diff options
| author | James Morris <james.l.morris@oracle.com> | 2017-07-24 20:44:18 -0400 |
|---|---|---|
| committer | James Morris <james.l.morris@oracle.com> | 2017-07-24 20:44:18 -0400 |
| commit | 53a2ebaaabc1eb8458796fec3bc1e0e80746b642 (patch) | |
| tree | 9d1f9227b49392cdd2edcc01057517da4f4b09c2 /kernel | |
| parent | 3cf29931453215536916d0c4da953fce1911ced3 (diff) | |
| parent | 520eccdfe187591a51ea9ab4c1a024ae4d0f68d9 (diff) | |
sync to Linus v4.13-rc2 for subsystem developers to work against
Diffstat (limited to 'kernel')
163 files changed, 11323 insertions, 7595 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 72aa080f91f0..4cb8e8b23c6e 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
| @@ -82,7 +82,7 @@ obj-$(CONFIG_KPROBES) += kprobes.o | |||
| 82 | obj-$(CONFIG_KGDB) += debug/ | 82 | obj-$(CONFIG_KGDB) += debug/ |
| 83 | obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o | 83 | obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o |
| 84 | obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o | 84 | obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o |
| 85 | obj-$(CONFIG_HARDLOCKUP_DETECTOR) += watchdog_hld.o | 85 | obj-$(CONFIG_HARDLOCKUP_DETECTOR_PERF) += watchdog_hld.o |
| 86 | obj-$(CONFIG_SECCOMP) += seccomp.o | 86 | obj-$(CONFIG_SECCOMP) += seccomp.o |
| 87 | obj-$(CONFIG_RELAY) += relay.o | 87 | obj-$(CONFIG_RELAY) += relay.o |
| 88 | obj-$(CONFIG_SYSCTL) += utsname_sysctl.o | 88 | obj-$(CONFIG_SYSCTL) += utsname_sysctl.o |
diff --git a/kernel/async.c b/kernel/async.c index d2edd6efec56..2cbd3dd5940d 100644 --- a/kernel/async.c +++ b/kernel/async.c | |||
| @@ -114,14 +114,14 @@ static void async_run_entry_fn(struct work_struct *work) | |||
| 114 | ktime_t uninitialized_var(calltime), delta, rettime; | 114 | ktime_t uninitialized_var(calltime), delta, rettime; |
| 115 | 115 | ||
| 116 | /* 1) run (and print duration) */ | 116 | /* 1) run (and print duration) */ |
| 117 | if (initcall_debug && system_state == SYSTEM_BOOTING) { | 117 | if (initcall_debug && system_state < SYSTEM_RUNNING) { |
| 118 | pr_debug("calling %lli_%pF @ %i\n", | 118 | pr_debug("calling %lli_%pF @ %i\n", |
| 119 | (long long)entry->cookie, | 119 | (long long)entry->cookie, |
| 120 | entry->func, task_pid_nr(current)); | 120 | entry->func, task_pid_nr(current)); |
| 121 | calltime = ktime_get(); | 121 | calltime = ktime_get(); |
| 122 | } | 122 | } |
| 123 | entry->func(entry->data, entry->cookie); | 123 | entry->func(entry->data, entry->cookie); |
| 124 | if (initcall_debug && system_state == SYSTEM_BOOTING) { | 124 | if (initcall_debug && system_state < SYSTEM_RUNNING) { |
| 125 | rettime = ktime_get(); | 125 | rettime = ktime_get(); |
| 126 | delta = ktime_sub(rettime, calltime); | 126 | delta = ktime_sub(rettime, calltime); |
| 127 | pr_debug("initcall %lli_%pF returned 0 after %lld usecs\n", | 127 | pr_debug("initcall %lli_%pF returned 0 after %lld usecs\n", |
| @@ -284,14 +284,14 @@ void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain | |||
| 284 | { | 284 | { |
| 285 | ktime_t uninitialized_var(starttime), delta, endtime; | 285 | ktime_t uninitialized_var(starttime), delta, endtime; |
| 286 | 286 | ||
| 287 | if (initcall_debug && system_state == SYSTEM_BOOTING) { | 287 | if (initcall_debug && system_state < SYSTEM_RUNNING) { |
| 288 | pr_debug("async_waiting @ %i\n", task_pid_nr(current)); | 288 | pr_debug("async_waiting @ %i\n", task_pid_nr(current)); |
| 289 | starttime = ktime_get(); | 289 | starttime = ktime_get(); |
| 290 | } | 290 | } |
| 291 | 291 | ||
| 292 | wait_event(async_done, lowest_in_progress(domain) >= cookie); | 292 | wait_event(async_done, lowest_in_progress(domain) >= cookie); |
| 293 | 293 | ||
| 294 | if (initcall_debug && system_state == SYSTEM_BOOTING) { | 294 | if (initcall_debug && system_state < SYSTEM_RUNNING) { |
| 295 | endtime = ktime_get(); | 295 | endtime = ktime_get(); |
| 296 | delta = ktime_sub(endtime, starttime); | 296 | delta = ktime_sub(endtime, starttime); |
| 297 | 297 | ||
diff --git a/kernel/audit.c b/kernel/audit.c index 4b7d49868ce1..6dd556931739 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
| @@ -575,12 +575,16 @@ static void kauditd_retry_skb(struct sk_buff *skb) | |||
| 575 | 575 | ||
| 576 | /** | 576 | /** |
| 577 | * auditd_reset - Disconnect the auditd connection | 577 | * auditd_reset - Disconnect the auditd connection |
| 578 | * @ac: auditd connection state | ||
| 578 | * | 579 | * |
| 579 | * Description: | 580 | * Description: |
| 580 | * Break the auditd/kauditd connection and move all the queued records into the | 581 | * Break the auditd/kauditd connection and move all the queued records into the |
| 581 | * hold queue in case auditd reconnects. | 582 | * hold queue in case auditd reconnects. It is important to note that the @ac |
| 583 | * pointer should never be dereferenced inside this function as it may be NULL | ||
| 584 | * or invalid, you can only compare the memory address! If @ac is NULL then | ||
| 585 | * the connection will always be reset. | ||
| 582 | */ | 586 | */ |
| 583 | static void auditd_reset(void) | 587 | static void auditd_reset(const struct auditd_connection *ac) |
| 584 | { | 588 | { |
| 585 | unsigned long flags; | 589 | unsigned long flags; |
| 586 | struct sk_buff *skb; | 590 | struct sk_buff *skb; |
| @@ -590,17 +594,21 @@ static void auditd_reset(void) | |||
| 590 | spin_lock_irqsave(&auditd_conn_lock, flags); | 594 | spin_lock_irqsave(&auditd_conn_lock, flags); |
| 591 | ac_old = rcu_dereference_protected(auditd_conn, | 595 | ac_old = rcu_dereference_protected(auditd_conn, |
| 592 | lockdep_is_held(&auditd_conn_lock)); | 596 | lockdep_is_held(&auditd_conn_lock)); |
| 597 | if (ac && ac != ac_old) { | ||
| 598 | /* someone already registered a new auditd connection */ | ||
| 599 | spin_unlock_irqrestore(&auditd_conn_lock, flags); | ||
| 600 | return; | ||
| 601 | } | ||
| 593 | rcu_assign_pointer(auditd_conn, NULL); | 602 | rcu_assign_pointer(auditd_conn, NULL); |
| 594 | spin_unlock_irqrestore(&auditd_conn_lock, flags); | 603 | spin_unlock_irqrestore(&auditd_conn_lock, flags); |
| 595 | 604 | ||
| 596 | if (ac_old) | 605 | if (ac_old) |
| 597 | call_rcu(&ac_old->rcu, auditd_conn_free); | 606 | call_rcu(&ac_old->rcu, auditd_conn_free); |
| 598 | 607 | ||
| 599 | /* flush all of the main and retry queues to the hold queue */ | 608 | /* flush the retry queue to the hold queue, but don't touch the main |
| 609 | * queue since we need to process that normally for multicast */ | ||
| 600 | while ((skb = skb_dequeue(&audit_retry_queue))) | 610 | while ((skb = skb_dequeue(&audit_retry_queue))) |
| 601 | kauditd_hold_skb(skb); | 611 | kauditd_hold_skb(skb); |
| 602 | while ((skb = skb_dequeue(&audit_queue))) | ||
| 603 | kauditd_hold_skb(skb); | ||
| 604 | } | 612 | } |
| 605 | 613 | ||
| 606 | /** | 614 | /** |
| @@ -633,6 +641,7 @@ static int auditd_send_unicast_skb(struct sk_buff *skb) | |||
| 633 | ac = rcu_dereference(auditd_conn); | 641 | ac = rcu_dereference(auditd_conn); |
| 634 | if (!ac) { | 642 | if (!ac) { |
| 635 | rcu_read_unlock(); | 643 | rcu_read_unlock(); |
| 644 | kfree_skb(skb); | ||
| 636 | rc = -ECONNREFUSED; | 645 | rc = -ECONNREFUSED; |
| 637 | goto err; | 646 | goto err; |
| 638 | } | 647 | } |
| @@ -649,8 +658,8 @@ static int auditd_send_unicast_skb(struct sk_buff *skb) | |||
| 649 | return rc; | 658 | return rc; |
| 650 | 659 | ||
| 651 | err: | 660 | err: |
| 652 | if (rc == -ECONNREFUSED) | 661 | if (ac && rc == -ECONNREFUSED) |
| 653 | auditd_reset(); | 662 | auditd_reset(ac); |
| 654 | return rc; | 663 | return rc; |
| 655 | } | 664 | } |
| 656 | 665 | ||
| @@ -795,9 +804,9 @@ static int kauditd_thread(void *dummy) | |||
| 795 | rc = kauditd_send_queue(sk, portid, | 804 | rc = kauditd_send_queue(sk, portid, |
| 796 | &audit_hold_queue, UNICAST_RETRIES, | 805 | &audit_hold_queue, UNICAST_RETRIES, |
| 797 | NULL, kauditd_rehold_skb); | 806 | NULL, kauditd_rehold_skb); |
| 798 | if (rc < 0) { | 807 | if (ac && rc < 0) { |
| 799 | sk = NULL; | 808 | sk = NULL; |
| 800 | auditd_reset(); | 809 | auditd_reset(ac); |
| 801 | goto main_queue; | 810 | goto main_queue; |
| 802 | } | 811 | } |
| 803 | 812 | ||
| @@ -805,9 +814,9 @@ static int kauditd_thread(void *dummy) | |||
| 805 | rc = kauditd_send_queue(sk, portid, | 814 | rc = kauditd_send_queue(sk, portid, |
| 806 | &audit_retry_queue, UNICAST_RETRIES, | 815 | &audit_retry_queue, UNICAST_RETRIES, |
| 807 | NULL, kauditd_hold_skb); | 816 | NULL, kauditd_hold_skb); |
| 808 | if (rc < 0) { | 817 | if (ac && rc < 0) { |
| 809 | sk = NULL; | 818 | sk = NULL; |
| 810 | auditd_reset(); | 819 | auditd_reset(ac); |
| 811 | goto main_queue; | 820 | goto main_queue; |
| 812 | } | 821 | } |
| 813 | 822 | ||
| @@ -815,12 +824,13 @@ main_queue: | |||
| 815 | /* process the main queue - do the multicast send and attempt | 824 | /* process the main queue - do the multicast send and attempt |
| 816 | * unicast, dump failed record sends to the retry queue; if | 825 | * unicast, dump failed record sends to the retry queue; if |
| 817 | * sk == NULL due to previous failures we will just do the | 826 | * sk == NULL due to previous failures we will just do the |
| 818 | * multicast send and move the record to the retry queue */ | 827 | * multicast send and move the record to the hold queue */ |
| 819 | rc = kauditd_send_queue(sk, portid, &audit_queue, 1, | 828 | rc = kauditd_send_queue(sk, portid, &audit_queue, 1, |
| 820 | kauditd_send_multicast_skb, | 829 | kauditd_send_multicast_skb, |
| 821 | kauditd_retry_skb); | 830 | (sk ? |
| 822 | if (sk == NULL || rc < 0) | 831 | kauditd_retry_skb : kauditd_hold_skb)); |
| 823 | auditd_reset(); | 832 | if (ac && rc < 0) |
| 833 | auditd_reset(ac); | ||
| 824 | sk = NULL; | 834 | sk = NULL; |
| 825 | 835 | ||
| 826 | /* drop our netns reference, no auditd sends past this line */ | 836 | /* drop our netns reference, no auditd sends past this line */ |
| @@ -1230,7 +1240,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
| 1230 | auditd_pid, 1); | 1240 | auditd_pid, 1); |
| 1231 | 1241 | ||
| 1232 | /* unregister the auditd connection */ | 1242 | /* unregister the auditd connection */ |
| 1233 | auditd_reset(); | 1243 | auditd_reset(NULL); |
| 1234 | } | 1244 | } |
| 1235 | } | 1245 | } |
| 1236 | if (s.mask & AUDIT_STATUS_RATE_LIMIT) { | 1246 | if (s.mask & AUDIT_STATUS_RATE_LIMIT) { |
| @@ -1999,22 +2009,10 @@ void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap) | |||
| 1999 | 2009 | ||
| 2000 | static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) | 2010 | static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) |
| 2001 | { | 2011 | { |
| 2002 | kernel_cap_t *perm = &name->fcap.permitted; | 2012 | audit_log_cap(ab, "cap_fp", &name->fcap.permitted); |
| 2003 | kernel_cap_t *inh = &name->fcap.inheritable; | 2013 | audit_log_cap(ab, "cap_fi", &name->fcap.inheritable); |
| 2004 | int log = 0; | 2014 | audit_log_format(ab, " cap_fe=%d cap_fver=%x", |
| 2005 | 2015 | name->fcap.fE, name->fcap_ver); | |
| 2006 | if (!cap_isclear(*perm)) { | ||
| 2007 | audit_log_cap(ab, "cap_fp", perm); | ||
| 2008 | log = 1; | ||
| 2009 | } | ||
| 2010 | if (!cap_isclear(*inh)) { | ||
| 2011 | audit_log_cap(ab, "cap_fi", inh); | ||
| 2012 | log = 1; | ||
| 2013 | } | ||
| 2014 | |||
| 2015 | if (log) | ||
| 2016 | audit_log_format(ab, " cap_fe=%d cap_fver=%x", | ||
| 2017 | name->fcap.fE, name->fcap_ver); | ||
| 2018 | } | 2016 | } |
| 2019 | 2017 | ||
| 2020 | static inline int audit_copy_fcaps(struct audit_names *name, | 2018 | static inline int audit_copy_fcaps(struct audit_names *name, |
diff --git a/kernel/audit.h b/kernel/audit.h index ddfce2ea4891..b331d9b83f63 100644 --- a/kernel/audit.h +++ b/kernel/audit.h | |||
| @@ -68,6 +68,7 @@ struct audit_cap_data { | |||
| 68 | unsigned int fE; /* effective bit of file cap */ | 68 | unsigned int fE; /* effective bit of file cap */ |
| 69 | kernel_cap_t effective; /* effective set of process */ | 69 | kernel_cap_t effective; /* effective set of process */ |
| 70 | }; | 70 | }; |
| 71 | kernel_cap_t ambient; | ||
| 71 | }; | 72 | }; |
| 72 | 73 | ||
| 73 | /* When fs/namei.c:getname() is called, we store the pointer in name and bump | 74 | /* When fs/namei.c:getname() is called, we store the pointer in name and bump |
| @@ -247,13 +248,13 @@ struct audit_netlink_list { | |||
| 247 | struct sk_buff_head q; | 248 | struct sk_buff_head q; |
| 248 | }; | 249 | }; |
| 249 | 250 | ||
| 250 | int audit_send_list(void *); | 251 | int audit_send_list(void *_dest); |
| 251 | 252 | ||
| 252 | extern int selinux_audit_rule_update(void); | 253 | extern int selinux_audit_rule_update(void); |
| 253 | 254 | ||
| 254 | extern struct mutex audit_filter_mutex; | 255 | extern struct mutex audit_filter_mutex; |
| 255 | extern int audit_del_rule(struct audit_entry *); | 256 | extern int audit_del_rule(struct audit_entry *entry); |
| 256 | extern void audit_free_rule_rcu(struct rcu_head *); | 257 | extern void audit_free_rule_rcu(struct rcu_head *head); |
| 257 | extern struct list_head audit_filter_list[]; | 258 | extern struct list_head audit_filter_list[]; |
| 258 | 259 | ||
| 259 | extern struct audit_entry *audit_dupe_rule(struct audit_krule *old); | 260 | extern struct audit_entry *audit_dupe_rule(struct audit_krule *old); |
| @@ -301,17 +302,17 @@ extern int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark | |||
| 301 | #endif /* CONFIG_AUDIT_WATCH */ | 302 | #endif /* CONFIG_AUDIT_WATCH */ |
| 302 | 303 | ||
| 303 | #ifdef CONFIG_AUDIT_TREE | 304 | #ifdef CONFIG_AUDIT_TREE |
| 304 | extern struct audit_chunk *audit_tree_lookup(const struct inode *); | 305 | extern struct audit_chunk *audit_tree_lookup(const struct inode *inode); |
| 305 | extern void audit_put_chunk(struct audit_chunk *); | 306 | extern void audit_put_chunk(struct audit_chunk *chunk); |
| 306 | extern bool audit_tree_match(struct audit_chunk *, struct audit_tree *); | 307 | extern bool audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree); |
| 307 | extern int audit_make_tree(struct audit_krule *, char *, u32); | 308 | extern int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op); |
| 308 | extern int audit_add_tree_rule(struct audit_krule *); | 309 | extern int audit_add_tree_rule(struct audit_krule *rule); |
| 309 | extern int audit_remove_tree_rule(struct audit_krule *); | 310 | extern int audit_remove_tree_rule(struct audit_krule *rule); |
| 310 | extern void audit_trim_trees(void); | 311 | extern void audit_trim_trees(void); |
| 311 | extern int audit_tag_tree(char *old, char *new); | 312 | extern int audit_tag_tree(char *old, char *new); |
| 312 | extern const char *audit_tree_path(struct audit_tree *); | 313 | extern const char *audit_tree_path(struct audit_tree *tree); |
| 313 | extern void audit_put_tree(struct audit_tree *); | 314 | extern void audit_put_tree(struct audit_tree *tree); |
| 314 | extern void audit_kill_trees(struct list_head *); | 315 | extern void audit_kill_trees(struct list_head *list); |
| 315 | #else | 316 | #else |
| 316 | #define audit_remove_tree_rule(rule) BUG() | 317 | #define audit_remove_tree_rule(rule) BUG() |
| 317 | #define audit_add_tree_rule(rule) -EINVAL | 318 | #define audit_add_tree_rule(rule) -EINVAL |
| @@ -323,7 +324,7 @@ extern void audit_kill_trees(struct list_head *); | |||
| 323 | #define audit_kill_trees(list) BUG() | 324 | #define audit_kill_trees(list) BUG() |
| 324 | #endif | 325 | #endif |
| 325 | 326 | ||
| 326 | extern char *audit_unpack_string(void **, size_t *, size_t); | 327 | extern char *audit_unpack_string(void **bufp, size_t *remain, size_t len); |
| 327 | 328 | ||
| 328 | extern pid_t audit_sig_pid; | 329 | extern pid_t audit_sig_pid; |
| 329 | extern kuid_t audit_sig_uid; | 330 | extern kuid_t audit_sig_uid; |
| @@ -333,7 +334,7 @@ extern int audit_filter(int msgtype, unsigned int listtype); | |||
| 333 | 334 | ||
| 334 | #ifdef CONFIG_AUDITSYSCALL | 335 | #ifdef CONFIG_AUDITSYSCALL |
| 335 | extern int audit_signal_info(int sig, struct task_struct *t); | 336 | extern int audit_signal_info(int sig, struct task_struct *t); |
| 336 | extern void audit_filter_inodes(struct task_struct *, struct audit_context *); | 337 | extern void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx); |
| 337 | extern struct list_head *audit_killed_trees(void); | 338 | extern struct list_head *audit_killed_trees(void); |
| 338 | #else | 339 | #else |
| 339 | #define audit_signal_info(s,t) AUDIT_DISABLED | 340 | #define audit_signal_info(s,t) AUDIT_DISABLED |
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index bb724baa7ac9..3260ba2312a9 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
| @@ -1261,6 +1261,7 @@ static void show_special(struct audit_context *context, int *call_panic) | |||
| 1261 | audit_log_cap(ab, "cap_pi", &context->capset.cap.inheritable); | 1261 | audit_log_cap(ab, "cap_pi", &context->capset.cap.inheritable); |
| 1262 | audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted); | 1262 | audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted); |
| 1263 | audit_log_cap(ab, "cap_pe", &context->capset.cap.effective); | 1263 | audit_log_cap(ab, "cap_pe", &context->capset.cap.effective); |
| 1264 | audit_log_cap(ab, "cap_pa", &context->capset.cap.ambient); | ||
| 1264 | break; | 1265 | break; |
| 1265 | case AUDIT_MMAP: | 1266 | case AUDIT_MMAP: |
| 1266 | audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd, | 1267 | audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd, |
| @@ -1382,9 +1383,11 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts | |||
| 1382 | audit_log_cap(ab, "old_pp", &axs->old_pcap.permitted); | 1383 | audit_log_cap(ab, "old_pp", &axs->old_pcap.permitted); |
| 1383 | audit_log_cap(ab, "old_pi", &axs->old_pcap.inheritable); | 1384 | audit_log_cap(ab, "old_pi", &axs->old_pcap.inheritable); |
| 1384 | audit_log_cap(ab, "old_pe", &axs->old_pcap.effective); | 1385 | audit_log_cap(ab, "old_pe", &axs->old_pcap.effective); |
| 1385 | audit_log_cap(ab, "new_pp", &axs->new_pcap.permitted); | 1386 | audit_log_cap(ab, "old_pa", &axs->old_pcap.ambient); |
| 1386 | audit_log_cap(ab, "new_pi", &axs->new_pcap.inheritable); | 1387 | audit_log_cap(ab, "pp", &axs->new_pcap.permitted); |
| 1387 | audit_log_cap(ab, "new_pe", &axs->new_pcap.effective); | 1388 | audit_log_cap(ab, "pi", &axs->new_pcap.inheritable); |
| 1389 | audit_log_cap(ab, "pe", &axs->new_pcap.effective); | ||
| 1390 | audit_log_cap(ab, "pa", &axs->new_pcap.ambient); | ||
| 1388 | break; } | 1391 | break; } |
| 1389 | 1392 | ||
| 1390 | } | 1393 | } |
| @@ -2342,10 +2345,12 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm, | |||
| 2342 | ax->old_pcap.permitted = old->cap_permitted; | 2345 | ax->old_pcap.permitted = old->cap_permitted; |
| 2343 | ax->old_pcap.inheritable = old->cap_inheritable; | 2346 | ax->old_pcap.inheritable = old->cap_inheritable; |
| 2344 | ax->old_pcap.effective = old->cap_effective; | 2347 | ax->old_pcap.effective = old->cap_effective; |
| 2348 | ax->old_pcap.ambient = old->cap_ambient; | ||
| 2345 | 2349 | ||
| 2346 | ax->new_pcap.permitted = new->cap_permitted; | 2350 | ax->new_pcap.permitted = new->cap_permitted; |
| 2347 | ax->new_pcap.inheritable = new->cap_inheritable; | 2351 | ax->new_pcap.inheritable = new->cap_inheritable; |
| 2348 | ax->new_pcap.effective = new->cap_effective; | 2352 | ax->new_pcap.effective = new->cap_effective; |
| 2353 | ax->new_pcap.ambient = new->cap_ambient; | ||
| 2349 | return 0; | 2354 | return 0; |
| 2350 | } | 2355 | } |
| 2351 | 2356 | ||
| @@ -2364,6 +2369,7 @@ void __audit_log_capset(const struct cred *new, const struct cred *old) | |||
| 2364 | context->capset.cap.effective = new->cap_effective; | 2369 | context->capset.cap.effective = new->cap_effective; |
| 2365 | context->capset.cap.inheritable = new->cap_effective; | 2370 | context->capset.cap.inheritable = new->cap_effective; |
| 2366 | context->capset.cap.permitted = new->cap_permitted; | 2371 | context->capset.cap.permitted = new->cap_permitted; |
| 2372 | context->capset.cap.ambient = new->cap_ambient; | ||
| 2367 | context->type = AUDIT_CAPSET; | 2373 | context->type = AUDIT_CAPSET; |
| 2368 | } | 2374 | } |
| 2369 | 2375 | ||
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 5e00b2333c26..d771a3872500 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c | |||
| @@ -86,6 +86,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) | |||
| 86 | array->map.key_size = attr->key_size; | 86 | array->map.key_size = attr->key_size; |
| 87 | array->map.value_size = attr->value_size; | 87 | array->map.value_size = attr->value_size; |
| 88 | array->map.max_entries = attr->max_entries; | 88 | array->map.max_entries = attr->max_entries; |
| 89 | array->map.map_flags = attr->map_flags; | ||
| 89 | array->elem_size = elem_size; | 90 | array->elem_size = elem_size; |
| 90 | 91 | ||
| 91 | if (!percpu) | 92 | if (!percpu) |
| @@ -334,6 +335,26 @@ static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key) | |||
| 334 | } | 335 | } |
| 335 | 336 | ||
| 336 | /* only called from syscall */ | 337 | /* only called from syscall */ |
| 338 | int bpf_fd_array_map_lookup_elem(struct bpf_map *map, void *key, u32 *value) | ||
| 339 | { | ||
| 340 | void **elem, *ptr; | ||
| 341 | int ret = 0; | ||
| 342 | |||
| 343 | if (!map->ops->map_fd_sys_lookup_elem) | ||
| 344 | return -ENOTSUPP; | ||
| 345 | |||
| 346 | rcu_read_lock(); | ||
| 347 | elem = array_map_lookup_elem(map, key); | ||
| 348 | if (elem && (ptr = READ_ONCE(*elem))) | ||
| 349 | *value = map->ops->map_fd_sys_lookup_elem(ptr); | ||
| 350 | else | ||
| 351 | ret = -ENOENT; | ||
| 352 | rcu_read_unlock(); | ||
| 353 | |||
| 354 | return ret; | ||
| 355 | } | ||
| 356 | |||
| 357 | /* only called from syscall */ | ||
| 337 | int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, | 358 | int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, |
| 338 | void *key, void *value, u64 map_flags) | 359 | void *key, void *value, u64 map_flags) |
| 339 | { | 360 | { |
| @@ -399,6 +420,11 @@ static void prog_fd_array_put_ptr(void *ptr) | |||
| 399 | bpf_prog_put(ptr); | 420 | bpf_prog_put(ptr); |
| 400 | } | 421 | } |
| 401 | 422 | ||
| 423 | static u32 prog_fd_array_sys_lookup_elem(void *ptr) | ||
| 424 | { | ||
| 425 | return ((struct bpf_prog *)ptr)->aux->id; | ||
| 426 | } | ||
| 427 | |||
| 402 | /* decrement refcnt of all bpf_progs that are stored in this map */ | 428 | /* decrement refcnt of all bpf_progs that are stored in this map */ |
| 403 | void bpf_fd_array_map_clear(struct bpf_map *map) | 429 | void bpf_fd_array_map_clear(struct bpf_map *map) |
| 404 | { | 430 | { |
| @@ -417,6 +443,7 @@ const struct bpf_map_ops prog_array_map_ops = { | |||
| 417 | .map_delete_elem = fd_array_map_delete_elem, | 443 | .map_delete_elem = fd_array_map_delete_elem, |
| 418 | .map_fd_get_ptr = prog_fd_array_get_ptr, | 444 | .map_fd_get_ptr = prog_fd_array_get_ptr, |
| 419 | .map_fd_put_ptr = prog_fd_array_put_ptr, | 445 | .map_fd_put_ptr = prog_fd_array_put_ptr, |
| 446 | .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem, | ||
| 420 | }; | 447 | }; |
| 421 | 448 | ||
| 422 | static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, | 449 | static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, |
| @@ -451,38 +478,24 @@ static void bpf_event_entry_free_rcu(struct bpf_event_entry *ee) | |||
| 451 | static void *perf_event_fd_array_get_ptr(struct bpf_map *map, | 478 | static void *perf_event_fd_array_get_ptr(struct bpf_map *map, |
| 452 | struct file *map_file, int fd) | 479 | struct file *map_file, int fd) |
| 453 | { | 480 | { |
| 454 | const struct perf_event_attr *attr; | ||
| 455 | struct bpf_event_entry *ee; | 481 | struct bpf_event_entry *ee; |
| 456 | struct perf_event *event; | 482 | struct perf_event *event; |
| 457 | struct file *perf_file; | 483 | struct file *perf_file; |
| 484 | u64 value; | ||
| 458 | 485 | ||
| 459 | perf_file = perf_event_get(fd); | 486 | perf_file = perf_event_get(fd); |
| 460 | if (IS_ERR(perf_file)) | 487 | if (IS_ERR(perf_file)) |
| 461 | return perf_file; | 488 | return perf_file; |
| 462 | 489 | ||
| 490 | ee = ERR_PTR(-EOPNOTSUPP); | ||
| 463 | event = perf_file->private_data; | 491 | event = perf_file->private_data; |
| 464 | ee = ERR_PTR(-EINVAL); | 492 | if (perf_event_read_local(event, &value) == -EOPNOTSUPP) |
| 465 | |||
| 466 | attr = perf_event_attrs(event); | ||
| 467 | if (IS_ERR(attr) || attr->inherit) | ||
| 468 | goto err_out; | 493 | goto err_out; |
| 469 | 494 | ||
| 470 | switch (attr->type) { | 495 | ee = bpf_event_entry_gen(perf_file, map_file); |
| 471 | case PERF_TYPE_SOFTWARE: | 496 | if (ee) |
| 472 | if (attr->config != PERF_COUNT_SW_BPF_OUTPUT) | 497 | return ee; |
| 473 | goto err_out; | 498 | ee = ERR_PTR(-ENOMEM); |
| 474 | /* fall-through */ | ||
| 475 | case PERF_TYPE_RAW: | ||
| 476 | case PERF_TYPE_HARDWARE: | ||
| 477 | ee = bpf_event_entry_gen(perf_file, map_file); | ||
| 478 | if (ee) | ||
| 479 | return ee; | ||
| 480 | ee = ERR_PTR(-ENOMEM); | ||
| 481 | /* fall-through */ | ||
| 482 | default: | ||
| 483 | break; | ||
| 484 | } | ||
| 485 | |||
| 486 | err_out: | 499 | err_out: |
| 487 | fput(perf_file); | 500 | fput(perf_file); |
| 488 | return ee; | 501 | return ee; |
| @@ -598,4 +611,5 @@ const struct bpf_map_ops array_of_maps_map_ops = { | |||
| 598 | .map_delete_elem = fd_array_map_delete_elem, | 611 | .map_delete_elem = fd_array_map_delete_elem, |
| 599 | .map_fd_get_ptr = bpf_map_fd_get_ptr, | 612 | .map_fd_get_ptr = bpf_map_fd_get_ptr, |
| 600 | .map_fd_put_ptr = bpf_map_fd_put_ptr, | 613 | .map_fd_put_ptr = bpf_map_fd_put_ptr, |
| 614 | .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, | ||
| 601 | }; | 615 | }; |
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index ea6033cba947..546113430049 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c | |||
| @@ -236,3 +236,40 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk, | |||
| 236 | return ret; | 236 | return ret; |
| 237 | } | 237 | } |
| 238 | EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); | 238 | EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); |
| 239 | |||
| 240 | /** | ||
| 241 | * __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock | ||
| 242 | * @sk: socket to get cgroup from | ||
| 243 | * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains | ||
| 244 | * sk with connection information (IP addresses, etc.) May not contain | ||
| 245 | * cgroup info if it is a req sock. | ||
| 246 | * @type: The type of program to be exectuted | ||
| 247 | * | ||
| 248 | * socket passed is expected to be of type INET or INET6. | ||
| 249 | * | ||
| 250 | * The program type passed in via @type must be suitable for sock_ops | ||
| 251 | * filtering. No further check is performed to assert that. | ||
| 252 | * | ||
| 253 | * This function will return %-EPERM if any if an attached program was found | ||
| 254 | * and if it returned != 1 during execution. In all other cases, 0 is returned. | ||
| 255 | */ | ||
| 256 | int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, | ||
| 257 | struct bpf_sock_ops_kern *sock_ops, | ||
| 258 | enum bpf_attach_type type) | ||
| 259 | { | ||
| 260 | struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); | ||
| 261 | struct bpf_prog *prog; | ||
| 262 | int ret = 0; | ||
| 263 | |||
| 264 | |||
| 265 | rcu_read_lock(); | ||
| 266 | |||
| 267 | prog = rcu_dereference(cgrp->bpf.effective[type]); | ||
| 268 | if (prog) | ||
| 269 | ret = BPF_PROG_RUN(prog, sock_ops) == 1 ? 0 : -EPERM; | ||
| 270 | |||
| 271 | rcu_read_unlock(); | ||
| 272 | |||
| 273 | return ret; | ||
| 274 | } | ||
| 275 | EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops); | ||
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index dedf367f59bb..ad5f55922a13 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c | |||
| @@ -763,10 +763,10 @@ EXPORT_SYMBOL_GPL(__bpf_call_base); | |||
| 763 | * | 763 | * |
| 764 | * Decode and execute eBPF instructions. | 764 | * Decode and execute eBPF instructions. |
| 765 | */ | 765 | */ |
| 766 | static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) | 766 | static unsigned int ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, |
| 767 | u64 *stack) | ||
| 767 | { | 768 | { |
| 768 | u64 stack[MAX_BPF_STACK / sizeof(u64)]; | 769 | u64 tmp; |
| 769 | u64 regs[MAX_BPF_REG], tmp; | ||
| 770 | static const void *jumptable[256] = { | 770 | static const void *jumptable[256] = { |
| 771 | [0 ... 255] = &&default_label, | 771 | [0 ... 255] = &&default_label, |
| 772 | /* Now overwrite non-defaults ... */ | 772 | /* Now overwrite non-defaults ... */ |
| @@ -824,7 +824,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) | |||
| 824 | [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG, | 824 | [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG, |
| 825 | /* Call instruction */ | 825 | /* Call instruction */ |
| 826 | [BPF_JMP | BPF_CALL] = &&JMP_CALL, | 826 | [BPF_JMP | BPF_CALL] = &&JMP_CALL, |
| 827 | [BPF_JMP | BPF_CALL | BPF_X] = &&JMP_TAIL_CALL, | 827 | [BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL, |
| 828 | /* Jumps */ | 828 | /* Jumps */ |
| 829 | [BPF_JMP | BPF_JA] = &&JMP_JA, | 829 | [BPF_JMP | BPF_JA] = &&JMP_JA, |
| 830 | [BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X, | 830 | [BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X, |
| @@ -874,9 +874,6 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) | |||
| 874 | #define CONT ({ insn++; goto select_insn; }) | 874 | #define CONT ({ insn++; goto select_insn; }) |
| 875 | #define CONT_JMP ({ insn++; goto select_insn; }) | 875 | #define CONT_JMP ({ insn++; goto select_insn; }) |
| 876 | 876 | ||
| 877 | FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; | ||
| 878 | ARG1 = (u64) (unsigned long) ctx; | ||
| 879 | |||
| 880 | select_insn: | 877 | select_insn: |
| 881 | goto *jumptable[insn->code]; | 878 | goto *jumptable[insn->code]; |
| 882 | 879 | ||
| @@ -1219,7 +1216,39 @@ load_byte: | |||
| 1219 | WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code); | 1216 | WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code); |
| 1220 | return 0; | 1217 | return 0; |
| 1221 | } | 1218 | } |
| 1222 | STACK_FRAME_NON_STANDARD(__bpf_prog_run); /* jump table */ | 1219 | STACK_FRAME_NON_STANDARD(___bpf_prog_run); /* jump table */ |
| 1220 | |||
| 1221 | #define PROG_NAME(stack_size) __bpf_prog_run##stack_size | ||
| 1222 | #define DEFINE_BPF_PROG_RUN(stack_size) \ | ||
| 1223 | static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn *insn) \ | ||
| 1224 | { \ | ||
| 1225 | u64 stack[stack_size / sizeof(u64)]; \ | ||
| 1226 | u64 regs[MAX_BPF_REG]; \ | ||
| 1227 | \ | ||
| 1228 | FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \ | ||
| 1229 | ARG1 = (u64) (unsigned long) ctx; \ | ||
| 1230 | return ___bpf_prog_run(regs, insn, stack); \ | ||
| 1231 | } | ||
| 1232 | |||
| 1233 | #define EVAL1(FN, X) FN(X) | ||
| 1234 | #define EVAL2(FN, X, Y...) FN(X) EVAL1(FN, Y) | ||
| 1235 | #define EVAL3(FN, X, Y...) FN(X) EVAL2(FN, Y) | ||
| 1236 | #define EVAL4(FN, X, Y...) FN(X) EVAL3(FN, Y) | ||
| 1237 | #define EVAL5(FN, X, Y...) FN(X) EVAL4(FN, Y) | ||
| 1238 | #define EVAL6(FN, X, Y...) FN(X) EVAL5(FN, Y) | ||
| 1239 | |||
| 1240 | EVAL6(DEFINE_BPF_PROG_RUN, 32, 64, 96, 128, 160, 192); | ||
| 1241 | EVAL6(DEFINE_BPF_PROG_RUN, 224, 256, 288, 320, 352, 384); | ||
| 1242 | EVAL4(DEFINE_BPF_PROG_RUN, 416, 448, 480, 512); | ||
| 1243 | |||
| 1244 | #define PROG_NAME_LIST(stack_size) PROG_NAME(stack_size), | ||
| 1245 | |||
| 1246 | static unsigned int (*interpreters[])(const void *ctx, | ||
| 1247 | const struct bpf_insn *insn) = { | ||
| 1248 | EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192) | ||
| 1249 | EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384) | ||
| 1250 | EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) | ||
| 1251 | }; | ||
| 1223 | 1252 | ||
| 1224 | bool bpf_prog_array_compatible(struct bpf_array *array, | 1253 | bool bpf_prog_array_compatible(struct bpf_array *array, |
| 1225 | const struct bpf_prog *fp) | 1254 | const struct bpf_prog *fp) |
| @@ -1268,7 +1297,9 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) | |||
| 1268 | */ | 1297 | */ |
| 1269 | struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) | 1298 | struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) |
| 1270 | { | 1299 | { |
| 1271 | fp->bpf_func = (void *) __bpf_prog_run; | 1300 | u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1); |
| 1301 | |||
| 1302 | fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1]; | ||
| 1272 | 1303 | ||
| 1273 | /* eBPF JITs can rewrite the program in case constant | 1304 | /* eBPF JITs can rewrite the program in case constant |
| 1274 | * blinding is active. However, in case of error during | 1305 | * blinding is active. However, in case of error during |
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 004334ea13ba..4fb463172aa8 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c | |||
| @@ -1244,6 +1244,26 @@ static void fd_htab_map_free(struct bpf_map *map) | |||
| 1244 | } | 1244 | } |
| 1245 | 1245 | ||
| 1246 | /* only called from syscall */ | 1246 | /* only called from syscall */ |
| 1247 | int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value) | ||
| 1248 | { | ||
| 1249 | void **ptr; | ||
| 1250 | int ret = 0; | ||
| 1251 | |||
| 1252 | if (!map->ops->map_fd_sys_lookup_elem) | ||
| 1253 | return -ENOTSUPP; | ||
| 1254 | |||
| 1255 | rcu_read_lock(); | ||
| 1256 | ptr = htab_map_lookup_elem(map, key); | ||
| 1257 | if (ptr) | ||
| 1258 | *value = map->ops->map_fd_sys_lookup_elem(READ_ONCE(*ptr)); | ||
| 1259 | else | ||
| 1260 | ret = -ENOENT; | ||
| 1261 | rcu_read_unlock(); | ||
| 1262 | |||
| 1263 | return ret; | ||
| 1264 | } | ||
| 1265 | |||
| 1266 | /* only called from syscall */ | ||
| 1247 | int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, | 1267 | int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, |
| 1248 | void *key, void *value, u64 map_flags) | 1268 | void *key, void *value, u64 map_flags) |
| 1249 | { | 1269 | { |
| @@ -1305,4 +1325,5 @@ const struct bpf_map_ops htab_of_maps_map_ops = { | |||
| 1305 | .map_delete_elem = htab_map_delete_elem, | 1325 | .map_delete_elem = htab_map_delete_elem, |
| 1306 | .map_fd_get_ptr = bpf_map_fd_get_ptr, | 1326 | .map_fd_get_ptr = bpf_map_fd_get_ptr, |
| 1307 | .map_fd_put_ptr = bpf_map_fd_put_ptr, | 1327 | .map_fd_put_ptr = bpf_map_fd_put_ptr, |
| 1328 | .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, | ||
| 1308 | }; | 1329 | }; |
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 9bbd33497d3d..e833ed914358 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c | |||
| @@ -377,10 +377,22 @@ static void bpf_evict_inode(struct inode *inode) | |||
| 377 | bpf_any_put(inode->i_private, type); | 377 | bpf_any_put(inode->i_private, type); |
| 378 | } | 378 | } |
| 379 | 379 | ||
| 380 | /* | ||
| 381 | * Display the mount options in /proc/mounts. | ||
| 382 | */ | ||
| 383 | static int bpf_show_options(struct seq_file *m, struct dentry *root) | ||
| 384 | { | ||
| 385 | umode_t mode = d_inode(root)->i_mode & S_IALLUGO & ~S_ISVTX; | ||
| 386 | |||
| 387 | if (mode != S_IRWXUGO) | ||
| 388 | seq_printf(m, ",mode=%o", mode); | ||
| 389 | return 0; | ||
| 390 | } | ||
| 391 | |||
| 380 | static const struct super_operations bpf_super_ops = { | 392 | static const struct super_operations bpf_super_ops = { |
| 381 | .statfs = simple_statfs, | 393 | .statfs = simple_statfs, |
| 382 | .drop_inode = generic_delete_inode, | 394 | .drop_inode = generic_delete_inode, |
| 383 | .show_options = generic_show_options, | 395 | .show_options = bpf_show_options, |
| 384 | .evict_inode = bpf_evict_inode, | 396 | .evict_inode = bpf_evict_inode, |
| 385 | }; | 397 | }; |
| 386 | 398 | ||
| @@ -434,8 +446,6 @@ static int bpf_fill_super(struct super_block *sb, void *data, int silent) | |||
| 434 | struct inode *inode; | 446 | struct inode *inode; |
| 435 | int ret; | 447 | int ret; |
| 436 | 448 | ||
| 437 | save_mount_options(sb, data); | ||
| 438 | |||
| 439 | ret = bpf_parse_options(data, &opts); | 449 | ret = bpf_parse_options(data, &opts); |
| 440 | if (ret) | 450 | if (ret) |
| 441 | return ret; | 451 | return ret; |
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 39cfafd895b8..b09185f0f17d 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c | |||
| @@ -432,6 +432,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) | |||
| 432 | trie->map.key_size = attr->key_size; | 432 | trie->map.key_size = attr->key_size; |
| 433 | trie->map.value_size = attr->value_size; | 433 | trie->map.value_size = attr->value_size; |
| 434 | trie->map.max_entries = attr->max_entries; | 434 | trie->map.max_entries = attr->max_entries; |
| 435 | trie->map.map_flags = attr->map_flags; | ||
| 435 | trie->data_size = attr->key_size - | 436 | trie->data_size = attr->key_size - |
| 436 | offsetof(struct bpf_lpm_trie_key, data); | 437 | offsetof(struct bpf_lpm_trie_key, data); |
| 437 | trie->max_prefixlen = trie->data_size * 8; | 438 | trie->max_prefixlen = trie->data_size * 8; |
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 59bcdf821ae4..1da574612bea 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c | |||
| @@ -95,3 +95,8 @@ void bpf_map_fd_put_ptr(void *ptr) | |||
| 95 | */ | 95 | */ |
| 96 | bpf_map_put(ptr); | 96 | bpf_map_put(ptr); |
| 97 | } | 97 | } |
| 98 | |||
| 99 | u32 bpf_map_fd_sys_lookup_elem(void *ptr) | ||
| 100 | { | ||
| 101 | return ((struct bpf_map *)ptr)->id; | ||
| 102 | } | ||
diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h index 177fadb689dc..6183db9ec08c 100644 --- a/kernel/bpf/map_in_map.h +++ b/kernel/bpf/map_in_map.h | |||
| @@ -19,5 +19,6 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0, | |||
| 19 | void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file, | 19 | void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file, |
| 20 | int ufd); | 20 | int ufd); |
| 21 | void bpf_map_fd_put_ptr(void *ptr); | 21 | void bpf_map_fd_put_ptr(void *ptr); |
| 22 | u32 bpf_map_fd_sys_lookup_elem(void *ptr); | ||
| 22 | 23 | ||
| 23 | #endif | 24 | #endif |
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 4dfd6f2ec2f9..31147d730abf 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c | |||
| @@ -88,6 +88,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) | |||
| 88 | smap->map.key_size = attr->key_size; | 88 | smap->map.key_size = attr->key_size; |
| 89 | smap->map.value_size = value_size; | 89 | smap->map.value_size = value_size; |
| 90 | smap->map.max_entries = attr->max_entries; | 90 | smap->map.max_entries = attr->max_entries; |
| 91 | smap->map.map_flags = attr->map_flags; | ||
| 91 | smap->n_buckets = n_buckets; | 92 | smap->n_buckets = n_buckets; |
| 92 | smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; | 93 | smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; |
| 93 | 94 | ||
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 265a0d854e33..045646da97cc 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c | |||
| @@ -22,8 +22,20 @@ | |||
| 22 | #include <linux/filter.h> | 22 | #include <linux/filter.h> |
| 23 | #include <linux/version.h> | 23 | #include <linux/version.h> |
| 24 | #include <linux/kernel.h> | 24 | #include <linux/kernel.h> |
| 25 | #include <linux/idr.h> | ||
| 26 | |||
| 27 | #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \ | ||
| 28 | (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ | ||
| 29 | (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ | ||
| 30 | (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) | ||
| 31 | #define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) | ||
| 32 | #define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_HASH(map)) | ||
| 25 | 33 | ||
| 26 | DEFINE_PER_CPU(int, bpf_prog_active); | 34 | DEFINE_PER_CPU(int, bpf_prog_active); |
| 35 | static DEFINE_IDR(prog_idr); | ||
| 36 | static DEFINE_SPINLOCK(prog_idr_lock); | ||
| 37 | static DEFINE_IDR(map_idr); | ||
| 38 | static DEFINE_SPINLOCK(map_idr_lock); | ||
| 27 | 39 | ||
| 28 | int sysctl_unprivileged_bpf_disabled __read_mostly; | 40 | int sysctl_unprivileged_bpf_disabled __read_mostly; |
| 29 | 41 | ||
| @@ -114,6 +126,37 @@ static void bpf_map_uncharge_memlock(struct bpf_map *map) | |||
| 114 | free_uid(user); | 126 | free_uid(user); |
| 115 | } | 127 | } |
| 116 | 128 | ||
| 129 | static int bpf_map_alloc_id(struct bpf_map *map) | ||
| 130 | { | ||
| 131 | int id; | ||
| 132 | |||
| 133 | spin_lock_bh(&map_idr_lock); | ||
| 134 | id = idr_alloc_cyclic(&map_idr, map, 1, INT_MAX, GFP_ATOMIC); | ||
| 135 | if (id > 0) | ||
| 136 | map->id = id; | ||
| 137 | spin_unlock_bh(&map_idr_lock); | ||
| 138 | |||
| 139 | if (WARN_ON_ONCE(!id)) | ||
| 140 | return -ENOSPC; | ||
| 141 | |||
| 142 | return id > 0 ? 0 : id; | ||
| 143 | } | ||
| 144 | |||
| 145 | static void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock) | ||
| 146 | { | ||
| 147 | if (do_idr_lock) | ||
| 148 | spin_lock_bh(&map_idr_lock); | ||
| 149 | else | ||
| 150 | __acquire(&map_idr_lock); | ||
| 151 | |||
| 152 | idr_remove(&map_idr, map->id); | ||
| 153 | |||
| 154 | if (do_idr_lock) | ||
| 155 | spin_unlock_bh(&map_idr_lock); | ||
| 156 | else | ||
| 157 | __release(&map_idr_lock); | ||
| 158 | } | ||
| 159 | |||
| 117 | /* called from workqueue */ | 160 | /* called from workqueue */ |
| 118 | static void bpf_map_free_deferred(struct work_struct *work) | 161 | static void bpf_map_free_deferred(struct work_struct *work) |
| 119 | { | 162 | { |
| @@ -135,14 +178,21 @@ static void bpf_map_put_uref(struct bpf_map *map) | |||
| 135 | /* decrement map refcnt and schedule it for freeing via workqueue | 178 | /* decrement map refcnt and schedule it for freeing via workqueue |
| 136 | * (unrelying map implementation ops->map_free() might sleep) | 179 | * (unrelying map implementation ops->map_free() might sleep) |
| 137 | */ | 180 | */ |
| 138 | void bpf_map_put(struct bpf_map *map) | 181 | static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock) |
| 139 | { | 182 | { |
| 140 | if (atomic_dec_and_test(&map->refcnt)) { | 183 | if (atomic_dec_and_test(&map->refcnt)) { |
| 184 | /* bpf_map_free_id() must be called first */ | ||
| 185 | bpf_map_free_id(map, do_idr_lock); | ||
| 141 | INIT_WORK(&map->work, bpf_map_free_deferred); | 186 | INIT_WORK(&map->work, bpf_map_free_deferred); |
| 142 | schedule_work(&map->work); | 187 | schedule_work(&map->work); |
| 143 | } | 188 | } |
| 144 | } | 189 | } |
| 145 | 190 | ||
| 191 | void bpf_map_put(struct bpf_map *map) | ||
| 192 | { | ||
| 193 | __bpf_map_put(map, true); | ||
| 194 | } | ||
| 195 | |||
| 146 | void bpf_map_put_with_uref(struct bpf_map *map) | 196 | void bpf_map_put_with_uref(struct bpf_map *map) |
| 147 | { | 197 | { |
| 148 | bpf_map_put_uref(map); | 198 | bpf_map_put_uref(map); |
| @@ -166,10 +216,12 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) | |||
| 166 | const struct bpf_map *map = filp->private_data; | 216 | const struct bpf_map *map = filp->private_data; |
| 167 | const struct bpf_array *array; | 217 | const struct bpf_array *array; |
| 168 | u32 owner_prog_type = 0; | 218 | u32 owner_prog_type = 0; |
| 219 | u32 owner_jited = 0; | ||
| 169 | 220 | ||
| 170 | if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) { | 221 | if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) { |
| 171 | array = container_of(map, struct bpf_array, map); | 222 | array = container_of(map, struct bpf_array, map); |
| 172 | owner_prog_type = array->owner_prog_type; | 223 | owner_prog_type = array->owner_prog_type; |
| 224 | owner_jited = array->owner_jited; | ||
| 173 | } | 225 | } |
| 174 | 226 | ||
| 175 | seq_printf(m, | 227 | seq_printf(m, |
| @@ -186,9 +238,12 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) | |||
| 186 | map->map_flags, | 238 | map->map_flags, |
| 187 | map->pages * 1ULL << PAGE_SHIFT); | 239 | map->pages * 1ULL << PAGE_SHIFT); |
| 188 | 240 | ||
| 189 | if (owner_prog_type) | 241 | if (owner_prog_type) { |
| 190 | seq_printf(m, "owner_prog_type:\t%u\n", | 242 | seq_printf(m, "owner_prog_type:\t%u\n", |
| 191 | owner_prog_type); | 243 | owner_prog_type); |
| 244 | seq_printf(m, "owner_jited:\t%u\n", | ||
| 245 | owner_jited); | ||
| 246 | } | ||
| 192 | } | 247 | } |
| 193 | #endif | 248 | #endif |
| 194 | 249 | ||
| @@ -236,11 +291,22 @@ static int map_create(union bpf_attr *attr) | |||
| 236 | if (err) | 291 | if (err) |
| 237 | goto free_map_nouncharge; | 292 | goto free_map_nouncharge; |
| 238 | 293 | ||
| 239 | err = bpf_map_new_fd(map); | 294 | err = bpf_map_alloc_id(map); |
| 240 | if (err < 0) | 295 | if (err) |
| 241 | /* failed to allocate fd */ | ||
| 242 | goto free_map; | 296 | goto free_map; |
| 243 | 297 | ||
| 298 | err = bpf_map_new_fd(map); | ||
| 299 | if (err < 0) { | ||
| 300 | /* failed to allocate fd. | ||
| 301 | * bpf_map_put() is needed because the above | ||
| 302 | * bpf_map_alloc_id() has published the map | ||
| 303 | * to the userspace and the userspace may | ||
| 304 | * have refcnt-ed it through BPF_MAP_GET_FD_BY_ID. | ||
| 305 | */ | ||
| 306 | bpf_map_put(map); | ||
| 307 | return err; | ||
| 308 | } | ||
| 309 | |||
| 244 | trace_bpf_map_create(map, err); | 310 | trace_bpf_map_create(map, err); |
| 245 | return err; | 311 | return err; |
| 246 | 312 | ||
| @@ -295,6 +361,28 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd) | |||
| 295 | return map; | 361 | return map; |
| 296 | } | 362 | } |
| 297 | 363 | ||
| 364 | /* map_idr_lock should have been held */ | ||
| 365 | static struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map, | ||
| 366 | bool uref) | ||
| 367 | { | ||
| 368 | int refold; | ||
| 369 | |||
| 370 | refold = __atomic_add_unless(&map->refcnt, 1, 0); | ||
| 371 | |||
| 372 | if (refold >= BPF_MAX_REFCNT) { | ||
| 373 | __bpf_map_put(map, false); | ||
| 374 | return ERR_PTR(-EBUSY); | ||
| 375 | } | ||
| 376 | |||
| 377 | if (!refold) | ||
| 378 | return ERR_PTR(-ENOENT); | ||
| 379 | |||
| 380 | if (uref) | ||
| 381 | atomic_inc(&map->usercnt); | ||
| 382 | |||
| 383 | return map; | ||
| 384 | } | ||
| 385 | |||
| 298 | int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) | 386 | int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) |
| 299 | { | 387 | { |
| 300 | return -ENOTSUPP; | 388 | return -ENOTSUPP; |
| @@ -322,19 +410,18 @@ static int map_lookup_elem(union bpf_attr *attr) | |||
| 322 | if (IS_ERR(map)) | 410 | if (IS_ERR(map)) |
| 323 | return PTR_ERR(map); | 411 | return PTR_ERR(map); |
| 324 | 412 | ||
| 325 | err = -ENOMEM; | 413 | key = memdup_user(ukey, map->key_size); |
| 326 | key = kmalloc(map->key_size, GFP_USER); | 414 | if (IS_ERR(key)) { |
| 327 | if (!key) | 415 | err = PTR_ERR(key); |
| 328 | goto err_put; | 416 | goto err_put; |
| 329 | 417 | } | |
| 330 | err = -EFAULT; | ||
| 331 | if (copy_from_user(key, ukey, map->key_size) != 0) | ||
| 332 | goto free_key; | ||
| 333 | 418 | ||
| 334 | if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || | 419 | if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || |
| 335 | map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || | 420 | map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || |
| 336 | map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) | 421 | map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) |
| 337 | value_size = round_up(map->value_size, 8) * num_possible_cpus(); | 422 | value_size = round_up(map->value_size, 8) * num_possible_cpus(); |
| 423 | else if (IS_FD_MAP(map)) | ||
| 424 | value_size = sizeof(u32); | ||
| 338 | else | 425 | else |
| 339 | value_size = map->value_size; | 426 | value_size = map->value_size; |
| 340 | 427 | ||
| @@ -350,9 +437,10 @@ static int map_lookup_elem(union bpf_attr *attr) | |||
| 350 | err = bpf_percpu_array_copy(map, key, value); | 437 | err = bpf_percpu_array_copy(map, key, value); |
| 351 | } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { | 438 | } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { |
| 352 | err = bpf_stackmap_copy(map, key, value); | 439 | err = bpf_stackmap_copy(map, key, value); |
| 353 | } else if (map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS || | 440 | } else if (IS_FD_ARRAY(map)) { |
| 354 | map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { | 441 | err = bpf_fd_array_map_lookup_elem(map, key, value); |
| 355 | err = -ENOTSUPP; | 442 | } else if (IS_FD_HASH(map)) { |
| 443 | err = bpf_fd_htab_map_lookup_elem(map, key, value); | ||
| 356 | } else { | 444 | } else { |
| 357 | rcu_read_lock(); | 445 | rcu_read_lock(); |
| 358 | ptr = map->ops->map_lookup_elem(map, key); | 446 | ptr = map->ops->map_lookup_elem(map, key); |
| @@ -402,14 +490,11 @@ static int map_update_elem(union bpf_attr *attr) | |||
| 402 | if (IS_ERR(map)) | 490 | if (IS_ERR(map)) |
| 403 | return PTR_ERR(map); | 491 | return PTR_ERR(map); |
| 404 | 492 | ||
| 405 | err = -ENOMEM; | 493 | key = memdup_user(ukey, map->key_size); |
| 406 | key = kmalloc(map->key_size, GFP_USER); | 494 | if (IS_ERR(key)) { |
| 407 | if (!key) | 495 | err = PTR_ERR(key); |
| 408 | goto err_put; | 496 | goto err_put; |
| 409 | 497 | } | |
| 410 | err = -EFAULT; | ||
| 411 | if (copy_from_user(key, ukey, map->key_size) != 0) | ||
| 412 | goto free_key; | ||
| 413 | 498 | ||
| 414 | if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || | 499 | if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || |
| 415 | map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || | 500 | map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || |
| @@ -488,14 +573,11 @@ static int map_delete_elem(union bpf_attr *attr) | |||
| 488 | if (IS_ERR(map)) | 573 | if (IS_ERR(map)) |
| 489 | return PTR_ERR(map); | 574 | return PTR_ERR(map); |
| 490 | 575 | ||
| 491 | err = -ENOMEM; | 576 | key = memdup_user(ukey, map->key_size); |
| 492 | key = kmalloc(map->key_size, GFP_USER); | 577 | if (IS_ERR(key)) { |
| 493 | if (!key) | 578 | err = PTR_ERR(key); |
| 494 | goto err_put; | 579 | goto err_put; |
| 495 | 580 | } | |
| 496 | err = -EFAULT; | ||
| 497 | if (copy_from_user(key, ukey, map->key_size) != 0) | ||
| 498 | goto free_key; | ||
| 499 | 581 | ||
| 500 | preempt_disable(); | 582 | preempt_disable(); |
| 501 | __this_cpu_inc(bpf_prog_active); | 583 | __this_cpu_inc(bpf_prog_active); |
| @@ -507,7 +589,6 @@ static int map_delete_elem(union bpf_attr *attr) | |||
| 507 | 589 | ||
| 508 | if (!err) | 590 | if (!err) |
| 509 | trace_bpf_map_delete_elem(map, ufd, key); | 591 | trace_bpf_map_delete_elem(map, ufd, key); |
| 510 | free_key: | ||
| 511 | kfree(key); | 592 | kfree(key); |
| 512 | err_put: | 593 | err_put: |
| 513 | fdput(f); | 594 | fdput(f); |
| @@ -536,14 +617,11 @@ static int map_get_next_key(union bpf_attr *attr) | |||
| 536 | return PTR_ERR(map); | 617 | return PTR_ERR(map); |
| 537 | 618 | ||
| 538 | if (ukey) { | 619 | if (ukey) { |
| 539 | err = -ENOMEM; | 620 | key = memdup_user(ukey, map->key_size); |
| 540 | key = kmalloc(map->key_size, GFP_USER); | 621 | if (IS_ERR(key)) { |
| 541 | if (!key) | 622 | err = PTR_ERR(key); |
| 542 | goto err_put; | 623 | goto err_put; |
| 543 | 624 | } | |
| 544 | err = -EFAULT; | ||
| 545 | if (copy_from_user(key, ukey, map->key_size) != 0) | ||
| 546 | goto free_key; | ||
| 547 | } else { | 625 | } else { |
| 548 | key = NULL; | 626 | key = NULL; |
| 549 | } | 627 | } |
| @@ -650,6 +728,42 @@ static void bpf_prog_uncharge_memlock(struct bpf_prog *prog) | |||
| 650 | free_uid(user); | 728 | free_uid(user); |
| 651 | } | 729 | } |
| 652 | 730 | ||
| 731 | static int bpf_prog_alloc_id(struct bpf_prog *prog) | ||
| 732 | { | ||
| 733 | int id; | ||
| 734 | |||
| 735 | spin_lock_bh(&prog_idr_lock); | ||
| 736 | id = idr_alloc_cyclic(&prog_idr, prog, 1, INT_MAX, GFP_ATOMIC); | ||
| 737 | if (id > 0) | ||
| 738 | prog->aux->id = id; | ||
| 739 | spin_unlock_bh(&prog_idr_lock); | ||
| 740 | |||
| 741 | /* id is in [1, INT_MAX) */ | ||
| 742 | if (WARN_ON_ONCE(!id)) | ||
| 743 | return -ENOSPC; | ||
| 744 | |||
| 745 | return id > 0 ? 0 : id; | ||
| 746 | } | ||
| 747 | |||
| 748 | static void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock) | ||
| 749 | { | ||
| 750 | /* cBPF to eBPF migrations are currently not in the idr store. */ | ||
| 751 | if (!prog->aux->id) | ||
| 752 | return; | ||
| 753 | |||
| 754 | if (do_idr_lock) | ||
| 755 | spin_lock_bh(&prog_idr_lock); | ||
| 756 | else | ||
| 757 | __acquire(&prog_idr_lock); | ||
| 758 | |||
| 759 | idr_remove(&prog_idr, prog->aux->id); | ||
| 760 | |||
| 761 | if (do_idr_lock) | ||
| 762 | spin_unlock_bh(&prog_idr_lock); | ||
| 763 | else | ||
| 764 | __release(&prog_idr_lock); | ||
| 765 | } | ||
| 766 | |||
| 653 | static void __bpf_prog_put_rcu(struct rcu_head *rcu) | 767 | static void __bpf_prog_put_rcu(struct rcu_head *rcu) |
| 654 | { | 768 | { |
| 655 | struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu); | 769 | struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu); |
| @@ -659,14 +773,21 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu) | |||
| 659 | bpf_prog_free(aux->prog); | 773 | bpf_prog_free(aux->prog); |
| 660 | } | 774 | } |
| 661 | 775 | ||
| 662 | void bpf_prog_put(struct bpf_prog *prog) | 776 | static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) |
| 663 | { | 777 | { |
| 664 | if (atomic_dec_and_test(&prog->aux->refcnt)) { | 778 | if (atomic_dec_and_test(&prog->aux->refcnt)) { |
| 665 | trace_bpf_prog_put_rcu(prog); | 779 | trace_bpf_prog_put_rcu(prog); |
| 780 | /* bpf_prog_free_id() must be called first */ | ||
| 781 | bpf_prog_free_id(prog, do_idr_lock); | ||
| 666 | bpf_prog_kallsyms_del(prog); | 782 | bpf_prog_kallsyms_del(prog); |
| 667 | call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); | 783 | call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); |
| 668 | } | 784 | } |
| 669 | } | 785 | } |
| 786 | |||
| 787 | void bpf_prog_put(struct bpf_prog *prog) | ||
| 788 | { | ||
| 789 | __bpf_prog_put(prog, true); | ||
| 790 | } | ||
| 670 | EXPORT_SYMBOL_GPL(bpf_prog_put); | 791 | EXPORT_SYMBOL_GPL(bpf_prog_put); |
| 671 | 792 | ||
| 672 | static int bpf_prog_release(struct inode *inode, struct file *filp) | 793 | static int bpf_prog_release(struct inode *inode, struct file *filp) |
| @@ -748,6 +869,24 @@ struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog) | |||
| 748 | } | 869 | } |
| 749 | EXPORT_SYMBOL_GPL(bpf_prog_inc); | 870 | EXPORT_SYMBOL_GPL(bpf_prog_inc); |
| 750 | 871 | ||
| 872 | /* prog_idr_lock should have been held */ | ||
| 873 | static struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog) | ||
| 874 | { | ||
| 875 | int refold; | ||
| 876 | |||
| 877 | refold = __atomic_add_unless(&prog->aux->refcnt, 1, 0); | ||
| 878 | |||
| 879 | if (refold >= BPF_MAX_REFCNT) { | ||
| 880 | __bpf_prog_put(prog, false); | ||
| 881 | return ERR_PTR(-EBUSY); | ||
| 882 | } | ||
| 883 | |||
| 884 | if (!refold) | ||
| 885 | return ERR_PTR(-ENOENT); | ||
| 886 | |||
| 887 | return prog; | ||
| 888 | } | ||
| 889 | |||
| 751 | static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type) | 890 | static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type) |
| 752 | { | 891 | { |
| 753 | struct fd f = fdget(ufd); | 892 | struct fd f = fdget(ufd); |
| @@ -815,7 +954,9 @@ static int bpf_prog_load(union bpf_attr *attr) | |||
| 815 | attr->kern_version != LINUX_VERSION_CODE) | 954 | attr->kern_version != LINUX_VERSION_CODE) |
| 816 | return -EINVAL; | 955 | return -EINVAL; |
| 817 | 956 | ||
| 818 | if (type != BPF_PROG_TYPE_SOCKET_FILTER && !capable(CAP_SYS_ADMIN)) | 957 | if (type != BPF_PROG_TYPE_SOCKET_FILTER && |
| 958 | type != BPF_PROG_TYPE_CGROUP_SKB && | ||
| 959 | !capable(CAP_SYS_ADMIN)) | ||
| 819 | return -EPERM; | 960 | return -EPERM; |
| 820 | 961 | ||
| 821 | /* plain bpf_prog allocation */ | 962 | /* plain bpf_prog allocation */ |
| @@ -855,11 +996,22 @@ static int bpf_prog_load(union bpf_attr *attr) | |||
| 855 | if (err < 0) | 996 | if (err < 0) |
| 856 | goto free_used_maps; | 997 | goto free_used_maps; |
| 857 | 998 | ||
| 858 | err = bpf_prog_new_fd(prog); | 999 | err = bpf_prog_alloc_id(prog); |
| 859 | if (err < 0) | 1000 | if (err) |
| 860 | /* failed to allocate fd */ | ||
| 861 | goto free_used_maps; | 1001 | goto free_used_maps; |
| 862 | 1002 | ||
| 1003 | err = bpf_prog_new_fd(prog); | ||
| 1004 | if (err < 0) { | ||
| 1005 | /* failed to allocate fd. | ||
| 1006 | * bpf_prog_put() is needed because the above | ||
| 1007 | * bpf_prog_alloc_id() has published the prog | ||
| 1008 | * to the userspace and the userspace may | ||
| 1009 | * have refcnt-ed it through BPF_PROG_GET_FD_BY_ID. | ||
| 1010 | */ | ||
| 1011 | bpf_prog_put(prog); | ||
| 1012 | return err; | ||
| 1013 | } | ||
| 1014 | |||
| 863 | bpf_prog_kallsyms_add(prog); | 1015 | bpf_prog_kallsyms_add(prog); |
| 864 | trace_bpf_prog_load(prog, err); | 1016 | trace_bpf_prog_load(prog, err); |
| 865 | return err; | 1017 | return err; |
| @@ -919,6 +1071,9 @@ static int bpf_prog_attach(const union bpf_attr *attr) | |||
| 919 | case BPF_CGROUP_INET_SOCK_CREATE: | 1071 | case BPF_CGROUP_INET_SOCK_CREATE: |
| 920 | ptype = BPF_PROG_TYPE_CGROUP_SOCK; | 1072 | ptype = BPF_PROG_TYPE_CGROUP_SOCK; |
| 921 | break; | 1073 | break; |
| 1074 | case BPF_CGROUP_SOCK_OPS: | ||
| 1075 | ptype = BPF_PROG_TYPE_SOCK_OPS; | ||
| 1076 | break; | ||
| 922 | default: | 1077 | default: |
| 923 | return -EINVAL; | 1078 | return -EINVAL; |
| 924 | } | 1079 | } |
| @@ -959,6 +1114,7 @@ static int bpf_prog_detach(const union bpf_attr *attr) | |||
| 959 | case BPF_CGROUP_INET_INGRESS: | 1114 | case BPF_CGROUP_INET_INGRESS: |
| 960 | case BPF_CGROUP_INET_EGRESS: | 1115 | case BPF_CGROUP_INET_EGRESS: |
| 961 | case BPF_CGROUP_INET_SOCK_CREATE: | 1116 | case BPF_CGROUP_INET_SOCK_CREATE: |
| 1117 | case BPF_CGROUP_SOCK_OPS: | ||
| 962 | cgrp = cgroup_get_from_fd(attr->target_fd); | 1118 | cgrp = cgroup_get_from_fd(attr->target_fd); |
| 963 | if (IS_ERR(cgrp)) | 1119 | if (IS_ERR(cgrp)) |
| 964 | return PTR_ERR(cgrp); | 1120 | return PTR_ERR(cgrp); |
| @@ -973,6 +1129,7 @@ static int bpf_prog_detach(const union bpf_attr *attr) | |||
| 973 | 1129 | ||
| 974 | return ret; | 1130 | return ret; |
| 975 | } | 1131 | } |
| 1132 | |||
| 976 | #endif /* CONFIG_CGROUP_BPF */ | 1133 | #endif /* CONFIG_CGROUP_BPF */ |
| 977 | 1134 | ||
| 978 | #define BPF_PROG_TEST_RUN_LAST_FIELD test.duration | 1135 | #define BPF_PROG_TEST_RUN_LAST_FIELD test.duration |
| @@ -997,6 +1154,237 @@ static int bpf_prog_test_run(const union bpf_attr *attr, | |||
| 997 | return ret; | 1154 | return ret; |
| 998 | } | 1155 | } |
| 999 | 1156 | ||
| 1157 | #define BPF_OBJ_GET_NEXT_ID_LAST_FIELD next_id | ||
| 1158 | |||
| 1159 | static int bpf_obj_get_next_id(const union bpf_attr *attr, | ||
| 1160 | union bpf_attr __user *uattr, | ||
| 1161 | struct idr *idr, | ||
| 1162 | spinlock_t *lock) | ||
| 1163 | { | ||
| 1164 | u32 next_id = attr->start_id; | ||
| 1165 | int err = 0; | ||
| 1166 | |||
| 1167 | if (CHECK_ATTR(BPF_OBJ_GET_NEXT_ID) || next_id >= INT_MAX) | ||
| 1168 | return -EINVAL; | ||
| 1169 | |||
| 1170 | if (!capable(CAP_SYS_ADMIN)) | ||
| 1171 | return -EPERM; | ||
| 1172 | |||
| 1173 | next_id++; | ||
| 1174 | spin_lock_bh(lock); | ||
| 1175 | if (!idr_get_next(idr, &next_id)) | ||
| 1176 | err = -ENOENT; | ||
| 1177 | spin_unlock_bh(lock); | ||
| 1178 | |||
| 1179 | if (!err) | ||
| 1180 | err = put_user(next_id, &uattr->next_id); | ||
| 1181 | |||
| 1182 | return err; | ||
| 1183 | } | ||
| 1184 | |||
| 1185 | #define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id | ||
| 1186 | |||
| 1187 | static int bpf_prog_get_fd_by_id(const union bpf_attr *attr) | ||
| 1188 | { | ||
| 1189 | struct bpf_prog *prog; | ||
| 1190 | u32 id = attr->prog_id; | ||
| 1191 | int fd; | ||
| 1192 | |||
| 1193 | if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID)) | ||
| 1194 | return -EINVAL; | ||
| 1195 | |||
| 1196 | if (!capable(CAP_SYS_ADMIN)) | ||
| 1197 | return -EPERM; | ||
| 1198 | |||
| 1199 | spin_lock_bh(&prog_idr_lock); | ||
| 1200 | prog = idr_find(&prog_idr, id); | ||
| 1201 | if (prog) | ||
| 1202 | prog = bpf_prog_inc_not_zero(prog); | ||
| 1203 | else | ||
| 1204 | prog = ERR_PTR(-ENOENT); | ||
| 1205 | spin_unlock_bh(&prog_idr_lock); | ||
| 1206 | |||
| 1207 | if (IS_ERR(prog)) | ||
| 1208 | return PTR_ERR(prog); | ||
| 1209 | |||
| 1210 | fd = bpf_prog_new_fd(prog); | ||
| 1211 | if (fd < 0) | ||
| 1212 | bpf_prog_put(prog); | ||
| 1213 | |||
| 1214 | return fd; | ||
| 1215 | } | ||
| 1216 | |||
| 1217 | #define BPF_MAP_GET_FD_BY_ID_LAST_FIELD map_id | ||
| 1218 | |||
| 1219 | static int bpf_map_get_fd_by_id(const union bpf_attr *attr) | ||
| 1220 | { | ||
| 1221 | struct bpf_map *map; | ||
| 1222 | u32 id = attr->map_id; | ||
| 1223 | int fd; | ||
| 1224 | |||
| 1225 | if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID)) | ||
| 1226 | return -EINVAL; | ||
| 1227 | |||
| 1228 | if (!capable(CAP_SYS_ADMIN)) | ||
| 1229 | return -EPERM; | ||
| 1230 | |||
| 1231 | spin_lock_bh(&map_idr_lock); | ||
| 1232 | map = idr_find(&map_idr, id); | ||
| 1233 | if (map) | ||
| 1234 | map = bpf_map_inc_not_zero(map, true); | ||
| 1235 | else | ||
| 1236 | map = ERR_PTR(-ENOENT); | ||
| 1237 | spin_unlock_bh(&map_idr_lock); | ||
| 1238 | |||
| 1239 | if (IS_ERR(map)) | ||
| 1240 | return PTR_ERR(map); | ||
| 1241 | |||
| 1242 | fd = bpf_map_new_fd(map); | ||
| 1243 | if (fd < 0) | ||
| 1244 | bpf_map_put(map); | ||
| 1245 | |||
| 1246 | return fd; | ||
| 1247 | } | ||
| 1248 | |||
| 1249 | static int check_uarg_tail_zero(void __user *uaddr, | ||
| 1250 | size_t expected_size, | ||
| 1251 | size_t actual_size) | ||
| 1252 | { | ||
| 1253 | unsigned char __user *addr; | ||
| 1254 | unsigned char __user *end; | ||
| 1255 | unsigned char val; | ||
| 1256 | int err; | ||
| 1257 | |||
| 1258 | if (actual_size <= expected_size) | ||
| 1259 | return 0; | ||
| 1260 | |||
| 1261 | addr = uaddr + expected_size; | ||
| 1262 | end = uaddr + actual_size; | ||
| 1263 | |||
| 1264 | for (; addr < end; addr++) { | ||
| 1265 | err = get_user(val, addr); | ||
| 1266 | if (err) | ||
| 1267 | return err; | ||
| 1268 | if (val) | ||
| 1269 | return -E2BIG; | ||
| 1270 | } | ||
| 1271 | |||
| 1272 | return 0; | ||
| 1273 | } | ||
| 1274 | |||
| 1275 | static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, | ||
| 1276 | const union bpf_attr *attr, | ||
| 1277 | union bpf_attr __user *uattr) | ||
| 1278 | { | ||
| 1279 | struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info); | ||
| 1280 | struct bpf_prog_info info = {}; | ||
| 1281 | u32 info_len = attr->info.info_len; | ||
| 1282 | char __user *uinsns; | ||
| 1283 | u32 ulen; | ||
| 1284 | int err; | ||
| 1285 | |||
| 1286 | err = check_uarg_tail_zero(uinfo, sizeof(info), info_len); | ||
| 1287 | if (err) | ||
| 1288 | return err; | ||
| 1289 | info_len = min_t(u32, sizeof(info), info_len); | ||
| 1290 | |||
| 1291 | if (copy_from_user(&info, uinfo, info_len)) | ||
| 1292 | return err; | ||
| 1293 | |||
| 1294 | info.type = prog->type; | ||
| 1295 | info.id = prog->aux->id; | ||
| 1296 | |||
| 1297 | memcpy(info.tag, prog->tag, sizeof(prog->tag)); | ||
| 1298 | |||
| 1299 | if (!capable(CAP_SYS_ADMIN)) { | ||
| 1300 | info.jited_prog_len = 0; | ||
| 1301 | info.xlated_prog_len = 0; | ||
| 1302 | goto done; | ||
| 1303 | } | ||
| 1304 | |||
| 1305 | ulen = info.jited_prog_len; | ||
| 1306 | info.jited_prog_len = prog->jited_len; | ||
| 1307 | if (info.jited_prog_len && ulen) { | ||
| 1308 | uinsns = u64_to_user_ptr(info.jited_prog_insns); | ||
| 1309 | ulen = min_t(u32, info.jited_prog_len, ulen); | ||
| 1310 | if (copy_to_user(uinsns, prog->bpf_func, ulen)) | ||
| 1311 | return -EFAULT; | ||
| 1312 | } | ||
| 1313 | |||
| 1314 | ulen = info.xlated_prog_len; | ||
| 1315 | info.xlated_prog_len = bpf_prog_size(prog->len); | ||
| 1316 | if (info.xlated_prog_len && ulen) { | ||
| 1317 | uinsns = u64_to_user_ptr(info.xlated_prog_insns); | ||
| 1318 | ulen = min_t(u32, info.xlated_prog_len, ulen); | ||
| 1319 | if (copy_to_user(uinsns, prog->insnsi, ulen)) | ||
| 1320 | return -EFAULT; | ||
| 1321 | } | ||
| 1322 | |||
| 1323 | done: | ||
| 1324 | if (copy_to_user(uinfo, &info, info_len) || | ||
| 1325 | put_user(info_len, &uattr->info.info_len)) | ||
| 1326 | return -EFAULT; | ||
| 1327 | |||
| 1328 | return 0; | ||
| 1329 | } | ||
| 1330 | |||
| 1331 | static int bpf_map_get_info_by_fd(struct bpf_map *map, | ||
| 1332 | const union bpf_attr *attr, | ||
| 1333 | union bpf_attr __user *uattr) | ||
| 1334 | { | ||
| 1335 | struct bpf_map_info __user *uinfo = u64_to_user_ptr(attr->info.info); | ||
| 1336 | struct bpf_map_info info = {}; | ||
| 1337 | u32 info_len = attr->info.info_len; | ||
| 1338 | int err; | ||
| 1339 | |||
| 1340 | err = check_uarg_tail_zero(uinfo, sizeof(info), info_len); | ||
| 1341 | if (err) | ||
| 1342 | return err; | ||
| 1343 | info_len = min_t(u32, sizeof(info), info_len); | ||
| 1344 | |||
| 1345 | info.type = map->map_type; | ||
| 1346 | info.id = map->id; | ||
| 1347 | info.key_size = map->key_size; | ||
| 1348 | info.value_size = map->value_size; | ||
| 1349 | info.max_entries = map->max_entries; | ||
| 1350 | info.map_flags = map->map_flags; | ||
| 1351 | |||
| 1352 | if (copy_to_user(uinfo, &info, info_len) || | ||
| 1353 | put_user(info_len, &uattr->info.info_len)) | ||
| 1354 | return -EFAULT; | ||
| 1355 | |||
| 1356 | return 0; | ||
| 1357 | } | ||
| 1358 | |||
| 1359 | #define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info | ||
| 1360 | |||
| 1361 | static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, | ||
| 1362 | union bpf_attr __user *uattr) | ||
| 1363 | { | ||
| 1364 | int ufd = attr->info.bpf_fd; | ||
| 1365 | struct fd f; | ||
| 1366 | int err; | ||
| 1367 | |||
| 1368 | if (CHECK_ATTR(BPF_OBJ_GET_INFO_BY_FD)) | ||
| 1369 | return -EINVAL; | ||
| 1370 | |||
| 1371 | f = fdget(ufd); | ||
| 1372 | if (!f.file) | ||
| 1373 | return -EBADFD; | ||
| 1374 | |||
| 1375 | if (f.file->f_op == &bpf_prog_fops) | ||
| 1376 | err = bpf_prog_get_info_by_fd(f.file->private_data, attr, | ||
| 1377 | uattr); | ||
| 1378 | else if (f.file->f_op == &bpf_map_fops) | ||
| 1379 | err = bpf_map_get_info_by_fd(f.file->private_data, attr, | ||
| 1380 | uattr); | ||
| 1381 | else | ||
| 1382 | err = -EINVAL; | ||
| 1383 | |||
| 1384 | fdput(f); | ||
| 1385 | return err; | ||
| 1386 | } | ||
| 1387 | |||
| 1000 | SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) | 1388 | SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) |
| 1001 | { | 1389 | { |
| 1002 | union bpf_attr attr = {}; | 1390 | union bpf_attr attr = {}; |
| @@ -1016,23 +1404,10 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz | |||
| 1016 | * user-space does not rely on any kernel feature | 1404 | * user-space does not rely on any kernel feature |
| 1017 | * extensions we dont know about yet. | 1405 | * extensions we dont know about yet. |
| 1018 | */ | 1406 | */ |
| 1019 | if (size > sizeof(attr)) { | 1407 | err = check_uarg_tail_zero(uattr, sizeof(attr), size); |
| 1020 | unsigned char __user *addr; | 1408 | if (err) |
| 1021 | unsigned char __user *end; | 1409 | return err; |
| 1022 | unsigned char val; | 1410 | size = min_t(u32, size, sizeof(attr)); |
| 1023 | |||
| 1024 | addr = (void __user *)uattr + sizeof(attr); | ||
| 1025 | end = (void __user *)uattr + size; | ||
| 1026 | |||
| 1027 | for (; addr < end; addr++) { | ||
| 1028 | err = get_user(val, addr); | ||
| 1029 | if (err) | ||
| 1030 | return err; | ||
| 1031 | if (val) | ||
| 1032 | return -E2BIG; | ||
| 1033 | } | ||
| 1034 | size = sizeof(attr); | ||
| 1035 | } | ||
| 1036 | 1411 | ||
| 1037 | /* copy attributes from user space, may be less than sizeof(bpf_attr) */ | 1412 | /* copy attributes from user space, may be less than sizeof(bpf_attr) */ |
| 1038 | if (copy_from_user(&attr, uattr, size) != 0) | 1413 | if (copy_from_user(&attr, uattr, size) != 0) |
| @@ -1074,6 +1449,23 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz | |||
| 1074 | case BPF_PROG_TEST_RUN: | 1449 | case BPF_PROG_TEST_RUN: |
| 1075 | err = bpf_prog_test_run(&attr, uattr); | 1450 | err = bpf_prog_test_run(&attr, uattr); |
| 1076 | break; | 1451 | break; |
| 1452 | case BPF_PROG_GET_NEXT_ID: | ||
| 1453 | err = bpf_obj_get_next_id(&attr, uattr, | ||
| 1454 | &prog_idr, &prog_idr_lock); | ||
| 1455 | break; | ||
| 1456 | case BPF_MAP_GET_NEXT_ID: | ||
| 1457 | err = bpf_obj_get_next_id(&attr, uattr, | ||
| 1458 | &map_idr, &map_idr_lock); | ||
| 1459 | break; | ||
| 1460 | case BPF_PROG_GET_FD_BY_ID: | ||
| 1461 | err = bpf_prog_get_fd_by_id(&attr); | ||
| 1462 | break; | ||
| 1463 | case BPF_MAP_GET_FD_BY_ID: | ||
| 1464 | err = bpf_map_get_fd_by_id(&attr); | ||
| 1465 | break; | ||
| 1466 | case BPF_OBJ_GET_INFO_BY_FD: | ||
| 1467 | err = bpf_obj_get_info_by_fd(&attr, uattr); | ||
| 1468 | break; | ||
| 1077 | default: | 1469 | default: |
| 1078 | err = -EINVAL; | 1470 | err = -EINVAL; |
| 1079 | break; | 1471 | break; |
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1eddb713b815..af9e84a4944e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c | |||
| @@ -463,19 +463,22 @@ static const int caller_saved[CALLER_SAVED_REGS] = { | |||
| 463 | BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5 | 463 | BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5 |
| 464 | }; | 464 | }; |
| 465 | 465 | ||
| 466 | static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno) | ||
| 467 | { | ||
| 468 | BUG_ON(regno >= MAX_BPF_REG); | ||
| 469 | |||
| 470 | memset(®s[regno], 0, sizeof(regs[regno])); | ||
| 471 | regs[regno].type = NOT_INIT; | ||
| 472 | regs[regno].min_value = BPF_REGISTER_MIN_RANGE; | ||
| 473 | regs[regno].max_value = BPF_REGISTER_MAX_RANGE; | ||
| 474 | } | ||
| 475 | |||
| 466 | static void init_reg_state(struct bpf_reg_state *regs) | 476 | static void init_reg_state(struct bpf_reg_state *regs) |
| 467 | { | 477 | { |
| 468 | int i; | 478 | int i; |
| 469 | 479 | ||
| 470 | for (i = 0; i < MAX_BPF_REG; i++) { | 480 | for (i = 0; i < MAX_BPF_REG; i++) |
| 471 | regs[i].type = NOT_INIT; | 481 | mark_reg_not_init(regs, i); |
| 472 | regs[i].imm = 0; | ||
| 473 | regs[i].min_value = BPF_REGISTER_MIN_RANGE; | ||
| 474 | regs[i].max_value = BPF_REGISTER_MAX_RANGE; | ||
| 475 | regs[i].min_align = 0; | ||
| 476 | regs[i].aux_off = 0; | ||
| 477 | regs[i].aux_off_align = 0; | ||
| 478 | } | ||
| 479 | 482 | ||
| 480 | /* frame pointer */ | 483 | /* frame pointer */ |
| 481 | regs[BPF_REG_FP].type = FRAME_PTR; | 484 | regs[BPF_REG_FP].type = FRAME_PTR; |
| @@ -501,6 +504,7 @@ static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno) | |||
| 501 | { | 504 | { |
| 502 | regs[regno].min_value = BPF_REGISTER_MIN_RANGE; | 505 | regs[regno].min_value = BPF_REGISTER_MIN_RANGE; |
| 503 | regs[regno].max_value = BPF_REGISTER_MAX_RANGE; | 506 | regs[regno].max_value = BPF_REGISTER_MAX_RANGE; |
| 507 | regs[regno].value_from_signed = false; | ||
| 504 | regs[regno].min_align = 0; | 508 | regs[regno].min_align = 0; |
| 505 | } | 509 | } |
| 506 | 510 | ||
| @@ -543,20 +547,6 @@ static int check_reg_arg(struct bpf_reg_state *regs, u32 regno, | |||
| 543 | return 0; | 547 | return 0; |
| 544 | } | 548 | } |
| 545 | 549 | ||
| 546 | static int bpf_size_to_bytes(int bpf_size) | ||
| 547 | { | ||
| 548 | if (bpf_size == BPF_W) | ||
| 549 | return 4; | ||
| 550 | else if (bpf_size == BPF_H) | ||
| 551 | return 2; | ||
| 552 | else if (bpf_size == BPF_B) | ||
| 553 | return 1; | ||
| 554 | else if (bpf_size == BPF_DW) | ||
| 555 | return 8; | ||
| 556 | else | ||
| 557 | return -EINVAL; | ||
| 558 | } | ||
| 559 | |||
| 560 | static bool is_spillable_regtype(enum bpf_reg_type type) | 550 | static bool is_spillable_regtype(enum bpf_reg_type type) |
| 561 | { | 551 | { |
| 562 | switch (type) { | 552 | switch (type) { |
| @@ -755,15 +745,29 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, | |||
| 755 | } | 745 | } |
| 756 | 746 | ||
| 757 | /* check access to 'struct bpf_context' fields */ | 747 | /* check access to 'struct bpf_context' fields */ |
| 758 | static int check_ctx_access(struct bpf_verifier_env *env, int off, int size, | 748 | static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size, |
| 759 | enum bpf_access_type t, enum bpf_reg_type *reg_type) | 749 | enum bpf_access_type t, enum bpf_reg_type *reg_type) |
| 760 | { | 750 | { |
| 751 | struct bpf_insn_access_aux info = { | ||
| 752 | .reg_type = *reg_type, | ||
| 753 | }; | ||
| 754 | |||
| 761 | /* for analyzer ctx accesses are already validated and converted */ | 755 | /* for analyzer ctx accesses are already validated and converted */ |
| 762 | if (env->analyzer_ops) | 756 | if (env->analyzer_ops) |
| 763 | return 0; | 757 | return 0; |
| 764 | 758 | ||
| 765 | if (env->prog->aux->ops->is_valid_access && | 759 | if (env->prog->aux->ops->is_valid_access && |
| 766 | env->prog->aux->ops->is_valid_access(off, size, t, reg_type)) { | 760 | env->prog->aux->ops->is_valid_access(off, size, t, &info)) { |
| 761 | /* A non zero info.ctx_field_size indicates that this field is a | ||
| 762 | * candidate for later verifier transformation to load the whole | ||
| 763 | * field and then apply a mask when accessed with a narrower | ||
| 764 | * access than actual ctx access size. A zero info.ctx_field_size | ||
| 765 | * will only allow for whole field access and rejects any other | ||
| 766 | * type of narrower access. | ||
| 767 | */ | ||
| 768 | env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size; | ||
| 769 | *reg_type = info.reg_type; | ||
| 770 | |||
| 767 | /* remember the offset of last byte accessed in ctx */ | 771 | /* remember the offset of last byte accessed in ctx */ |
| 768 | if (env->prog->aux->max_ctx_offset < off + size) | 772 | if (env->prog->aux->max_ctx_offset < off + size) |
| 769 | env->prog->aux->max_ctx_offset = off + size; | 773 | env->prog->aux->max_ctx_offset = off + size; |
| @@ -774,12 +778,13 @@ static int check_ctx_access(struct bpf_verifier_env *env, int off, int size, | |||
| 774 | return -EACCES; | 778 | return -EACCES; |
| 775 | } | 779 | } |
| 776 | 780 | ||
| 777 | static bool is_pointer_value(struct bpf_verifier_env *env, int regno) | 781 | static bool __is_pointer_value(bool allow_ptr_leaks, |
| 782 | const struct bpf_reg_state *reg) | ||
| 778 | { | 783 | { |
| 779 | if (env->allow_ptr_leaks) | 784 | if (allow_ptr_leaks) |
| 780 | return false; | 785 | return false; |
| 781 | 786 | ||
| 782 | switch (env->cur_state.regs[regno].type) { | 787 | switch (reg->type) { |
| 783 | case UNKNOWN_VALUE: | 788 | case UNKNOWN_VALUE: |
| 784 | case CONST_IMM: | 789 | case CONST_IMM: |
| 785 | return false; | 790 | return false; |
| @@ -788,6 +793,11 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno) | |||
| 788 | } | 793 | } |
| 789 | } | 794 | } |
| 790 | 795 | ||
| 796 | static bool is_pointer_value(struct bpf_verifier_env *env, int regno) | ||
| 797 | { | ||
| 798 | return __is_pointer_value(env->allow_ptr_leaks, &env->cur_state.regs[regno]); | ||
| 799 | } | ||
| 800 | |||
| 791 | static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg, | 801 | static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg, |
| 792 | int off, int size, bool strict) | 802 | int off, int size, bool strict) |
| 793 | { | 803 | { |
| @@ -808,11 +818,15 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg, | |||
| 808 | reg_off += reg->aux_off; | 818 | reg_off += reg->aux_off; |
| 809 | } | 819 | } |
| 810 | 820 | ||
| 811 | /* skb->data is NET_IP_ALIGN-ed, but for strict alignment checking | 821 | /* For platforms that do not have a Kconfig enabling |
| 812 | * we force this to 2 which is universally what architectures use | 822 | * CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS the value of |
| 813 | * when they don't set CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS. | 823 | * NET_IP_ALIGN is universally set to '2'. And on platforms |
| 824 | * that do set CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS, we get | ||
| 825 | * to this code only in strict mode where we want to emulate | ||
| 826 | * the NET_IP_ALIGN==2 checking. Therefore use an | ||
| 827 | * unconditional IP align value of '2'. | ||
| 814 | */ | 828 | */ |
| 815 | ip_align = strict ? 2 : NET_IP_ALIGN; | 829 | ip_align = 2; |
| 816 | if ((ip_align + reg_off + off) % size != 0) { | 830 | if ((ip_align + reg_off + off) % size != 0) { |
| 817 | verbose("misaligned packet access off %d+%d+%d size %d\n", | 831 | verbose("misaligned packet access off %d+%d+%d size %d\n", |
| 818 | ip_align, reg_off, off, size); | 832 | ip_align, reg_off, off, size); |
| @@ -839,9 +853,6 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, | |||
| 839 | { | 853 | { |
| 840 | bool strict = env->strict_alignment; | 854 | bool strict = env->strict_alignment; |
| 841 | 855 | ||
| 842 | if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) | ||
| 843 | strict = true; | ||
| 844 | |||
| 845 | switch (reg->type) { | 856 | switch (reg->type) { |
| 846 | case PTR_TO_PACKET: | 857 | case PTR_TO_PACKET: |
| 847 | return check_pkt_ptr_alignment(reg, off, size, strict); | 858 | return check_pkt_ptr_alignment(reg, off, size, strict); |
| @@ -864,7 +875,7 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, | |||
| 864 | * if t==write && value_regno==-1, some unknown value is stored into memory | 875 | * if t==write && value_regno==-1, some unknown value is stored into memory |
| 865 | * if t==read && value_regno==-1, don't care what we read from memory | 876 | * if t==read && value_regno==-1, don't care what we read from memory |
| 866 | */ | 877 | */ |
| 867 | static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, | 878 | static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regno, int off, |
| 868 | int bpf_size, enum bpf_access_type t, | 879 | int bpf_size, enum bpf_access_type t, |
| 869 | int value_regno) | 880 | int value_regno) |
| 870 | { | 881 | { |
| @@ -907,7 +918,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, | |||
| 907 | verbose("R%d leaks addr into ctx\n", value_regno); | 918 | verbose("R%d leaks addr into ctx\n", value_regno); |
| 908 | return -EACCES; | 919 | return -EACCES; |
| 909 | } | 920 | } |
| 910 | err = check_ctx_access(env, off, size, t, ®_type); | 921 | err = check_ctx_access(env, insn_idx, off, size, t, ®_type); |
| 911 | if (!err && t == BPF_READ && value_regno >= 0) { | 922 | if (!err && t == BPF_READ && value_regno >= 0) { |
| 912 | mark_reg_unknown_value_and_range(state->regs, | 923 | mark_reg_unknown_value_and_range(state->regs, |
| 913 | value_regno); | 924 | value_regno); |
| @@ -922,6 +933,10 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, | |||
| 922 | verbose("invalid stack off=%d size=%d\n", off, size); | 933 | verbose("invalid stack off=%d size=%d\n", off, size); |
| 923 | return -EACCES; | 934 | return -EACCES; |
| 924 | } | 935 | } |
| 936 | |||
| 937 | if (env->prog->aux->stack_depth < -off) | ||
| 938 | env->prog->aux->stack_depth = -off; | ||
| 939 | |||
| 925 | if (t == BPF_WRITE) { | 940 | if (t == BPF_WRITE) { |
| 926 | if (!env->allow_ptr_leaks && | 941 | if (!env->allow_ptr_leaks && |
| 927 | state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL && | 942 | state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL && |
| @@ -964,7 +979,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, | |||
| 964 | return err; | 979 | return err; |
| 965 | } | 980 | } |
| 966 | 981 | ||
| 967 | static int check_xadd(struct bpf_verifier_env *env, struct bpf_insn *insn) | 982 | static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn) |
| 968 | { | 983 | { |
| 969 | struct bpf_reg_state *regs = env->cur_state.regs; | 984 | struct bpf_reg_state *regs = env->cur_state.regs; |
| 970 | int err; | 985 | int err; |
| @@ -985,14 +1000,19 @@ static int check_xadd(struct bpf_verifier_env *env, struct bpf_insn *insn) | |||
| 985 | if (err) | 1000 | if (err) |
| 986 | return err; | 1001 | return err; |
| 987 | 1002 | ||
| 1003 | if (is_pointer_value(env, insn->src_reg)) { | ||
| 1004 | verbose("R%d leaks addr into mem\n", insn->src_reg); | ||
| 1005 | return -EACCES; | ||
| 1006 | } | ||
| 1007 | |||
| 988 | /* check whether atomic_add can read the memory */ | 1008 | /* check whether atomic_add can read the memory */ |
| 989 | err = check_mem_access(env, insn->dst_reg, insn->off, | 1009 | err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, |
| 990 | BPF_SIZE(insn->code), BPF_READ, -1); | 1010 | BPF_SIZE(insn->code), BPF_READ, -1); |
| 991 | if (err) | 1011 | if (err) |
| 992 | return err; | 1012 | return err; |
| 993 | 1013 | ||
| 994 | /* check whether atomic_add can write into the same memory */ | 1014 | /* check whether atomic_add can write into the same memory */ |
| 995 | return check_mem_access(env, insn->dst_reg, insn->off, | 1015 | return check_mem_access(env, insn_idx, insn->dst_reg, insn->off, |
| 996 | BPF_SIZE(insn->code), BPF_WRITE, -1); | 1016 | BPF_SIZE(insn->code), BPF_WRITE, -1); |
| 997 | } | 1017 | } |
| 998 | 1018 | ||
| @@ -1028,6 +1048,9 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, | |||
| 1028 | return -EACCES; | 1048 | return -EACCES; |
| 1029 | } | 1049 | } |
| 1030 | 1050 | ||
| 1051 | if (env->prog->aux->stack_depth < -off) | ||
| 1052 | env->prog->aux->stack_depth = -off; | ||
| 1053 | |||
| 1031 | if (meta && meta->raw_mode) { | 1054 | if (meta && meta->raw_mode) { |
| 1032 | meta->access_size = access_size; | 1055 | meta->access_size = access_size; |
| 1033 | meta->regno = regno; | 1056 | meta->regno = regno; |
| @@ -1335,8 +1358,8 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) | |||
| 1335 | if (reg->type != PTR_TO_PACKET && | 1358 | if (reg->type != PTR_TO_PACKET && |
| 1336 | reg->type != PTR_TO_PACKET_END) | 1359 | reg->type != PTR_TO_PACKET_END) |
| 1337 | continue; | 1360 | continue; |
| 1338 | reg->type = UNKNOWN_VALUE; | 1361 | __mark_reg_unknown_value(state->spilled_regs, |
| 1339 | reg->imm = 0; | 1362 | i / BPF_REG_SIZE); |
| 1340 | } | 1363 | } |
| 1341 | } | 1364 | } |
| 1342 | 1365 | ||
| @@ -1345,7 +1368,6 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) | |||
| 1345 | struct bpf_verifier_state *state = &env->cur_state; | 1368 | struct bpf_verifier_state *state = &env->cur_state; |
| 1346 | const struct bpf_func_proto *fn = NULL; | 1369 | const struct bpf_func_proto *fn = NULL; |
| 1347 | struct bpf_reg_state *regs = state->regs; | 1370 | struct bpf_reg_state *regs = state->regs; |
| 1348 | struct bpf_reg_state *reg; | ||
| 1349 | struct bpf_call_arg_meta meta; | 1371 | struct bpf_call_arg_meta meta; |
| 1350 | bool changes_data; | 1372 | bool changes_data; |
| 1351 | int i, err; | 1373 | int i, err; |
| @@ -1406,17 +1428,14 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) | |||
| 1406 | * is inferred from register state. | 1428 | * is inferred from register state. |
| 1407 | */ | 1429 | */ |
| 1408 | for (i = 0; i < meta.access_size; i++) { | 1430 | for (i = 0; i < meta.access_size; i++) { |
| 1409 | err = check_mem_access(env, meta.regno, i, BPF_B, BPF_WRITE, -1); | 1431 | err = check_mem_access(env, insn_idx, meta.regno, i, BPF_B, BPF_WRITE, -1); |
| 1410 | if (err) | 1432 | if (err) |
| 1411 | return err; | 1433 | return err; |
| 1412 | } | 1434 | } |
| 1413 | 1435 | ||
| 1414 | /* reset caller saved regs */ | 1436 | /* reset caller saved regs */ |
| 1415 | for (i = 0; i < CALLER_SAVED_REGS; i++) { | 1437 | for (i = 0; i < CALLER_SAVED_REGS; i++) |
| 1416 | reg = regs + caller_saved[i]; | 1438 | mark_reg_not_init(regs, caller_saved[i]); |
| 1417 | reg->type = NOT_INIT; | ||
| 1418 | reg->imm = 0; | ||
| 1419 | } | ||
| 1420 | 1439 | ||
| 1421 | /* update return register */ | 1440 | /* update return register */ |
| 1422 | if (fn->ret_type == RET_INTEGER) { | 1441 | if (fn->ret_type == RET_INTEGER) { |
| @@ -1645,6 +1664,65 @@ static int evaluate_reg_alu(struct bpf_verifier_env *env, struct bpf_insn *insn) | |||
| 1645 | return 0; | 1664 | return 0; |
| 1646 | } | 1665 | } |
| 1647 | 1666 | ||
| 1667 | static int evaluate_reg_imm_alu_unknown(struct bpf_verifier_env *env, | ||
| 1668 | struct bpf_insn *insn) | ||
| 1669 | { | ||
| 1670 | struct bpf_reg_state *regs = env->cur_state.regs; | ||
| 1671 | struct bpf_reg_state *dst_reg = ®s[insn->dst_reg]; | ||
| 1672 | struct bpf_reg_state *src_reg = ®s[insn->src_reg]; | ||
| 1673 | u8 opcode = BPF_OP(insn->code); | ||
| 1674 | s64 imm_log2 = __ilog2_u64((long long)dst_reg->imm); | ||
| 1675 | |||
| 1676 | /* BPF_X code with src_reg->type UNKNOWN_VALUE here. */ | ||
| 1677 | if (src_reg->imm > 0 && dst_reg->imm) { | ||
| 1678 | switch (opcode) { | ||
| 1679 | case BPF_ADD: | ||
| 1680 | /* dreg += sreg | ||
| 1681 | * where both have zero upper bits. Adding them | ||
| 1682 | * can only result making one more bit non-zero | ||
| 1683 | * in the larger value. | ||
| 1684 | * Ex. 0xffff (imm=48) + 1 (imm=63) = 0x10000 (imm=47) | ||
| 1685 | * 0xffff (imm=48) + 0xffff = 0x1fffe (imm=47) | ||
| 1686 | */ | ||
| 1687 | dst_reg->imm = min(src_reg->imm, 63 - imm_log2); | ||
| 1688 | dst_reg->imm--; | ||
| 1689 | break; | ||
| 1690 | case BPF_AND: | ||
| 1691 | /* dreg &= sreg | ||
| 1692 | * AND can not extend zero bits only shrink | ||
| 1693 | * Ex. 0x00..00ffffff | ||
| 1694 | * & 0x0f..ffffffff | ||
| 1695 | * ---------------- | ||
| 1696 | * 0x00..00ffffff | ||
| 1697 | */ | ||
| 1698 | dst_reg->imm = max(src_reg->imm, 63 - imm_log2); | ||
| 1699 | break; | ||
| 1700 | case BPF_OR: | ||
| 1701 | /* dreg |= sreg | ||
| 1702 | * OR can only extend zero bits | ||
| 1703 | * Ex. 0x00..00ffffff | ||
| 1704 | * | 0x0f..ffffffff | ||
| 1705 | * ---------------- | ||
| 1706 | * 0x0f..00ffffff | ||
| 1707 | */ | ||
| 1708 | dst_reg->imm = min(src_reg->imm, 63 - imm_log2); | ||
| 1709 | break; | ||
| 1710 | case BPF_SUB: | ||
| 1711 | case BPF_MUL: | ||
| 1712 | case BPF_RSH: | ||
| 1713 | case BPF_LSH: | ||
| 1714 | /* These may be flushed out later */ | ||
| 1715 | default: | ||
| 1716 | mark_reg_unknown_value(regs, insn->dst_reg); | ||
| 1717 | } | ||
| 1718 | } else { | ||
| 1719 | mark_reg_unknown_value(regs, insn->dst_reg); | ||
| 1720 | } | ||
| 1721 | |||
| 1722 | dst_reg->type = UNKNOWN_VALUE; | ||
| 1723 | return 0; | ||
| 1724 | } | ||
| 1725 | |||
| 1648 | static int evaluate_reg_imm_alu(struct bpf_verifier_env *env, | 1726 | static int evaluate_reg_imm_alu(struct bpf_verifier_env *env, |
| 1649 | struct bpf_insn *insn) | 1727 | struct bpf_insn *insn) |
| 1650 | { | 1728 | { |
| @@ -1654,6 +1732,9 @@ static int evaluate_reg_imm_alu(struct bpf_verifier_env *env, | |||
| 1654 | u8 opcode = BPF_OP(insn->code); | 1732 | u8 opcode = BPF_OP(insn->code); |
| 1655 | u64 dst_imm = dst_reg->imm; | 1733 | u64 dst_imm = dst_reg->imm; |
| 1656 | 1734 | ||
| 1735 | if (BPF_SRC(insn->code) == BPF_X && src_reg->type == UNKNOWN_VALUE) | ||
| 1736 | return evaluate_reg_imm_alu_unknown(env, insn); | ||
| 1737 | |||
| 1657 | /* dst_reg->type == CONST_IMM here. Simulate execution of insns | 1738 | /* dst_reg->type == CONST_IMM here. Simulate execution of insns |
| 1658 | * containing ALU ops. Don't care about overflow or negative | 1739 | * containing ALU ops. Don't care about overflow or negative |
| 1659 | * values, just add/sub/... them; registers are in u64. | 1740 | * values, just add/sub/... them; registers are in u64. |
| @@ -1758,10 +1839,24 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, | |||
| 1758 | dst_align = dst_reg->min_align; | 1839 | dst_align = dst_reg->min_align; |
| 1759 | 1840 | ||
| 1760 | /* We don't know anything about what was done to this register, mark it | 1841 | /* We don't know anything about what was done to this register, mark it |
| 1761 | * as unknown. | 1842 | * as unknown. Also, if both derived bounds came from signed/unsigned |
| 1843 | * mixed compares and one side is unbounded, we cannot really do anything | ||
| 1844 | * with them as boundaries cannot be trusted. Thus, arithmetic of two | ||
| 1845 | * regs of such kind will get invalidated bounds on the dst side. | ||
| 1762 | */ | 1846 | */ |
| 1763 | if (min_val == BPF_REGISTER_MIN_RANGE && | 1847 | if ((min_val == BPF_REGISTER_MIN_RANGE && |
| 1764 | max_val == BPF_REGISTER_MAX_RANGE) { | 1848 | max_val == BPF_REGISTER_MAX_RANGE) || |
| 1849 | (BPF_SRC(insn->code) == BPF_X && | ||
| 1850 | ((min_val != BPF_REGISTER_MIN_RANGE && | ||
| 1851 | max_val == BPF_REGISTER_MAX_RANGE) || | ||
| 1852 | (min_val == BPF_REGISTER_MIN_RANGE && | ||
| 1853 | max_val != BPF_REGISTER_MAX_RANGE) || | ||
| 1854 | (dst_reg->min_value != BPF_REGISTER_MIN_RANGE && | ||
| 1855 | dst_reg->max_value == BPF_REGISTER_MAX_RANGE) || | ||
| 1856 | (dst_reg->min_value == BPF_REGISTER_MIN_RANGE && | ||
| 1857 | dst_reg->max_value != BPF_REGISTER_MAX_RANGE)) && | ||
| 1858 | regs[insn->dst_reg].value_from_signed != | ||
| 1859 | regs[insn->src_reg].value_from_signed)) { | ||
| 1765 | reset_reg_range_values(regs, insn->dst_reg); | 1860 | reset_reg_range_values(regs, insn->dst_reg); |
| 1766 | return; | 1861 | return; |
| 1767 | } | 1862 | } |
| @@ -1945,9 +2040,11 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) | |||
| 1945 | */ | 2040 | */ |
| 1946 | regs[insn->dst_reg].type = CONST_IMM; | 2041 | regs[insn->dst_reg].type = CONST_IMM; |
| 1947 | regs[insn->dst_reg].imm = insn->imm; | 2042 | regs[insn->dst_reg].imm = insn->imm; |
| 2043 | regs[insn->dst_reg].id = 0; | ||
| 1948 | regs[insn->dst_reg].max_value = insn->imm; | 2044 | regs[insn->dst_reg].max_value = insn->imm; |
| 1949 | regs[insn->dst_reg].min_value = insn->imm; | 2045 | regs[insn->dst_reg].min_value = insn->imm; |
| 1950 | regs[insn->dst_reg].min_align = calc_align(insn->imm); | 2046 | regs[insn->dst_reg].min_align = calc_align(insn->imm); |
| 2047 | regs[insn->dst_reg].value_from_signed = false; | ||
| 1951 | } | 2048 | } |
| 1952 | 2049 | ||
| 1953 | } else if (opcode > BPF_END) { | 2050 | } else if (opcode > BPF_END) { |
| @@ -2123,40 +2220,63 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, | |||
| 2123 | struct bpf_reg_state *false_reg, u64 val, | 2220 | struct bpf_reg_state *false_reg, u64 val, |
| 2124 | u8 opcode) | 2221 | u8 opcode) |
| 2125 | { | 2222 | { |
| 2223 | bool value_from_signed = true; | ||
| 2224 | bool is_range = true; | ||
| 2225 | |||
| 2126 | switch (opcode) { | 2226 | switch (opcode) { |
| 2127 | case BPF_JEQ: | 2227 | case BPF_JEQ: |
| 2128 | /* If this is false then we know nothing Jon Snow, but if it is | 2228 | /* If this is false then we know nothing Jon Snow, but if it is |
| 2129 | * true then we know for sure. | 2229 | * true then we know for sure. |
| 2130 | */ | 2230 | */ |
| 2131 | true_reg->max_value = true_reg->min_value = val; | 2231 | true_reg->max_value = true_reg->min_value = val; |
| 2232 | is_range = false; | ||
| 2132 | break; | 2233 | break; |
| 2133 | case BPF_JNE: | 2234 | case BPF_JNE: |
| 2134 | /* If this is true we know nothing Jon Snow, but if it is false | 2235 | /* If this is true we know nothing Jon Snow, but if it is false |
| 2135 | * we know the value for sure; | 2236 | * we know the value for sure; |
| 2136 | */ | 2237 | */ |
| 2137 | false_reg->max_value = false_reg->min_value = val; | 2238 | false_reg->max_value = false_reg->min_value = val; |
| 2239 | is_range = false; | ||
| 2138 | break; | 2240 | break; |
| 2139 | case BPF_JGT: | 2241 | case BPF_JGT: |
| 2140 | /* Unsigned comparison, the minimum value is 0. */ | 2242 | value_from_signed = false; |
| 2141 | false_reg->min_value = 0; | ||
| 2142 | /* fallthrough */ | 2243 | /* fallthrough */ |
| 2143 | case BPF_JSGT: | 2244 | case BPF_JSGT: |
| 2245 | if (true_reg->value_from_signed != value_from_signed) | ||
| 2246 | reset_reg_range_values(true_reg, 0); | ||
| 2247 | if (false_reg->value_from_signed != value_from_signed) | ||
| 2248 | reset_reg_range_values(false_reg, 0); | ||
| 2249 | if (opcode == BPF_JGT) { | ||
| 2250 | /* Unsigned comparison, the minimum value is 0. */ | ||
| 2251 | false_reg->min_value = 0; | ||
| 2252 | } | ||
| 2144 | /* If this is false then we know the maximum val is val, | 2253 | /* If this is false then we know the maximum val is val, |
| 2145 | * otherwise we know the min val is val+1. | 2254 | * otherwise we know the min val is val+1. |
| 2146 | */ | 2255 | */ |
| 2147 | false_reg->max_value = val; | 2256 | false_reg->max_value = val; |
| 2257 | false_reg->value_from_signed = value_from_signed; | ||
| 2148 | true_reg->min_value = val + 1; | 2258 | true_reg->min_value = val + 1; |
| 2259 | true_reg->value_from_signed = value_from_signed; | ||
| 2149 | break; | 2260 | break; |
| 2150 | case BPF_JGE: | 2261 | case BPF_JGE: |
| 2151 | /* Unsigned comparison, the minimum value is 0. */ | 2262 | value_from_signed = false; |
| 2152 | false_reg->min_value = 0; | ||
| 2153 | /* fallthrough */ | 2263 | /* fallthrough */ |
| 2154 | case BPF_JSGE: | 2264 | case BPF_JSGE: |
| 2265 | if (true_reg->value_from_signed != value_from_signed) | ||
| 2266 | reset_reg_range_values(true_reg, 0); | ||
| 2267 | if (false_reg->value_from_signed != value_from_signed) | ||
| 2268 | reset_reg_range_values(false_reg, 0); | ||
| 2269 | if (opcode == BPF_JGE) { | ||
| 2270 | /* Unsigned comparison, the minimum value is 0. */ | ||
| 2271 | false_reg->min_value = 0; | ||
| 2272 | } | ||
| 2155 | /* If this is false then we know the maximum value is val - 1, | 2273 | /* If this is false then we know the maximum value is val - 1, |
| 2156 | * otherwise we know the mimimum value is val. | 2274 | * otherwise we know the mimimum value is val. |
| 2157 | */ | 2275 | */ |
| 2158 | false_reg->max_value = val - 1; | 2276 | false_reg->max_value = val - 1; |
| 2277 | false_reg->value_from_signed = value_from_signed; | ||
| 2159 | true_reg->min_value = val; | 2278 | true_reg->min_value = val; |
| 2279 | true_reg->value_from_signed = value_from_signed; | ||
| 2160 | break; | 2280 | break; |
| 2161 | default: | 2281 | default: |
| 2162 | break; | 2282 | break; |
| @@ -2164,6 +2284,12 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, | |||
| 2164 | 2284 | ||
| 2165 | check_reg_overflow(false_reg); | 2285 | check_reg_overflow(false_reg); |
| 2166 | check_reg_overflow(true_reg); | 2286 | check_reg_overflow(true_reg); |
| 2287 | if (is_range) { | ||
| 2288 | if (__is_pointer_value(false, false_reg)) | ||
| 2289 | reset_reg_range_values(false_reg, 0); | ||
| 2290 | if (__is_pointer_value(false, true_reg)) | ||
| 2291 | reset_reg_range_values(true_reg, 0); | ||
| 2292 | } | ||
| 2167 | } | 2293 | } |
| 2168 | 2294 | ||
| 2169 | /* Same as above, but for the case that dst_reg is a CONST_IMM reg and src_reg | 2295 | /* Same as above, but for the case that dst_reg is a CONST_IMM reg and src_reg |
| @@ -2173,41 +2299,64 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, | |||
| 2173 | struct bpf_reg_state *false_reg, u64 val, | 2299 | struct bpf_reg_state *false_reg, u64 val, |
| 2174 | u8 opcode) | 2300 | u8 opcode) |
| 2175 | { | 2301 | { |
| 2302 | bool value_from_signed = true; | ||
| 2303 | bool is_range = true; | ||
| 2304 | |||
| 2176 | switch (opcode) { | 2305 | switch (opcode) { |
| 2177 | case BPF_JEQ: | 2306 | case BPF_JEQ: |
| 2178 | /* If this is false then we know nothing Jon Snow, but if it is | 2307 | /* If this is false then we know nothing Jon Snow, but if it is |
| 2179 | * true then we know for sure. | 2308 | * true then we know for sure. |
| 2180 | */ | 2309 | */ |
| 2181 | true_reg->max_value = true_reg->min_value = val; | 2310 | true_reg->max_value = true_reg->min_value = val; |
| 2311 | is_range = false; | ||
| 2182 | break; | 2312 | break; |
| 2183 | case BPF_JNE: | 2313 | case BPF_JNE: |
| 2184 | /* If this is true we know nothing Jon Snow, but if it is false | 2314 | /* If this is true we know nothing Jon Snow, but if it is false |
| 2185 | * we know the value for sure; | 2315 | * we know the value for sure; |
| 2186 | */ | 2316 | */ |
| 2187 | false_reg->max_value = false_reg->min_value = val; | 2317 | false_reg->max_value = false_reg->min_value = val; |
| 2318 | is_range = false; | ||
| 2188 | break; | 2319 | break; |
| 2189 | case BPF_JGT: | 2320 | case BPF_JGT: |
| 2190 | /* Unsigned comparison, the minimum value is 0. */ | 2321 | value_from_signed = false; |
| 2191 | true_reg->min_value = 0; | ||
| 2192 | /* fallthrough */ | 2322 | /* fallthrough */ |
| 2193 | case BPF_JSGT: | 2323 | case BPF_JSGT: |
| 2324 | if (true_reg->value_from_signed != value_from_signed) | ||
| 2325 | reset_reg_range_values(true_reg, 0); | ||
| 2326 | if (false_reg->value_from_signed != value_from_signed) | ||
| 2327 | reset_reg_range_values(false_reg, 0); | ||
| 2328 | if (opcode == BPF_JGT) { | ||
| 2329 | /* Unsigned comparison, the minimum value is 0. */ | ||
| 2330 | true_reg->min_value = 0; | ||
| 2331 | } | ||
| 2194 | /* | 2332 | /* |
| 2195 | * If this is false, then the val is <= the register, if it is | 2333 | * If this is false, then the val is <= the register, if it is |
| 2196 | * true the register <= to the val. | 2334 | * true the register <= to the val. |
| 2197 | */ | 2335 | */ |
| 2198 | false_reg->min_value = val; | 2336 | false_reg->min_value = val; |
| 2337 | false_reg->value_from_signed = value_from_signed; | ||
| 2199 | true_reg->max_value = val - 1; | 2338 | true_reg->max_value = val - 1; |
| 2339 | true_reg->value_from_signed = value_from_signed; | ||
| 2200 | break; | 2340 | break; |
| 2201 | case BPF_JGE: | 2341 | case BPF_JGE: |
| 2202 | /* Unsigned comparison, the minimum value is 0. */ | 2342 | value_from_signed = false; |
| 2203 | true_reg->min_value = 0; | ||
| 2204 | /* fallthrough */ | 2343 | /* fallthrough */ |
| 2205 | case BPF_JSGE: | 2344 | case BPF_JSGE: |
| 2345 | if (true_reg->value_from_signed != value_from_signed) | ||
| 2346 | reset_reg_range_values(true_reg, 0); | ||
| 2347 | if (false_reg->value_from_signed != value_from_signed) | ||
| 2348 | reset_reg_range_values(false_reg, 0); | ||
| 2349 | if (opcode == BPF_JGE) { | ||
| 2350 | /* Unsigned comparison, the minimum value is 0. */ | ||
| 2351 | true_reg->min_value = 0; | ||
| 2352 | } | ||
| 2206 | /* If this is false then constant < register, if it is true then | 2353 | /* If this is false then constant < register, if it is true then |
| 2207 | * the register < constant. | 2354 | * the register < constant. |
| 2208 | */ | 2355 | */ |
| 2209 | false_reg->min_value = val + 1; | 2356 | false_reg->min_value = val + 1; |
| 2357 | false_reg->value_from_signed = value_from_signed; | ||
| 2210 | true_reg->max_value = val; | 2358 | true_reg->max_value = val; |
| 2359 | true_reg->value_from_signed = value_from_signed; | ||
| 2211 | break; | 2360 | break; |
| 2212 | default: | 2361 | default: |
| 2213 | break; | 2362 | break; |
| @@ -2215,6 +2364,12 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, | |||
| 2215 | 2364 | ||
| 2216 | check_reg_overflow(false_reg); | 2365 | check_reg_overflow(false_reg); |
| 2217 | check_reg_overflow(true_reg); | 2366 | check_reg_overflow(true_reg); |
| 2367 | if (is_range) { | ||
| 2368 | if (__is_pointer_value(false, false_reg)) | ||
| 2369 | reset_reg_range_values(false_reg, 0); | ||
| 2370 | if (__is_pointer_value(false, true_reg)) | ||
| 2371 | reset_reg_range_values(true_reg, 0); | ||
| 2372 | } | ||
| 2218 | } | 2373 | } |
| 2219 | 2374 | ||
| 2220 | static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id, | 2375 | static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id, |
| @@ -2402,6 +2557,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) | |||
| 2402 | 2557 | ||
| 2403 | regs[insn->dst_reg].type = CONST_IMM; | 2558 | regs[insn->dst_reg].type = CONST_IMM; |
| 2404 | regs[insn->dst_reg].imm = imm; | 2559 | regs[insn->dst_reg].imm = imm; |
| 2560 | regs[insn->dst_reg].id = 0; | ||
| 2405 | return 0; | 2561 | return 0; |
| 2406 | } | 2562 | } |
| 2407 | 2563 | ||
| @@ -2444,7 +2600,6 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) | |||
| 2444 | { | 2600 | { |
| 2445 | struct bpf_reg_state *regs = env->cur_state.regs; | 2601 | struct bpf_reg_state *regs = env->cur_state.regs; |
| 2446 | u8 mode = BPF_MODE(insn->code); | 2602 | u8 mode = BPF_MODE(insn->code); |
| 2447 | struct bpf_reg_state *reg; | ||
| 2448 | int i, err; | 2603 | int i, err; |
| 2449 | 2604 | ||
| 2450 | if (!may_access_skb(env->prog->type)) { | 2605 | if (!may_access_skb(env->prog->type)) { |
| @@ -2477,11 +2632,8 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) | |||
| 2477 | } | 2632 | } |
| 2478 | 2633 | ||
| 2479 | /* reset caller saved regs to unreadable */ | 2634 | /* reset caller saved regs to unreadable */ |
| 2480 | for (i = 0; i < CALLER_SAVED_REGS; i++) { | 2635 | for (i = 0; i < CALLER_SAVED_REGS; i++) |
| 2481 | reg = regs + caller_saved[i]; | 2636 | mark_reg_not_init(regs, caller_saved[i]); |
| 2482 | reg->type = NOT_INIT; | ||
| 2483 | reg->imm = 0; | ||
| 2484 | } | ||
| 2485 | 2637 | ||
| 2486 | /* mark destination R0 register as readable, since it contains | 2638 | /* mark destination R0 register as readable, since it contains |
| 2487 | * the value fetched from the packet | 2639 | * the value fetched from the packet |
| @@ -2692,7 +2844,8 @@ err_free: | |||
| 2692 | /* the following conditions reduce the number of explored insns | 2844 | /* the following conditions reduce the number of explored insns |
| 2693 | * from ~140k to ~80k for ultra large programs that use a lot of ptr_to_packet | 2845 | * from ~140k to ~80k for ultra large programs that use a lot of ptr_to_packet |
| 2694 | */ | 2846 | */ |
| 2695 | static bool compare_ptrs_to_packet(struct bpf_reg_state *old, | 2847 | static bool compare_ptrs_to_packet(struct bpf_verifier_env *env, |
| 2848 | struct bpf_reg_state *old, | ||
| 2696 | struct bpf_reg_state *cur) | 2849 | struct bpf_reg_state *cur) |
| 2697 | { | 2850 | { |
| 2698 | if (old->id != cur->id) | 2851 | if (old->id != cur->id) |
| @@ -2735,7 +2888,7 @@ static bool compare_ptrs_to_packet(struct bpf_reg_state *old, | |||
| 2735 | * 'if (R4 > data_end)' and all further insn were already good with r=20, | 2888 | * 'if (R4 > data_end)' and all further insn were already good with r=20, |
| 2736 | * so they will be good with r=30 and we can prune the search. | 2889 | * so they will be good with r=30 and we can prune the search. |
| 2737 | */ | 2890 | */ |
| 2738 | if (old->off <= cur->off && | 2891 | if (!env->strict_alignment && old->off <= cur->off && |
| 2739 | old->off >= old->range && cur->off >= cur->range) | 2892 | old->off >= old->range && cur->off >= cur->range) |
| 2740 | return true; | 2893 | return true; |
| 2741 | 2894 | ||
| @@ -2806,7 +2959,7 @@ static bool states_equal(struct bpf_verifier_env *env, | |||
| 2806 | continue; | 2959 | continue; |
| 2807 | 2960 | ||
| 2808 | if (rold->type == PTR_TO_PACKET && rcur->type == PTR_TO_PACKET && | 2961 | if (rold->type == PTR_TO_PACKET && rcur->type == PTR_TO_PACKET && |
| 2809 | compare_ptrs_to_packet(rold, rcur)) | 2962 | compare_ptrs_to_packet(env, rold, rcur)) |
| 2810 | continue; | 2963 | continue; |
| 2811 | 2964 | ||
| 2812 | return false; | 2965 | return false; |
| @@ -2824,6 +2977,8 @@ static bool states_equal(struct bpf_verifier_env *env, | |||
| 2824 | return false; | 2977 | return false; |
| 2825 | if (i % BPF_REG_SIZE) | 2978 | if (i % BPF_REG_SIZE) |
| 2826 | continue; | 2979 | continue; |
| 2980 | if (old->stack_slot_type[i] != STACK_SPILL) | ||
| 2981 | continue; | ||
| 2827 | if (memcmp(&old->spilled_regs[i / BPF_REG_SIZE], | 2982 | if (memcmp(&old->spilled_regs[i / BPF_REG_SIZE], |
| 2828 | &cur->spilled_regs[i / BPF_REG_SIZE], | 2983 | &cur->spilled_regs[i / BPF_REG_SIZE], |
| 2829 | sizeof(old->spilled_regs[0]))) | 2984 | sizeof(old->spilled_regs[0]))) |
| @@ -2985,18 +3140,12 @@ static int do_check(struct bpf_verifier_env *env) | |||
| 2985 | /* check that memory (src_reg + off) is readable, | 3140 | /* check that memory (src_reg + off) is readable, |
| 2986 | * the state of dst_reg will be updated by this func | 3141 | * the state of dst_reg will be updated by this func |
| 2987 | */ | 3142 | */ |
| 2988 | err = check_mem_access(env, insn->src_reg, insn->off, | 3143 | err = check_mem_access(env, insn_idx, insn->src_reg, insn->off, |
| 2989 | BPF_SIZE(insn->code), BPF_READ, | 3144 | BPF_SIZE(insn->code), BPF_READ, |
| 2990 | insn->dst_reg); | 3145 | insn->dst_reg); |
| 2991 | if (err) | 3146 | if (err) |
| 2992 | return err; | 3147 | return err; |
| 2993 | 3148 | ||
| 2994 | if (BPF_SIZE(insn->code) != BPF_W && | ||
| 2995 | BPF_SIZE(insn->code) != BPF_DW) { | ||
| 2996 | insn_idx++; | ||
| 2997 | continue; | ||
| 2998 | } | ||
| 2999 | |||
| 3000 | prev_src_type = &env->insn_aux_data[insn_idx].ptr_type; | 3149 | prev_src_type = &env->insn_aux_data[insn_idx].ptr_type; |
| 3001 | 3150 | ||
| 3002 | if (*prev_src_type == NOT_INIT) { | 3151 | if (*prev_src_type == NOT_INIT) { |
| @@ -3024,7 +3173,7 @@ static int do_check(struct bpf_verifier_env *env) | |||
| 3024 | enum bpf_reg_type *prev_dst_type, dst_reg_type; | 3173 | enum bpf_reg_type *prev_dst_type, dst_reg_type; |
| 3025 | 3174 | ||
| 3026 | if (BPF_MODE(insn->code) == BPF_XADD) { | 3175 | if (BPF_MODE(insn->code) == BPF_XADD) { |
| 3027 | err = check_xadd(env, insn); | 3176 | err = check_xadd(env, insn_idx, insn); |
| 3028 | if (err) | 3177 | if (err) |
| 3029 | return err; | 3178 | return err; |
| 3030 | insn_idx++; | 3179 | insn_idx++; |
| @@ -3043,7 +3192,7 @@ static int do_check(struct bpf_verifier_env *env) | |||
| 3043 | dst_reg_type = regs[insn->dst_reg].type; | 3192 | dst_reg_type = regs[insn->dst_reg].type; |
| 3044 | 3193 | ||
| 3045 | /* check that memory (dst_reg + off) is writeable */ | 3194 | /* check that memory (dst_reg + off) is writeable */ |
| 3046 | err = check_mem_access(env, insn->dst_reg, insn->off, | 3195 | err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, |
| 3047 | BPF_SIZE(insn->code), BPF_WRITE, | 3196 | BPF_SIZE(insn->code), BPF_WRITE, |
| 3048 | insn->src_reg); | 3197 | insn->src_reg); |
| 3049 | if (err) | 3198 | if (err) |
| @@ -3072,7 +3221,7 @@ static int do_check(struct bpf_verifier_env *env) | |||
| 3072 | return err; | 3221 | return err; |
| 3073 | 3222 | ||
| 3074 | /* check that memory (dst_reg + off) is writeable */ | 3223 | /* check that memory (dst_reg + off) is writeable */ |
| 3075 | err = check_mem_access(env, insn->dst_reg, insn->off, | 3224 | err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, |
| 3076 | BPF_SIZE(insn->code), BPF_WRITE, | 3225 | BPF_SIZE(insn->code), BPF_WRITE, |
| 3077 | -1); | 3226 | -1); |
| 3078 | if (err) | 3227 | if (err) |
| @@ -3170,7 +3319,8 @@ process_bpf_exit: | |||
| 3170 | insn_idx++; | 3319 | insn_idx++; |
| 3171 | } | 3320 | } |
| 3172 | 3321 | ||
| 3173 | verbose("processed %d insns\n", insn_processed); | 3322 | verbose("processed %d insns, stack depth %d\n", |
| 3323 | insn_processed, env->prog->aux->stack_depth); | ||
| 3174 | return 0; | 3324 | return 0; |
| 3175 | } | 3325 | } |
| 3176 | 3326 | ||
| @@ -3370,11 +3520,13 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of | |||
| 3370 | static int convert_ctx_accesses(struct bpf_verifier_env *env) | 3520 | static int convert_ctx_accesses(struct bpf_verifier_env *env) |
| 3371 | { | 3521 | { |
| 3372 | const struct bpf_verifier_ops *ops = env->prog->aux->ops; | 3522 | const struct bpf_verifier_ops *ops = env->prog->aux->ops; |
| 3523 | int i, cnt, size, ctx_field_size, delta = 0; | ||
| 3373 | const int insn_cnt = env->prog->len; | 3524 | const int insn_cnt = env->prog->len; |
| 3374 | struct bpf_insn insn_buf[16], *insn; | 3525 | struct bpf_insn insn_buf[16], *insn; |
| 3375 | struct bpf_prog *new_prog; | 3526 | struct bpf_prog *new_prog; |
| 3376 | enum bpf_access_type type; | 3527 | enum bpf_access_type type; |
| 3377 | int i, cnt, delta = 0; | 3528 | bool is_narrower_load; |
| 3529 | u32 target_size; | ||
| 3378 | 3530 | ||
| 3379 | if (ops->gen_prologue) { | 3531 | if (ops->gen_prologue) { |
| 3380 | cnt = ops->gen_prologue(insn_buf, env->seen_direct_write, | 3532 | cnt = ops->gen_prologue(insn_buf, env->seen_direct_write, |
| @@ -3414,12 +3566,52 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) | |||
| 3414 | if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX) | 3566 | if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX) |
| 3415 | continue; | 3567 | continue; |
| 3416 | 3568 | ||
| 3417 | cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog); | 3569 | ctx_field_size = env->insn_aux_data[i + delta].ctx_field_size; |
| 3418 | if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { | 3570 | size = BPF_LDST_BYTES(insn); |
| 3571 | |||
| 3572 | /* If the read access is a narrower load of the field, | ||
| 3573 | * convert to a 4/8-byte load, to minimum program type specific | ||
| 3574 | * convert_ctx_access changes. If conversion is successful, | ||
| 3575 | * we will apply proper mask to the result. | ||
| 3576 | */ | ||
| 3577 | is_narrower_load = size < ctx_field_size; | ||
| 3578 | if (is_narrower_load) { | ||
| 3579 | u32 off = insn->off; | ||
| 3580 | u8 size_code; | ||
| 3581 | |||
| 3582 | if (type == BPF_WRITE) { | ||
| 3583 | verbose("bpf verifier narrow ctx access misconfigured\n"); | ||
| 3584 | return -EINVAL; | ||
| 3585 | } | ||
| 3586 | |||
| 3587 | size_code = BPF_H; | ||
| 3588 | if (ctx_field_size == 4) | ||
| 3589 | size_code = BPF_W; | ||
| 3590 | else if (ctx_field_size == 8) | ||
| 3591 | size_code = BPF_DW; | ||
| 3592 | |||
| 3593 | insn->off = off & ~(ctx_field_size - 1); | ||
| 3594 | insn->code = BPF_LDX | BPF_MEM | size_code; | ||
| 3595 | } | ||
| 3596 | |||
| 3597 | target_size = 0; | ||
| 3598 | cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog, | ||
| 3599 | &target_size); | ||
| 3600 | if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) || | ||
| 3601 | (ctx_field_size && !target_size)) { | ||
| 3419 | verbose("bpf verifier is misconfigured\n"); | 3602 | verbose("bpf verifier is misconfigured\n"); |
| 3420 | return -EINVAL; | 3603 | return -EINVAL; |
| 3421 | } | 3604 | } |
| 3422 | 3605 | ||
| 3606 | if (is_narrower_load && size < target_size) { | ||
| 3607 | if (ctx_field_size <= 4) | ||
| 3608 | insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg, | ||
| 3609 | (1 << size * 8) - 1); | ||
| 3610 | else | ||
| 3611 | insn_buf[cnt++] = BPF_ALU64_IMM(BPF_AND, insn->dst_reg, | ||
| 3612 | (1 << size * 8) - 1); | ||
| 3613 | } | ||
| 3614 | |||
| 3423 | new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); | 3615 | new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); |
| 3424 | if (!new_prog) | 3616 | if (!new_prog) |
| 3425 | return -ENOMEM; | 3617 | return -ENOMEM; |
| @@ -3465,6 +3657,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) | |||
| 3465 | * the program array. | 3657 | * the program array. |
| 3466 | */ | 3658 | */ |
| 3467 | prog->cb_access = 1; | 3659 | prog->cb_access = 1; |
| 3660 | env->prog->aux->stack_depth = MAX_BPF_STACK; | ||
| 3468 | 3661 | ||
| 3469 | /* mark bpf_tail_call as different opcode to avoid | 3662 | /* mark bpf_tail_call as different opcode to avoid |
| 3470 | * conditional branch in the interpeter for every normal | 3663 | * conditional branch in the interpeter for every normal |
| @@ -3472,7 +3665,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) | |||
| 3472 | * that doesn't support bpf_tail_call yet | 3665 | * that doesn't support bpf_tail_call yet |
| 3473 | */ | 3666 | */ |
| 3474 | insn->imm = 0; | 3667 | insn->imm = 0; |
| 3475 | insn->code |= BPF_X; | 3668 | insn->code = BPF_JMP | BPF_TAIL_CALL; |
| 3476 | continue; | 3669 | continue; |
| 3477 | } | 3670 | } |
| 3478 | 3671 | ||
| @@ -3584,10 +3777,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) | |||
| 3584 | } else { | 3777 | } else { |
| 3585 | log_level = 0; | 3778 | log_level = 0; |
| 3586 | } | 3779 | } |
| 3587 | if (attr->prog_flags & BPF_F_STRICT_ALIGNMENT) | 3780 | |
| 3781 | env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT); | ||
| 3782 | if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) | ||
| 3588 | env->strict_alignment = true; | 3783 | env->strict_alignment = true; |
| 3589 | else | ||
| 3590 | env->strict_alignment = false; | ||
| 3591 | 3784 | ||
| 3592 | ret = replace_map_fd_with_map_ptr(env); | 3785 | ret = replace_map_fd_with_map_ptr(env); |
| 3593 | if (ret < 0) | 3786 | if (ret < 0) |
| @@ -3693,7 +3886,10 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, | |||
| 3693 | mutex_lock(&bpf_verifier_lock); | 3886 | mutex_lock(&bpf_verifier_lock); |
| 3694 | 3887 | ||
| 3695 | log_level = 0; | 3888 | log_level = 0; |
| 3889 | |||
| 3696 | env->strict_alignment = false; | 3890 | env->strict_alignment = false; |
| 3891 | if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) | ||
| 3892 | env->strict_alignment = true; | ||
| 3697 | 3893 | ||
| 3698 | env->explored_states = kcalloc(env->prog->len, | 3894 | env->explored_states = kcalloc(env->prog->len, |
| 3699 | sizeof(struct bpf_verifier_state_list *), | 3895 | sizeof(struct bpf_verifier_state_list *), |
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile index 387348a40c64..ce693ccb8c58 100644 --- a/kernel/cgroup/Makefile +++ b/kernel/cgroup/Makefile | |||
| @@ -4,3 +4,4 @@ obj-$(CONFIG_CGROUP_FREEZER) += freezer.o | |||
| 4 | obj-$(CONFIG_CGROUP_PIDS) += pids.o | 4 | obj-$(CONFIG_CGROUP_PIDS) += pids.o |
| 5 | obj-$(CONFIG_CGROUP_RDMA) += rdma.o | 5 | obj-$(CONFIG_CGROUP_RDMA) += rdma.o |
| 6 | obj-$(CONFIG_CPUSETS) += cpuset.o | 6 | obj-$(CONFIG_CPUSETS) += cpuset.o |
| 7 | obj-$(CONFIG_CGROUP_DEBUG) += debug.o | ||
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 00f4d6bf048f..793565c05742 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h | |||
| @@ -192,6 +192,8 @@ int cgroup_rmdir(struct kernfs_node *kn); | |||
| 192 | int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, | 192 | int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, |
| 193 | struct kernfs_root *kf_root); | 193 | struct kernfs_root *kf_root); |
| 194 | 194 | ||
| 195 | int cgroup_task_count(const struct cgroup *cgrp); | ||
| 196 | |||
| 195 | /* | 197 | /* |
| 196 | * namespace.c | 198 | * namespace.c |
| 197 | */ | 199 | */ |
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 85d75152402d..7bf4b1533f34 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c | |||
| @@ -334,19 +334,15 @@ static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp, | |||
| 334 | /** | 334 | /** |
| 335 | * cgroup_task_count - count the number of tasks in a cgroup. | 335 | * cgroup_task_count - count the number of tasks in a cgroup. |
| 336 | * @cgrp: the cgroup in question | 336 | * @cgrp: the cgroup in question |
| 337 | * | ||
| 338 | * Return the number of tasks in the cgroup. The returned number can be | ||
| 339 | * higher than the actual number of tasks due to css_set references from | ||
| 340 | * namespace roots and temporary usages. | ||
| 341 | */ | 337 | */ |
| 342 | static int cgroup_task_count(const struct cgroup *cgrp) | 338 | int cgroup_task_count(const struct cgroup *cgrp) |
| 343 | { | 339 | { |
| 344 | int count = 0; | 340 | int count = 0; |
| 345 | struct cgrp_cset_link *link; | 341 | struct cgrp_cset_link *link; |
| 346 | 342 | ||
| 347 | spin_lock_irq(&css_set_lock); | 343 | spin_lock_irq(&css_set_lock); |
| 348 | list_for_each_entry(link, &cgrp->cset_links, cset_link) | 344 | list_for_each_entry(link, &cgrp->cset_links, cset_link) |
| 349 | count += refcount_read(&link->cset->refcount); | 345 | count += link->cset->nr_tasks; |
| 350 | spin_unlock_irq(&css_set_lock); | 346 | spin_unlock_irq(&css_set_lock); |
| 351 | return count; | 347 | return count; |
| 352 | } | 348 | } |
| @@ -1263,150 +1259,3 @@ static int __init cgroup_no_v1(char *str) | |||
| 1263 | return 1; | 1259 | return 1; |
| 1264 | } | 1260 | } |
| 1265 | __setup("cgroup_no_v1=", cgroup_no_v1); | 1261 | __setup("cgroup_no_v1=", cgroup_no_v1); |
| 1266 | |||
| 1267 | |||
| 1268 | #ifdef CONFIG_CGROUP_DEBUG | ||
| 1269 | static struct cgroup_subsys_state * | ||
| 1270 | debug_css_alloc(struct cgroup_subsys_state *parent_css) | ||
| 1271 | { | ||
| 1272 | struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); | ||
| 1273 | |||
| 1274 | if (!css) | ||
| 1275 | return ERR_PTR(-ENOMEM); | ||
| 1276 | |||
| 1277 | return css; | ||
| 1278 | } | ||
| 1279 | |||
| 1280 | static void debug_css_free(struct cgroup_subsys_state *css) | ||
| 1281 | { | ||
| 1282 | kfree(css); | ||
| 1283 | } | ||
| 1284 | |||
| 1285 | static u64 debug_taskcount_read(struct cgroup_subsys_state *css, | ||
| 1286 | struct cftype *cft) | ||
| 1287 | { | ||
| 1288 | return cgroup_task_count(css->cgroup); | ||
| 1289 | } | ||
| 1290 | |||
| 1291 | static u64 current_css_set_read(struct cgroup_subsys_state *css, | ||
| 1292 | struct cftype *cft) | ||
| 1293 | { | ||
| 1294 | return (u64)(unsigned long)current->cgroups; | ||
| 1295 | } | ||
| 1296 | |||
| 1297 | static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css, | ||
| 1298 | struct cftype *cft) | ||
| 1299 | { | ||
| 1300 | u64 count; | ||
| 1301 | |||
| 1302 | rcu_read_lock(); | ||
| 1303 | count = refcount_read(&task_css_set(current)->refcount); | ||
| 1304 | rcu_read_unlock(); | ||
| 1305 | return count; | ||
| 1306 | } | ||
| 1307 | |||
| 1308 | static int current_css_set_cg_links_read(struct seq_file *seq, void *v) | ||
| 1309 | { | ||
| 1310 | struct cgrp_cset_link *link; | ||
| 1311 | struct css_set *cset; | ||
| 1312 | char *name_buf; | ||
| 1313 | |||
| 1314 | name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL); | ||
| 1315 | if (!name_buf) | ||
| 1316 | return -ENOMEM; | ||
| 1317 | |||
| 1318 | spin_lock_irq(&css_set_lock); | ||
| 1319 | rcu_read_lock(); | ||
| 1320 | cset = rcu_dereference(current->cgroups); | ||
| 1321 | list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { | ||
| 1322 | struct cgroup *c = link->cgrp; | ||
| 1323 | |||
| 1324 | cgroup_name(c, name_buf, NAME_MAX + 1); | ||
| 1325 | seq_printf(seq, "Root %d group %s\n", | ||
| 1326 | c->root->hierarchy_id, name_buf); | ||
| 1327 | } | ||
| 1328 | rcu_read_unlock(); | ||
| 1329 | spin_unlock_irq(&css_set_lock); | ||
| 1330 | kfree(name_buf); | ||
| 1331 | return 0; | ||
| 1332 | } | ||
| 1333 | |||
| 1334 | #define MAX_TASKS_SHOWN_PER_CSS 25 | ||
| 1335 | static int cgroup_css_links_read(struct seq_file *seq, void *v) | ||
| 1336 | { | ||
| 1337 | struct cgroup_subsys_state *css = seq_css(seq); | ||
| 1338 | struct cgrp_cset_link *link; | ||
| 1339 | |||
| 1340 | spin_lock_irq(&css_set_lock); | ||
| 1341 | list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { | ||
| 1342 | struct css_set *cset = link->cset; | ||
| 1343 | struct task_struct *task; | ||
| 1344 | int count = 0; | ||
| 1345 | |||
| 1346 | seq_printf(seq, "css_set %pK\n", cset); | ||
| 1347 | |||
| 1348 | list_for_each_entry(task, &cset->tasks, cg_list) { | ||
| 1349 | if (count++ > MAX_TASKS_SHOWN_PER_CSS) | ||
| 1350 | goto overflow; | ||
| 1351 | seq_printf(seq, " task %d\n", task_pid_vnr(task)); | ||
| 1352 | } | ||
| 1353 | |||
| 1354 | list_for_each_entry(task, &cset->mg_tasks, cg_list) { | ||
| 1355 | if (count++ > MAX_TASKS_SHOWN_PER_CSS) | ||
| 1356 | goto overflow; | ||
| 1357 | seq_printf(seq, " task %d\n", task_pid_vnr(task)); | ||
| 1358 | } | ||
| 1359 | continue; | ||
| 1360 | overflow: | ||
| 1361 | seq_puts(seq, " ...\n"); | ||
| 1362 | } | ||
| 1363 | spin_unlock_irq(&css_set_lock); | ||
| 1364 | return 0; | ||
| 1365 | } | ||
| 1366 | |||
| 1367 | static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft) | ||
| 1368 | { | ||
| 1369 | return (!cgroup_is_populated(css->cgroup) && | ||
| 1370 | !css_has_online_children(&css->cgroup->self)); | ||
| 1371 | } | ||
| 1372 | |||
| 1373 | static struct cftype debug_files[] = { | ||
| 1374 | { | ||
| 1375 | .name = "taskcount", | ||
| 1376 | .read_u64 = debug_taskcount_read, | ||
| 1377 | }, | ||
| 1378 | |||
| 1379 | { | ||
| 1380 | .name = "current_css_set", | ||
| 1381 | .read_u64 = current_css_set_read, | ||
| 1382 | }, | ||
| 1383 | |||
| 1384 | { | ||
| 1385 | .name = "current_css_set_refcount", | ||
| 1386 | .read_u64 = current_css_set_refcount_read, | ||
| 1387 | }, | ||
| 1388 | |||
| 1389 | { | ||
| 1390 | .name = "current_css_set_cg_links", | ||
| 1391 | .seq_show = current_css_set_cg_links_read, | ||
| 1392 | }, | ||
| 1393 | |||
| 1394 | { | ||
| 1395 | .name = "cgroup_css_links", | ||
| 1396 | .seq_show = cgroup_css_links_read, | ||
| 1397 | }, | ||
| 1398 | |||
| 1399 | { | ||
| 1400 | .name = "releasable", | ||
| 1401 | .read_u64 = releasable_read, | ||
| 1402 | }, | ||
| 1403 | |||
| 1404 | { } /* terminate */ | ||
| 1405 | }; | ||
| 1406 | |||
| 1407 | struct cgroup_subsys debug_cgrp_subsys = { | ||
| 1408 | .css_alloc = debug_css_alloc, | ||
| 1409 | .css_free = debug_css_free, | ||
| 1410 | .legacy_cftypes = debug_files, | ||
| 1411 | }; | ||
| 1412 | #endif /* CONFIG_CGROUP_DEBUG */ | ||
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index c3c9a0e1b3c9..620794a20a33 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c | |||
| @@ -573,6 +573,11 @@ static int css_set_count = 1; /* 1 for init_css_set */ | |||
| 573 | /** | 573 | /** |
| 574 | * css_set_populated - does a css_set contain any tasks? | 574 | * css_set_populated - does a css_set contain any tasks? |
| 575 | * @cset: target css_set | 575 | * @cset: target css_set |
| 576 | * | ||
| 577 | * css_set_populated() should be the same as !!cset->nr_tasks at steady | ||
| 578 | * state. However, css_set_populated() can be called while a task is being | ||
| 579 | * added to or removed from the linked list before the nr_tasks is | ||
| 580 | * properly updated. Hence, we can't just look at ->nr_tasks here. | ||
| 576 | */ | 581 | */ |
| 577 | static bool css_set_populated(struct css_set *cset) | 582 | static bool css_set_populated(struct css_set *cset) |
| 578 | { | 583 | { |
| @@ -1542,10 +1547,56 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, | |||
| 1542 | return len; | 1547 | return len; |
| 1543 | } | 1548 | } |
| 1544 | 1549 | ||
| 1550 | static int parse_cgroup_root_flags(char *data, unsigned int *root_flags) | ||
| 1551 | { | ||
| 1552 | char *token; | ||
| 1553 | |||
| 1554 | *root_flags = 0; | ||
| 1555 | |||
| 1556 | if (!data) | ||
| 1557 | return 0; | ||
| 1558 | |||
| 1559 | while ((token = strsep(&data, ",")) != NULL) { | ||
| 1560 | if (!strcmp(token, "nsdelegate")) { | ||
| 1561 | *root_flags |= CGRP_ROOT_NS_DELEGATE; | ||
| 1562 | continue; | ||
| 1563 | } | ||
| 1564 | |||
| 1565 | pr_err("cgroup2: unknown option \"%s\"\n", token); | ||
| 1566 | return -EINVAL; | ||
| 1567 | } | ||
| 1568 | |||
| 1569 | return 0; | ||
| 1570 | } | ||
| 1571 | |||
| 1572 | static void apply_cgroup_root_flags(unsigned int root_flags) | ||
| 1573 | { | ||
| 1574 | if (current->nsproxy->cgroup_ns == &init_cgroup_ns) { | ||
| 1575 | if (root_flags & CGRP_ROOT_NS_DELEGATE) | ||
| 1576 | cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE; | ||
| 1577 | else | ||
| 1578 | cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE; | ||
| 1579 | } | ||
| 1580 | } | ||
| 1581 | |||
| 1582 | static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root) | ||
| 1583 | { | ||
| 1584 | if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) | ||
| 1585 | seq_puts(seq, ",nsdelegate"); | ||
| 1586 | return 0; | ||
| 1587 | } | ||
| 1588 | |||
| 1545 | static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) | 1589 | static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) |
| 1546 | { | 1590 | { |
| 1547 | pr_err("remount is not allowed\n"); | 1591 | unsigned int root_flags; |
| 1548 | return -EINVAL; | 1592 | int ret; |
| 1593 | |||
| 1594 | ret = parse_cgroup_root_flags(data, &root_flags); | ||
| 1595 | if (ret) | ||
| 1596 | return ret; | ||
| 1597 | |||
| 1598 | apply_cgroup_root_flags(root_flags); | ||
| 1599 | return 0; | ||
| 1549 | } | 1600 | } |
| 1550 | 1601 | ||
| 1551 | /* | 1602 | /* |
| @@ -1598,6 +1649,7 @@ static void cgroup_enable_task_cg_lists(void) | |||
| 1598 | css_set_update_populated(cset, true); | 1649 | css_set_update_populated(cset, true); |
| 1599 | list_add_tail(&p->cg_list, &cset->tasks); | 1650 | list_add_tail(&p->cg_list, &cset->tasks); |
| 1600 | get_css_set(cset); | 1651 | get_css_set(cset); |
| 1652 | cset->nr_tasks++; | ||
| 1601 | } | 1653 | } |
| 1602 | spin_unlock(&p->sighand->siglock); | 1654 | spin_unlock(&p->sighand->siglock); |
| 1603 | } while_each_thread(g, p); | 1655 | } while_each_thread(g, p); |
| @@ -1784,6 +1836,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
| 1784 | { | 1836 | { |
| 1785 | struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; | 1837 | struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; |
| 1786 | struct dentry *dentry; | 1838 | struct dentry *dentry; |
| 1839 | int ret; | ||
| 1787 | 1840 | ||
| 1788 | get_cgroup_ns(ns); | 1841 | get_cgroup_ns(ns); |
| 1789 | 1842 | ||
| @@ -1801,16 +1854,21 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
| 1801 | cgroup_enable_task_cg_lists(); | 1854 | cgroup_enable_task_cg_lists(); |
| 1802 | 1855 | ||
| 1803 | if (fs_type == &cgroup2_fs_type) { | 1856 | if (fs_type == &cgroup2_fs_type) { |
| 1804 | if (data) { | 1857 | unsigned int root_flags; |
| 1805 | pr_err("cgroup2: unknown option \"%s\"\n", (char *)data); | 1858 | |
| 1859 | ret = parse_cgroup_root_flags(data, &root_flags); | ||
| 1860 | if (ret) { | ||
| 1806 | put_cgroup_ns(ns); | 1861 | put_cgroup_ns(ns); |
| 1807 | return ERR_PTR(-EINVAL); | 1862 | return ERR_PTR(ret); |
| 1808 | } | 1863 | } |
| 1864 | |||
| 1809 | cgrp_dfl_visible = true; | 1865 | cgrp_dfl_visible = true; |
| 1810 | cgroup_get_live(&cgrp_dfl_root.cgrp); | 1866 | cgroup_get_live(&cgrp_dfl_root.cgrp); |
| 1811 | 1867 | ||
| 1812 | dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root, | 1868 | dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root, |
| 1813 | CGROUP2_SUPER_MAGIC, ns); | 1869 | CGROUP2_SUPER_MAGIC, ns); |
| 1870 | if (!IS_ERR(dentry)) | ||
| 1871 | apply_cgroup_root_flags(root_flags); | ||
| 1814 | } else { | 1872 | } else { |
| 1815 | dentry = cgroup1_mount(&cgroup_fs_type, flags, data, | 1873 | dentry = cgroup1_mount(&cgroup_fs_type, flags, data, |
| 1816 | CGROUP_SUPER_MAGIC, ns); | 1874 | CGROUP_SUPER_MAGIC, ns); |
| @@ -2064,8 +2122,10 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx) | |||
| 2064 | struct css_set *to_cset = cset->mg_dst_cset; | 2122 | struct css_set *to_cset = cset->mg_dst_cset; |
| 2065 | 2123 | ||
| 2066 | get_css_set(to_cset); | 2124 | get_css_set(to_cset); |
| 2125 | to_cset->nr_tasks++; | ||
| 2067 | css_set_move_task(task, from_cset, to_cset, true); | 2126 | css_set_move_task(task, from_cset, to_cset, true); |
| 2068 | put_css_set_locked(from_cset); | 2127 | put_css_set_locked(from_cset); |
| 2128 | from_cset->nr_tasks--; | ||
| 2069 | } | 2129 | } |
| 2070 | } | 2130 | } |
| 2071 | spin_unlock_irq(&css_set_lock); | 2131 | spin_unlock_irq(&css_set_lock); |
| @@ -2355,27 +2415,14 @@ static int cgroup_procs_write_permission(struct task_struct *task, | |||
| 2355 | struct cgroup *dst_cgrp, | 2415 | struct cgroup *dst_cgrp, |
| 2356 | struct kernfs_open_file *of) | 2416 | struct kernfs_open_file *of) |
| 2357 | { | 2417 | { |
| 2358 | int ret = 0; | 2418 | struct super_block *sb = of->file->f_path.dentry->d_sb; |
| 2359 | 2419 | struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; | |
| 2360 | if (cgroup_on_dfl(dst_cgrp)) { | 2420 | struct cgroup *root_cgrp = ns->root_cset->dfl_cgrp; |
| 2361 | struct super_block *sb = of->file->f_path.dentry->d_sb; | 2421 | struct cgroup *src_cgrp, *com_cgrp; |
| 2362 | struct cgroup *cgrp; | 2422 | struct inode *inode; |
| 2363 | struct inode *inode; | 2423 | int ret; |
| 2364 | |||
| 2365 | spin_lock_irq(&css_set_lock); | ||
| 2366 | cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); | ||
| 2367 | spin_unlock_irq(&css_set_lock); | ||
| 2368 | |||
| 2369 | while (!cgroup_is_descendant(dst_cgrp, cgrp)) | ||
| 2370 | cgrp = cgroup_parent(cgrp); | ||
| 2371 | 2424 | ||
| 2372 | ret = -ENOMEM; | 2425 | if (!cgroup_on_dfl(dst_cgrp)) { |
| 2373 | inode = kernfs_get_inode(sb, cgrp->procs_file.kn); | ||
| 2374 | if (inode) { | ||
| 2375 | ret = inode_permission(inode, MAY_WRITE); | ||
| 2376 | iput(inode); | ||
| 2377 | } | ||
| 2378 | } else { | ||
| 2379 | const struct cred *cred = current_cred(); | 2426 | const struct cred *cred = current_cred(); |
| 2380 | const struct cred *tcred = get_task_cred(task); | 2427 | const struct cred *tcred = get_task_cred(task); |
| 2381 | 2428 | ||
| @@ -2383,14 +2430,47 @@ static int cgroup_procs_write_permission(struct task_struct *task, | |||
| 2383 | * even if we're attaching all tasks in the thread group, | 2430 | * even if we're attaching all tasks in the thread group, |
| 2384 | * we only need to check permissions on one of them. | 2431 | * we only need to check permissions on one of them. |
| 2385 | */ | 2432 | */ |
| 2386 | if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && | 2433 | if (uid_eq(cred->euid, GLOBAL_ROOT_UID) || |
| 2387 | !uid_eq(cred->euid, tcred->uid) && | 2434 | uid_eq(cred->euid, tcred->uid) || |
| 2388 | !uid_eq(cred->euid, tcred->suid)) | 2435 | uid_eq(cred->euid, tcred->suid)) |
| 2436 | ret = 0; | ||
| 2437 | else | ||
| 2389 | ret = -EACCES; | 2438 | ret = -EACCES; |
| 2439 | |||
| 2390 | put_cred(tcred); | 2440 | put_cred(tcred); |
| 2441 | return ret; | ||
| 2391 | } | 2442 | } |
| 2392 | 2443 | ||
| 2393 | return ret; | 2444 | /* find the source cgroup */ |
| 2445 | spin_lock_irq(&css_set_lock); | ||
| 2446 | src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); | ||
| 2447 | spin_unlock_irq(&css_set_lock); | ||
| 2448 | |||
| 2449 | /* and the common ancestor */ | ||
| 2450 | com_cgrp = src_cgrp; | ||
| 2451 | while (!cgroup_is_descendant(dst_cgrp, com_cgrp)) | ||
| 2452 | com_cgrp = cgroup_parent(com_cgrp); | ||
| 2453 | |||
| 2454 | /* %current should be authorized to migrate to the common ancestor */ | ||
| 2455 | inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn); | ||
| 2456 | if (!inode) | ||
| 2457 | return -ENOMEM; | ||
| 2458 | |||
| 2459 | ret = inode_permission(inode, MAY_WRITE); | ||
| 2460 | iput(inode); | ||
| 2461 | if (ret) | ||
| 2462 | return ret; | ||
| 2463 | |||
| 2464 | /* | ||
| 2465 | * If namespaces are delegation boundaries, %current must be able | ||
| 2466 | * to see both source and destination cgroups from its namespace. | ||
| 2467 | */ | ||
| 2468 | if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) && | ||
| 2469 | (!cgroup_is_descendant(src_cgrp, root_cgrp) || | ||
| 2470 | !cgroup_is_descendant(dst_cgrp, root_cgrp))) | ||
| 2471 | return -ENOENT; | ||
| 2472 | |||
| 2473 | return 0; | ||
| 2394 | } | 2474 | } |
| 2395 | 2475 | ||
| 2396 | /* | 2476 | /* |
| @@ -2954,11 +3034,23 @@ static void cgroup_file_release(struct kernfs_open_file *of) | |||
| 2954 | static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, | 3034 | static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, |
| 2955 | size_t nbytes, loff_t off) | 3035 | size_t nbytes, loff_t off) |
| 2956 | { | 3036 | { |
| 3037 | struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; | ||
| 2957 | struct cgroup *cgrp = of->kn->parent->priv; | 3038 | struct cgroup *cgrp = of->kn->parent->priv; |
| 2958 | struct cftype *cft = of->kn->priv; | 3039 | struct cftype *cft = of->kn->priv; |
| 2959 | struct cgroup_subsys_state *css; | 3040 | struct cgroup_subsys_state *css; |
| 2960 | int ret; | 3041 | int ret; |
| 2961 | 3042 | ||
| 3043 | /* | ||
| 3044 | * If namespaces are delegation boundaries, disallow writes to | ||
| 3045 | * files in an non-init namespace root from inside the namespace | ||
| 3046 | * except for the files explicitly marked delegatable - | ||
| 3047 | * cgroup.procs and cgroup.subtree_control. | ||
| 3048 | */ | ||
| 3049 | if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) && | ||
| 3050 | !(cft->flags & CFTYPE_NS_DELEGATABLE) && | ||
| 3051 | ns != &init_cgroup_ns && ns->root_cset->dfl_cgrp == cgrp) | ||
| 3052 | return -EPERM; | ||
| 3053 | |||
| 2962 | if (cft->write) | 3054 | if (cft->write) |
| 2963 | return cft->write(of, buf, nbytes, off); | 3055 | return cft->write(of, buf, nbytes, off); |
| 2964 | 3056 | ||
| @@ -3792,6 +3884,7 @@ static int cgroup_procs_show(struct seq_file *s, void *v) | |||
| 3792 | static struct cftype cgroup_base_files[] = { | 3884 | static struct cftype cgroup_base_files[] = { |
| 3793 | { | 3885 | { |
| 3794 | .name = "cgroup.procs", | 3886 | .name = "cgroup.procs", |
| 3887 | .flags = CFTYPE_NS_DELEGATABLE, | ||
| 3795 | .file_offset = offsetof(struct cgroup, procs_file), | 3888 | .file_offset = offsetof(struct cgroup, procs_file), |
| 3796 | .release = cgroup_procs_release, | 3889 | .release = cgroup_procs_release, |
| 3797 | .seq_start = cgroup_procs_start, | 3890 | .seq_start = cgroup_procs_start, |
| @@ -3805,6 +3898,7 @@ static struct cftype cgroup_base_files[] = { | |||
| 3805 | }, | 3898 | }, |
| 3806 | { | 3899 | { |
| 3807 | .name = "cgroup.subtree_control", | 3900 | .name = "cgroup.subtree_control", |
| 3901 | .flags = CFTYPE_NS_DELEGATABLE, | ||
| 3808 | .seq_show = cgroup_subtree_control_show, | 3902 | .seq_show = cgroup_subtree_control_show, |
| 3809 | .write = cgroup_subtree_control_write, | 3903 | .write = cgroup_subtree_control_write, |
| 3810 | }, | 3904 | }, |
| @@ -4265,6 +4359,11 @@ static void kill_css(struct cgroup_subsys_state *css) | |||
| 4265 | { | 4359 | { |
| 4266 | lockdep_assert_held(&cgroup_mutex); | 4360 | lockdep_assert_held(&cgroup_mutex); |
| 4267 | 4361 | ||
| 4362 | if (css->flags & CSS_DYING) | ||
| 4363 | return; | ||
| 4364 | |||
| 4365 | css->flags |= CSS_DYING; | ||
| 4366 | |||
| 4268 | /* | 4367 | /* |
| 4269 | * This must happen before css is disassociated with its cgroup. | 4368 | * This must happen before css is disassociated with its cgroup. |
| 4270 | * See seq_css() for details. | 4369 | * See seq_css() for details. |
| @@ -4388,6 +4487,7 @@ int cgroup_rmdir(struct kernfs_node *kn) | |||
| 4388 | } | 4487 | } |
| 4389 | 4488 | ||
| 4390 | static struct kernfs_syscall_ops cgroup_kf_syscall_ops = { | 4489 | static struct kernfs_syscall_ops cgroup_kf_syscall_ops = { |
| 4490 | .show_options = cgroup_show_options, | ||
| 4391 | .remount_fs = cgroup_remount, | 4491 | .remount_fs = cgroup_remount, |
| 4392 | .mkdir = cgroup_mkdir, | 4492 | .mkdir = cgroup_mkdir, |
| 4393 | .rmdir = cgroup_rmdir, | 4493 | .rmdir = cgroup_rmdir, |
| @@ -4784,6 +4884,7 @@ void cgroup_post_fork(struct task_struct *child) | |||
| 4784 | cset = task_css_set(current); | 4884 | cset = task_css_set(current); |
| 4785 | if (list_empty(&child->cg_list)) { | 4885 | if (list_empty(&child->cg_list)) { |
| 4786 | get_css_set(cset); | 4886 | get_css_set(cset); |
| 4887 | cset->nr_tasks++; | ||
| 4787 | css_set_move_task(child, NULL, cset, false); | 4888 | css_set_move_task(child, NULL, cset, false); |
| 4788 | } | 4889 | } |
| 4789 | spin_unlock_irq(&css_set_lock); | 4890 | spin_unlock_irq(&css_set_lock); |
| @@ -4833,6 +4934,7 @@ void cgroup_exit(struct task_struct *tsk) | |||
| 4833 | if (!list_empty(&tsk->cg_list)) { | 4934 | if (!list_empty(&tsk->cg_list)) { |
| 4834 | spin_lock_irq(&css_set_lock); | 4935 | spin_lock_irq(&css_set_lock); |
| 4835 | css_set_move_task(tsk, cset, NULL, false); | 4936 | css_set_move_task(tsk, cset, NULL, false); |
| 4937 | cset->nr_tasks--; | ||
| 4836 | spin_unlock_irq(&css_set_lock); | 4938 | spin_unlock_irq(&css_set_lock); |
| 4837 | } else { | 4939 | } else { |
| 4838 | get_css_set(cset); | 4940 | get_css_set(cset); |
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index f6501f4f6040..ca8376e5008c 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c | |||
| @@ -176,9 +176,9 @@ typedef enum { | |||
| 176 | } cpuset_flagbits_t; | 176 | } cpuset_flagbits_t; |
| 177 | 177 | ||
| 178 | /* convenient tests for these bits */ | 178 | /* convenient tests for these bits */ |
| 179 | static inline bool is_cpuset_online(const struct cpuset *cs) | 179 | static inline bool is_cpuset_online(struct cpuset *cs) |
| 180 | { | 180 | { |
| 181 | return test_bit(CS_ONLINE, &cs->flags); | 181 | return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css); |
| 182 | } | 182 | } |
| 183 | 183 | ||
| 184 | static inline int is_cpu_exclusive(const struct cpuset *cs) | 184 | static inline int is_cpu_exclusive(const struct cpuset *cs) |
| @@ -1038,40 +1038,25 @@ static void cpuset_post_attach(void) | |||
| 1038 | * @tsk: the task to change | 1038 | * @tsk: the task to change |
| 1039 | * @newmems: new nodes that the task will be set | 1039 | * @newmems: new nodes that the task will be set |
| 1040 | * | 1040 | * |
| 1041 | * In order to avoid seeing no nodes if the old and new nodes are disjoint, | 1041 | * We use the mems_allowed_seq seqlock to safely update both tsk->mems_allowed |
| 1042 | * we structure updates as setting all new allowed nodes, then clearing newly | 1042 | * and rebind an eventual tasks' mempolicy. If the task is allocating in |
| 1043 | * disallowed ones. | 1043 | * parallel, it might temporarily see an empty intersection, which results in |
| 1044 | * a seqlock check and retry before OOM or allocation failure. | ||
| 1044 | */ | 1045 | */ |
| 1045 | static void cpuset_change_task_nodemask(struct task_struct *tsk, | 1046 | static void cpuset_change_task_nodemask(struct task_struct *tsk, |
| 1046 | nodemask_t *newmems) | 1047 | nodemask_t *newmems) |
| 1047 | { | 1048 | { |
| 1048 | bool need_loop; | ||
| 1049 | |||
| 1050 | task_lock(tsk); | 1049 | task_lock(tsk); |
| 1051 | /* | ||
| 1052 | * Determine if a loop is necessary if another thread is doing | ||
| 1053 | * read_mems_allowed_begin(). If at least one node remains unchanged and | ||
| 1054 | * tsk does not have a mempolicy, then an empty nodemask will not be | ||
| 1055 | * possible when mems_allowed is larger than a word. | ||
| 1056 | */ | ||
| 1057 | need_loop = task_has_mempolicy(tsk) || | ||
| 1058 | !nodes_intersects(*newmems, tsk->mems_allowed); | ||
| 1059 | 1050 | ||
| 1060 | if (need_loop) { | 1051 | local_irq_disable(); |
| 1061 | local_irq_disable(); | 1052 | write_seqcount_begin(&tsk->mems_allowed_seq); |
| 1062 | write_seqcount_begin(&tsk->mems_allowed_seq); | ||
| 1063 | } | ||
| 1064 | 1053 | ||
| 1065 | nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); | 1054 | nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); |
| 1066 | mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); | 1055 | mpol_rebind_task(tsk, newmems); |
| 1067 | |||
| 1068 | mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2); | ||
| 1069 | tsk->mems_allowed = *newmems; | 1056 | tsk->mems_allowed = *newmems; |
| 1070 | 1057 | ||
| 1071 | if (need_loop) { | 1058 | write_seqcount_end(&tsk->mems_allowed_seq); |
| 1072 | write_seqcount_end(&tsk->mems_allowed_seq); | 1059 | local_irq_enable(); |
| 1073 | local_irq_enable(); | ||
| 1074 | } | ||
| 1075 | 1060 | ||
| 1076 | task_unlock(tsk); | 1061 | task_unlock(tsk); |
| 1077 | } | 1062 | } |
diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c new file mode 100644 index 000000000000..dac46af22782 --- /dev/null +++ b/kernel/cgroup/debug.c | |||
| @@ -0,0 +1,357 @@ | |||
| 1 | /* | ||
| 2 | * Debug controller | ||
| 3 | * | ||
| 4 | * WARNING: This controller is for cgroup core debugging only. | ||
| 5 | * Its interfaces are unstable and subject to changes at any time. | ||
| 6 | */ | ||
| 7 | #include <linux/ctype.h> | ||
| 8 | #include <linux/mm.h> | ||
| 9 | #include <linux/slab.h> | ||
| 10 | |||
| 11 | #include "cgroup-internal.h" | ||
| 12 | |||
| 13 | static struct cgroup_subsys_state * | ||
| 14 | debug_css_alloc(struct cgroup_subsys_state *parent_css) | ||
| 15 | { | ||
| 16 | struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); | ||
| 17 | |||
| 18 | if (!css) | ||
| 19 | return ERR_PTR(-ENOMEM); | ||
| 20 | |||
| 21 | return css; | ||
| 22 | } | ||
| 23 | |||
| 24 | static void debug_css_free(struct cgroup_subsys_state *css) | ||
| 25 | { | ||
| 26 | kfree(css); | ||
| 27 | } | ||
| 28 | |||
| 29 | /* | ||
| 30 | * debug_taskcount_read - return the number of tasks in a cgroup. | ||
| 31 | * @cgrp: the cgroup in question | ||
| 32 | */ | ||
| 33 | static u64 debug_taskcount_read(struct cgroup_subsys_state *css, | ||
| 34 | struct cftype *cft) | ||
| 35 | { | ||
| 36 | return cgroup_task_count(css->cgroup); | ||
| 37 | } | ||
| 38 | |||
| 39 | static int current_css_set_read(struct seq_file *seq, void *v) | ||
| 40 | { | ||
| 41 | struct kernfs_open_file *of = seq->private; | ||
| 42 | struct css_set *cset; | ||
| 43 | struct cgroup_subsys *ss; | ||
| 44 | struct cgroup_subsys_state *css; | ||
| 45 | int i, refcnt; | ||
| 46 | |||
| 47 | if (!cgroup_kn_lock_live(of->kn, false)) | ||
| 48 | return -ENODEV; | ||
| 49 | |||
| 50 | spin_lock_irq(&css_set_lock); | ||
| 51 | rcu_read_lock(); | ||
| 52 | cset = rcu_dereference(current->cgroups); | ||
| 53 | refcnt = refcount_read(&cset->refcount); | ||
| 54 | seq_printf(seq, "css_set %pK %d", cset, refcnt); | ||
| 55 | if (refcnt > cset->nr_tasks) | ||
| 56 | seq_printf(seq, " +%d", refcnt - cset->nr_tasks); | ||
| 57 | seq_puts(seq, "\n"); | ||
| 58 | |||
| 59 | /* | ||
| 60 | * Print the css'es stored in the current css_set. | ||
| 61 | */ | ||
| 62 | for_each_subsys(ss, i) { | ||
| 63 | css = cset->subsys[ss->id]; | ||
| 64 | if (!css) | ||
| 65 | continue; | ||
| 66 | seq_printf(seq, "%2d: %-4s\t- %lx[%d]\n", ss->id, ss->name, | ||
| 67 | (unsigned long)css, css->id); | ||
| 68 | } | ||
| 69 | rcu_read_unlock(); | ||
| 70 | spin_unlock_irq(&css_set_lock); | ||
| 71 | cgroup_kn_unlock(of->kn); | ||
| 72 | return 0; | ||
| 73 | } | ||
| 74 | |||
| 75 | static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css, | ||
| 76 | struct cftype *cft) | ||
| 77 | { | ||
| 78 | u64 count; | ||
| 79 | |||
| 80 | rcu_read_lock(); | ||
| 81 | count = refcount_read(&task_css_set(current)->refcount); | ||
| 82 | rcu_read_unlock(); | ||
| 83 | return count; | ||
| 84 | } | ||
| 85 | |||
| 86 | static int current_css_set_cg_links_read(struct seq_file *seq, void *v) | ||
| 87 | { | ||
| 88 | struct cgrp_cset_link *link; | ||
| 89 | struct css_set *cset; | ||
| 90 | char *name_buf; | ||
| 91 | |||
| 92 | name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL); | ||
| 93 | if (!name_buf) | ||
| 94 | return -ENOMEM; | ||
| 95 | |||
| 96 | spin_lock_irq(&css_set_lock); | ||
| 97 | rcu_read_lock(); | ||
| 98 | cset = rcu_dereference(current->cgroups); | ||
| 99 | list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { | ||
| 100 | struct cgroup *c = link->cgrp; | ||
| 101 | |||
| 102 | cgroup_name(c, name_buf, NAME_MAX + 1); | ||
| 103 | seq_printf(seq, "Root %d group %s\n", | ||
| 104 | c->root->hierarchy_id, name_buf); | ||
| 105 | } | ||
| 106 | rcu_read_unlock(); | ||
| 107 | spin_unlock_irq(&css_set_lock); | ||
| 108 | kfree(name_buf); | ||
| 109 | return 0; | ||
| 110 | } | ||
| 111 | |||
| 112 | #define MAX_TASKS_SHOWN_PER_CSS 25 | ||
| 113 | static int cgroup_css_links_read(struct seq_file *seq, void *v) | ||
| 114 | { | ||
| 115 | struct cgroup_subsys_state *css = seq_css(seq); | ||
| 116 | struct cgrp_cset_link *link; | ||
| 117 | int dead_cnt = 0, extra_refs = 0; | ||
| 118 | |||
| 119 | spin_lock_irq(&css_set_lock); | ||
| 120 | list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { | ||
| 121 | struct css_set *cset = link->cset; | ||
| 122 | struct task_struct *task; | ||
| 123 | int count = 0; | ||
| 124 | int refcnt = refcount_read(&cset->refcount); | ||
| 125 | |||
| 126 | seq_printf(seq, " %d", refcnt); | ||
| 127 | if (refcnt - cset->nr_tasks > 0) { | ||
| 128 | int extra = refcnt - cset->nr_tasks; | ||
| 129 | |||
| 130 | seq_printf(seq, " +%d", extra); | ||
| 131 | /* | ||
| 132 | * Take out the one additional reference in | ||
| 133 | * init_css_set. | ||
| 134 | */ | ||
| 135 | if (cset == &init_css_set) | ||
| 136 | extra--; | ||
| 137 | extra_refs += extra; | ||
| 138 | } | ||
| 139 | seq_puts(seq, "\n"); | ||
| 140 | |||
| 141 | list_for_each_entry(task, &cset->tasks, cg_list) { | ||
| 142 | if (count++ <= MAX_TASKS_SHOWN_PER_CSS) | ||
| 143 | seq_printf(seq, " task %d\n", | ||
| 144 | task_pid_vnr(task)); | ||
| 145 | } | ||
| 146 | |||
| 147 | list_for_each_entry(task, &cset->mg_tasks, cg_list) { | ||
| 148 | if (count++ <= MAX_TASKS_SHOWN_PER_CSS) | ||
| 149 | seq_printf(seq, " task %d\n", | ||
| 150 | task_pid_vnr(task)); | ||
| 151 | } | ||
| 152 | /* show # of overflowed tasks */ | ||
| 153 | if (count > MAX_TASKS_SHOWN_PER_CSS) | ||
| 154 | seq_printf(seq, " ... (%d)\n", | ||
| 155 | count - MAX_TASKS_SHOWN_PER_CSS); | ||
| 156 | |||
| 157 | if (cset->dead) { | ||
| 158 | seq_puts(seq, " [dead]\n"); | ||
| 159 | dead_cnt++; | ||
| 160 | } | ||
| 161 | |||
| 162 | WARN_ON(count != cset->nr_tasks); | ||
| 163 | } | ||
| 164 | spin_unlock_irq(&css_set_lock); | ||
| 165 | |||
| 166 | if (!dead_cnt && !extra_refs) | ||
| 167 | return 0; | ||
| 168 | |||
| 169 | seq_puts(seq, "\n"); | ||
| 170 | if (extra_refs) | ||
| 171 | seq_printf(seq, "extra references = %d\n", extra_refs); | ||
| 172 | if (dead_cnt) | ||
| 173 | seq_printf(seq, "dead css_sets = %d\n", dead_cnt); | ||
| 174 | |||
| 175 | return 0; | ||
| 176 | } | ||
| 177 | |||
| 178 | static int cgroup_subsys_states_read(struct seq_file *seq, void *v) | ||
| 179 | { | ||
| 180 | struct kernfs_open_file *of = seq->private; | ||
| 181 | struct cgroup *cgrp; | ||
| 182 | struct cgroup_subsys *ss; | ||
| 183 | struct cgroup_subsys_state *css; | ||
| 184 | char pbuf[16]; | ||
| 185 | int i; | ||
| 186 | |||
| 187 | cgrp = cgroup_kn_lock_live(of->kn, false); | ||
| 188 | if (!cgrp) | ||
| 189 | return -ENODEV; | ||
| 190 | |||
| 191 | for_each_subsys(ss, i) { | ||
| 192 | css = rcu_dereference_check(cgrp->subsys[ss->id], true); | ||
| 193 | if (!css) | ||
| 194 | continue; | ||
| 195 | |||
| 196 | pbuf[0] = '\0'; | ||
| 197 | |||
| 198 | /* Show the parent CSS if applicable*/ | ||
| 199 | if (css->parent) | ||
| 200 | snprintf(pbuf, sizeof(pbuf) - 1, " P=%d", | ||
| 201 | css->parent->id); | ||
| 202 | seq_printf(seq, "%2d: %-4s\t- %lx[%d] %d%s\n", ss->id, ss->name, | ||
| 203 | (unsigned long)css, css->id, | ||
| 204 | atomic_read(&css->online_cnt), pbuf); | ||
| 205 | } | ||
| 206 | |||
| 207 | cgroup_kn_unlock(of->kn); | ||
| 208 | return 0; | ||
| 209 | } | ||
| 210 | |||
| 211 | static void cgroup_masks_read_one(struct seq_file *seq, const char *name, | ||
| 212 | u16 mask) | ||
| 213 | { | ||
| 214 | struct cgroup_subsys *ss; | ||
| 215 | int ssid; | ||
| 216 | bool first = true; | ||
| 217 | |||
| 218 | seq_printf(seq, "%-17s: ", name); | ||
| 219 | for_each_subsys(ss, ssid) { | ||
| 220 | if (!(mask & (1 << ssid))) | ||
| 221 | continue; | ||
| 222 | if (!first) | ||
| 223 | seq_puts(seq, ", "); | ||
| 224 | seq_puts(seq, ss->name); | ||
| 225 | first = false; | ||
| 226 | } | ||
| 227 | seq_putc(seq, '\n'); | ||
| 228 | } | ||
| 229 | |||
| 230 | static int cgroup_masks_read(struct seq_file *seq, void *v) | ||
| 231 | { | ||
| 232 | struct kernfs_open_file *of = seq->private; | ||
| 233 | struct cgroup *cgrp; | ||
| 234 | |||
| 235 | cgrp = cgroup_kn_lock_live(of->kn, false); | ||
| 236 | if (!cgrp) | ||
| 237 | return -ENODEV; | ||
| 238 | |||
| 239 | cgroup_masks_read_one(seq, "subtree_control", cgrp->subtree_control); | ||
| 240 | cgroup_masks_read_one(seq, "subtree_ss_mask", cgrp->subtree_ss_mask); | ||
| 241 | |||
| 242 | cgroup_kn_unlock(of->kn); | ||
| 243 | return 0; | ||
| 244 | } | ||
| 245 | |||
| 246 | static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft) | ||
| 247 | { | ||
| 248 | return (!cgroup_is_populated(css->cgroup) && | ||
| 249 | !css_has_online_children(&css->cgroup->self)); | ||
| 250 | } | ||
| 251 | |||
| 252 | static struct cftype debug_legacy_files[] = { | ||
| 253 | { | ||
| 254 | .name = "taskcount", | ||
| 255 | .read_u64 = debug_taskcount_read, | ||
| 256 | }, | ||
| 257 | |||
| 258 | { | ||
| 259 | .name = "current_css_set", | ||
| 260 | .seq_show = current_css_set_read, | ||
| 261 | .flags = CFTYPE_ONLY_ON_ROOT, | ||
| 262 | }, | ||
| 263 | |||
| 264 | { | ||
| 265 | .name = "current_css_set_refcount", | ||
| 266 | .read_u64 = current_css_set_refcount_read, | ||
| 267 | .flags = CFTYPE_ONLY_ON_ROOT, | ||
| 268 | }, | ||
| 269 | |||
| 270 | { | ||
| 271 | .name = "current_css_set_cg_links", | ||
| 272 | .seq_show = current_css_set_cg_links_read, | ||
| 273 | .flags = CFTYPE_ONLY_ON_ROOT, | ||
| 274 | }, | ||
| 275 | |||
| 276 | { | ||
| 277 | .name = "cgroup_css_links", | ||
| 278 | .seq_show = cgroup_css_links_read, | ||
| 279 | }, | ||
| 280 | |||
| 281 | { | ||
| 282 | .name = "cgroup_subsys_states", | ||
| 283 | .seq_show = cgroup_subsys_states_read, | ||
| 284 | }, | ||
| 285 | |||
| 286 | { | ||
| 287 | .name = "cgroup_masks", | ||
| 288 | .seq_show = cgroup_masks_read, | ||
| 289 | }, | ||
| 290 | |||
| 291 | { | ||
| 292 | .name = "releasable", | ||
| 293 | .read_u64 = releasable_read, | ||
| 294 | }, | ||
| 295 | |||
| 296 | { } /* terminate */ | ||
| 297 | }; | ||
| 298 | |||
| 299 | static struct cftype debug_files[] = { | ||
| 300 | { | ||
| 301 | .name = "taskcount", | ||
| 302 | .read_u64 = debug_taskcount_read, | ||
| 303 | }, | ||
| 304 | |||
| 305 | { | ||
| 306 | .name = "current_css_set", | ||
| 307 | .seq_show = current_css_set_read, | ||
| 308 | .flags = CFTYPE_ONLY_ON_ROOT, | ||
| 309 | }, | ||
| 310 | |||
| 311 | { | ||
| 312 | .name = "current_css_set_refcount", | ||
| 313 | .read_u64 = current_css_set_refcount_read, | ||
| 314 | .flags = CFTYPE_ONLY_ON_ROOT, | ||
| 315 | }, | ||
| 316 | |||
| 317 | { | ||
| 318 | .name = "current_css_set_cg_links", | ||
| 319 | .seq_show = current_css_set_cg_links_read, | ||
| 320 | .flags = CFTYPE_ONLY_ON_ROOT, | ||
| 321 | }, | ||
| 322 | |||
| 323 | { | ||
| 324 | .name = "css_links", | ||
| 325 | .seq_show = cgroup_css_links_read, | ||
| 326 | }, | ||
| 327 | |||
| 328 | { | ||
| 329 | .name = "csses", | ||
| 330 | .seq_show = cgroup_subsys_states_read, | ||
| 331 | }, | ||
| 332 | |||
| 333 | { | ||
| 334 | .name = "masks", | ||
| 335 | .seq_show = cgroup_masks_read, | ||
| 336 | }, | ||
| 337 | |||
| 338 | { } /* terminate */ | ||
| 339 | }; | ||
| 340 | |||
| 341 | struct cgroup_subsys debug_cgrp_subsys = { | ||
| 342 | .css_alloc = debug_css_alloc, | ||
| 343 | .css_free = debug_css_free, | ||
| 344 | .legacy_cftypes = debug_legacy_files, | ||
| 345 | }; | ||
| 346 | |||
| 347 | /* | ||
| 348 | * On v2, debug is an implicit controller enabled by "cgroup_debug" boot | ||
| 349 | * parameter. | ||
| 350 | */ | ||
| 351 | static int __init enable_cgroup_debug(char *str) | ||
| 352 | { | ||
| 353 | debug_cgrp_subsys.dfl_cftypes = debug_files; | ||
| 354 | debug_cgrp_subsys.implicit_on_dfl = true; | ||
| 355 | return 1; | ||
| 356 | } | ||
| 357 | __setup("cgroup_debug", enable_cgroup_debug); | ||
diff --git a/kernel/compat.c b/kernel/compat.c index 933bcb31ae10..6f0a0e723a06 100644 --- a/kernel/compat.c +++ b/kernel/compat.c | |||
| @@ -30,100 +30,66 @@ | |||
| 30 | 30 | ||
| 31 | #include <linux/uaccess.h> | 31 | #include <linux/uaccess.h> |
| 32 | 32 | ||
| 33 | static int compat_get_timex(struct timex *txc, struct compat_timex __user *utp) | 33 | int compat_get_timex(struct timex *txc, const struct compat_timex __user *utp) |
| 34 | { | 34 | { |
| 35 | memset(txc, 0, sizeof(struct timex)); | 35 | struct compat_timex tx32; |
| 36 | |||
| 37 | if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) || | ||
| 38 | __get_user(txc->modes, &utp->modes) || | ||
| 39 | __get_user(txc->offset, &utp->offset) || | ||
| 40 | __get_user(txc->freq, &utp->freq) || | ||
| 41 | __get_user(txc->maxerror, &utp->maxerror) || | ||
| 42 | __get_user(txc->esterror, &utp->esterror) || | ||
| 43 | __get_user(txc->status, &utp->status) || | ||
| 44 | __get_user(txc->constant, &utp->constant) || | ||
| 45 | __get_user(txc->precision, &utp->precision) || | ||
| 46 | __get_user(txc->tolerance, &utp->tolerance) || | ||
| 47 | __get_user(txc->time.tv_sec, &utp->time.tv_sec) || | ||
| 48 | __get_user(txc->time.tv_usec, &utp->time.tv_usec) || | ||
| 49 | __get_user(txc->tick, &utp->tick) || | ||
| 50 | __get_user(txc->ppsfreq, &utp->ppsfreq) || | ||
| 51 | __get_user(txc->jitter, &utp->jitter) || | ||
| 52 | __get_user(txc->shift, &utp->shift) || | ||
| 53 | __get_user(txc->stabil, &utp->stabil) || | ||
| 54 | __get_user(txc->jitcnt, &utp->jitcnt) || | ||
| 55 | __get_user(txc->calcnt, &utp->calcnt) || | ||
| 56 | __get_user(txc->errcnt, &utp->errcnt) || | ||
| 57 | __get_user(txc->stbcnt, &utp->stbcnt)) | ||
| 58 | return -EFAULT; | ||
| 59 | 36 | ||
| 60 | return 0; | 37 | if (copy_from_user(&tx32, utp, sizeof(struct compat_timex))) |
| 61 | } | ||
| 62 | |||
| 63 | static int compat_put_timex(struct compat_timex __user *utp, struct timex *txc) | ||
| 64 | { | ||
| 65 | if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) || | ||
| 66 | __put_user(txc->modes, &utp->modes) || | ||
| 67 | __put_user(txc->offset, &utp->offset) || | ||
| 68 | __put_user(txc->freq, &utp->freq) || | ||
| 69 | __put_user(txc->maxerror, &utp->maxerror) || | ||
| 70 | __put_user(txc->esterror, &utp->esterror) || | ||
| 71 | __put_user(txc->status, &utp->status) || | ||
| 72 | __put_user(txc->constant, &utp->constant) || | ||
| 73 | __put_user(txc->precision, &utp->precision) || | ||
| 74 | __put_user(txc->tolerance, &utp->tolerance) || | ||
| 75 | __put_user(txc->time.tv_sec, &utp->time.tv_sec) || | ||
| 76 | __put_user(txc->time.tv_usec, &utp->time.tv_usec) || | ||
| 77 | __put_user(txc->tick, &utp->tick) || | ||
| 78 | __put_user(txc->ppsfreq, &utp->ppsfreq) || | ||
| 79 | __put_user(txc->jitter, &utp->jitter) || | ||
| 80 | __put_user(txc->shift, &utp->shift) || | ||
| 81 | __put_user(txc->stabil, &utp->stabil) || | ||
| 82 | __put_user(txc->jitcnt, &utp->jitcnt) || | ||
| 83 | __put_user(txc->calcnt, &utp->calcnt) || | ||
| 84 | __put_user(txc->errcnt, &utp->errcnt) || | ||
| 85 | __put_user(txc->stbcnt, &utp->stbcnt) || | ||
| 86 | __put_user(txc->tai, &utp->tai)) | ||
| 87 | return -EFAULT; | 38 | return -EFAULT; |
| 88 | return 0; | ||
| 89 | } | ||
| 90 | 39 | ||
| 91 | COMPAT_SYSCALL_DEFINE2(gettimeofday, struct compat_timeval __user *, tv, | 40 | txc->modes = tx32.modes; |
| 92 | struct timezone __user *, tz) | 41 | txc->offset = tx32.offset; |
| 93 | { | 42 | txc->freq = tx32.freq; |
| 94 | if (tv) { | 43 | txc->maxerror = tx32.maxerror; |
| 95 | struct timeval ktv; | 44 | txc->esterror = tx32.esterror; |
| 96 | do_gettimeofday(&ktv); | 45 | txc->status = tx32.status; |
| 97 | if (compat_put_timeval(&ktv, tv)) | 46 | txc->constant = tx32.constant; |
| 98 | return -EFAULT; | 47 | txc->precision = tx32.precision; |
| 99 | } | 48 | txc->tolerance = tx32.tolerance; |
| 100 | if (tz) { | 49 | txc->time.tv_sec = tx32.time.tv_sec; |
| 101 | if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) | 50 | txc->time.tv_usec = tx32.time.tv_usec; |
| 102 | return -EFAULT; | 51 | txc->tick = tx32.tick; |
| 103 | } | 52 | txc->ppsfreq = tx32.ppsfreq; |
| 53 | txc->jitter = tx32.jitter; | ||
| 54 | txc->shift = tx32.shift; | ||
| 55 | txc->stabil = tx32.stabil; | ||
| 56 | txc->jitcnt = tx32.jitcnt; | ||
| 57 | txc->calcnt = tx32.calcnt; | ||
| 58 | txc->errcnt = tx32.errcnt; | ||
| 59 | txc->stbcnt = tx32.stbcnt; | ||
| 104 | 60 | ||
| 105 | return 0; | 61 | return 0; |
| 106 | } | 62 | } |
| 107 | 63 | ||
| 108 | COMPAT_SYSCALL_DEFINE2(settimeofday, struct compat_timeval __user *, tv, | 64 | int compat_put_timex(struct compat_timex __user *utp, const struct timex *txc) |
| 109 | struct timezone __user *, tz) | 65 | { |
| 110 | { | 66 | struct compat_timex tx32; |
| 111 | struct timespec64 new_ts; | 67 | |
| 112 | struct timeval user_tv; | 68 | memset(&tx32, 0, sizeof(struct compat_timex)); |
| 113 | struct timezone new_tz; | 69 | tx32.modes = txc->modes; |
| 114 | 70 | tx32.offset = txc->offset; | |
| 115 | if (tv) { | 71 | tx32.freq = txc->freq; |
| 116 | if (compat_get_timeval(&user_tv, tv)) | 72 | tx32.maxerror = txc->maxerror; |
| 117 | return -EFAULT; | 73 | tx32.esterror = txc->esterror; |
| 118 | new_ts.tv_sec = user_tv.tv_sec; | 74 | tx32.status = txc->status; |
| 119 | new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC; | 75 | tx32.constant = txc->constant; |
| 120 | } | 76 | tx32.precision = txc->precision; |
| 121 | if (tz) { | 77 | tx32.tolerance = txc->tolerance; |
| 122 | if (copy_from_user(&new_tz, tz, sizeof(*tz))) | 78 | tx32.time.tv_sec = txc->time.tv_sec; |
| 123 | return -EFAULT; | 79 | tx32.time.tv_usec = txc->time.tv_usec; |
| 124 | } | 80 | tx32.tick = txc->tick; |
| 125 | 81 | tx32.ppsfreq = txc->ppsfreq; | |
| 126 | return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL); | 82 | tx32.jitter = txc->jitter; |
| 83 | tx32.shift = txc->shift; | ||
| 84 | tx32.stabil = txc->stabil; | ||
| 85 | tx32.jitcnt = txc->jitcnt; | ||
| 86 | tx32.calcnt = txc->calcnt; | ||
| 87 | tx32.errcnt = txc->errcnt; | ||
| 88 | tx32.stbcnt = txc->stbcnt; | ||
| 89 | tx32.tai = txc->tai; | ||
| 90 | if (copy_to_user(utp, &tx32, sizeof(struct compat_timex))) | ||
| 91 | return -EFAULT; | ||
| 92 | return 0; | ||
| 127 | } | 93 | } |
| 128 | 94 | ||
| 129 | static int __compat_get_timeval(struct timeval *tv, const struct compat_timeval __user *ctv) | 95 | static int __compat_get_timeval(struct timeval *tv, const struct compat_timeval __user *ctv) |
| @@ -154,6 +120,50 @@ static int __compat_put_timespec(const struct timespec *ts, struct compat_timesp | |||
| 154 | __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; | 120 | __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; |
| 155 | } | 121 | } |
| 156 | 122 | ||
| 123 | static int __compat_get_timespec64(struct timespec64 *ts64, | ||
| 124 | const struct compat_timespec __user *cts) | ||
| 125 | { | ||
| 126 | struct compat_timespec ts; | ||
| 127 | int ret; | ||
| 128 | |||
| 129 | ret = copy_from_user(&ts, cts, sizeof(ts)); | ||
| 130 | if (ret) | ||
| 131 | return -EFAULT; | ||
| 132 | |||
| 133 | ts64->tv_sec = ts.tv_sec; | ||
| 134 | ts64->tv_nsec = ts.tv_nsec; | ||
| 135 | |||
| 136 | return 0; | ||
| 137 | } | ||
| 138 | |||
| 139 | static int __compat_put_timespec64(const struct timespec64 *ts64, | ||
| 140 | struct compat_timespec __user *cts) | ||
| 141 | { | ||
| 142 | struct compat_timespec ts = { | ||
| 143 | .tv_sec = ts64->tv_sec, | ||
| 144 | .tv_nsec = ts64->tv_nsec | ||
| 145 | }; | ||
| 146 | return copy_to_user(cts, &ts, sizeof(ts)) ? -EFAULT : 0; | ||
| 147 | } | ||
| 148 | |||
| 149 | int compat_get_timespec64(struct timespec64 *ts, const void __user *uts) | ||
| 150 | { | ||
| 151 | if (COMPAT_USE_64BIT_TIME) | ||
| 152 | return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0; | ||
| 153 | else | ||
| 154 | return __compat_get_timespec64(ts, uts); | ||
| 155 | } | ||
| 156 | EXPORT_SYMBOL_GPL(compat_get_timespec64); | ||
| 157 | |||
| 158 | int compat_put_timespec64(const struct timespec64 *ts, void __user *uts) | ||
| 159 | { | ||
| 160 | if (COMPAT_USE_64BIT_TIME) | ||
| 161 | return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0; | ||
| 162 | else | ||
| 163 | return __compat_put_timespec64(ts, uts); | ||
| 164 | } | ||
| 165 | EXPORT_SYMBOL_GPL(compat_put_timespec64); | ||
| 166 | |||
| 157 | int compat_get_timeval(struct timeval *tv, const void __user *utv) | 167 | int compat_get_timeval(struct timeval *tv, const void __user *utv) |
| 158 | { | 168 | { |
| 159 | if (COMPAT_USE_64BIT_TIME) | 169 | if (COMPAT_USE_64BIT_TIME) |
| @@ -213,190 +223,30 @@ int compat_convert_timespec(struct timespec __user **kts, | |||
| 213 | return 0; | 223 | return 0; |
| 214 | } | 224 | } |
| 215 | 225 | ||
| 216 | static long compat_nanosleep_restart(struct restart_block *restart) | 226 | int get_compat_itimerval(struct itimerval *o, const struct compat_itimerval __user *i) |
| 217 | { | ||
| 218 | struct compat_timespec __user *rmtp; | ||
| 219 | struct timespec rmt; | ||
| 220 | mm_segment_t oldfs; | ||
| 221 | long ret; | ||
| 222 | |||
| 223 | restart->nanosleep.rmtp = (struct timespec __user *) &rmt; | ||
| 224 | oldfs = get_fs(); | ||
| 225 | set_fs(KERNEL_DS); | ||
| 226 | ret = hrtimer_nanosleep_restart(restart); | ||
| 227 | set_fs(oldfs); | ||
| 228 | |||
| 229 | if (ret == -ERESTART_RESTARTBLOCK) { | ||
| 230 | rmtp = restart->nanosleep.compat_rmtp; | ||
| 231 | |||
| 232 | if (rmtp && compat_put_timespec(&rmt, rmtp)) | ||
| 233 | return -EFAULT; | ||
| 234 | } | ||
| 235 | |||
| 236 | return ret; | ||
| 237 | } | ||
| 238 | |||
| 239 | COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp, | ||
| 240 | struct compat_timespec __user *, rmtp) | ||
| 241 | { | ||
| 242 | struct timespec tu, rmt; | ||
| 243 | struct timespec64 tu64; | ||
| 244 | mm_segment_t oldfs; | ||
| 245 | long ret; | ||
| 246 | |||
| 247 | if (compat_get_timespec(&tu, rqtp)) | ||
| 248 | return -EFAULT; | ||
| 249 | |||
| 250 | tu64 = timespec_to_timespec64(tu); | ||
| 251 | if (!timespec64_valid(&tu64)) | ||
| 252 | return -EINVAL; | ||
| 253 | |||
| 254 | oldfs = get_fs(); | ||
| 255 | set_fs(KERNEL_DS); | ||
| 256 | ret = hrtimer_nanosleep(&tu64, | ||
| 257 | rmtp ? (struct timespec __user *)&rmt : NULL, | ||
| 258 | HRTIMER_MODE_REL, CLOCK_MONOTONIC); | ||
| 259 | set_fs(oldfs); | ||
| 260 | |||
| 261 | /* | ||
| 262 | * hrtimer_nanosleep() can only return 0 or | ||
| 263 | * -ERESTART_RESTARTBLOCK here because: | ||
| 264 | * | ||
| 265 | * - we call it with HRTIMER_MODE_REL and therefor exclude the | ||
| 266 | * -ERESTARTNOHAND return path. | ||
| 267 | * | ||
| 268 | * - we supply the rmtp argument from the task stack (due to | ||
| 269 | * the necessary compat conversion. So the update cannot | ||
| 270 | * fail, which excludes the -EFAULT return path as well. If | ||
| 271 | * it fails nevertheless we have a bigger problem and wont | ||
| 272 | * reach this place anymore. | ||
| 273 | * | ||
| 274 | * - if the return value is 0, we do not have to update rmtp | ||
| 275 | * because there is no remaining time. | ||
| 276 | * | ||
| 277 | * We check for -ERESTART_RESTARTBLOCK nevertheless if the | ||
| 278 | * core implementation decides to return random nonsense. | ||
| 279 | */ | ||
| 280 | if (ret == -ERESTART_RESTARTBLOCK) { | ||
| 281 | struct restart_block *restart = ¤t->restart_block; | ||
| 282 | |||
| 283 | restart->fn = compat_nanosleep_restart; | ||
| 284 | restart->nanosleep.compat_rmtp = rmtp; | ||
| 285 | |||
| 286 | if (rmtp && compat_put_timespec(&rmt, rmtp)) | ||
| 287 | return -EFAULT; | ||
| 288 | } | ||
| 289 | return ret; | ||
| 290 | } | ||
| 291 | |||
| 292 | static inline long get_compat_itimerval(struct itimerval *o, | ||
| 293 | struct compat_itimerval __user *i) | ||
| 294 | { | 227 | { |
| 295 | return (!access_ok(VERIFY_READ, i, sizeof(*i)) || | 228 | struct compat_itimerval v32; |
| 296 | (__get_user(o->it_interval.tv_sec, &i->it_interval.tv_sec) | | ||
| 297 | __get_user(o->it_interval.tv_usec, &i->it_interval.tv_usec) | | ||
| 298 | __get_user(o->it_value.tv_sec, &i->it_value.tv_sec) | | ||
| 299 | __get_user(o->it_value.tv_usec, &i->it_value.tv_usec))); | ||
| 300 | } | ||
| 301 | |||
| 302 | static inline long put_compat_itimerval(struct compat_itimerval __user *o, | ||
| 303 | struct itimerval *i) | ||
| 304 | { | ||
| 305 | return (!access_ok(VERIFY_WRITE, o, sizeof(*o)) || | ||
| 306 | (__put_user(i->it_interval.tv_sec, &o->it_interval.tv_sec) | | ||
| 307 | __put_user(i->it_interval.tv_usec, &o->it_interval.tv_usec) | | ||
| 308 | __put_user(i->it_value.tv_sec, &o->it_value.tv_sec) | | ||
| 309 | __put_user(i->it_value.tv_usec, &o->it_value.tv_usec))); | ||
| 310 | } | ||
| 311 | |||
| 312 | asmlinkage long sys_ni_posix_timers(void); | ||
| 313 | 229 | ||
| 314 | COMPAT_SYSCALL_DEFINE2(getitimer, int, which, | 230 | if (copy_from_user(&v32, i, sizeof(struct compat_itimerval))) |
| 315 | struct compat_itimerval __user *, it) | ||
| 316 | { | ||
| 317 | struct itimerval kit; | ||
| 318 | int error; | ||
| 319 | |||
| 320 | if (!IS_ENABLED(CONFIG_POSIX_TIMERS)) | ||
| 321 | return sys_ni_posix_timers(); | ||
| 322 | |||
| 323 | error = do_getitimer(which, &kit); | ||
| 324 | if (!error && put_compat_itimerval(it, &kit)) | ||
| 325 | error = -EFAULT; | ||
| 326 | return error; | ||
| 327 | } | ||
| 328 | |||
| 329 | COMPAT_SYSCALL_DEFINE3(setitimer, int, which, | ||
| 330 | struct compat_itimerval __user *, in, | ||
| 331 | struct compat_itimerval __user *, out) | ||
| 332 | { | ||
| 333 | struct itimerval kin, kout; | ||
| 334 | int error; | ||
| 335 | |||
| 336 | if (!IS_ENABLED(CONFIG_POSIX_TIMERS)) | ||
| 337 | return sys_ni_posix_timers(); | ||
| 338 | |||
| 339 | if (in) { | ||
| 340 | if (get_compat_itimerval(&kin, in)) | ||
| 341 | return -EFAULT; | ||
| 342 | } else | ||
| 343 | memset(&kin, 0, sizeof(kin)); | ||
| 344 | |||
| 345 | error = do_setitimer(which, &kin, out ? &kout : NULL); | ||
| 346 | if (error || !out) | ||
| 347 | return error; | ||
| 348 | if (put_compat_itimerval(out, &kout)) | ||
| 349 | return -EFAULT; | 231 | return -EFAULT; |
| 232 | o->it_interval.tv_sec = v32.it_interval.tv_sec; | ||
| 233 | o->it_interval.tv_usec = v32.it_interval.tv_usec; | ||
| 234 | o->it_value.tv_sec = v32.it_value.tv_sec; | ||
| 235 | o->it_value.tv_usec = v32.it_value.tv_usec; | ||
| 350 | return 0; | 236 | return 0; |
| 351 | } | 237 | } |
| 352 | 238 | ||
| 353 | static compat_clock_t clock_t_to_compat_clock_t(clock_t x) | 239 | int put_compat_itimerval(struct compat_itimerval __user *o, const struct itimerval *i) |
| 354 | { | ||
| 355 | return compat_jiffies_to_clock_t(clock_t_to_jiffies(x)); | ||
| 356 | } | ||
| 357 | |||
| 358 | COMPAT_SYSCALL_DEFINE1(times, struct compat_tms __user *, tbuf) | ||
| 359 | { | 240 | { |
| 360 | if (tbuf) { | 241 | struct compat_itimerval v32; |
| 361 | struct tms tms; | ||
| 362 | struct compat_tms tmp; | ||
| 363 | |||
| 364 | do_sys_times(&tms); | ||
| 365 | /* Convert our struct tms to the compat version. */ | ||
| 366 | tmp.tms_utime = clock_t_to_compat_clock_t(tms.tms_utime); | ||
| 367 | tmp.tms_stime = clock_t_to_compat_clock_t(tms.tms_stime); | ||
| 368 | tmp.tms_cutime = clock_t_to_compat_clock_t(tms.tms_cutime); | ||
| 369 | tmp.tms_cstime = clock_t_to_compat_clock_t(tms.tms_cstime); | ||
| 370 | if (copy_to_user(tbuf, &tmp, sizeof(tmp))) | ||
| 371 | return -EFAULT; | ||
| 372 | } | ||
| 373 | force_successful_syscall_return(); | ||
| 374 | return compat_jiffies_to_clock_t(jiffies); | ||
| 375 | } | ||
| 376 | |||
| 377 | #ifdef __ARCH_WANT_SYS_SIGPENDING | ||
| 378 | 242 | ||
| 379 | /* | 243 | v32.it_interval.tv_sec = i->it_interval.tv_sec; |
| 380 | * Assumption: old_sigset_t and compat_old_sigset_t are both | 244 | v32.it_interval.tv_usec = i->it_interval.tv_usec; |
| 381 | * types that can be passed to put_user()/get_user(). | 245 | v32.it_value.tv_sec = i->it_value.tv_sec; |
| 382 | */ | 246 | v32.it_value.tv_usec = i->it_value.tv_usec; |
| 383 | 247 | return copy_to_user(o, &v32, sizeof(struct compat_itimerval)) ? -EFAULT : 0; | |
| 384 | COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set) | ||
| 385 | { | ||
| 386 | old_sigset_t s; | ||
| 387 | long ret; | ||
| 388 | mm_segment_t old_fs = get_fs(); | ||
| 389 | |||
| 390 | set_fs(KERNEL_DS); | ||
| 391 | ret = sys_sigpending((old_sigset_t __user *) &s); | ||
| 392 | set_fs(old_fs); | ||
| 393 | if (ret == 0) | ||
| 394 | ret = put_user(s, set); | ||
| 395 | return ret; | ||
| 396 | } | 248 | } |
| 397 | 249 | ||
| 398 | #endif | ||
| 399 | |||
| 400 | #ifdef __ARCH_WANT_SYS_SIGPROCMASK | 250 | #ifdef __ARCH_WANT_SYS_SIGPROCMASK |
| 401 | 251 | ||
| 402 | /* | 252 | /* |
| @@ -451,164 +301,33 @@ COMPAT_SYSCALL_DEFINE3(sigprocmask, int, how, | |||
| 451 | 301 | ||
| 452 | #endif | 302 | #endif |
| 453 | 303 | ||
| 454 | COMPAT_SYSCALL_DEFINE2(setrlimit, unsigned int, resource, | ||
| 455 | struct compat_rlimit __user *, rlim) | ||
| 456 | { | ||
| 457 | struct rlimit r; | ||
| 458 | |||
| 459 | if (!access_ok(VERIFY_READ, rlim, sizeof(*rlim)) || | ||
| 460 | __get_user(r.rlim_cur, &rlim->rlim_cur) || | ||
| 461 | __get_user(r.rlim_max, &rlim->rlim_max)) | ||
| 462 | return -EFAULT; | ||
| 463 | |||
| 464 | if (r.rlim_cur == COMPAT_RLIM_INFINITY) | ||
| 465 | r.rlim_cur = RLIM_INFINITY; | ||
| 466 | if (r.rlim_max == COMPAT_RLIM_INFINITY) | ||
| 467 | r.rlim_max = RLIM_INFINITY; | ||
| 468 | return do_prlimit(current, resource, &r, NULL); | ||
| 469 | } | ||
| 470 | |||
| 471 | #ifdef COMPAT_RLIM_OLD_INFINITY | ||
| 472 | |||
| 473 | COMPAT_SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, | ||
| 474 | struct compat_rlimit __user *, rlim) | ||
| 475 | { | ||
| 476 | struct rlimit r; | ||
| 477 | int ret; | ||
| 478 | mm_segment_t old_fs = get_fs(); | ||
| 479 | |||
| 480 | set_fs(KERNEL_DS); | ||
| 481 | ret = sys_old_getrlimit(resource, (struct rlimit __user *)&r); | ||
| 482 | set_fs(old_fs); | ||
| 483 | |||
| 484 | if (!ret) { | ||
| 485 | if (r.rlim_cur > COMPAT_RLIM_OLD_INFINITY) | ||
| 486 | r.rlim_cur = COMPAT_RLIM_INFINITY; | ||
| 487 | if (r.rlim_max > COMPAT_RLIM_OLD_INFINITY) | ||
| 488 | r.rlim_max = COMPAT_RLIM_INFINITY; | ||
| 489 | |||
| 490 | if (!access_ok(VERIFY_WRITE, rlim, sizeof(*rlim)) || | ||
| 491 | __put_user(r.rlim_cur, &rlim->rlim_cur) || | ||
| 492 | __put_user(r.rlim_max, &rlim->rlim_max)) | ||
| 493 | return -EFAULT; | ||
| 494 | } | ||
| 495 | return ret; | ||
| 496 | } | ||
| 497 | |||
| 498 | #endif | ||
| 499 | |||
| 500 | COMPAT_SYSCALL_DEFINE2(getrlimit, unsigned int, resource, | ||
| 501 | struct compat_rlimit __user *, rlim) | ||
| 502 | { | ||
| 503 | struct rlimit r; | ||
| 504 | int ret; | ||
| 505 | |||
| 506 | ret = do_prlimit(current, resource, NULL, &r); | ||
| 507 | if (!ret) { | ||
| 508 | if (r.rlim_cur > COMPAT_RLIM_INFINITY) | ||
| 509 | r.rlim_cur = COMPAT_RLIM_INFINITY; | ||
| 510 | if (r.rlim_max > COMPAT_RLIM_INFINITY) | ||
| 511 | r.rlim_max = COMPAT_RLIM_INFINITY; | ||
| 512 | |||
| 513 | if (!access_ok(VERIFY_WRITE, rlim, sizeof(*rlim)) || | ||
| 514 | __put_user(r.rlim_cur, &rlim->rlim_cur) || | ||
| 515 | __put_user(r.rlim_max, &rlim->rlim_max)) | ||
| 516 | return -EFAULT; | ||
| 517 | } | ||
| 518 | return ret; | ||
| 519 | } | ||
| 520 | |||
| 521 | int put_compat_rusage(const struct rusage *r, struct compat_rusage __user *ru) | 304 | int put_compat_rusage(const struct rusage *r, struct compat_rusage __user *ru) |
| 522 | { | 305 | { |
| 523 | if (!access_ok(VERIFY_WRITE, ru, sizeof(*ru)) || | 306 | struct compat_rusage r32; |
| 524 | __put_user(r->ru_utime.tv_sec, &ru->ru_utime.tv_sec) || | 307 | memset(&r32, 0, sizeof(r32)); |
| 525 | __put_user(r->ru_utime.tv_usec, &ru->ru_utime.tv_usec) || | 308 | r32.ru_utime.tv_sec = r->ru_utime.tv_sec; |
| 526 | __put_user(r->ru_stime.tv_sec, &ru->ru_stime.tv_sec) || | 309 | r32.ru_utime.tv_usec = r->ru_utime.tv_usec; |
| 527 | __put_user(r->ru_stime.tv_usec, &ru->ru_stime.tv_usec) || | 310 | r32.ru_stime.tv_sec = r->ru_stime.tv_sec; |
| 528 | __put_user(r->ru_maxrss, &ru->ru_maxrss) || | 311 | r32.ru_stime.tv_usec = r->ru_stime.tv_usec; |
| 529 | __put_user(r->ru_ixrss, &ru->ru_ixrss) || | 312 | r32.ru_maxrss = r->ru_maxrss; |
| 530 | __put_user(r->ru_idrss, &ru->ru_idrss) || | 313 | r32.ru_ixrss = r->ru_ixrss; |
| 531 | __put_user(r->ru_isrss, &ru->ru_isrss) || | 314 | r32.ru_idrss = r->ru_idrss; |
| 532 | __put_user(r->ru_minflt, &ru->ru_minflt) || | 315 | r32.ru_isrss = r->ru_isrss; |
| 533 | __put_user(r->ru_majflt, &ru->ru_majflt) || | 316 | r32.ru_minflt = r->ru_minflt; |
| 534 | __put_user(r->ru_nswap, &ru->ru_nswap) || | 317 | r32.ru_majflt = r->ru_majflt; |
| 535 | __put_user(r->ru_inblock, &ru->ru_inblock) || | 318 | r32.ru_nswap = r->ru_nswap; |
| 536 | __put_user(r->ru_oublock, &ru->ru_oublock) || | 319 | r32.ru_inblock = r->ru_inblock; |
| 537 | __put_user(r->ru_msgsnd, &ru->ru_msgsnd) || | 320 | r32.ru_oublock = r->ru_oublock; |
| 538 | __put_user(r->ru_msgrcv, &ru->ru_msgrcv) || | 321 | r32.ru_msgsnd = r->ru_msgsnd; |
| 539 | __put_user(r->ru_nsignals, &ru->ru_nsignals) || | 322 | r32.ru_msgrcv = r->ru_msgrcv; |
| 540 | __put_user(r->ru_nvcsw, &ru->ru_nvcsw) || | 323 | r32.ru_nsignals = r->ru_nsignals; |
| 541 | __put_user(r->ru_nivcsw, &ru->ru_nivcsw)) | 324 | r32.ru_nvcsw = r->ru_nvcsw; |
| 325 | r32.ru_nivcsw = r->ru_nivcsw; | ||
| 326 | if (copy_to_user(ru, &r32, sizeof(r32))) | ||
| 542 | return -EFAULT; | 327 | return -EFAULT; |
| 543 | return 0; | 328 | return 0; |
| 544 | } | 329 | } |
| 545 | 330 | ||
| 546 | COMPAT_SYSCALL_DEFINE4(wait4, | ||
| 547 | compat_pid_t, pid, | ||
| 548 | compat_uint_t __user *, stat_addr, | ||
| 549 | int, options, | ||
| 550 | struct compat_rusage __user *, ru) | ||
| 551 | { | ||
| 552 | if (!ru) { | ||
| 553 | return sys_wait4(pid, stat_addr, options, NULL); | ||
| 554 | } else { | ||
| 555 | struct rusage r; | ||
| 556 | int ret; | ||
| 557 | unsigned int status; | ||
| 558 | mm_segment_t old_fs = get_fs(); | ||
| 559 | |||
| 560 | set_fs (KERNEL_DS); | ||
| 561 | ret = sys_wait4(pid, | ||
| 562 | (stat_addr ? | ||
| 563 | (unsigned int __user *) &status : NULL), | ||
| 564 | options, (struct rusage __user *) &r); | ||
| 565 | set_fs (old_fs); | ||
| 566 | |||
| 567 | if (ret > 0) { | ||
| 568 | if (put_compat_rusage(&r, ru)) | ||
| 569 | return -EFAULT; | ||
| 570 | if (stat_addr && put_user(status, stat_addr)) | ||
| 571 | return -EFAULT; | ||
| 572 | } | ||
| 573 | return ret; | ||
| 574 | } | ||
| 575 | } | ||
| 576 | |||
| 577 | COMPAT_SYSCALL_DEFINE5(waitid, | ||
| 578 | int, which, compat_pid_t, pid, | ||
| 579 | struct compat_siginfo __user *, uinfo, int, options, | ||
| 580 | struct compat_rusage __user *, uru) | ||
| 581 | { | ||
| 582 | siginfo_t info; | ||
| 583 | struct rusage ru; | ||
| 584 | long ret; | ||
| 585 | mm_segment_t old_fs = get_fs(); | ||
| 586 | |||
| 587 | memset(&info, 0, sizeof(info)); | ||
| 588 | |||
| 589 | set_fs(KERNEL_DS); | ||
| 590 | ret = sys_waitid(which, pid, (siginfo_t __user *)&info, options, | ||
| 591 | uru ? (struct rusage __user *)&ru : NULL); | ||
| 592 | set_fs(old_fs); | ||
| 593 | |||
| 594 | if ((ret < 0) || (info.si_signo == 0)) | ||
| 595 | return ret; | ||
| 596 | |||
| 597 | if (uru) { | ||
| 598 | /* sys_waitid() overwrites everything in ru */ | ||
| 599 | if (COMPAT_USE_64BIT_TIME) | ||
| 600 | ret = copy_to_user(uru, &ru, sizeof(ru)); | ||
| 601 | else | ||
| 602 | ret = put_compat_rusage(&ru, uru); | ||
| 603 | if (ret) | ||
| 604 | return -EFAULT; | ||
| 605 | } | ||
| 606 | |||
| 607 | BUG_ON(info.si_code & __SI_MASK); | ||
| 608 | info.si_code |= __SI_CHLD; | ||
| 609 | return copy_siginfo_to_user32(uinfo, &info); | ||
| 610 | } | ||
| 611 | |||
| 612 | static int compat_get_user_cpu_mask(compat_ulong_t __user *user_mask_ptr, | 331 | static int compat_get_user_cpu_mask(compat_ulong_t __user *user_mask_ptr, |
| 613 | unsigned len, struct cpumask *new_mask) | 332 | unsigned len, struct cpumask *new_mask) |
| 614 | { | 333 | { |
| @@ -689,192 +408,26 @@ int put_compat_itimerspec(struct compat_itimerspec __user *dst, | |||
| 689 | return 0; | 408 | return 0; |
| 690 | } | 409 | } |
| 691 | 410 | ||
| 692 | COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock, | 411 | int get_compat_itimerspec64(struct itimerspec64 *its, |
| 693 | struct compat_sigevent __user *, timer_event_spec, | 412 | const struct compat_itimerspec __user *uits) |
| 694 | timer_t __user *, created_timer_id) | ||
| 695 | { | ||
| 696 | struct sigevent __user *event = NULL; | ||
| 697 | |||
| 698 | if (timer_event_spec) { | ||
| 699 | struct sigevent kevent; | ||
| 700 | |||
| 701 | event = compat_alloc_user_space(sizeof(*event)); | ||
| 702 | if (get_compat_sigevent(&kevent, timer_event_spec) || | ||
| 703 | copy_to_user(event, &kevent, sizeof(*event))) | ||
| 704 | return -EFAULT; | ||
| 705 | } | ||
| 706 | |||
| 707 | return sys_timer_create(which_clock, event, created_timer_id); | ||
| 708 | } | ||
| 709 | |||
| 710 | COMPAT_SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, | ||
| 711 | struct compat_itimerspec __user *, new, | ||
| 712 | struct compat_itimerspec __user *, old) | ||
| 713 | { | ||
| 714 | long err; | ||
| 715 | mm_segment_t oldfs; | ||
| 716 | struct itimerspec newts, oldts; | ||
| 717 | |||
| 718 | if (!new) | ||
| 719 | return -EINVAL; | ||
| 720 | if (get_compat_itimerspec(&newts, new)) | ||
| 721 | return -EFAULT; | ||
| 722 | oldfs = get_fs(); | ||
| 723 | set_fs(KERNEL_DS); | ||
| 724 | err = sys_timer_settime(timer_id, flags, | ||
| 725 | (struct itimerspec __user *) &newts, | ||
| 726 | (struct itimerspec __user *) &oldts); | ||
| 727 | set_fs(oldfs); | ||
| 728 | if (!err && old && put_compat_itimerspec(old, &oldts)) | ||
| 729 | return -EFAULT; | ||
| 730 | return err; | ||
| 731 | } | ||
| 732 | |||
| 733 | COMPAT_SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, | ||
| 734 | struct compat_itimerspec __user *, setting) | ||
| 735 | { | ||
| 736 | long err; | ||
| 737 | mm_segment_t oldfs; | ||
| 738 | struct itimerspec ts; | ||
| 739 | |||
| 740 | oldfs = get_fs(); | ||
| 741 | set_fs(KERNEL_DS); | ||
| 742 | err = sys_timer_gettime(timer_id, | ||
| 743 | (struct itimerspec __user *) &ts); | ||
| 744 | set_fs(oldfs); | ||
| 745 | if (!err && put_compat_itimerspec(setting, &ts)) | ||
| 746 | return -EFAULT; | ||
| 747 | return err; | ||
| 748 | } | ||
| 749 | |||
| 750 | COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock, | ||
| 751 | struct compat_timespec __user *, tp) | ||
| 752 | { | ||
| 753 | long err; | ||
| 754 | mm_segment_t oldfs; | ||
| 755 | struct timespec ts; | ||
| 756 | |||
| 757 | if (compat_get_timespec(&ts, tp)) | ||
| 758 | return -EFAULT; | ||
| 759 | oldfs = get_fs(); | ||
| 760 | set_fs(KERNEL_DS); | ||
| 761 | err = sys_clock_settime(which_clock, | ||
| 762 | (struct timespec __user *) &ts); | ||
| 763 | set_fs(oldfs); | ||
| 764 | return err; | ||
| 765 | } | ||
| 766 | |||
| 767 | COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, | ||
| 768 | struct compat_timespec __user *, tp) | ||
| 769 | { | ||
| 770 | long err; | ||
| 771 | mm_segment_t oldfs; | ||
| 772 | struct timespec ts; | ||
| 773 | |||
| 774 | oldfs = get_fs(); | ||
| 775 | set_fs(KERNEL_DS); | ||
| 776 | err = sys_clock_gettime(which_clock, | ||
| 777 | (struct timespec __user *) &ts); | ||
| 778 | set_fs(oldfs); | ||
| 779 | if (!err && compat_put_timespec(&ts, tp)) | ||
| 780 | return -EFAULT; | ||
| 781 | return err; | ||
| 782 | } | ||
| 783 | |||
| 784 | COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock, | ||
| 785 | struct compat_timex __user *, utp) | ||
| 786 | { | 413 | { |
| 787 | struct timex txc; | ||
| 788 | mm_segment_t oldfs; | ||
| 789 | int err, ret; | ||
| 790 | 414 | ||
| 791 | err = compat_get_timex(&txc, utp); | 415 | if (__compat_get_timespec64(&its->it_interval, &uits->it_interval) || |
| 792 | if (err) | 416 | __compat_get_timespec64(&its->it_value, &uits->it_value)) |
| 793 | return err; | ||
| 794 | |||
| 795 | oldfs = get_fs(); | ||
| 796 | set_fs(KERNEL_DS); | ||
| 797 | ret = sys_clock_adjtime(which_clock, (struct timex __user *) &txc); | ||
| 798 | set_fs(oldfs); | ||
| 799 | |||
| 800 | err = compat_put_timex(utp, &txc); | ||
| 801 | if (err) | ||
| 802 | return err; | ||
| 803 | |||
| 804 | return ret; | ||
| 805 | } | ||
| 806 | |||
| 807 | COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, | ||
| 808 | struct compat_timespec __user *, tp) | ||
| 809 | { | ||
| 810 | long err; | ||
| 811 | mm_segment_t oldfs; | ||
| 812 | struct timespec ts; | ||
| 813 | |||
| 814 | oldfs = get_fs(); | ||
| 815 | set_fs(KERNEL_DS); | ||
| 816 | err = sys_clock_getres(which_clock, | ||
| 817 | (struct timespec __user *) &ts); | ||
| 818 | set_fs(oldfs); | ||
| 819 | if (!err && tp && compat_put_timespec(&ts, tp)) | ||
| 820 | return -EFAULT; | 417 | return -EFAULT; |
| 821 | return err; | 418 | return 0; |
| 822 | } | ||
| 823 | |||
| 824 | static long compat_clock_nanosleep_restart(struct restart_block *restart) | ||
| 825 | { | ||
| 826 | long err; | ||
| 827 | mm_segment_t oldfs; | ||
| 828 | struct timespec tu; | ||
| 829 | struct compat_timespec __user *rmtp = restart->nanosleep.compat_rmtp; | ||
| 830 | |||
| 831 | restart->nanosleep.rmtp = (struct timespec __user *) &tu; | ||
| 832 | oldfs = get_fs(); | ||
| 833 | set_fs(KERNEL_DS); | ||
| 834 | err = clock_nanosleep_restart(restart); | ||
| 835 | set_fs(oldfs); | ||
| 836 | |||
| 837 | if ((err == -ERESTART_RESTARTBLOCK) && rmtp && | ||
| 838 | compat_put_timespec(&tu, rmtp)) | ||
| 839 | return -EFAULT; | ||
| 840 | |||
| 841 | if (err == -ERESTART_RESTARTBLOCK) { | ||
| 842 | restart->fn = compat_clock_nanosleep_restart; | ||
| 843 | restart->nanosleep.compat_rmtp = rmtp; | ||
| 844 | } | ||
| 845 | return err; | ||
| 846 | } | 419 | } |
| 420 | EXPORT_SYMBOL_GPL(get_compat_itimerspec64); | ||
| 847 | 421 | ||
| 848 | COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, | 422 | int put_compat_itimerspec64(const struct itimerspec64 *its, |
| 849 | struct compat_timespec __user *, rqtp, | 423 | struct compat_itimerspec __user *uits) |
| 850 | struct compat_timespec __user *, rmtp) | ||
| 851 | { | 424 | { |
| 852 | long err; | 425 | if (__compat_put_timespec64(&its->it_interval, &uits->it_interval) || |
| 853 | mm_segment_t oldfs; | 426 | __compat_put_timespec64(&its->it_value, &uits->it_value)) |
| 854 | struct timespec in, out; | ||
| 855 | struct restart_block *restart; | ||
| 856 | |||
| 857 | if (compat_get_timespec(&in, rqtp)) | ||
| 858 | return -EFAULT; | ||
| 859 | |||
| 860 | oldfs = get_fs(); | ||
| 861 | set_fs(KERNEL_DS); | ||
| 862 | err = sys_clock_nanosleep(which_clock, flags, | ||
| 863 | (struct timespec __user *) &in, | ||
| 864 | (struct timespec __user *) &out); | ||
| 865 | set_fs(oldfs); | ||
| 866 | |||
| 867 | if ((err == -ERESTART_RESTARTBLOCK) && rmtp && | ||
| 868 | compat_put_timespec(&out, rmtp)) | ||
| 869 | return -EFAULT; | 427 | return -EFAULT; |
| 870 | 428 | return 0; | |
| 871 | if (err == -ERESTART_RESTARTBLOCK) { | ||
| 872 | restart = ¤t->restart_block; | ||
| 873 | restart->fn = compat_clock_nanosleep_restart; | ||
| 874 | restart->nanosleep.compat_rmtp = rmtp; | ||
| 875 | } | ||
| 876 | return err; | ||
| 877 | } | 429 | } |
| 430 | EXPORT_SYMBOL_GPL(put_compat_itimerspec64); | ||
| 878 | 431 | ||
| 879 | /* | 432 | /* |
| 880 | * We currently only need the following fields from the sigevent | 433 | * We currently only need the following fields from the sigevent |
| @@ -900,84 +453,59 @@ int get_compat_sigevent(struct sigevent *event, | |||
| 900 | long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask, | 453 | long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask, |
| 901 | unsigned long bitmap_size) | 454 | unsigned long bitmap_size) |
| 902 | { | 455 | { |
| 903 | int i, j; | ||
| 904 | unsigned long m; | ||
| 905 | compat_ulong_t um; | ||
| 906 | unsigned long nr_compat_longs; | 456 | unsigned long nr_compat_longs; |
| 907 | 457 | ||
| 908 | /* align bitmap up to nearest compat_long_t boundary */ | 458 | /* align bitmap up to nearest compat_long_t boundary */ |
| 909 | bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG); | 459 | bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG); |
| 460 | nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size); | ||
| 910 | 461 | ||
| 911 | if (!access_ok(VERIFY_READ, umask, bitmap_size / 8)) | 462 | if (!access_ok(VERIFY_READ, umask, bitmap_size / 8)) |
| 912 | return -EFAULT; | 463 | return -EFAULT; |
| 913 | 464 | ||
| 914 | nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size); | 465 | user_access_begin(); |
| 915 | 466 | while (nr_compat_longs > 1) { | |
| 916 | for (i = 0; i < BITS_TO_LONGS(bitmap_size); i++) { | 467 | compat_ulong_t l1, l2; |
| 917 | m = 0; | 468 | unsafe_get_user(l1, umask++, Efault); |
| 918 | 469 | unsafe_get_user(l2, umask++, Efault); | |
| 919 | for (j = 0; j < sizeof(m)/sizeof(um); j++) { | 470 | *mask++ = ((unsigned long)l2 << BITS_PER_COMPAT_LONG) | l1; |
| 920 | /* | 471 | nr_compat_longs -= 2; |
| 921 | * We dont want to read past the end of the userspace | ||
| 922 | * bitmap. We must however ensure the end of the | ||
| 923 | * kernel bitmap is zeroed. | ||
| 924 | */ | ||
| 925 | if (nr_compat_longs) { | ||
| 926 | nr_compat_longs--; | ||
| 927 | if (__get_user(um, umask)) | ||
| 928 | return -EFAULT; | ||
| 929 | } else { | ||
| 930 | um = 0; | ||
| 931 | } | ||
| 932 | |||
| 933 | umask++; | ||
| 934 | m |= (long)um << (j * BITS_PER_COMPAT_LONG); | ||
| 935 | } | ||
| 936 | *mask++ = m; | ||
| 937 | } | 472 | } |
| 938 | 473 | if (nr_compat_longs) | |
| 474 | unsafe_get_user(*mask, umask++, Efault); | ||
| 475 | user_access_end(); | ||
| 939 | return 0; | 476 | return 0; |
| 477 | |||
| 478 | Efault: | ||
| 479 | user_access_end(); | ||
| 480 | return -EFAULT; | ||
| 940 | } | 481 | } |
| 941 | 482 | ||
| 942 | long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask, | 483 | long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask, |
| 943 | unsigned long bitmap_size) | 484 | unsigned long bitmap_size) |
| 944 | { | 485 | { |
| 945 | int i, j; | ||
| 946 | unsigned long m; | ||
| 947 | compat_ulong_t um; | ||
| 948 | unsigned long nr_compat_longs; | 486 | unsigned long nr_compat_longs; |
| 949 | 487 | ||
| 950 | /* align bitmap up to nearest compat_long_t boundary */ | 488 | /* align bitmap up to nearest compat_long_t boundary */ |
| 951 | bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG); | 489 | bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG); |
| 490 | nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size); | ||
| 952 | 491 | ||
| 953 | if (!access_ok(VERIFY_WRITE, umask, bitmap_size / 8)) | 492 | if (!access_ok(VERIFY_WRITE, umask, bitmap_size / 8)) |
| 954 | return -EFAULT; | 493 | return -EFAULT; |
| 955 | 494 | ||
| 956 | nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size); | 495 | user_access_begin(); |
| 957 | 496 | while (nr_compat_longs > 1) { | |
| 958 | for (i = 0; i < BITS_TO_LONGS(bitmap_size); i++) { | 497 | unsigned long m = *mask++; |
| 959 | m = *mask++; | 498 | unsafe_put_user((compat_ulong_t)m, umask++, Efault); |
| 960 | 499 | unsafe_put_user(m >> BITS_PER_COMPAT_LONG, umask++, Efault); | |
| 961 | for (j = 0; j < sizeof(m)/sizeof(um); j++) { | 500 | nr_compat_longs -= 2; |
| 962 | um = m; | ||
| 963 | |||
| 964 | /* | ||
| 965 | * We dont want to write past the end of the userspace | ||
| 966 | * bitmap. | ||
| 967 | */ | ||
| 968 | if (nr_compat_longs) { | ||
| 969 | nr_compat_longs--; | ||
| 970 | if (__put_user(um, umask)) | ||
| 971 | return -EFAULT; | ||
| 972 | } | ||
| 973 | |||
| 974 | umask++; | ||
| 975 | m >>= 4*sizeof(um); | ||
| 976 | m >>= 4*sizeof(um); | ||
| 977 | } | ||
| 978 | } | 501 | } |
| 979 | 502 | if (nr_compat_longs) | |
| 503 | unsafe_put_user((compat_ulong_t)*mask, umask++, Efault); | ||
| 504 | user_access_end(); | ||
| 980 | return 0; | 505 | return 0; |
| 506 | Efault: | ||
| 507 | user_access_end(); | ||
| 508 | return -EFAULT; | ||
| 981 | } | 509 | } |
| 982 | 510 | ||
| 983 | void | 511 | void |
| @@ -1003,96 +531,6 @@ sigset_to_compat(compat_sigset_t *compat, const sigset_t *set) | |||
| 1003 | } | 531 | } |
| 1004 | } | 532 | } |
| 1005 | 533 | ||
| 1006 | COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese, | ||
| 1007 | struct compat_siginfo __user *, uinfo, | ||
| 1008 | struct compat_timespec __user *, uts, compat_size_t, sigsetsize) | ||
| 1009 | { | ||
| 1010 | compat_sigset_t s32; | ||
| 1011 | sigset_t s; | ||
| 1012 | struct timespec t; | ||
| 1013 | siginfo_t info; | ||
| 1014 | long ret; | ||
| 1015 | |||
| 1016 | if (sigsetsize != sizeof(sigset_t)) | ||
| 1017 | return -EINVAL; | ||
| 1018 | |||
| 1019 | if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t))) | ||
| 1020 | return -EFAULT; | ||
| 1021 | sigset_from_compat(&s, &s32); | ||
| 1022 | |||
| 1023 | if (uts) { | ||
| 1024 | if (compat_get_timespec(&t, uts)) | ||
| 1025 | return -EFAULT; | ||
| 1026 | } | ||
| 1027 | |||
| 1028 | ret = do_sigtimedwait(&s, &info, uts ? &t : NULL); | ||
| 1029 | |||
| 1030 | if (ret > 0 && uinfo) { | ||
| 1031 | if (copy_siginfo_to_user32(uinfo, &info)) | ||
| 1032 | ret = -EFAULT; | ||
| 1033 | } | ||
| 1034 | |||
| 1035 | return ret; | ||
| 1036 | } | ||
| 1037 | |||
| 1038 | #ifdef __ARCH_WANT_COMPAT_SYS_TIME | ||
| 1039 | |||
| 1040 | /* compat_time_t is a 32 bit "long" and needs to get converted. */ | ||
| 1041 | |||
| 1042 | COMPAT_SYSCALL_DEFINE1(time, compat_time_t __user *, tloc) | ||
| 1043 | { | ||
| 1044 | compat_time_t i; | ||
| 1045 | struct timeval tv; | ||
| 1046 | |||
| 1047 | do_gettimeofday(&tv); | ||
| 1048 | i = tv.tv_sec; | ||
| 1049 | |||
| 1050 | if (tloc) { | ||
| 1051 | if (put_user(i,tloc)) | ||
| 1052 | return -EFAULT; | ||
| 1053 | } | ||
| 1054 | force_successful_syscall_return(); | ||
| 1055 | return i; | ||
| 1056 | } | ||
| 1057 | |||
| 1058 | COMPAT_SYSCALL_DEFINE1(stime, compat_time_t __user *, tptr) | ||
| 1059 | { | ||
| 1060 | struct timespec tv; | ||
| 1061 | int err; | ||
| 1062 | |||
| 1063 | if (get_user(tv.tv_sec, tptr)) | ||
| 1064 | return -EFAULT; | ||
| 1065 | |||
| 1066 | tv.tv_nsec = 0; | ||
| 1067 | |||
| 1068 | err = security_settime(&tv, NULL); | ||
| 1069 | if (err) | ||
| 1070 | return err; | ||
| 1071 | |||
| 1072 | do_settimeofday(&tv); | ||
| 1073 | return 0; | ||
| 1074 | } | ||
| 1075 | |||
| 1076 | #endif /* __ARCH_WANT_COMPAT_SYS_TIME */ | ||
| 1077 | |||
| 1078 | COMPAT_SYSCALL_DEFINE1(adjtimex, struct compat_timex __user *, utp) | ||
| 1079 | { | ||
| 1080 | struct timex txc; | ||
| 1081 | int err, ret; | ||
| 1082 | |||
| 1083 | err = compat_get_timex(&txc, utp); | ||
| 1084 | if (err) | ||
| 1085 | return err; | ||
| 1086 | |||
| 1087 | ret = do_adjtimex(&txc); | ||
| 1088 | |||
| 1089 | err = compat_put_timex(utp, &txc); | ||
| 1090 | if (err) | ||
| 1091 | return err; | ||
| 1092 | |||
| 1093 | return ret; | ||
| 1094 | } | ||
| 1095 | |||
| 1096 | #ifdef CONFIG_NUMA | 534 | #ifdef CONFIG_NUMA |
| 1097 | COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages, | 535 | COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages, |
| 1098 | compat_uptr_t __user *, pages32, | 536 | compat_uptr_t __user *, pages32, |
diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config index 26a06e09a5bd..d70829033bb7 100644 --- a/kernel/configs/android-base.config +++ b/kernel/configs/android-base.config | |||
| @@ -1,10 +1,13 @@ | |||
| 1 | # KEEP ALPHABETICALLY SORTED | 1 | # KEEP ALPHABETICALLY SORTED |
| 2 | # CONFIG_DEVKMEM is not set | 2 | # CONFIG_DEVKMEM is not set |
| 3 | # CONFIG_DEVMEM is not set | 3 | # CONFIG_DEVMEM is not set |
| 4 | # CONFIG_FHANDLE is not set | ||
| 4 | # CONFIG_INET_LRO is not set | 5 | # CONFIG_INET_LRO is not set |
| 5 | # CONFIG_MODULES is not set | 6 | # CONFIG_NFSD is not set |
| 7 | # CONFIG_NFS_FS is not set | ||
| 6 | # CONFIG_OABI_COMPAT is not set | 8 | # CONFIG_OABI_COMPAT is not set |
| 7 | # CONFIG_SYSVIPC is not set | 9 | # CONFIG_SYSVIPC is not set |
| 10 | # CONFIG_USELIB is not set | ||
| 8 | CONFIG_ANDROID=y | 11 | CONFIG_ANDROID=y |
| 9 | CONFIG_ANDROID_BINDER_IPC=y | 12 | CONFIG_ANDROID_BINDER_IPC=y |
| 10 | CONFIG_ANDROID_LOW_MEMORY_KILLER=y | 13 | CONFIG_ANDROID_LOW_MEMORY_KILLER=y |
| @@ -13,6 +16,7 @@ CONFIG_ASHMEM=y | |||
| 13 | CONFIG_AUDIT=y | 16 | CONFIG_AUDIT=y |
| 14 | CONFIG_BLK_DEV_INITRD=y | 17 | CONFIG_BLK_DEV_INITRD=y |
| 15 | CONFIG_CGROUPS=y | 18 | CONFIG_CGROUPS=y |
| 19 | CONFIG_CGROUP_BPF=y | ||
| 16 | CONFIG_CGROUP_CPUACCT=y | 20 | CONFIG_CGROUP_CPUACCT=y |
| 17 | CONFIG_CGROUP_DEBUG=y | 21 | CONFIG_CGROUP_DEBUG=y |
| 18 | CONFIG_CGROUP_FREEZER=y | 22 | CONFIG_CGROUP_FREEZER=y |
| @@ -23,6 +27,8 @@ CONFIG_EMBEDDED=y | |||
| 23 | CONFIG_FB=y | 27 | CONFIG_FB=y |
| 24 | CONFIG_HARDENED_USERCOPY=y | 28 | CONFIG_HARDENED_USERCOPY=y |
| 25 | CONFIG_HIGH_RES_TIMERS=y | 29 | CONFIG_HIGH_RES_TIMERS=y |
| 30 | CONFIG_IKCONFIG=y | ||
| 31 | CONFIG_IKCONFIG_PROC=y | ||
| 26 | CONFIG_INET6_AH=y | 32 | CONFIG_INET6_AH=y |
| 27 | CONFIG_INET6_ESP=y | 33 | CONFIG_INET6_ESP=y |
| 28 | CONFIG_INET6_IPCOMP=y | 34 | CONFIG_INET6_IPCOMP=y |
| @@ -60,6 +66,9 @@ CONFIG_IP_NF_TARGET_MASQUERADE=y | |||
| 60 | CONFIG_IP_NF_TARGET_NETMAP=y | 66 | CONFIG_IP_NF_TARGET_NETMAP=y |
| 61 | CONFIG_IP_NF_TARGET_REDIRECT=y | 67 | CONFIG_IP_NF_TARGET_REDIRECT=y |
| 62 | CONFIG_IP_NF_TARGET_REJECT=y | 68 | CONFIG_IP_NF_TARGET_REJECT=y |
| 69 | CONFIG_MODULES=y | ||
| 70 | CONFIG_MODULE_UNLOAD=y | ||
| 71 | CONFIG_MODVERSIONS=y | ||
| 63 | CONFIG_NET=y | 72 | CONFIG_NET=y |
| 64 | CONFIG_NETDEVICES=y | 73 | CONFIG_NETDEVICES=y |
| 65 | CONFIG_NETFILTER=y | 74 | CONFIG_NETFILTER=y |
diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config index 28ee064b6744..946fb92418f7 100644 --- a/kernel/configs/android-recommended.config +++ b/kernel/configs/android-recommended.config | |||
| @@ -6,13 +6,15 @@ | |||
| 6 | # CONFIG_NF_CONNTRACK_SIP is not set | 6 | # CONFIG_NF_CONNTRACK_SIP is not set |
| 7 | # CONFIG_PM_WAKELOCKS_GC is not set | 7 | # CONFIG_PM_WAKELOCKS_GC is not set |
| 8 | # CONFIG_VT is not set | 8 | # CONFIG_VT is not set |
| 9 | CONFIG_ARM64_SW_TTBR0_PAN=y | ||
| 9 | CONFIG_BACKLIGHT_LCD_SUPPORT=y | 10 | CONFIG_BACKLIGHT_LCD_SUPPORT=y |
| 10 | CONFIG_BLK_DEV_DM=y | 11 | CONFIG_BLK_DEV_DM=y |
| 11 | CONFIG_BLK_DEV_LOOP=y | 12 | CONFIG_BLK_DEV_LOOP=y |
| 12 | CONFIG_BLK_DEV_RAM=y | 13 | CONFIG_BLK_DEV_RAM=y |
| 13 | CONFIG_BLK_DEV_RAM_SIZE=8192 | 14 | CONFIG_BLK_DEV_RAM_SIZE=8192 |
| 15 | CONFIG_CC_STACKPROTECTOR_STRONG=y | ||
| 14 | CONFIG_COMPACTION=y | 16 | CONFIG_COMPACTION=y |
| 15 | CONFIG_STRICT_KERNEL_RWX=y | 17 | CONFIG_CPU_SW_DOMAIN_PAN=y |
| 16 | CONFIG_DM_CRYPT=y | 18 | CONFIG_DM_CRYPT=y |
| 17 | CONFIG_DM_UEVENT=y | 19 | CONFIG_DM_UEVENT=y |
| 18 | CONFIG_DM_VERITY=y | 20 | CONFIG_DM_VERITY=y |
| @@ -105,6 +107,7 @@ CONFIG_SCHEDSTATS=y | |||
| 105 | CONFIG_SMARTJOYPLUS_FF=y | 107 | CONFIG_SMARTJOYPLUS_FF=y |
| 106 | CONFIG_SND=y | 108 | CONFIG_SND=y |
| 107 | CONFIG_SOUND=y | 109 | CONFIG_SOUND=y |
| 110 | CONFIG_STRICT_KERNEL_RWX=y | ||
| 108 | CONFIG_SUSPEND_TIME=y | 111 | CONFIG_SUSPEND_TIME=y |
| 109 | CONFIG_TABLET_USB_ACECAD=y | 112 | CONFIG_TABLET_USB_ACECAD=y |
| 110 | CONFIG_TABLET_USB_AIPTEK=y | 113 | CONFIG_TABLET_USB_AIPTEK=y |
diff --git a/kernel/cpu.c b/kernel/cpu.c index 9ae6fbe5b5cf..eee033134262 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
| @@ -27,6 +27,7 @@ | |||
| 27 | #include <linux/smpboot.h> | 27 | #include <linux/smpboot.h> |
| 28 | #include <linux/relay.h> | 28 | #include <linux/relay.h> |
| 29 | #include <linux/slab.h> | 29 | #include <linux/slab.h> |
| 30 | #include <linux/percpu-rwsem.h> | ||
| 30 | 31 | ||
| 31 | #include <trace/events/power.h> | 32 | #include <trace/events/power.h> |
| 32 | #define CREATE_TRACE_POINTS | 33 | #define CREATE_TRACE_POINTS |
| @@ -65,6 +66,12 @@ struct cpuhp_cpu_state { | |||
| 65 | 66 | ||
| 66 | static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state); | 67 | static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state); |
| 67 | 68 | ||
| 69 | #if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP) | ||
| 70 | static struct lock_class_key cpuhp_state_key; | ||
| 71 | static struct lockdep_map cpuhp_state_lock_map = | ||
| 72 | STATIC_LOCKDEP_MAP_INIT("cpuhp_state", &cpuhp_state_key); | ||
| 73 | #endif | ||
| 74 | |||
| 68 | /** | 75 | /** |
| 69 | * cpuhp_step - Hotplug state machine step | 76 | * cpuhp_step - Hotplug state machine step |
| 70 | * @name: Name of the step | 77 | * @name: Name of the step |
| @@ -196,121 +203,41 @@ void cpu_maps_update_done(void) | |||
| 196 | mutex_unlock(&cpu_add_remove_lock); | 203 | mutex_unlock(&cpu_add_remove_lock); |
| 197 | } | 204 | } |
| 198 | 205 | ||
| 199 | /* If set, cpu_up and cpu_down will return -EBUSY and do nothing. | 206 | /* |
| 207 | * If set, cpu_up and cpu_down will return -EBUSY and do nothing. | ||
| 200 | * Should always be manipulated under cpu_add_remove_lock | 208 | * Should always be manipulated under cpu_add_remove_lock |
| 201 | */ | 209 | */ |
| 202 | static int cpu_hotplug_disabled; | 210 | static int cpu_hotplug_disabled; |
| 203 | 211 | ||
| 204 | #ifdef CONFIG_HOTPLUG_CPU | 212 | #ifdef CONFIG_HOTPLUG_CPU |
| 205 | 213 | ||
| 206 | static struct { | 214 | DEFINE_STATIC_PERCPU_RWSEM(cpu_hotplug_lock); |
| 207 | struct task_struct *active_writer; | ||
| 208 | /* wait queue to wake up the active_writer */ | ||
| 209 | wait_queue_head_t wq; | ||
| 210 | /* verifies that no writer will get active while readers are active */ | ||
| 211 | struct mutex lock; | ||
| 212 | /* | ||
| 213 | * Also blocks the new readers during | ||
| 214 | * an ongoing cpu hotplug operation. | ||
| 215 | */ | ||
| 216 | atomic_t refcount; | ||
| 217 | |||
| 218 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
| 219 | struct lockdep_map dep_map; | ||
| 220 | #endif | ||
| 221 | } cpu_hotplug = { | ||
| 222 | .active_writer = NULL, | ||
| 223 | .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq), | ||
| 224 | .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock), | ||
| 225 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
| 226 | .dep_map = STATIC_LOCKDEP_MAP_INIT("cpu_hotplug.dep_map", &cpu_hotplug.dep_map), | ||
| 227 | #endif | ||
| 228 | }; | ||
| 229 | 215 | ||
| 230 | /* Lockdep annotations for get/put_online_cpus() and cpu_hotplug_begin/end() */ | 216 | void cpus_read_lock(void) |
| 231 | #define cpuhp_lock_acquire_read() lock_map_acquire_read(&cpu_hotplug.dep_map) | ||
| 232 | #define cpuhp_lock_acquire_tryread() \ | ||
| 233 | lock_map_acquire_tryread(&cpu_hotplug.dep_map) | ||
| 234 | #define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map) | ||
| 235 | #define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map) | ||
| 236 | |||
| 237 | |||
| 238 | void get_online_cpus(void) | ||
| 239 | { | 217 | { |
| 240 | might_sleep(); | 218 | percpu_down_read(&cpu_hotplug_lock); |
| 241 | if (cpu_hotplug.active_writer == current) | ||
| 242 | return; | ||
| 243 | cpuhp_lock_acquire_read(); | ||
| 244 | mutex_lock(&cpu_hotplug.lock); | ||
| 245 | atomic_inc(&cpu_hotplug.refcount); | ||
| 246 | mutex_unlock(&cpu_hotplug.lock); | ||
| 247 | } | 219 | } |
| 248 | EXPORT_SYMBOL_GPL(get_online_cpus); | 220 | EXPORT_SYMBOL_GPL(cpus_read_lock); |
| 249 | 221 | ||
| 250 | void put_online_cpus(void) | 222 | void cpus_read_unlock(void) |
| 251 | { | 223 | { |
| 252 | int refcount; | 224 | percpu_up_read(&cpu_hotplug_lock); |
| 253 | |||
| 254 | if (cpu_hotplug.active_writer == current) | ||
| 255 | return; | ||
| 256 | |||
| 257 | refcount = atomic_dec_return(&cpu_hotplug.refcount); | ||
| 258 | if (WARN_ON(refcount < 0)) /* try to fix things up */ | ||
| 259 | atomic_inc(&cpu_hotplug.refcount); | ||
| 260 | |||
| 261 | if (refcount <= 0 && waitqueue_active(&cpu_hotplug.wq)) | ||
| 262 | wake_up(&cpu_hotplug.wq); | ||
| 263 | |||
| 264 | cpuhp_lock_release(); | ||
| 265 | |||
| 266 | } | 225 | } |
| 267 | EXPORT_SYMBOL_GPL(put_online_cpus); | 226 | EXPORT_SYMBOL_GPL(cpus_read_unlock); |
| 268 | 227 | ||
| 269 | /* | 228 | void cpus_write_lock(void) |
| 270 | * This ensures that the hotplug operation can begin only when the | ||
| 271 | * refcount goes to zero. | ||
| 272 | * | ||
| 273 | * Note that during a cpu-hotplug operation, the new readers, if any, | ||
| 274 | * will be blocked by the cpu_hotplug.lock | ||
| 275 | * | ||
| 276 | * Since cpu_hotplug_begin() is always called after invoking | ||
| 277 | * cpu_maps_update_begin(), we can be sure that only one writer is active. | ||
| 278 | * | ||
| 279 | * Note that theoretically, there is a possibility of a livelock: | ||
| 280 | * - Refcount goes to zero, last reader wakes up the sleeping | ||
| 281 | * writer. | ||
| 282 | * - Last reader unlocks the cpu_hotplug.lock. | ||
| 283 | * - A new reader arrives at this moment, bumps up the refcount. | ||
| 284 | * - The writer acquires the cpu_hotplug.lock finds the refcount | ||
| 285 | * non zero and goes to sleep again. | ||
| 286 | * | ||
| 287 | * However, this is very difficult to achieve in practice since | ||
| 288 | * get_online_cpus() not an api which is called all that often. | ||
| 289 | * | ||
| 290 | */ | ||
| 291 | void cpu_hotplug_begin(void) | ||
| 292 | { | 229 | { |
| 293 | DEFINE_WAIT(wait); | 230 | percpu_down_write(&cpu_hotplug_lock); |
| 294 | 231 | } | |
| 295 | cpu_hotplug.active_writer = current; | ||
| 296 | cpuhp_lock_acquire(); | ||
| 297 | 232 | ||
| 298 | for (;;) { | 233 | void cpus_write_unlock(void) |
| 299 | mutex_lock(&cpu_hotplug.lock); | 234 | { |
| 300 | prepare_to_wait(&cpu_hotplug.wq, &wait, TASK_UNINTERRUPTIBLE); | 235 | percpu_up_write(&cpu_hotplug_lock); |
| 301 | if (likely(!atomic_read(&cpu_hotplug.refcount))) | ||
| 302 | break; | ||
| 303 | mutex_unlock(&cpu_hotplug.lock); | ||
| 304 | schedule(); | ||
| 305 | } | ||
| 306 | finish_wait(&cpu_hotplug.wq, &wait); | ||
| 307 | } | 236 | } |
| 308 | 237 | ||
| 309 | void cpu_hotplug_done(void) | 238 | void lockdep_assert_cpus_held(void) |
| 310 | { | 239 | { |
| 311 | cpu_hotplug.active_writer = NULL; | 240 | percpu_rwsem_assert_held(&cpu_hotplug_lock); |
| 312 | mutex_unlock(&cpu_hotplug.lock); | ||
| 313 | cpuhp_lock_release(); | ||
| 314 | } | 241 | } |
| 315 | 242 | ||
| 316 | /* | 243 | /* |
| @@ -344,13 +271,26 @@ void cpu_hotplug_enable(void) | |||
| 344 | EXPORT_SYMBOL_GPL(cpu_hotplug_enable); | 271 | EXPORT_SYMBOL_GPL(cpu_hotplug_enable); |
| 345 | #endif /* CONFIG_HOTPLUG_CPU */ | 272 | #endif /* CONFIG_HOTPLUG_CPU */ |
| 346 | 273 | ||
| 347 | /* Notifier wrappers for transitioning to state machine */ | 274 | static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st); |
| 348 | 275 | ||
| 349 | static int bringup_wait_for_ap(unsigned int cpu) | 276 | static int bringup_wait_for_ap(unsigned int cpu) |
| 350 | { | 277 | { |
| 351 | struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); | 278 | struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); |
| 352 | 279 | ||
| 280 | /* Wait for the CPU to reach CPUHP_AP_ONLINE_IDLE */ | ||
| 353 | wait_for_completion(&st->done); | 281 | wait_for_completion(&st->done); |
| 282 | if (WARN_ON_ONCE((!cpu_online(cpu)))) | ||
| 283 | return -ECANCELED; | ||
| 284 | |||
| 285 | /* Unpark the stopper thread and the hotplug thread of the target cpu */ | ||
| 286 | stop_machine_unpark(cpu); | ||
| 287 | kthread_unpark(st->thread); | ||
| 288 | |||
| 289 | /* Should we go further up ? */ | ||
| 290 | if (st->target > CPUHP_AP_ONLINE_IDLE) { | ||
| 291 | __cpuhp_kick_ap_work(st); | ||
| 292 | wait_for_completion(&st->done); | ||
| 293 | } | ||
| 354 | return st->result; | 294 | return st->result; |
| 355 | } | 295 | } |
| 356 | 296 | ||
| @@ -371,9 +311,7 @@ static int bringup_cpu(unsigned int cpu) | |||
| 371 | irq_unlock_sparse(); | 311 | irq_unlock_sparse(); |
| 372 | if (ret) | 312 | if (ret) |
| 373 | return ret; | 313 | return ret; |
| 374 | ret = bringup_wait_for_ap(cpu); | 314 | return bringup_wait_for_ap(cpu); |
| 375 | BUG_ON(!cpu_online(cpu)); | ||
| 376 | return ret; | ||
| 377 | } | 315 | } |
| 378 | 316 | ||
| 379 | /* | 317 | /* |
| @@ -484,6 +422,7 @@ static void cpuhp_thread_fun(unsigned int cpu) | |||
| 484 | 422 | ||
| 485 | st->should_run = false; | 423 | st->should_run = false; |
| 486 | 424 | ||
| 425 | lock_map_acquire(&cpuhp_state_lock_map); | ||
| 487 | /* Single callback invocation for [un]install ? */ | 426 | /* Single callback invocation for [un]install ? */ |
| 488 | if (st->single) { | 427 | if (st->single) { |
| 489 | if (st->cb_state < CPUHP_AP_ONLINE) { | 428 | if (st->cb_state < CPUHP_AP_ONLINE) { |
| @@ -510,6 +449,7 @@ static void cpuhp_thread_fun(unsigned int cpu) | |||
| 510 | else if (st->state > st->target) | 449 | else if (st->state > st->target) |
| 511 | ret = cpuhp_ap_offline(cpu, st); | 450 | ret = cpuhp_ap_offline(cpu, st); |
| 512 | } | 451 | } |
| 452 | lock_map_release(&cpuhp_state_lock_map); | ||
| 513 | st->result = ret; | 453 | st->result = ret; |
| 514 | complete(&st->done); | 454 | complete(&st->done); |
| 515 | } | 455 | } |
| @@ -524,6 +464,9 @@ cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup, | |||
| 524 | if (!cpu_online(cpu)) | 464 | if (!cpu_online(cpu)) |
| 525 | return 0; | 465 | return 0; |
| 526 | 466 | ||
| 467 | lock_map_acquire(&cpuhp_state_lock_map); | ||
| 468 | lock_map_release(&cpuhp_state_lock_map); | ||
| 469 | |||
| 527 | /* | 470 | /* |
| 528 | * If we are up and running, use the hotplug thread. For early calls | 471 | * If we are up and running, use the hotplug thread. For early calls |
| 529 | * we invoke the thread function directly. | 472 | * we invoke the thread function directly. |
| @@ -567,6 +510,8 @@ static int cpuhp_kick_ap_work(unsigned int cpu) | |||
| 567 | enum cpuhp_state state = st->state; | 510 | enum cpuhp_state state = st->state; |
| 568 | 511 | ||
| 569 | trace_cpuhp_enter(cpu, st->target, state, cpuhp_kick_ap_work); | 512 | trace_cpuhp_enter(cpu, st->target, state, cpuhp_kick_ap_work); |
| 513 | lock_map_acquire(&cpuhp_state_lock_map); | ||
| 514 | lock_map_release(&cpuhp_state_lock_map); | ||
| 570 | __cpuhp_kick_ap_work(st); | 515 | __cpuhp_kick_ap_work(st); |
| 571 | wait_for_completion(&st->done); | 516 | wait_for_completion(&st->done); |
| 572 | trace_cpuhp_exit(cpu, st->state, state, st->result); | 517 | trace_cpuhp_exit(cpu, st->state, state, st->result); |
| @@ -630,30 +575,6 @@ void clear_tasks_mm_cpumask(int cpu) | |||
| 630 | rcu_read_unlock(); | 575 | rcu_read_unlock(); |
| 631 | } | 576 | } |
| 632 | 577 | ||
| 633 | static inline void check_for_tasks(int dead_cpu) | ||
| 634 | { | ||
| 635 | struct task_struct *g, *p; | ||
| 636 | |||
| 637 | read_lock(&tasklist_lock); | ||
| 638 | for_each_process_thread(g, p) { | ||
| 639 | if (!p->on_rq) | ||
| 640 | continue; | ||
| 641 | /* | ||
| 642 | * We do the check with unlocked task_rq(p)->lock. | ||
| 643 | * Order the reading to do not warn about a task, | ||
| 644 | * which was running on this cpu in the past, and | ||
| 645 | * it's just been woken on another cpu. | ||
| 646 | */ | ||
| 647 | rmb(); | ||
| 648 | if (task_cpu(p) != dead_cpu) | ||
| 649 | continue; | ||
| 650 | |||
| 651 | pr_warn("Task %s (pid=%d) is on cpu %d (state=%ld, flags=%x)\n", | ||
| 652 | p->comm, task_pid_nr(p), dead_cpu, p->state, p->flags); | ||
| 653 | } | ||
| 654 | read_unlock(&tasklist_lock); | ||
| 655 | } | ||
| 656 | |||
| 657 | /* Take this CPU down. */ | 578 | /* Take this CPU down. */ |
| 658 | static int take_cpu_down(void *_param) | 579 | static int take_cpu_down(void *_param) |
| 659 | { | 580 | { |
| @@ -701,7 +622,7 @@ static int takedown_cpu(unsigned int cpu) | |||
| 701 | /* | 622 | /* |
| 702 | * So now all preempt/rcu users must observe !cpu_active(). | 623 | * So now all preempt/rcu users must observe !cpu_active(). |
| 703 | */ | 624 | */ |
| 704 | err = stop_machine(take_cpu_down, NULL, cpumask_of(cpu)); | 625 | err = stop_machine_cpuslocked(take_cpu_down, NULL, cpumask_of(cpu)); |
| 705 | if (err) { | 626 | if (err) { |
| 706 | /* CPU refused to die */ | 627 | /* CPU refused to die */ |
| 707 | irq_unlock_sparse(); | 628 | irq_unlock_sparse(); |
| @@ -773,7 +694,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, | |||
| 773 | if (!cpu_present(cpu)) | 694 | if (!cpu_present(cpu)) |
| 774 | return -EINVAL; | 695 | return -EINVAL; |
| 775 | 696 | ||
| 776 | cpu_hotplug_begin(); | 697 | cpus_write_lock(); |
| 777 | 698 | ||
| 778 | cpuhp_tasks_frozen = tasks_frozen; | 699 | cpuhp_tasks_frozen = tasks_frozen; |
| 779 | 700 | ||
| @@ -811,7 +732,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, | |||
| 811 | } | 732 | } |
| 812 | 733 | ||
| 813 | out: | 734 | out: |
| 814 | cpu_hotplug_done(); | 735 | cpus_write_unlock(); |
| 815 | return ret; | 736 | return ret; |
| 816 | } | 737 | } |
| 817 | 738 | ||
| @@ -859,31 +780,20 @@ void notify_cpu_starting(unsigned int cpu) | |||
| 859 | } | 780 | } |
| 860 | 781 | ||
| 861 | /* | 782 | /* |
| 862 | * Called from the idle task. We need to set active here, so we can kick off | 783 | * Called from the idle task. Wake up the controlling task which brings the |
| 863 | * the stopper thread and unpark the smpboot threads. If the target state is | 784 | * stopper and the hotplug thread of the upcoming CPU up and then delegates |
| 864 | * beyond CPUHP_AP_ONLINE_IDLE we kick cpuhp thread and let it bring up the | 785 | * the rest of the online bringup to the hotplug thread. |
| 865 | * cpu further. | ||
| 866 | */ | 786 | */ |
| 867 | void cpuhp_online_idle(enum cpuhp_state state) | 787 | void cpuhp_online_idle(enum cpuhp_state state) |
| 868 | { | 788 | { |
| 869 | struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); | 789 | struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); |
| 870 | unsigned int cpu = smp_processor_id(); | ||
| 871 | 790 | ||
| 872 | /* Happens for the boot cpu */ | 791 | /* Happens for the boot cpu */ |
| 873 | if (state != CPUHP_AP_ONLINE_IDLE) | 792 | if (state != CPUHP_AP_ONLINE_IDLE) |
| 874 | return; | 793 | return; |
| 875 | 794 | ||
| 876 | st->state = CPUHP_AP_ONLINE_IDLE; | 795 | st->state = CPUHP_AP_ONLINE_IDLE; |
| 877 | 796 | complete(&st->done); | |
| 878 | /* Unpark the stopper thread and the hotplug thread of this cpu */ | ||
| 879 | stop_machine_unpark(cpu); | ||
| 880 | kthread_unpark(st->thread); | ||
| 881 | |||
| 882 | /* Should we go further up ? */ | ||
| 883 | if (st->target > CPUHP_AP_ONLINE_IDLE) | ||
| 884 | __cpuhp_kick_ap_work(st); | ||
| 885 | else | ||
| 886 | complete(&st->done); | ||
| 887 | } | 797 | } |
| 888 | 798 | ||
| 889 | /* Requires cpu_add_remove_lock to be held */ | 799 | /* Requires cpu_add_remove_lock to be held */ |
| @@ -893,7 +803,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) | |||
| 893 | struct task_struct *idle; | 803 | struct task_struct *idle; |
| 894 | int ret = 0; | 804 | int ret = 0; |
| 895 | 805 | ||
| 896 | cpu_hotplug_begin(); | 806 | cpus_write_lock(); |
| 897 | 807 | ||
| 898 | if (!cpu_present(cpu)) { | 808 | if (!cpu_present(cpu)) { |
| 899 | ret = -EINVAL; | 809 | ret = -EINVAL; |
| @@ -941,7 +851,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) | |||
| 941 | target = min((int)target, CPUHP_BRINGUP_CPU); | 851 | target = min((int)target, CPUHP_BRINGUP_CPU); |
| 942 | ret = cpuhp_up_callbacks(cpu, st, target); | 852 | ret = cpuhp_up_callbacks(cpu, st, target); |
| 943 | out: | 853 | out: |
| 944 | cpu_hotplug_done(); | 854 | cpus_write_unlock(); |
| 945 | return ret; | 855 | return ret; |
| 946 | } | 856 | } |
| 947 | 857 | ||
| @@ -1252,6 +1162,11 @@ static struct cpuhp_step cpuhp_ap_states[] = { | |||
| 1252 | .startup.single = smpboot_unpark_threads, | 1162 | .startup.single = smpboot_unpark_threads, |
| 1253 | .teardown.single = NULL, | 1163 | .teardown.single = NULL, |
| 1254 | }, | 1164 | }, |
| 1165 | [CPUHP_AP_IRQ_AFFINITY_ONLINE] = { | ||
| 1166 | .name = "irq/affinity:online", | ||
| 1167 | .startup.single = irq_affinity_online_cpu, | ||
| 1168 | .teardown.single = NULL, | ||
| 1169 | }, | ||
| 1255 | [CPUHP_AP_PERF_ONLINE] = { | 1170 | [CPUHP_AP_PERF_ONLINE] = { |
| 1256 | .name = "perf:online", | 1171 | .name = "perf:online", |
| 1257 | .startup.single = perf_event_init_cpu, | 1172 | .startup.single = perf_event_init_cpu, |
| @@ -1413,18 +1328,20 @@ static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state, | |||
| 1413 | } | 1328 | } |
| 1414 | } | 1329 | } |
| 1415 | 1330 | ||
| 1416 | int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node, | 1331 | int __cpuhp_state_add_instance_cpuslocked(enum cpuhp_state state, |
| 1417 | bool invoke) | 1332 | struct hlist_node *node, |
| 1333 | bool invoke) | ||
| 1418 | { | 1334 | { |
| 1419 | struct cpuhp_step *sp; | 1335 | struct cpuhp_step *sp; |
| 1420 | int cpu; | 1336 | int cpu; |
| 1421 | int ret; | 1337 | int ret; |
| 1422 | 1338 | ||
| 1339 | lockdep_assert_cpus_held(); | ||
| 1340 | |||
| 1423 | sp = cpuhp_get_step(state); | 1341 | sp = cpuhp_get_step(state); |
| 1424 | if (sp->multi_instance == false) | 1342 | if (sp->multi_instance == false) |
| 1425 | return -EINVAL; | 1343 | return -EINVAL; |
| 1426 | 1344 | ||
| 1427 | get_online_cpus(); | ||
| 1428 | mutex_lock(&cpuhp_state_mutex); | 1345 | mutex_lock(&cpuhp_state_mutex); |
| 1429 | 1346 | ||
| 1430 | if (!invoke || !sp->startup.multi) | 1347 | if (!invoke || !sp->startup.multi) |
| @@ -1453,13 +1370,23 @@ add_node: | |||
| 1453 | hlist_add_head(node, &sp->list); | 1370 | hlist_add_head(node, &sp->list); |
| 1454 | unlock: | 1371 | unlock: |
| 1455 | mutex_unlock(&cpuhp_state_mutex); | 1372 | mutex_unlock(&cpuhp_state_mutex); |
| 1456 | put_online_cpus(); | 1373 | return ret; |
| 1374 | } | ||
| 1375 | |||
| 1376 | int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node, | ||
| 1377 | bool invoke) | ||
| 1378 | { | ||
| 1379 | int ret; | ||
| 1380 | |||
| 1381 | cpus_read_lock(); | ||
| 1382 | ret = __cpuhp_state_add_instance_cpuslocked(state, node, invoke); | ||
| 1383 | cpus_read_unlock(); | ||
| 1457 | return ret; | 1384 | return ret; |
| 1458 | } | 1385 | } |
| 1459 | EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance); | 1386 | EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance); |
| 1460 | 1387 | ||
| 1461 | /** | 1388 | /** |
| 1462 | * __cpuhp_setup_state - Setup the callbacks for an hotplug machine state | 1389 | * __cpuhp_setup_state_cpuslocked - Setup the callbacks for an hotplug machine state |
| 1463 | * @state: The state to setup | 1390 | * @state: The state to setup |
| 1464 | * @invoke: If true, the startup function is invoked for cpus where | 1391 | * @invoke: If true, the startup function is invoked for cpus where |
| 1465 | * cpu state >= @state | 1392 | * cpu state >= @state |
| @@ -1468,25 +1395,27 @@ EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance); | |||
| 1468 | * @multi_instance: State is set up for multiple instances which get | 1395 | * @multi_instance: State is set up for multiple instances which get |
| 1469 | * added afterwards. | 1396 | * added afterwards. |
| 1470 | * | 1397 | * |
| 1398 | * The caller needs to hold cpus read locked while calling this function. | ||
| 1471 | * Returns: | 1399 | * Returns: |
| 1472 | * On success: | 1400 | * On success: |
| 1473 | * Positive state number if @state is CPUHP_AP_ONLINE_DYN | 1401 | * Positive state number if @state is CPUHP_AP_ONLINE_DYN |
| 1474 | * 0 for all other states | 1402 | * 0 for all other states |
| 1475 | * On failure: proper (negative) error code | 1403 | * On failure: proper (negative) error code |
| 1476 | */ | 1404 | */ |
| 1477 | int __cpuhp_setup_state(enum cpuhp_state state, | 1405 | int __cpuhp_setup_state_cpuslocked(enum cpuhp_state state, |
| 1478 | const char *name, bool invoke, | 1406 | const char *name, bool invoke, |
| 1479 | int (*startup)(unsigned int cpu), | 1407 | int (*startup)(unsigned int cpu), |
| 1480 | int (*teardown)(unsigned int cpu), | 1408 | int (*teardown)(unsigned int cpu), |
| 1481 | bool multi_instance) | 1409 | bool multi_instance) |
| 1482 | { | 1410 | { |
| 1483 | int cpu, ret = 0; | 1411 | int cpu, ret = 0; |
| 1484 | bool dynstate; | 1412 | bool dynstate; |
| 1485 | 1413 | ||
| 1414 | lockdep_assert_cpus_held(); | ||
| 1415 | |||
| 1486 | if (cpuhp_cb_check(state) || !name) | 1416 | if (cpuhp_cb_check(state) || !name) |
| 1487 | return -EINVAL; | 1417 | return -EINVAL; |
| 1488 | 1418 | ||
| 1489 | get_online_cpus(); | ||
| 1490 | mutex_lock(&cpuhp_state_mutex); | 1419 | mutex_lock(&cpuhp_state_mutex); |
| 1491 | 1420 | ||
| 1492 | ret = cpuhp_store_callbacks(state, name, startup, teardown, | 1421 | ret = cpuhp_store_callbacks(state, name, startup, teardown, |
| @@ -1522,7 +1451,6 @@ int __cpuhp_setup_state(enum cpuhp_state state, | |||
| 1522 | } | 1451 | } |
| 1523 | out: | 1452 | out: |
| 1524 | mutex_unlock(&cpuhp_state_mutex); | 1453 | mutex_unlock(&cpuhp_state_mutex); |
| 1525 | put_online_cpus(); | ||
| 1526 | /* | 1454 | /* |
| 1527 | * If the requested state is CPUHP_AP_ONLINE_DYN, return the | 1455 | * If the requested state is CPUHP_AP_ONLINE_DYN, return the |
| 1528 | * dynamically allocated state in case of success. | 1456 | * dynamically allocated state in case of success. |
| @@ -1531,6 +1459,22 @@ out: | |||
| 1531 | return state; | 1459 | return state; |
| 1532 | return ret; | 1460 | return ret; |
| 1533 | } | 1461 | } |
| 1462 | EXPORT_SYMBOL(__cpuhp_setup_state_cpuslocked); | ||
| 1463 | |||
| 1464 | int __cpuhp_setup_state(enum cpuhp_state state, | ||
| 1465 | const char *name, bool invoke, | ||
| 1466 | int (*startup)(unsigned int cpu), | ||
| 1467 | int (*teardown)(unsigned int cpu), | ||
| 1468 | bool multi_instance) | ||
| 1469 | { | ||
| 1470 | int ret; | ||
| 1471 | |||
| 1472 | cpus_read_lock(); | ||
| 1473 | ret = __cpuhp_setup_state_cpuslocked(state, name, invoke, startup, | ||
| 1474 | teardown, multi_instance); | ||
| 1475 | cpus_read_unlock(); | ||
| 1476 | return ret; | ||
| 1477 | } | ||
| 1534 | EXPORT_SYMBOL(__cpuhp_setup_state); | 1478 | EXPORT_SYMBOL(__cpuhp_setup_state); |
| 1535 | 1479 | ||
| 1536 | int __cpuhp_state_remove_instance(enum cpuhp_state state, | 1480 | int __cpuhp_state_remove_instance(enum cpuhp_state state, |
| @@ -1544,7 +1488,7 @@ int __cpuhp_state_remove_instance(enum cpuhp_state state, | |||
| 1544 | if (!sp->multi_instance) | 1488 | if (!sp->multi_instance) |
| 1545 | return -EINVAL; | 1489 | return -EINVAL; |
| 1546 | 1490 | ||
| 1547 | get_online_cpus(); | 1491 | cpus_read_lock(); |
| 1548 | mutex_lock(&cpuhp_state_mutex); | 1492 | mutex_lock(&cpuhp_state_mutex); |
| 1549 | 1493 | ||
| 1550 | if (!invoke || !cpuhp_get_teardown_cb(state)) | 1494 | if (!invoke || !cpuhp_get_teardown_cb(state)) |
| @@ -1565,29 +1509,30 @@ int __cpuhp_state_remove_instance(enum cpuhp_state state, | |||
| 1565 | remove: | 1509 | remove: |
| 1566 | hlist_del(node); | 1510 | hlist_del(node); |
| 1567 | mutex_unlock(&cpuhp_state_mutex); | 1511 | mutex_unlock(&cpuhp_state_mutex); |
| 1568 | put_online_cpus(); | 1512 | cpus_read_unlock(); |
| 1569 | 1513 | ||
| 1570 | return 0; | 1514 | return 0; |
| 1571 | } | 1515 | } |
| 1572 | EXPORT_SYMBOL_GPL(__cpuhp_state_remove_instance); | 1516 | EXPORT_SYMBOL_GPL(__cpuhp_state_remove_instance); |
| 1573 | 1517 | ||
| 1574 | /** | 1518 | /** |
| 1575 | * __cpuhp_remove_state - Remove the callbacks for an hotplug machine state | 1519 | * __cpuhp_remove_state_cpuslocked - Remove the callbacks for an hotplug machine state |
| 1576 | * @state: The state to remove | 1520 | * @state: The state to remove |
| 1577 | * @invoke: If true, the teardown function is invoked for cpus where | 1521 | * @invoke: If true, the teardown function is invoked for cpus where |
| 1578 | * cpu state >= @state | 1522 | * cpu state >= @state |
| 1579 | * | 1523 | * |
| 1524 | * The caller needs to hold cpus read locked while calling this function. | ||
| 1580 | * The teardown callback is currently not allowed to fail. Think | 1525 | * The teardown callback is currently not allowed to fail. Think |
| 1581 | * about module removal! | 1526 | * about module removal! |
| 1582 | */ | 1527 | */ |
| 1583 | void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) | 1528 | void __cpuhp_remove_state_cpuslocked(enum cpuhp_state state, bool invoke) |
| 1584 | { | 1529 | { |
| 1585 | struct cpuhp_step *sp = cpuhp_get_step(state); | 1530 | struct cpuhp_step *sp = cpuhp_get_step(state); |
| 1586 | int cpu; | 1531 | int cpu; |
| 1587 | 1532 | ||
| 1588 | BUG_ON(cpuhp_cb_check(state)); | 1533 | BUG_ON(cpuhp_cb_check(state)); |
| 1589 | 1534 | ||
| 1590 | get_online_cpus(); | 1535 | lockdep_assert_cpus_held(); |
| 1591 | 1536 | ||
| 1592 | mutex_lock(&cpuhp_state_mutex); | 1537 | mutex_lock(&cpuhp_state_mutex); |
| 1593 | if (sp->multi_instance) { | 1538 | if (sp->multi_instance) { |
| @@ -1615,7 +1560,14 @@ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) | |||
| 1615 | remove: | 1560 | remove: |
| 1616 | cpuhp_store_callbacks(state, NULL, NULL, NULL, false); | 1561 | cpuhp_store_callbacks(state, NULL, NULL, NULL, false); |
| 1617 | mutex_unlock(&cpuhp_state_mutex); | 1562 | mutex_unlock(&cpuhp_state_mutex); |
| 1618 | put_online_cpus(); | 1563 | } |
| 1564 | EXPORT_SYMBOL(__cpuhp_remove_state_cpuslocked); | ||
| 1565 | |||
| 1566 | void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) | ||
| 1567 | { | ||
| 1568 | cpus_read_lock(); | ||
| 1569 | __cpuhp_remove_state_cpuslocked(state, invoke); | ||
| 1570 | cpus_read_unlock(); | ||
| 1619 | } | 1571 | } |
| 1620 | EXPORT_SYMBOL(__cpuhp_remove_state); | 1572 | EXPORT_SYMBOL(__cpuhp_remove_state); |
| 1621 | 1573 | ||
| @@ -1658,13 +1610,13 @@ static ssize_t write_cpuhp_target(struct device *dev, | |||
| 1658 | ret = !sp->name || sp->cant_stop ? -EINVAL : 0; | 1610 | ret = !sp->name || sp->cant_stop ? -EINVAL : 0; |
| 1659 | mutex_unlock(&cpuhp_state_mutex); | 1611 | mutex_unlock(&cpuhp_state_mutex); |
| 1660 | if (ret) | 1612 | if (ret) |
| 1661 | return ret; | 1613 | goto out; |
| 1662 | 1614 | ||
| 1663 | if (st->state < target) | 1615 | if (st->state < target) |
| 1664 | ret = do_cpu_up(dev->id, target); | 1616 | ret = do_cpu_up(dev->id, target); |
| 1665 | else | 1617 | else |
| 1666 | ret = do_cpu_down(dev->id, target); | 1618 | ret = do_cpu_down(dev->id, target); |
| 1667 | 1619 | out: | |
| 1668 | unlock_device_hotplug(); | 1620 | unlock_device_hotplug(); |
| 1669 | return ret ? ret : count; | 1621 | return ret ? ret : count; |
| 1670 | } | 1622 | } |
| @@ -1684,7 +1636,7 @@ static struct attribute *cpuhp_cpu_attrs[] = { | |||
| 1684 | NULL | 1636 | NULL |
| 1685 | }; | 1637 | }; |
| 1686 | 1638 | ||
| 1687 | static struct attribute_group cpuhp_cpu_attr_group = { | 1639 | static const struct attribute_group cpuhp_cpu_attr_group = { |
| 1688 | .attrs = cpuhp_cpu_attrs, | 1640 | .attrs = cpuhp_cpu_attrs, |
| 1689 | .name = "hotplug", | 1641 | .name = "hotplug", |
| 1690 | NULL | 1642 | NULL |
| @@ -1716,7 +1668,7 @@ static struct attribute *cpuhp_cpu_root_attrs[] = { | |||
| 1716 | NULL | 1668 | NULL |
| 1717 | }; | 1669 | }; |
| 1718 | 1670 | ||
| 1719 | static struct attribute_group cpuhp_cpu_root_attr_group = { | 1671 | static const struct attribute_group cpuhp_cpu_root_attr_group = { |
| 1720 | .attrs = cpuhp_cpu_root_attrs, | 1672 | .attrs = cpuhp_cpu_root_attrs, |
| 1721 | .name = "hotplug", | 1673 | .name = "hotplug", |
| 1722 | NULL | 1674 | NULL |
diff --git a/kernel/crash_core.c b/kernel/crash_core.c index fcbd568f1e95..6db80fc0810b 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c | |||
| @@ -14,10 +14,12 @@ | |||
| 14 | #include <asm/sections.h> | 14 | #include <asm/sections.h> |
| 15 | 15 | ||
| 16 | /* vmcoreinfo stuff */ | 16 | /* vmcoreinfo stuff */ |
| 17 | static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; | 17 | static unsigned char *vmcoreinfo_data; |
| 18 | u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; | 18 | static size_t vmcoreinfo_size; |
| 19 | size_t vmcoreinfo_size; | 19 | u32 *vmcoreinfo_note; |
| 20 | size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); | 20 | |
| 21 | /* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */ | ||
| 22 | static unsigned char *vmcoreinfo_data_safecopy; | ||
| 21 | 23 | ||
| 22 | /* | 24 | /* |
| 23 | * parsing the "crashkernel" commandline | 25 | * parsing the "crashkernel" commandline |
| @@ -324,8 +326,23 @@ static void update_vmcoreinfo_note(void) | |||
| 324 | final_note(buf); | 326 | final_note(buf); |
| 325 | } | 327 | } |
| 326 | 328 | ||
| 329 | void crash_update_vmcoreinfo_safecopy(void *ptr) | ||
| 330 | { | ||
| 331 | if (ptr) | ||
| 332 | memcpy(ptr, vmcoreinfo_data, vmcoreinfo_size); | ||
| 333 | |||
| 334 | vmcoreinfo_data_safecopy = ptr; | ||
| 335 | } | ||
| 336 | |||
| 327 | void crash_save_vmcoreinfo(void) | 337 | void crash_save_vmcoreinfo(void) |
| 328 | { | 338 | { |
| 339 | if (!vmcoreinfo_note) | ||
| 340 | return; | ||
| 341 | |||
| 342 | /* Use the safe copy to generate vmcoreinfo note if have */ | ||
| 343 | if (vmcoreinfo_data_safecopy) | ||
| 344 | vmcoreinfo_data = vmcoreinfo_data_safecopy; | ||
| 345 | |||
| 329 | vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds()); | 346 | vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds()); |
| 330 | update_vmcoreinfo_note(); | 347 | update_vmcoreinfo_note(); |
| 331 | } | 348 | } |
| @@ -340,7 +357,7 @@ void vmcoreinfo_append_str(const char *fmt, ...) | |||
| 340 | r = vscnprintf(buf, sizeof(buf), fmt, args); | 357 | r = vscnprintf(buf, sizeof(buf), fmt, args); |
| 341 | va_end(args); | 358 | va_end(args); |
| 342 | 359 | ||
| 343 | r = min(r, vmcoreinfo_max_size - vmcoreinfo_size); | 360 | r = min(r, (size_t)VMCOREINFO_BYTES - vmcoreinfo_size); |
| 344 | 361 | ||
| 345 | memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); | 362 | memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); |
| 346 | 363 | ||
| @@ -356,11 +373,26 @@ void __weak arch_crash_save_vmcoreinfo(void) | |||
| 356 | 373 | ||
| 357 | phys_addr_t __weak paddr_vmcoreinfo_note(void) | 374 | phys_addr_t __weak paddr_vmcoreinfo_note(void) |
| 358 | { | 375 | { |
| 359 | return __pa_symbol((unsigned long)(char *)&vmcoreinfo_note); | 376 | return __pa(vmcoreinfo_note); |
| 360 | } | 377 | } |
| 361 | 378 | ||
| 362 | static int __init crash_save_vmcoreinfo_init(void) | 379 | static int __init crash_save_vmcoreinfo_init(void) |
| 363 | { | 380 | { |
| 381 | vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL); | ||
| 382 | if (!vmcoreinfo_data) { | ||
| 383 | pr_warn("Memory allocation for vmcoreinfo_data failed\n"); | ||
| 384 | return -ENOMEM; | ||
| 385 | } | ||
| 386 | |||
| 387 | vmcoreinfo_note = alloc_pages_exact(VMCOREINFO_NOTE_SIZE, | ||
| 388 | GFP_KERNEL | __GFP_ZERO); | ||
| 389 | if (!vmcoreinfo_note) { | ||
| 390 | free_page((unsigned long)vmcoreinfo_data); | ||
| 391 | vmcoreinfo_data = NULL; | ||
| 392 | pr_warn("Memory allocation for vmcoreinfo_note failed\n"); | ||
| 393 | return -ENOMEM; | ||
| 394 | } | ||
| 395 | |||
| 364 | VMCOREINFO_OSRELEASE(init_uts_ns.name.release); | 396 | VMCOREINFO_OSRELEASE(init_uts_ns.name.release); |
| 365 | VMCOREINFO_PAGESIZE(PAGE_SIZE); | 397 | VMCOREINFO_PAGESIZE(PAGE_SIZE); |
| 366 | 398 | ||
diff --git a/kernel/cred.c b/kernel/cred.c index 2bc66075740f..ecf03657e71c 100644 --- a/kernel/cred.c +++ b/kernel/cred.c | |||
| @@ -1,4 +1,4 @@ | |||
| 1 | /* Task credentials management - see Documentation/security/credentials.txt | 1 | /* Task credentials management - see Documentation/security/credentials.rst |
| 2 | * | 2 | * |
| 3 | * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. | 3 | * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. |
| 4 | * Written by David Howells (dhowells@redhat.com) | 4 | * Written by David Howells (dhowells@redhat.com) |
diff --git a/kernel/events/core.c b/kernel/events/core.c index 6e75a5c9412d..426c2ffba16d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
| @@ -389,6 +389,7 @@ static atomic_t nr_switch_events __read_mostly; | |||
| 389 | static LIST_HEAD(pmus); | 389 | static LIST_HEAD(pmus); |
| 390 | static DEFINE_MUTEX(pmus_lock); | 390 | static DEFINE_MUTEX(pmus_lock); |
| 391 | static struct srcu_struct pmus_srcu; | 391 | static struct srcu_struct pmus_srcu; |
| 392 | static cpumask_var_t perf_online_mask; | ||
| 392 | 393 | ||
| 393 | /* | 394 | /* |
| 394 | * perf event paranoia level: | 395 | * perf event paranoia level: |
| @@ -925,11 +926,6 @@ static inline int is_cgroup_event(struct perf_event *event) | |||
| 925 | return 0; | 926 | return 0; |
| 926 | } | 927 | } |
| 927 | 928 | ||
| 928 | static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event) | ||
| 929 | { | ||
| 930 | return 0; | ||
| 931 | } | ||
| 932 | |||
| 933 | static inline void update_cgrp_time_from_event(struct perf_event *event) | 929 | static inline void update_cgrp_time_from_event(struct perf_event *event) |
| 934 | { | 930 | { |
| 935 | } | 931 | } |
| @@ -1456,6 +1452,13 @@ static enum event_type_t get_event_type(struct perf_event *event) | |||
| 1456 | 1452 | ||
| 1457 | lockdep_assert_held(&ctx->lock); | 1453 | lockdep_assert_held(&ctx->lock); |
| 1458 | 1454 | ||
| 1455 | /* | ||
| 1456 | * It's 'group type', really, because if our group leader is | ||
| 1457 | * pinned, so are we. | ||
| 1458 | */ | ||
| 1459 | if (event->group_leader != event) | ||
| 1460 | event = event->group_leader; | ||
| 1461 | |||
| 1459 | event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE; | 1462 | event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE; |
| 1460 | if (!ctx->task) | 1463 | if (!ctx->task) |
| 1461 | event_type |= EVENT_CPU; | 1464 | event_type |= EVENT_CPU; |
| @@ -3636,10 +3639,10 @@ static inline u64 perf_event_count(struct perf_event *event) | |||
| 3636 | * will not be local and we cannot read them atomically | 3639 | * will not be local and we cannot read them atomically |
| 3637 | * - must not have a pmu::count method | 3640 | * - must not have a pmu::count method |
| 3638 | */ | 3641 | */ |
| 3639 | u64 perf_event_read_local(struct perf_event *event) | 3642 | int perf_event_read_local(struct perf_event *event, u64 *value) |
| 3640 | { | 3643 | { |
| 3641 | unsigned long flags; | 3644 | unsigned long flags; |
| 3642 | u64 val; | 3645 | int ret = 0; |
| 3643 | 3646 | ||
| 3644 | /* | 3647 | /* |
| 3645 | * Disabling interrupts avoids all counter scheduling (context | 3648 | * Disabling interrupts avoids all counter scheduling (context |
| @@ -3647,25 +3650,37 @@ u64 perf_event_read_local(struct perf_event *event) | |||
| 3647 | */ | 3650 | */ |
| 3648 | local_irq_save(flags); | 3651 | local_irq_save(flags); |
| 3649 | 3652 | ||
| 3650 | /* If this is a per-task event, it must be for current */ | ||
| 3651 | WARN_ON_ONCE((event->attach_state & PERF_ATTACH_TASK) && | ||
| 3652 | event->hw.target != current); | ||
| 3653 | |||
| 3654 | /* If this is a per-CPU event, it must be for this CPU */ | ||
| 3655 | WARN_ON_ONCE(!(event->attach_state & PERF_ATTACH_TASK) && | ||
| 3656 | event->cpu != smp_processor_id()); | ||
| 3657 | |||
| 3658 | /* | 3653 | /* |
| 3659 | * It must not be an event with inherit set, we cannot read | 3654 | * It must not be an event with inherit set, we cannot read |
| 3660 | * all child counters from atomic context. | 3655 | * all child counters from atomic context. |
| 3661 | */ | 3656 | */ |
| 3662 | WARN_ON_ONCE(event->attr.inherit); | 3657 | if (event->attr.inherit) { |
| 3658 | ret = -EOPNOTSUPP; | ||
| 3659 | goto out; | ||
| 3660 | } | ||
| 3663 | 3661 | ||
| 3664 | /* | 3662 | /* |
| 3665 | * It must not have a pmu::count method, those are not | 3663 | * It must not have a pmu::count method, those are not |
| 3666 | * NMI safe. | 3664 | * NMI safe. |
| 3667 | */ | 3665 | */ |
| 3668 | WARN_ON_ONCE(event->pmu->count); | 3666 | if (event->pmu->count) { |
| 3667 | ret = -EOPNOTSUPP; | ||
| 3668 | goto out; | ||
| 3669 | } | ||
| 3670 | |||
| 3671 | /* If this is a per-task event, it must be for current */ | ||
| 3672 | if ((event->attach_state & PERF_ATTACH_TASK) && | ||
| 3673 | event->hw.target != current) { | ||
| 3674 | ret = -EINVAL; | ||
| 3675 | goto out; | ||
| 3676 | } | ||
| 3677 | |||
| 3678 | /* If this is a per-CPU event, it must be for this CPU */ | ||
| 3679 | if (!(event->attach_state & PERF_ATTACH_TASK) && | ||
| 3680 | event->cpu != smp_processor_id()) { | ||
| 3681 | ret = -EINVAL; | ||
| 3682 | goto out; | ||
| 3683 | } | ||
| 3669 | 3684 | ||
| 3670 | /* | 3685 | /* |
| 3671 | * If the event is currently on this CPU, its either a per-task event, | 3686 | * If the event is currently on this CPU, its either a per-task event, |
| @@ -3675,10 +3690,11 @@ u64 perf_event_read_local(struct perf_event *event) | |||
| 3675 | if (event->oncpu == smp_processor_id()) | 3690 | if (event->oncpu == smp_processor_id()) |
| 3676 | event->pmu->read(event); | 3691 | event->pmu->read(event); |
| 3677 | 3692 | ||
| 3678 | val = local64_read(&event->count); | 3693 | *value = local64_read(&event->count); |
| 3694 | out: | ||
| 3679 | local_irq_restore(flags); | 3695 | local_irq_restore(flags); |
| 3680 | 3696 | ||
| 3681 | return val; | 3697 | return ret; |
| 3682 | } | 3698 | } |
| 3683 | 3699 | ||
| 3684 | static int perf_event_read(struct perf_event *event, bool group) | 3700 | static int perf_event_read(struct perf_event *event, bool group) |
| @@ -3812,14 +3828,6 @@ find_get_context(struct pmu *pmu, struct task_struct *task, | |||
| 3812 | if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) | 3828 | if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) |
| 3813 | return ERR_PTR(-EACCES); | 3829 | return ERR_PTR(-EACCES); |
| 3814 | 3830 | ||
| 3815 | /* | ||
| 3816 | * We could be clever and allow to attach a event to an | ||
| 3817 | * offline CPU and activate it when the CPU comes up, but | ||
| 3818 | * that's for later. | ||
| 3819 | */ | ||
| 3820 | if (!cpu_online(cpu)) | ||
| 3821 | return ERR_PTR(-ENODEV); | ||
| 3822 | |||
| 3823 | cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); | 3831 | cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); |
| 3824 | ctx = &cpuctx->ctx; | 3832 | ctx = &cpuctx->ctx; |
| 3825 | get_ctx(ctx); | 3833 | get_ctx(ctx); |
| @@ -4377,7 +4385,9 @@ EXPORT_SYMBOL_GPL(perf_event_read_value); | |||
| 4377 | static int __perf_read_group_add(struct perf_event *leader, | 4385 | static int __perf_read_group_add(struct perf_event *leader, |
| 4378 | u64 read_format, u64 *values) | 4386 | u64 read_format, u64 *values) |
| 4379 | { | 4387 | { |
| 4388 | struct perf_event_context *ctx = leader->ctx; | ||
| 4380 | struct perf_event *sub; | 4389 | struct perf_event *sub; |
| 4390 | unsigned long flags; | ||
| 4381 | int n = 1; /* skip @nr */ | 4391 | int n = 1; /* skip @nr */ |
| 4382 | int ret; | 4392 | int ret; |
| 4383 | 4393 | ||
| @@ -4407,12 +4417,15 @@ static int __perf_read_group_add(struct perf_event *leader, | |||
| 4407 | if (read_format & PERF_FORMAT_ID) | 4417 | if (read_format & PERF_FORMAT_ID) |
| 4408 | values[n++] = primary_event_id(leader); | 4418 | values[n++] = primary_event_id(leader); |
| 4409 | 4419 | ||
| 4420 | raw_spin_lock_irqsave(&ctx->lock, flags); | ||
| 4421 | |||
| 4410 | list_for_each_entry(sub, &leader->sibling_list, group_entry) { | 4422 | list_for_each_entry(sub, &leader->sibling_list, group_entry) { |
| 4411 | values[n++] += perf_event_count(sub); | 4423 | values[n++] += perf_event_count(sub); |
| 4412 | if (read_format & PERF_FORMAT_ID) | 4424 | if (read_format & PERF_FORMAT_ID) |
| 4413 | values[n++] = primary_event_id(sub); | 4425 | values[n++] = primary_event_id(sub); |
| 4414 | } | 4426 | } |
| 4415 | 4427 | ||
| 4428 | raw_spin_unlock_irqrestore(&ctx->lock, flags); | ||
| 4416 | return 0; | 4429 | return 0; |
| 4417 | } | 4430 | } |
| 4418 | 4431 | ||
| @@ -5729,9 +5742,6 @@ static void perf_output_read_one(struct perf_output_handle *handle, | |||
| 5729 | __output_copy(handle, values, n * sizeof(u64)); | 5742 | __output_copy(handle, values, n * sizeof(u64)); |
| 5730 | } | 5743 | } |
| 5731 | 5744 | ||
| 5732 | /* | ||
| 5733 | * XXX PERF_FORMAT_GROUP vs inherited events seems difficult. | ||
| 5734 | */ | ||
| 5735 | static void perf_output_read_group(struct perf_output_handle *handle, | 5745 | static void perf_output_read_group(struct perf_output_handle *handle, |
| 5736 | struct perf_event *event, | 5746 | struct perf_event *event, |
| 5737 | u64 enabled, u64 running) | 5747 | u64 enabled, u64 running) |
| @@ -5776,6 +5786,13 @@ static void perf_output_read_group(struct perf_output_handle *handle, | |||
| 5776 | #define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\ | 5786 | #define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\ |
| 5777 | PERF_FORMAT_TOTAL_TIME_RUNNING) | 5787 | PERF_FORMAT_TOTAL_TIME_RUNNING) |
| 5778 | 5788 | ||
| 5789 | /* | ||
| 5790 | * XXX PERF_SAMPLE_READ vs inherited events seems difficult. | ||
| 5791 | * | ||
| 5792 | * The problem is that its both hard and excessively expensive to iterate the | ||
| 5793 | * child list, not to mention that its impossible to IPI the children running | ||
| 5794 | * on another CPU, from interrupt/NMI context. | ||
| 5795 | */ | ||
| 5779 | static void perf_output_read(struct perf_output_handle *handle, | 5796 | static void perf_output_read(struct perf_output_handle *handle, |
| 5780 | struct perf_event *event) | 5797 | struct perf_event *event) |
| 5781 | { | 5798 | { |
| @@ -7703,7 +7720,8 @@ static int swevent_hlist_get_cpu(int cpu) | |||
| 7703 | int err = 0; | 7720 | int err = 0; |
| 7704 | 7721 | ||
| 7705 | mutex_lock(&swhash->hlist_mutex); | 7722 | mutex_lock(&swhash->hlist_mutex); |
| 7706 | if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) { | 7723 | if (!swevent_hlist_deref(swhash) && |
| 7724 | cpumask_test_cpu(cpu, perf_online_mask)) { | ||
| 7707 | struct swevent_hlist *hlist; | 7725 | struct swevent_hlist *hlist; |
| 7708 | 7726 | ||
| 7709 | hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); | 7727 | hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); |
| @@ -7724,7 +7742,7 @@ static int swevent_hlist_get(void) | |||
| 7724 | { | 7742 | { |
| 7725 | int err, cpu, failed_cpu; | 7743 | int err, cpu, failed_cpu; |
| 7726 | 7744 | ||
| 7727 | get_online_cpus(); | 7745 | mutex_lock(&pmus_lock); |
| 7728 | for_each_possible_cpu(cpu) { | 7746 | for_each_possible_cpu(cpu) { |
| 7729 | err = swevent_hlist_get_cpu(cpu); | 7747 | err = swevent_hlist_get_cpu(cpu); |
| 7730 | if (err) { | 7748 | if (err) { |
| @@ -7732,8 +7750,7 @@ static int swevent_hlist_get(void) | |||
| 7732 | goto fail; | 7750 | goto fail; |
| 7733 | } | 7751 | } |
| 7734 | } | 7752 | } |
| 7735 | put_online_cpus(); | 7753 | mutex_unlock(&pmus_lock); |
| 7736 | |||
| 7737 | return 0; | 7754 | return 0; |
| 7738 | fail: | 7755 | fail: |
| 7739 | for_each_possible_cpu(cpu) { | 7756 | for_each_possible_cpu(cpu) { |
| @@ -7741,8 +7758,7 @@ fail: | |||
| 7741 | break; | 7758 | break; |
| 7742 | swevent_hlist_put_cpu(cpu); | 7759 | swevent_hlist_put_cpu(cpu); |
| 7743 | } | 7760 | } |
| 7744 | 7761 | mutex_unlock(&pmus_lock); | |
| 7745 | put_online_cpus(); | ||
| 7746 | return err; | 7762 | return err; |
| 7747 | } | 7763 | } |
| 7748 | 7764 | ||
| @@ -8037,12 +8053,8 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) | |||
| 8037 | bool is_kprobe, is_tracepoint; | 8053 | bool is_kprobe, is_tracepoint; |
| 8038 | struct bpf_prog *prog; | 8054 | struct bpf_prog *prog; |
| 8039 | 8055 | ||
| 8040 | if (event->attr.type == PERF_TYPE_HARDWARE || | ||
| 8041 | event->attr.type == PERF_TYPE_SOFTWARE) | ||
| 8042 | return perf_event_set_bpf_handler(event, prog_fd); | ||
| 8043 | |||
| 8044 | if (event->attr.type != PERF_TYPE_TRACEPOINT) | 8056 | if (event->attr.type != PERF_TYPE_TRACEPOINT) |
| 8045 | return -EINVAL; | 8057 | return perf_event_set_bpf_handler(event, prog_fd); |
| 8046 | 8058 | ||
| 8047 | if (event->tp_event->prog) | 8059 | if (event->tp_event->prog) |
| 8048 | return -EEXIST; | 8060 | return -EEXIST; |
| @@ -8920,7 +8932,7 @@ perf_event_mux_interval_ms_store(struct device *dev, | |||
| 8920 | pmu->hrtimer_interval_ms = timer; | 8932 | pmu->hrtimer_interval_ms = timer; |
| 8921 | 8933 | ||
| 8922 | /* update all cpuctx for this PMU */ | 8934 | /* update all cpuctx for this PMU */ |
| 8923 | get_online_cpus(); | 8935 | cpus_read_lock(); |
| 8924 | for_each_online_cpu(cpu) { | 8936 | for_each_online_cpu(cpu) { |
| 8925 | struct perf_cpu_context *cpuctx; | 8937 | struct perf_cpu_context *cpuctx; |
| 8926 | cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); | 8938 | cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); |
| @@ -8929,7 +8941,7 @@ perf_event_mux_interval_ms_store(struct device *dev, | |||
| 8929 | cpu_function_call(cpu, | 8941 | cpu_function_call(cpu, |
| 8930 | (remote_function_f)perf_mux_hrtimer_restart, cpuctx); | 8942 | (remote_function_f)perf_mux_hrtimer_restart, cpuctx); |
| 8931 | } | 8943 | } |
| 8932 | put_online_cpus(); | 8944 | cpus_read_unlock(); |
| 8933 | mutex_unlock(&mux_interval_mutex); | 8945 | mutex_unlock(&mux_interval_mutex); |
| 8934 | 8946 | ||
| 8935 | return count; | 8947 | return count; |
| @@ -9059,6 +9071,7 @@ skip_type: | |||
| 9059 | lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); | 9071 | lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); |
| 9060 | lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); | 9072 | lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); |
| 9061 | cpuctx->ctx.pmu = pmu; | 9073 | cpuctx->ctx.pmu = pmu; |
| 9074 | cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask); | ||
| 9062 | 9075 | ||
| 9063 | __perf_mux_hrtimer_init(cpuctx, cpu); | 9076 | __perf_mux_hrtimer_init(cpuctx, cpu); |
| 9064 | } | 9077 | } |
| @@ -9172,7 +9185,7 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) | |||
| 9172 | 9185 | ||
| 9173 | static struct pmu *perf_init_event(struct perf_event *event) | 9186 | static struct pmu *perf_init_event(struct perf_event *event) |
| 9174 | { | 9187 | { |
| 9175 | struct pmu *pmu = NULL; | 9188 | struct pmu *pmu; |
| 9176 | int idx; | 9189 | int idx; |
| 9177 | int ret; | 9190 | int ret; |
| 9178 | 9191 | ||
| @@ -9441,9 +9454,10 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, | |||
| 9441 | local64_set(&hwc->period_left, hwc->sample_period); | 9454 | local64_set(&hwc->period_left, hwc->sample_period); |
| 9442 | 9455 | ||
| 9443 | /* | 9456 | /* |
| 9444 | * we currently do not support PERF_FORMAT_GROUP on inherited events | 9457 | * We currently do not support PERF_SAMPLE_READ on inherited events. |
| 9458 | * See perf_output_read(). | ||
| 9445 | */ | 9459 | */ |
| 9446 | if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) | 9460 | if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ)) |
| 9447 | goto err_ns; | 9461 | goto err_ns; |
| 9448 | 9462 | ||
| 9449 | if (!has_branch_stack(event)) | 9463 | if (!has_branch_stack(event)) |
| @@ -9456,9 +9470,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, | |||
| 9456 | } | 9470 | } |
| 9457 | 9471 | ||
| 9458 | pmu = perf_init_event(event); | 9472 | pmu = perf_init_event(event); |
| 9459 | if (!pmu) | 9473 | if (IS_ERR(pmu)) { |
| 9460 | goto err_ns; | ||
| 9461 | else if (IS_ERR(pmu)) { | ||
| 9462 | err = PTR_ERR(pmu); | 9474 | err = PTR_ERR(pmu); |
| 9463 | goto err_ns; | 9475 | goto err_ns; |
| 9464 | } | 9476 | } |
| @@ -9471,8 +9483,10 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, | |||
| 9471 | event->addr_filters_offs = kcalloc(pmu->nr_addr_filters, | 9483 | event->addr_filters_offs = kcalloc(pmu->nr_addr_filters, |
| 9472 | sizeof(unsigned long), | 9484 | sizeof(unsigned long), |
| 9473 | GFP_KERNEL); | 9485 | GFP_KERNEL); |
| 9474 | if (!event->addr_filters_offs) | 9486 | if (!event->addr_filters_offs) { |
| 9487 | err = -ENOMEM; | ||
| 9475 | goto err_per_task; | 9488 | goto err_per_task; |
| 9489 | } | ||
| 9476 | 9490 | ||
| 9477 | /* force hw sync on the address filters */ | 9491 | /* force hw sync on the address filters */ |
| 9478 | event->addr_filters_gen = 1; | 9492 | event->addr_filters_gen = 1; |
| @@ -9882,12 +9896,10 @@ SYSCALL_DEFINE5(perf_event_open, | |||
| 9882 | goto err_task; | 9896 | goto err_task; |
| 9883 | } | 9897 | } |
| 9884 | 9898 | ||
| 9885 | get_online_cpus(); | ||
| 9886 | |||
| 9887 | if (task) { | 9899 | if (task) { |
| 9888 | err = mutex_lock_interruptible(&task->signal->cred_guard_mutex); | 9900 | err = mutex_lock_interruptible(&task->signal->cred_guard_mutex); |
| 9889 | if (err) | 9901 | if (err) |
| 9890 | goto err_cpus; | 9902 | goto err_task; |
| 9891 | 9903 | ||
| 9892 | /* | 9904 | /* |
| 9893 | * Reuse ptrace permission checks for now. | 9905 | * Reuse ptrace permission checks for now. |
| @@ -10073,6 +10085,23 @@ SYSCALL_DEFINE5(perf_event_open, | |||
| 10073 | goto err_locked; | 10085 | goto err_locked; |
| 10074 | } | 10086 | } |
| 10075 | 10087 | ||
| 10088 | if (!task) { | ||
| 10089 | /* | ||
| 10090 | * Check if the @cpu we're creating an event for is online. | ||
| 10091 | * | ||
| 10092 | * We use the perf_cpu_context::ctx::mutex to serialize against | ||
| 10093 | * the hotplug notifiers. See perf_event_{init,exit}_cpu(). | ||
| 10094 | */ | ||
| 10095 | struct perf_cpu_context *cpuctx = | ||
| 10096 | container_of(ctx, struct perf_cpu_context, ctx); | ||
| 10097 | |||
| 10098 | if (!cpuctx->online) { | ||
| 10099 | err = -ENODEV; | ||
| 10100 | goto err_locked; | ||
| 10101 | } | ||
| 10102 | } | ||
| 10103 | |||
| 10104 | |||
| 10076 | /* | 10105 | /* |
| 10077 | * Must be under the same ctx::mutex as perf_install_in_context(), | 10106 | * Must be under the same ctx::mutex as perf_install_in_context(), |
| 10078 | * because we need to serialize with concurrent event creation. | 10107 | * because we need to serialize with concurrent event creation. |
| @@ -10162,8 +10191,6 @@ SYSCALL_DEFINE5(perf_event_open, | |||
| 10162 | put_task_struct(task); | 10191 | put_task_struct(task); |
| 10163 | } | 10192 | } |
| 10164 | 10193 | ||
| 10165 | put_online_cpus(); | ||
| 10166 | |||
| 10167 | mutex_lock(¤t->perf_event_mutex); | 10194 | mutex_lock(¤t->perf_event_mutex); |
| 10168 | list_add_tail(&event->owner_entry, ¤t->perf_event_list); | 10195 | list_add_tail(&event->owner_entry, ¤t->perf_event_list); |
| 10169 | mutex_unlock(¤t->perf_event_mutex); | 10196 | mutex_unlock(¤t->perf_event_mutex); |
| @@ -10197,8 +10224,6 @@ err_alloc: | |||
| 10197 | err_cred: | 10224 | err_cred: |
| 10198 | if (task) | 10225 | if (task) |
| 10199 | mutex_unlock(&task->signal->cred_guard_mutex); | 10226 | mutex_unlock(&task->signal->cred_guard_mutex); |
| 10200 | err_cpus: | ||
| 10201 | put_online_cpus(); | ||
| 10202 | err_task: | 10227 | err_task: |
| 10203 | if (task) | 10228 | if (task) |
| 10204 | put_task_struct(task); | 10229 | put_task_struct(task); |
| @@ -10253,6 +10278,21 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, | |||
| 10253 | goto err_unlock; | 10278 | goto err_unlock; |
| 10254 | } | 10279 | } |
| 10255 | 10280 | ||
| 10281 | if (!task) { | ||
| 10282 | /* | ||
| 10283 | * Check if the @cpu we're creating an event for is online. | ||
| 10284 | * | ||
| 10285 | * We use the perf_cpu_context::ctx::mutex to serialize against | ||
| 10286 | * the hotplug notifiers. See perf_event_{init,exit}_cpu(). | ||
| 10287 | */ | ||
| 10288 | struct perf_cpu_context *cpuctx = | ||
| 10289 | container_of(ctx, struct perf_cpu_context, ctx); | ||
| 10290 | if (!cpuctx->online) { | ||
| 10291 | err = -ENODEV; | ||
| 10292 | goto err_unlock; | ||
| 10293 | } | ||
| 10294 | } | ||
| 10295 | |||
| 10256 | if (!exclusive_event_installable(event, ctx)) { | 10296 | if (!exclusive_event_installable(event, ctx)) { |
| 10257 | err = -EBUSY; | 10297 | err = -EBUSY; |
| 10258 | goto err_unlock; | 10298 | goto err_unlock; |
| @@ -10920,6 +10960,8 @@ static void __init perf_event_init_all_cpus(void) | |||
| 10920 | struct swevent_htable *swhash; | 10960 | struct swevent_htable *swhash; |
| 10921 | int cpu; | 10961 | int cpu; |
| 10922 | 10962 | ||
| 10963 | zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL); | ||
| 10964 | |||
| 10923 | for_each_possible_cpu(cpu) { | 10965 | for_each_possible_cpu(cpu) { |
| 10924 | swhash = &per_cpu(swevent_htable, cpu); | 10966 | swhash = &per_cpu(swevent_htable, cpu); |
| 10925 | mutex_init(&swhash->hlist_mutex); | 10967 | mutex_init(&swhash->hlist_mutex); |
| @@ -10935,7 +10977,7 @@ static void __init perf_event_init_all_cpus(void) | |||
| 10935 | } | 10977 | } |
| 10936 | } | 10978 | } |
| 10937 | 10979 | ||
| 10938 | int perf_event_init_cpu(unsigned int cpu) | 10980 | void perf_swevent_init_cpu(unsigned int cpu) |
| 10939 | { | 10981 | { |
| 10940 | struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); | 10982 | struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); |
| 10941 | 10983 | ||
| @@ -10948,7 +10990,6 @@ int perf_event_init_cpu(unsigned int cpu) | |||
| 10948 | rcu_assign_pointer(swhash->swevent_hlist, hlist); | 10990 | rcu_assign_pointer(swhash->swevent_hlist, hlist); |
| 10949 | } | 10991 | } |
| 10950 | mutex_unlock(&swhash->hlist_mutex); | 10992 | mutex_unlock(&swhash->hlist_mutex); |
| 10951 | return 0; | ||
| 10952 | } | 10993 | } |
| 10953 | 10994 | ||
| 10954 | #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE | 10995 | #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE |
| @@ -10966,19 +11007,22 @@ static void __perf_event_exit_context(void *__info) | |||
| 10966 | 11007 | ||
| 10967 | static void perf_event_exit_cpu_context(int cpu) | 11008 | static void perf_event_exit_cpu_context(int cpu) |
| 10968 | { | 11009 | { |
| 11010 | struct perf_cpu_context *cpuctx; | ||
| 10969 | struct perf_event_context *ctx; | 11011 | struct perf_event_context *ctx; |
| 10970 | struct pmu *pmu; | 11012 | struct pmu *pmu; |
| 10971 | int idx; | ||
| 10972 | 11013 | ||
| 10973 | idx = srcu_read_lock(&pmus_srcu); | 11014 | mutex_lock(&pmus_lock); |
| 10974 | list_for_each_entry_rcu(pmu, &pmus, entry) { | 11015 | list_for_each_entry(pmu, &pmus, entry) { |
| 10975 | ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx; | 11016 | cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); |
| 11017 | ctx = &cpuctx->ctx; | ||
| 10976 | 11018 | ||
| 10977 | mutex_lock(&ctx->mutex); | 11019 | mutex_lock(&ctx->mutex); |
| 10978 | smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); | 11020 | smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); |
| 11021 | cpuctx->online = 0; | ||
| 10979 | mutex_unlock(&ctx->mutex); | 11022 | mutex_unlock(&ctx->mutex); |
| 10980 | } | 11023 | } |
| 10981 | srcu_read_unlock(&pmus_srcu, idx); | 11024 | cpumask_clear_cpu(cpu, perf_online_mask); |
| 11025 | mutex_unlock(&pmus_lock); | ||
| 10982 | } | 11026 | } |
| 10983 | #else | 11027 | #else |
| 10984 | 11028 | ||
| @@ -10986,6 +11030,29 @@ static void perf_event_exit_cpu_context(int cpu) { } | |||
| 10986 | 11030 | ||
| 10987 | #endif | 11031 | #endif |
| 10988 | 11032 | ||
| 11033 | int perf_event_init_cpu(unsigned int cpu) | ||
| 11034 | { | ||
| 11035 | struct perf_cpu_context *cpuctx; | ||
| 11036 | struct perf_event_context *ctx; | ||
| 11037 | struct pmu *pmu; | ||
| 11038 | |||
| 11039 | perf_swevent_init_cpu(cpu); | ||
| 11040 | |||
| 11041 | mutex_lock(&pmus_lock); | ||
| 11042 | cpumask_set_cpu(cpu, perf_online_mask); | ||
| 11043 | list_for_each_entry(pmu, &pmus, entry) { | ||
| 11044 | cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); | ||
| 11045 | ctx = &cpuctx->ctx; | ||
| 11046 | |||
| 11047 | mutex_lock(&ctx->mutex); | ||
| 11048 | cpuctx->online = 1; | ||
| 11049 | mutex_unlock(&ctx->mutex); | ||
| 11050 | } | ||
| 11051 | mutex_unlock(&pmus_lock); | ||
| 11052 | |||
| 11053 | return 0; | ||
| 11054 | } | ||
| 11055 | |||
| 10989 | int perf_event_exit_cpu(unsigned int cpu) | 11056 | int perf_event_exit_cpu(unsigned int cpu) |
| 10990 | { | 11057 | { |
| 10991 | perf_event_exit_cpu_context(cpu); | 11058 | perf_event_exit_cpu_context(cpu); |
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 2831480c63a2..ee97196bb151 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c | |||
| @@ -580,7 +580,7 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, | |||
| 580 | int ret = -ENOMEM, max_order = 0; | 580 | int ret = -ENOMEM, max_order = 0; |
| 581 | 581 | ||
| 582 | if (!has_aux(event)) | 582 | if (!has_aux(event)) |
| 583 | return -ENOTSUPP; | 583 | return -EOPNOTSUPP; |
| 584 | 584 | ||
| 585 | if (event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) { | 585 | if (event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) { |
| 586 | /* | 586 | /* |
diff --git a/kernel/exit.c b/kernel/exit.c index 516acdb0e0ec..c5548faa9f37 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
| @@ -51,7 +51,6 @@ | |||
| 51 | #include <linux/task_io_accounting_ops.h> | 51 | #include <linux/task_io_accounting_ops.h> |
| 52 | #include <linux/tracehook.h> | 52 | #include <linux/tracehook.h> |
| 53 | #include <linux/fs_struct.h> | 53 | #include <linux/fs_struct.h> |
| 54 | #include <linux/userfaultfd_k.h> | ||
| 55 | #include <linux/init_task.h> | 54 | #include <linux/init_task.h> |
| 56 | #include <linux/perf_event.h> | 55 | #include <linux/perf_event.h> |
| 57 | #include <trace/events/sched.h> | 56 | #include <trace/events/sched.h> |
| @@ -62,6 +61,7 @@ | |||
| 62 | #include <linux/kcov.h> | 61 | #include <linux/kcov.h> |
| 63 | #include <linux/random.h> | 62 | #include <linux/random.h> |
| 64 | #include <linux/rcuwait.h> | 63 | #include <linux/rcuwait.h> |
| 64 | #include <linux/compat.h> | ||
| 65 | 65 | ||
| 66 | #include <linux/uaccess.h> | 66 | #include <linux/uaccess.h> |
| 67 | #include <asm/unistd.h> | 67 | #include <asm/unistd.h> |
| @@ -318,19 +318,6 @@ void rcuwait_wake_up(struct rcuwait *w) | |||
| 318 | rcu_read_unlock(); | 318 | rcu_read_unlock(); |
| 319 | } | 319 | } |
| 320 | 320 | ||
| 321 | struct task_struct *try_get_task_struct(struct task_struct **ptask) | ||
| 322 | { | ||
| 323 | struct task_struct *task; | ||
| 324 | |||
| 325 | rcu_read_lock(); | ||
| 326 | task = task_rcu_dereference(ptask); | ||
| 327 | if (task) | ||
| 328 | get_task_struct(task); | ||
| 329 | rcu_read_unlock(); | ||
| 330 | |||
| 331 | return task; | ||
| 332 | } | ||
| 333 | |||
| 334 | /* | 321 | /* |
| 335 | * Determine if a process group is "orphaned", according to the POSIX | 322 | * Determine if a process group is "orphaned", according to the POSIX |
| 336 | * definition in 2.2.2.52. Orphaned process groups are not to be affected | 323 | * definition in 2.2.2.52. Orphaned process groups are not to be affected |
| @@ -995,16 +982,23 @@ SYSCALL_DEFINE1(exit_group, int, error_code) | |||
| 995 | return 0; | 982 | return 0; |
| 996 | } | 983 | } |
| 997 | 984 | ||
| 985 | struct waitid_info { | ||
| 986 | pid_t pid; | ||
| 987 | uid_t uid; | ||
| 988 | int status; | ||
| 989 | int cause; | ||
| 990 | }; | ||
| 991 | |||
| 998 | struct wait_opts { | 992 | struct wait_opts { |
| 999 | enum pid_type wo_type; | 993 | enum pid_type wo_type; |
| 1000 | int wo_flags; | 994 | int wo_flags; |
| 1001 | struct pid *wo_pid; | 995 | struct pid *wo_pid; |
| 1002 | 996 | ||
| 1003 | struct siginfo __user *wo_info; | 997 | struct waitid_info *wo_info; |
| 1004 | int __user *wo_stat; | 998 | int wo_stat; |
| 1005 | struct rusage __user *wo_rusage; | 999 | struct rusage *wo_rusage; |
| 1006 | 1000 | ||
| 1007 | wait_queue_t child_wait; | 1001 | wait_queue_entry_t child_wait; |
| 1008 | int notask_error; | 1002 | int notask_error; |
| 1009 | }; | 1003 | }; |
| 1010 | 1004 | ||
| @@ -1049,34 +1043,6 @@ eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p) | |||
| 1049 | return 1; | 1043 | return 1; |
| 1050 | } | 1044 | } |
| 1051 | 1045 | ||
| 1052 | static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p, | ||
| 1053 | pid_t pid, uid_t uid, int why, int status) | ||
| 1054 | { | ||
| 1055 | struct siginfo __user *infop; | ||
| 1056 | int retval = wo->wo_rusage | ||
| 1057 | ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; | ||
| 1058 | |||
| 1059 | put_task_struct(p); | ||
| 1060 | infop = wo->wo_info; | ||
| 1061 | if (infop) { | ||
| 1062 | if (!retval) | ||
| 1063 | retval = put_user(SIGCHLD, &infop->si_signo); | ||
| 1064 | if (!retval) | ||
| 1065 | retval = put_user(0, &infop->si_errno); | ||
| 1066 | if (!retval) | ||
| 1067 | retval = put_user((short)why, &infop->si_code); | ||
| 1068 | if (!retval) | ||
| 1069 | retval = put_user(pid, &infop->si_pid); | ||
| 1070 | if (!retval) | ||
| 1071 | retval = put_user(uid, &infop->si_uid); | ||
| 1072 | if (!retval) | ||
| 1073 | retval = put_user(status, &infop->si_status); | ||
| 1074 | } | ||
| 1075 | if (!retval) | ||
| 1076 | retval = pid; | ||
| 1077 | return retval; | ||
| 1078 | } | ||
| 1079 | |||
| 1080 | /* | 1046 | /* |
| 1081 | * Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold | 1047 | * Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold |
| 1082 | * read_lock(&tasklist_lock) on entry. If we return zero, we still hold | 1048 | * read_lock(&tasklist_lock) on entry. If we return zero, we still hold |
| @@ -1085,30 +1051,23 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p, | |||
| 1085 | */ | 1051 | */ |
| 1086 | static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) | 1052 | static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) |
| 1087 | { | 1053 | { |
| 1088 | int state, retval, status; | 1054 | int state, status; |
| 1089 | pid_t pid = task_pid_vnr(p); | 1055 | pid_t pid = task_pid_vnr(p); |
| 1090 | uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p)); | 1056 | uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p)); |
| 1091 | struct siginfo __user *infop; | 1057 | struct waitid_info *infop; |
| 1092 | 1058 | ||
| 1093 | if (!likely(wo->wo_flags & WEXITED)) | 1059 | if (!likely(wo->wo_flags & WEXITED)) |
| 1094 | return 0; | 1060 | return 0; |
| 1095 | 1061 | ||
| 1096 | if (unlikely(wo->wo_flags & WNOWAIT)) { | 1062 | if (unlikely(wo->wo_flags & WNOWAIT)) { |
| 1097 | int exit_code = p->exit_code; | 1063 | status = p->exit_code; |
| 1098 | int why; | ||
| 1099 | |||
| 1100 | get_task_struct(p); | 1064 | get_task_struct(p); |
| 1101 | read_unlock(&tasklist_lock); | 1065 | read_unlock(&tasklist_lock); |
| 1102 | sched_annotate_sleep(); | 1066 | sched_annotate_sleep(); |
| 1103 | 1067 | if (wo->wo_rusage) | |
| 1104 | if ((exit_code & 0x7f) == 0) { | 1068 | getrusage(p, RUSAGE_BOTH, wo->wo_rusage); |
| 1105 | why = CLD_EXITED; | 1069 | put_task_struct(p); |
| 1106 | status = exit_code >> 8; | 1070 | goto out_info; |
| 1107 | } else { | ||
| 1108 | why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED; | ||
| 1109 | status = exit_code & 0x7f; | ||
| 1110 | } | ||
| 1111 | return wait_noreap_copyout(wo, p, pid, uid, why, status); | ||
| 1112 | } | 1071 | } |
| 1113 | /* | 1072 | /* |
| 1114 | * Move the task's state to DEAD/TRACE, only one thread can do this. | 1073 | * Move the task's state to DEAD/TRACE, only one thread can do this. |
| @@ -1181,38 +1140,11 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) | |||
| 1181 | spin_unlock_irq(¤t->sighand->siglock); | 1140 | spin_unlock_irq(¤t->sighand->siglock); |
| 1182 | } | 1141 | } |
| 1183 | 1142 | ||
| 1184 | retval = wo->wo_rusage | 1143 | if (wo->wo_rusage) |
| 1185 | ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; | 1144 | getrusage(p, RUSAGE_BOTH, wo->wo_rusage); |
| 1186 | status = (p->signal->flags & SIGNAL_GROUP_EXIT) | 1145 | status = (p->signal->flags & SIGNAL_GROUP_EXIT) |
| 1187 | ? p->signal->group_exit_code : p->exit_code; | 1146 | ? p->signal->group_exit_code : p->exit_code; |
| 1188 | if (!retval && wo->wo_stat) | 1147 | wo->wo_stat = status; |
| 1189 | retval = put_user(status, wo->wo_stat); | ||
| 1190 | |||
| 1191 | infop = wo->wo_info; | ||
| 1192 | if (!retval && infop) | ||
| 1193 | retval = put_user(SIGCHLD, &infop->si_signo); | ||
| 1194 | if (!retval && infop) | ||
| 1195 | retval = put_user(0, &infop->si_errno); | ||
| 1196 | if (!retval && infop) { | ||
| 1197 | int why; | ||
| 1198 | |||
| 1199 | if ((status & 0x7f) == 0) { | ||
| 1200 | why = CLD_EXITED; | ||
| 1201 | status >>= 8; | ||
| 1202 | } else { | ||
| 1203 | why = (status & 0x80) ? CLD_DUMPED : CLD_KILLED; | ||
| 1204 | status &= 0x7f; | ||
| 1205 | } | ||
| 1206 | retval = put_user((short)why, &infop->si_code); | ||
| 1207 | if (!retval) | ||
| 1208 | retval = put_user(status, &infop->si_status); | ||
| 1209 | } | ||
| 1210 | if (!retval && infop) | ||
| 1211 | retval = put_user(pid, &infop->si_pid); | ||
| 1212 | if (!retval && infop) | ||
| 1213 | retval = put_user(uid, &infop->si_uid); | ||
| 1214 | if (!retval) | ||
| 1215 | retval = pid; | ||
| 1216 | 1148 | ||
| 1217 | if (state == EXIT_TRACE) { | 1149 | if (state == EXIT_TRACE) { |
| 1218 | write_lock_irq(&tasklist_lock); | 1150 | write_lock_irq(&tasklist_lock); |
| @@ -1229,7 +1161,21 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) | |||
| 1229 | if (state == EXIT_DEAD) | 1161 | if (state == EXIT_DEAD) |
| 1230 | release_task(p); | 1162 | release_task(p); |
| 1231 | 1163 | ||
| 1232 | return retval; | 1164 | out_info: |
| 1165 | infop = wo->wo_info; | ||
| 1166 | if (infop) { | ||
| 1167 | if ((status & 0x7f) == 0) { | ||
| 1168 | infop->cause = CLD_EXITED; | ||
| 1169 | infop->status = status >> 8; | ||
| 1170 | } else { | ||
| 1171 | infop->cause = (status & 0x80) ? CLD_DUMPED : CLD_KILLED; | ||
| 1172 | infop->status = status & 0x7f; | ||
| 1173 | } | ||
| 1174 | infop->pid = pid; | ||
| 1175 | infop->uid = uid; | ||
| 1176 | } | ||
| 1177 | |||
| 1178 | return pid; | ||
| 1233 | } | 1179 | } |
| 1234 | 1180 | ||
| 1235 | static int *task_stopped_code(struct task_struct *p, bool ptrace) | 1181 | static int *task_stopped_code(struct task_struct *p, bool ptrace) |
| @@ -1265,8 +1211,8 @@ static int *task_stopped_code(struct task_struct *p, bool ptrace) | |||
| 1265 | static int wait_task_stopped(struct wait_opts *wo, | 1211 | static int wait_task_stopped(struct wait_opts *wo, |
| 1266 | int ptrace, struct task_struct *p) | 1212 | int ptrace, struct task_struct *p) |
| 1267 | { | 1213 | { |
| 1268 | struct siginfo __user *infop; | 1214 | struct waitid_info *infop; |
| 1269 | int retval, exit_code, *p_code, why; | 1215 | int exit_code, *p_code, why; |
| 1270 | uid_t uid = 0; /* unneeded, required by compiler */ | 1216 | uid_t uid = 0; /* unneeded, required by compiler */ |
| 1271 | pid_t pid; | 1217 | pid_t pid; |
| 1272 | 1218 | ||
| @@ -1311,34 +1257,21 @@ unlock_sig: | |||
| 1311 | why = ptrace ? CLD_TRAPPED : CLD_STOPPED; | 1257 | why = ptrace ? CLD_TRAPPED : CLD_STOPPED; |
| 1312 | read_unlock(&tasklist_lock); | 1258 | read_unlock(&tasklist_lock); |
| 1313 | sched_annotate_sleep(); | 1259 | sched_annotate_sleep(); |
| 1260 | if (wo->wo_rusage) | ||
| 1261 | getrusage(p, RUSAGE_BOTH, wo->wo_rusage); | ||
| 1262 | put_task_struct(p); | ||
| 1314 | 1263 | ||
| 1315 | if (unlikely(wo->wo_flags & WNOWAIT)) | 1264 | if (likely(!(wo->wo_flags & WNOWAIT))) |
| 1316 | return wait_noreap_copyout(wo, p, pid, uid, why, exit_code); | 1265 | wo->wo_stat = (exit_code << 8) | 0x7f; |
| 1317 | |||
| 1318 | retval = wo->wo_rusage | ||
| 1319 | ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; | ||
| 1320 | if (!retval && wo->wo_stat) | ||
| 1321 | retval = put_user((exit_code << 8) | 0x7f, wo->wo_stat); | ||
| 1322 | 1266 | ||
| 1323 | infop = wo->wo_info; | 1267 | infop = wo->wo_info; |
| 1324 | if (!retval && infop) | 1268 | if (infop) { |
| 1325 | retval = put_user(SIGCHLD, &infop->si_signo); | 1269 | infop->cause = why; |
| 1326 | if (!retval && infop) | 1270 | infop->status = exit_code; |
| 1327 | retval = put_user(0, &infop->si_errno); | 1271 | infop->pid = pid; |
| 1328 | if (!retval && infop) | 1272 | infop->uid = uid; |
| 1329 | retval = put_user((short)why, &infop->si_code); | 1273 | } |
| 1330 | if (!retval && infop) | 1274 | return pid; |
| 1331 | retval = put_user(exit_code, &infop->si_status); | ||
| 1332 | if (!retval && infop) | ||
| 1333 | retval = put_user(pid, &infop->si_pid); | ||
| 1334 | if (!retval && infop) | ||
| 1335 | retval = put_user(uid, &infop->si_uid); | ||
| 1336 | if (!retval) | ||
| 1337 | retval = pid; | ||
| 1338 | put_task_struct(p); | ||
| 1339 | |||
| 1340 | BUG_ON(!retval); | ||
| 1341 | return retval; | ||
| 1342 | } | 1275 | } |
| 1343 | 1276 | ||
| 1344 | /* | 1277 | /* |
| @@ -1349,7 +1282,7 @@ unlock_sig: | |||
| 1349 | */ | 1282 | */ |
| 1350 | static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) | 1283 | static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) |
| 1351 | { | 1284 | { |
| 1352 | int retval; | 1285 | struct waitid_info *infop; |
| 1353 | pid_t pid; | 1286 | pid_t pid; |
| 1354 | uid_t uid; | 1287 | uid_t uid; |
| 1355 | 1288 | ||
| @@ -1374,22 +1307,20 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) | |||
| 1374 | get_task_struct(p); | 1307 | get_task_struct(p); |
| 1375 | read_unlock(&tasklist_lock); | 1308 | read_unlock(&tasklist_lock); |
| 1376 | sched_annotate_sleep(); | 1309 | sched_annotate_sleep(); |
| 1310 | if (wo->wo_rusage) | ||
| 1311 | getrusage(p, RUSAGE_BOTH, wo->wo_rusage); | ||
| 1312 | put_task_struct(p); | ||
| 1377 | 1313 | ||
| 1378 | if (!wo->wo_info) { | 1314 | infop = wo->wo_info; |
| 1379 | retval = wo->wo_rusage | 1315 | if (!infop) { |
| 1380 | ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; | 1316 | wo->wo_stat = 0xffff; |
| 1381 | put_task_struct(p); | ||
| 1382 | if (!retval && wo->wo_stat) | ||
| 1383 | retval = put_user(0xffff, wo->wo_stat); | ||
| 1384 | if (!retval) | ||
| 1385 | retval = pid; | ||
| 1386 | } else { | 1317 | } else { |
| 1387 | retval = wait_noreap_copyout(wo, p, pid, uid, | 1318 | infop->cause = CLD_CONTINUED; |
| 1388 | CLD_CONTINUED, SIGCONT); | 1319 | infop->pid = pid; |
| 1389 | BUG_ON(retval == 0); | 1320 | infop->uid = uid; |
| 1321 | infop->status = SIGCONT; | ||
| 1390 | } | 1322 | } |
| 1391 | 1323 | return pid; | |
| 1392 | return retval; | ||
| 1393 | } | 1324 | } |
| 1394 | 1325 | ||
| 1395 | /* | 1326 | /* |
| @@ -1541,7 +1472,7 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk) | |||
| 1541 | return 0; | 1472 | return 0; |
| 1542 | } | 1473 | } |
| 1543 | 1474 | ||
| 1544 | static int child_wait_callback(wait_queue_t *wait, unsigned mode, | 1475 | static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode, |
| 1545 | int sync, void *key) | 1476 | int sync, void *key) |
| 1546 | { | 1477 | { |
| 1547 | struct wait_opts *wo = container_of(wait, struct wait_opts, | 1478 | struct wait_opts *wo = container_of(wait, struct wait_opts, |
| @@ -1617,8 +1548,8 @@ end: | |||
| 1617 | return retval; | 1548 | return retval; |
| 1618 | } | 1549 | } |
| 1619 | 1550 | ||
| 1620 | SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, | 1551 | static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, |
| 1621 | infop, int, options, struct rusage __user *, ru) | 1552 | int options, struct rusage *ru) |
| 1622 | { | 1553 | { |
| 1623 | struct wait_opts wo; | 1554 | struct wait_opts wo; |
| 1624 | struct pid *pid = NULL; | 1555 | struct pid *pid = NULL; |
| @@ -1656,38 +1587,48 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, | |||
| 1656 | wo.wo_pid = pid; | 1587 | wo.wo_pid = pid; |
| 1657 | wo.wo_flags = options; | 1588 | wo.wo_flags = options; |
| 1658 | wo.wo_info = infop; | 1589 | wo.wo_info = infop; |
| 1659 | wo.wo_stat = NULL; | ||
| 1660 | wo.wo_rusage = ru; | 1590 | wo.wo_rusage = ru; |
| 1661 | ret = do_wait(&wo); | 1591 | ret = do_wait(&wo); |
| 1662 | 1592 | ||
| 1663 | if (ret > 0) { | ||
| 1664 | ret = 0; | ||
| 1665 | } else if (infop) { | ||
| 1666 | /* | ||
| 1667 | * For a WNOHANG return, clear out all the fields | ||
| 1668 | * we would set so the user can easily tell the | ||
| 1669 | * difference. | ||
| 1670 | */ | ||
| 1671 | if (!ret) | ||
| 1672 | ret = put_user(0, &infop->si_signo); | ||
| 1673 | if (!ret) | ||
| 1674 | ret = put_user(0, &infop->si_errno); | ||
| 1675 | if (!ret) | ||
| 1676 | ret = put_user(0, &infop->si_code); | ||
| 1677 | if (!ret) | ||
| 1678 | ret = put_user(0, &infop->si_pid); | ||
| 1679 | if (!ret) | ||
| 1680 | ret = put_user(0, &infop->si_uid); | ||
| 1681 | if (!ret) | ||
| 1682 | ret = put_user(0, &infop->si_status); | ||
| 1683 | } | ||
| 1684 | |||
| 1685 | put_pid(pid); | 1593 | put_pid(pid); |
| 1686 | return ret; | 1594 | return ret; |
| 1687 | } | 1595 | } |
| 1688 | 1596 | ||
| 1689 | SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, | 1597 | SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, |
| 1690 | int, options, struct rusage __user *, ru) | 1598 | infop, int, options, struct rusage __user *, ru) |
| 1599 | { | ||
| 1600 | struct rusage r; | ||
| 1601 | struct waitid_info info = {.status = 0}; | ||
| 1602 | long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL); | ||
| 1603 | int signo = 0; | ||
| 1604 | if (err > 0) { | ||
| 1605 | signo = SIGCHLD; | ||
| 1606 | err = 0; | ||
| 1607 | } | ||
| 1608 | |||
| 1609 | if (!err) { | ||
| 1610 | if (ru && copy_to_user(ru, &r, sizeof(struct rusage))) | ||
| 1611 | return -EFAULT; | ||
| 1612 | } | ||
| 1613 | if (!infop) | ||
| 1614 | return err; | ||
| 1615 | |||
| 1616 | user_access_begin(); | ||
| 1617 | unsafe_put_user(signo, &infop->si_signo, Efault); | ||
| 1618 | unsafe_put_user(0, &infop->si_errno, Efault); | ||
| 1619 | unsafe_put_user((short)info.cause, &infop->si_code, Efault); | ||
| 1620 | unsafe_put_user(info.pid, &infop->si_pid, Efault); | ||
| 1621 | unsafe_put_user(info.uid, &infop->si_uid, Efault); | ||
| 1622 | unsafe_put_user(info.status, &infop->si_status, Efault); | ||
| 1623 | user_access_end(); | ||
| 1624 | return err; | ||
| 1625 | Efault: | ||
| 1626 | user_access_end(); | ||
| 1627 | return -EFAULT; | ||
| 1628 | } | ||
| 1629 | |||
| 1630 | long kernel_wait4(pid_t upid, int __user *stat_addr, int options, | ||
| 1631 | struct rusage *ru) | ||
| 1691 | { | 1632 | { |
| 1692 | struct wait_opts wo; | 1633 | struct wait_opts wo; |
| 1693 | struct pid *pid = NULL; | 1634 | struct pid *pid = NULL; |
| @@ -1698,6 +1639,10 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, | |||
| 1698 | __WNOTHREAD|__WCLONE|__WALL)) | 1639 | __WNOTHREAD|__WCLONE|__WALL)) |
| 1699 | return -EINVAL; | 1640 | return -EINVAL; |
| 1700 | 1641 | ||
| 1642 | /* -INT_MIN is not defined */ | ||
| 1643 | if (upid == INT_MIN) | ||
| 1644 | return -ESRCH; | ||
| 1645 | |||
| 1701 | if (upid == -1) | 1646 | if (upid == -1) |
| 1702 | type = PIDTYPE_MAX; | 1647 | type = PIDTYPE_MAX; |
| 1703 | else if (upid < 0) { | 1648 | else if (upid < 0) { |
| @@ -1715,14 +1660,29 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, | |||
| 1715 | wo.wo_pid = pid; | 1660 | wo.wo_pid = pid; |
| 1716 | wo.wo_flags = options | WEXITED; | 1661 | wo.wo_flags = options | WEXITED; |
| 1717 | wo.wo_info = NULL; | 1662 | wo.wo_info = NULL; |
| 1718 | wo.wo_stat = stat_addr; | 1663 | wo.wo_stat = 0; |
| 1719 | wo.wo_rusage = ru; | 1664 | wo.wo_rusage = ru; |
| 1720 | ret = do_wait(&wo); | 1665 | ret = do_wait(&wo); |
| 1721 | put_pid(pid); | 1666 | put_pid(pid); |
| 1667 | if (ret > 0 && stat_addr && put_user(wo.wo_stat, stat_addr)) | ||
| 1668 | ret = -EFAULT; | ||
| 1722 | 1669 | ||
| 1723 | return ret; | 1670 | return ret; |
| 1724 | } | 1671 | } |
| 1725 | 1672 | ||
| 1673 | SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, | ||
| 1674 | int, options, struct rusage __user *, ru) | ||
| 1675 | { | ||
| 1676 | struct rusage r; | ||
| 1677 | long err = kernel_wait4(upid, stat_addr, options, ru ? &r : NULL); | ||
| 1678 | |||
| 1679 | if (err > 0) { | ||
| 1680 | if (ru && copy_to_user(ru, &r, sizeof(struct rusage))) | ||
| 1681 | return -EFAULT; | ||
| 1682 | } | ||
| 1683 | return err; | ||
| 1684 | } | ||
| 1685 | |||
| 1726 | #ifdef __ARCH_WANT_SYS_WAITPID | 1686 | #ifdef __ARCH_WANT_SYS_WAITPID |
| 1727 | 1687 | ||
| 1728 | /* | 1688 | /* |
| @@ -1735,3 +1695,61 @@ SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options) | |||
| 1735 | } | 1695 | } |
| 1736 | 1696 | ||
| 1737 | #endif | 1697 | #endif |
| 1698 | |||
| 1699 | #ifdef CONFIG_COMPAT | ||
| 1700 | COMPAT_SYSCALL_DEFINE4(wait4, | ||
| 1701 | compat_pid_t, pid, | ||
| 1702 | compat_uint_t __user *, stat_addr, | ||
| 1703 | int, options, | ||
| 1704 | struct compat_rusage __user *, ru) | ||
| 1705 | { | ||
| 1706 | struct rusage r; | ||
| 1707 | long err = kernel_wait4(pid, stat_addr, options, ru ? &r : NULL); | ||
| 1708 | if (err > 0) { | ||
| 1709 | if (ru && put_compat_rusage(&r, ru)) | ||
| 1710 | return -EFAULT; | ||
| 1711 | } | ||
| 1712 | return err; | ||
| 1713 | } | ||
| 1714 | |||
| 1715 | COMPAT_SYSCALL_DEFINE5(waitid, | ||
| 1716 | int, which, compat_pid_t, pid, | ||
| 1717 | struct compat_siginfo __user *, infop, int, options, | ||
| 1718 | struct compat_rusage __user *, uru) | ||
| 1719 | { | ||
| 1720 | struct rusage ru; | ||
| 1721 | struct waitid_info info = {.status = 0}; | ||
| 1722 | long err = kernel_waitid(which, pid, &info, options, uru ? &ru : NULL); | ||
| 1723 | int signo = 0; | ||
| 1724 | if (err > 0) { | ||
| 1725 | signo = SIGCHLD; | ||
| 1726 | err = 0; | ||
| 1727 | } | ||
| 1728 | |||
| 1729 | if (!err && uru) { | ||
| 1730 | /* kernel_waitid() overwrites everything in ru */ | ||
| 1731 | if (COMPAT_USE_64BIT_TIME) | ||
| 1732 | err = copy_to_user(uru, &ru, sizeof(ru)); | ||
| 1733 | else | ||
| 1734 | err = put_compat_rusage(&ru, uru); | ||
| 1735 | if (err) | ||
| 1736 | return -EFAULT; | ||
| 1737 | } | ||
| 1738 | |||
| 1739 | if (!infop) | ||
| 1740 | return err; | ||
| 1741 | |||
| 1742 | user_access_begin(); | ||
| 1743 | unsafe_put_user(signo, &infop->si_signo, Efault); | ||
| 1744 | unsafe_put_user(0, &infop->si_errno, Efault); | ||
| 1745 | unsafe_put_user((short)info.cause, &infop->si_code, Efault); | ||
| 1746 | unsafe_put_user(info.pid, &infop->si_pid, Efault); | ||
| 1747 | unsafe_put_user(info.uid, &infop->si_uid, Efault); | ||
| 1748 | unsafe_put_user(info.status, &infop->si_status, Efault); | ||
| 1749 | user_access_end(); | ||
| 1750 | return err; | ||
| 1751 | Efault: | ||
| 1752 | user_access_end(); | ||
| 1753 | return -EFAULT; | ||
| 1754 | } | ||
| 1755 | #endif | ||
diff --git a/kernel/extable.c b/kernel/extable.c index 2676d7f8baf6..38c2412401a1 100644 --- a/kernel/extable.c +++ b/kernel/extable.c | |||
| @@ -55,7 +55,8 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr) | |||
| 55 | { | 55 | { |
| 56 | const struct exception_table_entry *e; | 56 | const struct exception_table_entry *e; |
| 57 | 57 | ||
| 58 | e = search_extable(__start___ex_table, __stop___ex_table-1, addr); | 58 | e = search_extable(__start___ex_table, |
| 59 | __stop___ex_table - __start___ex_table, addr); | ||
| 59 | if (!e) | 60 | if (!e) |
| 60 | e = search_module_extables(addr); | 61 | e = search_module_extables(addr); |
| 61 | return e; | 62 | return e; |
| @@ -69,13 +70,13 @@ static inline int init_kernel_text(unsigned long addr) | |||
| 69 | return 0; | 70 | return 0; |
| 70 | } | 71 | } |
| 71 | 72 | ||
| 72 | int core_kernel_text(unsigned long addr) | 73 | int notrace core_kernel_text(unsigned long addr) |
| 73 | { | 74 | { |
| 74 | if (addr >= (unsigned long)_stext && | 75 | if (addr >= (unsigned long)_stext && |
| 75 | addr < (unsigned long)_etext) | 76 | addr < (unsigned long)_etext) |
| 76 | return 1; | 77 | return 1; |
| 77 | 78 | ||
| 78 | if (system_state == SYSTEM_BOOTING && | 79 | if (system_state < SYSTEM_RUNNING && |
| 79 | init_kernel_text(addr)) | 80 | init_kernel_text(addr)) |
| 80 | return 1; | 81 | return 1; |
| 81 | return 0; | 82 | return 0; |
diff --git a/kernel/fork.c b/kernel/fork.c index 3a13a940a6ea..5ff0ebcaafc3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
| @@ -205,19 +205,17 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) | |||
| 205 | void *stack; | 205 | void *stack; |
| 206 | int i; | 206 | int i; |
| 207 | 207 | ||
| 208 | local_irq_disable(); | ||
| 209 | for (i = 0; i < NR_CACHED_STACKS; i++) { | 208 | for (i = 0; i < NR_CACHED_STACKS; i++) { |
| 210 | struct vm_struct *s = this_cpu_read(cached_stacks[i]); | 209 | struct vm_struct *s; |
| 210 | |||
| 211 | s = this_cpu_xchg(cached_stacks[i], NULL); | ||
| 211 | 212 | ||
| 212 | if (!s) | 213 | if (!s) |
| 213 | continue; | 214 | continue; |
| 214 | this_cpu_write(cached_stacks[i], NULL); | ||
| 215 | 215 | ||
| 216 | tsk->stack_vm_area = s; | 216 | tsk->stack_vm_area = s; |
| 217 | local_irq_enable(); | ||
| 218 | return s->addr; | 217 | return s->addr; |
| 219 | } | 218 | } |
| 220 | local_irq_enable(); | ||
| 221 | 219 | ||
| 222 | stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE, | 220 | stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE, |
| 223 | VMALLOC_START, VMALLOC_END, | 221 | VMALLOC_START, VMALLOC_END, |
| @@ -245,19 +243,15 @@ static inline void free_thread_stack(struct task_struct *tsk) | |||
| 245 | { | 243 | { |
| 246 | #ifdef CONFIG_VMAP_STACK | 244 | #ifdef CONFIG_VMAP_STACK |
| 247 | if (task_stack_vm_area(tsk)) { | 245 | if (task_stack_vm_area(tsk)) { |
| 248 | unsigned long flags; | ||
| 249 | int i; | 246 | int i; |
| 250 | 247 | ||
| 251 | local_irq_save(flags); | ||
| 252 | for (i = 0; i < NR_CACHED_STACKS; i++) { | 248 | for (i = 0; i < NR_CACHED_STACKS; i++) { |
| 253 | if (this_cpu_read(cached_stacks[i])) | 249 | if (this_cpu_cmpxchg(cached_stacks[i], |
| 250 | NULL, tsk->stack_vm_area) != NULL) | ||
| 254 | continue; | 251 | continue; |
| 255 | 252 | ||
| 256 | this_cpu_write(cached_stacks[i], tsk->stack_vm_area); | ||
| 257 | local_irq_restore(flags); | ||
| 258 | return; | 253 | return; |
| 259 | } | 254 | } |
| 260 | local_irq_restore(flags); | ||
| 261 | 255 | ||
| 262 | vfree_atomic(tsk->stack); | 256 | vfree_atomic(tsk->stack); |
| 263 | return; | 257 | return; |
| @@ -326,8 +320,8 @@ static void account_kernel_stack(struct task_struct *tsk, int account) | |||
| 326 | } | 320 | } |
| 327 | 321 | ||
| 328 | /* All stack pages belong to the same memcg. */ | 322 | /* All stack pages belong to the same memcg. */ |
| 329 | memcg_kmem_update_page_stat(vm->pages[0], MEMCG_KERNEL_STACK_KB, | 323 | mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB, |
| 330 | account * (THREAD_SIZE / 1024)); | 324 | account * (THREAD_SIZE / 1024)); |
| 331 | } else { | 325 | } else { |
| 332 | /* | 326 | /* |
| 333 | * All stack pages are in the same zone and belong to the | 327 | * All stack pages are in the same zone and belong to the |
| @@ -338,8 +332,8 @@ static void account_kernel_stack(struct task_struct *tsk, int account) | |||
| 338 | mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, | 332 | mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, |
| 339 | THREAD_SIZE / 1024 * account); | 333 | THREAD_SIZE / 1024 * account); |
| 340 | 334 | ||
| 341 | memcg_kmem_update_page_stat(first_page, MEMCG_KERNEL_STACK_KB, | 335 | mod_memcg_page_state(first_page, MEMCG_KERNEL_STACK_KB, |
| 342 | account * (THREAD_SIZE / 1024)); | 336 | account * (THREAD_SIZE / 1024)); |
| 343 | } | 337 | } |
| 344 | } | 338 | } |
| 345 | 339 | ||
| @@ -560,7 +554,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) | |||
| 560 | set_task_stack_end_magic(tsk); | 554 | set_task_stack_end_magic(tsk); |
| 561 | 555 | ||
| 562 | #ifdef CONFIG_CC_STACKPROTECTOR | 556 | #ifdef CONFIG_CC_STACKPROTECTOR |
| 563 | tsk->stack_canary = get_random_long(); | 557 | tsk->stack_canary = get_random_canary(); |
| 564 | #endif | 558 | #endif |
| 565 | 559 | ||
| 566 | /* | 560 | /* |
| @@ -579,6 +573,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) | |||
| 579 | 573 | ||
| 580 | kcov_task_init(tsk); | 574 | kcov_task_init(tsk); |
| 581 | 575 | ||
| 576 | #ifdef CONFIG_FAULT_INJECTION | ||
| 577 | tsk->fail_nth = 0; | ||
| 578 | #endif | ||
| 579 | |||
| 582 | return tsk; | 580 | return tsk; |
| 583 | 581 | ||
| 584 | free_stack: | 582 | free_stack: |
| @@ -1573,6 +1571,18 @@ static __latent_entropy struct task_struct *copy_process( | |||
| 1573 | if (!p) | 1571 | if (!p) |
| 1574 | goto fork_out; | 1572 | goto fork_out; |
| 1575 | 1573 | ||
| 1574 | /* | ||
| 1575 | * This _must_ happen before we call free_task(), i.e. before we jump | ||
| 1576 | * to any of the bad_fork_* labels. This is to avoid freeing | ||
| 1577 | * p->set_child_tid which is (ab)used as a kthread's data pointer for | ||
| 1578 | * kernel threads (PF_KTHREAD). | ||
| 1579 | */ | ||
| 1580 | p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; | ||
| 1581 | /* | ||
| 1582 | * Clear TID on mm_release()? | ||
| 1583 | */ | ||
| 1584 | p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL; | ||
| 1585 | |||
| 1576 | ftrace_graph_init_task(p); | 1586 | ftrace_graph_init_task(p); |
| 1577 | 1587 | ||
| 1578 | rt_mutex_init_task(p); | 1588 | rt_mutex_init_task(p); |
| @@ -1621,9 +1631,9 @@ static __latent_entropy struct task_struct *copy_process( | |||
| 1621 | prev_cputime_init(&p->prev_cputime); | 1631 | prev_cputime_init(&p->prev_cputime); |
| 1622 | 1632 | ||
| 1623 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN | 1633 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN |
| 1624 | seqcount_init(&p->vtime_seqcount); | 1634 | seqcount_init(&p->vtime.seqcount); |
| 1625 | p->vtime_snap = 0; | 1635 | p->vtime.starttime = 0; |
| 1626 | p->vtime_snap_whence = VTIME_INACTIVE; | 1636 | p->vtime.state = VTIME_INACTIVE; |
| 1627 | #endif | 1637 | #endif |
| 1628 | 1638 | ||
| 1629 | #if defined(SPLIT_RSS_COUNTING) | 1639 | #if defined(SPLIT_RSS_COUNTING) |
| @@ -1739,11 +1749,6 @@ static __latent_entropy struct task_struct *copy_process( | |||
| 1739 | } | 1749 | } |
| 1740 | } | 1750 | } |
| 1741 | 1751 | ||
| 1742 | p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; | ||
| 1743 | /* | ||
| 1744 | * Clear TID on mm_release()? | ||
| 1745 | */ | ||
| 1746 | p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL; | ||
| 1747 | #ifdef CONFIG_BLOCK | 1752 | #ifdef CONFIG_BLOCK |
| 1748 | p->plug = NULL; | 1753 | p->plug = NULL; |
| 1749 | #endif | 1754 | #endif |
diff --git a/kernel/futex.c b/kernel/futex.c index 357348a6cf6b..16dbe4c93895 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
| @@ -212,7 +212,7 @@ struct futex_pi_state { | |||
| 212 | atomic_t refcount; | 212 | atomic_t refcount; |
| 213 | 213 | ||
| 214 | union futex_key key; | 214 | union futex_key key; |
| 215 | }; | 215 | } __randomize_layout; |
| 216 | 216 | ||
| 217 | /** | 217 | /** |
| 218 | * struct futex_q - The hashed futex queue entry, one per waiting task | 218 | * struct futex_q - The hashed futex queue entry, one per waiting task |
| @@ -225,7 +225,7 @@ struct futex_pi_state { | |||
| 225 | * @requeue_pi_key: the requeue_pi target futex key | 225 | * @requeue_pi_key: the requeue_pi target futex key |
| 226 | * @bitset: bitset for the optional bitmasked wakeup | 226 | * @bitset: bitset for the optional bitmasked wakeup |
| 227 | * | 227 | * |
| 228 | * We use this hashed waitqueue, instead of a normal wait_queue_t, so | 228 | * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so |
| 229 | * we can wake only the relevant ones (hashed queues may be shared). | 229 | * we can wake only the relevant ones (hashed queues may be shared). |
| 230 | * | 230 | * |
| 231 | * A futex_q has a woken state, just like tasks have TASK_RUNNING. | 231 | * A futex_q has a woken state, just like tasks have TASK_RUNNING. |
| @@ -246,7 +246,7 @@ struct futex_q { | |||
| 246 | struct rt_mutex_waiter *rt_waiter; | 246 | struct rt_mutex_waiter *rt_waiter; |
| 247 | union futex_key *requeue_pi_key; | 247 | union futex_key *requeue_pi_key; |
| 248 | u32 bitset; | 248 | u32 bitset; |
| 249 | }; | 249 | } __randomize_layout; |
| 250 | 250 | ||
| 251 | static const struct futex_q futex_q_init = { | 251 | static const struct futex_q futex_q_init = { |
| 252 | /* list gets initialized in queue_me()*/ | 252 | /* list gets initialized in queue_me()*/ |
| @@ -488,7 +488,7 @@ static void drop_futex_key_refs(union futex_key *key) | |||
| 488 | * | 488 | * |
| 489 | * Return: a negative error code or 0 | 489 | * Return: a negative error code or 0 |
| 490 | * | 490 | * |
| 491 | * The key words are stored in *key on success. | 491 | * The key words are stored in @key on success. |
| 492 | * | 492 | * |
| 493 | * For shared mappings, it's (page->index, file_inode(vma->vm_file), | 493 | * For shared mappings, it's (page->index, file_inode(vma->vm_file), |
| 494 | * offset_within_page). For private mappings, it's (uaddr, current->mm). | 494 | * offset_within_page). For private mappings, it's (uaddr, current->mm). |
| @@ -1259,9 +1259,9 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) | |||
| 1259 | * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) | 1259 | * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) |
| 1260 | * | 1260 | * |
| 1261 | * Return: | 1261 | * Return: |
| 1262 | * 0 - ready to wait; | 1262 | * - 0 - ready to wait; |
| 1263 | * 1 - acquired the lock; | 1263 | * - 1 - acquired the lock; |
| 1264 | * <0 - error | 1264 | * - <0 - error |
| 1265 | * | 1265 | * |
| 1266 | * The hb->lock and futex_key refs shall be held by the caller. | 1266 | * The hb->lock and futex_key refs shall be held by the caller. |
| 1267 | */ | 1267 | */ |
| @@ -1717,9 +1717,9 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, | |||
| 1717 | * hb1 and hb2 must be held by the caller. | 1717 | * hb1 and hb2 must be held by the caller. |
| 1718 | * | 1718 | * |
| 1719 | * Return: | 1719 | * Return: |
| 1720 | * 0 - failed to acquire the lock atomically; | 1720 | * - 0 - failed to acquire the lock atomically; |
| 1721 | * >0 - acquired the lock, return value is vpid of the top_waiter | 1721 | * - >0 - acquired the lock, return value is vpid of the top_waiter |
| 1722 | * <0 - error | 1722 | * - <0 - error |
| 1723 | */ | 1723 | */ |
| 1724 | static int futex_proxy_trylock_atomic(u32 __user *pifutex, | 1724 | static int futex_proxy_trylock_atomic(u32 __user *pifutex, |
| 1725 | struct futex_hash_bucket *hb1, | 1725 | struct futex_hash_bucket *hb1, |
| @@ -1785,8 +1785,8 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, | |||
| 1785 | * uaddr2 atomically on behalf of the top waiter. | 1785 | * uaddr2 atomically on behalf of the top waiter. |
| 1786 | * | 1786 | * |
| 1787 | * Return: | 1787 | * Return: |
| 1788 | * >=0 - on success, the number of tasks requeued or woken; | 1788 | * - >=0 - on success, the number of tasks requeued or woken; |
| 1789 | * <0 - on error | 1789 | * - <0 - on error |
| 1790 | */ | 1790 | */ |
| 1791 | static int futex_requeue(u32 __user *uaddr1, unsigned int flags, | 1791 | static int futex_requeue(u32 __user *uaddr1, unsigned int flags, |
| 1792 | u32 __user *uaddr2, int nr_wake, int nr_requeue, | 1792 | u32 __user *uaddr2, int nr_wake, int nr_requeue, |
| @@ -2142,8 +2142,8 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) | |||
| 2142 | * be paired with exactly one earlier call to queue_me(). | 2142 | * be paired with exactly one earlier call to queue_me(). |
| 2143 | * | 2143 | * |
| 2144 | * Return: | 2144 | * Return: |
| 2145 | * 1 - if the futex_q was still queued (and we removed unqueued it); | 2145 | * - 1 - if the futex_q was still queued (and we removed unqueued it); |
| 2146 | * 0 - if the futex_q was already removed by the waking thread | 2146 | * - 0 - if the futex_q was already removed by the waking thread |
| 2147 | */ | 2147 | */ |
| 2148 | static int unqueue_me(struct futex_q *q) | 2148 | static int unqueue_me(struct futex_q *q) |
| 2149 | { | 2149 | { |
| @@ -2333,9 +2333,9 @@ static long futex_wait_restart(struct restart_block *restart); | |||
| 2333 | * acquire the lock. Must be called with the hb lock held. | 2333 | * acquire the lock. Must be called with the hb lock held. |
| 2334 | * | 2334 | * |
| 2335 | * Return: | 2335 | * Return: |
| 2336 | * 1 - success, lock taken; | 2336 | * - 1 - success, lock taken; |
| 2337 | * 0 - success, lock not taken; | 2337 | * - 0 - success, lock not taken; |
| 2338 | * <0 - on error (-EFAULT) | 2338 | * - <0 - on error (-EFAULT) |
| 2339 | */ | 2339 | */ |
| 2340 | static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) | 2340 | static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) |
| 2341 | { | 2341 | { |
| @@ -2422,8 +2422,8 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, | |||
| 2422 | * with no q.key reference on failure. | 2422 | * with no q.key reference on failure. |
| 2423 | * | 2423 | * |
| 2424 | * Return: | 2424 | * Return: |
| 2425 | * 0 - uaddr contains val and hb has been locked; | 2425 | * - 0 - uaddr contains val and hb has been locked; |
| 2426 | * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked | 2426 | * - <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked |
| 2427 | */ | 2427 | */ |
| 2428 | static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, | 2428 | static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, |
| 2429 | struct futex_q *q, struct futex_hash_bucket **hb) | 2429 | struct futex_q *q, struct futex_hash_bucket **hb) |
| @@ -2895,8 +2895,8 @@ pi_faulted: | |||
| 2895 | * called with the hb lock held. | 2895 | * called with the hb lock held. |
| 2896 | * | 2896 | * |
| 2897 | * Return: | 2897 | * Return: |
| 2898 | * 0 = no early wakeup detected; | 2898 | * - 0 = no early wakeup detected; |
| 2899 | * <0 = -ETIMEDOUT or -ERESTARTNOINTR | 2899 | * - <0 = -ETIMEDOUT or -ERESTARTNOINTR |
| 2900 | */ | 2900 | */ |
| 2901 | static inline | 2901 | static inline |
| 2902 | int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, | 2902 | int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, |
| @@ -2968,8 +2968,8 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, | |||
| 2968 | * If 4 or 7, we cleanup and return with -ETIMEDOUT. | 2968 | * If 4 or 7, we cleanup and return with -ETIMEDOUT. |
| 2969 | * | 2969 | * |
| 2970 | * Return: | 2970 | * Return: |
| 2971 | * 0 - On success; | 2971 | * - 0 - On success; |
| 2972 | * <0 - On error | 2972 | * - <0 - On error |
| 2973 | */ | 2973 | */ |
| 2974 | static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, | 2974 | static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, |
| 2975 | u32 val, ktime_t *abs_time, u32 bitset, | 2975 | u32 val, ktime_t *abs_time, u32 bitset, |
diff --git a/kernel/groups.c b/kernel/groups.c index d09727692a2a..434f6665f187 100644 --- a/kernel/groups.c +++ b/kernel/groups.c | |||
| @@ -5,6 +5,7 @@ | |||
| 5 | #include <linux/export.h> | 5 | #include <linux/export.h> |
| 6 | #include <linux/slab.h> | 6 | #include <linux/slab.h> |
| 7 | #include <linux/security.h> | 7 | #include <linux/security.h> |
| 8 | #include <linux/sort.h> | ||
| 8 | #include <linux/syscalls.h> | 9 | #include <linux/syscalls.h> |
| 9 | #include <linux/user_namespace.h> | 10 | #include <linux/user_namespace.h> |
| 10 | #include <linux/vmalloc.h> | 11 | #include <linux/vmalloc.h> |
| @@ -76,32 +77,18 @@ static int groups_from_user(struct group_info *group_info, | |||
| 76 | return 0; | 77 | return 0; |
| 77 | } | 78 | } |
| 78 | 79 | ||
| 79 | /* a simple Shell sort */ | 80 | static int gid_cmp(const void *_a, const void *_b) |
| 81 | { | ||
| 82 | kgid_t a = *(kgid_t *)_a; | ||
| 83 | kgid_t b = *(kgid_t *)_b; | ||
| 84 | |||
| 85 | return gid_gt(a, b) - gid_lt(a, b); | ||
| 86 | } | ||
| 87 | |||
| 80 | static void groups_sort(struct group_info *group_info) | 88 | static void groups_sort(struct group_info *group_info) |
| 81 | { | 89 | { |
| 82 | int base, max, stride; | 90 | sort(group_info->gid, group_info->ngroups, sizeof(*group_info->gid), |
| 83 | int gidsetsize = group_info->ngroups; | 91 | gid_cmp, NULL); |
| 84 | |||
| 85 | for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1) | ||
| 86 | ; /* nothing */ | ||
| 87 | stride /= 3; | ||
| 88 | |||
| 89 | while (stride) { | ||
| 90 | max = gidsetsize - stride; | ||
| 91 | for (base = 0; base < max; base++) { | ||
| 92 | int left = base; | ||
| 93 | int right = left + stride; | ||
| 94 | kgid_t tmp = group_info->gid[right]; | ||
| 95 | |||
| 96 | while (left >= 0 && gid_gt(group_info->gid[left], tmp)) { | ||
| 97 | group_info->gid[right] = group_info->gid[left]; | ||
| 98 | right = left; | ||
| 99 | left -= stride; | ||
| 100 | } | ||
| 101 | group_info->gid[right] = tmp; | ||
| 102 | } | ||
| 103 | stride /= 3; | ||
| 104 | } | ||
| 105 | } | 92 | } |
| 106 | 93 | ||
| 107 | /* a simple bsearch */ | 94 | /* a simple bsearch */ |
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 3bbfd6a9c475..27c4e774071c 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig | |||
| @@ -21,6 +21,10 @@ config GENERIC_IRQ_SHOW | |||
| 21 | config GENERIC_IRQ_SHOW_LEVEL | 21 | config GENERIC_IRQ_SHOW_LEVEL |
| 22 | bool | 22 | bool |
| 23 | 23 | ||
| 24 | # Supports effective affinity mask | ||
| 25 | config GENERIC_IRQ_EFFECTIVE_AFF_MASK | ||
| 26 | bool | ||
| 27 | |||
| 24 | # Facility to allocate a hardware interrupt. This is legacy support | 28 | # Facility to allocate a hardware interrupt. This is legacy support |
| 25 | # and should not be used in new code. Use irq domains instead. | 29 | # and should not be used in new code. Use irq domains instead. |
| 26 | config GENERIC_IRQ_LEGACY_ALLOC_HWIRQ | 30 | config GENERIC_IRQ_LEGACY_ALLOC_HWIRQ |
| @@ -81,6 +85,9 @@ config GENERIC_MSI_IRQ_DOMAIN | |||
| 81 | config HANDLE_DOMAIN_IRQ | 85 | config HANDLE_DOMAIN_IRQ |
| 82 | bool | 86 | bool |
| 83 | 87 | ||
| 88 | config IRQ_TIMINGS | ||
| 89 | bool | ||
| 90 | |||
| 84 | config IRQ_DOMAIN_DEBUG | 91 | config IRQ_DOMAIN_DEBUG |
| 85 | bool "Expose hardware/virtual IRQ mapping via debugfs" | 92 | bool "Expose hardware/virtual IRQ mapping via debugfs" |
| 86 | depends on IRQ_DOMAIN && DEBUG_FS | 93 | depends on IRQ_DOMAIN && DEBUG_FS |
| @@ -108,4 +115,15 @@ config SPARSE_IRQ | |||
| 108 | 115 | ||
| 109 | If you don't know what to do here, say N. | 116 | If you don't know what to do here, say N. |
| 110 | 117 | ||
| 118 | config GENERIC_IRQ_DEBUGFS | ||
| 119 | bool "Expose irq internals in debugfs" | ||
| 120 | depends on DEBUG_FS | ||
| 121 | default n | ||
| 122 | ---help--- | ||
| 123 | |||
| 124 | Exposes internal state information through debugfs. Mostly for | ||
| 125 | developers and debugging of hard to diagnose interrupt problems. | ||
| 126 | |||
| 127 | If you don't know what to do here, say N. | ||
| 128 | |||
| 111 | endmenu | 129 | endmenu |
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 1d3ee3169202..e4aef7351f2b 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile | |||
| @@ -1,5 +1,6 @@ | |||
| 1 | 1 | ||
| 2 | obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o | 2 | obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o |
| 3 | obj-$(CONFIG_IRQ_TIMINGS) += timings.o | ||
| 3 | obj-$(CONFIG_GENERIC_IRQ_CHIP) += generic-chip.o | 4 | obj-$(CONFIG_GENERIC_IRQ_CHIP) += generic-chip.o |
| 4 | obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o | 5 | obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o |
| 5 | obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o | 6 | obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o |
| @@ -10,3 +11,4 @@ obj-$(CONFIG_PM_SLEEP) += pm.o | |||
| 10 | obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o | 11 | obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o |
| 11 | obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o | 12 | obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o |
| 12 | obj-$(CONFIG_SMP) += affinity.o | 13 | obj-$(CONFIG_SMP) += affinity.o |
| 14 | obj-$(CONFIG_GENERIC_IRQ_DEBUGFS) += debugfs.o | ||
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index e2d356dd7581..d69bd77252a7 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c | |||
| @@ -1,4 +1,7 @@ | |||
| 1 | 1 | /* | |
| 2 | * Copyright (C) 2016 Thomas Gleixner. | ||
| 3 | * Copyright (C) 2016-2017 Christoph Hellwig. | ||
| 4 | */ | ||
| 2 | #include <linux/interrupt.h> | 5 | #include <linux/interrupt.h> |
| 3 | #include <linux/kernel.h> | 6 | #include <linux/kernel.h> |
| 4 | #include <linux/slab.h> | 7 | #include <linux/slab.h> |
| @@ -35,13 +38,54 @@ static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, | |||
| 35 | } | 38 | } |
| 36 | } | 39 | } |
| 37 | 40 | ||
| 38 | static int get_nodes_in_cpumask(const struct cpumask *mask, nodemask_t *nodemsk) | 41 | static cpumask_var_t *alloc_node_to_present_cpumask(void) |
| 42 | { | ||
| 43 | cpumask_var_t *masks; | ||
| 44 | int node; | ||
| 45 | |||
| 46 | masks = kcalloc(nr_node_ids, sizeof(cpumask_var_t), GFP_KERNEL); | ||
| 47 | if (!masks) | ||
| 48 | return NULL; | ||
| 49 | |||
| 50 | for (node = 0; node < nr_node_ids; node++) { | ||
| 51 | if (!zalloc_cpumask_var(&masks[node], GFP_KERNEL)) | ||
| 52 | goto out_unwind; | ||
| 53 | } | ||
| 54 | |||
| 55 | return masks; | ||
| 56 | |||
| 57 | out_unwind: | ||
| 58 | while (--node >= 0) | ||
| 59 | free_cpumask_var(masks[node]); | ||
| 60 | kfree(masks); | ||
| 61 | return NULL; | ||
| 62 | } | ||
| 63 | |||
| 64 | static void free_node_to_present_cpumask(cpumask_var_t *masks) | ||
| 65 | { | ||
| 66 | int node; | ||
| 67 | |||
| 68 | for (node = 0; node < nr_node_ids; node++) | ||
| 69 | free_cpumask_var(masks[node]); | ||
| 70 | kfree(masks); | ||
| 71 | } | ||
| 72 | |||
| 73 | static void build_node_to_present_cpumask(cpumask_var_t *masks) | ||
| 74 | { | ||
| 75 | int cpu; | ||
| 76 | |||
| 77 | for_each_present_cpu(cpu) | ||
| 78 | cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]); | ||
| 79 | } | ||
| 80 | |||
| 81 | static int get_nodes_in_cpumask(cpumask_var_t *node_to_present_cpumask, | ||
| 82 | const struct cpumask *mask, nodemask_t *nodemsk) | ||
| 39 | { | 83 | { |
| 40 | int n, nodes = 0; | 84 | int n, nodes = 0; |
| 41 | 85 | ||
| 42 | /* Calculate the number of nodes in the supplied affinity mask */ | 86 | /* Calculate the number of nodes in the supplied affinity mask */ |
| 43 | for_each_online_node(n) { | 87 | for_each_node(n) { |
| 44 | if (cpumask_intersects(mask, cpumask_of_node(n))) { | 88 | if (cpumask_intersects(mask, node_to_present_cpumask[n])) { |
| 45 | node_set(n, *nodemsk); | 89 | node_set(n, *nodemsk); |
| 46 | nodes++; | 90 | nodes++; |
| 47 | } | 91 | } |
| @@ -64,7 +108,14 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) | |||
| 64 | int last_affv = affv + affd->pre_vectors; | 108 | int last_affv = affv + affd->pre_vectors; |
| 65 | nodemask_t nodemsk = NODE_MASK_NONE; | 109 | nodemask_t nodemsk = NODE_MASK_NONE; |
| 66 | struct cpumask *masks; | 110 | struct cpumask *masks; |
| 67 | cpumask_var_t nmsk; | 111 | cpumask_var_t nmsk, *node_to_present_cpumask; |
| 112 | |||
| 113 | /* | ||
| 114 | * If there aren't any vectors left after applying the pre/post | ||
| 115 | * vectors don't bother with assigning affinity. | ||
| 116 | */ | ||
| 117 | if (!affv) | ||
| 118 | return NULL; | ||
| 68 | 119 | ||
| 69 | if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) | 120 | if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) |
| 70 | return NULL; | 121 | return NULL; |
| @@ -73,13 +124,19 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) | |||
| 73 | if (!masks) | 124 | if (!masks) |
| 74 | goto out; | 125 | goto out; |
| 75 | 126 | ||
| 127 | node_to_present_cpumask = alloc_node_to_present_cpumask(); | ||
| 128 | if (!node_to_present_cpumask) | ||
| 129 | goto out; | ||
| 130 | |||
| 76 | /* Fill out vectors at the beginning that don't need affinity */ | 131 | /* Fill out vectors at the beginning that don't need affinity */ |
| 77 | for (curvec = 0; curvec < affd->pre_vectors; curvec++) | 132 | for (curvec = 0; curvec < affd->pre_vectors; curvec++) |
| 78 | cpumask_copy(masks + curvec, irq_default_affinity); | 133 | cpumask_copy(masks + curvec, irq_default_affinity); |
| 79 | 134 | ||
| 80 | /* Stabilize the cpumasks */ | 135 | /* Stabilize the cpumasks */ |
| 81 | get_online_cpus(); | 136 | get_online_cpus(); |
| 82 | nodes = get_nodes_in_cpumask(cpu_online_mask, &nodemsk); | 137 | build_node_to_present_cpumask(node_to_present_cpumask); |
| 138 | nodes = get_nodes_in_cpumask(node_to_present_cpumask, cpu_present_mask, | ||
| 139 | &nodemsk); | ||
| 83 | 140 | ||
| 84 | /* | 141 | /* |
| 85 | * If the number of nodes in the mask is greater than or equal the | 142 | * If the number of nodes in the mask is greater than or equal the |
| @@ -87,7 +144,8 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) | |||
| 87 | */ | 144 | */ |
| 88 | if (affv <= nodes) { | 145 | if (affv <= nodes) { |
| 89 | for_each_node_mask(n, nodemsk) { | 146 | for_each_node_mask(n, nodemsk) { |
| 90 | cpumask_copy(masks + curvec, cpumask_of_node(n)); | 147 | cpumask_copy(masks + curvec, |
| 148 | node_to_present_cpumask[n]); | ||
| 91 | if (++curvec == last_affv) | 149 | if (++curvec == last_affv) |
| 92 | break; | 150 | break; |
| 93 | } | 151 | } |
| @@ -101,7 +159,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) | |||
| 101 | vecs_per_node = (affv - (curvec - affd->pre_vectors)) / nodes; | 159 | vecs_per_node = (affv - (curvec - affd->pre_vectors)) / nodes; |
| 102 | 160 | ||
| 103 | /* Get the cpus on this node which are in the mask */ | 161 | /* Get the cpus on this node which are in the mask */ |
| 104 | cpumask_and(nmsk, cpu_online_mask, cpumask_of_node(n)); | 162 | cpumask_and(nmsk, cpu_present_mask, node_to_present_cpumask[n]); |
| 105 | 163 | ||
| 106 | /* Calculate the number of cpus per vector */ | 164 | /* Calculate the number of cpus per vector */ |
| 107 | ncpus = cpumask_weight(nmsk); | 165 | ncpus = cpumask_weight(nmsk); |
| @@ -133,6 +191,7 @@ done: | |||
| 133 | /* Fill out vectors at the end that don't need affinity */ | 191 | /* Fill out vectors at the end that don't need affinity */ |
| 134 | for (; curvec < nvecs; curvec++) | 192 | for (; curvec < nvecs; curvec++) |
| 135 | cpumask_copy(masks + curvec, irq_default_affinity); | 193 | cpumask_copy(masks + curvec, irq_default_affinity); |
| 194 | free_node_to_present_cpumask(node_to_present_cpumask); | ||
| 136 | out: | 195 | out: |
| 137 | free_cpumask_var(nmsk); | 196 | free_cpumask_var(nmsk); |
| 138 | return masks; | 197 | return masks; |
| @@ -140,19 +199,21 @@ out: | |||
| 140 | 199 | ||
| 141 | /** | 200 | /** |
| 142 | * irq_calc_affinity_vectors - Calculate the optimal number of vectors | 201 | * irq_calc_affinity_vectors - Calculate the optimal number of vectors |
| 202 | * @minvec: The minimum number of vectors available | ||
| 143 | * @maxvec: The maximum number of vectors available | 203 | * @maxvec: The maximum number of vectors available |
| 144 | * @affd: Description of the affinity requirements | 204 | * @affd: Description of the affinity requirements |
| 145 | */ | 205 | */ |
| 146 | int irq_calc_affinity_vectors(int maxvec, const struct irq_affinity *affd) | 206 | int irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity *affd) |
| 147 | { | 207 | { |
| 148 | int resv = affd->pre_vectors + affd->post_vectors; | 208 | int resv = affd->pre_vectors + affd->post_vectors; |
| 149 | int vecs = maxvec - resv; | 209 | int vecs = maxvec - resv; |
| 150 | int cpus; | 210 | int ret; |
| 211 | |||
| 212 | if (resv > minvec) | ||
| 213 | return 0; | ||
| 151 | 214 | ||
| 152 | /* Stabilize the cpumasks */ | ||
| 153 | get_online_cpus(); | 215 | get_online_cpus(); |
| 154 | cpus = cpumask_weight(cpu_online_mask); | 216 | ret = min_t(int, cpumask_weight(cpu_present_mask), vecs) + resv; |
| 155 | put_online_cpus(); | 217 | put_online_cpus(); |
| 156 | 218 | return ret; | |
| 157 | return min(cpus, vecs) + resv; | ||
| 158 | } | 219 | } |
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 0119b9d467ae..d30a0dd5cc02 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c | |||
| @@ -53,7 +53,7 @@ unsigned long probe_irq_on(void) | |||
| 53 | if (desc->irq_data.chip->irq_set_type) | 53 | if (desc->irq_data.chip->irq_set_type) |
| 54 | desc->irq_data.chip->irq_set_type(&desc->irq_data, | 54 | desc->irq_data.chip->irq_set_type(&desc->irq_data, |
| 55 | IRQ_TYPE_PROBE); | 55 | IRQ_TYPE_PROBE); |
| 56 | irq_startup(desc, false); | 56 | irq_startup(desc, IRQ_NORESEND, IRQ_START_FORCE); |
| 57 | } | 57 | } |
| 58 | raw_spin_unlock_irq(&desc->lock); | 58 | raw_spin_unlock_irq(&desc->lock); |
| 59 | } | 59 | } |
| @@ -70,7 +70,7 @@ unsigned long probe_irq_on(void) | |||
| 70 | raw_spin_lock_irq(&desc->lock); | 70 | raw_spin_lock_irq(&desc->lock); |
| 71 | if (!desc->action && irq_settings_can_probe(desc)) { | 71 | if (!desc->action && irq_settings_can_probe(desc)) { |
| 72 | desc->istate |= IRQS_AUTODETECT | IRQS_WAITING; | 72 | desc->istate |= IRQS_AUTODETECT | IRQS_WAITING; |
| 73 | if (irq_startup(desc, false)) | 73 | if (irq_startup(desc, IRQ_NORESEND, IRQ_START_FORCE)) |
| 74 | desc->istate |= IRQS_PENDING; | 74 | desc->istate |= IRQS_PENDING; |
| 75 | } | 75 | } |
| 76 | raw_spin_unlock_irq(&desc->lock); | 76 | raw_spin_unlock_irq(&desc->lock); |
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index c94da688ee9b..a3cc37c0c85e 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
| @@ -7,7 +7,7 @@ | |||
| 7 | * This file contains the core interrupt handling code, for irq-chip | 7 | * This file contains the core interrupt handling code, for irq-chip |
| 8 | * based architectures. | 8 | * based architectures. |
| 9 | * | 9 | * |
| 10 | * Detailed information is available in Documentation/DocBook/genericirq | 10 | * Detailed information is available in Documentation/core-api/genericirq.rst |
| 11 | */ | 11 | */ |
| 12 | 12 | ||
| 13 | #include <linux/irq.h> | 13 | #include <linux/irq.h> |
| @@ -170,62 +170,167 @@ static void irq_state_clr_disabled(struct irq_desc *desc) | |||
| 170 | irqd_clear(&desc->irq_data, IRQD_IRQ_DISABLED); | 170 | irqd_clear(&desc->irq_data, IRQD_IRQ_DISABLED); |
| 171 | } | 171 | } |
| 172 | 172 | ||
| 173 | static void irq_state_set_disabled(struct irq_desc *desc) | 173 | static void irq_state_clr_masked(struct irq_desc *desc) |
| 174 | { | 174 | { |
| 175 | irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); | 175 | irqd_clear(&desc->irq_data, IRQD_IRQ_MASKED); |
| 176 | } | 176 | } |
| 177 | 177 | ||
| 178 | static void irq_state_clr_masked(struct irq_desc *desc) | 178 | static void irq_state_clr_started(struct irq_desc *desc) |
| 179 | { | 179 | { |
| 180 | irqd_clear(&desc->irq_data, IRQD_IRQ_MASKED); | 180 | irqd_clear(&desc->irq_data, IRQD_IRQ_STARTED); |
| 181 | } | 181 | } |
| 182 | 182 | ||
| 183 | static void irq_state_set_masked(struct irq_desc *desc) | 183 | static void irq_state_set_started(struct irq_desc *desc) |
| 184 | { | 184 | { |
| 185 | irqd_set(&desc->irq_data, IRQD_IRQ_MASKED); | 185 | irqd_set(&desc->irq_data, IRQD_IRQ_STARTED); |
| 186 | } | 186 | } |
| 187 | 187 | ||
| 188 | int irq_startup(struct irq_desc *desc, bool resend) | 188 | enum { |
| 189 | IRQ_STARTUP_NORMAL, | ||
| 190 | IRQ_STARTUP_MANAGED, | ||
| 191 | IRQ_STARTUP_ABORT, | ||
| 192 | }; | ||
| 193 | |||
| 194 | #ifdef CONFIG_SMP | ||
| 195 | static int | ||
| 196 | __irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force) | ||
| 189 | { | 197 | { |
| 190 | int ret = 0; | 198 | struct irq_data *d = irq_desc_get_irq_data(desc); |
| 191 | 199 | ||
| 192 | irq_state_clr_disabled(desc); | 200 | if (!irqd_affinity_is_managed(d)) |
| 193 | desc->depth = 0; | 201 | return IRQ_STARTUP_NORMAL; |
| 202 | |||
| 203 | irqd_clr_managed_shutdown(d); | ||
| 204 | |||
| 205 | if (cpumask_any_and(aff, cpu_online_mask) > nr_cpu_ids) { | ||
| 206 | /* | ||
| 207 | * Catch code which fiddles with enable_irq() on a managed | ||
| 208 | * and potentially shutdown IRQ. Chained interrupt | ||
| 209 | * installment or irq auto probing should not happen on | ||
| 210 | * managed irqs either. Emit a warning, break the affinity | ||
| 211 | * and start it up as a normal interrupt. | ||
| 212 | */ | ||
| 213 | if (WARN_ON_ONCE(force)) | ||
| 214 | return IRQ_STARTUP_NORMAL; | ||
| 215 | /* | ||
| 216 | * The interrupt was requested, but there is no online CPU | ||
| 217 | * in it's affinity mask. Put it into managed shutdown | ||
| 218 | * state and let the cpu hotplug mechanism start it up once | ||
| 219 | * a CPU in the mask becomes available. | ||
| 220 | */ | ||
| 221 | irqd_set_managed_shutdown(d); | ||
| 222 | return IRQ_STARTUP_ABORT; | ||
| 223 | } | ||
| 224 | return IRQ_STARTUP_MANAGED; | ||
| 225 | } | ||
| 226 | #else | ||
| 227 | static __always_inline int | ||
| 228 | __irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force) | ||
| 229 | { | ||
| 230 | return IRQ_STARTUP_NORMAL; | ||
| 231 | } | ||
| 232 | #endif | ||
| 194 | 233 | ||
| 195 | irq_domain_activate_irq(&desc->irq_data); | 234 | static int __irq_startup(struct irq_desc *desc) |
| 196 | if (desc->irq_data.chip->irq_startup) { | 235 | { |
| 197 | ret = desc->irq_data.chip->irq_startup(&desc->irq_data); | 236 | struct irq_data *d = irq_desc_get_irq_data(desc); |
| 237 | int ret = 0; | ||
| 238 | |||
| 239 | irq_domain_activate_irq(d); | ||
| 240 | if (d->chip->irq_startup) { | ||
| 241 | ret = d->chip->irq_startup(d); | ||
| 242 | irq_state_clr_disabled(desc); | ||
| 198 | irq_state_clr_masked(desc); | 243 | irq_state_clr_masked(desc); |
| 199 | } else { | 244 | } else { |
| 200 | irq_enable(desc); | 245 | irq_enable(desc); |
| 201 | } | 246 | } |
| 247 | irq_state_set_started(desc); | ||
| 248 | return ret; | ||
| 249 | } | ||
| 250 | |||
| 251 | int irq_startup(struct irq_desc *desc, bool resend, bool force) | ||
| 252 | { | ||
| 253 | struct irq_data *d = irq_desc_get_irq_data(desc); | ||
| 254 | struct cpumask *aff = irq_data_get_affinity_mask(d); | ||
| 255 | int ret = 0; | ||
| 256 | |||
| 257 | desc->depth = 0; | ||
| 258 | |||
| 259 | if (irqd_is_started(d)) { | ||
| 260 | irq_enable(desc); | ||
| 261 | } else { | ||
| 262 | switch (__irq_startup_managed(desc, aff, force)) { | ||
| 263 | case IRQ_STARTUP_NORMAL: | ||
| 264 | ret = __irq_startup(desc); | ||
| 265 | irq_setup_affinity(desc); | ||
| 266 | break; | ||
| 267 | case IRQ_STARTUP_MANAGED: | ||
| 268 | ret = __irq_startup(desc); | ||
| 269 | irq_set_affinity_locked(d, aff, false); | ||
| 270 | break; | ||
| 271 | case IRQ_STARTUP_ABORT: | ||
| 272 | return 0; | ||
| 273 | } | ||
| 274 | } | ||
| 202 | if (resend) | 275 | if (resend) |
| 203 | check_irq_resend(desc); | 276 | check_irq_resend(desc); |
| 277 | |||
| 204 | return ret; | 278 | return ret; |
| 205 | } | 279 | } |
| 206 | 280 | ||
| 281 | static void __irq_disable(struct irq_desc *desc, bool mask); | ||
| 282 | |||
| 207 | void irq_shutdown(struct irq_desc *desc) | 283 | void irq_shutdown(struct irq_desc *desc) |
| 208 | { | 284 | { |
| 209 | irq_state_set_disabled(desc); | 285 | if (irqd_is_started(&desc->irq_data)) { |
| 210 | desc->depth = 1; | 286 | desc->depth = 1; |
| 211 | if (desc->irq_data.chip->irq_shutdown) | 287 | if (desc->irq_data.chip->irq_shutdown) { |
| 212 | desc->irq_data.chip->irq_shutdown(&desc->irq_data); | 288 | desc->irq_data.chip->irq_shutdown(&desc->irq_data); |
| 213 | else if (desc->irq_data.chip->irq_disable) | 289 | irq_state_set_disabled(desc); |
| 214 | desc->irq_data.chip->irq_disable(&desc->irq_data); | 290 | irq_state_set_masked(desc); |
| 215 | else | 291 | } else { |
| 216 | desc->irq_data.chip->irq_mask(&desc->irq_data); | 292 | __irq_disable(desc, true); |
| 293 | } | ||
| 294 | irq_state_clr_started(desc); | ||
| 295 | } | ||
| 296 | /* | ||
| 297 | * This must be called even if the interrupt was never started up, | ||
| 298 | * because the activation can happen before the interrupt is | ||
| 299 | * available for request/startup. It has it's own state tracking so | ||
| 300 | * it's safe to call it unconditionally. | ||
| 301 | */ | ||
| 217 | irq_domain_deactivate_irq(&desc->irq_data); | 302 | irq_domain_deactivate_irq(&desc->irq_data); |
| 218 | irq_state_set_masked(desc); | ||
| 219 | } | 303 | } |
| 220 | 304 | ||
| 221 | void irq_enable(struct irq_desc *desc) | 305 | void irq_enable(struct irq_desc *desc) |
| 222 | { | 306 | { |
| 223 | irq_state_clr_disabled(desc); | 307 | if (!irqd_irq_disabled(&desc->irq_data)) { |
| 224 | if (desc->irq_data.chip->irq_enable) | 308 | unmask_irq(desc); |
| 225 | desc->irq_data.chip->irq_enable(&desc->irq_data); | 309 | } else { |
| 226 | else | 310 | irq_state_clr_disabled(desc); |
| 227 | desc->irq_data.chip->irq_unmask(&desc->irq_data); | 311 | if (desc->irq_data.chip->irq_enable) { |
| 228 | irq_state_clr_masked(desc); | 312 | desc->irq_data.chip->irq_enable(&desc->irq_data); |
| 313 | irq_state_clr_masked(desc); | ||
| 314 | } else { | ||
| 315 | unmask_irq(desc); | ||
| 316 | } | ||
| 317 | } | ||
| 318 | } | ||
| 319 | |||
| 320 | static void __irq_disable(struct irq_desc *desc, bool mask) | ||
| 321 | { | ||
| 322 | if (irqd_irq_disabled(&desc->irq_data)) { | ||
| 323 | if (mask) | ||
| 324 | mask_irq(desc); | ||
| 325 | } else { | ||
| 326 | irq_state_set_disabled(desc); | ||
| 327 | if (desc->irq_data.chip->irq_disable) { | ||
| 328 | desc->irq_data.chip->irq_disable(&desc->irq_data); | ||
| 329 | irq_state_set_masked(desc); | ||
| 330 | } else if (mask) { | ||
| 331 | mask_irq(desc); | ||
| 332 | } | ||
| 333 | } | ||
| 229 | } | 334 | } |
| 230 | 335 | ||
| 231 | /** | 336 | /** |
| @@ -250,13 +355,7 @@ void irq_enable(struct irq_desc *desc) | |||
| 250 | */ | 355 | */ |
| 251 | void irq_disable(struct irq_desc *desc) | 356 | void irq_disable(struct irq_desc *desc) |
| 252 | { | 357 | { |
| 253 | irq_state_set_disabled(desc); | 358 | __irq_disable(desc, irq_settings_disable_unlazy(desc)); |
| 254 | if (desc->irq_data.chip->irq_disable) { | ||
| 255 | desc->irq_data.chip->irq_disable(&desc->irq_data); | ||
| 256 | irq_state_set_masked(desc); | ||
| 257 | } else if (irq_settings_disable_unlazy(desc)) { | ||
| 258 | mask_irq(desc); | ||
| 259 | } | ||
| 260 | } | 359 | } |
| 261 | 360 | ||
| 262 | void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu) | 361 | void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu) |
| @@ -279,18 +378,21 @@ void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu) | |||
| 279 | 378 | ||
| 280 | static inline void mask_ack_irq(struct irq_desc *desc) | 379 | static inline void mask_ack_irq(struct irq_desc *desc) |
| 281 | { | 380 | { |
| 282 | if (desc->irq_data.chip->irq_mask_ack) | 381 | if (desc->irq_data.chip->irq_mask_ack) { |
| 283 | desc->irq_data.chip->irq_mask_ack(&desc->irq_data); | 382 | desc->irq_data.chip->irq_mask_ack(&desc->irq_data); |
| 284 | else { | 383 | irq_state_set_masked(desc); |
| 285 | desc->irq_data.chip->irq_mask(&desc->irq_data); | 384 | } else { |
| 385 | mask_irq(desc); | ||
| 286 | if (desc->irq_data.chip->irq_ack) | 386 | if (desc->irq_data.chip->irq_ack) |
| 287 | desc->irq_data.chip->irq_ack(&desc->irq_data); | 387 | desc->irq_data.chip->irq_ack(&desc->irq_data); |
| 288 | } | 388 | } |
| 289 | irq_state_set_masked(desc); | ||
| 290 | } | 389 | } |
| 291 | 390 | ||
| 292 | void mask_irq(struct irq_desc *desc) | 391 | void mask_irq(struct irq_desc *desc) |
| 293 | { | 392 | { |
| 393 | if (irqd_irq_masked(&desc->irq_data)) | ||
| 394 | return; | ||
| 395 | |||
| 294 | if (desc->irq_data.chip->irq_mask) { | 396 | if (desc->irq_data.chip->irq_mask) { |
| 295 | desc->irq_data.chip->irq_mask(&desc->irq_data); | 397 | desc->irq_data.chip->irq_mask(&desc->irq_data); |
| 296 | irq_state_set_masked(desc); | 398 | irq_state_set_masked(desc); |
| @@ -299,6 +401,9 @@ void mask_irq(struct irq_desc *desc) | |||
| 299 | 401 | ||
| 300 | void unmask_irq(struct irq_desc *desc) | 402 | void unmask_irq(struct irq_desc *desc) |
| 301 | { | 403 | { |
| 404 | if (!irqd_irq_masked(&desc->irq_data)) | ||
| 405 | return; | ||
| 406 | |||
| 302 | if (desc->irq_data.chip->irq_unmask) { | 407 | if (desc->irq_data.chip->irq_unmask) { |
| 303 | desc->irq_data.chip->irq_unmask(&desc->irq_data); | 408 | desc->irq_data.chip->irq_unmask(&desc->irq_data); |
| 304 | irq_state_clr_masked(desc); | 409 | irq_state_clr_masked(desc); |
| @@ -312,10 +417,7 @@ void unmask_threaded_irq(struct irq_desc *desc) | |||
| 312 | if (chip->flags & IRQCHIP_EOI_THREADED) | 417 | if (chip->flags & IRQCHIP_EOI_THREADED) |
| 313 | chip->irq_eoi(&desc->irq_data); | 418 | chip->irq_eoi(&desc->irq_data); |
| 314 | 419 | ||
| 315 | if (chip->irq_unmask) { | 420 | unmask_irq(desc); |
| 316 | chip->irq_unmask(&desc->irq_data); | ||
| 317 | irq_state_clr_masked(desc); | ||
| 318 | } | ||
| 319 | } | 421 | } |
| 320 | 422 | ||
| 321 | /* | 423 | /* |
| @@ -851,7 +953,7 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, | |||
| 851 | irq_settings_set_norequest(desc); | 953 | irq_settings_set_norequest(desc); |
| 852 | irq_settings_set_nothread(desc); | 954 | irq_settings_set_nothread(desc); |
| 853 | desc->action = &chained_action; | 955 | desc->action = &chained_action; |
| 854 | irq_startup(desc, true); | 956 | irq_startup(desc, IRQ_RESEND, IRQ_START_FORCE); |
| 855 | } | 957 | } |
| 856 | } | 958 | } |
| 857 | 959 | ||
| @@ -903,6 +1005,13 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) | |||
| 903 | 1005 | ||
| 904 | if (!desc) | 1006 | if (!desc) |
| 905 | return; | 1007 | return; |
| 1008 | |||
| 1009 | /* | ||
| 1010 | * Warn when a driver sets the no autoenable flag on an already | ||
| 1011 | * active interrupt. | ||
| 1012 | */ | ||
| 1013 | WARN_ON_ONCE(!desc->depth && (set & _IRQ_NOAUTOEN)); | ||
| 1014 | |||
| 906 | irq_settings_clr_and_set(desc, clr, set); | 1015 | irq_settings_clr_and_set(desc, clr, set); |
| 907 | 1016 | ||
| 908 | irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU | | 1017 | irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU | |
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 011f8c4c63da..aee8f7ec40af 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c | |||
| @@ -14,37 +14,99 @@ | |||
| 14 | 14 | ||
| 15 | #include "internals.h" | 15 | #include "internals.h" |
| 16 | 16 | ||
| 17 | /* For !GENERIC_IRQ_EFFECTIVE_AFF_MASK this looks at general affinity mask */ | ||
| 18 | static inline bool irq_needs_fixup(struct irq_data *d) | ||
| 19 | { | ||
| 20 | const struct cpumask *m = irq_data_get_effective_affinity_mask(d); | ||
| 21 | |||
| 22 | return cpumask_test_cpu(smp_processor_id(), m); | ||
| 23 | } | ||
| 24 | |||
| 17 | static bool migrate_one_irq(struct irq_desc *desc) | 25 | static bool migrate_one_irq(struct irq_desc *desc) |
| 18 | { | 26 | { |
| 19 | struct irq_data *d = irq_desc_get_irq_data(desc); | 27 | struct irq_data *d = irq_desc_get_irq_data(desc); |
| 20 | const struct cpumask *affinity = d->common->affinity; | 28 | struct irq_chip *chip = irq_data_get_irq_chip(d); |
| 21 | struct irq_chip *c; | 29 | bool maskchip = !irq_can_move_pcntxt(d) && !irqd_irq_masked(d); |
| 22 | bool ret = false; | 30 | const struct cpumask *affinity; |
| 31 | bool brokeaff = false; | ||
| 32 | int err; | ||
| 23 | 33 | ||
| 24 | /* | 34 | /* |
| 25 | * If this is a per-CPU interrupt, or the affinity does not | 35 | * IRQ chip might be already torn down, but the irq descriptor is |
| 26 | * include this CPU, then we have nothing to do. | 36 | * still in the radix tree. Also if the chip has no affinity setter, |
| 37 | * nothing can be done here. | ||
| 27 | */ | 38 | */ |
| 28 | if (irqd_is_per_cpu(d) || | 39 | if (!chip || !chip->irq_set_affinity) { |
| 29 | !cpumask_test_cpu(smp_processor_id(), affinity)) | 40 | pr_debug("IRQ %u: Unable to migrate away\n", d->irq); |
| 30 | return false; | 41 | return false; |
| 42 | } | ||
| 43 | |||
| 44 | /* | ||
| 45 | * No move required, if: | ||
| 46 | * - Interrupt is per cpu | ||
| 47 | * - Interrupt is not started | ||
| 48 | * - Affinity mask does not include this CPU. | ||
| 49 | * | ||
| 50 | * Note: Do not check desc->action as this might be a chained | ||
| 51 | * interrupt. | ||
| 52 | */ | ||
| 53 | if (irqd_is_per_cpu(d) || !irqd_is_started(d) || !irq_needs_fixup(d)) { | ||
| 54 | /* | ||
| 55 | * If an irq move is pending, abort it if the dying CPU is | ||
| 56 | * the sole target. | ||
| 57 | */ | ||
| 58 | irq_fixup_move_pending(desc, false); | ||
| 59 | return false; | ||
| 60 | } | ||
| 61 | |||
| 62 | /* | ||
| 63 | * Complete an eventually pending irq move cleanup. If this | ||
| 64 | * interrupt was moved in hard irq context, then the vectors need | ||
| 65 | * to be cleaned up. It can't wait until this interrupt actually | ||
| 66 | * happens and this CPU was involved. | ||
| 67 | */ | ||
| 68 | irq_force_complete_move(desc); | ||
| 69 | |||
| 70 | /* | ||
| 71 | * If there is a setaffinity pending, then try to reuse the pending | ||
| 72 | * mask, so the last change of the affinity does not get lost. If | ||
| 73 | * there is no move pending or the pending mask does not contain | ||
| 74 | * any online CPU, use the current affinity mask. | ||
| 75 | */ | ||
| 76 | if (irq_fixup_move_pending(desc, true)) | ||
| 77 | affinity = irq_desc_get_pending_mask(desc); | ||
| 78 | else | ||
| 79 | affinity = irq_data_get_affinity_mask(d); | ||
| 80 | |||
| 81 | /* Mask the chip for interrupts which cannot move in process context */ | ||
| 82 | if (maskchip && chip->irq_mask) | ||
| 83 | chip->irq_mask(d); | ||
| 31 | 84 | ||
| 32 | if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { | 85 | if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { |
| 86 | /* | ||
| 87 | * If the interrupt is managed, then shut it down and leave | ||
| 88 | * the affinity untouched. | ||
| 89 | */ | ||
| 90 | if (irqd_affinity_is_managed(d)) { | ||
| 91 | irqd_set_managed_shutdown(d); | ||
| 92 | irq_shutdown(desc); | ||
| 93 | return false; | ||
| 94 | } | ||
| 33 | affinity = cpu_online_mask; | 95 | affinity = cpu_online_mask; |
| 34 | ret = true; | 96 | brokeaff = true; |
| 35 | } | 97 | } |
| 36 | 98 | ||
| 37 | c = irq_data_get_irq_chip(d); | 99 | err = irq_do_set_affinity(d, affinity, true); |
| 38 | if (!c->irq_set_affinity) { | 100 | if (err) { |
| 39 | pr_debug("IRQ%u: unable to set affinity\n", d->irq); | 101 | pr_warn_ratelimited("IRQ%u: set affinity failed(%d).\n", |
| 40 | } else { | 102 | d->irq, err); |
| 41 | int r = irq_do_set_affinity(d, affinity, false); | 103 | brokeaff = false; |
| 42 | if (r) | ||
| 43 | pr_warn_ratelimited("IRQ%u: set affinity failed(%d).\n", | ||
| 44 | d->irq, r); | ||
| 45 | } | 104 | } |
| 46 | 105 | ||
| 47 | return ret; | 106 | if (maskchip && chip->irq_unmask) |
| 107 | chip->irq_unmask(d); | ||
| 108 | |||
| 109 | return brokeaff; | ||
| 48 | } | 110 | } |
| 49 | 111 | ||
| 50 | /** | 112 | /** |
| @@ -59,11 +121,8 @@ static bool migrate_one_irq(struct irq_desc *desc) | |||
| 59 | */ | 121 | */ |
| 60 | void irq_migrate_all_off_this_cpu(void) | 122 | void irq_migrate_all_off_this_cpu(void) |
| 61 | { | 123 | { |
| 62 | unsigned int irq; | ||
| 63 | struct irq_desc *desc; | 124 | struct irq_desc *desc; |
| 64 | unsigned long flags; | 125 | unsigned int irq; |
| 65 | |||
| 66 | local_irq_save(flags); | ||
| 67 | 126 | ||
| 68 | for_each_active_irq(irq) { | 127 | for_each_active_irq(irq) { |
| 69 | bool affinity_broken; | 128 | bool affinity_broken; |
| @@ -73,10 +132,53 @@ void irq_migrate_all_off_this_cpu(void) | |||
| 73 | affinity_broken = migrate_one_irq(desc); | 132 | affinity_broken = migrate_one_irq(desc); |
| 74 | raw_spin_unlock(&desc->lock); | 133 | raw_spin_unlock(&desc->lock); |
| 75 | 134 | ||
| 76 | if (affinity_broken) | 135 | if (affinity_broken) { |
| 77 | pr_warn_ratelimited("IRQ%u no longer affine to CPU%u\n", | 136 | pr_warn_ratelimited("IRQ %u: no longer affine to CPU%u\n", |
| 78 | irq, smp_processor_id()); | 137 | irq, smp_processor_id()); |
| 138 | } | ||
| 139 | } | ||
| 140 | } | ||
| 141 | |||
| 142 | static void irq_restore_affinity_of_irq(struct irq_desc *desc, unsigned int cpu) | ||
| 143 | { | ||
| 144 | struct irq_data *data = irq_desc_get_irq_data(desc); | ||
| 145 | const struct cpumask *affinity = irq_data_get_affinity_mask(data); | ||
| 146 | |||
| 147 | if (!irqd_affinity_is_managed(data) || !desc->action || | ||
| 148 | !irq_data_get_irq_chip(data) || !cpumask_test_cpu(cpu, affinity)) | ||
| 149 | return; | ||
| 150 | |||
| 151 | if (irqd_is_managed_and_shutdown(data)) { | ||
| 152 | irq_startup(desc, IRQ_RESEND, IRQ_START_COND); | ||
| 153 | return; | ||
| 154 | } | ||
| 155 | |||
| 156 | /* | ||
| 157 | * If the interrupt can only be directed to a single target | ||
| 158 | * CPU then it is already assigned to a CPU in the affinity | ||
| 159 | * mask. No point in trying to move it around. | ||
| 160 | */ | ||
| 161 | if (!irqd_is_single_target(data)) | ||
| 162 | irq_set_affinity_locked(data, affinity, false); | ||
| 163 | } | ||
| 164 | |||
| 165 | /** | ||
| 166 | * irq_affinity_online_cpu - Restore affinity for managed interrupts | ||
| 167 | * @cpu: Upcoming CPU for which interrupts should be restored | ||
| 168 | */ | ||
| 169 | int irq_affinity_online_cpu(unsigned int cpu) | ||
| 170 | { | ||
| 171 | struct irq_desc *desc; | ||
| 172 | unsigned int irq; | ||
| 173 | |||
| 174 | irq_lock_sparse(); | ||
| 175 | for_each_active_irq(irq) { | ||
| 176 | desc = irq_to_desc(irq); | ||
| 177 | raw_spin_lock_irq(&desc->lock); | ||
| 178 | irq_restore_affinity_of_irq(desc, cpu); | ||
| 179 | raw_spin_unlock_irq(&desc->lock); | ||
| 79 | } | 180 | } |
| 181 | irq_unlock_sparse(); | ||
| 80 | 182 | ||
| 81 | local_irq_restore(flags); | 183 | return 0; |
| 82 | } | 184 | } |
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c new file mode 100644 index 000000000000..4d384edc0c64 --- /dev/null +++ b/kernel/irq/debugfs.c | |||
| @@ -0,0 +1,213 @@ | |||
| 1 | /* | ||
| 2 | * Copyright 2017 Thomas Gleixner <tglx@linutronix.de> | ||
| 3 | * | ||
| 4 | * This file is licensed under the GPL V2. | ||
| 5 | */ | ||
| 6 | #include <linux/irqdomain.h> | ||
| 7 | #include <linux/irq.h> | ||
| 8 | |||
| 9 | #include "internals.h" | ||
| 10 | |||
| 11 | static struct dentry *irq_dir; | ||
| 12 | |||
| 13 | struct irq_bit_descr { | ||
| 14 | unsigned int mask; | ||
| 15 | char *name; | ||
| 16 | }; | ||
| 17 | #define BIT_MASK_DESCR(m) { .mask = m, .name = #m } | ||
| 18 | |||
| 19 | static void irq_debug_show_bits(struct seq_file *m, int ind, unsigned int state, | ||
| 20 | const struct irq_bit_descr *sd, int size) | ||
| 21 | { | ||
| 22 | int i; | ||
| 23 | |||
| 24 | for (i = 0; i < size; i++, sd++) { | ||
| 25 | if (state & sd->mask) | ||
| 26 | seq_printf(m, "%*s%s\n", ind + 12, "", sd->name); | ||
| 27 | } | ||
| 28 | } | ||
| 29 | |||
| 30 | #ifdef CONFIG_SMP | ||
| 31 | static void irq_debug_show_masks(struct seq_file *m, struct irq_desc *desc) | ||
| 32 | { | ||
| 33 | struct irq_data *data = irq_desc_get_irq_data(desc); | ||
| 34 | struct cpumask *msk; | ||
| 35 | |||
| 36 | msk = irq_data_get_affinity_mask(data); | ||
| 37 | seq_printf(m, "affinity: %*pbl\n", cpumask_pr_args(msk)); | ||
| 38 | #ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK | ||
| 39 | msk = irq_data_get_effective_affinity_mask(data); | ||
| 40 | seq_printf(m, "effectiv: %*pbl\n", cpumask_pr_args(msk)); | ||
| 41 | #endif | ||
| 42 | #ifdef CONFIG_GENERIC_PENDING_IRQ | ||
| 43 | msk = desc->pending_mask; | ||
| 44 | seq_printf(m, "pending: %*pbl\n", cpumask_pr_args(msk)); | ||
| 45 | #endif | ||
| 46 | } | ||
| 47 | #else | ||
| 48 | static void irq_debug_show_masks(struct seq_file *m, struct irq_desc *desc) { } | ||
| 49 | #endif | ||
| 50 | |||
| 51 | static const struct irq_bit_descr irqchip_flags[] = { | ||
| 52 | BIT_MASK_DESCR(IRQCHIP_SET_TYPE_MASKED), | ||
| 53 | BIT_MASK_DESCR(IRQCHIP_EOI_IF_HANDLED), | ||
| 54 | BIT_MASK_DESCR(IRQCHIP_MASK_ON_SUSPEND), | ||
| 55 | BIT_MASK_DESCR(IRQCHIP_ONOFFLINE_ENABLED), | ||
| 56 | BIT_MASK_DESCR(IRQCHIP_SKIP_SET_WAKE), | ||
| 57 | BIT_MASK_DESCR(IRQCHIP_ONESHOT_SAFE), | ||
| 58 | BIT_MASK_DESCR(IRQCHIP_EOI_THREADED), | ||
| 59 | }; | ||
| 60 | |||
| 61 | static void | ||
| 62 | irq_debug_show_chip(struct seq_file *m, struct irq_data *data, int ind) | ||
| 63 | { | ||
| 64 | struct irq_chip *chip = data->chip; | ||
| 65 | |||
| 66 | if (!chip) { | ||
| 67 | seq_printf(m, "chip: None\n"); | ||
| 68 | return; | ||
| 69 | } | ||
| 70 | seq_printf(m, "%*schip: %s\n", ind, "", chip->name); | ||
| 71 | seq_printf(m, "%*sflags: 0x%lx\n", ind + 1, "", chip->flags); | ||
| 72 | irq_debug_show_bits(m, ind, chip->flags, irqchip_flags, | ||
| 73 | ARRAY_SIZE(irqchip_flags)); | ||
| 74 | } | ||
| 75 | |||
| 76 | static void | ||
| 77 | irq_debug_show_data(struct seq_file *m, struct irq_data *data, int ind) | ||
| 78 | { | ||
| 79 | seq_printf(m, "%*sdomain: %s\n", ind, "", | ||
| 80 | data->domain ? data->domain->name : ""); | ||
| 81 | seq_printf(m, "%*shwirq: 0x%lx\n", ind + 1, "", data->hwirq); | ||
| 82 | irq_debug_show_chip(m, data, ind + 1); | ||
| 83 | #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY | ||
| 84 | if (!data->parent_data) | ||
| 85 | return; | ||
| 86 | seq_printf(m, "%*sparent:\n", ind + 1, ""); | ||
| 87 | irq_debug_show_data(m, data->parent_data, ind + 4); | ||
| 88 | #endif | ||
| 89 | } | ||
| 90 | |||
| 91 | static const struct irq_bit_descr irqdata_states[] = { | ||
| 92 | BIT_MASK_DESCR(IRQ_TYPE_EDGE_RISING), | ||
| 93 | BIT_MASK_DESCR(IRQ_TYPE_EDGE_FALLING), | ||
| 94 | BIT_MASK_DESCR(IRQ_TYPE_LEVEL_HIGH), | ||
| 95 | BIT_MASK_DESCR(IRQ_TYPE_LEVEL_LOW), | ||
| 96 | BIT_MASK_DESCR(IRQD_LEVEL), | ||
| 97 | |||
| 98 | BIT_MASK_DESCR(IRQD_ACTIVATED), | ||
| 99 | BIT_MASK_DESCR(IRQD_IRQ_STARTED), | ||
| 100 | BIT_MASK_DESCR(IRQD_IRQ_DISABLED), | ||
| 101 | BIT_MASK_DESCR(IRQD_IRQ_MASKED), | ||
| 102 | BIT_MASK_DESCR(IRQD_IRQ_INPROGRESS), | ||
| 103 | |||
| 104 | BIT_MASK_DESCR(IRQD_PER_CPU), | ||
| 105 | BIT_MASK_DESCR(IRQD_NO_BALANCING), | ||
| 106 | |||
| 107 | BIT_MASK_DESCR(IRQD_SINGLE_TARGET), | ||
| 108 | BIT_MASK_DESCR(IRQD_MOVE_PCNTXT), | ||
| 109 | BIT_MASK_DESCR(IRQD_AFFINITY_SET), | ||
| 110 | BIT_MASK_DESCR(IRQD_SETAFFINITY_PENDING), | ||
| 111 | BIT_MASK_DESCR(IRQD_AFFINITY_MANAGED), | ||
| 112 | BIT_MASK_DESCR(IRQD_MANAGED_SHUTDOWN), | ||
| 113 | |||
| 114 | BIT_MASK_DESCR(IRQD_FORWARDED_TO_VCPU), | ||
| 115 | |||
| 116 | BIT_MASK_DESCR(IRQD_WAKEUP_STATE), | ||
| 117 | BIT_MASK_DESCR(IRQD_WAKEUP_ARMED), | ||
| 118 | }; | ||
| 119 | |||
| 120 | static const struct irq_bit_descr irqdesc_states[] = { | ||
| 121 | BIT_MASK_DESCR(_IRQ_NOPROBE), | ||
| 122 | BIT_MASK_DESCR(_IRQ_NOREQUEST), | ||
| 123 | BIT_MASK_DESCR(_IRQ_NOTHREAD), | ||
| 124 | BIT_MASK_DESCR(_IRQ_NOAUTOEN), | ||
| 125 | BIT_MASK_DESCR(_IRQ_NESTED_THREAD), | ||
| 126 | BIT_MASK_DESCR(_IRQ_PER_CPU_DEVID), | ||
| 127 | BIT_MASK_DESCR(_IRQ_IS_POLLED), | ||
| 128 | BIT_MASK_DESCR(_IRQ_DISABLE_UNLAZY), | ||
| 129 | }; | ||
| 130 | |||
| 131 | static const struct irq_bit_descr irqdesc_istates[] = { | ||
| 132 | BIT_MASK_DESCR(IRQS_AUTODETECT), | ||
| 133 | BIT_MASK_DESCR(IRQS_SPURIOUS_DISABLED), | ||
| 134 | BIT_MASK_DESCR(IRQS_POLL_INPROGRESS), | ||
| 135 | BIT_MASK_DESCR(IRQS_ONESHOT), | ||
| 136 | BIT_MASK_DESCR(IRQS_REPLAY), | ||
| 137 | BIT_MASK_DESCR(IRQS_WAITING), | ||
| 138 | BIT_MASK_DESCR(IRQS_PENDING), | ||
| 139 | BIT_MASK_DESCR(IRQS_SUSPENDED), | ||
| 140 | }; | ||
| 141 | |||
| 142 | |||
| 143 | static int irq_debug_show(struct seq_file *m, void *p) | ||
| 144 | { | ||
| 145 | struct irq_desc *desc = m->private; | ||
| 146 | struct irq_data *data; | ||
| 147 | |||
| 148 | raw_spin_lock_irq(&desc->lock); | ||
| 149 | data = irq_desc_get_irq_data(desc); | ||
| 150 | seq_printf(m, "handler: %pf\n", desc->handle_irq); | ||
| 151 | seq_printf(m, "status: 0x%08x\n", desc->status_use_accessors); | ||
| 152 | irq_debug_show_bits(m, 0, desc->status_use_accessors, irqdesc_states, | ||
| 153 | ARRAY_SIZE(irqdesc_states)); | ||
| 154 | seq_printf(m, "istate: 0x%08x\n", desc->istate); | ||
| 155 | irq_debug_show_bits(m, 0, desc->istate, irqdesc_istates, | ||
| 156 | ARRAY_SIZE(irqdesc_istates)); | ||
| 157 | seq_printf(m, "ddepth: %u\n", desc->depth); | ||
| 158 | seq_printf(m, "wdepth: %u\n", desc->wake_depth); | ||
| 159 | seq_printf(m, "dstate: 0x%08x\n", irqd_get(data)); | ||
| 160 | irq_debug_show_bits(m, 0, irqd_get(data), irqdata_states, | ||
| 161 | ARRAY_SIZE(irqdata_states)); | ||
| 162 | seq_printf(m, "node: %d\n", irq_data_get_node(data)); | ||
| 163 | irq_debug_show_masks(m, desc); | ||
| 164 | irq_debug_show_data(m, data, 0); | ||
| 165 | raw_spin_unlock_irq(&desc->lock); | ||
| 166 | return 0; | ||
| 167 | } | ||
| 168 | |||
| 169 | static int irq_debug_open(struct inode *inode, struct file *file) | ||
| 170 | { | ||
| 171 | return single_open(file, irq_debug_show, inode->i_private); | ||
| 172 | } | ||
| 173 | |||
| 174 | static const struct file_operations dfs_irq_ops = { | ||
| 175 | .open = irq_debug_open, | ||
| 176 | .read = seq_read, | ||
| 177 | .llseek = seq_lseek, | ||
| 178 | .release = single_release, | ||
| 179 | }; | ||
| 180 | |||
| 181 | void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc) | ||
| 182 | { | ||
| 183 | char name [10]; | ||
| 184 | |||
| 185 | if (!irq_dir || !desc || desc->debugfs_file) | ||
| 186 | return; | ||
| 187 | |||
| 188 | sprintf(name, "%d", irq); | ||
| 189 | desc->debugfs_file = debugfs_create_file(name, 0444, irq_dir, desc, | ||
| 190 | &dfs_irq_ops); | ||
| 191 | } | ||
| 192 | |||
| 193 | static int __init irq_debugfs_init(void) | ||
| 194 | { | ||
| 195 | struct dentry *root_dir; | ||
| 196 | int irq; | ||
| 197 | |||
| 198 | root_dir = debugfs_create_dir("irq", NULL); | ||
| 199 | if (!root_dir) | ||
| 200 | return -ENOMEM; | ||
| 201 | |||
| 202 | irq_domain_debugfs_init(root_dir); | ||
| 203 | |||
| 204 | irq_dir = debugfs_create_dir("irqs", root_dir); | ||
| 205 | |||
| 206 | irq_lock_sparse(); | ||
| 207 | for_each_active_irq(irq) | ||
| 208 | irq_add_debugfs_entry(irq, irq_to_desc(irq)); | ||
| 209 | irq_unlock_sparse(); | ||
| 210 | |||
| 211 | return 0; | ||
| 212 | } | ||
| 213 | __initcall(irq_debugfs_init); | ||
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c index 1613bfd48365..194c506d9d20 100644 --- a/kernel/irq/devres.c +++ b/kernel/irq/devres.c | |||
| @@ -4,6 +4,8 @@ | |||
| 4 | #include <linux/gfp.h> | 4 | #include <linux/gfp.h> |
| 5 | #include <linux/irq.h> | 5 | #include <linux/irq.h> |
| 6 | 6 | ||
| 7 | #include "internals.h" | ||
| 8 | |||
| 7 | /* | 9 | /* |
| 8 | * Device resource management aware IRQ request/free implementation. | 10 | * Device resource management aware IRQ request/free implementation. |
| 9 | */ | 11 | */ |
| @@ -198,3 +200,87 @@ int __devm_irq_alloc_descs(struct device *dev, int irq, unsigned int from, | |||
| 198 | return base; | 200 | return base; |
| 199 | } | 201 | } |
| 200 | EXPORT_SYMBOL_GPL(__devm_irq_alloc_descs); | 202 | EXPORT_SYMBOL_GPL(__devm_irq_alloc_descs); |
| 203 | |||
| 204 | #ifdef CONFIG_GENERIC_IRQ_CHIP | ||
| 205 | /** | ||
| 206 | * devm_irq_alloc_generic_chip - Allocate and initialize a generic chip | ||
| 207 | * for a managed device | ||
| 208 | * @dev: Device to allocate the generic chip for | ||
| 209 | * @name: Name of the irq chip | ||
| 210 | * @num_ct: Number of irq_chip_type instances associated with this | ||
| 211 | * @irq_base: Interrupt base nr for this chip | ||
| 212 | * @reg_base: Register base address (virtual) | ||
| 213 | * @handler: Default flow handler associated with this chip | ||
| 214 | * | ||
| 215 | * Returns an initialized irq_chip_generic structure. The chip defaults | ||
| 216 | * to the primary (index 0) irq_chip_type and @handler | ||
| 217 | */ | ||
| 218 | struct irq_chip_generic * | ||
| 219 | devm_irq_alloc_generic_chip(struct device *dev, const char *name, int num_ct, | ||
| 220 | unsigned int irq_base, void __iomem *reg_base, | ||
| 221 | irq_flow_handler_t handler) | ||
| 222 | { | ||
| 223 | struct irq_chip_generic *gc; | ||
| 224 | unsigned long sz = sizeof(*gc) + num_ct * sizeof(struct irq_chip_type); | ||
| 225 | |||
| 226 | gc = devm_kzalloc(dev, sz, GFP_KERNEL); | ||
| 227 | if (gc) | ||
| 228 | irq_init_generic_chip(gc, name, num_ct, | ||
| 229 | irq_base, reg_base, handler); | ||
| 230 | |||
| 231 | return gc; | ||
| 232 | } | ||
| 233 | EXPORT_SYMBOL_GPL(devm_irq_alloc_generic_chip); | ||
| 234 | |||
| 235 | struct irq_generic_chip_devres { | ||
| 236 | struct irq_chip_generic *gc; | ||
| 237 | u32 msk; | ||
| 238 | unsigned int clr; | ||
| 239 | unsigned int set; | ||
| 240 | }; | ||
| 241 | |||
| 242 | static void devm_irq_remove_generic_chip(struct device *dev, void *res) | ||
| 243 | { | ||
| 244 | struct irq_generic_chip_devres *this = res; | ||
| 245 | |||
| 246 | irq_remove_generic_chip(this->gc, this->msk, this->clr, this->set); | ||
| 247 | } | ||
| 248 | |||
| 249 | /** | ||
| 250 | * devm_irq_setup_generic_chip - Setup a range of interrupts with a generic | ||
| 251 | * chip for a managed device | ||
| 252 | * | ||
| 253 | * @dev: Device to setup the generic chip for | ||
| 254 | * @gc: Generic irq chip holding all data | ||
| 255 | * @msk: Bitmask holding the irqs to initialize relative to gc->irq_base | ||
| 256 | * @flags: Flags for initialization | ||
| 257 | * @clr: IRQ_* bits to clear | ||
| 258 | * @set: IRQ_* bits to set | ||
| 259 | * | ||
| 260 | * Set up max. 32 interrupts starting from gc->irq_base. Note, this | ||
| 261 | * initializes all interrupts to the primary irq_chip_type and its | ||
| 262 | * associated handler. | ||
| 263 | */ | ||
| 264 | int devm_irq_setup_generic_chip(struct device *dev, struct irq_chip_generic *gc, | ||
| 265 | u32 msk, enum irq_gc_flags flags, | ||
| 266 | unsigned int clr, unsigned int set) | ||
| 267 | { | ||
| 268 | struct irq_generic_chip_devres *dr; | ||
| 269 | |||
| 270 | dr = devres_alloc(devm_irq_remove_generic_chip, | ||
| 271 | sizeof(*dr), GFP_KERNEL); | ||
| 272 | if (!dr) | ||
| 273 | return -ENOMEM; | ||
| 274 | |||
| 275 | irq_setup_generic_chip(gc, msk, flags, clr, set); | ||
| 276 | |||
| 277 | dr->gc = gc; | ||
| 278 | dr->msk = msk; | ||
| 279 | dr->clr = clr; | ||
| 280 | dr->set = set; | ||
| 281 | devres_add(dev, dr); | ||
| 282 | |||
| 283 | return 0; | ||
| 284 | } | ||
| 285 | EXPORT_SYMBOL_GPL(devm_irq_setup_generic_chip); | ||
| 286 | #endif /* CONFIG_GENERIC_IRQ_CHIP */ | ||
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index ee32870079c9..f7086b78ad6e 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c | |||
| @@ -201,10 +201,9 @@ static void irq_writel_be(u32 val, void __iomem *addr) | |||
| 201 | iowrite32be(val, addr); | 201 | iowrite32be(val, addr); |
| 202 | } | 202 | } |
| 203 | 203 | ||
| 204 | static void | 204 | void irq_init_generic_chip(struct irq_chip_generic *gc, const char *name, |
| 205 | irq_init_generic_chip(struct irq_chip_generic *gc, const char *name, | 205 | int num_ct, unsigned int irq_base, |
| 206 | int num_ct, unsigned int irq_base, | 206 | void __iomem *reg_base, irq_flow_handler_t handler) |
| 207 | void __iomem *reg_base, irq_flow_handler_t handler) | ||
| 208 | { | 207 | { |
| 209 | raw_spin_lock_init(&gc->lock); | 208 | raw_spin_lock_init(&gc->lock); |
| 210 | gc->num_ct = num_ct; | 209 | gc->num_ct = num_ct; |
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index d3f24905852c..79f987b942b8 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c | |||
| @@ -6,7 +6,7 @@ | |||
| 6 | * | 6 | * |
| 7 | * This file contains the core interrupt handling code. | 7 | * This file contains the core interrupt handling code. |
| 8 | * | 8 | * |
| 9 | * Detailed information is available in Documentation/DocBook/genericirq | 9 | * Detailed information is available in Documentation/core-api/genericirq.rst |
| 10 | * | 10 | * |
| 11 | */ | 11 | */ |
| 12 | 12 | ||
| @@ -138,6 +138,8 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags | |||
| 138 | unsigned int irq = desc->irq_data.irq; | 138 | unsigned int irq = desc->irq_data.irq; |
| 139 | struct irqaction *action; | 139 | struct irqaction *action; |
| 140 | 140 | ||
| 141 | record_irq_time(desc); | ||
| 142 | |||
| 141 | for_each_action_of_desc(desc, action) { | 143 | for_each_action_of_desc(desc, action) { |
| 142 | irqreturn_t res; | 144 | irqreturn_t res; |
| 143 | 145 | ||
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index bc226e783bd2..a2c48058354c 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h | |||
| @@ -8,6 +8,7 @@ | |||
| 8 | #include <linux/irqdesc.h> | 8 | #include <linux/irqdesc.h> |
| 9 | #include <linux/kernel_stat.h> | 9 | #include <linux/kernel_stat.h> |
| 10 | #include <linux/pm_runtime.h> | 10 | #include <linux/pm_runtime.h> |
| 11 | #include <linux/sched/clock.h> | ||
| 11 | 12 | ||
| 12 | #ifdef CONFIG_SPARSE_IRQ | 13 | #ifdef CONFIG_SPARSE_IRQ |
| 13 | # define IRQ_BITMAP_BITS (NR_IRQS + 8196) | 14 | # define IRQ_BITMAP_BITS (NR_IRQS + 8196) |
| @@ -57,6 +58,7 @@ enum { | |||
| 57 | IRQS_WAITING = 0x00000080, | 58 | IRQS_WAITING = 0x00000080, |
| 58 | IRQS_PENDING = 0x00000200, | 59 | IRQS_PENDING = 0x00000200, |
| 59 | IRQS_SUSPENDED = 0x00000800, | 60 | IRQS_SUSPENDED = 0x00000800, |
| 61 | IRQS_TIMINGS = 0x00001000, | ||
| 60 | }; | 62 | }; |
| 61 | 63 | ||
| 62 | #include "debug.h" | 64 | #include "debug.h" |
| @@ -66,7 +68,14 @@ extern int __irq_set_trigger(struct irq_desc *desc, unsigned long flags); | |||
| 66 | extern void __disable_irq(struct irq_desc *desc); | 68 | extern void __disable_irq(struct irq_desc *desc); |
| 67 | extern void __enable_irq(struct irq_desc *desc); | 69 | extern void __enable_irq(struct irq_desc *desc); |
| 68 | 70 | ||
| 69 | extern int irq_startup(struct irq_desc *desc, bool resend); | 71 | #define IRQ_RESEND true |
| 72 | #define IRQ_NORESEND false | ||
| 73 | |||
| 74 | #define IRQ_START_FORCE true | ||
| 75 | #define IRQ_START_COND false | ||
| 76 | |||
| 77 | extern int irq_startup(struct irq_desc *desc, bool resend, bool force); | ||
| 78 | |||
| 70 | extern void irq_shutdown(struct irq_desc *desc); | 79 | extern void irq_shutdown(struct irq_desc *desc); |
| 71 | extern void irq_enable(struct irq_desc *desc); | 80 | extern void irq_enable(struct irq_desc *desc); |
| 72 | extern void irq_disable(struct irq_desc *desc); | 81 | extern void irq_disable(struct irq_desc *desc); |
| @@ -109,13 +118,19 @@ static inline void unregister_handler_proc(unsigned int irq, | |||
| 109 | 118 | ||
| 110 | extern bool irq_can_set_affinity_usr(unsigned int irq); | 119 | extern bool irq_can_set_affinity_usr(unsigned int irq); |
| 111 | 120 | ||
| 112 | extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask); | 121 | extern int irq_select_affinity_usr(unsigned int irq); |
| 113 | 122 | ||
| 114 | extern void irq_set_thread_affinity(struct irq_desc *desc); | 123 | extern void irq_set_thread_affinity(struct irq_desc *desc); |
| 115 | 124 | ||
| 116 | extern int irq_do_set_affinity(struct irq_data *data, | 125 | extern int irq_do_set_affinity(struct irq_data *data, |
| 117 | const struct cpumask *dest, bool force); | 126 | const struct cpumask *dest, bool force); |
| 118 | 127 | ||
| 128 | #ifdef CONFIG_SMP | ||
| 129 | extern int irq_setup_affinity(struct irq_desc *desc); | ||
| 130 | #else | ||
| 131 | static inline int irq_setup_affinity(struct irq_desc *desc) { return 0; } | ||
| 132 | #endif | ||
| 133 | |||
| 119 | /* Inline functions for support of irq chips on slow busses */ | 134 | /* Inline functions for support of irq chips on slow busses */ |
| 120 | static inline void chip_bus_lock(struct irq_desc *desc) | 135 | static inline void chip_bus_lock(struct irq_desc *desc) |
| 121 | { | 136 | { |
| @@ -169,6 +184,11 @@ irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags) | |||
| 169 | 184 | ||
| 170 | #define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors) | 185 | #define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors) |
| 171 | 186 | ||
| 187 | static inline unsigned int irqd_get(struct irq_data *d) | ||
| 188 | { | ||
| 189 | return __irqd_to_state(d); | ||
| 190 | } | ||
| 191 | |||
| 172 | /* | 192 | /* |
| 173 | * Manipulation functions for irq_data.state | 193 | * Manipulation functions for irq_data.state |
| 174 | */ | 194 | */ |
| @@ -182,6 +202,16 @@ static inline void irqd_clr_move_pending(struct irq_data *d) | |||
| 182 | __irqd_to_state(d) &= ~IRQD_SETAFFINITY_PENDING; | 202 | __irqd_to_state(d) &= ~IRQD_SETAFFINITY_PENDING; |
| 183 | } | 203 | } |
| 184 | 204 | ||
| 205 | static inline void irqd_set_managed_shutdown(struct irq_data *d) | ||
| 206 | { | ||
| 207 | __irqd_to_state(d) |= IRQD_MANAGED_SHUTDOWN; | ||
| 208 | } | ||
| 209 | |||
| 210 | static inline void irqd_clr_managed_shutdown(struct irq_data *d) | ||
| 211 | { | ||
| 212 | __irqd_to_state(d) &= ~IRQD_MANAGED_SHUTDOWN; | ||
| 213 | } | ||
| 214 | |||
| 185 | static inline void irqd_clear(struct irq_data *d, unsigned int mask) | 215 | static inline void irqd_clear(struct irq_data *d, unsigned int mask) |
| 186 | { | 216 | { |
| 187 | __irqd_to_state(d) &= ~mask; | 217 | __irqd_to_state(d) &= ~mask; |
| @@ -197,6 +227,16 @@ static inline bool irqd_has_set(struct irq_data *d, unsigned int mask) | |||
| 197 | return __irqd_to_state(d) & mask; | 227 | return __irqd_to_state(d) & mask; |
| 198 | } | 228 | } |
| 199 | 229 | ||
| 230 | static inline void irq_state_set_disabled(struct irq_desc *desc) | ||
| 231 | { | ||
| 232 | irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); | ||
| 233 | } | ||
| 234 | |||
| 235 | static inline void irq_state_set_masked(struct irq_desc *desc) | ||
| 236 | { | ||
| 237 | irqd_set(&desc->irq_data, IRQD_IRQ_MASKED); | ||
| 238 | } | ||
| 239 | |||
| 200 | #undef __irqd_to_state | 240 | #undef __irqd_to_state |
| 201 | 241 | ||
| 202 | static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc) | 242 | static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc) |
| @@ -226,3 +266,196 @@ irq_pm_install_action(struct irq_desc *desc, struct irqaction *action) { } | |||
| 226 | static inline void | 266 | static inline void |
| 227 | irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) { } | 267 | irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) { } |
| 228 | #endif | 268 | #endif |
| 269 | |||
| 270 | #ifdef CONFIG_IRQ_TIMINGS | ||
| 271 | |||
| 272 | #define IRQ_TIMINGS_SHIFT 5 | ||
| 273 | #define IRQ_TIMINGS_SIZE (1 << IRQ_TIMINGS_SHIFT) | ||
| 274 | #define IRQ_TIMINGS_MASK (IRQ_TIMINGS_SIZE - 1) | ||
| 275 | |||
| 276 | /** | ||
| 277 | * struct irq_timings - irq timings storing structure | ||
| 278 | * @values: a circular buffer of u64 encoded <timestamp,irq> values | ||
| 279 | * @count: the number of elements in the array | ||
| 280 | */ | ||
| 281 | struct irq_timings { | ||
| 282 | u64 values[IRQ_TIMINGS_SIZE]; | ||
| 283 | int count; | ||
| 284 | }; | ||
| 285 | |||
| 286 | DECLARE_PER_CPU(struct irq_timings, irq_timings); | ||
| 287 | |||
| 288 | extern void irq_timings_free(int irq); | ||
| 289 | extern int irq_timings_alloc(int irq); | ||
| 290 | |||
| 291 | static inline void irq_remove_timings(struct irq_desc *desc) | ||
| 292 | { | ||
| 293 | desc->istate &= ~IRQS_TIMINGS; | ||
| 294 | |||
| 295 | irq_timings_free(irq_desc_get_irq(desc)); | ||
| 296 | } | ||
| 297 | |||
| 298 | static inline void irq_setup_timings(struct irq_desc *desc, struct irqaction *act) | ||
| 299 | { | ||
| 300 | int irq = irq_desc_get_irq(desc); | ||
| 301 | int ret; | ||
| 302 | |||
| 303 | /* | ||
| 304 | * We don't need the measurement because the idle code already | ||
| 305 | * knows the next expiry event. | ||
| 306 | */ | ||
| 307 | if (act->flags & __IRQF_TIMER) | ||
| 308 | return; | ||
| 309 | |||
| 310 | /* | ||
| 311 | * In case the timing allocation fails, we just want to warn, | ||
| 312 | * not fail, so letting the system boot anyway. | ||
| 313 | */ | ||
| 314 | ret = irq_timings_alloc(irq); | ||
| 315 | if (ret) { | ||
| 316 | pr_warn("Failed to allocate irq timing stats for irq%d (%d)", | ||
| 317 | irq, ret); | ||
| 318 | return; | ||
| 319 | } | ||
| 320 | |||
| 321 | desc->istate |= IRQS_TIMINGS; | ||
| 322 | } | ||
| 323 | |||
| 324 | extern void irq_timings_enable(void); | ||
| 325 | extern void irq_timings_disable(void); | ||
| 326 | |||
| 327 | DECLARE_STATIC_KEY_FALSE(irq_timing_enabled); | ||
| 328 | |||
| 329 | /* | ||
| 330 | * The interrupt number and the timestamp are encoded into a single | ||
| 331 | * u64 variable to optimize the size. | ||
| 332 | * 48 bit time stamp and 16 bit IRQ number is way sufficient. | ||
| 333 | * Who cares an IRQ after 78 hours of idle time? | ||
| 334 | */ | ||
| 335 | static inline u64 irq_timing_encode(u64 timestamp, int irq) | ||
| 336 | { | ||
| 337 | return (timestamp << 16) | irq; | ||
| 338 | } | ||
| 339 | |||
| 340 | static inline int irq_timing_decode(u64 value, u64 *timestamp) | ||
| 341 | { | ||
| 342 | *timestamp = value >> 16; | ||
| 343 | return value & U16_MAX; | ||
| 344 | } | ||
| 345 | |||
| 346 | /* | ||
| 347 | * The function record_irq_time is only called in one place in the | ||
| 348 | * interrupts handler. We want this function always inline so the code | ||
| 349 | * inside is embedded in the function and the static key branching | ||
| 350 | * code can act at the higher level. Without the explicit | ||
| 351 | * __always_inline we can end up with a function call and a small | ||
| 352 | * overhead in the hotpath for nothing. | ||
| 353 | */ | ||
| 354 | static __always_inline void record_irq_time(struct irq_desc *desc) | ||
| 355 | { | ||
| 356 | if (!static_branch_likely(&irq_timing_enabled)) | ||
| 357 | return; | ||
| 358 | |||
| 359 | if (desc->istate & IRQS_TIMINGS) { | ||
| 360 | struct irq_timings *timings = this_cpu_ptr(&irq_timings); | ||
| 361 | |||
| 362 | timings->values[timings->count & IRQ_TIMINGS_MASK] = | ||
| 363 | irq_timing_encode(local_clock(), | ||
| 364 | irq_desc_get_irq(desc)); | ||
| 365 | |||
| 366 | timings->count++; | ||
| 367 | } | ||
| 368 | } | ||
| 369 | #else | ||
| 370 | static inline void irq_remove_timings(struct irq_desc *desc) {} | ||
| 371 | static inline void irq_setup_timings(struct irq_desc *desc, | ||
| 372 | struct irqaction *act) {}; | ||
| 373 | static inline void record_irq_time(struct irq_desc *desc) {} | ||
| 374 | #endif /* CONFIG_IRQ_TIMINGS */ | ||
| 375 | |||
| 376 | |||
| 377 | #ifdef CONFIG_GENERIC_IRQ_CHIP | ||
| 378 | void irq_init_generic_chip(struct irq_chip_generic *gc, const char *name, | ||
| 379 | int num_ct, unsigned int irq_base, | ||
| 380 | void __iomem *reg_base, irq_flow_handler_t handler); | ||
| 381 | #else | ||
| 382 | static inline void | ||
| 383 | irq_init_generic_chip(struct irq_chip_generic *gc, const char *name, | ||
| 384 | int num_ct, unsigned int irq_base, | ||
| 385 | void __iomem *reg_base, irq_flow_handler_t handler) { } | ||
| 386 | #endif /* CONFIG_GENERIC_IRQ_CHIP */ | ||
| 387 | |||
| 388 | #ifdef CONFIG_GENERIC_PENDING_IRQ | ||
| 389 | static inline bool irq_can_move_pcntxt(struct irq_data *data) | ||
| 390 | { | ||
| 391 | return irqd_can_move_in_process_context(data); | ||
| 392 | } | ||
| 393 | static inline bool irq_move_pending(struct irq_data *data) | ||
| 394 | { | ||
| 395 | return irqd_is_setaffinity_pending(data); | ||
| 396 | } | ||
| 397 | static inline void | ||
| 398 | irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) | ||
| 399 | { | ||
| 400 | cpumask_copy(desc->pending_mask, mask); | ||
| 401 | } | ||
| 402 | static inline void | ||
| 403 | irq_get_pending(struct cpumask *mask, struct irq_desc *desc) | ||
| 404 | { | ||
| 405 | cpumask_copy(mask, desc->pending_mask); | ||
| 406 | } | ||
| 407 | static inline struct cpumask *irq_desc_get_pending_mask(struct irq_desc *desc) | ||
| 408 | { | ||
| 409 | return desc->pending_mask; | ||
| 410 | } | ||
| 411 | bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear); | ||
| 412 | #else /* CONFIG_GENERIC_PENDING_IRQ */ | ||
| 413 | static inline bool irq_can_move_pcntxt(struct irq_data *data) | ||
| 414 | { | ||
| 415 | return true; | ||
| 416 | } | ||
| 417 | static inline bool irq_move_pending(struct irq_data *data) | ||
| 418 | { | ||
| 419 | return false; | ||
| 420 | } | ||
| 421 | static inline void | ||
| 422 | irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) | ||
| 423 | { | ||
| 424 | } | ||
| 425 | static inline void | ||
| 426 | irq_get_pending(struct cpumask *mask, struct irq_desc *desc) | ||
| 427 | { | ||
| 428 | } | ||
| 429 | static inline struct cpumask *irq_desc_get_pending_mask(struct irq_desc *desc) | ||
| 430 | { | ||
| 431 | return NULL; | ||
| 432 | } | ||
| 433 | static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear) | ||
| 434 | { | ||
| 435 | return false; | ||
| 436 | } | ||
| 437 | #endif /* !CONFIG_GENERIC_PENDING_IRQ */ | ||
| 438 | |||
| 439 | #ifdef CONFIG_GENERIC_IRQ_DEBUGFS | ||
| 440 | #include <linux/debugfs.h> | ||
| 441 | |||
| 442 | void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc); | ||
| 443 | static inline void irq_remove_debugfs_entry(struct irq_desc *desc) | ||
| 444 | { | ||
| 445 | debugfs_remove(desc->debugfs_file); | ||
| 446 | } | ||
| 447 | # ifdef CONFIG_IRQ_DOMAIN | ||
| 448 | void irq_domain_debugfs_init(struct dentry *root); | ||
| 449 | # else | ||
| 450 | static inline void irq_domain_debugfs_init(struct dentry *root) | ||
| 451 | { | ||
| 452 | } | ||
| 453 | # endif | ||
| 454 | #else /* CONFIG_GENERIC_IRQ_DEBUGFS */ | ||
| 455 | static inline void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *d) | ||
| 456 | { | ||
| 457 | } | ||
| 458 | static inline void irq_remove_debugfs_entry(struct irq_desc *d) | ||
| 459 | { | ||
| 460 | } | ||
| 461 | #endif /* CONFIG_GENERIC_IRQ_DEBUGFS */ | ||
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 00bb0aeea1d0..73be2b3909bd 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c | |||
| @@ -4,7 +4,7 @@ | |||
| 4 | * | 4 | * |
| 5 | * This file contains the interrupt descriptor management code | 5 | * This file contains the interrupt descriptor management code |
| 6 | * | 6 | * |
| 7 | * Detailed information is available in Documentation/DocBook/genericirq | 7 | * Detailed information is available in Documentation/core-api/genericirq.rst |
| 8 | * | 8 | * |
| 9 | */ | 9 | */ |
| 10 | #include <linux/irq.h> | 10 | #include <linux/irq.h> |
| @@ -54,14 +54,25 @@ static void __init init_irq_default_affinity(void) | |||
| 54 | #endif | 54 | #endif |
| 55 | 55 | ||
| 56 | #ifdef CONFIG_SMP | 56 | #ifdef CONFIG_SMP |
| 57 | static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) | 57 | static int alloc_masks(struct irq_desc *desc, int node) |
| 58 | { | 58 | { |
| 59 | if (!zalloc_cpumask_var_node(&desc->irq_common_data.affinity, | 59 | if (!zalloc_cpumask_var_node(&desc->irq_common_data.affinity, |
| 60 | gfp, node)) | 60 | GFP_KERNEL, node)) |
| 61 | return -ENOMEM; | 61 | return -ENOMEM; |
| 62 | 62 | ||
| 63 | #ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK | ||
| 64 | if (!zalloc_cpumask_var_node(&desc->irq_common_data.effective_affinity, | ||
| 65 | GFP_KERNEL, node)) { | ||
| 66 | free_cpumask_var(desc->irq_common_data.affinity); | ||
| 67 | return -ENOMEM; | ||
| 68 | } | ||
| 69 | #endif | ||
| 70 | |||
| 63 | #ifdef CONFIG_GENERIC_PENDING_IRQ | 71 | #ifdef CONFIG_GENERIC_PENDING_IRQ |
| 64 | if (!zalloc_cpumask_var_node(&desc->pending_mask, gfp, node)) { | 72 | if (!zalloc_cpumask_var_node(&desc->pending_mask, GFP_KERNEL, node)) { |
| 73 | #ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK | ||
| 74 | free_cpumask_var(desc->irq_common_data.effective_affinity); | ||
| 75 | #endif | ||
| 65 | free_cpumask_var(desc->irq_common_data.affinity); | 76 | free_cpumask_var(desc->irq_common_data.affinity); |
| 66 | return -ENOMEM; | 77 | return -ENOMEM; |
| 67 | } | 78 | } |
| @@ -86,7 +97,7 @@ static void desc_smp_init(struct irq_desc *desc, int node, | |||
| 86 | 97 | ||
| 87 | #else | 98 | #else |
| 88 | static inline int | 99 | static inline int |
| 89 | alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) { return 0; } | 100 | alloc_masks(struct irq_desc *desc, int node) { return 0; } |
| 90 | static inline void | 101 | static inline void |
| 91 | desc_smp_init(struct irq_desc *desc, int node, const struct cpumask *affinity) { } | 102 | desc_smp_init(struct irq_desc *desc, int node, const struct cpumask *affinity) { } |
| 92 | #endif | 103 | #endif |
| @@ -105,6 +116,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node, | |||
| 105 | desc->irq_data.chip_data = NULL; | 116 | desc->irq_data.chip_data = NULL; |
| 106 | irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS); | 117 | irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS); |
| 107 | irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); | 118 | irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); |
| 119 | irqd_set(&desc->irq_data, IRQD_IRQ_MASKED); | ||
| 108 | desc->handle_irq = handle_bad_irq; | 120 | desc->handle_irq = handle_bad_irq; |
| 109 | desc->depth = 1; | 121 | desc->depth = 1; |
| 110 | desc->irq_count = 0; | 122 | desc->irq_count = 0; |
| @@ -324,6 +336,9 @@ static void free_masks(struct irq_desc *desc) | |||
| 324 | free_cpumask_var(desc->pending_mask); | 336 | free_cpumask_var(desc->pending_mask); |
| 325 | #endif | 337 | #endif |
| 326 | free_cpumask_var(desc->irq_common_data.affinity); | 338 | free_cpumask_var(desc->irq_common_data.affinity); |
| 339 | #ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK | ||
| 340 | free_cpumask_var(desc->irq_common_data.effective_affinity); | ||
| 341 | #endif | ||
| 327 | } | 342 | } |
| 328 | #else | 343 | #else |
| 329 | static inline void free_masks(struct irq_desc *desc) { } | 344 | static inline void free_masks(struct irq_desc *desc) { } |
| @@ -344,9 +359,8 @@ static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags, | |||
| 344 | struct module *owner) | 359 | struct module *owner) |
| 345 | { | 360 | { |
| 346 | struct irq_desc *desc; | 361 | struct irq_desc *desc; |
| 347 | gfp_t gfp = GFP_KERNEL; | ||
| 348 | 362 | ||
| 349 | desc = kzalloc_node(sizeof(*desc), gfp, node); | 363 | desc = kzalloc_node(sizeof(*desc), GFP_KERNEL, node); |
| 350 | if (!desc) | 364 | if (!desc) |
| 351 | return NULL; | 365 | return NULL; |
| 352 | /* allocate based on nr_cpu_ids */ | 366 | /* allocate based on nr_cpu_ids */ |
| @@ -354,11 +368,12 @@ static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags, | |||
| 354 | if (!desc->kstat_irqs) | 368 | if (!desc->kstat_irqs) |
| 355 | goto err_desc; | 369 | goto err_desc; |
| 356 | 370 | ||
| 357 | if (alloc_masks(desc, gfp, node)) | 371 | if (alloc_masks(desc, node)) |
| 358 | goto err_kstat; | 372 | goto err_kstat; |
| 359 | 373 | ||
| 360 | raw_spin_lock_init(&desc->lock); | 374 | raw_spin_lock_init(&desc->lock); |
| 361 | lockdep_set_class(&desc->lock, &irq_desc_lock_class); | 375 | lockdep_set_class(&desc->lock, &irq_desc_lock_class); |
| 376 | mutex_init(&desc->request_mutex); | ||
| 362 | init_rcu_head(&desc->rcu); | 377 | init_rcu_head(&desc->rcu); |
| 363 | 378 | ||
| 364 | desc_set_defaults(irq, desc, node, affinity, owner); | 379 | desc_set_defaults(irq, desc, node, affinity, owner); |
| @@ -394,6 +409,7 @@ static void free_desc(unsigned int irq) | |||
| 394 | { | 409 | { |
| 395 | struct irq_desc *desc = irq_to_desc(irq); | 410 | struct irq_desc *desc = irq_to_desc(irq); |
| 396 | 411 | ||
| 412 | irq_remove_debugfs_entry(desc); | ||
| 397 | unregister_irq_proc(irq, desc); | 413 | unregister_irq_proc(irq, desc); |
| 398 | 414 | ||
| 399 | /* | 415 | /* |
| @@ -480,7 +496,8 @@ int __init early_irq_init(void) | |||
| 480 | 496 | ||
| 481 | /* Let arch update nr_irqs and return the nr of preallocated irqs */ | 497 | /* Let arch update nr_irqs and return the nr of preallocated irqs */ |
| 482 | initcnt = arch_probe_nr_irqs(); | 498 | initcnt = arch_probe_nr_irqs(); |
| 483 | printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d %d\n", NR_IRQS, nr_irqs, initcnt); | 499 | printk(KERN_INFO "NR_IRQS: %d, nr_irqs: %d, preallocated irqs: %d\n", |
| 500 | NR_IRQS, nr_irqs, initcnt); | ||
| 484 | 501 | ||
| 485 | if (WARN_ON(nr_irqs > IRQ_BITMAP_BITS)) | 502 | if (WARN_ON(nr_irqs > IRQ_BITMAP_BITS)) |
| 486 | nr_irqs = IRQ_BITMAP_BITS; | 503 | nr_irqs = IRQ_BITMAP_BITS; |
| @@ -516,14 +533,14 @@ int __init early_irq_init(void) | |||
| 516 | 533 | ||
| 517 | init_irq_default_affinity(); | 534 | init_irq_default_affinity(); |
| 518 | 535 | ||
| 519 | printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS); | 536 | printk(KERN_INFO "NR_IRQS: %d\n", NR_IRQS); |
| 520 | 537 | ||
| 521 | desc = irq_desc; | 538 | desc = irq_desc; |
| 522 | count = ARRAY_SIZE(irq_desc); | 539 | count = ARRAY_SIZE(irq_desc); |
| 523 | 540 | ||
| 524 | for (i = 0; i < count; i++) { | 541 | for (i = 0; i < count; i++) { |
| 525 | desc[i].kstat_irqs = alloc_percpu(unsigned int); | 542 | desc[i].kstat_irqs = alloc_percpu(unsigned int); |
| 526 | alloc_masks(&desc[i], GFP_KERNEL, node); | 543 | alloc_masks(&desc[i], node); |
| 527 | raw_spin_lock_init(&desc[i].lock); | 544 | raw_spin_lock_init(&desc[i].lock); |
| 528 | lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); | 545 | lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); |
| 529 | desc_set_defaults(i, &desc[i], node, NULL, NULL); | 546 | desc_set_defaults(i, &desc[i], node, NULL, NULL); |
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 31805f237396..f1f251479aa6 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c | |||
| @@ -1,5 +1,6 @@ | |||
| 1 | #define pr_fmt(fmt) "irq: " fmt | 1 | #define pr_fmt(fmt) "irq: " fmt |
| 2 | 2 | ||
| 3 | #include <linux/acpi.h> | ||
| 3 | #include <linux/debugfs.h> | 4 | #include <linux/debugfs.h> |
| 4 | #include <linux/hardirq.h> | 5 | #include <linux/hardirq.h> |
| 5 | #include <linux/interrupt.h> | 6 | #include <linux/interrupt.h> |
| @@ -26,39 +27,69 @@ static struct irq_domain *irq_default_domain; | |||
| 26 | static void irq_domain_check_hierarchy(struct irq_domain *domain); | 27 | static void irq_domain_check_hierarchy(struct irq_domain *domain); |
| 27 | 28 | ||
| 28 | struct irqchip_fwid { | 29 | struct irqchip_fwid { |
| 29 | struct fwnode_handle fwnode; | 30 | struct fwnode_handle fwnode; |
| 30 | char *name; | 31 | unsigned int type; |
| 32 | char *name; | ||
| 31 | void *data; | 33 | void *data; |
| 32 | }; | 34 | }; |
| 33 | 35 | ||
| 36 | #ifdef CONFIG_GENERIC_IRQ_DEBUGFS | ||
| 37 | static void debugfs_add_domain_dir(struct irq_domain *d); | ||
| 38 | static void debugfs_remove_domain_dir(struct irq_domain *d); | ||
| 39 | #else | ||
| 40 | static inline void debugfs_add_domain_dir(struct irq_domain *d) { } | ||
| 41 | static inline void debugfs_remove_domain_dir(struct irq_domain *d) { } | ||
| 42 | #endif | ||
| 43 | |||
| 34 | /** | 44 | /** |
| 35 | * irq_domain_alloc_fwnode - Allocate a fwnode_handle suitable for | 45 | * irq_domain_alloc_fwnode - Allocate a fwnode_handle suitable for |
| 36 | * identifying an irq domain | 46 | * identifying an irq domain |
| 37 | * @data: optional user-provided data | 47 | * @type: Type of irqchip_fwnode. See linux/irqdomain.h |
| 48 | * @name: Optional user provided domain name | ||
| 49 | * @id: Optional user provided id if name != NULL | ||
| 50 | * @data: Optional user-provided data | ||
| 38 | * | 51 | * |
| 39 | * Allocate a struct device_node, and return a poiner to the embedded | 52 | * Allocate a struct irqchip_fwid, and return a poiner to the embedded |
| 40 | * fwnode_handle (or NULL on failure). | 53 | * fwnode_handle (or NULL on failure). |
| 54 | * | ||
| 55 | * Note: The types IRQCHIP_FWNODE_NAMED and IRQCHIP_FWNODE_NAMED_ID are | ||
| 56 | * solely to transport name information to irqdomain creation code. The | ||
| 57 | * node is not stored. For other types the pointer is kept in the irq | ||
| 58 | * domain struct. | ||
| 41 | */ | 59 | */ |
| 42 | struct fwnode_handle *irq_domain_alloc_fwnode(void *data) | 60 | struct fwnode_handle *__irq_domain_alloc_fwnode(unsigned int type, int id, |
| 61 | const char *name, void *data) | ||
| 43 | { | 62 | { |
| 44 | struct irqchip_fwid *fwid; | 63 | struct irqchip_fwid *fwid; |
| 45 | char *name; | 64 | char *n; |
| 46 | 65 | ||
| 47 | fwid = kzalloc(sizeof(*fwid), GFP_KERNEL); | 66 | fwid = kzalloc(sizeof(*fwid), GFP_KERNEL); |
| 48 | name = kasprintf(GFP_KERNEL, "irqchip@%p", data); | ||
| 49 | 67 | ||
| 50 | if (!fwid || !name) { | 68 | switch (type) { |
| 69 | case IRQCHIP_FWNODE_NAMED: | ||
| 70 | n = kasprintf(GFP_KERNEL, "%s", name); | ||
| 71 | break; | ||
| 72 | case IRQCHIP_FWNODE_NAMED_ID: | ||
| 73 | n = kasprintf(GFP_KERNEL, "%s-%d", name, id); | ||
| 74 | break; | ||
| 75 | default: | ||
| 76 | n = kasprintf(GFP_KERNEL, "irqchip@%p", data); | ||
| 77 | break; | ||
| 78 | } | ||
| 79 | |||
| 80 | if (!fwid || !n) { | ||
| 51 | kfree(fwid); | 81 | kfree(fwid); |
| 52 | kfree(name); | 82 | kfree(n); |
| 53 | return NULL; | 83 | return NULL; |
| 54 | } | 84 | } |
| 55 | 85 | ||
| 56 | fwid->name = name; | 86 | fwid->type = type; |
| 87 | fwid->name = n; | ||
| 57 | fwid->data = data; | 88 | fwid->data = data; |
| 58 | fwid->fwnode.type = FWNODE_IRQCHIP; | 89 | fwid->fwnode.type = FWNODE_IRQCHIP; |
| 59 | return &fwid->fwnode; | 90 | return &fwid->fwnode; |
| 60 | } | 91 | } |
| 61 | EXPORT_SYMBOL_GPL(irq_domain_alloc_fwnode); | 92 | EXPORT_SYMBOL_GPL(__irq_domain_alloc_fwnode); |
| 62 | 93 | ||
| 63 | /** | 94 | /** |
| 64 | * irq_domain_free_fwnode - Free a non-OF-backed fwnode_handle | 95 | * irq_domain_free_fwnode - Free a non-OF-backed fwnode_handle |
| @@ -97,26 +128,97 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, | |||
| 97 | void *host_data) | 128 | void *host_data) |
| 98 | { | 129 | { |
| 99 | struct device_node *of_node = to_of_node(fwnode); | 130 | struct device_node *of_node = to_of_node(fwnode); |
| 131 | struct irqchip_fwid *fwid; | ||
| 100 | struct irq_domain *domain; | 132 | struct irq_domain *domain; |
| 101 | 133 | ||
| 134 | static atomic_t unknown_domains; | ||
| 135 | |||
| 102 | domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size), | 136 | domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size), |
| 103 | GFP_KERNEL, of_node_to_nid(of_node)); | 137 | GFP_KERNEL, of_node_to_nid(of_node)); |
| 104 | if (WARN_ON(!domain)) | 138 | if (WARN_ON(!domain)) |
| 105 | return NULL; | 139 | return NULL; |
| 106 | 140 | ||
| 141 | if (fwnode && is_fwnode_irqchip(fwnode)) { | ||
| 142 | fwid = container_of(fwnode, struct irqchip_fwid, fwnode); | ||
| 143 | |||
| 144 | switch (fwid->type) { | ||
| 145 | case IRQCHIP_FWNODE_NAMED: | ||
| 146 | case IRQCHIP_FWNODE_NAMED_ID: | ||
| 147 | domain->name = kstrdup(fwid->name, GFP_KERNEL); | ||
| 148 | if (!domain->name) { | ||
| 149 | kfree(domain); | ||
| 150 | return NULL; | ||
| 151 | } | ||
| 152 | domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED; | ||
| 153 | break; | ||
| 154 | default: | ||
| 155 | domain->fwnode = fwnode; | ||
| 156 | domain->name = fwid->name; | ||
| 157 | break; | ||
| 158 | } | ||
| 159 | #ifdef CONFIG_ACPI | ||
| 160 | } else if (is_acpi_device_node(fwnode)) { | ||
| 161 | struct acpi_buffer buf = { | ||
| 162 | .length = ACPI_ALLOCATE_BUFFER, | ||
| 163 | }; | ||
| 164 | acpi_handle handle; | ||
| 165 | |||
| 166 | handle = acpi_device_handle(to_acpi_device_node(fwnode)); | ||
| 167 | if (acpi_get_name(handle, ACPI_FULL_PATHNAME, &buf) == AE_OK) { | ||
| 168 | domain->name = buf.pointer; | ||
| 169 | domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED; | ||
| 170 | } | ||
| 171 | |||
| 172 | domain->fwnode = fwnode; | ||
| 173 | #endif | ||
| 174 | } else if (of_node) { | ||
| 175 | char *name; | ||
| 176 | |||
| 177 | /* | ||
| 178 | * DT paths contain '/', which debugfs is legitimately | ||
| 179 | * unhappy about. Replace them with ':', which does | ||
| 180 | * the trick and is not as offensive as '\'... | ||
| 181 | */ | ||
| 182 | name = kstrdup(of_node_full_name(of_node), GFP_KERNEL); | ||
| 183 | if (!name) { | ||
| 184 | kfree(domain); | ||
| 185 | return NULL; | ||
| 186 | } | ||
| 187 | |||
| 188 | strreplace(name, '/', ':'); | ||
| 189 | |||
| 190 | domain->name = name; | ||
| 191 | domain->fwnode = fwnode; | ||
| 192 | domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED; | ||
| 193 | } | ||
| 194 | |||
| 195 | if (!domain->name) { | ||
| 196 | if (fwnode) { | ||
| 197 | pr_err("Invalid fwnode type (%d) for irqdomain\n", | ||
| 198 | fwnode->type); | ||
| 199 | } | ||
| 200 | domain->name = kasprintf(GFP_KERNEL, "unknown-%d", | ||
| 201 | atomic_inc_return(&unknown_domains)); | ||
| 202 | if (!domain->name) { | ||
| 203 | kfree(domain); | ||
| 204 | return NULL; | ||
| 205 | } | ||
| 206 | domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED; | ||
| 207 | } | ||
| 208 | |||
| 107 | of_node_get(of_node); | 209 | of_node_get(of_node); |
| 108 | 210 | ||
| 109 | /* Fill structure */ | 211 | /* Fill structure */ |
| 110 | INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL); | 212 | INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL); |
| 111 | domain->ops = ops; | 213 | domain->ops = ops; |
| 112 | domain->host_data = host_data; | 214 | domain->host_data = host_data; |
| 113 | domain->fwnode = fwnode; | ||
| 114 | domain->hwirq_max = hwirq_max; | 215 | domain->hwirq_max = hwirq_max; |
| 115 | domain->revmap_size = size; | 216 | domain->revmap_size = size; |
| 116 | domain->revmap_direct_max_irq = direct_max; | 217 | domain->revmap_direct_max_irq = direct_max; |
| 117 | irq_domain_check_hierarchy(domain); | 218 | irq_domain_check_hierarchy(domain); |
| 118 | 219 | ||
| 119 | mutex_lock(&irq_domain_mutex); | 220 | mutex_lock(&irq_domain_mutex); |
| 221 | debugfs_add_domain_dir(domain); | ||
| 120 | list_add(&domain->link, &irq_domain_list); | 222 | list_add(&domain->link, &irq_domain_list); |
| 121 | mutex_unlock(&irq_domain_mutex); | 223 | mutex_unlock(&irq_domain_mutex); |
| 122 | 224 | ||
| @@ -136,6 +238,7 @@ EXPORT_SYMBOL_GPL(__irq_domain_add); | |||
| 136 | void irq_domain_remove(struct irq_domain *domain) | 238 | void irq_domain_remove(struct irq_domain *domain) |
| 137 | { | 239 | { |
| 138 | mutex_lock(&irq_domain_mutex); | 240 | mutex_lock(&irq_domain_mutex); |
| 241 | debugfs_remove_domain_dir(domain); | ||
| 139 | 242 | ||
| 140 | WARN_ON(!radix_tree_empty(&domain->revmap_tree)); | 243 | WARN_ON(!radix_tree_empty(&domain->revmap_tree)); |
| 141 | 244 | ||
| @@ -152,10 +255,43 @@ void irq_domain_remove(struct irq_domain *domain) | |||
| 152 | pr_debug("Removed domain %s\n", domain->name); | 255 | pr_debug("Removed domain %s\n", domain->name); |
| 153 | 256 | ||
| 154 | of_node_put(irq_domain_get_of_node(domain)); | 257 | of_node_put(irq_domain_get_of_node(domain)); |
| 258 | if (domain->flags & IRQ_DOMAIN_NAME_ALLOCATED) | ||
| 259 | kfree(domain->name); | ||
| 155 | kfree(domain); | 260 | kfree(domain); |
| 156 | } | 261 | } |
| 157 | EXPORT_SYMBOL_GPL(irq_domain_remove); | 262 | EXPORT_SYMBOL_GPL(irq_domain_remove); |
| 158 | 263 | ||
| 264 | void irq_domain_update_bus_token(struct irq_domain *domain, | ||
| 265 | enum irq_domain_bus_token bus_token) | ||
| 266 | { | ||
| 267 | char *name; | ||
| 268 | |||
| 269 | if (domain->bus_token == bus_token) | ||
| 270 | return; | ||
| 271 | |||
| 272 | mutex_lock(&irq_domain_mutex); | ||
| 273 | |||
| 274 | domain->bus_token = bus_token; | ||
| 275 | |||
| 276 | name = kasprintf(GFP_KERNEL, "%s-%d", domain->name, bus_token); | ||
| 277 | if (!name) { | ||
| 278 | mutex_unlock(&irq_domain_mutex); | ||
| 279 | return; | ||
| 280 | } | ||
| 281 | |||
| 282 | debugfs_remove_domain_dir(domain); | ||
| 283 | |||
| 284 | if (domain->flags & IRQ_DOMAIN_NAME_ALLOCATED) | ||
| 285 | kfree(domain->name); | ||
| 286 | else | ||
| 287 | domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED; | ||
| 288 | |||
| 289 | domain->name = name; | ||
| 290 | debugfs_add_domain_dir(domain); | ||
| 291 | |||
| 292 | mutex_unlock(&irq_domain_mutex); | ||
| 293 | } | ||
| 294 | |||
| 159 | /** | 295 | /** |
| 160 | * irq_domain_add_simple() - Register an irq_domain and optionally map a range of irqs | 296 | * irq_domain_add_simple() - Register an irq_domain and optionally map a range of irqs |
| 161 | * @of_node: pointer to interrupt controller's device tree node. | 297 | * @of_node: pointer to interrupt controller's device tree node. |
| @@ -344,6 +480,7 @@ void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq) | |||
| 344 | 480 | ||
| 345 | irq_data->domain = NULL; | 481 | irq_data->domain = NULL; |
| 346 | irq_data->hwirq = 0; | 482 | irq_data->hwirq = 0; |
| 483 | domain->mapcount--; | ||
| 347 | 484 | ||
| 348 | /* Clear reverse map for this hwirq */ | 485 | /* Clear reverse map for this hwirq */ |
| 349 | if (hwirq < domain->revmap_size) { | 486 | if (hwirq < domain->revmap_size) { |
| @@ -395,6 +532,7 @@ int irq_domain_associate(struct irq_domain *domain, unsigned int virq, | |||
| 395 | domain->name = irq_data->chip->name; | 532 | domain->name = irq_data->chip->name; |
| 396 | } | 533 | } |
| 397 | 534 | ||
| 535 | domain->mapcount++; | ||
| 398 | if (hwirq < domain->revmap_size) { | 536 | if (hwirq < domain->revmap_size) { |
| 399 | domain->linear_revmap[hwirq] = virq; | 537 | domain->linear_revmap[hwirq] = virq; |
| 400 | } else { | 538 | } else { |
| @@ -746,13 +884,54 @@ unsigned int irq_find_mapping(struct irq_domain *domain, | |||
| 746 | EXPORT_SYMBOL_GPL(irq_find_mapping); | 884 | EXPORT_SYMBOL_GPL(irq_find_mapping); |
| 747 | 885 | ||
| 748 | #ifdef CONFIG_IRQ_DOMAIN_DEBUG | 886 | #ifdef CONFIG_IRQ_DOMAIN_DEBUG |
| 887 | static void virq_debug_show_one(struct seq_file *m, struct irq_desc *desc) | ||
| 888 | { | ||
| 889 | struct irq_domain *domain; | ||
| 890 | struct irq_data *data; | ||
| 891 | |||
| 892 | domain = desc->irq_data.domain; | ||
| 893 | data = &desc->irq_data; | ||
| 894 | |||
| 895 | while (domain) { | ||
| 896 | unsigned int irq = data->irq; | ||
| 897 | unsigned long hwirq = data->hwirq; | ||
| 898 | struct irq_chip *chip; | ||
| 899 | bool direct; | ||
| 900 | |||
| 901 | if (data == &desc->irq_data) | ||
| 902 | seq_printf(m, "%5d ", irq); | ||
| 903 | else | ||
| 904 | seq_printf(m, "%5d+ ", irq); | ||
| 905 | seq_printf(m, "0x%05lx ", hwirq); | ||
| 906 | |||
| 907 | chip = irq_data_get_irq_chip(data); | ||
| 908 | seq_printf(m, "%-15s ", (chip && chip->name) ? chip->name : "none"); | ||
| 909 | |||
| 910 | seq_printf(m, data ? "0x%p " : " %p ", | ||
| 911 | irq_data_get_irq_chip_data(data)); | ||
| 912 | |||
| 913 | seq_printf(m, " %c ", (desc->action && desc->action->handler) ? '*' : ' '); | ||
| 914 | direct = (irq == hwirq) && (irq < domain->revmap_direct_max_irq); | ||
| 915 | seq_printf(m, "%6s%-8s ", | ||
| 916 | (hwirq < domain->revmap_size) ? "LINEAR" : "RADIX", | ||
| 917 | direct ? "(DIRECT)" : ""); | ||
| 918 | seq_printf(m, "%s\n", domain->name); | ||
| 919 | #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY | ||
| 920 | domain = domain->parent; | ||
| 921 | data = data->parent_data; | ||
| 922 | #else | ||
| 923 | domain = NULL; | ||
| 924 | #endif | ||
| 925 | } | ||
| 926 | } | ||
| 927 | |||
| 749 | static int virq_debug_show(struct seq_file *m, void *private) | 928 | static int virq_debug_show(struct seq_file *m, void *private) |
| 750 | { | 929 | { |
| 751 | unsigned long flags; | 930 | unsigned long flags; |
| 752 | struct irq_desc *desc; | 931 | struct irq_desc *desc; |
| 753 | struct irq_domain *domain; | 932 | struct irq_domain *domain; |
| 754 | struct radix_tree_iter iter; | 933 | struct radix_tree_iter iter; |
| 755 | void *data, **slot; | 934 | void **slot; |
| 756 | int i; | 935 | int i; |
| 757 | 936 | ||
| 758 | seq_printf(m, " %-16s %-6s %-10s %-10s %s\n", | 937 | seq_printf(m, " %-16s %-6s %-10s %-10s %s\n", |
| @@ -760,15 +939,26 @@ static int virq_debug_show(struct seq_file *m, void *private) | |||
| 760 | mutex_lock(&irq_domain_mutex); | 939 | mutex_lock(&irq_domain_mutex); |
| 761 | list_for_each_entry(domain, &irq_domain_list, link) { | 940 | list_for_each_entry(domain, &irq_domain_list, link) { |
| 762 | struct device_node *of_node; | 941 | struct device_node *of_node; |
| 942 | const char *name; | ||
| 943 | |||
| 763 | int count = 0; | 944 | int count = 0; |
| 945 | |||
| 764 | of_node = irq_domain_get_of_node(domain); | 946 | of_node = irq_domain_get_of_node(domain); |
| 947 | if (of_node) | ||
| 948 | name = of_node_full_name(of_node); | ||
| 949 | else if (is_fwnode_irqchip(domain->fwnode)) | ||
| 950 | name = container_of(domain->fwnode, struct irqchip_fwid, | ||
| 951 | fwnode)->name; | ||
| 952 | else | ||
| 953 | name = ""; | ||
| 954 | |||
| 765 | radix_tree_for_each_slot(slot, &domain->revmap_tree, &iter, 0) | 955 | radix_tree_for_each_slot(slot, &domain->revmap_tree, &iter, 0) |
| 766 | count++; | 956 | count++; |
| 767 | seq_printf(m, "%c%-16s %6u %10u %10u %s\n", | 957 | seq_printf(m, "%c%-16s %6u %10u %10u %s\n", |
| 768 | domain == irq_default_domain ? '*' : ' ', domain->name, | 958 | domain == irq_default_domain ? '*' : ' ', domain->name, |
| 769 | domain->revmap_size + count, domain->revmap_size, | 959 | domain->revmap_size + count, domain->revmap_size, |
| 770 | domain->revmap_direct_max_irq, | 960 | domain->revmap_direct_max_irq, |
| 771 | of_node ? of_node_full_name(of_node) : ""); | 961 | name); |
| 772 | } | 962 | } |
| 773 | mutex_unlock(&irq_domain_mutex); | 963 | mutex_unlock(&irq_domain_mutex); |
| 774 | 964 | ||
| @@ -782,30 +972,7 @@ static int virq_debug_show(struct seq_file *m, void *private) | |||
| 782 | continue; | 972 | continue; |
| 783 | 973 | ||
| 784 | raw_spin_lock_irqsave(&desc->lock, flags); | 974 | raw_spin_lock_irqsave(&desc->lock, flags); |
| 785 | domain = desc->irq_data.domain; | 975 | virq_debug_show_one(m, desc); |
| 786 | |||
| 787 | if (domain) { | ||
| 788 | struct irq_chip *chip; | ||
| 789 | int hwirq = desc->irq_data.hwirq; | ||
| 790 | bool direct; | ||
| 791 | |||
| 792 | seq_printf(m, "%5d ", i); | ||
| 793 | seq_printf(m, "0x%05x ", hwirq); | ||
| 794 | |||
| 795 | chip = irq_desc_get_chip(desc); | ||
| 796 | seq_printf(m, "%-15s ", (chip && chip->name) ? chip->name : "none"); | ||
| 797 | |||
| 798 | data = irq_desc_get_chip_data(desc); | ||
| 799 | seq_printf(m, data ? "0x%p " : " %p ", data); | ||
| 800 | |||
| 801 | seq_printf(m, " %c ", (desc->action && desc->action->handler) ? '*' : ' '); | ||
| 802 | direct = (i == hwirq) && (i < domain->revmap_direct_max_irq); | ||
| 803 | seq_printf(m, "%6s%-8s ", | ||
| 804 | (hwirq < domain->revmap_size) ? "LINEAR" : "RADIX", | ||
| 805 | direct ? "(DIRECT)" : ""); | ||
| 806 | seq_printf(m, "%s\n", desc->irq_data.domain->name); | ||
| 807 | } | ||
| 808 | |||
| 809 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 976 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
| 810 | } | 977 | } |
| 811 | 978 | ||
| @@ -973,6 +1140,7 @@ static void irq_domain_insert_irq(int virq) | |||
| 973 | struct irq_domain *domain = data->domain; | 1140 | struct irq_domain *domain = data->domain; |
| 974 | irq_hw_number_t hwirq = data->hwirq; | 1141 | irq_hw_number_t hwirq = data->hwirq; |
| 975 | 1142 | ||
| 1143 | domain->mapcount++; | ||
| 976 | if (hwirq < domain->revmap_size) { | 1144 | if (hwirq < domain->revmap_size) { |
| 977 | domain->linear_revmap[hwirq] = virq; | 1145 | domain->linear_revmap[hwirq] = virq; |
| 978 | } else { | 1146 | } else { |
| @@ -1002,6 +1170,7 @@ static void irq_domain_remove_irq(int virq) | |||
| 1002 | struct irq_domain *domain = data->domain; | 1170 | struct irq_domain *domain = data->domain; |
| 1003 | irq_hw_number_t hwirq = data->hwirq; | 1171 | irq_hw_number_t hwirq = data->hwirq; |
| 1004 | 1172 | ||
| 1173 | domain->mapcount--; | ||
| 1005 | if (hwirq < domain->revmap_size) { | 1174 | if (hwirq < domain->revmap_size) { |
| 1006 | domain->linear_revmap[hwirq] = 0; | 1175 | domain->linear_revmap[hwirq] = 0; |
| 1007 | } else { | 1176 | } else { |
| @@ -1189,43 +1358,18 @@ void irq_domain_free_irqs_top(struct irq_domain *domain, unsigned int virq, | |||
| 1189 | irq_domain_free_irqs_common(domain, virq, nr_irqs); | 1358 | irq_domain_free_irqs_common(domain, virq, nr_irqs); |
| 1190 | } | 1359 | } |
| 1191 | 1360 | ||
| 1192 | static bool irq_domain_is_auto_recursive(struct irq_domain *domain) | 1361 | static void irq_domain_free_irqs_hierarchy(struct irq_domain *domain, |
| 1193 | { | ||
| 1194 | return domain->flags & IRQ_DOMAIN_FLAG_AUTO_RECURSIVE; | ||
| 1195 | } | ||
| 1196 | |||
| 1197 | static void irq_domain_free_irqs_recursive(struct irq_domain *domain, | ||
| 1198 | unsigned int irq_base, | 1362 | unsigned int irq_base, |
| 1199 | unsigned int nr_irqs) | 1363 | unsigned int nr_irqs) |
| 1200 | { | 1364 | { |
| 1201 | domain->ops->free(domain, irq_base, nr_irqs); | 1365 | domain->ops->free(domain, irq_base, nr_irqs); |
| 1202 | if (irq_domain_is_auto_recursive(domain)) { | ||
| 1203 | BUG_ON(!domain->parent); | ||
| 1204 | irq_domain_free_irqs_recursive(domain->parent, irq_base, | ||
| 1205 | nr_irqs); | ||
| 1206 | } | ||
| 1207 | } | 1366 | } |
| 1208 | 1367 | ||
| 1209 | int irq_domain_alloc_irqs_recursive(struct irq_domain *domain, | 1368 | int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain, |
| 1210 | unsigned int irq_base, | 1369 | unsigned int irq_base, |
| 1211 | unsigned int nr_irqs, void *arg) | 1370 | unsigned int nr_irqs, void *arg) |
| 1212 | { | 1371 | { |
| 1213 | int ret = 0; | 1372 | return domain->ops->alloc(domain, irq_base, nr_irqs, arg); |
| 1214 | struct irq_domain *parent = domain->parent; | ||
| 1215 | bool recursive = irq_domain_is_auto_recursive(domain); | ||
| 1216 | |||
| 1217 | BUG_ON(recursive && !parent); | ||
| 1218 | if (recursive) | ||
| 1219 | ret = irq_domain_alloc_irqs_recursive(parent, irq_base, | ||
| 1220 | nr_irqs, arg); | ||
| 1221 | if (ret < 0) | ||
| 1222 | return ret; | ||
| 1223 | |||
| 1224 | ret = domain->ops->alloc(domain, irq_base, nr_irqs, arg); | ||
| 1225 | if (ret < 0 && recursive) | ||
| 1226 | irq_domain_free_irqs_recursive(parent, irq_base, nr_irqs); | ||
| 1227 | |||
| 1228 | return ret; | ||
| 1229 | } | 1373 | } |
| 1230 | 1374 | ||
| 1231 | /** | 1375 | /** |
| @@ -1286,7 +1430,7 @@ int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, | |||
| 1286 | } | 1430 | } |
| 1287 | 1431 | ||
| 1288 | mutex_lock(&irq_domain_mutex); | 1432 | mutex_lock(&irq_domain_mutex); |
| 1289 | ret = irq_domain_alloc_irqs_recursive(domain, virq, nr_irqs, arg); | 1433 | ret = irq_domain_alloc_irqs_hierarchy(domain, virq, nr_irqs, arg); |
| 1290 | if (ret < 0) { | 1434 | if (ret < 0) { |
| 1291 | mutex_unlock(&irq_domain_mutex); | 1435 | mutex_unlock(&irq_domain_mutex); |
| 1292 | goto out_free_irq_data; | 1436 | goto out_free_irq_data; |
| @@ -1321,7 +1465,7 @@ void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs) | |||
| 1321 | mutex_lock(&irq_domain_mutex); | 1465 | mutex_lock(&irq_domain_mutex); |
| 1322 | for (i = 0; i < nr_irqs; i++) | 1466 | for (i = 0; i < nr_irqs; i++) |
| 1323 | irq_domain_remove_irq(virq + i); | 1467 | irq_domain_remove_irq(virq + i); |
| 1324 | irq_domain_free_irqs_recursive(data->domain, virq, nr_irqs); | 1468 | irq_domain_free_irqs_hierarchy(data->domain, virq, nr_irqs); |
| 1325 | mutex_unlock(&irq_domain_mutex); | 1469 | mutex_unlock(&irq_domain_mutex); |
| 1326 | 1470 | ||
| 1327 | irq_domain_free_irq_data(virq, nr_irqs); | 1471 | irq_domain_free_irq_data(virq, nr_irqs); |
| @@ -1341,15 +1485,11 @@ int irq_domain_alloc_irqs_parent(struct irq_domain *domain, | |||
| 1341 | unsigned int irq_base, unsigned int nr_irqs, | 1485 | unsigned int irq_base, unsigned int nr_irqs, |
| 1342 | void *arg) | 1486 | void *arg) |
| 1343 | { | 1487 | { |
| 1344 | /* irq_domain_alloc_irqs_recursive() has called parent's alloc() */ | 1488 | if (!domain->parent) |
| 1345 | if (irq_domain_is_auto_recursive(domain)) | 1489 | return -ENOSYS; |
| 1346 | return 0; | ||
| 1347 | 1490 | ||
| 1348 | domain = domain->parent; | 1491 | return irq_domain_alloc_irqs_hierarchy(domain->parent, irq_base, |
| 1349 | if (domain) | 1492 | nr_irqs, arg); |
| 1350 | return irq_domain_alloc_irqs_recursive(domain, irq_base, | ||
| 1351 | nr_irqs, arg); | ||
| 1352 | return -ENOSYS; | ||
| 1353 | } | 1493 | } |
| 1354 | EXPORT_SYMBOL_GPL(irq_domain_alloc_irqs_parent); | 1494 | EXPORT_SYMBOL_GPL(irq_domain_alloc_irqs_parent); |
| 1355 | 1495 | ||
| @@ -1364,10 +1504,10 @@ EXPORT_SYMBOL_GPL(irq_domain_alloc_irqs_parent); | |||
| 1364 | void irq_domain_free_irqs_parent(struct irq_domain *domain, | 1504 | void irq_domain_free_irqs_parent(struct irq_domain *domain, |
| 1365 | unsigned int irq_base, unsigned int nr_irqs) | 1505 | unsigned int irq_base, unsigned int nr_irqs) |
| 1366 | { | 1506 | { |
| 1367 | /* irq_domain_free_irqs_recursive() will call parent's free */ | 1507 | if (!domain->parent) |
| 1368 | if (!irq_domain_is_auto_recursive(domain) && domain->parent) | 1508 | return; |
| 1369 | irq_domain_free_irqs_recursive(domain->parent, irq_base, | 1509 | |
| 1370 | nr_irqs); | 1510 | irq_domain_free_irqs_hierarchy(domain->parent, irq_base, nr_irqs); |
| 1371 | } | 1511 | } |
| 1372 | EXPORT_SYMBOL_GPL(irq_domain_free_irqs_parent); | 1512 | EXPORT_SYMBOL_GPL(irq_domain_free_irqs_parent); |
| 1373 | 1513 | ||
| @@ -1487,3 +1627,77 @@ static void irq_domain_check_hierarchy(struct irq_domain *domain) | |||
| 1487 | { | 1627 | { |
| 1488 | } | 1628 | } |
| 1489 | #endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */ | 1629 | #endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */ |
| 1630 | |||
| 1631 | #ifdef CONFIG_GENERIC_IRQ_DEBUGFS | ||
| 1632 | static struct dentry *domain_dir; | ||
| 1633 | |||
| 1634 | static void | ||
| 1635 | irq_domain_debug_show_one(struct seq_file *m, struct irq_domain *d, int ind) | ||
| 1636 | { | ||
| 1637 | seq_printf(m, "%*sname: %s\n", ind, "", d->name); | ||
| 1638 | seq_printf(m, "%*ssize: %u\n", ind + 1, "", | ||
| 1639 | d->revmap_size + d->revmap_direct_max_irq); | ||
| 1640 | seq_printf(m, "%*smapped: %u\n", ind + 1, "", d->mapcount); | ||
| 1641 | seq_printf(m, "%*sflags: 0x%08x\n", ind +1 , "", d->flags); | ||
| 1642 | #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY | ||
| 1643 | if (!d->parent) | ||
| 1644 | return; | ||
| 1645 | seq_printf(m, "%*sparent: %s\n", ind + 1, "", d->parent->name); | ||
| 1646 | irq_domain_debug_show_one(m, d->parent, ind + 4); | ||
| 1647 | #endif | ||
| 1648 | } | ||
| 1649 | |||
| 1650 | static int irq_domain_debug_show(struct seq_file *m, void *p) | ||
| 1651 | { | ||
| 1652 | struct irq_domain *d = m->private; | ||
| 1653 | |||
| 1654 | /* Default domain? Might be NULL */ | ||
| 1655 | if (!d) { | ||
| 1656 | if (!irq_default_domain) | ||
| 1657 | return 0; | ||
| 1658 | d = irq_default_domain; | ||
| 1659 | } | ||
| 1660 | irq_domain_debug_show_one(m, d, 0); | ||
| 1661 | return 0; | ||
| 1662 | } | ||
| 1663 | |||
| 1664 | static int irq_domain_debug_open(struct inode *inode, struct file *file) | ||
| 1665 | { | ||
| 1666 | return single_open(file, irq_domain_debug_show, inode->i_private); | ||
| 1667 | } | ||
| 1668 | |||
| 1669 | static const struct file_operations dfs_domain_ops = { | ||
| 1670 | .open = irq_domain_debug_open, | ||
| 1671 | .read = seq_read, | ||
| 1672 | .llseek = seq_lseek, | ||
| 1673 | .release = single_release, | ||
| 1674 | }; | ||
| 1675 | |||
| 1676 | static void debugfs_add_domain_dir(struct irq_domain *d) | ||
| 1677 | { | ||
| 1678 | if (!d->name || !domain_dir || d->debugfs_file) | ||
| 1679 | return; | ||
| 1680 | d->debugfs_file = debugfs_create_file(d->name, 0444, domain_dir, d, | ||
| 1681 | &dfs_domain_ops); | ||
| 1682 | } | ||
| 1683 | |||
| 1684 | static void debugfs_remove_domain_dir(struct irq_domain *d) | ||
| 1685 | { | ||
| 1686 | debugfs_remove(d->debugfs_file); | ||
| 1687 | } | ||
| 1688 | |||
| 1689 | void __init irq_domain_debugfs_init(struct dentry *root) | ||
| 1690 | { | ||
| 1691 | struct irq_domain *d; | ||
| 1692 | |||
| 1693 | domain_dir = debugfs_create_dir("domains", root); | ||
| 1694 | if (!domain_dir) | ||
| 1695 | return; | ||
| 1696 | |||
| 1697 | debugfs_create_file("default", 0444, domain_dir, NULL, &dfs_domain_ops); | ||
| 1698 | mutex_lock(&irq_domain_mutex); | ||
| 1699 | list_for_each_entry(d, &irq_domain_list, link) | ||
| 1700 | debugfs_add_domain_dir(d); | ||
| 1701 | mutex_unlock(&irq_domain_mutex); | ||
| 1702 | } | ||
| 1703 | #endif | ||
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 070be980c37a..1d1a5b945ab4 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
| @@ -168,34 +168,6 @@ void irq_set_thread_affinity(struct irq_desc *desc) | |||
| 168 | set_bit(IRQTF_AFFINITY, &action->thread_flags); | 168 | set_bit(IRQTF_AFFINITY, &action->thread_flags); |
| 169 | } | 169 | } |
| 170 | 170 | ||
| 171 | #ifdef CONFIG_GENERIC_PENDING_IRQ | ||
| 172 | static inline bool irq_can_move_pcntxt(struct irq_data *data) | ||
| 173 | { | ||
| 174 | return irqd_can_move_in_process_context(data); | ||
| 175 | } | ||
| 176 | static inline bool irq_move_pending(struct irq_data *data) | ||
| 177 | { | ||
| 178 | return irqd_is_setaffinity_pending(data); | ||
| 179 | } | ||
| 180 | static inline void | ||
| 181 | irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) | ||
| 182 | { | ||
| 183 | cpumask_copy(desc->pending_mask, mask); | ||
| 184 | } | ||
| 185 | static inline void | ||
| 186 | irq_get_pending(struct cpumask *mask, struct irq_desc *desc) | ||
| 187 | { | ||
| 188 | cpumask_copy(mask, desc->pending_mask); | ||
| 189 | } | ||
| 190 | #else | ||
| 191 | static inline bool irq_can_move_pcntxt(struct irq_data *data) { return true; } | ||
| 192 | static inline bool irq_move_pending(struct irq_data *data) { return false; } | ||
| 193 | static inline void | ||
| 194 | irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) { } | ||
| 195 | static inline void | ||
| 196 | irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { } | ||
| 197 | #endif | ||
| 198 | |||
| 199 | int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, | 171 | int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, |
| 200 | bool force) | 172 | bool force) |
| 201 | { | 173 | { |
| @@ -345,15 +317,18 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier); | |||
| 345 | /* | 317 | /* |
| 346 | * Generic version of the affinity autoselector. | 318 | * Generic version of the affinity autoselector. |
| 347 | */ | 319 | */ |
| 348 | static int setup_affinity(struct irq_desc *desc, struct cpumask *mask) | 320 | int irq_setup_affinity(struct irq_desc *desc) |
| 349 | { | 321 | { |
| 350 | struct cpumask *set = irq_default_affinity; | 322 | struct cpumask *set = irq_default_affinity; |
| 351 | int node = irq_desc_get_node(desc); | 323 | int ret, node = irq_desc_get_node(desc); |
| 324 | static DEFINE_RAW_SPINLOCK(mask_lock); | ||
| 325 | static struct cpumask mask; | ||
| 352 | 326 | ||
| 353 | /* Excludes PER_CPU and NO_BALANCE interrupts */ | 327 | /* Excludes PER_CPU and NO_BALANCE interrupts */ |
| 354 | if (!__irq_can_set_affinity(desc)) | 328 | if (!__irq_can_set_affinity(desc)) |
| 355 | return 0; | 329 | return 0; |
| 356 | 330 | ||
| 331 | raw_spin_lock(&mask_lock); | ||
| 357 | /* | 332 | /* |
| 358 | * Preserve the managed affinity setting and a userspace affinity | 333 | * Preserve the managed affinity setting and a userspace affinity |
| 359 | * setup, but make sure that one of the targets is online. | 334 | * setup, but make sure that one of the targets is online. |
| @@ -367,46 +342,40 @@ static int setup_affinity(struct irq_desc *desc, struct cpumask *mask) | |||
| 367 | irqd_clear(&desc->irq_data, IRQD_AFFINITY_SET); | 342 | irqd_clear(&desc->irq_data, IRQD_AFFINITY_SET); |
| 368 | } | 343 | } |
| 369 | 344 | ||
| 370 | cpumask_and(mask, cpu_online_mask, set); | 345 | cpumask_and(&mask, cpu_online_mask, set); |
| 371 | if (node != NUMA_NO_NODE) { | 346 | if (node != NUMA_NO_NODE) { |
| 372 | const struct cpumask *nodemask = cpumask_of_node(node); | 347 | const struct cpumask *nodemask = cpumask_of_node(node); |
| 373 | 348 | ||
| 374 | /* make sure at least one of the cpus in nodemask is online */ | 349 | /* make sure at least one of the cpus in nodemask is online */ |
| 375 | if (cpumask_intersects(mask, nodemask)) | 350 | if (cpumask_intersects(&mask, nodemask)) |
| 376 | cpumask_and(mask, mask, nodemask); | 351 | cpumask_and(&mask, &mask, nodemask); |
| 377 | } | 352 | } |
| 378 | irq_do_set_affinity(&desc->irq_data, mask, false); | 353 | ret = irq_do_set_affinity(&desc->irq_data, &mask, false); |
| 379 | return 0; | 354 | raw_spin_unlock(&mask_lock); |
| 355 | return ret; | ||
| 380 | } | 356 | } |
| 381 | #else | 357 | #else |
| 382 | /* Wrapper for ALPHA specific affinity selector magic */ | 358 | /* Wrapper for ALPHA specific affinity selector magic */ |
| 383 | static inline int setup_affinity(struct irq_desc *d, struct cpumask *mask) | 359 | int irq_setup_affinity(struct irq_desc *desc) |
| 384 | { | 360 | { |
| 385 | return irq_select_affinity(irq_desc_get_irq(d)); | 361 | return irq_select_affinity(irq_desc_get_irq(desc)); |
| 386 | } | 362 | } |
| 387 | #endif | 363 | #endif |
| 388 | 364 | ||
| 389 | /* | 365 | /* |
| 390 | * Called when affinity is set via /proc/irq | 366 | * Called when a bogus affinity is set via /proc/irq |
| 391 | */ | 367 | */ |
| 392 | int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask) | 368 | int irq_select_affinity_usr(unsigned int irq) |
| 393 | { | 369 | { |
| 394 | struct irq_desc *desc = irq_to_desc(irq); | 370 | struct irq_desc *desc = irq_to_desc(irq); |
| 395 | unsigned long flags; | 371 | unsigned long flags; |
| 396 | int ret; | 372 | int ret; |
| 397 | 373 | ||
| 398 | raw_spin_lock_irqsave(&desc->lock, flags); | 374 | raw_spin_lock_irqsave(&desc->lock, flags); |
| 399 | ret = setup_affinity(desc, mask); | 375 | ret = irq_setup_affinity(desc); |
| 400 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 376 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
| 401 | return ret; | 377 | return ret; |
| 402 | } | 378 | } |
| 403 | |||
| 404 | #else | ||
| 405 | static inline int | ||
| 406 | setup_affinity(struct irq_desc *desc, struct cpumask *mask) | ||
| 407 | { | ||
| 408 | return 0; | ||
| 409 | } | ||
| 410 | #endif | 379 | #endif |
| 411 | 380 | ||
| 412 | /** | 381 | /** |
| @@ -533,9 +502,15 @@ void __enable_irq(struct irq_desc *desc) | |||
| 533 | goto err_out; | 502 | goto err_out; |
| 534 | /* Prevent probing on this irq: */ | 503 | /* Prevent probing on this irq: */ |
| 535 | irq_settings_set_noprobe(desc); | 504 | irq_settings_set_noprobe(desc); |
| 536 | irq_enable(desc); | 505 | /* |
| 537 | check_irq_resend(desc); | 506 | * Call irq_startup() not irq_enable() here because the |
| 538 | /* fall-through */ | 507 | * interrupt might be marked NOAUTOEN. So irq_startup() |
| 508 | * needs to be invoked when it gets enabled the first | ||
| 509 | * time. If it was already started up, then irq_startup() | ||
| 510 | * will invoke irq_enable() under the hood. | ||
| 511 | */ | ||
| 512 | irq_startup(desc, IRQ_RESEND, IRQ_START_COND); | ||
| 513 | break; | ||
| 539 | } | 514 | } |
| 540 | default: | 515 | default: |
| 541 | desc->depth--; | 516 | desc->depth--; |
| @@ -1115,6 +1090,16 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary) | |||
| 1115 | /* | 1090 | /* |
| 1116 | * Internal function to register an irqaction - typically used to | 1091 | * Internal function to register an irqaction - typically used to |
| 1117 | * allocate special interrupts that are part of the architecture. | 1092 | * allocate special interrupts that are part of the architecture. |
| 1093 | * | ||
| 1094 | * Locking rules: | ||
| 1095 | * | ||
| 1096 | * desc->request_mutex Provides serialization against a concurrent free_irq() | ||
| 1097 | * chip_bus_lock Provides serialization for slow bus operations | ||
| 1098 | * desc->lock Provides serialization against hard interrupts | ||
| 1099 | * | ||
| 1100 | * chip_bus_lock and desc->lock are sufficient for all other management and | ||
| 1101 | * interrupt related functions. desc->request_mutex solely serializes | ||
| 1102 | * request/free_irq(). | ||
| 1118 | */ | 1103 | */ |
| 1119 | static int | 1104 | static int |
| 1120 | __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | 1105 | __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) |
| @@ -1122,7 +1107,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
| 1122 | struct irqaction *old, **old_ptr; | 1107 | struct irqaction *old, **old_ptr; |
| 1123 | unsigned long flags, thread_mask = 0; | 1108 | unsigned long flags, thread_mask = 0; |
| 1124 | int ret, nested, shared = 0; | 1109 | int ret, nested, shared = 0; |
| 1125 | cpumask_var_t mask; | ||
| 1126 | 1110 | ||
| 1127 | if (!desc) | 1111 | if (!desc) |
| 1128 | return -EINVAL; | 1112 | return -EINVAL; |
| @@ -1181,11 +1165,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
| 1181 | } | 1165 | } |
| 1182 | } | 1166 | } |
| 1183 | 1167 | ||
| 1184 | if (!alloc_cpumask_var(&mask, GFP_KERNEL)) { | ||
| 1185 | ret = -ENOMEM; | ||
| 1186 | goto out_thread; | ||
| 1187 | } | ||
| 1188 | |||
| 1189 | /* | 1168 | /* |
| 1190 | * Drivers are often written to work w/o knowledge about the | 1169 | * Drivers are often written to work w/o knowledge about the |
| 1191 | * underlying irq chip implementation, so a request for a | 1170 | * underlying irq chip implementation, so a request for a |
| @@ -1199,7 +1178,34 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
| 1199 | new->flags &= ~IRQF_ONESHOT; | 1178 | new->flags &= ~IRQF_ONESHOT; |
| 1200 | 1179 | ||
| 1201 | /* | 1180 | /* |
| 1181 | * Protects against a concurrent __free_irq() call which might wait | ||
| 1182 | * for synchronize_irq() to complete without holding the optional | ||
| 1183 | * chip bus lock and desc->lock. | ||
| 1184 | */ | ||
| 1185 | mutex_lock(&desc->request_mutex); | ||
| 1186 | |||
| 1187 | /* | ||
| 1188 | * Acquire bus lock as the irq_request_resources() callback below | ||
| 1189 | * might rely on the serialization or the magic power management | ||
| 1190 | * functions which are abusing the irq_bus_lock() callback, | ||
| 1191 | */ | ||
| 1192 | chip_bus_lock(desc); | ||
| 1193 | |||
| 1194 | /* First installed action requests resources. */ | ||
| 1195 | if (!desc->action) { | ||
| 1196 | ret = irq_request_resources(desc); | ||
| 1197 | if (ret) { | ||
| 1198 | pr_err("Failed to request resources for %s (irq %d) on irqchip %s\n", | ||
| 1199 | new->name, irq, desc->irq_data.chip->name); | ||
| 1200 | goto out_bus_unlock; | ||
| 1201 | } | ||
| 1202 | } | ||
| 1203 | |||
| 1204 | /* | ||
| 1202 | * The following block of code has to be executed atomically | 1205 | * The following block of code has to be executed atomically |
| 1206 | * protected against a concurrent interrupt and any of the other | ||
| 1207 | * management calls which are not serialized via | ||
| 1208 | * desc->request_mutex or the optional bus lock. | ||
| 1203 | */ | 1209 | */ |
| 1204 | raw_spin_lock_irqsave(&desc->lock, flags); | 1210 | raw_spin_lock_irqsave(&desc->lock, flags); |
| 1205 | old_ptr = &desc->action; | 1211 | old_ptr = &desc->action; |
| @@ -1250,7 +1256,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
| 1250 | */ | 1256 | */ |
| 1251 | if (thread_mask == ~0UL) { | 1257 | if (thread_mask == ~0UL) { |
| 1252 | ret = -EBUSY; | 1258 | ret = -EBUSY; |
| 1253 | goto out_mask; | 1259 | goto out_unlock; |
| 1254 | } | 1260 | } |
| 1255 | /* | 1261 | /* |
| 1256 | * The thread_mask for the action is or'ed to | 1262 | * The thread_mask for the action is or'ed to |
| @@ -1294,17 +1300,10 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
| 1294 | pr_err("Threaded irq requested with handler=NULL and !ONESHOT for irq %d\n", | 1300 | pr_err("Threaded irq requested with handler=NULL and !ONESHOT for irq %d\n", |
| 1295 | irq); | 1301 | irq); |
| 1296 | ret = -EINVAL; | 1302 | ret = -EINVAL; |
| 1297 | goto out_mask; | 1303 | goto out_unlock; |
| 1298 | } | 1304 | } |
| 1299 | 1305 | ||
| 1300 | if (!shared) { | 1306 | if (!shared) { |
| 1301 | ret = irq_request_resources(desc); | ||
| 1302 | if (ret) { | ||
| 1303 | pr_err("Failed to request resources for %s (irq %d) on irqchip %s\n", | ||
| 1304 | new->name, irq, desc->irq_data.chip->name); | ||
| 1305 | goto out_mask; | ||
| 1306 | } | ||
| 1307 | |||
| 1308 | init_waitqueue_head(&desc->wait_for_threads); | 1307 | init_waitqueue_head(&desc->wait_for_threads); |
| 1309 | 1308 | ||
| 1310 | /* Setup the type (level, edge polarity) if configured: */ | 1309 | /* Setup the type (level, edge polarity) if configured: */ |
| @@ -1313,7 +1312,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
| 1313 | new->flags & IRQF_TRIGGER_MASK); | 1312 | new->flags & IRQF_TRIGGER_MASK); |
| 1314 | 1313 | ||
| 1315 | if (ret) | 1314 | if (ret) |
| 1316 | goto out_mask; | 1315 | goto out_unlock; |
| 1317 | } | 1316 | } |
| 1318 | 1317 | ||
| 1319 | desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \ | 1318 | desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \ |
| @@ -1328,20 +1327,25 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
| 1328 | if (new->flags & IRQF_ONESHOT) | 1327 | if (new->flags & IRQF_ONESHOT) |
| 1329 | desc->istate |= IRQS_ONESHOT; | 1328 | desc->istate |= IRQS_ONESHOT; |
| 1330 | 1329 | ||
| 1331 | if (irq_settings_can_autoenable(desc)) | ||
| 1332 | irq_startup(desc, true); | ||
| 1333 | else | ||
| 1334 | /* Undo nested disables: */ | ||
| 1335 | desc->depth = 1; | ||
| 1336 | |||
| 1337 | /* Exclude IRQ from balancing if requested */ | 1330 | /* Exclude IRQ from balancing if requested */ |
| 1338 | if (new->flags & IRQF_NOBALANCING) { | 1331 | if (new->flags & IRQF_NOBALANCING) { |
| 1339 | irq_settings_set_no_balancing(desc); | 1332 | irq_settings_set_no_balancing(desc); |
| 1340 | irqd_set(&desc->irq_data, IRQD_NO_BALANCING); | 1333 | irqd_set(&desc->irq_data, IRQD_NO_BALANCING); |
| 1341 | } | 1334 | } |
| 1342 | 1335 | ||
| 1343 | /* Set default affinity mask once everything is setup */ | 1336 | if (irq_settings_can_autoenable(desc)) { |
| 1344 | setup_affinity(desc, mask); | 1337 | irq_startup(desc, IRQ_RESEND, IRQ_START_COND); |
| 1338 | } else { | ||
| 1339 | /* | ||
| 1340 | * Shared interrupts do not go well with disabling | ||
| 1341 | * auto enable. The sharing interrupt might request | ||
| 1342 | * it while it's still disabled and then wait for | ||
| 1343 | * interrupts forever. | ||
| 1344 | */ | ||
| 1345 | WARN_ON_ONCE(new->flags & IRQF_SHARED); | ||
| 1346 | /* Undo nested disables: */ | ||
| 1347 | desc->depth = 1; | ||
| 1348 | } | ||
| 1345 | 1349 | ||
| 1346 | } else if (new->flags & IRQF_TRIGGER_MASK) { | 1350 | } else if (new->flags & IRQF_TRIGGER_MASK) { |
| 1347 | unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK; | 1351 | unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK; |
| @@ -1371,6 +1375,10 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
| 1371 | } | 1375 | } |
| 1372 | 1376 | ||
| 1373 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 1377 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
| 1378 | chip_bus_sync_unlock(desc); | ||
| 1379 | mutex_unlock(&desc->request_mutex); | ||
| 1380 | |||
| 1381 | irq_setup_timings(desc, new); | ||
| 1374 | 1382 | ||
| 1375 | /* | 1383 | /* |
| 1376 | * Strictly no need to wake it up, but hung_task complains | 1384 | * Strictly no need to wake it up, but hung_task complains |
| @@ -1382,10 +1390,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
| 1382 | wake_up_process(new->secondary->thread); | 1390 | wake_up_process(new->secondary->thread); |
| 1383 | 1391 | ||
| 1384 | register_irq_proc(irq, desc); | 1392 | register_irq_proc(irq, desc); |
| 1393 | irq_add_debugfs_entry(irq, desc); | ||
| 1385 | new->dir = NULL; | 1394 | new->dir = NULL; |
| 1386 | register_handler_proc(irq, new); | 1395 | register_handler_proc(irq, new); |
| 1387 | free_cpumask_var(mask); | ||
| 1388 | |||
| 1389 | return 0; | 1396 | return 0; |
| 1390 | 1397 | ||
| 1391 | mismatch: | 1398 | mismatch: |
| @@ -1398,9 +1405,14 @@ mismatch: | |||
| 1398 | } | 1405 | } |
| 1399 | ret = -EBUSY; | 1406 | ret = -EBUSY; |
| 1400 | 1407 | ||
| 1401 | out_mask: | 1408 | out_unlock: |
| 1402 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 1409 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
| 1403 | free_cpumask_var(mask); | 1410 | |
| 1411 | if (!desc->action) | ||
| 1412 | irq_release_resources(desc); | ||
| 1413 | out_bus_unlock: | ||
| 1414 | chip_bus_sync_unlock(desc); | ||
| 1415 | mutex_unlock(&desc->request_mutex); | ||
| 1404 | 1416 | ||
| 1405 | out_thread: | 1417 | out_thread: |
| 1406 | if (new->thread) { | 1418 | if (new->thread) { |
| @@ -1441,9 +1453,7 @@ int setup_irq(unsigned int irq, struct irqaction *act) | |||
| 1441 | if (retval < 0) | 1453 | if (retval < 0) |
| 1442 | return retval; | 1454 | return retval; |
| 1443 | 1455 | ||
| 1444 | chip_bus_lock(desc); | ||
| 1445 | retval = __setup_irq(irq, desc, act); | 1456 | retval = __setup_irq(irq, desc, act); |
| 1446 | chip_bus_sync_unlock(desc); | ||
| 1447 | 1457 | ||
| 1448 | if (retval) | 1458 | if (retval) |
| 1449 | irq_chip_pm_put(&desc->irq_data); | 1459 | irq_chip_pm_put(&desc->irq_data); |
| @@ -1467,6 +1477,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) | |||
| 1467 | if (!desc) | 1477 | if (!desc) |
| 1468 | return NULL; | 1478 | return NULL; |
| 1469 | 1479 | ||
| 1480 | mutex_lock(&desc->request_mutex); | ||
| 1470 | chip_bus_lock(desc); | 1481 | chip_bus_lock(desc); |
| 1471 | raw_spin_lock_irqsave(&desc->lock, flags); | 1482 | raw_spin_lock_irqsave(&desc->lock, flags); |
| 1472 | 1483 | ||
| @@ -1482,6 +1493,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) | |||
| 1482 | WARN(1, "Trying to free already-free IRQ %d\n", irq); | 1493 | WARN(1, "Trying to free already-free IRQ %d\n", irq); |
| 1483 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 1494 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
| 1484 | chip_bus_sync_unlock(desc); | 1495 | chip_bus_sync_unlock(desc); |
| 1496 | mutex_unlock(&desc->request_mutex); | ||
| 1485 | return NULL; | 1497 | return NULL; |
| 1486 | } | 1498 | } |
| 1487 | 1499 | ||
| @@ -1499,7 +1511,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) | |||
| 1499 | if (!desc->action) { | 1511 | if (!desc->action) { |
| 1500 | irq_settings_clr_disable_unlazy(desc); | 1512 | irq_settings_clr_disable_unlazy(desc); |
| 1501 | irq_shutdown(desc); | 1513 | irq_shutdown(desc); |
| 1502 | irq_release_resources(desc); | ||
| 1503 | } | 1514 | } |
| 1504 | 1515 | ||
| 1505 | #ifdef CONFIG_SMP | 1516 | #ifdef CONFIG_SMP |
| @@ -1509,6 +1520,20 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) | |||
| 1509 | #endif | 1520 | #endif |
| 1510 | 1521 | ||
| 1511 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 1522 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
| 1523 | /* | ||
| 1524 | * Drop bus_lock here so the changes which were done in the chip | ||
| 1525 | * callbacks above are synced out to the irq chips which hang | ||
| 1526 | * behind a slow bus (I2C, SPI) before calling synchronize_irq(). | ||
| 1527 | * | ||
| 1528 | * Aside of that the bus_lock can also be taken from the threaded | ||
| 1529 | * handler in irq_finalize_oneshot() which results in a deadlock | ||
| 1530 | * because synchronize_irq() would wait forever for the thread to | ||
| 1531 | * complete, which is blocked on the bus lock. | ||
| 1532 | * | ||
| 1533 | * The still held desc->request_mutex() protects against a | ||
| 1534 | * concurrent request_irq() of this irq so the release of resources | ||
| 1535 | * and timing data is properly serialized. | ||
| 1536 | */ | ||
| 1512 | chip_bus_sync_unlock(desc); | 1537 | chip_bus_sync_unlock(desc); |
| 1513 | 1538 | ||
| 1514 | unregister_handler_proc(irq, action); | 1539 | unregister_handler_proc(irq, action); |
| @@ -1541,6 +1566,20 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) | |||
| 1541 | } | 1566 | } |
| 1542 | } | 1567 | } |
| 1543 | 1568 | ||
| 1569 | /* Last action releases resources */ | ||
| 1570 | if (!desc->action) { | ||
| 1571 | /* | ||
| 1572 | * Reaquire bus lock as irq_release_resources() might | ||
| 1573 | * require it to deallocate resources over the slow bus. | ||
| 1574 | */ | ||
| 1575 | chip_bus_lock(desc); | ||
| 1576 | irq_release_resources(desc); | ||
| 1577 | chip_bus_sync_unlock(desc); | ||
| 1578 | irq_remove_timings(desc); | ||
| 1579 | } | ||
| 1580 | |||
| 1581 | mutex_unlock(&desc->request_mutex); | ||
| 1582 | |||
| 1544 | irq_chip_pm_put(&desc->irq_data); | 1583 | irq_chip_pm_put(&desc->irq_data); |
| 1545 | module_put(desc->owner); | 1584 | module_put(desc->owner); |
| 1546 | kfree(action->secondary); | 1585 | kfree(action->secondary); |
| @@ -1697,9 +1736,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, | |||
| 1697 | return retval; | 1736 | return retval; |
| 1698 | } | 1737 | } |
| 1699 | 1738 | ||
| 1700 | chip_bus_lock(desc); | ||
| 1701 | retval = __setup_irq(irq, desc, action); | 1739 | retval = __setup_irq(irq, desc, action); |
| 1702 | chip_bus_sync_unlock(desc); | ||
| 1703 | 1740 | ||
| 1704 | if (retval) { | 1741 | if (retval) { |
| 1705 | irq_chip_pm_put(&desc->irq_data); | 1742 | irq_chip_pm_put(&desc->irq_data); |
| @@ -1947,9 +1984,7 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act) | |||
| 1947 | if (retval < 0) | 1984 | if (retval < 0) |
| 1948 | return retval; | 1985 | return retval; |
| 1949 | 1986 | ||
| 1950 | chip_bus_lock(desc); | ||
| 1951 | retval = __setup_irq(irq, desc, act); | 1987 | retval = __setup_irq(irq, desc, act); |
| 1952 | chip_bus_sync_unlock(desc); | ||
| 1953 | 1988 | ||
| 1954 | if (retval) | 1989 | if (retval) |
| 1955 | irq_chip_pm_put(&desc->irq_data); | 1990 | irq_chip_pm_put(&desc->irq_data); |
| @@ -1958,9 +1993,10 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act) | |||
| 1958 | } | 1993 | } |
| 1959 | 1994 | ||
| 1960 | /** | 1995 | /** |
| 1961 | * request_percpu_irq - allocate a percpu interrupt line | 1996 | * __request_percpu_irq - allocate a percpu interrupt line |
| 1962 | * @irq: Interrupt line to allocate | 1997 | * @irq: Interrupt line to allocate |
| 1963 | * @handler: Function to be called when the IRQ occurs. | 1998 | * @handler: Function to be called when the IRQ occurs. |
| 1999 | * @flags: Interrupt type flags (IRQF_TIMER only) | ||
| 1964 | * @devname: An ascii name for the claiming device | 2000 | * @devname: An ascii name for the claiming device |
| 1965 | * @dev_id: A percpu cookie passed back to the handler function | 2001 | * @dev_id: A percpu cookie passed back to the handler function |
| 1966 | * | 2002 | * |
| @@ -1973,8 +2009,9 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act) | |||
| 1973 | * the handler gets called with the interrupted CPU's instance of | 2009 | * the handler gets called with the interrupted CPU's instance of |
| 1974 | * that variable. | 2010 | * that variable. |
| 1975 | */ | 2011 | */ |
| 1976 | int request_percpu_irq(unsigned int irq, irq_handler_t handler, | 2012 | int __request_percpu_irq(unsigned int irq, irq_handler_t handler, |
| 1977 | const char *devname, void __percpu *dev_id) | 2013 | unsigned long flags, const char *devname, |
| 2014 | void __percpu *dev_id) | ||
| 1978 | { | 2015 | { |
| 1979 | struct irqaction *action; | 2016 | struct irqaction *action; |
| 1980 | struct irq_desc *desc; | 2017 | struct irq_desc *desc; |
| @@ -1988,12 +2025,15 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler, | |||
| 1988 | !irq_settings_is_per_cpu_devid(desc)) | 2025 | !irq_settings_is_per_cpu_devid(desc)) |
| 1989 | return -EINVAL; | 2026 | return -EINVAL; |
| 1990 | 2027 | ||
| 2028 | if (flags && flags != IRQF_TIMER) | ||
| 2029 | return -EINVAL; | ||
| 2030 | |||
| 1991 | action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); | 2031 | action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); |
| 1992 | if (!action) | 2032 | if (!action) |
| 1993 | return -ENOMEM; | 2033 | return -ENOMEM; |
| 1994 | 2034 | ||
| 1995 | action->handler = handler; | 2035 | action->handler = handler; |
| 1996 | action->flags = IRQF_PERCPU | IRQF_NO_SUSPEND; | 2036 | action->flags = flags | IRQF_PERCPU | IRQF_NO_SUSPEND; |
| 1997 | action->name = devname; | 2037 | action->name = devname; |
| 1998 | action->percpu_dev_id = dev_id; | 2038 | action->percpu_dev_id = dev_id; |
| 1999 | 2039 | ||
| @@ -2003,9 +2043,7 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler, | |||
| 2003 | return retval; | 2043 | return retval; |
| 2004 | } | 2044 | } |
| 2005 | 2045 | ||
| 2006 | chip_bus_lock(desc); | ||
| 2007 | retval = __setup_irq(irq, desc, action); | 2046 | retval = __setup_irq(irq, desc, action); |
| 2008 | chip_bus_sync_unlock(desc); | ||
| 2009 | 2047 | ||
| 2010 | if (retval) { | 2048 | if (retval) { |
| 2011 | irq_chip_pm_put(&desc->irq_data); | 2049 | irq_chip_pm_put(&desc->irq_data); |
| @@ -2014,7 +2052,7 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler, | |||
| 2014 | 2052 | ||
| 2015 | return retval; | 2053 | return retval; |
| 2016 | } | 2054 | } |
| 2017 | EXPORT_SYMBOL_GPL(request_percpu_irq); | 2055 | EXPORT_SYMBOL_GPL(__request_percpu_irq); |
| 2018 | 2056 | ||
| 2019 | /** | 2057 | /** |
| 2020 | * irq_get_irqchip_state - returns the irqchip state of a interrupt. | 2058 | * irq_get_irqchip_state - returns the irqchip state of a interrupt. |
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 37ddb7bda651..6ca054a3f91d 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c | |||
| @@ -4,6 +4,36 @@ | |||
| 4 | 4 | ||
| 5 | #include "internals.h" | 5 | #include "internals.h" |
| 6 | 6 | ||
| 7 | /** | ||
| 8 | * irq_fixup_move_pending - Cleanup irq move pending from a dying CPU | ||
| 9 | * @desc: Interrupt descpriptor to clean up | ||
| 10 | * @force_clear: If set clear the move pending bit unconditionally. | ||
| 11 | * If not set, clear it only when the dying CPU is the | ||
| 12 | * last one in the pending mask. | ||
| 13 | * | ||
| 14 | * Returns true if the pending bit was set and the pending mask contains an | ||
| 15 | * online CPU other than the dying CPU. | ||
| 16 | */ | ||
| 17 | bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear) | ||
| 18 | { | ||
| 19 | struct irq_data *data = irq_desc_get_irq_data(desc); | ||
| 20 | |||
| 21 | if (!irqd_is_setaffinity_pending(data)) | ||
| 22 | return false; | ||
| 23 | |||
| 24 | /* | ||
| 25 | * The outgoing CPU might be the last online target in a pending | ||
| 26 | * interrupt move. If that's the case clear the pending move bit. | ||
| 27 | */ | ||
| 28 | if (cpumask_any_and(desc->pending_mask, cpu_online_mask) >= nr_cpu_ids) { | ||
| 29 | irqd_clr_move_pending(data); | ||
| 30 | return false; | ||
| 31 | } | ||
| 32 | if (force_clear) | ||
| 33 | irqd_clr_move_pending(data); | ||
| 34 | return true; | ||
| 35 | } | ||
| 36 | |||
| 7 | void irq_move_masked_irq(struct irq_data *idata) | 37 | void irq_move_masked_irq(struct irq_data *idata) |
| 8 | { | 38 | { |
| 9 | struct irq_desc *desc = irq_data_to_desc(idata); | 39 | struct irq_desc *desc = irq_data_to_desc(idata); |
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index ddc2f5427f75..48eadf416c24 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c | |||
| @@ -265,13 +265,20 @@ struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode, | |||
| 265 | struct msi_domain_info *info, | 265 | struct msi_domain_info *info, |
| 266 | struct irq_domain *parent) | 266 | struct irq_domain *parent) |
| 267 | { | 267 | { |
| 268 | struct irq_domain *domain; | ||
| 269 | |||
| 268 | if (info->flags & MSI_FLAG_USE_DEF_DOM_OPS) | 270 | if (info->flags & MSI_FLAG_USE_DEF_DOM_OPS) |
| 269 | msi_domain_update_dom_ops(info); | 271 | msi_domain_update_dom_ops(info); |
| 270 | if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS) | 272 | if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS) |
| 271 | msi_domain_update_chip_ops(info); | 273 | msi_domain_update_chip_ops(info); |
| 272 | 274 | ||
| 273 | return irq_domain_create_hierarchy(parent, IRQ_DOMAIN_FLAG_MSI, 0, | 275 | domain = irq_domain_create_hierarchy(parent, IRQ_DOMAIN_FLAG_MSI, 0, |
| 274 | fwnode, &msi_domain_ops, info); | 276 | fwnode, &msi_domain_ops, info); |
| 277 | |||
| 278 | if (domain && !domain->name && info->chip) | ||
| 279 | domain->name = info->chip->name; | ||
| 280 | |||
| 281 | return domain; | ||
| 275 | } | 282 | } |
| 276 | 283 | ||
| 277 | int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev, | 284 | int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev, |
| @@ -308,7 +315,7 @@ int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev, | |||
| 308 | 315 | ||
| 309 | ops->set_desc(arg, desc); | 316 | ops->set_desc(arg, desc); |
| 310 | /* Assumes the domain mutex is held! */ | 317 | /* Assumes the domain mutex is held! */ |
| 311 | ret = irq_domain_alloc_irqs_recursive(domain, virq, 1, arg); | 318 | ret = irq_domain_alloc_irqs_hierarchy(domain, virq, 1, arg); |
| 312 | if (ret) | 319 | if (ret) |
| 313 | break; | 320 | break; |
| 314 | 321 | ||
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index cea1de0161f1..6bd9b58429cc 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c | |||
| @@ -149,6 +149,8 @@ static void resume_irq(struct irq_desc *desc) | |||
| 149 | 149 | ||
| 150 | /* Pretend that it got disabled ! */ | 150 | /* Pretend that it got disabled ! */ |
| 151 | desc->depth++; | 151 | desc->depth++; |
| 152 | irq_state_set_disabled(desc); | ||
| 153 | irq_state_set_masked(desc); | ||
| 152 | resume: | 154 | resume: |
| 153 | desc->istate &= ~IRQS_SUSPENDED; | 155 | desc->istate &= ~IRQS_SUSPENDED; |
| 154 | __enable_irq(desc); | 156 | __enable_irq(desc); |
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index c53edad7b459..7f9642a1e267 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c | |||
| @@ -37,19 +37,47 @@ static struct proc_dir_entry *root_irq_dir; | |||
| 37 | 37 | ||
| 38 | #ifdef CONFIG_SMP | 38 | #ifdef CONFIG_SMP |
| 39 | 39 | ||
| 40 | static int show_irq_affinity(int type, struct seq_file *m, void *v) | 40 | enum { |
| 41 | AFFINITY, | ||
| 42 | AFFINITY_LIST, | ||
| 43 | EFFECTIVE, | ||
| 44 | EFFECTIVE_LIST, | ||
| 45 | }; | ||
| 46 | |||
| 47 | static int show_irq_affinity(int type, struct seq_file *m) | ||
| 41 | { | 48 | { |
| 42 | struct irq_desc *desc = irq_to_desc((long)m->private); | 49 | struct irq_desc *desc = irq_to_desc((long)m->private); |
| 43 | const struct cpumask *mask = desc->irq_common_data.affinity; | 50 | const struct cpumask *mask; |
| 44 | 51 | ||
| 52 | switch (type) { | ||
| 53 | case AFFINITY: | ||
| 54 | case AFFINITY_LIST: | ||
| 55 | mask = desc->irq_common_data.affinity; | ||
| 45 | #ifdef CONFIG_GENERIC_PENDING_IRQ | 56 | #ifdef CONFIG_GENERIC_PENDING_IRQ |
| 46 | if (irqd_is_setaffinity_pending(&desc->irq_data)) | 57 | if (irqd_is_setaffinity_pending(&desc->irq_data)) |
| 47 | mask = desc->pending_mask; | 58 | mask = desc->pending_mask; |
| 48 | #endif | 59 | #endif |
| 49 | if (type) | 60 | break; |
| 61 | case EFFECTIVE: | ||
| 62 | case EFFECTIVE_LIST: | ||
| 63 | #ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK | ||
| 64 | mask = desc->irq_common_data.effective_affinity; | ||
| 65 | break; | ||
| 66 | #else | ||
| 67 | return -EINVAL; | ||
| 68 | #endif | ||
| 69 | }; | ||
| 70 | |||
| 71 | switch (type) { | ||
| 72 | case AFFINITY_LIST: | ||
| 73 | case EFFECTIVE_LIST: | ||
| 50 | seq_printf(m, "%*pbl\n", cpumask_pr_args(mask)); | 74 | seq_printf(m, "%*pbl\n", cpumask_pr_args(mask)); |
| 51 | else | 75 | break; |
| 76 | case AFFINITY: | ||
| 77 | case EFFECTIVE: | ||
| 52 | seq_printf(m, "%*pb\n", cpumask_pr_args(mask)); | 78 | seq_printf(m, "%*pb\n", cpumask_pr_args(mask)); |
| 79 | break; | ||
| 80 | } | ||
| 53 | return 0; | 81 | return 0; |
| 54 | } | 82 | } |
| 55 | 83 | ||
| @@ -80,12 +108,12 @@ static int irq_affinity_hint_proc_show(struct seq_file *m, void *v) | |||
| 80 | int no_irq_affinity; | 108 | int no_irq_affinity; |
| 81 | static int irq_affinity_proc_show(struct seq_file *m, void *v) | 109 | static int irq_affinity_proc_show(struct seq_file *m, void *v) |
| 82 | { | 110 | { |
| 83 | return show_irq_affinity(0, m, v); | 111 | return show_irq_affinity(AFFINITY, m); |
| 84 | } | 112 | } |
| 85 | 113 | ||
| 86 | static int irq_affinity_list_proc_show(struct seq_file *m, void *v) | 114 | static int irq_affinity_list_proc_show(struct seq_file *m, void *v) |
| 87 | { | 115 | { |
| 88 | return show_irq_affinity(1, m, v); | 116 | return show_irq_affinity(AFFINITY_LIST, m); |
| 89 | } | 117 | } |
| 90 | 118 | ||
| 91 | 119 | ||
| @@ -120,9 +148,11 @@ static ssize_t write_irq_affinity(int type, struct file *file, | |||
| 120 | * one online CPU still has to be targeted. | 148 | * one online CPU still has to be targeted. |
| 121 | */ | 149 | */ |
| 122 | if (!cpumask_intersects(new_value, cpu_online_mask)) { | 150 | if (!cpumask_intersects(new_value, cpu_online_mask)) { |
| 123 | /* Special case for empty set - allow the architecture | 151 | /* |
| 124 | code to set default SMP affinity. */ | 152 | * Special case for empty set - allow the architecture code |
| 125 | err = irq_select_affinity_usr(irq, new_value) ? -EINVAL : count; | 153 | * to set default SMP affinity. |
| 154 | */ | ||
| 155 | err = irq_select_affinity_usr(irq) ? -EINVAL : count; | ||
| 126 | } else { | 156 | } else { |
| 127 | irq_set_affinity(irq, new_value); | 157 | irq_set_affinity(irq, new_value); |
| 128 | err = count; | 158 | err = count; |
| @@ -183,6 +213,44 @@ static const struct file_operations irq_affinity_list_proc_fops = { | |||
| 183 | .write = irq_affinity_list_proc_write, | 213 | .write = irq_affinity_list_proc_write, |
| 184 | }; | 214 | }; |
| 185 | 215 | ||
| 216 | #ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK | ||
| 217 | static int irq_effective_aff_proc_show(struct seq_file *m, void *v) | ||
| 218 | { | ||
| 219 | return show_irq_affinity(EFFECTIVE, m); | ||
| 220 | } | ||
| 221 | |||
| 222 | static int irq_effective_aff_list_proc_show(struct seq_file *m, void *v) | ||
| 223 | { | ||
| 224 | return show_irq_affinity(EFFECTIVE_LIST, m); | ||
| 225 | } | ||
| 226 | |||
| 227 | static int irq_effective_aff_proc_open(struct inode *inode, struct file *file) | ||
| 228 | { | ||
| 229 | return single_open(file, irq_effective_aff_proc_show, PDE_DATA(inode)); | ||
| 230 | } | ||
| 231 | |||
| 232 | static int irq_effective_aff_list_proc_open(struct inode *inode, | ||
| 233 | struct file *file) | ||
| 234 | { | ||
| 235 | return single_open(file, irq_effective_aff_list_proc_show, | ||
| 236 | PDE_DATA(inode)); | ||
| 237 | } | ||
| 238 | |||
| 239 | static const struct file_operations irq_effective_aff_proc_fops = { | ||
| 240 | .open = irq_effective_aff_proc_open, | ||
| 241 | .read = seq_read, | ||
| 242 | .llseek = seq_lseek, | ||
| 243 | .release = single_release, | ||
| 244 | }; | ||
| 245 | |||
| 246 | static const struct file_operations irq_effective_aff_list_proc_fops = { | ||
| 247 | .open = irq_effective_aff_list_proc_open, | ||
| 248 | .read = seq_read, | ||
| 249 | .llseek = seq_lseek, | ||
| 250 | .release = single_release, | ||
| 251 | }; | ||
| 252 | #endif | ||
| 253 | |||
| 186 | static int default_affinity_show(struct seq_file *m, void *v) | 254 | static int default_affinity_show(struct seq_file *m, void *v) |
| 187 | { | 255 | { |
| 188 | seq_printf(m, "%*pb\n", cpumask_pr_args(irq_default_affinity)); | 256 | seq_printf(m, "%*pb\n", cpumask_pr_args(irq_default_affinity)); |
| @@ -324,6 +392,7 @@ void register_handler_proc(unsigned int irq, struct irqaction *action) | |||
| 324 | void register_irq_proc(unsigned int irq, struct irq_desc *desc) | 392 | void register_irq_proc(unsigned int irq, struct irq_desc *desc) |
| 325 | { | 393 | { |
| 326 | static DEFINE_MUTEX(register_lock); | 394 | static DEFINE_MUTEX(register_lock); |
| 395 | void __maybe_unused *irqp = (void *)(unsigned long) irq; | ||
| 327 | char name [MAX_NAMELEN]; | 396 | char name [MAX_NAMELEN]; |
| 328 | 397 | ||
| 329 | if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip)) | 398 | if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip)) |
| @@ -349,20 +418,25 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) | |||
| 349 | #ifdef CONFIG_SMP | 418 | #ifdef CONFIG_SMP |
| 350 | /* create /proc/irq/<irq>/smp_affinity */ | 419 | /* create /proc/irq/<irq>/smp_affinity */ |
| 351 | proc_create_data("smp_affinity", 0644, desc->dir, | 420 | proc_create_data("smp_affinity", 0644, desc->dir, |
| 352 | &irq_affinity_proc_fops, (void *)(long)irq); | 421 | &irq_affinity_proc_fops, irqp); |
| 353 | 422 | ||
| 354 | /* create /proc/irq/<irq>/affinity_hint */ | 423 | /* create /proc/irq/<irq>/affinity_hint */ |
| 355 | proc_create_data("affinity_hint", 0444, desc->dir, | 424 | proc_create_data("affinity_hint", 0444, desc->dir, |
| 356 | &irq_affinity_hint_proc_fops, (void *)(long)irq); | 425 | &irq_affinity_hint_proc_fops, irqp); |
| 357 | 426 | ||
| 358 | /* create /proc/irq/<irq>/smp_affinity_list */ | 427 | /* create /proc/irq/<irq>/smp_affinity_list */ |
| 359 | proc_create_data("smp_affinity_list", 0644, desc->dir, | 428 | proc_create_data("smp_affinity_list", 0644, desc->dir, |
| 360 | &irq_affinity_list_proc_fops, (void *)(long)irq); | 429 | &irq_affinity_list_proc_fops, irqp); |
| 361 | 430 | ||
| 362 | proc_create_data("node", 0444, desc->dir, | 431 | proc_create_data("node", 0444, desc->dir, |
| 363 | &irq_node_proc_fops, (void *)(long)irq); | 432 | &irq_node_proc_fops, irqp); |
| 433 | # ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK | ||
| 434 | proc_create_data("effective_affinity", 0444, desc->dir, | ||
| 435 | &irq_effective_aff_proc_fops, irqp); | ||
| 436 | proc_create_data("effective_affinity_list", 0444, desc->dir, | ||
| 437 | &irq_effective_aff_list_proc_fops, irqp); | ||
| 438 | # endif | ||
| 364 | #endif | 439 | #endif |
| 365 | |||
| 366 | proc_create_data("spurious", 0444, desc->dir, | 440 | proc_create_data("spurious", 0444, desc->dir, |
| 367 | &irq_spurious_proc_fops, (void *)(long)irq); | 441 | &irq_spurious_proc_fops, (void *)(long)irq); |
| 368 | 442 | ||
| @@ -381,6 +455,10 @@ void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) | |||
| 381 | remove_proc_entry("affinity_hint", desc->dir); | 455 | remove_proc_entry("affinity_hint", desc->dir); |
| 382 | remove_proc_entry("smp_affinity_list", desc->dir); | 456 | remove_proc_entry("smp_affinity_list", desc->dir); |
| 383 | remove_proc_entry("node", desc->dir); | 457 | remove_proc_entry("node", desc->dir); |
| 458 | # ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK | ||
| 459 | remove_proc_entry("effective_affinity", desc->dir); | ||
| 460 | remove_proc_entry("effective_affinity_list", desc->dir); | ||
| 461 | # endif | ||
| 384 | #endif | 462 | #endif |
| 385 | remove_proc_entry("spurious", desc->dir); | 463 | remove_proc_entry("spurious", desc->dir); |
| 386 | 464 | ||
diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c new file mode 100644 index 000000000000..c8c1d073fbf1 --- /dev/null +++ b/kernel/irq/timings.c | |||
| @@ -0,0 +1,369 @@ | |||
| 1 | /* | ||
| 2 | * linux/kernel/irq/timings.c | ||
| 3 | * | ||
| 4 | * Copyright (C) 2016, Linaro Ltd - Daniel Lezcano <daniel.lezcano@linaro.org> | ||
| 5 | * | ||
| 6 | * This program is free software; you can redistribute it and/or modify | ||
| 7 | * it under the terms of the GNU General Public License version 2 as | ||
| 8 | * published by the Free Software Foundation. | ||
| 9 | * | ||
| 10 | */ | ||
| 11 | #include <linux/kernel.h> | ||
| 12 | #include <linux/percpu.h> | ||
| 13 | #include <linux/slab.h> | ||
| 14 | #include <linux/static_key.h> | ||
| 15 | #include <linux/interrupt.h> | ||
| 16 | #include <linux/idr.h> | ||
| 17 | #include <linux/irq.h> | ||
| 18 | #include <linux/math64.h> | ||
| 19 | |||
| 20 | #include <trace/events/irq.h> | ||
| 21 | |||
| 22 | #include "internals.h" | ||
| 23 | |||
| 24 | DEFINE_STATIC_KEY_FALSE(irq_timing_enabled); | ||
| 25 | |||
| 26 | DEFINE_PER_CPU(struct irq_timings, irq_timings); | ||
| 27 | |||
| 28 | struct irqt_stat { | ||
| 29 | u64 next_evt; | ||
| 30 | u64 last_ts; | ||
| 31 | u64 variance; | ||
| 32 | u32 avg; | ||
| 33 | u32 nr_samples; | ||
| 34 | int anomalies; | ||
| 35 | int valid; | ||
| 36 | }; | ||
| 37 | |||
| 38 | static DEFINE_IDR(irqt_stats); | ||
| 39 | |||
| 40 | void irq_timings_enable(void) | ||
| 41 | { | ||
| 42 | static_branch_enable(&irq_timing_enabled); | ||
| 43 | } | ||
| 44 | |||
| 45 | void irq_timings_disable(void) | ||
| 46 | { | ||
| 47 | static_branch_disable(&irq_timing_enabled); | ||
| 48 | } | ||
| 49 | |||
| 50 | /** | ||
| 51 | * irqs_update - update the irq timing statistics with a new timestamp | ||
| 52 | * | ||
| 53 | * @irqs: an irqt_stat struct pointer | ||
| 54 | * @ts: the new timestamp | ||
| 55 | * | ||
| 56 | * The statistics are computed online, in other words, the code is | ||
| 57 | * designed to compute the statistics on a stream of values rather | ||
| 58 | * than doing multiple passes on the values to compute the average, | ||
| 59 | * then the variance. The integer division introduces a loss of | ||
| 60 | * precision but with an acceptable error margin regarding the results | ||
| 61 | * we would have with the double floating precision: we are dealing | ||
| 62 | * with nanosec, so big numbers, consequently the mantisse is | ||
| 63 | * negligeable, especially when converting the time in usec | ||
| 64 | * afterwards. | ||
| 65 | * | ||
| 66 | * The computation happens at idle time. When the CPU is not idle, the | ||
| 67 | * interrupts' timestamps are stored in the circular buffer, when the | ||
| 68 | * CPU goes idle and this routine is called, all the buffer's values | ||
| 69 | * are injected in the statistical model continuying to extend the | ||
| 70 | * statistics from the previous busy-idle cycle. | ||
| 71 | * | ||
| 72 | * The observations showed a device will trigger a burst of periodic | ||
| 73 | * interrupts followed by one or two peaks of longer time, for | ||
| 74 | * instance when a SD card device flushes its cache, then the periodic | ||
| 75 | * intervals occur again. A one second inactivity period resets the | ||
| 76 | * stats, that gives us the certitude the statistical values won't | ||
| 77 | * exceed 1x10^9, thus the computation won't overflow. | ||
| 78 | * | ||
| 79 | * Basically, the purpose of the algorithm is to watch the periodic | ||
| 80 | * interrupts and eliminate the peaks. | ||
| 81 | * | ||
| 82 | * An interrupt is considered periodically stable if the interval of | ||
| 83 | * its occurences follow the normal distribution, thus the values | ||
| 84 | * comply with: | ||
| 85 | * | ||
| 86 | * avg - 3 x stddev < value < avg + 3 x stddev | ||
| 87 | * | ||
| 88 | * Which can be simplified to: | ||
| 89 | * | ||
| 90 | * -3 x stddev < value - avg < 3 x stddev | ||
| 91 | * | ||
| 92 | * abs(value - avg) < 3 x stddev | ||
| 93 | * | ||
| 94 | * In order to save a costly square root computation, we use the | ||
| 95 | * variance. For the record, stddev = sqrt(variance). The equation | ||
| 96 | * above becomes: | ||
| 97 | * | ||
| 98 | * abs(value - avg) < 3 x sqrt(variance) | ||
| 99 | * | ||
| 100 | * And finally we square it: | ||
| 101 | * | ||
| 102 | * (value - avg) ^ 2 < (3 x sqrt(variance)) ^ 2 | ||
| 103 | * | ||
| 104 | * (value - avg) x (value - avg) < 9 x variance | ||
| 105 | * | ||
| 106 | * Statistically speaking, any values out of this interval is | ||
| 107 | * considered as an anomaly and is discarded. However, a normal | ||
| 108 | * distribution appears when the number of samples is 30 (it is the | ||
| 109 | * rule of thumb in statistics, cf. "30 samples" on Internet). When | ||
| 110 | * there are three consecutive anomalies, the statistics are resetted. | ||
| 111 | * | ||
| 112 | */ | ||
| 113 | static void irqs_update(struct irqt_stat *irqs, u64 ts) | ||
| 114 | { | ||
| 115 | u64 old_ts = irqs->last_ts; | ||
| 116 | u64 variance = 0; | ||
| 117 | u64 interval; | ||
| 118 | s64 diff; | ||
| 119 | |||
| 120 | /* | ||
| 121 | * The timestamps are absolute time values, we need to compute | ||
| 122 | * the timing interval between two interrupts. | ||
| 123 | */ | ||
| 124 | irqs->last_ts = ts; | ||
| 125 | |||
| 126 | /* | ||
| 127 | * The interval type is u64 in order to deal with the same | ||
| 128 | * type in our computation, that prevent mindfuck issues with | ||
| 129 | * overflow, sign and division. | ||
| 130 | */ | ||
| 131 | interval = ts - old_ts; | ||
| 132 | |||
| 133 | /* | ||
| 134 | * The interrupt triggered more than one second apart, that | ||
| 135 | * ends the sequence as predictible for our purpose. In this | ||
| 136 | * case, assume we have the beginning of a sequence and the | ||
| 137 | * timestamp is the first value. As it is impossible to | ||
| 138 | * predict anything at this point, return. | ||
| 139 | * | ||
| 140 | * Note the first timestamp of the sequence will always fall | ||
| 141 | * in this test because the old_ts is zero. That is what we | ||
| 142 | * want as we need another timestamp to compute an interval. | ||
| 143 | */ | ||
| 144 | if (interval >= NSEC_PER_SEC) { | ||
| 145 | memset(irqs, 0, sizeof(*irqs)); | ||
| 146 | irqs->last_ts = ts; | ||
| 147 | return; | ||
| 148 | } | ||
| 149 | |||
| 150 | /* | ||
| 151 | * Pre-compute the delta with the average as the result is | ||
| 152 | * used several times in this function. | ||
| 153 | */ | ||
| 154 | diff = interval - irqs->avg; | ||
| 155 | |||
| 156 | /* | ||
| 157 | * Increment the number of samples. | ||
| 158 | */ | ||
| 159 | irqs->nr_samples++; | ||
| 160 | |||
| 161 | /* | ||
| 162 | * Online variance divided by the number of elements if there | ||
| 163 | * is more than one sample. Normally the formula is division | ||
| 164 | * by nr_samples - 1 but we assume the number of element will be | ||
| 165 | * more than 32 and dividing by 32 instead of 31 is enough | ||
| 166 | * precise. | ||
| 167 | */ | ||
| 168 | if (likely(irqs->nr_samples > 1)) | ||
| 169 | variance = irqs->variance >> IRQ_TIMINGS_SHIFT; | ||
| 170 | |||
| 171 | /* | ||
| 172 | * The rule of thumb in statistics for the normal distribution | ||
| 173 | * is having at least 30 samples in order to have the model to | ||
| 174 | * apply. Values outside the interval are considered as an | ||
| 175 | * anomaly. | ||
| 176 | */ | ||
| 177 | if ((irqs->nr_samples >= 30) && ((diff * diff) > (9 * variance))) { | ||
| 178 | /* | ||
| 179 | * After three consecutive anomalies, we reset the | ||
| 180 | * stats as it is no longer stable enough. | ||
| 181 | */ | ||
| 182 | if (irqs->anomalies++ >= 3) { | ||
| 183 | memset(irqs, 0, sizeof(*irqs)); | ||
| 184 | irqs->last_ts = ts; | ||
| 185 | return; | ||
| 186 | } | ||
| 187 | } else { | ||
| 188 | /* | ||
| 189 | * The anomalies must be consecutives, so at this | ||
| 190 | * point, we reset the anomalies counter. | ||
| 191 | */ | ||
| 192 | irqs->anomalies = 0; | ||
| 193 | } | ||
| 194 | |||
| 195 | /* | ||
| 196 | * The interrupt is considered stable enough to try to predict | ||
| 197 | * the next event on it. | ||
| 198 | */ | ||
| 199 | irqs->valid = 1; | ||
| 200 | |||
| 201 | /* | ||
| 202 | * Online average algorithm: | ||
| 203 | * | ||
| 204 | * new_average = average + ((value - average) / count) | ||
| 205 | * | ||
| 206 | * The variance computation depends on the new average | ||
| 207 | * to be computed here first. | ||
| 208 | * | ||
| 209 | */ | ||
| 210 | irqs->avg = irqs->avg + (diff >> IRQ_TIMINGS_SHIFT); | ||
| 211 | |||
| 212 | /* | ||
| 213 | * Online variance algorithm: | ||
| 214 | * | ||
| 215 | * new_variance = variance + (value - average) x (value - new_average) | ||
| 216 | * | ||
| 217 | * Warning: irqs->avg is updated with the line above, hence | ||
| 218 | * 'interval - irqs->avg' is no longer equal to 'diff' | ||
| 219 | */ | ||
| 220 | irqs->variance = irqs->variance + (diff * (interval - irqs->avg)); | ||
| 221 | |||
| 222 | /* | ||
| 223 | * Update the next event | ||
| 224 | */ | ||
| 225 | irqs->next_evt = ts + irqs->avg; | ||
| 226 | } | ||
| 227 | |||
| 228 | /** | ||
| 229 | * irq_timings_next_event - Return when the next event is supposed to arrive | ||
| 230 | * | ||
| 231 | * During the last busy cycle, the number of interrupts is incremented | ||
| 232 | * and stored in the irq_timings structure. This information is | ||
| 233 | * necessary to: | ||
| 234 | * | ||
| 235 | * - know if the index in the table wrapped up: | ||
| 236 | * | ||
| 237 | * If more than the array size interrupts happened during the | ||
| 238 | * last busy/idle cycle, the index wrapped up and we have to | ||
| 239 | * begin with the next element in the array which is the last one | ||
| 240 | * in the sequence, otherwise it is a the index 0. | ||
| 241 | * | ||
| 242 | * - have an indication of the interrupts activity on this CPU | ||
| 243 | * (eg. irq/sec) | ||
| 244 | * | ||
| 245 | * The values are 'consumed' after inserting in the statistical model, | ||
| 246 | * thus the count is reinitialized. | ||
| 247 | * | ||
| 248 | * The array of values **must** be browsed in the time direction, the | ||
| 249 | * timestamp must increase between an element and the next one. | ||
| 250 | * | ||
| 251 | * Returns a nanosec time based estimation of the earliest interrupt, | ||
| 252 | * U64_MAX otherwise. | ||
| 253 | */ | ||
| 254 | u64 irq_timings_next_event(u64 now) | ||
| 255 | { | ||
| 256 | struct irq_timings *irqts = this_cpu_ptr(&irq_timings); | ||
| 257 | struct irqt_stat *irqs; | ||
| 258 | struct irqt_stat __percpu *s; | ||
| 259 | u64 ts, next_evt = U64_MAX; | ||
| 260 | int i, irq = 0; | ||
| 261 | |||
| 262 | /* | ||
| 263 | * This function must be called with the local irq disabled in | ||
| 264 | * order to prevent the timings circular buffer to be updated | ||
| 265 | * while we are reading it. | ||
| 266 | */ | ||
| 267 | WARN_ON_ONCE(!irqs_disabled()); | ||
| 268 | |||
| 269 | /* | ||
| 270 | * Number of elements in the circular buffer: If it happens it | ||
| 271 | * was flushed before, then the number of elements could be | ||
| 272 | * smaller than IRQ_TIMINGS_SIZE, so the count is used, | ||
| 273 | * otherwise the array size is used as we wrapped. The index | ||
| 274 | * begins from zero when we did not wrap. That could be done | ||
| 275 | * in a nicer way with the proper circular array structure | ||
| 276 | * type but with the cost of extra computation in the | ||
| 277 | * interrupt handler hot path. We choose efficiency. | ||
| 278 | * | ||
| 279 | * Inject measured irq/timestamp to the statistical model | ||
| 280 | * while decrementing the counter because we consume the data | ||
| 281 | * from our circular buffer. | ||
| 282 | */ | ||
| 283 | for (i = irqts->count & IRQ_TIMINGS_MASK, | ||
| 284 | irqts->count = min(IRQ_TIMINGS_SIZE, irqts->count); | ||
| 285 | irqts->count > 0; irqts->count--, i = (i + 1) & IRQ_TIMINGS_MASK) { | ||
| 286 | |||
| 287 | irq = irq_timing_decode(irqts->values[i], &ts); | ||
| 288 | |||
| 289 | s = idr_find(&irqt_stats, irq); | ||
| 290 | if (s) { | ||
| 291 | irqs = this_cpu_ptr(s); | ||
| 292 | irqs_update(irqs, ts); | ||
| 293 | } | ||
| 294 | } | ||
| 295 | |||
| 296 | /* | ||
| 297 | * Look in the list of interrupts' statistics, the earliest | ||
| 298 | * next event. | ||
| 299 | */ | ||
| 300 | idr_for_each_entry(&irqt_stats, s, i) { | ||
| 301 | |||
| 302 | irqs = this_cpu_ptr(s); | ||
| 303 | |||
| 304 | if (!irqs->valid) | ||
| 305 | continue; | ||
| 306 | |||
| 307 | if (irqs->next_evt <= now) { | ||
| 308 | irq = i; | ||
| 309 | next_evt = now; | ||
| 310 | |||
| 311 | /* | ||
| 312 | * This interrupt mustn't use in the future | ||
| 313 | * until new events occur and update the | ||
| 314 | * statistics. | ||
| 315 | */ | ||
| 316 | irqs->valid = 0; | ||
| 317 | break; | ||
| 318 | } | ||
| 319 | |||
| 320 | if (irqs->next_evt < next_evt) { | ||
| 321 | irq = i; | ||
| 322 | next_evt = irqs->next_evt; | ||
| 323 | } | ||
| 324 | } | ||
| 325 | |||
| 326 | return next_evt; | ||
| 327 | } | ||
| 328 | |||
| 329 | void irq_timings_free(int irq) | ||
| 330 | { | ||
| 331 | struct irqt_stat __percpu *s; | ||
| 332 | |||
| 333 | s = idr_find(&irqt_stats, irq); | ||
| 334 | if (s) { | ||
| 335 | free_percpu(s); | ||
| 336 | idr_remove(&irqt_stats, irq); | ||
| 337 | } | ||
| 338 | } | ||
| 339 | |||
| 340 | int irq_timings_alloc(int irq) | ||
| 341 | { | ||
| 342 | struct irqt_stat __percpu *s; | ||
| 343 | int id; | ||
| 344 | |||
| 345 | /* | ||
| 346 | * Some platforms can have the same private interrupt per cpu, | ||
| 347 | * so this function may be be called several times with the | ||
| 348 | * same interrupt number. Just bail out in case the per cpu | ||
| 349 | * stat structure is already allocated. | ||
| 350 | */ | ||
| 351 | s = idr_find(&irqt_stats, irq); | ||
| 352 | if (s) | ||
| 353 | return 0; | ||
| 354 | |||
| 355 | s = alloc_percpu(*s); | ||
| 356 | if (!s) | ||
| 357 | return -ENOMEM; | ||
| 358 | |||
| 359 | idr_preload(GFP_KERNEL); | ||
| 360 | id = idr_alloc(&irqt_stats, s, irq, irq + 1, GFP_NOWAIT); | ||
| 361 | idr_preload_end(); | ||
| 362 | |||
| 363 | if (id < 0) { | ||
| 364 | free_percpu(s); | ||
| 365 | return id; | ||
| 366 | } | ||
| 367 | |||
| 368 | return 0; | ||
| 369 | } | ||
diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 6c9cb208ac48..d11c506a6ac3 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c | |||
| @@ -15,6 +15,7 @@ | |||
| 15 | #include <linux/static_key.h> | 15 | #include <linux/static_key.h> |
| 16 | #include <linux/jump_label_ratelimit.h> | 16 | #include <linux/jump_label_ratelimit.h> |
| 17 | #include <linux/bug.h> | 17 | #include <linux/bug.h> |
| 18 | #include <linux/cpu.h> | ||
| 18 | 19 | ||
| 19 | #ifdef HAVE_JUMP_LABEL | 20 | #ifdef HAVE_JUMP_LABEL |
| 20 | 21 | ||
| @@ -124,6 +125,7 @@ void static_key_slow_inc(struct static_key *key) | |||
| 124 | return; | 125 | return; |
| 125 | } | 126 | } |
| 126 | 127 | ||
| 128 | cpus_read_lock(); | ||
| 127 | jump_label_lock(); | 129 | jump_label_lock(); |
| 128 | if (atomic_read(&key->enabled) == 0) { | 130 | if (atomic_read(&key->enabled) == 0) { |
| 129 | atomic_set(&key->enabled, -1); | 131 | atomic_set(&key->enabled, -1); |
| @@ -133,12 +135,14 @@ void static_key_slow_inc(struct static_key *key) | |||
| 133 | atomic_inc(&key->enabled); | 135 | atomic_inc(&key->enabled); |
| 134 | } | 136 | } |
| 135 | jump_label_unlock(); | 137 | jump_label_unlock(); |
| 138 | cpus_read_unlock(); | ||
| 136 | } | 139 | } |
| 137 | EXPORT_SYMBOL_GPL(static_key_slow_inc); | 140 | EXPORT_SYMBOL_GPL(static_key_slow_inc); |
| 138 | 141 | ||
| 139 | static void __static_key_slow_dec(struct static_key *key, | 142 | static void __static_key_slow_dec(struct static_key *key, |
| 140 | unsigned long rate_limit, struct delayed_work *work) | 143 | unsigned long rate_limit, struct delayed_work *work) |
| 141 | { | 144 | { |
| 145 | cpus_read_lock(); | ||
| 142 | /* | 146 | /* |
| 143 | * The negative count check is valid even when a negative | 147 | * The negative count check is valid even when a negative |
| 144 | * key->enabled is in use by static_key_slow_inc(); a | 148 | * key->enabled is in use by static_key_slow_inc(); a |
| @@ -149,6 +153,7 @@ static void __static_key_slow_dec(struct static_key *key, | |||
| 149 | if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) { | 153 | if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) { |
| 150 | WARN(atomic_read(&key->enabled) < 0, | 154 | WARN(atomic_read(&key->enabled) < 0, |
| 151 | "jump label: negative count!\n"); | 155 | "jump label: negative count!\n"); |
| 156 | cpus_read_unlock(); | ||
| 152 | return; | 157 | return; |
| 153 | } | 158 | } |
| 154 | 159 | ||
| @@ -159,6 +164,7 @@ static void __static_key_slow_dec(struct static_key *key, | |||
| 159 | jump_label_update(key); | 164 | jump_label_update(key); |
| 160 | } | 165 | } |
| 161 | jump_label_unlock(); | 166 | jump_label_unlock(); |
| 167 | cpus_read_unlock(); | ||
| 162 | } | 168 | } |
| 163 | 169 | ||
| 164 | static void jump_label_update_timeout(struct work_struct *work) | 170 | static void jump_label_update_timeout(struct work_struct *work) |
| @@ -334,6 +340,7 @@ void __init jump_label_init(void) | |||
| 334 | if (static_key_initialized) | 340 | if (static_key_initialized) |
| 335 | return; | 341 | return; |
| 336 | 342 | ||
| 343 | cpus_read_lock(); | ||
| 337 | jump_label_lock(); | 344 | jump_label_lock(); |
| 338 | jump_label_sort_entries(iter_start, iter_stop); | 345 | jump_label_sort_entries(iter_start, iter_stop); |
| 339 | 346 | ||
| @@ -353,6 +360,7 @@ void __init jump_label_init(void) | |||
| 353 | } | 360 | } |
| 354 | static_key_initialized = true; | 361 | static_key_initialized = true; |
| 355 | jump_label_unlock(); | 362 | jump_label_unlock(); |
| 363 | cpus_read_unlock(); | ||
| 356 | } | 364 | } |
| 357 | 365 | ||
| 358 | #ifdef CONFIG_MODULES | 366 | #ifdef CONFIG_MODULES |
| @@ -590,28 +598,28 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val, | |||
| 590 | struct module *mod = data; | 598 | struct module *mod = data; |
| 591 | int ret = 0; | 599 | int ret = 0; |
| 592 | 600 | ||
| 601 | cpus_read_lock(); | ||
| 602 | jump_label_lock(); | ||
| 603 | |||
| 593 | switch (val) { | 604 | switch (val) { |
| 594 | case MODULE_STATE_COMING: | 605 | case MODULE_STATE_COMING: |
| 595 | jump_label_lock(); | ||
| 596 | ret = jump_label_add_module(mod); | 606 | ret = jump_label_add_module(mod); |
| 597 | if (ret) { | 607 | if (ret) { |
| 598 | WARN(1, "Failed to allocatote memory: jump_label may not work properly.\n"); | 608 | WARN(1, "Failed to allocatote memory: jump_label may not work properly.\n"); |
| 599 | jump_label_del_module(mod); | 609 | jump_label_del_module(mod); |
| 600 | } | 610 | } |
| 601 | jump_label_unlock(); | ||
| 602 | break; | 611 | break; |
| 603 | case MODULE_STATE_GOING: | 612 | case MODULE_STATE_GOING: |
| 604 | jump_label_lock(); | ||
| 605 | jump_label_del_module(mod); | 613 | jump_label_del_module(mod); |
| 606 | jump_label_unlock(); | ||
| 607 | break; | 614 | break; |
| 608 | case MODULE_STATE_LIVE: | 615 | case MODULE_STATE_LIVE: |
| 609 | jump_label_lock(); | ||
| 610 | jump_label_invalidate_module_init(mod); | 616 | jump_label_invalidate_module_init(mod); |
| 611 | jump_label_unlock(); | ||
| 612 | break; | 617 | break; |
| 613 | } | 618 | } |
| 614 | 619 | ||
| 620 | jump_label_unlock(); | ||
| 621 | cpus_read_unlock(); | ||
| 622 | |||
| 615 | return notifier_from_errno(ret); | 623 | return notifier_from_errno(ret); |
| 616 | } | 624 | } |
| 617 | 625 | ||
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 6a3b249a2ae1..127e7cfafa55 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c | |||
| @@ -28,12 +28,6 @@ | |||
| 28 | 28 | ||
| 29 | #include <asm/sections.h> | 29 | #include <asm/sections.h> |
| 30 | 30 | ||
| 31 | #ifdef CONFIG_KALLSYMS_ALL | ||
| 32 | #define all_var 1 | ||
| 33 | #else | ||
| 34 | #define all_var 0 | ||
| 35 | #endif | ||
| 36 | |||
| 37 | /* | 31 | /* |
| 38 | * These will be re-linked against their real values | 32 | * These will be re-linked against their real values |
| 39 | * during the second link stage. | 33 | * during the second link stage. |
| @@ -82,7 +76,7 @@ static inline int is_kernel(unsigned long addr) | |||
| 82 | 76 | ||
| 83 | static int is_ksym_addr(unsigned long addr) | 77 | static int is_ksym_addr(unsigned long addr) |
| 84 | { | 78 | { |
| 85 | if (all_var) | 79 | if (IS_ENABLED(CONFIG_KALLSYMS_ALL)) |
| 86 | return is_kernel(addr); | 80 | return is_kernel(addr); |
| 87 | 81 | ||
| 88 | return is_kernel_text(addr) || is_kernel_inittext(addr); | 82 | return is_kernel_text(addr) || is_kernel_inittext(addr); |
| @@ -280,7 +274,7 @@ static unsigned long get_symbol_pos(unsigned long addr, | |||
| 280 | if (!symbol_end) { | 274 | if (!symbol_end) { |
| 281 | if (is_kernel_inittext(addr)) | 275 | if (is_kernel_inittext(addr)) |
| 282 | symbol_end = (unsigned long)_einittext; | 276 | symbol_end = (unsigned long)_einittext; |
| 283 | else if (all_var) | 277 | else if (IS_ENABLED(CONFIG_KALLSYMS_ALL)) |
| 284 | symbol_end = (unsigned long)_end; | 278 | symbol_end = (unsigned long)_end; |
| 285 | else | 279 | else |
| 286 | symbol_end = (unsigned long)_etext; | 280 | symbol_end = (unsigned long)_etext; |
diff --git a/kernel/kcmp.c b/kernel/kcmp.c index 3a47fa998fe0..ea34ed8bb952 100644 --- a/kernel/kcmp.c +++ b/kernel/kcmp.c | |||
| @@ -11,6 +11,10 @@ | |||
| 11 | #include <linux/bug.h> | 11 | #include <linux/bug.h> |
| 12 | #include <linux/err.h> | 12 | #include <linux/err.h> |
| 13 | #include <linux/kcmp.h> | 13 | #include <linux/kcmp.h> |
| 14 | #include <linux/capability.h> | ||
| 15 | #include <linux/list.h> | ||
| 16 | #include <linux/eventpoll.h> | ||
| 17 | #include <linux/file.h> | ||
| 14 | 18 | ||
| 15 | #include <asm/unistd.h> | 19 | #include <asm/unistd.h> |
| 16 | 20 | ||
| @@ -94,6 +98,56 @@ static int kcmp_lock(struct mutex *m1, struct mutex *m2) | |||
| 94 | return err; | 98 | return err; |
| 95 | } | 99 | } |
| 96 | 100 | ||
| 101 | #ifdef CONFIG_EPOLL | ||
| 102 | static int kcmp_epoll_target(struct task_struct *task1, | ||
| 103 | struct task_struct *task2, | ||
| 104 | unsigned long idx1, | ||
| 105 | struct kcmp_epoll_slot __user *uslot) | ||
| 106 | { | ||
| 107 | struct file *filp, *filp_epoll, *filp_tgt; | ||
| 108 | struct kcmp_epoll_slot slot; | ||
| 109 | struct files_struct *files; | ||
| 110 | |||
| 111 | if (copy_from_user(&slot, uslot, sizeof(slot))) | ||
| 112 | return -EFAULT; | ||
| 113 | |||
| 114 | filp = get_file_raw_ptr(task1, idx1); | ||
| 115 | if (!filp) | ||
| 116 | return -EBADF; | ||
| 117 | |||
| 118 | files = get_files_struct(task2); | ||
| 119 | if (!files) | ||
| 120 | return -EBADF; | ||
| 121 | |||
| 122 | spin_lock(&files->file_lock); | ||
| 123 | filp_epoll = fcheck_files(files, slot.efd); | ||
| 124 | if (filp_epoll) | ||
| 125 | get_file(filp_epoll); | ||
| 126 | else | ||
| 127 | filp_tgt = ERR_PTR(-EBADF); | ||
| 128 | spin_unlock(&files->file_lock); | ||
| 129 | put_files_struct(files); | ||
| 130 | |||
| 131 | if (filp_epoll) { | ||
| 132 | filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, slot.toff); | ||
| 133 | fput(filp_epoll); | ||
| 134 | } else | ||
| 135 | |||
| 136 | if (IS_ERR(filp_tgt)) | ||
| 137 | return PTR_ERR(filp_tgt); | ||
| 138 | |||
| 139 | return kcmp_ptr(filp, filp_tgt, KCMP_FILE); | ||
| 140 | } | ||
| 141 | #else | ||
| 142 | static int kcmp_epoll_target(struct task_struct *task1, | ||
| 143 | struct task_struct *task2, | ||
| 144 | unsigned long idx1, | ||
| 145 | struct kcmp_epoll_slot __user *uslot) | ||
| 146 | { | ||
| 147 | return -EOPNOTSUPP; | ||
| 148 | } | ||
| 149 | #endif | ||
| 150 | |||
| 97 | SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type, | 151 | SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type, |
| 98 | unsigned long, idx1, unsigned long, idx2) | 152 | unsigned long, idx1, unsigned long, idx2) |
| 99 | { | 153 | { |
| @@ -165,6 +219,9 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type, | |||
| 165 | ret = -EOPNOTSUPP; | 219 | ret = -EOPNOTSUPP; |
| 166 | #endif | 220 | #endif |
| 167 | break; | 221 | break; |
| 222 | case KCMP_EPOLL_TFD: | ||
| 223 | ret = kcmp_epoll_target(task1, task2, idx1, (void *)idx2); | ||
| 224 | break; | ||
| 168 | default: | 225 | default: |
| 169 | ret = -EINVAL; | 226 | ret = -EINVAL; |
| 170 | break; | 227 | break; |
diff --git a/kernel/kexec.c b/kernel/kexec.c index 980936a90ee6..e62ec4dc6620 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
| @@ -144,6 +144,14 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments, | |||
| 144 | if (ret) | 144 | if (ret) |
| 145 | goto out; | 145 | goto out; |
| 146 | 146 | ||
| 147 | /* | ||
| 148 | * Some architecture(like S390) may touch the crash memory before | ||
| 149 | * machine_kexec_prepare(), we must copy vmcoreinfo data after it. | ||
| 150 | */ | ||
| 151 | ret = kimage_crash_copy_vmcoreinfo(image); | ||
| 152 | if (ret) | ||
| 153 | goto out; | ||
| 154 | |||
| 147 | for (i = 0; i < nr_segments; i++) { | 155 | for (i = 0; i < nr_segments; i++) { |
| 148 | ret = kimage_load_segment(image, &image->segment[i]); | 156 | ret = kimage_load_segment(image, &image->segment[i]); |
| 149 | if (ret) | 157 | if (ret) |
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index ae1a3ba24df5..1ae7c41c33c1 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c | |||
| @@ -38,6 +38,7 @@ | |||
| 38 | #include <linux/syscore_ops.h> | 38 | #include <linux/syscore_ops.h> |
| 39 | #include <linux/compiler.h> | 39 | #include <linux/compiler.h> |
| 40 | #include <linux/hugetlb.h> | 40 | #include <linux/hugetlb.h> |
| 41 | #include <linux/frame.h> | ||
| 41 | 42 | ||
| 42 | #include <asm/page.h> | 43 | #include <asm/page.h> |
| 43 | #include <asm/sections.h> | 44 | #include <asm/sections.h> |
| @@ -481,6 +482,40 @@ struct page *kimage_alloc_control_pages(struct kimage *image, | |||
| 481 | return pages; | 482 | return pages; |
| 482 | } | 483 | } |
| 483 | 484 | ||
| 485 | int kimage_crash_copy_vmcoreinfo(struct kimage *image) | ||
| 486 | { | ||
| 487 | struct page *vmcoreinfo_page; | ||
| 488 | void *safecopy; | ||
| 489 | |||
| 490 | if (image->type != KEXEC_TYPE_CRASH) | ||
| 491 | return 0; | ||
| 492 | |||
| 493 | /* | ||
| 494 | * For kdump, allocate one vmcoreinfo safe copy from the | ||
| 495 | * crash memory. as we have arch_kexec_protect_crashkres() | ||
| 496 | * after kexec syscall, we naturally protect it from write | ||
| 497 | * (even read) access under kernel direct mapping. But on | ||
| 498 | * the other hand, we still need to operate it when crash | ||
| 499 | * happens to generate vmcoreinfo note, hereby we rely on | ||
| 500 | * vmap for this purpose. | ||
| 501 | */ | ||
| 502 | vmcoreinfo_page = kimage_alloc_control_pages(image, 0); | ||
| 503 | if (!vmcoreinfo_page) { | ||
| 504 | pr_warn("Could not allocate vmcoreinfo buffer\n"); | ||
| 505 | return -ENOMEM; | ||
| 506 | } | ||
| 507 | safecopy = vmap(&vmcoreinfo_page, 1, VM_MAP, PAGE_KERNEL); | ||
| 508 | if (!safecopy) { | ||
| 509 | pr_warn("Could not vmap vmcoreinfo buffer\n"); | ||
| 510 | return -ENOMEM; | ||
| 511 | } | ||
| 512 | |||
| 513 | image->vmcoreinfo_data_copy = safecopy; | ||
| 514 | crash_update_vmcoreinfo_safecopy(safecopy); | ||
| 515 | |||
| 516 | return 0; | ||
| 517 | } | ||
| 518 | |||
| 484 | static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) | 519 | static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) |
| 485 | { | 520 | { |
| 486 | if (*image->entry != 0) | 521 | if (*image->entry != 0) |
| @@ -568,6 +603,11 @@ void kimage_free(struct kimage *image) | |||
| 568 | if (!image) | 603 | if (!image) |
| 569 | return; | 604 | return; |
| 570 | 605 | ||
| 606 | if (image->vmcoreinfo_data_copy) { | ||
| 607 | crash_update_vmcoreinfo_safecopy(NULL); | ||
| 608 | vunmap(image->vmcoreinfo_data_copy); | ||
| 609 | } | ||
| 610 | |||
| 571 | kimage_free_extra_pages(image); | 611 | kimage_free_extra_pages(image); |
| 572 | for_each_kimage_entry(image, ptr, entry) { | 612 | for_each_kimage_entry(image, ptr, entry) { |
| 573 | if (entry & IND_INDIRECTION) { | 613 | if (entry & IND_INDIRECTION) { |
| @@ -874,7 +914,7 @@ int kexec_load_disabled; | |||
| 874 | * only when panic_cpu holds the current CPU number; this is the only CPU | 914 | * only when panic_cpu holds the current CPU number; this is the only CPU |
| 875 | * which processes crash_kexec routines. | 915 | * which processes crash_kexec routines. |
| 876 | */ | 916 | */ |
| 877 | void __crash_kexec(struct pt_regs *regs) | 917 | void __noclone __crash_kexec(struct pt_regs *regs) |
| 878 | { | 918 | { |
| 879 | /* Take the kexec_mutex here to prevent sys_kexec_load | 919 | /* Take the kexec_mutex here to prevent sys_kexec_load |
| 880 | * running on one cpu from replacing the crash kernel | 920 | * running on one cpu from replacing the crash kernel |
| @@ -896,6 +936,7 @@ void __crash_kexec(struct pt_regs *regs) | |||
| 896 | mutex_unlock(&kexec_mutex); | 936 | mutex_unlock(&kexec_mutex); |
| 897 | } | 937 | } |
| 898 | } | 938 | } |
| 939 | STACK_FRAME_NON_STANDARD(__crash_kexec); | ||
| 899 | 940 | ||
| 900 | void crash_kexec(struct pt_regs *regs) | 941 | void crash_kexec(struct pt_regs *regs) |
| 901 | { | 942 | { |
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index b118735fea9d..9f48f4412297 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c | |||
| @@ -26,13 +26,6 @@ | |||
| 26 | #include <linux/vmalloc.h> | 26 | #include <linux/vmalloc.h> |
| 27 | #include "kexec_internal.h" | 27 | #include "kexec_internal.h" |
| 28 | 28 | ||
| 29 | /* | ||
| 30 | * Declare these symbols weak so that if architecture provides a purgatory, | ||
| 31 | * these will be overridden. | ||
| 32 | */ | ||
| 33 | char __weak kexec_purgatory[0]; | ||
| 34 | size_t __weak kexec_purgatory_size = 0; | ||
| 35 | |||
| 36 | static int kexec_calculate_store_digests(struct kimage *image); | 29 | static int kexec_calculate_store_digests(struct kimage *image); |
| 37 | 30 | ||
| 38 | /* Architectures can provide this probe function */ | 31 | /* Architectures can provide this probe function */ |
| @@ -162,16 +155,10 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, | |||
| 162 | } | 155 | } |
| 163 | 156 | ||
| 164 | if (cmdline_len) { | 157 | if (cmdline_len) { |
| 165 | image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL); | 158 | image->cmdline_buf = memdup_user(cmdline_ptr, cmdline_len); |
| 166 | if (!image->cmdline_buf) { | 159 | if (IS_ERR(image->cmdline_buf)) { |
| 167 | ret = -ENOMEM; | 160 | ret = PTR_ERR(image->cmdline_buf); |
| 168 | goto out; | 161 | image->cmdline_buf = NULL; |
| 169 | } | ||
| 170 | |||
| 171 | ret = copy_from_user(image->cmdline_buf, cmdline_ptr, | ||
| 172 | cmdline_len); | ||
| 173 | if (ret) { | ||
| 174 | ret = -EFAULT; | ||
| 175 | goto out; | 162 | goto out; |
| 176 | } | 163 | } |
| 177 | 164 | ||
| @@ -304,6 +291,14 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, | |||
| 304 | if (ret) | 291 | if (ret) |
| 305 | goto out; | 292 | goto out; |
| 306 | 293 | ||
| 294 | /* | ||
| 295 | * Some architecture(like S390) may touch the crash memory before | ||
| 296 | * machine_kexec_prepare(), we must copy vmcoreinfo data after it. | ||
| 297 | */ | ||
| 298 | ret = kimage_crash_copy_vmcoreinfo(image); | ||
| 299 | if (ret) | ||
| 300 | goto out; | ||
| 301 | |||
| 307 | ret = kexec_calculate_store_digests(image); | 302 | ret = kexec_calculate_store_digests(image); |
| 308 | if (ret) | 303 | if (ret) |
| 309 | goto out; | 304 | goto out; |
diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h index 799a8a452187..50dfcb039a41 100644 --- a/kernel/kexec_internal.h +++ b/kernel/kexec_internal.h | |||
| @@ -17,6 +17,8 @@ extern struct mutex kexec_mutex; | |||
| 17 | #ifdef CONFIG_KEXEC_FILE | 17 | #ifdef CONFIG_KEXEC_FILE |
| 18 | #include <linux/purgatory.h> | 18 | #include <linux/purgatory.h> |
| 19 | void kimage_file_post_load_cleanup(struct kimage *image); | 19 | void kimage_file_post_load_cleanup(struct kimage *image); |
| 20 | extern char kexec_purgatory[]; | ||
| 21 | extern size_t kexec_purgatory_size; | ||
| 20 | #else /* CONFIG_KEXEC_FILE */ | 22 | #else /* CONFIG_KEXEC_FILE */ |
| 21 | static inline void kimage_file_post_load_cleanup(struct kimage *image) { } | 23 | static inline void kimage_file_post_load_cleanup(struct kimage *image) { } |
| 22 | #endif /* CONFIG_KEXEC_FILE */ | 24 | #endif /* CONFIG_KEXEC_FILE */ |
diff --git a/kernel/kmod.c b/kernel/kmod.c index 563f97e2be36..6d016c5d97c8 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
| @@ -45,8 +45,6 @@ | |||
| 45 | 45 | ||
| 46 | #include <trace/events/module.h> | 46 | #include <trace/events/module.h> |
| 47 | 47 | ||
| 48 | extern int max_threads; | ||
| 49 | |||
| 50 | #define CAP_BSET (void *)1 | 48 | #define CAP_BSET (void *)1 |
| 51 | #define CAP_PI (void *)2 | 49 | #define CAP_PI (void *)2 |
| 52 | 50 | ||
| @@ -56,6 +54,21 @@ static DEFINE_SPINLOCK(umh_sysctl_lock); | |||
| 56 | static DECLARE_RWSEM(umhelper_sem); | 54 | static DECLARE_RWSEM(umhelper_sem); |
| 57 | 55 | ||
| 58 | #ifdef CONFIG_MODULES | 56 | #ifdef CONFIG_MODULES |
| 57 | /* | ||
| 58 | * Assuming: | ||
| 59 | * | ||
| 60 | * threads = div64_u64((u64) totalram_pages * (u64) PAGE_SIZE, | ||
| 61 | * (u64) THREAD_SIZE * 8UL); | ||
| 62 | * | ||
| 63 | * If you need less than 50 threads would mean we're dealing with systems | ||
| 64 | * smaller than 3200 pages. This assuems you are capable of having ~13M memory, | ||
| 65 | * and this would only be an be an upper limit, after which the OOM killer | ||
| 66 | * would take effect. Systems like these are very unlikely if modules are | ||
| 67 | * enabled. | ||
| 68 | */ | ||
| 69 | #define MAX_KMOD_CONCURRENT 50 | ||
| 70 | static atomic_t kmod_concurrent_max = ATOMIC_INIT(MAX_KMOD_CONCURRENT); | ||
| 71 | static DECLARE_WAIT_QUEUE_HEAD(kmod_wq); | ||
| 59 | 72 | ||
| 60 | /* | 73 | /* |
| 61 | modprobe_path is set via /proc/sys. | 74 | modprobe_path is set via /proc/sys. |
| @@ -127,11 +140,7 @@ int __request_module(bool wait, const char *fmt, ...) | |||
| 127 | { | 140 | { |
| 128 | va_list args; | 141 | va_list args; |
| 129 | char module_name[MODULE_NAME_LEN]; | 142 | char module_name[MODULE_NAME_LEN]; |
| 130 | unsigned int max_modprobes; | ||
| 131 | int ret; | 143 | int ret; |
| 132 | static atomic_t kmod_concurrent = ATOMIC_INIT(0); | ||
| 133 | #define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ | ||
| 134 | static int kmod_loop_msg; | ||
| 135 | 144 | ||
| 136 | /* | 145 | /* |
| 137 | * We don't allow synchronous module loading from async. Module | 146 | * We don't allow synchronous module loading from async. Module |
| @@ -154,40 +163,25 @@ int __request_module(bool wait, const char *fmt, ...) | |||
| 154 | if (ret) | 163 | if (ret) |
| 155 | return ret; | 164 | return ret; |
| 156 | 165 | ||
| 157 | /* If modprobe needs a service that is in a module, we get a recursive | 166 | if (atomic_dec_if_positive(&kmod_concurrent_max) < 0) { |
| 158 | * loop. Limit the number of running kmod threads to max_threads/2 or | 167 | pr_warn_ratelimited("request_module: kmod_concurrent_max (%u) close to 0 (max_modprobes: %u), for module %s, throttling...", |
| 159 | * MAX_KMOD_CONCURRENT, whichever is the smaller. A cleaner method | 168 | atomic_read(&kmod_concurrent_max), |
| 160 | * would be to run the parents of this process, counting how many times | 169 | MAX_KMOD_CONCURRENT, module_name); |
| 161 | * kmod was invoked. That would mean accessing the internals of the | 170 | wait_event_interruptible(kmod_wq, |
| 162 | * process tables to get the command line, proc_pid_cmdline is static | 171 | atomic_dec_if_positive(&kmod_concurrent_max) >= 0); |
| 163 | * and it is not worth changing the proc code just to handle this case. | ||
| 164 | * KAO. | ||
| 165 | * | ||
| 166 | * "trace the ppid" is simple, but will fail if someone's | ||
| 167 | * parent exits. I think this is as good as it gets. --RR | ||
| 168 | */ | ||
| 169 | max_modprobes = min(max_threads/2, MAX_KMOD_CONCURRENT); | ||
| 170 | atomic_inc(&kmod_concurrent); | ||
| 171 | if (atomic_read(&kmod_concurrent) > max_modprobes) { | ||
| 172 | /* We may be blaming an innocent here, but unlikely */ | ||
| 173 | if (kmod_loop_msg < 5) { | ||
| 174 | printk(KERN_ERR | ||
| 175 | "request_module: runaway loop modprobe %s\n", | ||
| 176 | module_name); | ||
| 177 | kmod_loop_msg++; | ||
| 178 | } | ||
| 179 | atomic_dec(&kmod_concurrent); | ||
| 180 | return -ENOMEM; | ||
| 181 | } | 172 | } |
| 182 | 173 | ||
| 183 | trace_module_request(module_name, wait, _RET_IP_); | 174 | trace_module_request(module_name, wait, _RET_IP_); |
| 184 | 175 | ||
| 185 | ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC); | 176 | ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC); |
| 186 | 177 | ||
| 187 | atomic_dec(&kmod_concurrent); | 178 | atomic_inc(&kmod_concurrent_max); |
| 179 | wake_up(&kmod_wq); | ||
| 180 | |||
| 188 | return ret; | 181 | return ret; |
| 189 | } | 182 | } |
| 190 | EXPORT_SYMBOL(__request_module); | 183 | EXPORT_SYMBOL(__request_module); |
| 184 | |||
| 191 | #endif /* CONFIG_MODULES */ | 185 | #endif /* CONFIG_MODULES */ |
| 192 | 186 | ||
| 193 | static void call_usermodehelper_freeinfo(struct subprocess_info *info) | 187 | static void call_usermodehelper_freeinfo(struct subprocess_info *info) |
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 2d2d3a568e4e..a1606a4224e1 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
| @@ -122,7 +122,7 @@ static void *alloc_insn_page(void) | |||
| 122 | return module_alloc(PAGE_SIZE); | 122 | return module_alloc(PAGE_SIZE); |
| 123 | } | 123 | } |
| 124 | 124 | ||
| 125 | static void free_insn_page(void *page) | 125 | void __weak free_insn_page(void *page) |
| 126 | { | 126 | { |
| 127 | module_memfree(page); | 127 | module_memfree(page); |
| 128 | } | 128 | } |
| @@ -483,11 +483,6 @@ static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); | |||
| 483 | */ | 483 | */ |
| 484 | static void do_optimize_kprobes(void) | 484 | static void do_optimize_kprobes(void) |
| 485 | { | 485 | { |
| 486 | /* Optimization never be done when disarmed */ | ||
| 487 | if (kprobes_all_disarmed || !kprobes_allow_optimization || | ||
| 488 | list_empty(&optimizing_list)) | ||
| 489 | return; | ||
| 490 | |||
| 491 | /* | 486 | /* |
| 492 | * The optimization/unoptimization refers online_cpus via | 487 | * The optimization/unoptimization refers online_cpus via |
| 493 | * stop_machine() and cpu-hotplug modifies online_cpus. | 488 | * stop_machine() and cpu-hotplug modifies online_cpus. |
| @@ -495,14 +490,19 @@ static void do_optimize_kprobes(void) | |||
| 495 | * This combination can cause a deadlock (cpu-hotplug try to lock | 490 | * This combination can cause a deadlock (cpu-hotplug try to lock |
| 496 | * text_mutex but stop_machine can not be done because online_cpus | 491 | * text_mutex but stop_machine can not be done because online_cpus |
| 497 | * has been changed) | 492 | * has been changed) |
| 498 | * To avoid this deadlock, we need to call get_online_cpus() | 493 | * To avoid this deadlock, caller must have locked cpu hotplug |
| 499 | * for preventing cpu-hotplug outside of text_mutex locking. | 494 | * for preventing cpu-hotplug outside of text_mutex locking. |
| 500 | */ | 495 | */ |
| 501 | get_online_cpus(); | 496 | lockdep_assert_cpus_held(); |
| 497 | |||
| 498 | /* Optimization never be done when disarmed */ | ||
| 499 | if (kprobes_all_disarmed || !kprobes_allow_optimization || | ||
| 500 | list_empty(&optimizing_list)) | ||
| 501 | return; | ||
| 502 | |||
| 502 | mutex_lock(&text_mutex); | 503 | mutex_lock(&text_mutex); |
| 503 | arch_optimize_kprobes(&optimizing_list); | 504 | arch_optimize_kprobes(&optimizing_list); |
| 504 | mutex_unlock(&text_mutex); | 505 | mutex_unlock(&text_mutex); |
| 505 | put_online_cpus(); | ||
| 506 | } | 506 | } |
| 507 | 507 | ||
| 508 | /* | 508 | /* |
| @@ -513,12 +513,13 @@ static void do_unoptimize_kprobes(void) | |||
| 513 | { | 513 | { |
| 514 | struct optimized_kprobe *op, *tmp; | 514 | struct optimized_kprobe *op, *tmp; |
| 515 | 515 | ||
| 516 | /* See comment in do_optimize_kprobes() */ | ||
| 517 | lockdep_assert_cpus_held(); | ||
| 518 | |||
| 516 | /* Unoptimization must be done anytime */ | 519 | /* Unoptimization must be done anytime */ |
| 517 | if (list_empty(&unoptimizing_list)) | 520 | if (list_empty(&unoptimizing_list)) |
| 518 | return; | 521 | return; |
| 519 | 522 | ||
| 520 | /* Ditto to do_optimize_kprobes */ | ||
| 521 | get_online_cpus(); | ||
| 522 | mutex_lock(&text_mutex); | 523 | mutex_lock(&text_mutex); |
| 523 | arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list); | 524 | arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list); |
| 524 | /* Loop free_list for disarming */ | 525 | /* Loop free_list for disarming */ |
| @@ -537,7 +538,6 @@ static void do_unoptimize_kprobes(void) | |||
| 537 | list_del_init(&op->list); | 538 | list_del_init(&op->list); |
| 538 | } | 539 | } |
| 539 | mutex_unlock(&text_mutex); | 540 | mutex_unlock(&text_mutex); |
| 540 | put_online_cpus(); | ||
| 541 | } | 541 | } |
| 542 | 542 | ||
| 543 | /* Reclaim all kprobes on the free_list */ | 543 | /* Reclaim all kprobes on the free_list */ |
| @@ -562,6 +562,7 @@ static void kick_kprobe_optimizer(void) | |||
| 562 | static void kprobe_optimizer(struct work_struct *work) | 562 | static void kprobe_optimizer(struct work_struct *work) |
| 563 | { | 563 | { |
| 564 | mutex_lock(&kprobe_mutex); | 564 | mutex_lock(&kprobe_mutex); |
| 565 | cpus_read_lock(); | ||
| 565 | /* Lock modules while optimizing kprobes */ | 566 | /* Lock modules while optimizing kprobes */ |
| 566 | mutex_lock(&module_mutex); | 567 | mutex_lock(&module_mutex); |
| 567 | 568 | ||
| @@ -587,6 +588,7 @@ static void kprobe_optimizer(struct work_struct *work) | |||
| 587 | do_free_cleaned_kprobes(); | 588 | do_free_cleaned_kprobes(); |
| 588 | 589 | ||
| 589 | mutex_unlock(&module_mutex); | 590 | mutex_unlock(&module_mutex); |
| 591 | cpus_read_unlock(); | ||
| 590 | mutex_unlock(&kprobe_mutex); | 592 | mutex_unlock(&kprobe_mutex); |
| 591 | 593 | ||
| 592 | /* Step 5: Kick optimizer again if needed */ | 594 | /* Step 5: Kick optimizer again if needed */ |
| @@ -650,9 +652,8 @@ static void optimize_kprobe(struct kprobe *p) | |||
| 650 | /* Short cut to direct unoptimizing */ | 652 | /* Short cut to direct unoptimizing */ |
| 651 | static void force_unoptimize_kprobe(struct optimized_kprobe *op) | 653 | static void force_unoptimize_kprobe(struct optimized_kprobe *op) |
| 652 | { | 654 | { |
| 653 | get_online_cpus(); | 655 | lockdep_assert_cpus_held(); |
| 654 | arch_unoptimize_kprobe(op); | 656 | arch_unoptimize_kprobe(op); |
| 655 | put_online_cpus(); | ||
| 656 | if (kprobe_disabled(&op->kp)) | 657 | if (kprobe_disabled(&op->kp)) |
| 657 | arch_disarm_kprobe(&op->kp); | 658 | arch_disarm_kprobe(&op->kp); |
| 658 | } | 659 | } |
| @@ -791,6 +792,7 @@ static void try_to_optimize_kprobe(struct kprobe *p) | |||
| 791 | return; | 792 | return; |
| 792 | 793 | ||
| 793 | /* For preparing optimization, jump_label_text_reserved() is called */ | 794 | /* For preparing optimization, jump_label_text_reserved() is called */ |
| 795 | cpus_read_lock(); | ||
| 794 | jump_label_lock(); | 796 | jump_label_lock(); |
| 795 | mutex_lock(&text_mutex); | 797 | mutex_lock(&text_mutex); |
| 796 | 798 | ||
| @@ -812,6 +814,7 @@ static void try_to_optimize_kprobe(struct kprobe *p) | |||
| 812 | out: | 814 | out: |
| 813 | mutex_unlock(&text_mutex); | 815 | mutex_unlock(&text_mutex); |
| 814 | jump_label_unlock(); | 816 | jump_label_unlock(); |
| 817 | cpus_read_unlock(); | ||
| 815 | } | 818 | } |
| 816 | 819 | ||
| 817 | #ifdef CONFIG_SYSCTL | 820 | #ifdef CONFIG_SYSCTL |
| @@ -826,6 +829,7 @@ static void optimize_all_kprobes(void) | |||
| 826 | if (kprobes_allow_optimization) | 829 | if (kprobes_allow_optimization) |
| 827 | goto out; | 830 | goto out; |
| 828 | 831 | ||
| 832 | cpus_read_lock(); | ||
| 829 | kprobes_allow_optimization = true; | 833 | kprobes_allow_optimization = true; |
| 830 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { | 834 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { |
| 831 | head = &kprobe_table[i]; | 835 | head = &kprobe_table[i]; |
| @@ -833,6 +837,7 @@ static void optimize_all_kprobes(void) | |||
| 833 | if (!kprobe_disabled(p)) | 837 | if (!kprobe_disabled(p)) |
| 834 | optimize_kprobe(p); | 838 | optimize_kprobe(p); |
| 835 | } | 839 | } |
| 840 | cpus_read_unlock(); | ||
| 836 | printk(KERN_INFO "Kprobes globally optimized\n"); | 841 | printk(KERN_INFO "Kprobes globally optimized\n"); |
| 837 | out: | 842 | out: |
| 838 | mutex_unlock(&kprobe_mutex); | 843 | mutex_unlock(&kprobe_mutex); |
| @@ -851,6 +856,7 @@ static void unoptimize_all_kprobes(void) | |||
| 851 | return; | 856 | return; |
| 852 | } | 857 | } |
| 853 | 858 | ||
| 859 | cpus_read_lock(); | ||
| 854 | kprobes_allow_optimization = false; | 860 | kprobes_allow_optimization = false; |
| 855 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { | 861 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { |
| 856 | head = &kprobe_table[i]; | 862 | head = &kprobe_table[i]; |
| @@ -859,6 +865,7 @@ static void unoptimize_all_kprobes(void) | |||
| 859 | unoptimize_kprobe(p, false); | 865 | unoptimize_kprobe(p, false); |
| 860 | } | 866 | } |
| 861 | } | 867 | } |
| 868 | cpus_read_unlock(); | ||
| 862 | mutex_unlock(&kprobe_mutex); | 869 | mutex_unlock(&kprobe_mutex); |
| 863 | 870 | ||
| 864 | /* Wait for unoptimizing completion */ | 871 | /* Wait for unoptimizing completion */ |
| @@ -1010,14 +1017,11 @@ static void arm_kprobe(struct kprobe *kp) | |||
| 1010 | arm_kprobe_ftrace(kp); | 1017 | arm_kprobe_ftrace(kp); |
| 1011 | return; | 1018 | return; |
| 1012 | } | 1019 | } |
| 1013 | /* | 1020 | cpus_read_lock(); |
| 1014 | * Here, since __arm_kprobe() doesn't use stop_machine(), | ||
| 1015 | * this doesn't cause deadlock on text_mutex. So, we don't | ||
| 1016 | * need get_online_cpus(). | ||
| 1017 | */ | ||
| 1018 | mutex_lock(&text_mutex); | 1021 | mutex_lock(&text_mutex); |
| 1019 | __arm_kprobe(kp); | 1022 | __arm_kprobe(kp); |
| 1020 | mutex_unlock(&text_mutex); | 1023 | mutex_unlock(&text_mutex); |
| 1024 | cpus_read_unlock(); | ||
| 1021 | } | 1025 | } |
| 1022 | 1026 | ||
| 1023 | /* Disarm a kprobe with text_mutex */ | 1027 | /* Disarm a kprobe with text_mutex */ |
| @@ -1027,10 +1031,12 @@ static void disarm_kprobe(struct kprobe *kp, bool reopt) | |||
| 1027 | disarm_kprobe_ftrace(kp); | 1031 | disarm_kprobe_ftrace(kp); |
| 1028 | return; | 1032 | return; |
| 1029 | } | 1033 | } |
| 1030 | /* Ditto */ | 1034 | |
| 1035 | cpus_read_lock(); | ||
| 1031 | mutex_lock(&text_mutex); | 1036 | mutex_lock(&text_mutex); |
| 1032 | __disarm_kprobe(kp, reopt); | 1037 | __disarm_kprobe(kp, reopt); |
| 1033 | mutex_unlock(&text_mutex); | 1038 | mutex_unlock(&text_mutex); |
| 1039 | cpus_read_unlock(); | ||
| 1034 | } | 1040 | } |
| 1035 | 1041 | ||
| 1036 | /* | 1042 | /* |
| @@ -1298,13 +1304,10 @@ static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p) | |||
| 1298 | int ret = 0; | 1304 | int ret = 0; |
| 1299 | struct kprobe *ap = orig_p; | 1305 | struct kprobe *ap = orig_p; |
| 1300 | 1306 | ||
| 1307 | cpus_read_lock(); | ||
| 1308 | |||
| 1301 | /* For preparing optimization, jump_label_text_reserved() is called */ | 1309 | /* For preparing optimization, jump_label_text_reserved() is called */ |
| 1302 | jump_label_lock(); | 1310 | jump_label_lock(); |
| 1303 | /* | ||
| 1304 | * Get online CPUs to avoid text_mutex deadlock.with stop machine, | ||
| 1305 | * which is invoked by unoptimize_kprobe() in add_new_kprobe() | ||
| 1306 | */ | ||
| 1307 | get_online_cpus(); | ||
| 1308 | mutex_lock(&text_mutex); | 1311 | mutex_lock(&text_mutex); |
| 1309 | 1312 | ||
| 1310 | if (!kprobe_aggrprobe(orig_p)) { | 1313 | if (!kprobe_aggrprobe(orig_p)) { |
| @@ -1352,8 +1355,8 @@ static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p) | |||
| 1352 | 1355 | ||
| 1353 | out: | 1356 | out: |
| 1354 | mutex_unlock(&text_mutex); | 1357 | mutex_unlock(&text_mutex); |
| 1355 | put_online_cpus(); | ||
| 1356 | jump_label_unlock(); | 1358 | jump_label_unlock(); |
| 1359 | cpus_read_unlock(); | ||
| 1357 | 1360 | ||
| 1358 | if (ret == 0 && kprobe_disabled(ap) && !kprobe_disabled(p)) { | 1361 | if (ret == 0 && kprobe_disabled(ap) && !kprobe_disabled(p)) { |
| 1359 | ap->flags &= ~KPROBE_FLAG_DISABLED; | 1362 | ap->flags &= ~KPROBE_FLAG_DISABLED; |
| @@ -1555,9 +1558,12 @@ int register_kprobe(struct kprobe *p) | |||
| 1555 | goto out; | 1558 | goto out; |
| 1556 | } | 1559 | } |
| 1557 | 1560 | ||
| 1558 | mutex_lock(&text_mutex); /* Avoiding text modification */ | 1561 | cpus_read_lock(); |
| 1562 | /* Prevent text modification */ | ||
| 1563 | mutex_lock(&text_mutex); | ||
| 1559 | ret = prepare_kprobe(p); | 1564 | ret = prepare_kprobe(p); |
| 1560 | mutex_unlock(&text_mutex); | 1565 | mutex_unlock(&text_mutex); |
| 1566 | cpus_read_unlock(); | ||
| 1561 | if (ret) | 1567 | if (ret) |
| 1562 | goto out; | 1568 | goto out; |
| 1563 | 1569 | ||
| @@ -1570,7 +1576,6 @@ int register_kprobe(struct kprobe *p) | |||
| 1570 | 1576 | ||
| 1571 | /* Try to optimize kprobe */ | 1577 | /* Try to optimize kprobe */ |
| 1572 | try_to_optimize_kprobe(p); | 1578 | try_to_optimize_kprobe(p); |
| 1573 | |||
| 1574 | out: | 1579 | out: |
| 1575 | mutex_unlock(&kprobe_mutex); | 1580 | mutex_unlock(&kprobe_mutex); |
| 1576 | 1581 | ||
| @@ -1766,24 +1771,13 @@ unsigned long __weak arch_deref_entry_point(void *entry) | |||
| 1766 | 1771 | ||
| 1767 | int register_jprobes(struct jprobe **jps, int num) | 1772 | int register_jprobes(struct jprobe **jps, int num) |
| 1768 | { | 1773 | { |
| 1769 | struct jprobe *jp; | ||
| 1770 | int ret = 0, i; | 1774 | int ret = 0, i; |
| 1771 | 1775 | ||
| 1772 | if (num <= 0) | 1776 | if (num <= 0) |
| 1773 | return -EINVAL; | 1777 | return -EINVAL; |
| 1778 | |||
| 1774 | for (i = 0; i < num; i++) { | 1779 | for (i = 0; i < num; i++) { |
| 1775 | unsigned long addr, offset; | 1780 | ret = register_jprobe(jps[i]); |
| 1776 | jp = jps[i]; | ||
| 1777 | addr = arch_deref_entry_point(jp->entry); | ||
| 1778 | |||
| 1779 | /* Verify probepoint is a function entry point */ | ||
| 1780 | if (kallsyms_lookup_size_offset(addr, NULL, &offset) && | ||
| 1781 | offset == 0) { | ||
| 1782 | jp->kp.pre_handler = setjmp_pre_handler; | ||
| 1783 | jp->kp.break_handler = longjmp_break_handler; | ||
| 1784 | ret = register_kprobe(&jp->kp); | ||
| 1785 | } else | ||
| 1786 | ret = -EINVAL; | ||
| 1787 | 1781 | ||
| 1788 | if (ret < 0) { | 1782 | if (ret < 0) { |
| 1789 | if (i > 0) | 1783 | if (i > 0) |
| @@ -1791,13 +1785,30 @@ int register_jprobes(struct jprobe **jps, int num) | |||
| 1791 | break; | 1785 | break; |
| 1792 | } | 1786 | } |
| 1793 | } | 1787 | } |
| 1788 | |||
| 1794 | return ret; | 1789 | return ret; |
| 1795 | } | 1790 | } |
| 1796 | EXPORT_SYMBOL_GPL(register_jprobes); | 1791 | EXPORT_SYMBOL_GPL(register_jprobes); |
| 1797 | 1792 | ||
| 1798 | int register_jprobe(struct jprobe *jp) | 1793 | int register_jprobe(struct jprobe *jp) |
| 1799 | { | 1794 | { |
| 1800 | return register_jprobes(&jp, 1); | 1795 | unsigned long addr, offset; |
| 1796 | struct kprobe *kp = &jp->kp; | ||
| 1797 | |||
| 1798 | /* | ||
| 1799 | * Verify probepoint as well as the jprobe handler are | ||
| 1800 | * valid function entry points. | ||
| 1801 | */ | ||
| 1802 | addr = arch_deref_entry_point(jp->entry); | ||
| 1803 | |||
| 1804 | if (kallsyms_lookup_size_offset(addr, NULL, &offset) && offset == 0 && | ||
| 1805 | kprobe_on_func_entry(kp->addr, kp->symbol_name, kp->offset)) { | ||
| 1806 | kp->pre_handler = setjmp_pre_handler; | ||
| 1807 | kp->break_handler = longjmp_break_handler; | ||
| 1808 | return register_kprobe(kp); | ||
| 1809 | } | ||
| 1810 | |||
| 1811 | return -EINVAL; | ||
| 1801 | } | 1812 | } |
| 1802 | EXPORT_SYMBOL_GPL(register_jprobe); | 1813 | EXPORT_SYMBOL_GPL(register_jprobe); |
| 1803 | 1814 | ||
| @@ -1883,12 +1894,12 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) | |||
| 1883 | } | 1894 | } |
| 1884 | NOKPROBE_SYMBOL(pre_handler_kretprobe); | 1895 | NOKPROBE_SYMBOL(pre_handler_kretprobe); |
| 1885 | 1896 | ||
| 1886 | bool __weak arch_function_offset_within_entry(unsigned long offset) | 1897 | bool __weak arch_kprobe_on_func_entry(unsigned long offset) |
| 1887 | { | 1898 | { |
| 1888 | return !offset; | 1899 | return !offset; |
| 1889 | } | 1900 | } |
| 1890 | 1901 | ||
| 1891 | bool function_offset_within_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset) | 1902 | bool kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset) |
| 1892 | { | 1903 | { |
| 1893 | kprobe_opcode_t *kp_addr = _kprobe_addr(addr, sym, offset); | 1904 | kprobe_opcode_t *kp_addr = _kprobe_addr(addr, sym, offset); |
| 1894 | 1905 | ||
| @@ -1896,7 +1907,7 @@ bool function_offset_within_entry(kprobe_opcode_t *addr, const char *sym, unsign | |||
| 1896 | return false; | 1907 | return false; |
| 1897 | 1908 | ||
| 1898 | if (!kallsyms_lookup_size_offset((unsigned long)kp_addr, NULL, &offset) || | 1909 | if (!kallsyms_lookup_size_offset((unsigned long)kp_addr, NULL, &offset) || |
| 1899 | !arch_function_offset_within_entry(offset)) | 1910 | !arch_kprobe_on_func_entry(offset)) |
| 1900 | return false; | 1911 | return false; |
| 1901 | 1912 | ||
| 1902 | return true; | 1913 | return true; |
| @@ -1909,7 +1920,7 @@ int register_kretprobe(struct kretprobe *rp) | |||
| 1909 | int i; | 1920 | int i; |
| 1910 | void *addr; | 1921 | void *addr; |
| 1911 | 1922 | ||
| 1912 | if (!function_offset_within_entry(rp->kp.addr, rp->kp.symbol_name, rp->kp.offset)) | 1923 | if (!kprobe_on_func_entry(rp->kp.addr, rp->kp.symbol_name, rp->kp.offset)) |
| 1913 | return -EINVAL; | 1924 | return -EINVAL; |
| 1914 | 1925 | ||
| 1915 | if (kretprobe_blacklist_size) { | 1926 | if (kretprobe_blacklist_size) { |
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 23cd70651238..46ba853656f6 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c | |||
| @@ -134,7 +134,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj, | |||
| 134 | { | 134 | { |
| 135 | phys_addr_t vmcore_base = paddr_vmcoreinfo_note(); | 135 | phys_addr_t vmcore_base = paddr_vmcoreinfo_note(); |
| 136 | return sprintf(buf, "%pa %x\n", &vmcore_base, | 136 | return sprintf(buf, "%pa %x\n", &vmcore_base, |
| 137 | (unsigned int)sizeof(vmcoreinfo_note)); | 137 | (unsigned int)VMCOREINFO_NOTE_SIZE); |
| 138 | } | 138 | } |
| 139 | KERNEL_ATTR_RO(vmcoreinfo); | 139 | KERNEL_ATTR_RO(vmcoreinfo); |
| 140 | 140 | ||
| @@ -234,7 +234,7 @@ static struct attribute * kernel_attrs[] = { | |||
| 234 | NULL | 234 | NULL |
| 235 | }; | 235 | }; |
| 236 | 236 | ||
| 237 | static struct attribute_group kernel_attr_group = { | 237 | static const struct attribute_group kernel_attr_group = { |
| 238 | .attrs = kernel_attrs, | 238 | .attrs = kernel_attrs, |
| 239 | }; | 239 | }; |
| 240 | 240 | ||
diff --git a/kernel/livepatch/Kconfig b/kernel/livepatch/Kconfig index 045022557936..ec4565122e65 100644 --- a/kernel/livepatch/Kconfig +++ b/kernel/livepatch/Kconfig | |||
| @@ -10,6 +10,7 @@ config LIVEPATCH | |||
| 10 | depends on SYSFS | 10 | depends on SYSFS |
| 11 | depends on KALLSYMS_ALL | 11 | depends on KALLSYMS_ALL |
| 12 | depends on HAVE_LIVEPATCH | 12 | depends on HAVE_LIVEPATCH |
| 13 | depends on !TRIM_UNUSED_KSYMS | ||
| 13 | help | 14 | help |
| 14 | Say Y here if you want to support kernel live patching. | 15 | Say Y here if you want to support kernel live patching. |
| 15 | This option has no runtime impact until a kernel "patch" | 16 | This option has no runtime impact until a kernel "patch" |
diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c index f8269036bf0b..52c4e907c14b 100644 --- a/kernel/livepatch/patch.c +++ b/kernel/livepatch/patch.c | |||
| @@ -59,7 +59,11 @@ static void notrace klp_ftrace_handler(unsigned long ip, | |||
| 59 | 59 | ||
| 60 | ops = container_of(fops, struct klp_ops, fops); | 60 | ops = container_of(fops, struct klp_ops, fops); |
| 61 | 61 | ||
| 62 | rcu_read_lock(); | 62 | /* |
| 63 | * A variant of synchronize_sched() is used to allow patching functions | ||
| 64 | * where RCU is not watching, see klp_synchronize_transition(). | ||
| 65 | */ | ||
| 66 | preempt_disable_notrace(); | ||
| 63 | 67 | ||
| 64 | func = list_first_or_null_rcu(&ops->func_stack, struct klp_func, | 68 | func = list_first_or_null_rcu(&ops->func_stack, struct klp_func, |
| 65 | stack_node); | 69 | stack_node); |
| @@ -115,7 +119,7 @@ static void notrace klp_ftrace_handler(unsigned long ip, | |||
| 115 | 119 | ||
| 116 | klp_arch_set_pc(regs, (unsigned long)func->new_func); | 120 | klp_arch_set_pc(regs, (unsigned long)func->new_func); |
| 117 | unlock: | 121 | unlock: |
| 118 | rcu_read_unlock(); | 122 | preempt_enable_notrace(); |
| 119 | } | 123 | } |
| 120 | 124 | ||
| 121 | /* | 125 | /* |
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index adc0cc64aa4b..b004a1fb6032 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c | |||
| @@ -49,6 +49,28 @@ static void klp_transition_work_fn(struct work_struct *work) | |||
| 49 | static DECLARE_DELAYED_WORK(klp_transition_work, klp_transition_work_fn); | 49 | static DECLARE_DELAYED_WORK(klp_transition_work, klp_transition_work_fn); |
| 50 | 50 | ||
| 51 | /* | 51 | /* |
| 52 | * This function is just a stub to implement a hard force | ||
| 53 | * of synchronize_sched(). This requires synchronizing | ||
| 54 | * tasks even in userspace and idle. | ||
| 55 | */ | ||
| 56 | static void klp_sync(struct work_struct *work) | ||
| 57 | { | ||
| 58 | } | ||
| 59 | |||
| 60 | /* | ||
| 61 | * We allow to patch also functions where RCU is not watching, | ||
| 62 | * e.g. before user_exit(). We can not rely on the RCU infrastructure | ||
| 63 | * to do the synchronization. Instead hard force the sched synchronization. | ||
| 64 | * | ||
| 65 | * This approach allows to use RCU functions for manipulating func_stack | ||
| 66 | * safely. | ||
| 67 | */ | ||
| 68 | static void klp_synchronize_transition(void) | ||
| 69 | { | ||
| 70 | schedule_on_each_cpu(klp_sync); | ||
| 71 | } | ||
| 72 | |||
| 73 | /* | ||
| 52 | * The transition to the target patch state is complete. Clean up the data | 74 | * The transition to the target patch state is complete. Clean up the data |
| 53 | * structures. | 75 | * structures. |
| 54 | */ | 76 | */ |
| @@ -73,7 +95,7 @@ static void klp_complete_transition(void) | |||
| 73 | * func->transition gets cleared, the handler may choose a | 95 | * func->transition gets cleared, the handler may choose a |
| 74 | * removed function. | 96 | * removed function. |
| 75 | */ | 97 | */ |
| 76 | synchronize_rcu(); | 98 | klp_synchronize_transition(); |
| 77 | } | 99 | } |
| 78 | 100 | ||
| 79 | if (klp_transition_patch->immediate) | 101 | if (klp_transition_patch->immediate) |
| @@ -92,7 +114,7 @@ static void klp_complete_transition(void) | |||
| 92 | 114 | ||
| 93 | /* Prevent klp_ftrace_handler() from seeing KLP_UNDEFINED state */ | 115 | /* Prevent klp_ftrace_handler() from seeing KLP_UNDEFINED state */ |
| 94 | if (klp_target_state == KLP_PATCHED) | 116 | if (klp_target_state == KLP_PATCHED) |
| 95 | synchronize_rcu(); | 117 | klp_synchronize_transition(); |
| 96 | 118 | ||
| 97 | read_lock(&tasklist_lock); | 119 | read_lock(&tasklist_lock); |
| 98 | for_each_process_thread(g, task) { | 120 | for_each_process_thread(g, task) { |
| @@ -136,7 +158,11 @@ void klp_cancel_transition(void) | |||
| 136 | */ | 158 | */ |
| 137 | void klp_update_patch_state(struct task_struct *task) | 159 | void klp_update_patch_state(struct task_struct *task) |
| 138 | { | 160 | { |
| 139 | rcu_read_lock(); | 161 | /* |
| 162 | * A variant of synchronize_sched() is used to allow patching functions | ||
| 163 | * where RCU is not watching, see klp_synchronize_transition(). | ||
| 164 | */ | ||
| 165 | preempt_disable_notrace(); | ||
| 140 | 166 | ||
| 141 | /* | 167 | /* |
| 142 | * This test_and_clear_tsk_thread_flag() call also serves as a read | 168 | * This test_and_clear_tsk_thread_flag() call also serves as a read |
| @@ -153,7 +179,7 @@ void klp_update_patch_state(struct task_struct *task) | |||
| 153 | if (test_and_clear_tsk_thread_flag(task, TIF_PATCH_PENDING)) | 179 | if (test_and_clear_tsk_thread_flag(task, TIF_PATCH_PENDING)) |
| 154 | task->patch_state = READ_ONCE(klp_target_state); | 180 | task->patch_state = READ_ONCE(klp_target_state); |
| 155 | 181 | ||
| 156 | rcu_read_unlock(); | 182 | preempt_enable_notrace(); |
| 157 | } | 183 | } |
| 158 | 184 | ||
| 159 | /* | 185 | /* |
| @@ -539,7 +565,7 @@ void klp_reverse_transition(void) | |||
| 539 | clear_tsk_thread_flag(idle_task(cpu), TIF_PATCH_PENDING); | 565 | clear_tsk_thread_flag(idle_task(cpu), TIF_PATCH_PENDING); |
| 540 | 566 | ||
| 541 | /* Let any remaining calls to klp_update_patch_state() complete */ | 567 | /* Let any remaining calls to klp_update_patch_state() complete */ |
| 542 | synchronize_rcu(); | 568 | klp_synchronize_transition(); |
| 543 | 569 | ||
| 544 | klp_start_transition(); | 570 | klp_start_transition(); |
| 545 | } | 571 | } |
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index c0e31bfee25c..7d2499bec5fe 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c | |||
| @@ -1157,18 +1157,18 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth, | |||
| 1157 | if (debug_locks_silent) | 1157 | if (debug_locks_silent) |
| 1158 | return 0; | 1158 | return 0; |
| 1159 | 1159 | ||
| 1160 | printk("\n"); | 1160 | pr_warn("\n"); |
| 1161 | pr_warn("======================================================\n"); | 1161 | pr_warn("======================================================\n"); |
| 1162 | pr_warn("WARNING: possible circular locking dependency detected\n"); | 1162 | pr_warn("WARNING: possible circular locking dependency detected\n"); |
| 1163 | print_kernel_ident(); | 1163 | print_kernel_ident(); |
| 1164 | pr_warn("------------------------------------------------------\n"); | 1164 | pr_warn("------------------------------------------------------\n"); |
| 1165 | printk("%s/%d is trying to acquire lock:\n", | 1165 | pr_warn("%s/%d is trying to acquire lock:\n", |
| 1166 | curr->comm, task_pid_nr(curr)); | 1166 | curr->comm, task_pid_nr(curr)); |
| 1167 | print_lock(check_src); | 1167 | print_lock(check_src); |
| 1168 | printk("\nbut task is already holding lock:\n"); | 1168 | pr_warn("\nbut task is already holding lock:\n"); |
| 1169 | print_lock(check_tgt); | 1169 | print_lock(check_tgt); |
| 1170 | printk("\nwhich lock already depends on the new lock.\n\n"); | 1170 | pr_warn("\nwhich lock already depends on the new lock.\n\n"); |
| 1171 | printk("\nthe existing dependency chain (in reverse order) is:\n"); | 1171 | pr_warn("\nthe existing dependency chain (in reverse order) is:\n"); |
| 1172 | 1172 | ||
| 1173 | print_circular_bug_entry(entry, depth); | 1173 | print_circular_bug_entry(entry, depth); |
| 1174 | 1174 | ||
| @@ -1495,13 +1495,13 @@ print_bad_irq_dependency(struct task_struct *curr, | |||
| 1495 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) | 1495 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) |
| 1496 | return 0; | 1496 | return 0; |
| 1497 | 1497 | ||
| 1498 | printk("\n"); | 1498 | pr_warn("\n"); |
| 1499 | pr_warn("=====================================================\n"); | 1499 | pr_warn("=====================================================\n"); |
| 1500 | pr_warn("WARNING: %s-safe -> %s-unsafe lock order detected\n", | 1500 | pr_warn("WARNING: %s-safe -> %s-unsafe lock order detected\n", |
| 1501 | irqclass, irqclass); | 1501 | irqclass, irqclass); |
| 1502 | print_kernel_ident(); | 1502 | print_kernel_ident(); |
| 1503 | pr_warn("-----------------------------------------------------\n"); | 1503 | pr_warn("-----------------------------------------------------\n"); |
| 1504 | printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", | 1504 | pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", |
| 1505 | curr->comm, task_pid_nr(curr), | 1505 | curr->comm, task_pid_nr(curr), |
| 1506 | curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT, | 1506 | curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT, |
| 1507 | curr->softirq_context, softirq_count() >> SOFTIRQ_SHIFT, | 1507 | curr->softirq_context, softirq_count() >> SOFTIRQ_SHIFT, |
| @@ -1509,46 +1509,46 @@ print_bad_irq_dependency(struct task_struct *curr, | |||
| 1509 | curr->softirqs_enabled); | 1509 | curr->softirqs_enabled); |
| 1510 | print_lock(next); | 1510 | print_lock(next); |
| 1511 | 1511 | ||
| 1512 | printk("\nand this task is already holding:\n"); | 1512 | pr_warn("\nand this task is already holding:\n"); |
| 1513 | print_lock(prev); | 1513 | print_lock(prev); |
| 1514 | printk("which would create a new lock dependency:\n"); | 1514 | pr_warn("which would create a new lock dependency:\n"); |
| 1515 | print_lock_name(hlock_class(prev)); | 1515 | print_lock_name(hlock_class(prev)); |
| 1516 | printk(KERN_CONT " ->"); | 1516 | pr_cont(" ->"); |
| 1517 | print_lock_name(hlock_class(next)); | 1517 | print_lock_name(hlock_class(next)); |
| 1518 | printk(KERN_CONT "\n"); | 1518 | pr_cont("\n"); |
| 1519 | 1519 | ||
| 1520 | printk("\nbut this new dependency connects a %s-irq-safe lock:\n", | 1520 | pr_warn("\nbut this new dependency connects a %s-irq-safe lock:\n", |
| 1521 | irqclass); | 1521 | irqclass); |
| 1522 | print_lock_name(backwards_entry->class); | 1522 | print_lock_name(backwards_entry->class); |
| 1523 | printk("\n... which became %s-irq-safe at:\n", irqclass); | 1523 | pr_warn("\n... which became %s-irq-safe at:\n", irqclass); |
| 1524 | 1524 | ||
| 1525 | print_stack_trace(backwards_entry->class->usage_traces + bit1, 1); | 1525 | print_stack_trace(backwards_entry->class->usage_traces + bit1, 1); |
| 1526 | 1526 | ||
| 1527 | printk("\nto a %s-irq-unsafe lock:\n", irqclass); | 1527 | pr_warn("\nto a %s-irq-unsafe lock:\n", irqclass); |
| 1528 | print_lock_name(forwards_entry->class); | 1528 | print_lock_name(forwards_entry->class); |
| 1529 | printk("\n... which became %s-irq-unsafe at:\n", irqclass); | 1529 | pr_warn("\n... which became %s-irq-unsafe at:\n", irqclass); |
| 1530 | printk("..."); | 1530 | pr_warn("..."); |
| 1531 | 1531 | ||
| 1532 | print_stack_trace(forwards_entry->class->usage_traces + bit2, 1); | 1532 | print_stack_trace(forwards_entry->class->usage_traces + bit2, 1); |
| 1533 | 1533 | ||
| 1534 | printk("\nother info that might help us debug this:\n\n"); | 1534 | pr_warn("\nother info that might help us debug this:\n\n"); |
| 1535 | print_irq_lock_scenario(backwards_entry, forwards_entry, | 1535 | print_irq_lock_scenario(backwards_entry, forwards_entry, |
| 1536 | hlock_class(prev), hlock_class(next)); | 1536 | hlock_class(prev), hlock_class(next)); |
| 1537 | 1537 | ||
| 1538 | lockdep_print_held_locks(curr); | 1538 | lockdep_print_held_locks(curr); |
| 1539 | 1539 | ||
| 1540 | printk("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass); | 1540 | pr_warn("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass); |
| 1541 | if (!save_trace(&prev_root->trace)) | 1541 | if (!save_trace(&prev_root->trace)) |
| 1542 | return 0; | 1542 | return 0; |
| 1543 | print_shortest_lock_dependencies(backwards_entry, prev_root); | 1543 | print_shortest_lock_dependencies(backwards_entry, prev_root); |
| 1544 | 1544 | ||
| 1545 | printk("\nthe dependencies between the lock to be acquired"); | 1545 | pr_warn("\nthe dependencies between the lock to be acquired"); |
| 1546 | printk(" and %s-irq-unsafe lock:\n", irqclass); | 1546 | pr_warn(" and %s-irq-unsafe lock:\n", irqclass); |
| 1547 | if (!save_trace(&next_root->trace)) | 1547 | if (!save_trace(&next_root->trace)) |
| 1548 | return 0; | 1548 | return 0; |
| 1549 | print_shortest_lock_dependencies(forwards_entry, next_root); | 1549 | print_shortest_lock_dependencies(forwards_entry, next_root); |
| 1550 | 1550 | ||
| 1551 | printk("\nstack backtrace:\n"); | 1551 | pr_warn("\nstack backtrace:\n"); |
| 1552 | dump_stack(); | 1552 | dump_stack(); |
| 1553 | 1553 | ||
| 1554 | return 0; | 1554 | return 0; |
| @@ -1724,22 +1724,22 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, | |||
| 1724 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) | 1724 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) |
| 1725 | return 0; | 1725 | return 0; |
| 1726 | 1726 | ||
| 1727 | printk("\n"); | 1727 | pr_warn("\n"); |
| 1728 | pr_warn("============================================\n"); | 1728 | pr_warn("============================================\n"); |
| 1729 | pr_warn("WARNING: possible recursive locking detected\n"); | 1729 | pr_warn("WARNING: possible recursive locking detected\n"); |
| 1730 | print_kernel_ident(); | 1730 | print_kernel_ident(); |
| 1731 | pr_warn("--------------------------------------------\n"); | 1731 | pr_warn("--------------------------------------------\n"); |
| 1732 | printk("%s/%d is trying to acquire lock:\n", | 1732 | pr_warn("%s/%d is trying to acquire lock:\n", |
| 1733 | curr->comm, task_pid_nr(curr)); | 1733 | curr->comm, task_pid_nr(curr)); |
| 1734 | print_lock(next); | 1734 | print_lock(next); |
| 1735 | printk("\nbut task is already holding lock:\n"); | 1735 | pr_warn("\nbut task is already holding lock:\n"); |
| 1736 | print_lock(prev); | 1736 | print_lock(prev); |
| 1737 | 1737 | ||
| 1738 | printk("\nother info that might help us debug this:\n"); | 1738 | pr_warn("\nother info that might help us debug this:\n"); |
| 1739 | print_deadlock_scenario(next, prev); | 1739 | print_deadlock_scenario(next, prev); |
| 1740 | lockdep_print_held_locks(curr); | 1740 | lockdep_print_held_locks(curr); |
| 1741 | 1741 | ||
| 1742 | printk("\nstack backtrace:\n"); | 1742 | pr_warn("\nstack backtrace:\n"); |
| 1743 | dump_stack(); | 1743 | dump_stack(); |
| 1744 | 1744 | ||
| 1745 | return 0; | 1745 | return 0; |
| @@ -2074,21 +2074,21 @@ static void print_collision(struct task_struct *curr, | |||
| 2074 | struct held_lock *hlock_next, | 2074 | struct held_lock *hlock_next, |
| 2075 | struct lock_chain *chain) | 2075 | struct lock_chain *chain) |
| 2076 | { | 2076 | { |
| 2077 | printk("\n"); | 2077 | pr_warn("\n"); |
| 2078 | pr_warn("============================\n"); | 2078 | pr_warn("============================\n"); |
| 2079 | pr_warn("WARNING: chain_key collision\n"); | 2079 | pr_warn("WARNING: chain_key collision\n"); |
| 2080 | print_kernel_ident(); | 2080 | print_kernel_ident(); |
| 2081 | pr_warn("----------------------------\n"); | 2081 | pr_warn("----------------------------\n"); |
| 2082 | printk("%s/%d: ", current->comm, task_pid_nr(current)); | 2082 | pr_warn("%s/%d: ", current->comm, task_pid_nr(current)); |
| 2083 | printk("Hash chain already cached but the contents don't match!\n"); | 2083 | pr_warn("Hash chain already cached but the contents don't match!\n"); |
| 2084 | 2084 | ||
| 2085 | printk("Held locks:"); | 2085 | pr_warn("Held locks:"); |
| 2086 | print_chain_keys_held_locks(curr, hlock_next); | 2086 | print_chain_keys_held_locks(curr, hlock_next); |
| 2087 | 2087 | ||
| 2088 | printk("Locks in cached chain:"); | 2088 | pr_warn("Locks in cached chain:"); |
| 2089 | print_chain_keys_chain(chain); | 2089 | print_chain_keys_chain(chain); |
| 2090 | 2090 | ||
| 2091 | printk("\nstack backtrace:\n"); | 2091 | pr_warn("\nstack backtrace:\n"); |
| 2092 | dump_stack(); | 2092 | dump_stack(); |
| 2093 | } | 2093 | } |
| 2094 | #endif | 2094 | #endif |
| @@ -2373,16 +2373,16 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, | |||
| 2373 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) | 2373 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) |
| 2374 | return 0; | 2374 | return 0; |
| 2375 | 2375 | ||
| 2376 | printk("\n"); | 2376 | pr_warn("\n"); |
| 2377 | pr_warn("================================\n"); | 2377 | pr_warn("================================\n"); |
| 2378 | pr_warn("WARNING: inconsistent lock state\n"); | 2378 | pr_warn("WARNING: inconsistent lock state\n"); |
| 2379 | print_kernel_ident(); | 2379 | print_kernel_ident(); |
| 2380 | pr_warn("--------------------------------\n"); | 2380 | pr_warn("--------------------------------\n"); |
| 2381 | 2381 | ||
| 2382 | printk("inconsistent {%s} -> {%s} usage.\n", | 2382 | pr_warn("inconsistent {%s} -> {%s} usage.\n", |
| 2383 | usage_str[prev_bit], usage_str[new_bit]); | 2383 | usage_str[prev_bit], usage_str[new_bit]); |
| 2384 | 2384 | ||
| 2385 | printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n", | 2385 | pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n", |
| 2386 | curr->comm, task_pid_nr(curr), | 2386 | curr->comm, task_pid_nr(curr), |
| 2387 | trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT, | 2387 | trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT, |
| 2388 | trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT, | 2388 | trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT, |
| @@ -2390,16 +2390,16 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, | |||
| 2390 | trace_softirqs_enabled(curr)); | 2390 | trace_softirqs_enabled(curr)); |
| 2391 | print_lock(this); | 2391 | print_lock(this); |
| 2392 | 2392 | ||
| 2393 | printk("{%s} state was registered at:\n", usage_str[prev_bit]); | 2393 | pr_warn("{%s} state was registered at:\n", usage_str[prev_bit]); |
| 2394 | print_stack_trace(hlock_class(this)->usage_traces + prev_bit, 1); | 2394 | print_stack_trace(hlock_class(this)->usage_traces + prev_bit, 1); |
| 2395 | 2395 | ||
| 2396 | print_irqtrace_events(curr); | 2396 | print_irqtrace_events(curr); |
| 2397 | printk("\nother info that might help us debug this:\n"); | 2397 | pr_warn("\nother info that might help us debug this:\n"); |
| 2398 | print_usage_bug_scenario(this); | 2398 | print_usage_bug_scenario(this); |
| 2399 | 2399 | ||
| 2400 | lockdep_print_held_locks(curr); | 2400 | lockdep_print_held_locks(curr); |
| 2401 | 2401 | ||
| 2402 | printk("\nstack backtrace:\n"); | 2402 | pr_warn("\nstack backtrace:\n"); |
| 2403 | dump_stack(); | 2403 | dump_stack(); |
| 2404 | 2404 | ||
| 2405 | return 0; | 2405 | return 0; |
| @@ -2438,28 +2438,28 @@ print_irq_inversion_bug(struct task_struct *curr, | |||
| 2438 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) | 2438 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) |
| 2439 | return 0; | 2439 | return 0; |
| 2440 | 2440 | ||
| 2441 | printk("\n"); | 2441 | pr_warn("\n"); |
| 2442 | pr_warn("========================================================\n"); | 2442 | pr_warn("========================================================\n"); |
| 2443 | pr_warn("WARNING: possible irq lock inversion dependency detected\n"); | 2443 | pr_warn("WARNING: possible irq lock inversion dependency detected\n"); |
| 2444 | print_kernel_ident(); | 2444 | print_kernel_ident(); |
| 2445 | pr_warn("--------------------------------------------------------\n"); | 2445 | pr_warn("--------------------------------------------------------\n"); |
| 2446 | printk("%s/%d just changed the state of lock:\n", | 2446 | pr_warn("%s/%d just changed the state of lock:\n", |
| 2447 | curr->comm, task_pid_nr(curr)); | 2447 | curr->comm, task_pid_nr(curr)); |
| 2448 | print_lock(this); | 2448 | print_lock(this); |
| 2449 | if (forwards) | 2449 | if (forwards) |
| 2450 | printk("but this lock took another, %s-unsafe lock in the past:\n", irqclass); | 2450 | pr_warn("but this lock took another, %s-unsafe lock in the past:\n", irqclass); |
| 2451 | else | 2451 | else |
| 2452 | printk("but this lock was taken by another, %s-safe lock in the past:\n", irqclass); | 2452 | pr_warn("but this lock was taken by another, %s-safe lock in the past:\n", irqclass); |
| 2453 | print_lock_name(other->class); | 2453 | print_lock_name(other->class); |
| 2454 | printk("\n\nand interrupts could create inverse lock ordering between them.\n\n"); | 2454 | pr_warn("\n\nand interrupts could create inverse lock ordering between them.\n\n"); |
| 2455 | 2455 | ||
| 2456 | printk("\nother info that might help us debug this:\n"); | 2456 | pr_warn("\nother info that might help us debug this:\n"); |
| 2457 | 2457 | ||
| 2458 | /* Find a middle lock (if one exists) */ | 2458 | /* Find a middle lock (if one exists) */ |
| 2459 | depth = get_lock_depth(other); | 2459 | depth = get_lock_depth(other); |
| 2460 | do { | 2460 | do { |
| 2461 | if (depth == 0 && (entry != root)) { | 2461 | if (depth == 0 && (entry != root)) { |
| 2462 | printk("lockdep:%s bad path found in chain graph\n", __func__); | 2462 | pr_warn("lockdep:%s bad path found in chain graph\n", __func__); |
| 2463 | break; | 2463 | break; |
| 2464 | } | 2464 | } |
| 2465 | middle = entry; | 2465 | middle = entry; |
| @@ -2475,12 +2475,12 @@ print_irq_inversion_bug(struct task_struct *curr, | |||
| 2475 | 2475 | ||
| 2476 | lockdep_print_held_locks(curr); | 2476 | lockdep_print_held_locks(curr); |
| 2477 | 2477 | ||
| 2478 | printk("\nthe shortest dependencies between 2nd lock and 1st lock:\n"); | 2478 | pr_warn("\nthe shortest dependencies between 2nd lock and 1st lock:\n"); |
| 2479 | if (!save_trace(&root->trace)) | 2479 | if (!save_trace(&root->trace)) |
| 2480 | return 0; | 2480 | return 0; |
| 2481 | print_shortest_lock_dependencies(other, root); | 2481 | print_shortest_lock_dependencies(other, root); |
| 2482 | 2482 | ||
| 2483 | printk("\nstack backtrace:\n"); | 2483 | pr_warn("\nstack backtrace:\n"); |
| 2484 | dump_stack(); | 2484 | dump_stack(); |
| 2485 | 2485 | ||
| 2486 | return 0; | 2486 | return 0; |
| @@ -3189,25 +3189,25 @@ print_lock_nested_lock_not_held(struct task_struct *curr, | |||
| 3189 | if (debug_locks_silent) | 3189 | if (debug_locks_silent) |
| 3190 | return 0; | 3190 | return 0; |
| 3191 | 3191 | ||
| 3192 | printk("\n"); | 3192 | pr_warn("\n"); |
| 3193 | pr_warn("==================================\n"); | 3193 | pr_warn("==================================\n"); |
| 3194 | pr_warn("WARNING: Nested lock was not taken\n"); | 3194 | pr_warn("WARNING: Nested lock was not taken\n"); |
| 3195 | print_kernel_ident(); | 3195 | print_kernel_ident(); |
| 3196 | pr_warn("----------------------------------\n"); | 3196 | pr_warn("----------------------------------\n"); |
| 3197 | 3197 | ||
| 3198 | printk("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr)); | 3198 | pr_warn("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr)); |
| 3199 | print_lock(hlock); | 3199 | print_lock(hlock); |
| 3200 | 3200 | ||
| 3201 | printk("\nbut this task is not holding:\n"); | 3201 | pr_warn("\nbut this task is not holding:\n"); |
| 3202 | printk("%s\n", hlock->nest_lock->name); | 3202 | pr_warn("%s\n", hlock->nest_lock->name); |
| 3203 | 3203 | ||
| 3204 | printk("\nstack backtrace:\n"); | 3204 | pr_warn("\nstack backtrace:\n"); |
| 3205 | dump_stack(); | 3205 | dump_stack(); |
| 3206 | 3206 | ||
| 3207 | printk("\nother info that might help us debug this:\n"); | 3207 | pr_warn("\nother info that might help us debug this:\n"); |
| 3208 | lockdep_print_held_locks(curr); | 3208 | lockdep_print_held_locks(curr); |
| 3209 | 3209 | ||
| 3210 | printk("\nstack backtrace:\n"); | 3210 | pr_warn("\nstack backtrace:\n"); |
| 3211 | dump_stack(); | 3211 | dump_stack(); |
| 3212 | 3212 | ||
| 3213 | return 0; | 3213 | return 0; |
| @@ -3402,21 +3402,21 @@ print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock, | |||
| 3402 | if (debug_locks_silent) | 3402 | if (debug_locks_silent) |
| 3403 | return 0; | 3403 | return 0; |
| 3404 | 3404 | ||
| 3405 | printk("\n"); | 3405 | pr_warn("\n"); |
| 3406 | pr_warn("=====================================\n"); | 3406 | pr_warn("=====================================\n"); |
| 3407 | pr_warn("WARNING: bad unlock balance detected!\n"); | 3407 | pr_warn("WARNING: bad unlock balance detected!\n"); |
| 3408 | print_kernel_ident(); | 3408 | print_kernel_ident(); |
| 3409 | pr_warn("-------------------------------------\n"); | 3409 | pr_warn("-------------------------------------\n"); |
| 3410 | printk("%s/%d is trying to release lock (", | 3410 | pr_warn("%s/%d is trying to release lock (", |
| 3411 | curr->comm, task_pid_nr(curr)); | 3411 | curr->comm, task_pid_nr(curr)); |
| 3412 | print_lockdep_cache(lock); | 3412 | print_lockdep_cache(lock); |
| 3413 | printk(KERN_CONT ") at:\n"); | 3413 | pr_cont(") at:\n"); |
| 3414 | print_ip_sym(ip); | 3414 | print_ip_sym(ip); |
| 3415 | printk("but there are no more locks to release!\n"); | 3415 | pr_warn("but there are no more locks to release!\n"); |
| 3416 | printk("\nother info that might help us debug this:\n"); | 3416 | pr_warn("\nother info that might help us debug this:\n"); |
| 3417 | lockdep_print_held_locks(curr); | 3417 | lockdep_print_held_locks(curr); |
| 3418 | 3418 | ||
| 3419 | printk("\nstack backtrace:\n"); | 3419 | pr_warn("\nstack backtrace:\n"); |
| 3420 | dump_stack(); | 3420 | dump_stack(); |
| 3421 | 3421 | ||
| 3422 | return 0; | 3422 | return 0; |
| @@ -3974,21 +3974,21 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, | |||
| 3974 | if (debug_locks_silent) | 3974 | if (debug_locks_silent) |
| 3975 | return 0; | 3975 | return 0; |
| 3976 | 3976 | ||
| 3977 | printk("\n"); | 3977 | pr_warn("\n"); |
| 3978 | pr_warn("=================================\n"); | 3978 | pr_warn("=================================\n"); |
| 3979 | pr_warn("WARNING: bad contention detected!\n"); | 3979 | pr_warn("WARNING: bad contention detected!\n"); |
| 3980 | print_kernel_ident(); | 3980 | print_kernel_ident(); |
| 3981 | pr_warn("---------------------------------\n"); | 3981 | pr_warn("---------------------------------\n"); |
| 3982 | printk("%s/%d is trying to contend lock (", | 3982 | pr_warn("%s/%d is trying to contend lock (", |
| 3983 | curr->comm, task_pid_nr(curr)); | 3983 | curr->comm, task_pid_nr(curr)); |
| 3984 | print_lockdep_cache(lock); | 3984 | print_lockdep_cache(lock); |
| 3985 | printk(KERN_CONT ") at:\n"); | 3985 | pr_cont(") at:\n"); |
| 3986 | print_ip_sym(ip); | 3986 | print_ip_sym(ip); |
| 3987 | printk("but there are no locks held!\n"); | 3987 | pr_warn("but there are no locks held!\n"); |
| 3988 | printk("\nother info that might help us debug this:\n"); | 3988 | pr_warn("\nother info that might help us debug this:\n"); |
| 3989 | lockdep_print_held_locks(curr); | 3989 | lockdep_print_held_locks(curr); |
| 3990 | 3990 | ||
| 3991 | printk("\nstack backtrace:\n"); | 3991 | pr_warn("\nstack backtrace:\n"); |
| 3992 | dump_stack(); | 3992 | dump_stack(); |
| 3993 | 3993 | ||
| 3994 | return 0; | 3994 | return 0; |
| @@ -4318,17 +4318,17 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from, | |||
| 4318 | if (debug_locks_silent) | 4318 | if (debug_locks_silent) |
| 4319 | return; | 4319 | return; |
| 4320 | 4320 | ||
| 4321 | printk("\n"); | 4321 | pr_warn("\n"); |
| 4322 | pr_warn("=========================\n"); | 4322 | pr_warn("=========================\n"); |
| 4323 | pr_warn("WARNING: held lock freed!\n"); | 4323 | pr_warn("WARNING: held lock freed!\n"); |
| 4324 | print_kernel_ident(); | 4324 | print_kernel_ident(); |
| 4325 | pr_warn("-------------------------\n"); | 4325 | pr_warn("-------------------------\n"); |
| 4326 | printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n", | 4326 | pr_warn("%s/%d is freeing memory %p-%p, with a lock still held there!\n", |
| 4327 | curr->comm, task_pid_nr(curr), mem_from, mem_to-1); | 4327 | curr->comm, task_pid_nr(curr), mem_from, mem_to-1); |
| 4328 | print_lock(hlock); | 4328 | print_lock(hlock); |
| 4329 | lockdep_print_held_locks(curr); | 4329 | lockdep_print_held_locks(curr); |
| 4330 | 4330 | ||
| 4331 | printk("\nstack backtrace:\n"); | 4331 | pr_warn("\nstack backtrace:\n"); |
| 4332 | dump_stack(); | 4332 | dump_stack(); |
| 4333 | } | 4333 | } |
| 4334 | 4334 | ||
| @@ -4376,14 +4376,14 @@ static void print_held_locks_bug(void) | |||
| 4376 | if (debug_locks_silent) | 4376 | if (debug_locks_silent) |
| 4377 | return; | 4377 | return; |
| 4378 | 4378 | ||
| 4379 | printk("\n"); | 4379 | pr_warn("\n"); |
| 4380 | pr_warn("====================================\n"); | 4380 | pr_warn("====================================\n"); |
| 4381 | pr_warn("WARNING: %s/%d still has locks held!\n", | 4381 | pr_warn("WARNING: %s/%d still has locks held!\n", |
| 4382 | current->comm, task_pid_nr(current)); | 4382 | current->comm, task_pid_nr(current)); |
| 4383 | print_kernel_ident(); | 4383 | print_kernel_ident(); |
| 4384 | pr_warn("------------------------------------\n"); | 4384 | pr_warn("------------------------------------\n"); |
| 4385 | lockdep_print_held_locks(current); | 4385 | lockdep_print_held_locks(current); |
| 4386 | printk("\nstack backtrace:\n"); | 4386 | pr_warn("\nstack backtrace:\n"); |
| 4387 | dump_stack(); | 4387 | dump_stack(); |
| 4388 | } | 4388 | } |
| 4389 | 4389 | ||
| @@ -4402,10 +4402,10 @@ void debug_show_all_locks(void) | |||
| 4402 | int unlock = 1; | 4402 | int unlock = 1; |
| 4403 | 4403 | ||
| 4404 | if (unlikely(!debug_locks)) { | 4404 | if (unlikely(!debug_locks)) { |
| 4405 | printk("INFO: lockdep is turned off.\n"); | 4405 | pr_warn("INFO: lockdep is turned off.\n"); |
| 4406 | return; | 4406 | return; |
| 4407 | } | 4407 | } |
| 4408 | printk("\nShowing all locks held in the system:\n"); | 4408 | pr_warn("\nShowing all locks held in the system:\n"); |
| 4409 | 4409 | ||
| 4410 | /* | 4410 | /* |
| 4411 | * Here we try to get the tasklist_lock as hard as possible, | 4411 | * Here we try to get the tasklist_lock as hard as possible, |
| @@ -4416,18 +4416,18 @@ void debug_show_all_locks(void) | |||
| 4416 | retry: | 4416 | retry: |
| 4417 | if (!read_trylock(&tasklist_lock)) { | 4417 | if (!read_trylock(&tasklist_lock)) { |
| 4418 | if (count == 10) | 4418 | if (count == 10) |
| 4419 | printk("hm, tasklist_lock locked, retrying... "); | 4419 | pr_warn("hm, tasklist_lock locked, retrying... "); |
| 4420 | if (count) { | 4420 | if (count) { |
| 4421 | count--; | 4421 | count--; |
| 4422 | printk(" #%d", 10-count); | 4422 | pr_cont(" #%d", 10-count); |
| 4423 | mdelay(200); | 4423 | mdelay(200); |
| 4424 | goto retry; | 4424 | goto retry; |
| 4425 | } | 4425 | } |
| 4426 | printk(" ignoring it.\n"); | 4426 | pr_cont(" ignoring it.\n"); |
| 4427 | unlock = 0; | 4427 | unlock = 0; |
| 4428 | } else { | 4428 | } else { |
| 4429 | if (count != 10) | 4429 | if (count != 10) |
| 4430 | printk(KERN_CONT " locked it.\n"); | 4430 | pr_cont(" locked it.\n"); |
| 4431 | } | 4431 | } |
| 4432 | 4432 | ||
| 4433 | do_each_thread(g, p) { | 4433 | do_each_thread(g, p) { |
| @@ -4445,7 +4445,7 @@ retry: | |||
| 4445 | unlock = 1; | 4445 | unlock = 1; |
| 4446 | } while_each_thread(g, p); | 4446 | } while_each_thread(g, p); |
| 4447 | 4447 | ||
| 4448 | printk("\n"); | 4448 | pr_warn("\n"); |
| 4449 | pr_warn("=============================================\n\n"); | 4449 | pr_warn("=============================================\n\n"); |
| 4450 | 4450 | ||
| 4451 | if (unlock) | 4451 | if (unlock) |
| @@ -4475,12 +4475,12 @@ asmlinkage __visible void lockdep_sys_exit(void) | |||
| 4475 | if (unlikely(curr->lockdep_depth)) { | 4475 | if (unlikely(curr->lockdep_depth)) { |
| 4476 | if (!debug_locks_off()) | 4476 | if (!debug_locks_off()) |
| 4477 | return; | 4477 | return; |
| 4478 | printk("\n"); | 4478 | pr_warn("\n"); |
| 4479 | pr_warn("================================================\n"); | 4479 | pr_warn("================================================\n"); |
| 4480 | pr_warn("WARNING: lock held when returning to user space!\n"); | 4480 | pr_warn("WARNING: lock held when returning to user space!\n"); |
| 4481 | print_kernel_ident(); | 4481 | print_kernel_ident(); |
| 4482 | pr_warn("------------------------------------------------\n"); | 4482 | pr_warn("------------------------------------------------\n"); |
| 4483 | printk("%s/%d is leaving the kernel with locks still held!\n", | 4483 | pr_warn("%s/%d is leaving the kernel with locks still held!\n", |
| 4484 | curr->comm, curr->pid); | 4484 | curr->comm, curr->pid); |
| 4485 | lockdep_print_held_locks(curr); | 4485 | lockdep_print_held_locks(curr); |
| 4486 | } | 4486 | } |
| @@ -4490,19 +4490,15 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) | |||
| 4490 | { | 4490 | { |
| 4491 | struct task_struct *curr = current; | 4491 | struct task_struct *curr = current; |
| 4492 | 4492 | ||
| 4493 | #ifndef CONFIG_PROVE_RCU_REPEATEDLY | ||
| 4494 | if (!debug_locks_off()) | ||
| 4495 | return; | ||
| 4496 | #endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */ | ||
| 4497 | /* Note: the following can be executed concurrently, so be careful. */ | 4493 | /* Note: the following can be executed concurrently, so be careful. */ |
| 4498 | printk("\n"); | 4494 | pr_warn("\n"); |
| 4499 | pr_warn("=============================\n"); | 4495 | pr_warn("=============================\n"); |
| 4500 | pr_warn("WARNING: suspicious RCU usage\n"); | 4496 | pr_warn("WARNING: suspicious RCU usage\n"); |
| 4501 | print_kernel_ident(); | 4497 | print_kernel_ident(); |
| 4502 | pr_warn("-----------------------------\n"); | 4498 | pr_warn("-----------------------------\n"); |
| 4503 | printk("%s:%d %s!\n", file, line, s); | 4499 | pr_warn("%s:%d %s!\n", file, line, s); |
| 4504 | printk("\nother info that might help us debug this:\n\n"); | 4500 | pr_warn("\nother info that might help us debug this:\n\n"); |
| 4505 | printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n", | 4501 | pr_warn("\n%srcu_scheduler_active = %d, debug_locks = %d\n", |
| 4506 | !rcu_lockdep_current_cpu_online() | 4502 | !rcu_lockdep_current_cpu_online() |
| 4507 | ? "RCU used illegally from offline CPU!\n" | 4503 | ? "RCU used illegally from offline CPU!\n" |
| 4508 | : !rcu_is_watching() | 4504 | : !rcu_is_watching() |
| @@ -4529,10 +4525,10 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) | |||
| 4529 | * rcu_read_lock_bh() and so on from extended quiescent states. | 4525 | * rcu_read_lock_bh() and so on from extended quiescent states. |
| 4530 | */ | 4526 | */ |
| 4531 | if (!rcu_is_watching()) | 4527 | if (!rcu_is_watching()) |
| 4532 | printk("RCU used illegally from extended quiescent state!\n"); | 4528 | pr_warn("RCU used illegally from extended quiescent state!\n"); |
| 4533 | 4529 | ||
| 4534 | lockdep_print_held_locks(curr); | 4530 | lockdep_print_held_locks(curr); |
| 4535 | printk("\nstack backtrace:\n"); | 4531 | pr_warn("\nstack backtrace:\n"); |
| 4536 | dump_stack(); | 4532 | dump_stack(); |
| 4537 | } | 4533 | } |
| 4538 | EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious); | 4534 | EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious); |
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 198527a62149..858a07590e39 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c | |||
| @@ -227,9 +227,9 @@ static void __sched __mutex_lock_slowpath(struct mutex *lock); | |||
| 227 | * (or statically defined) before it can be locked. memset()-ing | 227 | * (or statically defined) before it can be locked. memset()-ing |
| 228 | * the mutex to 0 is not allowed. | 228 | * the mutex to 0 is not allowed. |
| 229 | * | 229 | * |
| 230 | * ( The CONFIG_DEBUG_MUTEXES .config option turns on debugging | 230 | * (The CONFIG_DEBUG_MUTEXES .config option turns on debugging |
| 231 | * checks that will enforce the restrictions and will also do | 231 | * checks that will enforce the restrictions and will also do |
| 232 | * deadlock debugging. ) | 232 | * deadlock debugging) |
| 233 | * | 233 | * |
| 234 | * This function is similar to (but not equivalent to) down(). | 234 | * This function is similar to (but not equivalent to) down(). |
| 235 | */ | 235 | */ |
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index cc3ed0ccdfa2..2655f26ec882 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c | |||
| @@ -20,6 +20,7 @@ | |||
| 20 | #include <linux/cpumask.h> | 20 | #include <linux/cpumask.h> |
| 21 | #include <linux/percpu.h> | 21 | #include <linux/percpu.h> |
| 22 | #include <linux/hardirq.h> | 22 | #include <linux/hardirq.h> |
| 23 | #include <linux/spinlock.h> | ||
| 23 | #include <asm/qrwlock.h> | 24 | #include <asm/qrwlock.h> |
| 24 | 25 | ||
| 25 | /* | 26 | /* |
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index b2caec7315af..fd24153e8a48 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c | |||
| @@ -28,6 +28,7 @@ | |||
| 28 | #include <linux/percpu.h> | 28 | #include <linux/percpu.h> |
| 29 | #include <linux/hardirq.h> | 29 | #include <linux/hardirq.h> |
| 30 | #include <linux/mutex.h> | 30 | #include <linux/mutex.h> |
| 31 | #include <linux/prefetch.h> | ||
| 31 | #include <asm/byteorder.h> | 32 | #include <asm/byteorder.h> |
| 32 | #include <asm/qspinlock.h> | 33 | #include <asm/qspinlock.h> |
| 33 | 34 | ||
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index e6b2f7ad3e51..4ccfcaae5b89 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h | |||
| @@ -193,7 +193,8 @@ void __init __pv_init_lock_hash(void) | |||
| 193 | */ | 193 | */ |
| 194 | pv_lock_hash = alloc_large_system_hash("PV qspinlock", | 194 | pv_lock_hash = alloc_large_system_hash("PV qspinlock", |
| 195 | sizeof(struct pv_hash_entry), | 195 | sizeof(struct pv_hash_entry), |
| 196 | pv_hash_size, 0, HASH_EARLY, | 196 | pv_hash_size, 0, |
| 197 | HASH_EARLY | HASH_ZERO, | ||
| 197 | &pv_lock_hash_bits, NULL, | 198 | &pv_lock_hash_bits, NULL, |
| 198 | pv_hash_size, pv_hash_size); | 199 | pv_hash_size, pv_hash_size); |
| 199 | } | 200 | } |
diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c index 58e366ad36f4..ac35e648b0e5 100644 --- a/kernel/locking/rtmutex-debug.c +++ b/kernel/locking/rtmutex-debug.c | |||
| @@ -166,12 +166,16 @@ void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) | |||
| 166 | memset(waiter, 0x22, sizeof(*waiter)); | 166 | memset(waiter, 0x22, sizeof(*waiter)); |
| 167 | } | 167 | } |
| 168 | 168 | ||
| 169 | void debug_rt_mutex_init(struct rt_mutex *lock, const char *name) | 169 | void debug_rt_mutex_init(struct rt_mutex *lock, const char *name, struct lock_class_key *key) |
| 170 | { | 170 | { |
| 171 | /* | 171 | /* |
| 172 | * Make sure we are not reinitializing a held lock: | 172 | * Make sure we are not reinitializing a held lock: |
| 173 | */ | 173 | */ |
| 174 | debug_check_no_locks_freed((void *)lock, sizeof(*lock)); | 174 | debug_check_no_locks_freed((void *)lock, sizeof(*lock)); |
| 175 | lock->name = name; | 175 | lock->name = name; |
| 176 | |||
| 177 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
| 178 | lockdep_init_map(&lock->dep_map, name, key, 0); | ||
| 179 | #endif | ||
| 176 | } | 180 | } |
| 177 | 181 | ||
diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h index b585af9a1b50..5078c6ddf4a5 100644 --- a/kernel/locking/rtmutex-debug.h +++ b/kernel/locking/rtmutex-debug.h | |||
| @@ -11,7 +11,7 @@ | |||
| 11 | 11 | ||
| 12 | extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); | 12 | extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); |
| 13 | extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter); | 13 | extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter); |
| 14 | extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name); | 14 | extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name, struct lock_class_key *key); |
| 15 | extern void debug_rt_mutex_lock(struct rt_mutex *lock); | 15 | extern void debug_rt_mutex_lock(struct rt_mutex *lock); |
| 16 | extern void debug_rt_mutex_unlock(struct rt_mutex *lock); | 16 | extern void debug_rt_mutex_unlock(struct rt_mutex *lock); |
| 17 | extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock, | 17 | extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock, |
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index b95509416909..649dc9d3951a 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c | |||
| @@ -963,7 +963,6 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, | |||
| 963 | return -EDEADLK; | 963 | return -EDEADLK; |
| 964 | 964 | ||
| 965 | raw_spin_lock(&task->pi_lock); | 965 | raw_spin_lock(&task->pi_lock); |
| 966 | rt_mutex_adjust_prio(task); | ||
| 967 | waiter->task = task; | 966 | waiter->task = task; |
| 968 | waiter->lock = lock; | 967 | waiter->lock = lock; |
| 969 | waiter->prio = task->prio; | 968 | waiter->prio = task->prio; |
| @@ -1481,6 +1480,7 @@ void __sched rt_mutex_lock(struct rt_mutex *lock) | |||
| 1481 | { | 1480 | { |
| 1482 | might_sleep(); | 1481 | might_sleep(); |
| 1483 | 1482 | ||
| 1483 | mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); | ||
| 1484 | rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock); | 1484 | rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock); |
| 1485 | } | 1485 | } |
| 1486 | EXPORT_SYMBOL_GPL(rt_mutex_lock); | 1486 | EXPORT_SYMBOL_GPL(rt_mutex_lock); |
| @@ -1496,9 +1496,16 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock); | |||
| 1496 | */ | 1496 | */ |
| 1497 | int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock) | 1497 | int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock) |
| 1498 | { | 1498 | { |
| 1499 | int ret; | ||
| 1500 | |||
| 1499 | might_sleep(); | 1501 | might_sleep(); |
| 1500 | 1502 | ||
| 1501 | return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock); | 1503 | mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); |
| 1504 | ret = rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock); | ||
| 1505 | if (ret) | ||
| 1506 | mutex_release(&lock->dep_map, 1, _RET_IP_); | ||
| 1507 | |||
| 1508 | return ret; | ||
| 1502 | } | 1509 | } |
| 1503 | EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); | 1510 | EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); |
| 1504 | 1511 | ||
| @@ -1526,11 +1533,18 @@ int __sched rt_mutex_futex_trylock(struct rt_mutex *lock) | |||
| 1526 | int | 1533 | int |
| 1527 | rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout) | 1534 | rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout) |
| 1528 | { | 1535 | { |
| 1536 | int ret; | ||
| 1537 | |||
| 1529 | might_sleep(); | 1538 | might_sleep(); |
| 1530 | 1539 | ||
| 1531 | return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, | 1540 | mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); |
| 1541 | ret = rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, | ||
| 1532 | RT_MUTEX_MIN_CHAINWALK, | 1542 | RT_MUTEX_MIN_CHAINWALK, |
| 1533 | rt_mutex_slowlock); | 1543 | rt_mutex_slowlock); |
| 1544 | if (ret) | ||
| 1545 | mutex_release(&lock->dep_map, 1, _RET_IP_); | ||
| 1546 | |||
| 1547 | return ret; | ||
| 1534 | } | 1548 | } |
| 1535 | EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); | 1549 | EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); |
| 1536 | 1550 | ||
| @@ -1547,10 +1561,16 @@ EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); | |||
| 1547 | */ | 1561 | */ |
| 1548 | int __sched rt_mutex_trylock(struct rt_mutex *lock) | 1562 | int __sched rt_mutex_trylock(struct rt_mutex *lock) |
| 1549 | { | 1563 | { |
| 1564 | int ret; | ||
| 1565 | |||
| 1550 | if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq())) | 1566 | if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq())) |
| 1551 | return 0; | 1567 | return 0; |
| 1552 | 1568 | ||
| 1553 | return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock); | 1569 | ret = rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock); |
| 1570 | if (ret) | ||
| 1571 | mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); | ||
| 1572 | |||
| 1573 | return ret; | ||
| 1554 | } | 1574 | } |
| 1555 | EXPORT_SYMBOL_GPL(rt_mutex_trylock); | 1575 | EXPORT_SYMBOL_GPL(rt_mutex_trylock); |
| 1556 | 1576 | ||
| @@ -1561,6 +1581,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_trylock); | |||
| 1561 | */ | 1581 | */ |
| 1562 | void __sched rt_mutex_unlock(struct rt_mutex *lock) | 1582 | void __sched rt_mutex_unlock(struct rt_mutex *lock) |
| 1563 | { | 1583 | { |
| 1584 | mutex_release(&lock->dep_map, 1, _RET_IP_); | ||
| 1564 | rt_mutex_fastunlock(lock, rt_mutex_slowunlock); | 1585 | rt_mutex_fastunlock(lock, rt_mutex_slowunlock); |
| 1565 | } | 1586 | } |
| 1566 | EXPORT_SYMBOL_GPL(rt_mutex_unlock); | 1587 | EXPORT_SYMBOL_GPL(rt_mutex_unlock); |
| @@ -1620,7 +1641,6 @@ void rt_mutex_destroy(struct rt_mutex *lock) | |||
| 1620 | lock->magic = NULL; | 1641 | lock->magic = NULL; |
| 1621 | #endif | 1642 | #endif |
| 1622 | } | 1643 | } |
| 1623 | |||
| 1624 | EXPORT_SYMBOL_GPL(rt_mutex_destroy); | 1644 | EXPORT_SYMBOL_GPL(rt_mutex_destroy); |
| 1625 | 1645 | ||
| 1626 | /** | 1646 | /** |
| @@ -1632,14 +1652,16 @@ EXPORT_SYMBOL_GPL(rt_mutex_destroy); | |||
| 1632 | * | 1652 | * |
| 1633 | * Initializing of a locked rt lock is not allowed | 1653 | * Initializing of a locked rt lock is not allowed |
| 1634 | */ | 1654 | */ |
| 1635 | void __rt_mutex_init(struct rt_mutex *lock, const char *name) | 1655 | void __rt_mutex_init(struct rt_mutex *lock, const char *name, |
| 1656 | struct lock_class_key *key) | ||
| 1636 | { | 1657 | { |
| 1637 | lock->owner = NULL; | 1658 | lock->owner = NULL; |
| 1638 | raw_spin_lock_init(&lock->wait_lock); | 1659 | raw_spin_lock_init(&lock->wait_lock); |
| 1639 | lock->waiters = RB_ROOT; | 1660 | lock->waiters = RB_ROOT; |
| 1640 | lock->waiters_leftmost = NULL; | 1661 | lock->waiters_leftmost = NULL; |
| 1641 | 1662 | ||
| 1642 | debug_rt_mutex_init(lock, name); | 1663 | if (name && key) |
| 1664 | debug_rt_mutex_init(lock, name, key); | ||
| 1643 | } | 1665 | } |
| 1644 | EXPORT_SYMBOL_GPL(__rt_mutex_init); | 1666 | EXPORT_SYMBOL_GPL(__rt_mutex_init); |
| 1645 | 1667 | ||
| @@ -1660,7 +1682,7 @@ EXPORT_SYMBOL_GPL(__rt_mutex_init); | |||
| 1660 | void rt_mutex_init_proxy_locked(struct rt_mutex *lock, | 1682 | void rt_mutex_init_proxy_locked(struct rt_mutex *lock, |
| 1661 | struct task_struct *proxy_owner) | 1683 | struct task_struct *proxy_owner) |
| 1662 | { | 1684 | { |
| 1663 | __rt_mutex_init(lock, NULL); | 1685 | __rt_mutex_init(lock, NULL, NULL); |
| 1664 | debug_rt_mutex_proxy_lock(lock, proxy_owner); | 1686 | debug_rt_mutex_proxy_lock(lock, proxy_owner); |
| 1665 | rt_mutex_set_owner(lock, proxy_owner); | 1687 | rt_mutex_set_owner(lock, proxy_owner); |
| 1666 | } | 1688 | } |
| @@ -1785,12 +1807,14 @@ int rt_mutex_wait_proxy_lock(struct rt_mutex *lock, | |||
| 1785 | int ret; | 1807 | int ret; |
| 1786 | 1808 | ||
| 1787 | raw_spin_lock_irq(&lock->wait_lock); | 1809 | raw_spin_lock_irq(&lock->wait_lock); |
| 1788 | |||
| 1789 | set_current_state(TASK_INTERRUPTIBLE); | ||
| 1790 | |||
| 1791 | /* sleep on the mutex */ | 1810 | /* sleep on the mutex */ |
| 1811 | set_current_state(TASK_INTERRUPTIBLE); | ||
| 1792 | ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter); | 1812 | ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter); |
| 1793 | 1813 | /* | |
| 1814 | * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might | ||
| 1815 | * have to fix that up. | ||
| 1816 | */ | ||
| 1817 | fixup_rt_mutex_waiters(lock); | ||
| 1794 | raw_spin_unlock_irq(&lock->wait_lock); | 1818 | raw_spin_unlock_irq(&lock->wait_lock); |
| 1795 | 1819 | ||
| 1796 | return ret; | 1820 | return ret; |
| @@ -1822,15 +1846,25 @@ bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock, | |||
| 1822 | 1846 | ||
| 1823 | raw_spin_lock_irq(&lock->wait_lock); | 1847 | raw_spin_lock_irq(&lock->wait_lock); |
| 1824 | /* | 1848 | /* |
| 1849 | * Do an unconditional try-lock, this deals with the lock stealing | ||
| 1850 | * state where __rt_mutex_futex_unlock() -> mark_wakeup_next_waiter() | ||
| 1851 | * sets a NULL owner. | ||
| 1852 | * | ||
| 1853 | * We're not interested in the return value, because the subsequent | ||
| 1854 | * test on rt_mutex_owner() will infer that. If the trylock succeeded, | ||
| 1855 | * we will own the lock and it will have removed the waiter. If we | ||
| 1856 | * failed the trylock, we're still not owner and we need to remove | ||
| 1857 | * ourselves. | ||
| 1858 | */ | ||
| 1859 | try_to_take_rt_mutex(lock, current, waiter); | ||
| 1860 | /* | ||
| 1825 | * Unless we're the owner; we're still enqueued on the wait_list. | 1861 | * Unless we're the owner; we're still enqueued on the wait_list. |
| 1826 | * So check if we became owner, if not, take us off the wait_list. | 1862 | * So check if we became owner, if not, take us off the wait_list. |
| 1827 | */ | 1863 | */ |
| 1828 | if (rt_mutex_owner(lock) != current) { | 1864 | if (rt_mutex_owner(lock) != current) { |
| 1829 | remove_waiter(lock, waiter); | 1865 | remove_waiter(lock, waiter); |
| 1830 | fixup_rt_mutex_waiters(lock); | ||
| 1831 | cleanup = true; | 1866 | cleanup = true; |
| 1832 | } | 1867 | } |
| 1833 | |||
| 1834 | /* | 1868 | /* |
| 1835 | * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might | 1869 | * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might |
| 1836 | * have to fix that up. | 1870 | * have to fix that up. |
diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h index 6607802efa8b..5c253caffe91 100644 --- a/kernel/locking/rtmutex.h +++ b/kernel/locking/rtmutex.h | |||
| @@ -17,7 +17,7 @@ | |||
| 17 | #define debug_rt_mutex_proxy_lock(l,p) do { } while (0) | 17 | #define debug_rt_mutex_proxy_lock(l,p) do { } while (0) |
| 18 | #define debug_rt_mutex_proxy_unlock(l) do { } while (0) | 18 | #define debug_rt_mutex_proxy_unlock(l) do { } while (0) |
| 19 | #define debug_rt_mutex_unlock(l) do { } while (0) | 19 | #define debug_rt_mutex_unlock(l) do { } while (0) |
| 20 | #define debug_rt_mutex_init(m, n) do { } while (0) | 20 | #define debug_rt_mutex_init(m, n, k) do { } while (0) |
| 21 | #define debug_rt_mutex_deadlock(d, a ,l) do { } while (0) | 21 | #define debug_rt_mutex_deadlock(d, a ,l) do { } while (0) |
| 22 | #define debug_rt_mutex_print_deadlock(w) do { } while (0) | 22 | #define debug_rt_mutex_print_deadlock(w) do { } while (0) |
| 23 | #define debug_rt_mutex_reset_waiter(w) do { } while (0) | 23 | #define debug_rt_mutex_reset_waiter(w) do { } while (0) |
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c index c65f7989f850..20819df98125 100644 --- a/kernel/locking/rwsem-spinlock.c +++ b/kernel/locking/rwsem-spinlock.c | |||
| @@ -231,8 +231,8 @@ int __sched __down_write_common(struct rw_semaphore *sem, int state) | |||
| 231 | 231 | ||
| 232 | out_nolock: | 232 | out_nolock: |
| 233 | list_del(&waiter.list); | 233 | list_del(&waiter.list); |
| 234 | if (!list_empty(&sem->wait_list)) | 234 | if (!list_empty(&sem->wait_list) && sem->count >= 0) |
| 235 | __rwsem_do_wake(sem, 1); | 235 | __rwsem_do_wake(sem, 0); |
| 236 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); | 236 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); |
| 237 | 237 | ||
| 238 | return -EINTR; | 238 | return -EINTR; |
diff --git a/kernel/memremap.c b/kernel/memremap.c index 23a6483c3666..124bed776532 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c | |||
| @@ -358,7 +358,11 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, | |||
| 358 | goto err_pfn_remap; | 358 | goto err_pfn_remap; |
| 359 | 359 | ||
| 360 | mem_hotplug_begin(); | 360 | mem_hotplug_begin(); |
| 361 | error = arch_add_memory(nid, align_start, align_size, true); | 361 | error = arch_add_memory(nid, align_start, align_size, false); |
| 362 | if (!error) | ||
| 363 | move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], | ||
| 364 | align_start >> PAGE_SHIFT, | ||
| 365 | align_size >> PAGE_SHIFT); | ||
| 362 | mem_hotplug_done(); | 366 | mem_hotplug_done(); |
| 363 | if (error) | 367 | if (error) |
| 364 | goto err_add_memory; | 368 | goto err_add_memory; |
diff --git a/kernel/module.c b/kernel/module.c index 4a3665f8f837..40f983cbea81 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
| @@ -49,9 +49,7 @@ | |||
| 49 | #include <linux/rculist.h> | 49 | #include <linux/rculist.h> |
| 50 | #include <linux/uaccess.h> | 50 | #include <linux/uaccess.h> |
| 51 | #include <asm/cacheflush.h> | 51 | #include <asm/cacheflush.h> |
| 52 | #ifdef CONFIG_STRICT_MODULE_RWX | 52 | #include <linux/set_memory.h> |
| 53 | #include <asm/set_memory.h> | ||
| 54 | #endif | ||
| 55 | #include <asm/mmu_context.h> | 53 | #include <asm/mmu_context.h> |
| 56 | #include <linux/license.h> | 54 | #include <linux/license.h> |
| 57 | #include <asm/sections.h> | 55 | #include <asm/sections.h> |
| @@ -302,6 +300,7 @@ int unregister_module_notifier(struct notifier_block *nb) | |||
| 302 | EXPORT_SYMBOL(unregister_module_notifier); | 300 | EXPORT_SYMBOL(unregister_module_notifier); |
| 303 | 301 | ||
| 304 | struct load_info { | 302 | struct load_info { |
| 303 | const char *name; | ||
| 305 | Elf_Ehdr *hdr; | 304 | Elf_Ehdr *hdr; |
| 306 | unsigned long len; | 305 | unsigned long len; |
| 307 | Elf_Shdr *sechdrs; | 306 | Elf_Shdr *sechdrs; |
| @@ -602,7 +601,7 @@ static struct module *find_module_all(const char *name, size_t len, | |||
| 602 | 601 | ||
| 603 | module_assert_mutex_or_preempt(); | 602 | module_assert_mutex_or_preempt(); |
| 604 | 603 | ||
| 605 | list_for_each_entry(mod, &modules, list) { | 604 | list_for_each_entry_rcu(mod, &modules, list) { |
| 606 | if (!even_unformed && mod->state == MODULE_STATE_UNFORMED) | 605 | if (!even_unformed && mod->state == MODULE_STATE_UNFORMED) |
| 607 | continue; | 606 | continue; |
| 608 | if (strlen(mod->name) == len && !memcmp(mod->name, name, len)) | 607 | if (strlen(mod->name) == len && !memcmp(mod->name, name, len)) |
| @@ -1202,10 +1201,7 @@ static ssize_t store_uevent(struct module_attribute *mattr, | |||
| 1202 | struct module_kobject *mk, | 1201 | struct module_kobject *mk, |
| 1203 | const char *buffer, size_t count) | 1202 | const char *buffer, size_t count) |
| 1204 | { | 1203 | { |
| 1205 | enum kobject_action action; | 1204 | kobject_synth_uevent(&mk->kobj, buffer, count); |
| 1206 | |||
| 1207 | if (kobject_action_type(buffer, count, &action) == 0) | ||
| 1208 | kobject_uevent(&mk->kobj, action); | ||
| 1209 | return count; | 1205 | return count; |
| 1210 | } | 1206 | } |
| 1211 | 1207 | ||
| @@ -1278,12 +1274,13 @@ static u32 resolve_rel_crc(const s32 *crc) | |||
| 1278 | return *(u32 *)((void *)crc + *crc); | 1274 | return *(u32 *)((void *)crc + *crc); |
| 1279 | } | 1275 | } |
| 1280 | 1276 | ||
| 1281 | static int check_version(Elf_Shdr *sechdrs, | 1277 | static int check_version(const struct load_info *info, |
| 1282 | unsigned int versindex, | ||
| 1283 | const char *symname, | 1278 | const char *symname, |
| 1284 | struct module *mod, | 1279 | struct module *mod, |
| 1285 | const s32 *crc) | 1280 | const s32 *crc) |
| 1286 | { | 1281 | { |
| 1282 | Elf_Shdr *sechdrs = info->sechdrs; | ||
| 1283 | unsigned int versindex = info->index.vers; | ||
| 1287 | unsigned int i, num_versions; | 1284 | unsigned int i, num_versions; |
| 1288 | struct modversion_info *versions; | 1285 | struct modversion_info *versions; |
| 1289 | 1286 | ||
| @@ -1317,17 +1314,16 @@ static int check_version(Elf_Shdr *sechdrs, | |||
| 1317 | } | 1314 | } |
| 1318 | 1315 | ||
| 1319 | /* Broken toolchain. Warn once, then let it go.. */ | 1316 | /* Broken toolchain. Warn once, then let it go.. */ |
| 1320 | pr_warn_once("%s: no symbol version for %s\n", mod->name, symname); | 1317 | pr_warn_once("%s: no symbol version for %s\n", info->name, symname); |
| 1321 | return 1; | 1318 | return 1; |
| 1322 | 1319 | ||
| 1323 | bad_version: | 1320 | bad_version: |
| 1324 | pr_warn("%s: disagrees about version of symbol %s\n", | 1321 | pr_warn("%s: disagrees about version of symbol %s\n", |
| 1325 | mod->name, symname); | 1322 | info->name, symname); |
| 1326 | return 0; | 1323 | return 0; |
| 1327 | } | 1324 | } |
| 1328 | 1325 | ||
| 1329 | static inline int check_modstruct_version(Elf_Shdr *sechdrs, | 1326 | static inline int check_modstruct_version(const struct load_info *info, |
| 1330 | unsigned int versindex, | ||
| 1331 | struct module *mod) | 1327 | struct module *mod) |
| 1332 | { | 1328 | { |
| 1333 | const s32 *crc; | 1329 | const s32 *crc; |
| @@ -1343,8 +1339,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs, | |||
| 1343 | BUG(); | 1339 | BUG(); |
| 1344 | } | 1340 | } |
| 1345 | preempt_enable(); | 1341 | preempt_enable(); |
| 1346 | return check_version(sechdrs, versindex, | 1342 | return check_version(info, VMLINUX_SYMBOL_STR(module_layout), |
| 1347 | VMLINUX_SYMBOL_STR(module_layout), mod, crc); | 1343 | mod, crc); |
| 1348 | } | 1344 | } |
| 1349 | 1345 | ||
| 1350 | /* First part is kernel version, which we ignore if module has crcs. */ | 1346 | /* First part is kernel version, which we ignore if module has crcs. */ |
| @@ -1358,8 +1354,7 @@ static inline int same_magic(const char *amagic, const char *bmagic, | |||
| 1358 | return strcmp(amagic, bmagic) == 0; | 1354 | return strcmp(amagic, bmagic) == 0; |
| 1359 | } | 1355 | } |
| 1360 | #else | 1356 | #else |
| 1361 | static inline int check_version(Elf_Shdr *sechdrs, | 1357 | static inline int check_version(const struct load_info *info, |
| 1362 | unsigned int versindex, | ||
| 1363 | const char *symname, | 1358 | const char *symname, |
| 1364 | struct module *mod, | 1359 | struct module *mod, |
| 1365 | const s32 *crc) | 1360 | const s32 *crc) |
| @@ -1367,8 +1362,7 @@ static inline int check_version(Elf_Shdr *sechdrs, | |||
| 1367 | return 1; | 1362 | return 1; |
| 1368 | } | 1363 | } |
| 1369 | 1364 | ||
| 1370 | static inline int check_modstruct_version(Elf_Shdr *sechdrs, | 1365 | static inline int check_modstruct_version(const struct load_info *info, |
| 1371 | unsigned int versindex, | ||
| 1372 | struct module *mod) | 1366 | struct module *mod) |
| 1373 | { | 1367 | { |
| 1374 | return 1; | 1368 | return 1; |
| @@ -1404,7 +1398,7 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod, | |||
| 1404 | if (!sym) | 1398 | if (!sym) |
| 1405 | goto unlock; | 1399 | goto unlock; |
| 1406 | 1400 | ||
| 1407 | if (!check_version(info->sechdrs, info->index.vers, name, mod, crc)) { | 1401 | if (!check_version(info, name, mod, crc)) { |
| 1408 | sym = ERR_PTR(-EINVAL); | 1402 | sym = ERR_PTR(-EINVAL); |
| 1409 | goto getname; | 1403 | goto getname; |
| 1410 | } | 1404 | } |
| @@ -1667,31 +1661,36 @@ static inline void remove_notes_attrs(struct module *mod) | |||
| 1667 | } | 1661 | } |
| 1668 | #endif /* CONFIG_KALLSYMS */ | 1662 | #endif /* CONFIG_KALLSYMS */ |
| 1669 | 1663 | ||
| 1670 | static void add_usage_links(struct module *mod) | 1664 | static void del_usage_links(struct module *mod) |
| 1671 | { | 1665 | { |
| 1672 | #ifdef CONFIG_MODULE_UNLOAD | 1666 | #ifdef CONFIG_MODULE_UNLOAD |
| 1673 | struct module_use *use; | 1667 | struct module_use *use; |
| 1674 | int nowarn; | ||
| 1675 | 1668 | ||
| 1676 | mutex_lock(&module_mutex); | 1669 | mutex_lock(&module_mutex); |
| 1677 | list_for_each_entry(use, &mod->target_list, target_list) { | 1670 | list_for_each_entry(use, &mod->target_list, target_list) |
| 1678 | nowarn = sysfs_create_link(use->target->holders_dir, | 1671 | sysfs_remove_link(use->target->holders_dir, mod->name); |
| 1679 | &mod->mkobj.kobj, mod->name); | ||
| 1680 | } | ||
| 1681 | mutex_unlock(&module_mutex); | 1672 | mutex_unlock(&module_mutex); |
| 1682 | #endif | 1673 | #endif |
| 1683 | } | 1674 | } |
| 1684 | 1675 | ||
| 1685 | static void del_usage_links(struct module *mod) | 1676 | static int add_usage_links(struct module *mod) |
| 1686 | { | 1677 | { |
| 1678 | int ret = 0; | ||
| 1687 | #ifdef CONFIG_MODULE_UNLOAD | 1679 | #ifdef CONFIG_MODULE_UNLOAD |
| 1688 | struct module_use *use; | 1680 | struct module_use *use; |
| 1689 | 1681 | ||
| 1690 | mutex_lock(&module_mutex); | 1682 | mutex_lock(&module_mutex); |
| 1691 | list_for_each_entry(use, &mod->target_list, target_list) | 1683 | list_for_each_entry(use, &mod->target_list, target_list) { |
| 1692 | sysfs_remove_link(use->target->holders_dir, mod->name); | 1684 | ret = sysfs_create_link(use->target->holders_dir, |
| 1685 | &mod->mkobj.kobj, mod->name); | ||
| 1686 | if (ret) | ||
| 1687 | break; | ||
| 1688 | } | ||
| 1693 | mutex_unlock(&module_mutex); | 1689 | mutex_unlock(&module_mutex); |
| 1690 | if (ret) | ||
| 1691 | del_usage_links(mod); | ||
| 1694 | #endif | 1692 | #endif |
| 1693 | return ret; | ||
| 1695 | } | 1694 | } |
| 1696 | 1695 | ||
| 1697 | static int module_add_modinfo_attrs(struct module *mod) | 1696 | static int module_add_modinfo_attrs(struct module *mod) |
| @@ -1802,13 +1801,18 @@ static int mod_sysfs_setup(struct module *mod, | |||
| 1802 | if (err) | 1801 | if (err) |
| 1803 | goto out_unreg_param; | 1802 | goto out_unreg_param; |
| 1804 | 1803 | ||
| 1805 | add_usage_links(mod); | 1804 | err = add_usage_links(mod); |
| 1805 | if (err) | ||
| 1806 | goto out_unreg_modinfo_attrs; | ||
| 1807 | |||
| 1806 | add_sect_attrs(mod, info); | 1808 | add_sect_attrs(mod, info); |
| 1807 | add_notes_attrs(mod, info); | 1809 | add_notes_attrs(mod, info); |
| 1808 | 1810 | ||
| 1809 | kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD); | 1811 | kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD); |
| 1810 | return 0; | 1812 | return 0; |
| 1811 | 1813 | ||
| 1814 | out_unreg_modinfo_attrs: | ||
| 1815 | module_remove_modinfo_attrs(mod); | ||
| 1812 | out_unreg_param: | 1816 | out_unreg_param: |
| 1813 | module_param_sysfs_remove(mod); | 1817 | module_param_sysfs_remove(mod); |
| 1814 | out_unreg_holders: | 1818 | out_unreg_holders: |
| @@ -2915,9 +2919,15 @@ static int rewrite_section_headers(struct load_info *info, int flags) | |||
| 2915 | info->index.vers = 0; /* Pretend no __versions section! */ | 2919 | info->index.vers = 0; /* Pretend no __versions section! */ |
| 2916 | else | 2920 | else |
| 2917 | info->index.vers = find_sec(info, "__versions"); | 2921 | info->index.vers = find_sec(info, "__versions"); |
| 2922 | info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC; | ||
| 2923 | |||
| 2918 | info->index.info = find_sec(info, ".modinfo"); | 2924 | info->index.info = find_sec(info, ".modinfo"); |
| 2925 | if (!info->index.info) | ||
| 2926 | info->name = "(missing .modinfo section)"; | ||
| 2927 | else | ||
| 2928 | info->name = get_modinfo(info, "name"); | ||
| 2919 | info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC; | 2929 | info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC; |
| 2920 | info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC; | 2930 | |
| 2921 | return 0; | 2931 | return 0; |
| 2922 | } | 2932 | } |
| 2923 | 2933 | ||
| @@ -2957,21 +2967,29 @@ static struct module *setup_load_info(struct load_info *info, int flags) | |||
| 2957 | 2967 | ||
| 2958 | info->index.mod = find_sec(info, ".gnu.linkonce.this_module"); | 2968 | info->index.mod = find_sec(info, ".gnu.linkonce.this_module"); |
| 2959 | if (!info->index.mod) { | 2969 | if (!info->index.mod) { |
| 2960 | pr_warn("No module found in object\n"); | 2970 | pr_warn("%s: No module found in object\n", |
| 2971 | info->name ?: "(missing .modinfo name field)"); | ||
| 2961 | return ERR_PTR(-ENOEXEC); | 2972 | return ERR_PTR(-ENOEXEC); |
| 2962 | } | 2973 | } |
| 2963 | /* This is temporary: point mod into copy of data. */ | 2974 | /* This is temporary: point mod into copy of data. */ |
| 2964 | mod = (void *)info->sechdrs[info->index.mod].sh_addr; | 2975 | mod = (void *)info->sechdrs[info->index.mod].sh_addr; |
| 2965 | 2976 | ||
| 2977 | /* | ||
| 2978 | * If we didn't load the .modinfo 'name' field, fall back to | ||
| 2979 | * on-disk struct mod 'name' field. | ||
| 2980 | */ | ||
| 2981 | if (!info->name) | ||
| 2982 | info->name = mod->name; | ||
| 2983 | |||
| 2966 | if (info->index.sym == 0) { | 2984 | if (info->index.sym == 0) { |
| 2967 | pr_warn("%s: module has no symbols (stripped?)\n", mod->name); | 2985 | pr_warn("%s: module has no symbols (stripped?)\n", info->name); |
| 2968 | return ERR_PTR(-ENOEXEC); | 2986 | return ERR_PTR(-ENOEXEC); |
| 2969 | } | 2987 | } |
| 2970 | 2988 | ||
| 2971 | info->index.pcpu = find_pcpusec(info); | 2989 | info->index.pcpu = find_pcpusec(info); |
| 2972 | 2990 | ||
| 2973 | /* Check module struct version now, before we try to use module. */ | 2991 | /* Check module struct version now, before we try to use module. */ |
| 2974 | if (!check_modstruct_version(info->sechdrs, info->index.vers, mod)) | 2992 | if (!check_modstruct_version(info, mod)) |
| 2975 | return ERR_PTR(-ENOEXEC); | 2993 | return ERR_PTR(-ENOEXEC); |
| 2976 | 2994 | ||
| 2977 | return mod; | 2995 | return mod; |
| @@ -2992,7 +3010,7 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags) | |||
| 2992 | return err; | 3010 | return err; |
| 2993 | } else if (!same_magic(modmagic, vermagic, info->index.vers)) { | 3011 | } else if (!same_magic(modmagic, vermagic, info->index.vers)) { |
| 2994 | pr_err("%s: version magic '%s' should be '%s'\n", | 3012 | pr_err("%s: version magic '%s' should be '%s'\n", |
| 2995 | mod->name, modmagic, vermagic); | 3013 | info->name, modmagic, vermagic); |
| 2996 | return -ENOEXEC; | 3014 | return -ENOEXEC; |
| 2997 | } | 3015 | } |
| 2998 | 3016 | ||
| @@ -3077,9 +3095,9 @@ static int find_module_sections(struct module *mod, struct load_info *info) | |||
| 3077 | mod->trace_events = section_objs(info, "_ftrace_events", | 3095 | mod->trace_events = section_objs(info, "_ftrace_events", |
| 3078 | sizeof(*mod->trace_events), | 3096 | sizeof(*mod->trace_events), |
| 3079 | &mod->num_trace_events); | 3097 | &mod->num_trace_events); |
| 3080 | mod->trace_enums = section_objs(info, "_ftrace_enum_map", | 3098 | mod->trace_evals = section_objs(info, "_ftrace_eval_map", |
| 3081 | sizeof(*mod->trace_enums), | 3099 | sizeof(*mod->trace_evals), |
| 3082 | &mod->num_trace_enums); | 3100 | &mod->num_trace_evals); |
| 3083 | #endif | 3101 | #endif |
| 3084 | #ifdef CONFIG_TRACING | 3102 | #ifdef CONFIG_TRACING |
| 3085 | mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt", | 3103 | mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt", |
| @@ -3242,7 +3260,7 @@ int __weak module_frob_arch_sections(Elf_Ehdr *hdr, | |||
| 3242 | 3260 | ||
| 3243 | /* module_blacklist is a comma-separated list of module names */ | 3261 | /* module_blacklist is a comma-separated list of module names */ |
| 3244 | static char *module_blacklist; | 3262 | static char *module_blacklist; |
| 3245 | static bool blacklisted(char *module_name) | 3263 | static bool blacklisted(const char *module_name) |
| 3246 | { | 3264 | { |
| 3247 | const char *p; | 3265 | const char *p; |
| 3248 | size_t len; | 3266 | size_t len; |
| @@ -3272,7 +3290,7 @@ static struct module *layout_and_allocate(struct load_info *info, int flags) | |||
| 3272 | if (IS_ERR(mod)) | 3290 | if (IS_ERR(mod)) |
| 3273 | return mod; | 3291 | return mod; |
| 3274 | 3292 | ||
| 3275 | if (blacklisted(mod->name)) | 3293 | if (blacklisted(info->name)) |
| 3276 | return ERR_PTR(-EPERM); | 3294 | return ERR_PTR(-EPERM); |
| 3277 | 3295 | ||
| 3278 | err = check_modinfo(mod, info, flags); | 3296 | err = check_modinfo(mod, info, flags); |
| @@ -4201,7 +4219,7 @@ const struct exception_table_entry *search_module_extables(unsigned long addr) | |||
| 4201 | goto out; | 4219 | goto out; |
| 4202 | 4220 | ||
| 4203 | e = search_extable(mod->extable, | 4221 | e = search_extable(mod->extable, |
| 4204 | mod->extable + mod->num_exentries - 1, | 4222 | mod->num_exentries, |
| 4205 | addr); | 4223 | addr); |
| 4206 | out: | 4224 | out: |
| 4207 | preempt_enable(); | 4225 | preempt_enable(); |
diff --git a/kernel/padata.c b/kernel/padata.c index ac8f1e524836..868f947166d7 100644 --- a/kernel/padata.c +++ b/kernel/padata.c | |||
| @@ -934,29 +934,18 @@ static struct kobj_type padata_attr_type = { | |||
| 934 | }; | 934 | }; |
| 935 | 935 | ||
| 936 | /** | 936 | /** |
| 937 | * padata_alloc_possible - Allocate and initialize padata instance. | ||
| 938 | * Use the cpu_possible_mask for serial and | ||
| 939 | * parallel workers. | ||
| 940 | * | ||
| 941 | * @wq: workqueue to use for the allocated padata instance | ||
| 942 | */ | ||
| 943 | struct padata_instance *padata_alloc_possible(struct workqueue_struct *wq) | ||
| 944 | { | ||
| 945 | return padata_alloc(wq, cpu_possible_mask, cpu_possible_mask); | ||
| 946 | } | ||
| 947 | EXPORT_SYMBOL(padata_alloc_possible); | ||
| 948 | |||
| 949 | /** | ||
| 950 | * padata_alloc - allocate and initialize a padata instance and specify | 937 | * padata_alloc - allocate and initialize a padata instance and specify |
| 951 | * cpumasks for serial and parallel workers. | 938 | * cpumasks for serial and parallel workers. |
| 952 | * | 939 | * |
| 953 | * @wq: workqueue to use for the allocated padata instance | 940 | * @wq: workqueue to use for the allocated padata instance |
| 954 | * @pcpumask: cpumask that will be used for padata parallelization | 941 | * @pcpumask: cpumask that will be used for padata parallelization |
| 955 | * @cbcpumask: cpumask that will be used for padata serialization | 942 | * @cbcpumask: cpumask that will be used for padata serialization |
| 943 | * | ||
| 944 | * Must be called from a cpus_read_lock() protected region | ||
| 956 | */ | 945 | */ |
| 957 | struct padata_instance *padata_alloc(struct workqueue_struct *wq, | 946 | static struct padata_instance *padata_alloc(struct workqueue_struct *wq, |
| 958 | const struct cpumask *pcpumask, | 947 | const struct cpumask *pcpumask, |
| 959 | const struct cpumask *cbcpumask) | 948 | const struct cpumask *cbcpumask) |
| 960 | { | 949 | { |
| 961 | struct padata_instance *pinst; | 950 | struct padata_instance *pinst; |
| 962 | struct parallel_data *pd = NULL; | 951 | struct parallel_data *pd = NULL; |
| @@ -965,7 +954,6 @@ struct padata_instance *padata_alloc(struct workqueue_struct *wq, | |||
| 965 | if (!pinst) | 954 | if (!pinst) |
| 966 | goto err; | 955 | goto err; |
| 967 | 956 | ||
| 968 | get_online_cpus(); | ||
| 969 | if (!alloc_cpumask_var(&pinst->cpumask.pcpu, GFP_KERNEL)) | 957 | if (!alloc_cpumask_var(&pinst->cpumask.pcpu, GFP_KERNEL)) |
| 970 | goto err_free_inst; | 958 | goto err_free_inst; |
| 971 | if (!alloc_cpumask_var(&pinst->cpumask.cbcpu, GFP_KERNEL)) { | 959 | if (!alloc_cpumask_var(&pinst->cpumask.cbcpu, GFP_KERNEL)) { |
| @@ -989,14 +977,12 @@ struct padata_instance *padata_alloc(struct workqueue_struct *wq, | |||
| 989 | 977 | ||
| 990 | pinst->flags = 0; | 978 | pinst->flags = 0; |
| 991 | 979 | ||
| 992 | put_online_cpus(); | ||
| 993 | |||
| 994 | BLOCKING_INIT_NOTIFIER_HEAD(&pinst->cpumask_change_notifier); | 980 | BLOCKING_INIT_NOTIFIER_HEAD(&pinst->cpumask_change_notifier); |
| 995 | kobject_init(&pinst->kobj, &padata_attr_type); | 981 | kobject_init(&pinst->kobj, &padata_attr_type); |
| 996 | mutex_init(&pinst->lock); | 982 | mutex_init(&pinst->lock); |
| 997 | 983 | ||
| 998 | #ifdef CONFIG_HOTPLUG_CPU | 984 | #ifdef CONFIG_HOTPLUG_CPU |
| 999 | cpuhp_state_add_instance_nocalls(hp_online, &pinst->node); | 985 | cpuhp_state_add_instance_nocalls_cpuslocked(hp_online, &pinst->node); |
| 1000 | #endif | 986 | #endif |
| 1001 | return pinst; | 987 | return pinst; |
| 1002 | 988 | ||
| @@ -1005,12 +991,27 @@ err_free_masks: | |||
| 1005 | free_cpumask_var(pinst->cpumask.cbcpu); | 991 | free_cpumask_var(pinst->cpumask.cbcpu); |
| 1006 | err_free_inst: | 992 | err_free_inst: |
| 1007 | kfree(pinst); | 993 | kfree(pinst); |
| 1008 | put_online_cpus(); | ||
| 1009 | err: | 994 | err: |
| 1010 | return NULL; | 995 | return NULL; |
| 1011 | } | 996 | } |
| 1012 | 997 | ||
| 1013 | /** | 998 | /** |
| 999 | * padata_alloc_possible - Allocate and initialize padata instance. | ||
| 1000 | * Use the cpu_possible_mask for serial and | ||
| 1001 | * parallel workers. | ||
| 1002 | * | ||
| 1003 | * @wq: workqueue to use for the allocated padata instance | ||
| 1004 | * | ||
| 1005 | * Must be called from a cpus_read_lock() protected region | ||
| 1006 | */ | ||
| 1007 | struct padata_instance *padata_alloc_possible(struct workqueue_struct *wq) | ||
| 1008 | { | ||
| 1009 | lockdep_assert_cpus_held(); | ||
| 1010 | return padata_alloc(wq, cpu_possible_mask, cpu_possible_mask); | ||
| 1011 | } | ||
| 1012 | EXPORT_SYMBOL(padata_alloc_possible); | ||
| 1013 | |||
| 1014 | /** | ||
| 1014 | * padata_free - free a padata instance | 1015 | * padata_free - free a padata instance |
| 1015 | * | 1016 | * |
| 1016 | * @padata_inst: padata instance to free | 1017 | * @padata_inst: padata instance to free |
diff --git a/kernel/pid.c b/kernel/pid.c index fd1cde1e4576..731c4e528f4e 100644 --- a/kernel/pid.c +++ b/kernel/pid.c | |||
| @@ -575,16 +575,13 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) | |||
| 575 | */ | 575 | */ |
| 576 | void __init pidhash_init(void) | 576 | void __init pidhash_init(void) |
| 577 | { | 577 | { |
| 578 | unsigned int i, pidhash_size; | 578 | unsigned int pidhash_size; |
| 579 | 579 | ||
| 580 | pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18, | 580 | pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18, |
| 581 | HASH_EARLY | HASH_SMALL, | 581 | HASH_EARLY | HASH_SMALL | HASH_ZERO, |
| 582 | &pidhash_shift, NULL, | 582 | &pidhash_shift, NULL, |
| 583 | 0, 4096); | 583 | 0, 4096); |
| 584 | pidhash_size = 1U << pidhash_shift; | 584 | pidhash_size = 1U << pidhash_shift; |
| 585 | |||
| 586 | for (i = 0; i < pidhash_size; i++) | ||
| 587 | INIT_HLIST_HEAD(&pid_hash[i]); | ||
| 588 | } | 585 | } |
| 589 | 586 | ||
| 590 | void __init pidmap_init(void) | 587 | void __init pidmap_init(void) |
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index a8b978c35a6a..e1914c7b85b1 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c | |||
| @@ -1108,7 +1108,7 @@ static struct attribute * g[] = { | |||
| 1108 | }; | 1108 | }; |
| 1109 | 1109 | ||
| 1110 | 1110 | ||
| 1111 | static struct attribute_group attr_group = { | 1111 | static const struct attribute_group attr_group = { |
| 1112 | .attrs = g, | 1112 | .attrs = g, |
| 1113 | }; | 1113 | }; |
| 1114 | 1114 | ||
diff --git a/kernel/power/main.c b/kernel/power/main.c index d401c21136d1..42bd800a6755 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
| @@ -705,7 +705,7 @@ static struct attribute * g[] = { | |||
| 705 | NULL, | 705 | NULL, |
| 706 | }; | 706 | }; |
| 707 | 707 | ||
| 708 | static struct attribute_group attr_group = { | 708 | static const struct attribute_group attr_group = { |
| 709 | .attrs = g, | 709 | .attrs = g, |
| 710 | }; | 710 | }; |
| 711 | 711 | ||
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 3b1e0f3ad07f..222317721c5a 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
| @@ -30,19 +30,17 @@ | |||
| 30 | #include <linux/slab.h> | 30 | #include <linux/slab.h> |
| 31 | #include <linux/compiler.h> | 31 | #include <linux/compiler.h> |
| 32 | #include <linux/ktime.h> | 32 | #include <linux/ktime.h> |
| 33 | #include <linux/set_memory.h> | ||
| 33 | 34 | ||
| 34 | #include <linux/uaccess.h> | 35 | #include <linux/uaccess.h> |
| 35 | #include <asm/mmu_context.h> | 36 | #include <asm/mmu_context.h> |
| 36 | #include <asm/pgtable.h> | 37 | #include <asm/pgtable.h> |
| 37 | #include <asm/tlbflush.h> | 38 | #include <asm/tlbflush.h> |
| 38 | #include <asm/io.h> | 39 | #include <asm/io.h> |
| 39 | #ifdef CONFIG_STRICT_KERNEL_RWX | ||
| 40 | #include <asm/set_memory.h> | ||
| 41 | #endif | ||
| 42 | 40 | ||
| 43 | #include "power.h" | 41 | #include "power.h" |
| 44 | 42 | ||
| 45 | #ifdef CONFIG_STRICT_KERNEL_RWX | 43 | #if defined(CONFIG_STRICT_KERNEL_RWX) && defined(CONFIG_ARCH_HAS_SET_MEMORY) |
| 46 | static bool hibernate_restore_protection; | 44 | static bool hibernate_restore_protection; |
| 47 | static bool hibernate_restore_protection_active; | 45 | static bool hibernate_restore_protection_active; |
| 48 | 46 | ||
| @@ -77,7 +75,7 @@ static inline void hibernate_restore_protection_begin(void) {} | |||
| 77 | static inline void hibernate_restore_protection_end(void) {} | 75 | static inline void hibernate_restore_protection_end(void) {} |
| 78 | static inline void hibernate_restore_protect_page(void *page_address) {} | 76 | static inline void hibernate_restore_protect_page(void *page_address) {} |
| 79 | static inline void hibernate_restore_unprotect_page(void *page_address) {} | 77 | static inline void hibernate_restore_unprotect_page(void *page_address) {} |
| 80 | #endif /* CONFIG_STRICT_KERNEL_RWX */ | 78 | #endif /* CONFIG_STRICT_KERNEL_RWX && CONFIG_ARCH_HAS_SET_MEMORY */ |
| 81 | 79 | ||
| 82 | static int swsusp_page_is_free(struct page *); | 80 | static int swsusp_page_is_free(struct page *); |
| 83 | static void swsusp_set_page_forbidden(struct page *); | 81 | static void swsusp_set_page_forbidden(struct page *); |
| @@ -1425,7 +1423,7 @@ static unsigned int nr_meta_pages; | |||
| 1425 | * Numbers of normal and highmem page frames allocated for hibernation image | 1423 | * Numbers of normal and highmem page frames allocated for hibernation image |
| 1426 | * before suspending devices. | 1424 | * before suspending devices. |
| 1427 | */ | 1425 | */ |
| 1428 | unsigned int alloc_normal, alloc_highmem; | 1426 | static unsigned int alloc_normal, alloc_highmem; |
| 1429 | /* | 1427 | /* |
| 1430 | * Memory bitmap used for marking saveable pages (during hibernation) or | 1428 | * Memory bitmap used for marking saveable pages (during hibernation) or |
| 1431 | * hibernation image pages (during restore) | 1429 | * hibernation image pages (during restore) |
| @@ -1929,8 +1927,7 @@ static inline unsigned int alloc_highmem_pages(struct memory_bitmap *bm, | |||
| 1929 | * also be located in the high memory, because of the way in which | 1927 | * also be located in the high memory, because of the way in which |
| 1930 | * copy_data_pages() works. | 1928 | * copy_data_pages() works. |
| 1931 | */ | 1929 | */ |
| 1932 | static int swsusp_alloc(struct memory_bitmap *orig_bm, | 1930 | static int swsusp_alloc(struct memory_bitmap *copy_bm, |
| 1933 | struct memory_bitmap *copy_bm, | ||
| 1934 | unsigned int nr_pages, unsigned int nr_highmem) | 1931 | unsigned int nr_pages, unsigned int nr_highmem) |
| 1935 | { | 1932 | { |
| 1936 | if (nr_highmem > 0) { | 1933 | if (nr_highmem > 0) { |
| @@ -1976,7 +1973,7 @@ asmlinkage __visible int swsusp_save(void) | |||
| 1976 | return -ENOMEM; | 1973 | return -ENOMEM; |
| 1977 | } | 1974 | } |
| 1978 | 1975 | ||
| 1979 | if (swsusp_alloc(&orig_bm, ©_bm, nr_pages, nr_highmem)) { | 1976 | if (swsusp_alloc(©_bm, nr_pages, nr_highmem)) { |
| 1980 | printk(KERN_ERR "PM: Memory allocation failed\n"); | 1977 | printk(KERN_ERR "PM: Memory allocation failed\n"); |
| 1981 | return -ENOMEM; | 1978 | return -ENOMEM; |
| 1982 | } | 1979 | } |
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index c0248c74d6d4..3ecf275d7e44 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c | |||
| @@ -86,11 +86,9 @@ static void freeze_enter(void) | |||
| 86 | 86 | ||
| 87 | /* Push all the CPUs into the idle loop. */ | 87 | /* Push all the CPUs into the idle loop. */ |
| 88 | wake_up_all_idle_cpus(); | 88 | wake_up_all_idle_cpus(); |
| 89 | pr_debug("PM: suspend-to-idle\n"); | ||
| 90 | /* Make the current CPU wait so it can enter the idle loop too. */ | 89 | /* Make the current CPU wait so it can enter the idle loop too. */ |
| 91 | wait_event(suspend_freeze_wait_head, | 90 | wait_event(suspend_freeze_wait_head, |
| 92 | suspend_freeze_state == FREEZE_STATE_WAKE); | 91 | suspend_freeze_state == FREEZE_STATE_WAKE); |
| 93 | pr_debug("PM: resume from suspend-to-idle\n"); | ||
| 94 | 92 | ||
| 95 | cpuidle_pause(); | 93 | cpuidle_pause(); |
| 96 | put_online_cpus(); | 94 | put_online_cpus(); |
| @@ -106,6 +104,8 @@ static void freeze_enter(void) | |||
| 106 | 104 | ||
| 107 | static void s2idle_loop(void) | 105 | static void s2idle_loop(void) |
| 108 | { | 106 | { |
| 107 | pr_debug("PM: suspend-to-idle\n"); | ||
| 108 | |||
| 109 | do { | 109 | do { |
| 110 | freeze_enter(); | 110 | freeze_enter(); |
| 111 | 111 | ||
| @@ -121,6 +121,8 @@ static void s2idle_loop(void) | |||
| 121 | 121 | ||
| 122 | pm_wakeup_clear(false); | 122 | pm_wakeup_clear(false); |
| 123 | } while (!dpm_suspend_noirq(PMSG_SUSPEND)); | 123 | } while (!dpm_suspend_noirq(PMSG_SUSPEND)); |
| 124 | |||
| 125 | pr_debug("PM: resume from suspend-to-idle\n"); | ||
| 124 | } | 126 | } |
| 125 | 127 | ||
| 126 | void freeze_wake(void) | 128 | void freeze_wake(void) |
diff --git a/kernel/power/swap.c b/kernel/power/swap.c index f80fd33639e0..57d22571f306 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c | |||
| @@ -225,14 +225,14 @@ static struct block_device *hib_resume_bdev; | |||
| 225 | struct hib_bio_batch { | 225 | struct hib_bio_batch { |
| 226 | atomic_t count; | 226 | atomic_t count; |
| 227 | wait_queue_head_t wait; | 227 | wait_queue_head_t wait; |
| 228 | int error; | 228 | blk_status_t error; |
| 229 | }; | 229 | }; |
| 230 | 230 | ||
| 231 | static void hib_init_batch(struct hib_bio_batch *hb) | 231 | static void hib_init_batch(struct hib_bio_batch *hb) |
| 232 | { | 232 | { |
| 233 | atomic_set(&hb->count, 0); | 233 | atomic_set(&hb->count, 0); |
| 234 | init_waitqueue_head(&hb->wait); | 234 | init_waitqueue_head(&hb->wait); |
| 235 | hb->error = 0; | 235 | hb->error = BLK_STS_OK; |
| 236 | } | 236 | } |
| 237 | 237 | ||
| 238 | static void hib_end_io(struct bio *bio) | 238 | static void hib_end_io(struct bio *bio) |
| @@ -240,7 +240,7 @@ static void hib_end_io(struct bio *bio) | |||
| 240 | struct hib_bio_batch *hb = bio->bi_private; | 240 | struct hib_bio_batch *hb = bio->bi_private; |
| 241 | struct page *page = bio->bi_io_vec[0].bv_page; | 241 | struct page *page = bio->bi_io_vec[0].bv_page; |
| 242 | 242 | ||
| 243 | if (bio->bi_error) { | 243 | if (bio->bi_status) { |
| 244 | printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n", | 244 | printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n", |
| 245 | imajor(bio->bi_bdev->bd_inode), | 245 | imajor(bio->bi_bdev->bd_inode), |
| 246 | iminor(bio->bi_bdev->bd_inode), | 246 | iminor(bio->bi_bdev->bd_inode), |
| @@ -253,8 +253,8 @@ static void hib_end_io(struct bio *bio) | |||
| 253 | flush_icache_range((unsigned long)page_address(page), | 253 | flush_icache_range((unsigned long)page_address(page), |
| 254 | (unsigned long)page_address(page) + PAGE_SIZE); | 254 | (unsigned long)page_address(page) + PAGE_SIZE); |
| 255 | 255 | ||
| 256 | if (bio->bi_error && !hb->error) | 256 | if (bio->bi_status && !hb->error) |
| 257 | hb->error = bio->bi_error; | 257 | hb->error = bio->bi_status; |
| 258 | if (atomic_dec_and_test(&hb->count)) | 258 | if (atomic_dec_and_test(&hb->count)) |
| 259 | wake_up(&hb->wait); | 259 | wake_up(&hb->wait); |
| 260 | 260 | ||
| @@ -293,10 +293,10 @@ static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr, | |||
| 293 | return error; | 293 | return error; |
| 294 | } | 294 | } |
| 295 | 295 | ||
| 296 | static int hib_wait_io(struct hib_bio_batch *hb) | 296 | static blk_status_t hib_wait_io(struct hib_bio_batch *hb) |
| 297 | { | 297 | { |
| 298 | wait_event(hb->wait, atomic_read(&hb->count) == 0); | 298 | wait_event(hb->wait, atomic_read(&hb->count) == 0); |
| 299 | return hb->error; | 299 | return blk_status_to_errno(hb->error); |
| 300 | } | 300 | } |
| 301 | 301 | ||
| 302 | /* | 302 | /* |
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 1db044f808b7..2a7d04049af4 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h | |||
| @@ -18,12 +18,14 @@ | |||
| 18 | 18 | ||
| 19 | #ifdef CONFIG_PRINTK | 19 | #ifdef CONFIG_PRINTK |
| 20 | 20 | ||
| 21 | #define PRINTK_SAFE_CONTEXT_MASK 0x7fffffff | 21 | #define PRINTK_SAFE_CONTEXT_MASK 0x3fffffff |
| 22 | #define PRINTK_NMI_CONTEXT_MASK 0x80000000 | 22 | #define PRINTK_NMI_DEFERRED_CONTEXT_MASK 0x40000000 |
| 23 | #define PRINTK_NMI_CONTEXT_MASK 0x80000000 | ||
| 23 | 24 | ||
| 24 | extern raw_spinlock_t logbuf_lock; | 25 | extern raw_spinlock_t logbuf_lock; |
| 25 | 26 | ||
| 26 | __printf(1, 0) int vprintk_default(const char *fmt, va_list args); | 27 | __printf(1, 0) int vprintk_default(const char *fmt, va_list args); |
| 28 | __printf(1, 0) int vprintk_deferred(const char *fmt, va_list args); | ||
| 27 | __printf(1, 0) int vprintk_func(const char *fmt, va_list args); | 29 | __printf(1, 0) int vprintk_func(const char *fmt, va_list args); |
| 28 | void __printk_safe_enter(void); | 30 | void __printk_safe_enter(void); |
| 29 | void __printk_safe_exit(void); | 31 | void __printk_safe_exit(void); |
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index a1aecf44ab07..fc47863f629c 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c | |||
| @@ -269,7 +269,6 @@ static struct console *exclusive_console; | |||
| 269 | #define MAX_CMDLINECONSOLES 8 | 269 | #define MAX_CMDLINECONSOLES 8 |
| 270 | 270 | ||
| 271 | static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES]; | 271 | static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES]; |
| 272 | static int console_cmdline_cnt; | ||
| 273 | 272 | ||
| 274 | static int preferred_console = -1; | 273 | static int preferred_console = -1; |
| 275 | int console_set_on_cmdline; | 274 | int console_set_on_cmdline; |
| @@ -1176,7 +1175,7 @@ static void boot_delay_msec(int level) | |||
| 1176 | unsigned long long k; | 1175 | unsigned long long k; |
| 1177 | unsigned long timeout; | 1176 | unsigned long timeout; |
| 1178 | 1177 | ||
| 1179 | if ((boot_delay == 0 || system_state != SYSTEM_BOOTING) | 1178 | if ((boot_delay == 0 || system_state >= SYSTEM_RUNNING) |
| 1180 | || suppress_message_printing(level)) { | 1179 | || suppress_message_printing(level)) { |
| 1181 | return; | 1180 | return; |
| 1182 | } | 1181 | } |
| @@ -1906,25 +1905,12 @@ static int __add_preferred_console(char *name, int idx, char *options, | |||
| 1906 | * See if this tty is not yet registered, and | 1905 | * See if this tty is not yet registered, and |
| 1907 | * if we have a slot free. | 1906 | * if we have a slot free. |
| 1908 | */ | 1907 | */ |
| 1909 | for (i = 0, c = console_cmdline; i < console_cmdline_cnt; i++, c++) { | 1908 | for (i = 0, c = console_cmdline; |
| 1909 | i < MAX_CMDLINECONSOLES && c->name[0]; | ||
| 1910 | i++, c++) { | ||
| 1910 | if (strcmp(c->name, name) == 0 && c->index == idx) { | 1911 | if (strcmp(c->name, name) == 0 && c->index == idx) { |
| 1911 | if (brl_options) | 1912 | if (!brl_options) |
| 1912 | return 0; | 1913 | preferred_console = i; |
| 1913 | |||
| 1914 | /* | ||
| 1915 | * Maintain an invariant that will help to find if | ||
| 1916 | * the matching console is preferred, see | ||
| 1917 | * register_console(): | ||
| 1918 | * | ||
| 1919 | * The last non-braille console is always | ||
| 1920 | * the preferred one. | ||
| 1921 | */ | ||
| 1922 | if (i != console_cmdline_cnt - 1) | ||
| 1923 | swap(console_cmdline[i], | ||
| 1924 | console_cmdline[console_cmdline_cnt - 1]); | ||
| 1925 | |||
| 1926 | preferred_console = console_cmdline_cnt - 1; | ||
| 1927 | |||
| 1928 | return 0; | 1914 | return 0; |
| 1929 | } | 1915 | } |
| 1930 | } | 1916 | } |
| @@ -1937,7 +1923,6 @@ static int __add_preferred_console(char *name, int idx, char *options, | |||
| 1937 | braille_set_options(c, brl_options); | 1923 | braille_set_options(c, brl_options); |
| 1938 | 1924 | ||
| 1939 | c->index = idx; | 1925 | c->index = idx; |
| 1940 | console_cmdline_cnt++; | ||
| 1941 | return 0; | 1926 | return 0; |
| 1942 | } | 1927 | } |
| 1943 | /* | 1928 | /* |
| @@ -2477,23 +2462,12 @@ void register_console(struct console *newcon) | |||
| 2477 | } | 2462 | } |
| 2478 | 2463 | ||
| 2479 | /* | 2464 | /* |
| 2480 | * See if this console matches one we selected on the command line. | 2465 | * See if this console matches one we selected on |
| 2481 | * | 2466 | * the command line. |
| 2482 | * There may be several entries in the console_cmdline array matching | ||
| 2483 | * with the same console, one with newcon->match(), another by | ||
| 2484 | * name/index: | ||
| 2485 | * | ||
| 2486 | * pl011,mmio,0x87e024000000,115200 -- added from SPCR | ||
| 2487 | * ttyAMA0 -- added from command line | ||
| 2488 | * | ||
| 2489 | * Traverse the console_cmdline array in reverse order to be | ||
| 2490 | * sure that if this console is preferred then it will be the first | ||
| 2491 | * matching entry. We use the invariant that is maintained in | ||
| 2492 | * __add_preferred_console(). | ||
| 2493 | */ | 2467 | */ |
| 2494 | for (i = console_cmdline_cnt - 1; i >= 0; i--) { | 2468 | for (i = 0, c = console_cmdline; |
| 2495 | c = console_cmdline + i; | 2469 | i < MAX_CMDLINECONSOLES && c->name[0]; |
| 2496 | 2470 | i++, c++) { | |
| 2497 | if (!newcon->match || | 2471 | if (!newcon->match || |
| 2498 | newcon->match(newcon, c->name, c->index, c->options) != 0) { | 2472 | newcon->match(newcon, c->name, c->index, c->options) != 0) { |
| 2499 | /* default matching */ | 2473 | /* default matching */ |
| @@ -2746,16 +2720,13 @@ void wake_up_klogd(void) | |||
| 2746 | preempt_enable(); | 2720 | preempt_enable(); |
| 2747 | } | 2721 | } |
| 2748 | 2722 | ||
| 2749 | int printk_deferred(const char *fmt, ...) | 2723 | int vprintk_deferred(const char *fmt, va_list args) |
| 2750 | { | 2724 | { |
| 2751 | va_list args; | ||
| 2752 | int r; | 2725 | int r; |
| 2753 | 2726 | ||
| 2754 | preempt_disable(); | ||
| 2755 | va_start(args, fmt); | ||
| 2756 | r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, 0, fmt, args); | 2727 | r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, 0, fmt, args); |
| 2757 | va_end(args); | ||
| 2758 | 2728 | ||
| 2729 | preempt_disable(); | ||
| 2759 | __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT); | 2730 | __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT); |
| 2760 | irq_work_queue(this_cpu_ptr(&wake_up_klogd_work)); | 2731 | irq_work_queue(this_cpu_ptr(&wake_up_klogd_work)); |
| 2761 | preempt_enable(); | 2732 | preempt_enable(); |
| @@ -2763,6 +2734,18 @@ int printk_deferred(const char *fmt, ...) | |||
| 2763 | return r; | 2734 | return r; |
| 2764 | } | 2735 | } |
| 2765 | 2736 | ||
| 2737 | int printk_deferred(const char *fmt, ...) | ||
| 2738 | { | ||
| 2739 | va_list args; | ||
| 2740 | int r; | ||
| 2741 | |||
| 2742 | va_start(args, fmt); | ||
| 2743 | r = vprintk_deferred(fmt, args); | ||
| 2744 | va_end(args); | ||
| 2745 | |||
| 2746 | return r; | ||
| 2747 | } | ||
| 2748 | |||
| 2766 | /* | 2749 | /* |
| 2767 | * printk rate limiting, lifted from the networking subsystem. | 2750 | * printk rate limiting, lifted from the networking subsystem. |
| 2768 | * | 2751 | * |
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index 033e50a7d706..3cdaeaef9ce1 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c | |||
| @@ -80,8 +80,8 @@ static void queue_flush_work(struct printk_safe_seq_buf *s) | |||
| 80 | * happen, printk_safe_log_store() will notice the buffer->len mismatch | 80 | * happen, printk_safe_log_store() will notice the buffer->len mismatch |
| 81 | * and repeat the write. | 81 | * and repeat the write. |
| 82 | */ | 82 | */ |
| 83 | static int printk_safe_log_store(struct printk_safe_seq_buf *s, | 83 | static __printf(2, 0) int printk_safe_log_store(struct printk_safe_seq_buf *s, |
| 84 | const char *fmt, va_list args) | 84 | const char *fmt, va_list args) |
| 85 | { | 85 | { |
| 86 | int add; | 86 | int add; |
| 87 | size_t len; | 87 | size_t len; |
| @@ -299,7 +299,7 @@ void printk_safe_flush_on_panic(void) | |||
| 299 | * one writer running. But the buffer might get flushed from another | 299 | * one writer running. But the buffer might get flushed from another |
| 300 | * CPU, so we need to be careful. | 300 | * CPU, so we need to be careful. |
| 301 | */ | 301 | */ |
| 302 | static int vprintk_nmi(const char *fmt, va_list args) | 302 | static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args) |
| 303 | { | 303 | { |
| 304 | struct printk_safe_seq_buf *s = this_cpu_ptr(&nmi_print_seq); | 304 | struct printk_safe_seq_buf *s = this_cpu_ptr(&nmi_print_seq); |
| 305 | 305 | ||
| @@ -308,17 +308,29 @@ static int vprintk_nmi(const char *fmt, va_list args) | |||
| 308 | 308 | ||
| 309 | void printk_nmi_enter(void) | 309 | void printk_nmi_enter(void) |
| 310 | { | 310 | { |
| 311 | this_cpu_or(printk_context, PRINTK_NMI_CONTEXT_MASK); | 311 | /* |
| 312 | * The size of the extra per-CPU buffer is limited. Use it only when | ||
| 313 | * the main one is locked. If this CPU is not in the safe context, | ||
| 314 | * the lock must be taken on another CPU and we could wait for it. | ||
| 315 | */ | ||
| 316 | if ((this_cpu_read(printk_context) & PRINTK_SAFE_CONTEXT_MASK) && | ||
| 317 | raw_spin_is_locked(&logbuf_lock)) { | ||
| 318 | this_cpu_or(printk_context, PRINTK_NMI_CONTEXT_MASK); | ||
| 319 | } else { | ||
| 320 | this_cpu_or(printk_context, PRINTK_NMI_DEFERRED_CONTEXT_MASK); | ||
| 321 | } | ||
| 312 | } | 322 | } |
| 313 | 323 | ||
| 314 | void printk_nmi_exit(void) | 324 | void printk_nmi_exit(void) |
| 315 | { | 325 | { |
| 316 | this_cpu_and(printk_context, ~PRINTK_NMI_CONTEXT_MASK); | 326 | this_cpu_and(printk_context, |
| 327 | ~(PRINTK_NMI_CONTEXT_MASK | | ||
| 328 | PRINTK_NMI_DEFERRED_CONTEXT_MASK)); | ||
| 317 | } | 329 | } |
| 318 | 330 | ||
| 319 | #else | 331 | #else |
| 320 | 332 | ||
| 321 | static int vprintk_nmi(const char *fmt, va_list args) | 333 | static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args) |
| 322 | { | 334 | { |
| 323 | return 0; | 335 | return 0; |
| 324 | } | 336 | } |
| @@ -330,7 +342,7 @@ static int vprintk_nmi(const char *fmt, va_list args) | |||
| 330 | * into itself. It uses a per-CPU buffer to store the message, just like | 342 | * into itself. It uses a per-CPU buffer to store the message, just like |
| 331 | * NMI. | 343 | * NMI. |
| 332 | */ | 344 | */ |
| 333 | static int vprintk_safe(const char *fmt, va_list args) | 345 | static __printf(1, 0) int vprintk_safe(const char *fmt, va_list args) |
| 334 | { | 346 | { |
| 335 | struct printk_safe_seq_buf *s = this_cpu_ptr(&safe_print_seq); | 347 | struct printk_safe_seq_buf *s = this_cpu_ptr(&safe_print_seq); |
| 336 | 348 | ||
| @@ -351,12 +363,22 @@ void __printk_safe_exit(void) | |||
| 351 | 363 | ||
| 352 | __printf(1, 0) int vprintk_func(const char *fmt, va_list args) | 364 | __printf(1, 0) int vprintk_func(const char *fmt, va_list args) |
| 353 | { | 365 | { |
| 366 | /* Use extra buffer in NMI when logbuf_lock is taken or in safe mode. */ | ||
| 354 | if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK) | 367 | if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK) |
| 355 | return vprintk_nmi(fmt, args); | 368 | return vprintk_nmi(fmt, args); |
| 356 | 369 | ||
| 370 | /* Use extra buffer to prevent a recursion deadlock in safe mode. */ | ||
| 357 | if (this_cpu_read(printk_context) & PRINTK_SAFE_CONTEXT_MASK) | 371 | if (this_cpu_read(printk_context) & PRINTK_SAFE_CONTEXT_MASK) |
| 358 | return vprintk_safe(fmt, args); | 372 | return vprintk_safe(fmt, args); |
| 359 | 373 | ||
| 374 | /* | ||
| 375 | * Use the main logbuf when logbuf_lock is available in NMI. | ||
| 376 | * But avoid calling console drivers that might have their own locks. | ||
| 377 | */ | ||
| 378 | if (this_cpu_read(printk_context) & PRINTK_NMI_DEFERRED_CONTEXT_MASK) | ||
| 379 | return vprintk_deferred(fmt, args); | ||
| 380 | |||
| 381 | /* No obstacles. */ | ||
| 360 | return vprintk_default(fmt, args); | 382 | return vprintk_default(fmt, args); |
| 361 | } | 383 | } |
| 362 | 384 | ||
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 266ddcc1d8bb..60f356d91060 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
| @@ -60,19 +60,25 @@ int ptrace_access_vm(struct task_struct *tsk, unsigned long addr, | |||
| 60 | } | 60 | } |
| 61 | 61 | ||
| 62 | 62 | ||
| 63 | void __ptrace_link(struct task_struct *child, struct task_struct *new_parent, | ||
| 64 | const struct cred *ptracer_cred) | ||
| 65 | { | ||
| 66 | BUG_ON(!list_empty(&child->ptrace_entry)); | ||
| 67 | list_add(&child->ptrace_entry, &new_parent->ptraced); | ||
| 68 | child->parent = new_parent; | ||
| 69 | child->ptracer_cred = get_cred(ptracer_cred); | ||
| 70 | } | ||
| 71 | |||
| 63 | /* | 72 | /* |
| 64 | * ptrace a task: make the debugger its new parent and | 73 | * ptrace a task: make the debugger its new parent and |
| 65 | * move it to the ptrace list. | 74 | * move it to the ptrace list. |
| 66 | * | 75 | * |
| 67 | * Must be called with the tasklist lock write-held. | 76 | * Must be called with the tasklist lock write-held. |
| 68 | */ | 77 | */ |
| 69 | void __ptrace_link(struct task_struct *child, struct task_struct *new_parent) | 78 | static void ptrace_link(struct task_struct *child, struct task_struct *new_parent) |
| 70 | { | 79 | { |
| 71 | BUG_ON(!list_empty(&child->ptrace_entry)); | ||
| 72 | list_add(&child->ptrace_entry, &new_parent->ptraced); | ||
| 73 | child->parent = new_parent; | ||
| 74 | rcu_read_lock(); | 80 | rcu_read_lock(); |
| 75 | child->ptracer_cred = get_cred(__task_cred(new_parent)); | 81 | __ptrace_link(child, new_parent, __task_cred(new_parent)); |
| 76 | rcu_read_unlock(); | 82 | rcu_read_unlock(); |
| 77 | } | 83 | } |
| 78 | 84 | ||
| @@ -386,7 +392,7 @@ static int ptrace_attach(struct task_struct *task, long request, | |||
| 386 | flags |= PT_SEIZED; | 392 | flags |= PT_SEIZED; |
| 387 | task->ptrace = flags; | 393 | task->ptrace = flags; |
| 388 | 394 | ||
| 389 | __ptrace_link(task, current); | 395 | ptrace_link(task, current); |
| 390 | 396 | ||
| 391 | /* SEIZE doesn't trap tracee on attach */ | 397 | /* SEIZE doesn't trap tracee on attach */ |
| 392 | if (!seize) | 398 | if (!seize) |
| @@ -459,7 +465,7 @@ static int ptrace_traceme(void) | |||
| 459 | */ | 465 | */ |
| 460 | if (!ret && !(current->real_parent->flags & PF_EXITING)) { | 466 | if (!ret && !(current->real_parent->flags & PF_EXITING)) { |
| 461 | current->ptrace = PT_PTRACED; | 467 | current->ptrace = PT_PTRACED; |
| 462 | __ptrace_link(current, current->real_parent); | 468 | ptrace_link(current, current->real_parent); |
| 463 | } | 469 | } |
| 464 | } | 470 | } |
| 465 | write_unlock_irq(&tasklist_lock); | 471 | write_unlock_irq(&tasklist_lock); |
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig new file mode 100644 index 000000000000..be90c945063f --- /dev/null +++ b/kernel/rcu/Kconfig | |||
| @@ -0,0 +1,242 @@ | |||
| 1 | # | ||
| 2 | # RCU-related configuration options | ||
| 3 | # | ||
| 4 | |||
| 5 | menu "RCU Subsystem" | ||
| 6 | |||
| 7 | config TREE_RCU | ||
| 8 | bool | ||
| 9 | default y if !PREEMPT && SMP | ||
| 10 | help | ||
| 11 | This option selects the RCU implementation that is | ||
| 12 | designed for very large SMP system with hundreds or | ||
| 13 | thousands of CPUs. It also scales down nicely to | ||
| 14 | smaller systems. | ||
| 15 | |||
| 16 | config PREEMPT_RCU | ||
| 17 | bool | ||
| 18 | default y if PREEMPT | ||
| 19 | help | ||
| 20 | This option selects the RCU implementation that is | ||
| 21 | designed for very large SMP systems with hundreds or | ||
| 22 | thousands of CPUs, but for which real-time response | ||
| 23 | is also required. It also scales down nicely to | ||
| 24 | smaller systems. | ||
| 25 | |||
| 26 | Select this option if you are unsure. | ||
| 27 | |||
| 28 | config TINY_RCU | ||
| 29 | bool | ||
| 30 | default y if !PREEMPT && !SMP | ||
| 31 | help | ||
| 32 | This option selects the RCU implementation that is | ||
| 33 | designed for UP systems from which real-time response | ||
| 34 | is not required. This option greatly reduces the | ||
| 35 | memory footprint of RCU. | ||
| 36 | |||
| 37 | config RCU_EXPERT | ||
| 38 | bool "Make expert-level adjustments to RCU configuration" | ||
| 39 | default n | ||
| 40 | help | ||
| 41 | This option needs to be enabled if you wish to make | ||
| 42 | expert-level adjustments to RCU configuration. By default, | ||
| 43 | no such adjustments can be made, which has the often-beneficial | ||
| 44 | side-effect of preventing "make oldconfig" from asking you all | ||
| 45 | sorts of detailed questions about how you would like numerous | ||
| 46 | obscure RCU options to be set up. | ||
| 47 | |||
| 48 | Say Y if you need to make expert-level adjustments to RCU. | ||
| 49 | |||
| 50 | Say N if you are unsure. | ||
| 51 | |||
| 52 | config SRCU | ||
| 53 | bool | ||
| 54 | help | ||
| 55 | This option selects the sleepable version of RCU. This version | ||
| 56 | permits arbitrary sleeping or blocking within RCU read-side critical | ||
| 57 | sections. | ||
| 58 | |||
| 59 | config TINY_SRCU | ||
| 60 | bool | ||
| 61 | default y if SRCU && TINY_RCU | ||
| 62 | help | ||
| 63 | This option selects the single-CPU non-preemptible version of SRCU. | ||
| 64 | |||
| 65 | config TREE_SRCU | ||
| 66 | bool | ||
| 67 | default y if SRCU && !TINY_RCU | ||
| 68 | help | ||
| 69 | This option selects the full-fledged version of SRCU. | ||
| 70 | |||
| 71 | config TASKS_RCU | ||
| 72 | bool | ||
| 73 | default n | ||
| 74 | select SRCU | ||
| 75 | help | ||
| 76 | This option enables a task-based RCU implementation that uses | ||
| 77 | only voluntary context switch (not preemption!), idle, and | ||
| 78 | user-mode execution as quiescent states. | ||
| 79 | |||
| 80 | config RCU_STALL_COMMON | ||
| 81 | def_bool ( TREE_RCU || PREEMPT_RCU ) | ||
| 82 | help | ||
| 83 | This option enables RCU CPU stall code that is common between | ||
| 84 | the TINY and TREE variants of RCU. The purpose is to allow | ||
| 85 | the tiny variants to disable RCU CPU stall warnings, while | ||
| 86 | making these warnings mandatory for the tree variants. | ||
| 87 | |||
| 88 | config RCU_NEED_SEGCBLIST | ||
| 89 | def_bool ( TREE_RCU || PREEMPT_RCU || TREE_SRCU ) | ||
| 90 | |||
| 91 | config CONTEXT_TRACKING | ||
| 92 | bool | ||
| 93 | |||
| 94 | config CONTEXT_TRACKING_FORCE | ||
| 95 | bool "Force context tracking" | ||
| 96 | depends on CONTEXT_TRACKING | ||
| 97 | default y if !NO_HZ_FULL | ||
| 98 | help | ||
| 99 | The major pre-requirement for full dynticks to work is to | ||
| 100 | support the context tracking subsystem. But there are also | ||
| 101 | other dependencies to provide in order to make the full | ||
| 102 | dynticks working. | ||
| 103 | |||
| 104 | This option stands for testing when an arch implements the | ||
| 105 | context tracking backend but doesn't yet fullfill all the | ||
| 106 | requirements to make the full dynticks feature working. | ||
| 107 | Without the full dynticks, there is no way to test the support | ||
| 108 | for context tracking and the subsystems that rely on it: RCU | ||
| 109 | userspace extended quiescent state and tickless cputime | ||
| 110 | accounting. This option copes with the absence of the full | ||
| 111 | dynticks subsystem by forcing the context tracking on all | ||
| 112 | CPUs in the system. | ||
| 113 | |||
| 114 | Say Y only if you're working on the development of an | ||
| 115 | architecture backend for the context tracking. | ||
| 116 | |||
| 117 | Say N otherwise, this option brings an overhead that you | ||
| 118 | don't want in production. | ||
| 119 | |||
| 120 | |||
| 121 | config RCU_FANOUT | ||
| 122 | int "Tree-based hierarchical RCU fanout value" | ||
| 123 | range 2 64 if 64BIT | ||
| 124 | range 2 32 if !64BIT | ||
| 125 | depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT | ||
| 126 | default 64 if 64BIT | ||
| 127 | default 32 if !64BIT | ||
| 128 | help | ||
| 129 | This option controls the fanout of hierarchical implementations | ||
| 130 | of RCU, allowing RCU to work efficiently on machines with | ||
| 131 | large numbers of CPUs. This value must be at least the fourth | ||
| 132 | root of NR_CPUS, which allows NR_CPUS to be insanely large. | ||
| 133 | The default value of RCU_FANOUT should be used for production | ||
| 134 | systems, but if you are stress-testing the RCU implementation | ||
| 135 | itself, small RCU_FANOUT values allow you to test large-system | ||
| 136 | code paths on small(er) systems. | ||
| 137 | |||
| 138 | Select a specific number if testing RCU itself. | ||
| 139 | Take the default if unsure. | ||
| 140 | |||
| 141 | config RCU_FANOUT_LEAF | ||
| 142 | int "Tree-based hierarchical RCU leaf-level fanout value" | ||
| 143 | range 2 64 if 64BIT | ||
| 144 | range 2 32 if !64BIT | ||
| 145 | depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT | ||
| 146 | default 16 | ||
| 147 | help | ||
| 148 | This option controls the leaf-level fanout of hierarchical | ||
| 149 | implementations of RCU, and allows trading off cache misses | ||
| 150 | against lock contention. Systems that synchronize their | ||
| 151 | scheduling-clock interrupts for energy-efficiency reasons will | ||
| 152 | want the default because the smaller leaf-level fanout keeps | ||
| 153 | lock contention levels acceptably low. Very large systems | ||
| 154 | (hundreds or thousands of CPUs) will instead want to set this | ||
| 155 | value to the maximum value possible in order to reduce the | ||
| 156 | number of cache misses incurred during RCU's grace-period | ||
| 157 | initialization. These systems tend to run CPU-bound, and thus | ||
| 158 | are not helped by synchronized interrupts, and thus tend to | ||
| 159 | skew them, which reduces lock contention enough that large | ||
| 160 | leaf-level fanouts work well. That said, setting leaf-level | ||
| 161 | fanout to a large number will likely cause problematic | ||
| 162 | lock contention on the leaf-level rcu_node structures unless | ||
| 163 | you boot with the skew_tick kernel parameter. | ||
| 164 | |||
| 165 | Select a specific number if testing RCU itself. | ||
| 166 | |||
| 167 | Select the maximum permissible value for large systems, but | ||
| 168 | please understand that you may also need to set the skew_tick | ||
| 169 | kernel boot parameter to avoid contention on the rcu_node | ||
| 170 | structure's locks. | ||
| 171 | |||
| 172 | Take the default if unsure. | ||
| 173 | |||
| 174 | config RCU_FAST_NO_HZ | ||
| 175 | bool "Accelerate last non-dyntick-idle CPU's grace periods" | ||
| 176 | depends on NO_HZ_COMMON && SMP && RCU_EXPERT | ||
| 177 | default n | ||
| 178 | help | ||
| 179 | This option permits CPUs to enter dynticks-idle state even if | ||
| 180 | they have RCU callbacks queued, and prevents RCU from waking | ||
| 181 | these CPUs up more than roughly once every four jiffies (by | ||
| 182 | default, you can adjust this using the rcutree.rcu_idle_gp_delay | ||
| 183 | parameter), thus improving energy efficiency. On the other | ||
| 184 | hand, this option increases the duration of RCU grace periods, | ||
| 185 | for example, slowing down synchronize_rcu(). | ||
| 186 | |||
| 187 | Say Y if energy efficiency is critically important, and you | ||
| 188 | don't care about increased grace-period durations. | ||
| 189 | |||
| 190 | Say N if you are unsure. | ||
| 191 | |||
| 192 | config RCU_BOOST | ||
| 193 | bool "Enable RCU priority boosting" | ||
| 194 | depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT | ||
| 195 | default n | ||
| 196 | help | ||
| 197 | This option boosts the priority of preempted RCU readers that | ||
| 198 | block the current preemptible RCU grace period for too long. | ||
| 199 | This option also prevents heavy loads from blocking RCU | ||
| 200 | callback invocation for all flavors of RCU. | ||
| 201 | |||
| 202 | Say Y here if you are working with real-time apps or heavy loads | ||
| 203 | Say N here if you are unsure. | ||
| 204 | |||
| 205 | config RCU_BOOST_DELAY | ||
| 206 | int "Milliseconds to delay boosting after RCU grace-period start" | ||
| 207 | range 0 3000 | ||
| 208 | depends on RCU_BOOST | ||
| 209 | default 500 | ||
| 210 | help | ||
| 211 | This option specifies the time to wait after the beginning of | ||
| 212 | a given grace period before priority-boosting preempted RCU | ||
| 213 | readers blocking that grace period. Note that any RCU reader | ||
| 214 | blocking an expedited RCU grace period is boosted immediately. | ||
| 215 | |||
| 216 | Accept the default if unsure. | ||
| 217 | |||
| 218 | config RCU_NOCB_CPU | ||
| 219 | bool "Offload RCU callback processing from boot-selected CPUs" | ||
| 220 | depends on TREE_RCU || PREEMPT_RCU | ||
| 221 | depends on RCU_EXPERT || NO_HZ_FULL | ||
| 222 | default n | ||
| 223 | help | ||
| 224 | Use this option to reduce OS jitter for aggressive HPC or | ||
| 225 | real-time workloads. It can also be used to offload RCU | ||
| 226 | callback invocation to energy-efficient CPUs in battery-powered | ||
| 227 | asymmetric multiprocessors. | ||
| 228 | |||
| 229 | This option offloads callback invocation from the set of | ||
| 230 | CPUs specified at boot time by the rcu_nocbs parameter. | ||
| 231 | For each such CPU, a kthread ("rcuox/N") will be created to | ||
| 232 | invoke callbacks, where the "N" is the CPU being offloaded, | ||
| 233 | and where the "x" is "b" for RCU-bh, "p" for RCU-preempt, and | ||
| 234 | "s" for RCU-sched. Nothing prevents this kthread from running | ||
| 235 | on the specified CPUs, but (1) the kthreads may be preempted | ||
| 236 | between each callback, and (2) affinity or cgroups can be used | ||
| 237 | to force the kthreads to run on whatever set of CPUs is desired. | ||
| 238 | |||
| 239 | Say Y here if you want to help to debug reduced OS jitter. | ||
| 240 | Say N here if you are unsure. | ||
| 241 | |||
| 242 | endmenu # "RCU Subsystem" | ||
diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug new file mode 100644 index 000000000000..0ec7d1d33a14 --- /dev/null +++ b/kernel/rcu/Kconfig.debug | |||
| @@ -0,0 +1,82 @@ | |||
| 1 | # | ||
| 2 | # RCU-related debugging configuration options | ||
| 3 | # | ||
| 4 | |||
| 5 | menu "RCU Debugging" | ||
| 6 | |||
| 7 | config PROVE_RCU | ||
| 8 | def_bool PROVE_LOCKING | ||
| 9 | |||
| 10 | config TORTURE_TEST | ||
| 11 | tristate | ||
| 12 | default n | ||
| 13 | |||
| 14 | config RCU_PERF_TEST | ||
| 15 | tristate "performance tests for RCU" | ||
| 16 | depends on DEBUG_KERNEL | ||
| 17 | select TORTURE_TEST | ||
| 18 | select SRCU | ||
| 19 | select TASKS_RCU | ||
| 20 | default n | ||
| 21 | help | ||
| 22 | This option provides a kernel module that runs performance | ||
| 23 | tests on the RCU infrastructure. The kernel module may be built | ||
| 24 | after the fact on the running kernel to be tested, if desired. | ||
| 25 | |||
| 26 | Say Y here if you want RCU performance tests to be built into | ||
| 27 | the kernel. | ||
| 28 | Say M if you want the RCU performance tests to build as a module. | ||
| 29 | Say N if you are unsure. | ||
| 30 | |||
| 31 | config RCU_TORTURE_TEST | ||
| 32 | tristate "torture tests for RCU" | ||
| 33 | depends on DEBUG_KERNEL | ||
| 34 | select TORTURE_TEST | ||
| 35 | select SRCU | ||
| 36 | select TASKS_RCU | ||
| 37 | default n | ||
| 38 | help | ||
| 39 | This option provides a kernel module that runs torture tests | ||
| 40 | on the RCU infrastructure. The kernel module may be built | ||
| 41 | after the fact on the running kernel to be tested, if desired. | ||
| 42 | |||
| 43 | Say Y here if you want RCU torture tests to be built into | ||
| 44 | the kernel. | ||
| 45 | Say M if you want the RCU torture tests to build as a module. | ||
| 46 | Say N if you are unsure. | ||
| 47 | |||
| 48 | config RCU_CPU_STALL_TIMEOUT | ||
| 49 | int "RCU CPU stall timeout in seconds" | ||
| 50 | depends on RCU_STALL_COMMON | ||
| 51 | range 3 300 | ||
| 52 | default 21 | ||
| 53 | help | ||
| 54 | If a given RCU grace period extends more than the specified | ||
| 55 | number of seconds, a CPU stall warning is printed. If the | ||
| 56 | RCU grace period persists, additional CPU stall warnings are | ||
| 57 | printed at more widely spaced intervals. | ||
| 58 | |||
| 59 | config RCU_TRACE | ||
| 60 | bool "Enable tracing for RCU" | ||
| 61 | depends on DEBUG_KERNEL | ||
| 62 | default y if TREE_RCU | ||
| 63 | select TRACE_CLOCK | ||
| 64 | help | ||
| 65 | This option enables additional tracepoints for ftrace-style | ||
| 66 | event tracing. | ||
| 67 | |||
| 68 | Say Y here if you want to enable RCU tracing | ||
| 69 | Say N if you are unsure. | ||
| 70 | |||
| 71 | config RCU_EQS_DEBUG | ||
| 72 | bool "Provide debugging asserts for adding NO_HZ support to an arch" | ||
| 73 | depends on DEBUG_KERNEL | ||
| 74 | help | ||
| 75 | This option provides consistency checks in RCU's handling of | ||
| 76 | NO_HZ. These checks have proven quite helpful in detecting | ||
| 77 | bugs in arch-specific NO_HZ code. | ||
| 78 | |||
| 79 | Say N here if you need ultimate kernel/user switch latencies | ||
| 80 | Say Y if you are unsure | ||
| 81 | |||
| 82 | endmenu # "RCU Debugging" | ||
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 23803c7d5180..13c0fc852767 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile | |||
| @@ -3,13 +3,11 @@ | |||
| 3 | KCOV_INSTRUMENT := n | 3 | KCOV_INSTRUMENT := n |
| 4 | 4 | ||
| 5 | obj-y += update.o sync.o | 5 | obj-y += update.o sync.o |
| 6 | obj-$(CONFIG_CLASSIC_SRCU) += srcu.o | ||
| 7 | obj-$(CONFIG_TREE_SRCU) += srcutree.o | 6 | obj-$(CONFIG_TREE_SRCU) += srcutree.o |
| 8 | obj-$(CONFIG_TINY_SRCU) += srcutiny.o | 7 | obj-$(CONFIG_TINY_SRCU) += srcutiny.o |
| 9 | obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o | 8 | obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o |
| 10 | obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o | 9 | obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o |
| 11 | obj-$(CONFIG_TREE_RCU) += tree.o | 10 | obj-$(CONFIG_TREE_RCU) += tree.o |
| 12 | obj-$(CONFIG_PREEMPT_RCU) += tree.o | 11 | obj-$(CONFIG_PREEMPT_RCU) += tree.o |
| 13 | obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o | ||
| 14 | obj-$(CONFIG_TINY_RCU) += tiny.o | 12 | obj-$(CONFIG_TINY_RCU) += tiny.o |
| 15 | obj-$(CONFIG_RCU_NEED_SEGCBLIST) += rcu_segcblist.o | 13 | obj-$(CONFIG_RCU_NEED_SEGCBLIST) += rcu_segcblist.o |
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 73e16ec4054b..808b8c85f626 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h | |||
| @@ -212,6 +212,18 @@ int rcu_jiffies_till_stall_check(void); | |||
| 212 | */ | 212 | */ |
| 213 | #define TPS(x) tracepoint_string(x) | 213 | #define TPS(x) tracepoint_string(x) |
| 214 | 214 | ||
| 215 | /* | ||
| 216 | * Dump the ftrace buffer, but only one time per callsite per boot. | ||
| 217 | */ | ||
| 218 | #define rcu_ftrace_dump(oops_dump_mode) \ | ||
| 219 | do { \ | ||
| 220 | static atomic_t ___rfd_beenhere = ATOMIC_INIT(0); \ | ||
| 221 | \ | ||
| 222 | if (!atomic_read(&___rfd_beenhere) && \ | ||
| 223 | !atomic_xchg(&___rfd_beenhere, 1)) \ | ||
| 224 | ftrace_dump(oops_dump_mode); \ | ||
| 225 | } while (0) | ||
| 226 | |||
| 215 | void rcu_early_boot_tests(void); | 227 | void rcu_early_boot_tests(void); |
| 216 | void rcu_test_sync_prims(void); | 228 | void rcu_test_sync_prims(void); |
| 217 | 229 | ||
| @@ -291,6 +303,271 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) | |||
| 291 | cpu <= rnp->grphi; \ | 303 | cpu <= rnp->grphi; \ |
| 292 | cpu = cpumask_next((cpu), cpu_possible_mask)) | 304 | cpu = cpumask_next((cpu), cpu_possible_mask)) |
| 293 | 305 | ||
| 306 | /* | ||
| 307 | * Wrappers for the rcu_node::lock acquire and release. | ||
| 308 | * | ||
| 309 | * Because the rcu_nodes form a tree, the tree traversal locking will observe | ||
| 310 | * different lock values, this in turn means that an UNLOCK of one level | ||
| 311 | * followed by a LOCK of another level does not imply a full memory barrier; | ||
| 312 | * and most importantly transitivity is lost. | ||
| 313 | * | ||
| 314 | * In order to restore full ordering between tree levels, augment the regular | ||
| 315 | * lock acquire functions with smp_mb__after_unlock_lock(). | ||
| 316 | * | ||
| 317 | * As ->lock of struct rcu_node is a __private field, therefore one should use | ||
| 318 | * these wrappers rather than directly call raw_spin_{lock,unlock}* on ->lock. | ||
| 319 | */ | ||
| 320 | #define raw_spin_lock_rcu_node(p) \ | ||
| 321 | do { \ | ||
| 322 | raw_spin_lock(&ACCESS_PRIVATE(p, lock)); \ | ||
| 323 | smp_mb__after_unlock_lock(); \ | ||
| 324 | } while (0) | ||
| 325 | |||
| 326 | #define raw_spin_unlock_rcu_node(p) raw_spin_unlock(&ACCESS_PRIVATE(p, lock)) | ||
| 327 | |||
| 328 | #define raw_spin_lock_irq_rcu_node(p) \ | ||
| 329 | do { \ | ||
| 330 | raw_spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \ | ||
| 331 | smp_mb__after_unlock_lock(); \ | ||
| 332 | } while (0) | ||
| 333 | |||
| 334 | #define raw_spin_unlock_irq_rcu_node(p) \ | ||
| 335 | raw_spin_unlock_irq(&ACCESS_PRIVATE(p, lock)) | ||
| 336 | |||
| 337 | #define raw_spin_lock_irqsave_rcu_node(p, flags) \ | ||
| 338 | do { \ | ||
| 339 | raw_spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \ | ||
| 340 | smp_mb__after_unlock_lock(); \ | ||
| 341 | } while (0) | ||
| 342 | |||
| 343 | #define raw_spin_unlock_irqrestore_rcu_node(p, flags) \ | ||
| 344 | raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \ | ||
| 345 | |||
| 346 | #define raw_spin_trylock_rcu_node(p) \ | ||
| 347 | ({ \ | ||
| 348 | bool ___locked = raw_spin_trylock(&ACCESS_PRIVATE(p, lock)); \ | ||
| 349 | \ | ||
| 350 | if (___locked) \ | ||
| 351 | smp_mb__after_unlock_lock(); \ | ||
| 352 | ___locked; \ | ||
| 353 | }) | ||
| 354 | |||
| 294 | #endif /* #if defined(SRCU) || !defined(TINY_RCU) */ | 355 | #endif /* #if defined(SRCU) || !defined(TINY_RCU) */ |
| 295 | 356 | ||
| 357 | #ifdef CONFIG_TINY_RCU | ||
| 358 | /* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */ | ||
| 359 | static inline bool rcu_gp_is_normal(void) /* Internal RCU use. */ | ||
| 360 | { | ||
| 361 | return true; | ||
| 362 | } | ||
| 363 | static inline bool rcu_gp_is_expedited(void) /* Internal RCU use. */ | ||
| 364 | { | ||
| 365 | return false; | ||
| 366 | } | ||
| 367 | |||
| 368 | static inline void rcu_expedite_gp(void) | ||
| 369 | { | ||
| 370 | } | ||
| 371 | |||
| 372 | static inline void rcu_unexpedite_gp(void) | ||
| 373 | { | ||
| 374 | } | ||
| 375 | #else /* #ifdef CONFIG_TINY_RCU */ | ||
| 376 | bool rcu_gp_is_normal(void); /* Internal RCU use. */ | ||
| 377 | bool rcu_gp_is_expedited(void); /* Internal RCU use. */ | ||
| 378 | void rcu_expedite_gp(void); | ||
| 379 | void rcu_unexpedite_gp(void); | ||
| 380 | void rcupdate_announce_bootup_oddness(void); | ||
| 381 | #endif /* #else #ifdef CONFIG_TINY_RCU */ | ||
| 382 | |||
| 383 | #define RCU_SCHEDULER_INACTIVE 0 | ||
| 384 | #define RCU_SCHEDULER_INIT 1 | ||
| 385 | #define RCU_SCHEDULER_RUNNING 2 | ||
| 386 | |||
| 387 | #ifdef CONFIG_TINY_RCU | ||
| 388 | static inline void rcu_request_urgent_qs_task(struct task_struct *t) { } | ||
| 389 | #else /* #ifdef CONFIG_TINY_RCU */ | ||
| 390 | void rcu_request_urgent_qs_task(struct task_struct *t); | ||
| 391 | #endif /* #else #ifdef CONFIG_TINY_RCU */ | ||
| 392 | |||
| 393 | enum rcutorture_type { | ||
| 394 | RCU_FLAVOR, | ||
| 395 | RCU_BH_FLAVOR, | ||
| 396 | RCU_SCHED_FLAVOR, | ||
| 397 | RCU_TASKS_FLAVOR, | ||
| 398 | SRCU_FLAVOR, | ||
| 399 | INVALID_RCU_FLAVOR | ||
| 400 | }; | ||
| 401 | |||
| 402 | #if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) | ||
| 403 | void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, | ||
| 404 | unsigned long *gpnum, unsigned long *completed); | ||
| 405 | void rcutorture_record_test_transition(void); | ||
| 406 | void rcutorture_record_progress(unsigned long vernum); | ||
| 407 | void do_trace_rcu_torture_read(const char *rcutorturename, | ||
| 408 | struct rcu_head *rhp, | ||
| 409 | unsigned long secs, | ||
| 410 | unsigned long c_old, | ||
| 411 | unsigned long c); | ||
| 412 | #else | ||
| 413 | static inline void rcutorture_get_gp_data(enum rcutorture_type test_type, | ||
| 414 | int *flags, | ||
| 415 | unsigned long *gpnum, | ||
| 416 | unsigned long *completed) | ||
| 417 | { | ||
| 418 | *flags = 0; | ||
| 419 | *gpnum = 0; | ||
| 420 | *completed = 0; | ||
| 421 | } | ||
| 422 | static inline void rcutorture_record_test_transition(void) | ||
| 423 | { | ||
| 424 | } | ||
| 425 | static inline void rcutorture_record_progress(unsigned long vernum) | ||
| 426 | { | ||
| 427 | } | ||
| 428 | #ifdef CONFIG_RCU_TRACE | ||
| 429 | void do_trace_rcu_torture_read(const char *rcutorturename, | ||
| 430 | struct rcu_head *rhp, | ||
| 431 | unsigned long secs, | ||
| 432 | unsigned long c_old, | ||
| 433 | unsigned long c); | ||
| 434 | #else | ||
| 435 | #define do_trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c) \ | ||
| 436 | do { } while (0) | ||
| 437 | #endif | ||
| 438 | #endif | ||
| 439 | |||
| 440 | #ifdef CONFIG_TINY_SRCU | ||
| 441 | |||
| 442 | static inline void srcutorture_get_gp_data(enum rcutorture_type test_type, | ||
| 443 | struct srcu_struct *sp, int *flags, | ||
| 444 | unsigned long *gpnum, | ||
| 445 | unsigned long *completed) | ||
| 446 | { | ||
| 447 | if (test_type != SRCU_FLAVOR) | ||
| 448 | return; | ||
| 449 | *flags = 0; | ||
| 450 | *completed = sp->srcu_idx; | ||
| 451 | *gpnum = *completed; | ||
| 452 | } | ||
| 453 | |||
| 454 | #elif defined(CONFIG_TREE_SRCU) | ||
| 455 | |||
| 456 | void srcutorture_get_gp_data(enum rcutorture_type test_type, | ||
| 457 | struct srcu_struct *sp, int *flags, | ||
| 458 | unsigned long *gpnum, unsigned long *completed); | ||
| 459 | |||
| 460 | #endif | ||
| 461 | |||
| 462 | #ifdef CONFIG_TINY_RCU | ||
| 463 | |||
| 464 | /* | ||
| 465 | * Return the number of grace periods started. | ||
| 466 | */ | ||
| 467 | static inline unsigned long rcu_batches_started(void) | ||
| 468 | { | ||
| 469 | return 0; | ||
| 470 | } | ||
| 471 | |||
| 472 | /* | ||
| 473 | * Return the number of bottom-half grace periods started. | ||
| 474 | */ | ||
| 475 | static inline unsigned long rcu_batches_started_bh(void) | ||
| 476 | { | ||
| 477 | return 0; | ||
| 478 | } | ||
| 479 | |||
| 480 | /* | ||
| 481 | * Return the number of sched grace periods started. | ||
| 482 | */ | ||
| 483 | static inline unsigned long rcu_batches_started_sched(void) | ||
| 484 | { | ||
| 485 | return 0; | ||
| 486 | } | ||
| 487 | |||
| 488 | /* | ||
| 489 | * Return the number of grace periods completed. | ||
| 490 | */ | ||
| 491 | static inline unsigned long rcu_batches_completed(void) | ||
| 492 | { | ||
| 493 | return 0; | ||
| 494 | } | ||
| 495 | |||
| 496 | /* | ||
| 497 | * Return the number of bottom-half grace periods completed. | ||
| 498 | */ | ||
| 499 | static inline unsigned long rcu_batches_completed_bh(void) | ||
| 500 | { | ||
| 501 | return 0; | ||
| 502 | } | ||
| 503 | |||
| 504 | /* | ||
| 505 | * Return the number of sched grace periods completed. | ||
| 506 | */ | ||
| 507 | static inline unsigned long rcu_batches_completed_sched(void) | ||
| 508 | { | ||
| 509 | return 0; | ||
| 510 | } | ||
| 511 | |||
| 512 | /* | ||
| 513 | * Return the number of expedited grace periods completed. | ||
| 514 | */ | ||
| 515 | static inline unsigned long rcu_exp_batches_completed(void) | ||
| 516 | { | ||
| 517 | return 0; | ||
| 518 | } | ||
| 519 | |||
| 520 | /* | ||
| 521 | * Return the number of expedited sched grace periods completed. | ||
| 522 | */ | ||
| 523 | static inline unsigned long rcu_exp_batches_completed_sched(void) | ||
| 524 | { | ||
| 525 | return 0; | ||
| 526 | } | ||
| 527 | |||
| 528 | static inline unsigned long srcu_batches_completed(struct srcu_struct *sp) | ||
| 529 | { | ||
| 530 | return 0; | ||
| 531 | } | ||
| 532 | |||
| 533 | static inline void rcu_force_quiescent_state(void) | ||
| 534 | { | ||
| 535 | } | ||
| 536 | |||
| 537 | static inline void rcu_bh_force_quiescent_state(void) | ||
| 538 | { | ||
| 539 | } | ||
| 540 | |||
| 541 | static inline void rcu_sched_force_quiescent_state(void) | ||
| 542 | { | ||
| 543 | } | ||
| 544 | |||
| 545 | static inline void show_rcu_gp_kthreads(void) | ||
| 546 | { | ||
| 547 | } | ||
| 548 | |||
| 549 | #else /* #ifdef CONFIG_TINY_RCU */ | ||
| 550 | extern unsigned long rcutorture_testseq; | ||
| 551 | extern unsigned long rcutorture_vernum; | ||
| 552 | unsigned long rcu_batches_started(void); | ||
| 553 | unsigned long rcu_batches_started_bh(void); | ||
| 554 | unsigned long rcu_batches_started_sched(void); | ||
| 555 | unsigned long rcu_batches_completed(void); | ||
| 556 | unsigned long rcu_batches_completed_bh(void); | ||
| 557 | unsigned long rcu_batches_completed_sched(void); | ||
| 558 | unsigned long rcu_exp_batches_completed(void); | ||
| 559 | unsigned long rcu_exp_batches_completed_sched(void); | ||
| 560 | unsigned long srcu_batches_completed(struct srcu_struct *sp); | ||
| 561 | void show_rcu_gp_kthreads(void); | ||
| 562 | void rcu_force_quiescent_state(void); | ||
| 563 | void rcu_bh_force_quiescent_state(void); | ||
| 564 | void rcu_sched_force_quiescent_state(void); | ||
| 565 | #endif /* #else #ifdef CONFIG_TINY_RCU */ | ||
| 566 | |||
| 567 | #ifdef CONFIG_RCU_NOCB_CPU | ||
| 568 | bool rcu_is_nocb_cpu(int cpu); | ||
| 569 | #else | ||
| 570 | static inline bool rcu_is_nocb_cpu(int cpu) { return false; } | ||
| 571 | #endif | ||
| 572 | |||
| 296 | #endif /* __LINUX_RCU_H */ | 573 | #endif /* __LINUX_RCU_H */ |
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index a4a86fb47e4a..3cc18110b612 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c | |||
| @@ -48,6 +48,8 @@ | |||
| 48 | #include <linux/torture.h> | 48 | #include <linux/torture.h> |
| 49 | #include <linux/vmalloc.h> | 49 | #include <linux/vmalloc.h> |
| 50 | 50 | ||
| 51 | #include "rcu.h" | ||
| 52 | |||
| 51 | MODULE_LICENSE("GPL"); | 53 | MODULE_LICENSE("GPL"); |
| 52 | MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>"); | 54 | MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>"); |
| 53 | 55 | ||
| @@ -59,12 +61,16 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>"); | |||
| 59 | #define VERBOSE_PERFOUT_ERRSTRING(s) \ | 61 | #define VERBOSE_PERFOUT_ERRSTRING(s) \ |
| 60 | do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0) | 62 | do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0) |
| 61 | 63 | ||
| 64 | torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives"); | ||
| 65 | torture_param(int, gp_async_max, 1000, "Max # outstanding waits per reader"); | ||
| 62 | torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); | 66 | torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); |
| 63 | torture_param(int, holdoff, 10, "Holdoff time before test start (s)"); | 67 | torture_param(int, holdoff, 10, "Holdoff time before test start (s)"); |
| 64 | torture_param(int, nreaders, -1, "Number of RCU reader threads"); | 68 | torture_param(int, nreaders, 0, "Number of RCU reader threads"); |
| 65 | torture_param(int, nwriters, -1, "Number of RCU updater threads"); | 69 | torture_param(int, nwriters, -1, "Number of RCU updater threads"); |
| 66 | torture_param(bool, shutdown, false, "Shutdown at end of performance tests."); | 70 | torture_param(bool, shutdown, !IS_ENABLED(MODULE), |
| 71 | "Shutdown at end of performance tests."); | ||
| 67 | torture_param(bool, verbose, true, "Enable verbose debugging printk()s"); | 72 | torture_param(bool, verbose, true, "Enable verbose debugging printk()s"); |
| 73 | torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable"); | ||
| 68 | 74 | ||
| 69 | static char *perf_type = "rcu"; | 75 | static char *perf_type = "rcu"; |
| 70 | module_param(perf_type, charp, 0444); | 76 | module_param(perf_type, charp, 0444); |
| @@ -86,13 +92,16 @@ static u64 t_rcu_perf_writer_started; | |||
| 86 | static u64 t_rcu_perf_writer_finished; | 92 | static u64 t_rcu_perf_writer_finished; |
| 87 | static unsigned long b_rcu_perf_writer_started; | 93 | static unsigned long b_rcu_perf_writer_started; |
| 88 | static unsigned long b_rcu_perf_writer_finished; | 94 | static unsigned long b_rcu_perf_writer_finished; |
| 95 | static DEFINE_PER_CPU(atomic_t, n_async_inflight); | ||
| 89 | 96 | ||
| 90 | static int rcu_perf_writer_state; | 97 | static int rcu_perf_writer_state; |
| 91 | #define RTWS_INIT 0 | 98 | #define RTWS_INIT 0 |
| 92 | #define RTWS_EXP_SYNC 1 | 99 | #define RTWS_ASYNC 1 |
| 93 | #define RTWS_SYNC 2 | 100 | #define RTWS_BARRIER 2 |
| 94 | #define RTWS_IDLE 2 | 101 | #define RTWS_EXP_SYNC 3 |
| 95 | #define RTWS_STOPPING 3 | 102 | #define RTWS_SYNC 4 |
| 103 | #define RTWS_IDLE 5 | ||
| 104 | #define RTWS_STOPPING 6 | ||
| 96 | 105 | ||
| 97 | #define MAX_MEAS 10000 | 106 | #define MAX_MEAS 10000 |
| 98 | #define MIN_MEAS 100 | 107 | #define MIN_MEAS 100 |
| @@ -114,6 +123,8 @@ struct rcu_perf_ops { | |||
| 114 | unsigned long (*started)(void); | 123 | unsigned long (*started)(void); |
| 115 | unsigned long (*completed)(void); | 124 | unsigned long (*completed)(void); |
| 116 | unsigned long (*exp_completed)(void); | 125 | unsigned long (*exp_completed)(void); |
| 126 | void (*async)(struct rcu_head *head, rcu_callback_t func); | ||
| 127 | void (*gp_barrier)(void); | ||
| 117 | void (*sync)(void); | 128 | void (*sync)(void); |
| 118 | void (*exp_sync)(void); | 129 | void (*exp_sync)(void); |
| 119 | const char *name; | 130 | const char *name; |
| @@ -153,6 +164,8 @@ static struct rcu_perf_ops rcu_ops = { | |||
| 153 | .started = rcu_batches_started, | 164 | .started = rcu_batches_started, |
| 154 | .completed = rcu_batches_completed, | 165 | .completed = rcu_batches_completed, |
| 155 | .exp_completed = rcu_exp_batches_completed, | 166 | .exp_completed = rcu_exp_batches_completed, |
| 167 | .async = call_rcu, | ||
| 168 | .gp_barrier = rcu_barrier, | ||
| 156 | .sync = synchronize_rcu, | 169 | .sync = synchronize_rcu, |
| 157 | .exp_sync = synchronize_rcu_expedited, | 170 | .exp_sync = synchronize_rcu_expedited, |
| 158 | .name = "rcu" | 171 | .name = "rcu" |
| @@ -181,6 +194,8 @@ static struct rcu_perf_ops rcu_bh_ops = { | |||
| 181 | .started = rcu_batches_started_bh, | 194 | .started = rcu_batches_started_bh, |
| 182 | .completed = rcu_batches_completed_bh, | 195 | .completed = rcu_batches_completed_bh, |
| 183 | .exp_completed = rcu_exp_batches_completed_sched, | 196 | .exp_completed = rcu_exp_batches_completed_sched, |
| 197 | .async = call_rcu_bh, | ||
| 198 | .gp_barrier = rcu_barrier_bh, | ||
| 184 | .sync = synchronize_rcu_bh, | 199 | .sync = synchronize_rcu_bh, |
| 185 | .exp_sync = synchronize_rcu_bh_expedited, | 200 | .exp_sync = synchronize_rcu_bh_expedited, |
| 186 | .name = "rcu_bh" | 201 | .name = "rcu_bh" |
| @@ -208,6 +223,16 @@ static unsigned long srcu_perf_completed(void) | |||
| 208 | return srcu_batches_completed(srcu_ctlp); | 223 | return srcu_batches_completed(srcu_ctlp); |
| 209 | } | 224 | } |
| 210 | 225 | ||
| 226 | static void srcu_call_rcu(struct rcu_head *head, rcu_callback_t func) | ||
| 227 | { | ||
| 228 | call_srcu(srcu_ctlp, head, func); | ||
| 229 | } | ||
| 230 | |||
| 231 | static void srcu_rcu_barrier(void) | ||
| 232 | { | ||
| 233 | srcu_barrier(srcu_ctlp); | ||
| 234 | } | ||
| 235 | |||
| 211 | static void srcu_perf_synchronize(void) | 236 | static void srcu_perf_synchronize(void) |
| 212 | { | 237 | { |
| 213 | synchronize_srcu(srcu_ctlp); | 238 | synchronize_srcu(srcu_ctlp); |
| @@ -226,11 +251,42 @@ static struct rcu_perf_ops srcu_ops = { | |||
| 226 | .started = NULL, | 251 | .started = NULL, |
| 227 | .completed = srcu_perf_completed, | 252 | .completed = srcu_perf_completed, |
| 228 | .exp_completed = srcu_perf_completed, | 253 | .exp_completed = srcu_perf_completed, |
| 254 | .async = srcu_call_rcu, | ||
| 255 | .gp_barrier = srcu_rcu_barrier, | ||
| 229 | .sync = srcu_perf_synchronize, | 256 | .sync = srcu_perf_synchronize, |
| 230 | .exp_sync = srcu_perf_synchronize_expedited, | 257 | .exp_sync = srcu_perf_synchronize_expedited, |
| 231 | .name = "srcu" | 258 | .name = "srcu" |
| 232 | }; | 259 | }; |
| 233 | 260 | ||
| 261 | static struct srcu_struct srcud; | ||
| 262 | |||
| 263 | static void srcu_sync_perf_init(void) | ||
| 264 | { | ||
| 265 | srcu_ctlp = &srcud; | ||
| 266 | init_srcu_struct(srcu_ctlp); | ||
| 267 | } | ||
| 268 | |||
| 269 | static void srcu_sync_perf_cleanup(void) | ||
| 270 | { | ||
| 271 | cleanup_srcu_struct(srcu_ctlp); | ||
| 272 | } | ||
| 273 | |||
| 274 | static struct rcu_perf_ops srcud_ops = { | ||
| 275 | .ptype = SRCU_FLAVOR, | ||
| 276 | .init = srcu_sync_perf_init, | ||
| 277 | .cleanup = srcu_sync_perf_cleanup, | ||
| 278 | .readlock = srcu_perf_read_lock, | ||
| 279 | .readunlock = srcu_perf_read_unlock, | ||
| 280 | .started = NULL, | ||
| 281 | .completed = srcu_perf_completed, | ||
| 282 | .exp_completed = srcu_perf_completed, | ||
| 283 | .async = srcu_call_rcu, | ||
| 284 | .gp_barrier = srcu_rcu_barrier, | ||
| 285 | .sync = srcu_perf_synchronize, | ||
| 286 | .exp_sync = srcu_perf_synchronize_expedited, | ||
| 287 | .name = "srcud" | ||
| 288 | }; | ||
| 289 | |||
| 234 | /* | 290 | /* |
| 235 | * Definitions for sched perf testing. | 291 | * Definitions for sched perf testing. |
| 236 | */ | 292 | */ |
| @@ -254,6 +310,8 @@ static struct rcu_perf_ops sched_ops = { | |||
| 254 | .started = rcu_batches_started_sched, | 310 | .started = rcu_batches_started_sched, |
| 255 | .completed = rcu_batches_completed_sched, | 311 | .completed = rcu_batches_completed_sched, |
| 256 | .exp_completed = rcu_exp_batches_completed_sched, | 312 | .exp_completed = rcu_exp_batches_completed_sched, |
| 313 | .async = call_rcu_sched, | ||
| 314 | .gp_barrier = rcu_barrier_sched, | ||
| 257 | .sync = synchronize_sched, | 315 | .sync = synchronize_sched, |
| 258 | .exp_sync = synchronize_sched_expedited, | 316 | .exp_sync = synchronize_sched_expedited, |
| 259 | .name = "sched" | 317 | .name = "sched" |
| @@ -281,6 +339,8 @@ static struct rcu_perf_ops tasks_ops = { | |||
| 281 | .readunlock = tasks_perf_read_unlock, | 339 | .readunlock = tasks_perf_read_unlock, |
| 282 | .started = rcu_no_completed, | 340 | .started = rcu_no_completed, |
| 283 | .completed = rcu_no_completed, | 341 | .completed = rcu_no_completed, |
| 342 | .async = call_rcu_tasks, | ||
| 343 | .gp_barrier = rcu_barrier_tasks, | ||
| 284 | .sync = synchronize_rcu_tasks, | 344 | .sync = synchronize_rcu_tasks, |
| 285 | .exp_sync = synchronize_rcu_tasks, | 345 | .exp_sync = synchronize_rcu_tasks, |
| 286 | .name = "tasks" | 346 | .name = "tasks" |
| @@ -344,6 +404,15 @@ rcu_perf_reader(void *arg) | |||
| 344 | } | 404 | } |
| 345 | 405 | ||
| 346 | /* | 406 | /* |
| 407 | * Callback function for asynchronous grace periods from rcu_perf_writer(). | ||
| 408 | */ | ||
| 409 | static void rcu_perf_async_cb(struct rcu_head *rhp) | ||
| 410 | { | ||
| 411 | atomic_dec(this_cpu_ptr(&n_async_inflight)); | ||
| 412 | kfree(rhp); | ||
| 413 | } | ||
| 414 | |||
| 415 | /* | ||
| 347 | * RCU perf writer kthread. Repeatedly does a grace period. | 416 | * RCU perf writer kthread. Repeatedly does a grace period. |
| 348 | */ | 417 | */ |
| 349 | static int | 418 | static int |
| @@ -352,6 +421,7 @@ rcu_perf_writer(void *arg) | |||
| 352 | int i = 0; | 421 | int i = 0; |
| 353 | int i_max; | 422 | int i_max; |
| 354 | long me = (long)arg; | 423 | long me = (long)arg; |
| 424 | struct rcu_head *rhp = NULL; | ||
| 355 | struct sched_param sp; | 425 | struct sched_param sp; |
| 356 | bool started = false, done = false, alldone = false; | 426 | bool started = false, done = false, alldone = false; |
| 357 | u64 t; | 427 | u64 t; |
| @@ -380,9 +450,27 @@ rcu_perf_writer(void *arg) | |||
| 380 | } | 450 | } |
| 381 | 451 | ||
| 382 | do { | 452 | do { |
| 453 | if (writer_holdoff) | ||
| 454 | udelay(writer_holdoff); | ||
| 383 | wdp = &wdpp[i]; | 455 | wdp = &wdpp[i]; |
| 384 | *wdp = ktime_get_mono_fast_ns(); | 456 | *wdp = ktime_get_mono_fast_ns(); |
| 385 | if (gp_exp) { | 457 | if (gp_async) { |
| 458 | retry: | ||
| 459 | if (!rhp) | ||
| 460 | rhp = kmalloc(sizeof(*rhp), GFP_KERNEL); | ||
| 461 | if (rhp && atomic_read(this_cpu_ptr(&n_async_inflight)) < gp_async_max) { | ||
| 462 | rcu_perf_writer_state = RTWS_ASYNC; | ||
| 463 | atomic_inc(this_cpu_ptr(&n_async_inflight)); | ||
| 464 | cur_ops->async(rhp, rcu_perf_async_cb); | ||
| 465 | rhp = NULL; | ||
| 466 | } else if (!kthread_should_stop()) { | ||
| 467 | rcu_perf_writer_state = RTWS_BARRIER; | ||
| 468 | cur_ops->gp_barrier(); | ||
| 469 | goto retry; | ||
| 470 | } else { | ||
| 471 | kfree(rhp); /* Because we are stopping. */ | ||
| 472 | } | ||
| 473 | } else if (gp_exp) { | ||
| 386 | rcu_perf_writer_state = RTWS_EXP_SYNC; | 474 | rcu_perf_writer_state = RTWS_EXP_SYNC; |
| 387 | cur_ops->exp_sync(); | 475 | cur_ops->exp_sync(); |
| 388 | } else { | 476 | } else { |
| @@ -429,6 +517,10 @@ rcu_perf_writer(void *arg) | |||
| 429 | i++; | 517 | i++; |
| 430 | rcu_perf_wait_shutdown(); | 518 | rcu_perf_wait_shutdown(); |
| 431 | } while (!torture_must_stop()); | 519 | } while (!torture_must_stop()); |
| 520 | if (gp_async) { | ||
| 521 | rcu_perf_writer_state = RTWS_BARRIER; | ||
| 522 | cur_ops->gp_barrier(); | ||
| 523 | } | ||
| 432 | rcu_perf_writer_state = RTWS_STOPPING; | 524 | rcu_perf_writer_state = RTWS_STOPPING; |
| 433 | writer_n_durations[me] = i_max; | 525 | writer_n_durations[me] = i_max; |
| 434 | torture_kthread_stopping("rcu_perf_writer"); | 526 | torture_kthread_stopping("rcu_perf_writer"); |
| @@ -452,6 +544,17 @@ rcu_perf_cleanup(void) | |||
| 452 | u64 *wdp; | 544 | u64 *wdp; |
| 453 | u64 *wdpp; | 545 | u64 *wdpp; |
| 454 | 546 | ||
| 547 | /* | ||
| 548 | * Would like warning at start, but everything is expedited | ||
| 549 | * during the mid-boot phase, so have to wait till the end. | ||
| 550 | */ | ||
| 551 | if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp) | ||
| 552 | VERBOSE_PERFOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!"); | ||
| 553 | if (rcu_gp_is_normal() && gp_exp) | ||
| 554 | VERBOSE_PERFOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!"); | ||
| 555 | if (gp_exp && gp_async) | ||
| 556 | VERBOSE_PERFOUT_ERRSTRING("No expedited async GPs, so went with async!"); | ||
| 557 | |||
| 455 | if (torture_cleanup_begin()) | 558 | if (torture_cleanup_begin()) |
| 456 | return; | 559 | return; |
| 457 | 560 | ||
| @@ -554,7 +657,7 @@ rcu_perf_init(void) | |||
| 554 | long i; | 657 | long i; |
| 555 | int firsterr = 0; | 658 | int firsterr = 0; |
| 556 | static struct rcu_perf_ops *perf_ops[] = { | 659 | static struct rcu_perf_ops *perf_ops[] = { |
| 557 | &rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops, | 660 | &rcu_ops, &rcu_bh_ops, &srcu_ops, &srcud_ops, &sched_ops, |
| 558 | RCUPERF_TASKS_OPS | 661 | RCUPERF_TASKS_OPS |
| 559 | }; | 662 | }; |
| 560 | 663 | ||
| @@ -624,16 +727,6 @@ rcu_perf_init(void) | |||
| 624 | firsterr = -ENOMEM; | 727 | firsterr = -ENOMEM; |
| 625 | goto unwind; | 728 | goto unwind; |
| 626 | } | 729 | } |
| 627 | if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp) { | ||
| 628 | VERBOSE_PERFOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!"); | ||
| 629 | firsterr = -EINVAL; | ||
| 630 | goto unwind; | ||
| 631 | } | ||
| 632 | if (rcu_gp_is_normal() && gp_exp) { | ||
| 633 | VERBOSE_PERFOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!"); | ||
| 634 | firsterr = -EINVAL; | ||
| 635 | goto unwind; | ||
| 636 | } | ||
| 637 | for (i = 0; i < nrealwriters; i++) { | 730 | for (i = 0; i < nrealwriters; i++) { |
| 638 | writer_durations[i] = | 731 | writer_durations[i] = |
| 639 | kcalloc(MAX_MEAS, sizeof(*writer_durations[i]), | 732 | kcalloc(MAX_MEAS, sizeof(*writer_durations[i]), |
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index ae6e574d4cf5..b8f7f8ce8575 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c | |||
| @@ -52,6 +52,8 @@ | |||
| 52 | #include <linux/torture.h> | 52 | #include <linux/torture.h> |
| 53 | #include <linux/vmalloc.h> | 53 | #include <linux/vmalloc.h> |
| 54 | 54 | ||
| 55 | #include "rcu.h" | ||
| 56 | |||
| 55 | MODULE_LICENSE("GPL"); | 57 | MODULE_LICENSE("GPL"); |
| 56 | MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@joshtriplett.org>"); | 58 | MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@joshtriplett.org>"); |
| 57 | 59 | ||
| @@ -562,31 +564,19 @@ static void srcu_torture_stats(void) | |||
| 562 | int __maybe_unused cpu; | 564 | int __maybe_unused cpu; |
| 563 | int idx; | 565 | int idx; |
| 564 | 566 | ||
| 565 | #if defined(CONFIG_TREE_SRCU) || defined(CONFIG_CLASSIC_SRCU) | ||
| 566 | #ifdef CONFIG_TREE_SRCU | 567 | #ifdef CONFIG_TREE_SRCU |
| 567 | idx = srcu_ctlp->srcu_idx & 0x1; | 568 | idx = srcu_ctlp->srcu_idx & 0x1; |
| 568 | #else /* #ifdef CONFIG_TREE_SRCU */ | ||
| 569 | idx = srcu_ctlp->completed & 0x1; | ||
| 570 | #endif /* #else #ifdef CONFIG_TREE_SRCU */ | ||
| 571 | pr_alert("%s%s Tree SRCU per-CPU(idx=%d):", | 569 | pr_alert("%s%s Tree SRCU per-CPU(idx=%d):", |
| 572 | torture_type, TORTURE_FLAG, idx); | 570 | torture_type, TORTURE_FLAG, idx); |
| 573 | for_each_possible_cpu(cpu) { | 571 | for_each_possible_cpu(cpu) { |
| 574 | unsigned long l0, l1; | 572 | unsigned long l0, l1; |
| 575 | unsigned long u0, u1; | 573 | unsigned long u0, u1; |
| 576 | long c0, c1; | 574 | long c0, c1; |
| 577 | #ifdef CONFIG_TREE_SRCU | ||
| 578 | struct srcu_data *counts; | 575 | struct srcu_data *counts; |
| 579 | 576 | ||
| 580 | counts = per_cpu_ptr(srcu_ctlp->sda, cpu); | 577 | counts = per_cpu_ptr(srcu_ctlp->sda, cpu); |
| 581 | u0 = counts->srcu_unlock_count[!idx]; | 578 | u0 = counts->srcu_unlock_count[!idx]; |
| 582 | u1 = counts->srcu_unlock_count[idx]; | 579 | u1 = counts->srcu_unlock_count[idx]; |
| 583 | #else /* #ifdef CONFIG_TREE_SRCU */ | ||
| 584 | struct srcu_array *counts; | ||
| 585 | |||
| 586 | counts = per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu); | ||
| 587 | u0 = counts->unlock_count[!idx]; | ||
| 588 | u1 = counts->unlock_count[idx]; | ||
| 589 | #endif /* #else #ifdef CONFIG_TREE_SRCU */ | ||
| 590 | 580 | ||
| 591 | /* | 581 | /* |
| 592 | * Make sure that a lock is always counted if the corresponding | 582 | * Make sure that a lock is always counted if the corresponding |
| @@ -594,13 +584,8 @@ static void srcu_torture_stats(void) | |||
| 594 | */ | 584 | */ |
| 595 | smp_rmb(); | 585 | smp_rmb(); |
| 596 | 586 | ||
| 597 | #ifdef CONFIG_TREE_SRCU | ||
| 598 | l0 = counts->srcu_lock_count[!idx]; | 587 | l0 = counts->srcu_lock_count[!idx]; |
| 599 | l1 = counts->srcu_lock_count[idx]; | 588 | l1 = counts->srcu_lock_count[idx]; |
| 600 | #else /* #ifdef CONFIG_TREE_SRCU */ | ||
| 601 | l0 = counts->lock_count[!idx]; | ||
| 602 | l1 = counts->lock_count[idx]; | ||
| 603 | #endif /* #else #ifdef CONFIG_TREE_SRCU */ | ||
| 604 | 589 | ||
| 605 | c0 = l0 - u0; | 590 | c0 = l0 - u0; |
| 606 | c1 = l1 - u1; | 591 | c1 = l1 - u1; |
| @@ -609,7 +594,7 @@ static void srcu_torture_stats(void) | |||
| 609 | pr_cont("\n"); | 594 | pr_cont("\n"); |
| 610 | #elif defined(CONFIG_TINY_SRCU) | 595 | #elif defined(CONFIG_TINY_SRCU) |
| 611 | idx = READ_ONCE(srcu_ctlp->srcu_idx) & 0x1; | 596 | idx = READ_ONCE(srcu_ctlp->srcu_idx) & 0x1; |
| 612 | pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%d,%d)\n", | 597 | pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%hd,%hd)\n", |
| 613 | torture_type, TORTURE_FLAG, idx, | 598 | torture_type, TORTURE_FLAG, idx, |
| 614 | READ_ONCE(srcu_ctlp->srcu_lock_nesting[!idx]), | 599 | READ_ONCE(srcu_ctlp->srcu_lock_nesting[!idx]), |
| 615 | READ_ONCE(srcu_ctlp->srcu_lock_nesting[idx])); | 600 | READ_ONCE(srcu_ctlp->srcu_lock_nesting[idx])); |
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c deleted file mode 100644 index 584d8a983883..000000000000 --- a/kernel/rcu/srcu.c +++ /dev/null | |||
| @@ -1,662 +0,0 @@ | |||
| 1 | /* | ||
| 2 | * Sleepable Read-Copy Update mechanism for mutual exclusion. | ||
| 3 | * | ||
| 4 | * This program is free software; you can redistribute it and/or modify | ||
| 5 | * it under the terms of the GNU General Public License as published by | ||
| 6 | * the Free Software Foundation; either version 2 of the License, or | ||
| 7 | * (at your option) any later version. | ||
| 8 | * | ||
| 9 | * This program is distributed in the hope that it will be useful, | ||
| 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 12 | * GNU General Public License for more details. | ||
| 13 | * | ||
| 14 | * You should have received a copy of the GNU General Public License | ||
| 15 | * along with this program; if not, you can access it online at | ||
| 16 | * http://www.gnu.org/licenses/gpl-2.0.html. | ||
| 17 | * | ||
| 18 | * Copyright (C) IBM Corporation, 2006 | ||
| 19 | * Copyright (C) Fujitsu, 2012 | ||
| 20 | * | ||
| 21 | * Author: Paul McKenney <paulmck@us.ibm.com> | ||
| 22 | * Lai Jiangshan <laijs@cn.fujitsu.com> | ||
| 23 | * | ||
| 24 | * For detailed explanation of Read-Copy Update mechanism see - | ||
| 25 | * Documentation/RCU/ *.txt | ||
| 26 | * | ||
| 27 | */ | ||
| 28 | |||
| 29 | #include <linux/export.h> | ||
| 30 | #include <linux/mutex.h> | ||
| 31 | #include <linux/percpu.h> | ||
| 32 | #include <linux/preempt.h> | ||
| 33 | #include <linux/rcupdate_wait.h> | ||
| 34 | #include <linux/sched.h> | ||
| 35 | #include <linux/smp.h> | ||
| 36 | #include <linux/delay.h> | ||
| 37 | #include <linux/srcu.h> | ||
| 38 | |||
| 39 | #include "rcu.h" | ||
| 40 | |||
| 41 | /* | ||
| 42 | * Initialize an rcu_batch structure to empty. | ||
| 43 | */ | ||
| 44 | static inline void rcu_batch_init(struct rcu_batch *b) | ||
| 45 | { | ||
| 46 | b->head = NULL; | ||
| 47 | b->tail = &b->head; | ||
| 48 | } | ||
| 49 | |||
| 50 | /* | ||
| 51 | * Enqueue a callback onto the tail of the specified rcu_batch structure. | ||
| 52 | */ | ||
| 53 | static inline void rcu_batch_queue(struct rcu_batch *b, struct rcu_head *head) | ||
| 54 | { | ||
| 55 | *b->tail = head; | ||
| 56 | b->tail = &head->next; | ||
| 57 | } | ||
| 58 | |||
| 59 | /* | ||
| 60 | * Is the specified rcu_batch structure empty? | ||
| 61 | */ | ||
| 62 | static inline bool rcu_batch_empty(struct rcu_batch *b) | ||
| 63 | { | ||
| 64 | return b->tail == &b->head; | ||
| 65 | } | ||
| 66 | |||
| 67 | /* | ||
| 68 | * Remove the callback at the head of the specified rcu_batch structure | ||
| 69 | * and return a pointer to it, or return NULL if the structure is empty. | ||
| 70 | */ | ||
| 71 | static inline struct rcu_head *rcu_batch_dequeue(struct rcu_batch *b) | ||
| 72 | { | ||
| 73 | struct rcu_head *head; | ||
| 74 | |||
| 75 | if (rcu_batch_empty(b)) | ||
| 76 | return NULL; | ||
| 77 | |||
| 78 | head = b->head; | ||
| 79 | b->head = head->next; | ||
| 80 | if (b->tail == &head->next) | ||
| 81 | rcu_batch_init(b); | ||
| 82 | |||
| 83 | return head; | ||
| 84 | } | ||
| 85 | |||
| 86 | /* | ||
| 87 | * Move all callbacks from the rcu_batch structure specified by "from" to | ||
| 88 | * the structure specified by "to". | ||
| 89 | */ | ||
| 90 | static inline void rcu_batch_move(struct rcu_batch *to, struct rcu_batch *from) | ||
| 91 | { | ||
| 92 | if (!rcu_batch_empty(from)) { | ||
| 93 | *to->tail = from->head; | ||
| 94 | to->tail = from->tail; | ||
| 95 | rcu_batch_init(from); | ||
| 96 | } | ||
| 97 | } | ||
| 98 | |||
| 99 | static int init_srcu_struct_fields(struct srcu_struct *sp) | ||
| 100 | { | ||
| 101 | sp->completed = 0; | ||
| 102 | spin_lock_init(&sp->queue_lock); | ||
| 103 | sp->running = false; | ||
| 104 | rcu_batch_init(&sp->batch_queue); | ||
| 105 | rcu_batch_init(&sp->batch_check0); | ||
| 106 | rcu_batch_init(&sp->batch_check1); | ||
| 107 | rcu_batch_init(&sp->batch_done); | ||
| 108 | INIT_DELAYED_WORK(&sp->work, process_srcu); | ||
| 109 | sp->per_cpu_ref = alloc_percpu(struct srcu_array); | ||
| 110 | return sp->per_cpu_ref ? 0 : -ENOMEM; | ||
| 111 | } | ||
| 112 | |||
| 113 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
| 114 | |||
| 115 | int __init_srcu_struct(struct srcu_struct *sp, const char *name, | ||
| 116 | struct lock_class_key *key) | ||
| 117 | { | ||
| 118 | /* Don't re-initialize a lock while it is held. */ | ||
| 119 | debug_check_no_locks_freed((void *)sp, sizeof(*sp)); | ||
| 120 | lockdep_init_map(&sp->dep_map, name, key, 0); | ||
| 121 | return init_srcu_struct_fields(sp); | ||
| 122 | } | ||
| 123 | EXPORT_SYMBOL_GPL(__init_srcu_struct); | ||
| 124 | |||
| 125 | #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | ||
| 126 | |||
| 127 | /** | ||
| 128 | * init_srcu_struct - initialize a sleep-RCU structure | ||
| 129 | * @sp: structure to initialize. | ||
| 130 | * | ||
| 131 | * Must invoke this on a given srcu_struct before passing that srcu_struct | ||
| 132 | * to any other function. Each srcu_struct represents a separate domain | ||
| 133 | * of SRCU protection. | ||
| 134 | */ | ||
| 135 | int init_srcu_struct(struct srcu_struct *sp) | ||
| 136 | { | ||
| 137 | return init_srcu_struct_fields(sp); | ||
| 138 | } | ||
| 139 | EXPORT_SYMBOL_GPL(init_srcu_struct); | ||
| 140 | |||
| 141 | #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | ||
| 142 | |||
| 143 | /* | ||
| 144 | * Returns approximate total of the readers' ->lock_count[] values for the | ||
| 145 | * rank of per-CPU counters specified by idx. | ||
| 146 | */ | ||
| 147 | static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx) | ||
| 148 | { | ||
| 149 | int cpu; | ||
| 150 | unsigned long sum = 0; | ||
| 151 | |||
| 152 | for_each_possible_cpu(cpu) { | ||
| 153 | struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu); | ||
| 154 | |||
| 155 | sum += READ_ONCE(cpuc->lock_count[idx]); | ||
| 156 | } | ||
| 157 | return sum; | ||
| 158 | } | ||
| 159 | |||
| 160 | /* | ||
| 161 | * Returns approximate total of the readers' ->unlock_count[] values for the | ||
| 162 | * rank of per-CPU counters specified by idx. | ||
| 163 | */ | ||
| 164 | static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx) | ||
| 165 | { | ||
| 166 | int cpu; | ||
| 167 | unsigned long sum = 0; | ||
| 168 | |||
| 169 | for_each_possible_cpu(cpu) { | ||
| 170 | struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu); | ||
| 171 | |||
| 172 | sum += READ_ONCE(cpuc->unlock_count[idx]); | ||
| 173 | } | ||
| 174 | return sum; | ||
| 175 | } | ||
| 176 | |||
| 177 | /* | ||
| 178 | * Return true if the number of pre-existing readers is determined to | ||
| 179 | * be zero. | ||
| 180 | */ | ||
| 181 | static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx) | ||
| 182 | { | ||
| 183 | unsigned long unlocks; | ||
| 184 | |||
| 185 | unlocks = srcu_readers_unlock_idx(sp, idx); | ||
| 186 | |||
| 187 | /* | ||
| 188 | * Make sure that a lock is always counted if the corresponding unlock | ||
| 189 | * is counted. Needs to be a smp_mb() as the read side may contain a | ||
| 190 | * read from a variable that is written to before the synchronize_srcu() | ||
| 191 | * in the write side. In this case smp_mb()s A and B act like the store | ||
| 192 | * buffering pattern. | ||
| 193 | * | ||
| 194 | * This smp_mb() also pairs with smp_mb() C to prevent accesses after the | ||
| 195 | * synchronize_srcu() from being executed before the grace period ends. | ||
| 196 | */ | ||
| 197 | smp_mb(); /* A */ | ||
| 198 | |||
| 199 | /* | ||
| 200 | * If the locks are the same as the unlocks, then there must have | ||
| 201 | * been no readers on this index at some time in between. This does not | ||
| 202 | * mean that there are no more readers, as one could have read the | ||
| 203 | * current index but not have incremented the lock counter yet. | ||
| 204 | * | ||
| 205 | * Possible bug: There is no guarantee that there haven't been ULONG_MAX | ||
| 206 | * increments of ->lock_count[] since the unlocks were counted, meaning | ||
| 207 | * that this could return true even if there are still active readers. | ||
| 208 | * Since there are no memory barriers around srcu_flip(), the CPU is not | ||
| 209 | * required to increment ->completed before running | ||
| 210 | * srcu_readers_unlock_idx(), which means that there could be an | ||
| 211 | * arbitrarily large number of critical sections that execute after | ||
| 212 | * srcu_readers_unlock_idx() but use the old value of ->completed. | ||
| 213 | */ | ||
| 214 | return srcu_readers_lock_idx(sp, idx) == unlocks; | ||
| 215 | } | ||
| 216 | |||
| 217 | /** | ||
| 218 | * srcu_readers_active - returns true if there are readers. and false | ||
| 219 | * otherwise | ||
| 220 | * @sp: which srcu_struct to count active readers (holding srcu_read_lock). | ||
| 221 | * | ||
| 222 | * Note that this is not an atomic primitive, and can therefore suffer | ||
| 223 | * severe errors when invoked on an active srcu_struct. That said, it | ||
| 224 | * can be useful as an error check at cleanup time. | ||
| 225 | */ | ||
| 226 | static bool srcu_readers_active(struct srcu_struct *sp) | ||
| 227 | { | ||
| 228 | int cpu; | ||
| 229 | unsigned long sum = 0; | ||
| 230 | |||
| 231 | for_each_possible_cpu(cpu) { | ||
| 232 | struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu); | ||
| 233 | |||
| 234 | sum += READ_ONCE(cpuc->lock_count[0]); | ||
| 235 | sum += READ_ONCE(cpuc->lock_count[1]); | ||
| 236 | sum -= READ_ONCE(cpuc->unlock_count[0]); | ||
| 237 | sum -= READ_ONCE(cpuc->unlock_count[1]); | ||
| 238 | } | ||
| 239 | return sum; | ||
| 240 | } | ||
| 241 | |||
| 242 | /** | ||
| 243 | * cleanup_srcu_struct - deconstruct a sleep-RCU structure | ||
| 244 | * @sp: structure to clean up. | ||
| 245 | * | ||
| 246 | * Must invoke this only after you are finished using a given srcu_struct | ||
| 247 | * that was initialized via init_srcu_struct(). This code does some | ||
| 248 | * probabalistic checking, spotting late uses of srcu_read_lock(), | ||
| 249 | * synchronize_srcu(), synchronize_srcu_expedited(), and call_srcu(). | ||
| 250 | * If any such late uses are detected, the per-CPU memory associated with | ||
| 251 | * the srcu_struct is simply leaked and WARN_ON() is invoked. If the | ||
| 252 | * caller frees the srcu_struct itself, a use-after-free crash will likely | ||
| 253 | * ensue, but at least there will be a warning printed. | ||
| 254 | */ | ||
| 255 | void cleanup_srcu_struct(struct srcu_struct *sp) | ||
| 256 | { | ||
| 257 | if (WARN_ON(srcu_readers_active(sp))) | ||
| 258 | return; /* Leakage unless caller handles error. */ | ||
| 259 | free_percpu(sp->per_cpu_ref); | ||
| 260 | sp->per_cpu_ref = NULL; | ||
| 261 | } | ||
| 262 | EXPORT_SYMBOL_GPL(cleanup_srcu_struct); | ||
| 263 | |||
| 264 | /* | ||
| 265 | * Counts the new reader in the appropriate per-CPU element of the | ||
| 266 | * srcu_struct. Must be called from process context. | ||
| 267 | * Returns an index that must be passed to the matching srcu_read_unlock(). | ||
| 268 | */ | ||
| 269 | int __srcu_read_lock(struct srcu_struct *sp) | ||
| 270 | { | ||
| 271 | int idx; | ||
| 272 | |||
| 273 | idx = READ_ONCE(sp->completed) & 0x1; | ||
| 274 | __this_cpu_inc(sp->per_cpu_ref->lock_count[idx]); | ||
| 275 | smp_mb(); /* B */ /* Avoid leaking the critical section. */ | ||
| 276 | return idx; | ||
| 277 | } | ||
| 278 | EXPORT_SYMBOL_GPL(__srcu_read_lock); | ||
| 279 | |||
| 280 | /* | ||
| 281 | * Removes the count for the old reader from the appropriate per-CPU | ||
| 282 | * element of the srcu_struct. Note that this may well be a different | ||
| 283 | * CPU than that which was incremented by the corresponding srcu_read_lock(). | ||
| 284 | * Must be called from process context. | ||
| 285 | */ | ||
| 286 | void __srcu_read_unlock(struct srcu_struct *sp, int idx) | ||
| 287 | { | ||
| 288 | smp_mb(); /* C */ /* Avoid leaking the critical section. */ | ||
| 289 | this_cpu_inc(sp->per_cpu_ref->unlock_count[idx]); | ||
| 290 | } | ||
| 291 | EXPORT_SYMBOL_GPL(__srcu_read_unlock); | ||
| 292 | |||
| 293 | /* | ||
| 294 | * We use an adaptive strategy for synchronize_srcu() and especially for | ||
| 295 | * synchronize_srcu_expedited(). We spin for a fixed time period | ||
| 296 | * (defined below) to allow SRCU readers to exit their read-side critical | ||
| 297 | * sections. If there are still some readers after 10 microseconds, | ||
| 298 | * we repeatedly block for 1-millisecond time periods. This approach | ||
| 299 | * has done well in testing, so there is no need for a config parameter. | ||
| 300 | */ | ||
| 301 | #define SRCU_RETRY_CHECK_DELAY 5 | ||
| 302 | #define SYNCHRONIZE_SRCU_TRYCOUNT 2 | ||
| 303 | #define SYNCHRONIZE_SRCU_EXP_TRYCOUNT 12 | ||
| 304 | |||
| 305 | /* | ||
| 306 | * @@@ Wait until all pre-existing readers complete. Such readers | ||
| 307 | * will have used the index specified by "idx". | ||
| 308 | * the caller should ensures the ->completed is not changed while checking | ||
| 309 | * and idx = (->completed & 1) ^ 1 | ||
| 310 | */ | ||
| 311 | static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount) | ||
| 312 | { | ||
| 313 | for (;;) { | ||
| 314 | if (srcu_readers_active_idx_check(sp, idx)) | ||
| 315 | return true; | ||
| 316 | if (--trycount <= 0) | ||
| 317 | return false; | ||
| 318 | udelay(SRCU_RETRY_CHECK_DELAY); | ||
| 319 | } | ||
| 320 | } | ||
| 321 | |||
| 322 | /* | ||
| 323 | * Increment the ->completed counter so that future SRCU readers will | ||
| 324 | * use the other rank of the ->(un)lock_count[] arrays. This allows | ||
| 325 | * us to wait for pre-existing readers in a starvation-free manner. | ||
| 326 | */ | ||
| 327 | static void srcu_flip(struct srcu_struct *sp) | ||
| 328 | { | ||
| 329 | WRITE_ONCE(sp->completed, sp->completed + 1); | ||
| 330 | |||
| 331 | /* | ||
| 332 | * Ensure that if the updater misses an __srcu_read_unlock() | ||
| 333 | * increment, that task's next __srcu_read_lock() will see the | ||
| 334 | * above counter update. Note that both this memory barrier | ||
| 335 | * and the one in srcu_readers_active_idx_check() provide the | ||
| 336 | * guarantee for __srcu_read_lock(). | ||
| 337 | */ | ||
| 338 | smp_mb(); /* D */ /* Pairs with C. */ | ||
| 339 | } | ||
| 340 | |||
| 341 | /* | ||
| 342 | * Enqueue an SRCU callback on the specified srcu_struct structure, | ||
| 343 | * initiating grace-period processing if it is not already running. | ||
| 344 | * | ||
| 345 | * Note that all CPUs must agree that the grace period extended beyond | ||
| 346 | * all pre-existing SRCU read-side critical section. On systems with | ||
| 347 | * more than one CPU, this means that when "func()" is invoked, each CPU | ||
| 348 | * is guaranteed to have executed a full memory barrier since the end of | ||
| 349 | * its last corresponding SRCU read-side critical section whose beginning | ||
| 350 | * preceded the call to call_rcu(). It also means that each CPU executing | ||
| 351 | * an SRCU read-side critical section that continues beyond the start of | ||
| 352 | * "func()" must have executed a memory barrier after the call_rcu() | ||
| 353 | * but before the beginning of that SRCU read-side critical section. | ||
| 354 | * Note that these guarantees include CPUs that are offline, idle, or | ||
| 355 | * executing in user mode, as well as CPUs that are executing in the kernel. | ||
| 356 | * | ||
| 357 | * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the | ||
| 358 | * resulting SRCU callback function "func()", then both CPU A and CPU | ||
| 359 | * B are guaranteed to execute a full memory barrier during the time | ||
| 360 | * interval between the call to call_rcu() and the invocation of "func()". | ||
| 361 | * This guarantee applies even if CPU A and CPU B are the same CPU (but | ||
| 362 | * again only if the system has more than one CPU). | ||
| 363 | * | ||
| 364 | * Of course, these guarantees apply only for invocations of call_srcu(), | ||
| 365 | * srcu_read_lock(), and srcu_read_unlock() that are all passed the same | ||
| 366 | * srcu_struct structure. | ||
| 367 | */ | ||
| 368 | void call_srcu(struct srcu_struct *sp, struct rcu_head *head, | ||
| 369 | rcu_callback_t func) | ||
| 370 | { | ||
| 371 | unsigned long flags; | ||
| 372 | |||
| 373 | head->next = NULL; | ||
| 374 | head->func = func; | ||
| 375 | spin_lock_irqsave(&sp->queue_lock, flags); | ||
| 376 | smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */ | ||
| 377 | rcu_batch_queue(&sp->batch_queue, head); | ||
| 378 | if (!sp->running) { | ||
| 379 | sp->running = true; | ||
| 380 | queue_delayed_work(system_power_efficient_wq, &sp->work, 0); | ||
| 381 | } | ||
| 382 | spin_unlock_irqrestore(&sp->queue_lock, flags); | ||
| 383 | } | ||
| 384 | EXPORT_SYMBOL_GPL(call_srcu); | ||
| 385 | |||
| 386 | static void srcu_advance_batches(struct srcu_struct *sp, int trycount); | ||
| 387 | static void srcu_reschedule(struct srcu_struct *sp); | ||
| 388 | |||
| 389 | /* | ||
| 390 | * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). | ||
| 391 | */ | ||
| 392 | static void __synchronize_srcu(struct srcu_struct *sp, int trycount) | ||
| 393 | { | ||
| 394 | struct rcu_synchronize rcu; | ||
| 395 | struct rcu_head *head = &rcu.head; | ||
| 396 | bool done = false; | ||
| 397 | |||
| 398 | RCU_LOCKDEP_WARN(lock_is_held(&sp->dep_map) || | ||
| 399 | lock_is_held(&rcu_bh_lock_map) || | ||
| 400 | lock_is_held(&rcu_lock_map) || | ||
| 401 | lock_is_held(&rcu_sched_lock_map), | ||
| 402 | "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section"); | ||
| 403 | |||
| 404 | might_sleep(); | ||
| 405 | init_completion(&rcu.completion); | ||
| 406 | |||
| 407 | head->next = NULL; | ||
| 408 | head->func = wakeme_after_rcu; | ||
| 409 | spin_lock_irq(&sp->queue_lock); | ||
| 410 | smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */ | ||
| 411 | if (!sp->running) { | ||
| 412 | /* steal the processing owner */ | ||
| 413 | sp->running = true; | ||
| 414 | rcu_batch_queue(&sp->batch_check0, head); | ||
| 415 | spin_unlock_irq(&sp->queue_lock); | ||
| 416 | |||
| 417 | srcu_advance_batches(sp, trycount); | ||
| 418 | if (!rcu_batch_empty(&sp->batch_done)) { | ||
| 419 | BUG_ON(sp->batch_done.head != head); | ||
| 420 | rcu_batch_dequeue(&sp->batch_done); | ||
| 421 | done = true; | ||
| 422 | } | ||
| 423 | /* give the processing owner to work_struct */ | ||
| 424 | srcu_reschedule(sp); | ||
| 425 | } else { | ||
| 426 | rcu_batch_queue(&sp->batch_queue, head); | ||
| 427 | spin_unlock_irq(&sp->queue_lock); | ||
| 428 | } | ||
| 429 | |||
| 430 | if (!done) { | ||
| 431 | wait_for_completion(&rcu.completion); | ||
| 432 | smp_mb(); /* Caller's later accesses after GP. */ | ||
| 433 | } | ||
| 434 | |||
| 435 | } | ||
| 436 | |||
| 437 | /** | ||
| 438 | * synchronize_srcu - wait for prior SRCU read-side critical-section completion | ||
| 439 | * @sp: srcu_struct with which to synchronize. | ||
| 440 | * | ||
| 441 | * Wait for the count to drain to zero of both indexes. To avoid the | ||
| 442 | * possible starvation of synchronize_srcu(), it waits for the count of | ||
| 443 | * the index=((->completed & 1) ^ 1) to drain to zero at first, | ||
| 444 | * and then flip the completed and wait for the count of the other index. | ||
| 445 | * | ||
| 446 | * Can block; must be called from process context. | ||
| 447 | * | ||
| 448 | * Note that it is illegal to call synchronize_srcu() from the corresponding | ||
| 449 | * SRCU read-side critical section; doing so will result in deadlock. | ||
| 450 | * However, it is perfectly legal to call synchronize_srcu() on one | ||
| 451 | * srcu_struct from some other srcu_struct's read-side critical section, | ||
| 452 | * as long as the resulting graph of srcu_structs is acyclic. | ||
| 453 | * | ||
| 454 | * There are memory-ordering constraints implied by synchronize_srcu(). | ||
| 455 | * On systems with more than one CPU, when synchronize_srcu() returns, | ||
| 456 | * each CPU is guaranteed to have executed a full memory barrier since | ||
| 457 | * the end of its last corresponding SRCU-sched read-side critical section | ||
| 458 | * whose beginning preceded the call to synchronize_srcu(). In addition, | ||
| 459 | * each CPU having an SRCU read-side critical section that extends beyond | ||
| 460 | * the return from synchronize_srcu() is guaranteed to have executed a | ||
| 461 | * full memory barrier after the beginning of synchronize_srcu() and before | ||
| 462 | * the beginning of that SRCU read-side critical section. Note that these | ||
| 463 | * guarantees include CPUs that are offline, idle, or executing in user mode, | ||
| 464 | * as well as CPUs that are executing in the kernel. | ||
| 465 | * | ||
| 466 | * Furthermore, if CPU A invoked synchronize_srcu(), which returned | ||
| 467 | * to its caller on CPU B, then both CPU A and CPU B are guaranteed | ||
| 468 | * to have executed a full memory barrier during the execution of | ||
| 469 | * synchronize_srcu(). This guarantee applies even if CPU A and CPU B | ||
| 470 | * are the same CPU, but again only if the system has more than one CPU. | ||
| 471 | * | ||
| 472 | * Of course, these memory-ordering guarantees apply only when | ||
| 473 | * synchronize_srcu(), srcu_read_lock(), and srcu_read_unlock() are | ||
| 474 | * passed the same srcu_struct structure. | ||
| 475 | */ | ||
| 476 | void synchronize_srcu(struct srcu_struct *sp) | ||
| 477 | { | ||
| 478 | __synchronize_srcu(sp, (rcu_gp_is_expedited() && !rcu_gp_is_normal()) | ||
| 479 | ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT | ||
| 480 | : SYNCHRONIZE_SRCU_TRYCOUNT); | ||
| 481 | } | ||
| 482 | EXPORT_SYMBOL_GPL(synchronize_srcu); | ||
| 483 | |||
| 484 | /** | ||
| 485 | * synchronize_srcu_expedited - Brute-force SRCU grace period | ||
| 486 | * @sp: srcu_struct with which to synchronize. | ||
| 487 | * | ||
| 488 | * Wait for an SRCU grace period to elapse, but be more aggressive about | ||
| 489 | * spinning rather than blocking when waiting. | ||
| 490 | * | ||
| 491 | * Note that synchronize_srcu_expedited() has the same deadlock and | ||
| 492 | * memory-ordering properties as does synchronize_srcu(). | ||
| 493 | */ | ||
| 494 | void synchronize_srcu_expedited(struct srcu_struct *sp) | ||
| 495 | { | ||
| 496 | __synchronize_srcu(sp, SYNCHRONIZE_SRCU_EXP_TRYCOUNT); | ||
| 497 | } | ||
| 498 | EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); | ||
| 499 | |||
| 500 | /** | ||
| 501 | * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete. | ||
| 502 | * @sp: srcu_struct on which to wait for in-flight callbacks. | ||
| 503 | */ | ||
| 504 | void srcu_barrier(struct srcu_struct *sp) | ||
| 505 | { | ||
| 506 | synchronize_srcu(sp); | ||
| 507 | } | ||
| 508 | EXPORT_SYMBOL_GPL(srcu_barrier); | ||
| 509 | |||
| 510 | /** | ||
| 511 | * srcu_batches_completed - return batches completed. | ||
| 512 | * @sp: srcu_struct on which to report batch completion. | ||
| 513 | * | ||
| 514 | * Report the number of batches, correlated with, but not necessarily | ||
| 515 | * precisely the same as, the number of grace periods that have elapsed. | ||
| 516 | */ | ||
| 517 | unsigned long srcu_batches_completed(struct srcu_struct *sp) | ||
| 518 | { | ||
| 519 | return sp->completed; | ||
| 520 | } | ||
| 521 | EXPORT_SYMBOL_GPL(srcu_batches_completed); | ||
| 522 | |||
| 523 | #define SRCU_CALLBACK_BATCH 10 | ||
| 524 | #define SRCU_INTERVAL 1 | ||
| 525 | |||
| 526 | /* | ||
| 527 | * Move any new SRCU callbacks to the first stage of the SRCU grace | ||
| 528 | * period pipeline. | ||
| 529 | */ | ||
| 530 | static void srcu_collect_new(struct srcu_struct *sp) | ||
| 531 | { | ||
| 532 | if (!rcu_batch_empty(&sp->batch_queue)) { | ||
| 533 | spin_lock_irq(&sp->queue_lock); | ||
| 534 | rcu_batch_move(&sp->batch_check0, &sp->batch_queue); | ||
| 535 | spin_unlock_irq(&sp->queue_lock); | ||
| 536 | } | ||
| 537 | } | ||
| 538 | |||
| 539 | /* | ||
| 540 | * Core SRCU state machine. Advance callbacks from ->batch_check0 to | ||
| 541 | * ->batch_check1 and then to ->batch_done as readers drain. | ||
| 542 | */ | ||
| 543 | static void srcu_advance_batches(struct srcu_struct *sp, int trycount) | ||
| 544 | { | ||
| 545 | int idx = 1 ^ (sp->completed & 1); | ||
| 546 | |||
| 547 | /* | ||
| 548 | * Because readers might be delayed for an extended period after | ||
| 549 | * fetching ->completed for their index, at any point in time there | ||
| 550 | * might well be readers using both idx=0 and idx=1. We therefore | ||
| 551 | * need to wait for readers to clear from both index values before | ||
| 552 | * invoking a callback. | ||
| 553 | */ | ||
| 554 | |||
| 555 | if (rcu_batch_empty(&sp->batch_check0) && | ||
| 556 | rcu_batch_empty(&sp->batch_check1)) | ||
| 557 | return; /* no callbacks need to be advanced */ | ||
| 558 | |||
| 559 | if (!try_check_zero(sp, idx, trycount)) | ||
| 560 | return; /* failed to advance, will try after SRCU_INTERVAL */ | ||
| 561 | |||
| 562 | /* | ||
| 563 | * The callbacks in ->batch_check1 have already done with their | ||
| 564 | * first zero check and flip back when they were enqueued on | ||
| 565 | * ->batch_check0 in a previous invocation of srcu_advance_batches(). | ||
| 566 | * (Presumably try_check_zero() returned false during that | ||
| 567 | * invocation, leaving the callbacks stranded on ->batch_check1.) | ||
| 568 | * They are therefore ready to invoke, so move them to ->batch_done. | ||
| 569 | */ | ||
| 570 | rcu_batch_move(&sp->batch_done, &sp->batch_check1); | ||
| 571 | |||
| 572 | if (rcu_batch_empty(&sp->batch_check0)) | ||
| 573 | return; /* no callbacks need to be advanced */ | ||
| 574 | srcu_flip(sp); | ||
| 575 | |||
| 576 | /* | ||
| 577 | * The callbacks in ->batch_check0 just finished their | ||
| 578 | * first check zero and flip, so move them to ->batch_check1 | ||
| 579 | * for future checking on the other idx. | ||
| 580 | */ | ||
| 581 | rcu_batch_move(&sp->batch_check1, &sp->batch_check0); | ||
| 582 | |||
| 583 | /* | ||
| 584 | * SRCU read-side critical sections are normally short, so check | ||
| 585 | * at least twice in quick succession after a flip. | ||
| 586 | */ | ||
| 587 | trycount = trycount < 2 ? 2 : trycount; | ||
| 588 | if (!try_check_zero(sp, idx^1, trycount)) | ||
| 589 | return; /* failed to advance, will try after SRCU_INTERVAL */ | ||
| 590 | |||
| 591 | /* | ||
| 592 | * The callbacks in ->batch_check1 have now waited for all | ||
| 593 | * pre-existing readers using both idx values. They are therefore | ||
| 594 | * ready to invoke, so move them to ->batch_done. | ||
| 595 | */ | ||
| 596 | rcu_batch_move(&sp->batch_done, &sp->batch_check1); | ||
| 597 | } | ||
| 598 | |||
| 599 | /* | ||
| 600 | * Invoke a limited number of SRCU callbacks that have passed through | ||
| 601 | * their grace period. If there are more to do, SRCU will reschedule | ||
| 602 | * the workqueue. Note that needed memory barriers have been executed | ||
| 603 | * in this task's context by srcu_readers_active_idx_check(). | ||
| 604 | */ | ||
| 605 | static void srcu_invoke_callbacks(struct srcu_struct *sp) | ||
| 606 | { | ||
| 607 | int i; | ||
| 608 | struct rcu_head *head; | ||
| 609 | |||
| 610 | for (i = 0; i < SRCU_CALLBACK_BATCH; i++) { | ||
| 611 | head = rcu_batch_dequeue(&sp->batch_done); | ||
| 612 | if (!head) | ||
| 613 | break; | ||
| 614 | local_bh_disable(); | ||
| 615 | head->func(head); | ||
| 616 | local_bh_enable(); | ||
| 617 | } | ||
| 618 | } | ||
| 619 | |||
| 620 | /* | ||
| 621 | * Finished one round of SRCU grace period. Start another if there are | ||
| 622 | * more SRCU callbacks queued, otherwise put SRCU into not-running state. | ||
| 623 | */ | ||
| 624 | static void srcu_reschedule(struct srcu_struct *sp) | ||
| 625 | { | ||
| 626 | bool pending = true; | ||
| 627 | |||
| 628 | if (rcu_batch_empty(&sp->batch_done) && | ||
| 629 | rcu_batch_empty(&sp->batch_check1) && | ||
| 630 | rcu_batch_empty(&sp->batch_check0) && | ||
| 631 | rcu_batch_empty(&sp->batch_queue)) { | ||
| 632 | spin_lock_irq(&sp->queue_lock); | ||
| 633 | if (rcu_batch_empty(&sp->batch_done) && | ||
| 634 | rcu_batch_empty(&sp->batch_check1) && | ||
| 635 | rcu_batch_empty(&sp->batch_check0) && | ||
| 636 | rcu_batch_empty(&sp->batch_queue)) { | ||
| 637 | sp->running = false; | ||
| 638 | pending = false; | ||
| 639 | } | ||
| 640 | spin_unlock_irq(&sp->queue_lock); | ||
| 641 | } | ||
| 642 | |||
| 643 | if (pending) | ||
| 644 | queue_delayed_work(system_power_efficient_wq, | ||
| 645 | &sp->work, SRCU_INTERVAL); | ||
| 646 | } | ||
| 647 | |||
| 648 | /* | ||
| 649 | * This is the work-queue function that handles SRCU grace periods. | ||
| 650 | */ | ||
| 651 | void process_srcu(struct work_struct *work) | ||
| 652 | { | ||
| 653 | struct srcu_struct *sp; | ||
| 654 | |||
| 655 | sp = container_of(work, struct srcu_struct, work.work); | ||
| 656 | |||
| 657 | srcu_collect_new(sp); | ||
| 658 | srcu_advance_batches(sp, 1); | ||
| 659 | srcu_invoke_callbacks(sp); | ||
| 660 | srcu_reschedule(sp); | ||
| 661 | } | ||
| 662 | EXPORT_SYMBOL_GPL(process_srcu); | ||
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 36e1f82faed1..1a1c1047d2ed 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c | |||
| @@ -38,8 +38,8 @@ static int init_srcu_struct_fields(struct srcu_struct *sp) | |||
| 38 | sp->srcu_lock_nesting[0] = 0; | 38 | sp->srcu_lock_nesting[0] = 0; |
| 39 | sp->srcu_lock_nesting[1] = 0; | 39 | sp->srcu_lock_nesting[1] = 0; |
| 40 | init_swait_queue_head(&sp->srcu_wq); | 40 | init_swait_queue_head(&sp->srcu_wq); |
| 41 | sp->srcu_gp_seq = 0; | 41 | sp->srcu_cb_head = NULL; |
| 42 | rcu_segcblist_init(&sp->srcu_cblist); | 42 | sp->srcu_cb_tail = &sp->srcu_cb_head; |
| 43 | sp->srcu_gp_running = false; | 43 | sp->srcu_gp_running = false; |
| 44 | sp->srcu_gp_waiting = false; | 44 | sp->srcu_gp_waiting = false; |
| 45 | sp->srcu_idx = 0; | 45 | sp->srcu_idx = 0; |
| @@ -88,31 +88,16 @@ void cleanup_srcu_struct(struct srcu_struct *sp) | |||
| 88 | { | 88 | { |
| 89 | WARN_ON(sp->srcu_lock_nesting[0] || sp->srcu_lock_nesting[1]); | 89 | WARN_ON(sp->srcu_lock_nesting[0] || sp->srcu_lock_nesting[1]); |
| 90 | flush_work(&sp->srcu_work); | 90 | flush_work(&sp->srcu_work); |
| 91 | WARN_ON(rcu_seq_state(sp->srcu_gp_seq)); | ||
| 92 | WARN_ON(sp->srcu_gp_running); | 91 | WARN_ON(sp->srcu_gp_running); |
| 93 | WARN_ON(sp->srcu_gp_waiting); | 92 | WARN_ON(sp->srcu_gp_waiting); |
| 94 | WARN_ON(!rcu_segcblist_empty(&sp->srcu_cblist)); | 93 | WARN_ON(sp->srcu_cb_head); |
| 94 | WARN_ON(&sp->srcu_cb_head != sp->srcu_cb_tail); | ||
| 95 | } | 95 | } |
| 96 | EXPORT_SYMBOL_GPL(cleanup_srcu_struct); | 96 | EXPORT_SYMBOL_GPL(cleanup_srcu_struct); |
| 97 | 97 | ||
| 98 | /* | 98 | /* |
| 99 | * Counts the new reader in the appropriate per-CPU element of the | ||
| 100 | * srcu_struct. Must be called from process context. | ||
| 101 | * Returns an index that must be passed to the matching srcu_read_unlock(). | ||
| 102 | */ | ||
| 103 | int __srcu_read_lock(struct srcu_struct *sp) | ||
| 104 | { | ||
| 105 | int idx; | ||
| 106 | |||
| 107 | idx = READ_ONCE(sp->srcu_idx); | ||
| 108 | WRITE_ONCE(sp->srcu_lock_nesting[idx], sp->srcu_lock_nesting[idx] + 1); | ||
| 109 | return idx; | ||
| 110 | } | ||
| 111 | EXPORT_SYMBOL_GPL(__srcu_read_lock); | ||
| 112 | |||
| 113 | /* | ||
| 114 | * Removes the count for the old reader from the appropriate element of | 99 | * Removes the count for the old reader from the appropriate element of |
| 115 | * the srcu_struct. Must be called from process context. | 100 | * the srcu_struct. |
| 116 | */ | 101 | */ |
| 117 | void __srcu_read_unlock(struct srcu_struct *sp, int idx) | 102 | void __srcu_read_unlock(struct srcu_struct *sp, int idx) |
| 118 | { | 103 | { |
| @@ -132,52 +117,44 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock); | |||
| 132 | void srcu_drive_gp(struct work_struct *wp) | 117 | void srcu_drive_gp(struct work_struct *wp) |
| 133 | { | 118 | { |
| 134 | int idx; | 119 | int idx; |
| 135 | struct rcu_cblist ready_cbs; | 120 | struct rcu_head *lh; |
| 136 | struct srcu_struct *sp; | ||
| 137 | struct rcu_head *rhp; | 121 | struct rcu_head *rhp; |
| 122 | struct srcu_struct *sp; | ||
| 138 | 123 | ||
| 139 | sp = container_of(wp, struct srcu_struct, srcu_work); | 124 | sp = container_of(wp, struct srcu_struct, srcu_work); |
| 140 | if (sp->srcu_gp_running || rcu_segcblist_empty(&sp->srcu_cblist)) | 125 | if (sp->srcu_gp_running || !READ_ONCE(sp->srcu_cb_head)) |
| 141 | return; /* Already running or nothing to do. */ | 126 | return; /* Already running or nothing to do. */ |
| 142 | 127 | ||
| 143 | /* Tag recently arrived callbacks and wait for readers. */ | 128 | /* Remove recently arrived callbacks and wait for readers. */ |
| 144 | WRITE_ONCE(sp->srcu_gp_running, true); | 129 | WRITE_ONCE(sp->srcu_gp_running, true); |
| 145 | rcu_segcblist_accelerate(&sp->srcu_cblist, | 130 | local_irq_disable(); |
| 146 | rcu_seq_snap(&sp->srcu_gp_seq)); | 131 | lh = sp->srcu_cb_head; |
| 147 | rcu_seq_start(&sp->srcu_gp_seq); | 132 | sp->srcu_cb_head = NULL; |
| 133 | sp->srcu_cb_tail = &sp->srcu_cb_head; | ||
| 134 | local_irq_enable(); | ||
| 148 | idx = sp->srcu_idx; | 135 | idx = sp->srcu_idx; |
| 149 | WRITE_ONCE(sp->srcu_idx, !sp->srcu_idx); | 136 | WRITE_ONCE(sp->srcu_idx, !sp->srcu_idx); |
| 150 | WRITE_ONCE(sp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */ | 137 | WRITE_ONCE(sp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */ |
| 151 | swait_event(sp->srcu_wq, !READ_ONCE(sp->srcu_lock_nesting[idx])); | 138 | swait_event(sp->srcu_wq, !READ_ONCE(sp->srcu_lock_nesting[idx])); |
| 152 | WRITE_ONCE(sp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */ | 139 | WRITE_ONCE(sp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */ |
| 153 | rcu_seq_end(&sp->srcu_gp_seq); | 140 | |
| 154 | 141 | /* Invoke the callbacks we removed above. */ | |
| 155 | /* Update callback list based on GP, and invoke ready callbacks. */ | 142 | while (lh) { |
| 156 | rcu_segcblist_advance(&sp->srcu_cblist, | 143 | rhp = lh; |
| 157 | rcu_seq_current(&sp->srcu_gp_seq)); | 144 | lh = lh->next; |
| 158 | if (rcu_segcblist_ready_cbs(&sp->srcu_cblist)) { | 145 | local_bh_disable(); |
| 159 | rcu_cblist_init(&ready_cbs); | 146 | rhp->func(rhp); |
| 160 | local_irq_disable(); | 147 | local_bh_enable(); |
| 161 | rcu_segcblist_extract_done_cbs(&sp->srcu_cblist, &ready_cbs); | ||
| 162 | local_irq_enable(); | ||
| 163 | rhp = rcu_cblist_dequeue(&ready_cbs); | ||
| 164 | for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) { | ||
| 165 | local_bh_disable(); | ||
| 166 | rhp->func(rhp); | ||
| 167 | local_bh_enable(); | ||
| 168 | } | ||
| 169 | local_irq_disable(); | ||
| 170 | rcu_segcblist_insert_count(&sp->srcu_cblist, &ready_cbs); | ||
| 171 | local_irq_enable(); | ||
| 172 | } | 148 | } |
| 173 | WRITE_ONCE(sp->srcu_gp_running, false); | ||
| 174 | 149 | ||
| 175 | /* | 150 | /* |
| 176 | * If more callbacks, reschedule ourselves. This can race with | 151 | * Enable rescheduling, and if there are more callbacks, |
| 177 | * a call_srcu() at interrupt level, but the ->srcu_gp_running | 152 | * reschedule ourselves. This can race with a call_srcu() |
| 178 | * checks will straighten that out. | 153 | * at interrupt level, but the ->srcu_gp_running checks will |
| 154 | * straighten that out. | ||
| 179 | */ | 155 | */ |
| 180 | if (!rcu_segcblist_empty(&sp->srcu_cblist)) | 156 | WRITE_ONCE(sp->srcu_gp_running, false); |
| 157 | if (READ_ONCE(sp->srcu_cb_head)) | ||
| 181 | schedule_work(&sp->srcu_work); | 158 | schedule_work(&sp->srcu_work); |
| 182 | } | 159 | } |
| 183 | EXPORT_SYMBOL_GPL(srcu_drive_gp); | 160 | EXPORT_SYMBOL_GPL(srcu_drive_gp); |
| @@ -186,14 +163,16 @@ EXPORT_SYMBOL_GPL(srcu_drive_gp); | |||
| 186 | * Enqueue an SRCU callback on the specified srcu_struct structure, | 163 | * Enqueue an SRCU callback on the specified srcu_struct structure, |
| 187 | * initiating grace-period processing if it is not already running. | 164 | * initiating grace-period processing if it is not already running. |
| 188 | */ | 165 | */ |
| 189 | void call_srcu(struct srcu_struct *sp, struct rcu_head *head, | 166 | void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, |
| 190 | rcu_callback_t func) | 167 | rcu_callback_t func) |
| 191 | { | 168 | { |
| 192 | unsigned long flags; | 169 | unsigned long flags; |
| 193 | 170 | ||
| 194 | head->func = func; | 171 | rhp->func = func; |
| 172 | rhp->next = NULL; | ||
| 195 | local_irq_save(flags); | 173 | local_irq_save(flags); |
| 196 | rcu_segcblist_enqueue(&sp->srcu_cblist, head, false); | 174 | *sp->srcu_cb_tail = rhp; |
| 175 | sp->srcu_cb_tail = &rhp->next; | ||
| 197 | local_irq_restore(flags); | 176 | local_irq_restore(flags); |
| 198 | if (!READ_ONCE(sp->srcu_gp_running)) | 177 | if (!READ_ONCE(sp->srcu_gp_running)) |
| 199 | schedule_work(&sp->srcu_work); | 178 | schedule_work(&sp->srcu_work); |
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 3ae8474557df..d0ca524bf042 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c | |||
| @@ -40,9 +40,15 @@ | |||
| 40 | #include "rcu.h" | 40 | #include "rcu.h" |
| 41 | #include "rcu_segcblist.h" | 41 | #include "rcu_segcblist.h" |
| 42 | 42 | ||
| 43 | ulong exp_holdoff = 25 * 1000; /* Holdoff (ns) for auto-expediting. */ | 43 | /* Holdoff in nanoseconds for auto-expediting. */ |
| 44 | #define DEFAULT_SRCU_EXP_HOLDOFF (25 * 1000) | ||
| 45 | static ulong exp_holdoff = DEFAULT_SRCU_EXP_HOLDOFF; | ||
| 44 | module_param(exp_holdoff, ulong, 0444); | 46 | module_param(exp_holdoff, ulong, 0444); |
| 45 | 47 | ||
| 48 | /* Overflow-check frequency. N bits roughly says every 2**N grace periods. */ | ||
| 49 | static ulong counter_wrap_check = (ULONG_MAX >> 2); | ||
| 50 | module_param(counter_wrap_check, ulong, 0444); | ||
| 51 | |||
| 46 | static void srcu_invoke_callbacks(struct work_struct *work); | 52 | static void srcu_invoke_callbacks(struct work_struct *work); |
| 47 | static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay); | 53 | static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay); |
| 48 | 54 | ||
| @@ -70,7 +76,7 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static) | |||
| 70 | 76 | ||
| 71 | /* Each pass through this loop initializes one srcu_node structure. */ | 77 | /* Each pass through this loop initializes one srcu_node structure. */ |
| 72 | rcu_for_each_node_breadth_first(sp, snp) { | 78 | rcu_for_each_node_breadth_first(sp, snp) { |
| 73 | spin_lock_init(&snp->lock); | 79 | raw_spin_lock_init(&ACCESS_PRIVATE(snp, lock)); |
| 74 | WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) != | 80 | WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) != |
| 75 | ARRAY_SIZE(snp->srcu_data_have_cbs)); | 81 | ARRAY_SIZE(snp->srcu_data_have_cbs)); |
| 76 | for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) { | 82 | for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) { |
| @@ -104,7 +110,7 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static) | |||
| 104 | snp_first = sp->level[level]; | 110 | snp_first = sp->level[level]; |
| 105 | for_each_possible_cpu(cpu) { | 111 | for_each_possible_cpu(cpu) { |
| 106 | sdp = per_cpu_ptr(sp->sda, cpu); | 112 | sdp = per_cpu_ptr(sp->sda, cpu); |
| 107 | spin_lock_init(&sdp->lock); | 113 | raw_spin_lock_init(&ACCESS_PRIVATE(sdp, lock)); |
| 108 | rcu_segcblist_init(&sdp->srcu_cblist); | 114 | rcu_segcblist_init(&sdp->srcu_cblist); |
| 109 | sdp->srcu_cblist_invoking = false; | 115 | sdp->srcu_cblist_invoking = false; |
| 110 | sdp->srcu_gp_seq_needed = sp->srcu_gp_seq; | 116 | sdp->srcu_gp_seq_needed = sp->srcu_gp_seq; |
| @@ -163,7 +169,7 @@ int __init_srcu_struct(struct srcu_struct *sp, const char *name, | |||
| 163 | /* Don't re-initialize a lock while it is held. */ | 169 | /* Don't re-initialize a lock while it is held. */ |
| 164 | debug_check_no_locks_freed((void *)sp, sizeof(*sp)); | 170 | debug_check_no_locks_freed((void *)sp, sizeof(*sp)); |
| 165 | lockdep_init_map(&sp->dep_map, name, key, 0); | 171 | lockdep_init_map(&sp->dep_map, name, key, 0); |
| 166 | spin_lock_init(&sp->gp_lock); | 172 | raw_spin_lock_init(&ACCESS_PRIVATE(sp, lock)); |
| 167 | return init_srcu_struct_fields(sp, false); | 173 | return init_srcu_struct_fields(sp, false); |
| 168 | } | 174 | } |
| 169 | EXPORT_SYMBOL_GPL(__init_srcu_struct); | 175 | EXPORT_SYMBOL_GPL(__init_srcu_struct); |
| @@ -180,7 +186,7 @@ EXPORT_SYMBOL_GPL(__init_srcu_struct); | |||
| 180 | */ | 186 | */ |
| 181 | int init_srcu_struct(struct srcu_struct *sp) | 187 | int init_srcu_struct(struct srcu_struct *sp) |
| 182 | { | 188 | { |
| 183 | spin_lock_init(&sp->gp_lock); | 189 | raw_spin_lock_init(&ACCESS_PRIVATE(sp, lock)); |
| 184 | return init_srcu_struct_fields(sp, false); | 190 | return init_srcu_struct_fields(sp, false); |
| 185 | } | 191 | } |
| 186 | EXPORT_SYMBOL_GPL(init_srcu_struct); | 192 | EXPORT_SYMBOL_GPL(init_srcu_struct); |
| @@ -191,7 +197,7 @@ EXPORT_SYMBOL_GPL(init_srcu_struct); | |||
| 191 | * First-use initialization of statically allocated srcu_struct | 197 | * First-use initialization of statically allocated srcu_struct |
| 192 | * structure. Wiring up the combining tree is more than can be | 198 | * structure. Wiring up the combining tree is more than can be |
| 193 | * done with compile-time initialization, so this check is added | 199 | * done with compile-time initialization, so this check is added |
| 194 | * to each update-side SRCU primitive. Use ->gp_lock, which -is- | 200 | * to each update-side SRCU primitive. Use sp->lock, which -is- |
| 195 | * compile-time initialized, to resolve races involving multiple | 201 | * compile-time initialized, to resolve races involving multiple |
| 196 | * CPUs trying to garner first-use privileges. | 202 | * CPUs trying to garner first-use privileges. |
| 197 | */ | 203 | */ |
| @@ -203,13 +209,13 @@ static void check_init_srcu_struct(struct srcu_struct *sp) | |||
| 203 | /* The smp_load_acquire() pairs with the smp_store_release(). */ | 209 | /* The smp_load_acquire() pairs with the smp_store_release(). */ |
| 204 | if (!rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq_needed))) /*^^^*/ | 210 | if (!rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq_needed))) /*^^^*/ |
| 205 | return; /* Already initialized. */ | 211 | return; /* Already initialized. */ |
| 206 | spin_lock_irqsave(&sp->gp_lock, flags); | 212 | raw_spin_lock_irqsave_rcu_node(sp, flags); |
| 207 | if (!rcu_seq_state(sp->srcu_gp_seq_needed)) { | 213 | if (!rcu_seq_state(sp->srcu_gp_seq_needed)) { |
| 208 | spin_unlock_irqrestore(&sp->gp_lock, flags); | 214 | raw_spin_unlock_irqrestore_rcu_node(sp, flags); |
| 209 | return; | 215 | return; |
| 210 | } | 216 | } |
| 211 | init_srcu_struct_fields(sp, true); | 217 | init_srcu_struct_fields(sp, true); |
| 212 | spin_unlock_irqrestore(&sp->gp_lock, flags); | 218 | raw_spin_unlock_irqrestore_rcu_node(sp, flags); |
| 213 | } | 219 | } |
| 214 | 220 | ||
| 215 | /* | 221 | /* |
| @@ -275,15 +281,20 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx) | |||
| 275 | * not mean that there are no more readers, as one could have read | 281 | * not mean that there are no more readers, as one could have read |
| 276 | * the current index but not have incremented the lock counter yet. | 282 | * the current index but not have incremented the lock counter yet. |
| 277 | * | 283 | * |
| 278 | * Possible bug: There is no guarantee that there haven't been | 284 | * So suppose that the updater is preempted here for so long |
| 279 | * ULONG_MAX increments of ->srcu_lock_count[] since the unlocks were | 285 | * that more than ULONG_MAX non-nested readers come and go in |
| 280 | * counted, meaning that this could return true even if there are | 286 | * the meantime. It turns out that this cannot result in overflow |
| 281 | * still active readers. Since there are no memory barriers around | 287 | * because if a reader modifies its unlock count after we read it |
| 282 | * srcu_flip(), the CPU is not required to increment ->srcu_idx | 288 | * above, then that reader's next load of ->srcu_idx is guaranteed |
| 283 | * before running srcu_readers_unlock_idx(), which means that there | 289 | * to get the new value, which will cause it to operate on the |
| 284 | * could be an arbitrarily large number of critical sections that | 290 | * other bank of counters, where it cannot contribute to the |
| 285 | * execute after srcu_readers_unlock_idx() but use the old value | 291 | * overflow of these counters. This means that there is a maximum |
| 286 | * of ->srcu_idx. | 292 | * of 2*NR_CPUS increments, which cannot overflow given current |
| 293 | * systems, especially not on 64-bit systems. | ||
| 294 | * | ||
| 295 | * OK, how about nesting? This does impose a limit on nesting | ||
| 296 | * of floor(ULONG_MAX/NR_CPUS/2), which should be sufficient, | ||
| 297 | * especially on 64-bit systems. | ||
| 287 | */ | 298 | */ |
| 288 | return srcu_readers_lock_idx(sp, idx) == unlocks; | 299 | return srcu_readers_lock_idx(sp, idx) == unlocks; |
| 289 | } | 300 | } |
| @@ -357,7 +368,7 @@ EXPORT_SYMBOL_GPL(cleanup_srcu_struct); | |||
| 357 | 368 | ||
| 358 | /* | 369 | /* |
| 359 | * Counts the new reader in the appropriate per-CPU element of the | 370 | * Counts the new reader in the appropriate per-CPU element of the |
| 360 | * srcu_struct. Must be called from process context. | 371 | * srcu_struct. |
| 361 | * Returns an index that must be passed to the matching srcu_read_unlock(). | 372 | * Returns an index that must be passed to the matching srcu_read_unlock(). |
| 362 | */ | 373 | */ |
| 363 | int __srcu_read_lock(struct srcu_struct *sp) | 374 | int __srcu_read_lock(struct srcu_struct *sp) |
| @@ -365,7 +376,7 @@ int __srcu_read_lock(struct srcu_struct *sp) | |||
| 365 | int idx; | 376 | int idx; |
| 366 | 377 | ||
| 367 | idx = READ_ONCE(sp->srcu_idx) & 0x1; | 378 | idx = READ_ONCE(sp->srcu_idx) & 0x1; |
| 368 | __this_cpu_inc(sp->sda->srcu_lock_count[idx]); | 379 | this_cpu_inc(sp->sda->srcu_lock_count[idx]); |
| 369 | smp_mb(); /* B */ /* Avoid leaking the critical section. */ | 380 | smp_mb(); /* B */ /* Avoid leaking the critical section. */ |
| 370 | return idx; | 381 | return idx; |
| 371 | } | 382 | } |
| @@ -375,7 +386,6 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock); | |||
| 375 | * Removes the count for the old reader from the appropriate per-CPU | 386 | * Removes the count for the old reader from the appropriate per-CPU |
| 376 | * element of the srcu_struct. Note that this may well be a different | 387 | * element of the srcu_struct. Note that this may well be a different |
| 377 | * CPU than that which was incremented by the corresponding srcu_read_lock(). | 388 | * CPU than that which was incremented by the corresponding srcu_read_lock(). |
| 378 | * Must be called from process context. | ||
| 379 | */ | 389 | */ |
| 380 | void __srcu_read_unlock(struct srcu_struct *sp, int idx) | 390 | void __srcu_read_unlock(struct srcu_struct *sp, int idx) |
| 381 | { | 391 | { |
| @@ -401,8 +411,7 @@ static void srcu_gp_start(struct srcu_struct *sp) | |||
| 401 | struct srcu_data *sdp = this_cpu_ptr(sp->sda); | 411 | struct srcu_data *sdp = this_cpu_ptr(sp->sda); |
| 402 | int state; | 412 | int state; |
| 403 | 413 | ||
| 404 | RCU_LOCKDEP_WARN(!lockdep_is_held(&sp->gp_lock), | 414 | lockdep_assert_held(&sp->lock); |
| 405 | "Invoked srcu_gp_start() without ->gp_lock!"); | ||
| 406 | WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)); | 415 | WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)); |
| 407 | rcu_segcblist_advance(&sdp->srcu_cblist, | 416 | rcu_segcblist_advance(&sdp->srcu_cblist, |
| 408 | rcu_seq_current(&sp->srcu_gp_seq)); | 417 | rcu_seq_current(&sp->srcu_gp_seq)); |
| @@ -490,17 +499,20 @@ static void srcu_gp_end(struct srcu_struct *sp) | |||
| 490 | { | 499 | { |
| 491 | unsigned long cbdelay; | 500 | unsigned long cbdelay; |
| 492 | bool cbs; | 501 | bool cbs; |
| 502 | int cpu; | ||
| 503 | unsigned long flags; | ||
| 493 | unsigned long gpseq; | 504 | unsigned long gpseq; |
| 494 | int idx; | 505 | int idx; |
| 495 | int idxnext; | 506 | int idxnext; |
| 496 | unsigned long mask; | 507 | unsigned long mask; |
| 508 | struct srcu_data *sdp; | ||
| 497 | struct srcu_node *snp; | 509 | struct srcu_node *snp; |
| 498 | 510 | ||
| 499 | /* Prevent more than one additional grace period. */ | 511 | /* Prevent more than one additional grace period. */ |
| 500 | mutex_lock(&sp->srcu_cb_mutex); | 512 | mutex_lock(&sp->srcu_cb_mutex); |
| 501 | 513 | ||
| 502 | /* End the current grace period. */ | 514 | /* End the current grace period. */ |
| 503 | spin_lock_irq(&sp->gp_lock); | 515 | raw_spin_lock_irq_rcu_node(sp); |
| 504 | idx = rcu_seq_state(sp->srcu_gp_seq); | 516 | idx = rcu_seq_state(sp->srcu_gp_seq); |
| 505 | WARN_ON_ONCE(idx != SRCU_STATE_SCAN2); | 517 | WARN_ON_ONCE(idx != SRCU_STATE_SCAN2); |
| 506 | cbdelay = srcu_get_delay(sp); | 518 | cbdelay = srcu_get_delay(sp); |
| @@ -509,7 +521,7 @@ static void srcu_gp_end(struct srcu_struct *sp) | |||
| 509 | gpseq = rcu_seq_current(&sp->srcu_gp_seq); | 521 | gpseq = rcu_seq_current(&sp->srcu_gp_seq); |
| 510 | if (ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, gpseq)) | 522 | if (ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, gpseq)) |
| 511 | sp->srcu_gp_seq_needed_exp = gpseq; | 523 | sp->srcu_gp_seq_needed_exp = gpseq; |
| 512 | spin_unlock_irq(&sp->gp_lock); | 524 | raw_spin_unlock_irq_rcu_node(sp); |
| 513 | mutex_unlock(&sp->srcu_gp_mutex); | 525 | mutex_unlock(&sp->srcu_gp_mutex); |
| 514 | /* A new grace period can start at this point. But only one. */ | 526 | /* A new grace period can start at this point. But only one. */ |
| 515 | 527 | ||
| @@ -517,7 +529,7 @@ static void srcu_gp_end(struct srcu_struct *sp) | |||
| 517 | idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs); | 529 | idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs); |
| 518 | idxnext = (idx + 1) % ARRAY_SIZE(snp->srcu_have_cbs); | 530 | idxnext = (idx + 1) % ARRAY_SIZE(snp->srcu_have_cbs); |
| 519 | rcu_for_each_node_breadth_first(sp, snp) { | 531 | rcu_for_each_node_breadth_first(sp, snp) { |
| 520 | spin_lock_irq(&snp->lock); | 532 | raw_spin_lock_irq_rcu_node(snp); |
| 521 | cbs = false; | 533 | cbs = false; |
| 522 | if (snp >= sp->level[rcu_num_lvls - 1]) | 534 | if (snp >= sp->level[rcu_num_lvls - 1]) |
| 523 | cbs = snp->srcu_have_cbs[idx] == gpseq; | 535 | cbs = snp->srcu_have_cbs[idx] == gpseq; |
| @@ -527,28 +539,37 @@ static void srcu_gp_end(struct srcu_struct *sp) | |||
| 527 | snp->srcu_gp_seq_needed_exp = gpseq; | 539 | snp->srcu_gp_seq_needed_exp = gpseq; |
| 528 | mask = snp->srcu_data_have_cbs[idx]; | 540 | mask = snp->srcu_data_have_cbs[idx]; |
| 529 | snp->srcu_data_have_cbs[idx] = 0; | 541 | snp->srcu_data_have_cbs[idx] = 0; |
| 530 | spin_unlock_irq(&snp->lock); | 542 | raw_spin_unlock_irq_rcu_node(snp); |
| 531 | if (cbs) { | 543 | if (cbs) |
| 532 | smp_mb(); /* GP end before CB invocation. */ | ||
| 533 | srcu_schedule_cbs_snp(sp, snp, mask, cbdelay); | 544 | srcu_schedule_cbs_snp(sp, snp, mask, cbdelay); |
| 534 | } | 545 | |
| 546 | /* Occasionally prevent srcu_data counter wrap. */ | ||
| 547 | if (!(gpseq & counter_wrap_check)) | ||
| 548 | for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) { | ||
| 549 | sdp = per_cpu_ptr(sp->sda, cpu); | ||
| 550 | raw_spin_lock_irqsave_rcu_node(sdp, flags); | ||
| 551 | if (ULONG_CMP_GE(gpseq, | ||
| 552 | sdp->srcu_gp_seq_needed + 100)) | ||
| 553 | sdp->srcu_gp_seq_needed = gpseq; | ||
| 554 | raw_spin_unlock_irqrestore_rcu_node(sdp, flags); | ||
| 555 | } | ||
| 535 | } | 556 | } |
| 536 | 557 | ||
| 537 | /* Callback initiation done, allow grace periods after next. */ | 558 | /* Callback initiation done, allow grace periods after next. */ |
| 538 | mutex_unlock(&sp->srcu_cb_mutex); | 559 | mutex_unlock(&sp->srcu_cb_mutex); |
| 539 | 560 | ||
| 540 | /* Start a new grace period if needed. */ | 561 | /* Start a new grace period if needed. */ |
| 541 | spin_lock_irq(&sp->gp_lock); | 562 | raw_spin_lock_irq_rcu_node(sp); |
| 542 | gpseq = rcu_seq_current(&sp->srcu_gp_seq); | 563 | gpseq = rcu_seq_current(&sp->srcu_gp_seq); |
| 543 | if (!rcu_seq_state(gpseq) && | 564 | if (!rcu_seq_state(gpseq) && |
| 544 | ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) { | 565 | ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) { |
| 545 | srcu_gp_start(sp); | 566 | srcu_gp_start(sp); |
| 546 | spin_unlock_irq(&sp->gp_lock); | 567 | raw_spin_unlock_irq_rcu_node(sp); |
| 547 | /* Throttle expedited grace periods: Should be rare! */ | 568 | /* Throttle expedited grace periods: Should be rare! */ |
| 548 | srcu_reschedule(sp, rcu_seq_ctr(gpseq) & 0x3ff | 569 | srcu_reschedule(sp, rcu_seq_ctr(gpseq) & 0x3ff |
| 549 | ? 0 : SRCU_INTERVAL); | 570 | ? 0 : SRCU_INTERVAL); |
| 550 | } else { | 571 | } else { |
| 551 | spin_unlock_irq(&sp->gp_lock); | 572 | raw_spin_unlock_irq_rcu_node(sp); |
| 552 | } | 573 | } |
| 553 | } | 574 | } |
| 554 | 575 | ||
| @@ -568,18 +589,18 @@ static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp, | |||
| 568 | if (rcu_seq_done(&sp->srcu_gp_seq, s) || | 589 | if (rcu_seq_done(&sp->srcu_gp_seq, s) || |
| 569 | ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s)) | 590 | ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s)) |
| 570 | return; | 591 | return; |
| 571 | spin_lock_irqsave(&snp->lock, flags); | 592 | raw_spin_lock_irqsave_rcu_node(snp, flags); |
| 572 | if (ULONG_CMP_GE(snp->srcu_gp_seq_needed_exp, s)) { | 593 | if (ULONG_CMP_GE(snp->srcu_gp_seq_needed_exp, s)) { |
| 573 | spin_unlock_irqrestore(&snp->lock, flags); | 594 | raw_spin_unlock_irqrestore_rcu_node(snp, flags); |
| 574 | return; | 595 | return; |
| 575 | } | 596 | } |
| 576 | WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); | 597 | WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); |
| 577 | spin_unlock_irqrestore(&snp->lock, flags); | 598 | raw_spin_unlock_irqrestore_rcu_node(snp, flags); |
| 578 | } | 599 | } |
| 579 | spin_lock_irqsave(&sp->gp_lock, flags); | 600 | raw_spin_lock_irqsave_rcu_node(sp, flags); |
| 580 | if (!ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s)) | 601 | if (!ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s)) |
| 581 | sp->srcu_gp_seq_needed_exp = s; | 602 | sp->srcu_gp_seq_needed_exp = s; |
| 582 | spin_unlock_irqrestore(&sp->gp_lock, flags); | 603 | raw_spin_unlock_irqrestore_rcu_node(sp, flags); |
| 583 | } | 604 | } |
| 584 | 605 | ||
| 585 | /* | 606 | /* |
| @@ -601,14 +622,13 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, | |||
| 601 | for (; snp != NULL; snp = snp->srcu_parent) { | 622 | for (; snp != NULL; snp = snp->srcu_parent) { |
| 602 | if (rcu_seq_done(&sp->srcu_gp_seq, s) && snp != sdp->mynode) | 623 | if (rcu_seq_done(&sp->srcu_gp_seq, s) && snp != sdp->mynode) |
| 603 | return; /* GP already done and CBs recorded. */ | 624 | return; /* GP already done and CBs recorded. */ |
| 604 | spin_lock_irqsave(&snp->lock, flags); | 625 | raw_spin_lock_irqsave_rcu_node(snp, flags); |
| 605 | if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) { | 626 | if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) { |
| 606 | snp_seq = snp->srcu_have_cbs[idx]; | 627 | snp_seq = snp->srcu_have_cbs[idx]; |
| 607 | if (snp == sdp->mynode && snp_seq == s) | 628 | if (snp == sdp->mynode && snp_seq == s) |
| 608 | snp->srcu_data_have_cbs[idx] |= sdp->grpmask; | 629 | snp->srcu_data_have_cbs[idx] |= sdp->grpmask; |
| 609 | spin_unlock_irqrestore(&snp->lock, flags); | 630 | raw_spin_unlock_irqrestore_rcu_node(snp, flags); |
| 610 | if (snp == sdp->mynode && snp_seq != s) { | 631 | if (snp == sdp->mynode && snp_seq != s) { |
| 611 | smp_mb(); /* CBs after GP! */ | ||
| 612 | srcu_schedule_cbs_sdp(sdp, do_norm | 632 | srcu_schedule_cbs_sdp(sdp, do_norm |
| 613 | ? SRCU_INTERVAL | 633 | ? SRCU_INTERVAL |
| 614 | : 0); | 634 | : 0); |
| @@ -623,11 +643,11 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, | |||
| 623 | snp->srcu_data_have_cbs[idx] |= sdp->grpmask; | 643 | snp->srcu_data_have_cbs[idx] |= sdp->grpmask; |
| 624 | if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s)) | 644 | if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s)) |
| 625 | snp->srcu_gp_seq_needed_exp = s; | 645 | snp->srcu_gp_seq_needed_exp = s; |
| 626 | spin_unlock_irqrestore(&snp->lock, flags); | 646 | raw_spin_unlock_irqrestore_rcu_node(snp, flags); |
| 627 | } | 647 | } |
| 628 | 648 | ||
| 629 | /* Top of tree, must ensure the grace period will be started. */ | 649 | /* Top of tree, must ensure the grace period will be started. */ |
| 630 | spin_lock_irqsave(&sp->gp_lock, flags); | 650 | raw_spin_lock_irqsave_rcu_node(sp, flags); |
| 631 | if (ULONG_CMP_LT(sp->srcu_gp_seq_needed, s)) { | 651 | if (ULONG_CMP_LT(sp->srcu_gp_seq_needed, s)) { |
| 632 | /* | 652 | /* |
| 633 | * Record need for grace period s. Pair with load | 653 | * Record need for grace period s. Pair with load |
| @@ -646,7 +666,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, | |||
| 646 | queue_delayed_work(system_power_efficient_wq, &sp->work, | 666 | queue_delayed_work(system_power_efficient_wq, &sp->work, |
| 647 | srcu_get_delay(sp)); | 667 | srcu_get_delay(sp)); |
| 648 | } | 668 | } |
| 649 | spin_unlock_irqrestore(&sp->gp_lock, flags); | 669 | raw_spin_unlock_irqrestore_rcu_node(sp, flags); |
| 650 | } | 670 | } |
| 651 | 671 | ||
| 652 | /* | 672 | /* |
| @@ -672,6 +692,16 @@ static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount) | |||
| 672 | */ | 692 | */ |
| 673 | static void srcu_flip(struct srcu_struct *sp) | 693 | static void srcu_flip(struct srcu_struct *sp) |
| 674 | { | 694 | { |
| 695 | /* | ||
| 696 | * Ensure that if this updater saw a given reader's increment | ||
| 697 | * from __srcu_read_lock(), that reader was using an old value | ||
| 698 | * of ->srcu_idx. Also ensure that if a given reader sees the | ||
| 699 | * new value of ->srcu_idx, this updater's earlier scans cannot | ||
| 700 | * have seen that reader's increments (which is OK, because this | ||
| 701 | * grace period need not wait on that reader). | ||
| 702 | */ | ||
| 703 | smp_mb(); /* E */ /* Pairs with B and C. */ | ||
| 704 | |||
| 675 | WRITE_ONCE(sp->srcu_idx, sp->srcu_idx + 1); | 705 | WRITE_ONCE(sp->srcu_idx, sp->srcu_idx + 1); |
| 676 | 706 | ||
| 677 | /* | 707 | /* |
| @@ -746,6 +776,13 @@ static bool srcu_might_be_idle(struct srcu_struct *sp) | |||
| 746 | } | 776 | } |
| 747 | 777 | ||
| 748 | /* | 778 | /* |
| 779 | * SRCU callback function to leak a callback. | ||
| 780 | */ | ||
| 781 | static void srcu_leak_callback(struct rcu_head *rhp) | ||
| 782 | { | ||
| 783 | } | ||
| 784 | |||
| 785 | /* | ||
| 749 | * Enqueue an SRCU callback on the srcu_data structure associated with | 786 | * Enqueue an SRCU callback on the srcu_data structure associated with |
| 750 | * the current CPU and the specified srcu_struct structure, initiating | 787 | * the current CPU and the specified srcu_struct structure, initiating |
| 751 | * grace-period processing if it is not already running. | 788 | * grace-period processing if it is not already running. |
| @@ -783,10 +820,16 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, | |||
| 783 | struct srcu_data *sdp; | 820 | struct srcu_data *sdp; |
| 784 | 821 | ||
| 785 | check_init_srcu_struct(sp); | 822 | check_init_srcu_struct(sp); |
| 823 | if (debug_rcu_head_queue(rhp)) { | ||
| 824 | /* Probable double call_srcu(), so leak the callback. */ | ||
| 825 | WRITE_ONCE(rhp->func, srcu_leak_callback); | ||
| 826 | WARN_ONCE(1, "call_srcu(): Leaked duplicate callback\n"); | ||
| 827 | return; | ||
| 828 | } | ||
| 786 | rhp->func = func; | 829 | rhp->func = func; |
| 787 | local_irq_save(flags); | 830 | local_irq_save(flags); |
| 788 | sdp = this_cpu_ptr(sp->sda); | 831 | sdp = this_cpu_ptr(sp->sda); |
| 789 | spin_lock(&sdp->lock); | 832 | raw_spin_lock_rcu_node(sdp); |
| 790 | rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp, false); | 833 | rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp, false); |
| 791 | rcu_segcblist_advance(&sdp->srcu_cblist, | 834 | rcu_segcblist_advance(&sdp->srcu_cblist, |
| 792 | rcu_seq_current(&sp->srcu_gp_seq)); | 835 | rcu_seq_current(&sp->srcu_gp_seq)); |
| @@ -800,13 +843,30 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, | |||
| 800 | sdp->srcu_gp_seq_needed_exp = s; | 843 | sdp->srcu_gp_seq_needed_exp = s; |
| 801 | needexp = true; | 844 | needexp = true; |
| 802 | } | 845 | } |
| 803 | spin_unlock_irqrestore(&sdp->lock, flags); | 846 | raw_spin_unlock_irqrestore_rcu_node(sdp, flags); |
| 804 | if (needgp) | 847 | if (needgp) |
| 805 | srcu_funnel_gp_start(sp, sdp, s, do_norm); | 848 | srcu_funnel_gp_start(sp, sdp, s, do_norm); |
| 806 | else if (needexp) | 849 | else if (needexp) |
| 807 | srcu_funnel_exp_start(sp, sdp->mynode, s); | 850 | srcu_funnel_exp_start(sp, sdp->mynode, s); |
| 808 | } | 851 | } |
| 809 | 852 | ||
| 853 | /** | ||
| 854 | * call_srcu() - Queue a callback for invocation after an SRCU grace period | ||
| 855 | * @sp: srcu_struct in queue the callback | ||
| 856 | * @head: structure to be used for queueing the SRCU callback. | ||
| 857 | * @func: function to be invoked after the SRCU grace period | ||
| 858 | * | ||
| 859 | * The callback function will be invoked some time after a full SRCU | ||
| 860 | * grace period elapses, in other words after all pre-existing SRCU | ||
| 861 | * read-side critical sections have completed. However, the callback | ||
| 862 | * function might well execute concurrently with other SRCU read-side | ||
| 863 | * critical sections that started after call_srcu() was invoked. SRCU | ||
| 864 | * read-side critical sections are delimited by srcu_read_lock() and | ||
| 865 | * srcu_read_unlock(), and may be nested. | ||
| 866 | * | ||
| 867 | * The callback will be invoked from process context, but must nevertheless | ||
| 868 | * be fast and must not block. | ||
| 869 | */ | ||
| 810 | void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, | 870 | void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, |
| 811 | rcu_callback_t func) | 871 | rcu_callback_t func) |
| 812 | { | 872 | { |
| @@ -954,13 +1014,16 @@ void srcu_barrier(struct srcu_struct *sp) | |||
| 954 | */ | 1014 | */ |
| 955 | for_each_possible_cpu(cpu) { | 1015 | for_each_possible_cpu(cpu) { |
| 956 | sdp = per_cpu_ptr(sp->sda, cpu); | 1016 | sdp = per_cpu_ptr(sp->sda, cpu); |
| 957 | spin_lock_irq(&sdp->lock); | 1017 | raw_spin_lock_irq_rcu_node(sdp); |
| 958 | atomic_inc(&sp->srcu_barrier_cpu_cnt); | 1018 | atomic_inc(&sp->srcu_barrier_cpu_cnt); |
| 959 | sdp->srcu_barrier_head.func = srcu_barrier_cb; | 1019 | sdp->srcu_barrier_head.func = srcu_barrier_cb; |
| 1020 | debug_rcu_head_queue(&sdp->srcu_barrier_head); | ||
| 960 | if (!rcu_segcblist_entrain(&sdp->srcu_cblist, | 1021 | if (!rcu_segcblist_entrain(&sdp->srcu_cblist, |
| 961 | &sdp->srcu_barrier_head, 0)) | 1022 | &sdp->srcu_barrier_head, 0)) { |
| 1023 | debug_rcu_head_unqueue(&sdp->srcu_barrier_head); | ||
| 962 | atomic_dec(&sp->srcu_barrier_cpu_cnt); | 1024 | atomic_dec(&sp->srcu_barrier_cpu_cnt); |
| 963 | spin_unlock_irq(&sdp->lock); | 1025 | } |
| 1026 | raw_spin_unlock_irq_rcu_node(sdp); | ||
| 964 | } | 1027 | } |
| 965 | 1028 | ||
| 966 | /* Remove the initial count, at which point reaching zero can happen. */ | 1029 | /* Remove the initial count, at which point reaching zero can happen. */ |
| @@ -1009,17 +1072,17 @@ static void srcu_advance_state(struct srcu_struct *sp) | |||
| 1009 | */ | 1072 | */ |
| 1010 | idx = rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq)); /* ^^^ */ | 1073 | idx = rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq)); /* ^^^ */ |
| 1011 | if (idx == SRCU_STATE_IDLE) { | 1074 | if (idx == SRCU_STATE_IDLE) { |
| 1012 | spin_lock_irq(&sp->gp_lock); | 1075 | raw_spin_lock_irq_rcu_node(sp); |
| 1013 | if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) { | 1076 | if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) { |
| 1014 | WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq)); | 1077 | WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq)); |
| 1015 | spin_unlock_irq(&sp->gp_lock); | 1078 | raw_spin_unlock_irq_rcu_node(sp); |
| 1016 | mutex_unlock(&sp->srcu_gp_mutex); | 1079 | mutex_unlock(&sp->srcu_gp_mutex); |
| 1017 | return; | 1080 | return; |
| 1018 | } | 1081 | } |
| 1019 | idx = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)); | 1082 | idx = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)); |
| 1020 | if (idx == SRCU_STATE_IDLE) | 1083 | if (idx == SRCU_STATE_IDLE) |
| 1021 | srcu_gp_start(sp); | 1084 | srcu_gp_start(sp); |
| 1022 | spin_unlock_irq(&sp->gp_lock); | 1085 | raw_spin_unlock_irq_rcu_node(sp); |
| 1023 | if (idx != SRCU_STATE_IDLE) { | 1086 | if (idx != SRCU_STATE_IDLE) { |
| 1024 | mutex_unlock(&sp->srcu_gp_mutex); | 1087 | mutex_unlock(&sp->srcu_gp_mutex); |
| 1025 | return; /* Someone else started the grace period. */ | 1088 | return; /* Someone else started the grace period. */ |
| @@ -1068,22 +1131,22 @@ static void srcu_invoke_callbacks(struct work_struct *work) | |||
| 1068 | sdp = container_of(work, struct srcu_data, work.work); | 1131 | sdp = container_of(work, struct srcu_data, work.work); |
| 1069 | sp = sdp->sp; | 1132 | sp = sdp->sp; |
| 1070 | rcu_cblist_init(&ready_cbs); | 1133 | rcu_cblist_init(&ready_cbs); |
| 1071 | spin_lock_irq(&sdp->lock); | 1134 | raw_spin_lock_irq_rcu_node(sdp); |
| 1072 | smp_mb(); /* Old grace periods before callback invocation! */ | ||
| 1073 | rcu_segcblist_advance(&sdp->srcu_cblist, | 1135 | rcu_segcblist_advance(&sdp->srcu_cblist, |
| 1074 | rcu_seq_current(&sp->srcu_gp_seq)); | 1136 | rcu_seq_current(&sp->srcu_gp_seq)); |
| 1075 | if (sdp->srcu_cblist_invoking || | 1137 | if (sdp->srcu_cblist_invoking || |
| 1076 | !rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) { | 1138 | !rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) { |
| 1077 | spin_unlock_irq(&sdp->lock); | 1139 | raw_spin_unlock_irq_rcu_node(sdp); |
| 1078 | return; /* Someone else on the job or nothing to do. */ | 1140 | return; /* Someone else on the job or nothing to do. */ |
| 1079 | } | 1141 | } |
| 1080 | 1142 | ||
| 1081 | /* We are on the job! Extract and invoke ready callbacks. */ | 1143 | /* We are on the job! Extract and invoke ready callbacks. */ |
| 1082 | sdp->srcu_cblist_invoking = true; | 1144 | sdp->srcu_cblist_invoking = true; |
| 1083 | rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs); | 1145 | rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs); |
| 1084 | spin_unlock_irq(&sdp->lock); | 1146 | raw_spin_unlock_irq_rcu_node(sdp); |
| 1085 | rhp = rcu_cblist_dequeue(&ready_cbs); | 1147 | rhp = rcu_cblist_dequeue(&ready_cbs); |
| 1086 | for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) { | 1148 | for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) { |
| 1149 | debug_rcu_head_unqueue(rhp); | ||
| 1087 | local_bh_disable(); | 1150 | local_bh_disable(); |
| 1088 | rhp->func(rhp); | 1151 | rhp->func(rhp); |
| 1089 | local_bh_enable(); | 1152 | local_bh_enable(); |
| @@ -1093,13 +1156,13 @@ static void srcu_invoke_callbacks(struct work_struct *work) | |||
| 1093 | * Update counts, accelerate new callbacks, and if needed, | 1156 | * Update counts, accelerate new callbacks, and if needed, |
| 1094 | * schedule another round of callback invocation. | 1157 | * schedule another round of callback invocation. |
| 1095 | */ | 1158 | */ |
| 1096 | spin_lock_irq(&sdp->lock); | 1159 | raw_spin_lock_irq_rcu_node(sdp); |
| 1097 | rcu_segcblist_insert_count(&sdp->srcu_cblist, &ready_cbs); | 1160 | rcu_segcblist_insert_count(&sdp->srcu_cblist, &ready_cbs); |
| 1098 | (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, | 1161 | (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, |
| 1099 | rcu_seq_snap(&sp->srcu_gp_seq)); | 1162 | rcu_seq_snap(&sp->srcu_gp_seq)); |
| 1100 | sdp->srcu_cblist_invoking = false; | 1163 | sdp->srcu_cblist_invoking = false; |
| 1101 | more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist); | 1164 | more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist); |
| 1102 | spin_unlock_irq(&sdp->lock); | 1165 | raw_spin_unlock_irq_rcu_node(sdp); |
| 1103 | if (more) | 1166 | if (more) |
| 1104 | srcu_schedule_cbs_sdp(sdp, 0); | 1167 | srcu_schedule_cbs_sdp(sdp, 0); |
| 1105 | } | 1168 | } |
| @@ -1112,7 +1175,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay) | |||
| 1112 | { | 1175 | { |
| 1113 | bool pushgp = true; | 1176 | bool pushgp = true; |
| 1114 | 1177 | ||
| 1115 | spin_lock_irq(&sp->gp_lock); | 1178 | raw_spin_lock_irq_rcu_node(sp); |
| 1116 | if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) { | 1179 | if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) { |
| 1117 | if (!WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq))) { | 1180 | if (!WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq))) { |
| 1118 | /* All requests fulfilled, time to go idle. */ | 1181 | /* All requests fulfilled, time to go idle. */ |
| @@ -1122,7 +1185,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay) | |||
| 1122 | /* Outstanding request and no GP. Start one. */ | 1185 | /* Outstanding request and no GP. Start one. */ |
| 1123 | srcu_gp_start(sp); | 1186 | srcu_gp_start(sp); |
| 1124 | } | 1187 | } |
| 1125 | spin_unlock_irq(&sp->gp_lock); | 1188 | raw_spin_unlock_irq_rcu_node(sp); |
| 1126 | 1189 | ||
| 1127 | if (pushgp) | 1190 | if (pushgp) |
| 1128 | queue_delayed_work(system_power_efficient_wq, &sp->work, delay); | 1191 | queue_delayed_work(system_power_efficient_wq, &sp->work, delay); |
| @@ -1153,3 +1216,12 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type, | |||
| 1153 | *gpnum = rcu_seq_ctr(sp->srcu_gp_seq_needed); | 1216 | *gpnum = rcu_seq_ctr(sp->srcu_gp_seq_needed); |
| 1154 | } | 1217 | } |
| 1155 | EXPORT_SYMBOL_GPL(srcutorture_get_gp_data); | 1218 | EXPORT_SYMBOL_GPL(srcutorture_get_gp_data); |
| 1219 | |||
| 1220 | static int __init srcu_bootup_announce(void) | ||
| 1221 | { | ||
| 1222 | pr_info("Hierarchical SRCU implementation.\n"); | ||
| 1223 | if (exp_holdoff != DEFAULT_SRCU_EXP_HOLDOFF) | ||
| 1224 | pr_info("\tNon-default auto-expedite holdoff of %lu ns.\n", exp_holdoff); | ||
| 1225 | return 0; | ||
| 1226 | } | ||
| 1227 | early_initcall(srcu_bootup_announce); | ||
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index e5385731e391..f8488965250f 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c | |||
| @@ -35,15 +35,26 @@ | |||
| 35 | #include <linux/time.h> | 35 | #include <linux/time.h> |
| 36 | #include <linux/cpu.h> | 36 | #include <linux/cpu.h> |
| 37 | #include <linux/prefetch.h> | 37 | #include <linux/prefetch.h> |
| 38 | #include <linux/trace_events.h> | ||
| 39 | 38 | ||
| 40 | #include "rcu.h" | 39 | #include "rcu.h" |
| 41 | 40 | ||
| 42 | /* Forward declarations for tiny_plugin.h. */ | 41 | /* Global control variables for rcupdate callback mechanism. */ |
| 43 | struct rcu_ctrlblk; | 42 | struct rcu_ctrlblk { |
| 44 | static void __call_rcu(struct rcu_head *head, | 43 | struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */ |
| 45 | rcu_callback_t func, | 44 | struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ |
| 46 | struct rcu_ctrlblk *rcp); | 45 | struct rcu_head **curtail; /* ->next pointer of last CB. */ |
| 46 | }; | ||
| 47 | |||
| 48 | /* Definition for rcupdate control block. */ | ||
| 49 | static struct rcu_ctrlblk rcu_sched_ctrlblk = { | ||
| 50 | .donetail = &rcu_sched_ctrlblk.rcucblist, | ||
| 51 | .curtail = &rcu_sched_ctrlblk.rcucblist, | ||
| 52 | }; | ||
| 53 | |||
| 54 | static struct rcu_ctrlblk rcu_bh_ctrlblk = { | ||
| 55 | .donetail = &rcu_bh_ctrlblk.rcucblist, | ||
| 56 | .curtail = &rcu_bh_ctrlblk.rcucblist, | ||
| 57 | }; | ||
| 47 | 58 | ||
| 48 | #include "tiny_plugin.h" | 59 | #include "tiny_plugin.h" |
| 49 | 60 | ||
| @@ -59,19 +70,6 @@ void rcu_barrier_sched(void) | |||
| 59 | } | 70 | } |
| 60 | EXPORT_SYMBOL(rcu_barrier_sched); | 71 | EXPORT_SYMBOL(rcu_barrier_sched); |
| 61 | 72 | ||
| 62 | #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) | ||
| 63 | |||
| 64 | /* | ||
| 65 | * Test whether RCU thinks that the current CPU is idle. | ||
| 66 | */ | ||
| 67 | bool notrace __rcu_is_watching(void) | ||
| 68 | { | ||
| 69 | return true; | ||
| 70 | } | ||
| 71 | EXPORT_SYMBOL(__rcu_is_watching); | ||
| 72 | |||
| 73 | #endif /* defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */ | ||
| 74 | |||
| 75 | /* | 73 | /* |
| 76 | * Helper function for rcu_sched_qs() and rcu_bh_qs(). | 74 | * Helper function for rcu_sched_qs() and rcu_bh_qs(). |
| 77 | * Also irqs are disabled to avoid confusion due to interrupt handlers | 75 | * Also irqs are disabled to avoid confusion due to interrupt handlers |
| @@ -79,7 +77,6 @@ EXPORT_SYMBOL(__rcu_is_watching); | |||
| 79 | */ | 77 | */ |
| 80 | static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) | 78 | static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) |
| 81 | { | 79 | { |
| 82 | RCU_TRACE(reset_cpu_stall_ticks(rcp);) | ||
| 83 | if (rcp->donetail != rcp->curtail) { | 80 | if (rcp->donetail != rcp->curtail) { |
| 84 | rcp->donetail = rcp->curtail; | 81 | rcp->donetail = rcp->curtail; |
| 85 | return 1; | 82 | return 1; |
| @@ -125,7 +122,6 @@ void rcu_bh_qs(void) | |||
| 125 | */ | 122 | */ |
| 126 | void rcu_check_callbacks(int user) | 123 | void rcu_check_callbacks(int user) |
| 127 | { | 124 | { |
| 128 | RCU_TRACE(check_cpu_stalls();) | ||
| 129 | if (user) | 125 | if (user) |
| 130 | rcu_sched_qs(); | 126 | rcu_sched_qs(); |
| 131 | else if (!in_softirq()) | 127 | else if (!in_softirq()) |
| @@ -140,10 +136,8 @@ void rcu_check_callbacks(int user) | |||
| 140 | */ | 136 | */ |
| 141 | static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) | 137 | static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) |
| 142 | { | 138 | { |
| 143 | const char *rn = NULL; | ||
| 144 | struct rcu_head *next, *list; | 139 | struct rcu_head *next, *list; |
| 145 | unsigned long flags; | 140 | unsigned long flags; |
| 146 | RCU_TRACE(int cb_count = 0;) | ||
| 147 | 141 | ||
| 148 | /* Move the ready-to-invoke callbacks to a local list. */ | 142 | /* Move the ready-to-invoke callbacks to a local list. */ |
| 149 | local_irq_save(flags); | 143 | local_irq_save(flags); |
| @@ -152,7 +146,6 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) | |||
| 152 | local_irq_restore(flags); | 146 | local_irq_restore(flags); |
| 153 | return; | 147 | return; |
| 154 | } | 148 | } |
| 155 | RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1);) | ||
| 156 | list = rcp->rcucblist; | 149 | list = rcp->rcucblist; |
| 157 | rcp->rcucblist = *rcp->donetail; | 150 | rcp->rcucblist = *rcp->donetail; |
| 158 | *rcp->donetail = NULL; | 151 | *rcp->donetail = NULL; |
| @@ -162,22 +155,15 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) | |||
| 162 | local_irq_restore(flags); | 155 | local_irq_restore(flags); |
| 163 | 156 | ||
| 164 | /* Invoke the callbacks on the local list. */ | 157 | /* Invoke the callbacks on the local list. */ |
| 165 | RCU_TRACE(rn = rcp->name;) | ||
| 166 | while (list) { | 158 | while (list) { |
| 167 | next = list->next; | 159 | next = list->next; |
| 168 | prefetch(next); | 160 | prefetch(next); |
| 169 | debug_rcu_head_unqueue(list); | 161 | debug_rcu_head_unqueue(list); |
| 170 | local_bh_disable(); | 162 | local_bh_disable(); |
| 171 | __rcu_reclaim(rn, list); | 163 | __rcu_reclaim("", list); |
| 172 | local_bh_enable(); | 164 | local_bh_enable(); |
| 173 | list = next; | 165 | list = next; |
| 174 | RCU_TRACE(cb_count++;) | ||
| 175 | } | 166 | } |
| 176 | RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count);) | ||
| 177 | RCU_TRACE(trace_rcu_batch_end(rcp->name, | ||
| 178 | cb_count, 0, need_resched(), | ||
| 179 | is_idle_task(current), | ||
| 180 | false)); | ||
| 181 | } | 167 | } |
| 182 | 168 | ||
| 183 | static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) | 169 | static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) |
| @@ -221,7 +207,6 @@ static void __call_rcu(struct rcu_head *head, | |||
| 221 | local_irq_save(flags); | 207 | local_irq_save(flags); |
| 222 | *rcp->curtail = head; | 208 | *rcp->curtail = head; |
| 223 | rcp->curtail = &head->next; | 209 | rcp->curtail = &head->next; |
| 224 | RCU_TRACE(rcp->qlen++;) | ||
| 225 | local_irq_restore(flags); | 210 | local_irq_restore(flags); |
| 226 | 211 | ||
| 227 | if (unlikely(is_idle_task(current))) { | 212 | if (unlikely(is_idle_task(current))) { |
| @@ -254,8 +239,5 @@ EXPORT_SYMBOL_GPL(call_rcu_bh); | |||
| 254 | void __init rcu_init(void) | 239 | void __init rcu_init(void) |
| 255 | { | 240 | { |
| 256 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); | 241 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); |
| 257 | RCU_TRACE(reset_cpu_stall_ticks(&rcu_sched_ctrlblk);) | ||
| 258 | RCU_TRACE(reset_cpu_stall_ticks(&rcu_bh_ctrlblk);) | ||
| 259 | |||
| 260 | rcu_early_boot_tests(); | 242 | rcu_early_boot_tests(); |
| 261 | } | 243 | } |
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h index 371034e77f87..f0a01b2a3062 100644 --- a/kernel/rcu/tiny_plugin.h +++ b/kernel/rcu/tiny_plugin.h | |||
| @@ -22,36 +22,6 @@ | |||
| 22 | * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> | 22 | * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> |
| 23 | */ | 23 | */ |
| 24 | 24 | ||
| 25 | #include <linux/kthread.h> | ||
| 26 | #include <linux/init.h> | ||
| 27 | #include <linux/debugfs.h> | ||
| 28 | #include <linux/seq_file.h> | ||
| 29 | |||
| 30 | /* Global control variables for rcupdate callback mechanism. */ | ||
| 31 | struct rcu_ctrlblk { | ||
| 32 | struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */ | ||
| 33 | struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ | ||
| 34 | struct rcu_head **curtail; /* ->next pointer of last CB. */ | ||
| 35 | RCU_TRACE(long qlen); /* Number of pending CBs. */ | ||
| 36 | RCU_TRACE(unsigned long gp_start); /* Start time for stalls. */ | ||
| 37 | RCU_TRACE(unsigned long ticks_this_gp); /* Statistic for stalls. */ | ||
| 38 | RCU_TRACE(unsigned long jiffies_stall); /* Jiffies at next stall. */ | ||
| 39 | RCU_TRACE(const char *name); /* Name of RCU type. */ | ||
| 40 | }; | ||
| 41 | |||
| 42 | /* Definition for rcupdate control block. */ | ||
| 43 | static struct rcu_ctrlblk rcu_sched_ctrlblk = { | ||
| 44 | .donetail = &rcu_sched_ctrlblk.rcucblist, | ||
| 45 | .curtail = &rcu_sched_ctrlblk.rcucblist, | ||
| 46 | RCU_TRACE(.name = "rcu_sched") | ||
| 47 | }; | ||
| 48 | |||
| 49 | static struct rcu_ctrlblk rcu_bh_ctrlblk = { | ||
| 50 | .donetail = &rcu_bh_ctrlblk.rcucblist, | ||
| 51 | .curtail = &rcu_bh_ctrlblk.rcucblist, | ||
| 52 | RCU_TRACE(.name = "rcu_bh") | ||
| 53 | }; | ||
| 54 | |||
| 55 | #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) | 25 | #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) |
| 56 | #include <linux/kernel_stat.h> | 26 | #include <linux/kernel_stat.h> |
| 57 | 27 | ||
| @@ -75,96 +45,3 @@ void __init rcu_scheduler_starting(void) | |||
| 75 | } | 45 | } |
| 76 | 46 | ||
| 77 | #endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */ | 47 | #endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */ |
| 78 | |||
| 79 | #ifdef CONFIG_RCU_TRACE | ||
| 80 | |||
| 81 | static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n) | ||
| 82 | { | ||
| 83 | unsigned long flags; | ||
| 84 | |||
| 85 | local_irq_save(flags); | ||
| 86 | rcp->qlen -= n; | ||
| 87 | local_irq_restore(flags); | ||
| 88 | } | ||
| 89 | |||
| 90 | /* | ||
| 91 | * Dump statistics for TINY_RCU, such as they are. | ||
| 92 | */ | ||
| 93 | static int show_tiny_stats(struct seq_file *m, void *unused) | ||
| 94 | { | ||
| 95 | seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen); | ||
| 96 | seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen); | ||
| 97 | return 0; | ||
| 98 | } | ||
| 99 | |||
| 100 | static int show_tiny_stats_open(struct inode *inode, struct file *file) | ||
| 101 | { | ||
| 102 | return single_open(file, show_tiny_stats, NULL); | ||
| 103 | } | ||
| 104 | |||
| 105 | static const struct file_operations show_tiny_stats_fops = { | ||
| 106 | .owner = THIS_MODULE, | ||
| 107 | .open = show_tiny_stats_open, | ||
| 108 | .read = seq_read, | ||
| 109 | .llseek = seq_lseek, | ||
| 110 | .release = single_release, | ||
| 111 | }; | ||
| 112 | |||
| 113 | static struct dentry *rcudir; | ||
| 114 | |||
| 115 | static int __init rcutiny_trace_init(void) | ||
| 116 | { | ||
| 117 | struct dentry *retval; | ||
| 118 | |||
| 119 | rcudir = debugfs_create_dir("rcu", NULL); | ||
| 120 | if (!rcudir) | ||
| 121 | goto free_out; | ||
| 122 | retval = debugfs_create_file("rcudata", 0444, rcudir, | ||
| 123 | NULL, &show_tiny_stats_fops); | ||
| 124 | if (!retval) | ||
| 125 | goto free_out; | ||
| 126 | return 0; | ||
| 127 | free_out: | ||
| 128 | debugfs_remove_recursive(rcudir); | ||
| 129 | return 1; | ||
| 130 | } | ||
| 131 | device_initcall(rcutiny_trace_init); | ||
| 132 | |||
| 133 | static void check_cpu_stall(struct rcu_ctrlblk *rcp) | ||
| 134 | { | ||
| 135 | unsigned long j; | ||
| 136 | unsigned long js; | ||
| 137 | |||
| 138 | if (rcu_cpu_stall_suppress) | ||
| 139 | return; | ||
| 140 | rcp->ticks_this_gp++; | ||
| 141 | j = jiffies; | ||
| 142 | js = READ_ONCE(rcp->jiffies_stall); | ||
| 143 | if (rcp->rcucblist && ULONG_CMP_GE(j, js)) { | ||
| 144 | pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n", | ||
| 145 | rcp->name, rcp->ticks_this_gp, DYNTICK_TASK_EXIT_IDLE, | ||
| 146 | jiffies - rcp->gp_start, rcp->qlen); | ||
| 147 | dump_stack(); | ||
| 148 | WRITE_ONCE(rcp->jiffies_stall, | ||
| 149 | jiffies + 3 * rcu_jiffies_till_stall_check() + 3); | ||
| 150 | } else if (ULONG_CMP_GE(j, js)) { | ||
| 151 | WRITE_ONCE(rcp->jiffies_stall, | ||
| 152 | jiffies + rcu_jiffies_till_stall_check()); | ||
| 153 | } | ||
| 154 | } | ||
| 155 | |||
| 156 | static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) | ||
| 157 | { | ||
| 158 | rcp->ticks_this_gp = 0; | ||
| 159 | rcp->gp_start = jiffies; | ||
| 160 | WRITE_ONCE(rcp->jiffies_stall, | ||
| 161 | jiffies + rcu_jiffies_till_stall_check()); | ||
| 162 | } | ||
| 163 | |||
| 164 | static void check_cpu_stalls(void) | ||
| 165 | { | ||
| 166 | RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk);) | ||
| 167 | RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk);) | ||
| 168 | } | ||
| 169 | |||
| 170 | #endif /* #ifdef CONFIG_RCU_TRACE */ | ||
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e354e475e645..51d4c3acf32d 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c | |||
| @@ -168,35 +168,17 @@ static void rcu_report_exp_rdp(struct rcu_state *rsp, | |||
| 168 | static void sync_sched_exp_online_cleanup(int cpu); | 168 | static void sync_sched_exp_online_cleanup(int cpu); |
| 169 | 169 | ||
| 170 | /* rcuc/rcub kthread realtime priority */ | 170 | /* rcuc/rcub kthread realtime priority */ |
| 171 | #ifdef CONFIG_RCU_KTHREAD_PRIO | ||
| 172 | static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO; | ||
| 173 | #else /* #ifdef CONFIG_RCU_KTHREAD_PRIO */ | ||
| 174 | static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0; | 171 | static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0; |
| 175 | #endif /* #else #ifdef CONFIG_RCU_KTHREAD_PRIO */ | ||
| 176 | module_param(kthread_prio, int, 0644); | 172 | module_param(kthread_prio, int, 0644); |
| 177 | 173 | ||
| 178 | /* Delay in jiffies for grace-period initialization delays, debug only. */ | 174 | /* Delay in jiffies for grace-period initialization delays, debug only. */ |
| 179 | 175 | ||
| 180 | #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT | 176 | static int gp_preinit_delay; |
| 181 | static int gp_preinit_delay = CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT_DELAY; | 177 | module_param(gp_preinit_delay, int, 0444); |
| 182 | module_param(gp_preinit_delay, int, 0644); | 178 | static int gp_init_delay; |
| 183 | #else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT */ | 179 | module_param(gp_init_delay, int, 0444); |
| 184 | static const int gp_preinit_delay; | 180 | static int gp_cleanup_delay; |
| 185 | #endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT */ | 181 | module_param(gp_cleanup_delay, int, 0444); |
| 186 | |||
| 187 | #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT | ||
| 188 | static int gp_init_delay = CONFIG_RCU_TORTURE_TEST_SLOW_INIT_DELAY; | ||
| 189 | module_param(gp_init_delay, int, 0644); | ||
| 190 | #else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT */ | ||
| 191 | static const int gp_init_delay; | ||
| 192 | #endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT */ | ||
| 193 | |||
| 194 | #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP | ||
| 195 | static int gp_cleanup_delay = CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP_DELAY; | ||
| 196 | module_param(gp_cleanup_delay, int, 0644); | ||
| 197 | #else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP */ | ||
| 198 | static const int gp_cleanup_delay; | ||
| 199 | #endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP */ | ||
| 200 | 182 | ||
| 201 | /* | 183 | /* |
| 202 | * Number of grace periods between delays, normalized by the duration of | 184 | * Number of grace periods between delays, normalized by the duration of |
| @@ -250,6 +232,7 @@ static int rcu_gp_in_progress(struct rcu_state *rsp) | |||
| 250 | */ | 232 | */ |
| 251 | void rcu_sched_qs(void) | 233 | void rcu_sched_qs(void) |
| 252 | { | 234 | { |
| 235 | RCU_LOCKDEP_WARN(preemptible(), "rcu_sched_qs() invoked with preemption enabled!!!"); | ||
| 253 | if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) | 236 | if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) |
| 254 | return; | 237 | return; |
| 255 | trace_rcu_grace_period(TPS("rcu_sched"), | 238 | trace_rcu_grace_period(TPS("rcu_sched"), |
| @@ -265,6 +248,7 @@ void rcu_sched_qs(void) | |||
| 265 | 248 | ||
| 266 | void rcu_bh_qs(void) | 249 | void rcu_bh_qs(void) |
| 267 | { | 250 | { |
| 251 | RCU_LOCKDEP_WARN(preemptible(), "rcu_bh_qs() invoked with preemption enabled!!!"); | ||
| 268 | if (__this_cpu_read(rcu_bh_data.cpu_no_qs.s)) { | 252 | if (__this_cpu_read(rcu_bh_data.cpu_no_qs.s)) { |
| 269 | trace_rcu_grace_period(TPS("rcu_bh"), | 253 | trace_rcu_grace_period(TPS("rcu_bh"), |
| 270 | __this_cpu_read(rcu_bh_data.gpnum), | 254 | __this_cpu_read(rcu_bh_data.gpnum), |
| @@ -286,10 +270,6 @@ void rcu_bh_qs(void) | |||
| 286 | static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { | 270 | static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { |
| 287 | .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, | 271 | .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, |
| 288 | .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR), | 272 | .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR), |
| 289 | #ifdef CONFIG_NO_HZ_FULL_SYSIDLE | ||
| 290 | .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE, | ||
| 291 | .dynticks_idle = ATOMIC_INIT(1), | ||
| 292 | #endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ | ||
| 293 | }; | 273 | }; |
| 294 | 274 | ||
| 295 | /* | 275 | /* |
| @@ -478,7 +458,7 @@ void rcu_note_context_switch(bool preempt) | |||
| 478 | barrier(); /* Avoid RCU read-side critical sections leaking down. */ | 458 | barrier(); /* Avoid RCU read-side critical sections leaking down. */ |
| 479 | trace_rcu_utilization(TPS("Start context switch")); | 459 | trace_rcu_utilization(TPS("Start context switch")); |
| 480 | rcu_sched_qs(); | 460 | rcu_sched_qs(); |
| 481 | rcu_preempt_note_context_switch(); | 461 | rcu_preempt_note_context_switch(preempt); |
| 482 | /* Load rcu_urgent_qs before other flags. */ | 462 | /* Load rcu_urgent_qs before other flags. */ |
| 483 | if (!smp_load_acquire(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs))) | 463 | if (!smp_load_acquire(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs))) |
| 484 | goto out; | 464 | goto out; |
| @@ -534,9 +514,12 @@ void rcu_all_qs(void) | |||
| 534 | } | 514 | } |
| 535 | EXPORT_SYMBOL_GPL(rcu_all_qs); | 515 | EXPORT_SYMBOL_GPL(rcu_all_qs); |
| 536 | 516 | ||
| 537 | static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ | 517 | #define DEFAULT_RCU_BLIMIT 10 /* Maximum callbacks per rcu_do_batch. */ |
| 538 | static long qhimark = 10000; /* If this many pending, ignore blimit. */ | 518 | static long blimit = DEFAULT_RCU_BLIMIT; |
| 539 | static long qlowmark = 100; /* Once only this many pending, use blimit. */ | 519 | #define DEFAULT_RCU_QHIMARK 10000 /* If this many pending, ignore blimit. */ |
| 520 | static long qhimark = DEFAULT_RCU_QHIMARK; | ||
| 521 | #define DEFAULT_RCU_QLOMARK 100 /* Once only this many pending, use blimit. */ | ||
| 522 | static long qlowmark = DEFAULT_RCU_QLOMARK; | ||
| 540 | 523 | ||
| 541 | module_param(blimit, long, 0444); | 524 | module_param(blimit, long, 0444); |
| 542 | module_param(qhimark, long, 0444); | 525 | module_param(qhimark, long, 0444); |
| @@ -559,10 +542,7 @@ module_param(jiffies_till_sched_qs, ulong, 0644); | |||
| 559 | 542 | ||
| 560 | static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, | 543 | static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, |
| 561 | struct rcu_data *rdp); | 544 | struct rcu_data *rdp); |
| 562 | static void force_qs_rnp(struct rcu_state *rsp, | 545 | static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp)); |
| 563 | int (*f)(struct rcu_data *rsp, bool *isidle, | ||
| 564 | unsigned long *maxj), | ||
| 565 | bool *isidle, unsigned long *maxj); | ||
| 566 | static void force_quiescent_state(struct rcu_state *rsp); | 546 | static void force_quiescent_state(struct rcu_state *rsp); |
| 567 | static int rcu_pending(void); | 547 | static int rcu_pending(void); |
| 568 | 548 | ||
| @@ -757,6 +737,7 @@ static int rcu_future_needs_gp(struct rcu_state *rsp) | |||
| 757 | int idx = (READ_ONCE(rnp->completed) + 1) & 0x1; | 737 | int idx = (READ_ONCE(rnp->completed) + 1) & 0x1; |
| 758 | int *fp = &rnp->need_future_gp[idx]; | 738 | int *fp = &rnp->need_future_gp[idx]; |
| 759 | 739 | ||
| 740 | RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_future_needs_gp() invoked with irqs enabled!!!"); | ||
| 760 | return READ_ONCE(*fp); | 741 | return READ_ONCE(*fp); |
| 761 | } | 742 | } |
| 762 | 743 | ||
| @@ -768,6 +749,7 @@ static int rcu_future_needs_gp(struct rcu_state *rsp) | |||
| 768 | static bool | 749 | static bool |
| 769 | cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) | 750 | cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) |
| 770 | { | 751 | { |
| 752 | RCU_LOCKDEP_WARN(!irqs_disabled(), "cpu_needs_another_gp() invoked with irqs enabled!!!"); | ||
| 771 | if (rcu_gp_in_progress(rsp)) | 753 | if (rcu_gp_in_progress(rsp)) |
| 772 | return false; /* No, a grace period is already in progress. */ | 754 | return false; /* No, a grace period is already in progress. */ |
| 773 | if (rcu_future_needs_gp(rsp)) | 755 | if (rcu_future_needs_gp(rsp)) |
| @@ -794,6 +776,7 @@ static void rcu_eqs_enter_common(bool user) | |||
| 794 | struct rcu_data *rdp; | 776 | struct rcu_data *rdp; |
| 795 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); | 777 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); |
| 796 | 778 | ||
| 779 | RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_eqs_enter_common() invoked with irqs enabled!!!"); | ||
| 797 | trace_rcu_dyntick(TPS("Start"), rdtp->dynticks_nesting, 0); | 780 | trace_rcu_dyntick(TPS("Start"), rdtp->dynticks_nesting, 0); |
| 798 | if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && | 781 | if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && |
| 799 | !user && !is_idle_task(current)) { | 782 | !user && !is_idle_task(current)) { |
| @@ -864,7 +847,6 @@ void rcu_idle_enter(void) | |||
| 864 | 847 | ||
| 865 | local_irq_save(flags); | 848 | local_irq_save(flags); |
| 866 | rcu_eqs_enter(false); | 849 | rcu_eqs_enter(false); |
| 867 | rcu_sysidle_enter(0); | ||
| 868 | local_irq_restore(flags); | 850 | local_irq_restore(flags); |
| 869 | } | 851 | } |
| 870 | EXPORT_SYMBOL_GPL(rcu_idle_enter); | 852 | EXPORT_SYMBOL_GPL(rcu_idle_enter); |
| @@ -914,7 +896,6 @@ void rcu_irq_exit(void) | |||
| 914 | trace_rcu_dyntick(TPS("--="), rdtp->dynticks_nesting, rdtp->dynticks_nesting - 1); | 896 | trace_rcu_dyntick(TPS("--="), rdtp->dynticks_nesting, rdtp->dynticks_nesting - 1); |
| 915 | rdtp->dynticks_nesting--; | 897 | rdtp->dynticks_nesting--; |
| 916 | } | 898 | } |
| 917 | rcu_sysidle_enter(1); | ||
| 918 | } | 899 | } |
| 919 | 900 | ||
| 920 | /* | 901 | /* |
| @@ -967,6 +948,7 @@ static void rcu_eqs_exit(bool user) | |||
| 967 | struct rcu_dynticks *rdtp; | 948 | struct rcu_dynticks *rdtp; |
| 968 | long long oldval; | 949 | long long oldval; |
| 969 | 950 | ||
| 951 | RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_eqs_exit() invoked with irqs enabled!!!"); | ||
| 970 | rdtp = this_cpu_ptr(&rcu_dynticks); | 952 | rdtp = this_cpu_ptr(&rcu_dynticks); |
| 971 | oldval = rdtp->dynticks_nesting; | 953 | oldval = rdtp->dynticks_nesting; |
| 972 | WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0); | 954 | WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0); |
| @@ -995,7 +977,6 @@ void rcu_idle_exit(void) | |||
| 995 | 977 | ||
| 996 | local_irq_save(flags); | 978 | local_irq_save(flags); |
| 997 | rcu_eqs_exit(false); | 979 | rcu_eqs_exit(false); |
| 998 | rcu_sysidle_exit(0); | ||
| 999 | local_irq_restore(flags); | 980 | local_irq_restore(flags); |
| 1000 | } | 981 | } |
| 1001 | EXPORT_SYMBOL_GPL(rcu_idle_exit); | 982 | EXPORT_SYMBOL_GPL(rcu_idle_exit); |
| @@ -1047,7 +1028,6 @@ void rcu_irq_enter(void) | |||
| 1047 | trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting); | 1028 | trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting); |
| 1048 | else | 1029 | else |
| 1049 | rcu_eqs_exit_common(oldval, true); | 1030 | rcu_eqs_exit_common(oldval, true); |
| 1050 | rcu_sysidle_exit(1); | ||
| 1051 | } | 1031 | } |
| 1052 | 1032 | ||
| 1053 | /* | 1033 | /* |
| @@ -1130,22 +1110,11 @@ void rcu_nmi_exit(void) | |||
| 1130 | } | 1110 | } |
| 1131 | 1111 | ||
| 1132 | /** | 1112 | /** |
| 1133 | * __rcu_is_watching - are RCU read-side critical sections safe? | ||
| 1134 | * | ||
| 1135 | * Return true if RCU is watching the running CPU, which means that | ||
| 1136 | * this CPU can safely enter RCU read-side critical sections. Unlike | ||
| 1137 | * rcu_is_watching(), the caller of __rcu_is_watching() must have at | ||
| 1138 | * least disabled preemption. | ||
| 1139 | */ | ||
| 1140 | bool notrace __rcu_is_watching(void) | ||
| 1141 | { | ||
| 1142 | return !rcu_dynticks_curr_cpu_in_eqs(); | ||
| 1143 | } | ||
| 1144 | |||
| 1145 | /** | ||
| 1146 | * rcu_is_watching - see if RCU thinks that the current CPU is idle | 1113 | * rcu_is_watching - see if RCU thinks that the current CPU is idle |
| 1147 | * | 1114 | * |
| 1148 | * If the current CPU is in its idle loop and is neither in an interrupt | 1115 | * Return true if RCU is watching the running CPU, which means that this |
| 1116 | * CPU can safely enter RCU read-side critical sections. In other words, | ||
| 1117 | * if the current CPU is in its idle loop and is neither in an interrupt | ||
| 1149 | * or NMI handler, return true. | 1118 | * or NMI handler, return true. |
| 1150 | */ | 1119 | */ |
| 1151 | bool notrace rcu_is_watching(void) | 1120 | bool notrace rcu_is_watching(void) |
| @@ -1153,7 +1122,7 @@ bool notrace rcu_is_watching(void) | |||
| 1153 | bool ret; | 1122 | bool ret; |
| 1154 | 1123 | ||
| 1155 | preempt_disable_notrace(); | 1124 | preempt_disable_notrace(); |
| 1156 | ret = __rcu_is_watching(); | 1125 | ret = !rcu_dynticks_curr_cpu_in_eqs(); |
| 1157 | preempt_enable_notrace(); | 1126 | preempt_enable_notrace(); |
| 1158 | return ret; | 1127 | return ret; |
| 1159 | } | 1128 | } |
| @@ -1237,11 +1206,9 @@ static int rcu_is_cpu_rrupt_from_idle(void) | |||
| 1237 | * credit them with an implicit quiescent state. Return 1 if this CPU | 1206 | * credit them with an implicit quiescent state. Return 1 if this CPU |
| 1238 | * is in dynticks idle mode, which is an extended quiescent state. | 1207 | * is in dynticks idle mode, which is an extended quiescent state. |
| 1239 | */ | 1208 | */ |
| 1240 | static int dyntick_save_progress_counter(struct rcu_data *rdp, | 1209 | static int dyntick_save_progress_counter(struct rcu_data *rdp) |
| 1241 | bool *isidle, unsigned long *maxj) | ||
| 1242 | { | 1210 | { |
| 1243 | rdp->dynticks_snap = rcu_dynticks_snap(rdp->dynticks); | 1211 | rdp->dynticks_snap = rcu_dynticks_snap(rdp->dynticks); |
| 1244 | rcu_sysidle_check_cpu(rdp, isidle, maxj); | ||
| 1245 | if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) { | 1212 | if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) { |
| 1246 | trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); | 1213 | trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); |
| 1247 | if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4, | 1214 | if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4, |
| @@ -1258,8 +1225,7 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp, | |||
| 1258 | * idle state since the last call to dyntick_save_progress_counter() | 1225 | * idle state since the last call to dyntick_save_progress_counter() |
| 1259 | * for this same CPU, or by virtue of having been offline. | 1226 | * for this same CPU, or by virtue of having been offline. |
| 1260 | */ | 1227 | */ |
| 1261 | static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, | 1228 | static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) |
| 1262 | bool *isidle, unsigned long *maxj) | ||
| 1263 | { | 1229 | { |
| 1264 | unsigned long jtsq; | 1230 | unsigned long jtsq; |
| 1265 | bool *rnhqp; | 1231 | bool *rnhqp; |
| @@ -1674,6 +1640,8 @@ void rcu_cpu_stall_reset(void) | |||
| 1674 | static unsigned long rcu_cbs_completed(struct rcu_state *rsp, | 1640 | static unsigned long rcu_cbs_completed(struct rcu_state *rsp, |
| 1675 | struct rcu_node *rnp) | 1641 | struct rcu_node *rnp) |
| 1676 | { | 1642 | { |
| 1643 | lockdep_assert_held(&rnp->lock); | ||
| 1644 | |||
| 1677 | /* | 1645 | /* |
| 1678 | * If RCU is idle, we just wait for the next grace period. | 1646 | * If RCU is idle, we just wait for the next grace period. |
| 1679 | * But we can only be sure that RCU is idle if we are looking | 1647 | * But we can only be sure that RCU is idle if we are looking |
| @@ -1719,6 +1687,8 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, | |||
| 1719 | bool ret = false; | 1687 | bool ret = false; |
| 1720 | struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); | 1688 | struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); |
| 1721 | 1689 | ||
| 1690 | lockdep_assert_held(&rnp->lock); | ||
| 1691 | |||
| 1722 | /* | 1692 | /* |
| 1723 | * Pick up grace-period number for new callbacks. If this | 1693 | * Pick up grace-period number for new callbacks. If this |
| 1724 | * grace period is already marked as needed, return to the caller. | 1694 | * grace period is already marked as needed, return to the caller. |
| @@ -1845,6 +1815,8 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, | |||
| 1845 | { | 1815 | { |
| 1846 | bool ret = false; | 1816 | bool ret = false; |
| 1847 | 1817 | ||
| 1818 | lockdep_assert_held(&rnp->lock); | ||
| 1819 | |||
| 1848 | /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ | 1820 | /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ |
| 1849 | if (!rcu_segcblist_pend_cbs(&rdp->cblist)) | 1821 | if (!rcu_segcblist_pend_cbs(&rdp->cblist)) |
| 1850 | return false; | 1822 | return false; |
| @@ -1883,6 +1855,8 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, | |||
| 1883 | static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, | 1855 | static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, |
| 1884 | struct rcu_data *rdp) | 1856 | struct rcu_data *rdp) |
| 1885 | { | 1857 | { |
| 1858 | lockdep_assert_held(&rnp->lock); | ||
| 1859 | |||
| 1886 | /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ | 1860 | /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ |
| 1887 | if (!rcu_segcblist_pend_cbs(&rdp->cblist)) | 1861 | if (!rcu_segcblist_pend_cbs(&rdp->cblist)) |
| 1888 | return false; | 1862 | return false; |
| @@ -1909,6 +1883,8 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, | |||
| 1909 | bool ret; | 1883 | bool ret; |
| 1910 | bool need_gp; | 1884 | bool need_gp; |
| 1911 | 1885 | ||
| 1886 | lockdep_assert_held(&rnp->lock); | ||
| 1887 | |||
| 1912 | /* Handle the ends of any preceding grace periods first. */ | 1888 | /* Handle the ends of any preceding grace periods first. */ |
| 1913 | if (rdp->completed == rnp->completed && | 1889 | if (rdp->completed == rnp->completed && |
| 1914 | !unlikely(READ_ONCE(rdp->gpwrap))) { | 1890 | !unlikely(READ_ONCE(rdp->gpwrap))) { |
| @@ -2115,25 +2091,16 @@ static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp) | |||
| 2115 | */ | 2091 | */ |
| 2116 | static void rcu_gp_fqs(struct rcu_state *rsp, bool first_time) | 2092 | static void rcu_gp_fqs(struct rcu_state *rsp, bool first_time) |
| 2117 | { | 2093 | { |
| 2118 | bool isidle = false; | ||
| 2119 | unsigned long maxj; | ||
| 2120 | struct rcu_node *rnp = rcu_get_root(rsp); | 2094 | struct rcu_node *rnp = rcu_get_root(rsp); |
| 2121 | 2095 | ||
| 2122 | WRITE_ONCE(rsp->gp_activity, jiffies); | 2096 | WRITE_ONCE(rsp->gp_activity, jiffies); |
| 2123 | rsp->n_force_qs++; | 2097 | rsp->n_force_qs++; |
| 2124 | if (first_time) { | 2098 | if (first_time) { |
| 2125 | /* Collect dyntick-idle snapshots. */ | 2099 | /* Collect dyntick-idle snapshots. */ |
| 2126 | if (is_sysidle_rcu_state(rsp)) { | 2100 | force_qs_rnp(rsp, dyntick_save_progress_counter); |
| 2127 | isidle = true; | ||
| 2128 | maxj = jiffies - ULONG_MAX / 4; | ||
| 2129 | } | ||
| 2130 | force_qs_rnp(rsp, dyntick_save_progress_counter, | ||
| 2131 | &isidle, &maxj); | ||
| 2132 | rcu_sysidle_report_gp(rsp, isidle, maxj); | ||
| 2133 | } else { | 2101 | } else { |
| 2134 | /* Handle dyntick-idle and offline CPUs. */ | 2102 | /* Handle dyntick-idle and offline CPUs. */ |
| 2135 | isidle = true; | 2103 | force_qs_rnp(rsp, rcu_implicit_dynticks_qs); |
| 2136 | force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj); | ||
| 2137 | } | 2104 | } |
| 2138 | /* Clear flag to prevent immediate re-entry. */ | 2105 | /* Clear flag to prevent immediate re-entry. */ |
| 2139 | if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { | 2106 | if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { |
| @@ -2341,6 +2308,7 @@ static bool | |||
| 2341 | rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, | 2308 | rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, |
| 2342 | struct rcu_data *rdp) | 2309 | struct rcu_data *rdp) |
| 2343 | { | 2310 | { |
| 2311 | lockdep_assert_held(&rnp->lock); | ||
| 2344 | if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) { | 2312 | if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) { |
| 2345 | /* | 2313 | /* |
| 2346 | * Either we have not yet spawned the grace-period | 2314 | * Either we have not yet spawned the grace-period |
| @@ -2402,6 +2370,7 @@ static bool rcu_start_gp(struct rcu_state *rsp) | |||
| 2402 | static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) | 2370 | static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) |
| 2403 | __releases(rcu_get_root(rsp)->lock) | 2371 | __releases(rcu_get_root(rsp)->lock) |
| 2404 | { | 2372 | { |
| 2373 | lockdep_assert_held(&rcu_get_root(rsp)->lock); | ||
| 2405 | WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); | 2374 | WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); |
| 2406 | WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); | 2375 | WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); |
| 2407 | raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags); | 2376 | raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags); |
| @@ -2426,6 +2395,8 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, | |||
| 2426 | unsigned long oldmask = 0; | 2395 | unsigned long oldmask = 0; |
| 2427 | struct rcu_node *rnp_c; | 2396 | struct rcu_node *rnp_c; |
| 2428 | 2397 | ||
| 2398 | lockdep_assert_held(&rnp->lock); | ||
| 2399 | |||
| 2429 | /* Walk up the rcu_node hierarchy. */ | 2400 | /* Walk up the rcu_node hierarchy. */ |
| 2430 | for (;;) { | 2401 | for (;;) { |
| 2431 | if (!(rnp->qsmask & mask) || rnp->gpnum != gps) { | 2402 | if (!(rnp->qsmask & mask) || rnp->gpnum != gps) { |
| @@ -2486,6 +2457,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, | |||
| 2486 | unsigned long mask; | 2457 | unsigned long mask; |
| 2487 | struct rcu_node *rnp_p; | 2458 | struct rcu_node *rnp_p; |
| 2488 | 2459 | ||
| 2460 | lockdep_assert_held(&rnp->lock); | ||
| 2489 | if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p || | 2461 | if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p || |
| 2490 | rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { | 2462 | rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { |
| 2491 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | 2463 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); |
| @@ -2599,6 +2571,8 @@ static void | |||
| 2599 | rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, | 2571 | rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, |
| 2600 | struct rcu_node *rnp, struct rcu_data *rdp) | 2572 | struct rcu_node *rnp, struct rcu_data *rdp) |
| 2601 | { | 2573 | { |
| 2574 | lockdep_assert_held(&rsp->orphan_lock); | ||
| 2575 | |||
| 2602 | /* No-CBs CPUs do not have orphanable callbacks. */ | 2576 | /* No-CBs CPUs do not have orphanable callbacks. */ |
| 2603 | if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || rcu_is_nocb_cpu(rdp->cpu)) | 2577 | if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || rcu_is_nocb_cpu(rdp->cpu)) |
| 2604 | return; | 2578 | return; |
| @@ -2639,6 +2613,8 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) | |||
| 2639 | { | 2613 | { |
| 2640 | struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); | 2614 | struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); |
| 2641 | 2615 | ||
| 2616 | lockdep_assert_held(&rsp->orphan_lock); | ||
| 2617 | |||
| 2642 | /* No-CBs CPUs are handled specially. */ | 2618 | /* No-CBs CPUs are handled specially. */ |
| 2643 | if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || | 2619 | if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || |
| 2644 | rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags)) | 2620 | rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags)) |
| @@ -2705,6 +2681,7 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) | |||
| 2705 | long mask; | 2681 | long mask; |
| 2706 | struct rcu_node *rnp = rnp_leaf; | 2682 | struct rcu_node *rnp = rnp_leaf; |
| 2707 | 2683 | ||
| 2684 | lockdep_assert_held(&rnp->lock); | ||
| 2708 | if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || | 2685 | if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || |
| 2709 | rnp->qsmaskinit || rcu_preempt_has_tasks(rnp)) | 2686 | rnp->qsmaskinit || rcu_preempt_has_tasks(rnp)) |
| 2710 | return; | 2687 | return; |
| @@ -2895,10 +2872,7 @@ void rcu_check_callbacks(int user) | |||
| 2895 | * | 2872 | * |
| 2896 | * The caller must have suppressed start of new grace periods. | 2873 | * The caller must have suppressed start of new grace periods. |
| 2897 | */ | 2874 | */ |
| 2898 | static void force_qs_rnp(struct rcu_state *rsp, | 2875 | static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp)) |
| 2899 | int (*f)(struct rcu_data *rsp, bool *isidle, | ||
| 2900 | unsigned long *maxj), | ||
| 2901 | bool *isidle, unsigned long *maxj) | ||
| 2902 | { | 2876 | { |
| 2903 | int cpu; | 2877 | int cpu; |
| 2904 | unsigned long flags; | 2878 | unsigned long flags; |
| @@ -2937,7 +2911,7 @@ static void force_qs_rnp(struct rcu_state *rsp, | |||
| 2937 | for_each_leaf_node_possible_cpu(rnp, cpu) { | 2911 | for_each_leaf_node_possible_cpu(rnp, cpu) { |
| 2938 | unsigned long bit = leaf_node_cpu_bit(rnp, cpu); | 2912 | unsigned long bit = leaf_node_cpu_bit(rnp, cpu); |
| 2939 | if ((rnp->qsmask & bit) != 0) { | 2913 | if ((rnp->qsmask & bit) != 0) { |
| 2940 | if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) | 2914 | if (f(per_cpu_ptr(rsp->rda, cpu))) |
| 2941 | mask |= bit; | 2915 | mask |= bit; |
| 2942 | } | 2916 | } |
| 2943 | } | 2917 | } |
| @@ -3143,9 +3117,14 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, | |||
| 3143 | WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1)); | 3117 | WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1)); |
| 3144 | 3118 | ||
| 3145 | if (debug_rcu_head_queue(head)) { | 3119 | if (debug_rcu_head_queue(head)) { |
| 3146 | /* Probable double call_rcu(), so leak the callback. */ | 3120 | /* |
| 3121 | * Probable double call_rcu(), so leak the callback. | ||
| 3122 | * Use rcu:rcu_callback trace event to find the previous | ||
| 3123 | * time callback was passed to __call_rcu(). | ||
| 3124 | */ | ||
| 3125 | WARN_ONCE(1, "__call_rcu(): Double-freed CB %p->%pF()!!!\n", | ||
| 3126 | head, head->func); | ||
| 3147 | WRITE_ONCE(head->func, rcu_leak_callback); | 3127 | WRITE_ONCE(head->func, rcu_leak_callback); |
| 3148 | WARN_ONCE(1, "__call_rcu(): Leaked duplicate callback\n"); | ||
| 3149 | return; | 3128 | return; |
| 3150 | } | 3129 | } |
| 3151 | head->func = func; | 3130 | head->func = func; |
| @@ -3194,8 +3173,24 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, | |||
| 3194 | local_irq_restore(flags); | 3173 | local_irq_restore(flags); |
| 3195 | } | 3174 | } |
| 3196 | 3175 | ||
| 3197 | /* | 3176 | /** |
| 3198 | * Queue an RCU-sched callback for invocation after a grace period. | 3177 | * call_rcu_sched() - Queue an RCU for invocation after sched grace period. |
| 3178 | * @head: structure to be used for queueing the RCU updates. | ||
| 3179 | * @func: actual callback function to be invoked after the grace period | ||
| 3180 | * | ||
| 3181 | * The callback function will be invoked some time after a full grace | ||
| 3182 | * period elapses, in other words after all currently executing RCU | ||
| 3183 | * read-side critical sections have completed. call_rcu_sched() assumes | ||
| 3184 | * that the read-side critical sections end on enabling of preemption | ||
| 3185 | * or on voluntary preemption. | ||
| 3186 | * RCU read-side critical sections are delimited by : | ||
| 3187 | * - rcu_read_lock_sched() and rcu_read_unlock_sched(), OR | ||
| 3188 | * - anything that disables preemption. | ||
| 3189 | * | ||
| 3190 | * These may be nested. | ||
| 3191 | * | ||
| 3192 | * See the description of call_rcu() for more detailed information on | ||
| 3193 | * memory ordering guarantees. | ||
| 3199 | */ | 3194 | */ |
| 3200 | void call_rcu_sched(struct rcu_head *head, rcu_callback_t func) | 3195 | void call_rcu_sched(struct rcu_head *head, rcu_callback_t func) |
| 3201 | { | 3196 | { |
| @@ -3203,8 +3198,26 @@ void call_rcu_sched(struct rcu_head *head, rcu_callback_t func) | |||
| 3203 | } | 3198 | } |
| 3204 | EXPORT_SYMBOL_GPL(call_rcu_sched); | 3199 | EXPORT_SYMBOL_GPL(call_rcu_sched); |
| 3205 | 3200 | ||
| 3206 | /* | 3201 | /** |
| 3207 | * Queue an RCU callback for invocation after a quicker grace period. | 3202 | * call_rcu_bh() - Queue an RCU for invocation after a quicker grace period. |
| 3203 | * @head: structure to be used for queueing the RCU updates. | ||
| 3204 | * @func: actual callback function to be invoked after the grace period | ||
| 3205 | * | ||
| 3206 | * The callback function will be invoked some time after a full grace | ||
| 3207 | * period elapses, in other words after all currently executing RCU | ||
| 3208 | * read-side critical sections have completed. call_rcu_bh() assumes | ||
| 3209 | * that the read-side critical sections end on completion of a softirq | ||
| 3210 | * handler. This means that read-side critical sections in process | ||
| 3211 | * context must not be interrupted by softirqs. This interface is to be | ||
| 3212 | * used when most of the read-side critical sections are in softirq context. | ||
| 3213 | * RCU read-side critical sections are delimited by : | ||
| 3214 | * - rcu_read_lock() and rcu_read_unlock(), if in interrupt context. | ||
| 3215 | * OR | ||
| 3216 | * - rcu_read_lock_bh() and rcu_read_unlock_bh(), if in process context. | ||
| 3217 | * These may be nested. | ||
| 3218 | * | ||
| 3219 | * See the description of call_rcu() for more detailed information on | ||
| 3220 | * memory ordering guarantees. | ||
| 3208 | */ | 3221 | */ |
| 3209 | void call_rcu_bh(struct rcu_head *head, rcu_callback_t func) | 3222 | void call_rcu_bh(struct rcu_head *head, rcu_callback_t func) |
| 3210 | { | 3223 | { |
| @@ -3280,12 +3293,6 @@ static inline int rcu_blocking_is_gp(void) | |||
| 3280 | * to have executed a full memory barrier during the execution of | 3293 | * to have executed a full memory barrier during the execution of |
| 3281 | * synchronize_sched() -- even if CPU A and CPU B are the same CPU (but | 3294 | * synchronize_sched() -- even if CPU A and CPU B are the same CPU (but |
| 3282 | * again only if the system has more than one CPU). | 3295 | * again only if the system has more than one CPU). |
| 3283 | * | ||
| 3284 | * This primitive provides the guarantees made by the (now removed) | ||
| 3285 | * synchronize_kernel() API. In contrast, synchronize_rcu() only | ||
| 3286 | * guarantees that rcu_read_lock() sections will have completed. | ||
| 3287 | * In "classic RCU", these two guarantees happen to be one and | ||
| 3288 | * the same, but can differ in realtime RCU implementations. | ||
| 3289 | */ | 3296 | */ |
| 3290 | void synchronize_sched(void) | 3297 | void synchronize_sched(void) |
| 3291 | { | 3298 | { |
| @@ -3578,8 +3585,14 @@ static void rcu_barrier_func(void *type) | |||
| 3578 | struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); | 3585 | struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); |
| 3579 | 3586 | ||
| 3580 | _rcu_barrier_trace(rsp, "IRQ", -1, rsp->barrier_sequence); | 3587 | _rcu_barrier_trace(rsp, "IRQ", -1, rsp->barrier_sequence); |
| 3581 | atomic_inc(&rsp->barrier_cpu_count); | 3588 | rdp->barrier_head.func = rcu_barrier_callback; |
| 3582 | rsp->call(&rdp->barrier_head, rcu_barrier_callback); | 3589 | debug_rcu_head_queue(&rdp->barrier_head); |
| 3590 | if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head, 0)) { | ||
| 3591 | atomic_inc(&rsp->barrier_cpu_count); | ||
| 3592 | } else { | ||
| 3593 | debug_rcu_head_unqueue(&rdp->barrier_head); | ||
| 3594 | _rcu_barrier_trace(rsp, "IRQNQ", -1, rsp->barrier_sequence); | ||
| 3595 | } | ||
| 3583 | } | 3596 | } |
| 3584 | 3597 | ||
| 3585 | /* | 3598 | /* |
| @@ -3698,6 +3711,7 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) | |||
| 3698 | long mask; | 3711 | long mask; |
| 3699 | struct rcu_node *rnp = rnp_leaf; | 3712 | struct rcu_node *rnp = rnp_leaf; |
| 3700 | 3713 | ||
| 3714 | lockdep_assert_held(&rnp->lock); | ||
| 3701 | for (;;) { | 3715 | for (;;) { |
| 3702 | mask = rnp->grpmask; | 3716 | mask = rnp->grpmask; |
| 3703 | rnp = rnp->parent; | 3717 | rnp = rnp->parent; |
| @@ -3753,7 +3767,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) | |||
| 3753 | !init_nocb_callback_list(rdp)) | 3767 | !init_nocb_callback_list(rdp)) |
| 3754 | rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */ | 3768 | rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */ |
| 3755 | rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; | 3769 | rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; |
| 3756 | rcu_sysidle_init_percpu_data(rdp->dynticks); | ||
| 3757 | rcu_dynticks_eqs_online(); | 3770 | rcu_dynticks_eqs_online(); |
| 3758 | raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ | 3771 | raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ |
| 3759 | 3772 | ||
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index ba38262c3554..9af0f31d6847 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h | |||
| @@ -45,14 +45,6 @@ struct rcu_dynticks { | |||
| 45 | bool rcu_need_heavy_qs; /* GP old, need heavy quiescent state. */ | 45 | bool rcu_need_heavy_qs; /* GP old, need heavy quiescent state. */ |
| 46 | unsigned long rcu_qs_ctr; /* Light universal quiescent state ctr. */ | 46 | unsigned long rcu_qs_ctr; /* Light universal quiescent state ctr. */ |
| 47 | bool rcu_urgent_qs; /* GP old need light quiescent state. */ | 47 | bool rcu_urgent_qs; /* GP old need light quiescent state. */ |
| 48 | #ifdef CONFIG_NO_HZ_FULL_SYSIDLE | ||
| 49 | long long dynticks_idle_nesting; | ||
| 50 | /* irq/process nesting level from idle. */ | ||
| 51 | atomic_t dynticks_idle; /* Even value for idle, else odd. */ | ||
| 52 | /* "Idle" excludes userspace execution. */ | ||
| 53 | unsigned long dynticks_idle_jiffies; | ||
| 54 | /* End of last non-NMI non-idle period. */ | ||
| 55 | #endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ | ||
| 56 | #ifdef CONFIG_RCU_FAST_NO_HZ | 48 | #ifdef CONFIG_RCU_FAST_NO_HZ |
| 57 | bool all_lazy; /* Are all CPU's CBs lazy? */ | 49 | bool all_lazy; /* Are all CPU's CBs lazy? */ |
| 58 | unsigned long nonlazy_posted; | 50 | unsigned long nonlazy_posted; |
| @@ -160,19 +152,6 @@ struct rcu_node { | |||
| 160 | /* Number of tasks boosted for expedited GP. */ | 152 | /* Number of tasks boosted for expedited GP. */ |
| 161 | unsigned long n_normal_boosts; | 153 | unsigned long n_normal_boosts; |
| 162 | /* Number of tasks boosted for normal GP. */ | 154 | /* Number of tasks boosted for normal GP. */ |
| 163 | unsigned long n_balk_blkd_tasks; | ||
| 164 | /* Refused to boost: no blocked tasks. */ | ||
| 165 | unsigned long n_balk_exp_gp_tasks; | ||
| 166 | /* Refused to boost: nothing blocking GP. */ | ||
| 167 | unsigned long n_balk_boost_tasks; | ||
| 168 | /* Refused to boost: already boosting. */ | ||
| 169 | unsigned long n_balk_notblocked; | ||
| 170 | /* Refused to boost: RCU RS CS still running. */ | ||
| 171 | unsigned long n_balk_notyet; | ||
| 172 | /* Refused to boost: not yet time. */ | ||
| 173 | unsigned long n_balk_nos; | ||
| 174 | /* Refused to boost: not sure why, though. */ | ||
| 175 | /* This can happen due to race conditions. */ | ||
| 176 | #ifdef CONFIG_RCU_NOCB_CPU | 155 | #ifdef CONFIG_RCU_NOCB_CPU |
| 177 | struct swait_queue_head nocb_gp_wq[2]; | 156 | struct swait_queue_head nocb_gp_wq[2]; |
| 178 | /* Place for rcu_nocb_kthread() to wait GP. */ | 157 | /* Place for rcu_nocb_kthread() to wait GP. */ |
| @@ -312,9 +291,9 @@ struct rcu_data { | |||
| 312 | }; | 291 | }; |
| 313 | 292 | ||
| 314 | /* Values for nocb_defer_wakeup field in struct rcu_data. */ | 293 | /* Values for nocb_defer_wakeup field in struct rcu_data. */ |
| 315 | #define RCU_NOGP_WAKE_NOT 0 | 294 | #define RCU_NOCB_WAKE_NOT 0 |
| 316 | #define RCU_NOGP_WAKE 1 | 295 | #define RCU_NOCB_WAKE 1 |
| 317 | #define RCU_NOGP_WAKE_FORCE 2 | 296 | #define RCU_NOCB_WAKE_FORCE 2 |
| 318 | 297 | ||
| 319 | #define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500)) | 298 | #define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500)) |
| 320 | /* For jiffies_till_first_fqs and */ | 299 | /* For jiffies_till_first_fqs and */ |
| @@ -477,7 +456,7 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work); | |||
| 477 | 456 | ||
| 478 | /* Forward declarations for rcutree_plugin.h */ | 457 | /* Forward declarations for rcutree_plugin.h */ |
| 479 | static void rcu_bootup_announce(void); | 458 | static void rcu_bootup_announce(void); |
| 480 | static void rcu_preempt_note_context_switch(void); | 459 | static void rcu_preempt_note_context_switch(bool preempt); |
| 481 | static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); | 460 | static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); |
| 482 | #ifdef CONFIG_HOTPLUG_CPU | 461 | #ifdef CONFIG_HOTPLUG_CPU |
| 483 | static bool rcu_preempt_has_tasks(struct rcu_node *rnp); | 462 | static bool rcu_preempt_has_tasks(struct rcu_node *rnp); |
| @@ -529,15 +508,7 @@ static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp); | |||
| 529 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ | 508 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ |
| 530 | static void __maybe_unused rcu_kick_nohz_cpu(int cpu); | 509 | static void __maybe_unused rcu_kick_nohz_cpu(int cpu); |
| 531 | static bool init_nocb_callback_list(struct rcu_data *rdp); | 510 | static bool init_nocb_callback_list(struct rcu_data *rdp); |
| 532 | static void rcu_sysidle_enter(int irq); | ||
| 533 | static void rcu_sysidle_exit(int irq); | ||
| 534 | static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, | ||
| 535 | unsigned long *maxj); | ||
| 536 | static bool is_sysidle_rcu_state(struct rcu_state *rsp); | ||
| 537 | static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, | ||
| 538 | unsigned long maxj); | ||
| 539 | static void rcu_bind_gp_kthread(void); | 511 | static void rcu_bind_gp_kthread(void); |
| 540 | static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp); | ||
| 541 | static bool rcu_nohz_full_cpu(struct rcu_state *rsp); | 512 | static bool rcu_nohz_full_cpu(struct rcu_state *rsp); |
| 542 | static void rcu_dynticks_task_enter(void); | 513 | static void rcu_dynticks_task_enter(void); |
| 543 | static void rcu_dynticks_task_exit(void); | 514 | static void rcu_dynticks_task_exit(void); |
| @@ -551,75 +522,3 @@ void srcu_offline_cpu(unsigned int cpu) { } | |||
| 551 | #endif /* #else #ifdef CONFIG_SRCU */ | 522 | #endif /* #else #ifdef CONFIG_SRCU */ |
| 552 | 523 | ||
| 553 | #endif /* #ifndef RCU_TREE_NONCORE */ | 524 | #endif /* #ifndef RCU_TREE_NONCORE */ |
| 554 | |||
| 555 | #ifdef CONFIG_RCU_TRACE | ||
| 556 | /* Read out queue lengths for tracing. */ | ||
| 557 | static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) | ||
| 558 | { | ||
| 559 | #ifdef CONFIG_RCU_NOCB_CPU | ||
| 560 | *ql = atomic_long_read(&rdp->nocb_q_count); | ||
| 561 | *qll = atomic_long_read(&rdp->nocb_q_count_lazy); | ||
| 562 | #else /* #ifdef CONFIG_RCU_NOCB_CPU */ | ||
| 563 | *ql = 0; | ||
| 564 | *qll = 0; | ||
| 565 | #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ | ||
| 566 | } | ||
| 567 | #endif /* #ifdef CONFIG_RCU_TRACE */ | ||
| 568 | |||
| 569 | /* | ||
| 570 | * Wrappers for the rcu_node::lock acquire and release. | ||
| 571 | * | ||
| 572 | * Because the rcu_nodes form a tree, the tree traversal locking will observe | ||
| 573 | * different lock values, this in turn means that an UNLOCK of one level | ||
| 574 | * followed by a LOCK of another level does not imply a full memory barrier; | ||
| 575 | * and most importantly transitivity is lost. | ||
| 576 | * | ||
| 577 | * In order to restore full ordering between tree levels, augment the regular | ||
| 578 | * lock acquire functions with smp_mb__after_unlock_lock(). | ||
| 579 | * | ||
| 580 | * As ->lock of struct rcu_node is a __private field, therefore one should use | ||
| 581 | * these wrappers rather than directly call raw_spin_{lock,unlock}* on ->lock. | ||
| 582 | */ | ||
| 583 | static inline void raw_spin_lock_rcu_node(struct rcu_node *rnp) | ||
| 584 | { | ||
| 585 | raw_spin_lock(&ACCESS_PRIVATE(rnp, lock)); | ||
| 586 | smp_mb__after_unlock_lock(); | ||
| 587 | } | ||
| 588 | |||
| 589 | static inline void raw_spin_unlock_rcu_node(struct rcu_node *rnp) | ||
| 590 | { | ||
| 591 | raw_spin_unlock(&ACCESS_PRIVATE(rnp, lock)); | ||
| 592 | } | ||
| 593 | |||
| 594 | static inline void raw_spin_lock_irq_rcu_node(struct rcu_node *rnp) | ||
| 595 | { | ||
| 596 | raw_spin_lock_irq(&ACCESS_PRIVATE(rnp, lock)); | ||
| 597 | smp_mb__after_unlock_lock(); | ||
| 598 | } | ||
| 599 | |||
| 600 | static inline void raw_spin_unlock_irq_rcu_node(struct rcu_node *rnp) | ||
| 601 | { | ||
| 602 | raw_spin_unlock_irq(&ACCESS_PRIVATE(rnp, lock)); | ||
| 603 | } | ||
| 604 | |||
| 605 | #define raw_spin_lock_irqsave_rcu_node(rnp, flags) \ | ||
| 606 | do { \ | ||
| 607 | typecheck(unsigned long, flags); \ | ||
| 608 | raw_spin_lock_irqsave(&ACCESS_PRIVATE(rnp, lock), flags); \ | ||
| 609 | smp_mb__after_unlock_lock(); \ | ||
| 610 | } while (0) | ||
| 611 | |||
| 612 | #define raw_spin_unlock_irqrestore_rcu_node(rnp, flags) \ | ||
| 613 | do { \ | ||
| 614 | typecheck(unsigned long, flags); \ | ||
| 615 | raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(rnp, lock), flags); \ | ||
| 616 | } while (0) | ||
| 617 | |||
| 618 | static inline bool raw_spin_trylock_rcu_node(struct rcu_node *rnp) | ||
| 619 | { | ||
| 620 | bool locked = raw_spin_trylock(&ACCESS_PRIVATE(rnp, lock)); | ||
| 621 | |||
| 622 | if (locked) | ||
| 623 | smp_mb__after_unlock_lock(); | ||
| 624 | return locked; | ||
| 625 | } | ||
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index e513b4ab1197..dd21ca47e4b4 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h | |||
| @@ -147,7 +147,7 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) | |||
| 147 | * | 147 | * |
| 148 | * Caller must hold the rcu_state's exp_mutex. | 148 | * Caller must hold the rcu_state's exp_mutex. |
| 149 | */ | 149 | */ |
| 150 | static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) | 150 | static bool sync_rcu_preempt_exp_done(struct rcu_node *rnp) |
| 151 | { | 151 | { |
| 152 | return rnp->exp_tasks == NULL && | 152 | return rnp->exp_tasks == NULL && |
| 153 | READ_ONCE(rnp->expmask) == 0; | 153 | READ_ONCE(rnp->expmask) == 0; |
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index c9a48657512a..908b309d60d7 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h | |||
| @@ -70,7 +70,7 @@ static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */ | |||
| 70 | static void __init rcu_bootup_announce_oddness(void) | 70 | static void __init rcu_bootup_announce_oddness(void) |
| 71 | { | 71 | { |
| 72 | if (IS_ENABLED(CONFIG_RCU_TRACE)) | 72 | if (IS_ENABLED(CONFIG_RCU_TRACE)) |
| 73 | pr_info("\tRCU debugfs-based tracing is enabled.\n"); | 73 | pr_info("\tRCU event tracing is enabled.\n"); |
| 74 | if ((IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 64) || | 74 | if ((IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 64) || |
| 75 | (!IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 32)) | 75 | (!IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 32)) |
| 76 | pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n", | 76 | pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n", |
| @@ -90,8 +90,32 @@ static void __init rcu_bootup_announce_oddness(void) | |||
| 90 | pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); | 90 | pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); |
| 91 | if (nr_cpu_ids != NR_CPUS) | 91 | if (nr_cpu_ids != NR_CPUS) |
| 92 | pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); | 92 | pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); |
| 93 | if (IS_ENABLED(CONFIG_RCU_BOOST)) | 93 | #ifdef CONFIG_RCU_BOOST |
| 94 | pr_info("\tRCU kthread priority: %d.\n", kthread_prio); | 94 | pr_info("\tRCU priority boosting: priority %d delay %d ms.\n", kthread_prio, CONFIG_RCU_BOOST_DELAY); |
| 95 | #endif | ||
| 96 | if (blimit != DEFAULT_RCU_BLIMIT) | ||
| 97 | pr_info("\tBoot-time adjustment of callback invocation limit to %ld.\n", blimit); | ||
| 98 | if (qhimark != DEFAULT_RCU_QHIMARK) | ||
| 99 | pr_info("\tBoot-time adjustment of callback high-water mark to %ld.\n", qhimark); | ||
| 100 | if (qlowmark != DEFAULT_RCU_QLOMARK) | ||
| 101 | pr_info("\tBoot-time adjustment of callback low-water mark to %ld.\n", qlowmark); | ||
| 102 | if (jiffies_till_first_fqs != ULONG_MAX) | ||
| 103 | pr_info("\tBoot-time adjustment of first FQS scan delay to %ld jiffies.\n", jiffies_till_first_fqs); | ||
| 104 | if (jiffies_till_next_fqs != ULONG_MAX) | ||
| 105 | pr_info("\tBoot-time adjustment of subsequent FQS scan delay to %ld jiffies.\n", jiffies_till_next_fqs); | ||
| 106 | if (rcu_kick_kthreads) | ||
| 107 | pr_info("\tKick kthreads if too-long grace period.\n"); | ||
| 108 | if (IS_ENABLED(CONFIG_DEBUG_OBJECTS_RCU_HEAD)) | ||
| 109 | pr_info("\tRCU callback double-/use-after-free debug enabled.\n"); | ||
| 110 | if (gp_preinit_delay) | ||
| 111 | pr_info("\tRCU debug GP pre-init slowdown %d jiffies.\n", gp_preinit_delay); | ||
| 112 | if (gp_init_delay) | ||
| 113 | pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_init_delay); | ||
| 114 | if (gp_cleanup_delay) | ||
| 115 | pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_cleanup_delay); | ||
| 116 | if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG)) | ||
| 117 | pr_info("\tRCU debug extended QS entry/exit.\n"); | ||
| 118 | rcupdate_announce_bootup_oddness(); | ||
| 95 | } | 119 | } |
| 96 | 120 | ||
| 97 | #ifdef CONFIG_PREEMPT_RCU | 121 | #ifdef CONFIG_PREEMPT_RCU |
| @@ -155,6 +179,8 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) | |||
| 155 | (rnp->expmask & rdp->grpmask ? RCU_EXP_BLKD : 0); | 179 | (rnp->expmask & rdp->grpmask ? RCU_EXP_BLKD : 0); |
| 156 | struct task_struct *t = current; | 180 | struct task_struct *t = current; |
| 157 | 181 | ||
| 182 | lockdep_assert_held(&rnp->lock); | ||
| 183 | |||
| 158 | /* | 184 | /* |
| 159 | * Decide where to queue the newly blocked task. In theory, | 185 | * Decide where to queue the newly blocked task. In theory, |
| 160 | * this could be an if-statement. In practice, when I tried | 186 | * this could be an if-statement. In practice, when I tried |
| @@ -263,6 +289,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) | |||
| 263 | */ | 289 | */ |
| 264 | static void rcu_preempt_qs(void) | 290 | static void rcu_preempt_qs(void) |
| 265 | { | 291 | { |
| 292 | RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_qs() invoked with preemption enabled!!!\n"); | ||
| 266 | if (__this_cpu_read(rcu_data_p->cpu_no_qs.s)) { | 293 | if (__this_cpu_read(rcu_data_p->cpu_no_qs.s)) { |
| 267 | trace_rcu_grace_period(TPS("rcu_preempt"), | 294 | trace_rcu_grace_period(TPS("rcu_preempt"), |
| 268 | __this_cpu_read(rcu_data_p->gpnum), | 295 | __this_cpu_read(rcu_data_p->gpnum), |
| @@ -286,12 +313,14 @@ static void rcu_preempt_qs(void) | |||
| 286 | * | 313 | * |
| 287 | * Caller must disable interrupts. | 314 | * Caller must disable interrupts. |
| 288 | */ | 315 | */ |
| 289 | static void rcu_preempt_note_context_switch(void) | 316 | static void rcu_preempt_note_context_switch(bool preempt) |
| 290 | { | 317 | { |
| 291 | struct task_struct *t = current; | 318 | struct task_struct *t = current; |
| 292 | struct rcu_data *rdp; | 319 | struct rcu_data *rdp; |
| 293 | struct rcu_node *rnp; | 320 | struct rcu_node *rnp; |
| 294 | 321 | ||
| 322 | RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_preempt_note_context_switch() invoked with interrupts enabled!!!\n"); | ||
| 323 | WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0); | ||
| 295 | if (t->rcu_read_lock_nesting > 0 && | 324 | if (t->rcu_read_lock_nesting > 0 && |
| 296 | !t->rcu_read_unlock_special.b.blocked) { | 325 | !t->rcu_read_unlock_special.b.blocked) { |
| 297 | 326 | ||
| @@ -607,6 +636,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) | |||
| 607 | */ | 636 | */ |
| 608 | static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) | 637 | static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) |
| 609 | { | 638 | { |
| 639 | RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n"); | ||
| 610 | WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); | 640 | WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); |
| 611 | if (rcu_preempt_has_tasks(rnp)) | 641 | if (rcu_preempt_has_tasks(rnp)) |
| 612 | rnp->gp_tasks = rnp->blkd_tasks.next; | 642 | rnp->gp_tasks = rnp->blkd_tasks.next; |
| @@ -643,8 +673,37 @@ static void rcu_preempt_do_callbacks(void) | |||
| 643 | 673 | ||
| 644 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 674 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
| 645 | 675 | ||
| 646 | /* | 676 | /** |
| 647 | * Queue a preemptible-RCU callback for invocation after a grace period. | 677 | * call_rcu() - Queue an RCU callback for invocation after a grace period. |
| 678 | * @head: structure to be used for queueing the RCU updates. | ||
| 679 | * @func: actual callback function to be invoked after the grace period | ||
| 680 | * | ||
| 681 | * The callback function will be invoked some time after a full grace | ||
| 682 | * period elapses, in other words after all pre-existing RCU read-side | ||
| 683 | * critical sections have completed. However, the callback function | ||
| 684 | * might well execute concurrently with RCU read-side critical sections | ||
| 685 | * that started after call_rcu() was invoked. RCU read-side critical | ||
| 686 | * sections are delimited by rcu_read_lock() and rcu_read_unlock(), | ||
| 687 | * and may be nested. | ||
| 688 | * | ||
| 689 | * Note that all CPUs must agree that the grace period extended beyond | ||
| 690 | * all pre-existing RCU read-side critical section. On systems with more | ||
| 691 | * than one CPU, this means that when "func()" is invoked, each CPU is | ||
| 692 | * guaranteed to have executed a full memory barrier since the end of its | ||
| 693 | * last RCU read-side critical section whose beginning preceded the call | ||
| 694 | * to call_rcu(). It also means that each CPU executing an RCU read-side | ||
| 695 | * critical section that continues beyond the start of "func()" must have | ||
| 696 | * executed a memory barrier after the call_rcu() but before the beginning | ||
| 697 | * of that RCU read-side critical section. Note that these guarantees | ||
| 698 | * include CPUs that are offline, idle, or executing in user mode, as | ||
| 699 | * well as CPUs that are executing in the kernel. | ||
| 700 | * | ||
| 701 | * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the | ||
| 702 | * resulting RCU callback function "func()", then both CPU A and CPU B are | ||
| 703 | * guaranteed to execute a full memory barrier during the time interval | ||
| 704 | * between the call to call_rcu() and the invocation of "func()" -- even | ||
| 705 | * if CPU A and CPU B are the same CPU (but again only if the system has | ||
| 706 | * more than one CPU). | ||
| 648 | */ | 707 | */ |
| 649 | void call_rcu(struct rcu_head *head, rcu_callback_t func) | 708 | void call_rcu(struct rcu_head *head, rcu_callback_t func) |
| 650 | { | 709 | { |
| @@ -663,8 +722,13 @@ EXPORT_SYMBOL_GPL(call_rcu); | |||
| 663 | * synchronize_rcu() was waiting. RCU read-side critical sections are | 722 | * synchronize_rcu() was waiting. RCU read-side critical sections are |
| 664 | * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested. | 723 | * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested. |
| 665 | * | 724 | * |
| 666 | * See the description of synchronize_sched() for more detailed information | 725 | * See the description of synchronize_sched() for more detailed |
| 667 | * on memory ordering guarantees. | 726 | * information on memory-ordering guarantees. However, please note |
| 727 | * that -only- the memory-ordering guarantees apply. For example, | ||
| 728 | * synchronize_rcu() is -not- guaranteed to wait on things like code | ||
| 729 | * protected by preempt_disable(), instead, synchronize_rcu() is -only- | ||
| 730 | * guaranteed to wait on RCU read-side critical sections, that is, sections | ||
| 731 | * of code protected by rcu_read_lock(). | ||
| 668 | */ | 732 | */ |
| 669 | void synchronize_rcu(void) | 733 | void synchronize_rcu(void) |
| 670 | { | 734 | { |
| @@ -738,7 +802,7 @@ static void __init rcu_bootup_announce(void) | |||
| 738 | * Because preemptible RCU does not exist, we never have to check for | 802 | * Because preemptible RCU does not exist, we never have to check for |
| 739 | * CPUs being in quiescent states. | 803 | * CPUs being in quiescent states. |
| 740 | */ | 804 | */ |
| 741 | static void rcu_preempt_note_context_switch(void) | 805 | static void rcu_preempt_note_context_switch(bool preempt) |
| 742 | { | 806 | { |
| 743 | } | 807 | } |
| 744 | 808 | ||
| @@ -835,33 +899,6 @@ void exit_rcu(void) | |||
| 835 | 899 | ||
| 836 | #include "../locking/rtmutex_common.h" | 900 | #include "../locking/rtmutex_common.h" |
| 837 | 901 | ||
| 838 | #ifdef CONFIG_RCU_TRACE | ||
| 839 | |||
| 840 | static void rcu_initiate_boost_trace(struct rcu_node *rnp) | ||
| 841 | { | ||
| 842 | if (!rcu_preempt_has_tasks(rnp)) | ||
| 843 | rnp->n_balk_blkd_tasks++; | ||
| 844 | else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL) | ||
| 845 | rnp->n_balk_exp_gp_tasks++; | ||
| 846 | else if (rnp->gp_tasks != NULL && rnp->boost_tasks != NULL) | ||
| 847 | rnp->n_balk_boost_tasks++; | ||
| 848 | else if (rnp->gp_tasks != NULL && rnp->qsmask != 0) | ||
| 849 | rnp->n_balk_notblocked++; | ||
| 850 | else if (rnp->gp_tasks != NULL && | ||
| 851 | ULONG_CMP_LT(jiffies, rnp->boost_time)) | ||
| 852 | rnp->n_balk_notyet++; | ||
| 853 | else | ||
| 854 | rnp->n_balk_nos++; | ||
| 855 | } | ||
| 856 | |||
| 857 | #else /* #ifdef CONFIG_RCU_TRACE */ | ||
| 858 | |||
| 859 | static void rcu_initiate_boost_trace(struct rcu_node *rnp) | ||
| 860 | { | ||
| 861 | } | ||
| 862 | |||
| 863 | #endif /* #else #ifdef CONFIG_RCU_TRACE */ | ||
| 864 | |||
| 865 | static void rcu_wake_cond(struct task_struct *t, int status) | 902 | static void rcu_wake_cond(struct task_struct *t, int status) |
| 866 | { | 903 | { |
| 867 | /* | 904 | /* |
| @@ -992,8 +1029,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) | |||
| 992 | { | 1029 | { |
| 993 | struct task_struct *t; | 1030 | struct task_struct *t; |
| 994 | 1031 | ||
| 1032 | lockdep_assert_held(&rnp->lock); | ||
| 995 | if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { | 1033 | if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { |
| 996 | rnp->n_balk_exp_gp_tasks++; | ||
| 997 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | 1034 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); |
| 998 | return; | 1035 | return; |
| 999 | } | 1036 | } |
| @@ -1009,7 +1046,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) | |||
| 1009 | if (t) | 1046 | if (t) |
| 1010 | rcu_wake_cond(t, rnp->boost_kthread_status); | 1047 | rcu_wake_cond(t, rnp->boost_kthread_status); |
| 1011 | } else { | 1048 | } else { |
| 1012 | rcu_initiate_boost_trace(rnp); | ||
| 1013 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | 1049 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); |
| 1014 | } | 1050 | } |
| 1015 | } | 1051 | } |
| @@ -1260,8 +1296,7 @@ static void rcu_prepare_kthreads(int cpu) | |||
| 1260 | int rcu_needs_cpu(u64 basemono, u64 *nextevt) | 1296 | int rcu_needs_cpu(u64 basemono, u64 *nextevt) |
| 1261 | { | 1297 | { |
| 1262 | *nextevt = KTIME_MAX; | 1298 | *nextevt = KTIME_MAX; |
| 1263 | return IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) | 1299 | return rcu_cpu_has_callbacks(NULL); |
| 1264 | ? 0 : rcu_cpu_has_callbacks(NULL); | ||
| 1265 | } | 1300 | } |
| 1266 | 1301 | ||
| 1267 | /* | 1302 | /* |
| @@ -1372,10 +1407,7 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt) | |||
| 1372 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); | 1407 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); |
| 1373 | unsigned long dj; | 1408 | unsigned long dj; |
| 1374 | 1409 | ||
| 1375 | if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)) { | 1410 | RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_needs_cpu() invoked with irqs enabled!!!"); |
| 1376 | *nextevt = KTIME_MAX; | ||
| 1377 | return 0; | ||
| 1378 | } | ||
| 1379 | 1411 | ||
| 1380 | /* Snapshot to detect later posting of non-lazy callback. */ | 1412 | /* Snapshot to detect later posting of non-lazy callback. */ |
| 1381 | rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; | 1413 | rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; |
| @@ -1424,8 +1456,8 @@ static void rcu_prepare_for_idle(void) | |||
| 1424 | struct rcu_state *rsp; | 1456 | struct rcu_state *rsp; |
| 1425 | int tne; | 1457 | int tne; |
| 1426 | 1458 | ||
| 1427 | if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) || | 1459 | RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_prepare_for_idle() invoked with irqs enabled!!!"); |
| 1428 | rcu_is_nocb_cpu(smp_processor_id())) | 1460 | if (rcu_is_nocb_cpu(smp_processor_id())) |
| 1429 | return; | 1461 | return; |
| 1430 | 1462 | ||
| 1431 | /* Handle nohz enablement switches conservatively. */ | 1463 | /* Handle nohz enablement switches conservatively. */ |
| @@ -1479,8 +1511,8 @@ static void rcu_prepare_for_idle(void) | |||
| 1479 | */ | 1511 | */ |
| 1480 | static void rcu_cleanup_after_idle(void) | 1512 | static void rcu_cleanup_after_idle(void) |
| 1481 | { | 1513 | { |
| 1482 | if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) || | 1514 | RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_cleanup_after_idle() invoked with irqs enabled!!!"); |
| 1483 | rcu_is_nocb_cpu(smp_processor_id())) | 1515 | if (rcu_is_nocb_cpu(smp_processor_id())) |
| 1484 | return; | 1516 | return; |
| 1485 | if (rcu_try_advance_all_cbs()) | 1517 | if (rcu_try_advance_all_cbs()) |
| 1486 | invoke_rcu_core(); | 1518 | invoke_rcu_core(); |
| @@ -1747,7 +1779,6 @@ static void rcu_init_one_nocb(struct rcu_node *rnp) | |||
| 1747 | init_swait_queue_head(&rnp->nocb_gp_wq[1]); | 1779 | init_swait_queue_head(&rnp->nocb_gp_wq[1]); |
| 1748 | } | 1780 | } |
| 1749 | 1781 | ||
| 1750 | #ifndef CONFIG_RCU_NOCB_CPU_ALL | ||
| 1751 | /* Is the specified CPU a no-CBs CPU? */ | 1782 | /* Is the specified CPU a no-CBs CPU? */ |
| 1752 | bool rcu_is_nocb_cpu(int cpu) | 1783 | bool rcu_is_nocb_cpu(int cpu) |
| 1753 | { | 1784 | { |
| @@ -1755,7 +1786,6 @@ bool rcu_is_nocb_cpu(int cpu) | |||
| 1755 | return cpumask_test_cpu(cpu, rcu_nocb_mask); | 1786 | return cpumask_test_cpu(cpu, rcu_nocb_mask); |
| 1756 | return false; | 1787 | return false; |
| 1757 | } | 1788 | } |
| 1758 | #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ | ||
| 1759 | 1789 | ||
| 1760 | /* | 1790 | /* |
| 1761 | * Kick the leader kthread for this NOCB group. | 1791 | * Kick the leader kthread for this NOCB group. |
| @@ -1769,6 +1799,7 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force) | |||
| 1769 | if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) { | 1799 | if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) { |
| 1770 | /* Prior smp_mb__after_atomic() orders against prior enqueue. */ | 1800 | /* Prior smp_mb__after_atomic() orders against prior enqueue. */ |
| 1771 | WRITE_ONCE(rdp_leader->nocb_leader_sleep, false); | 1801 | WRITE_ONCE(rdp_leader->nocb_leader_sleep, false); |
| 1802 | smp_mb(); /* ->nocb_leader_sleep before swake_up(). */ | ||
| 1772 | swake_up(&rdp_leader->nocb_wq); | 1803 | swake_up(&rdp_leader->nocb_wq); |
| 1773 | } | 1804 | } |
| 1774 | } | 1805 | } |
| @@ -1860,7 +1891,7 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, | |||
| 1860 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | 1891 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, |
| 1861 | TPS("WakeEmpty")); | 1892 | TPS("WakeEmpty")); |
| 1862 | } else { | 1893 | } else { |
| 1863 | WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE); | 1894 | WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE); |
| 1864 | /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */ | 1895 | /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */ |
| 1865 | smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true); | 1896 | smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true); |
| 1866 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | 1897 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, |
| @@ -1874,7 +1905,7 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, | |||
| 1874 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | 1905 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, |
| 1875 | TPS("WakeOvf")); | 1906 | TPS("WakeOvf")); |
| 1876 | } else { | 1907 | } else { |
| 1877 | WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE_FORCE); | 1908 | WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_FORCE); |
| 1878 | /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */ | 1909 | /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */ |
| 1879 | smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true); | 1910 | smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true); |
| 1880 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | 1911 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, |
| @@ -2023,6 +2054,7 @@ wait_again: | |||
| 2023 | * nocb_gp_head, where they await a grace period. | 2054 | * nocb_gp_head, where they await a grace period. |
| 2024 | */ | 2055 | */ |
| 2025 | gotcbs = false; | 2056 | gotcbs = false; |
| 2057 | smp_mb(); /* wakeup before ->nocb_head reads. */ | ||
| 2026 | for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { | 2058 | for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { |
| 2027 | rdp->nocb_gp_head = READ_ONCE(rdp->nocb_head); | 2059 | rdp->nocb_gp_head = READ_ONCE(rdp->nocb_head); |
| 2028 | if (!rdp->nocb_gp_head) | 2060 | if (!rdp->nocb_gp_head) |
| @@ -2201,8 +2233,8 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp) | |||
| 2201 | if (!rcu_nocb_need_deferred_wakeup(rdp)) | 2233 | if (!rcu_nocb_need_deferred_wakeup(rdp)) |
| 2202 | return; | 2234 | return; |
| 2203 | ndw = READ_ONCE(rdp->nocb_defer_wakeup); | 2235 | ndw = READ_ONCE(rdp->nocb_defer_wakeup); |
| 2204 | WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE_NOT); | 2236 | WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); |
| 2205 | wake_nocb_leader(rdp, ndw == RCU_NOGP_WAKE_FORCE); | 2237 | wake_nocb_leader(rdp, ndw == RCU_NOCB_WAKE_FORCE); |
| 2206 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWake")); | 2238 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWake")); |
| 2207 | } | 2239 | } |
| 2208 | 2240 | ||
| @@ -2212,10 +2244,6 @@ void __init rcu_init_nohz(void) | |||
| 2212 | bool need_rcu_nocb_mask = true; | 2244 | bool need_rcu_nocb_mask = true; |
| 2213 | struct rcu_state *rsp; | 2245 | struct rcu_state *rsp; |
| 2214 | 2246 | ||
| 2215 | #ifdef CONFIG_RCU_NOCB_CPU_NONE | ||
| 2216 | need_rcu_nocb_mask = false; | ||
| 2217 | #endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */ | ||
| 2218 | |||
| 2219 | #if defined(CONFIG_NO_HZ_FULL) | 2247 | #if defined(CONFIG_NO_HZ_FULL) |
| 2220 | if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask)) | 2248 | if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask)) |
| 2221 | need_rcu_nocb_mask = true; | 2249 | need_rcu_nocb_mask = true; |
| @@ -2231,14 +2259,6 @@ void __init rcu_init_nohz(void) | |||
| 2231 | if (!have_rcu_nocb_mask) | 2259 | if (!have_rcu_nocb_mask) |
| 2232 | return; | 2260 | return; |
| 2233 | 2261 | ||
| 2234 | #ifdef CONFIG_RCU_NOCB_CPU_ZERO | ||
| 2235 | pr_info("\tOffload RCU callbacks from CPU 0\n"); | ||
| 2236 | cpumask_set_cpu(0, rcu_nocb_mask); | ||
| 2237 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */ | ||
| 2238 | #ifdef CONFIG_RCU_NOCB_CPU_ALL | ||
| 2239 | pr_info("\tOffload RCU callbacks from all CPUs\n"); | ||
| 2240 | cpumask_copy(rcu_nocb_mask, cpu_possible_mask); | ||
| 2241 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */ | ||
| 2242 | #if defined(CONFIG_NO_HZ_FULL) | 2262 | #if defined(CONFIG_NO_HZ_FULL) |
| 2243 | if (tick_nohz_full_running) | 2263 | if (tick_nohz_full_running) |
| 2244 | cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask); | 2264 | cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask); |
| @@ -2491,421 +2511,6 @@ static void __maybe_unused rcu_kick_nohz_cpu(int cpu) | |||
| 2491 | #endif /* #ifdef CONFIG_NO_HZ_FULL */ | 2511 | #endif /* #ifdef CONFIG_NO_HZ_FULL */ |
| 2492 | } | 2512 | } |
| 2493 | 2513 | ||
| 2494 | |||
| 2495 | #ifdef CONFIG_NO_HZ_FULL_SYSIDLE | ||
| 2496 | |||
| 2497 | static int full_sysidle_state; /* Current system-idle state. */ | ||
| 2498 | #define RCU_SYSIDLE_NOT 0 /* Some CPU is not idle. */ | ||
| 2499 | #define RCU_SYSIDLE_SHORT 1 /* All CPUs idle for brief period. */ | ||
| 2500 | #define RCU_SYSIDLE_LONG 2 /* All CPUs idle for long enough. */ | ||
| 2501 | #define RCU_SYSIDLE_FULL 3 /* All CPUs idle, ready for sysidle. */ | ||
| 2502 | #define RCU_SYSIDLE_FULL_NOTED 4 /* Actually entered sysidle state. */ | ||
| 2503 | |||
| 2504 | /* | ||
| 2505 | * Invoked to note exit from irq or task transition to idle. Note that | ||
| 2506 | * usermode execution does -not- count as idle here! After all, we want | ||
| 2507 | * to detect full-system idle states, not RCU quiescent states and grace | ||
| 2508 | * periods. The caller must have disabled interrupts. | ||
| 2509 | */ | ||
| 2510 | static void rcu_sysidle_enter(int irq) | ||
| 2511 | { | ||
| 2512 | unsigned long j; | ||
| 2513 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); | ||
| 2514 | |||
| 2515 | /* If there are no nohz_full= CPUs, no need to track this. */ | ||
| 2516 | if (!tick_nohz_full_enabled()) | ||
| 2517 | return; | ||
| 2518 | |||
| 2519 | /* Adjust nesting, check for fully idle. */ | ||
| 2520 | if (irq) { | ||
| 2521 | rdtp->dynticks_idle_nesting--; | ||
| 2522 | WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0); | ||
| 2523 | if (rdtp->dynticks_idle_nesting != 0) | ||
| 2524 | return; /* Still not fully idle. */ | ||
| 2525 | } else { | ||
| 2526 | if ((rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) == | ||
| 2527 | DYNTICK_TASK_NEST_VALUE) { | ||
| 2528 | rdtp->dynticks_idle_nesting = 0; | ||
| 2529 | } else { | ||
| 2530 | rdtp->dynticks_idle_nesting -= DYNTICK_TASK_NEST_VALUE; | ||
| 2531 | WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0); | ||
| 2532 | return; /* Still not fully idle. */ | ||
| 2533 | } | ||
| 2534 | } | ||
| 2535 | |||
| 2536 | /* Record start of fully idle period. */ | ||
| 2537 | j = jiffies; | ||
| 2538 | WRITE_ONCE(rdtp->dynticks_idle_jiffies, j); | ||
| 2539 | smp_mb__before_atomic(); | ||
| 2540 | atomic_inc(&rdtp->dynticks_idle); | ||
| 2541 | smp_mb__after_atomic(); | ||
| 2542 | WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1); | ||
| 2543 | } | ||
| 2544 | |||
| 2545 | /* | ||
| 2546 | * Unconditionally force exit from full system-idle state. This is | ||
| 2547 | * invoked when a normal CPU exits idle, but must be called separately | ||
| 2548 | * for the timekeeping CPU (tick_do_timer_cpu). The reason for this | ||
| 2549 | * is that the timekeeping CPU is permitted to take scheduling-clock | ||
| 2550 | * interrupts while the system is in system-idle state, and of course | ||
| 2551 | * rcu_sysidle_exit() has no way of distinguishing a scheduling-clock | ||
| 2552 | * interrupt from any other type of interrupt. | ||
| 2553 | */ | ||
| 2554 | void rcu_sysidle_force_exit(void) | ||
| 2555 | { | ||
| 2556 | int oldstate = READ_ONCE(full_sysidle_state); | ||
| 2557 | int newoldstate; | ||
| 2558 | |||
| 2559 | /* | ||
| 2560 | * Each pass through the following loop attempts to exit full | ||
| 2561 | * system-idle state. If contention proves to be a problem, | ||
| 2562 | * a trylock-based contention tree could be used here. | ||
| 2563 | */ | ||
| 2564 | while (oldstate > RCU_SYSIDLE_SHORT) { | ||
| 2565 | newoldstate = cmpxchg(&full_sysidle_state, | ||
| 2566 | oldstate, RCU_SYSIDLE_NOT); | ||
| 2567 | if (oldstate == newoldstate && | ||
| 2568 | oldstate == RCU_SYSIDLE_FULL_NOTED) { | ||
| 2569 | rcu_kick_nohz_cpu(tick_do_timer_cpu); | ||
| 2570 | return; /* We cleared it, done! */ | ||
| 2571 | } | ||
| 2572 | oldstate = newoldstate; | ||
| 2573 | } | ||
| 2574 | smp_mb(); /* Order initial oldstate fetch vs. later non-idle work. */ | ||
| 2575 | } | ||
| 2576 | |||
| 2577 | /* | ||
| 2578 | * Invoked to note entry to irq or task transition from idle. Note that | ||
| 2579 | * usermode execution does -not- count as idle here! The caller must | ||
| 2580 | * have disabled interrupts. | ||
| 2581 | */ | ||
| 2582 | static void rcu_sysidle_exit(int irq) | ||
| 2583 | { | ||
| 2584 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); | ||
| 2585 | |||
| 2586 | /* If there are no nohz_full= CPUs, no need to track this. */ | ||
| 2587 | if (!tick_nohz_full_enabled()) | ||
| 2588 | return; | ||
| 2589 | |||
| 2590 | /* Adjust nesting, check for already non-idle. */ | ||
| 2591 | if (irq) { | ||
| 2592 | rdtp->dynticks_idle_nesting++; | ||
| 2593 | WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0); | ||
| 2594 | if (rdtp->dynticks_idle_nesting != 1) | ||
| 2595 | return; /* Already non-idle. */ | ||
| 2596 | } else { | ||
| 2597 | /* | ||
| 2598 | * Allow for irq misnesting. Yes, it really is possible | ||
| 2599 | * to enter an irq handler then never leave it, and maybe | ||
| 2600 | * also vice versa. Handle both possibilities. | ||
| 2601 | */ | ||
| 2602 | if (rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) { | ||
| 2603 | rdtp->dynticks_idle_nesting += DYNTICK_TASK_NEST_VALUE; | ||
| 2604 | WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0); | ||
| 2605 | return; /* Already non-idle. */ | ||
| 2606 | } else { | ||
| 2607 | rdtp->dynticks_idle_nesting = DYNTICK_TASK_EXIT_IDLE; | ||
| 2608 | } | ||
| 2609 | } | ||
| 2610 | |||
| 2611 | /* Record end of idle period. */ | ||
| 2612 | smp_mb__before_atomic(); | ||
| 2613 | atomic_inc(&rdtp->dynticks_idle); | ||
| 2614 | smp_mb__after_atomic(); | ||
| 2615 | WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1)); | ||
| 2616 | |||
| 2617 | /* | ||
| 2618 | * If we are the timekeeping CPU, we are permitted to be non-idle | ||
| 2619 | * during a system-idle state. This must be the case, because | ||
| 2620 | * the timekeeping CPU has to take scheduling-clock interrupts | ||
| 2621 | * during the time that the system is transitioning to full | ||
| 2622 | * system-idle state. This means that the timekeeping CPU must | ||
| 2623 | * invoke rcu_sysidle_force_exit() directly if it does anything | ||
| 2624 | * more than take a scheduling-clock interrupt. | ||
| 2625 | */ | ||
| 2626 | if (smp_processor_id() == tick_do_timer_cpu) | ||
| 2627 | return; | ||
| 2628 | |||
| 2629 | /* Update system-idle state: We are clearly no longer fully idle! */ | ||
| 2630 | rcu_sysidle_force_exit(); | ||
| 2631 | } | ||
| 2632 | |||
| 2633 | /* | ||
| 2634 | * Check to see if the current CPU is idle. Note that usermode execution | ||
| 2635 | * does not count as idle. The caller must have disabled interrupts, | ||
| 2636 | * and must be running on tick_do_timer_cpu. | ||
| 2637 | */ | ||
| 2638 | static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, | ||
| 2639 | unsigned long *maxj) | ||
| 2640 | { | ||
| 2641 | int cur; | ||
| 2642 | unsigned long j; | ||
| 2643 | struct rcu_dynticks *rdtp = rdp->dynticks; | ||
| 2644 | |||
| 2645 | /* If there are no nohz_full= CPUs, don't check system-wide idleness. */ | ||
| 2646 | if (!tick_nohz_full_enabled()) | ||
| 2647 | return; | ||
| 2648 | |||
| 2649 | /* | ||
| 2650 | * If some other CPU has already reported non-idle, if this is | ||
| 2651 | * not the flavor of RCU that tracks sysidle state, or if this | ||
| 2652 | * is an offline or the timekeeping CPU, nothing to do. | ||
| 2653 | */ | ||
| 2654 | if (!*isidle || rdp->rsp != rcu_state_p || | ||
| 2655 | cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu) | ||
| 2656 | return; | ||
| 2657 | /* Verify affinity of current kthread. */ | ||
| 2658 | WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu); | ||
| 2659 | |||
| 2660 | /* Pick up current idle and NMI-nesting counter and check. */ | ||
| 2661 | cur = atomic_read(&rdtp->dynticks_idle); | ||
| 2662 | if (cur & 0x1) { | ||
| 2663 | *isidle = false; /* We are not idle! */ | ||
| 2664 | return; | ||
| 2665 | } | ||
| 2666 | smp_mb(); /* Read counters before timestamps. */ | ||
| 2667 | |||
| 2668 | /* Pick up timestamps. */ | ||
| 2669 | j = READ_ONCE(rdtp->dynticks_idle_jiffies); | ||
| 2670 | /* If this CPU entered idle more recently, update maxj timestamp. */ | ||
| 2671 | if (ULONG_CMP_LT(*maxj, j)) | ||
| 2672 | *maxj = j; | ||
| 2673 | } | ||
| 2674 | |||
| 2675 | /* | ||
| 2676 | * Is this the flavor of RCU that is handling full-system idle? | ||
| 2677 | */ | ||
| 2678 | static bool is_sysidle_rcu_state(struct rcu_state *rsp) | ||
| 2679 | { | ||
| 2680 | return rsp == rcu_state_p; | ||
| 2681 | } | ||
| 2682 | |||
| 2683 | /* | ||
| 2684 | * Return a delay in jiffies based on the number of CPUs, rcu_node | ||
| 2685 | * leaf fanout, and jiffies tick rate. The idea is to allow larger | ||
| 2686 | * systems more time to transition to full-idle state in order to | ||
| 2687 | * avoid the cache thrashing that otherwise occur on the state variable. | ||
| 2688 | * Really small systems (less than a couple of tens of CPUs) should | ||
| 2689 | * instead use a single global atomically incremented counter, and later | ||
| 2690 | * versions of this will automatically reconfigure themselves accordingly. | ||
| 2691 | */ | ||
| 2692 | static unsigned long rcu_sysidle_delay(void) | ||
| 2693 | { | ||
| 2694 | if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) | ||
| 2695 | return 0; | ||
| 2696 | return DIV_ROUND_UP(nr_cpu_ids * HZ, rcu_fanout_leaf * 1000); | ||
| 2697 | } | ||
| 2698 | |||
| 2699 | /* | ||
| 2700 | * Advance the full-system-idle state. This is invoked when all of | ||
| 2701 | * the non-timekeeping CPUs are idle. | ||
| 2702 | */ | ||
| 2703 | static void rcu_sysidle(unsigned long j) | ||
| 2704 | { | ||
| 2705 | /* Check the current state. */ | ||
| 2706 | switch (READ_ONCE(full_sysidle_state)) { | ||
| 2707 | case RCU_SYSIDLE_NOT: | ||
| 2708 | |||
| 2709 | /* First time all are idle, so note a short idle period. */ | ||
| 2710 | WRITE_ONCE(full_sysidle_state, RCU_SYSIDLE_SHORT); | ||
| 2711 | break; | ||
| 2712 | |||
| 2713 | case RCU_SYSIDLE_SHORT: | ||
| 2714 | |||
| 2715 | /* | ||
| 2716 | * Idle for a bit, time to advance to next state? | ||
| 2717 | * cmpxchg failure means race with non-idle, let them win. | ||
| 2718 | */ | ||
| 2719 | if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay())) | ||
| 2720 | (void)cmpxchg(&full_sysidle_state, | ||
| 2721 | RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG); | ||
| 2722 | break; | ||
| 2723 | |||
| 2724 | case RCU_SYSIDLE_LONG: | ||
| 2725 | |||
| 2726 | /* | ||
| 2727 | * Do an additional check pass before advancing to full. | ||
| 2728 | * cmpxchg failure means race with non-idle, let them win. | ||
| 2729 | */ | ||
| 2730 | if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay())) | ||
| 2731 | (void)cmpxchg(&full_sysidle_state, | ||
| 2732 | RCU_SYSIDLE_LONG, RCU_SYSIDLE_FULL); | ||
| 2733 | break; | ||
| 2734 | |||
| 2735 | default: | ||
| 2736 | break; | ||
| 2737 | } | ||
| 2738 | } | ||
| 2739 | |||
| 2740 | /* | ||
| 2741 | * Found a non-idle non-timekeeping CPU, so kick the system-idle state | ||
| 2742 | * back to the beginning. | ||
| 2743 | */ | ||
| 2744 | static void rcu_sysidle_cancel(void) | ||
| 2745 | { | ||
| 2746 | smp_mb(); | ||
| 2747 | if (full_sysidle_state > RCU_SYSIDLE_SHORT) | ||
| 2748 | WRITE_ONCE(full_sysidle_state, RCU_SYSIDLE_NOT); | ||
| 2749 | } | ||
| 2750 | |||
| 2751 | /* | ||
| 2752 | * Update the sysidle state based on the results of a force-quiescent-state | ||
| 2753 | * scan of the CPUs' dyntick-idle state. | ||
| 2754 | */ | ||
| 2755 | static void rcu_sysidle_report(struct rcu_state *rsp, int isidle, | ||
| 2756 | unsigned long maxj, bool gpkt) | ||
| 2757 | { | ||
| 2758 | if (rsp != rcu_state_p) | ||
| 2759 | return; /* Wrong flavor, ignore. */ | ||
| 2760 | if (gpkt && nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) | ||
| 2761 | return; /* Running state machine from timekeeping CPU. */ | ||
| 2762 | if (isidle) | ||
| 2763 | rcu_sysidle(maxj); /* More idle! */ | ||
| 2764 | else | ||
| 2765 | rcu_sysidle_cancel(); /* Idle is over. */ | ||
| 2766 | } | ||
| 2767 | |||
| 2768 | /* | ||
| 2769 | * Wrapper for rcu_sysidle_report() when called from the grace-period | ||
| 2770 | * kthread's context. | ||
| 2771 | */ | ||
| 2772 | static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, | ||
| 2773 | unsigned long maxj) | ||
| 2774 | { | ||
| 2775 | /* If there are no nohz_full= CPUs, no need to track this. */ | ||
| 2776 | if (!tick_nohz_full_enabled()) | ||
| 2777 | return; | ||
| 2778 | |||
| 2779 | rcu_sysidle_report(rsp, isidle, maxj, true); | ||
| 2780 | } | ||
| 2781 | |||
| 2782 | /* Callback and function for forcing an RCU grace period. */ | ||
| 2783 | struct rcu_sysidle_head { | ||
| 2784 | struct rcu_head rh; | ||
| 2785 | int inuse; | ||
| 2786 | }; | ||
| 2787 | |||
| 2788 | static void rcu_sysidle_cb(struct rcu_head *rhp) | ||
| 2789 | { | ||
| 2790 | struct rcu_sysidle_head *rshp; | ||
| 2791 | |||
| 2792 | /* | ||
| 2793 | * The following memory barrier is needed to replace the | ||
| 2794 | * memory barriers that would normally be in the memory | ||
| 2795 | * allocator. | ||
| 2796 | */ | ||
| 2797 | smp_mb(); /* grace period precedes setting inuse. */ | ||
| 2798 | |||
| 2799 | rshp = container_of(rhp, struct rcu_sysidle_head, rh); | ||
| 2800 | WRITE_ONCE(rshp->inuse, 0); | ||
| 2801 | } | ||
| 2802 | |||
| 2803 | /* | ||
| 2804 | * Check to see if the system is fully idle, other than the timekeeping CPU. | ||
| 2805 | * The caller must have disabled interrupts. This is not intended to be | ||
| 2806 | * called unless tick_nohz_full_enabled(). | ||
| 2807 | */ | ||
| 2808 | bool rcu_sys_is_idle(void) | ||
| 2809 | { | ||
| 2810 | static struct rcu_sysidle_head rsh; | ||
| 2811 | int rss = READ_ONCE(full_sysidle_state); | ||
| 2812 | |||
| 2813 | if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu)) | ||
| 2814 | return false; | ||
| 2815 | |||
| 2816 | /* Handle small-system case by doing a full scan of CPUs. */ | ||
| 2817 | if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) { | ||
| 2818 | int oldrss = rss - 1; | ||
| 2819 | |||
| 2820 | /* | ||
| 2821 | * One pass to advance to each state up to _FULL. | ||
| 2822 | * Give up if any pass fails to advance the state. | ||
| 2823 | */ | ||
| 2824 | while (rss < RCU_SYSIDLE_FULL && oldrss < rss) { | ||
| 2825 | int cpu; | ||
| 2826 | bool isidle = true; | ||
| 2827 | unsigned long maxj = jiffies - ULONG_MAX / 4; | ||
| 2828 | struct rcu_data *rdp; | ||
| 2829 | |||
| 2830 | /* Scan all the CPUs looking for nonidle CPUs. */ | ||
| 2831 | for_each_possible_cpu(cpu) { | ||
| 2832 | rdp = per_cpu_ptr(rcu_state_p->rda, cpu); | ||
| 2833 | rcu_sysidle_check_cpu(rdp, &isidle, &maxj); | ||
| 2834 | if (!isidle) | ||
| 2835 | break; | ||
| 2836 | } | ||
| 2837 | rcu_sysidle_report(rcu_state_p, isidle, maxj, false); | ||
| 2838 | oldrss = rss; | ||
| 2839 | rss = READ_ONCE(full_sysidle_state); | ||
| 2840 | } | ||
| 2841 | } | ||
| 2842 | |||
| 2843 | /* If this is the first observation of an idle period, record it. */ | ||
| 2844 | if (rss == RCU_SYSIDLE_FULL) { | ||
| 2845 | rss = cmpxchg(&full_sysidle_state, | ||
| 2846 | RCU_SYSIDLE_FULL, RCU_SYSIDLE_FULL_NOTED); | ||
| 2847 | return rss == RCU_SYSIDLE_FULL; | ||
| 2848 | } | ||
| 2849 | |||
| 2850 | smp_mb(); /* ensure rss load happens before later caller actions. */ | ||
| 2851 | |||
| 2852 | /* If already fully idle, tell the caller (in case of races). */ | ||
| 2853 | if (rss == RCU_SYSIDLE_FULL_NOTED) | ||
| 2854 | return true; | ||
| 2855 | |||
| 2856 | /* | ||
| 2857 | * If we aren't there yet, and a grace period is not in flight, | ||
| 2858 | * initiate a grace period. Either way, tell the caller that | ||
| 2859 | * we are not there yet. We use an xchg() rather than an assignment | ||
| 2860 | * to make up for the memory barriers that would otherwise be | ||
| 2861 | * provided by the memory allocator. | ||
| 2862 | */ | ||
| 2863 | if (nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL && | ||
| 2864 | !rcu_gp_in_progress(rcu_state_p) && | ||
| 2865 | !rsh.inuse && xchg(&rsh.inuse, 1) == 0) | ||
| 2866 | call_rcu(&rsh.rh, rcu_sysidle_cb); | ||
| 2867 | return false; | ||
| 2868 | } | ||
| 2869 | |||
| 2870 | /* | ||
| 2871 | * Initialize dynticks sysidle state for CPUs coming online. | ||
| 2872 | */ | ||
| 2873 | static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) | ||
| 2874 | { | ||
| 2875 | rdtp->dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE; | ||
| 2876 | } | ||
| 2877 | |||
| 2878 | #else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ | ||
| 2879 | |||
| 2880 | static void rcu_sysidle_enter(int irq) | ||
| 2881 | { | ||
| 2882 | } | ||
| 2883 | |||
| 2884 | static void rcu_sysidle_exit(int irq) | ||
| 2885 | { | ||
| 2886 | } | ||
| 2887 | |||
| 2888 | static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, | ||
| 2889 | unsigned long *maxj) | ||
| 2890 | { | ||
| 2891 | } | ||
| 2892 | |||
| 2893 | static bool is_sysidle_rcu_state(struct rcu_state *rsp) | ||
| 2894 | { | ||
| 2895 | return false; | ||
| 2896 | } | ||
| 2897 | |||
| 2898 | static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, | ||
| 2899 | unsigned long maxj) | ||
| 2900 | { | ||
| 2901 | } | ||
| 2902 | |||
| 2903 | static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) | ||
| 2904 | { | ||
| 2905 | } | ||
| 2906 | |||
| 2907 | #endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ | ||
| 2908 | |||
| 2909 | /* | 2514 | /* |
| 2910 | * Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the | 2515 | * Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the |
| 2911 | * grace-period kthread will do force_quiescent_state() processing? | 2516 | * grace-period kthread will do force_quiescent_state() processing? |
| @@ -2936,13 +2541,7 @@ static void rcu_bind_gp_kthread(void) | |||
| 2936 | 2541 | ||
| 2937 | if (!tick_nohz_full_enabled()) | 2542 | if (!tick_nohz_full_enabled()) |
| 2938 | return; | 2543 | return; |
| 2939 | #ifdef CONFIG_NO_HZ_FULL_SYSIDLE | ||
| 2940 | cpu = tick_do_timer_cpu; | ||
| 2941 | if (cpu >= 0 && cpu < nr_cpu_ids) | ||
| 2942 | set_cpus_allowed_ptr(current, cpumask_of(cpu)); | ||
| 2943 | #else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ | ||
| 2944 | housekeeping_affine(current); | 2544 | housekeeping_affine(current); |
| 2945 | #endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ | ||
| 2946 | } | 2545 | } |
| 2947 | 2546 | ||
| 2948 | /* Record the current task on dyntick-idle entry. */ | 2547 | /* Record the current task on dyntick-idle entry. */ |
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c deleted file mode 100644 index 6cea17a1ea30..000000000000 --- a/kernel/rcu/tree_trace.c +++ /dev/null | |||
| @@ -1,494 +0,0 @@ | |||
| 1 | /* | ||
| 2 | * Read-Copy Update tracing for hierarchical implementation. | ||
| 3 | * | ||
| 4 | * This program is free software; you can redistribute it and/or modify | ||
| 5 | * it under the terms of the GNU General Public License as published by | ||
| 6 | * the Free Software Foundation; either version 2 of the License, or | ||
| 7 | * (at your option) any later version. | ||
| 8 | * | ||
| 9 | * This program is distributed in the hope that it will be useful, | ||
| 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 12 | * GNU General Public License for more details. | ||
| 13 | * | ||
| 14 | * You should have received a copy of the GNU General Public License | ||
| 15 | * along with this program; if not, you can access it online at | ||
| 16 | * http://www.gnu.org/licenses/gpl-2.0.html. | ||
| 17 | * | ||
| 18 | * Copyright IBM Corporation, 2008 | ||
| 19 | * Author: Paul E. McKenney | ||
| 20 | * | ||
| 21 | * Papers: http://www.rdrop.com/users/paulmck/RCU | ||
| 22 | * | ||
| 23 | * For detailed explanation of Read-Copy Update mechanism see - | ||
| 24 | * Documentation/RCU | ||
| 25 | * | ||
| 26 | */ | ||
| 27 | #include <linux/types.h> | ||
| 28 | #include <linux/kernel.h> | ||
| 29 | #include <linux/init.h> | ||
| 30 | #include <linux/spinlock.h> | ||
| 31 | #include <linux/smp.h> | ||
| 32 | #include <linux/rcupdate.h> | ||
| 33 | #include <linux/interrupt.h> | ||
| 34 | #include <linux/sched.h> | ||
| 35 | #include <linux/atomic.h> | ||
| 36 | #include <linux/bitops.h> | ||
| 37 | #include <linux/completion.h> | ||
| 38 | #include <linux/percpu.h> | ||
| 39 | #include <linux/notifier.h> | ||
| 40 | #include <linux/cpu.h> | ||
| 41 | #include <linux/mutex.h> | ||
| 42 | #include <linux/debugfs.h> | ||
| 43 | #include <linux/seq_file.h> | ||
| 44 | #include <linux/prefetch.h> | ||
| 45 | |||
| 46 | #define RCU_TREE_NONCORE | ||
| 47 | #include "tree.h" | ||
| 48 | #include "rcu.h" | ||
| 49 | |||
| 50 | static int r_open(struct inode *inode, struct file *file, | ||
| 51 | const struct seq_operations *op) | ||
| 52 | { | ||
| 53 | int ret = seq_open(file, op); | ||
| 54 | if (!ret) { | ||
| 55 | struct seq_file *m = (struct seq_file *)file->private_data; | ||
| 56 | m->private = inode->i_private; | ||
| 57 | } | ||
| 58 | return ret; | ||
| 59 | } | ||
| 60 | |||
| 61 | static void *r_start(struct seq_file *m, loff_t *pos) | ||
| 62 | { | ||
| 63 | struct rcu_state *rsp = (struct rcu_state *)m->private; | ||
| 64 | *pos = cpumask_next(*pos - 1, cpu_possible_mask); | ||
| 65 | if ((*pos) < nr_cpu_ids) | ||
| 66 | return per_cpu_ptr(rsp->rda, *pos); | ||
| 67 | return NULL; | ||
| 68 | } | ||
| 69 | |||
| 70 | static void *r_next(struct seq_file *m, void *v, loff_t *pos) | ||
| 71 | { | ||
| 72 | (*pos)++; | ||
| 73 | return r_start(m, pos); | ||
| 74 | } | ||
| 75 | |||
| 76 | static void r_stop(struct seq_file *m, void *v) | ||
| 77 | { | ||
| 78 | } | ||
| 79 | |||
| 80 | static int show_rcubarrier(struct seq_file *m, void *v) | ||
| 81 | { | ||
| 82 | struct rcu_state *rsp = (struct rcu_state *)m->private; | ||
| 83 | seq_printf(m, "bcc: %d bseq: %lu\n", | ||
| 84 | atomic_read(&rsp->barrier_cpu_count), | ||
| 85 | rsp->barrier_sequence); | ||
| 86 | return 0; | ||
| 87 | } | ||
| 88 | |||
| 89 | static int rcubarrier_open(struct inode *inode, struct file *file) | ||
| 90 | { | ||
| 91 | return single_open(file, show_rcubarrier, inode->i_private); | ||
| 92 | } | ||
| 93 | |||
| 94 | static const struct file_operations rcubarrier_fops = { | ||
| 95 | .owner = THIS_MODULE, | ||
| 96 | .open = rcubarrier_open, | ||
| 97 | .read = seq_read, | ||
| 98 | .llseek = no_llseek, | ||
| 99 | .release = single_release, | ||
| 100 | }; | ||
| 101 | |||
| 102 | #ifdef CONFIG_RCU_BOOST | ||
| 103 | |||
| 104 | static char convert_kthread_status(unsigned int kthread_status) | ||
| 105 | { | ||
| 106 | if (kthread_status > RCU_KTHREAD_MAX) | ||
| 107 | return '?'; | ||
| 108 | return "SRWOY"[kthread_status]; | ||
| 109 | } | ||
| 110 | |||
| 111 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
| 112 | |||
| 113 | static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) | ||
| 114 | { | ||
| 115 | long ql, qll; | ||
| 116 | |||
| 117 | if (!rdp->beenonline) | ||
| 118 | return; | ||
| 119 | seq_printf(m, "%3d%cc=%ld g=%ld cnq=%d/%d:%d", | ||
| 120 | rdp->cpu, | ||
| 121 | cpu_is_offline(rdp->cpu) ? '!' : ' ', | ||
| 122 | ulong2long(rdp->completed), ulong2long(rdp->gpnum), | ||
| 123 | rdp->cpu_no_qs.b.norm, | ||
| 124 | rdp->rcu_qs_ctr_snap == per_cpu(rdp->dynticks->rcu_qs_ctr, rdp->cpu), | ||
| 125 | rdp->core_needs_qs); | ||
| 126 | seq_printf(m, " dt=%d/%llx/%d df=%lu", | ||
| 127 | rcu_dynticks_snap(rdp->dynticks), | ||
| 128 | rdp->dynticks->dynticks_nesting, | ||
| 129 | rdp->dynticks->dynticks_nmi_nesting, | ||
| 130 | rdp->dynticks_fqs); | ||
| 131 | seq_printf(m, " of=%lu", rdp->offline_fqs); | ||
| 132 | rcu_nocb_q_lengths(rdp, &ql, &qll); | ||
| 133 | qll += rcu_segcblist_n_lazy_cbs(&rdp->cblist); | ||
| 134 | ql += rcu_segcblist_n_cbs(&rdp->cblist); | ||
| 135 | seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c", | ||
| 136 | qll, ql, | ||
| 137 | ".N"[!rcu_segcblist_segempty(&rdp->cblist, RCU_NEXT_TAIL)], | ||
| 138 | ".R"[!rcu_segcblist_segempty(&rdp->cblist, | ||
| 139 | RCU_NEXT_READY_TAIL)], | ||
| 140 | ".W"[!rcu_segcblist_segempty(&rdp->cblist, RCU_WAIT_TAIL)], | ||
| 141 | ".D"[!rcu_segcblist_segempty(&rdp->cblist, RCU_DONE_TAIL)]); | ||
| 142 | #ifdef CONFIG_RCU_BOOST | ||
| 143 | seq_printf(m, " kt=%d/%c ktl=%x", | ||
| 144 | per_cpu(rcu_cpu_has_work, rdp->cpu), | ||
| 145 | convert_kthread_status(per_cpu(rcu_cpu_kthread_status, | ||
| 146 | rdp->cpu)), | ||
| 147 | per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff); | ||
| 148 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
| 149 | seq_printf(m, " b=%ld", rdp->blimit); | ||
| 150 | seq_printf(m, " ci=%lu nci=%lu co=%lu ca=%lu\n", | ||
| 151 | rdp->n_cbs_invoked, rdp->n_nocbs_invoked, | ||
| 152 | rdp->n_cbs_orphaned, rdp->n_cbs_adopted); | ||
| 153 | } | ||
| 154 | |||
| 155 | static int show_rcudata(struct seq_file *m, void *v) | ||
| 156 | { | ||
| 157 | print_one_rcu_data(m, (struct rcu_data *)v); | ||
| 158 | return 0; | ||
| 159 | } | ||
| 160 | |||
| 161 | static const struct seq_operations rcudate_op = { | ||
| 162 | .start = r_start, | ||
| 163 | .next = r_next, | ||
| 164 | .stop = r_stop, | ||
| 165 | .show = show_rcudata, | ||
| 166 | }; | ||
| 167 | |||
| 168 | static int rcudata_open(struct inode *inode, struct file *file) | ||
| 169 | { | ||
| 170 | return r_open(inode, file, &rcudate_op); | ||
| 171 | } | ||
| 172 | |||
| 173 | static const struct file_operations rcudata_fops = { | ||
| 174 | .owner = THIS_MODULE, | ||
| 175 | .open = rcudata_open, | ||
| 176 | .read = seq_read, | ||
| 177 | .llseek = no_llseek, | ||
| 178 | .release = seq_release, | ||
| 179 | }; | ||
| 180 | |||
| 181 | static int show_rcuexp(struct seq_file *m, void *v) | ||
| 182 | { | ||
| 183 | int cpu; | ||
| 184 | struct rcu_state *rsp = (struct rcu_state *)m->private; | ||
| 185 | struct rcu_data *rdp; | ||
| 186 | unsigned long s0 = 0, s1 = 0, s2 = 0, s3 = 0; | ||
| 187 | |||
| 188 | for_each_possible_cpu(cpu) { | ||
| 189 | rdp = per_cpu_ptr(rsp->rda, cpu); | ||
| 190 | s0 += atomic_long_read(&rdp->exp_workdone0); | ||
| 191 | s1 += atomic_long_read(&rdp->exp_workdone1); | ||
| 192 | s2 += atomic_long_read(&rdp->exp_workdone2); | ||
| 193 | s3 += atomic_long_read(&rdp->exp_workdone3); | ||
| 194 | } | ||
| 195 | seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu enq=%d sc=%lu\n", | ||
| 196 | rsp->expedited_sequence, s0, s1, s2, s3, | ||
| 197 | atomic_read(&rsp->expedited_need_qs), | ||
| 198 | rsp->expedited_sequence / 2); | ||
| 199 | return 0; | ||
| 200 | } | ||
| 201 | |||
| 202 | static int rcuexp_open(struct inode *inode, struct file *file) | ||
| 203 | { | ||
| 204 | return single_open(file, show_rcuexp, inode->i_private); | ||
| 205 | } | ||
| 206 | |||
| 207 | static const struct file_operations rcuexp_fops = { | ||
| 208 | .owner = THIS_MODULE, | ||
| 209 | .open = rcuexp_open, | ||
| 210 | .read = seq_read, | ||
| 211 | .llseek = no_llseek, | ||
| 212 | .release = single_release, | ||
| 213 | }; | ||
| 214 | |||
| 215 | #ifdef CONFIG_RCU_BOOST | ||
| 216 | |||
| 217 | static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp) | ||
| 218 | { | ||
| 219 | seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu ", | ||
| 220 | rnp->grplo, rnp->grphi, | ||
| 221 | "T."[list_empty(&rnp->blkd_tasks)], | ||
| 222 | "N."[!rnp->gp_tasks], | ||
| 223 | "E."[!rnp->exp_tasks], | ||
| 224 | "B."[!rnp->boost_tasks], | ||
| 225 | convert_kthread_status(rnp->boost_kthread_status), | ||
| 226 | rnp->n_tasks_boosted, rnp->n_exp_boosts, | ||
| 227 | rnp->n_normal_boosts); | ||
| 228 | seq_printf(m, "j=%04x bt=%04x\n", | ||
| 229 | (int)(jiffies & 0xffff), | ||
| 230 | (int)(rnp->boost_time & 0xffff)); | ||
| 231 | seq_printf(m, " balk: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n", | ||
| 232 | rnp->n_balk_blkd_tasks, | ||
| 233 | rnp->n_balk_exp_gp_tasks, | ||
| 234 | rnp->n_balk_boost_tasks, | ||
| 235 | rnp->n_balk_notblocked, | ||
| 236 | rnp->n_balk_notyet, | ||
| 237 | rnp->n_balk_nos); | ||
| 238 | } | ||
| 239 | |||
| 240 | static int show_rcu_node_boost(struct seq_file *m, void *unused) | ||
| 241 | { | ||
| 242 | struct rcu_node *rnp; | ||
| 243 | |||
| 244 | rcu_for_each_leaf_node(&rcu_preempt_state, rnp) | ||
| 245 | print_one_rcu_node_boost(m, rnp); | ||
| 246 | return 0; | ||
| 247 | } | ||
| 248 | |||
| 249 | static int rcu_node_boost_open(struct inode *inode, struct file *file) | ||
| 250 | { | ||
| 251 | return single_open(file, show_rcu_node_boost, NULL); | ||
| 252 | } | ||
| 253 | |||
| 254 | static const struct file_operations rcu_node_boost_fops = { | ||
| 255 | .owner = THIS_MODULE, | ||
| 256 | .open = rcu_node_boost_open, | ||
| 257 | .read = seq_read, | ||
| 258 | .llseek = no_llseek, | ||
| 259 | .release = single_release, | ||
| 260 | }; | ||
| 261 | |||
| 262 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
| 263 | |||
| 264 | static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) | ||
| 265 | { | ||
| 266 | unsigned long gpnum; | ||
| 267 | int level = 0; | ||
| 268 | struct rcu_node *rnp; | ||
| 269 | |||
| 270 | gpnum = rsp->gpnum; | ||
| 271 | seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x ", | ||
| 272 | ulong2long(rsp->completed), ulong2long(gpnum), | ||
| 273 | rsp->gp_state, | ||
| 274 | (long)(rsp->jiffies_force_qs - jiffies), | ||
| 275 | (int)(jiffies & 0xffff)); | ||
| 276 | seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", | ||
| 277 | rsp->n_force_qs, rsp->n_force_qs_ngp, | ||
| 278 | rsp->n_force_qs - rsp->n_force_qs_ngp, | ||
| 279 | READ_ONCE(rsp->n_force_qs_lh), | ||
| 280 | rsp->orphan_done.len_lazy, | ||
| 281 | rsp->orphan_done.len); | ||
| 282 | for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) { | ||
| 283 | if (rnp->level != level) { | ||
| 284 | seq_puts(m, "\n"); | ||
| 285 | level = rnp->level; | ||
| 286 | } | ||
| 287 | seq_printf(m, "%lx/%lx->%lx %c%c>%c %d:%d ^%d ", | ||
| 288 | rnp->qsmask, rnp->qsmaskinit, rnp->qsmaskinitnext, | ||
| 289 | ".G"[rnp->gp_tasks != NULL], | ||
| 290 | ".E"[rnp->exp_tasks != NULL], | ||
| 291 | ".T"[!list_empty(&rnp->blkd_tasks)], | ||
| 292 | rnp->grplo, rnp->grphi, rnp->grpnum); | ||
| 293 | } | ||
| 294 | seq_puts(m, "\n"); | ||
| 295 | } | ||
| 296 | |||
| 297 | static int show_rcuhier(struct seq_file *m, void *v) | ||
| 298 | { | ||
| 299 | struct rcu_state *rsp = (struct rcu_state *)m->private; | ||
| 300 | print_one_rcu_state(m, rsp); | ||
| 301 | return 0; | ||
| 302 | } | ||
| 303 | |||
| 304 | static int rcuhier_open(struct inode *inode, struct file *file) | ||
| 305 | { | ||
| 306 | return single_open(file, show_rcuhier, inode->i_private); | ||
| 307 | } | ||
| 308 | |||
| 309 | static const struct file_operations rcuhier_fops = { | ||
| 310 | .owner = THIS_MODULE, | ||
| 311 | .open = rcuhier_open, | ||
| 312 | .read = seq_read, | ||
| 313 | .llseek = no_llseek, | ||
| 314 | .release = single_release, | ||
| 315 | }; | ||
| 316 | |||
| 317 | static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) | ||
| 318 | { | ||
| 319 | unsigned long flags; | ||
| 320 | unsigned long completed; | ||
| 321 | unsigned long gpnum; | ||
| 322 | unsigned long gpage; | ||
| 323 | unsigned long gpmax; | ||
| 324 | struct rcu_node *rnp = &rsp->node[0]; | ||
| 325 | |||
| 326 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | ||
| 327 | completed = READ_ONCE(rsp->completed); | ||
| 328 | gpnum = READ_ONCE(rsp->gpnum); | ||
| 329 | if (completed == gpnum) | ||
| 330 | gpage = 0; | ||
| 331 | else | ||
| 332 | gpage = jiffies - rsp->gp_start; | ||
| 333 | gpmax = rsp->gp_max; | ||
| 334 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 335 | seq_printf(m, "completed=%ld gpnum=%ld age=%ld max=%ld\n", | ||
| 336 | ulong2long(completed), ulong2long(gpnum), gpage, gpmax); | ||
| 337 | } | ||
| 338 | |||
| 339 | static int show_rcugp(struct seq_file *m, void *v) | ||
| 340 | { | ||
| 341 | struct rcu_state *rsp = (struct rcu_state *)m->private; | ||
| 342 | show_one_rcugp(m, rsp); | ||
| 343 | return 0; | ||
| 344 | } | ||
| 345 | |||
| 346 | static int rcugp_open(struct inode *inode, struct file *file) | ||
| 347 | { | ||
| 348 | return single_open(file, show_rcugp, inode->i_private); | ||
| 349 | } | ||
| 350 | |||
| 351 | static const struct file_operations rcugp_fops = { | ||
| 352 | .owner = THIS_MODULE, | ||
| 353 | .open = rcugp_open, | ||
| 354 | .read = seq_read, | ||
| 355 | .llseek = no_llseek, | ||
| 356 | .release = single_release, | ||
| 357 | }; | ||
| 358 | |||
| 359 | static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) | ||
| 360 | { | ||
| 361 | if (!rdp->beenonline) | ||
| 362 | return; | ||
| 363 | seq_printf(m, "%3d%cnp=%ld ", | ||
| 364 | rdp->cpu, | ||
| 365 | cpu_is_offline(rdp->cpu) ? '!' : ' ', | ||
| 366 | rdp->n_rcu_pending); | ||
| 367 | seq_printf(m, "qsp=%ld rpq=%ld cbr=%ld cng=%ld ", | ||
| 368 | rdp->n_rp_core_needs_qs, | ||
| 369 | rdp->n_rp_report_qs, | ||
| 370 | rdp->n_rp_cb_ready, | ||
| 371 | rdp->n_rp_cpu_needs_gp); | ||
| 372 | seq_printf(m, "gpc=%ld gps=%ld nn=%ld ndw%ld\n", | ||
| 373 | rdp->n_rp_gp_completed, | ||
| 374 | rdp->n_rp_gp_started, | ||
| 375 | rdp->n_rp_nocb_defer_wakeup, | ||
| 376 | rdp->n_rp_need_nothing); | ||
| 377 | } | ||
| 378 | |||
| 379 | static int show_rcu_pending(struct seq_file *m, void *v) | ||
| 380 | { | ||
| 381 | print_one_rcu_pending(m, (struct rcu_data *)v); | ||
| 382 | return 0; | ||
| 383 | } | ||
| 384 | |||
| 385 | static const struct seq_operations rcu_pending_op = { | ||
| 386 | .start = r_start, | ||
| 387 | .next = r_next, | ||
| 388 | .stop = r_stop, | ||
| 389 | .show = show_rcu_pending, | ||
| 390 | }; | ||
| 391 | |||
| 392 | static int rcu_pending_open(struct inode *inode, struct file *file) | ||
| 393 | { | ||
| 394 | return r_open(inode, file, &rcu_pending_op); | ||
| 395 | } | ||
| 396 | |||
| 397 | static const struct file_operations rcu_pending_fops = { | ||
| 398 | .owner = THIS_MODULE, | ||
| 399 | .open = rcu_pending_open, | ||
| 400 | .read = seq_read, | ||
| 401 | .llseek = no_llseek, | ||
| 402 | .release = seq_release, | ||
| 403 | }; | ||
| 404 | |||
| 405 | static int show_rcutorture(struct seq_file *m, void *unused) | ||
| 406 | { | ||
| 407 | seq_printf(m, "rcutorture test sequence: %lu %s\n", | ||
| 408 | rcutorture_testseq >> 1, | ||
| 409 | (rcutorture_testseq & 0x1) ? "(test in progress)" : ""); | ||
| 410 | seq_printf(m, "rcutorture update version number: %lu\n", | ||
| 411 | rcutorture_vernum); | ||
| 412 | return 0; | ||
| 413 | } | ||
| 414 | |||
| 415 | static int rcutorture_open(struct inode *inode, struct file *file) | ||
| 416 | { | ||
| 417 | return single_open(file, show_rcutorture, NULL); | ||
| 418 | } | ||
| 419 | |||
| 420 | static const struct file_operations rcutorture_fops = { | ||
| 421 | .owner = THIS_MODULE, | ||
| 422 | .open = rcutorture_open, | ||
| 423 | .read = seq_read, | ||
| 424 | .llseek = seq_lseek, | ||
| 425 | .release = single_release, | ||
| 426 | }; | ||
| 427 | |||
| 428 | static struct dentry *rcudir; | ||
| 429 | |||
| 430 | static int __init rcutree_trace_init(void) | ||
| 431 | { | ||
| 432 | struct rcu_state *rsp; | ||
| 433 | struct dentry *retval; | ||
| 434 | struct dentry *rspdir; | ||
| 435 | |||
| 436 | rcudir = debugfs_create_dir("rcu", NULL); | ||
| 437 | if (!rcudir) | ||
| 438 | goto free_out; | ||
| 439 | |||
| 440 | for_each_rcu_flavor(rsp) { | ||
| 441 | rspdir = debugfs_create_dir(rsp->name, rcudir); | ||
| 442 | if (!rspdir) | ||
| 443 | goto free_out; | ||
| 444 | |||
| 445 | retval = debugfs_create_file("rcudata", 0444, | ||
| 446 | rspdir, rsp, &rcudata_fops); | ||
| 447 | if (!retval) | ||
| 448 | goto free_out; | ||
| 449 | |||
| 450 | retval = debugfs_create_file("rcuexp", 0444, | ||
| 451 | rspdir, rsp, &rcuexp_fops); | ||
| 452 | if (!retval) | ||
| 453 | goto free_out; | ||
| 454 | |||
| 455 | retval = debugfs_create_file("rcu_pending", 0444, | ||
| 456 | rspdir, rsp, &rcu_pending_fops); | ||
| 457 | if (!retval) | ||
| 458 | goto free_out; | ||
| 459 | |||
| 460 | retval = debugfs_create_file("rcubarrier", 0444, | ||
| 461 | rspdir, rsp, &rcubarrier_fops); | ||
| 462 | if (!retval) | ||
| 463 | goto free_out; | ||
| 464 | |||
| 465 | #ifdef CONFIG_RCU_BOOST | ||
| 466 | if (rsp == &rcu_preempt_state) { | ||
| 467 | retval = debugfs_create_file("rcuboost", 0444, | ||
| 468 | rspdir, NULL, &rcu_node_boost_fops); | ||
| 469 | if (!retval) | ||
| 470 | goto free_out; | ||
| 471 | } | ||
| 472 | #endif | ||
| 473 | |||
| 474 | retval = debugfs_create_file("rcugp", 0444, | ||
| 475 | rspdir, rsp, &rcugp_fops); | ||
| 476 | if (!retval) | ||
| 477 | goto free_out; | ||
| 478 | |||
| 479 | retval = debugfs_create_file("rcuhier", 0444, | ||
| 480 | rspdir, rsp, &rcuhier_fops); | ||
| 481 | if (!retval) | ||
| 482 | goto free_out; | ||
| 483 | } | ||
| 484 | |||
| 485 | retval = debugfs_create_file("rcutorture", 0444, rcudir, | ||
| 486 | NULL, &rcutorture_fops); | ||
| 487 | if (!retval) | ||
| 488 | goto free_out; | ||
| 489 | return 0; | ||
| 490 | free_out: | ||
| 491 | debugfs_remove_recursive(rcudir); | ||
| 492 | return 1; | ||
| 493 | } | ||
| 494 | device_initcall(rcutree_trace_init); | ||
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 273e869ca21d..00e77c470017 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c | |||
| @@ -62,7 +62,9 @@ | |||
| 62 | #define MODULE_PARAM_PREFIX "rcupdate." | 62 | #define MODULE_PARAM_PREFIX "rcupdate." |
| 63 | 63 | ||
| 64 | #ifndef CONFIG_TINY_RCU | 64 | #ifndef CONFIG_TINY_RCU |
| 65 | extern int rcu_expedited; /* from sysctl */ | ||
| 65 | module_param(rcu_expedited, int, 0); | 66 | module_param(rcu_expedited, int, 0); |
| 67 | extern int rcu_normal; /* from sysctl */ | ||
| 66 | module_param(rcu_normal, int, 0); | 68 | module_param(rcu_normal, int, 0); |
| 67 | static int rcu_normal_after_boot; | 69 | static int rcu_normal_after_boot; |
| 68 | module_param(rcu_normal_after_boot, int, 0); | 70 | module_param(rcu_normal_after_boot, int, 0); |
| @@ -379,6 +381,7 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, | |||
| 379 | struct rcu_synchronize *rs_array) | 381 | struct rcu_synchronize *rs_array) |
| 380 | { | 382 | { |
| 381 | int i; | 383 | int i; |
| 384 | int j; | ||
| 382 | 385 | ||
| 383 | /* Initialize and register callbacks for each flavor specified. */ | 386 | /* Initialize and register callbacks for each flavor specified. */ |
| 384 | for (i = 0; i < n; i++) { | 387 | for (i = 0; i < n; i++) { |
| @@ -390,7 +393,11 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, | |||
| 390 | } | 393 | } |
| 391 | init_rcu_head_on_stack(&rs_array[i].head); | 394 | init_rcu_head_on_stack(&rs_array[i].head); |
| 392 | init_completion(&rs_array[i].completion); | 395 | init_completion(&rs_array[i].completion); |
| 393 | (crcu_array[i])(&rs_array[i].head, wakeme_after_rcu); | 396 | for (j = 0; j < i; j++) |
| 397 | if (crcu_array[j] == crcu_array[i]) | ||
| 398 | break; | ||
| 399 | if (j == i) | ||
| 400 | (crcu_array[i])(&rs_array[i].head, wakeme_after_rcu); | ||
| 394 | } | 401 | } |
| 395 | 402 | ||
| 396 | /* Wait for all callbacks to be invoked. */ | 403 | /* Wait for all callbacks to be invoked. */ |
| @@ -399,7 +406,11 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, | |||
| 399 | (crcu_array[i] == call_rcu || | 406 | (crcu_array[i] == call_rcu || |
| 400 | crcu_array[i] == call_rcu_bh)) | 407 | crcu_array[i] == call_rcu_bh)) |
| 401 | continue; | 408 | continue; |
| 402 | wait_for_completion(&rs_array[i].completion); | 409 | for (j = 0; j < i; j++) |
| 410 | if (crcu_array[j] == crcu_array[i]) | ||
| 411 | break; | ||
| 412 | if (j == i) | ||
| 413 | wait_for_completion(&rs_array[i].completion); | ||
| 403 | destroy_rcu_head_on_stack(&rs_array[i].head); | 414 | destroy_rcu_head_on_stack(&rs_array[i].head); |
| 404 | } | 415 | } |
| 405 | } | 416 | } |
| @@ -560,15 +571,30 @@ static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock); | |||
| 560 | DEFINE_SRCU(tasks_rcu_exit_srcu); | 571 | DEFINE_SRCU(tasks_rcu_exit_srcu); |
| 561 | 572 | ||
| 562 | /* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */ | 573 | /* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */ |
| 563 | static int rcu_task_stall_timeout __read_mostly = HZ * 60 * 10; | 574 | #define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10) |
| 575 | static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT; | ||
| 564 | module_param(rcu_task_stall_timeout, int, 0644); | 576 | module_param(rcu_task_stall_timeout, int, 0644); |
| 565 | 577 | ||
| 566 | static void rcu_spawn_tasks_kthread(void); | 578 | static void rcu_spawn_tasks_kthread(void); |
| 567 | static struct task_struct *rcu_tasks_kthread_ptr; | 579 | static struct task_struct *rcu_tasks_kthread_ptr; |
| 568 | 580 | ||
| 569 | /* | 581 | /** |
| 570 | * Post an RCU-tasks callback. First call must be from process context | 582 | * call_rcu_tasks() - Queue an RCU for invocation task-based grace period |
| 571 | * after the scheduler if fully operational. | 583 | * @rhp: structure to be used for queueing the RCU updates. |
| 584 | * @func: actual callback function to be invoked after the grace period | ||
| 585 | * | ||
| 586 | * The callback function will be invoked some time after a full grace | ||
| 587 | * period elapses, in other words after all currently executing RCU | ||
| 588 | * read-side critical sections have completed. call_rcu_tasks() assumes | ||
| 589 | * that the read-side critical sections end at a voluntary context | ||
| 590 | * switch (not a preemption!), entry into idle, or transition to usermode | ||
| 591 | * execution. As such, there are no read-side primitives analogous to | ||
| 592 | * rcu_read_lock() and rcu_read_unlock() because this primitive is intended | ||
| 593 | * to determine that all tasks have passed through a safe state, not so | ||
| 594 | * much for data-strcuture synchronization. | ||
| 595 | * | ||
| 596 | * See the description of call_rcu() for more detailed information on | ||
| 597 | * memory ordering guarantees. | ||
| 572 | */ | 598 | */ |
| 573 | void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) | 599 | void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) |
| 574 | { | 600 | { |
| @@ -851,6 +877,23 @@ static void rcu_spawn_tasks_kthread(void) | |||
| 851 | 877 | ||
| 852 | #endif /* #ifdef CONFIG_TASKS_RCU */ | 878 | #endif /* #ifdef CONFIG_TASKS_RCU */ |
| 853 | 879 | ||
| 880 | #ifndef CONFIG_TINY_RCU | ||
| 881 | |||
| 882 | /* | ||
| 883 | * Print any non-default Tasks RCU settings. | ||
| 884 | */ | ||
| 885 | static void __init rcu_tasks_bootup_oddness(void) | ||
| 886 | { | ||
| 887 | #ifdef CONFIG_TASKS_RCU | ||
| 888 | if (rcu_task_stall_timeout != RCU_TASK_STALL_TIMEOUT) | ||
| 889 | pr_info("\tTasks-RCU CPU stall warnings timeout set to %d (rcu_task_stall_timeout).\n", rcu_task_stall_timeout); | ||
| 890 | else | ||
| 891 | pr_info("\tTasks RCU enabled.\n"); | ||
| 892 | #endif /* #ifdef CONFIG_TASKS_RCU */ | ||
| 893 | } | ||
| 894 | |||
| 895 | #endif /* #ifndef CONFIG_TINY_RCU */ | ||
| 896 | |||
| 854 | #ifdef CONFIG_PROVE_RCU | 897 | #ifdef CONFIG_PROVE_RCU |
| 855 | 898 | ||
| 856 | /* | 899 | /* |
| @@ -935,3 +978,25 @@ late_initcall(rcu_verify_early_boot_tests); | |||
| 935 | #else | 978 | #else |
| 936 | void rcu_early_boot_tests(void) {} | 979 | void rcu_early_boot_tests(void) {} |
| 937 | #endif /* CONFIG_PROVE_RCU */ | 980 | #endif /* CONFIG_PROVE_RCU */ |
| 981 | |||
| 982 | #ifndef CONFIG_TINY_RCU | ||
| 983 | |||
| 984 | /* | ||
| 985 | * Print any significant non-default boot-time settings. | ||
| 986 | */ | ||
| 987 | void __init rcupdate_announce_bootup_oddness(void) | ||
| 988 | { | ||
| 989 | if (rcu_normal) | ||
| 990 | pr_info("\tNo expedited grace period (rcu_normal).\n"); | ||
| 991 | else if (rcu_normal_after_boot) | ||
| 992 | pr_info("\tNo expedited grace period (rcu_normal_after_boot).\n"); | ||
| 993 | else if (rcu_expedited) | ||
| 994 | pr_info("\tAll grace periods are expedited (rcu_expedited).\n"); | ||
| 995 | if (rcu_cpu_stall_suppress) | ||
| 996 | pr_info("\tRCU CPU stall warnings suppressed (rcu_cpu_stall_suppress).\n"); | ||
| 997 | if (rcu_cpu_stall_timeout != CONFIG_RCU_CPU_STALL_TIMEOUT) | ||
| 998 | pr_info("\tRCU CPU stall warnings timeout set to %d (rcu_cpu_stall_timeout).\n", rcu_cpu_stall_timeout); | ||
| 999 | rcu_tasks_bootup_oddness(); | ||
| 1000 | } | ||
| 1001 | |||
| 1002 | #endif /* #ifndef CONFIG_TINY_RCU */ | ||
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 89ab6758667b..53f0164ed362 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile | |||
| @@ -16,9 +16,9 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer | |||
| 16 | endif | 16 | endif |
| 17 | 17 | ||
| 18 | obj-y += core.o loadavg.o clock.o cputime.o | 18 | obj-y += core.o loadavg.o clock.o cputime.o |
| 19 | obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o | 19 | obj-y += idle_task.o fair.o rt.o deadline.o |
| 20 | obj-y += wait.o swait.o completion.o idle.o | 20 | obj-y += wait.o wait_bit.o swait.o completion.o idle.o |
| 21 | obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o | 21 | obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o |
| 22 | obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o | 22 | obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o |
| 23 | obj-$(CONFIG_SCHEDSTATS) += stats.o | 23 | obj-$(CONFIG_SCHEDSTATS) += stats.o |
| 24 | obj-$(CONFIG_SCHED_DEBUG) += debug.o | 24 | obj-$(CONFIG_SCHED_DEBUG) += debug.o |
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index 00a45c45beca..ca0f8fc945c6 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c | |||
| @@ -64,6 +64,7 @@ | |||
| 64 | #include <linux/workqueue.h> | 64 | #include <linux/workqueue.h> |
| 65 | #include <linux/compiler.h> | 65 | #include <linux/compiler.h> |
| 66 | #include <linux/tick.h> | 66 | #include <linux/tick.h> |
| 67 | #include <linux/init.h> | ||
| 67 | 68 | ||
| 68 | /* | 69 | /* |
| 69 | * Scheduler clock - returns current time in nanosec units. | 70 | * Scheduler clock - returns current time in nanosec units. |
| @@ -124,14 +125,27 @@ int sched_clock_stable(void) | |||
| 124 | return static_branch_likely(&__sched_clock_stable); | 125 | return static_branch_likely(&__sched_clock_stable); |
| 125 | } | 126 | } |
| 126 | 127 | ||
| 128 | static void __scd_stamp(struct sched_clock_data *scd) | ||
| 129 | { | ||
| 130 | scd->tick_gtod = ktime_get_ns(); | ||
| 131 | scd->tick_raw = sched_clock(); | ||
| 132 | } | ||
| 133 | |||
| 127 | static void __set_sched_clock_stable(void) | 134 | static void __set_sched_clock_stable(void) |
| 128 | { | 135 | { |
| 129 | struct sched_clock_data *scd = this_scd(); | 136 | struct sched_clock_data *scd; |
| 130 | 137 | ||
| 131 | /* | 138 | /* |
| 139 | * Since we're still unstable and the tick is already running, we have | ||
| 140 | * to disable IRQs in order to get a consistent scd->tick* reading. | ||
| 141 | */ | ||
| 142 | local_irq_disable(); | ||
| 143 | scd = this_scd(); | ||
| 144 | /* | ||
| 132 | * Attempt to make the (initial) unstable->stable transition continuous. | 145 | * Attempt to make the (initial) unstable->stable transition continuous. |
| 133 | */ | 146 | */ |
| 134 | __sched_clock_offset = (scd->tick_gtod + __gtod_offset) - (scd->tick_raw); | 147 | __sched_clock_offset = (scd->tick_gtod + __gtod_offset) - (scd->tick_raw); |
| 148 | local_irq_enable(); | ||
| 135 | 149 | ||
| 136 | printk(KERN_INFO "sched_clock: Marking stable (%lld, %lld)->(%lld, %lld)\n", | 150 | printk(KERN_INFO "sched_clock: Marking stable (%lld, %lld)->(%lld, %lld)\n", |
| 137 | scd->tick_gtod, __gtod_offset, | 151 | scd->tick_gtod, __gtod_offset, |
| @@ -141,8 +155,38 @@ static void __set_sched_clock_stable(void) | |||
| 141 | tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE); | 155 | tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE); |
| 142 | } | 156 | } |
| 143 | 157 | ||
| 158 | /* | ||
| 159 | * If we ever get here, we're screwed, because we found out -- typically after | ||
| 160 | * the fact -- that TSC wasn't good. This means all our clocksources (including | ||
| 161 | * ktime) could have reported wrong values. | ||
| 162 | * | ||
| 163 | * What we do here is an attempt to fix up and continue sort of where we left | ||
| 164 | * off in a coherent manner. | ||
| 165 | * | ||
| 166 | * The only way to fully avoid random clock jumps is to boot with: | ||
| 167 | * "tsc=unstable". | ||
| 168 | */ | ||
| 144 | static void __sched_clock_work(struct work_struct *work) | 169 | static void __sched_clock_work(struct work_struct *work) |
| 145 | { | 170 | { |
| 171 | struct sched_clock_data *scd; | ||
| 172 | int cpu; | ||
| 173 | |||
| 174 | /* take a current timestamp and set 'now' */ | ||
| 175 | preempt_disable(); | ||
| 176 | scd = this_scd(); | ||
| 177 | __scd_stamp(scd); | ||
| 178 | scd->clock = scd->tick_gtod + __gtod_offset; | ||
| 179 | preempt_enable(); | ||
| 180 | |||
| 181 | /* clone to all CPUs */ | ||
| 182 | for_each_possible_cpu(cpu) | ||
| 183 | per_cpu(sched_clock_data, cpu) = *scd; | ||
| 184 | |||
| 185 | printk(KERN_WARNING "TSC found unstable after boot, most likely due to broken BIOS. Use 'tsc=unstable'.\n"); | ||
| 186 | printk(KERN_INFO "sched_clock: Marking unstable (%lld, %lld)<-(%lld, %lld)\n", | ||
| 187 | scd->tick_gtod, __gtod_offset, | ||
| 188 | scd->tick_raw, __sched_clock_offset); | ||
| 189 | |||
| 146 | static_branch_disable(&__sched_clock_stable); | 190 | static_branch_disable(&__sched_clock_stable); |
| 147 | } | 191 | } |
| 148 | 192 | ||
| @@ -150,27 +194,11 @@ static DECLARE_WORK(sched_clock_work, __sched_clock_work); | |||
| 150 | 194 | ||
| 151 | static void __clear_sched_clock_stable(void) | 195 | static void __clear_sched_clock_stable(void) |
| 152 | { | 196 | { |
| 153 | struct sched_clock_data *scd = this_scd(); | 197 | if (!sched_clock_stable()) |
| 154 | 198 | return; | |
| 155 | /* | ||
| 156 | * Attempt to make the stable->unstable transition continuous. | ||
| 157 | * | ||
| 158 | * Trouble is, this is typically called from the TSC watchdog | ||
| 159 | * timer, which is late per definition. This means the tick | ||
| 160 | * values can already be screwy. | ||
| 161 | * | ||
| 162 | * Still do what we can. | ||
| 163 | */ | ||
| 164 | __gtod_offset = (scd->tick_raw + __sched_clock_offset) - (scd->tick_gtod); | ||
| 165 | |||
| 166 | printk(KERN_INFO "sched_clock: Marking unstable (%lld, %lld)<-(%lld, %lld)\n", | ||
| 167 | scd->tick_gtod, __gtod_offset, | ||
| 168 | scd->tick_raw, __sched_clock_offset); | ||
| 169 | 199 | ||
| 170 | tick_dep_set(TICK_DEP_BIT_CLOCK_UNSTABLE); | 200 | tick_dep_set(TICK_DEP_BIT_CLOCK_UNSTABLE); |
| 171 | 201 | schedule_work(&sched_clock_work); | |
| 172 | if (sched_clock_stable()) | ||
| 173 | schedule_work(&sched_clock_work); | ||
| 174 | } | 202 | } |
| 175 | 203 | ||
| 176 | void clear_sched_clock_stable(void) | 204 | void clear_sched_clock_stable(void) |
| @@ -183,7 +211,11 @@ void clear_sched_clock_stable(void) | |||
| 183 | __clear_sched_clock_stable(); | 211 | __clear_sched_clock_stable(); |
| 184 | } | 212 | } |
| 185 | 213 | ||
| 186 | void sched_clock_init_late(void) | 214 | /* |
| 215 | * We run this as late_initcall() such that it runs after all built-in drivers, | ||
| 216 | * notably: acpi_processor and intel_idle, which can mark the TSC as unstable. | ||
| 217 | */ | ||
| 218 | static int __init sched_clock_init_late(void) | ||
| 187 | { | 219 | { |
| 188 | sched_clock_running = 2; | 220 | sched_clock_running = 2; |
| 189 | /* | 221 | /* |
| @@ -197,7 +229,10 @@ void sched_clock_init_late(void) | |||
| 197 | 229 | ||
| 198 | if (__sched_clock_stable_early) | 230 | if (__sched_clock_stable_early) |
| 199 | __set_sched_clock_stable(); | 231 | __set_sched_clock_stable(); |
| 232 | |||
| 233 | return 0; | ||
| 200 | } | 234 | } |
| 235 | late_initcall(sched_clock_init_late); | ||
| 201 | 236 | ||
| 202 | /* | 237 | /* |
| 203 | * min, max except they take wrapping into account | 238 | * min, max except they take wrapping into account |
| @@ -347,21 +382,38 @@ void sched_clock_tick(void) | |||
| 347 | { | 382 | { |
| 348 | struct sched_clock_data *scd; | 383 | struct sched_clock_data *scd; |
| 349 | 384 | ||
| 385 | if (sched_clock_stable()) | ||
| 386 | return; | ||
| 387 | |||
| 388 | if (unlikely(!sched_clock_running)) | ||
| 389 | return; | ||
| 390 | |||
| 350 | WARN_ON_ONCE(!irqs_disabled()); | 391 | WARN_ON_ONCE(!irqs_disabled()); |
| 351 | 392 | ||
| 393 | scd = this_scd(); | ||
| 394 | __scd_stamp(scd); | ||
| 395 | sched_clock_local(scd); | ||
| 396 | } | ||
| 397 | |||
| 398 | void sched_clock_tick_stable(void) | ||
| 399 | { | ||
| 400 | u64 gtod, clock; | ||
| 401 | |||
| 402 | if (!sched_clock_stable()) | ||
| 403 | return; | ||
| 404 | |||
| 352 | /* | 405 | /* |
| 353 | * Update these values even if sched_clock_stable(), because it can | 406 | * Called under watchdog_lock. |
| 354 | * become unstable at any point in time at which point we need some | ||
| 355 | * values to fall back on. | ||
| 356 | * | 407 | * |
| 357 | * XXX arguably we can skip this if we expose tsc_clocksource_reliable | 408 | * The watchdog just found this TSC to (still) be stable, so now is a |
| 409 | * good moment to update our __gtod_offset. Because once we find the | ||
| 410 | * TSC to be unstable, any computation will be computing crap. | ||
| 358 | */ | 411 | */ |
| 359 | scd = this_scd(); | 412 | local_irq_disable(); |
| 360 | scd->tick_raw = sched_clock(); | 413 | gtod = ktime_get_ns(); |
| 361 | scd->tick_gtod = ktime_get_ns(); | 414 | clock = sched_clock(); |
| 362 | 415 | __gtod_offset = (clock + __sched_clock_offset) - gtod; | |
| 363 | if (!sched_clock_stable() && likely(sched_clock_running)) | 416 | local_irq_enable(); |
| 364 | sched_clock_local(scd); | ||
| 365 | } | 417 | } |
| 366 | 418 | ||
| 367 | /* | 419 | /* |
| @@ -374,15 +426,21 @@ void sched_clock_idle_sleep_event(void) | |||
| 374 | EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); | 426 | EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); |
| 375 | 427 | ||
| 376 | /* | 428 | /* |
| 377 | * We just idled delta nanoseconds (called with irqs disabled): | 429 | * We just idled; resync with ktime. |
| 378 | */ | 430 | */ |
| 379 | void sched_clock_idle_wakeup_event(u64 delta_ns) | 431 | void sched_clock_idle_wakeup_event(void) |
| 380 | { | 432 | { |
| 381 | if (timekeeping_suspended) | 433 | unsigned long flags; |
| 434 | |||
| 435 | if (sched_clock_stable()) | ||
| 436 | return; | ||
| 437 | |||
| 438 | if (unlikely(timekeeping_suspended)) | ||
| 382 | return; | 439 | return; |
| 383 | 440 | ||
| 441 | local_irq_save(flags); | ||
| 384 | sched_clock_tick(); | 442 | sched_clock_tick(); |
| 385 | touch_softlockup_watchdog_sched(); | 443 | local_irq_restore(flags); |
| 386 | } | 444 | } |
| 387 | EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); | 445 | EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); |
| 388 | 446 | ||
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 53f9558fa925..13fc5ae9bf2f 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c | |||
| @@ -66,7 +66,7 @@ do_wait_for_common(struct completion *x, | |||
| 66 | if (!x->done) { | 66 | if (!x->done) { |
| 67 | DECLARE_WAITQUEUE(wait, current); | 67 | DECLARE_WAITQUEUE(wait, current); |
| 68 | 68 | ||
| 69 | __add_wait_queue_tail_exclusive(&x->wait, &wait); | 69 | __add_wait_queue_entry_tail_exclusive(&x->wait, &wait); |
| 70 | do { | 70 | do { |
| 71 | if (signal_pending_state(state, current)) { | 71 | if (signal_pending_state(state, current)) { |
| 72 | timeout = -ERESTARTSYS; | 72 | timeout = -ERESTARTSYS; |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 803c3bc274c4..17c667b427b4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
| @@ -10,6 +10,7 @@ | |||
| 10 | #include <uapi/linux/sched/types.h> | 10 | #include <uapi/linux/sched/types.h> |
| 11 | #include <linux/sched/loadavg.h> | 11 | #include <linux/sched/loadavg.h> |
| 12 | #include <linux/sched/hotplug.h> | 12 | #include <linux/sched/hotplug.h> |
| 13 | #include <linux/wait_bit.h> | ||
| 13 | #include <linux/cpuset.h> | 14 | #include <linux/cpuset.h> |
| 14 | #include <linux/delayacct.h> | 15 | #include <linux/delayacct.h> |
| 15 | #include <linux/init_task.h> | 16 | #include <linux/init_task.h> |
| @@ -788,36 +789,6 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags) | |||
| 788 | dequeue_task(rq, p, flags); | 789 | dequeue_task(rq, p, flags); |
| 789 | } | 790 | } |
| 790 | 791 | ||
| 791 | void sched_set_stop_task(int cpu, struct task_struct *stop) | ||
| 792 | { | ||
| 793 | struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; | ||
| 794 | struct task_struct *old_stop = cpu_rq(cpu)->stop; | ||
| 795 | |||
| 796 | if (stop) { | ||
| 797 | /* | ||
| 798 | * Make it appear like a SCHED_FIFO task, its something | ||
| 799 | * userspace knows about and won't get confused about. | ||
| 800 | * | ||
| 801 | * Also, it will make PI more or less work without too | ||
| 802 | * much confusion -- but then, stop work should not | ||
| 803 | * rely on PI working anyway. | ||
| 804 | */ | ||
| 805 | sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); | ||
| 806 | |||
| 807 | stop->sched_class = &stop_sched_class; | ||
| 808 | } | ||
| 809 | |||
| 810 | cpu_rq(cpu)->stop = stop; | ||
| 811 | |||
| 812 | if (old_stop) { | ||
| 813 | /* | ||
| 814 | * Reset it back to a normal scheduling class so that | ||
| 815 | * it can die in pieces. | ||
| 816 | */ | ||
| 817 | old_stop->sched_class = &rt_sched_class; | ||
| 818 | } | ||
| 819 | } | ||
| 820 | |||
| 821 | /* | 792 | /* |
| 822 | * __normal_prio - return the priority that is based on the static prio | 793 | * __normal_prio - return the priority that is based on the static prio |
| 823 | */ | 794 | */ |
| @@ -1588,6 +1559,36 @@ static void update_avg(u64 *avg, u64 sample) | |||
| 1588 | *avg += diff >> 3; | 1559 | *avg += diff >> 3; |
| 1589 | } | 1560 | } |
| 1590 | 1561 | ||
| 1562 | void sched_set_stop_task(int cpu, struct task_struct *stop) | ||
| 1563 | { | ||
| 1564 | struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; | ||
| 1565 | struct task_struct *old_stop = cpu_rq(cpu)->stop; | ||
| 1566 | |||
| 1567 | if (stop) { | ||
| 1568 | /* | ||
| 1569 | * Make it appear like a SCHED_FIFO task, its something | ||
| 1570 | * userspace knows about and won't get confused about. | ||
| 1571 | * | ||
| 1572 | * Also, it will make PI more or less work without too | ||
| 1573 | * much confusion -- but then, stop work should not | ||
| 1574 | * rely on PI working anyway. | ||
| 1575 | */ | ||
| 1576 | sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); | ||
| 1577 | |||
| 1578 | stop->sched_class = &stop_sched_class; | ||
| 1579 | } | ||
| 1580 | |||
| 1581 | cpu_rq(cpu)->stop = stop; | ||
| 1582 | |||
| 1583 | if (old_stop) { | ||
| 1584 | /* | ||
| 1585 | * Reset it back to a normal scheduling class so that | ||
| 1586 | * it can die in pieces. | ||
| 1587 | */ | ||
| 1588 | old_stop->sched_class = &rt_sched_class; | ||
| 1589 | } | ||
| 1590 | } | ||
| 1591 | |||
| 1591 | #else | 1592 | #else |
| 1592 | 1593 | ||
| 1593 | static inline int __set_cpus_allowed_ptr(struct task_struct *p, | 1594 | static inline int __set_cpus_allowed_ptr(struct task_struct *p, |
| @@ -1731,7 +1732,7 @@ void sched_ttwu_pending(void) | |||
| 1731 | { | 1732 | { |
| 1732 | struct rq *rq = this_rq(); | 1733 | struct rq *rq = this_rq(); |
| 1733 | struct llist_node *llist = llist_del_all(&rq->wake_list); | 1734 | struct llist_node *llist = llist_del_all(&rq->wake_list); |
| 1734 | struct task_struct *p; | 1735 | struct task_struct *p, *t; |
| 1735 | struct rq_flags rf; | 1736 | struct rq_flags rf; |
| 1736 | 1737 | ||
| 1737 | if (!llist) | 1738 | if (!llist) |
| @@ -1740,17 +1741,8 @@ void sched_ttwu_pending(void) | |||
| 1740 | rq_lock_irqsave(rq, &rf); | 1741 | rq_lock_irqsave(rq, &rf); |
| 1741 | update_rq_clock(rq); | 1742 | update_rq_clock(rq); |
| 1742 | 1743 | ||
| 1743 | while (llist) { | 1744 | llist_for_each_entry_safe(p, t, llist, wake_entry) |
| 1744 | int wake_flags = 0; | 1745 | ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf); |
| 1745 | |||
| 1746 | p = llist_entry(llist, struct task_struct, wake_entry); | ||
| 1747 | llist = llist_next(llist); | ||
| 1748 | |||
| 1749 | if (p->sched_remote_wakeup) | ||
| 1750 | wake_flags = WF_MIGRATED; | ||
| 1751 | |||
| 1752 | ttwu_do_activate(rq, p, wake_flags, &rf); | ||
| 1753 | } | ||
| 1754 | 1746 | ||
| 1755 | rq_unlock_irqrestore(rq, &rf); | 1747 | rq_unlock_irqrestore(rq, &rf); |
| 1756 | } | 1748 | } |
| @@ -2148,23 +2140,6 @@ int wake_up_state(struct task_struct *p, unsigned int state) | |||
| 2148 | } | 2140 | } |
| 2149 | 2141 | ||
| 2150 | /* | 2142 | /* |
| 2151 | * This function clears the sched_dl_entity static params. | ||
| 2152 | */ | ||
| 2153 | void __dl_clear_params(struct task_struct *p) | ||
| 2154 | { | ||
| 2155 | struct sched_dl_entity *dl_se = &p->dl; | ||
| 2156 | |||
| 2157 | dl_se->dl_runtime = 0; | ||
| 2158 | dl_se->dl_deadline = 0; | ||
| 2159 | dl_se->dl_period = 0; | ||
| 2160 | dl_se->flags = 0; | ||
| 2161 | dl_se->dl_bw = 0; | ||
| 2162 | |||
| 2163 | dl_se->dl_throttled = 0; | ||
| 2164 | dl_se->dl_yielded = 0; | ||
| 2165 | } | ||
| 2166 | |||
| 2167 | /* | ||
| 2168 | * Perform scheduler related setup for a newly forked process p. | 2143 | * Perform scheduler related setup for a newly forked process p. |
| 2169 | * p is forked by current. | 2144 | * p is forked by current. |
| 2170 | * | 2145 | * |
| @@ -2193,6 +2168,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
| 2193 | 2168 | ||
| 2194 | RB_CLEAR_NODE(&p->dl.rb_node); | 2169 | RB_CLEAR_NODE(&p->dl.rb_node); |
| 2195 | init_dl_task_timer(&p->dl); | 2170 | init_dl_task_timer(&p->dl); |
| 2171 | init_dl_inactive_task_timer(&p->dl); | ||
| 2196 | __dl_clear_params(p); | 2172 | __dl_clear_params(p); |
| 2197 | 2173 | ||
| 2198 | INIT_LIST_HEAD(&p->rt.run_list); | 2174 | INIT_LIST_HEAD(&p->rt.run_list); |
| @@ -2430,7 +2406,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
| 2430 | unsigned long to_ratio(u64 period, u64 runtime) | 2406 | unsigned long to_ratio(u64 period, u64 runtime) |
| 2431 | { | 2407 | { |
| 2432 | if (runtime == RUNTIME_INF) | 2408 | if (runtime == RUNTIME_INF) |
| 2433 | return 1ULL << 20; | 2409 | return BW_UNIT; |
| 2434 | 2410 | ||
| 2435 | /* | 2411 | /* |
| 2436 | * Doing this here saves a lot of checks in all | 2412 | * Doing this here saves a lot of checks in all |
| @@ -2440,93 +2416,9 @@ unsigned long to_ratio(u64 period, u64 runtime) | |||
| 2440 | if (period == 0) | 2416 | if (period == 0) |
| 2441 | return 0; | 2417 | return 0; |
| 2442 | 2418 | ||
| 2443 | return div64_u64(runtime << 20, period); | 2419 | return div64_u64(runtime << BW_SHIFT, period); |
| 2444 | } | ||
| 2445 | |||
| 2446 | #ifdef CONFIG_SMP | ||
| 2447 | inline struct dl_bw *dl_bw_of(int i) | ||
| 2448 | { | ||
| 2449 | RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), | ||
| 2450 | "sched RCU must be held"); | ||
| 2451 | return &cpu_rq(i)->rd->dl_bw; | ||
| 2452 | } | ||
| 2453 | |||
| 2454 | static inline int dl_bw_cpus(int i) | ||
| 2455 | { | ||
| 2456 | struct root_domain *rd = cpu_rq(i)->rd; | ||
| 2457 | int cpus = 0; | ||
| 2458 | |||
| 2459 | RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), | ||
| 2460 | "sched RCU must be held"); | ||
| 2461 | for_each_cpu_and(i, rd->span, cpu_active_mask) | ||
| 2462 | cpus++; | ||
| 2463 | |||
| 2464 | return cpus; | ||
| 2465 | } | ||
| 2466 | #else | ||
| 2467 | inline struct dl_bw *dl_bw_of(int i) | ||
| 2468 | { | ||
| 2469 | return &cpu_rq(i)->dl.dl_bw; | ||
| 2470 | } | ||
| 2471 | |||
| 2472 | static inline int dl_bw_cpus(int i) | ||
| 2473 | { | ||
| 2474 | return 1; | ||
| 2475 | } | ||
| 2476 | #endif | ||
| 2477 | |||
| 2478 | /* | ||
| 2479 | * We must be sure that accepting a new task (or allowing changing the | ||
| 2480 | * parameters of an existing one) is consistent with the bandwidth | ||
| 2481 | * constraints. If yes, this function also accordingly updates the currently | ||
| 2482 | * allocated bandwidth to reflect the new situation. | ||
| 2483 | * | ||
| 2484 | * This function is called while holding p's rq->lock. | ||
| 2485 | * | ||
| 2486 | * XXX we should delay bw change until the task's 0-lag point, see | ||
| 2487 | * __setparam_dl(). | ||
| 2488 | */ | ||
| 2489 | static int dl_overflow(struct task_struct *p, int policy, | ||
| 2490 | const struct sched_attr *attr) | ||
| 2491 | { | ||
| 2492 | |||
| 2493 | struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); | ||
| 2494 | u64 period = attr->sched_period ?: attr->sched_deadline; | ||
| 2495 | u64 runtime = attr->sched_runtime; | ||
| 2496 | u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; | ||
| 2497 | int cpus, err = -1; | ||
| 2498 | |||
| 2499 | /* !deadline task may carry old deadline bandwidth */ | ||
| 2500 | if (new_bw == p->dl.dl_bw && task_has_dl_policy(p)) | ||
| 2501 | return 0; | ||
| 2502 | |||
| 2503 | /* | ||
| 2504 | * Either if a task, enters, leave, or stays -deadline but changes | ||
| 2505 | * its parameters, we may need to update accordingly the total | ||
| 2506 | * allocated bandwidth of the container. | ||
| 2507 | */ | ||
| 2508 | raw_spin_lock(&dl_b->lock); | ||
| 2509 | cpus = dl_bw_cpus(task_cpu(p)); | ||
| 2510 | if (dl_policy(policy) && !task_has_dl_policy(p) && | ||
| 2511 | !__dl_overflow(dl_b, cpus, 0, new_bw)) { | ||
| 2512 | __dl_add(dl_b, new_bw); | ||
| 2513 | err = 0; | ||
| 2514 | } else if (dl_policy(policy) && task_has_dl_policy(p) && | ||
| 2515 | !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) { | ||
| 2516 | __dl_clear(dl_b, p->dl.dl_bw); | ||
| 2517 | __dl_add(dl_b, new_bw); | ||
| 2518 | err = 0; | ||
| 2519 | } else if (!dl_policy(policy) && task_has_dl_policy(p)) { | ||
| 2520 | __dl_clear(dl_b, p->dl.dl_bw); | ||
| 2521 | err = 0; | ||
| 2522 | } | ||
| 2523 | raw_spin_unlock(&dl_b->lock); | ||
| 2524 | |||
| 2525 | return err; | ||
| 2526 | } | 2420 | } |
| 2527 | 2421 | ||
| 2528 | extern void init_dl_bw(struct dl_bw *dl_b); | ||
| 2529 | |||
| 2530 | /* | 2422 | /* |
| 2531 | * wake_up_new_task - wake up a newly created task for the first time. | 2423 | * wake_up_new_task - wake up a newly created task for the first time. |
| 2532 | * | 2424 | * |
| @@ -3687,7 +3579,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) | |||
| 3687 | exception_exit(prev_state); | 3579 | exception_exit(prev_state); |
| 3688 | } | 3580 | } |
| 3689 | 3581 | ||
| 3690 | int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, | 3582 | int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, |
| 3691 | void *key) | 3583 | void *key) |
| 3692 | { | 3584 | { |
| 3693 | return try_to_wake_up(curr->private, mode, wake_flags); | 3585 | return try_to_wake_up(curr->private, mode, wake_flags); |
| @@ -4009,46 +3901,6 @@ static struct task_struct *find_process_by_pid(pid_t pid) | |||
| 4009 | } | 3901 | } |
| 4010 | 3902 | ||
| 4011 | /* | 3903 | /* |
| 4012 | * This function initializes the sched_dl_entity of a newly becoming | ||
| 4013 | * SCHED_DEADLINE task. | ||
| 4014 | * | ||
| 4015 | * Only the static values are considered here, the actual runtime and the | ||
| 4016 | * absolute deadline will be properly calculated when the task is enqueued | ||
| 4017 | * for the first time with its new policy. | ||
| 4018 | */ | ||
| 4019 | static void | ||
| 4020 | __setparam_dl(struct task_struct *p, const struct sched_attr *attr) | ||
| 4021 | { | ||
| 4022 | struct sched_dl_entity *dl_se = &p->dl; | ||
| 4023 | |||
| 4024 | dl_se->dl_runtime = attr->sched_runtime; | ||
| 4025 | dl_se->dl_deadline = attr->sched_deadline; | ||
| 4026 | dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; | ||
| 4027 | dl_se->flags = attr->sched_flags; | ||
| 4028 | dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); | ||
| 4029 | |||
| 4030 | /* | ||
| 4031 | * Changing the parameters of a task is 'tricky' and we're not doing | ||
| 4032 | * the correct thing -- also see task_dead_dl() and switched_from_dl(). | ||
| 4033 | * | ||
| 4034 | * What we SHOULD do is delay the bandwidth release until the 0-lag | ||
| 4035 | * point. This would include retaining the task_struct until that time | ||
| 4036 | * and change dl_overflow() to not immediately decrement the current | ||
| 4037 | * amount. | ||
| 4038 | * | ||
| 4039 | * Instead we retain the current runtime/deadline and let the new | ||
| 4040 | * parameters take effect after the current reservation period lapses. | ||
| 4041 | * This is safe (albeit pessimistic) because the 0-lag point is always | ||
| 4042 | * before the current scheduling deadline. | ||
| 4043 | * | ||
| 4044 | * We can still have temporary overloads because we do not delay the | ||
| 4045 | * change in bandwidth until that time; so admission control is | ||
| 4046 | * not on the safe side. It does however guarantee tasks will never | ||
| 4047 | * consume more than promised. | ||
| 4048 | */ | ||
| 4049 | } | ||
| 4050 | |||
| 4051 | /* | ||
| 4052 | * sched_setparam() passes in -1 for its policy, to let the functions | 3904 | * sched_setparam() passes in -1 for its policy, to let the functions |
| 4053 | * it calls know not to change it. | 3905 | * it calls know not to change it. |
| 4054 | */ | 3906 | */ |
| @@ -4101,59 +3953,6 @@ static void __setscheduler(struct rq *rq, struct task_struct *p, | |||
| 4101 | p->sched_class = &fair_sched_class; | 3953 | p->sched_class = &fair_sched_class; |
| 4102 | } | 3954 | } |
| 4103 | 3955 | ||
| 4104 | static void | ||
| 4105 | __getparam_dl(struct task_struct *p, struct sched_attr *attr) | ||
| 4106 | { | ||
| 4107 | struct sched_dl_entity *dl_se = &p->dl; | ||
| 4108 | |||
| 4109 | attr->sched_priority = p->rt_priority; | ||
| 4110 | attr->sched_runtime = dl_se->dl_runtime; | ||
| 4111 | attr->sched_deadline = dl_se->dl_deadline; | ||
| 4112 | attr->sched_period = dl_se->dl_period; | ||
| 4113 | attr->sched_flags = dl_se->flags; | ||
| 4114 | } | ||
| 4115 | |||
| 4116 | /* | ||
| 4117 | * This function validates the new parameters of a -deadline task. | ||
| 4118 | * We ask for the deadline not being zero, and greater or equal | ||
| 4119 | * than the runtime, as well as the period of being zero or | ||
| 4120 | * greater than deadline. Furthermore, we have to be sure that | ||
| 4121 | * user parameters are above the internal resolution of 1us (we | ||
| 4122 | * check sched_runtime only since it is always the smaller one) and | ||
| 4123 | * below 2^63 ns (we have to check both sched_deadline and | ||
| 4124 | * sched_period, as the latter can be zero). | ||
| 4125 | */ | ||
| 4126 | static bool | ||
| 4127 | __checkparam_dl(const struct sched_attr *attr) | ||
| 4128 | { | ||
| 4129 | /* deadline != 0 */ | ||
| 4130 | if (attr->sched_deadline == 0) | ||
| 4131 | return false; | ||
| 4132 | |||
| 4133 | /* | ||
| 4134 | * Since we truncate DL_SCALE bits, make sure we're at least | ||
| 4135 | * that big. | ||
| 4136 | */ | ||
| 4137 | if (attr->sched_runtime < (1ULL << DL_SCALE)) | ||
| 4138 | return false; | ||
| 4139 | |||
| 4140 | /* | ||
| 4141 | * Since we use the MSB for wrap-around and sign issues, make | ||
| 4142 | * sure it's not set (mind that period can be equal to zero). | ||
| 4143 | */ | ||
| 4144 | if (attr->sched_deadline & (1ULL << 63) || | ||
| 4145 | attr->sched_period & (1ULL << 63)) | ||
| 4146 | return false; | ||
| 4147 | |||
| 4148 | /* runtime <= deadline <= period (if period != 0) */ | ||
| 4149 | if ((attr->sched_period != 0 && | ||
| 4150 | attr->sched_period < attr->sched_deadline) || | ||
| 4151 | attr->sched_deadline < attr->sched_runtime) | ||
| 4152 | return false; | ||
| 4153 | |||
| 4154 | return true; | ||
| 4155 | } | ||
| 4156 | |||
| 4157 | /* | 3956 | /* |
| 4158 | * Check the target process has a UID that matches the current process's: | 3957 | * Check the target process has a UID that matches the current process's: |
| 4159 | */ | 3958 | */ |
| @@ -4170,19 +3969,6 @@ static bool check_same_owner(struct task_struct *p) | |||
| 4170 | return match; | 3969 | return match; |
| 4171 | } | 3970 | } |
| 4172 | 3971 | ||
| 4173 | static bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) | ||
| 4174 | { | ||
| 4175 | struct sched_dl_entity *dl_se = &p->dl; | ||
| 4176 | |||
| 4177 | if (dl_se->dl_runtime != attr->sched_runtime || | ||
| 4178 | dl_se->dl_deadline != attr->sched_deadline || | ||
| 4179 | dl_se->dl_period != attr->sched_period || | ||
| 4180 | dl_se->flags != attr->sched_flags) | ||
| 4181 | return true; | ||
| 4182 | |||
| 4183 | return false; | ||
| 4184 | } | ||
| 4185 | |||
| 4186 | static int __sched_setscheduler(struct task_struct *p, | 3972 | static int __sched_setscheduler(struct task_struct *p, |
| 4187 | const struct sched_attr *attr, | 3973 | const struct sched_attr *attr, |
| 4188 | bool user, bool pi) | 3974 | bool user, bool pi) |
| @@ -4197,8 +3983,8 @@ static int __sched_setscheduler(struct task_struct *p, | |||
| 4197 | int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; | 3983 | int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; |
| 4198 | struct rq *rq; | 3984 | struct rq *rq; |
| 4199 | 3985 | ||
| 4200 | /* May grab non-irq protected spin_locks: */ | 3986 | /* The pi code expects interrupts enabled */ |
| 4201 | BUG_ON(in_interrupt()); | 3987 | BUG_ON(pi && in_interrupt()); |
| 4202 | recheck: | 3988 | recheck: |
| 4203 | /* Double check policy once rq lock held: */ | 3989 | /* Double check policy once rq lock held: */ |
| 4204 | if (policy < 0) { | 3990 | if (policy < 0) { |
| @@ -4211,7 +3997,8 @@ recheck: | |||
| 4211 | return -EINVAL; | 3997 | return -EINVAL; |
| 4212 | } | 3998 | } |
| 4213 | 3999 | ||
| 4214 | if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK)) | 4000 | if (attr->sched_flags & |
| 4001 | ~(SCHED_FLAG_RESET_ON_FORK | SCHED_FLAG_RECLAIM)) | ||
| 4215 | return -EINVAL; | 4002 | return -EINVAL; |
| 4216 | 4003 | ||
| 4217 | /* | 4004 | /* |
| @@ -4362,7 +4149,7 @@ change: | |||
| 4362 | * of a SCHED_DEADLINE task) we need to check if enough bandwidth | 4149 | * of a SCHED_DEADLINE task) we need to check if enough bandwidth |
| 4363 | * is available. | 4150 | * is available. |
| 4364 | */ | 4151 | */ |
| 4365 | if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) { | 4152 | if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) { |
| 4366 | task_rq_unlock(rq, p, &rf); | 4153 | task_rq_unlock(rq, p, &rf); |
| 4367 | return -EBUSY; | 4154 | return -EBUSY; |
| 4368 | } | 4155 | } |
| @@ -5463,26 +5250,17 @@ void init_idle(struct task_struct *idle, int cpu) | |||
| 5463 | #endif | 5250 | #endif |
| 5464 | } | 5251 | } |
| 5465 | 5252 | ||
| 5253 | #ifdef CONFIG_SMP | ||
| 5254 | |||
| 5466 | int cpuset_cpumask_can_shrink(const struct cpumask *cur, | 5255 | int cpuset_cpumask_can_shrink(const struct cpumask *cur, |
| 5467 | const struct cpumask *trial) | 5256 | const struct cpumask *trial) |
| 5468 | { | 5257 | { |
| 5469 | int ret = 1, trial_cpus; | 5258 | int ret = 1; |
| 5470 | struct dl_bw *cur_dl_b; | ||
| 5471 | unsigned long flags; | ||
| 5472 | 5259 | ||
| 5473 | if (!cpumask_weight(cur)) | 5260 | if (!cpumask_weight(cur)) |
| 5474 | return ret; | 5261 | return ret; |
| 5475 | 5262 | ||
| 5476 | rcu_read_lock_sched(); | 5263 | ret = dl_cpuset_cpumask_can_shrink(cur, trial); |
| 5477 | cur_dl_b = dl_bw_of(cpumask_any(cur)); | ||
| 5478 | trial_cpus = cpumask_weight(trial); | ||
| 5479 | |||
| 5480 | raw_spin_lock_irqsave(&cur_dl_b->lock, flags); | ||
| 5481 | if (cur_dl_b->bw != -1 && | ||
| 5482 | cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw) | ||
| 5483 | ret = 0; | ||
| 5484 | raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags); | ||
| 5485 | rcu_read_unlock_sched(); | ||
| 5486 | 5264 | ||
| 5487 | return ret; | 5265 | return ret; |
| 5488 | } | 5266 | } |
| @@ -5506,43 +5284,14 @@ int task_can_attach(struct task_struct *p, | |||
| 5506 | goto out; | 5284 | goto out; |
| 5507 | } | 5285 | } |
| 5508 | 5286 | ||
| 5509 | #ifdef CONFIG_SMP | ||
| 5510 | if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span, | 5287 | if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span, |
| 5511 | cs_cpus_allowed)) { | 5288 | cs_cpus_allowed)) |
| 5512 | unsigned int dest_cpu = cpumask_any_and(cpu_active_mask, | 5289 | ret = dl_task_can_attach(p, cs_cpus_allowed); |
| 5513 | cs_cpus_allowed); | ||
| 5514 | struct dl_bw *dl_b; | ||
| 5515 | bool overflow; | ||
| 5516 | int cpus; | ||
| 5517 | unsigned long flags; | ||
| 5518 | |||
| 5519 | rcu_read_lock_sched(); | ||
| 5520 | dl_b = dl_bw_of(dest_cpu); | ||
| 5521 | raw_spin_lock_irqsave(&dl_b->lock, flags); | ||
| 5522 | cpus = dl_bw_cpus(dest_cpu); | ||
| 5523 | overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw); | ||
| 5524 | if (overflow) | ||
| 5525 | ret = -EBUSY; | ||
| 5526 | else { | ||
| 5527 | /* | ||
| 5528 | * We reserve space for this task in the destination | ||
| 5529 | * root_domain, as we can't fail after this point. | ||
| 5530 | * We will free resources in the source root_domain | ||
| 5531 | * later on (see set_cpus_allowed_dl()). | ||
| 5532 | */ | ||
| 5533 | __dl_add(dl_b, p->dl.dl_bw); | ||
| 5534 | } | ||
| 5535 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); | ||
| 5536 | rcu_read_unlock_sched(); | ||
| 5537 | 5290 | ||
| 5538 | } | ||
| 5539 | #endif | ||
| 5540 | out: | 5291 | out: |
| 5541 | return ret; | 5292 | return ret; |
| 5542 | } | 5293 | } |
| 5543 | 5294 | ||
| 5544 | #ifdef CONFIG_SMP | ||
| 5545 | |||
| 5546 | bool sched_smp_initialized __read_mostly; | 5295 | bool sched_smp_initialized __read_mostly; |
| 5547 | 5296 | ||
| 5548 | #ifdef CONFIG_NUMA_BALANCING | 5297 | #ifdef CONFIG_NUMA_BALANCING |
| @@ -5605,7 +5354,7 @@ void idle_task_exit(void) | |||
| 5605 | BUG_ON(cpu_online(smp_processor_id())); | 5354 | BUG_ON(cpu_online(smp_processor_id())); |
| 5606 | 5355 | ||
| 5607 | if (mm != &init_mm) { | 5356 | if (mm != &init_mm) { |
| 5608 | switch_mm_irqs_off(mm, &init_mm, current); | 5357 | switch_mm(mm, &init_mm, current); |
| 5609 | finish_arch_post_lock_switch(); | 5358 | finish_arch_post_lock_switch(); |
| 5610 | } | 5359 | } |
| 5611 | mmdrop(mm); | 5360 | mmdrop(mm); |
| @@ -5805,23 +5554,8 @@ static void cpuset_cpu_active(void) | |||
| 5805 | 5554 | ||
| 5806 | static int cpuset_cpu_inactive(unsigned int cpu) | 5555 | static int cpuset_cpu_inactive(unsigned int cpu) |
| 5807 | { | 5556 | { |
| 5808 | unsigned long flags; | ||
| 5809 | struct dl_bw *dl_b; | ||
| 5810 | bool overflow; | ||
| 5811 | int cpus; | ||
| 5812 | |||
| 5813 | if (!cpuhp_tasks_frozen) { | 5557 | if (!cpuhp_tasks_frozen) { |
| 5814 | rcu_read_lock_sched(); | 5558 | if (dl_cpu_busy(cpu)) |
| 5815 | dl_b = dl_bw_of(cpu); | ||
| 5816 | |||
| 5817 | raw_spin_lock_irqsave(&dl_b->lock, flags); | ||
| 5818 | cpus = dl_bw_cpus(cpu); | ||
| 5819 | overflow = __dl_overflow(dl_b, cpus, 0, 0); | ||
| 5820 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); | ||
| 5821 | |||
| 5822 | rcu_read_unlock_sched(); | ||
| 5823 | |||
| 5824 | if (overflow) | ||
| 5825 | return -EBUSY; | 5559 | return -EBUSY; |
| 5826 | cpuset_update_active_cpus(); | 5560 | cpuset_update_active_cpus(); |
| 5827 | } else { | 5561 | } else { |
| @@ -5874,15 +5608,9 @@ int sched_cpu_deactivate(unsigned int cpu) | |||
| 5874 | * users of this state to go away such that all new such users will | 5608 | * users of this state to go away such that all new such users will |
| 5875 | * observe it. | 5609 | * observe it. |
| 5876 | * | 5610 | * |
| 5877 | * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might | ||
| 5878 | * not imply sync_sched(), so wait for both. | ||
| 5879 | * | ||
| 5880 | * Do sync before park smpboot threads to take care the rcu boost case. | 5611 | * Do sync before park smpboot threads to take care the rcu boost case. |
| 5881 | */ | 5612 | */ |
| 5882 | if (IS_ENABLED(CONFIG_PREEMPT)) | 5613 | synchronize_rcu_mult(call_rcu, call_rcu_sched); |
| 5883 | synchronize_rcu_mult(call_rcu, call_rcu_sched); | ||
| 5884 | else | ||
| 5885 | synchronize_rcu(); | ||
| 5886 | 5614 | ||
| 5887 | if (!sched_smp_initialized) | 5615 | if (!sched_smp_initialized) |
| 5888 | return 0; | 5616 | return 0; |
| @@ -5958,7 +5686,6 @@ void __init sched_init_smp(void) | |||
| 5958 | cpumask_var_t non_isolated_cpus; | 5686 | cpumask_var_t non_isolated_cpus; |
| 5959 | 5687 | ||
| 5960 | alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); | 5688 | alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); |
| 5961 | alloc_cpumask_var(&fallback_doms, GFP_KERNEL); | ||
| 5962 | 5689 | ||
| 5963 | sched_init_numa(); | 5690 | sched_init_numa(); |
| 5964 | 5691 | ||
| @@ -5968,7 +5695,7 @@ void __init sched_init_smp(void) | |||
| 5968 | * happen. | 5695 | * happen. |
| 5969 | */ | 5696 | */ |
| 5970 | mutex_lock(&sched_domains_mutex); | 5697 | mutex_lock(&sched_domains_mutex); |
| 5971 | init_sched_domains(cpu_active_mask); | 5698 | sched_init_domains(cpu_active_mask); |
| 5972 | cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); | 5699 | cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); |
| 5973 | if (cpumask_empty(non_isolated_cpus)) | 5700 | if (cpumask_empty(non_isolated_cpus)) |
| 5974 | cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); | 5701 | cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); |
| @@ -5984,7 +5711,6 @@ void __init sched_init_smp(void) | |||
| 5984 | init_sched_dl_class(); | 5711 | init_sched_dl_class(); |
| 5985 | 5712 | ||
| 5986 | sched_init_smt(); | 5713 | sched_init_smt(); |
| 5987 | sched_clock_init_late(); | ||
| 5988 | 5714 | ||
| 5989 | sched_smp_initialized = true; | 5715 | sched_smp_initialized = true; |
| 5990 | } | 5716 | } |
| @@ -6000,7 +5726,6 @@ early_initcall(migration_init); | |||
| 6000 | void __init sched_init_smp(void) | 5726 | void __init sched_init_smp(void) |
| 6001 | { | 5727 | { |
| 6002 | sched_init_granularity(); | 5728 | sched_init_granularity(); |
| 6003 | sched_clock_init_late(); | ||
| 6004 | } | 5729 | } |
| 6005 | #endif /* CONFIG_SMP */ | 5730 | #endif /* CONFIG_SMP */ |
| 6006 | 5731 | ||
| @@ -6026,28 +5751,13 @@ static struct kmem_cache *task_group_cache __read_mostly; | |||
| 6026 | DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); | 5751 | DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); |
| 6027 | DECLARE_PER_CPU(cpumask_var_t, select_idle_mask); | 5752 | DECLARE_PER_CPU(cpumask_var_t, select_idle_mask); |
| 6028 | 5753 | ||
| 6029 | #define WAIT_TABLE_BITS 8 | ||
| 6030 | #define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS) | ||
| 6031 | static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned; | ||
| 6032 | |||
| 6033 | wait_queue_head_t *bit_waitqueue(void *word, int bit) | ||
| 6034 | { | ||
| 6035 | const int shift = BITS_PER_LONG == 32 ? 5 : 6; | ||
| 6036 | unsigned long val = (unsigned long)word << shift | bit; | ||
| 6037 | |||
| 6038 | return bit_wait_table + hash_long(val, WAIT_TABLE_BITS); | ||
| 6039 | } | ||
| 6040 | EXPORT_SYMBOL(bit_waitqueue); | ||
| 6041 | |||
| 6042 | void __init sched_init(void) | 5754 | void __init sched_init(void) |
| 6043 | { | 5755 | { |
| 6044 | int i, j; | 5756 | int i, j; |
| 6045 | unsigned long alloc_size = 0, ptr; | 5757 | unsigned long alloc_size = 0, ptr; |
| 6046 | 5758 | ||
| 6047 | sched_clock_init(); | 5759 | sched_clock_init(); |
| 6048 | 5760 | wait_bit_init(); | |
| 6049 | for (i = 0; i < WAIT_TABLE_SIZE; i++) | ||
| 6050 | init_waitqueue_head(bit_wait_table + i); | ||
| 6051 | 5761 | ||
| 6052 | #ifdef CONFIG_FAIR_GROUP_SCHED | 5762 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 6053 | alloc_size += 2 * nr_cpu_ids * sizeof(void **); | 5763 | alloc_size += 2 * nr_cpu_ids * sizeof(void **); |
| @@ -6199,7 +5909,6 @@ void __init sched_init(void) | |||
| 6199 | calc_load_update = jiffies + LOAD_FREQ; | 5909 | calc_load_update = jiffies + LOAD_FREQ; |
| 6200 | 5910 | ||
| 6201 | #ifdef CONFIG_SMP | 5911 | #ifdef CONFIG_SMP |
| 6202 | zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); | ||
| 6203 | /* May be allocated at isolcpus cmdline parse time */ | 5912 | /* May be allocated at isolcpus cmdline parse time */ |
| 6204 | if (cpu_isolated_map == NULL) | 5913 | if (cpu_isolated_map == NULL) |
| 6205 | zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); | 5914 | zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); |
| @@ -6251,8 +5960,10 @@ void ___might_sleep(const char *file, int line, int preempt_offset) | |||
| 6251 | 5960 | ||
| 6252 | if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && | 5961 | if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && |
| 6253 | !is_idle_task(current)) || | 5962 | !is_idle_task(current)) || |
| 6254 | system_state != SYSTEM_RUNNING || oops_in_progress) | 5963 | system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || |
| 5964 | oops_in_progress) | ||
| 6255 | return; | 5965 | return; |
| 5966 | |||
| 6256 | if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) | 5967 | if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) |
| 6257 | return; | 5968 | return; |
| 6258 | prev_jiffy = jiffies; | 5969 | prev_jiffy = jiffies; |
| @@ -6507,385 +6218,6 @@ void sched_move_task(struct task_struct *tsk) | |||
| 6507 | 6218 | ||
| 6508 | task_rq_unlock(rq, tsk, &rf); | 6219 | task_rq_unlock(rq, tsk, &rf); |
| 6509 | } | 6220 | } |
| 6510 | #endif /* CONFIG_CGROUP_SCHED */ | ||
| 6511 | |||
| 6512 | #ifdef CONFIG_RT_GROUP_SCHED | ||
| 6513 | /* | ||
| 6514 | * Ensure that the real time constraints are schedulable. | ||
| 6515 | */ | ||
| 6516 | static DEFINE_MUTEX(rt_constraints_mutex); | ||
| 6517 | |||
| 6518 | /* Must be called with tasklist_lock held */ | ||
| 6519 | static inline int tg_has_rt_tasks(struct task_group *tg) | ||
| 6520 | { | ||
| 6521 | struct task_struct *g, *p; | ||
| 6522 | |||
| 6523 | /* | ||
| 6524 | * Autogroups do not have RT tasks; see autogroup_create(). | ||
| 6525 | */ | ||
| 6526 | if (task_group_is_autogroup(tg)) | ||
| 6527 | return 0; | ||
| 6528 | |||
| 6529 | for_each_process_thread(g, p) { | ||
| 6530 | if (rt_task(p) && task_group(p) == tg) | ||
| 6531 | return 1; | ||
| 6532 | } | ||
| 6533 | |||
| 6534 | return 0; | ||
| 6535 | } | ||
| 6536 | |||
| 6537 | struct rt_schedulable_data { | ||
| 6538 | struct task_group *tg; | ||
| 6539 | u64 rt_period; | ||
| 6540 | u64 rt_runtime; | ||
| 6541 | }; | ||
| 6542 | |||
| 6543 | static int tg_rt_schedulable(struct task_group *tg, void *data) | ||
| 6544 | { | ||
| 6545 | struct rt_schedulable_data *d = data; | ||
| 6546 | struct task_group *child; | ||
| 6547 | unsigned long total, sum = 0; | ||
| 6548 | u64 period, runtime; | ||
| 6549 | |||
| 6550 | period = ktime_to_ns(tg->rt_bandwidth.rt_period); | ||
| 6551 | runtime = tg->rt_bandwidth.rt_runtime; | ||
| 6552 | |||
| 6553 | if (tg == d->tg) { | ||
| 6554 | period = d->rt_period; | ||
| 6555 | runtime = d->rt_runtime; | ||
| 6556 | } | ||
| 6557 | |||
| 6558 | /* | ||
| 6559 | * Cannot have more runtime than the period. | ||
| 6560 | */ | ||
| 6561 | if (runtime > period && runtime != RUNTIME_INF) | ||
| 6562 | return -EINVAL; | ||
| 6563 | |||
| 6564 | /* | ||
| 6565 | * Ensure we don't starve existing RT tasks. | ||
| 6566 | */ | ||
| 6567 | if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) | ||
| 6568 | return -EBUSY; | ||
| 6569 | |||
| 6570 | total = to_ratio(period, runtime); | ||
| 6571 | |||
| 6572 | /* | ||
| 6573 | * Nobody can have more than the global setting allows. | ||
| 6574 | */ | ||
| 6575 | if (total > to_ratio(global_rt_period(), global_rt_runtime())) | ||
| 6576 | return -EINVAL; | ||
| 6577 | |||
| 6578 | /* | ||
| 6579 | * The sum of our children's runtime should not exceed our own. | ||
| 6580 | */ | ||
| 6581 | list_for_each_entry_rcu(child, &tg->children, siblings) { | ||
| 6582 | period = ktime_to_ns(child->rt_bandwidth.rt_period); | ||
| 6583 | runtime = child->rt_bandwidth.rt_runtime; | ||
| 6584 | |||
| 6585 | if (child == d->tg) { | ||
| 6586 | period = d->rt_period; | ||
| 6587 | runtime = d->rt_runtime; | ||
| 6588 | } | ||
| 6589 | |||
| 6590 | sum += to_ratio(period, runtime); | ||
| 6591 | } | ||
| 6592 | |||
| 6593 | if (sum > total) | ||
| 6594 | return -EINVAL; | ||
| 6595 | |||
| 6596 | return 0; | ||
| 6597 | } | ||
| 6598 | |||
| 6599 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) | ||
| 6600 | { | ||
| 6601 | int ret; | ||
| 6602 | |||
| 6603 | struct rt_schedulable_data data = { | ||
| 6604 | .tg = tg, | ||
| 6605 | .rt_period = period, | ||
| 6606 | .rt_runtime = runtime, | ||
| 6607 | }; | ||
| 6608 | |||
| 6609 | rcu_read_lock(); | ||
| 6610 | ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data); | ||
| 6611 | rcu_read_unlock(); | ||
| 6612 | |||
| 6613 | return ret; | ||
| 6614 | } | ||
| 6615 | |||
| 6616 | static int tg_set_rt_bandwidth(struct task_group *tg, | ||
| 6617 | u64 rt_period, u64 rt_runtime) | ||
| 6618 | { | ||
| 6619 | int i, err = 0; | ||
| 6620 | |||
| 6621 | /* | ||
| 6622 | * Disallowing the root group RT runtime is BAD, it would disallow the | ||
| 6623 | * kernel creating (and or operating) RT threads. | ||
| 6624 | */ | ||
| 6625 | if (tg == &root_task_group && rt_runtime == 0) | ||
| 6626 | return -EINVAL; | ||
| 6627 | |||
| 6628 | /* No period doesn't make any sense. */ | ||
| 6629 | if (rt_period == 0) | ||
| 6630 | return -EINVAL; | ||
| 6631 | |||
| 6632 | mutex_lock(&rt_constraints_mutex); | ||
| 6633 | read_lock(&tasklist_lock); | ||
| 6634 | err = __rt_schedulable(tg, rt_period, rt_runtime); | ||
| 6635 | if (err) | ||
| 6636 | goto unlock; | ||
| 6637 | |||
| 6638 | raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); | ||
| 6639 | tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); | ||
| 6640 | tg->rt_bandwidth.rt_runtime = rt_runtime; | ||
| 6641 | |||
| 6642 | for_each_possible_cpu(i) { | ||
| 6643 | struct rt_rq *rt_rq = tg->rt_rq[i]; | ||
| 6644 | |||
| 6645 | raw_spin_lock(&rt_rq->rt_runtime_lock); | ||
| 6646 | rt_rq->rt_runtime = rt_runtime; | ||
| 6647 | raw_spin_unlock(&rt_rq->rt_runtime_lock); | ||
| 6648 | } | ||
| 6649 | raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); | ||
| 6650 | unlock: | ||
| 6651 | read_unlock(&tasklist_lock); | ||
| 6652 | mutex_unlock(&rt_constraints_mutex); | ||
| 6653 | |||
| 6654 | return err; | ||
| 6655 | } | ||
| 6656 | |||
| 6657 | static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) | ||
| 6658 | { | ||
| 6659 | u64 rt_runtime, rt_period; | ||
| 6660 | |||
| 6661 | rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); | ||
| 6662 | rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; | ||
| 6663 | if (rt_runtime_us < 0) | ||
| 6664 | rt_runtime = RUNTIME_INF; | ||
| 6665 | |||
| 6666 | return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); | ||
| 6667 | } | ||
| 6668 | |||
| 6669 | static long sched_group_rt_runtime(struct task_group *tg) | ||
| 6670 | { | ||
| 6671 | u64 rt_runtime_us; | ||
| 6672 | |||
| 6673 | if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF) | ||
| 6674 | return -1; | ||
| 6675 | |||
| 6676 | rt_runtime_us = tg->rt_bandwidth.rt_runtime; | ||
| 6677 | do_div(rt_runtime_us, NSEC_PER_USEC); | ||
| 6678 | return rt_runtime_us; | ||
| 6679 | } | ||
| 6680 | |||
| 6681 | static int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us) | ||
| 6682 | { | ||
| 6683 | u64 rt_runtime, rt_period; | ||
| 6684 | |||
| 6685 | rt_period = rt_period_us * NSEC_PER_USEC; | ||
| 6686 | rt_runtime = tg->rt_bandwidth.rt_runtime; | ||
| 6687 | |||
| 6688 | return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); | ||
| 6689 | } | ||
| 6690 | |||
| 6691 | static long sched_group_rt_period(struct task_group *tg) | ||
| 6692 | { | ||
| 6693 | u64 rt_period_us; | ||
| 6694 | |||
| 6695 | rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period); | ||
| 6696 | do_div(rt_period_us, NSEC_PER_USEC); | ||
| 6697 | return rt_period_us; | ||
| 6698 | } | ||
| 6699 | #endif /* CONFIG_RT_GROUP_SCHED */ | ||
| 6700 | |||
| 6701 | #ifdef CONFIG_RT_GROUP_SCHED | ||
| 6702 | static int sched_rt_global_constraints(void) | ||
| 6703 | { | ||
| 6704 | int ret = 0; | ||
| 6705 | |||
| 6706 | mutex_lock(&rt_constraints_mutex); | ||
| 6707 | read_lock(&tasklist_lock); | ||
| 6708 | ret = __rt_schedulable(NULL, 0, 0); | ||
| 6709 | read_unlock(&tasklist_lock); | ||
| 6710 | mutex_unlock(&rt_constraints_mutex); | ||
| 6711 | |||
| 6712 | return ret; | ||
| 6713 | } | ||
| 6714 | |||
| 6715 | static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) | ||
| 6716 | { | ||
| 6717 | /* Don't accept realtime tasks when there is no way for them to run */ | ||
| 6718 | if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) | ||
| 6719 | return 0; | ||
| 6720 | |||
| 6721 | return 1; | ||
| 6722 | } | ||
| 6723 | |||
| 6724 | #else /* !CONFIG_RT_GROUP_SCHED */ | ||
| 6725 | static int sched_rt_global_constraints(void) | ||
| 6726 | { | ||
| 6727 | unsigned long flags; | ||
| 6728 | int i; | ||
| 6729 | |||
| 6730 | raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); | ||
| 6731 | for_each_possible_cpu(i) { | ||
| 6732 | struct rt_rq *rt_rq = &cpu_rq(i)->rt; | ||
| 6733 | |||
| 6734 | raw_spin_lock(&rt_rq->rt_runtime_lock); | ||
| 6735 | rt_rq->rt_runtime = global_rt_runtime(); | ||
| 6736 | raw_spin_unlock(&rt_rq->rt_runtime_lock); | ||
| 6737 | } | ||
| 6738 | raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); | ||
| 6739 | |||
| 6740 | return 0; | ||
| 6741 | } | ||
| 6742 | #endif /* CONFIG_RT_GROUP_SCHED */ | ||
| 6743 | |||
| 6744 | static int sched_dl_global_validate(void) | ||
| 6745 | { | ||
| 6746 | u64 runtime = global_rt_runtime(); | ||
| 6747 | u64 period = global_rt_period(); | ||
| 6748 | u64 new_bw = to_ratio(period, runtime); | ||
| 6749 | struct dl_bw *dl_b; | ||
| 6750 | int cpu, ret = 0; | ||
| 6751 | unsigned long flags; | ||
| 6752 | |||
| 6753 | /* | ||
| 6754 | * Here we want to check the bandwidth not being set to some | ||
| 6755 | * value smaller than the currently allocated bandwidth in | ||
| 6756 | * any of the root_domains. | ||
| 6757 | * | ||
| 6758 | * FIXME: Cycling on all the CPUs is overdoing, but simpler than | ||
| 6759 | * cycling on root_domains... Discussion on different/better | ||
| 6760 | * solutions is welcome! | ||
| 6761 | */ | ||
| 6762 | for_each_possible_cpu(cpu) { | ||
| 6763 | rcu_read_lock_sched(); | ||
| 6764 | dl_b = dl_bw_of(cpu); | ||
| 6765 | |||
| 6766 | raw_spin_lock_irqsave(&dl_b->lock, flags); | ||
| 6767 | if (new_bw < dl_b->total_bw) | ||
| 6768 | ret = -EBUSY; | ||
| 6769 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); | ||
| 6770 | |||
| 6771 | rcu_read_unlock_sched(); | ||
| 6772 | |||
| 6773 | if (ret) | ||
| 6774 | break; | ||
| 6775 | } | ||
| 6776 | |||
| 6777 | return ret; | ||
| 6778 | } | ||
| 6779 | |||
| 6780 | static void sched_dl_do_global(void) | ||
| 6781 | { | ||
| 6782 | u64 new_bw = -1; | ||
| 6783 | struct dl_bw *dl_b; | ||
| 6784 | int cpu; | ||
| 6785 | unsigned long flags; | ||
| 6786 | |||
| 6787 | def_dl_bandwidth.dl_period = global_rt_period(); | ||
| 6788 | def_dl_bandwidth.dl_runtime = global_rt_runtime(); | ||
| 6789 | |||
| 6790 | if (global_rt_runtime() != RUNTIME_INF) | ||
| 6791 | new_bw = to_ratio(global_rt_period(), global_rt_runtime()); | ||
| 6792 | |||
| 6793 | /* | ||
| 6794 | * FIXME: As above... | ||
| 6795 | */ | ||
| 6796 | for_each_possible_cpu(cpu) { | ||
| 6797 | rcu_read_lock_sched(); | ||
| 6798 | dl_b = dl_bw_of(cpu); | ||
| 6799 | |||
| 6800 | raw_spin_lock_irqsave(&dl_b->lock, flags); | ||
| 6801 | dl_b->bw = new_bw; | ||
| 6802 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); | ||
| 6803 | |||
| 6804 | rcu_read_unlock_sched(); | ||
| 6805 | } | ||
| 6806 | } | ||
| 6807 | |||
| 6808 | static int sched_rt_global_validate(void) | ||
| 6809 | { | ||
| 6810 | if (sysctl_sched_rt_period <= 0) | ||
| 6811 | return -EINVAL; | ||
| 6812 | |||
| 6813 | if ((sysctl_sched_rt_runtime != RUNTIME_INF) && | ||
| 6814 | (sysctl_sched_rt_runtime > sysctl_sched_rt_period)) | ||
| 6815 | return -EINVAL; | ||
| 6816 | |||
| 6817 | return 0; | ||
| 6818 | } | ||
| 6819 | |||
| 6820 | static void sched_rt_do_global(void) | ||
| 6821 | { | ||
| 6822 | def_rt_bandwidth.rt_runtime = global_rt_runtime(); | ||
| 6823 | def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period()); | ||
| 6824 | } | ||
| 6825 | |||
| 6826 | int sched_rt_handler(struct ctl_table *table, int write, | ||
| 6827 | void __user *buffer, size_t *lenp, | ||
| 6828 | loff_t *ppos) | ||
| 6829 | { | ||
| 6830 | int old_period, old_runtime; | ||
| 6831 | static DEFINE_MUTEX(mutex); | ||
| 6832 | int ret; | ||
| 6833 | |||
| 6834 | mutex_lock(&mutex); | ||
| 6835 | old_period = sysctl_sched_rt_period; | ||
| 6836 | old_runtime = sysctl_sched_rt_runtime; | ||
| 6837 | |||
| 6838 | ret = proc_dointvec(table, write, buffer, lenp, ppos); | ||
| 6839 | |||
| 6840 | if (!ret && write) { | ||
| 6841 | ret = sched_rt_global_validate(); | ||
| 6842 | if (ret) | ||
| 6843 | goto undo; | ||
| 6844 | |||
| 6845 | ret = sched_dl_global_validate(); | ||
| 6846 | if (ret) | ||
| 6847 | goto undo; | ||
| 6848 | |||
| 6849 | ret = sched_rt_global_constraints(); | ||
| 6850 | if (ret) | ||
| 6851 | goto undo; | ||
| 6852 | |||
| 6853 | sched_rt_do_global(); | ||
| 6854 | sched_dl_do_global(); | ||
| 6855 | } | ||
| 6856 | if (0) { | ||
| 6857 | undo: | ||
| 6858 | sysctl_sched_rt_period = old_period; | ||
| 6859 | sysctl_sched_rt_runtime = old_runtime; | ||
| 6860 | } | ||
| 6861 | mutex_unlock(&mutex); | ||
| 6862 | |||
| 6863 | return ret; | ||
| 6864 | } | ||
| 6865 | |||
| 6866 | int sched_rr_handler(struct ctl_table *table, int write, | ||
| 6867 | void __user *buffer, size_t *lenp, | ||
| 6868 | loff_t *ppos) | ||
| 6869 | { | ||
| 6870 | int ret; | ||
| 6871 | static DEFINE_MUTEX(mutex); | ||
| 6872 | |||
| 6873 | mutex_lock(&mutex); | ||
| 6874 | ret = proc_dointvec(table, write, buffer, lenp, ppos); | ||
| 6875 | /* | ||
| 6876 | * Make sure that internally we keep jiffies. | ||
| 6877 | * Also, writing zero resets the timeslice to default: | ||
| 6878 | */ | ||
| 6879 | if (!ret && write) { | ||
| 6880 | sched_rr_timeslice = | ||
| 6881 | sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE : | ||
| 6882 | msecs_to_jiffies(sysctl_sched_rr_timeslice); | ||
| 6883 | } | ||
| 6884 | mutex_unlock(&mutex); | ||
| 6885 | return ret; | ||
| 6886 | } | ||
| 6887 | |||
| 6888 | #ifdef CONFIG_CGROUP_SCHED | ||
| 6889 | 6221 | ||
| 6890 | static inline struct task_group *css_tg(struct cgroup_subsys_state *css) | 6222 | static inline struct task_group *css_tg(struct cgroup_subsys_state *css) |
| 6891 | { | 6223 | { |
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 76877a62b5fa..29a397067ffa 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c | |||
| @@ -101,9 +101,6 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time, | |||
| 101 | if (sg_policy->next_freq == next_freq) | 101 | if (sg_policy->next_freq == next_freq) |
| 102 | return; | 102 | return; |
| 103 | 103 | ||
| 104 | if (sg_policy->next_freq > next_freq) | ||
| 105 | next_freq = (sg_policy->next_freq + next_freq) >> 1; | ||
| 106 | |||
| 107 | sg_policy->next_freq = next_freq; | 104 | sg_policy->next_freq = next_freq; |
| 108 | sg_policy->last_freq_update_time = time; | 105 | sg_policy->last_freq_update_time = time; |
| 109 | 106 | ||
| @@ -245,11 +242,10 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, | |||
| 245 | sugov_update_commit(sg_policy, time, next_f); | 242 | sugov_update_commit(sg_policy, time, next_f); |
| 246 | } | 243 | } |
| 247 | 244 | ||
| 248 | static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu) | 245 | static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) |
| 249 | { | 246 | { |
| 250 | struct sugov_policy *sg_policy = sg_cpu->sg_policy; | 247 | struct sugov_policy *sg_policy = sg_cpu->sg_policy; |
| 251 | struct cpufreq_policy *policy = sg_policy->policy; | 248 | struct cpufreq_policy *policy = sg_policy->policy; |
| 252 | u64 last_freq_update_time = sg_policy->last_freq_update_time; | ||
| 253 | unsigned long util = 0, max = 1; | 249 | unsigned long util = 0, max = 1; |
| 254 | unsigned int j; | 250 | unsigned int j; |
| 255 | 251 | ||
| @@ -265,7 +261,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu) | |||
| 265 | * enough, don't take the CPU into account as it probably is | 261 | * enough, don't take the CPU into account as it probably is |
| 266 | * idle now (and clear iowait_boost for it). | 262 | * idle now (and clear iowait_boost for it). |
| 267 | */ | 263 | */ |
| 268 | delta_ns = last_freq_update_time - j_sg_cpu->last_update; | 264 | delta_ns = time - j_sg_cpu->last_update; |
| 269 | if (delta_ns > TICK_NSEC) { | 265 | if (delta_ns > TICK_NSEC) { |
| 270 | j_sg_cpu->iowait_boost = 0; | 266 | j_sg_cpu->iowait_boost = 0; |
| 271 | continue; | 267 | continue; |
| @@ -309,7 +305,7 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time, | |||
| 309 | if (flags & SCHED_CPUFREQ_RT_DL) | 305 | if (flags & SCHED_CPUFREQ_RT_DL) |
| 310 | next_f = sg_policy->policy->cpuinfo.max_freq; | 306 | next_f = sg_policy->policy->cpuinfo.max_freq; |
| 311 | else | 307 | else |
| 312 | next_f = sugov_next_freq_shared(sg_cpu); | 308 | next_f = sugov_next_freq_shared(sg_cpu, time); |
| 313 | 309 | ||
| 314 | sugov_update_commit(sg_policy, time, next_f); | 310 | sugov_update_commit(sg_policy, time, next_f); |
| 315 | } | 311 | } |
| @@ -614,6 +610,11 @@ static int sugov_start(struct cpufreq_policy *policy) | |||
| 614 | sg_cpu->sg_policy = sg_policy; | 610 | sg_cpu->sg_policy = sg_policy; |
| 615 | sg_cpu->flags = SCHED_CPUFREQ_RT; | 611 | sg_cpu->flags = SCHED_CPUFREQ_RT; |
| 616 | sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; | 612 | sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; |
| 613 | } | ||
| 614 | |||
| 615 | for_each_cpu(cpu, policy->cpus) { | ||
| 616 | struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); | ||
| 617 | |||
| 617 | cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, | 618 | cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, |
| 618 | policy_is_shared(policy) ? | 619 | policy_is_shared(policy) ? |
| 619 | sugov_update_shared : | 620 | sugov_update_shared : |
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index aea3135c5d90..14d2dbf97c53 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c | |||
| @@ -611,9 +611,9 @@ static void cputime_adjust(struct task_cputime *curr, | |||
| 611 | utime = curr->utime; | 611 | utime = curr->utime; |
| 612 | 612 | ||
| 613 | /* | 613 | /* |
| 614 | * If either stime or both stime and utime are 0, assume all runtime is | 614 | * If either stime or utime are 0, assume all runtime is userspace. |
| 615 | * userspace. Once a task gets some ticks, the monotonicy code at | 615 | * Once a task gets some ticks, the monotonicy code at 'update:' |
| 616 | * 'update' will ensure things converge to the observed ratio. | 616 | * will ensure things converge to the observed ratio. |
| 617 | */ | 617 | */ |
| 618 | if (stime == 0) { | 618 | if (stime == 0) { |
| 619 | utime = rtime; | 619 | utime = rtime; |
| @@ -679,20 +679,21 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) | |||
| 679 | #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ | 679 | #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ |
| 680 | 680 | ||
| 681 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN | 681 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN |
| 682 | static u64 vtime_delta(struct task_struct *tsk) | 682 | static u64 vtime_delta(struct vtime *vtime) |
| 683 | { | 683 | { |
| 684 | unsigned long now = READ_ONCE(jiffies); | 684 | unsigned long long clock; |
| 685 | 685 | ||
| 686 | if (time_before(now, (unsigned long)tsk->vtime_snap)) | 686 | clock = sched_clock(); |
| 687 | if (clock < vtime->starttime) | ||
| 687 | return 0; | 688 | return 0; |
| 688 | 689 | ||
| 689 | return jiffies_to_nsecs(now - tsk->vtime_snap); | 690 | return clock - vtime->starttime; |
| 690 | } | 691 | } |
| 691 | 692 | ||
| 692 | static u64 get_vtime_delta(struct task_struct *tsk) | 693 | static u64 get_vtime_delta(struct vtime *vtime) |
| 693 | { | 694 | { |
| 694 | unsigned long now = READ_ONCE(jiffies); | 695 | u64 delta = vtime_delta(vtime); |
| 695 | u64 delta, other; | 696 | u64 other; |
| 696 | 697 | ||
| 697 | /* | 698 | /* |
| 698 | * Unlike tick based timing, vtime based timing never has lost | 699 | * Unlike tick based timing, vtime based timing never has lost |
| @@ -701,104 +702,138 @@ static u64 get_vtime_delta(struct task_struct *tsk) | |||
| 701 | * elapsed time. Limit account_other_time to prevent rounding | 702 | * elapsed time. Limit account_other_time to prevent rounding |
| 702 | * errors from causing elapsed vtime to go negative. | 703 | * errors from causing elapsed vtime to go negative. |
| 703 | */ | 704 | */ |
| 704 | delta = jiffies_to_nsecs(now - tsk->vtime_snap); | ||
| 705 | other = account_other_time(delta); | 705 | other = account_other_time(delta); |
| 706 | WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); | 706 | WARN_ON_ONCE(vtime->state == VTIME_INACTIVE); |
| 707 | tsk->vtime_snap = now; | 707 | vtime->starttime += delta; |
| 708 | 708 | ||
| 709 | return delta - other; | 709 | return delta - other; |
| 710 | } | 710 | } |
| 711 | 711 | ||
| 712 | static void __vtime_account_system(struct task_struct *tsk) | 712 | static void __vtime_account_system(struct task_struct *tsk, |
| 713 | struct vtime *vtime) | ||
| 713 | { | 714 | { |
| 714 | account_system_time(tsk, irq_count(), get_vtime_delta(tsk)); | 715 | vtime->stime += get_vtime_delta(vtime); |
| 716 | if (vtime->stime >= TICK_NSEC) { | ||
| 717 | account_system_time(tsk, irq_count(), vtime->stime); | ||
| 718 | vtime->stime = 0; | ||
| 719 | } | ||
| 720 | } | ||
| 721 | |||
| 722 | static void vtime_account_guest(struct task_struct *tsk, | ||
| 723 | struct vtime *vtime) | ||
| 724 | { | ||
| 725 | vtime->gtime += get_vtime_delta(vtime); | ||
| 726 | if (vtime->gtime >= TICK_NSEC) { | ||
| 727 | account_guest_time(tsk, vtime->gtime); | ||
| 728 | vtime->gtime = 0; | ||
| 729 | } | ||
| 715 | } | 730 | } |
| 716 | 731 | ||
| 717 | void vtime_account_system(struct task_struct *tsk) | 732 | void vtime_account_system(struct task_struct *tsk) |
| 718 | { | 733 | { |
| 719 | if (!vtime_delta(tsk)) | 734 | struct vtime *vtime = &tsk->vtime; |
| 735 | |||
| 736 | if (!vtime_delta(vtime)) | ||
| 720 | return; | 737 | return; |
| 721 | 738 | ||
| 722 | write_seqcount_begin(&tsk->vtime_seqcount); | 739 | write_seqcount_begin(&vtime->seqcount); |
| 723 | __vtime_account_system(tsk); | 740 | /* We might have scheduled out from guest path */ |
| 724 | write_seqcount_end(&tsk->vtime_seqcount); | 741 | if (current->flags & PF_VCPU) |
| 742 | vtime_account_guest(tsk, vtime); | ||
| 743 | else | ||
| 744 | __vtime_account_system(tsk, vtime); | ||
| 745 | write_seqcount_end(&vtime->seqcount); | ||
| 725 | } | 746 | } |
| 726 | 747 | ||
| 727 | void vtime_account_user(struct task_struct *tsk) | 748 | void vtime_user_enter(struct task_struct *tsk) |
| 728 | { | 749 | { |
| 729 | write_seqcount_begin(&tsk->vtime_seqcount); | 750 | struct vtime *vtime = &tsk->vtime; |
| 730 | tsk->vtime_snap_whence = VTIME_SYS; | 751 | |
| 731 | if (vtime_delta(tsk)) | 752 | write_seqcount_begin(&vtime->seqcount); |
| 732 | account_user_time(tsk, get_vtime_delta(tsk)); | 753 | __vtime_account_system(tsk, vtime); |
| 733 | write_seqcount_end(&tsk->vtime_seqcount); | 754 | vtime->state = VTIME_USER; |
| 755 | write_seqcount_end(&vtime->seqcount); | ||
| 734 | } | 756 | } |
| 735 | 757 | ||
| 736 | void vtime_user_enter(struct task_struct *tsk) | 758 | void vtime_user_exit(struct task_struct *tsk) |
| 737 | { | 759 | { |
| 738 | write_seqcount_begin(&tsk->vtime_seqcount); | 760 | struct vtime *vtime = &tsk->vtime; |
| 739 | if (vtime_delta(tsk)) | 761 | |
| 740 | __vtime_account_system(tsk); | 762 | write_seqcount_begin(&vtime->seqcount); |
| 741 | tsk->vtime_snap_whence = VTIME_USER; | 763 | vtime->utime += get_vtime_delta(vtime); |
| 742 | write_seqcount_end(&tsk->vtime_seqcount); | 764 | if (vtime->utime >= TICK_NSEC) { |
| 765 | account_user_time(tsk, vtime->utime); | ||
| 766 | vtime->utime = 0; | ||
| 767 | } | ||
| 768 | vtime->state = VTIME_SYS; | ||
| 769 | write_seqcount_end(&vtime->seqcount); | ||
| 743 | } | 770 | } |
| 744 | 771 | ||
| 745 | void vtime_guest_enter(struct task_struct *tsk) | 772 | void vtime_guest_enter(struct task_struct *tsk) |
| 746 | { | 773 | { |
| 774 | struct vtime *vtime = &tsk->vtime; | ||
| 747 | /* | 775 | /* |
| 748 | * The flags must be updated under the lock with | 776 | * The flags must be updated under the lock with |
| 749 | * the vtime_snap flush and update. | 777 | * the vtime_starttime flush and update. |
| 750 | * That enforces a right ordering and update sequence | 778 | * That enforces a right ordering and update sequence |
| 751 | * synchronization against the reader (task_gtime()) | 779 | * synchronization against the reader (task_gtime()) |
| 752 | * that can thus safely catch up with a tickless delta. | 780 | * that can thus safely catch up with a tickless delta. |
| 753 | */ | 781 | */ |
| 754 | write_seqcount_begin(&tsk->vtime_seqcount); | 782 | write_seqcount_begin(&vtime->seqcount); |
| 755 | if (vtime_delta(tsk)) | 783 | __vtime_account_system(tsk, vtime); |
| 756 | __vtime_account_system(tsk); | ||
| 757 | current->flags |= PF_VCPU; | 784 | current->flags |= PF_VCPU; |
| 758 | write_seqcount_end(&tsk->vtime_seqcount); | 785 | write_seqcount_end(&vtime->seqcount); |
| 759 | } | 786 | } |
| 760 | EXPORT_SYMBOL_GPL(vtime_guest_enter); | 787 | EXPORT_SYMBOL_GPL(vtime_guest_enter); |
| 761 | 788 | ||
| 762 | void vtime_guest_exit(struct task_struct *tsk) | 789 | void vtime_guest_exit(struct task_struct *tsk) |
| 763 | { | 790 | { |
| 764 | write_seqcount_begin(&tsk->vtime_seqcount); | 791 | struct vtime *vtime = &tsk->vtime; |
| 765 | __vtime_account_system(tsk); | 792 | |
| 793 | write_seqcount_begin(&vtime->seqcount); | ||
| 794 | vtime_account_guest(tsk, vtime); | ||
| 766 | current->flags &= ~PF_VCPU; | 795 | current->flags &= ~PF_VCPU; |
| 767 | write_seqcount_end(&tsk->vtime_seqcount); | 796 | write_seqcount_end(&vtime->seqcount); |
| 768 | } | 797 | } |
| 769 | EXPORT_SYMBOL_GPL(vtime_guest_exit); | 798 | EXPORT_SYMBOL_GPL(vtime_guest_exit); |
| 770 | 799 | ||
| 771 | void vtime_account_idle(struct task_struct *tsk) | 800 | void vtime_account_idle(struct task_struct *tsk) |
| 772 | { | 801 | { |
| 773 | account_idle_time(get_vtime_delta(tsk)); | 802 | account_idle_time(get_vtime_delta(&tsk->vtime)); |
| 774 | } | 803 | } |
| 775 | 804 | ||
| 776 | void arch_vtime_task_switch(struct task_struct *prev) | 805 | void arch_vtime_task_switch(struct task_struct *prev) |
| 777 | { | 806 | { |
| 778 | write_seqcount_begin(&prev->vtime_seqcount); | 807 | struct vtime *vtime = &prev->vtime; |
| 779 | prev->vtime_snap_whence = VTIME_INACTIVE; | ||
| 780 | write_seqcount_end(&prev->vtime_seqcount); | ||
| 781 | 808 | ||
| 782 | write_seqcount_begin(¤t->vtime_seqcount); | 809 | write_seqcount_begin(&vtime->seqcount); |
| 783 | current->vtime_snap_whence = VTIME_SYS; | 810 | vtime->state = VTIME_INACTIVE; |
| 784 | current->vtime_snap = jiffies; | 811 | write_seqcount_end(&vtime->seqcount); |
| 785 | write_seqcount_end(¤t->vtime_seqcount); | 812 | |
| 813 | vtime = ¤t->vtime; | ||
| 814 | |||
| 815 | write_seqcount_begin(&vtime->seqcount); | ||
| 816 | vtime->state = VTIME_SYS; | ||
| 817 | vtime->starttime = sched_clock(); | ||
| 818 | write_seqcount_end(&vtime->seqcount); | ||
| 786 | } | 819 | } |
| 787 | 820 | ||
| 788 | void vtime_init_idle(struct task_struct *t, int cpu) | 821 | void vtime_init_idle(struct task_struct *t, int cpu) |
| 789 | { | 822 | { |
| 823 | struct vtime *vtime = &t->vtime; | ||
| 790 | unsigned long flags; | 824 | unsigned long flags; |
| 791 | 825 | ||
| 792 | local_irq_save(flags); | 826 | local_irq_save(flags); |
| 793 | write_seqcount_begin(&t->vtime_seqcount); | 827 | write_seqcount_begin(&vtime->seqcount); |
| 794 | t->vtime_snap_whence = VTIME_SYS; | 828 | vtime->state = VTIME_SYS; |
| 795 | t->vtime_snap = jiffies; | 829 | vtime->starttime = sched_clock(); |
| 796 | write_seqcount_end(&t->vtime_seqcount); | 830 | write_seqcount_end(&vtime->seqcount); |
| 797 | local_irq_restore(flags); | 831 | local_irq_restore(flags); |
| 798 | } | 832 | } |
| 799 | 833 | ||
| 800 | u64 task_gtime(struct task_struct *t) | 834 | u64 task_gtime(struct task_struct *t) |
| 801 | { | 835 | { |
| 836 | struct vtime *vtime = &t->vtime; | ||
| 802 | unsigned int seq; | 837 | unsigned int seq; |
| 803 | u64 gtime; | 838 | u64 gtime; |
| 804 | 839 | ||
| @@ -806,13 +841,13 @@ u64 task_gtime(struct task_struct *t) | |||
| 806 | return t->gtime; | 841 | return t->gtime; |
| 807 | 842 | ||
| 808 | do { | 843 | do { |
| 809 | seq = read_seqcount_begin(&t->vtime_seqcount); | 844 | seq = read_seqcount_begin(&vtime->seqcount); |
| 810 | 845 | ||
| 811 | gtime = t->gtime; | 846 | gtime = t->gtime; |
| 812 | if (t->vtime_snap_whence == VTIME_SYS && t->flags & PF_VCPU) | 847 | if (vtime->state == VTIME_SYS && t->flags & PF_VCPU) |
| 813 | gtime += vtime_delta(t); | 848 | gtime += vtime->gtime + vtime_delta(vtime); |
| 814 | 849 | ||
| 815 | } while (read_seqcount_retry(&t->vtime_seqcount, seq)); | 850 | } while (read_seqcount_retry(&vtime->seqcount, seq)); |
| 816 | 851 | ||
| 817 | return gtime; | 852 | return gtime; |
| 818 | } | 853 | } |
| @@ -824,8 +859,9 @@ u64 task_gtime(struct task_struct *t) | |||
| 824 | */ | 859 | */ |
| 825 | void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) | 860 | void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) |
| 826 | { | 861 | { |
| 827 | u64 delta; | 862 | struct vtime *vtime = &t->vtime; |
| 828 | unsigned int seq; | 863 | unsigned int seq; |
| 864 | u64 delta; | ||
| 829 | 865 | ||
| 830 | if (!vtime_accounting_enabled()) { | 866 | if (!vtime_accounting_enabled()) { |
| 831 | *utime = t->utime; | 867 | *utime = t->utime; |
| @@ -834,25 +870,25 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) | |||
| 834 | } | 870 | } |
| 835 | 871 | ||
| 836 | do { | 872 | do { |
| 837 | seq = read_seqcount_begin(&t->vtime_seqcount); | 873 | seq = read_seqcount_begin(&vtime->seqcount); |
| 838 | 874 | ||
| 839 | *utime = t->utime; | 875 | *utime = t->utime; |
| 840 | *stime = t->stime; | 876 | *stime = t->stime; |
| 841 | 877 | ||
| 842 | /* Task is sleeping, nothing to add */ | 878 | /* Task is sleeping, nothing to add */ |
| 843 | if (t->vtime_snap_whence == VTIME_INACTIVE || is_idle_task(t)) | 879 | if (vtime->state == VTIME_INACTIVE || is_idle_task(t)) |
| 844 | continue; | 880 | continue; |
| 845 | 881 | ||
| 846 | delta = vtime_delta(t); | 882 | delta = vtime_delta(vtime); |
| 847 | 883 | ||
| 848 | /* | 884 | /* |
| 849 | * Task runs either in user or kernel space, add pending nohz time to | 885 | * Task runs either in user or kernel space, add pending nohz time to |
| 850 | * the right place. | 886 | * the right place. |
| 851 | */ | 887 | */ |
| 852 | if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU) | 888 | if (vtime->state == VTIME_USER || t->flags & PF_VCPU) |
| 853 | *utime += delta; | 889 | *utime += vtime->utime + delta; |
| 854 | else if (t->vtime_snap_whence == VTIME_SYS) | 890 | else if (vtime->state == VTIME_SYS) |
| 855 | *stime += delta; | 891 | *stime += vtime->stime + delta; |
| 856 | } while (read_seqcount_retry(&t->vtime_seqcount, seq)); | 892 | } while (read_seqcount_retry(&vtime->seqcount, seq)); |
| 857 | } | 893 | } |
| 858 | #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ | 894 | #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ |
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index a2ce59015642..755bd3f1a1a9 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c | |||
| @@ -17,6 +17,7 @@ | |||
| 17 | #include "sched.h" | 17 | #include "sched.h" |
| 18 | 18 | ||
| 19 | #include <linux/slab.h> | 19 | #include <linux/slab.h> |
| 20 | #include <uapi/linux/sched/types.h> | ||
| 20 | 21 | ||
| 21 | struct dl_bandwidth def_dl_bandwidth; | 22 | struct dl_bandwidth def_dl_bandwidth; |
| 22 | 23 | ||
| @@ -43,6 +44,254 @@ static inline int on_dl_rq(struct sched_dl_entity *dl_se) | |||
| 43 | return !RB_EMPTY_NODE(&dl_se->rb_node); | 44 | return !RB_EMPTY_NODE(&dl_se->rb_node); |
| 44 | } | 45 | } |
| 45 | 46 | ||
| 47 | #ifdef CONFIG_SMP | ||
| 48 | static inline struct dl_bw *dl_bw_of(int i) | ||
| 49 | { | ||
| 50 | RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), | ||
| 51 | "sched RCU must be held"); | ||
| 52 | return &cpu_rq(i)->rd->dl_bw; | ||
| 53 | } | ||
| 54 | |||
| 55 | static inline int dl_bw_cpus(int i) | ||
| 56 | { | ||
| 57 | struct root_domain *rd = cpu_rq(i)->rd; | ||
| 58 | int cpus = 0; | ||
| 59 | |||
| 60 | RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), | ||
| 61 | "sched RCU must be held"); | ||
| 62 | for_each_cpu_and(i, rd->span, cpu_active_mask) | ||
| 63 | cpus++; | ||
| 64 | |||
| 65 | return cpus; | ||
| 66 | } | ||
| 67 | #else | ||
| 68 | static inline struct dl_bw *dl_bw_of(int i) | ||
| 69 | { | ||
| 70 | return &cpu_rq(i)->dl.dl_bw; | ||
| 71 | } | ||
| 72 | |||
| 73 | static inline int dl_bw_cpus(int i) | ||
| 74 | { | ||
| 75 | return 1; | ||
| 76 | } | ||
| 77 | #endif | ||
| 78 | |||
| 79 | static inline | ||
| 80 | void add_running_bw(u64 dl_bw, struct dl_rq *dl_rq) | ||
| 81 | { | ||
| 82 | u64 old = dl_rq->running_bw; | ||
| 83 | |||
| 84 | lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock); | ||
| 85 | dl_rq->running_bw += dl_bw; | ||
| 86 | SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */ | ||
| 87 | SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); | ||
| 88 | } | ||
| 89 | |||
| 90 | static inline | ||
| 91 | void sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq) | ||
| 92 | { | ||
| 93 | u64 old = dl_rq->running_bw; | ||
| 94 | |||
| 95 | lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock); | ||
| 96 | dl_rq->running_bw -= dl_bw; | ||
| 97 | SCHED_WARN_ON(dl_rq->running_bw > old); /* underflow */ | ||
| 98 | if (dl_rq->running_bw > old) | ||
| 99 | dl_rq->running_bw = 0; | ||
| 100 | } | ||
| 101 | |||
| 102 | static inline | ||
| 103 | void add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) | ||
| 104 | { | ||
| 105 | u64 old = dl_rq->this_bw; | ||
| 106 | |||
| 107 | lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock); | ||
| 108 | dl_rq->this_bw += dl_bw; | ||
| 109 | SCHED_WARN_ON(dl_rq->this_bw < old); /* overflow */ | ||
| 110 | } | ||
| 111 | |||
| 112 | static inline | ||
| 113 | void sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) | ||
| 114 | { | ||
| 115 | u64 old = dl_rq->this_bw; | ||
| 116 | |||
| 117 | lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock); | ||
| 118 | dl_rq->this_bw -= dl_bw; | ||
| 119 | SCHED_WARN_ON(dl_rq->this_bw > old); /* underflow */ | ||
| 120 | if (dl_rq->this_bw > old) | ||
| 121 | dl_rq->this_bw = 0; | ||
| 122 | SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); | ||
| 123 | } | ||
| 124 | |||
| 125 | void dl_change_utilization(struct task_struct *p, u64 new_bw) | ||
| 126 | { | ||
| 127 | struct rq *rq; | ||
| 128 | |||
| 129 | if (task_on_rq_queued(p)) | ||
| 130 | return; | ||
| 131 | |||
| 132 | rq = task_rq(p); | ||
| 133 | if (p->dl.dl_non_contending) { | ||
| 134 | sub_running_bw(p->dl.dl_bw, &rq->dl); | ||
| 135 | p->dl.dl_non_contending = 0; | ||
| 136 | /* | ||
| 137 | * If the timer handler is currently running and the | ||
| 138 | * timer cannot be cancelled, inactive_task_timer() | ||
| 139 | * will see that dl_not_contending is not set, and | ||
| 140 | * will not touch the rq's active utilization, | ||
| 141 | * so we are still safe. | ||
| 142 | */ | ||
| 143 | if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) | ||
| 144 | put_task_struct(p); | ||
| 145 | } | ||
| 146 | sub_rq_bw(p->dl.dl_bw, &rq->dl); | ||
| 147 | add_rq_bw(new_bw, &rq->dl); | ||
| 148 | } | ||
| 149 | |||
| 150 | /* | ||
| 151 | * The utilization of a task cannot be immediately removed from | ||
| 152 | * the rq active utilization (running_bw) when the task blocks. | ||
| 153 | * Instead, we have to wait for the so called "0-lag time". | ||
| 154 | * | ||
| 155 | * If a task blocks before the "0-lag time", a timer (the inactive | ||
| 156 | * timer) is armed, and running_bw is decreased when the timer | ||
| 157 | * fires. | ||
| 158 | * | ||
| 159 | * If the task wakes up again before the inactive timer fires, | ||
| 160 | * the timer is cancelled, whereas if the task wakes up after the | ||
| 161 | * inactive timer fired (and running_bw has been decreased) the | ||
| 162 | * task's utilization has to be added to running_bw again. | ||
| 163 | * A flag in the deadline scheduling entity (dl_non_contending) | ||
| 164 | * is used to avoid race conditions between the inactive timer handler | ||
| 165 | * and task wakeups. | ||
| 166 | * | ||
| 167 | * The following diagram shows how running_bw is updated. A task is | ||
| 168 | * "ACTIVE" when its utilization contributes to running_bw; an | ||
| 169 | * "ACTIVE contending" task is in the TASK_RUNNING state, while an | ||
| 170 | * "ACTIVE non contending" task is a blocked task for which the "0-lag time" | ||
| 171 | * has not passed yet. An "INACTIVE" task is a task for which the "0-lag" | ||
| 172 | * time already passed, which does not contribute to running_bw anymore. | ||
| 173 | * +------------------+ | ||
| 174 | * wakeup | ACTIVE | | ||
| 175 | * +------------------>+ contending | | ||
| 176 | * | add_running_bw | | | ||
| 177 | * | +----+------+------+ | ||
| 178 | * | | ^ | ||
| 179 | * | dequeue | | | ||
| 180 | * +--------+-------+ | | | ||
| 181 | * | | t >= 0-lag | | wakeup | ||
| 182 | * | INACTIVE |<---------------+ | | ||
| 183 | * | | sub_running_bw | | | ||
| 184 | * +--------+-------+ | | | ||
| 185 | * ^ | | | ||
| 186 | * | t < 0-lag | | | ||
| 187 | * | | | | ||
| 188 | * | V | | ||
| 189 | * | +----+------+------+ | ||
| 190 | * | sub_running_bw | ACTIVE | | ||
| 191 | * +-------------------+ | | ||
| 192 | * inactive timer | non contending | | ||
| 193 | * fired +------------------+ | ||
| 194 | * | ||
| 195 | * The task_non_contending() function is invoked when a task | ||
| 196 | * blocks, and checks if the 0-lag time already passed or | ||
| 197 | * not (in the first case, it directly updates running_bw; | ||
| 198 | * in the second case, it arms the inactive timer). | ||
| 199 | * | ||
| 200 | * The task_contending() function is invoked when a task wakes | ||
| 201 | * up, and checks if the task is still in the "ACTIVE non contending" | ||
| 202 | * state or not (in the second case, it updates running_bw). | ||
| 203 | */ | ||
| 204 | static void task_non_contending(struct task_struct *p) | ||
| 205 | { | ||
| 206 | struct sched_dl_entity *dl_se = &p->dl; | ||
| 207 | struct hrtimer *timer = &dl_se->inactive_timer; | ||
| 208 | struct dl_rq *dl_rq = dl_rq_of_se(dl_se); | ||
| 209 | struct rq *rq = rq_of_dl_rq(dl_rq); | ||
| 210 | s64 zerolag_time; | ||
| 211 | |||
| 212 | /* | ||
| 213 | * If this is a non-deadline task that has been boosted, | ||
| 214 | * do nothing | ||
| 215 | */ | ||
| 216 | if (dl_se->dl_runtime == 0) | ||
| 217 | return; | ||
| 218 | |||
| 219 | WARN_ON(hrtimer_active(&dl_se->inactive_timer)); | ||
| 220 | WARN_ON(dl_se->dl_non_contending); | ||
| 221 | |||
| 222 | zerolag_time = dl_se->deadline - | ||
| 223 | div64_long((dl_se->runtime * dl_se->dl_period), | ||
| 224 | dl_se->dl_runtime); | ||
| 225 | |||
| 226 | /* | ||
| 227 | * Using relative times instead of the absolute "0-lag time" | ||
| 228 | * allows to simplify the code | ||
| 229 | */ | ||
| 230 | zerolag_time -= rq_clock(rq); | ||
| 231 | |||
| 232 | /* | ||
| 233 | * If the "0-lag time" already passed, decrease the active | ||
| 234 | * utilization now, instead of starting a timer | ||
| 235 | */ | ||
| 236 | if (zerolag_time < 0) { | ||
| 237 | if (dl_task(p)) | ||
| 238 | sub_running_bw(dl_se->dl_bw, dl_rq); | ||
| 239 | if (!dl_task(p) || p->state == TASK_DEAD) { | ||
| 240 | struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); | ||
| 241 | |||
| 242 | if (p->state == TASK_DEAD) | ||
| 243 | sub_rq_bw(p->dl.dl_bw, &rq->dl); | ||
| 244 | raw_spin_lock(&dl_b->lock); | ||
| 245 | __dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); | ||
| 246 | __dl_clear_params(p); | ||
| 247 | raw_spin_unlock(&dl_b->lock); | ||
| 248 | } | ||
| 249 | |||
| 250 | return; | ||
| 251 | } | ||
| 252 | |||
| 253 | dl_se->dl_non_contending = 1; | ||
| 254 | get_task_struct(p); | ||
| 255 | hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL); | ||
| 256 | } | ||
| 257 | |||
| 258 | static void task_contending(struct sched_dl_entity *dl_se, int flags) | ||
| 259 | { | ||
| 260 | struct dl_rq *dl_rq = dl_rq_of_se(dl_se); | ||
| 261 | |||
| 262 | /* | ||
| 263 | * If this is a non-deadline task that has been boosted, | ||
| 264 | * do nothing | ||
| 265 | */ | ||
| 266 | if (dl_se->dl_runtime == 0) | ||
| 267 | return; | ||
| 268 | |||
| 269 | if (flags & ENQUEUE_MIGRATED) | ||
| 270 | add_rq_bw(dl_se->dl_bw, dl_rq); | ||
| 271 | |||
| 272 | if (dl_se->dl_non_contending) { | ||
| 273 | dl_se->dl_non_contending = 0; | ||
| 274 | /* | ||
| 275 | * If the timer handler is currently running and the | ||
| 276 | * timer cannot be cancelled, inactive_task_timer() | ||
| 277 | * will see that dl_not_contending is not set, and | ||
| 278 | * will not touch the rq's active utilization, | ||
| 279 | * so we are still safe. | ||
| 280 | */ | ||
| 281 | if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1) | ||
| 282 | put_task_struct(dl_task_of(dl_se)); | ||
| 283 | } else { | ||
| 284 | /* | ||
| 285 | * Since "dl_non_contending" is not set, the | ||
| 286 | * task's utilization has already been removed from | ||
| 287 | * active utilization (either when the task blocked, | ||
| 288 | * when the "inactive timer" fired). | ||
| 289 | * So, add it back. | ||
| 290 | */ | ||
| 291 | add_running_bw(dl_se->dl_bw, dl_rq); | ||
| 292 | } | ||
| 293 | } | ||
| 294 | |||
| 46 | static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq) | 295 | static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq) |
| 47 | { | 296 | { |
| 48 | struct sched_dl_entity *dl_se = &p->dl; | 297 | struct sched_dl_entity *dl_se = &p->dl; |
| @@ -83,6 +332,10 @@ void init_dl_rq(struct dl_rq *dl_rq) | |||
| 83 | #else | 332 | #else |
| 84 | init_dl_bw(&dl_rq->dl_bw); | 333 | init_dl_bw(&dl_rq->dl_bw); |
| 85 | #endif | 334 | #endif |
| 335 | |||
| 336 | dl_rq->running_bw = 0; | ||
| 337 | dl_rq->this_bw = 0; | ||
| 338 | init_dl_rq_bw_ratio(dl_rq); | ||
| 86 | } | 339 | } |
| 87 | 340 | ||
| 88 | #ifdef CONFIG_SMP | 341 | #ifdef CONFIG_SMP |
| @@ -484,13 +737,84 @@ static bool dl_entity_overflow(struct sched_dl_entity *dl_se, | |||
| 484 | } | 737 | } |
| 485 | 738 | ||
| 486 | /* | 739 | /* |
| 487 | * When a -deadline entity is queued back on the runqueue, its runtime and | 740 | * Revised wakeup rule [1]: For self-suspending tasks, rather then |
| 488 | * deadline might need updating. | 741 | * re-initializing task's runtime and deadline, the revised wakeup |
| 742 | * rule adjusts the task's runtime to avoid the task to overrun its | ||
| 743 | * density. | ||
| 744 | * | ||
| 745 | * Reasoning: a task may overrun the density if: | ||
| 746 | * runtime / (deadline - t) > dl_runtime / dl_deadline | ||
| 747 | * | ||
| 748 | * Therefore, runtime can be adjusted to: | ||
| 749 | * runtime = (dl_runtime / dl_deadline) * (deadline - t) | ||
| 750 | * | ||
| 751 | * In such way that runtime will be equal to the maximum density | ||
| 752 | * the task can use without breaking any rule. | ||
| 753 | * | ||
| 754 | * [1] Luca Abeni, Giuseppe Lipari, and Juri Lelli. 2015. Constant | ||
| 755 | * bandwidth server revisited. SIGBED Rev. 11, 4 (January 2015), 19-24. | ||
| 756 | */ | ||
| 757 | static void | ||
| 758 | update_dl_revised_wakeup(struct sched_dl_entity *dl_se, struct rq *rq) | ||
| 759 | { | ||
| 760 | u64 laxity = dl_se->deadline - rq_clock(rq); | ||
| 761 | |||
| 762 | /* | ||
| 763 | * If the task has deadline < period, and the deadline is in the past, | ||
| 764 | * it should already be throttled before this check. | ||
| 765 | * | ||
| 766 | * See update_dl_entity() comments for further details. | ||
| 767 | */ | ||
| 768 | WARN_ON(dl_time_before(dl_se->deadline, rq_clock(rq))); | ||
| 769 | |||
| 770 | dl_se->runtime = (dl_se->dl_density * laxity) >> BW_SHIFT; | ||
| 771 | } | ||
| 772 | |||
| 773 | /* | ||
| 774 | * Regarding the deadline, a task with implicit deadline has a relative | ||
| 775 | * deadline == relative period. A task with constrained deadline has a | ||
| 776 | * relative deadline <= relative period. | ||
| 777 | * | ||
| 778 | * We support constrained deadline tasks. However, there are some restrictions | ||
| 779 | * applied only for tasks which do not have an implicit deadline. See | ||
| 780 | * update_dl_entity() to know more about such restrictions. | ||
| 781 | * | ||
| 782 | * The dl_is_implicit() returns true if the task has an implicit deadline. | ||
| 783 | */ | ||
| 784 | static inline bool dl_is_implicit(struct sched_dl_entity *dl_se) | ||
| 785 | { | ||
| 786 | return dl_se->dl_deadline == dl_se->dl_period; | ||
| 787 | } | ||
| 788 | |||
| 789 | /* | ||
| 790 | * When a deadline entity is placed in the runqueue, its runtime and deadline | ||
| 791 | * might need to be updated. This is done by a CBS wake up rule. There are two | ||
| 792 | * different rules: 1) the original CBS; and 2) the Revisited CBS. | ||
| 793 | * | ||
| 794 | * When the task is starting a new period, the Original CBS is used. In this | ||
| 795 | * case, the runtime is replenished and a new absolute deadline is set. | ||
| 796 | * | ||
| 797 | * When a task is queued before the begin of the next period, using the | ||
| 798 | * remaining runtime and deadline could make the entity to overflow, see | ||
| 799 | * dl_entity_overflow() to find more about runtime overflow. When such case | ||
| 800 | * is detected, the runtime and deadline need to be updated. | ||
| 801 | * | ||
| 802 | * If the task has an implicit deadline, i.e., deadline == period, the Original | ||
| 803 | * CBS is applied. the runtime is replenished and a new absolute deadline is | ||
| 804 | * set, as in the previous cases. | ||
| 805 | * | ||
| 806 | * However, the Original CBS does not work properly for tasks with | ||
| 807 | * deadline < period, which are said to have a constrained deadline. By | ||
| 808 | * applying the Original CBS, a constrained deadline task would be able to run | ||
| 809 | * runtime/deadline in a period. With deadline < period, the task would | ||
| 810 | * overrun the runtime/period allowed bandwidth, breaking the admission test. | ||
| 489 | * | 811 | * |
| 490 | * The policy here is that we update the deadline of the entity only if: | 812 | * In order to prevent this misbehave, the Revisited CBS is used for |
| 491 | * - the current deadline is in the past, | 813 | * constrained deadline tasks when a runtime overflow is detected. In the |
| 492 | * - using the remaining runtime with the current deadline would make | 814 | * Revisited CBS, rather than replenishing & setting a new absolute deadline, |
| 493 | * the entity exceed its bandwidth. | 815 | * the remaining runtime of the task is reduced to avoid runtime overflow. |
| 816 | * Please refer to the comments update_dl_revised_wakeup() function to find | ||
| 817 | * more about the Revised CBS rule. | ||
| 494 | */ | 818 | */ |
| 495 | static void update_dl_entity(struct sched_dl_entity *dl_se, | 819 | static void update_dl_entity(struct sched_dl_entity *dl_se, |
| 496 | struct sched_dl_entity *pi_se) | 820 | struct sched_dl_entity *pi_se) |
| @@ -500,6 +824,14 @@ static void update_dl_entity(struct sched_dl_entity *dl_se, | |||
| 500 | 824 | ||
| 501 | if (dl_time_before(dl_se->deadline, rq_clock(rq)) || | 825 | if (dl_time_before(dl_se->deadline, rq_clock(rq)) || |
| 502 | dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) { | 826 | dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) { |
| 827 | |||
| 828 | if (unlikely(!dl_is_implicit(dl_se) && | ||
| 829 | !dl_time_before(dl_se->deadline, rq_clock(rq)) && | ||
| 830 | !dl_se->dl_boosted)){ | ||
| 831 | update_dl_revised_wakeup(dl_se, rq); | ||
| 832 | return; | ||
| 833 | } | ||
| 834 | |||
| 503 | dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; | 835 | dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; |
| 504 | dl_se->runtime = pi_se->dl_runtime; | 836 | dl_se->runtime = pi_se->dl_runtime; |
| 505 | } | 837 | } |
| @@ -593,10 +925,8 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) | |||
| 593 | * The task might have changed its scheduling policy to something | 925 | * The task might have changed its scheduling policy to something |
| 594 | * different than SCHED_DEADLINE (through switched_from_dl()). | 926 | * different than SCHED_DEADLINE (through switched_from_dl()). |
| 595 | */ | 927 | */ |
| 596 | if (!dl_task(p)) { | 928 | if (!dl_task(p)) |
| 597 | __dl_clear_params(p); | ||
| 598 | goto unlock; | 929 | goto unlock; |
| 599 | } | ||
| 600 | 930 | ||
| 601 | /* | 931 | /* |
| 602 | * The task might have been boosted by someone else and might be in the | 932 | * The task might have been boosted by someone else and might be in the |
| @@ -723,6 +1053,8 @@ static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se) | |||
| 723 | if (unlikely(dl_se->dl_boosted || !start_dl_timer(p))) | 1053 | if (unlikely(dl_se->dl_boosted || !start_dl_timer(p))) |
| 724 | return; | 1054 | return; |
| 725 | dl_se->dl_throttled = 1; | 1055 | dl_se->dl_throttled = 1; |
| 1056 | if (dl_se->runtime > 0) | ||
| 1057 | dl_se->runtime = 0; | ||
| 726 | } | 1058 | } |
| 727 | } | 1059 | } |
| 728 | 1060 | ||
| @@ -735,6 +1067,47 @@ int dl_runtime_exceeded(struct sched_dl_entity *dl_se) | |||
| 735 | extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); | 1067 | extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); |
| 736 | 1068 | ||
| 737 | /* | 1069 | /* |
| 1070 | * This function implements the GRUB accounting rule: | ||
| 1071 | * according to the GRUB reclaiming algorithm, the runtime is | ||
| 1072 | * not decreased as "dq = -dt", but as | ||
| 1073 | * "dq = -max{u / Umax, (1 - Uinact - Uextra)} dt", | ||
| 1074 | * where u is the utilization of the task, Umax is the maximum reclaimable | ||
| 1075 | * utilization, Uinact is the (per-runqueue) inactive utilization, computed | ||
| 1076 | * as the difference between the "total runqueue utilization" and the | ||
| 1077 | * runqueue active utilization, and Uextra is the (per runqueue) extra | ||
| 1078 | * reclaimable utilization. | ||
| 1079 | * Since rq->dl.running_bw and rq->dl.this_bw contain utilizations | ||
| 1080 | * multiplied by 2^BW_SHIFT, the result has to be shifted right by | ||
| 1081 | * BW_SHIFT. | ||
| 1082 | * Since rq->dl.bw_ratio contains 1 / Umax multipled by 2^RATIO_SHIFT, | ||
| 1083 | * dl_bw is multiped by rq->dl.bw_ratio and shifted right by RATIO_SHIFT. | ||
| 1084 | * Since delta is a 64 bit variable, to have an overflow its value | ||
| 1085 | * should be larger than 2^(64 - 20 - 8), which is more than 64 seconds. | ||
| 1086 | * So, overflow is not an issue here. | ||
| 1087 | */ | ||
| 1088 | u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se) | ||
| 1089 | { | ||
| 1090 | u64 u_inact = rq->dl.this_bw - rq->dl.running_bw; /* Utot - Uact */ | ||
| 1091 | u64 u_act; | ||
| 1092 | u64 u_act_min = (dl_se->dl_bw * rq->dl.bw_ratio) >> RATIO_SHIFT; | ||
| 1093 | |||
| 1094 | /* | ||
| 1095 | * Instead of computing max{u * bw_ratio, (1 - u_inact - u_extra)}, | ||
| 1096 | * we compare u_inact + rq->dl.extra_bw with | ||
| 1097 | * 1 - (u * rq->dl.bw_ratio >> RATIO_SHIFT), because | ||
| 1098 | * u_inact + rq->dl.extra_bw can be larger than | ||
| 1099 | * 1 * (so, 1 - u_inact - rq->dl.extra_bw would be negative | ||
| 1100 | * leading to wrong results) | ||
| 1101 | */ | ||
| 1102 | if (u_inact + rq->dl.extra_bw > BW_UNIT - u_act_min) | ||
| 1103 | u_act = u_act_min; | ||
| 1104 | else | ||
| 1105 | u_act = BW_UNIT - u_inact - rq->dl.extra_bw; | ||
| 1106 | |||
| 1107 | return (delta * u_act) >> BW_SHIFT; | ||
| 1108 | } | ||
| 1109 | |||
| 1110 | /* | ||
| 738 | * Update the current task's runtime statistics (provided it is still | 1111 | * Update the current task's runtime statistics (provided it is still |
| 739 | * a -deadline task and has not been removed from the dl_rq). | 1112 | * a -deadline task and has not been removed from the dl_rq). |
| 740 | */ | 1113 | */ |
| @@ -776,6 +1149,8 @@ static void update_curr_dl(struct rq *rq) | |||
| 776 | 1149 | ||
| 777 | sched_rt_avg_update(rq, delta_exec); | 1150 | sched_rt_avg_update(rq, delta_exec); |
| 778 | 1151 | ||
| 1152 | if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM)) | ||
| 1153 | delta_exec = grub_reclaim(delta_exec, rq, &curr->dl); | ||
| 779 | dl_se->runtime -= delta_exec; | 1154 | dl_se->runtime -= delta_exec; |
| 780 | 1155 | ||
| 781 | throttle: | 1156 | throttle: |
| @@ -815,6 +1190,56 @@ throttle: | |||
| 815 | } | 1190 | } |
| 816 | } | 1191 | } |
| 817 | 1192 | ||
| 1193 | static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) | ||
| 1194 | { | ||
| 1195 | struct sched_dl_entity *dl_se = container_of(timer, | ||
| 1196 | struct sched_dl_entity, | ||
| 1197 | inactive_timer); | ||
| 1198 | struct task_struct *p = dl_task_of(dl_se); | ||
| 1199 | struct rq_flags rf; | ||
| 1200 | struct rq *rq; | ||
| 1201 | |||
| 1202 | rq = task_rq_lock(p, &rf); | ||
| 1203 | |||
| 1204 | if (!dl_task(p) || p->state == TASK_DEAD) { | ||
| 1205 | struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); | ||
| 1206 | |||
| 1207 | if (p->state == TASK_DEAD && dl_se->dl_non_contending) { | ||
| 1208 | sub_running_bw(p->dl.dl_bw, dl_rq_of_se(&p->dl)); | ||
| 1209 | sub_rq_bw(p->dl.dl_bw, dl_rq_of_se(&p->dl)); | ||
| 1210 | dl_se->dl_non_contending = 0; | ||
| 1211 | } | ||
| 1212 | |||
| 1213 | raw_spin_lock(&dl_b->lock); | ||
| 1214 | __dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); | ||
| 1215 | raw_spin_unlock(&dl_b->lock); | ||
| 1216 | __dl_clear_params(p); | ||
| 1217 | |||
| 1218 | goto unlock; | ||
| 1219 | } | ||
| 1220 | if (dl_se->dl_non_contending == 0) | ||
| 1221 | goto unlock; | ||
| 1222 | |||
| 1223 | sched_clock_tick(); | ||
| 1224 | update_rq_clock(rq); | ||
| 1225 | |||
| 1226 | sub_running_bw(dl_se->dl_bw, &rq->dl); | ||
| 1227 | dl_se->dl_non_contending = 0; | ||
| 1228 | unlock: | ||
| 1229 | task_rq_unlock(rq, p, &rf); | ||
| 1230 | put_task_struct(p); | ||
| 1231 | |||
| 1232 | return HRTIMER_NORESTART; | ||
| 1233 | } | ||
| 1234 | |||
| 1235 | void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se) | ||
| 1236 | { | ||
| 1237 | struct hrtimer *timer = &dl_se->inactive_timer; | ||
| 1238 | |||
| 1239 | hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
| 1240 | timer->function = inactive_task_timer; | ||
| 1241 | } | ||
| 1242 | |||
| 818 | #ifdef CONFIG_SMP | 1243 | #ifdef CONFIG_SMP |
| 819 | 1244 | ||
| 820 | static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) | 1245 | static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) |
| @@ -946,10 +1371,12 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, | |||
| 946 | * parameters of the task might need updating. Otherwise, | 1371 | * parameters of the task might need updating. Otherwise, |
| 947 | * we want a replenishment of its runtime. | 1372 | * we want a replenishment of its runtime. |
| 948 | */ | 1373 | */ |
| 949 | if (flags & ENQUEUE_WAKEUP) | 1374 | if (flags & ENQUEUE_WAKEUP) { |
| 1375 | task_contending(dl_se, flags); | ||
| 950 | update_dl_entity(dl_se, pi_se); | 1376 | update_dl_entity(dl_se, pi_se); |
| 951 | else if (flags & ENQUEUE_REPLENISH) | 1377 | } else if (flags & ENQUEUE_REPLENISH) { |
| 952 | replenish_dl_entity(dl_se, pi_se); | 1378 | replenish_dl_entity(dl_se, pi_se); |
| 1379 | } | ||
| 953 | 1380 | ||
| 954 | __enqueue_dl_entity(dl_se); | 1381 | __enqueue_dl_entity(dl_se); |
| 955 | } | 1382 | } |
| @@ -959,28 +1386,25 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se) | |||
| 959 | __dequeue_dl_entity(dl_se); | 1386 | __dequeue_dl_entity(dl_se); |
| 960 | } | 1387 | } |
| 961 | 1388 | ||
| 962 | static inline bool dl_is_constrained(struct sched_dl_entity *dl_se) | ||
| 963 | { | ||
| 964 | return dl_se->dl_deadline < dl_se->dl_period; | ||
| 965 | } | ||
| 966 | |||
| 967 | static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) | 1389 | static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) |
| 968 | { | 1390 | { |
| 969 | struct task_struct *pi_task = rt_mutex_get_top_task(p); | 1391 | struct task_struct *pi_task = rt_mutex_get_top_task(p); |
| 970 | struct sched_dl_entity *pi_se = &p->dl; | 1392 | struct sched_dl_entity *pi_se = &p->dl; |
| 971 | 1393 | ||
| 972 | /* | 1394 | /* |
| 973 | * Use the scheduling parameters of the top pi-waiter | 1395 | * Use the scheduling parameters of the top pi-waiter task if: |
| 974 | * task if we have one and its (absolute) deadline is | 1396 | * - we have a top pi-waiter which is a SCHED_DEADLINE task AND |
| 975 | * smaller than our one... OTW we keep our runtime and | 1397 | * - our dl_boosted is set (i.e. the pi-waiter's (absolute) deadline is |
| 976 | * deadline. | 1398 | * smaller than our deadline OR we are a !SCHED_DEADLINE task getting |
| 1399 | * boosted due to a SCHED_DEADLINE pi-waiter). | ||
| 1400 | * Otherwise we keep our runtime and deadline. | ||
| 977 | */ | 1401 | */ |
| 978 | if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) { | 1402 | if (pi_task && dl_prio(pi_task->normal_prio) && p->dl.dl_boosted) { |
| 979 | pi_se = &pi_task->dl; | 1403 | pi_se = &pi_task->dl; |
| 980 | } else if (!dl_prio(p->normal_prio)) { | 1404 | } else if (!dl_prio(p->normal_prio)) { |
| 981 | /* | 1405 | /* |
| 982 | * Special case in which we have a !SCHED_DEADLINE task | 1406 | * Special case in which we have a !SCHED_DEADLINE task |
| 983 | * that is going to be deboosted, but exceedes its | 1407 | * that is going to be deboosted, but exceeds its |
| 984 | * runtime while doing so. No point in replenishing | 1408 | * runtime while doing so. No point in replenishing |
| 985 | * it, as it's going to return back to its original | 1409 | * it, as it's going to return back to its original |
| 986 | * scheduling class after this. | 1410 | * scheduling class after this. |
| @@ -995,17 +1419,32 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) | |||
| 995 | * If that is the case, the task will be throttled and | 1419 | * If that is the case, the task will be throttled and |
| 996 | * the replenishment timer will be set to the next period. | 1420 | * the replenishment timer will be set to the next period. |
| 997 | */ | 1421 | */ |
| 998 | if (!p->dl.dl_throttled && dl_is_constrained(&p->dl)) | 1422 | if (!p->dl.dl_throttled && !dl_is_implicit(&p->dl)) |
| 999 | dl_check_constrained_dl(&p->dl); | 1423 | dl_check_constrained_dl(&p->dl); |
| 1000 | 1424 | ||
| 1425 | if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & ENQUEUE_RESTORE) { | ||
| 1426 | add_rq_bw(p->dl.dl_bw, &rq->dl); | ||
| 1427 | add_running_bw(p->dl.dl_bw, &rq->dl); | ||
| 1428 | } | ||
| 1429 | |||
| 1001 | /* | 1430 | /* |
| 1002 | * If p is throttled, we do nothing. In fact, if it exhausted | 1431 | * If p is throttled, we do not enqueue it. In fact, if it exhausted |
| 1003 | * its budget it needs a replenishment and, since it now is on | 1432 | * its budget it needs a replenishment and, since it now is on |
| 1004 | * its rq, the bandwidth timer callback (which clearly has not | 1433 | * its rq, the bandwidth timer callback (which clearly has not |
| 1005 | * run yet) will take care of this. | 1434 | * run yet) will take care of this. |
| 1435 | * However, the active utilization does not depend on the fact | ||
| 1436 | * that the task is on the runqueue or not (but depends on the | ||
| 1437 | * task's state - in GRUB parlance, "inactive" vs "active contending"). | ||
| 1438 | * In other words, even if a task is throttled its utilization must | ||
| 1439 | * be counted in the active utilization; hence, we need to call | ||
| 1440 | * add_running_bw(). | ||
| 1006 | */ | 1441 | */ |
| 1007 | if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH)) | 1442 | if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH)) { |
| 1443 | if (flags & ENQUEUE_WAKEUP) | ||
| 1444 | task_contending(&p->dl, flags); | ||
| 1445 | |||
| 1008 | return; | 1446 | return; |
| 1447 | } | ||
| 1009 | 1448 | ||
| 1010 | enqueue_dl_entity(&p->dl, pi_se, flags); | 1449 | enqueue_dl_entity(&p->dl, pi_se, flags); |
| 1011 | 1450 | ||
| @@ -1023,6 +1462,23 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) | |||
| 1023 | { | 1462 | { |
| 1024 | update_curr_dl(rq); | 1463 | update_curr_dl(rq); |
| 1025 | __dequeue_task_dl(rq, p, flags); | 1464 | __dequeue_task_dl(rq, p, flags); |
| 1465 | |||
| 1466 | if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & DEQUEUE_SAVE) { | ||
| 1467 | sub_running_bw(p->dl.dl_bw, &rq->dl); | ||
| 1468 | sub_rq_bw(p->dl.dl_bw, &rq->dl); | ||
| 1469 | } | ||
| 1470 | |||
| 1471 | /* | ||
| 1472 | * This check allows to start the inactive timer (or to immediately | ||
| 1473 | * decrease the active utilization, if needed) in two cases: | ||
| 1474 | * when the task blocks and when it is terminating | ||
| 1475 | * (p->state == TASK_DEAD). We can handle the two cases in the same | ||
| 1476 | * way, because from GRUB's point of view the same thing is happening | ||
| 1477 | * (the task moves from "active contending" to "active non contending" | ||
| 1478 | * or "inactive") | ||
| 1479 | */ | ||
| 1480 | if (flags & DEQUEUE_SLEEP) | ||
| 1481 | task_non_contending(p); | ||
| 1026 | } | 1482 | } |
| 1027 | 1483 | ||
| 1028 | /* | 1484 | /* |
| @@ -1100,6 +1556,37 @@ out: | |||
| 1100 | return cpu; | 1556 | return cpu; |
| 1101 | } | 1557 | } |
| 1102 | 1558 | ||
| 1559 | static void migrate_task_rq_dl(struct task_struct *p) | ||
| 1560 | { | ||
| 1561 | struct rq *rq; | ||
| 1562 | |||
| 1563 | if (p->state != TASK_WAKING) | ||
| 1564 | return; | ||
| 1565 | |||
| 1566 | rq = task_rq(p); | ||
| 1567 | /* | ||
| 1568 | * Since p->state == TASK_WAKING, set_task_cpu() has been called | ||
| 1569 | * from try_to_wake_up(). Hence, p->pi_lock is locked, but | ||
| 1570 | * rq->lock is not... So, lock it | ||
| 1571 | */ | ||
| 1572 | raw_spin_lock(&rq->lock); | ||
| 1573 | if (p->dl.dl_non_contending) { | ||
| 1574 | sub_running_bw(p->dl.dl_bw, &rq->dl); | ||
| 1575 | p->dl.dl_non_contending = 0; | ||
| 1576 | /* | ||
| 1577 | * If the timer handler is currently running and the | ||
| 1578 | * timer cannot be cancelled, inactive_task_timer() | ||
| 1579 | * will see that dl_not_contending is not set, and | ||
| 1580 | * will not touch the rq's active utilization, | ||
| 1581 | * so we are still safe. | ||
| 1582 | */ | ||
| 1583 | if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) | ||
| 1584 | put_task_struct(p); | ||
| 1585 | } | ||
| 1586 | sub_rq_bw(p->dl.dl_bw, &rq->dl); | ||
| 1587 | raw_spin_unlock(&rq->lock); | ||
| 1588 | } | ||
| 1589 | |||
| 1103 | static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) | 1590 | static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) |
| 1104 | { | 1591 | { |
| 1105 | /* | 1592 | /* |
| @@ -1255,19 +1742,6 @@ static void task_fork_dl(struct task_struct *p) | |||
| 1255 | */ | 1742 | */ |
| 1256 | } | 1743 | } |
| 1257 | 1744 | ||
| 1258 | static void task_dead_dl(struct task_struct *p) | ||
| 1259 | { | ||
| 1260 | struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); | ||
| 1261 | |||
| 1262 | /* | ||
| 1263 | * Since we are TASK_DEAD we won't slip out of the domain! | ||
| 1264 | */ | ||
| 1265 | raw_spin_lock_irq(&dl_b->lock); | ||
| 1266 | /* XXX we should retain the bw until 0-lag */ | ||
| 1267 | dl_b->total_bw -= p->dl.dl_bw; | ||
| 1268 | raw_spin_unlock_irq(&dl_b->lock); | ||
| 1269 | } | ||
| 1270 | |||
| 1271 | static void set_curr_task_dl(struct rq *rq) | 1745 | static void set_curr_task_dl(struct rq *rq) |
| 1272 | { | 1746 | { |
| 1273 | struct task_struct *p = rq->curr; | 1747 | struct task_struct *p = rq->curr; |
| @@ -1533,7 +2007,7 @@ retry: | |||
| 1533 | * then possible that next_task has migrated. | 2007 | * then possible that next_task has migrated. |
| 1534 | */ | 2008 | */ |
| 1535 | task = pick_next_pushable_dl_task(rq); | 2009 | task = pick_next_pushable_dl_task(rq); |
| 1536 | if (task_cpu(next_task) == rq->cpu && task == next_task) { | 2010 | if (task == next_task) { |
| 1537 | /* | 2011 | /* |
| 1538 | * The task is still there. We don't try | 2012 | * The task is still there. We don't try |
| 1539 | * again, some other cpu will pull it when ready. | 2013 | * again, some other cpu will pull it when ready. |
| @@ -1551,7 +2025,11 @@ retry: | |||
| 1551 | } | 2025 | } |
| 1552 | 2026 | ||
| 1553 | deactivate_task(rq, next_task, 0); | 2027 | deactivate_task(rq, next_task, 0); |
| 2028 | sub_running_bw(next_task->dl.dl_bw, &rq->dl); | ||
| 2029 | sub_rq_bw(next_task->dl.dl_bw, &rq->dl); | ||
| 1554 | set_task_cpu(next_task, later_rq->cpu); | 2030 | set_task_cpu(next_task, later_rq->cpu); |
| 2031 | add_rq_bw(next_task->dl.dl_bw, &later_rq->dl); | ||
| 2032 | add_running_bw(next_task->dl.dl_bw, &later_rq->dl); | ||
| 1555 | activate_task(later_rq, next_task, 0); | 2033 | activate_task(later_rq, next_task, 0); |
| 1556 | ret = 1; | 2034 | ret = 1; |
| 1557 | 2035 | ||
| @@ -1639,7 +2117,11 @@ static void pull_dl_task(struct rq *this_rq) | |||
| 1639 | resched = true; | 2117 | resched = true; |
| 1640 | 2118 | ||
| 1641 | deactivate_task(src_rq, p, 0); | 2119 | deactivate_task(src_rq, p, 0); |
| 2120 | sub_running_bw(p->dl.dl_bw, &src_rq->dl); | ||
| 2121 | sub_rq_bw(p->dl.dl_bw, &src_rq->dl); | ||
| 1642 | set_task_cpu(p, this_cpu); | 2122 | set_task_cpu(p, this_cpu); |
| 2123 | add_rq_bw(p->dl.dl_bw, &this_rq->dl); | ||
| 2124 | add_running_bw(p->dl.dl_bw, &this_rq->dl); | ||
| 1643 | activate_task(this_rq, p, 0); | 2125 | activate_task(this_rq, p, 0); |
| 1644 | dmin = p->dl.deadline; | 2126 | dmin = p->dl.deadline; |
| 1645 | 2127 | ||
| @@ -1695,7 +2177,7 @@ static void set_cpus_allowed_dl(struct task_struct *p, | |||
| 1695 | * until we complete the update. | 2177 | * until we complete the update. |
| 1696 | */ | 2178 | */ |
| 1697 | raw_spin_lock(&src_dl_b->lock); | 2179 | raw_spin_lock(&src_dl_b->lock); |
| 1698 | __dl_clear(src_dl_b, p->dl.dl_bw); | 2180 | __dl_clear(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); |
| 1699 | raw_spin_unlock(&src_dl_b->lock); | 2181 | raw_spin_unlock(&src_dl_b->lock); |
| 1700 | } | 2182 | } |
| 1701 | 2183 | ||
| @@ -1737,13 +2219,26 @@ void __init init_sched_dl_class(void) | |||
| 1737 | static void switched_from_dl(struct rq *rq, struct task_struct *p) | 2219 | static void switched_from_dl(struct rq *rq, struct task_struct *p) |
| 1738 | { | 2220 | { |
| 1739 | /* | 2221 | /* |
| 1740 | * Start the deadline timer; if we switch back to dl before this we'll | 2222 | * task_non_contending() can start the "inactive timer" (if the 0-lag |
| 1741 | * continue consuming our current CBS slice. If we stay outside of | 2223 | * time is in the future). If the task switches back to dl before |
| 1742 | * SCHED_DEADLINE until the deadline passes, the timer will reset the | 2224 | * the "inactive timer" fires, it can continue to consume its current |
| 1743 | * task. | 2225 | * runtime using its current deadline. If it stays outside of |
| 2226 | * SCHED_DEADLINE until the 0-lag time passes, inactive_task_timer() | ||
| 2227 | * will reset the task parameters. | ||
| 1744 | */ | 2228 | */ |
| 1745 | if (!start_dl_timer(p)) | 2229 | if (task_on_rq_queued(p) && p->dl.dl_runtime) |
| 1746 | __dl_clear_params(p); | 2230 | task_non_contending(p); |
| 2231 | |||
| 2232 | if (!task_on_rq_queued(p)) | ||
| 2233 | sub_rq_bw(p->dl.dl_bw, &rq->dl); | ||
| 2234 | |||
| 2235 | /* | ||
| 2236 | * We cannot use inactive_task_timer() to invoke sub_running_bw() | ||
| 2237 | * at the 0-lag time, because the task could have been migrated | ||
| 2238 | * while SCHED_OTHER in the meanwhile. | ||
| 2239 | */ | ||
| 2240 | if (p->dl.dl_non_contending) | ||
| 2241 | p->dl.dl_non_contending = 0; | ||
| 1747 | 2242 | ||
| 1748 | /* | 2243 | /* |
| 1749 | * Since this might be the only -deadline task on the rq, | 2244 | * Since this might be the only -deadline task on the rq, |
| @@ -1762,11 +2257,15 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) | |||
| 1762 | */ | 2257 | */ |
| 1763 | static void switched_to_dl(struct rq *rq, struct task_struct *p) | 2258 | static void switched_to_dl(struct rq *rq, struct task_struct *p) |
| 1764 | { | 2259 | { |
| 2260 | if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) | ||
| 2261 | put_task_struct(p); | ||
| 1765 | 2262 | ||
| 1766 | /* If p is not queued we will update its parameters at next wakeup. */ | 2263 | /* If p is not queued we will update its parameters at next wakeup. */ |
| 1767 | if (!task_on_rq_queued(p)) | 2264 | if (!task_on_rq_queued(p)) { |
| 1768 | return; | 2265 | add_rq_bw(p->dl.dl_bw, &rq->dl); |
| 1769 | 2266 | ||
| 2267 | return; | ||
| 2268 | } | ||
| 1770 | /* | 2269 | /* |
| 1771 | * If p is boosted we already updated its params in | 2270 | * If p is boosted we already updated its params in |
| 1772 | * rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH), | 2271 | * rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH), |
| @@ -1836,6 +2335,7 @@ const struct sched_class dl_sched_class = { | |||
| 1836 | 2335 | ||
| 1837 | #ifdef CONFIG_SMP | 2336 | #ifdef CONFIG_SMP |
| 1838 | .select_task_rq = select_task_rq_dl, | 2337 | .select_task_rq = select_task_rq_dl, |
| 2338 | .migrate_task_rq = migrate_task_rq_dl, | ||
| 1839 | .set_cpus_allowed = set_cpus_allowed_dl, | 2339 | .set_cpus_allowed = set_cpus_allowed_dl, |
| 1840 | .rq_online = rq_online_dl, | 2340 | .rq_online = rq_online_dl, |
| 1841 | .rq_offline = rq_offline_dl, | 2341 | .rq_offline = rq_offline_dl, |
| @@ -1845,7 +2345,6 @@ const struct sched_class dl_sched_class = { | |||
| 1845 | .set_curr_task = set_curr_task_dl, | 2345 | .set_curr_task = set_curr_task_dl, |
| 1846 | .task_tick = task_tick_dl, | 2346 | .task_tick = task_tick_dl, |
| 1847 | .task_fork = task_fork_dl, | 2347 | .task_fork = task_fork_dl, |
| 1848 | .task_dead = task_dead_dl, | ||
| 1849 | 2348 | ||
| 1850 | .prio_changed = prio_changed_dl, | 2349 | .prio_changed = prio_changed_dl, |
| 1851 | .switched_from = switched_from_dl, | 2350 | .switched_from = switched_from_dl, |
| @@ -1854,6 +2353,317 @@ const struct sched_class dl_sched_class = { | |||
| 1854 | .update_curr = update_curr_dl, | 2353 | .update_curr = update_curr_dl, |
| 1855 | }; | 2354 | }; |
| 1856 | 2355 | ||
| 2356 | int sched_dl_global_validate(void) | ||
| 2357 | { | ||
| 2358 | u64 runtime = global_rt_runtime(); | ||
| 2359 | u64 period = global_rt_period(); | ||
| 2360 | u64 new_bw = to_ratio(period, runtime); | ||
| 2361 | struct dl_bw *dl_b; | ||
| 2362 | int cpu, ret = 0; | ||
| 2363 | unsigned long flags; | ||
| 2364 | |||
| 2365 | /* | ||
| 2366 | * Here we want to check the bandwidth not being set to some | ||
| 2367 | * value smaller than the currently allocated bandwidth in | ||
| 2368 | * any of the root_domains. | ||
| 2369 | * | ||
| 2370 | * FIXME: Cycling on all the CPUs is overdoing, but simpler than | ||
| 2371 | * cycling on root_domains... Discussion on different/better | ||
| 2372 | * solutions is welcome! | ||
| 2373 | */ | ||
| 2374 | for_each_possible_cpu(cpu) { | ||
| 2375 | rcu_read_lock_sched(); | ||
| 2376 | dl_b = dl_bw_of(cpu); | ||
| 2377 | |||
| 2378 | raw_spin_lock_irqsave(&dl_b->lock, flags); | ||
| 2379 | if (new_bw < dl_b->total_bw) | ||
| 2380 | ret = -EBUSY; | ||
| 2381 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); | ||
| 2382 | |||
| 2383 | rcu_read_unlock_sched(); | ||
| 2384 | |||
| 2385 | if (ret) | ||
| 2386 | break; | ||
| 2387 | } | ||
| 2388 | |||
| 2389 | return ret; | ||
| 2390 | } | ||
| 2391 | |||
| 2392 | void init_dl_rq_bw_ratio(struct dl_rq *dl_rq) | ||
| 2393 | { | ||
| 2394 | if (global_rt_runtime() == RUNTIME_INF) { | ||
| 2395 | dl_rq->bw_ratio = 1 << RATIO_SHIFT; | ||
| 2396 | dl_rq->extra_bw = 1 << BW_SHIFT; | ||
| 2397 | } else { | ||
| 2398 | dl_rq->bw_ratio = to_ratio(global_rt_runtime(), | ||
| 2399 | global_rt_period()) >> (BW_SHIFT - RATIO_SHIFT); | ||
| 2400 | dl_rq->extra_bw = to_ratio(global_rt_period(), | ||
| 2401 | global_rt_runtime()); | ||
| 2402 | } | ||
| 2403 | } | ||
| 2404 | |||
| 2405 | void sched_dl_do_global(void) | ||
| 2406 | { | ||
| 2407 | u64 new_bw = -1; | ||
| 2408 | struct dl_bw *dl_b; | ||
| 2409 | int cpu; | ||
| 2410 | unsigned long flags; | ||
| 2411 | |||
| 2412 | def_dl_bandwidth.dl_period = global_rt_period(); | ||
| 2413 | def_dl_bandwidth.dl_runtime = global_rt_runtime(); | ||
| 2414 | |||
| 2415 | if (global_rt_runtime() != RUNTIME_INF) | ||
| 2416 | new_bw = to_ratio(global_rt_period(), global_rt_runtime()); | ||
| 2417 | |||
| 2418 | /* | ||
| 2419 | * FIXME: As above... | ||
| 2420 | */ | ||
| 2421 | for_each_possible_cpu(cpu) { | ||
| 2422 | rcu_read_lock_sched(); | ||
| 2423 | dl_b = dl_bw_of(cpu); | ||
| 2424 | |||
| 2425 | raw_spin_lock_irqsave(&dl_b->lock, flags); | ||
| 2426 | dl_b->bw = new_bw; | ||
| 2427 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); | ||
| 2428 | |||
| 2429 | rcu_read_unlock_sched(); | ||
| 2430 | init_dl_rq_bw_ratio(&cpu_rq(cpu)->dl); | ||
| 2431 | } | ||
| 2432 | } | ||
| 2433 | |||
| 2434 | /* | ||
| 2435 | * We must be sure that accepting a new task (or allowing changing the | ||
| 2436 | * parameters of an existing one) is consistent with the bandwidth | ||
| 2437 | * constraints. If yes, this function also accordingly updates the currently | ||
| 2438 | * allocated bandwidth to reflect the new situation. | ||
| 2439 | * | ||
| 2440 | * This function is called while holding p's rq->lock. | ||
| 2441 | */ | ||
| 2442 | int sched_dl_overflow(struct task_struct *p, int policy, | ||
| 2443 | const struct sched_attr *attr) | ||
| 2444 | { | ||
| 2445 | struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); | ||
| 2446 | u64 period = attr->sched_period ?: attr->sched_deadline; | ||
| 2447 | u64 runtime = attr->sched_runtime; | ||
| 2448 | u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; | ||
| 2449 | int cpus, err = -1; | ||
| 2450 | |||
| 2451 | /* !deadline task may carry old deadline bandwidth */ | ||
| 2452 | if (new_bw == p->dl.dl_bw && task_has_dl_policy(p)) | ||
| 2453 | return 0; | ||
| 2454 | |||
| 2455 | /* | ||
| 2456 | * Either if a task, enters, leave, or stays -deadline but changes | ||
| 2457 | * its parameters, we may need to update accordingly the total | ||
| 2458 | * allocated bandwidth of the container. | ||
| 2459 | */ | ||
| 2460 | raw_spin_lock(&dl_b->lock); | ||
| 2461 | cpus = dl_bw_cpus(task_cpu(p)); | ||
| 2462 | if (dl_policy(policy) && !task_has_dl_policy(p) && | ||
| 2463 | !__dl_overflow(dl_b, cpus, 0, new_bw)) { | ||
| 2464 | if (hrtimer_active(&p->dl.inactive_timer)) | ||
| 2465 | __dl_clear(dl_b, p->dl.dl_bw, cpus); | ||
| 2466 | __dl_add(dl_b, new_bw, cpus); | ||
| 2467 | err = 0; | ||
| 2468 | } else if (dl_policy(policy) && task_has_dl_policy(p) && | ||
| 2469 | !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) { | ||
| 2470 | /* | ||
| 2471 | * XXX this is slightly incorrect: when the task | ||
| 2472 | * utilization decreases, we should delay the total | ||
| 2473 | * utilization change until the task's 0-lag point. | ||
| 2474 | * But this would require to set the task's "inactive | ||
| 2475 | * timer" when the task is not inactive. | ||
| 2476 | */ | ||
| 2477 | __dl_clear(dl_b, p->dl.dl_bw, cpus); | ||
| 2478 | __dl_add(dl_b, new_bw, cpus); | ||
| 2479 | dl_change_utilization(p, new_bw); | ||
| 2480 | err = 0; | ||
| 2481 | } else if (!dl_policy(policy) && task_has_dl_policy(p)) { | ||
| 2482 | /* | ||
| 2483 | * Do not decrease the total deadline utilization here, | ||
| 2484 | * switched_from_dl() will take care to do it at the correct | ||
| 2485 | * (0-lag) time. | ||
| 2486 | */ | ||
| 2487 | err = 0; | ||
| 2488 | } | ||
| 2489 | raw_spin_unlock(&dl_b->lock); | ||
| 2490 | |||
| 2491 | return err; | ||
| 2492 | } | ||
| 2493 | |||
| 2494 | /* | ||
| 2495 | * This function initializes the sched_dl_entity of a newly becoming | ||
| 2496 | * SCHED_DEADLINE task. | ||
| 2497 | * | ||
| 2498 | * Only the static values are considered here, the actual runtime and the | ||
| 2499 | * absolute deadline will be properly calculated when the task is enqueued | ||
| 2500 | * for the first time with its new policy. | ||
| 2501 | */ | ||
| 2502 | void __setparam_dl(struct task_struct *p, const struct sched_attr *attr) | ||
| 2503 | { | ||
| 2504 | struct sched_dl_entity *dl_se = &p->dl; | ||
| 2505 | |||
| 2506 | dl_se->dl_runtime = attr->sched_runtime; | ||
| 2507 | dl_se->dl_deadline = attr->sched_deadline; | ||
| 2508 | dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; | ||
| 2509 | dl_se->flags = attr->sched_flags; | ||
| 2510 | dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); | ||
| 2511 | dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime); | ||
| 2512 | } | ||
| 2513 | |||
| 2514 | void __getparam_dl(struct task_struct *p, struct sched_attr *attr) | ||
| 2515 | { | ||
| 2516 | struct sched_dl_entity *dl_se = &p->dl; | ||
| 2517 | |||
| 2518 | attr->sched_priority = p->rt_priority; | ||
| 2519 | attr->sched_runtime = dl_se->dl_runtime; | ||
| 2520 | attr->sched_deadline = dl_se->dl_deadline; | ||
| 2521 | attr->sched_period = dl_se->dl_period; | ||
| 2522 | attr->sched_flags = dl_se->flags; | ||
| 2523 | } | ||
| 2524 | |||
| 2525 | /* | ||
| 2526 | * This function validates the new parameters of a -deadline task. | ||
| 2527 | * We ask for the deadline not being zero, and greater or equal | ||
| 2528 | * than the runtime, as well as the period of being zero or | ||
| 2529 | * greater than deadline. Furthermore, we have to be sure that | ||
| 2530 | * user parameters are above the internal resolution of 1us (we | ||
| 2531 | * check sched_runtime only since it is always the smaller one) and | ||
| 2532 | * below 2^63 ns (we have to check both sched_deadline and | ||
| 2533 | * sched_period, as the latter can be zero). | ||
| 2534 | */ | ||
| 2535 | bool __checkparam_dl(const struct sched_attr *attr) | ||
| 2536 | { | ||
| 2537 | /* deadline != 0 */ | ||
| 2538 | if (attr->sched_deadline == 0) | ||
| 2539 | return false; | ||
| 2540 | |||
| 2541 | /* | ||
| 2542 | * Since we truncate DL_SCALE bits, make sure we're at least | ||
| 2543 | * that big. | ||
| 2544 | */ | ||
| 2545 | if (attr->sched_runtime < (1ULL << DL_SCALE)) | ||
| 2546 | return false; | ||
| 2547 | |||
| 2548 | /* | ||
| 2549 | * Since we use the MSB for wrap-around and sign issues, make | ||
| 2550 | * sure it's not set (mind that period can be equal to zero). | ||
| 2551 | */ | ||
| 2552 | if (attr->sched_deadline & (1ULL << 63) || | ||
| 2553 | attr->sched_period & (1ULL << 63)) | ||
| 2554 | return false; | ||
| 2555 | |||
| 2556 | /* runtime <= deadline <= period (if period != 0) */ | ||
| 2557 | if ((attr->sched_period != 0 && | ||
| 2558 | attr->sched_period < attr->sched_deadline) || | ||
| 2559 | attr->sched_deadline < attr->sched_runtime) | ||
| 2560 | return false; | ||
| 2561 | |||
| 2562 | return true; | ||
| 2563 | } | ||
| 2564 | |||
| 2565 | /* | ||
| 2566 | * This function clears the sched_dl_entity static params. | ||
| 2567 | */ | ||
| 2568 | void __dl_clear_params(struct task_struct *p) | ||
| 2569 | { | ||
| 2570 | struct sched_dl_entity *dl_se = &p->dl; | ||
| 2571 | |||
| 2572 | dl_se->dl_runtime = 0; | ||
| 2573 | dl_se->dl_deadline = 0; | ||
| 2574 | dl_se->dl_period = 0; | ||
| 2575 | dl_se->flags = 0; | ||
| 2576 | dl_se->dl_bw = 0; | ||
| 2577 | dl_se->dl_density = 0; | ||
| 2578 | |||
| 2579 | dl_se->dl_throttled = 0; | ||
| 2580 | dl_se->dl_yielded = 0; | ||
| 2581 | dl_se->dl_non_contending = 0; | ||
| 2582 | } | ||
| 2583 | |||
| 2584 | bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) | ||
| 2585 | { | ||
| 2586 | struct sched_dl_entity *dl_se = &p->dl; | ||
| 2587 | |||
| 2588 | if (dl_se->dl_runtime != attr->sched_runtime || | ||
| 2589 | dl_se->dl_deadline != attr->sched_deadline || | ||
| 2590 | dl_se->dl_period != attr->sched_period || | ||
| 2591 | dl_se->flags != attr->sched_flags) | ||
| 2592 | return true; | ||
| 2593 | |||
| 2594 | return false; | ||
| 2595 | } | ||
| 2596 | |||
| 2597 | #ifdef CONFIG_SMP | ||
| 2598 | int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed) | ||
| 2599 | { | ||
| 2600 | unsigned int dest_cpu = cpumask_any_and(cpu_active_mask, | ||
| 2601 | cs_cpus_allowed); | ||
| 2602 | struct dl_bw *dl_b; | ||
| 2603 | bool overflow; | ||
| 2604 | int cpus, ret; | ||
| 2605 | unsigned long flags; | ||
| 2606 | |||
| 2607 | rcu_read_lock_sched(); | ||
| 2608 | dl_b = dl_bw_of(dest_cpu); | ||
| 2609 | raw_spin_lock_irqsave(&dl_b->lock, flags); | ||
| 2610 | cpus = dl_bw_cpus(dest_cpu); | ||
| 2611 | overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw); | ||
| 2612 | if (overflow) | ||
| 2613 | ret = -EBUSY; | ||
| 2614 | else { | ||
| 2615 | /* | ||
| 2616 | * We reserve space for this task in the destination | ||
| 2617 | * root_domain, as we can't fail after this point. | ||
| 2618 | * We will free resources in the source root_domain | ||
| 2619 | * later on (see set_cpus_allowed_dl()). | ||
| 2620 | */ | ||
| 2621 | __dl_add(dl_b, p->dl.dl_bw, cpus); | ||
| 2622 | ret = 0; | ||
| 2623 | } | ||
| 2624 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); | ||
| 2625 | rcu_read_unlock_sched(); | ||
| 2626 | return ret; | ||
| 2627 | } | ||
| 2628 | |||
| 2629 | int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, | ||
| 2630 | const struct cpumask *trial) | ||
| 2631 | { | ||
| 2632 | int ret = 1, trial_cpus; | ||
| 2633 | struct dl_bw *cur_dl_b; | ||
| 2634 | unsigned long flags; | ||
| 2635 | |||
| 2636 | rcu_read_lock_sched(); | ||
| 2637 | cur_dl_b = dl_bw_of(cpumask_any(cur)); | ||
| 2638 | trial_cpus = cpumask_weight(trial); | ||
| 2639 | |||
| 2640 | raw_spin_lock_irqsave(&cur_dl_b->lock, flags); | ||
| 2641 | if (cur_dl_b->bw != -1 && | ||
| 2642 | cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw) | ||
| 2643 | ret = 0; | ||
| 2644 | raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags); | ||
| 2645 | rcu_read_unlock_sched(); | ||
| 2646 | return ret; | ||
| 2647 | } | ||
| 2648 | |||
| 2649 | bool dl_cpu_busy(unsigned int cpu) | ||
| 2650 | { | ||
| 2651 | unsigned long flags; | ||
| 2652 | struct dl_bw *dl_b; | ||
| 2653 | bool overflow; | ||
| 2654 | int cpus; | ||
| 2655 | |||
| 2656 | rcu_read_lock_sched(); | ||
| 2657 | dl_b = dl_bw_of(cpu); | ||
| 2658 | raw_spin_lock_irqsave(&dl_b->lock, flags); | ||
| 2659 | cpus = dl_bw_cpus(cpu); | ||
| 2660 | overflow = __dl_overflow(dl_b, cpus, 0, 0); | ||
| 2661 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); | ||
| 2662 | rcu_read_unlock_sched(); | ||
| 2663 | return overflow; | ||
| 2664 | } | ||
| 2665 | #endif | ||
| 2666 | |||
| 1857 | #ifdef CONFIG_SCHED_DEBUG | 2667 | #ifdef CONFIG_SCHED_DEBUG |
| 1858 | extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq); | 2668 | extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq); |
| 1859 | 2669 | ||
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 38f019324f1a..4fa66de52bd6 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
| @@ -552,15 +552,21 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) | |||
| 552 | 552 | ||
| 553 | #define P(x) \ | 553 | #define P(x) \ |
| 554 | SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x)) | 554 | SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x)) |
| 555 | #define PU(x) \ | ||
| 556 | SEQ_printf(m, " .%-30s: %lu\n", #x, (unsigned long)(rt_rq->x)) | ||
| 555 | #define PN(x) \ | 557 | #define PN(x) \ |
| 556 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x)) | 558 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x)) |
| 557 | 559 | ||
| 558 | P(rt_nr_running); | 560 | PU(rt_nr_running); |
| 561 | #ifdef CONFIG_SMP | ||
| 562 | PU(rt_nr_migratory); | ||
| 563 | #endif | ||
| 559 | P(rt_throttled); | 564 | P(rt_throttled); |
| 560 | PN(rt_time); | 565 | PN(rt_time); |
| 561 | PN(rt_runtime); | 566 | PN(rt_runtime); |
| 562 | 567 | ||
| 563 | #undef PN | 568 | #undef PN |
| 569 | #undef PU | ||
| 564 | #undef P | 570 | #undef P |
| 565 | } | 571 | } |
| 566 | 572 | ||
| @@ -569,14 +575,21 @@ void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq) | |||
| 569 | struct dl_bw *dl_bw; | 575 | struct dl_bw *dl_bw; |
| 570 | 576 | ||
| 571 | SEQ_printf(m, "\ndl_rq[%d]:\n", cpu); | 577 | SEQ_printf(m, "\ndl_rq[%d]:\n", cpu); |
| 572 | SEQ_printf(m, " .%-30s: %ld\n", "dl_nr_running", dl_rq->dl_nr_running); | 578 | |
| 579 | #define PU(x) \ | ||
| 580 | SEQ_printf(m, " .%-30s: %lu\n", #x, (unsigned long)(dl_rq->x)) | ||
| 581 | |||
| 582 | PU(dl_nr_running); | ||
| 573 | #ifdef CONFIG_SMP | 583 | #ifdef CONFIG_SMP |
| 584 | PU(dl_nr_migratory); | ||
| 574 | dl_bw = &cpu_rq(cpu)->rd->dl_bw; | 585 | dl_bw = &cpu_rq(cpu)->rd->dl_bw; |
| 575 | #else | 586 | #else |
| 576 | dl_bw = &dl_rq->dl_bw; | 587 | dl_bw = &dl_rq->dl_bw; |
| 577 | #endif | 588 | #endif |
| 578 | SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->bw", dl_bw->bw); | 589 | SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->bw", dl_bw->bw); |
| 579 | SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->total_bw", dl_bw->total_bw); | 590 | SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->total_bw", dl_bw->total_bw); |
| 591 | |||
| 592 | #undef PU | ||
| 580 | } | 593 | } |
| 581 | 594 | ||
| 582 | extern __read_mostly int sched_clock_running; | 595 | extern __read_mostly int sched_clock_running; |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d71109321841..c95880e216f6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
| @@ -369,8 +369,9 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) | |||
| 369 | } | 369 | } |
| 370 | 370 | ||
| 371 | /* Iterate thr' all leaf cfs_rq's on a runqueue */ | 371 | /* Iterate thr' all leaf cfs_rq's on a runqueue */ |
| 372 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ | 372 | #define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \ |
| 373 | list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) | 373 | list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \ |
| 374 | leaf_cfs_rq_list) | ||
| 374 | 375 | ||
| 375 | /* Do the two (enqueued) entities belong to the same group ? */ | 376 | /* Do the two (enqueued) entities belong to the same group ? */ |
| 376 | static inline struct cfs_rq * | 377 | static inline struct cfs_rq * |
| @@ -463,8 +464,8 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) | |||
| 463 | { | 464 | { |
| 464 | } | 465 | } |
| 465 | 466 | ||
| 466 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ | 467 | #define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \ |
| 467 | for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) | 468 | for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos) |
| 468 | 469 | ||
| 469 | static inline struct sched_entity *parent_entity(struct sched_entity *se) | 470 | static inline struct sched_entity *parent_entity(struct sched_entity *se) |
| 470 | { | 471 | { |
| @@ -1381,7 +1382,6 @@ static unsigned long weighted_cpuload(const int cpu); | |||
| 1381 | static unsigned long source_load(int cpu, int type); | 1382 | static unsigned long source_load(int cpu, int type); |
| 1382 | static unsigned long target_load(int cpu, int type); | 1383 | static unsigned long target_load(int cpu, int type); |
| 1383 | static unsigned long capacity_of(int cpu); | 1384 | static unsigned long capacity_of(int cpu); |
| 1384 | static long effective_load(struct task_group *tg, int cpu, long wl, long wg); | ||
| 1385 | 1385 | ||
| 1386 | /* Cached statistics for all CPUs within a node */ | 1386 | /* Cached statistics for all CPUs within a node */ |
| 1387 | struct numa_stats { | 1387 | struct numa_stats { |
| @@ -2469,7 +2469,8 @@ void task_numa_work(struct callback_head *work) | |||
| 2469 | return; | 2469 | return; |
| 2470 | 2470 | ||
| 2471 | 2471 | ||
| 2472 | down_read(&mm->mmap_sem); | 2472 | if (!down_read_trylock(&mm->mmap_sem)) |
| 2473 | return; | ||
| 2473 | vma = find_vma(mm, start); | 2474 | vma = find_vma(mm, start); |
| 2474 | if (!vma) { | 2475 | if (!vma) { |
| 2475 | reset_ptenuma_scan(p); | 2476 | reset_ptenuma_scan(p); |
| @@ -2584,6 +2585,60 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) | |||
| 2584 | } | 2585 | } |
| 2585 | } | 2586 | } |
| 2586 | } | 2587 | } |
| 2588 | |||
| 2589 | /* | ||
| 2590 | * Can a task be moved from prev_cpu to this_cpu without causing a load | ||
| 2591 | * imbalance that would trigger the load balancer? | ||
| 2592 | */ | ||
| 2593 | static inline bool numa_wake_affine(struct sched_domain *sd, | ||
| 2594 | struct task_struct *p, int this_cpu, | ||
| 2595 | int prev_cpu, int sync) | ||
| 2596 | { | ||
| 2597 | struct numa_stats prev_load, this_load; | ||
| 2598 | s64 this_eff_load, prev_eff_load; | ||
| 2599 | |||
| 2600 | update_numa_stats(&prev_load, cpu_to_node(prev_cpu)); | ||
| 2601 | update_numa_stats(&this_load, cpu_to_node(this_cpu)); | ||
| 2602 | |||
| 2603 | /* | ||
| 2604 | * If sync wakeup then subtract the (maximum possible) | ||
| 2605 | * effect of the currently running task from the load | ||
| 2606 | * of the current CPU: | ||
| 2607 | */ | ||
| 2608 | if (sync) { | ||
| 2609 | unsigned long current_load = task_h_load(current); | ||
| 2610 | |||
| 2611 | if (this_load.load > current_load) | ||
| 2612 | this_load.load -= current_load; | ||
| 2613 | else | ||
| 2614 | this_load.load = 0; | ||
| 2615 | } | ||
| 2616 | |||
| 2617 | /* | ||
| 2618 | * In low-load situations, where this_cpu's node is idle due to the | ||
| 2619 | * sync cause above having dropped this_load.load to 0, move the task. | ||
| 2620 | * Moving to an idle socket will not create a bad imbalance. | ||
| 2621 | * | ||
| 2622 | * Otherwise check if the nodes are near enough in load to allow this | ||
| 2623 | * task to be woken on this_cpu's node. | ||
| 2624 | */ | ||
| 2625 | if (this_load.load > 0) { | ||
| 2626 | unsigned long task_load = task_h_load(p); | ||
| 2627 | |||
| 2628 | this_eff_load = 100; | ||
| 2629 | this_eff_load *= prev_load.compute_capacity; | ||
| 2630 | |||
| 2631 | prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; | ||
| 2632 | prev_eff_load *= this_load.compute_capacity; | ||
| 2633 | |||
| 2634 | this_eff_load *= this_load.load + task_load; | ||
| 2635 | prev_eff_load *= prev_load.load - task_load; | ||
| 2636 | |||
| 2637 | return this_eff_load <= prev_eff_load; | ||
| 2638 | } | ||
| 2639 | |||
| 2640 | return true; | ||
| 2641 | } | ||
| 2587 | #else | 2642 | #else |
| 2588 | static void task_tick_numa(struct rq *rq, struct task_struct *curr) | 2643 | static void task_tick_numa(struct rq *rq, struct task_struct *curr) |
| 2589 | { | 2644 | { |
| @@ -2596,6 +2651,15 @@ static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p) | |||
| 2596 | static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p) | 2651 | static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p) |
| 2597 | { | 2652 | { |
| 2598 | } | 2653 | } |
| 2654 | |||
| 2655 | #ifdef CONFIG_SMP | ||
| 2656 | static inline bool numa_wake_affine(struct sched_domain *sd, | ||
| 2657 | struct task_struct *p, int this_cpu, | ||
| 2658 | int prev_cpu, int sync) | ||
| 2659 | { | ||
| 2660 | return true; | ||
| 2661 | } | ||
| 2662 | #endif /* !SMP */ | ||
| 2599 | #endif /* CONFIG_NUMA_BALANCING */ | 2663 | #endif /* CONFIG_NUMA_BALANCING */ |
| 2600 | 2664 | ||
| 2601 | static void | 2665 | static void |
| @@ -2916,12 +2980,12 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa, | |||
| 2916 | /* | 2980 | /* |
| 2917 | * Step 2: update *_avg. | 2981 | * Step 2: update *_avg. |
| 2918 | */ | 2982 | */ |
| 2919 | sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX); | 2983 | sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib); |
| 2920 | if (cfs_rq) { | 2984 | if (cfs_rq) { |
| 2921 | cfs_rq->runnable_load_avg = | 2985 | cfs_rq->runnable_load_avg = |
| 2922 | div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX); | 2986 | div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib); |
| 2923 | } | 2987 | } |
| 2924 | sa->util_avg = sa->util_sum / LOAD_AVG_MAX; | 2988 | sa->util_avg = sa->util_sum / (LOAD_AVG_MAX - 1024 + sa->period_contrib); |
| 2925 | 2989 | ||
| 2926 | return 1; | 2990 | return 1; |
| 2927 | } | 2991 | } |
| @@ -2982,8 +3046,7 @@ __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq) | |||
| 2982 | * differential update where we store the last value we propagated. This in | 3046 | * differential update where we store the last value we propagated. This in |
| 2983 | * turn allows skipping updates if the differential is 'small'. | 3047 | * turn allows skipping updates if the differential is 'small'. |
| 2984 | * | 3048 | * |
| 2985 | * Updating tg's load_avg is necessary before update_cfs_share() (which is | 3049 | * Updating tg's load_avg is necessary before update_cfs_share(). |
| 2986 | * done) and effective_load() (which is not done because it is too costly). | ||
| 2987 | */ | 3050 | */ |
| 2988 | static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) | 3051 | static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) |
| 2989 | { | 3052 | { |
| @@ -3563,7 +3626,7 @@ static inline void check_schedstat_required(void) | |||
| 3563 | trace_sched_stat_runtime_enabled()) { | 3626 | trace_sched_stat_runtime_enabled()) { |
| 3564 | printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, " | 3627 | printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, " |
| 3565 | "stat_blocked and stat_runtime require the " | 3628 | "stat_blocked and stat_runtime require the " |
| 3566 | "kernel parameter schedstats=enabled or " | 3629 | "kernel parameter schedstats=enable or " |
| 3567 | "kernel.sched_schedstats=1\n"); | 3630 | "kernel.sched_schedstats=1\n"); |
| 3568 | } | 3631 | } |
| 3569 | #endif | 3632 | #endif |
| @@ -4642,24 +4705,43 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | |||
| 4642 | hrtimer_cancel(&cfs_b->slack_timer); | 4705 | hrtimer_cancel(&cfs_b->slack_timer); |
| 4643 | } | 4706 | } |
| 4644 | 4707 | ||
| 4708 | /* | ||
| 4709 | * Both these cpu hotplug callbacks race against unregister_fair_sched_group() | ||
| 4710 | * | ||
| 4711 | * The race is harmless, since modifying bandwidth settings of unhooked group | ||
| 4712 | * bits doesn't do much. | ||
| 4713 | */ | ||
| 4714 | |||
| 4715 | /* cpu online calback */ | ||
| 4645 | static void __maybe_unused update_runtime_enabled(struct rq *rq) | 4716 | static void __maybe_unused update_runtime_enabled(struct rq *rq) |
| 4646 | { | 4717 | { |
| 4647 | struct cfs_rq *cfs_rq; | 4718 | struct task_group *tg; |
| 4719 | |||
| 4720 | lockdep_assert_held(&rq->lock); | ||
| 4648 | 4721 | ||
| 4649 | for_each_leaf_cfs_rq(rq, cfs_rq) { | 4722 | rcu_read_lock(); |
| 4650 | struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth; | 4723 | list_for_each_entry_rcu(tg, &task_groups, list) { |
| 4724 | struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; | ||
| 4725 | struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; | ||
| 4651 | 4726 | ||
| 4652 | raw_spin_lock(&cfs_b->lock); | 4727 | raw_spin_lock(&cfs_b->lock); |
| 4653 | cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF; | 4728 | cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF; |
| 4654 | raw_spin_unlock(&cfs_b->lock); | 4729 | raw_spin_unlock(&cfs_b->lock); |
| 4655 | } | 4730 | } |
| 4731 | rcu_read_unlock(); | ||
| 4656 | } | 4732 | } |
| 4657 | 4733 | ||
| 4734 | /* cpu offline callback */ | ||
| 4658 | static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) | 4735 | static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) |
| 4659 | { | 4736 | { |
| 4660 | struct cfs_rq *cfs_rq; | 4737 | struct task_group *tg; |
| 4738 | |||
| 4739 | lockdep_assert_held(&rq->lock); | ||
| 4740 | |||
| 4741 | rcu_read_lock(); | ||
| 4742 | list_for_each_entry_rcu(tg, &task_groups, list) { | ||
| 4743 | struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; | ||
| 4661 | 4744 | ||
| 4662 | for_each_leaf_cfs_rq(rq, cfs_rq) { | ||
| 4663 | if (!cfs_rq->runtime_enabled) | 4745 | if (!cfs_rq->runtime_enabled) |
| 4664 | continue; | 4746 | continue; |
| 4665 | 4747 | ||
| @@ -4677,6 +4759,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) | |||
| 4677 | if (cfs_rq_throttled(cfs_rq)) | 4759 | if (cfs_rq_throttled(cfs_rq)) |
| 4678 | unthrottle_cfs_rq(cfs_rq); | 4760 | unthrottle_cfs_rq(cfs_rq); |
| 4679 | } | 4761 | } |
| 4762 | rcu_read_unlock(); | ||
| 4680 | } | 4763 | } |
| 4681 | 4764 | ||
| 4682 | #else /* CONFIG_CFS_BANDWIDTH */ | 4765 | #else /* CONFIG_CFS_BANDWIDTH */ |
| @@ -5215,126 +5298,6 @@ static unsigned long cpu_avg_load_per_task(int cpu) | |||
| 5215 | return 0; | 5298 | return 0; |
| 5216 | } | 5299 | } |
| 5217 | 5300 | ||
| 5218 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 5219 | /* | ||
| 5220 | * effective_load() calculates the load change as seen from the root_task_group | ||
| 5221 | * | ||
| 5222 | * Adding load to a group doesn't make a group heavier, but can cause movement | ||
| 5223 | * of group shares between cpus. Assuming the shares were perfectly aligned one | ||
| 5224 | * can calculate the shift in shares. | ||
| 5225 | * | ||
| 5226 | * Calculate the effective load difference if @wl is added (subtracted) to @tg | ||
| 5227 | * on this @cpu and results in a total addition (subtraction) of @wg to the | ||
| 5228 | * total group weight. | ||
| 5229 | * | ||
| 5230 | * Given a runqueue weight distribution (rw_i) we can compute a shares | ||
| 5231 | * distribution (s_i) using: | ||
| 5232 | * | ||
| 5233 | * s_i = rw_i / \Sum rw_j (1) | ||
| 5234 | * | ||
| 5235 | * Suppose we have 4 CPUs and our @tg is a direct child of the root group and | ||
| 5236 | * has 7 equal weight tasks, distributed as below (rw_i), with the resulting | ||
| 5237 | * shares distribution (s_i): | ||
| 5238 | * | ||
| 5239 | * rw_i = { 2, 4, 1, 0 } | ||
| 5240 | * s_i = { 2/7, 4/7, 1/7, 0 } | ||
| 5241 | * | ||
| 5242 | * As per wake_affine() we're interested in the load of two CPUs (the CPU the | ||
| 5243 | * task used to run on and the CPU the waker is running on), we need to | ||
| 5244 | * compute the effect of waking a task on either CPU and, in case of a sync | ||
| 5245 | * wakeup, compute the effect of the current task going to sleep. | ||
| 5246 | * | ||
| 5247 | * So for a change of @wl to the local @cpu with an overall group weight change | ||
| 5248 | * of @wl we can compute the new shares distribution (s'_i) using: | ||
| 5249 | * | ||
| 5250 | * s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2) | ||
| 5251 | * | ||
| 5252 | * Suppose we're interested in CPUs 0 and 1, and want to compute the load | ||
| 5253 | * differences in waking a task to CPU 0. The additional task changes the | ||
| 5254 | * weight and shares distributions like: | ||
| 5255 | * | ||
| 5256 | * rw'_i = { 3, 4, 1, 0 } | ||
| 5257 | * s'_i = { 3/8, 4/8, 1/8, 0 } | ||
| 5258 | * | ||
| 5259 | * We can then compute the difference in effective weight by using: | ||
| 5260 | * | ||
| 5261 | * dw_i = S * (s'_i - s_i) (3) | ||
| 5262 | * | ||
| 5263 | * Where 'S' is the group weight as seen by its parent. | ||
| 5264 | * | ||
| 5265 | * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7) | ||
| 5266 | * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 - | ||
| 5267 | * 4/7) times the weight of the group. | ||
| 5268 | */ | ||
| 5269 | static long effective_load(struct task_group *tg, int cpu, long wl, long wg) | ||
| 5270 | { | ||
| 5271 | struct sched_entity *se = tg->se[cpu]; | ||
| 5272 | |||
| 5273 | if (!tg->parent) /* the trivial, non-cgroup case */ | ||
| 5274 | return wl; | ||
| 5275 | |||
| 5276 | for_each_sched_entity(se) { | ||
| 5277 | struct cfs_rq *cfs_rq = se->my_q; | ||
| 5278 | long W, w = cfs_rq_load_avg(cfs_rq); | ||
| 5279 | |||
| 5280 | tg = cfs_rq->tg; | ||
| 5281 | |||
| 5282 | /* | ||
| 5283 | * W = @wg + \Sum rw_j | ||
| 5284 | */ | ||
| 5285 | W = wg + atomic_long_read(&tg->load_avg); | ||
| 5286 | |||
| 5287 | /* Ensure \Sum rw_j >= rw_i */ | ||
| 5288 | W -= cfs_rq->tg_load_avg_contrib; | ||
| 5289 | W += w; | ||
| 5290 | |||
| 5291 | /* | ||
| 5292 | * w = rw_i + @wl | ||
| 5293 | */ | ||
| 5294 | w += wl; | ||
| 5295 | |||
| 5296 | /* | ||
| 5297 | * wl = S * s'_i; see (2) | ||
| 5298 | */ | ||
| 5299 | if (W > 0 && w < W) | ||
| 5300 | wl = (w * (long)scale_load_down(tg->shares)) / W; | ||
| 5301 | else | ||
| 5302 | wl = scale_load_down(tg->shares); | ||
| 5303 | |||
| 5304 | /* | ||
| 5305 | * Per the above, wl is the new se->load.weight value; since | ||
| 5306 | * those are clipped to [MIN_SHARES, ...) do so now. See | ||
| 5307 | * calc_cfs_shares(). | ||
| 5308 | */ | ||
| 5309 | if (wl < MIN_SHARES) | ||
| 5310 | wl = MIN_SHARES; | ||
| 5311 | |||
| 5312 | /* | ||
| 5313 | * wl = dw_i = S * (s'_i - s_i); see (3) | ||
| 5314 | */ | ||
| 5315 | wl -= se->avg.load_avg; | ||
| 5316 | |||
| 5317 | /* | ||
| 5318 | * Recursively apply this logic to all parent groups to compute | ||
| 5319 | * the final effective load change on the root group. Since | ||
| 5320 | * only the @tg group gets extra weight, all parent groups can | ||
| 5321 | * only redistribute existing shares. @wl is the shift in shares | ||
| 5322 | * resulting from this level per the above. | ||
| 5323 | */ | ||
| 5324 | wg = 0; | ||
| 5325 | } | ||
| 5326 | |||
| 5327 | return wl; | ||
| 5328 | } | ||
| 5329 | #else | ||
| 5330 | |||
| 5331 | static long effective_load(struct task_group *tg, int cpu, long wl, long wg) | ||
| 5332 | { | ||
| 5333 | return wl; | ||
| 5334 | } | ||
| 5335 | |||
| 5336 | #endif | ||
| 5337 | |||
| 5338 | static void record_wakee(struct task_struct *p) | 5301 | static void record_wakee(struct task_struct *p) |
| 5339 | { | 5302 | { |
| 5340 | /* | 5303 | /* |
| @@ -5385,67 +5348,25 @@ static int wake_wide(struct task_struct *p) | |||
| 5385 | static int wake_affine(struct sched_domain *sd, struct task_struct *p, | 5348 | static int wake_affine(struct sched_domain *sd, struct task_struct *p, |
| 5386 | int prev_cpu, int sync) | 5349 | int prev_cpu, int sync) |
| 5387 | { | 5350 | { |
| 5388 | s64 this_load, load; | 5351 | int this_cpu = smp_processor_id(); |
| 5389 | s64 this_eff_load, prev_eff_load; | 5352 | bool affine = false; |
| 5390 | int idx, this_cpu; | ||
| 5391 | struct task_group *tg; | ||
| 5392 | unsigned long weight; | ||
| 5393 | int balanced; | ||
| 5394 | |||
| 5395 | idx = sd->wake_idx; | ||
| 5396 | this_cpu = smp_processor_id(); | ||
| 5397 | load = source_load(prev_cpu, idx); | ||
| 5398 | this_load = target_load(this_cpu, idx); | ||
| 5399 | 5353 | ||
| 5400 | /* | 5354 | /* |
| 5401 | * If sync wakeup then subtract the (maximum possible) | 5355 | * Common case: CPUs are in the same socket, and select_idle_sibling() |
| 5402 | * effect of the currently running task from the load | 5356 | * will do its thing regardless of what we return: |
| 5403 | * of the current CPU: | ||
| 5404 | */ | ||
| 5405 | if (sync) { | ||
| 5406 | tg = task_group(current); | ||
| 5407 | weight = current->se.avg.load_avg; | ||
| 5408 | |||
| 5409 | this_load += effective_load(tg, this_cpu, -weight, -weight); | ||
| 5410 | load += effective_load(tg, prev_cpu, 0, -weight); | ||
| 5411 | } | ||
| 5412 | |||
| 5413 | tg = task_group(p); | ||
| 5414 | weight = p->se.avg.load_avg; | ||
| 5415 | |||
| 5416 | /* | ||
| 5417 | * In low-load situations, where prev_cpu is idle and this_cpu is idle | ||
| 5418 | * due to the sync cause above having dropped this_load to 0, we'll | ||
| 5419 | * always have an imbalance, but there's really nothing you can do | ||
| 5420 | * about that, so that's good too. | ||
| 5421 | * | ||
| 5422 | * Otherwise check if either cpus are near enough in load to allow this | ||
| 5423 | * task to be woken on this_cpu. | ||
| 5424 | */ | 5357 | */ |
| 5425 | this_eff_load = 100; | 5358 | if (cpus_share_cache(prev_cpu, this_cpu)) |
| 5426 | this_eff_load *= capacity_of(prev_cpu); | 5359 | affine = true; |
| 5427 | 5360 | else | |
| 5428 | prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; | 5361 | affine = numa_wake_affine(sd, p, this_cpu, prev_cpu, sync); |
| 5429 | prev_eff_load *= capacity_of(this_cpu); | ||
| 5430 | |||
| 5431 | if (this_load > 0) { | ||
| 5432 | this_eff_load *= this_load + | ||
| 5433 | effective_load(tg, this_cpu, weight, weight); | ||
| 5434 | |||
| 5435 | prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight); | ||
| 5436 | } | ||
| 5437 | |||
| 5438 | balanced = this_eff_load <= prev_eff_load; | ||
| 5439 | 5362 | ||
| 5440 | schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts); | 5363 | schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts); |
| 5364 | if (affine) { | ||
| 5365 | schedstat_inc(sd->ttwu_move_affine); | ||
| 5366 | schedstat_inc(p->se.statistics.nr_wakeups_affine); | ||
| 5367 | } | ||
| 5441 | 5368 | ||
| 5442 | if (!balanced) | 5369 | return affine; |
| 5443 | return 0; | ||
| 5444 | |||
| 5445 | schedstat_inc(sd->ttwu_move_affine); | ||
| 5446 | schedstat_inc(p->se.statistics.nr_wakeups_affine); | ||
| 5447 | |||
| 5448 | return 1; | ||
| 5449 | } | 5370 | } |
| 5450 | 5371 | ||
| 5451 | static inline int task_util(struct task_struct *p); | 5372 | static inline int task_util(struct task_struct *p); |
| @@ -5484,12 +5405,12 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, | |||
| 5484 | int i; | 5405 | int i; |
| 5485 | 5406 | ||
| 5486 | /* Skip over this group if it has no CPUs allowed */ | 5407 | /* Skip over this group if it has no CPUs allowed */ |
| 5487 | if (!cpumask_intersects(sched_group_cpus(group), | 5408 | if (!cpumask_intersects(sched_group_span(group), |
| 5488 | &p->cpus_allowed)) | 5409 | &p->cpus_allowed)) |
| 5489 | continue; | 5410 | continue; |
| 5490 | 5411 | ||
| 5491 | local_group = cpumask_test_cpu(this_cpu, | 5412 | local_group = cpumask_test_cpu(this_cpu, |
| 5492 | sched_group_cpus(group)); | 5413 | sched_group_span(group)); |
| 5493 | 5414 | ||
| 5494 | /* | 5415 | /* |
| 5495 | * Tally up the load of all CPUs in the group and find | 5416 | * Tally up the load of all CPUs in the group and find |
| @@ -5499,7 +5420,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, | |||
| 5499 | runnable_load = 0; | 5420 | runnable_load = 0; |
| 5500 | max_spare_cap = 0; | 5421 | max_spare_cap = 0; |
| 5501 | 5422 | ||
| 5502 | for_each_cpu(i, sched_group_cpus(group)) { | 5423 | for_each_cpu(i, sched_group_span(group)) { |
| 5503 | /* Bias balancing toward cpus of our domain */ | 5424 | /* Bias balancing toward cpus of our domain */ |
| 5504 | if (local_group) | 5425 | if (local_group) |
| 5505 | load = source_load(i, load_idx); | 5426 | load = source_load(i, load_idx); |
| @@ -5602,10 +5523,10 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) | |||
| 5602 | 5523 | ||
| 5603 | /* Check if we have any choice: */ | 5524 | /* Check if we have any choice: */ |
| 5604 | if (group->group_weight == 1) | 5525 | if (group->group_weight == 1) |
| 5605 | return cpumask_first(sched_group_cpus(group)); | 5526 | return cpumask_first(sched_group_span(group)); |
| 5606 | 5527 | ||
| 5607 | /* Traverse only the allowed CPUs */ | 5528 | /* Traverse only the allowed CPUs */ |
| 5608 | for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) { | 5529 | for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) { |
| 5609 | if (idle_cpu(i)) { | 5530 | if (idle_cpu(i)) { |
| 5610 | struct rq *rq = cpu_rq(i); | 5531 | struct rq *rq = cpu_rq(i); |
| 5611 | struct cpuidle_state *idle = idle_get_state(rq); | 5532 | struct cpuidle_state *idle = idle_get_state(rq); |
| @@ -5640,43 +5561,6 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) | |||
| 5640 | return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu; | 5561 | return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu; |
| 5641 | } | 5562 | } |
| 5642 | 5563 | ||
| 5643 | /* | ||
| 5644 | * Implement a for_each_cpu() variant that starts the scan at a given cpu | ||
| 5645 | * (@start), and wraps around. | ||
| 5646 | * | ||
| 5647 | * This is used to scan for idle CPUs; such that not all CPUs looking for an | ||
| 5648 | * idle CPU find the same CPU. The down-side is that tasks tend to cycle | ||
| 5649 | * through the LLC domain. | ||
| 5650 | * | ||
| 5651 | * Especially tbench is found sensitive to this. | ||
| 5652 | */ | ||
| 5653 | |||
| 5654 | static int cpumask_next_wrap(int n, const struct cpumask *mask, int start, int *wrapped) | ||
| 5655 | { | ||
| 5656 | int next; | ||
| 5657 | |||
| 5658 | again: | ||
| 5659 | next = find_next_bit(cpumask_bits(mask), nr_cpumask_bits, n+1); | ||
| 5660 | |||
| 5661 | if (*wrapped) { | ||
| 5662 | if (next >= start) | ||
| 5663 | return nr_cpumask_bits; | ||
| 5664 | } else { | ||
| 5665 | if (next >= nr_cpumask_bits) { | ||
| 5666 | *wrapped = 1; | ||
| 5667 | n = -1; | ||
| 5668 | goto again; | ||
| 5669 | } | ||
| 5670 | } | ||
| 5671 | |||
| 5672 | return next; | ||
| 5673 | } | ||
| 5674 | |||
| 5675 | #define for_each_cpu_wrap(cpu, mask, start, wrap) \ | ||
| 5676 | for ((wrap) = 0, (cpu) = (start)-1; \ | ||
| 5677 | (cpu) = cpumask_next_wrap((cpu), (mask), (start), &(wrap)), \ | ||
| 5678 | (cpu) < nr_cpumask_bits; ) | ||
| 5679 | |||
| 5680 | #ifdef CONFIG_SCHED_SMT | 5564 | #ifdef CONFIG_SCHED_SMT |
| 5681 | 5565 | ||
| 5682 | static inline void set_idle_cores(int cpu, int val) | 5566 | static inline void set_idle_cores(int cpu, int val) |
| @@ -5736,7 +5620,7 @@ unlock: | |||
| 5736 | static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target) | 5620 | static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target) |
| 5737 | { | 5621 | { |
| 5738 | struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); | 5622 | struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); |
| 5739 | int core, cpu, wrap; | 5623 | int core, cpu; |
| 5740 | 5624 | ||
| 5741 | if (!static_branch_likely(&sched_smt_present)) | 5625 | if (!static_branch_likely(&sched_smt_present)) |
| 5742 | return -1; | 5626 | return -1; |
| @@ -5746,7 +5630,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int | |||
| 5746 | 5630 | ||
| 5747 | cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed); | 5631 | cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed); |
| 5748 | 5632 | ||
| 5749 | for_each_cpu_wrap(core, cpus, target, wrap) { | 5633 | for_each_cpu_wrap(core, cpus, target) { |
| 5750 | bool idle = true; | 5634 | bool idle = true; |
| 5751 | 5635 | ||
| 5752 | for_each_cpu(cpu, cpu_smt_mask(core)) { | 5636 | for_each_cpu(cpu, cpu_smt_mask(core)) { |
| @@ -5809,27 +5693,38 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd | |||
| 5809 | static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target) | 5693 | static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target) |
| 5810 | { | 5694 | { |
| 5811 | struct sched_domain *this_sd; | 5695 | struct sched_domain *this_sd; |
| 5812 | u64 avg_cost, avg_idle = this_rq()->avg_idle; | 5696 | u64 avg_cost, avg_idle; |
| 5813 | u64 time, cost; | 5697 | u64 time, cost; |
| 5814 | s64 delta; | 5698 | s64 delta; |
| 5815 | int cpu, wrap; | 5699 | int cpu, nr = INT_MAX; |
| 5816 | 5700 | ||
| 5817 | this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); | 5701 | this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); |
| 5818 | if (!this_sd) | 5702 | if (!this_sd) |
| 5819 | return -1; | 5703 | return -1; |
| 5820 | 5704 | ||
| 5821 | avg_cost = this_sd->avg_scan_cost; | ||
| 5822 | |||
| 5823 | /* | 5705 | /* |
| 5824 | * Due to large variance we need a large fuzz factor; hackbench in | 5706 | * Due to large variance we need a large fuzz factor; hackbench in |
| 5825 | * particularly is sensitive here. | 5707 | * particularly is sensitive here. |
| 5826 | */ | 5708 | */ |
| 5827 | if (sched_feat(SIS_AVG_CPU) && (avg_idle / 512) < avg_cost) | 5709 | avg_idle = this_rq()->avg_idle / 512; |
| 5710 | avg_cost = this_sd->avg_scan_cost + 1; | ||
| 5711 | |||
| 5712 | if (sched_feat(SIS_AVG_CPU) && avg_idle < avg_cost) | ||
| 5828 | return -1; | 5713 | return -1; |
| 5829 | 5714 | ||
| 5715 | if (sched_feat(SIS_PROP)) { | ||
| 5716 | u64 span_avg = sd->span_weight * avg_idle; | ||
| 5717 | if (span_avg > 4*avg_cost) | ||
| 5718 | nr = div_u64(span_avg, avg_cost); | ||
| 5719 | else | ||
| 5720 | nr = 4; | ||
| 5721 | } | ||
| 5722 | |||
| 5830 | time = local_clock(); | 5723 | time = local_clock(); |
| 5831 | 5724 | ||
| 5832 | for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) { | 5725 | for_each_cpu_wrap(cpu, sched_domain_span(sd), target) { |
| 5726 | if (!--nr) | ||
| 5727 | return -1; | ||
| 5833 | if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) | 5728 | if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) |
| 5834 | continue; | 5729 | continue; |
| 5835 | if (idle_cpu(cpu)) | 5730 | if (idle_cpu(cpu)) |
| @@ -6011,11 +5906,15 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f | |||
| 6011 | 5906 | ||
| 6012 | if (affine_sd) { | 5907 | if (affine_sd) { |
| 6013 | sd = NULL; /* Prefer wake_affine over balance flags */ | 5908 | sd = NULL; /* Prefer wake_affine over balance flags */ |
| 6014 | if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync)) | 5909 | if (cpu == prev_cpu) |
| 5910 | goto pick_cpu; | ||
| 5911 | |||
| 5912 | if (wake_affine(affine_sd, p, prev_cpu, sync)) | ||
| 6015 | new_cpu = cpu; | 5913 | new_cpu = cpu; |
| 6016 | } | 5914 | } |
| 6017 | 5915 | ||
| 6018 | if (!sd) { | 5916 | if (!sd) { |
| 5917 | pick_cpu: | ||
| 6019 | if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ | 5918 | if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ |
| 6020 | new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); | 5919 | new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); |
| 6021 | 5920 | ||
| @@ -6168,8 +6067,11 @@ static void set_last_buddy(struct sched_entity *se) | |||
| 6168 | if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE)) | 6067 | if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE)) |
| 6169 | return; | 6068 | return; |
| 6170 | 6069 | ||
| 6171 | for_each_sched_entity(se) | 6070 | for_each_sched_entity(se) { |
| 6071 | if (SCHED_WARN_ON(!se->on_rq)) | ||
| 6072 | return; | ||
| 6172 | cfs_rq_of(se)->last = se; | 6073 | cfs_rq_of(se)->last = se; |
| 6074 | } | ||
| 6173 | } | 6075 | } |
| 6174 | 6076 | ||
| 6175 | static void set_next_buddy(struct sched_entity *se) | 6077 | static void set_next_buddy(struct sched_entity *se) |
| @@ -6177,8 +6079,11 @@ static void set_next_buddy(struct sched_entity *se) | |||
| 6177 | if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE)) | 6079 | if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE)) |
| 6178 | return; | 6080 | return; |
| 6179 | 6081 | ||
| 6180 | for_each_sched_entity(se) | 6082 | for_each_sched_entity(se) { |
| 6083 | if (SCHED_WARN_ON(!se->on_rq)) | ||
| 6084 | return; | ||
| 6181 | cfs_rq_of(se)->next = se; | 6085 | cfs_rq_of(se)->next = se; |
| 6086 | } | ||
| 6182 | } | 6087 | } |
| 6183 | 6088 | ||
| 6184 | static void set_skip_buddy(struct sched_entity *se) | 6089 | static void set_skip_buddy(struct sched_entity *se) |
| @@ -6686,6 +6591,10 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env) | |||
| 6686 | if (dst_nid == p->numa_preferred_nid) | 6591 | if (dst_nid == p->numa_preferred_nid) |
| 6687 | return 0; | 6592 | return 0; |
| 6688 | 6593 | ||
| 6594 | /* Leaving a core idle is often worse than degrading locality. */ | ||
| 6595 | if (env->idle != CPU_NOT_IDLE) | ||
| 6596 | return -1; | ||
| 6597 | |||
| 6689 | if (numa_group) { | 6598 | if (numa_group) { |
| 6690 | src_faults = group_faults(p, src_nid); | 6599 | src_faults = group_faults(p, src_nid); |
| 6691 | dst_faults = group_faults(p, dst_nid); | 6600 | dst_faults = group_faults(p, dst_nid); |
| @@ -6737,10 +6646,10 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
| 6737 | * our sched_group. We may want to revisit it if we couldn't | 6646 | * our sched_group. We may want to revisit it if we couldn't |
| 6738 | * meet load balance goals by pulling other tasks on src_cpu. | 6647 | * meet load balance goals by pulling other tasks on src_cpu. |
| 6739 | * | 6648 | * |
| 6740 | * Also avoid computing new_dst_cpu if we have already computed | 6649 | * Avoid computing new_dst_cpu for NEWLY_IDLE or if we have |
| 6741 | * one in current iteration. | 6650 | * already computed one in current iteration. |
| 6742 | */ | 6651 | */ |
| 6743 | if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED)) | 6652 | if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED)) |
| 6744 | return 0; | 6653 | return 0; |
| 6745 | 6654 | ||
| 6746 | /* Prevent to re-select dst_cpu via env's cpus */ | 6655 | /* Prevent to re-select dst_cpu via env's cpus */ |
| @@ -6970,10 +6879,28 @@ static void attach_tasks(struct lb_env *env) | |||
| 6970 | } | 6879 | } |
| 6971 | 6880 | ||
| 6972 | #ifdef CONFIG_FAIR_GROUP_SCHED | 6881 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 6882 | |||
| 6883 | static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) | ||
| 6884 | { | ||
| 6885 | if (cfs_rq->load.weight) | ||
| 6886 | return false; | ||
| 6887 | |||
| 6888 | if (cfs_rq->avg.load_sum) | ||
| 6889 | return false; | ||
| 6890 | |||
| 6891 | if (cfs_rq->avg.util_sum) | ||
| 6892 | return false; | ||
| 6893 | |||
| 6894 | if (cfs_rq->runnable_load_sum) | ||
| 6895 | return false; | ||
| 6896 | |||
| 6897 | return true; | ||
| 6898 | } | ||
| 6899 | |||
| 6973 | static void update_blocked_averages(int cpu) | 6900 | static void update_blocked_averages(int cpu) |
| 6974 | { | 6901 | { |
| 6975 | struct rq *rq = cpu_rq(cpu); | 6902 | struct rq *rq = cpu_rq(cpu); |
| 6976 | struct cfs_rq *cfs_rq; | 6903 | struct cfs_rq *cfs_rq, *pos; |
| 6977 | struct rq_flags rf; | 6904 | struct rq_flags rf; |
| 6978 | 6905 | ||
| 6979 | rq_lock_irqsave(rq, &rf); | 6906 | rq_lock_irqsave(rq, &rf); |
| @@ -6983,7 +6910,7 @@ static void update_blocked_averages(int cpu) | |||
| 6983 | * Iterates the task_group tree in a bottom up fashion, see | 6910 | * Iterates the task_group tree in a bottom up fashion, see |
| 6984 | * list_add_leaf_cfs_rq() for details. | 6911 | * list_add_leaf_cfs_rq() for details. |
| 6985 | */ | 6912 | */ |
| 6986 | for_each_leaf_cfs_rq(rq, cfs_rq) { | 6913 | for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) { |
| 6987 | struct sched_entity *se; | 6914 | struct sched_entity *se; |
| 6988 | 6915 | ||
| 6989 | /* throttled entities do not contribute to load */ | 6916 | /* throttled entities do not contribute to load */ |
| @@ -6997,6 +6924,13 @@ static void update_blocked_averages(int cpu) | |||
| 6997 | se = cfs_rq->tg->se[cpu]; | 6924 | se = cfs_rq->tg->se[cpu]; |
| 6998 | if (se && !skip_blocked_update(se)) | 6925 | if (se && !skip_blocked_update(se)) |
| 6999 | update_load_avg(se, 0); | 6926 | update_load_avg(se, 0); |
| 6927 | |||
| 6928 | /* | ||
| 6929 | * There can be a lot of idle CPU cgroups. Don't let fully | ||
| 6930 | * decayed cfs_rqs linger on the list. | ||
| 6931 | */ | ||
| 6932 | if (cfs_rq_is_decayed(cfs_rq)) | ||
| 6933 | list_del_leaf_cfs_rq(cfs_rq); | ||
| 7000 | } | 6934 | } |
| 7001 | rq_unlock_irqrestore(rq, &rf); | 6935 | rq_unlock_irqrestore(rq, &rf); |
| 7002 | } | 6936 | } |
| @@ -7229,7 +7163,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) | |||
| 7229 | * span the current group. | 7163 | * span the current group. |
| 7230 | */ | 7164 | */ |
| 7231 | 7165 | ||
| 7232 | for_each_cpu(cpu, sched_group_cpus(sdg)) { | 7166 | for_each_cpu(cpu, sched_group_span(sdg)) { |
| 7233 | struct sched_group_capacity *sgc; | 7167 | struct sched_group_capacity *sgc; |
| 7234 | struct rq *rq = cpu_rq(cpu); | 7168 | struct rq *rq = cpu_rq(cpu); |
| 7235 | 7169 | ||
| @@ -7408,7 +7342,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
| 7408 | 7342 | ||
| 7409 | memset(sgs, 0, sizeof(*sgs)); | 7343 | memset(sgs, 0, sizeof(*sgs)); |
| 7410 | 7344 | ||
| 7411 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { | 7345 | for_each_cpu_and(i, sched_group_span(group), env->cpus) { |
| 7412 | struct rq *rq = cpu_rq(i); | 7346 | struct rq *rq = cpu_rq(i); |
| 7413 | 7347 | ||
| 7414 | /* Bias balancing toward cpus of our domain */ | 7348 | /* Bias balancing toward cpus of our domain */ |
| @@ -7572,7 +7506,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd | |||
| 7572 | struct sg_lb_stats *sgs = &tmp_sgs; | 7506 | struct sg_lb_stats *sgs = &tmp_sgs; |
| 7573 | int local_group; | 7507 | int local_group; |
| 7574 | 7508 | ||
| 7575 | local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); | 7509 | local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(sg)); |
| 7576 | if (local_group) { | 7510 | if (local_group) { |
| 7577 | sds->local = sg; | 7511 | sds->local = sg; |
| 7578 | sgs = local; | 7512 | sgs = local; |
| @@ -7927,7 +7861,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
| 7927 | unsigned long busiest_load = 0, busiest_capacity = 1; | 7861 | unsigned long busiest_load = 0, busiest_capacity = 1; |
| 7928 | int i; | 7862 | int i; |
| 7929 | 7863 | ||
| 7930 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { | 7864 | for_each_cpu_and(i, sched_group_span(group), env->cpus) { |
| 7931 | unsigned long capacity, wl; | 7865 | unsigned long capacity, wl; |
| 7932 | enum fbq_type rt; | 7866 | enum fbq_type rt; |
| 7933 | 7867 | ||
| @@ -8033,7 +7967,6 @@ static int active_load_balance_cpu_stop(void *data); | |||
| 8033 | static int should_we_balance(struct lb_env *env) | 7967 | static int should_we_balance(struct lb_env *env) |
| 8034 | { | 7968 | { |
| 8035 | struct sched_group *sg = env->sd->groups; | 7969 | struct sched_group *sg = env->sd->groups; |
| 8036 | struct cpumask *sg_cpus, *sg_mask; | ||
| 8037 | int cpu, balance_cpu = -1; | 7970 | int cpu, balance_cpu = -1; |
| 8038 | 7971 | ||
| 8039 | /* | 7972 | /* |
| @@ -8043,11 +7976,9 @@ static int should_we_balance(struct lb_env *env) | |||
| 8043 | if (env->idle == CPU_NEWLY_IDLE) | 7976 | if (env->idle == CPU_NEWLY_IDLE) |
| 8044 | return 1; | 7977 | return 1; |
| 8045 | 7978 | ||
| 8046 | sg_cpus = sched_group_cpus(sg); | ||
| 8047 | sg_mask = sched_group_mask(sg); | ||
| 8048 | /* Try to find first idle cpu */ | 7979 | /* Try to find first idle cpu */ |
| 8049 | for_each_cpu_and(cpu, sg_cpus, env->cpus) { | 7980 | for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) { |
| 8050 | if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu)) | 7981 | if (!idle_cpu(cpu)) |
| 8051 | continue; | 7982 | continue; |
| 8052 | 7983 | ||
| 8053 | balance_cpu = cpu; | 7984 | balance_cpu = cpu; |
| @@ -8083,7 +8014,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
| 8083 | .sd = sd, | 8014 | .sd = sd, |
| 8084 | .dst_cpu = this_cpu, | 8015 | .dst_cpu = this_cpu, |
| 8085 | .dst_rq = this_rq, | 8016 | .dst_rq = this_rq, |
| 8086 | .dst_grpmask = sched_group_cpus(sd->groups), | 8017 | .dst_grpmask = sched_group_span(sd->groups), |
| 8087 | .idle = idle, | 8018 | .idle = idle, |
| 8088 | .loop_break = sched_nr_migrate_break, | 8019 | .loop_break = sched_nr_migrate_break, |
| 8089 | .cpus = cpus, | 8020 | .cpus = cpus, |
| @@ -8091,14 +8022,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
| 8091 | .tasks = LIST_HEAD_INIT(env.tasks), | 8022 | .tasks = LIST_HEAD_INIT(env.tasks), |
| 8092 | }; | 8023 | }; |
| 8093 | 8024 | ||
| 8094 | /* | 8025 | cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask); |
| 8095 | * For NEWLY_IDLE load_balancing, we don't need to consider | ||
| 8096 | * other cpus in our group | ||
| 8097 | */ | ||
| 8098 | if (idle == CPU_NEWLY_IDLE) | ||
| 8099 | env.dst_grpmask = NULL; | ||
| 8100 | |||
| 8101 | cpumask_copy(cpus, cpu_active_mask); | ||
| 8102 | 8026 | ||
| 8103 | schedstat_inc(sd->lb_count[idle]); | 8027 | schedstat_inc(sd->lb_count[idle]); |
| 8104 | 8028 | ||
| @@ -8220,7 +8144,15 @@ more_balance: | |||
| 8220 | /* All tasks on this runqueue were pinned by CPU affinity */ | 8144 | /* All tasks on this runqueue were pinned by CPU affinity */ |
| 8221 | if (unlikely(env.flags & LBF_ALL_PINNED)) { | 8145 | if (unlikely(env.flags & LBF_ALL_PINNED)) { |
| 8222 | cpumask_clear_cpu(cpu_of(busiest), cpus); | 8146 | cpumask_clear_cpu(cpu_of(busiest), cpus); |
| 8223 | if (!cpumask_empty(cpus)) { | 8147 | /* |
| 8148 | * Attempting to continue load balancing at the current | ||
| 8149 | * sched_domain level only makes sense if there are | ||
| 8150 | * active CPUs remaining as possible busiest CPUs to | ||
| 8151 | * pull load from which are not contained within the | ||
| 8152 | * destination group that is receiving any migrated | ||
| 8153 | * load. | ||
| 8154 | */ | ||
| 8155 | if (!cpumask_subset(cpus, env.dst_grpmask)) { | ||
| 8224 | env.loop = 0; | 8156 | env.loop = 0; |
| 8225 | env.loop_break = sched_nr_migrate_break; | 8157 | env.loop_break = sched_nr_migrate_break; |
| 8226 | goto redo; | 8158 | goto redo; |
| @@ -8516,6 +8448,13 @@ static int active_load_balance_cpu_stop(void *data) | |||
| 8516 | .src_cpu = busiest_rq->cpu, | 8448 | .src_cpu = busiest_rq->cpu, |
| 8517 | .src_rq = busiest_rq, | 8449 | .src_rq = busiest_rq, |
| 8518 | .idle = CPU_IDLE, | 8450 | .idle = CPU_IDLE, |
| 8451 | /* | ||
| 8452 | * can_migrate_task() doesn't need to compute new_dst_cpu | ||
| 8453 | * for active balancing. Since we have CPU_IDLE, but no | ||
| 8454 | * @dst_grpmask we need to make that test go away with lying | ||
| 8455 | * about DST_PINNED. | ||
| 8456 | */ | ||
| 8457 | .flags = LBF_DST_PINNED, | ||
| 8519 | }; | 8458 | }; |
| 8520 | 8459 | ||
| 8521 | schedstat_inc(sd->alb_count); | 8460 | schedstat_inc(sd->alb_count); |
| @@ -8659,6 +8598,10 @@ void nohz_balance_enter_idle(int cpu) | |||
| 8659 | if (!cpu_active(cpu)) | 8598 | if (!cpu_active(cpu)) |
| 8660 | return; | 8599 | return; |
| 8661 | 8600 | ||
| 8601 | /* Spare idle load balancing on CPUs that don't want to be disturbed: */ | ||
| 8602 | if (!is_housekeeping_cpu(cpu)) | ||
| 8603 | return; | ||
| 8604 | |||
| 8662 | if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) | 8605 | if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) |
| 8663 | return; | 8606 | return; |
| 8664 | 8607 | ||
| @@ -9523,10 +9466,10 @@ const struct sched_class fair_sched_class = { | |||
| 9523 | #ifdef CONFIG_SCHED_DEBUG | 9466 | #ifdef CONFIG_SCHED_DEBUG |
| 9524 | void print_cfs_stats(struct seq_file *m, int cpu) | 9467 | void print_cfs_stats(struct seq_file *m, int cpu) |
| 9525 | { | 9468 | { |
| 9526 | struct cfs_rq *cfs_rq; | 9469 | struct cfs_rq *cfs_rq, *pos; |
| 9527 | 9470 | ||
| 9528 | rcu_read_lock(); | 9471 | rcu_read_lock(); |
| 9529 | for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) | 9472 | for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos) |
| 9530 | print_cfs_rq(m, cpu, cfs_rq); | 9473 | print_cfs_rq(m, cpu, cfs_rq); |
| 9531 | rcu_read_unlock(); | 9474 | rcu_read_unlock(); |
| 9532 | } | 9475 | } |
diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 11192e0cb122..d3fb15555291 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h | |||
| @@ -55,6 +55,7 @@ SCHED_FEAT(TTWU_QUEUE, true) | |||
| 55 | * When doing wakeups, attempt to limit superfluous scans of the LLC domain. | 55 | * When doing wakeups, attempt to limit superfluous scans of the LLC domain. |
| 56 | */ | 56 | */ |
| 57 | SCHED_FEAT(SIS_AVG_CPU, false) | 57 | SCHED_FEAT(SIS_AVG_CPU, false) |
| 58 | SCHED_FEAT(SIS_PROP, true) | ||
| 58 | 59 | ||
| 59 | /* | 60 | /* |
| 60 | * Issue a WARN when we do multiple update_rq_clock() calls | 61 | * Issue a WARN when we do multiple update_rq_clock() calls |
| @@ -76,7 +77,6 @@ SCHED_FEAT(WARN_DOUBLE_CLOCK, false) | |||
| 76 | SCHED_FEAT(RT_PUSH_IPI, true) | 77 | SCHED_FEAT(RT_PUSH_IPI, true) |
| 77 | #endif | 78 | #endif |
| 78 | 79 | ||
| 79 | SCHED_FEAT(FORCE_SD_OVERLAP, false) | ||
| 80 | SCHED_FEAT(RT_RUNTIME_SHARE, true) | 80 | SCHED_FEAT(RT_RUNTIME_SHARE, true) |
| 81 | SCHED_FEAT(LB_MIN, false) | 81 | SCHED_FEAT(LB_MIN, false) |
| 82 | SCHED_FEAT(ATTACH_AGE_LOAD, true) | 82 | SCHED_FEAT(ATTACH_AGE_LOAD, true) |
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index ef63adce0c9c..6c23e30c0e5c 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c | |||
| @@ -219,6 +219,7 @@ static void do_idle(void) | |||
| 219 | */ | 219 | */ |
| 220 | 220 | ||
| 221 | __current_set_polling(); | 221 | __current_set_polling(); |
| 222 | quiet_vmstat(); | ||
| 222 | tick_nohz_idle_enter(); | 223 | tick_nohz_idle_enter(); |
| 223 | 224 | ||
| 224 | while (!need_resched()) { | 225 | while (!need_resched()) { |
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index f15fb2bdbc0d..f14716a3522f 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c | |||
| @@ -117,7 +117,7 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) | |||
| 117 | * load-average relies on per-cpu sampling from the tick, it is affected by | 117 | * load-average relies on per-cpu sampling from the tick, it is affected by |
| 118 | * NO_HZ. | 118 | * NO_HZ. |
| 119 | * | 119 | * |
| 120 | * The basic idea is to fold the nr_active delta into a global idle-delta upon | 120 | * The basic idea is to fold the nr_active delta into a global NO_HZ-delta upon |
| 121 | * entering NO_HZ state such that we can include this as an 'extra' cpu delta | 121 | * entering NO_HZ state such that we can include this as an 'extra' cpu delta |
| 122 | * when we read the global state. | 122 | * when we read the global state. |
| 123 | * | 123 | * |
| @@ -126,7 +126,7 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) | |||
| 126 | * - When we go NO_HZ idle during the window, we can negate our sample | 126 | * - When we go NO_HZ idle during the window, we can negate our sample |
| 127 | * contribution, causing under-accounting. | 127 | * contribution, causing under-accounting. |
| 128 | * | 128 | * |
| 129 | * We avoid this by keeping two idle-delta counters and flipping them | 129 | * We avoid this by keeping two NO_HZ-delta counters and flipping them |
| 130 | * when the window starts, thus separating old and new NO_HZ load. | 130 | * when the window starts, thus separating old and new NO_HZ load. |
| 131 | * | 131 | * |
| 132 | * The only trick is the slight shift in index flip for read vs write. | 132 | * The only trick is the slight shift in index flip for read vs write. |
| @@ -137,22 +137,22 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) | |||
| 137 | * r:0 0 1 1 0 0 1 1 0 | 137 | * r:0 0 1 1 0 0 1 1 0 |
| 138 | * w:0 1 1 0 0 1 1 0 0 | 138 | * w:0 1 1 0 0 1 1 0 0 |
| 139 | * | 139 | * |
| 140 | * This ensures we'll fold the old idle contribution in this window while | 140 | * This ensures we'll fold the old NO_HZ contribution in this window while |
| 141 | * accumlating the new one. | 141 | * accumlating the new one. |
| 142 | * | 142 | * |
| 143 | * - When we wake up from NO_HZ idle during the window, we push up our | 143 | * - When we wake up from NO_HZ during the window, we push up our |
| 144 | * contribution, since we effectively move our sample point to a known | 144 | * contribution, since we effectively move our sample point to a known |
| 145 | * busy state. | 145 | * busy state. |
| 146 | * | 146 | * |
| 147 | * This is solved by pushing the window forward, and thus skipping the | 147 | * This is solved by pushing the window forward, and thus skipping the |
| 148 | * sample, for this cpu (effectively using the idle-delta for this cpu which | 148 | * sample, for this cpu (effectively using the NO_HZ-delta for this cpu which |
| 149 | * was in effect at the time the window opened). This also solves the issue | 149 | * was in effect at the time the window opened). This also solves the issue |
| 150 | * of having to deal with a cpu having been in NOHZ idle for multiple | 150 | * of having to deal with a cpu having been in NO_HZ for multiple LOAD_FREQ |
| 151 | * LOAD_FREQ intervals. | 151 | * intervals. |
| 152 | * | 152 | * |
| 153 | * When making the ILB scale, we should try to pull this in as well. | 153 | * When making the ILB scale, we should try to pull this in as well. |
| 154 | */ | 154 | */ |
| 155 | static atomic_long_t calc_load_idle[2]; | 155 | static atomic_long_t calc_load_nohz[2]; |
| 156 | static int calc_load_idx; | 156 | static int calc_load_idx; |
| 157 | 157 | ||
| 158 | static inline int calc_load_write_idx(void) | 158 | static inline int calc_load_write_idx(void) |
| @@ -167,7 +167,7 @@ static inline int calc_load_write_idx(void) | |||
| 167 | 167 | ||
| 168 | /* | 168 | /* |
| 169 | * If the folding window started, make sure we start writing in the | 169 | * If the folding window started, make sure we start writing in the |
| 170 | * next idle-delta. | 170 | * next NO_HZ-delta. |
| 171 | */ | 171 | */ |
| 172 | if (!time_before(jiffies, READ_ONCE(calc_load_update))) | 172 | if (!time_before(jiffies, READ_ONCE(calc_load_update))) |
| 173 | idx++; | 173 | idx++; |
| @@ -180,24 +180,24 @@ static inline int calc_load_read_idx(void) | |||
| 180 | return calc_load_idx & 1; | 180 | return calc_load_idx & 1; |
| 181 | } | 181 | } |
| 182 | 182 | ||
| 183 | void calc_load_enter_idle(void) | 183 | void calc_load_nohz_start(void) |
| 184 | { | 184 | { |
| 185 | struct rq *this_rq = this_rq(); | 185 | struct rq *this_rq = this_rq(); |
| 186 | long delta; | 186 | long delta; |
| 187 | 187 | ||
| 188 | /* | 188 | /* |
| 189 | * We're going into NOHZ mode, if there's any pending delta, fold it | 189 | * We're going into NO_HZ mode, if there's any pending delta, fold it |
| 190 | * into the pending idle delta. | 190 | * into the pending NO_HZ delta. |
| 191 | */ | 191 | */ |
| 192 | delta = calc_load_fold_active(this_rq, 0); | 192 | delta = calc_load_fold_active(this_rq, 0); |
| 193 | if (delta) { | 193 | if (delta) { |
| 194 | int idx = calc_load_write_idx(); | 194 | int idx = calc_load_write_idx(); |
| 195 | 195 | ||
| 196 | atomic_long_add(delta, &calc_load_idle[idx]); | 196 | atomic_long_add(delta, &calc_load_nohz[idx]); |
| 197 | } | 197 | } |
| 198 | } | 198 | } |
| 199 | 199 | ||
| 200 | void calc_load_exit_idle(void) | 200 | void calc_load_nohz_stop(void) |
| 201 | { | 201 | { |
| 202 | struct rq *this_rq = this_rq(); | 202 | struct rq *this_rq = this_rq(); |
| 203 | 203 | ||
| @@ -217,13 +217,13 @@ void calc_load_exit_idle(void) | |||
| 217 | this_rq->calc_load_update += LOAD_FREQ; | 217 | this_rq->calc_load_update += LOAD_FREQ; |
| 218 | } | 218 | } |
| 219 | 219 | ||
| 220 | static long calc_load_fold_idle(void) | 220 | static long calc_load_nohz_fold(void) |
| 221 | { | 221 | { |
| 222 | int idx = calc_load_read_idx(); | 222 | int idx = calc_load_read_idx(); |
| 223 | long delta = 0; | 223 | long delta = 0; |
| 224 | 224 | ||
| 225 | if (atomic_long_read(&calc_load_idle[idx])) | 225 | if (atomic_long_read(&calc_load_nohz[idx])) |
| 226 | delta = atomic_long_xchg(&calc_load_idle[idx], 0); | 226 | delta = atomic_long_xchg(&calc_load_nohz[idx], 0); |
| 227 | 227 | ||
| 228 | return delta; | 228 | return delta; |
| 229 | } | 229 | } |
| @@ -299,9 +299,9 @@ calc_load_n(unsigned long load, unsigned long exp, | |||
| 299 | 299 | ||
| 300 | /* | 300 | /* |
| 301 | * NO_HZ can leave us missing all per-cpu ticks calling | 301 | * NO_HZ can leave us missing all per-cpu ticks calling |
| 302 | * calc_load_account_active(), but since an idle CPU folds its delta into | 302 | * calc_load_fold_active(), but since a NO_HZ CPU folds its delta into |
| 303 | * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold | 303 | * calc_load_nohz per calc_load_nohz_start(), all we need to do is fold |
| 304 | * in the pending idle delta if our idle period crossed a load cycle boundary. | 304 | * in the pending NO_HZ delta if our NO_HZ period crossed a load cycle boundary. |
| 305 | * | 305 | * |
| 306 | * Once we've updated the global active value, we need to apply the exponential | 306 | * Once we've updated the global active value, we need to apply the exponential |
| 307 | * weights adjusted to the number of cycles missed. | 307 | * weights adjusted to the number of cycles missed. |
| @@ -330,7 +330,7 @@ static void calc_global_nohz(void) | |||
| 330 | } | 330 | } |
| 331 | 331 | ||
| 332 | /* | 332 | /* |
| 333 | * Flip the idle index... | 333 | * Flip the NO_HZ index... |
| 334 | * | 334 | * |
| 335 | * Make sure we first write the new time then flip the index, so that | 335 | * Make sure we first write the new time then flip the index, so that |
| 336 | * calc_load_write_idx() will see the new time when it reads the new | 336 | * calc_load_write_idx() will see the new time when it reads the new |
| @@ -341,7 +341,7 @@ static void calc_global_nohz(void) | |||
| 341 | } | 341 | } |
| 342 | #else /* !CONFIG_NO_HZ_COMMON */ | 342 | #else /* !CONFIG_NO_HZ_COMMON */ |
| 343 | 343 | ||
| 344 | static inline long calc_load_fold_idle(void) { return 0; } | 344 | static inline long calc_load_nohz_fold(void) { return 0; } |
| 345 | static inline void calc_global_nohz(void) { } | 345 | static inline void calc_global_nohz(void) { } |
| 346 | 346 | ||
| 347 | #endif /* CONFIG_NO_HZ_COMMON */ | 347 | #endif /* CONFIG_NO_HZ_COMMON */ |
| @@ -362,9 +362,9 @@ void calc_global_load(unsigned long ticks) | |||
| 362 | return; | 362 | return; |
| 363 | 363 | ||
| 364 | /* | 364 | /* |
| 365 | * Fold the 'old' idle-delta to include all NO_HZ cpus. | 365 | * Fold the 'old' NO_HZ-delta to include all NO_HZ cpus. |
| 366 | */ | 366 | */ |
| 367 | delta = calc_load_fold_idle(); | 367 | delta = calc_load_nohz_fold(); |
| 368 | if (delta) | 368 | if (delta) |
| 369 | atomic_long_add(delta, &calc_load_tasks); | 369 | atomic_long_add(delta, &calc_load_tasks); |
| 370 | 370 | ||
| @@ -378,7 +378,8 @@ void calc_global_load(unsigned long ticks) | |||
| 378 | WRITE_ONCE(calc_load_update, sample_window + LOAD_FREQ); | 378 | WRITE_ONCE(calc_load_update, sample_window + LOAD_FREQ); |
| 379 | 379 | ||
| 380 | /* | 380 | /* |
| 381 | * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk. | 381 | * In case we went to NO_HZ for multiple LOAD_FREQ intervals |
| 382 | * catch up in bulk. | ||
| 382 | */ | 383 | */ |
| 383 | calc_global_nohz(); | 384 | calc_global_nohz(); |
| 384 | } | 385 | } |
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 979b7341008a..45caf937ef90 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
| @@ -840,6 +840,17 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) | |||
| 840 | int enqueue = 0; | 840 | int enqueue = 0; |
| 841 | struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); | 841 | struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); |
| 842 | struct rq *rq = rq_of_rt_rq(rt_rq); | 842 | struct rq *rq = rq_of_rt_rq(rt_rq); |
| 843 | int skip; | ||
| 844 | |||
| 845 | /* | ||
| 846 | * When span == cpu_online_mask, taking each rq->lock | ||
| 847 | * can be time-consuming. Try to avoid it when possible. | ||
| 848 | */ | ||
| 849 | raw_spin_lock(&rt_rq->rt_runtime_lock); | ||
| 850 | skip = !rt_rq->rt_time && !rt_rq->rt_nr_running; | ||
| 851 | raw_spin_unlock(&rt_rq->rt_runtime_lock); | ||
| 852 | if (skip) | ||
| 853 | continue; | ||
| 843 | 854 | ||
| 844 | raw_spin_lock(&rq->lock); | 855 | raw_spin_lock(&rq->lock); |
| 845 | if (rt_rq->rt_time) { | 856 | if (rt_rq->rt_time) { |
| @@ -1819,7 +1830,7 @@ retry: | |||
| 1819 | * pushing. | 1830 | * pushing. |
| 1820 | */ | 1831 | */ |
| 1821 | task = pick_next_pushable_task(rq); | 1832 | task = pick_next_pushable_task(rq); |
| 1822 | if (task_cpu(next_task) == rq->cpu && task == next_task) { | 1833 | if (task == next_task) { |
| 1823 | /* | 1834 | /* |
| 1824 | * The task hasn't migrated, and is still the next | 1835 | * The task hasn't migrated, and is still the next |
| 1825 | * eligible task, but we failed to find a run-queue | 1836 | * eligible task, but we failed to find a run-queue |
| @@ -2438,6 +2449,316 @@ const struct sched_class rt_sched_class = { | |||
| 2438 | .update_curr = update_curr_rt, | 2449 | .update_curr = update_curr_rt, |
| 2439 | }; | 2450 | }; |
| 2440 | 2451 | ||
| 2452 | #ifdef CONFIG_RT_GROUP_SCHED | ||
| 2453 | /* | ||
| 2454 | * Ensure that the real time constraints are schedulable. | ||
| 2455 | */ | ||
| 2456 | static DEFINE_MUTEX(rt_constraints_mutex); | ||
| 2457 | |||
| 2458 | /* Must be called with tasklist_lock held */ | ||
| 2459 | static inline int tg_has_rt_tasks(struct task_group *tg) | ||
| 2460 | { | ||
| 2461 | struct task_struct *g, *p; | ||
| 2462 | |||
| 2463 | /* | ||
| 2464 | * Autogroups do not have RT tasks; see autogroup_create(). | ||
| 2465 | */ | ||
| 2466 | if (task_group_is_autogroup(tg)) | ||
| 2467 | return 0; | ||
| 2468 | |||
| 2469 | for_each_process_thread(g, p) { | ||
| 2470 | if (rt_task(p) && task_group(p) == tg) | ||
| 2471 | return 1; | ||
| 2472 | } | ||
| 2473 | |||
| 2474 | return 0; | ||
| 2475 | } | ||
| 2476 | |||
| 2477 | struct rt_schedulable_data { | ||
| 2478 | struct task_group *tg; | ||
| 2479 | u64 rt_period; | ||
| 2480 | u64 rt_runtime; | ||
| 2481 | }; | ||
| 2482 | |||
| 2483 | static int tg_rt_schedulable(struct task_group *tg, void *data) | ||
| 2484 | { | ||
| 2485 | struct rt_schedulable_data *d = data; | ||
| 2486 | struct task_group *child; | ||
| 2487 | unsigned long total, sum = 0; | ||
| 2488 | u64 period, runtime; | ||
| 2489 | |||
| 2490 | period = ktime_to_ns(tg->rt_bandwidth.rt_period); | ||
| 2491 | runtime = tg->rt_bandwidth.rt_runtime; | ||
| 2492 | |||
| 2493 | if (tg == d->tg) { | ||
| 2494 | period = d->rt_period; | ||
| 2495 | runtime = d->rt_runtime; | ||
| 2496 | } | ||
| 2497 | |||
| 2498 | /* | ||
| 2499 | * Cannot have more runtime than the period. | ||
| 2500 | */ | ||
| 2501 | if (runtime > period && runtime != RUNTIME_INF) | ||
| 2502 | return -EINVAL; | ||
| 2503 | |||
| 2504 | /* | ||
| 2505 | * Ensure we don't starve existing RT tasks. | ||
| 2506 | */ | ||
| 2507 | if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) | ||
| 2508 | return -EBUSY; | ||
| 2509 | |||
| 2510 | total = to_ratio(period, runtime); | ||
| 2511 | |||
| 2512 | /* | ||
| 2513 | * Nobody can have more than the global setting allows. | ||
| 2514 | */ | ||
| 2515 | if (total > to_ratio(global_rt_period(), global_rt_runtime())) | ||
| 2516 | return -EINVAL; | ||
| 2517 | |||
| 2518 | /* | ||
| 2519 | * The sum of our children's runtime should not exceed our own. | ||
| 2520 | */ | ||
| 2521 | list_for_each_entry_rcu(child, &tg->children, siblings) { | ||
| 2522 | period = ktime_to_ns(child->rt_bandwidth.rt_period); | ||
| 2523 | runtime = child->rt_bandwidth.rt_runtime; | ||
| 2524 | |||
| 2525 | if (child == d->tg) { | ||
| 2526 | period = d->rt_period; | ||
| 2527 | runtime = d->rt_runtime; | ||
| 2528 | } | ||
| 2529 | |||
| 2530 | sum += to_ratio(period, runtime); | ||
| 2531 | } | ||
| 2532 | |||
| 2533 | if (sum > total) | ||
| 2534 | return -EINVAL; | ||
| 2535 | |||
| 2536 | return 0; | ||
| 2537 | } | ||
| 2538 | |||
| 2539 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) | ||
| 2540 | { | ||
| 2541 | int ret; | ||
| 2542 | |||
| 2543 | struct rt_schedulable_data data = { | ||
| 2544 | .tg = tg, | ||
| 2545 | .rt_period = period, | ||
| 2546 | .rt_runtime = runtime, | ||
| 2547 | }; | ||
| 2548 | |||
| 2549 | rcu_read_lock(); | ||
| 2550 | ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data); | ||
| 2551 | rcu_read_unlock(); | ||
| 2552 | |||
| 2553 | return ret; | ||
| 2554 | } | ||
| 2555 | |||
| 2556 | static int tg_set_rt_bandwidth(struct task_group *tg, | ||
| 2557 | u64 rt_period, u64 rt_runtime) | ||
| 2558 | { | ||
| 2559 | int i, err = 0; | ||
| 2560 | |||
| 2561 | /* | ||
| 2562 | * Disallowing the root group RT runtime is BAD, it would disallow the | ||
| 2563 | * kernel creating (and or operating) RT threads. | ||
| 2564 | */ | ||
| 2565 | if (tg == &root_task_group && rt_runtime == 0) | ||
| 2566 | return -EINVAL; | ||
| 2567 | |||
| 2568 | /* No period doesn't make any sense. */ | ||
| 2569 | if (rt_period == 0) | ||
| 2570 | return -EINVAL; | ||
| 2571 | |||
| 2572 | mutex_lock(&rt_constraints_mutex); | ||
| 2573 | read_lock(&tasklist_lock); | ||
| 2574 | err = __rt_schedulable(tg, rt_period, rt_runtime); | ||
| 2575 | if (err) | ||
| 2576 | goto unlock; | ||
| 2577 | |||
| 2578 | raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); | ||
| 2579 | tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); | ||
| 2580 | tg->rt_bandwidth.rt_runtime = rt_runtime; | ||
| 2581 | |||
| 2582 | for_each_possible_cpu(i) { | ||
| 2583 | struct rt_rq *rt_rq = tg->rt_rq[i]; | ||
| 2584 | |||
| 2585 | raw_spin_lock(&rt_rq->rt_runtime_lock); | ||
| 2586 | rt_rq->rt_runtime = rt_runtime; | ||
| 2587 | raw_spin_unlock(&rt_rq->rt_runtime_lock); | ||
| 2588 | } | ||
| 2589 | raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); | ||
| 2590 | unlock: | ||
| 2591 | read_unlock(&tasklist_lock); | ||
| 2592 | mutex_unlock(&rt_constraints_mutex); | ||
| 2593 | |||
| 2594 | return err; | ||
| 2595 | } | ||
| 2596 | |||
| 2597 | int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) | ||
| 2598 | { | ||
| 2599 | u64 rt_runtime, rt_period; | ||
| 2600 | |||
| 2601 | rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); | ||
| 2602 | rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; | ||
| 2603 | if (rt_runtime_us < 0) | ||
| 2604 | rt_runtime = RUNTIME_INF; | ||
| 2605 | |||
| 2606 | return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); | ||
| 2607 | } | ||
| 2608 | |||
| 2609 | long sched_group_rt_runtime(struct task_group *tg) | ||
| 2610 | { | ||
| 2611 | u64 rt_runtime_us; | ||
| 2612 | |||
| 2613 | if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF) | ||
| 2614 | return -1; | ||
| 2615 | |||
| 2616 | rt_runtime_us = tg->rt_bandwidth.rt_runtime; | ||
| 2617 | do_div(rt_runtime_us, NSEC_PER_USEC); | ||
| 2618 | return rt_runtime_us; | ||
| 2619 | } | ||
| 2620 | |||
| 2621 | int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us) | ||
| 2622 | { | ||
| 2623 | u64 rt_runtime, rt_period; | ||
| 2624 | |||
| 2625 | rt_period = rt_period_us * NSEC_PER_USEC; | ||
| 2626 | rt_runtime = tg->rt_bandwidth.rt_runtime; | ||
| 2627 | |||
| 2628 | return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); | ||
| 2629 | } | ||
| 2630 | |||
| 2631 | long sched_group_rt_period(struct task_group *tg) | ||
| 2632 | { | ||
| 2633 | u64 rt_period_us; | ||
| 2634 | |||
| 2635 | rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period); | ||
| 2636 | do_div(rt_period_us, NSEC_PER_USEC); | ||
| 2637 | return rt_period_us; | ||
| 2638 | } | ||
| 2639 | |||
| 2640 | static int sched_rt_global_constraints(void) | ||
| 2641 | { | ||
| 2642 | int ret = 0; | ||
| 2643 | |||
| 2644 | mutex_lock(&rt_constraints_mutex); | ||
| 2645 | read_lock(&tasklist_lock); | ||
| 2646 | ret = __rt_schedulable(NULL, 0, 0); | ||
| 2647 | read_unlock(&tasklist_lock); | ||
| 2648 | mutex_unlock(&rt_constraints_mutex); | ||
| 2649 | |||
| 2650 | return ret; | ||
| 2651 | } | ||
| 2652 | |||
| 2653 | int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) | ||
| 2654 | { | ||
| 2655 | /* Don't accept realtime tasks when there is no way for them to run */ | ||
| 2656 | if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) | ||
| 2657 | return 0; | ||
| 2658 | |||
| 2659 | return 1; | ||
| 2660 | } | ||
| 2661 | |||
| 2662 | #else /* !CONFIG_RT_GROUP_SCHED */ | ||
| 2663 | static int sched_rt_global_constraints(void) | ||
| 2664 | { | ||
| 2665 | unsigned long flags; | ||
| 2666 | int i; | ||
| 2667 | |||
| 2668 | raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); | ||
| 2669 | for_each_possible_cpu(i) { | ||
| 2670 | struct rt_rq *rt_rq = &cpu_rq(i)->rt; | ||
| 2671 | |||
| 2672 | raw_spin_lock(&rt_rq->rt_runtime_lock); | ||
| 2673 | rt_rq->rt_runtime = global_rt_runtime(); | ||
| 2674 | raw_spin_unlock(&rt_rq->rt_runtime_lock); | ||
| 2675 | } | ||
| 2676 | raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); | ||
| 2677 | |||
| 2678 | return 0; | ||
| 2679 | } | ||
| 2680 | #endif /* CONFIG_RT_GROUP_SCHED */ | ||
| 2681 | |||
| 2682 | static int sched_rt_global_validate(void) | ||
| 2683 | { | ||
| 2684 | if (sysctl_sched_rt_period <= 0) | ||
| 2685 | return -EINVAL; | ||
| 2686 | |||
| 2687 | if ((sysctl_sched_rt_runtime != RUNTIME_INF) && | ||
| 2688 | (sysctl_sched_rt_runtime > sysctl_sched_rt_period)) | ||
| 2689 | return -EINVAL; | ||
| 2690 | |||
| 2691 | return 0; | ||
| 2692 | } | ||
| 2693 | |||
| 2694 | static void sched_rt_do_global(void) | ||
| 2695 | { | ||
| 2696 | def_rt_bandwidth.rt_runtime = global_rt_runtime(); | ||
| 2697 | def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period()); | ||
| 2698 | } | ||
| 2699 | |||
| 2700 | int sched_rt_handler(struct ctl_table *table, int write, | ||
| 2701 | void __user *buffer, size_t *lenp, | ||
| 2702 | loff_t *ppos) | ||
| 2703 | { | ||
| 2704 | int old_period, old_runtime; | ||
| 2705 | static DEFINE_MUTEX(mutex); | ||
| 2706 | int ret; | ||
| 2707 | |||
| 2708 | mutex_lock(&mutex); | ||
| 2709 | old_period = sysctl_sched_rt_period; | ||
| 2710 | old_runtime = sysctl_sched_rt_runtime; | ||
| 2711 | |||
| 2712 | ret = proc_dointvec(table, write, buffer, lenp, ppos); | ||
| 2713 | |||
| 2714 | if (!ret && write) { | ||
| 2715 | ret = sched_rt_global_validate(); | ||
| 2716 | if (ret) | ||
| 2717 | goto undo; | ||
| 2718 | |||
| 2719 | ret = sched_dl_global_validate(); | ||
| 2720 | if (ret) | ||
| 2721 | goto undo; | ||
| 2722 | |||
| 2723 | ret = sched_rt_global_constraints(); | ||
| 2724 | if (ret) | ||
| 2725 | goto undo; | ||
| 2726 | |||
| 2727 | sched_rt_do_global(); | ||
| 2728 | sched_dl_do_global(); | ||
| 2729 | } | ||
| 2730 | if (0) { | ||
| 2731 | undo: | ||
| 2732 | sysctl_sched_rt_period = old_period; | ||
| 2733 | sysctl_sched_rt_runtime = old_runtime; | ||
| 2734 | } | ||
| 2735 | mutex_unlock(&mutex); | ||
| 2736 | |||
| 2737 | return ret; | ||
| 2738 | } | ||
| 2739 | |||
| 2740 | int sched_rr_handler(struct ctl_table *table, int write, | ||
| 2741 | void __user *buffer, size_t *lenp, | ||
| 2742 | loff_t *ppos) | ||
| 2743 | { | ||
| 2744 | int ret; | ||
| 2745 | static DEFINE_MUTEX(mutex); | ||
| 2746 | |||
| 2747 | mutex_lock(&mutex); | ||
| 2748 | ret = proc_dointvec(table, write, buffer, lenp, ppos); | ||
| 2749 | /* | ||
| 2750 | * Make sure that internally we keep jiffies. | ||
| 2751 | * Also, writing zero resets the timeslice to default: | ||
| 2752 | */ | ||
| 2753 | if (!ret && write) { | ||
| 2754 | sched_rr_timeslice = | ||
| 2755 | sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE : | ||
| 2756 | msecs_to_jiffies(sysctl_sched_rr_timeslice); | ||
| 2757 | } | ||
| 2758 | mutex_unlock(&mutex); | ||
| 2759 | return ret; | ||
| 2760 | } | ||
| 2761 | |||
| 2441 | #ifdef CONFIG_SCHED_DEBUG | 2762 | #ifdef CONFIG_SCHED_DEBUG |
| 2442 | extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); | 2763 | extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); |
| 2443 | 2764 | ||
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6dda2aab731e..eeef1a3086d1 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
| @@ -39,9 +39,9 @@ | |||
| 39 | #include "cpuacct.h" | 39 | #include "cpuacct.h" |
| 40 | 40 | ||
| 41 | #ifdef CONFIG_SCHED_DEBUG | 41 | #ifdef CONFIG_SCHED_DEBUG |
| 42 | #define SCHED_WARN_ON(x) WARN_ONCE(x, #x) | 42 | # define SCHED_WARN_ON(x) WARN_ONCE(x, #x) |
| 43 | #else | 43 | #else |
| 44 | #define SCHED_WARN_ON(x) ((void)(x)) | 44 | # define SCHED_WARN_ON(x) ({ (void)(x), 0; }) |
| 45 | #endif | 45 | #endif |
| 46 | 46 | ||
| 47 | struct rq; | 47 | struct rq; |
| @@ -218,23 +218,25 @@ static inline int dl_bandwidth_enabled(void) | |||
| 218 | return sysctl_sched_rt_runtime >= 0; | 218 | return sysctl_sched_rt_runtime >= 0; |
| 219 | } | 219 | } |
| 220 | 220 | ||
| 221 | extern struct dl_bw *dl_bw_of(int i); | ||
| 222 | |||
| 223 | struct dl_bw { | 221 | struct dl_bw { |
| 224 | raw_spinlock_t lock; | 222 | raw_spinlock_t lock; |
| 225 | u64 bw, total_bw; | 223 | u64 bw, total_bw; |
| 226 | }; | 224 | }; |
| 227 | 225 | ||
| 226 | static inline void __dl_update(struct dl_bw *dl_b, s64 bw); | ||
| 227 | |||
| 228 | static inline | 228 | static inline |
| 229 | void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw) | 229 | void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw, int cpus) |
| 230 | { | 230 | { |
| 231 | dl_b->total_bw -= tsk_bw; | 231 | dl_b->total_bw -= tsk_bw; |
| 232 | __dl_update(dl_b, (s32)tsk_bw / cpus); | ||
| 232 | } | 233 | } |
| 233 | 234 | ||
| 234 | static inline | 235 | static inline |
| 235 | void __dl_add(struct dl_bw *dl_b, u64 tsk_bw) | 236 | void __dl_add(struct dl_bw *dl_b, u64 tsk_bw, int cpus) |
| 236 | { | 237 | { |
| 237 | dl_b->total_bw += tsk_bw; | 238 | dl_b->total_bw += tsk_bw; |
| 239 | __dl_update(dl_b, -((s32)tsk_bw / cpus)); | ||
| 238 | } | 240 | } |
| 239 | 241 | ||
| 240 | static inline | 242 | static inline |
| @@ -244,7 +246,22 @@ bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) | |||
| 244 | dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; | 246 | dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; |
| 245 | } | 247 | } |
| 246 | 248 | ||
| 249 | void dl_change_utilization(struct task_struct *p, u64 new_bw); | ||
| 247 | extern void init_dl_bw(struct dl_bw *dl_b); | 250 | extern void init_dl_bw(struct dl_bw *dl_b); |
| 251 | extern int sched_dl_global_validate(void); | ||
| 252 | extern void sched_dl_do_global(void); | ||
| 253 | extern int sched_dl_overflow(struct task_struct *p, int policy, | ||
| 254 | const struct sched_attr *attr); | ||
| 255 | extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr); | ||
| 256 | extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr); | ||
| 257 | extern bool __checkparam_dl(const struct sched_attr *attr); | ||
| 258 | extern void __dl_clear_params(struct task_struct *p); | ||
| 259 | extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); | ||
| 260 | extern int dl_task_can_attach(struct task_struct *p, | ||
| 261 | const struct cpumask *cs_cpus_allowed); | ||
| 262 | extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, | ||
| 263 | const struct cpumask *trial); | ||
| 264 | extern bool dl_cpu_busy(unsigned int cpu); | ||
| 248 | 265 | ||
| 249 | #ifdef CONFIG_CGROUP_SCHED | 266 | #ifdef CONFIG_CGROUP_SCHED |
| 250 | 267 | ||
| @@ -366,6 +383,11 @@ extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent | |||
| 366 | extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, | 383 | extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, |
| 367 | struct sched_rt_entity *rt_se, int cpu, | 384 | struct sched_rt_entity *rt_se, int cpu, |
| 368 | struct sched_rt_entity *parent); | 385 | struct sched_rt_entity *parent); |
| 386 | extern int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us); | ||
| 387 | extern int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us); | ||
| 388 | extern long sched_group_rt_runtime(struct task_group *tg); | ||
| 389 | extern long sched_group_rt_period(struct task_group *tg); | ||
| 390 | extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk); | ||
| 369 | 391 | ||
| 370 | extern struct task_group *sched_create_group(struct task_group *parent); | 392 | extern struct task_group *sched_create_group(struct task_group *parent); |
| 371 | extern void sched_online_group(struct task_group *tg, | 393 | extern void sched_online_group(struct task_group *tg, |
| @@ -558,6 +580,30 @@ struct dl_rq { | |||
| 558 | #else | 580 | #else |
| 559 | struct dl_bw dl_bw; | 581 | struct dl_bw dl_bw; |
| 560 | #endif | 582 | #endif |
| 583 | /* | ||
| 584 | * "Active utilization" for this runqueue: increased when a | ||
| 585 | * task wakes up (becomes TASK_RUNNING) and decreased when a | ||
| 586 | * task blocks | ||
| 587 | */ | ||
| 588 | u64 running_bw; | ||
| 589 | |||
| 590 | /* | ||
| 591 | * Utilization of the tasks "assigned" to this runqueue (including | ||
| 592 | * the tasks that are in runqueue and the tasks that executed on this | ||
| 593 | * CPU and blocked). Increased when a task moves to this runqueue, and | ||
| 594 | * decreased when the task moves away (migrates, changes scheduling | ||
| 595 | * policy, or terminates). | ||
| 596 | * This is needed to compute the "inactive utilization" for the | ||
| 597 | * runqueue (inactive utilization = this_bw - running_bw). | ||
| 598 | */ | ||
| 599 | u64 this_bw; | ||
| 600 | u64 extra_bw; | ||
| 601 | |||
| 602 | /* | ||
| 603 | * Inverse of the fraction of CPU utilization that can be reclaimed | ||
| 604 | * by the GRUB algorithm. | ||
| 605 | */ | ||
| 606 | u64 bw_ratio; | ||
| 561 | }; | 607 | }; |
| 562 | 608 | ||
| 563 | #ifdef CONFIG_SMP | 609 | #ifdef CONFIG_SMP |
| @@ -606,11 +652,9 @@ struct root_domain { | |||
| 606 | 652 | ||
| 607 | extern struct root_domain def_root_domain; | 653 | extern struct root_domain def_root_domain; |
| 608 | extern struct mutex sched_domains_mutex; | 654 | extern struct mutex sched_domains_mutex; |
| 609 | extern cpumask_var_t fallback_doms; | ||
| 610 | extern cpumask_var_t sched_domains_tmpmask; | ||
| 611 | 655 | ||
| 612 | extern void init_defrootdomain(void); | 656 | extern void init_defrootdomain(void); |
| 613 | extern int init_sched_domains(const struct cpumask *cpu_map); | 657 | extern int sched_init_domains(const struct cpumask *cpu_map); |
| 614 | extern void rq_attach_root(struct rq *rq, struct root_domain *rd); | 658 | extern void rq_attach_root(struct rq *rq, struct root_domain *rd); |
| 615 | 659 | ||
| 616 | #endif /* CONFIG_SMP */ | 660 | #endif /* CONFIG_SMP */ |
| @@ -1025,7 +1069,11 @@ struct sched_group_capacity { | |||
| 1025 | unsigned long next_update; | 1069 | unsigned long next_update; |
| 1026 | int imbalance; /* XXX unrelated to capacity but shared group state */ | 1070 | int imbalance; /* XXX unrelated to capacity but shared group state */ |
| 1027 | 1071 | ||
| 1028 | unsigned long cpumask[0]; /* iteration mask */ | 1072 | #ifdef CONFIG_SCHED_DEBUG |
| 1073 | int id; | ||
| 1074 | #endif | ||
| 1075 | |||
| 1076 | unsigned long cpumask[0]; /* balance mask */ | ||
| 1029 | }; | 1077 | }; |
| 1030 | 1078 | ||
| 1031 | struct sched_group { | 1079 | struct sched_group { |
| @@ -1046,16 +1094,15 @@ struct sched_group { | |||
| 1046 | unsigned long cpumask[0]; | 1094 | unsigned long cpumask[0]; |
| 1047 | }; | 1095 | }; |
| 1048 | 1096 | ||
| 1049 | static inline struct cpumask *sched_group_cpus(struct sched_group *sg) | 1097 | static inline struct cpumask *sched_group_span(struct sched_group *sg) |
| 1050 | { | 1098 | { |
| 1051 | return to_cpumask(sg->cpumask); | 1099 | return to_cpumask(sg->cpumask); |
| 1052 | } | 1100 | } |
| 1053 | 1101 | ||
| 1054 | /* | 1102 | /* |
| 1055 | * cpumask masking which cpus in the group are allowed to iterate up the domain | 1103 | * See build_balance_mask(). |
| 1056 | * tree. | ||
| 1057 | */ | 1104 | */ |
| 1058 | static inline struct cpumask *sched_group_mask(struct sched_group *sg) | 1105 | static inline struct cpumask *group_balance_mask(struct sched_group *sg) |
| 1059 | { | 1106 | { |
| 1060 | return to_cpumask(sg->sgc->cpumask); | 1107 | return to_cpumask(sg->sgc->cpumask); |
| 1061 | } | 1108 | } |
| @@ -1066,7 +1113,7 @@ static inline struct cpumask *sched_group_mask(struct sched_group *sg) | |||
| 1066 | */ | 1113 | */ |
| 1067 | static inline unsigned int group_first_cpu(struct sched_group *group) | 1114 | static inline unsigned int group_first_cpu(struct sched_group *group) |
| 1068 | { | 1115 | { |
| 1069 | return cpumask_first(sched_group_cpus(group)); | 1116 | return cpumask_first(sched_group_span(group)); |
| 1070 | } | 1117 | } |
| 1071 | 1118 | ||
| 1072 | extern int group_balance_cpu(struct sched_group *sg); | 1119 | extern int group_balance_cpu(struct sched_group *sg); |
| @@ -1422,7 +1469,11 @@ static inline void set_curr_task(struct rq *rq, struct task_struct *curr) | |||
| 1422 | curr->sched_class->set_curr_task(rq); | 1469 | curr->sched_class->set_curr_task(rq); |
| 1423 | } | 1470 | } |
| 1424 | 1471 | ||
| 1472 | #ifdef CONFIG_SMP | ||
| 1425 | #define sched_class_highest (&stop_sched_class) | 1473 | #define sched_class_highest (&stop_sched_class) |
| 1474 | #else | ||
| 1475 | #define sched_class_highest (&dl_sched_class) | ||
| 1476 | #endif | ||
| 1426 | #define for_each_class(class) \ | 1477 | #define for_each_class(class) \ |
| 1427 | for (class = sched_class_highest; class; class = class->next) | 1478 | for (class = sched_class_highest; class; class = class->next) |
| 1428 | 1479 | ||
| @@ -1486,7 +1537,12 @@ extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime | |||
| 1486 | extern struct dl_bandwidth def_dl_bandwidth; | 1537 | extern struct dl_bandwidth def_dl_bandwidth; |
| 1487 | extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime); | 1538 | extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime); |
| 1488 | extern void init_dl_task_timer(struct sched_dl_entity *dl_se); | 1539 | extern void init_dl_task_timer(struct sched_dl_entity *dl_se); |
| 1540 | extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se); | ||
| 1541 | extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq); | ||
| 1489 | 1542 | ||
| 1543 | #define BW_SHIFT 20 | ||
| 1544 | #define BW_UNIT (1 << BW_SHIFT) | ||
| 1545 | #define RATIO_SHIFT 8 | ||
| 1490 | unsigned long to_ratio(u64 period, u64 runtime); | 1546 | unsigned long to_ratio(u64 period, u64 runtime); |
| 1491 | 1547 | ||
| 1492 | extern void init_entity_runnable_average(struct sched_entity *se); | 1548 | extern void init_entity_runnable_average(struct sched_entity *se); |
| @@ -1928,6 +1984,33 @@ extern void nohz_balance_exit_idle(unsigned int cpu); | |||
| 1928 | static inline void nohz_balance_exit_idle(unsigned int cpu) { } | 1984 | static inline void nohz_balance_exit_idle(unsigned int cpu) { } |
| 1929 | #endif | 1985 | #endif |
| 1930 | 1986 | ||
| 1987 | |||
| 1988 | #ifdef CONFIG_SMP | ||
| 1989 | static inline | ||
| 1990 | void __dl_update(struct dl_bw *dl_b, s64 bw) | ||
| 1991 | { | ||
| 1992 | struct root_domain *rd = container_of(dl_b, struct root_domain, dl_bw); | ||
| 1993 | int i; | ||
| 1994 | |||
| 1995 | RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), | ||
| 1996 | "sched RCU must be held"); | ||
| 1997 | for_each_cpu_and(i, rd->span, cpu_active_mask) { | ||
| 1998 | struct rq *rq = cpu_rq(i); | ||
| 1999 | |||
| 2000 | rq->dl.extra_bw += bw; | ||
| 2001 | } | ||
| 2002 | } | ||
| 2003 | #else | ||
| 2004 | static inline | ||
| 2005 | void __dl_update(struct dl_bw *dl_b, s64 bw) | ||
| 2006 | { | ||
| 2007 | struct dl_rq *dl = container_of(dl_b, struct dl_rq, dl_bw); | ||
| 2008 | |||
| 2009 | dl->extra_bw += bw; | ||
| 2010 | } | ||
| 2011 | #endif | ||
| 2012 | |||
| 2013 | |||
| 1931 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | 2014 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING |
| 1932 | struct irqtime { | 2015 | struct irqtime { |
| 1933 | u64 total; | 2016 | u64 total; |
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 1b0b4fb12837..79895aec281e 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c | |||
| @@ -10,6 +10,7 @@ DEFINE_MUTEX(sched_domains_mutex); | |||
| 10 | 10 | ||
| 11 | /* Protected by sched_domains_mutex: */ | 11 | /* Protected by sched_domains_mutex: */ |
| 12 | cpumask_var_t sched_domains_tmpmask; | 12 | cpumask_var_t sched_domains_tmpmask; |
| 13 | cpumask_var_t sched_domains_tmpmask2; | ||
| 13 | 14 | ||
| 14 | #ifdef CONFIG_SCHED_DEBUG | 15 | #ifdef CONFIG_SCHED_DEBUG |
| 15 | 16 | ||
| @@ -35,7 +36,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
| 35 | 36 | ||
| 36 | cpumask_clear(groupmask); | 37 | cpumask_clear(groupmask); |
| 37 | 38 | ||
| 38 | printk(KERN_DEBUG "%*s domain %d: ", level, "", level); | 39 | printk(KERN_DEBUG "%*s domain-%d: ", level, "", level); |
| 39 | 40 | ||
| 40 | if (!(sd->flags & SD_LOAD_BALANCE)) { | 41 | if (!(sd->flags & SD_LOAD_BALANCE)) { |
| 41 | printk("does not load-balance\n"); | 42 | printk("does not load-balance\n"); |
| @@ -45,14 +46,14 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
| 45 | return -1; | 46 | return -1; |
| 46 | } | 47 | } |
| 47 | 48 | ||
| 48 | printk(KERN_CONT "span %*pbl level %s\n", | 49 | printk(KERN_CONT "span=%*pbl level=%s\n", |
| 49 | cpumask_pr_args(sched_domain_span(sd)), sd->name); | 50 | cpumask_pr_args(sched_domain_span(sd)), sd->name); |
| 50 | 51 | ||
| 51 | if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { | 52 | if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { |
| 52 | printk(KERN_ERR "ERROR: domain->span does not contain " | 53 | printk(KERN_ERR "ERROR: domain->span does not contain " |
| 53 | "CPU%d\n", cpu); | 54 | "CPU%d\n", cpu); |
| 54 | } | 55 | } |
| 55 | if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { | 56 | if (!cpumask_test_cpu(cpu, sched_group_span(group))) { |
| 56 | printk(KERN_ERR "ERROR: domain->groups does not contain" | 57 | printk(KERN_ERR "ERROR: domain->groups does not contain" |
| 57 | " CPU%d\n", cpu); | 58 | " CPU%d\n", cpu); |
| 58 | } | 59 | } |
| @@ -65,29 +66,47 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
| 65 | break; | 66 | break; |
| 66 | } | 67 | } |
| 67 | 68 | ||
| 68 | if (!cpumask_weight(sched_group_cpus(group))) { | 69 | if (!cpumask_weight(sched_group_span(group))) { |
| 69 | printk(KERN_CONT "\n"); | 70 | printk(KERN_CONT "\n"); |
| 70 | printk(KERN_ERR "ERROR: empty group\n"); | 71 | printk(KERN_ERR "ERROR: empty group\n"); |
| 71 | break; | 72 | break; |
| 72 | } | 73 | } |
| 73 | 74 | ||
| 74 | if (!(sd->flags & SD_OVERLAP) && | 75 | if (!(sd->flags & SD_OVERLAP) && |
| 75 | cpumask_intersects(groupmask, sched_group_cpus(group))) { | 76 | cpumask_intersects(groupmask, sched_group_span(group))) { |
| 76 | printk(KERN_CONT "\n"); | 77 | printk(KERN_CONT "\n"); |
| 77 | printk(KERN_ERR "ERROR: repeated CPUs\n"); | 78 | printk(KERN_ERR "ERROR: repeated CPUs\n"); |
| 78 | break; | 79 | break; |
| 79 | } | 80 | } |
| 80 | 81 | ||
| 81 | cpumask_or(groupmask, groupmask, sched_group_cpus(group)); | 82 | cpumask_or(groupmask, groupmask, sched_group_span(group)); |
| 82 | 83 | ||
| 83 | printk(KERN_CONT " %*pbl", | 84 | printk(KERN_CONT " %d:{ span=%*pbl", |
| 84 | cpumask_pr_args(sched_group_cpus(group))); | 85 | group->sgc->id, |
| 85 | if (group->sgc->capacity != SCHED_CAPACITY_SCALE) { | 86 | cpumask_pr_args(sched_group_span(group))); |
| 86 | printk(KERN_CONT " (cpu_capacity = %lu)", | 87 | |
| 87 | group->sgc->capacity); | 88 | if ((sd->flags & SD_OVERLAP) && |
| 89 | !cpumask_equal(group_balance_mask(group), sched_group_span(group))) { | ||
| 90 | printk(KERN_CONT " mask=%*pbl", | ||
| 91 | cpumask_pr_args(group_balance_mask(group))); | ||
| 92 | } | ||
| 93 | |||
| 94 | if (group->sgc->capacity != SCHED_CAPACITY_SCALE) | ||
| 95 | printk(KERN_CONT " cap=%lu", group->sgc->capacity); | ||
| 96 | |||
| 97 | if (group == sd->groups && sd->child && | ||
| 98 | !cpumask_equal(sched_domain_span(sd->child), | ||
| 99 | sched_group_span(group))) { | ||
| 100 | printk(KERN_ERR "ERROR: domain->groups does not match domain->child\n"); | ||
| 88 | } | 101 | } |
| 89 | 102 | ||
| 103 | printk(KERN_CONT " }"); | ||
| 104 | |||
| 90 | group = group->next; | 105 | group = group->next; |
| 106 | |||
| 107 | if (group != sd->groups) | ||
| 108 | printk(KERN_CONT ","); | ||
| 109 | |||
| 91 | } while (group != sd->groups); | 110 | } while (group != sd->groups); |
| 92 | printk(KERN_CONT "\n"); | 111 | printk(KERN_CONT "\n"); |
| 93 | 112 | ||
| @@ -113,7 +132,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) | |||
| 113 | return; | 132 | return; |
| 114 | } | 133 | } |
| 115 | 134 | ||
| 116 | printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); | 135 | printk(KERN_DEBUG "CPU%d attaching sched-domain(s):\n", cpu); |
| 117 | 136 | ||
| 118 | for (;;) { | 137 | for (;;) { |
| 119 | if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) | 138 | if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) |
| @@ -477,46 +496,214 @@ enum s_alloc { | |||
| 477 | }; | 496 | }; |
| 478 | 497 | ||
| 479 | /* | 498 | /* |
| 480 | * Build an iteration mask that can exclude certain CPUs from the upwards | 499 | * Return the canonical balance CPU for this group, this is the first CPU |
| 481 | * domain traversal. | 500 | * of this group that's also in the balance mask. |
| 482 | * | 501 | * |
| 483 | * Asymmetric node setups can result in situations where the domain tree is of | 502 | * The balance mask are all those CPUs that could actually end up at this |
| 484 | * unequal depth, make sure to skip domains that already cover the entire | 503 | * group. See build_balance_mask(). |
| 485 | * range. | ||
| 486 | * | 504 | * |
| 487 | * In that case build_sched_domains() will have terminated the iteration early | 505 | * Also see should_we_balance(). |
| 488 | * and our sibling sd spans will be empty. Domains should always include the | ||
| 489 | * CPU they're built on, so check that. | ||
| 490 | */ | 506 | */ |
| 491 | static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) | 507 | int group_balance_cpu(struct sched_group *sg) |
| 492 | { | 508 | { |
| 493 | const struct cpumask *span = sched_domain_span(sd); | 509 | return cpumask_first(group_balance_mask(sg)); |
| 510 | } | ||
| 511 | |||
| 512 | |||
| 513 | /* | ||
| 514 | * NUMA topology (first read the regular topology blurb below) | ||
| 515 | * | ||
| 516 | * Given a node-distance table, for example: | ||
| 517 | * | ||
| 518 | * node 0 1 2 3 | ||
| 519 | * 0: 10 20 30 20 | ||
| 520 | * 1: 20 10 20 30 | ||
| 521 | * 2: 30 20 10 20 | ||
| 522 | * 3: 20 30 20 10 | ||
| 523 | * | ||
| 524 | * which represents a 4 node ring topology like: | ||
| 525 | * | ||
| 526 | * 0 ----- 1 | ||
| 527 | * | | | ||
| 528 | * | | | ||
| 529 | * | | | ||
| 530 | * 3 ----- 2 | ||
| 531 | * | ||
| 532 | * We want to construct domains and groups to represent this. The way we go | ||
| 533 | * about doing this is to build the domains on 'hops'. For each NUMA level we | ||
| 534 | * construct the mask of all nodes reachable in @level hops. | ||
| 535 | * | ||
| 536 | * For the above NUMA topology that gives 3 levels: | ||
| 537 | * | ||
| 538 | * NUMA-2 0-3 0-3 0-3 0-3 | ||
| 539 | * groups: {0-1,3},{1-3} {0-2},{0,2-3} {1-3},{0-1,3} {0,2-3},{0-2} | ||
| 540 | * | ||
| 541 | * NUMA-1 0-1,3 0-2 1-3 0,2-3 | ||
| 542 | * groups: {0},{1},{3} {0},{1},{2} {1},{2},{3} {0},{2},{3} | ||
| 543 | * | ||
| 544 | * NUMA-0 0 1 2 3 | ||
| 545 | * | ||
| 546 | * | ||
| 547 | * As can be seen; things don't nicely line up as with the regular topology. | ||
| 548 | * When we iterate a domain in child domain chunks some nodes can be | ||
| 549 | * represented multiple times -- hence the "overlap" naming for this part of | ||
| 550 | * the topology. | ||
| 551 | * | ||
| 552 | * In order to minimize this overlap, we only build enough groups to cover the | ||
| 553 | * domain. For instance Node-0 NUMA-2 would only get groups: 0-1,3 and 1-3. | ||
| 554 | * | ||
| 555 | * Because: | ||
| 556 | * | ||
| 557 | * - the first group of each domain is its child domain; this | ||
| 558 | * gets us the first 0-1,3 | ||
| 559 | * - the only uncovered node is 2, who's child domain is 1-3. | ||
| 560 | * | ||
| 561 | * However, because of the overlap, computing a unique CPU for each group is | ||
| 562 | * more complicated. Consider for instance the groups of NODE-1 NUMA-2, both | ||
| 563 | * groups include the CPUs of Node-0, while those CPUs would not in fact ever | ||
| 564 | * end up at those groups (they would end up in group: 0-1,3). | ||
| 565 | * | ||
| 566 | * To correct this we have to introduce the group balance mask. This mask | ||
| 567 | * will contain those CPUs in the group that can reach this group given the | ||
| 568 | * (child) domain tree. | ||
| 569 | * | ||
| 570 | * With this we can once again compute balance_cpu and sched_group_capacity | ||
| 571 | * relations. | ||
| 572 | * | ||
| 573 | * XXX include words on how balance_cpu is unique and therefore can be | ||
| 574 | * used for sched_group_capacity links. | ||
| 575 | * | ||
| 576 | * | ||
| 577 | * Another 'interesting' topology is: | ||
| 578 | * | ||
| 579 | * node 0 1 2 3 | ||
| 580 | * 0: 10 20 20 30 | ||
| 581 | * 1: 20 10 20 20 | ||
| 582 | * 2: 20 20 10 20 | ||
| 583 | * 3: 30 20 20 10 | ||
| 584 | * | ||
| 585 | * Which looks a little like: | ||
| 586 | * | ||
| 587 | * 0 ----- 1 | ||
| 588 | * | / | | ||
| 589 | * | / | | ||
| 590 | * | / | | ||
| 591 | * 2 ----- 3 | ||
| 592 | * | ||
| 593 | * This topology is asymmetric, nodes 1,2 are fully connected, but nodes 0,3 | ||
| 594 | * are not. | ||
| 595 | * | ||
| 596 | * This leads to a few particularly weird cases where the sched_domain's are | ||
| 597 | * not of the same number for each cpu. Consider: | ||
| 598 | * | ||
| 599 | * NUMA-2 0-3 0-3 | ||
| 600 | * groups: {0-2},{1-3} {1-3},{0-2} | ||
| 601 | * | ||
| 602 | * NUMA-1 0-2 0-3 0-3 1-3 | ||
| 603 | * | ||
| 604 | * NUMA-0 0 1 2 3 | ||
| 605 | * | ||
| 606 | */ | ||
| 607 | |||
| 608 | |||
| 609 | /* | ||
| 610 | * Build the balance mask; it contains only those CPUs that can arrive at this | ||
| 611 | * group and should be considered to continue balancing. | ||
| 612 | * | ||
| 613 | * We do this during the group creation pass, therefore the group information | ||
| 614 | * isn't complete yet, however since each group represents a (child) domain we | ||
| 615 | * can fully construct this using the sched_domain bits (which are already | ||
| 616 | * complete). | ||
| 617 | */ | ||
| 618 | static void | ||
| 619 | build_balance_mask(struct sched_domain *sd, struct sched_group *sg, struct cpumask *mask) | ||
| 620 | { | ||
| 621 | const struct cpumask *sg_span = sched_group_span(sg); | ||
| 494 | struct sd_data *sdd = sd->private; | 622 | struct sd_data *sdd = sd->private; |
| 495 | struct sched_domain *sibling; | 623 | struct sched_domain *sibling; |
| 496 | int i; | 624 | int i; |
| 497 | 625 | ||
| 498 | for_each_cpu(i, span) { | 626 | cpumask_clear(mask); |
| 627 | |||
| 628 | for_each_cpu(i, sg_span) { | ||
| 499 | sibling = *per_cpu_ptr(sdd->sd, i); | 629 | sibling = *per_cpu_ptr(sdd->sd, i); |
| 500 | if (!cpumask_test_cpu(i, sched_domain_span(sibling))) | 630 | |
| 631 | /* | ||
| 632 | * Can happen in the asymmetric case, where these siblings are | ||
| 633 | * unused. The mask will not be empty because those CPUs that | ||
| 634 | * do have the top domain _should_ span the domain. | ||
| 635 | */ | ||
| 636 | if (!sibling->child) | ||
| 501 | continue; | 637 | continue; |
| 502 | 638 | ||
| 503 | cpumask_set_cpu(i, sched_group_mask(sg)); | 639 | /* If we would not end up here, we can't continue from here */ |
| 640 | if (!cpumask_equal(sg_span, sched_domain_span(sibling->child))) | ||
| 641 | continue; | ||
| 642 | |||
| 643 | cpumask_set_cpu(i, mask); | ||
| 504 | } | 644 | } |
| 645 | |||
| 646 | /* We must not have empty masks here */ | ||
| 647 | WARN_ON_ONCE(cpumask_empty(mask)); | ||
| 505 | } | 648 | } |
| 506 | 649 | ||
| 507 | /* | 650 | /* |
| 508 | * Return the canonical balance CPU for this group, this is the first CPU | 651 | * XXX: This creates per-node group entries; since the load-balancer will |
| 509 | * of this group that's also in the iteration mask. | 652 | * immediately access remote memory to construct this group's load-balance |
| 653 | * statistics having the groups node local is of dubious benefit. | ||
| 510 | */ | 654 | */ |
| 511 | int group_balance_cpu(struct sched_group *sg) | 655 | static struct sched_group * |
| 656 | build_group_from_child_sched_domain(struct sched_domain *sd, int cpu) | ||
| 512 | { | 657 | { |
| 513 | return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg)); | 658 | struct sched_group *sg; |
| 659 | struct cpumask *sg_span; | ||
| 660 | |||
| 661 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), | ||
| 662 | GFP_KERNEL, cpu_to_node(cpu)); | ||
| 663 | |||
| 664 | if (!sg) | ||
| 665 | return NULL; | ||
| 666 | |||
| 667 | sg_span = sched_group_span(sg); | ||
| 668 | if (sd->child) | ||
| 669 | cpumask_copy(sg_span, sched_domain_span(sd->child)); | ||
| 670 | else | ||
| 671 | cpumask_copy(sg_span, sched_domain_span(sd)); | ||
| 672 | |||
| 673 | return sg; | ||
| 674 | } | ||
| 675 | |||
| 676 | static void init_overlap_sched_group(struct sched_domain *sd, | ||
| 677 | struct sched_group *sg) | ||
| 678 | { | ||
| 679 | struct cpumask *mask = sched_domains_tmpmask2; | ||
| 680 | struct sd_data *sdd = sd->private; | ||
| 681 | struct cpumask *sg_span; | ||
| 682 | int cpu; | ||
| 683 | |||
| 684 | build_balance_mask(sd, sg, mask); | ||
| 685 | cpu = cpumask_first_and(sched_group_span(sg), mask); | ||
| 686 | |||
| 687 | sg->sgc = *per_cpu_ptr(sdd->sgc, cpu); | ||
| 688 | if (atomic_inc_return(&sg->sgc->ref) == 1) | ||
| 689 | cpumask_copy(group_balance_mask(sg), mask); | ||
| 690 | else | ||
| 691 | WARN_ON_ONCE(!cpumask_equal(group_balance_mask(sg), mask)); | ||
| 692 | |||
| 693 | /* | ||
| 694 | * Initialize sgc->capacity such that even if we mess up the | ||
| 695 | * domains and no possible iteration will get us here, we won't | ||
| 696 | * die on a /0 trap. | ||
| 697 | */ | ||
| 698 | sg_span = sched_group_span(sg); | ||
| 699 | sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); | ||
| 700 | sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; | ||
| 514 | } | 701 | } |
| 515 | 702 | ||
| 516 | static int | 703 | static int |
| 517 | build_overlap_sched_groups(struct sched_domain *sd, int cpu) | 704 | build_overlap_sched_groups(struct sched_domain *sd, int cpu) |
| 518 | { | 705 | { |
| 519 | struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg; | 706 | struct sched_group *first = NULL, *last = NULL, *sg; |
| 520 | const struct cpumask *span = sched_domain_span(sd); | 707 | const struct cpumask *span = sched_domain_span(sd); |
| 521 | struct cpumask *covered = sched_domains_tmpmask; | 708 | struct cpumask *covered = sched_domains_tmpmask; |
| 522 | struct sd_data *sdd = sd->private; | 709 | struct sd_data *sdd = sd->private; |
| @@ -525,7 +712,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) | |||
| 525 | 712 | ||
| 526 | cpumask_clear(covered); | 713 | cpumask_clear(covered); |
| 527 | 714 | ||
| 528 | for_each_cpu(i, span) { | 715 | for_each_cpu_wrap(i, span, cpu) { |
| 529 | struct cpumask *sg_span; | 716 | struct cpumask *sg_span; |
| 530 | 717 | ||
| 531 | if (cpumask_test_cpu(i, covered)) | 718 | if (cpumask_test_cpu(i, covered)) |
| @@ -533,44 +720,27 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) | |||
| 533 | 720 | ||
| 534 | sibling = *per_cpu_ptr(sdd->sd, i); | 721 | sibling = *per_cpu_ptr(sdd->sd, i); |
| 535 | 722 | ||
| 536 | /* See the comment near build_group_mask(). */ | 723 | /* |
| 724 | * Asymmetric node setups can result in situations where the | ||
| 725 | * domain tree is of unequal depth, make sure to skip domains | ||
| 726 | * that already cover the entire range. | ||
| 727 | * | ||
| 728 | * In that case build_sched_domains() will have terminated the | ||
| 729 | * iteration early and our sibling sd spans will be empty. | ||
| 730 | * Domains should always include the CPU they're built on, so | ||
| 731 | * check that. | ||
| 732 | */ | ||
| 537 | if (!cpumask_test_cpu(i, sched_domain_span(sibling))) | 733 | if (!cpumask_test_cpu(i, sched_domain_span(sibling))) |
| 538 | continue; | 734 | continue; |
| 539 | 735 | ||
| 540 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), | 736 | sg = build_group_from_child_sched_domain(sibling, cpu); |
| 541 | GFP_KERNEL, cpu_to_node(cpu)); | ||
| 542 | |||
| 543 | if (!sg) | 737 | if (!sg) |
| 544 | goto fail; | 738 | goto fail; |
| 545 | 739 | ||
| 546 | sg_span = sched_group_cpus(sg); | 740 | sg_span = sched_group_span(sg); |
| 547 | if (sibling->child) | ||
| 548 | cpumask_copy(sg_span, sched_domain_span(sibling->child)); | ||
| 549 | else | ||
| 550 | cpumask_set_cpu(i, sg_span); | ||
| 551 | |||
| 552 | cpumask_or(covered, covered, sg_span); | 741 | cpumask_or(covered, covered, sg_span); |
| 553 | 742 | ||
| 554 | sg->sgc = *per_cpu_ptr(sdd->sgc, i); | 743 | init_overlap_sched_group(sd, sg); |
| 555 | if (atomic_inc_return(&sg->sgc->ref) == 1) | ||
| 556 | build_group_mask(sd, sg); | ||
| 557 | |||
| 558 | /* | ||
| 559 | * Initialize sgc->capacity such that even if we mess up the | ||
| 560 | * domains and no possible iteration will get us here, we won't | ||
| 561 | * die on a /0 trap. | ||
| 562 | */ | ||
| 563 | sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); | ||
| 564 | sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; | ||
| 565 | |||
| 566 | /* | ||
| 567 | * Make sure the first group of this domain contains the | ||
| 568 | * canonical balance CPU. Otherwise the sched_domain iteration | ||
| 569 | * breaks. See update_sg_lb_stats(). | ||
| 570 | */ | ||
| 571 | if ((!groups && cpumask_test_cpu(cpu, sg_span)) || | ||
| 572 | group_balance_cpu(sg) == cpu) | ||
| 573 | groups = sg; | ||
| 574 | 744 | ||
| 575 | if (!first) | 745 | if (!first) |
| 576 | first = sg; | 746 | first = sg; |
| @@ -579,7 +749,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) | |||
| 579 | last = sg; | 749 | last = sg; |
| 580 | last->next = first; | 750 | last->next = first; |
| 581 | } | 751 | } |
| 582 | sd->groups = groups; | 752 | sd->groups = first; |
| 583 | 753 | ||
| 584 | return 0; | 754 | return 0; |
| 585 | 755 | ||
| @@ -589,23 +759,106 @@ fail: | |||
| 589 | return -ENOMEM; | 759 | return -ENOMEM; |
| 590 | } | 760 | } |
| 591 | 761 | ||
| 592 | static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) | 762 | |
| 763 | /* | ||
| 764 | * Package topology (also see the load-balance blurb in fair.c) | ||
| 765 | * | ||
| 766 | * The scheduler builds a tree structure to represent a number of important | ||
| 767 | * topology features. By default (default_topology[]) these include: | ||
| 768 | * | ||
| 769 | * - Simultaneous multithreading (SMT) | ||
| 770 | * - Multi-Core Cache (MC) | ||
| 771 | * - Package (DIE) | ||
| 772 | * | ||
| 773 | * Where the last one more or less denotes everything up to a NUMA node. | ||
| 774 | * | ||
| 775 | * The tree consists of 3 primary data structures: | ||
| 776 | * | ||
| 777 | * sched_domain -> sched_group -> sched_group_capacity | ||
| 778 | * ^ ^ ^ ^ | ||
| 779 | * `-' `-' | ||
| 780 | * | ||
| 781 | * The sched_domains are per-cpu and have a two way link (parent & child) and | ||
| 782 | * denote the ever growing mask of CPUs belonging to that level of topology. | ||
| 783 | * | ||
| 784 | * Each sched_domain has a circular (double) linked list of sched_group's, each | ||
| 785 | * denoting the domains of the level below (or individual CPUs in case of the | ||
| 786 | * first domain level). The sched_group linked by a sched_domain includes the | ||
| 787 | * CPU of that sched_domain [*]. | ||
| 788 | * | ||
| 789 | * Take for instance a 2 threaded, 2 core, 2 cache cluster part: | ||
| 790 | * | ||
| 791 | * CPU 0 1 2 3 4 5 6 7 | ||
| 792 | * | ||
| 793 | * DIE [ ] | ||
| 794 | * MC [ ] [ ] | ||
| 795 | * SMT [ ] [ ] [ ] [ ] | ||
| 796 | * | ||
| 797 | * - or - | ||
| 798 | * | ||
| 799 | * DIE 0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7 | ||
| 800 | * MC 0-3 0-3 0-3 0-3 4-7 4-7 4-7 4-7 | ||
| 801 | * SMT 0-1 0-1 2-3 2-3 4-5 4-5 6-7 6-7 | ||
| 802 | * | ||
| 803 | * CPU 0 1 2 3 4 5 6 7 | ||
| 804 | * | ||
| 805 | * One way to think about it is: sched_domain moves you up and down among these | ||
| 806 | * topology levels, while sched_group moves you sideways through it, at child | ||
| 807 | * domain granularity. | ||
| 808 | * | ||
| 809 | * sched_group_capacity ensures each unique sched_group has shared storage. | ||
| 810 | * | ||
| 811 | * There are two related construction problems, both require a CPU that | ||
| 812 | * uniquely identify each group (for a given domain): | ||
| 813 | * | ||
| 814 | * - The first is the balance_cpu (see should_we_balance() and the | ||
| 815 | * load-balance blub in fair.c); for each group we only want 1 CPU to | ||
| 816 | * continue balancing at a higher domain. | ||
| 817 | * | ||
| 818 | * - The second is the sched_group_capacity; we want all identical groups | ||
| 819 | * to share a single sched_group_capacity. | ||
| 820 | * | ||
| 821 | * Since these topologies are exclusive by construction. That is, its | ||
| 822 | * impossible for an SMT thread to belong to multiple cores, and cores to | ||
| 823 | * be part of multiple caches. There is a very clear and unique location | ||
| 824 | * for each CPU in the hierarchy. | ||
| 825 | * | ||
| 826 | * Therefore computing a unique CPU for each group is trivial (the iteration | ||
| 827 | * mask is redundant and set all 1s; all CPUs in a group will end up at _that_ | ||
| 828 | * group), we can simply pick the first CPU in each group. | ||
| 829 | * | ||
| 830 | * | ||
| 831 | * [*] in other words, the first group of each domain is its child domain. | ||
| 832 | */ | ||
| 833 | |||
| 834 | static struct sched_group *get_group(int cpu, struct sd_data *sdd) | ||
| 593 | { | 835 | { |
| 594 | struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); | 836 | struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); |
| 595 | struct sched_domain *child = sd->child; | 837 | struct sched_domain *child = sd->child; |
| 838 | struct sched_group *sg; | ||
| 596 | 839 | ||
| 597 | if (child) | 840 | if (child) |
| 598 | cpu = cpumask_first(sched_domain_span(child)); | 841 | cpu = cpumask_first(sched_domain_span(child)); |
| 599 | 842 | ||
| 600 | if (sg) { | 843 | sg = *per_cpu_ptr(sdd->sg, cpu); |
| 601 | *sg = *per_cpu_ptr(sdd->sg, cpu); | 844 | sg->sgc = *per_cpu_ptr(sdd->sgc, cpu); |
| 602 | (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu); | 845 | |
| 846 | /* For claim_allocations: */ | ||
| 847 | atomic_inc(&sg->ref); | ||
| 848 | atomic_inc(&sg->sgc->ref); | ||
| 603 | 849 | ||
| 604 | /* For claim_allocations: */ | 850 | if (child) { |
| 605 | atomic_set(&(*sg)->sgc->ref, 1); | 851 | cpumask_copy(sched_group_span(sg), sched_domain_span(child)); |
| 852 | cpumask_copy(group_balance_mask(sg), sched_group_span(sg)); | ||
| 853 | } else { | ||
| 854 | cpumask_set_cpu(cpu, sched_group_span(sg)); | ||
| 855 | cpumask_set_cpu(cpu, group_balance_mask(sg)); | ||
| 606 | } | 856 | } |
| 607 | 857 | ||
| 608 | return cpu; | 858 | sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg)); |
| 859 | sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; | ||
| 860 | |||
| 861 | return sg; | ||
| 609 | } | 862 | } |
| 610 | 863 | ||
| 611 | /* | 864 | /* |
| @@ -624,34 +877,20 @@ build_sched_groups(struct sched_domain *sd, int cpu) | |||
| 624 | struct cpumask *covered; | 877 | struct cpumask *covered; |
| 625 | int i; | 878 | int i; |
| 626 | 879 | ||
| 627 | get_group(cpu, sdd, &sd->groups); | ||
| 628 | atomic_inc(&sd->groups->ref); | ||
| 629 | |||
| 630 | if (cpu != cpumask_first(span)) | ||
| 631 | return 0; | ||
| 632 | |||
| 633 | lockdep_assert_held(&sched_domains_mutex); | 880 | lockdep_assert_held(&sched_domains_mutex); |
| 634 | covered = sched_domains_tmpmask; | 881 | covered = sched_domains_tmpmask; |
| 635 | 882 | ||
| 636 | cpumask_clear(covered); | 883 | cpumask_clear(covered); |
| 637 | 884 | ||
| 638 | for_each_cpu(i, span) { | 885 | for_each_cpu_wrap(i, span, cpu) { |
| 639 | struct sched_group *sg; | 886 | struct sched_group *sg; |
| 640 | int group, j; | ||
| 641 | 887 | ||
| 642 | if (cpumask_test_cpu(i, covered)) | 888 | if (cpumask_test_cpu(i, covered)) |
| 643 | continue; | 889 | continue; |
| 644 | 890 | ||
| 645 | group = get_group(i, sdd, &sg); | 891 | sg = get_group(i, sdd); |
| 646 | cpumask_setall(sched_group_mask(sg)); | ||
| 647 | 892 | ||
| 648 | for_each_cpu(j, span) { | 893 | cpumask_or(covered, covered, sched_group_span(sg)); |
| 649 | if (get_group(j, sdd, NULL) != group) | ||
| 650 | continue; | ||
| 651 | |||
| 652 | cpumask_set_cpu(j, covered); | ||
| 653 | cpumask_set_cpu(j, sched_group_cpus(sg)); | ||
| 654 | } | ||
| 655 | 894 | ||
| 656 | if (!first) | 895 | if (!first) |
| 657 | first = sg; | 896 | first = sg; |
| @@ -660,6 +899,7 @@ build_sched_groups(struct sched_domain *sd, int cpu) | |||
| 660 | last = sg; | 899 | last = sg; |
| 661 | } | 900 | } |
| 662 | last->next = first; | 901 | last->next = first; |
| 902 | sd->groups = first; | ||
| 663 | 903 | ||
| 664 | return 0; | 904 | return 0; |
| 665 | } | 905 | } |
| @@ -683,12 +923,12 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) | |||
| 683 | do { | 923 | do { |
| 684 | int cpu, max_cpu = -1; | 924 | int cpu, max_cpu = -1; |
| 685 | 925 | ||
| 686 | sg->group_weight = cpumask_weight(sched_group_cpus(sg)); | 926 | sg->group_weight = cpumask_weight(sched_group_span(sg)); |
| 687 | 927 | ||
| 688 | if (!(sd->flags & SD_ASYM_PACKING)) | 928 | if (!(sd->flags & SD_ASYM_PACKING)) |
| 689 | goto next; | 929 | goto next; |
| 690 | 930 | ||
| 691 | for_each_cpu(cpu, sched_group_cpus(sg)) { | 931 | for_each_cpu(cpu, sched_group_span(sg)) { |
| 692 | if (max_cpu < 0) | 932 | if (max_cpu < 0) |
| 693 | max_cpu = cpu; | 933 | max_cpu = cpu; |
| 694 | else if (sched_asym_prefer(cpu, max_cpu)) | 934 | else if (sched_asym_prefer(cpu, max_cpu)) |
| @@ -1308,6 +1548,10 @@ static int __sdt_alloc(const struct cpumask *cpu_map) | |||
| 1308 | if (!sgc) | 1548 | if (!sgc) |
| 1309 | return -ENOMEM; | 1549 | return -ENOMEM; |
| 1310 | 1550 | ||
| 1551 | #ifdef CONFIG_SCHED_DEBUG | ||
| 1552 | sgc->id = j; | ||
| 1553 | #endif | ||
| 1554 | |||
| 1311 | *per_cpu_ptr(sdd->sgc, j) = sgc; | 1555 | *per_cpu_ptr(sdd->sgc, j) = sgc; |
| 1312 | } | 1556 | } |
| 1313 | } | 1557 | } |
| @@ -1407,7 +1651,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att | |||
| 1407 | sd = build_sched_domain(tl, cpu_map, attr, sd, i); | 1651 | sd = build_sched_domain(tl, cpu_map, attr, sd, i); |
| 1408 | if (tl == sched_domain_topology) | 1652 | if (tl == sched_domain_topology) |
| 1409 | *per_cpu_ptr(d.sd, i) = sd; | 1653 | *per_cpu_ptr(d.sd, i) = sd; |
| 1410 | if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) | 1654 | if (tl->flags & SDTL_OVERLAP) |
| 1411 | sd->flags |= SD_OVERLAP; | 1655 | sd->flags |= SD_OVERLAP; |
| 1412 | if (cpumask_equal(cpu_map, sched_domain_span(sd))) | 1656 | if (cpumask_equal(cpu_map, sched_domain_span(sd))) |
| 1413 | break; | 1657 | break; |
| @@ -1478,7 +1722,7 @@ static struct sched_domain_attr *dattr_cur; | |||
| 1478 | * cpumask) fails, then fallback to a single sched domain, | 1722 | * cpumask) fails, then fallback to a single sched domain, |
| 1479 | * as determined by the single cpumask fallback_doms. | 1723 | * as determined by the single cpumask fallback_doms. |
| 1480 | */ | 1724 | */ |
| 1481 | cpumask_var_t fallback_doms; | 1725 | static cpumask_var_t fallback_doms; |
| 1482 | 1726 | ||
| 1483 | /* | 1727 | /* |
| 1484 | * arch_update_cpu_topology lets virtualized architectures update the | 1728 | * arch_update_cpu_topology lets virtualized architectures update the |
| @@ -1520,10 +1764,14 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) | |||
| 1520 | * For now this just excludes isolated CPUs, but could be used to | 1764 | * For now this just excludes isolated CPUs, but could be used to |
| 1521 | * exclude other special cases in the future. | 1765 | * exclude other special cases in the future. |
| 1522 | */ | 1766 | */ |
| 1523 | int init_sched_domains(const struct cpumask *cpu_map) | 1767 | int sched_init_domains(const struct cpumask *cpu_map) |
| 1524 | { | 1768 | { |
| 1525 | int err; | 1769 | int err; |
| 1526 | 1770 | ||
| 1771 | zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL); | ||
| 1772 | zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL); | ||
| 1773 | zalloc_cpumask_var(&fallback_doms, GFP_KERNEL); | ||
| 1774 | |||
| 1527 | arch_update_cpu_topology(); | 1775 | arch_update_cpu_topology(); |
| 1528 | ndoms_cur = 1; | 1776 | ndoms_cur = 1; |
| 1529 | doms_cur = alloc_sched_domains(ndoms_cur); | 1777 | doms_cur = alloc_sched_domains(ndoms_cur); |
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index b8c84c6dee64..17f11c6b0a9f 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c | |||
| @@ -12,44 +12,44 @@ | |||
| 12 | #include <linux/hash.h> | 12 | #include <linux/hash.h> |
| 13 | #include <linux/kthread.h> | 13 | #include <linux/kthread.h> |
| 14 | 14 | ||
| 15 | void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key) | 15 | void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *key) |
| 16 | { | 16 | { |
| 17 | spin_lock_init(&q->lock); | 17 | spin_lock_init(&wq_head->lock); |
| 18 | lockdep_set_class_and_name(&q->lock, key, name); | 18 | lockdep_set_class_and_name(&wq_head->lock, key, name); |
| 19 | INIT_LIST_HEAD(&q->task_list); | 19 | INIT_LIST_HEAD(&wq_head->head); |
| 20 | } | 20 | } |
| 21 | 21 | ||
| 22 | EXPORT_SYMBOL(__init_waitqueue_head); | 22 | EXPORT_SYMBOL(__init_waitqueue_head); |
| 23 | 23 | ||
| 24 | void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) | 24 | void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) |
| 25 | { | 25 | { |
| 26 | unsigned long flags; | 26 | unsigned long flags; |
| 27 | 27 | ||
| 28 | wait->flags &= ~WQ_FLAG_EXCLUSIVE; | 28 | wq_entry->flags &= ~WQ_FLAG_EXCLUSIVE; |
| 29 | spin_lock_irqsave(&q->lock, flags); | 29 | spin_lock_irqsave(&wq_head->lock, flags); |
| 30 | __add_wait_queue(q, wait); | 30 | __add_wait_queue_entry_tail(wq_head, wq_entry); |
| 31 | spin_unlock_irqrestore(&q->lock, flags); | 31 | spin_unlock_irqrestore(&wq_head->lock, flags); |
| 32 | } | 32 | } |
| 33 | EXPORT_SYMBOL(add_wait_queue); | 33 | EXPORT_SYMBOL(add_wait_queue); |
| 34 | 34 | ||
| 35 | void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait) | 35 | void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) |
| 36 | { | 36 | { |
| 37 | unsigned long flags; | 37 | unsigned long flags; |
| 38 | 38 | ||
| 39 | wait->flags |= WQ_FLAG_EXCLUSIVE; | 39 | wq_entry->flags |= WQ_FLAG_EXCLUSIVE; |
| 40 | spin_lock_irqsave(&q->lock, flags); | 40 | spin_lock_irqsave(&wq_head->lock, flags); |
| 41 | __add_wait_queue_tail(q, wait); | 41 | __add_wait_queue_entry_tail(wq_head, wq_entry); |
| 42 | spin_unlock_irqrestore(&q->lock, flags); | 42 | spin_unlock_irqrestore(&wq_head->lock, flags); |
| 43 | } | 43 | } |
| 44 | EXPORT_SYMBOL(add_wait_queue_exclusive); | 44 | EXPORT_SYMBOL(add_wait_queue_exclusive); |
| 45 | 45 | ||
| 46 | void remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) | 46 | void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) |
| 47 | { | 47 | { |
| 48 | unsigned long flags; | 48 | unsigned long flags; |
| 49 | 49 | ||
| 50 | spin_lock_irqsave(&q->lock, flags); | 50 | spin_lock_irqsave(&wq_head->lock, flags); |
| 51 | __remove_wait_queue(q, wait); | 51 | __remove_wait_queue(wq_head, wq_entry); |
| 52 | spin_unlock_irqrestore(&q->lock, flags); | 52 | spin_unlock_irqrestore(&wq_head->lock, flags); |
| 53 | } | 53 | } |
| 54 | EXPORT_SYMBOL(remove_wait_queue); | 54 | EXPORT_SYMBOL(remove_wait_queue); |
| 55 | 55 | ||
| @@ -63,12 +63,12 @@ EXPORT_SYMBOL(remove_wait_queue); | |||
| 63 | * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns | 63 | * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns |
| 64 | * zero in this (rare) case, and we handle it by continuing to scan the queue. | 64 | * zero in this (rare) case, and we handle it by continuing to scan the queue. |
| 65 | */ | 65 | */ |
| 66 | static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, | 66 | static void __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode, |
| 67 | int nr_exclusive, int wake_flags, void *key) | 67 | int nr_exclusive, int wake_flags, void *key) |
| 68 | { | 68 | { |
| 69 | wait_queue_t *curr, *next; | 69 | wait_queue_entry_t *curr, *next; |
| 70 | 70 | ||
| 71 | list_for_each_entry_safe(curr, next, &q->task_list, task_list) { | 71 | list_for_each_entry_safe(curr, next, &wq_head->head, entry) { |
| 72 | unsigned flags = curr->flags; | 72 | unsigned flags = curr->flags; |
| 73 | 73 | ||
| 74 | if (curr->func(curr, mode, wake_flags, key) && | 74 | if (curr->func(curr, mode, wake_flags, key) && |
| @@ -79,7 +79,7 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, | |||
| 79 | 79 | ||
| 80 | /** | 80 | /** |
| 81 | * __wake_up - wake up threads blocked on a waitqueue. | 81 | * __wake_up - wake up threads blocked on a waitqueue. |
| 82 | * @q: the waitqueue | 82 | * @wq_head: the waitqueue |
| 83 | * @mode: which threads | 83 | * @mode: which threads |
| 84 | * @nr_exclusive: how many wake-one or wake-many threads to wake up | 84 | * @nr_exclusive: how many wake-one or wake-many threads to wake up |
| 85 | * @key: is directly passed to the wakeup function | 85 | * @key: is directly passed to the wakeup function |
| @@ -87,35 +87,35 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, | |||
| 87 | * It may be assumed that this function implies a write memory barrier before | 87 | * It may be assumed that this function implies a write memory barrier before |
| 88 | * changing the task state if and only if any tasks are woken up. | 88 | * changing the task state if and only if any tasks are woken up. |
| 89 | */ | 89 | */ |
| 90 | void __wake_up(wait_queue_head_t *q, unsigned int mode, | 90 | void __wake_up(struct wait_queue_head *wq_head, unsigned int mode, |
| 91 | int nr_exclusive, void *key) | 91 | int nr_exclusive, void *key) |
| 92 | { | 92 | { |
| 93 | unsigned long flags; | 93 | unsigned long flags; |
| 94 | 94 | ||
| 95 | spin_lock_irqsave(&q->lock, flags); | 95 | spin_lock_irqsave(&wq_head->lock, flags); |
| 96 | __wake_up_common(q, mode, nr_exclusive, 0, key); | 96 | __wake_up_common(wq_head, mode, nr_exclusive, 0, key); |
| 97 | spin_unlock_irqrestore(&q->lock, flags); | 97 | spin_unlock_irqrestore(&wq_head->lock, flags); |
| 98 | } | 98 | } |
| 99 | EXPORT_SYMBOL(__wake_up); | 99 | EXPORT_SYMBOL(__wake_up); |
| 100 | 100 | ||
| 101 | /* | 101 | /* |
| 102 | * Same as __wake_up but called with the spinlock in wait_queue_head_t held. | 102 | * Same as __wake_up but called with the spinlock in wait_queue_head_t held. |
| 103 | */ | 103 | */ |
| 104 | void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr) | 104 | void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr) |
| 105 | { | 105 | { |
| 106 | __wake_up_common(q, mode, nr, 0, NULL); | 106 | __wake_up_common(wq_head, mode, nr, 0, NULL); |
| 107 | } | 107 | } |
| 108 | EXPORT_SYMBOL_GPL(__wake_up_locked); | 108 | EXPORT_SYMBOL_GPL(__wake_up_locked); |
| 109 | 109 | ||
| 110 | void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) | 110 | void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key) |
| 111 | { | 111 | { |
| 112 | __wake_up_common(q, mode, 1, 0, key); | 112 | __wake_up_common(wq_head, mode, 1, 0, key); |
| 113 | } | 113 | } |
| 114 | EXPORT_SYMBOL_GPL(__wake_up_locked_key); | 114 | EXPORT_SYMBOL_GPL(__wake_up_locked_key); |
| 115 | 115 | ||
| 116 | /** | 116 | /** |
| 117 | * __wake_up_sync_key - wake up threads blocked on a waitqueue. | 117 | * __wake_up_sync_key - wake up threads blocked on a waitqueue. |
| 118 | * @q: the waitqueue | 118 | * @wq_head: the waitqueue |
| 119 | * @mode: which threads | 119 | * @mode: which threads |
| 120 | * @nr_exclusive: how many wake-one or wake-many threads to wake up | 120 | * @nr_exclusive: how many wake-one or wake-many threads to wake up |
| 121 | * @key: opaque value to be passed to wakeup targets | 121 | * @key: opaque value to be passed to wakeup targets |
| @@ -130,30 +130,30 @@ EXPORT_SYMBOL_GPL(__wake_up_locked_key); | |||
| 130 | * It may be assumed that this function implies a write memory barrier before | 130 | * It may be assumed that this function implies a write memory barrier before |
| 131 | * changing the task state if and only if any tasks are woken up. | 131 | * changing the task state if and only if any tasks are woken up. |
| 132 | */ | 132 | */ |
| 133 | void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, | 133 | void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, |
| 134 | int nr_exclusive, void *key) | 134 | int nr_exclusive, void *key) |
| 135 | { | 135 | { |
| 136 | unsigned long flags; | 136 | unsigned long flags; |
| 137 | int wake_flags = 1; /* XXX WF_SYNC */ | 137 | int wake_flags = 1; /* XXX WF_SYNC */ |
| 138 | 138 | ||
| 139 | if (unlikely(!q)) | 139 | if (unlikely(!wq_head)) |
| 140 | return; | 140 | return; |
| 141 | 141 | ||
| 142 | if (unlikely(nr_exclusive != 1)) | 142 | if (unlikely(nr_exclusive != 1)) |
| 143 | wake_flags = 0; | 143 | wake_flags = 0; |
| 144 | 144 | ||
| 145 | spin_lock_irqsave(&q->lock, flags); | 145 | spin_lock_irqsave(&wq_head->lock, flags); |
| 146 | __wake_up_common(q, mode, nr_exclusive, wake_flags, key); | 146 | __wake_up_common(wq_head, mode, nr_exclusive, wake_flags, key); |
| 147 | spin_unlock_irqrestore(&q->lock, flags); | 147 | spin_unlock_irqrestore(&wq_head->lock, flags); |
| 148 | } | 148 | } |
| 149 | EXPORT_SYMBOL_GPL(__wake_up_sync_key); | 149 | EXPORT_SYMBOL_GPL(__wake_up_sync_key); |
| 150 | 150 | ||
| 151 | /* | 151 | /* |
| 152 | * __wake_up_sync - see __wake_up_sync_key() | 152 | * __wake_up_sync - see __wake_up_sync_key() |
| 153 | */ | 153 | */ |
| 154 | void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) | 154 | void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode, int nr_exclusive) |
| 155 | { | 155 | { |
| 156 | __wake_up_sync_key(q, mode, nr_exclusive, NULL); | 156 | __wake_up_sync_key(wq_head, mode, nr_exclusive, NULL); |
| 157 | } | 157 | } |
| 158 | EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ | 158 | EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ |
| 159 | 159 | ||
| @@ -170,48 +170,48 @@ EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ | |||
| 170 | * loads to move into the critical region). | 170 | * loads to move into the critical region). |
| 171 | */ | 171 | */ |
| 172 | void | 172 | void |
| 173 | prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state) | 173 | prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state) |
| 174 | { | 174 | { |
| 175 | unsigned long flags; | 175 | unsigned long flags; |
| 176 | 176 | ||
| 177 | wait->flags &= ~WQ_FLAG_EXCLUSIVE; | 177 | wq_entry->flags &= ~WQ_FLAG_EXCLUSIVE; |
| 178 | spin_lock_irqsave(&q->lock, flags); | 178 | spin_lock_irqsave(&wq_head->lock, flags); |
| 179 | if (list_empty(&wait->task_list)) | 179 | if (list_empty(&wq_entry->entry)) |
| 180 | __add_wait_queue(q, wait); | 180 | __add_wait_queue(wq_head, wq_entry); |
| 181 | set_current_state(state); | 181 | set_current_state(state); |
| 182 | spin_unlock_irqrestore(&q->lock, flags); | 182 | spin_unlock_irqrestore(&wq_head->lock, flags); |
| 183 | } | 183 | } |
| 184 | EXPORT_SYMBOL(prepare_to_wait); | 184 | EXPORT_SYMBOL(prepare_to_wait); |
| 185 | 185 | ||
| 186 | void | 186 | void |
| 187 | prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state) | 187 | prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state) |
| 188 | { | 188 | { |
| 189 | unsigned long flags; | 189 | unsigned long flags; |
| 190 | 190 | ||
| 191 | wait->flags |= WQ_FLAG_EXCLUSIVE; | 191 | wq_entry->flags |= WQ_FLAG_EXCLUSIVE; |
| 192 | spin_lock_irqsave(&q->lock, flags); | 192 | spin_lock_irqsave(&wq_head->lock, flags); |
| 193 | if (list_empty(&wait->task_list)) | 193 | if (list_empty(&wq_entry->entry)) |
| 194 | __add_wait_queue_tail(q, wait); | 194 | __add_wait_queue_entry_tail(wq_head, wq_entry); |
| 195 | set_current_state(state); | 195 | set_current_state(state); |
| 196 | spin_unlock_irqrestore(&q->lock, flags); | 196 | spin_unlock_irqrestore(&wq_head->lock, flags); |
| 197 | } | 197 | } |
| 198 | EXPORT_SYMBOL(prepare_to_wait_exclusive); | 198 | EXPORT_SYMBOL(prepare_to_wait_exclusive); |
| 199 | 199 | ||
| 200 | void init_wait_entry(wait_queue_t *wait, int flags) | 200 | void init_wait_entry(struct wait_queue_entry *wq_entry, int flags) |
| 201 | { | 201 | { |
| 202 | wait->flags = flags; | 202 | wq_entry->flags = flags; |
| 203 | wait->private = current; | 203 | wq_entry->private = current; |
| 204 | wait->func = autoremove_wake_function; | 204 | wq_entry->func = autoremove_wake_function; |
| 205 | INIT_LIST_HEAD(&wait->task_list); | 205 | INIT_LIST_HEAD(&wq_entry->entry); |
| 206 | } | 206 | } |
| 207 | EXPORT_SYMBOL(init_wait_entry); | 207 | EXPORT_SYMBOL(init_wait_entry); |
| 208 | 208 | ||
| 209 | long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state) | 209 | long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state) |
| 210 | { | 210 | { |
| 211 | unsigned long flags; | 211 | unsigned long flags; |
| 212 | long ret = 0; | 212 | long ret = 0; |
| 213 | 213 | ||
| 214 | spin_lock_irqsave(&q->lock, flags); | 214 | spin_lock_irqsave(&wq_head->lock, flags); |
| 215 | if (unlikely(signal_pending_state(state, current))) { | 215 | if (unlikely(signal_pending_state(state, current))) { |
| 216 | /* | 216 | /* |
| 217 | * Exclusive waiter must not fail if it was selected by wakeup, | 217 | * Exclusive waiter must not fail if it was selected by wakeup, |
| @@ -219,24 +219,24 @@ long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state) | |||
| 219 | * | 219 | * |
| 220 | * The caller will recheck the condition and return success if | 220 | * The caller will recheck the condition and return success if |
| 221 | * we were already woken up, we can not miss the event because | 221 | * we were already woken up, we can not miss the event because |
| 222 | * wakeup locks/unlocks the same q->lock. | 222 | * wakeup locks/unlocks the same wq_head->lock. |
| 223 | * | 223 | * |
| 224 | * But we need to ensure that set-condition + wakeup after that | 224 | * But we need to ensure that set-condition + wakeup after that |
| 225 | * can't see us, it should wake up another exclusive waiter if | 225 | * can't see us, it should wake up another exclusive waiter if |
| 226 | * we fail. | 226 | * we fail. |
| 227 | */ | 227 | */ |
| 228 | list_del_init(&wait->task_list); | 228 | list_del_init(&wq_entry->entry); |
| 229 | ret = -ERESTARTSYS; | 229 | ret = -ERESTARTSYS; |
| 230 | } else { | 230 | } else { |
| 231 | if (list_empty(&wait->task_list)) { | 231 | if (list_empty(&wq_entry->entry)) { |
| 232 | if (wait->flags & WQ_FLAG_EXCLUSIVE) | 232 | if (wq_entry->flags & WQ_FLAG_EXCLUSIVE) |
| 233 | __add_wait_queue_tail(q, wait); | 233 | __add_wait_queue_entry_tail(wq_head, wq_entry); |
| 234 | else | 234 | else |
| 235 | __add_wait_queue(q, wait); | 235 | __add_wait_queue(wq_head, wq_entry); |
| 236 | } | 236 | } |
| 237 | set_current_state(state); | 237 | set_current_state(state); |
| 238 | } | 238 | } |
| 239 | spin_unlock_irqrestore(&q->lock, flags); | 239 | spin_unlock_irqrestore(&wq_head->lock, flags); |
| 240 | 240 | ||
| 241 | return ret; | 241 | return ret; |
| 242 | } | 242 | } |
| @@ -249,10 +249,10 @@ EXPORT_SYMBOL(prepare_to_wait_event); | |||
| 249 | * condition in the caller before they add the wait | 249 | * condition in the caller before they add the wait |
| 250 | * entry to the wake queue. | 250 | * entry to the wake queue. |
| 251 | */ | 251 | */ |
| 252 | int do_wait_intr(wait_queue_head_t *wq, wait_queue_t *wait) | 252 | int do_wait_intr(wait_queue_head_t *wq, wait_queue_entry_t *wait) |
| 253 | { | 253 | { |
| 254 | if (likely(list_empty(&wait->task_list))) | 254 | if (likely(list_empty(&wait->entry))) |
| 255 | __add_wait_queue_tail(wq, wait); | 255 | __add_wait_queue_entry_tail(wq, wait); |
| 256 | 256 | ||
| 257 | set_current_state(TASK_INTERRUPTIBLE); | 257 | set_current_state(TASK_INTERRUPTIBLE); |
| 258 | if (signal_pending(current)) | 258 | if (signal_pending(current)) |
| @@ -265,10 +265,10 @@ int do_wait_intr(wait_queue_head_t *wq, wait_queue_t *wait) | |||
| 265 | } | 265 | } |
| 266 | EXPORT_SYMBOL(do_wait_intr); | 266 | EXPORT_SYMBOL(do_wait_intr); |
| 267 | 267 | ||
| 268 | int do_wait_intr_irq(wait_queue_head_t *wq, wait_queue_t *wait) | 268 | int do_wait_intr_irq(wait_queue_head_t *wq, wait_queue_entry_t *wait) |
| 269 | { | 269 | { |
| 270 | if (likely(list_empty(&wait->task_list))) | 270 | if (likely(list_empty(&wait->entry))) |
| 271 | __add_wait_queue_tail(wq, wait); | 271 | __add_wait_queue_entry_tail(wq, wait); |
| 272 | 272 | ||
| 273 | set_current_state(TASK_INTERRUPTIBLE); | 273 | set_current_state(TASK_INTERRUPTIBLE); |
| 274 | if (signal_pending(current)) | 274 | if (signal_pending(current)) |
| @@ -283,14 +283,14 @@ EXPORT_SYMBOL(do_wait_intr_irq); | |||
| 283 | 283 | ||
| 284 | /** | 284 | /** |
| 285 | * finish_wait - clean up after waiting in a queue | 285 | * finish_wait - clean up after waiting in a queue |
| 286 | * @q: waitqueue waited on | 286 | * @wq_head: waitqueue waited on |
| 287 | * @wait: wait descriptor | 287 | * @wq_entry: wait descriptor |
| 288 | * | 288 | * |
| 289 | * Sets current thread back to running state and removes | 289 | * Sets current thread back to running state and removes |
| 290 | * the wait descriptor from the given waitqueue if still | 290 | * the wait descriptor from the given waitqueue if still |
| 291 | * queued. | 291 | * queued. |
| 292 | */ | 292 | */ |
| 293 | void finish_wait(wait_queue_head_t *q, wait_queue_t *wait) | 293 | void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) |
| 294 | { | 294 | { |
| 295 | unsigned long flags; | 295 | unsigned long flags; |
| 296 | 296 | ||
| @@ -308,20 +308,20 @@ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait) | |||
| 308 | * have _one_ other CPU that looks at or modifies | 308 | * have _one_ other CPU that looks at or modifies |
| 309 | * the list). | 309 | * the list). |
| 310 | */ | 310 | */ |
| 311 | if (!list_empty_careful(&wait->task_list)) { | 311 | if (!list_empty_careful(&wq_entry->entry)) { |
| 312 | spin_lock_irqsave(&q->lock, flags); | 312 | spin_lock_irqsave(&wq_head->lock, flags); |
| 313 | list_del_init(&wait->task_list); | 313 | list_del_init(&wq_entry->entry); |
| 314 | spin_unlock_irqrestore(&q->lock, flags); | 314 | spin_unlock_irqrestore(&wq_head->lock, flags); |
| 315 | } | 315 | } |
| 316 | } | 316 | } |
| 317 | EXPORT_SYMBOL(finish_wait); | 317 | EXPORT_SYMBOL(finish_wait); |
| 318 | 318 | ||
| 319 | int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) | 319 | int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key) |
| 320 | { | 320 | { |
| 321 | int ret = default_wake_function(wait, mode, sync, key); | 321 | int ret = default_wake_function(wq_entry, mode, sync, key); |
| 322 | 322 | ||
| 323 | if (ret) | 323 | if (ret) |
| 324 | list_del_init(&wait->task_list); | 324 | list_del_init(&wq_entry->entry); |
| 325 | return ret; | 325 | return ret; |
| 326 | } | 326 | } |
| 327 | EXPORT_SYMBOL(autoremove_wake_function); | 327 | EXPORT_SYMBOL(autoremove_wake_function); |
| @@ -334,24 +334,24 @@ static inline bool is_kthread_should_stop(void) | |||
| 334 | /* | 334 | /* |
| 335 | * DEFINE_WAIT_FUNC(wait, woken_wake_func); | 335 | * DEFINE_WAIT_FUNC(wait, woken_wake_func); |
| 336 | * | 336 | * |
| 337 | * add_wait_queue(&wq, &wait); | 337 | * add_wait_queue(&wq_head, &wait); |
| 338 | * for (;;) { | 338 | * for (;;) { |
| 339 | * if (condition) | 339 | * if (condition) |
| 340 | * break; | 340 | * break; |
| 341 | * | 341 | * |
| 342 | * p->state = mode; condition = true; | 342 | * p->state = mode; condition = true; |
| 343 | * smp_mb(); // A smp_wmb(); // C | 343 | * smp_mb(); // A smp_wmb(); // C |
| 344 | * if (!wait->flags & WQ_FLAG_WOKEN) wait->flags |= WQ_FLAG_WOKEN; | 344 | * if (!wq_entry->flags & WQ_FLAG_WOKEN) wq_entry->flags |= WQ_FLAG_WOKEN; |
| 345 | * schedule() try_to_wake_up(); | 345 | * schedule() try_to_wake_up(); |
| 346 | * p->state = TASK_RUNNING; ~~~~~~~~~~~~~~~~~~ | 346 | * p->state = TASK_RUNNING; ~~~~~~~~~~~~~~~~~~ |
| 347 | * wait->flags &= ~WQ_FLAG_WOKEN; condition = true; | 347 | * wq_entry->flags &= ~WQ_FLAG_WOKEN; condition = true; |
| 348 | * smp_mb() // B smp_wmb(); // C | 348 | * smp_mb() // B smp_wmb(); // C |
| 349 | * wait->flags |= WQ_FLAG_WOKEN; | 349 | * wq_entry->flags |= WQ_FLAG_WOKEN; |
| 350 | * } | 350 | * } |
| 351 | * remove_wait_queue(&wq, &wait); | 351 | * remove_wait_queue(&wq_head, &wait); |
| 352 | * | 352 | * |
| 353 | */ | 353 | */ |
| 354 | long wait_woken(wait_queue_t *wait, unsigned mode, long timeout) | 354 | long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout) |
| 355 | { | 355 | { |
| 356 | set_current_state(mode); /* A */ | 356 | set_current_state(mode); /* A */ |
| 357 | /* | 357 | /* |
| @@ -359,7 +359,7 @@ long wait_woken(wait_queue_t *wait, unsigned mode, long timeout) | |||
| 359 | * woken_wake_function() such that if we observe WQ_FLAG_WOKEN we must | 359 | * woken_wake_function() such that if we observe WQ_FLAG_WOKEN we must |
| 360 | * also observe all state before the wakeup. | 360 | * also observe all state before the wakeup. |
| 361 | */ | 361 | */ |
| 362 | if (!(wait->flags & WQ_FLAG_WOKEN) && !is_kthread_should_stop()) | 362 | if (!(wq_entry->flags & WQ_FLAG_WOKEN) && !is_kthread_should_stop()) |
| 363 | timeout = schedule_timeout(timeout); | 363 | timeout = schedule_timeout(timeout); |
| 364 | __set_current_state(TASK_RUNNING); | 364 | __set_current_state(TASK_RUNNING); |
| 365 | 365 | ||
| @@ -369,13 +369,13 @@ long wait_woken(wait_queue_t *wait, unsigned mode, long timeout) | |||
| 369 | * condition being true _OR_ WQ_FLAG_WOKEN such that we will not miss | 369 | * condition being true _OR_ WQ_FLAG_WOKEN such that we will not miss |
| 370 | * an event. | 370 | * an event. |
| 371 | */ | 371 | */ |
| 372 | smp_store_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */ | 372 | smp_store_mb(wq_entry->flags, wq_entry->flags & ~WQ_FLAG_WOKEN); /* B */ |
| 373 | 373 | ||
| 374 | return timeout; | 374 | return timeout; |
| 375 | } | 375 | } |
| 376 | EXPORT_SYMBOL(wait_woken); | 376 | EXPORT_SYMBOL(wait_woken); |
| 377 | 377 | ||
| 378 | int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) | 378 | int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key) |
| 379 | { | 379 | { |
| 380 | /* | 380 | /* |
| 381 | * Although this function is called under waitqueue lock, LOCK | 381 | * Although this function is called under waitqueue lock, LOCK |
| @@ -385,267 +385,8 @@ int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) | |||
| 385 | * and is paired with smp_store_mb() in wait_woken(). | 385 | * and is paired with smp_store_mb() in wait_woken(). |
| 386 | */ | 386 | */ |
| 387 | smp_wmb(); /* C */ | 387 | smp_wmb(); /* C */ |
| 388 | wait->flags |= WQ_FLAG_WOKEN; | 388 | wq_entry->flags |= WQ_FLAG_WOKEN; |
| 389 | 389 | ||
| 390 | return default_wake_function(wait, mode, sync, key); | 390 | return default_wake_function(wq_entry, mode, sync, key); |
| 391 | } | 391 | } |
| 392 | EXPORT_SYMBOL(woken_wake_function); | 392 | EXPORT_SYMBOL(woken_wake_function); |
| 393 | |||
| 394 | int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg) | ||
| 395 | { | ||
| 396 | struct wait_bit_key *key = arg; | ||
| 397 | struct wait_bit_queue *wait_bit | ||
| 398 | = container_of(wait, struct wait_bit_queue, wait); | ||
| 399 | |||
| 400 | if (wait_bit->key.flags != key->flags || | ||
| 401 | wait_bit->key.bit_nr != key->bit_nr || | ||
| 402 | test_bit(key->bit_nr, key->flags)) | ||
| 403 | return 0; | ||
| 404 | else | ||
| 405 | return autoremove_wake_function(wait, mode, sync, key); | ||
| 406 | } | ||
| 407 | EXPORT_SYMBOL(wake_bit_function); | ||
| 408 | |||
| 409 | /* | ||
| 410 | * To allow interruptible waiting and asynchronous (i.e. nonblocking) | ||
| 411 | * waiting, the actions of __wait_on_bit() and __wait_on_bit_lock() are | ||
| 412 | * permitted return codes. Nonzero return codes halt waiting and return. | ||
| 413 | */ | ||
| 414 | int __sched | ||
| 415 | __wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q, | ||
| 416 | wait_bit_action_f *action, unsigned mode) | ||
| 417 | { | ||
| 418 | int ret = 0; | ||
| 419 | |||
| 420 | do { | ||
| 421 | prepare_to_wait(wq, &q->wait, mode); | ||
| 422 | if (test_bit(q->key.bit_nr, q->key.flags)) | ||
| 423 | ret = (*action)(&q->key, mode); | ||
| 424 | } while (test_bit(q->key.bit_nr, q->key.flags) && !ret); | ||
| 425 | finish_wait(wq, &q->wait); | ||
| 426 | return ret; | ||
| 427 | } | ||
| 428 | EXPORT_SYMBOL(__wait_on_bit); | ||
| 429 | |||
| 430 | int __sched out_of_line_wait_on_bit(void *word, int bit, | ||
| 431 | wait_bit_action_f *action, unsigned mode) | ||
| 432 | { | ||
| 433 | wait_queue_head_t *wq = bit_waitqueue(word, bit); | ||
| 434 | DEFINE_WAIT_BIT(wait, word, bit); | ||
| 435 | |||
| 436 | return __wait_on_bit(wq, &wait, action, mode); | ||
| 437 | } | ||
| 438 | EXPORT_SYMBOL(out_of_line_wait_on_bit); | ||
| 439 | |||
| 440 | int __sched out_of_line_wait_on_bit_timeout( | ||
| 441 | void *word, int bit, wait_bit_action_f *action, | ||
| 442 | unsigned mode, unsigned long timeout) | ||
| 443 | { | ||
| 444 | wait_queue_head_t *wq = bit_waitqueue(word, bit); | ||
| 445 | DEFINE_WAIT_BIT(wait, word, bit); | ||
| 446 | |||
| 447 | wait.key.timeout = jiffies + timeout; | ||
| 448 | return __wait_on_bit(wq, &wait, action, mode); | ||
| 449 | } | ||
| 450 | EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout); | ||
| 451 | |||
| 452 | int __sched | ||
| 453 | __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, | ||
| 454 | wait_bit_action_f *action, unsigned mode) | ||
| 455 | { | ||
| 456 | int ret = 0; | ||
| 457 | |||
| 458 | for (;;) { | ||
| 459 | prepare_to_wait_exclusive(wq, &q->wait, mode); | ||
| 460 | if (test_bit(q->key.bit_nr, q->key.flags)) { | ||
| 461 | ret = action(&q->key, mode); | ||
| 462 | /* | ||
| 463 | * See the comment in prepare_to_wait_event(). | ||
| 464 | * finish_wait() does not necessarily takes wq->lock, | ||
| 465 | * but test_and_set_bit() implies mb() which pairs with | ||
| 466 | * smp_mb__after_atomic() before wake_up_page(). | ||
| 467 | */ | ||
| 468 | if (ret) | ||
| 469 | finish_wait(wq, &q->wait); | ||
| 470 | } | ||
| 471 | if (!test_and_set_bit(q->key.bit_nr, q->key.flags)) { | ||
| 472 | if (!ret) | ||
| 473 | finish_wait(wq, &q->wait); | ||
| 474 | return 0; | ||
| 475 | } else if (ret) { | ||
| 476 | return ret; | ||
| 477 | } | ||
| 478 | } | ||
| 479 | } | ||
| 480 | EXPORT_SYMBOL(__wait_on_bit_lock); | ||
| 481 | |||
| 482 | int __sched out_of_line_wait_on_bit_lock(void *word, int bit, | ||
| 483 | wait_bit_action_f *action, unsigned mode) | ||
| 484 | { | ||
| 485 | wait_queue_head_t *wq = bit_waitqueue(word, bit); | ||
| 486 | DEFINE_WAIT_BIT(wait, word, bit); | ||
| 487 | |||
| 488 | return __wait_on_bit_lock(wq, &wait, action, mode); | ||
| 489 | } | ||
| 490 | EXPORT_SYMBOL(out_of_line_wait_on_bit_lock); | ||
| 491 | |||
| 492 | void __wake_up_bit(wait_queue_head_t *wq, void *word, int bit) | ||
| 493 | { | ||
| 494 | struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit); | ||
| 495 | if (waitqueue_active(wq)) | ||
| 496 | __wake_up(wq, TASK_NORMAL, 1, &key); | ||
| 497 | } | ||
| 498 | EXPORT_SYMBOL(__wake_up_bit); | ||
| 499 | |||
| 500 | /** | ||
| 501 | * wake_up_bit - wake up a waiter on a bit | ||
| 502 | * @word: the word being waited on, a kernel virtual address | ||
| 503 | * @bit: the bit of the word being waited on | ||
| 504 | * | ||
| 505 | * There is a standard hashed waitqueue table for generic use. This | ||
| 506 | * is the part of the hashtable's accessor API that wakes up waiters | ||
| 507 | * on a bit. For instance, if one were to have waiters on a bitflag, | ||
| 508 | * one would call wake_up_bit() after clearing the bit. | ||
| 509 | * | ||
| 510 | * In order for this to function properly, as it uses waitqueue_active() | ||
| 511 | * internally, some kind of memory barrier must be done prior to calling | ||
| 512 | * this. Typically, this will be smp_mb__after_atomic(), but in some | ||
| 513 | * cases where bitflags are manipulated non-atomically under a lock, one | ||
| 514 | * may need to use a less regular barrier, such fs/inode.c's smp_mb(), | ||
| 515 | * because spin_unlock() does not guarantee a memory barrier. | ||
| 516 | */ | ||
| 517 | void wake_up_bit(void *word, int bit) | ||
| 518 | { | ||
| 519 | __wake_up_bit(bit_waitqueue(word, bit), word, bit); | ||
| 520 | } | ||
| 521 | EXPORT_SYMBOL(wake_up_bit); | ||
| 522 | |||
| 523 | /* | ||
| 524 | * Manipulate the atomic_t address to produce a better bit waitqueue table hash | ||
| 525 | * index (we're keying off bit -1, but that would produce a horrible hash | ||
| 526 | * value). | ||
| 527 | */ | ||
| 528 | static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p) | ||
| 529 | { | ||
| 530 | if (BITS_PER_LONG == 64) { | ||
| 531 | unsigned long q = (unsigned long)p; | ||
| 532 | return bit_waitqueue((void *)(q & ~1), q & 1); | ||
| 533 | } | ||
| 534 | return bit_waitqueue(p, 0); | ||
| 535 | } | ||
| 536 | |||
| 537 | static int wake_atomic_t_function(wait_queue_t *wait, unsigned mode, int sync, | ||
| 538 | void *arg) | ||
| 539 | { | ||
| 540 | struct wait_bit_key *key = arg; | ||
| 541 | struct wait_bit_queue *wait_bit | ||
| 542 | = container_of(wait, struct wait_bit_queue, wait); | ||
| 543 | atomic_t *val = key->flags; | ||
| 544 | |||
| 545 | if (wait_bit->key.flags != key->flags || | ||
| 546 | wait_bit->key.bit_nr != key->bit_nr || | ||
| 547 | atomic_read(val) != 0) | ||
| 548 | return 0; | ||
| 549 | return autoremove_wake_function(wait, mode, sync, key); | ||
| 550 | } | ||
| 551 | |||
| 552 | /* | ||
| 553 | * To allow interruptible waiting and asynchronous (i.e. nonblocking) waiting, | ||
| 554 | * the actions of __wait_on_atomic_t() are permitted return codes. Nonzero | ||
| 555 | * return codes halt waiting and return. | ||
| 556 | */ | ||
| 557 | static __sched | ||
| 558 | int __wait_on_atomic_t(wait_queue_head_t *wq, struct wait_bit_queue *q, | ||
| 559 | int (*action)(atomic_t *), unsigned mode) | ||
| 560 | { | ||
| 561 | atomic_t *val; | ||
| 562 | int ret = 0; | ||
| 563 | |||
| 564 | do { | ||
| 565 | prepare_to_wait(wq, &q->wait, mode); | ||
| 566 | val = q->key.flags; | ||
| 567 | if (atomic_read(val) == 0) | ||
| 568 | break; | ||
| 569 | ret = (*action)(val); | ||
| 570 | } while (!ret && atomic_read(val) != 0); | ||
| 571 | finish_wait(wq, &q->wait); | ||
| 572 | return ret; | ||
| 573 | } | ||
| 574 | |||
| 575 | #define DEFINE_WAIT_ATOMIC_T(name, p) \ | ||
| 576 | struct wait_bit_queue name = { \ | ||
| 577 | .key = __WAIT_ATOMIC_T_KEY_INITIALIZER(p), \ | ||
| 578 | .wait = { \ | ||
| 579 | .private = current, \ | ||
| 580 | .func = wake_atomic_t_function, \ | ||
| 581 | .task_list = \ | ||
| 582 | LIST_HEAD_INIT((name).wait.task_list), \ | ||
| 583 | }, \ | ||
| 584 | } | ||
| 585 | |||
| 586 | __sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *), | ||
| 587 | unsigned mode) | ||
| 588 | { | ||
| 589 | wait_queue_head_t *wq = atomic_t_waitqueue(p); | ||
| 590 | DEFINE_WAIT_ATOMIC_T(wait, p); | ||
| 591 | |||
| 592 | return __wait_on_atomic_t(wq, &wait, action, mode); | ||
| 593 | } | ||
| 594 | EXPORT_SYMBOL(out_of_line_wait_on_atomic_t); | ||
| 595 | |||
| 596 | /** | ||
| 597 | * wake_up_atomic_t - Wake up a waiter on a atomic_t | ||
| 598 | * @p: The atomic_t being waited on, a kernel virtual address | ||
| 599 | * | ||
| 600 | * Wake up anyone waiting for the atomic_t to go to zero. | ||
| 601 | * | ||
| 602 | * Abuse the bit-waker function and its waitqueue hash table set (the atomic_t | ||
| 603 | * check is done by the waiter's wake function, not the by the waker itself). | ||
| 604 | */ | ||
| 605 | void wake_up_atomic_t(atomic_t *p) | ||
| 606 | { | ||
| 607 | __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR); | ||
| 608 | } | ||
| 609 | EXPORT_SYMBOL(wake_up_atomic_t); | ||
| 610 | |||
| 611 | __sched int bit_wait(struct wait_bit_key *word, int mode) | ||
| 612 | { | ||
| 613 | schedule(); | ||
| 614 | if (signal_pending_state(mode, current)) | ||
| 615 | return -EINTR; | ||
| 616 | return 0; | ||
| 617 | } | ||
| 618 | EXPORT_SYMBOL(bit_wait); | ||
| 619 | |||
| 620 | __sched int bit_wait_io(struct wait_bit_key *word, int mode) | ||
| 621 | { | ||
| 622 | io_schedule(); | ||
| 623 | if (signal_pending_state(mode, current)) | ||
| 624 | return -EINTR; | ||
| 625 | return 0; | ||
| 626 | } | ||
| 627 | EXPORT_SYMBOL(bit_wait_io); | ||
| 628 | |||
| 629 | __sched int bit_wait_timeout(struct wait_bit_key *word, int mode) | ||
| 630 | { | ||
| 631 | unsigned long now = READ_ONCE(jiffies); | ||
| 632 | if (time_after_eq(now, word->timeout)) | ||
| 633 | return -EAGAIN; | ||
| 634 | schedule_timeout(word->timeout - now); | ||
| 635 | if (signal_pending_state(mode, current)) | ||
| 636 | return -EINTR; | ||
| 637 | return 0; | ||
| 638 | } | ||
| 639 | EXPORT_SYMBOL_GPL(bit_wait_timeout); | ||
| 640 | |||
| 641 | __sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode) | ||
| 642 | { | ||
| 643 | unsigned long now = READ_ONCE(jiffies); | ||
| 644 | if (time_after_eq(now, word->timeout)) | ||
| 645 | return -EAGAIN; | ||
| 646 | io_schedule_timeout(word->timeout - now); | ||
| 647 | if (signal_pending_state(mode, current)) | ||
| 648 | return -EINTR; | ||
| 649 | return 0; | ||
| 650 | } | ||
| 651 | EXPORT_SYMBOL_GPL(bit_wait_io_timeout); | ||
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c new file mode 100644 index 000000000000..f8159698aa4d --- /dev/null +++ b/kernel/sched/wait_bit.c | |||
| @@ -0,0 +1,286 @@ | |||
| 1 | /* | ||
| 2 | * The implementation of the wait_bit*() and related waiting APIs: | ||
| 3 | */ | ||
| 4 | #include <linux/wait_bit.h> | ||
| 5 | #include <linux/sched/signal.h> | ||
| 6 | #include <linux/sched/debug.h> | ||
| 7 | #include <linux/hash.h> | ||
| 8 | |||
| 9 | #define WAIT_TABLE_BITS 8 | ||
| 10 | #define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS) | ||
| 11 | |||
| 12 | static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned; | ||
| 13 | |||
| 14 | wait_queue_head_t *bit_waitqueue(void *word, int bit) | ||
| 15 | { | ||
| 16 | const int shift = BITS_PER_LONG == 32 ? 5 : 6; | ||
| 17 | unsigned long val = (unsigned long)word << shift | bit; | ||
| 18 | |||
| 19 | return bit_wait_table + hash_long(val, WAIT_TABLE_BITS); | ||
| 20 | } | ||
| 21 | EXPORT_SYMBOL(bit_waitqueue); | ||
| 22 | |||
| 23 | int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *arg) | ||
| 24 | { | ||
| 25 | struct wait_bit_key *key = arg; | ||
| 26 | struct wait_bit_queue_entry *wait_bit = container_of(wq_entry, struct wait_bit_queue_entry, wq_entry); | ||
| 27 | |||
| 28 | if (wait_bit->key.flags != key->flags || | ||
| 29 | wait_bit->key.bit_nr != key->bit_nr || | ||
| 30 | test_bit(key->bit_nr, key->flags)) | ||
| 31 | return 0; | ||
| 32 | else | ||
| 33 | return autoremove_wake_function(wq_entry, mode, sync, key); | ||
| 34 | } | ||
| 35 | EXPORT_SYMBOL(wake_bit_function); | ||
| 36 | |||
| 37 | /* | ||
| 38 | * To allow interruptible waiting and asynchronous (i.e. nonblocking) | ||
| 39 | * waiting, the actions of __wait_on_bit() and __wait_on_bit_lock() are | ||
| 40 | * permitted return codes. Nonzero return codes halt waiting and return. | ||
| 41 | */ | ||
| 42 | int __sched | ||
| 43 | __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, | ||
| 44 | wait_bit_action_f *action, unsigned mode) | ||
| 45 | { | ||
| 46 | int ret = 0; | ||
| 47 | |||
| 48 | do { | ||
| 49 | prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode); | ||
| 50 | if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) | ||
| 51 | ret = (*action)(&wbq_entry->key, mode); | ||
| 52 | } while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret); | ||
| 53 | finish_wait(wq_head, &wbq_entry->wq_entry); | ||
| 54 | return ret; | ||
| 55 | } | ||
| 56 | EXPORT_SYMBOL(__wait_on_bit); | ||
| 57 | |||
| 58 | int __sched out_of_line_wait_on_bit(void *word, int bit, | ||
| 59 | wait_bit_action_f *action, unsigned mode) | ||
| 60 | { | ||
| 61 | struct wait_queue_head *wq_head = bit_waitqueue(word, bit); | ||
| 62 | DEFINE_WAIT_BIT(wq_entry, word, bit); | ||
| 63 | |||
| 64 | return __wait_on_bit(wq_head, &wq_entry, action, mode); | ||
| 65 | } | ||
| 66 | EXPORT_SYMBOL(out_of_line_wait_on_bit); | ||
| 67 | |||
| 68 | int __sched out_of_line_wait_on_bit_timeout( | ||
| 69 | void *word, int bit, wait_bit_action_f *action, | ||
| 70 | unsigned mode, unsigned long timeout) | ||
| 71 | { | ||
| 72 | struct wait_queue_head *wq_head = bit_waitqueue(word, bit); | ||
| 73 | DEFINE_WAIT_BIT(wq_entry, word, bit); | ||
| 74 | |||
| 75 | wq_entry.key.timeout = jiffies + timeout; | ||
| 76 | return __wait_on_bit(wq_head, &wq_entry, action, mode); | ||
| 77 | } | ||
| 78 | EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout); | ||
| 79 | |||
| 80 | int __sched | ||
| 81 | __wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, | ||
| 82 | wait_bit_action_f *action, unsigned mode) | ||
| 83 | { | ||
| 84 | int ret = 0; | ||
| 85 | |||
| 86 | for (;;) { | ||
| 87 | prepare_to_wait_exclusive(wq_head, &wbq_entry->wq_entry, mode); | ||
| 88 | if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) { | ||
| 89 | ret = action(&wbq_entry->key, mode); | ||
| 90 | /* | ||
| 91 | * See the comment in prepare_to_wait_event(). | ||
| 92 | * finish_wait() does not necessarily takes wwq_head->lock, | ||
| 93 | * but test_and_set_bit() implies mb() which pairs with | ||
| 94 | * smp_mb__after_atomic() before wake_up_page(). | ||
| 95 | */ | ||
| 96 | if (ret) | ||
| 97 | finish_wait(wq_head, &wbq_entry->wq_entry); | ||
| 98 | } | ||
| 99 | if (!test_and_set_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) { | ||
| 100 | if (!ret) | ||
| 101 | finish_wait(wq_head, &wbq_entry->wq_entry); | ||
| 102 | return 0; | ||
| 103 | } else if (ret) { | ||
| 104 | return ret; | ||
| 105 | } | ||
| 106 | } | ||
| 107 | } | ||
| 108 | EXPORT_SYMBOL(__wait_on_bit_lock); | ||
| 109 | |||
| 110 | int __sched out_of_line_wait_on_bit_lock(void *word, int bit, | ||
| 111 | wait_bit_action_f *action, unsigned mode) | ||
| 112 | { | ||
| 113 | struct wait_queue_head *wq_head = bit_waitqueue(word, bit); | ||
| 114 | DEFINE_WAIT_BIT(wq_entry, word, bit); | ||
| 115 | |||
| 116 | return __wait_on_bit_lock(wq_head, &wq_entry, action, mode); | ||
| 117 | } | ||
| 118 | EXPORT_SYMBOL(out_of_line_wait_on_bit_lock); | ||
| 119 | |||
| 120 | void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit) | ||
| 121 | { | ||
| 122 | struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit); | ||
| 123 | if (waitqueue_active(wq_head)) | ||
| 124 | __wake_up(wq_head, TASK_NORMAL, 1, &key); | ||
| 125 | } | ||
| 126 | EXPORT_SYMBOL(__wake_up_bit); | ||
| 127 | |||
| 128 | /** | ||
| 129 | * wake_up_bit - wake up a waiter on a bit | ||
| 130 | * @word: the word being waited on, a kernel virtual address | ||
| 131 | * @bit: the bit of the word being waited on | ||
| 132 | * | ||
| 133 | * There is a standard hashed waitqueue table for generic use. This | ||
| 134 | * is the part of the hashtable's accessor API that wakes up waiters | ||
| 135 | * on a bit. For instance, if one were to have waiters on a bitflag, | ||
| 136 | * one would call wake_up_bit() after clearing the bit. | ||
| 137 | * | ||
| 138 | * In order for this to function properly, as it uses waitqueue_active() | ||
| 139 | * internally, some kind of memory barrier must be done prior to calling | ||
| 140 | * this. Typically, this will be smp_mb__after_atomic(), but in some | ||
| 141 | * cases where bitflags are manipulated non-atomically under a lock, one | ||
| 142 | * may need to use a less regular barrier, such fs/inode.c's smp_mb(), | ||
| 143 | * because spin_unlock() does not guarantee a memory barrier. | ||
| 144 | */ | ||
| 145 | void wake_up_bit(void *word, int bit) | ||
| 146 | { | ||
| 147 | __wake_up_bit(bit_waitqueue(word, bit), word, bit); | ||
| 148 | } | ||
| 149 | EXPORT_SYMBOL(wake_up_bit); | ||
| 150 | |||
| 151 | /* | ||
| 152 | * Manipulate the atomic_t address to produce a better bit waitqueue table hash | ||
| 153 | * index (we're keying off bit -1, but that would produce a horrible hash | ||
| 154 | * value). | ||
| 155 | */ | ||
| 156 | static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p) | ||
| 157 | { | ||
| 158 | if (BITS_PER_LONG == 64) { | ||
| 159 | unsigned long q = (unsigned long)p; | ||
| 160 | return bit_waitqueue((void *)(q & ~1), q & 1); | ||
| 161 | } | ||
| 162 | return bit_waitqueue(p, 0); | ||
| 163 | } | ||
| 164 | |||
| 165 | static int wake_atomic_t_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, | ||
| 166 | void *arg) | ||
| 167 | { | ||
| 168 | struct wait_bit_key *key = arg; | ||
| 169 | struct wait_bit_queue_entry *wait_bit = container_of(wq_entry, struct wait_bit_queue_entry, wq_entry); | ||
| 170 | atomic_t *val = key->flags; | ||
| 171 | |||
| 172 | if (wait_bit->key.flags != key->flags || | ||
| 173 | wait_bit->key.bit_nr != key->bit_nr || | ||
| 174 | atomic_read(val) != 0) | ||
| 175 | return 0; | ||
| 176 | return autoremove_wake_function(wq_entry, mode, sync, key); | ||
| 177 | } | ||
| 178 | |||
| 179 | /* | ||
| 180 | * To allow interruptible waiting and asynchronous (i.e. nonblocking) waiting, | ||
| 181 | * the actions of __wait_on_atomic_t() are permitted return codes. Nonzero | ||
| 182 | * return codes halt waiting and return. | ||
| 183 | */ | ||
| 184 | static __sched | ||
| 185 | int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, | ||
| 186 | int (*action)(atomic_t *), unsigned mode) | ||
| 187 | { | ||
| 188 | atomic_t *val; | ||
| 189 | int ret = 0; | ||
| 190 | |||
| 191 | do { | ||
| 192 | prepare_to_wait(wq_head, &wbq_entry->wq_entry, mode); | ||
| 193 | val = wbq_entry->key.flags; | ||
| 194 | if (atomic_read(val) == 0) | ||
| 195 | break; | ||
| 196 | ret = (*action)(val); | ||
| 197 | } while (!ret && atomic_read(val) != 0); | ||
| 198 | finish_wait(wq_head, &wbq_entry->wq_entry); | ||
| 199 | return ret; | ||
| 200 | } | ||
| 201 | |||
| 202 | #define DEFINE_WAIT_ATOMIC_T(name, p) \ | ||
| 203 | struct wait_bit_queue_entry name = { \ | ||
| 204 | .key = __WAIT_ATOMIC_T_KEY_INITIALIZER(p), \ | ||
| 205 | .wq_entry = { \ | ||
| 206 | .private = current, \ | ||
| 207 | .func = wake_atomic_t_function, \ | ||
| 208 | .entry = \ | ||
| 209 | LIST_HEAD_INIT((name).wq_entry.entry), \ | ||
| 210 | }, \ | ||
| 211 | } | ||
| 212 | |||
| 213 | __sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *), | ||
| 214 | unsigned mode) | ||
| 215 | { | ||
| 216 | struct wait_queue_head *wq_head = atomic_t_waitqueue(p); | ||
| 217 | DEFINE_WAIT_ATOMIC_T(wq_entry, p); | ||
| 218 | |||
| 219 | return __wait_on_atomic_t(wq_head, &wq_entry, action, mode); | ||
| 220 | } | ||
| 221 | EXPORT_SYMBOL(out_of_line_wait_on_atomic_t); | ||
| 222 | |||
| 223 | /** | ||
| 224 | * wake_up_atomic_t - Wake up a waiter on a atomic_t | ||
| 225 | * @p: The atomic_t being waited on, a kernel virtual address | ||
| 226 | * | ||
| 227 | * Wake up anyone waiting for the atomic_t to go to zero. | ||
| 228 | * | ||
| 229 | * Abuse the bit-waker function and its waitqueue hash table set (the atomic_t | ||
| 230 | * check is done by the waiter's wake function, not the by the waker itself). | ||
| 231 | */ | ||
| 232 | void wake_up_atomic_t(atomic_t *p) | ||
| 233 | { | ||
| 234 | __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR); | ||
| 235 | } | ||
| 236 | EXPORT_SYMBOL(wake_up_atomic_t); | ||
| 237 | |||
| 238 | __sched int bit_wait(struct wait_bit_key *word, int mode) | ||
| 239 | { | ||
| 240 | schedule(); | ||
| 241 | if (signal_pending_state(mode, current)) | ||
| 242 | return -EINTR; | ||
| 243 | return 0; | ||
| 244 | } | ||
| 245 | EXPORT_SYMBOL(bit_wait); | ||
| 246 | |||
| 247 | __sched int bit_wait_io(struct wait_bit_key *word, int mode) | ||
| 248 | { | ||
| 249 | io_schedule(); | ||
| 250 | if (signal_pending_state(mode, current)) | ||
| 251 | return -EINTR; | ||
| 252 | return 0; | ||
| 253 | } | ||
| 254 | EXPORT_SYMBOL(bit_wait_io); | ||
| 255 | |||
| 256 | __sched int bit_wait_timeout(struct wait_bit_key *word, int mode) | ||
| 257 | { | ||
| 258 | unsigned long now = READ_ONCE(jiffies); | ||
| 259 | if (time_after_eq(now, word->timeout)) | ||
| 260 | return -EAGAIN; | ||
| 261 | schedule_timeout(word->timeout - now); | ||
| 262 | if (signal_pending_state(mode, current)) | ||
| 263 | return -EINTR; | ||
| 264 | return 0; | ||
| 265 | } | ||
| 266 | EXPORT_SYMBOL_GPL(bit_wait_timeout); | ||
| 267 | |||
| 268 | __sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode) | ||
| 269 | { | ||
| 270 | unsigned long now = READ_ONCE(jiffies); | ||
| 271 | if (time_after_eq(now, word->timeout)) | ||
| 272 | return -EAGAIN; | ||
| 273 | io_schedule_timeout(word->timeout - now); | ||
| 274 | if (signal_pending_state(mode, current)) | ||
| 275 | return -EINTR; | ||
| 276 | return 0; | ||
| 277 | } | ||
| 278 | EXPORT_SYMBOL_GPL(bit_wait_io_timeout); | ||
| 279 | |||
| 280 | void __init wait_bit_init(void) | ||
| 281 | { | ||
| 282 | int i; | ||
| 283 | |||
| 284 | for (i = 0; i < WAIT_TABLE_SIZE; i++) | ||
| 285 | init_waitqueue_head(bit_wait_table + i); | ||
| 286 | } | ||
diff --git a/kernel/signal.c b/kernel/signal.c index ca92bcfeb322..caed9133ae52 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
| @@ -39,6 +39,7 @@ | |||
| 39 | #include <linux/compat.h> | 39 | #include <linux/compat.h> |
| 40 | #include <linux/cn_proc.h> | 40 | #include <linux/cn_proc.h> |
| 41 | #include <linux/compiler.h> | 41 | #include <linux/compiler.h> |
| 42 | #include <linux/posix-timers.h> | ||
| 42 | 43 | ||
| 43 | #define CREATE_TRACE_POINTS | 44 | #define CREATE_TRACE_POINTS |
| 44 | #include <trace/events/signal.h> | 45 | #include <trace/events/signal.h> |
| @@ -510,7 +511,8 @@ int unhandled_signal(struct task_struct *tsk, int sig) | |||
| 510 | return !tsk->ptrace; | 511 | return !tsk->ptrace; |
| 511 | } | 512 | } |
| 512 | 513 | ||
| 513 | static void collect_signal(int sig, struct sigpending *list, siginfo_t *info) | 514 | static void collect_signal(int sig, struct sigpending *list, siginfo_t *info, |
| 515 | bool *resched_timer) | ||
| 514 | { | 516 | { |
| 515 | struct sigqueue *q, *first = NULL; | 517 | struct sigqueue *q, *first = NULL; |
| 516 | 518 | ||
| @@ -532,6 +534,12 @@ static void collect_signal(int sig, struct sigpending *list, siginfo_t *info) | |||
| 532 | still_pending: | 534 | still_pending: |
| 533 | list_del_init(&first->list); | 535 | list_del_init(&first->list); |
| 534 | copy_siginfo(info, &first->info); | 536 | copy_siginfo(info, &first->info); |
| 537 | |||
| 538 | *resched_timer = | ||
| 539 | (first->flags & SIGQUEUE_PREALLOC) && | ||
| 540 | (info->si_code == SI_TIMER) && | ||
| 541 | (info->si_sys_private); | ||
| 542 | |||
| 535 | __sigqueue_free(first); | 543 | __sigqueue_free(first); |
| 536 | } else { | 544 | } else { |
| 537 | /* | 545 | /* |
| @@ -548,12 +556,12 @@ still_pending: | |||
| 548 | } | 556 | } |
| 549 | 557 | ||
| 550 | static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, | 558 | static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, |
| 551 | siginfo_t *info) | 559 | siginfo_t *info, bool *resched_timer) |
| 552 | { | 560 | { |
| 553 | int sig = next_signal(pending, mask); | 561 | int sig = next_signal(pending, mask); |
| 554 | 562 | ||
| 555 | if (sig) | 563 | if (sig) |
| 556 | collect_signal(sig, pending, info); | 564 | collect_signal(sig, pending, info, resched_timer); |
| 557 | return sig; | 565 | return sig; |
| 558 | } | 566 | } |
| 559 | 567 | ||
| @@ -565,15 +573,16 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, | |||
| 565 | */ | 573 | */ |
| 566 | int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) | 574 | int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) |
| 567 | { | 575 | { |
| 576 | bool resched_timer = false; | ||
| 568 | int signr; | 577 | int signr; |
| 569 | 578 | ||
| 570 | /* We only dequeue private signals from ourselves, we don't let | 579 | /* We only dequeue private signals from ourselves, we don't let |
| 571 | * signalfd steal them | 580 | * signalfd steal them |
| 572 | */ | 581 | */ |
| 573 | signr = __dequeue_signal(&tsk->pending, mask, info); | 582 | signr = __dequeue_signal(&tsk->pending, mask, info, &resched_timer); |
| 574 | if (!signr) { | 583 | if (!signr) { |
| 575 | signr = __dequeue_signal(&tsk->signal->shared_pending, | 584 | signr = __dequeue_signal(&tsk->signal->shared_pending, |
| 576 | mask, info); | 585 | mask, info, &resched_timer); |
| 577 | #ifdef CONFIG_POSIX_TIMERS | 586 | #ifdef CONFIG_POSIX_TIMERS |
| 578 | /* | 587 | /* |
| 579 | * itimer signal ? | 588 | * itimer signal ? |
| @@ -621,7 +630,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) | |||
| 621 | current->jobctl |= JOBCTL_STOP_DEQUEUED; | 630 | current->jobctl |= JOBCTL_STOP_DEQUEUED; |
| 622 | } | 631 | } |
| 623 | #ifdef CONFIG_POSIX_TIMERS | 632 | #ifdef CONFIG_POSIX_TIMERS |
| 624 | if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) { | 633 | if (resched_timer) { |
| 625 | /* | 634 | /* |
| 626 | * Release the siglock to ensure proper locking order | 635 | * Release the siglock to ensure proper locking order |
| 627 | * of timer locks outside of siglocks. Note, we leave | 636 | * of timer locks outside of siglocks. Note, we leave |
| @@ -629,7 +638,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) | |||
| 629 | * about to disable them again anyway. | 638 | * about to disable them again anyway. |
| 630 | */ | 639 | */ |
| 631 | spin_unlock(&tsk->sighand->siglock); | 640 | spin_unlock(&tsk->sighand->siglock); |
| 632 | do_schedule_next_timer(info); | 641 | posixtimer_rearm(info); |
| 633 | spin_lock(&tsk->sighand->siglock); | 642 | spin_lock(&tsk->sighand->siglock); |
| 634 | } | 643 | } |
| 635 | #endif | 644 | #endif |
| @@ -1393,6 +1402,10 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid) | |||
| 1393 | return ret; | 1402 | return ret; |
| 1394 | } | 1403 | } |
| 1395 | 1404 | ||
| 1405 | /* -INT_MIN is undefined. Exclude this case to avoid a UBSAN warning */ | ||
| 1406 | if (pid == INT_MIN) | ||
| 1407 | return -ESRCH; | ||
| 1408 | |||
| 1396 | read_lock(&tasklist_lock); | 1409 | read_lock(&tasklist_lock); |
| 1397 | if (pid != -1) { | 1410 | if (pid != -1) { |
| 1398 | ret = __kill_pgrp_info(sig, info, | 1411 | ret = __kill_pgrp_info(sig, info, |
| @@ -2092,7 +2105,6 @@ static void do_jobctl_trap(void) | |||
| 2092 | 2105 | ||
| 2093 | static int ptrace_signal(int signr, siginfo_t *info) | 2106 | static int ptrace_signal(int signr, siginfo_t *info) |
| 2094 | { | 2107 | { |
| 2095 | ptrace_signal_deliver(); | ||
| 2096 | /* | 2108 | /* |
| 2097 | * We do not check sig_kernel_stop(signr) but set this marker | 2109 | * We do not check sig_kernel_stop(signr) but set this marker |
| 2098 | * unconditionally because we do not know whether debugger will | 2110 | * unconditionally because we do not know whether debugger will |
| @@ -2768,7 +2780,7 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) | |||
| 2768 | * @info: if non-null, the signal's siginfo is returned here | 2780 | * @info: if non-null, the signal's siginfo is returned here |
| 2769 | * @ts: upper bound on process time suspension | 2781 | * @ts: upper bound on process time suspension |
| 2770 | */ | 2782 | */ |
| 2771 | int do_sigtimedwait(const sigset_t *which, siginfo_t *info, | 2783 | static int do_sigtimedwait(const sigset_t *which, siginfo_t *info, |
| 2772 | const struct timespec *ts) | 2784 | const struct timespec *ts) |
| 2773 | { | 2785 | { |
| 2774 | ktime_t *to = NULL, timeout = KTIME_MAX; | 2786 | ktime_t *to = NULL, timeout = KTIME_MAX; |
| @@ -2857,6 +2869,40 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, | |||
| 2857 | return ret; | 2869 | return ret; |
| 2858 | } | 2870 | } |
| 2859 | 2871 | ||
| 2872 | #ifdef CONFIG_COMPAT | ||
| 2873 | COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese, | ||
| 2874 | struct compat_siginfo __user *, uinfo, | ||
| 2875 | struct compat_timespec __user *, uts, compat_size_t, sigsetsize) | ||
| 2876 | { | ||
| 2877 | compat_sigset_t s32; | ||
| 2878 | sigset_t s; | ||
| 2879 | struct timespec t; | ||
| 2880 | siginfo_t info; | ||
| 2881 | long ret; | ||
| 2882 | |||
| 2883 | if (sigsetsize != sizeof(sigset_t)) | ||
| 2884 | return -EINVAL; | ||
| 2885 | |||
| 2886 | if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t))) | ||
| 2887 | return -EFAULT; | ||
| 2888 | sigset_from_compat(&s, &s32); | ||
| 2889 | |||
| 2890 | if (uts) { | ||
| 2891 | if (compat_get_timespec(&t, uts)) | ||
| 2892 | return -EFAULT; | ||
| 2893 | } | ||
| 2894 | |||
| 2895 | ret = do_sigtimedwait(&s, &info, uts ? &t : NULL); | ||
| 2896 | |||
| 2897 | if (ret > 0 && uinfo) { | ||
| 2898 | if (copy_siginfo_to_user32(uinfo, &info)) | ||
| 2899 | ret = -EFAULT; | ||
| 2900 | } | ||
| 2901 | |||
| 2902 | return ret; | ||
| 2903 | } | ||
| 2904 | #endif | ||
| 2905 | |||
| 2860 | /** | 2906 | /** |
| 2861 | * sys_kill - send a signal to a process | 2907 | * sys_kill - send a signal to a process |
| 2862 | * @pid: the PID of the process | 2908 | * @pid: the PID of the process |
| @@ -3113,78 +3159,68 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) | |||
| 3113 | } | 3159 | } |
| 3114 | 3160 | ||
| 3115 | static int | 3161 | static int |
| 3116 | do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long sp) | 3162 | do_sigaltstack (const stack_t *ss, stack_t *oss, unsigned long sp) |
| 3117 | { | 3163 | { |
| 3118 | stack_t oss; | 3164 | struct task_struct *t = current; |
| 3119 | int error; | ||
| 3120 | 3165 | ||
| 3121 | oss.ss_sp = (void __user *) current->sas_ss_sp; | 3166 | if (oss) { |
| 3122 | oss.ss_size = current->sas_ss_size; | 3167 | memset(oss, 0, sizeof(stack_t)); |
| 3123 | oss.ss_flags = sas_ss_flags(sp) | | 3168 | oss->ss_sp = (void __user *) t->sas_ss_sp; |
| 3124 | (current->sas_ss_flags & SS_FLAG_BITS); | 3169 | oss->ss_size = t->sas_ss_size; |
| 3170 | oss->ss_flags = sas_ss_flags(sp) | | ||
| 3171 | (current->sas_ss_flags & SS_FLAG_BITS); | ||
| 3172 | } | ||
| 3125 | 3173 | ||
| 3126 | if (uss) { | 3174 | if (ss) { |
| 3127 | void __user *ss_sp; | 3175 | void __user *ss_sp = ss->ss_sp; |
| 3128 | size_t ss_size; | 3176 | size_t ss_size = ss->ss_size; |
| 3129 | unsigned ss_flags; | 3177 | unsigned ss_flags = ss->ss_flags; |
| 3130 | int ss_mode; | 3178 | int ss_mode; |
| 3131 | 3179 | ||
| 3132 | error = -EFAULT; | 3180 | if (unlikely(on_sig_stack(sp))) |
| 3133 | if (!access_ok(VERIFY_READ, uss, sizeof(*uss))) | 3181 | return -EPERM; |
| 3134 | goto out; | ||
| 3135 | error = __get_user(ss_sp, &uss->ss_sp) | | ||
| 3136 | __get_user(ss_flags, &uss->ss_flags) | | ||
| 3137 | __get_user(ss_size, &uss->ss_size); | ||
| 3138 | if (error) | ||
| 3139 | goto out; | ||
| 3140 | |||
| 3141 | error = -EPERM; | ||
| 3142 | if (on_sig_stack(sp)) | ||
| 3143 | goto out; | ||
| 3144 | 3182 | ||
| 3145 | ss_mode = ss_flags & ~SS_FLAG_BITS; | 3183 | ss_mode = ss_flags & ~SS_FLAG_BITS; |
| 3146 | error = -EINVAL; | 3184 | if (unlikely(ss_mode != SS_DISABLE && ss_mode != SS_ONSTACK && |
| 3147 | if (ss_mode != SS_DISABLE && ss_mode != SS_ONSTACK && | 3185 | ss_mode != 0)) |
| 3148 | ss_mode != 0) | 3186 | return -EINVAL; |
| 3149 | goto out; | ||
| 3150 | 3187 | ||
| 3151 | if (ss_mode == SS_DISABLE) { | 3188 | if (ss_mode == SS_DISABLE) { |
| 3152 | ss_size = 0; | 3189 | ss_size = 0; |
| 3153 | ss_sp = NULL; | 3190 | ss_sp = NULL; |
| 3154 | } else { | 3191 | } else { |
| 3155 | error = -ENOMEM; | 3192 | if (unlikely(ss_size < MINSIGSTKSZ)) |
| 3156 | if (ss_size < MINSIGSTKSZ) | 3193 | return -ENOMEM; |
| 3157 | goto out; | ||
| 3158 | } | 3194 | } |
| 3159 | 3195 | ||
| 3160 | current->sas_ss_sp = (unsigned long) ss_sp; | 3196 | t->sas_ss_sp = (unsigned long) ss_sp; |
| 3161 | current->sas_ss_size = ss_size; | 3197 | t->sas_ss_size = ss_size; |
| 3162 | current->sas_ss_flags = ss_flags; | 3198 | t->sas_ss_flags = ss_flags; |
| 3163 | } | 3199 | } |
| 3164 | 3200 | return 0; | |
| 3165 | error = 0; | ||
| 3166 | if (uoss) { | ||
| 3167 | error = -EFAULT; | ||
| 3168 | if (!access_ok(VERIFY_WRITE, uoss, sizeof(*uoss))) | ||
| 3169 | goto out; | ||
| 3170 | error = __put_user(oss.ss_sp, &uoss->ss_sp) | | ||
| 3171 | __put_user(oss.ss_size, &uoss->ss_size) | | ||
| 3172 | __put_user(oss.ss_flags, &uoss->ss_flags); | ||
| 3173 | } | ||
| 3174 | |||
| 3175 | out: | ||
| 3176 | return error; | ||
| 3177 | } | 3201 | } |
| 3202 | |||
| 3178 | SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss) | 3203 | SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss) |
| 3179 | { | 3204 | { |
| 3180 | return do_sigaltstack(uss, uoss, current_user_stack_pointer()); | 3205 | stack_t new, old; |
| 3206 | int err; | ||
| 3207 | if (uss && copy_from_user(&new, uss, sizeof(stack_t))) | ||
| 3208 | return -EFAULT; | ||
| 3209 | err = do_sigaltstack(uss ? &new : NULL, uoss ? &old : NULL, | ||
| 3210 | current_user_stack_pointer()); | ||
| 3211 | if (!err && uoss && copy_to_user(uoss, &old, sizeof(stack_t))) | ||
| 3212 | err = -EFAULT; | ||
| 3213 | return err; | ||
| 3181 | } | 3214 | } |
| 3182 | 3215 | ||
| 3183 | int restore_altstack(const stack_t __user *uss) | 3216 | int restore_altstack(const stack_t __user *uss) |
| 3184 | { | 3217 | { |
| 3185 | int err = do_sigaltstack(uss, NULL, current_user_stack_pointer()); | 3218 | stack_t new; |
| 3219 | if (copy_from_user(&new, uss, sizeof(stack_t))) | ||
| 3220 | return -EFAULT; | ||
| 3221 | (void)do_sigaltstack(&new, NULL, current_user_stack_pointer()); | ||
| 3186 | /* squash all but EFAULT for now */ | 3222 | /* squash all but EFAULT for now */ |
| 3187 | return err == -EFAULT ? err : 0; | 3223 | return 0; |
| 3188 | } | 3224 | } |
| 3189 | 3225 | ||
| 3190 | int __save_altstack(stack_t __user *uss, unsigned long sp) | 3226 | int __save_altstack(stack_t __user *uss, unsigned long sp) |
| @@ -3207,29 +3243,24 @@ COMPAT_SYSCALL_DEFINE2(sigaltstack, | |||
| 3207 | { | 3243 | { |
| 3208 | stack_t uss, uoss; | 3244 | stack_t uss, uoss; |
| 3209 | int ret; | 3245 | int ret; |
| 3210 | mm_segment_t seg; | ||
| 3211 | 3246 | ||
| 3212 | if (uss_ptr) { | 3247 | if (uss_ptr) { |
| 3213 | compat_stack_t uss32; | 3248 | compat_stack_t uss32; |
| 3214 | |||
| 3215 | memset(&uss, 0, sizeof(stack_t)); | ||
| 3216 | if (copy_from_user(&uss32, uss_ptr, sizeof(compat_stack_t))) | 3249 | if (copy_from_user(&uss32, uss_ptr, sizeof(compat_stack_t))) |
| 3217 | return -EFAULT; | 3250 | return -EFAULT; |
| 3218 | uss.ss_sp = compat_ptr(uss32.ss_sp); | 3251 | uss.ss_sp = compat_ptr(uss32.ss_sp); |
| 3219 | uss.ss_flags = uss32.ss_flags; | 3252 | uss.ss_flags = uss32.ss_flags; |
| 3220 | uss.ss_size = uss32.ss_size; | 3253 | uss.ss_size = uss32.ss_size; |
| 3221 | } | 3254 | } |
| 3222 | seg = get_fs(); | 3255 | ret = do_sigaltstack(uss_ptr ? &uss : NULL, &uoss, |
| 3223 | set_fs(KERNEL_DS); | ||
| 3224 | ret = do_sigaltstack((stack_t __force __user *) (uss_ptr ? &uss : NULL), | ||
| 3225 | (stack_t __force __user *) &uoss, | ||
| 3226 | compat_user_stack_pointer()); | 3256 | compat_user_stack_pointer()); |
| 3227 | set_fs(seg); | ||
| 3228 | if (ret >= 0 && uoss_ptr) { | 3257 | if (ret >= 0 && uoss_ptr) { |
| 3229 | if (!access_ok(VERIFY_WRITE, uoss_ptr, sizeof(compat_stack_t)) || | 3258 | compat_stack_t old; |
| 3230 | __put_user(ptr_to_compat(uoss.ss_sp), &uoss_ptr->ss_sp) || | 3259 | memset(&old, 0, sizeof(old)); |
| 3231 | __put_user(uoss.ss_flags, &uoss_ptr->ss_flags) || | 3260 | old.ss_sp = ptr_to_compat(uoss.ss_sp); |
| 3232 | __put_user(uoss.ss_size, &uoss_ptr->ss_size)) | 3261 | old.ss_flags = uoss.ss_flags; |
| 3262 | old.ss_size = uoss.ss_size; | ||
| 3263 | if (copy_to_user(uoss_ptr, &old, sizeof(compat_stack_t))) | ||
| 3233 | ret = -EFAULT; | 3264 | ret = -EFAULT; |
| 3234 | } | 3265 | } |
| 3235 | return ret; | 3266 | return ret; |
| @@ -3269,6 +3300,18 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) | |||
| 3269 | return sys_rt_sigpending((sigset_t __user *)set, sizeof(old_sigset_t)); | 3300 | return sys_rt_sigpending((sigset_t __user *)set, sizeof(old_sigset_t)); |
| 3270 | } | 3301 | } |
| 3271 | 3302 | ||
| 3303 | #ifdef CONFIG_COMPAT | ||
| 3304 | COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set32) | ||
| 3305 | { | ||
| 3306 | sigset_t set; | ||
| 3307 | int err = do_sigpending(&set, sizeof(old_sigset_t)); | ||
| 3308 | if (err == 0) | ||
| 3309 | if (copy_to_user(set32, &set, sizeof(old_sigset_t))) | ||
| 3310 | err = -EFAULT; | ||
| 3311 | return err; | ||
| 3312 | } | ||
| 3313 | #endif | ||
| 3314 | |||
| 3272 | #endif | 3315 | #endif |
| 3273 | 3316 | ||
| 3274 | #ifdef __ARCH_WANT_SYS_SIGPROCMASK | 3317 | #ifdef __ARCH_WANT_SYS_SIGPROCMASK |
diff --git a/kernel/smp.c b/kernel/smp.c index a817769b53c0..3061483cb3ad 100644 --- a/kernel/smp.c +++ b/kernel/smp.c | |||
| @@ -30,6 +30,7 @@ enum { | |||
| 30 | struct call_function_data { | 30 | struct call_function_data { |
| 31 | struct call_single_data __percpu *csd; | 31 | struct call_single_data __percpu *csd; |
| 32 | cpumask_var_t cpumask; | 32 | cpumask_var_t cpumask; |
| 33 | cpumask_var_t cpumask_ipi; | ||
| 33 | }; | 34 | }; |
| 34 | 35 | ||
| 35 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data); | 36 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data); |
| @@ -45,9 +46,15 @@ int smpcfd_prepare_cpu(unsigned int cpu) | |||
| 45 | if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, | 46 | if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, |
| 46 | cpu_to_node(cpu))) | 47 | cpu_to_node(cpu))) |
| 47 | return -ENOMEM; | 48 | return -ENOMEM; |
| 49 | if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL, | ||
| 50 | cpu_to_node(cpu))) { | ||
| 51 | free_cpumask_var(cfd->cpumask); | ||
| 52 | return -ENOMEM; | ||
| 53 | } | ||
| 48 | cfd->csd = alloc_percpu(struct call_single_data); | 54 | cfd->csd = alloc_percpu(struct call_single_data); |
| 49 | if (!cfd->csd) { | 55 | if (!cfd->csd) { |
| 50 | free_cpumask_var(cfd->cpumask); | 56 | free_cpumask_var(cfd->cpumask); |
| 57 | free_cpumask_var(cfd->cpumask_ipi); | ||
| 51 | return -ENOMEM; | 58 | return -ENOMEM; |
| 52 | } | 59 | } |
| 53 | 60 | ||
| @@ -59,6 +66,7 @@ int smpcfd_dead_cpu(unsigned int cpu) | |||
| 59 | struct call_function_data *cfd = &per_cpu(cfd_data, cpu); | 66 | struct call_function_data *cfd = &per_cpu(cfd_data, cpu); |
| 60 | 67 | ||
| 61 | free_cpumask_var(cfd->cpumask); | 68 | free_cpumask_var(cfd->cpumask); |
| 69 | free_cpumask_var(cfd->cpumask_ipi); | ||
| 62 | free_percpu(cfd->csd); | 70 | free_percpu(cfd->csd); |
| 63 | return 0; | 71 | return 0; |
| 64 | } | 72 | } |
| @@ -428,12 +436,13 @@ void smp_call_function_many(const struct cpumask *mask, | |||
| 428 | cfd = this_cpu_ptr(&cfd_data); | 436 | cfd = this_cpu_ptr(&cfd_data); |
| 429 | 437 | ||
| 430 | cpumask_and(cfd->cpumask, mask, cpu_online_mask); | 438 | cpumask_and(cfd->cpumask, mask, cpu_online_mask); |
| 431 | cpumask_clear_cpu(this_cpu, cfd->cpumask); | 439 | __cpumask_clear_cpu(this_cpu, cfd->cpumask); |
| 432 | 440 | ||
| 433 | /* Some callers race with other cpus changing the passed mask */ | 441 | /* Some callers race with other cpus changing the passed mask */ |
| 434 | if (unlikely(!cpumask_weight(cfd->cpumask))) | 442 | if (unlikely(!cpumask_weight(cfd->cpumask))) |
| 435 | return; | 443 | return; |
| 436 | 444 | ||
| 445 | cpumask_clear(cfd->cpumask_ipi); | ||
| 437 | for_each_cpu(cpu, cfd->cpumask) { | 446 | for_each_cpu(cpu, cfd->cpumask) { |
| 438 | struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu); | 447 | struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu); |
| 439 | 448 | ||
| @@ -442,11 +451,12 @@ void smp_call_function_many(const struct cpumask *mask, | |||
| 442 | csd->flags |= CSD_FLAG_SYNCHRONOUS; | 451 | csd->flags |= CSD_FLAG_SYNCHRONOUS; |
| 443 | csd->func = func; | 452 | csd->func = func; |
| 444 | csd->info = info; | 453 | csd->info = info; |
| 445 | llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)); | 454 | if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu))) |
| 455 | __cpumask_set_cpu(cpu, cfd->cpumask_ipi); | ||
| 446 | } | 456 | } |
| 447 | 457 | ||
| 448 | /* Send a message to all CPUs in the map */ | 458 | /* Send a message to all CPUs in the map */ |
| 449 | arch_send_call_function_ipi_mask(cfd->cpumask); | 459 | arch_send_call_function_ipi_mask(cfd->cpumask_ipi); |
| 450 | 460 | ||
| 451 | if (wait) { | 461 | if (wait) { |
| 452 | for_each_cpu(cpu, cfd->cpumask) { | 462 | for_each_cpu(cpu, cfd->cpumask) { |
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 1eb82661ecdb..b7591261652d 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c | |||
| @@ -552,7 +552,8 @@ static int __init cpu_stop_init(void) | |||
| 552 | } | 552 | } |
| 553 | early_initcall(cpu_stop_init); | 553 | early_initcall(cpu_stop_init); |
| 554 | 554 | ||
| 555 | static int __stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) | 555 | int stop_machine_cpuslocked(cpu_stop_fn_t fn, void *data, |
| 556 | const struct cpumask *cpus) | ||
| 556 | { | 557 | { |
| 557 | struct multi_stop_data msdata = { | 558 | struct multi_stop_data msdata = { |
| 558 | .fn = fn, | 559 | .fn = fn, |
| @@ -561,6 +562,8 @@ static int __stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cp | |||
| 561 | .active_cpus = cpus, | 562 | .active_cpus = cpus, |
| 562 | }; | 563 | }; |
| 563 | 564 | ||
| 565 | lockdep_assert_cpus_held(); | ||
| 566 | |||
| 564 | if (!stop_machine_initialized) { | 567 | if (!stop_machine_initialized) { |
| 565 | /* | 568 | /* |
| 566 | * Handle the case where stop_machine() is called | 569 | * Handle the case where stop_machine() is called |
| @@ -590,9 +593,9 @@ int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) | |||
| 590 | int ret; | 593 | int ret; |
| 591 | 594 | ||
| 592 | /* No CPUs can come up or down during this. */ | 595 | /* No CPUs can come up or down during this. */ |
| 593 | get_online_cpus(); | 596 | cpus_read_lock(); |
| 594 | ret = __stop_machine(fn, data, cpus); | 597 | ret = stop_machine_cpuslocked(fn, data, cpus); |
| 595 | put_online_cpus(); | 598 | cpus_read_unlock(); |
| 596 | return ret; | 599 | return ret; |
| 597 | } | 600 | } |
| 598 | EXPORT_SYMBOL_GPL(stop_machine); | 601 | EXPORT_SYMBOL_GPL(stop_machine); |
diff --git a/kernel/sys.c b/kernel/sys.c index 8a94b4eabcaa..2855ee73acd0 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
| @@ -886,7 +886,7 @@ SYSCALL_DEFINE0(getegid) | |||
| 886 | return from_kgid_munged(current_user_ns(), current_egid()); | 886 | return from_kgid_munged(current_user_ns(), current_egid()); |
| 887 | } | 887 | } |
| 888 | 888 | ||
| 889 | void do_sys_times(struct tms *tms) | 889 | static void do_sys_times(struct tms *tms) |
| 890 | { | 890 | { |
| 891 | u64 tgutime, tgstime, cutime, cstime; | 891 | u64 tgutime, tgstime, cutime, cstime; |
| 892 | 892 | ||
| @@ -912,6 +912,32 @@ SYSCALL_DEFINE1(times, struct tms __user *, tbuf) | |||
| 912 | return (long) jiffies_64_to_clock_t(get_jiffies_64()); | 912 | return (long) jiffies_64_to_clock_t(get_jiffies_64()); |
| 913 | } | 913 | } |
| 914 | 914 | ||
| 915 | #ifdef CONFIG_COMPAT | ||
| 916 | static compat_clock_t clock_t_to_compat_clock_t(clock_t x) | ||
| 917 | { | ||
| 918 | return compat_jiffies_to_clock_t(clock_t_to_jiffies(x)); | ||
| 919 | } | ||
| 920 | |||
| 921 | COMPAT_SYSCALL_DEFINE1(times, struct compat_tms __user *, tbuf) | ||
| 922 | { | ||
| 923 | if (tbuf) { | ||
| 924 | struct tms tms; | ||
| 925 | struct compat_tms tmp; | ||
| 926 | |||
| 927 | do_sys_times(&tms); | ||
| 928 | /* Convert our struct tms to the compat version. */ | ||
| 929 | tmp.tms_utime = clock_t_to_compat_clock_t(tms.tms_utime); | ||
| 930 | tmp.tms_stime = clock_t_to_compat_clock_t(tms.tms_stime); | ||
| 931 | tmp.tms_cutime = clock_t_to_compat_clock_t(tms.tms_cutime); | ||
| 932 | tmp.tms_cstime = clock_t_to_compat_clock_t(tms.tms_cstime); | ||
| 933 | if (copy_to_user(tbuf, &tmp, sizeof(tmp))) | ||
| 934 | return -EFAULT; | ||
| 935 | } | ||
| 936 | force_successful_syscall_return(); | ||
| 937 | return compat_jiffies_to_clock_t(jiffies); | ||
| 938 | } | ||
| 939 | #endif | ||
| 940 | |||
| 915 | /* | 941 | /* |
| 916 | * This needs some heavy checking ... | 942 | * This needs some heavy checking ... |
| 917 | * I just haven't the stomach for it. I also don't fully | 943 | * I just haven't the stomach for it. I also don't fully |
| @@ -1306,6 +1332,54 @@ SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim) | |||
| 1306 | return ret; | 1332 | return ret; |
| 1307 | } | 1333 | } |
| 1308 | 1334 | ||
| 1335 | #ifdef CONFIG_COMPAT | ||
| 1336 | |||
| 1337 | COMPAT_SYSCALL_DEFINE2(setrlimit, unsigned int, resource, | ||
| 1338 | struct compat_rlimit __user *, rlim) | ||
| 1339 | { | ||
| 1340 | struct rlimit r; | ||
| 1341 | struct compat_rlimit r32; | ||
| 1342 | |||
| 1343 | if (copy_from_user(&r32, rlim, sizeof(struct compat_rlimit))) | ||
| 1344 | return -EFAULT; | ||
| 1345 | |||
| 1346 | if (r32.rlim_cur == COMPAT_RLIM_INFINITY) | ||
| 1347 | r.rlim_cur = RLIM_INFINITY; | ||
| 1348 | else | ||
| 1349 | r.rlim_cur = r32.rlim_cur; | ||
| 1350 | if (r32.rlim_max == COMPAT_RLIM_INFINITY) | ||
| 1351 | r.rlim_max = RLIM_INFINITY; | ||
| 1352 | else | ||
| 1353 | r.rlim_max = r32.rlim_max; | ||
| 1354 | return do_prlimit(current, resource, &r, NULL); | ||
| 1355 | } | ||
| 1356 | |||
| 1357 | COMPAT_SYSCALL_DEFINE2(getrlimit, unsigned int, resource, | ||
| 1358 | struct compat_rlimit __user *, rlim) | ||
| 1359 | { | ||
| 1360 | struct rlimit r; | ||
| 1361 | int ret; | ||
| 1362 | |||
| 1363 | ret = do_prlimit(current, resource, NULL, &r); | ||
| 1364 | if (!ret) { | ||
| 1365 | struct compat_rlimit r32; | ||
| 1366 | if (r.rlim_cur > COMPAT_RLIM_INFINITY) | ||
| 1367 | r32.rlim_cur = COMPAT_RLIM_INFINITY; | ||
| 1368 | else | ||
| 1369 | r32.rlim_cur = r.rlim_cur; | ||
| 1370 | if (r.rlim_max > COMPAT_RLIM_INFINITY) | ||
| 1371 | r32.rlim_max = COMPAT_RLIM_INFINITY; | ||
| 1372 | else | ||
| 1373 | r32.rlim_max = r.rlim_max; | ||
| 1374 | |||
| 1375 | if (copy_to_user(rlim, &r32, sizeof(struct compat_rlimit))) | ||
| 1376 | return -EFAULT; | ||
| 1377 | } | ||
| 1378 | return ret; | ||
| 1379 | } | ||
| 1380 | |||
| 1381 | #endif | ||
| 1382 | |||
| 1309 | #ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT | 1383 | #ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT |
| 1310 | 1384 | ||
| 1311 | /* | 1385 | /* |
| @@ -1328,6 +1402,30 @@ SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, | |||
| 1328 | return copy_to_user(rlim, &x, sizeof(x)) ? -EFAULT : 0; | 1402 | return copy_to_user(rlim, &x, sizeof(x)) ? -EFAULT : 0; |
| 1329 | } | 1403 | } |
| 1330 | 1404 | ||
| 1405 | #ifdef CONFIG_COMPAT | ||
| 1406 | COMPAT_SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, | ||
| 1407 | struct compat_rlimit __user *, rlim) | ||
| 1408 | { | ||
| 1409 | struct rlimit r; | ||
| 1410 | |||
| 1411 | if (resource >= RLIM_NLIMITS) | ||
| 1412 | return -EINVAL; | ||
| 1413 | |||
| 1414 | task_lock(current->group_leader); | ||
| 1415 | r = current->signal->rlim[resource]; | ||
| 1416 | task_unlock(current->group_leader); | ||
| 1417 | if (r.rlim_cur > 0x7FFFFFFF) | ||
| 1418 | r.rlim_cur = 0x7FFFFFFF; | ||
| 1419 | if (r.rlim_max > 0x7FFFFFFF) | ||
| 1420 | r.rlim_max = 0x7FFFFFFF; | ||
| 1421 | |||
| 1422 | if (put_user(r.rlim_cur, &rlim->rlim_cur) || | ||
| 1423 | put_user(r.rlim_max, &rlim->rlim_max)) | ||
| 1424 | return -EFAULT; | ||
| 1425 | return 0; | ||
| 1426 | } | ||
| 1427 | #endif | ||
| 1428 | |||
| 1331 | #endif | 1429 | #endif |
| 1332 | 1430 | ||
| 1333 | static inline bool rlim64_is_infinity(__u64 rlim64) | 1431 | static inline bool rlim64_is_infinity(__u64 rlim64) |
| @@ -1552,7 +1650,7 @@ static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r) | |||
| 1552 | r->ru_oublock += task_io_get_oublock(t); | 1650 | r->ru_oublock += task_io_get_oublock(t); |
| 1553 | } | 1651 | } |
| 1554 | 1652 | ||
| 1555 | static void k_getrusage(struct task_struct *p, int who, struct rusage *r) | 1653 | void getrusage(struct task_struct *p, int who, struct rusage *r) |
| 1556 | { | 1654 | { |
| 1557 | struct task_struct *t; | 1655 | struct task_struct *t; |
| 1558 | unsigned long flags; | 1656 | unsigned long flags; |
| @@ -1626,20 +1724,16 @@ out: | |||
| 1626 | r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */ | 1724 | r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */ |
| 1627 | } | 1725 | } |
| 1628 | 1726 | ||
| 1629 | int getrusage(struct task_struct *p, int who, struct rusage __user *ru) | 1727 | SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru) |
| 1630 | { | 1728 | { |
| 1631 | struct rusage r; | 1729 | struct rusage r; |
| 1632 | 1730 | ||
| 1633 | k_getrusage(p, who, &r); | ||
| 1634 | return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; | ||
| 1635 | } | ||
| 1636 | |||
| 1637 | SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru) | ||
| 1638 | { | ||
| 1639 | if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN && | 1731 | if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN && |
| 1640 | who != RUSAGE_THREAD) | 1732 | who != RUSAGE_THREAD) |
| 1641 | return -EINVAL; | 1733 | return -EINVAL; |
| 1642 | return getrusage(current, who, ru); | 1734 | |
| 1735 | getrusage(current, who, &r); | ||
| 1736 | return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; | ||
| 1643 | } | 1737 | } |
| 1644 | 1738 | ||
| 1645 | #ifdef CONFIG_COMPAT | 1739 | #ifdef CONFIG_COMPAT |
| @@ -1651,7 +1745,7 @@ COMPAT_SYSCALL_DEFINE2(getrusage, int, who, struct compat_rusage __user *, ru) | |||
| 1651 | who != RUSAGE_THREAD) | 1745 | who != RUSAGE_THREAD) |
| 1652 | return -EINVAL; | 1746 | return -EINVAL; |
| 1653 | 1747 | ||
| 1654 | k_getrusage(current, who, &r); | 1748 | getrusage(current, who, &r); |
| 1655 | return put_compat_rusage(&r, ru); | 1749 | return put_compat_rusage(&r, ru); |
| 1656 | } | 1750 | } |
| 1657 | #endif | 1751 | #endif |
| @@ -2266,7 +2360,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, | |||
| 2266 | case PR_GET_THP_DISABLE: | 2360 | case PR_GET_THP_DISABLE: |
| 2267 | if (arg2 || arg3 || arg4 || arg5) | 2361 | if (arg2 || arg3 || arg4 || arg5) |
| 2268 | return -EINVAL; | 2362 | return -EINVAL; |
| 2269 | error = !!(me->mm->def_flags & VM_NOHUGEPAGE); | 2363 | error = !!test_bit(MMF_DISABLE_THP, &me->mm->flags); |
| 2270 | break; | 2364 | break; |
| 2271 | case PR_SET_THP_DISABLE: | 2365 | case PR_SET_THP_DISABLE: |
| 2272 | if (arg3 || arg4 || arg5) | 2366 | if (arg3 || arg4 || arg5) |
| @@ -2274,9 +2368,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, | |||
| 2274 | if (down_write_killable(&me->mm->mmap_sem)) | 2368 | if (down_write_killable(&me->mm->mmap_sem)) |
| 2275 | return -EINTR; | 2369 | return -EINTR; |
| 2276 | if (arg2) | 2370 | if (arg2) |
| 2277 | me->mm->def_flags |= VM_NOHUGEPAGE; | 2371 | set_bit(MMF_DISABLE_THP, &me->mm->flags); |
| 2278 | else | 2372 | else |
| 2279 | me->mm->def_flags &= ~VM_NOHUGEPAGE; | 2373 | clear_bit(MMF_DISABLE_THP, &me->mm->flags); |
| 2280 | up_write(&me->mm->mmap_sem); | 2374 | up_write(&me->mm->mmap_sem); |
| 2281 | break; | 2375 | break; |
| 2282 | case PR_MPX_ENABLE_MANAGEMENT: | 2376 | case PR_MPX_ENABLE_MANAGEMENT: |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4dfba1a76cc3..6648fbbb8157 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
| @@ -174,11 +174,32 @@ extern int no_unaligned_warning; | |||
| 174 | 174 | ||
| 175 | #ifdef CONFIG_PROC_SYSCTL | 175 | #ifdef CONFIG_PROC_SYSCTL |
| 176 | 176 | ||
| 177 | #define SYSCTL_WRITES_LEGACY -1 | 177 | /** |
| 178 | #define SYSCTL_WRITES_WARN 0 | 178 | * enum sysctl_writes_mode - supported sysctl write modes |
| 179 | #define SYSCTL_WRITES_STRICT 1 | 179 | * |
| 180 | * @SYSCTL_WRITES_LEGACY: each write syscall must fully contain the sysctl value | ||
| 181 | * to be written, and multiple writes on the same sysctl file descriptor | ||
| 182 | * will rewrite the sysctl value, regardless of file position. No warning | ||
| 183 | * is issued when the initial position is not 0. | ||
| 184 | * @SYSCTL_WRITES_WARN: same as above but warn when the initial file position is | ||
| 185 | * not 0. | ||
| 186 | * @SYSCTL_WRITES_STRICT: writes to numeric sysctl entries must always be at | ||
| 187 | * file position 0 and the value must be fully contained in the buffer | ||
| 188 | * sent to the write syscall. If dealing with strings respect the file | ||
| 189 | * position, but restrict this to the max length of the buffer, anything | ||
| 190 | * passed the max lenght will be ignored. Multiple writes will append | ||
| 191 | * to the buffer. | ||
| 192 | * | ||
| 193 | * These write modes control how current file position affects the behavior of | ||
| 194 | * updating sysctl values through the proc interface on each write. | ||
| 195 | */ | ||
| 196 | enum sysctl_writes_mode { | ||
| 197 | SYSCTL_WRITES_LEGACY = -1, | ||
| 198 | SYSCTL_WRITES_WARN = 0, | ||
| 199 | SYSCTL_WRITES_STRICT = 1, | ||
| 200 | }; | ||
| 180 | 201 | ||
| 181 | static int sysctl_writes_strict = SYSCTL_WRITES_STRICT; | 202 | static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT; |
| 182 | 203 | ||
| 183 | static int proc_do_cad_pid(struct ctl_table *table, int write, | 204 | static int proc_do_cad_pid(struct ctl_table *table, int write, |
| 184 | void __user *buffer, size_t *lenp, loff_t *ppos); | 205 | void __user *buffer, size_t *lenp, loff_t *ppos); |
| @@ -880,6 +901,14 @@ static struct ctl_table kern_table[] = { | |||
| 880 | #endif | 901 | #endif |
| 881 | }, | 902 | }, |
| 882 | { | 903 | { |
| 904 | .procname = "watchdog_cpumask", | ||
| 905 | .data = &watchdog_cpumask_bits, | ||
| 906 | .maxlen = NR_CPUS, | ||
| 907 | .mode = 0644, | ||
| 908 | .proc_handler = proc_watchdog_cpumask, | ||
| 909 | }, | ||
| 910 | #ifdef CONFIG_SOFTLOCKUP_DETECTOR | ||
| 911 | { | ||
| 883 | .procname = "soft_watchdog", | 912 | .procname = "soft_watchdog", |
| 884 | .data = &soft_watchdog_enabled, | 913 | .data = &soft_watchdog_enabled, |
| 885 | .maxlen = sizeof (int), | 914 | .maxlen = sizeof (int), |
| @@ -889,13 +918,6 @@ static struct ctl_table kern_table[] = { | |||
| 889 | .extra2 = &one, | 918 | .extra2 = &one, |
| 890 | }, | 919 | }, |
| 891 | { | 920 | { |
| 892 | .procname = "watchdog_cpumask", | ||
| 893 | .data = &watchdog_cpumask_bits, | ||
| 894 | .maxlen = NR_CPUS, | ||
| 895 | .mode = 0644, | ||
| 896 | .proc_handler = proc_watchdog_cpumask, | ||
| 897 | }, | ||
| 898 | { | ||
| 899 | .procname = "softlockup_panic", | 921 | .procname = "softlockup_panic", |
| 900 | .data = &softlockup_panic, | 922 | .data = &softlockup_panic, |
| 901 | .maxlen = sizeof(int), | 923 | .maxlen = sizeof(int), |
| @@ -904,27 +926,29 @@ static struct ctl_table kern_table[] = { | |||
| 904 | .extra1 = &zero, | 926 | .extra1 = &zero, |
| 905 | .extra2 = &one, | 927 | .extra2 = &one, |
| 906 | }, | 928 | }, |
| 907 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | 929 | #ifdef CONFIG_SMP |
| 908 | { | 930 | { |
| 909 | .procname = "hardlockup_panic", | 931 | .procname = "softlockup_all_cpu_backtrace", |
| 910 | .data = &hardlockup_panic, | 932 | .data = &sysctl_softlockup_all_cpu_backtrace, |
| 911 | .maxlen = sizeof(int), | 933 | .maxlen = sizeof(int), |
| 912 | .mode = 0644, | 934 | .mode = 0644, |
| 913 | .proc_handler = proc_dointvec_minmax, | 935 | .proc_handler = proc_dointvec_minmax, |
| 914 | .extra1 = &zero, | 936 | .extra1 = &zero, |
| 915 | .extra2 = &one, | 937 | .extra2 = &one, |
| 916 | }, | 938 | }, |
| 939 | #endif /* CONFIG_SMP */ | ||
| 917 | #endif | 940 | #endif |
| 918 | #ifdef CONFIG_SMP | 941 | #ifdef CONFIG_HARDLOCKUP_DETECTOR |
| 919 | { | 942 | { |
| 920 | .procname = "softlockup_all_cpu_backtrace", | 943 | .procname = "hardlockup_panic", |
| 921 | .data = &sysctl_softlockup_all_cpu_backtrace, | 944 | .data = &hardlockup_panic, |
| 922 | .maxlen = sizeof(int), | 945 | .maxlen = sizeof(int), |
| 923 | .mode = 0644, | 946 | .mode = 0644, |
| 924 | .proc_handler = proc_dointvec_minmax, | 947 | .proc_handler = proc_dointvec_minmax, |
| 925 | .extra1 = &zero, | 948 | .extra1 = &zero, |
| 926 | .extra2 = &one, | 949 | .extra2 = &one, |
| 927 | }, | 950 | }, |
| 951 | #ifdef CONFIG_SMP | ||
| 928 | { | 952 | { |
| 929 | .procname = "hardlockup_all_cpu_backtrace", | 953 | .procname = "hardlockup_all_cpu_backtrace", |
| 930 | .data = &sysctl_hardlockup_all_cpu_backtrace, | 954 | .data = &sysctl_hardlockup_all_cpu_backtrace, |
| @@ -936,6 +960,8 @@ static struct ctl_table kern_table[] = { | |||
| 936 | }, | 960 | }, |
| 937 | #endif /* CONFIG_SMP */ | 961 | #endif /* CONFIG_SMP */ |
| 938 | #endif | 962 | #endif |
| 963 | #endif | ||
| 964 | |||
| 939 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) | 965 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) |
| 940 | { | 966 | { |
| 941 | .procname = "unknown_nmi_panic", | 967 | .procname = "unknown_nmi_panic", |
| @@ -1950,6 +1976,32 @@ static void warn_sysctl_write(struct ctl_table *table) | |||
| 1950 | } | 1976 | } |
| 1951 | 1977 | ||
| 1952 | /** | 1978 | /** |
| 1979 | * proc_first_pos_non_zero_ignore - check if firs position is allowed | ||
| 1980 | * @ppos: file position | ||
| 1981 | * @table: the sysctl table | ||
| 1982 | * | ||
| 1983 | * Returns true if the first position is non-zero and the sysctl_writes_strict | ||
| 1984 | * mode indicates this is not allowed for numeric input types. String proc | ||
| 1985 | * hadlers can ignore the return value. | ||
| 1986 | */ | ||
| 1987 | static bool proc_first_pos_non_zero_ignore(loff_t *ppos, | ||
| 1988 | struct ctl_table *table) | ||
| 1989 | { | ||
| 1990 | if (!*ppos) | ||
| 1991 | return false; | ||
| 1992 | |||
| 1993 | switch (sysctl_writes_strict) { | ||
| 1994 | case SYSCTL_WRITES_STRICT: | ||
| 1995 | return true; | ||
| 1996 | case SYSCTL_WRITES_WARN: | ||
| 1997 | warn_sysctl_write(table); | ||
| 1998 | return false; | ||
| 1999 | default: | ||
| 2000 | return false; | ||
| 2001 | } | ||
| 2002 | } | ||
| 2003 | |||
| 2004 | /** | ||
| 1953 | * proc_dostring - read a string sysctl | 2005 | * proc_dostring - read a string sysctl |
| 1954 | * @table: the sysctl table | 2006 | * @table: the sysctl table |
| 1955 | * @write: %TRUE if this is a write to the sysctl file | 2007 | * @write: %TRUE if this is a write to the sysctl file |
| @@ -1969,8 +2021,8 @@ static void warn_sysctl_write(struct ctl_table *table) | |||
| 1969 | int proc_dostring(struct ctl_table *table, int write, | 2021 | int proc_dostring(struct ctl_table *table, int write, |
| 1970 | void __user *buffer, size_t *lenp, loff_t *ppos) | 2022 | void __user *buffer, size_t *lenp, loff_t *ppos) |
| 1971 | { | 2023 | { |
| 1972 | if (write && *ppos && sysctl_writes_strict == SYSCTL_WRITES_WARN) | 2024 | if (write) |
| 1973 | warn_sysctl_write(table); | 2025 | proc_first_pos_non_zero_ignore(ppos, table); |
| 1974 | 2026 | ||
| 1975 | return _proc_do_string((char *)(table->data), table->maxlen, write, | 2027 | return _proc_do_string((char *)(table->data), table->maxlen, write, |
| 1976 | (char __user *)buffer, lenp, ppos); | 2028 | (char __user *)buffer, lenp, ppos); |
| @@ -2128,19 +2180,18 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp, | |||
| 2128 | return 0; | 2180 | return 0; |
| 2129 | } | 2181 | } |
| 2130 | 2182 | ||
| 2131 | static int do_proc_douintvec_conv(bool *negp, unsigned long *lvalp, | 2183 | static int do_proc_douintvec_conv(unsigned long *lvalp, |
| 2132 | int *valp, | 2184 | unsigned int *valp, |
| 2133 | int write, void *data) | 2185 | int write, void *data) |
| 2134 | { | 2186 | { |
| 2135 | if (write) { | 2187 | if (write) { |
| 2136 | if (*negp) | 2188 | if (*lvalp > UINT_MAX) |
| 2137 | return -EINVAL; | 2189 | return -EINVAL; |
| 2138 | if (*lvalp > UINT_MAX) | 2190 | if (*lvalp > UINT_MAX) |
| 2139 | return -EINVAL; | 2191 | return -EINVAL; |
| 2140 | *valp = *lvalp; | 2192 | *valp = *lvalp; |
| 2141 | } else { | 2193 | } else { |
| 2142 | unsigned int val = *valp; | 2194 | unsigned int val = *valp; |
| 2143 | *negp = false; | ||
| 2144 | *lvalp = (unsigned long)val; | 2195 | *lvalp = (unsigned long)val; |
| 2145 | } | 2196 | } |
| 2146 | return 0; | 2197 | return 0; |
| @@ -2172,17 +2223,8 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, | |||
| 2172 | conv = do_proc_dointvec_conv; | 2223 | conv = do_proc_dointvec_conv; |
| 2173 | 2224 | ||
| 2174 | if (write) { | 2225 | if (write) { |
| 2175 | if (*ppos) { | 2226 | if (proc_first_pos_non_zero_ignore(ppos, table)) |
| 2176 | switch (sysctl_writes_strict) { | 2227 | goto out; |
| 2177 | case SYSCTL_WRITES_STRICT: | ||
| 2178 | goto out; | ||
| 2179 | case SYSCTL_WRITES_WARN: | ||
| 2180 | warn_sysctl_write(table); | ||
| 2181 | break; | ||
| 2182 | default: | ||
| 2183 | break; | ||
| 2184 | } | ||
| 2185 | } | ||
| 2186 | 2228 | ||
| 2187 | if (left > PAGE_SIZE - 1) | 2229 | if (left > PAGE_SIZE - 1) |
| 2188 | left = PAGE_SIZE - 1; | 2230 | left = PAGE_SIZE - 1; |
| @@ -2249,6 +2291,146 @@ static int do_proc_dointvec(struct ctl_table *table, int write, | |||
| 2249 | buffer, lenp, ppos, conv, data); | 2291 | buffer, lenp, ppos, conv, data); |
| 2250 | } | 2292 | } |
| 2251 | 2293 | ||
| 2294 | static int do_proc_douintvec_w(unsigned int *tbl_data, | ||
| 2295 | struct ctl_table *table, | ||
| 2296 | void __user *buffer, | ||
| 2297 | size_t *lenp, loff_t *ppos, | ||
| 2298 | int (*conv)(unsigned long *lvalp, | ||
| 2299 | unsigned int *valp, | ||
| 2300 | int write, void *data), | ||
| 2301 | void *data) | ||
| 2302 | { | ||
| 2303 | unsigned long lval; | ||
| 2304 | int err = 0; | ||
| 2305 | size_t left; | ||
| 2306 | bool neg; | ||
| 2307 | char *kbuf = NULL, *p; | ||
| 2308 | |||
| 2309 | left = *lenp; | ||
| 2310 | |||
| 2311 | if (proc_first_pos_non_zero_ignore(ppos, table)) | ||
| 2312 | goto bail_early; | ||
| 2313 | |||
| 2314 | if (left > PAGE_SIZE - 1) | ||
| 2315 | left = PAGE_SIZE - 1; | ||
| 2316 | |||
| 2317 | p = kbuf = memdup_user_nul(buffer, left); | ||
| 2318 | if (IS_ERR(kbuf)) | ||
| 2319 | return -EINVAL; | ||
| 2320 | |||
| 2321 | left -= proc_skip_spaces(&p); | ||
| 2322 | if (!left) { | ||
| 2323 | err = -EINVAL; | ||
| 2324 | goto out_free; | ||
| 2325 | } | ||
| 2326 | |||
| 2327 | err = proc_get_long(&p, &left, &lval, &neg, | ||
| 2328 | proc_wspace_sep, | ||
| 2329 | sizeof(proc_wspace_sep), NULL); | ||
| 2330 | if (err || neg) { | ||
| 2331 | err = -EINVAL; | ||
| 2332 | goto out_free; | ||
| 2333 | } | ||
| 2334 | |||
| 2335 | if (conv(&lval, tbl_data, 1, data)) { | ||
| 2336 | err = -EINVAL; | ||
| 2337 | goto out_free; | ||
| 2338 | } | ||
| 2339 | |||
| 2340 | if (!err && left) | ||
| 2341 | left -= proc_skip_spaces(&p); | ||
| 2342 | |||
| 2343 | out_free: | ||
| 2344 | kfree(kbuf); | ||
| 2345 | if (err) | ||
| 2346 | return -EINVAL; | ||
| 2347 | |||
| 2348 | return 0; | ||
| 2349 | |||
| 2350 | /* This is in keeping with old __do_proc_dointvec() */ | ||
| 2351 | bail_early: | ||
| 2352 | *ppos += *lenp; | ||
| 2353 | return err; | ||
| 2354 | } | ||
| 2355 | |||
| 2356 | static int do_proc_douintvec_r(unsigned int *tbl_data, void __user *buffer, | ||
| 2357 | size_t *lenp, loff_t *ppos, | ||
| 2358 | int (*conv)(unsigned long *lvalp, | ||
| 2359 | unsigned int *valp, | ||
| 2360 | int write, void *data), | ||
| 2361 | void *data) | ||
| 2362 | { | ||
| 2363 | unsigned long lval; | ||
| 2364 | int err = 0; | ||
| 2365 | size_t left; | ||
| 2366 | |||
| 2367 | left = *lenp; | ||
| 2368 | |||
| 2369 | if (conv(&lval, tbl_data, 0, data)) { | ||
| 2370 | err = -EINVAL; | ||
| 2371 | goto out; | ||
| 2372 | } | ||
| 2373 | |||
| 2374 | err = proc_put_long(&buffer, &left, lval, false); | ||
| 2375 | if (err || !left) | ||
| 2376 | goto out; | ||
| 2377 | |||
| 2378 | err = proc_put_char(&buffer, &left, '\n'); | ||
| 2379 | |||
| 2380 | out: | ||
| 2381 | *lenp -= left; | ||
| 2382 | *ppos += *lenp; | ||
| 2383 | |||
| 2384 | return err; | ||
| 2385 | } | ||
| 2386 | |||
| 2387 | static int __do_proc_douintvec(void *tbl_data, struct ctl_table *table, | ||
| 2388 | int write, void __user *buffer, | ||
| 2389 | size_t *lenp, loff_t *ppos, | ||
| 2390 | int (*conv)(unsigned long *lvalp, | ||
| 2391 | unsigned int *valp, | ||
| 2392 | int write, void *data), | ||
| 2393 | void *data) | ||
| 2394 | { | ||
| 2395 | unsigned int *i, vleft; | ||
| 2396 | |||
| 2397 | if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) { | ||
| 2398 | *lenp = 0; | ||
| 2399 | return 0; | ||
| 2400 | } | ||
| 2401 | |||
| 2402 | i = (unsigned int *) tbl_data; | ||
| 2403 | vleft = table->maxlen / sizeof(*i); | ||
| 2404 | |||
| 2405 | /* | ||
| 2406 | * Arrays are not supported, keep this simple. *Do not* add | ||
| 2407 | * support for them. | ||
| 2408 | */ | ||
| 2409 | if (vleft != 1) { | ||
| 2410 | *lenp = 0; | ||
| 2411 | return -EINVAL; | ||
| 2412 | } | ||
| 2413 | |||
| 2414 | if (!conv) | ||
| 2415 | conv = do_proc_douintvec_conv; | ||
| 2416 | |||
| 2417 | if (write) | ||
| 2418 | return do_proc_douintvec_w(i, table, buffer, lenp, ppos, | ||
| 2419 | conv, data); | ||
| 2420 | return do_proc_douintvec_r(i, buffer, lenp, ppos, conv, data); | ||
| 2421 | } | ||
| 2422 | |||
| 2423 | static int do_proc_douintvec(struct ctl_table *table, int write, | ||
| 2424 | void __user *buffer, size_t *lenp, loff_t *ppos, | ||
| 2425 | int (*conv)(unsigned long *lvalp, | ||
| 2426 | unsigned int *valp, | ||
| 2427 | int write, void *data), | ||
| 2428 | void *data) | ||
| 2429 | { | ||
| 2430 | return __do_proc_douintvec(table->data, table, write, | ||
| 2431 | buffer, lenp, ppos, conv, data); | ||
| 2432 | } | ||
| 2433 | |||
| 2252 | /** | 2434 | /** |
| 2253 | * proc_dointvec - read a vector of integers | 2435 | * proc_dointvec - read a vector of integers |
| 2254 | * @table: the sysctl table | 2436 | * @table: the sysctl table |
| @@ -2284,8 +2466,8 @@ int proc_dointvec(struct ctl_table *table, int write, | |||
| 2284 | int proc_douintvec(struct ctl_table *table, int write, | 2466 | int proc_douintvec(struct ctl_table *table, int write, |
| 2285 | void __user *buffer, size_t *lenp, loff_t *ppos) | 2467 | void __user *buffer, size_t *lenp, loff_t *ppos) |
| 2286 | { | 2468 | { |
| 2287 | return do_proc_dointvec(table, write, buffer, lenp, ppos, | 2469 | return do_proc_douintvec(table, write, buffer, lenp, ppos, |
| 2288 | do_proc_douintvec_conv, NULL); | 2470 | do_proc_douintvec_conv, NULL); |
| 2289 | } | 2471 | } |
| 2290 | 2472 | ||
| 2291 | /* | 2473 | /* |
| @@ -2390,6 +2572,65 @@ int proc_dointvec_minmax(struct ctl_table *table, int write, | |||
| 2390 | do_proc_dointvec_minmax_conv, ¶m); | 2572 | do_proc_dointvec_minmax_conv, ¶m); |
| 2391 | } | 2573 | } |
| 2392 | 2574 | ||
| 2575 | struct do_proc_douintvec_minmax_conv_param { | ||
| 2576 | unsigned int *min; | ||
| 2577 | unsigned int *max; | ||
| 2578 | }; | ||
| 2579 | |||
| 2580 | static int do_proc_douintvec_minmax_conv(unsigned long *lvalp, | ||
| 2581 | unsigned int *valp, | ||
| 2582 | int write, void *data) | ||
| 2583 | { | ||
| 2584 | struct do_proc_douintvec_minmax_conv_param *param = data; | ||
| 2585 | |||
| 2586 | if (write) { | ||
| 2587 | unsigned int val = *lvalp; | ||
| 2588 | |||
| 2589 | if ((param->min && *param->min > val) || | ||
| 2590 | (param->max && *param->max < val)) | ||
| 2591 | return -ERANGE; | ||
| 2592 | |||
| 2593 | if (*lvalp > UINT_MAX) | ||
| 2594 | return -EINVAL; | ||
| 2595 | *valp = val; | ||
| 2596 | } else { | ||
| 2597 | unsigned int val = *valp; | ||
| 2598 | *lvalp = (unsigned long) val; | ||
| 2599 | } | ||
| 2600 | |||
| 2601 | return 0; | ||
| 2602 | } | ||
| 2603 | |||
| 2604 | /** | ||
| 2605 | * proc_douintvec_minmax - read a vector of unsigned ints with min/max values | ||
| 2606 | * @table: the sysctl table | ||
| 2607 | * @write: %TRUE if this is a write to the sysctl file | ||
| 2608 | * @buffer: the user buffer | ||
| 2609 | * @lenp: the size of the user buffer | ||
| 2610 | * @ppos: file position | ||
| 2611 | * | ||
| 2612 | * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer | ||
| 2613 | * values from/to the user buffer, treated as an ASCII string. Negative | ||
| 2614 | * strings are not allowed. | ||
| 2615 | * | ||
| 2616 | * This routine will ensure the values are within the range specified by | ||
| 2617 | * table->extra1 (min) and table->extra2 (max). There is a final sanity | ||
| 2618 | * check for UINT_MAX to avoid having to support wrap around uses from | ||
| 2619 | * userspace. | ||
| 2620 | * | ||
| 2621 | * Returns 0 on success. | ||
| 2622 | */ | ||
| 2623 | int proc_douintvec_minmax(struct ctl_table *table, int write, | ||
| 2624 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
| 2625 | { | ||
| 2626 | struct do_proc_douintvec_minmax_conv_param param = { | ||
| 2627 | .min = (unsigned int *) table->extra1, | ||
| 2628 | .max = (unsigned int *) table->extra2, | ||
| 2629 | }; | ||
| 2630 | return do_proc_douintvec(table, write, buffer, lenp, ppos, | ||
| 2631 | do_proc_douintvec_minmax_conv, ¶m); | ||
| 2632 | } | ||
| 2633 | |||
| 2393 | static void validate_coredump_safety(void) | 2634 | static void validate_coredump_safety(void) |
| 2394 | { | 2635 | { |
| 2395 | #ifdef CONFIG_COREDUMP | 2636 | #ifdef CONFIG_COREDUMP |
| @@ -2447,17 +2688,8 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int | |||
| 2447 | left = *lenp; | 2688 | left = *lenp; |
| 2448 | 2689 | ||
| 2449 | if (write) { | 2690 | if (write) { |
| 2450 | if (*ppos) { | 2691 | if (proc_first_pos_non_zero_ignore(ppos, table)) |
| 2451 | switch (sysctl_writes_strict) { | 2692 | goto out; |
| 2452 | case SYSCTL_WRITES_STRICT: | ||
| 2453 | goto out; | ||
| 2454 | case SYSCTL_WRITES_WARN: | ||
| 2455 | warn_sysctl_write(table); | ||
| 2456 | break; | ||
| 2457 | default: | ||
| 2458 | break; | ||
| 2459 | } | ||
| 2460 | } | ||
| 2461 | 2693 | ||
| 2462 | if (left > PAGE_SIZE - 1) | 2694 | if (left > PAGE_SIZE - 1) |
| 2463 | left = PAGE_SIZE - 1; | 2695 | left = PAGE_SIZE - 1; |
| @@ -2898,6 +3130,12 @@ int proc_dointvec_minmax(struct ctl_table *table, int write, | |||
| 2898 | return -ENOSYS; | 3130 | return -ENOSYS; |
| 2899 | } | 3131 | } |
| 2900 | 3132 | ||
| 3133 | int proc_douintvec_minmax(struct ctl_table *table, int write, | ||
| 3134 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
| 3135 | { | ||
| 3136 | return -ENOSYS; | ||
| 3137 | } | ||
| 3138 | |||
| 2901 | int proc_dointvec_jiffies(struct ctl_table *table, int write, | 3139 | int proc_dointvec_jiffies(struct ctl_table *table, int write, |
| 2902 | void __user *buffer, size_t *lenp, loff_t *ppos) | 3140 | void __user *buffer, size_t *lenp, loff_t *ppos) |
| 2903 | { | 3141 | { |
| @@ -2940,6 +3178,7 @@ EXPORT_SYMBOL(proc_dointvec); | |||
| 2940 | EXPORT_SYMBOL(proc_douintvec); | 3178 | EXPORT_SYMBOL(proc_douintvec); |
| 2941 | EXPORT_SYMBOL(proc_dointvec_jiffies); | 3179 | EXPORT_SYMBOL(proc_dointvec_jiffies); |
| 2942 | EXPORT_SYMBOL(proc_dointvec_minmax); | 3180 | EXPORT_SYMBOL(proc_dointvec_minmax); |
| 3181 | EXPORT_SYMBOL_GPL(proc_douintvec_minmax); | ||
| 2943 | EXPORT_SYMBOL(proc_dointvec_userhz_jiffies); | 3182 | EXPORT_SYMBOL(proc_dointvec_userhz_jiffies); |
| 2944 | EXPORT_SYMBOL(proc_dointvec_ms_jiffies); | 3183 | EXPORT_SYMBOL(proc_dointvec_ms_jiffies); |
| 2945 | EXPORT_SYMBOL(proc_dostring); | 3184 | EXPORT_SYMBOL(proc_dostring); |
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index ece4b177052b..02e1859f2ca8 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c | |||
| @@ -1119,7 +1119,7 @@ static ssize_t bin_uuid(struct file *file, | |||
| 1119 | /* Only supports reads */ | 1119 | /* Only supports reads */ |
| 1120 | if (oldval && oldlen) { | 1120 | if (oldval && oldlen) { |
| 1121 | char buf[UUID_STRING_LEN + 1]; | 1121 | char buf[UUID_STRING_LEN + 1]; |
| 1122 | uuid_be uuid; | 1122 | uuid_t uuid; |
| 1123 | 1123 | ||
| 1124 | result = kernel_read(file, 0, buf, sizeof(buf) - 1); | 1124 | result = kernel_read(file, 0, buf, sizeof(buf) - 1); |
| 1125 | if (result < 0) | 1125 | if (result < 0) |
| @@ -1128,7 +1128,7 @@ static ssize_t bin_uuid(struct file *file, | |||
| 1128 | buf[result] = '\0'; | 1128 | buf[result] = '\0'; |
| 1129 | 1129 | ||
| 1130 | result = -EIO; | 1130 | result = -EIO; |
| 1131 | if (uuid_be_to_bin(buf, &uuid)) | 1131 | if (uuid_parse(buf, &uuid)) |
| 1132 | goto out; | 1132 | goto out; |
| 1133 | 1133 | ||
| 1134 | if (oldlen > 16) | 1134 | if (oldlen > 16) |
| @@ -1346,7 +1346,7 @@ static void deprecated_sysctl_warning(const int *name, int nlen) | |||
| 1346 | * CTL_KERN/KERN_VERSION is used by older glibc and cannot | 1346 | * CTL_KERN/KERN_VERSION is used by older glibc and cannot |
| 1347 | * ever go away. | 1347 | * ever go away. |
| 1348 | */ | 1348 | */ |
| 1349 | if (name[0] == CTL_KERN && name[1] == KERN_VERSION) | 1349 | if (nlen >= 2 && name[0] == CTL_KERN && name[1] == KERN_VERSION) |
| 1350 | return; | 1350 | return; |
| 1351 | 1351 | ||
| 1352 | if (printk_ratelimit()) { | 1352 | if (printk_ratelimit()) { |
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 4008d9f95dd7..ac09bc29eb08 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig | |||
| @@ -126,56 +126,6 @@ config NO_HZ_FULL_ALL | |||
| 126 | Note the boot CPU will still be kept outside the range to | 126 | Note the boot CPU will still be kept outside the range to |
| 127 | handle the timekeeping duty. | 127 | handle the timekeeping duty. |
| 128 | 128 | ||
| 129 | config NO_HZ_FULL_SYSIDLE | ||
| 130 | bool "Detect full-system idle state for full dynticks system" | ||
| 131 | depends on NO_HZ_FULL | ||
| 132 | default n | ||
| 133 | help | ||
| 134 | At least one CPU must keep the scheduling-clock tick running for | ||
| 135 | timekeeping purposes whenever there is a non-idle CPU, where | ||
| 136 | "non-idle" also includes dynticks CPUs as long as they are | ||
| 137 | running non-idle tasks. Because the underlying adaptive-tick | ||
| 138 | support cannot distinguish between all CPUs being idle and | ||
| 139 | all CPUs each running a single task in dynticks mode, the | ||
| 140 | underlying support simply ensures that there is always a CPU | ||
| 141 | handling the scheduling-clock tick, whether or not all CPUs | ||
| 142 | are idle. This Kconfig option enables scalable detection of | ||
| 143 | the all-CPUs-idle state, thus allowing the scheduling-clock | ||
| 144 | tick to be disabled when all CPUs are idle. Note that scalable | ||
| 145 | detection of the all-CPUs-idle state means that larger systems | ||
| 146 | will be slower to declare the all-CPUs-idle state. | ||
| 147 | |||
| 148 | Say Y if you would like to help debug all-CPUs-idle detection. | ||
| 149 | |||
| 150 | Say N if you are unsure. | ||
| 151 | |||
| 152 | config NO_HZ_FULL_SYSIDLE_SMALL | ||
| 153 | int "Number of CPUs above which large-system approach is used" | ||
| 154 | depends on NO_HZ_FULL_SYSIDLE | ||
| 155 | range 1 NR_CPUS | ||
| 156 | default 8 | ||
| 157 | help | ||
| 158 | The full-system idle detection mechanism takes a lazy approach | ||
| 159 | on large systems, as is required to attain decent scalability. | ||
| 160 | However, on smaller systems, scalability is not anywhere near as | ||
| 161 | large a concern as is energy efficiency. The sysidle subsystem | ||
| 162 | therefore uses a fast but non-scalable algorithm for small | ||
| 163 | systems and a lazier but scalable algorithm for large systems. | ||
| 164 | This Kconfig parameter defines the number of CPUs in the largest | ||
| 165 | system that will be considered to be "small". | ||
| 166 | |||
| 167 | The default value will be fine in most cases. Battery-powered | ||
| 168 | systems that (1) enable NO_HZ_FULL_SYSIDLE, (2) have larger | ||
| 169 | numbers of CPUs, and (3) are suffering from battery-lifetime | ||
| 170 | problems due to long sysidle latencies might wish to experiment | ||
| 171 | with larger values for this Kconfig parameter. On the other | ||
| 172 | hand, they might be even better served by disabling NO_HZ_FULL | ||
| 173 | entirely, given that NO_HZ_FULL is intended for HPC and | ||
| 174 | real-time workloads that at present do not tend to be run on | ||
| 175 | battery-powered systems. | ||
| 176 | |||
| 177 | Take the default if you are unsure. | ||
| 178 | |||
| 179 | config NO_HZ | 129 | config NO_HZ |
| 180 | bool "Old Idle dynticks config" | 130 | bool "Old Idle dynticks config" |
| 181 | depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS | 131 | depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS |
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 5cb5b0008d97..0b8ff7d257ea 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c | |||
| @@ -27,6 +27,9 @@ | |||
| 27 | #include <linux/posix-timers.h> | 27 | #include <linux/posix-timers.h> |
| 28 | #include <linux/workqueue.h> | 28 | #include <linux/workqueue.h> |
| 29 | #include <linux/freezer.h> | 29 | #include <linux/freezer.h> |
| 30 | #include <linux/compat.h> | ||
| 31 | |||
| 32 | #include "posix-timers.h" | ||
| 30 | 33 | ||
| 31 | #define CREATE_TRACE_POINTS | 34 | #define CREATE_TRACE_POINTS |
| 32 | #include <trace/events/alarmtimer.h> | 35 | #include <trace/events/alarmtimer.h> |
| @@ -45,11 +48,13 @@ static struct alarm_base { | |||
| 45 | clockid_t base_clockid; | 48 | clockid_t base_clockid; |
| 46 | } alarm_bases[ALARM_NUMTYPE]; | 49 | } alarm_bases[ALARM_NUMTYPE]; |
| 47 | 50 | ||
| 51 | #if defined(CONFIG_POSIX_TIMERS) || defined(CONFIG_RTC_CLASS) | ||
| 48 | /* freezer information to handle clock_nanosleep triggered wakeups */ | 52 | /* freezer information to handle clock_nanosleep triggered wakeups */ |
| 49 | static enum alarmtimer_type freezer_alarmtype; | 53 | static enum alarmtimer_type freezer_alarmtype; |
| 50 | static ktime_t freezer_expires; | 54 | static ktime_t freezer_expires; |
| 51 | static ktime_t freezer_delta; | 55 | static ktime_t freezer_delta; |
| 52 | static DEFINE_SPINLOCK(freezer_delta_lock); | 56 | static DEFINE_SPINLOCK(freezer_delta_lock); |
| 57 | #endif | ||
| 53 | 58 | ||
| 54 | static struct wakeup_source *ws; | 59 | static struct wakeup_source *ws; |
| 55 | 60 | ||
| @@ -307,38 +312,6 @@ static int alarmtimer_resume(struct device *dev) | |||
| 307 | } | 312 | } |
| 308 | #endif | 313 | #endif |
| 309 | 314 | ||
| 310 | static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type) | ||
| 311 | { | ||
| 312 | struct alarm_base *base; | ||
| 313 | unsigned long flags; | ||
| 314 | ktime_t delta; | ||
| 315 | |||
| 316 | switch(type) { | ||
| 317 | case ALARM_REALTIME: | ||
| 318 | base = &alarm_bases[ALARM_REALTIME]; | ||
| 319 | type = ALARM_REALTIME_FREEZER; | ||
| 320 | break; | ||
| 321 | case ALARM_BOOTTIME: | ||
| 322 | base = &alarm_bases[ALARM_BOOTTIME]; | ||
| 323 | type = ALARM_BOOTTIME_FREEZER; | ||
| 324 | break; | ||
| 325 | default: | ||
| 326 | WARN_ONCE(1, "Invalid alarm type: %d\n", type); | ||
| 327 | return; | ||
| 328 | } | ||
| 329 | |||
| 330 | delta = ktime_sub(absexp, base->gettime()); | ||
| 331 | |||
| 332 | spin_lock_irqsave(&freezer_delta_lock, flags); | ||
| 333 | if (!freezer_delta || (delta < freezer_delta)) { | ||
| 334 | freezer_delta = delta; | ||
| 335 | freezer_expires = absexp; | ||
| 336 | freezer_alarmtype = type; | ||
| 337 | } | ||
| 338 | spin_unlock_irqrestore(&freezer_delta_lock, flags); | ||
| 339 | } | ||
| 340 | |||
| 341 | |||
| 342 | /** | 315 | /** |
| 343 | * alarm_init - Initialize an alarm structure | 316 | * alarm_init - Initialize an alarm structure |
| 344 | * @alarm: ptr to alarm to be initialized | 317 | * @alarm: ptr to alarm to be initialized |
| @@ -387,7 +360,7 @@ void alarm_start_relative(struct alarm *alarm, ktime_t start) | |||
| 387 | { | 360 | { |
| 388 | struct alarm_base *base = &alarm_bases[alarm->type]; | 361 | struct alarm_base *base = &alarm_bases[alarm->type]; |
| 389 | 362 | ||
| 390 | start = ktime_add(start, base->gettime()); | 363 | start = ktime_add_safe(start, base->gettime()); |
| 391 | alarm_start(alarm, start); | 364 | alarm_start(alarm, start); |
| 392 | } | 365 | } |
| 393 | EXPORT_SYMBOL_GPL(alarm_start_relative); | 366 | EXPORT_SYMBOL_GPL(alarm_start_relative); |
| @@ -475,7 +448,7 @@ u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval) | |||
| 475 | overrun++; | 448 | overrun++; |
| 476 | } | 449 | } |
| 477 | 450 | ||
| 478 | alarm->node.expires = ktime_add(alarm->node.expires, interval); | 451 | alarm->node.expires = ktime_add_safe(alarm->node.expires, interval); |
| 479 | return overrun; | 452 | return overrun; |
| 480 | } | 453 | } |
| 481 | EXPORT_SYMBOL_GPL(alarm_forward); | 454 | EXPORT_SYMBOL_GPL(alarm_forward); |
| @@ -488,6 +461,38 @@ u64 alarm_forward_now(struct alarm *alarm, ktime_t interval) | |||
| 488 | } | 461 | } |
| 489 | EXPORT_SYMBOL_GPL(alarm_forward_now); | 462 | EXPORT_SYMBOL_GPL(alarm_forward_now); |
| 490 | 463 | ||
| 464 | #ifdef CONFIG_POSIX_TIMERS | ||
| 465 | |||
| 466 | static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type) | ||
| 467 | { | ||
| 468 | struct alarm_base *base; | ||
| 469 | unsigned long flags; | ||
| 470 | ktime_t delta; | ||
| 471 | |||
| 472 | switch(type) { | ||
| 473 | case ALARM_REALTIME: | ||
| 474 | base = &alarm_bases[ALARM_REALTIME]; | ||
| 475 | type = ALARM_REALTIME_FREEZER; | ||
| 476 | break; | ||
| 477 | case ALARM_BOOTTIME: | ||
| 478 | base = &alarm_bases[ALARM_BOOTTIME]; | ||
| 479 | type = ALARM_BOOTTIME_FREEZER; | ||
| 480 | break; | ||
| 481 | default: | ||
| 482 | WARN_ONCE(1, "Invalid alarm type: %d\n", type); | ||
| 483 | return; | ||
| 484 | } | ||
| 485 | |||
| 486 | delta = ktime_sub(absexp, base->gettime()); | ||
| 487 | |||
| 488 | spin_lock_irqsave(&freezer_delta_lock, flags); | ||
| 489 | if (!freezer_delta || (delta < freezer_delta)) { | ||
| 490 | freezer_delta = delta; | ||
| 491 | freezer_expires = absexp; | ||
| 492 | freezer_alarmtype = type; | ||
| 493 | } | ||
| 494 | spin_unlock_irqrestore(&freezer_delta_lock, flags); | ||
| 495 | } | ||
| 491 | 496 | ||
| 492 | /** | 497 | /** |
| 493 | * clock2alarm - helper that converts from clockid to alarmtypes | 498 | * clock2alarm - helper that converts from clockid to alarmtypes |
| @@ -511,22 +516,26 @@ static enum alarmtimer_type clock2alarm(clockid_t clockid) | |||
| 511 | static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, | 516 | static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, |
| 512 | ktime_t now) | 517 | ktime_t now) |
| 513 | { | 518 | { |
| 514 | unsigned long flags; | ||
| 515 | struct k_itimer *ptr = container_of(alarm, struct k_itimer, | 519 | struct k_itimer *ptr = container_of(alarm, struct k_itimer, |
| 516 | it.alarm.alarmtimer); | 520 | it.alarm.alarmtimer); |
| 517 | enum alarmtimer_restart result = ALARMTIMER_NORESTART; | 521 | enum alarmtimer_restart result = ALARMTIMER_NORESTART; |
| 522 | unsigned long flags; | ||
| 523 | int si_private = 0; | ||
| 518 | 524 | ||
| 519 | spin_lock_irqsave(&ptr->it_lock, flags); | 525 | spin_lock_irqsave(&ptr->it_lock, flags); |
| 520 | if ((ptr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) { | ||
| 521 | if (IS_ENABLED(CONFIG_POSIX_TIMERS) && | ||
| 522 | posix_timer_event(ptr, 0) != 0) | ||
| 523 | ptr->it_overrun++; | ||
| 524 | } | ||
| 525 | 526 | ||
| 526 | /* Re-add periodic timers */ | 527 | ptr->it_active = 0; |
| 527 | if (ptr->it.alarm.interval) { | 528 | if (ptr->it_interval) |
| 528 | ptr->it_overrun += alarm_forward(alarm, now, | 529 | si_private = ++ptr->it_requeue_pending; |
| 529 | ptr->it.alarm.interval); | 530 | |
| 531 | if (posix_timer_event(ptr, si_private) && ptr->it_interval) { | ||
| 532 | /* | ||
| 533 | * Handle ignored signals and rearm the timer. This will go | ||
| 534 | * away once we handle ignored signals proper. | ||
| 535 | */ | ||
| 536 | ptr->it_overrun += alarm_forward_now(alarm, ptr->it_interval); | ||
| 537 | ++ptr->it_requeue_pending; | ||
| 538 | ptr->it_active = 1; | ||
| 530 | result = ALARMTIMER_RESTART; | 539 | result = ALARMTIMER_RESTART; |
| 531 | } | 540 | } |
| 532 | spin_unlock_irqrestore(&ptr->it_lock, flags); | 541 | spin_unlock_irqrestore(&ptr->it_lock, flags); |
| @@ -535,6 +544,72 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, | |||
| 535 | } | 544 | } |
| 536 | 545 | ||
| 537 | /** | 546 | /** |
| 547 | * alarm_timer_rearm - Posix timer callback for rearming timer | ||
| 548 | * @timr: Pointer to the posixtimer data struct | ||
| 549 | */ | ||
| 550 | static void alarm_timer_rearm(struct k_itimer *timr) | ||
| 551 | { | ||
| 552 | struct alarm *alarm = &timr->it.alarm.alarmtimer; | ||
| 553 | |||
| 554 | timr->it_overrun += alarm_forward_now(alarm, timr->it_interval); | ||
| 555 | alarm_start(alarm, alarm->node.expires); | ||
| 556 | } | ||
| 557 | |||
| 558 | /** | ||
| 559 | * alarm_timer_forward - Posix timer callback for forwarding timer | ||
| 560 | * @timr: Pointer to the posixtimer data struct | ||
| 561 | * @now: Current time to forward the timer against | ||
| 562 | */ | ||
| 563 | static int alarm_timer_forward(struct k_itimer *timr, ktime_t now) | ||
| 564 | { | ||
| 565 | struct alarm *alarm = &timr->it.alarm.alarmtimer; | ||
| 566 | |||
| 567 | return (int) alarm_forward(alarm, timr->it_interval, now); | ||
| 568 | } | ||
| 569 | |||
| 570 | /** | ||
| 571 | * alarm_timer_remaining - Posix timer callback to retrieve remaining time | ||
| 572 | * @timr: Pointer to the posixtimer data struct | ||
| 573 | * @now: Current time to calculate against | ||
| 574 | */ | ||
| 575 | static ktime_t alarm_timer_remaining(struct k_itimer *timr, ktime_t now) | ||
| 576 | { | ||
| 577 | struct alarm *alarm = &timr->it.alarm.alarmtimer; | ||
| 578 | |||
| 579 | return ktime_sub(now, alarm->node.expires); | ||
| 580 | } | ||
| 581 | |||
| 582 | /** | ||
| 583 | * alarm_timer_try_to_cancel - Posix timer callback to cancel a timer | ||
| 584 | * @timr: Pointer to the posixtimer data struct | ||
| 585 | */ | ||
| 586 | static int alarm_timer_try_to_cancel(struct k_itimer *timr) | ||
| 587 | { | ||
| 588 | return alarm_try_to_cancel(&timr->it.alarm.alarmtimer); | ||
| 589 | } | ||
| 590 | |||
| 591 | /** | ||
| 592 | * alarm_timer_arm - Posix timer callback to arm a timer | ||
| 593 | * @timr: Pointer to the posixtimer data struct | ||
| 594 | * @expires: The new expiry time | ||
| 595 | * @absolute: Expiry value is absolute time | ||
| 596 | * @sigev_none: Posix timer does not deliver signals | ||
| 597 | */ | ||
| 598 | static void alarm_timer_arm(struct k_itimer *timr, ktime_t expires, | ||
| 599 | bool absolute, bool sigev_none) | ||
| 600 | { | ||
| 601 | struct alarm *alarm = &timr->it.alarm.alarmtimer; | ||
| 602 | struct alarm_base *base = &alarm_bases[alarm->type]; | ||
| 603 | |||
| 604 | if (!absolute) | ||
| 605 | expires = ktime_add_safe(expires, base->gettime()); | ||
| 606 | if (sigev_none) | ||
| 607 | alarm->node.expires = expires; | ||
| 608 | else | ||
| 609 | alarm_start(&timr->it.alarm.alarmtimer, expires); | ||
| 610 | } | ||
| 611 | |||
| 612 | /** | ||
| 538 | * alarm_clock_getres - posix getres interface | 613 | * alarm_clock_getres - posix getres interface |
| 539 | * @which_clock: clockid | 614 | * @which_clock: clockid |
| 540 | * @tp: timespec to fill | 615 | * @tp: timespec to fill |
| @@ -591,89 +666,6 @@ static int alarm_timer_create(struct k_itimer *new_timer) | |||
| 591 | } | 666 | } |
| 592 | 667 | ||
| 593 | /** | 668 | /** |
| 594 | * alarm_timer_get - posix timer_get interface | ||
| 595 | * @new_timer: k_itimer pointer | ||
| 596 | * @cur_setting: itimerspec data to fill | ||
| 597 | * | ||
| 598 | * Copies out the current itimerspec data | ||
| 599 | */ | ||
| 600 | static void alarm_timer_get(struct k_itimer *timr, | ||
| 601 | struct itimerspec64 *cur_setting) | ||
| 602 | { | ||
| 603 | ktime_t relative_expiry_time = | ||
| 604 | alarm_expires_remaining(&(timr->it.alarm.alarmtimer)); | ||
| 605 | |||
| 606 | if (ktime_to_ns(relative_expiry_time) > 0) { | ||
| 607 | cur_setting->it_value = ktime_to_timespec64(relative_expiry_time); | ||
| 608 | } else { | ||
| 609 | cur_setting->it_value.tv_sec = 0; | ||
| 610 | cur_setting->it_value.tv_nsec = 0; | ||
| 611 | } | ||
| 612 | |||
| 613 | cur_setting->it_interval = ktime_to_timespec64(timr->it.alarm.interval); | ||
| 614 | } | ||
| 615 | |||
| 616 | /** | ||
| 617 | * alarm_timer_del - posix timer_del interface | ||
| 618 | * @timr: k_itimer pointer to be deleted | ||
| 619 | * | ||
| 620 | * Cancels any programmed alarms for the given timer. | ||
| 621 | */ | ||
| 622 | static int alarm_timer_del(struct k_itimer *timr) | ||
| 623 | { | ||
| 624 | if (!rtcdev) | ||
| 625 | return -ENOTSUPP; | ||
| 626 | |||
| 627 | if (alarm_try_to_cancel(&timr->it.alarm.alarmtimer) < 0) | ||
| 628 | return TIMER_RETRY; | ||
| 629 | |||
| 630 | return 0; | ||
| 631 | } | ||
| 632 | |||
| 633 | /** | ||
| 634 | * alarm_timer_set - posix timer_set interface | ||
| 635 | * @timr: k_itimer pointer to be deleted | ||
| 636 | * @flags: timer flags | ||
| 637 | * @new_setting: itimerspec to be used | ||
| 638 | * @old_setting: itimerspec being replaced | ||
| 639 | * | ||
| 640 | * Sets the timer to new_setting, and starts the timer. | ||
| 641 | */ | ||
| 642 | static int alarm_timer_set(struct k_itimer *timr, int flags, | ||
| 643 | struct itimerspec64 *new_setting, | ||
| 644 | struct itimerspec64 *old_setting) | ||
| 645 | { | ||
| 646 | ktime_t exp; | ||
| 647 | |||
| 648 | if (!rtcdev) | ||
| 649 | return -ENOTSUPP; | ||
| 650 | |||
| 651 | if (flags & ~TIMER_ABSTIME) | ||
| 652 | return -EINVAL; | ||
| 653 | |||
| 654 | if (old_setting) | ||
| 655 | alarm_timer_get(timr, old_setting); | ||
| 656 | |||
| 657 | /* If the timer was already set, cancel it */ | ||
| 658 | if (alarm_try_to_cancel(&timr->it.alarm.alarmtimer) < 0) | ||
| 659 | return TIMER_RETRY; | ||
| 660 | |||
| 661 | /* start the timer */ | ||
| 662 | timr->it.alarm.interval = timespec64_to_ktime(new_setting->it_interval); | ||
| 663 | exp = timespec64_to_ktime(new_setting->it_value); | ||
| 664 | /* Convert (if necessary) to absolute time */ | ||
| 665 | if (flags != TIMER_ABSTIME) { | ||
| 666 | ktime_t now; | ||
| 667 | |||
| 668 | now = alarm_bases[timr->it.alarm.alarmtimer.type].gettime(); | ||
| 669 | exp = ktime_add(now, exp); | ||
| 670 | } | ||
| 671 | |||
| 672 | alarm_start(&timr->it.alarm.alarmtimer, exp); | ||
| 673 | return 0; | ||
| 674 | } | ||
| 675 | |||
| 676 | /** | ||
| 677 | * alarmtimer_nsleep_wakeup - Wakeup function for alarm_timer_nsleep | 669 | * alarmtimer_nsleep_wakeup - Wakeup function for alarm_timer_nsleep |
| 678 | * @alarm: ptr to alarm that fired | 670 | * @alarm: ptr to alarm that fired |
| 679 | * | 671 | * |
| @@ -697,8 +689,10 @@ static enum alarmtimer_restart alarmtimer_nsleep_wakeup(struct alarm *alarm, | |||
| 697 | * | 689 | * |
| 698 | * Sets the alarm timer and sleeps until it is fired or interrupted. | 690 | * Sets the alarm timer and sleeps until it is fired or interrupted. |
| 699 | */ | 691 | */ |
| 700 | static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp) | 692 | static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp, |
| 693 | enum alarmtimer_type type) | ||
| 701 | { | 694 | { |
| 695 | struct restart_block *restart; | ||
| 702 | alarm->data = (void *)current; | 696 | alarm->data = (void *)current; |
| 703 | do { | 697 | do { |
| 704 | set_current_state(TASK_INTERRUPTIBLE); | 698 | set_current_state(TASK_INTERRUPTIBLE); |
| @@ -711,36 +705,25 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp) | |||
| 711 | 705 | ||
| 712 | __set_current_state(TASK_RUNNING); | 706 | __set_current_state(TASK_RUNNING); |
| 713 | 707 | ||
| 714 | return (alarm->data == NULL); | 708 | if (!alarm->data) |
| 715 | } | ||
| 716 | |||
| 717 | |||
| 718 | /** | ||
| 719 | * update_rmtp - Update remaining timespec value | ||
| 720 | * @exp: expiration time | ||
| 721 | * @type: timer type | ||
| 722 | * @rmtp: user pointer to remaining timepsec value | ||
| 723 | * | ||
| 724 | * Helper function that fills in rmtp value with time between | ||
| 725 | * now and the exp value | ||
| 726 | */ | ||
| 727 | static int update_rmtp(ktime_t exp, enum alarmtimer_type type, | ||
| 728 | struct timespec __user *rmtp) | ||
| 729 | { | ||
| 730 | struct timespec rmt; | ||
| 731 | ktime_t rem; | ||
| 732 | |||
| 733 | rem = ktime_sub(exp, alarm_bases[type].gettime()); | ||
| 734 | |||
| 735 | if (rem <= 0) | ||
| 736 | return 0; | 709 | return 0; |
| 737 | rmt = ktime_to_timespec(rem); | ||
| 738 | 710 | ||
| 739 | if (copy_to_user(rmtp, &rmt, sizeof(*rmtp))) | 711 | if (freezing(current)) |
| 740 | return -EFAULT; | 712 | alarmtimer_freezerset(absexp, type); |
| 713 | restart = ¤t->restart_block; | ||
| 714 | if (restart->nanosleep.type != TT_NONE) { | ||
| 715 | struct timespec64 rmt; | ||
| 716 | ktime_t rem; | ||
| 717 | |||
| 718 | rem = ktime_sub(absexp, alarm_bases[type].gettime()); | ||
| 741 | 719 | ||
| 742 | return 1; | 720 | if (rem <= 0) |
| 721 | return 0; | ||
| 722 | rmt = ktime_to_timespec64(rem); | ||
| 743 | 723 | ||
| 724 | return nanosleep_copyout(restart, &rmt); | ||
| 725 | } | ||
| 726 | return -ERESTART_RESTARTBLOCK; | ||
| 744 | } | 727 | } |
| 745 | 728 | ||
| 746 | /** | 729 | /** |
| @@ -752,32 +735,12 @@ static int update_rmtp(ktime_t exp, enum alarmtimer_type type, | |||
| 752 | static long __sched alarm_timer_nsleep_restart(struct restart_block *restart) | 735 | static long __sched alarm_timer_nsleep_restart(struct restart_block *restart) |
| 753 | { | 736 | { |
| 754 | enum alarmtimer_type type = restart->nanosleep.clockid; | 737 | enum alarmtimer_type type = restart->nanosleep.clockid; |
| 755 | ktime_t exp; | 738 | ktime_t exp = restart->nanosleep.expires; |
| 756 | struct timespec __user *rmtp; | ||
| 757 | struct alarm alarm; | 739 | struct alarm alarm; |
| 758 | int ret = 0; | ||
| 759 | 740 | ||
| 760 | exp = restart->nanosleep.expires; | ||
| 761 | alarm_init(&alarm, type, alarmtimer_nsleep_wakeup); | 741 | alarm_init(&alarm, type, alarmtimer_nsleep_wakeup); |
| 762 | 742 | ||
| 763 | if (alarmtimer_do_nsleep(&alarm, exp)) | 743 | return alarmtimer_do_nsleep(&alarm, exp, type); |
| 764 | goto out; | ||
| 765 | |||
| 766 | if (freezing(current)) | ||
| 767 | alarmtimer_freezerset(exp, type); | ||
| 768 | |||
| 769 | rmtp = restart->nanosleep.rmtp; | ||
| 770 | if (rmtp) { | ||
| 771 | ret = update_rmtp(exp, type, rmtp); | ||
| 772 | if (ret <= 0) | ||
| 773 | goto out; | ||
| 774 | } | ||
| 775 | |||
| 776 | |||
| 777 | /* The other values in restart are already filled in */ | ||
| 778 | ret = -ERESTART_RESTARTBLOCK; | ||
| 779 | out: | ||
| 780 | return ret; | ||
| 781 | } | 744 | } |
| 782 | 745 | ||
| 783 | /** | 746 | /** |
| @@ -790,11 +753,10 @@ out: | |||
| 790 | * Handles clock_nanosleep calls against _ALARM clockids | 753 | * Handles clock_nanosleep calls against _ALARM clockids |
| 791 | */ | 754 | */ |
| 792 | static int alarm_timer_nsleep(const clockid_t which_clock, int flags, | 755 | static int alarm_timer_nsleep(const clockid_t which_clock, int flags, |
| 793 | struct timespec64 *tsreq, | 756 | const struct timespec64 *tsreq) |
| 794 | struct timespec __user *rmtp) | ||
| 795 | { | 757 | { |
| 796 | enum alarmtimer_type type = clock2alarm(which_clock); | 758 | enum alarmtimer_type type = clock2alarm(which_clock); |
| 797 | struct restart_block *restart; | 759 | struct restart_block *restart = ¤t->restart_block; |
| 798 | struct alarm alarm; | 760 | struct alarm alarm; |
| 799 | ktime_t exp; | 761 | ktime_t exp; |
| 800 | int ret = 0; | 762 | int ret = 0; |
| @@ -817,35 +779,36 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, | |||
| 817 | exp = ktime_add(now, exp); | 779 | exp = ktime_add(now, exp); |
| 818 | } | 780 | } |
| 819 | 781 | ||
| 820 | if (alarmtimer_do_nsleep(&alarm, exp)) | 782 | ret = alarmtimer_do_nsleep(&alarm, exp, type); |
| 821 | goto out; | 783 | if (ret != -ERESTART_RESTARTBLOCK) |
| 822 | 784 | return ret; | |
| 823 | if (freezing(current)) | ||
| 824 | alarmtimer_freezerset(exp, type); | ||
| 825 | 785 | ||
| 826 | /* abs timers don't set remaining time or restart */ | 786 | /* abs timers don't set remaining time or restart */ |
| 827 | if (flags == TIMER_ABSTIME) { | 787 | if (flags == TIMER_ABSTIME) |
| 828 | ret = -ERESTARTNOHAND; | 788 | return -ERESTARTNOHAND; |
| 829 | goto out; | ||
| 830 | } | ||
| 831 | 789 | ||
| 832 | if (rmtp) { | ||
| 833 | ret = update_rmtp(exp, type, rmtp); | ||
| 834 | if (ret <= 0) | ||
| 835 | goto out; | ||
| 836 | } | ||
| 837 | |||
| 838 | restart = ¤t->restart_block; | ||
| 839 | restart->fn = alarm_timer_nsleep_restart; | 790 | restart->fn = alarm_timer_nsleep_restart; |
| 840 | restart->nanosleep.clockid = type; | 791 | restart->nanosleep.clockid = type; |
| 841 | restart->nanosleep.expires = exp; | 792 | restart->nanosleep.expires = exp; |
| 842 | restart->nanosleep.rmtp = rmtp; | ||
| 843 | ret = -ERESTART_RESTARTBLOCK; | ||
| 844 | |||
| 845 | out: | ||
| 846 | return ret; | 793 | return ret; |
| 847 | } | 794 | } |
| 848 | 795 | ||
| 796 | const struct k_clock alarm_clock = { | ||
| 797 | .clock_getres = alarm_clock_getres, | ||
| 798 | .clock_get = alarm_clock_get, | ||
| 799 | .timer_create = alarm_timer_create, | ||
| 800 | .timer_set = common_timer_set, | ||
| 801 | .timer_del = common_timer_del, | ||
| 802 | .timer_get = common_timer_get, | ||
| 803 | .timer_arm = alarm_timer_arm, | ||
| 804 | .timer_rearm = alarm_timer_rearm, | ||
| 805 | .timer_forward = alarm_timer_forward, | ||
| 806 | .timer_remaining = alarm_timer_remaining, | ||
| 807 | .timer_try_to_cancel = alarm_timer_try_to_cancel, | ||
| 808 | .nsleep = alarm_timer_nsleep, | ||
| 809 | }; | ||
| 810 | #endif /* CONFIG_POSIX_TIMERS */ | ||
| 811 | |||
| 849 | 812 | ||
| 850 | /* Suspend hook structures */ | 813 | /* Suspend hook structures */ |
| 851 | static const struct dev_pm_ops alarmtimer_pm_ops = { | 814 | static const struct dev_pm_ops alarmtimer_pm_ops = { |
| @@ -871,23 +834,9 @@ static int __init alarmtimer_init(void) | |||
| 871 | struct platform_device *pdev; | 834 | struct platform_device *pdev; |
| 872 | int error = 0; | 835 | int error = 0; |
| 873 | int i; | 836 | int i; |
| 874 | struct k_clock alarm_clock = { | ||
| 875 | .clock_getres = alarm_clock_getres, | ||
| 876 | .clock_get = alarm_clock_get, | ||
| 877 | .timer_create = alarm_timer_create, | ||
| 878 | .timer_set = alarm_timer_set, | ||
| 879 | .timer_del = alarm_timer_del, | ||
| 880 | .timer_get = alarm_timer_get, | ||
| 881 | .nsleep = alarm_timer_nsleep, | ||
| 882 | }; | ||
| 883 | 837 | ||
| 884 | alarmtimer_rtc_timer_init(); | 838 | alarmtimer_rtc_timer_init(); |
| 885 | 839 | ||
| 886 | if (IS_ENABLED(CONFIG_POSIX_TIMERS)) { | ||
| 887 | posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock); | ||
| 888 | posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock); | ||
| 889 | } | ||
| 890 | |||
| 891 | /* Initialize alarm bases */ | 840 | /* Initialize alarm bases */ |
| 892 | alarm_bases[ALARM_REALTIME].base_clockid = CLOCK_REALTIME; | 841 | alarm_bases[ALARM_REALTIME].base_clockid = CLOCK_REALTIME; |
| 893 | alarm_bases[ALARM_REALTIME].gettime = &ktime_get_real; | 842 | alarm_bases[ALARM_REALTIME].gettime = &ktime_get_real; |
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 93621ae718d3..03918a19cf2d 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c | |||
| @@ -233,6 +233,9 @@ static void clocksource_watchdog(unsigned long data) | |||
| 233 | continue; | 233 | continue; |
| 234 | } | 234 | } |
| 235 | 235 | ||
| 236 | if (cs == curr_clocksource && cs->tick_stable) | ||
| 237 | cs->tick_stable(cs); | ||
| 238 | |||
| 236 | if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && | 239 | if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && |
| 237 | (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) && | 240 | (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) && |
| 238 | (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) { | 241 | (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) { |
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index ac053bb5296e..88f75f92ef36 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c | |||
| @@ -51,6 +51,7 @@ | |||
| 51 | #include <linux/sched/debug.h> | 51 | #include <linux/sched/debug.h> |
| 52 | #include <linux/timer.h> | 52 | #include <linux/timer.h> |
| 53 | #include <linux/freezer.h> | 53 | #include <linux/freezer.h> |
| 54 | #include <linux/compat.h> | ||
| 54 | 55 | ||
| 55 | #include <linux/uaccess.h> | 56 | #include <linux/uaccess.h> |
| 56 | 57 | ||
| @@ -1439,8 +1440,29 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task) | |||
| 1439 | } | 1440 | } |
| 1440 | EXPORT_SYMBOL_GPL(hrtimer_init_sleeper); | 1441 | EXPORT_SYMBOL_GPL(hrtimer_init_sleeper); |
| 1441 | 1442 | ||
| 1443 | int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts) | ||
| 1444 | { | ||
| 1445 | switch(restart->nanosleep.type) { | ||
| 1446 | #ifdef CONFIG_COMPAT | ||
| 1447 | case TT_COMPAT: | ||
| 1448 | if (compat_put_timespec64(ts, restart->nanosleep.compat_rmtp)) | ||
| 1449 | return -EFAULT; | ||
| 1450 | break; | ||
| 1451 | #endif | ||
| 1452 | case TT_NATIVE: | ||
| 1453 | if (put_timespec64(ts, restart->nanosleep.rmtp)) | ||
| 1454 | return -EFAULT; | ||
| 1455 | break; | ||
| 1456 | default: | ||
| 1457 | BUG(); | ||
| 1458 | } | ||
| 1459 | return -ERESTART_RESTARTBLOCK; | ||
| 1460 | } | ||
| 1461 | |||
| 1442 | static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) | 1462 | static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) |
| 1443 | { | 1463 | { |
| 1464 | struct restart_block *restart; | ||
| 1465 | |||
| 1444 | hrtimer_init_sleeper(t, current); | 1466 | hrtimer_init_sleeper(t, current); |
| 1445 | 1467 | ||
| 1446 | do { | 1468 | do { |
| @@ -1457,53 +1479,38 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod | |||
| 1457 | 1479 | ||
| 1458 | __set_current_state(TASK_RUNNING); | 1480 | __set_current_state(TASK_RUNNING); |
| 1459 | 1481 | ||
| 1460 | return t->task == NULL; | 1482 | if (!t->task) |
| 1461 | } | ||
| 1462 | |||
| 1463 | static int update_rmtp(struct hrtimer *timer, struct timespec __user *rmtp) | ||
| 1464 | { | ||
| 1465 | struct timespec rmt; | ||
| 1466 | ktime_t rem; | ||
| 1467 | |||
| 1468 | rem = hrtimer_expires_remaining(timer); | ||
| 1469 | if (rem <= 0) | ||
| 1470 | return 0; | 1483 | return 0; |
| 1471 | rmt = ktime_to_timespec(rem); | ||
| 1472 | 1484 | ||
| 1473 | if (copy_to_user(rmtp, &rmt, sizeof(*rmtp))) | 1485 | restart = ¤t->restart_block; |
| 1474 | return -EFAULT; | 1486 | if (restart->nanosleep.type != TT_NONE) { |
| 1487 | ktime_t rem = hrtimer_expires_remaining(&t->timer); | ||
| 1488 | struct timespec64 rmt; | ||
| 1489 | |||
| 1490 | if (rem <= 0) | ||
| 1491 | return 0; | ||
| 1492 | rmt = ktime_to_timespec64(rem); | ||
| 1475 | 1493 | ||
| 1476 | return 1; | 1494 | return nanosleep_copyout(restart, &rmt); |
| 1495 | } | ||
| 1496 | return -ERESTART_RESTARTBLOCK; | ||
| 1477 | } | 1497 | } |
| 1478 | 1498 | ||
| 1479 | long __sched hrtimer_nanosleep_restart(struct restart_block *restart) | 1499 | static long __sched hrtimer_nanosleep_restart(struct restart_block *restart) |
| 1480 | { | 1500 | { |
| 1481 | struct hrtimer_sleeper t; | 1501 | struct hrtimer_sleeper t; |
| 1482 | struct timespec __user *rmtp; | 1502 | int ret; |
| 1483 | int ret = 0; | ||
| 1484 | 1503 | ||
| 1485 | hrtimer_init_on_stack(&t.timer, restart->nanosleep.clockid, | 1504 | hrtimer_init_on_stack(&t.timer, restart->nanosleep.clockid, |
| 1486 | HRTIMER_MODE_ABS); | 1505 | HRTIMER_MODE_ABS); |
| 1487 | hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires); | 1506 | hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires); |
| 1488 | 1507 | ||
| 1489 | if (do_nanosleep(&t, HRTIMER_MODE_ABS)) | 1508 | ret = do_nanosleep(&t, HRTIMER_MODE_ABS); |
| 1490 | goto out; | ||
| 1491 | |||
| 1492 | rmtp = restart->nanosleep.rmtp; | ||
| 1493 | if (rmtp) { | ||
| 1494 | ret = update_rmtp(&t.timer, rmtp); | ||
| 1495 | if (ret <= 0) | ||
| 1496 | goto out; | ||
| 1497 | } | ||
| 1498 | |||
| 1499 | /* The other values in restart are already filled in */ | ||
| 1500 | ret = -ERESTART_RESTARTBLOCK; | ||
| 1501 | out: | ||
| 1502 | destroy_hrtimer_on_stack(&t.timer); | 1509 | destroy_hrtimer_on_stack(&t.timer); |
| 1503 | return ret; | 1510 | return ret; |
| 1504 | } | 1511 | } |
| 1505 | 1512 | ||
| 1506 | long hrtimer_nanosleep(struct timespec64 *rqtp, struct timespec __user *rmtp, | 1513 | long hrtimer_nanosleep(const struct timespec64 *rqtp, |
| 1507 | const enum hrtimer_mode mode, const clockid_t clockid) | 1514 | const enum hrtimer_mode mode, const clockid_t clockid) |
| 1508 | { | 1515 | { |
| 1509 | struct restart_block *restart; | 1516 | struct restart_block *restart; |
| @@ -1517,7 +1524,8 @@ long hrtimer_nanosleep(struct timespec64 *rqtp, struct timespec __user *rmtp, | |||
| 1517 | 1524 | ||
| 1518 | hrtimer_init_on_stack(&t.timer, clockid, mode); | 1525 | hrtimer_init_on_stack(&t.timer, clockid, mode); |
| 1519 | hrtimer_set_expires_range_ns(&t.timer, timespec64_to_ktime(*rqtp), slack); | 1526 | hrtimer_set_expires_range_ns(&t.timer, timespec64_to_ktime(*rqtp), slack); |
| 1520 | if (do_nanosleep(&t, mode)) | 1527 | ret = do_nanosleep(&t, mode); |
| 1528 | if (ret != -ERESTART_RESTARTBLOCK) | ||
| 1521 | goto out; | 1529 | goto out; |
| 1522 | 1530 | ||
| 1523 | /* Absolute timers do not update the rmtp value and restart: */ | 1531 | /* Absolute timers do not update the rmtp value and restart: */ |
| @@ -1526,19 +1534,10 @@ long hrtimer_nanosleep(struct timespec64 *rqtp, struct timespec __user *rmtp, | |||
| 1526 | goto out; | 1534 | goto out; |
| 1527 | } | 1535 | } |
| 1528 | 1536 | ||
| 1529 | if (rmtp) { | ||
| 1530 | ret = update_rmtp(&t.timer, rmtp); | ||
| 1531 | if (ret <= 0) | ||
| 1532 | goto out; | ||
| 1533 | } | ||
| 1534 | |||
| 1535 | restart = ¤t->restart_block; | 1537 | restart = ¤t->restart_block; |
| 1536 | restart->fn = hrtimer_nanosleep_restart; | 1538 | restart->fn = hrtimer_nanosleep_restart; |
| 1537 | restart->nanosleep.clockid = t.timer.base->clockid; | 1539 | restart->nanosleep.clockid = t.timer.base->clockid; |
| 1538 | restart->nanosleep.rmtp = rmtp; | ||
| 1539 | restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer); | 1540 | restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer); |
| 1540 | |||
| 1541 | ret = -ERESTART_RESTARTBLOCK; | ||
| 1542 | out: | 1541 | out: |
| 1543 | destroy_hrtimer_on_stack(&t.timer); | 1542 | destroy_hrtimer_on_stack(&t.timer); |
| 1544 | return ret; | 1543 | return ret; |
| @@ -1547,18 +1546,37 @@ out: | |||
| 1547 | SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp, | 1546 | SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp, |
| 1548 | struct timespec __user *, rmtp) | 1547 | struct timespec __user *, rmtp) |
| 1549 | { | 1548 | { |
| 1550 | struct timespec64 tu64; | 1549 | struct timespec64 tu; |
| 1551 | struct timespec tu; | 1550 | |
| 1551 | if (get_timespec64(&tu, rqtp)) | ||
| 1552 | return -EFAULT; | ||
| 1553 | |||
| 1554 | if (!timespec64_valid(&tu)) | ||
| 1555 | return -EINVAL; | ||
| 1556 | |||
| 1557 | current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; | ||
| 1558 | current->restart_block.nanosleep.rmtp = rmtp; | ||
| 1559 | return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC); | ||
| 1560 | } | ||
| 1561 | |||
| 1562 | #ifdef CONFIG_COMPAT | ||
| 1563 | |||
| 1564 | COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp, | ||
| 1565 | struct compat_timespec __user *, rmtp) | ||
| 1566 | { | ||
| 1567 | struct timespec64 tu; | ||
| 1552 | 1568 | ||
| 1553 | if (copy_from_user(&tu, rqtp, sizeof(tu))) | 1569 | if (compat_get_timespec64(&tu, rqtp)) |
| 1554 | return -EFAULT; | 1570 | return -EFAULT; |
| 1555 | 1571 | ||
| 1556 | tu64 = timespec_to_timespec64(tu); | 1572 | if (!timespec64_valid(&tu)) |
| 1557 | if (!timespec64_valid(&tu64)) | ||
| 1558 | return -EINVAL; | 1573 | return -EINVAL; |
| 1559 | 1574 | ||
| 1560 | return hrtimer_nanosleep(&tu64, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC); | 1575 | current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; |
| 1576 | current->restart_block.nanosleep.compat_rmtp = rmtp; | ||
| 1577 | return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC); | ||
| 1561 | } | 1578 | } |
| 1579 | #endif | ||
| 1562 | 1580 | ||
| 1563 | /* | 1581 | /* |
| 1564 | * Functions related to boot-time initialization: | 1582 | * Functions related to boot-time initialization: |
diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index 087d6a1279b8..2ef98a02376a 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c | |||
| @@ -15,6 +15,7 @@ | |||
| 15 | #include <linux/posix-timers.h> | 15 | #include <linux/posix-timers.h> |
| 16 | #include <linux/hrtimer.h> | 16 | #include <linux/hrtimer.h> |
| 17 | #include <trace/events/timer.h> | 17 | #include <trace/events/timer.h> |
| 18 | #include <linux/compat.h> | ||
| 18 | 19 | ||
| 19 | #include <linux/uaccess.h> | 20 | #include <linux/uaccess.h> |
| 20 | 21 | ||
| @@ -116,6 +117,19 @@ SYSCALL_DEFINE2(getitimer, int, which, struct itimerval __user *, value) | |||
| 116 | return error; | 117 | return error; |
| 117 | } | 118 | } |
| 118 | 119 | ||
| 120 | #ifdef CONFIG_COMPAT | ||
| 121 | COMPAT_SYSCALL_DEFINE2(getitimer, int, which, | ||
| 122 | struct compat_itimerval __user *, it) | ||
| 123 | { | ||
| 124 | struct itimerval kit; | ||
| 125 | int error = do_getitimer(which, &kit); | ||
| 126 | |||
| 127 | if (!error && put_compat_itimerval(it, &kit)) | ||
| 128 | error = -EFAULT; | ||
| 129 | return error; | ||
| 130 | } | ||
| 131 | #endif | ||
| 132 | |||
| 119 | 133 | ||
| 120 | /* | 134 | /* |
| 121 | * The timer is automagically restarted, when interval != 0 | 135 | * The timer is automagically restarted, when interval != 0 |
| @@ -138,8 +152,12 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, | |||
| 138 | u64 oval, nval, ointerval, ninterval; | 152 | u64 oval, nval, ointerval, ninterval; |
| 139 | struct cpu_itimer *it = &tsk->signal->it[clock_id]; | 153 | struct cpu_itimer *it = &tsk->signal->it[clock_id]; |
| 140 | 154 | ||
| 141 | nval = timeval_to_ns(&value->it_value); | 155 | /* |
| 142 | ninterval = timeval_to_ns(&value->it_interval); | 156 | * Use the to_ktime conversion because that clamps the maximum |
| 157 | * value to KTIME_MAX and avoid multiplication overflows. | ||
| 158 | */ | ||
| 159 | nval = ktime_to_ns(timeval_to_ktime(value->it_value)); | ||
| 160 | ninterval = ktime_to_ns(timeval_to_ktime(value->it_interval)); | ||
| 143 | 161 | ||
| 144 | spin_lock_irq(&tsk->sighand->siglock); | 162 | spin_lock_irq(&tsk->sighand->siglock); |
| 145 | 163 | ||
| @@ -294,3 +312,27 @@ SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value, | |||
| 294 | return -EFAULT; | 312 | return -EFAULT; |
| 295 | return 0; | 313 | return 0; |
| 296 | } | 314 | } |
| 315 | |||
| 316 | #ifdef CONFIG_COMPAT | ||
| 317 | COMPAT_SYSCALL_DEFINE3(setitimer, int, which, | ||
| 318 | struct compat_itimerval __user *, in, | ||
| 319 | struct compat_itimerval __user *, out) | ||
| 320 | { | ||
| 321 | struct itimerval kin, kout; | ||
| 322 | int error; | ||
| 323 | |||
| 324 | if (in) { | ||
| 325 | if (get_compat_itimerval(&kin, in)) | ||
| 326 | return -EFAULT; | ||
| 327 | } else { | ||
| 328 | memset(&kin, 0, sizeof(kin)); | ||
| 329 | } | ||
| 330 | |||
| 331 | error = do_setitimer(which, &kin, out ? &kout : NULL); | ||
| 332 | if (error || !out) | ||
| 333 | return error; | ||
| 334 | if (put_compat_itimerval(out, &kout)) | ||
| 335 | return -EFAULT; | ||
| 336 | return 0; | ||
| 337 | } | ||
| 338 | #endif | ||
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index 31d588d37a17..17cdc554c9fe 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c | |||
| @@ -25,6 +25,8 @@ | |||
| 25 | #include <linux/syscalls.h> | 25 | #include <linux/syscalls.h> |
| 26 | #include <linux/uaccess.h> | 26 | #include <linux/uaccess.h> |
| 27 | 27 | ||
| 28 | #include "posix-timers.h" | ||
| 29 | |||
| 28 | static void delete_clock(struct kref *kref); | 30 | static void delete_clock(struct kref *kref); |
| 29 | 31 | ||
| 30 | /* | 32 | /* |
| @@ -82,38 +84,6 @@ static unsigned int posix_clock_poll(struct file *fp, poll_table *wait) | |||
| 82 | return result; | 84 | return result; |
| 83 | } | 85 | } |
| 84 | 86 | ||
| 85 | static int posix_clock_fasync(int fd, struct file *fp, int on) | ||
| 86 | { | ||
| 87 | struct posix_clock *clk = get_posix_clock(fp); | ||
| 88 | int err = 0; | ||
| 89 | |||
| 90 | if (!clk) | ||
| 91 | return -ENODEV; | ||
| 92 | |||
| 93 | if (clk->ops.fasync) | ||
| 94 | err = clk->ops.fasync(clk, fd, fp, on); | ||
| 95 | |||
| 96 | put_posix_clock(clk); | ||
| 97 | |||
| 98 | return err; | ||
| 99 | } | ||
| 100 | |||
| 101 | static int posix_clock_mmap(struct file *fp, struct vm_area_struct *vma) | ||
| 102 | { | ||
| 103 | struct posix_clock *clk = get_posix_clock(fp); | ||
| 104 | int err = -ENODEV; | ||
| 105 | |||
| 106 | if (!clk) | ||
| 107 | return -ENODEV; | ||
| 108 | |||
| 109 | if (clk->ops.mmap) | ||
| 110 | err = clk->ops.mmap(clk, vma); | ||
| 111 | |||
| 112 | put_posix_clock(clk); | ||
| 113 | |||
| 114 | return err; | ||
| 115 | } | ||
| 116 | |||
| 117 | static long posix_clock_ioctl(struct file *fp, | 87 | static long posix_clock_ioctl(struct file *fp, |
| 118 | unsigned int cmd, unsigned long arg) | 88 | unsigned int cmd, unsigned long arg) |
| 119 | { | 89 | { |
| @@ -199,8 +169,6 @@ static const struct file_operations posix_clock_file_operations = { | |||
| 199 | .unlocked_ioctl = posix_clock_ioctl, | 169 | .unlocked_ioctl = posix_clock_ioctl, |
| 200 | .open = posix_clock_open, | 170 | .open = posix_clock_open, |
| 201 | .release = posix_clock_release, | 171 | .release = posix_clock_release, |
| 202 | .fasync = posix_clock_fasync, | ||
| 203 | .mmap = posix_clock_mmap, | ||
| 204 | #ifdef CONFIG_COMPAT | 172 | #ifdef CONFIG_COMPAT |
| 205 | .compat_ioctl = posix_clock_compat_ioctl, | 173 | .compat_ioctl = posix_clock_compat_ioctl, |
| 206 | #endif | 174 | #endif |
| @@ -359,88 +327,9 @@ out: | |||
| 359 | return err; | 327 | return err; |
| 360 | } | 328 | } |
| 361 | 329 | ||
| 362 | static int pc_timer_create(struct k_itimer *kit) | 330 | const struct k_clock clock_posix_dynamic = { |
| 363 | { | ||
| 364 | clockid_t id = kit->it_clock; | ||
| 365 | struct posix_clock_desc cd; | ||
| 366 | int err; | ||
| 367 | |||
| 368 | err = get_clock_desc(id, &cd); | ||
| 369 | if (err) | ||
| 370 | return err; | ||
| 371 | |||
| 372 | if (cd.clk->ops.timer_create) | ||
| 373 | err = cd.clk->ops.timer_create(cd.clk, kit); | ||
| 374 | else | ||
| 375 | err = -EOPNOTSUPP; | ||
| 376 | |||
| 377 | put_clock_desc(&cd); | ||
| 378 | |||
| 379 | return err; | ||
| 380 | } | ||
| 381 | |||
| 382 | static int pc_timer_delete(struct k_itimer *kit) | ||
| 383 | { | ||
| 384 | clockid_t id = kit->it_clock; | ||
| 385 | struct posix_clock_desc cd; | ||
| 386 | int err; | ||
| 387 | |||
| 388 | err = get_clock_desc(id, &cd); | ||
| 389 | if (err) | ||
| 390 | return err; | ||
| 391 | |||
| 392 | if (cd.clk->ops.timer_delete) | ||
| 393 | err = cd.clk->ops.timer_delete(cd.clk, kit); | ||
| 394 | else | ||
| 395 | err = -EOPNOTSUPP; | ||
| 396 | |||
| 397 | put_clock_desc(&cd); | ||
| 398 | |||
| 399 | return err; | ||
| 400 | } | ||
| 401 | |||
| 402 | static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec64 *ts) | ||
| 403 | { | ||
| 404 | clockid_t id = kit->it_clock; | ||
| 405 | struct posix_clock_desc cd; | ||
| 406 | |||
| 407 | if (get_clock_desc(id, &cd)) | ||
| 408 | return; | ||
| 409 | |||
| 410 | if (cd.clk->ops.timer_gettime) | ||
| 411 | cd.clk->ops.timer_gettime(cd.clk, kit, ts); | ||
| 412 | |||
| 413 | put_clock_desc(&cd); | ||
| 414 | } | ||
| 415 | |||
| 416 | static int pc_timer_settime(struct k_itimer *kit, int flags, | ||
| 417 | struct itimerspec64 *ts, struct itimerspec64 *old) | ||
| 418 | { | ||
| 419 | clockid_t id = kit->it_clock; | ||
| 420 | struct posix_clock_desc cd; | ||
| 421 | int err; | ||
| 422 | |||
| 423 | err = get_clock_desc(id, &cd); | ||
| 424 | if (err) | ||
| 425 | return err; | ||
| 426 | |||
| 427 | if (cd.clk->ops.timer_settime) | ||
| 428 | err = cd.clk->ops.timer_settime(cd.clk, kit, flags, ts, old); | ||
| 429 | else | ||
| 430 | err = -EOPNOTSUPP; | ||
| 431 | |||
| 432 | put_clock_desc(&cd); | ||
| 433 | |||
| 434 | return err; | ||
| 435 | } | ||
| 436 | |||
| 437 | struct k_clock clock_posix_dynamic = { | ||
| 438 | .clock_getres = pc_clock_getres, | 331 | .clock_getres = pc_clock_getres, |
| 439 | .clock_set = pc_clock_settime, | 332 | .clock_set = pc_clock_settime, |
| 440 | .clock_get = pc_clock_gettime, | 333 | .clock_get = pc_clock_gettime, |
| 441 | .clock_adj = pc_clock_adjtime, | 334 | .clock_adj = pc_clock_adjtime, |
| 442 | .timer_create = pc_timer_create, | ||
| 443 | .timer_set = pc_timer_settime, | ||
| 444 | .timer_del = pc_timer_delete, | ||
| 445 | .timer_get = pc_timer_gettime, | ||
| 446 | }; | 335 | }; |
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 1370f067fb51..a3bd5dbe0dc4 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c | |||
| @@ -12,6 +12,11 @@ | |||
| 12 | #include <trace/events/timer.h> | 12 | #include <trace/events/timer.h> |
| 13 | #include <linux/tick.h> | 13 | #include <linux/tick.h> |
| 14 | #include <linux/workqueue.h> | 14 | #include <linux/workqueue.h> |
| 15 | #include <linux/compat.h> | ||
| 16 | |||
| 17 | #include "posix-timers.h" | ||
| 18 | |||
| 19 | static void posix_cpu_timer_rearm(struct k_itimer *timer); | ||
| 15 | 20 | ||
| 16 | /* | 21 | /* |
| 17 | * Called after updating RLIMIT_CPU to run cpu timer and update | 22 | * Called after updating RLIMIT_CPU to run cpu timer and update |
| @@ -322,6 +327,8 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer) | |||
| 322 | if (CPUCLOCK_WHICH(new_timer->it_clock) >= CPUCLOCK_MAX) | 327 | if (CPUCLOCK_WHICH(new_timer->it_clock) >= CPUCLOCK_MAX) |
| 323 | return -EINVAL; | 328 | return -EINVAL; |
| 324 | 329 | ||
| 330 | new_timer->kclock = &clock_posix_cpu; | ||
| 331 | |||
| 325 | INIT_LIST_HEAD(&new_timer->it.cpu.entry); | 332 | INIT_LIST_HEAD(&new_timer->it.cpu.entry); |
| 326 | 333 | ||
| 327 | rcu_read_lock(); | 334 | rcu_read_lock(); |
| @@ -524,7 +531,8 @@ static void cpu_timer_fire(struct k_itimer *timer) | |||
| 524 | * reload the timer. But we need to keep it | 531 | * reload the timer. But we need to keep it |
| 525 | * ticking in case the signal is deliverable next time. | 532 | * ticking in case the signal is deliverable next time. |
| 526 | */ | 533 | */ |
| 527 | posix_cpu_timer_schedule(timer); | 534 | posix_cpu_timer_rearm(timer); |
| 535 | ++timer->it_requeue_pending; | ||
| 528 | } | 536 | } |
| 529 | } | 537 | } |
| 530 | 538 | ||
| @@ -572,7 +580,11 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, | |||
| 572 | 580 | ||
| 573 | WARN_ON_ONCE(p == NULL); | 581 | WARN_ON_ONCE(p == NULL); |
| 574 | 582 | ||
| 575 | new_expires = timespec64_to_ns(&new->it_value); | 583 | /* |
| 584 | * Use the to_ktime conversion because that clamps the maximum | ||
| 585 | * value to KTIME_MAX and avoid multiplication overflows. | ||
| 586 | */ | ||
| 587 | new_expires = ktime_to_ns(timespec64_to_ktime(new->it_value)); | ||
| 576 | 588 | ||
| 577 | /* | 589 | /* |
| 578 | * Protect against sighand release/switch in exit/exec and p->cpu_timers | 590 | * Protect against sighand release/switch in exit/exec and p->cpu_timers |
| @@ -712,10 +724,8 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp | |||
| 712 | */ | 724 | */ |
| 713 | itp->it_interval = ns_to_timespec64(timer->it.cpu.incr); | 725 | itp->it_interval = ns_to_timespec64(timer->it.cpu.incr); |
| 714 | 726 | ||
| 715 | if (timer->it.cpu.expires == 0) { /* Timer not armed at all. */ | 727 | if (!timer->it.cpu.expires) |
| 716 | itp->it_value.tv_sec = itp->it_value.tv_nsec = 0; | ||
| 717 | return; | 728 | return; |
| 718 | } | ||
| 719 | 729 | ||
| 720 | /* | 730 | /* |
| 721 | * Sample the clock to take the difference with the expiry time. | 731 | * Sample the clock to take the difference with the expiry time. |
| @@ -739,7 +749,6 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp | |||
| 739 | * Call the timer disarmed, nothing else to do. | 749 | * Call the timer disarmed, nothing else to do. |
| 740 | */ | 750 | */ |
| 741 | timer->it.cpu.expires = 0; | 751 | timer->it.cpu.expires = 0; |
| 742 | itp->it_value = ns_to_timespec64(timer->it.cpu.expires); | ||
| 743 | return; | 752 | return; |
| 744 | } else { | 753 | } else { |
| 745 | cpu_timer_sample_group(timer->it_clock, p, &now); | 754 | cpu_timer_sample_group(timer->it_clock, p, &now); |
| @@ -825,8 +834,10 @@ static void check_thread_timers(struct task_struct *tsk, | |||
| 825 | * At the hard limit, we just die. | 834 | * At the hard limit, we just die. |
| 826 | * No need to calculate anything else now. | 835 | * No need to calculate anything else now. |
| 827 | */ | 836 | */ |
| 828 | pr_info("CPU Watchdog Timeout (hard): %s[%d]\n", | 837 | if (print_fatal_signals) { |
| 829 | tsk->comm, task_pid_nr(tsk)); | 838 | pr_info("CPU Watchdog Timeout (hard): %s[%d]\n", |
| 839 | tsk->comm, task_pid_nr(tsk)); | ||
| 840 | } | ||
| 830 | __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); | 841 | __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); |
| 831 | return; | 842 | return; |
| 832 | } | 843 | } |
| @@ -838,8 +849,10 @@ static void check_thread_timers(struct task_struct *tsk, | |||
| 838 | soft += USEC_PER_SEC; | 849 | soft += USEC_PER_SEC; |
| 839 | sig->rlim[RLIMIT_RTTIME].rlim_cur = soft; | 850 | sig->rlim[RLIMIT_RTTIME].rlim_cur = soft; |
| 840 | } | 851 | } |
| 841 | pr_info("RT Watchdog Timeout (soft): %s[%d]\n", | 852 | if (print_fatal_signals) { |
| 842 | tsk->comm, task_pid_nr(tsk)); | 853 | pr_info("RT Watchdog Timeout (soft): %s[%d]\n", |
| 854 | tsk->comm, task_pid_nr(tsk)); | ||
| 855 | } | ||
| 843 | __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); | 856 | __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); |
| 844 | } | 857 | } |
| 845 | } | 858 | } |
| @@ -936,8 +949,10 @@ static void check_process_timers(struct task_struct *tsk, | |||
| 936 | * At the hard limit, we just die. | 949 | * At the hard limit, we just die. |
| 937 | * No need to calculate anything else now. | 950 | * No need to calculate anything else now. |
| 938 | */ | 951 | */ |
| 939 | pr_info("RT Watchdog Timeout (hard): %s[%d]\n", | 952 | if (print_fatal_signals) { |
| 940 | tsk->comm, task_pid_nr(tsk)); | 953 | pr_info("RT Watchdog Timeout (hard): %s[%d]\n", |
| 954 | tsk->comm, task_pid_nr(tsk)); | ||
| 955 | } | ||
| 941 | __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); | 956 | __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); |
| 942 | return; | 957 | return; |
| 943 | } | 958 | } |
| @@ -945,8 +960,10 @@ static void check_process_timers(struct task_struct *tsk, | |||
| 945 | /* | 960 | /* |
| 946 | * At the soft limit, send a SIGXCPU every second. | 961 | * At the soft limit, send a SIGXCPU every second. |
| 947 | */ | 962 | */ |
| 948 | pr_info("CPU Watchdog Timeout (soft): %s[%d]\n", | 963 | if (print_fatal_signals) { |
| 949 | tsk->comm, task_pid_nr(tsk)); | 964 | pr_info("CPU Watchdog Timeout (soft): %s[%d]\n", |
| 965 | tsk->comm, task_pid_nr(tsk)); | ||
| 966 | } | ||
| 950 | __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); | 967 | __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); |
| 951 | if (soft < hard) { | 968 | if (soft < hard) { |
| 952 | soft++; | 969 | soft++; |
| @@ -968,10 +985,10 @@ static void check_process_timers(struct task_struct *tsk, | |||
| 968 | } | 985 | } |
| 969 | 986 | ||
| 970 | /* | 987 | /* |
| 971 | * This is called from the signal code (via do_schedule_next_timer) | 988 | * This is called from the signal code (via posixtimer_rearm) |
| 972 | * when the last timer signal was delivered and we have to reload the timer. | 989 | * when the last timer signal was delivered and we have to reload the timer. |
| 973 | */ | 990 | */ |
| 974 | void posix_cpu_timer_schedule(struct k_itimer *timer) | 991 | static void posix_cpu_timer_rearm(struct k_itimer *timer) |
| 975 | { | 992 | { |
| 976 | struct sighand_struct *sighand; | 993 | struct sighand_struct *sighand; |
| 977 | unsigned long flags; | 994 | unsigned long flags; |
| @@ -987,12 +1004,12 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) | |||
| 987 | cpu_clock_sample(timer->it_clock, p, &now); | 1004 | cpu_clock_sample(timer->it_clock, p, &now); |
| 988 | bump_cpu_timer(timer, now); | 1005 | bump_cpu_timer(timer, now); |
| 989 | if (unlikely(p->exit_state)) | 1006 | if (unlikely(p->exit_state)) |
| 990 | goto out; | 1007 | return; |
| 991 | 1008 | ||
| 992 | /* Protect timer list r/w in arm_timer() */ | 1009 | /* Protect timer list r/w in arm_timer() */ |
| 993 | sighand = lock_task_sighand(p, &flags); | 1010 | sighand = lock_task_sighand(p, &flags); |
| 994 | if (!sighand) | 1011 | if (!sighand) |
| 995 | goto out; | 1012 | return; |
| 996 | } else { | 1013 | } else { |
| 997 | /* | 1014 | /* |
| 998 | * Protect arm_timer() and timer sampling in case of call to | 1015 | * Protect arm_timer() and timer sampling in case of call to |
| @@ -1005,11 +1022,10 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) | |||
| 1005 | * We can't even collect a sample any more. | 1022 | * We can't even collect a sample any more. |
| 1006 | */ | 1023 | */ |
| 1007 | timer->it.cpu.expires = 0; | 1024 | timer->it.cpu.expires = 0; |
| 1008 | goto out; | 1025 | return; |
| 1009 | } else if (unlikely(p->exit_state) && thread_group_empty(p)) { | 1026 | } else if (unlikely(p->exit_state) && thread_group_empty(p)) { |
| 1010 | unlock_task_sighand(p, &flags); | 1027 | /* If the process is dying, no need to rearm */ |
| 1011 | /* Optimizations: if the process is dying, no need to rearm */ | 1028 | goto unlock; |
| 1012 | goto out; | ||
| 1013 | } | 1029 | } |
| 1014 | cpu_timer_sample_group(timer->it_clock, p, &now); | 1030 | cpu_timer_sample_group(timer->it_clock, p, &now); |
| 1015 | bump_cpu_timer(timer, now); | 1031 | bump_cpu_timer(timer, now); |
| @@ -1021,12 +1037,8 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) | |||
| 1021 | */ | 1037 | */ |
| 1022 | WARN_ON_ONCE(!irqs_disabled()); | 1038 | WARN_ON_ONCE(!irqs_disabled()); |
| 1023 | arm_timer(timer); | 1039 | arm_timer(timer); |
| 1040 | unlock: | ||
| 1024 | unlock_task_sighand(p, &flags); | 1041 | unlock_task_sighand(p, &flags); |
| 1025 | |||
| 1026 | out: | ||
| 1027 | timer->it_overrun_last = timer->it_overrun; | ||
| 1028 | timer->it_overrun = -1; | ||
| 1029 | ++timer->it_requeue_pending; | ||
| 1030 | } | 1042 | } |
| 1031 | 1043 | ||
| 1032 | /** | 1044 | /** |
| @@ -1219,9 +1231,11 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, | |||
| 1219 | } | 1231 | } |
| 1220 | 1232 | ||
| 1221 | static int do_cpu_nanosleep(const clockid_t which_clock, int flags, | 1233 | static int do_cpu_nanosleep(const clockid_t which_clock, int flags, |
| 1222 | struct timespec64 *rqtp, struct itimerspec64 *it) | 1234 | const struct timespec64 *rqtp) |
| 1223 | { | 1235 | { |
| 1236 | struct itimerspec64 it; | ||
| 1224 | struct k_itimer timer; | 1237 | struct k_itimer timer; |
| 1238 | u64 expires; | ||
| 1225 | int error; | 1239 | int error; |
| 1226 | 1240 | ||
| 1227 | /* | 1241 | /* |
| @@ -1235,12 +1249,13 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, | |||
| 1235 | timer.it_process = current; | 1249 | timer.it_process = current; |
| 1236 | if (!error) { | 1250 | if (!error) { |
| 1237 | static struct itimerspec64 zero_it; | 1251 | static struct itimerspec64 zero_it; |
| 1252 | struct restart_block *restart; | ||
| 1238 | 1253 | ||
| 1239 | memset(it, 0, sizeof *it); | 1254 | memset(&it, 0, sizeof(it)); |
| 1240 | it->it_value = *rqtp; | 1255 | it.it_value = *rqtp; |
| 1241 | 1256 | ||
| 1242 | spin_lock_irq(&timer.it_lock); | 1257 | spin_lock_irq(&timer.it_lock); |
| 1243 | error = posix_cpu_timer_set(&timer, flags, it, NULL); | 1258 | error = posix_cpu_timer_set(&timer, flags, &it, NULL); |
| 1244 | if (error) { | 1259 | if (error) { |
| 1245 | spin_unlock_irq(&timer.it_lock); | 1260 | spin_unlock_irq(&timer.it_lock); |
| 1246 | return error; | 1261 | return error; |
| @@ -1269,8 +1284,8 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, | |||
| 1269 | /* | 1284 | /* |
| 1270 | * We were interrupted by a signal. | 1285 | * We were interrupted by a signal. |
| 1271 | */ | 1286 | */ |
| 1272 | *rqtp = ns_to_timespec64(timer.it.cpu.expires); | 1287 | expires = timer.it.cpu.expires; |
| 1273 | error = posix_cpu_timer_set(&timer, 0, &zero_it, it); | 1288 | error = posix_cpu_timer_set(&timer, 0, &zero_it, &it); |
| 1274 | if (!error) { | 1289 | if (!error) { |
| 1275 | /* | 1290 | /* |
| 1276 | * Timer is now unarmed, deletion can not fail. | 1291 | * Timer is now unarmed, deletion can not fail. |
| @@ -1290,7 +1305,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, | |||
| 1290 | spin_unlock_irq(&timer.it_lock); | 1305 | spin_unlock_irq(&timer.it_lock); |
| 1291 | } | 1306 | } |
| 1292 | 1307 | ||
| 1293 | if ((it->it_value.tv_sec | it->it_value.tv_nsec) == 0) { | 1308 | if ((it.it_value.tv_sec | it.it_value.tv_nsec) == 0) { |
| 1294 | /* | 1309 | /* |
| 1295 | * It actually did fire already. | 1310 | * It actually did fire already. |
| 1296 | */ | 1311 | */ |
| @@ -1298,6 +1313,13 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, | |||
| 1298 | } | 1313 | } |
| 1299 | 1314 | ||
| 1300 | error = -ERESTART_RESTARTBLOCK; | 1315 | error = -ERESTART_RESTARTBLOCK; |
| 1316 | /* | ||
| 1317 | * Report back to the user the time still remaining. | ||
| 1318 | */ | ||
| 1319 | restart = ¤t->restart_block; | ||
| 1320 | restart->nanosleep.expires = expires; | ||
| 1321 | if (restart->nanosleep.type != TT_NONE) | ||
| 1322 | error = nanosleep_copyout(restart, &it.it_value); | ||
| 1301 | } | 1323 | } |
| 1302 | 1324 | ||
| 1303 | return error; | 1325 | return error; |
| @@ -1306,11 +1328,9 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, | |||
| 1306 | static long posix_cpu_nsleep_restart(struct restart_block *restart_block); | 1328 | static long posix_cpu_nsleep_restart(struct restart_block *restart_block); |
| 1307 | 1329 | ||
| 1308 | static int posix_cpu_nsleep(const clockid_t which_clock, int flags, | 1330 | static int posix_cpu_nsleep(const clockid_t which_clock, int flags, |
| 1309 | struct timespec64 *rqtp, struct timespec __user *rmtp) | 1331 | const struct timespec64 *rqtp) |
| 1310 | { | 1332 | { |
| 1311 | struct restart_block *restart_block = ¤t->restart_block; | 1333 | struct restart_block *restart_block = ¤t->restart_block; |
| 1312 | struct itimerspec64 it; | ||
| 1313 | struct timespec ts; | ||
| 1314 | int error; | 1334 | int error; |
| 1315 | 1335 | ||
| 1316 | /* | 1336 | /* |
| @@ -1321,23 +1341,15 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags, | |||
| 1321 | CPUCLOCK_PID(which_clock) == task_pid_vnr(current))) | 1341 | CPUCLOCK_PID(which_clock) == task_pid_vnr(current))) |
| 1322 | return -EINVAL; | 1342 | return -EINVAL; |
| 1323 | 1343 | ||
| 1324 | error = do_cpu_nanosleep(which_clock, flags, rqtp, &it); | 1344 | error = do_cpu_nanosleep(which_clock, flags, rqtp); |
| 1325 | 1345 | ||
| 1326 | if (error == -ERESTART_RESTARTBLOCK) { | 1346 | if (error == -ERESTART_RESTARTBLOCK) { |
| 1327 | 1347 | ||
| 1328 | if (flags & TIMER_ABSTIME) | 1348 | if (flags & TIMER_ABSTIME) |
| 1329 | return -ERESTARTNOHAND; | 1349 | return -ERESTARTNOHAND; |
| 1330 | /* | ||
| 1331 | * Report back to the user the time still remaining. | ||
| 1332 | */ | ||
| 1333 | ts = timespec64_to_timespec(it.it_value); | ||
| 1334 | if (rmtp && copy_to_user(rmtp, &ts, sizeof(*rmtp))) | ||
| 1335 | return -EFAULT; | ||
| 1336 | 1350 | ||
| 1337 | restart_block->fn = posix_cpu_nsleep_restart; | 1351 | restart_block->fn = posix_cpu_nsleep_restart; |
| 1338 | restart_block->nanosleep.clockid = which_clock; | 1352 | restart_block->nanosleep.clockid = which_clock; |
| 1339 | restart_block->nanosleep.rmtp = rmtp; | ||
| 1340 | restart_block->nanosleep.expires = timespec64_to_ns(rqtp); | ||
| 1341 | } | 1353 | } |
| 1342 | return error; | 1354 | return error; |
| 1343 | } | 1355 | } |
| @@ -1345,28 +1357,11 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags, | |||
| 1345 | static long posix_cpu_nsleep_restart(struct restart_block *restart_block) | 1357 | static long posix_cpu_nsleep_restart(struct restart_block *restart_block) |
| 1346 | { | 1358 | { |
| 1347 | clockid_t which_clock = restart_block->nanosleep.clockid; | 1359 | clockid_t which_clock = restart_block->nanosleep.clockid; |
| 1348 | struct itimerspec64 it; | ||
| 1349 | struct timespec64 t; | 1360 | struct timespec64 t; |
| 1350 | struct timespec tmp; | ||
| 1351 | int error; | ||
| 1352 | 1361 | ||
| 1353 | t = ns_to_timespec64(restart_block->nanosleep.expires); | 1362 | t = ns_to_timespec64(restart_block->nanosleep.expires); |
| 1354 | 1363 | ||
| 1355 | error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it); | 1364 | return do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t); |
| 1356 | |||
| 1357 | if (error == -ERESTART_RESTARTBLOCK) { | ||
| 1358 | struct timespec __user *rmtp = restart_block->nanosleep.rmtp; | ||
| 1359 | /* | ||
| 1360 | * Report back to the user the time still remaining. | ||
| 1361 | */ | ||
| 1362 | tmp = timespec64_to_timespec(it.it_value); | ||
| 1363 | if (rmtp && copy_to_user(rmtp, &tmp, sizeof(*rmtp))) | ||
| 1364 | return -EFAULT; | ||
| 1365 | |||
| 1366 | restart_block->nanosleep.expires = timespec64_to_ns(&t); | ||
| 1367 | } | ||
| 1368 | return error; | ||
| 1369 | |||
| 1370 | } | 1365 | } |
| 1371 | 1366 | ||
| 1372 | #define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED) | 1367 | #define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED) |
| @@ -1388,14 +1383,9 @@ static int process_cpu_timer_create(struct k_itimer *timer) | |||
| 1388 | return posix_cpu_timer_create(timer); | 1383 | return posix_cpu_timer_create(timer); |
| 1389 | } | 1384 | } |
| 1390 | static int process_cpu_nsleep(const clockid_t which_clock, int flags, | 1385 | static int process_cpu_nsleep(const clockid_t which_clock, int flags, |
| 1391 | struct timespec64 *rqtp, | 1386 | const struct timespec64 *rqtp) |
| 1392 | struct timespec __user *rmtp) | ||
| 1393 | { | ||
| 1394 | return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp, rmtp); | ||
| 1395 | } | ||
| 1396 | static long process_cpu_nsleep_restart(struct restart_block *restart_block) | ||
| 1397 | { | 1387 | { |
| 1398 | return -EINVAL; | 1388 | return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp); |
| 1399 | } | 1389 | } |
| 1400 | static int thread_cpu_clock_getres(const clockid_t which_clock, | 1390 | static int thread_cpu_clock_getres(const clockid_t which_clock, |
| 1401 | struct timespec64 *tp) | 1391 | struct timespec64 *tp) |
| @@ -1413,36 +1403,27 @@ static int thread_cpu_timer_create(struct k_itimer *timer) | |||
| 1413 | return posix_cpu_timer_create(timer); | 1403 | return posix_cpu_timer_create(timer); |
| 1414 | } | 1404 | } |
| 1415 | 1405 | ||
| 1416 | struct k_clock clock_posix_cpu = { | 1406 | const struct k_clock clock_posix_cpu = { |
| 1417 | .clock_getres = posix_cpu_clock_getres, | 1407 | .clock_getres = posix_cpu_clock_getres, |
| 1418 | .clock_set = posix_cpu_clock_set, | 1408 | .clock_set = posix_cpu_clock_set, |
| 1419 | .clock_get = posix_cpu_clock_get, | 1409 | .clock_get = posix_cpu_clock_get, |
| 1420 | .timer_create = posix_cpu_timer_create, | 1410 | .timer_create = posix_cpu_timer_create, |
| 1421 | .nsleep = posix_cpu_nsleep, | 1411 | .nsleep = posix_cpu_nsleep, |
| 1422 | .nsleep_restart = posix_cpu_nsleep_restart, | ||
| 1423 | .timer_set = posix_cpu_timer_set, | 1412 | .timer_set = posix_cpu_timer_set, |
| 1424 | .timer_del = posix_cpu_timer_del, | 1413 | .timer_del = posix_cpu_timer_del, |
| 1425 | .timer_get = posix_cpu_timer_get, | 1414 | .timer_get = posix_cpu_timer_get, |
| 1415 | .timer_rearm = posix_cpu_timer_rearm, | ||
| 1426 | }; | 1416 | }; |
| 1427 | 1417 | ||
| 1428 | static __init int init_posix_cpu_timers(void) | 1418 | const struct k_clock clock_process = { |
| 1429 | { | 1419 | .clock_getres = process_cpu_clock_getres, |
| 1430 | struct k_clock process = { | 1420 | .clock_get = process_cpu_clock_get, |
| 1431 | .clock_getres = process_cpu_clock_getres, | 1421 | .timer_create = process_cpu_timer_create, |
| 1432 | .clock_get = process_cpu_clock_get, | 1422 | .nsleep = process_cpu_nsleep, |
| 1433 | .timer_create = process_cpu_timer_create, | 1423 | }; |
| 1434 | .nsleep = process_cpu_nsleep, | ||
| 1435 | .nsleep_restart = process_cpu_nsleep_restart, | ||
| 1436 | }; | ||
| 1437 | struct k_clock thread = { | ||
| 1438 | .clock_getres = thread_cpu_clock_getres, | ||
| 1439 | .clock_get = thread_cpu_clock_get, | ||
| 1440 | .timer_create = thread_cpu_timer_create, | ||
| 1441 | }; | ||
| 1442 | |||
| 1443 | posix_timers_register_clock(CLOCK_PROCESS_CPUTIME_ID, &process); | ||
| 1444 | posix_timers_register_clock(CLOCK_THREAD_CPUTIME_ID, &thread); | ||
| 1445 | 1424 | ||
| 1446 | return 0; | 1425 | const struct k_clock clock_thread = { |
| 1447 | } | 1426 | .clock_getres = thread_cpu_clock_getres, |
| 1448 | __initcall(init_posix_cpu_timers); | 1427 | .clock_get = thread_cpu_clock_get, |
| 1428 | .timer_create = thread_cpu_timer_create, | ||
| 1429 | }; | ||
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index c0cd53eb018a..06f34feb635e 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c | |||
| @@ -17,6 +17,7 @@ | |||
| 17 | #include <linux/ktime.h> | 17 | #include <linux/ktime.h> |
| 18 | #include <linux/timekeeping.h> | 18 | #include <linux/timekeeping.h> |
| 19 | #include <linux/posix-timers.h> | 19 | #include <linux/posix-timers.h> |
| 20 | #include <linux/compat.h> | ||
| 20 | 21 | ||
| 21 | asmlinkage long sys_ni_posix_timers(void) | 22 | asmlinkage long sys_ni_posix_timers(void) |
| 22 | { | 23 | { |
| @@ -27,6 +28,7 @@ asmlinkage long sys_ni_posix_timers(void) | |||
| 27 | } | 28 | } |
| 28 | 29 | ||
| 29 | #define SYS_NI(name) SYSCALL_ALIAS(sys_##name, sys_ni_posix_timers) | 30 | #define SYS_NI(name) SYSCALL_ALIAS(sys_##name, sys_ni_posix_timers) |
| 31 | #define COMPAT_SYS_NI(name) SYSCALL_ALIAS(compat_sys_##name, sys_ni_posix_timers) | ||
| 30 | 32 | ||
| 31 | SYS_NI(timer_create); | 33 | SYS_NI(timer_create); |
| 32 | SYS_NI(timer_gettime); | 34 | SYS_NI(timer_gettime); |
| @@ -49,40 +51,52 @@ SYS_NI(alarm); | |||
| 49 | SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, | 51 | SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, |
| 50 | const struct timespec __user *, tp) | 52 | const struct timespec __user *, tp) |
| 51 | { | 53 | { |
| 52 | struct timespec64 new_tp64; | 54 | struct timespec64 new_tp; |
| 53 | struct timespec new_tp; | ||
| 54 | 55 | ||
| 55 | if (which_clock != CLOCK_REALTIME) | 56 | if (which_clock != CLOCK_REALTIME) |
| 56 | return -EINVAL; | 57 | return -EINVAL; |
| 57 | if (copy_from_user(&new_tp, tp, sizeof (*tp))) | 58 | if (get_timespec64(&new_tp, tp)) |
| 58 | return -EFAULT; | 59 | return -EFAULT; |
| 59 | 60 | ||
| 60 | new_tp64 = timespec_to_timespec64(new_tp); | 61 | return do_sys_settimeofday64(&new_tp, NULL); |
| 61 | return do_sys_settimeofday64(&new_tp64, NULL); | ||
| 62 | } | 62 | } |
| 63 | 63 | ||
| 64 | SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, | 64 | int do_clock_gettime(clockid_t which_clock, struct timespec64 *tp) |
| 65 | struct timespec __user *,tp) | ||
| 66 | { | 65 | { |
| 67 | struct timespec64 kernel_tp64; | ||
| 68 | struct timespec kernel_tp; | ||
| 69 | |||
| 70 | switch (which_clock) { | 66 | switch (which_clock) { |
| 71 | case CLOCK_REALTIME: ktime_get_real_ts64(&kernel_tp64); break; | 67 | case CLOCK_REALTIME: |
| 72 | case CLOCK_MONOTONIC: ktime_get_ts64(&kernel_tp64); break; | 68 | ktime_get_real_ts64(tp); |
| 73 | case CLOCK_BOOTTIME: get_monotonic_boottime64(&kernel_tp64); break; | 69 | break; |
| 74 | default: return -EINVAL; | 70 | case CLOCK_MONOTONIC: |
| 71 | ktime_get_ts64(tp); | ||
| 72 | break; | ||
| 73 | case CLOCK_BOOTTIME: | ||
| 74 | get_monotonic_boottime64(tp); | ||
| 75 | break; | ||
| 76 | default: | ||
| 77 | return -EINVAL; | ||
| 75 | } | 78 | } |
| 76 | 79 | ||
| 77 | kernel_tp = timespec64_to_timespec(kernel_tp64); | 80 | return 0; |
| 78 | if (copy_to_user(tp, &kernel_tp, sizeof (kernel_tp))) | 81 | } |
| 82 | SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, | ||
| 83 | struct timespec __user *, tp) | ||
| 84 | { | ||
| 85 | int ret; | ||
| 86 | struct timespec64 kernel_tp; | ||
| 87 | |||
| 88 | ret = do_clock_gettime(which_clock, &kernel_tp); | ||
| 89 | if (ret) | ||
| 90 | return ret; | ||
| 91 | |||
| 92 | if (put_timespec64(&kernel_tp, tp)) | ||
| 79 | return -EFAULT; | 93 | return -EFAULT; |
| 80 | return 0; | 94 | return 0; |
| 81 | } | 95 | } |
| 82 | 96 | ||
| 83 | SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __user *, tp) | 97 | SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __user *, tp) |
| 84 | { | 98 | { |
| 85 | struct timespec rtn_tp = { | 99 | struct timespec64 rtn_tp = { |
| 86 | .tv_sec = 0, | 100 | .tv_sec = 0, |
| 87 | .tv_nsec = hrtimer_resolution, | 101 | .tv_nsec = hrtimer_resolution, |
| 88 | }; | 102 | }; |
| @@ -91,7 +105,7 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __us | |||
| 91 | case CLOCK_REALTIME: | 105 | case CLOCK_REALTIME: |
| 92 | case CLOCK_MONOTONIC: | 106 | case CLOCK_MONOTONIC: |
| 93 | case CLOCK_BOOTTIME: | 107 | case CLOCK_BOOTTIME: |
| 94 | if (copy_to_user(tp, &rtn_tp, sizeof(rtn_tp))) | 108 | if (put_timespec64(&rtn_tp, tp)) |
| 95 | return -EFAULT; | 109 | return -EFAULT; |
| 96 | return 0; | 110 | return 0; |
| 97 | default: | 111 | default: |
| @@ -110,22 +124,108 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, | |||
| 110 | case CLOCK_REALTIME: | 124 | case CLOCK_REALTIME: |
| 111 | case CLOCK_MONOTONIC: | 125 | case CLOCK_MONOTONIC: |
| 112 | case CLOCK_BOOTTIME: | 126 | case CLOCK_BOOTTIME: |
| 113 | if (copy_from_user(&t, rqtp, sizeof (struct timespec))) | 127 | break; |
| 114 | return -EFAULT; | ||
| 115 | t64 = timespec_to_timespec64(t); | ||
| 116 | if (!timespec64_valid(&t64)) | ||
| 117 | return -EINVAL; | ||
| 118 | return hrtimer_nanosleep(&t64, rmtp, flags & TIMER_ABSTIME ? | ||
| 119 | HRTIMER_MODE_ABS : HRTIMER_MODE_REL, | ||
| 120 | which_clock); | ||
| 121 | default: | 128 | default: |
| 122 | return -EINVAL; | 129 | return -EINVAL; |
| 123 | } | 130 | } |
| 131 | |||
| 132 | if (copy_from_user(&t, rqtp, sizeof (struct timespec))) | ||
| 133 | return -EFAULT; | ||
| 134 | t64 = timespec_to_timespec64(t); | ||
| 135 | if (!timespec64_valid(&t64)) | ||
| 136 | return -EINVAL; | ||
| 137 | if (flags & TIMER_ABSTIME) | ||
| 138 | rmtp = NULL; | ||
| 139 | current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; | ||
| 140 | current->restart_block.nanosleep.rmtp = rmtp; | ||
| 141 | return hrtimer_nanosleep(&t64, flags & TIMER_ABSTIME ? | ||
| 142 | HRTIMER_MODE_ABS : HRTIMER_MODE_REL, | ||
| 143 | which_clock); | ||
| 124 | } | 144 | } |
| 125 | 145 | ||
| 126 | #ifdef CONFIG_COMPAT | 146 | #ifdef CONFIG_COMPAT |
| 127 | long clock_nanosleep_restart(struct restart_block *restart_block) | 147 | COMPAT_SYS_NI(timer_create); |
| 148 | COMPAT_SYS_NI(clock_adjtime); | ||
| 149 | COMPAT_SYS_NI(timer_settime); | ||
| 150 | COMPAT_SYS_NI(timer_gettime); | ||
| 151 | COMPAT_SYS_NI(getitimer); | ||
| 152 | COMPAT_SYS_NI(setitimer); | ||
| 153 | |||
| 154 | COMPAT_SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, | ||
| 155 | struct compat_timespec __user *, tp) | ||
| 128 | { | 156 | { |
| 129 | return hrtimer_nanosleep_restart(restart_block); | 157 | struct timespec64 new_tp; |
| 158 | |||
| 159 | if (which_clock != CLOCK_REALTIME) | ||
| 160 | return -EINVAL; | ||
| 161 | if (compat_get_timespec64(&new_tp, tp)) | ||
| 162 | return -EFAULT; | ||
| 163 | |||
| 164 | return do_sys_settimeofday64(&new_tp, NULL); | ||
| 165 | } | ||
| 166 | |||
| 167 | COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, | ||
| 168 | struct compat_timespec __user *, tp) | ||
| 169 | { | ||
| 170 | int ret; | ||
| 171 | struct timespec64 kernel_tp; | ||
| 172 | |||
| 173 | ret = do_clock_gettime(which_clock, &kernel_tp); | ||
| 174 | if (ret) | ||
| 175 | return ret; | ||
| 176 | |||
| 177 | if (compat_put_timespec64(&kernel_tp, tp)) | ||
| 178 | return -EFAULT; | ||
| 179 | return 0; | ||
| 180 | } | ||
| 181 | |||
| 182 | COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, | ||
| 183 | struct compat_timespec __user *, tp) | ||
| 184 | { | ||
| 185 | struct timespec64 rtn_tp = { | ||
| 186 | .tv_sec = 0, | ||
| 187 | .tv_nsec = hrtimer_resolution, | ||
| 188 | }; | ||
| 189 | |||
| 190 | switch (which_clock) { | ||
| 191 | case CLOCK_REALTIME: | ||
| 192 | case CLOCK_MONOTONIC: | ||
| 193 | case CLOCK_BOOTTIME: | ||
| 194 | if (compat_put_timespec64(&rtn_tp, tp)) | ||
| 195 | return -EFAULT; | ||
| 196 | return 0; | ||
| 197 | default: | ||
| 198 | return -EINVAL; | ||
| 199 | } | ||
| 200 | } | ||
| 201 | |||
| 202 | COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, | ||
| 203 | struct compat_timespec __user *, rqtp, | ||
| 204 | struct compat_timespec __user *, rmtp) | ||
| 205 | { | ||
| 206 | struct timespec64 t64; | ||
| 207 | struct timespec t; | ||
| 208 | |||
| 209 | switch (which_clock) { | ||
| 210 | case CLOCK_REALTIME: | ||
| 211 | case CLOCK_MONOTONIC: | ||
| 212 | case CLOCK_BOOTTIME: | ||
| 213 | break; | ||
| 214 | default: | ||
| 215 | return -EINVAL; | ||
| 216 | } | ||
| 217 | |||
| 218 | if (compat_get_timespec(&t, rqtp)) | ||
| 219 | return -EFAULT; | ||
| 220 | t64 = timespec_to_timespec64(t); | ||
| 221 | if (!timespec64_valid(&t64)) | ||
| 222 | return -EINVAL; | ||
| 223 | if (flags & TIMER_ABSTIME) | ||
| 224 | rmtp = NULL; | ||
| 225 | current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; | ||
| 226 | current->restart_block.nanosleep.compat_rmtp = rmtp; | ||
| 227 | return hrtimer_nanosleep(&t64, flags & TIMER_ABSTIME ? | ||
| 228 | HRTIMER_MODE_ABS : HRTIMER_MODE_REL, | ||
| 229 | which_clock); | ||
| 130 | } | 230 | } |
| 131 | #endif | 231 | #endif |
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 4d7b2ce09c27..13d6881f908b 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c | |||
| @@ -49,8 +49,10 @@ | |||
| 49 | #include <linux/workqueue.h> | 49 | #include <linux/workqueue.h> |
| 50 | #include <linux/export.h> | 50 | #include <linux/export.h> |
| 51 | #include <linux/hashtable.h> | 51 | #include <linux/hashtable.h> |
| 52 | #include <linux/compat.h> | ||
| 52 | 53 | ||
| 53 | #include "timekeeping.h" | 54 | #include "timekeeping.h" |
| 55 | #include "posix-timers.h" | ||
| 54 | 56 | ||
| 55 | /* | 57 | /* |
| 56 | * Management arrays for POSIX timers. Timers are now kept in static hash table | 58 | * Management arrays for POSIX timers. Timers are now kept in static hash table |
| @@ -69,6 +71,10 @@ static struct kmem_cache *posix_timers_cache; | |||
| 69 | static DEFINE_HASHTABLE(posix_timers_hashtable, 9); | 71 | static DEFINE_HASHTABLE(posix_timers_hashtable, 9); |
| 70 | static DEFINE_SPINLOCK(hash_lock); | 72 | static DEFINE_SPINLOCK(hash_lock); |
| 71 | 73 | ||
| 74 | static const struct k_clock * const posix_clocks[]; | ||
| 75 | static const struct k_clock *clockid_to_kclock(const clockid_t id); | ||
| 76 | static const struct k_clock clock_realtime, clock_monotonic; | ||
| 77 | |||
| 72 | /* | 78 | /* |
| 73 | * we assume that the new SIGEV_THREAD_ID shares no bits with the other | 79 | * we assume that the new SIGEV_THREAD_ID shares no bits with the other |
| 74 | * SIGEV values. Here we put out an error if this assumption fails. | 80 | * SIGEV values. Here we put out an error if this assumption fails. |
| @@ -124,22 +130,6 @@ static DEFINE_SPINLOCK(hash_lock); | |||
| 124 | * have is CLOCK_REALTIME and its high res counter part, both of | 130 | * have is CLOCK_REALTIME and its high res counter part, both of |
| 125 | * which we beg off on and pass to do_sys_settimeofday(). | 131 | * which we beg off on and pass to do_sys_settimeofday(). |
| 126 | */ | 132 | */ |
| 127 | |||
| 128 | static struct k_clock posix_clocks[MAX_CLOCKS]; | ||
| 129 | |||
| 130 | /* | ||
| 131 | * These ones are defined below. | ||
| 132 | */ | ||
| 133 | static int common_nsleep(const clockid_t, int flags, struct timespec64 *t, | ||
| 134 | struct timespec __user *rmtp); | ||
| 135 | static int common_timer_create(struct k_itimer *new_timer); | ||
| 136 | static void common_timer_get(struct k_itimer *, struct itimerspec64 *); | ||
| 137 | static int common_timer_set(struct k_itimer *, int, | ||
| 138 | struct itimerspec64 *, struct itimerspec64 *); | ||
| 139 | static int common_timer_del(struct k_itimer *timer); | ||
| 140 | |||
| 141 | static enum hrtimer_restart posix_timer_fn(struct hrtimer *data); | ||
| 142 | |||
| 143 | static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags); | 133 | static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags); |
| 144 | 134 | ||
| 145 | #define lock_timer(tid, flags) \ | 135 | #define lock_timer(tid, flags) \ |
| @@ -285,91 +275,23 @@ static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp) | |||
| 285 | */ | 275 | */ |
| 286 | static __init int init_posix_timers(void) | 276 | static __init int init_posix_timers(void) |
| 287 | { | 277 | { |
| 288 | struct k_clock clock_realtime = { | ||
| 289 | .clock_getres = posix_get_hrtimer_res, | ||
| 290 | .clock_get = posix_clock_realtime_get, | ||
| 291 | .clock_set = posix_clock_realtime_set, | ||
| 292 | .clock_adj = posix_clock_realtime_adj, | ||
| 293 | .nsleep = common_nsleep, | ||
| 294 | .nsleep_restart = hrtimer_nanosleep_restart, | ||
| 295 | .timer_create = common_timer_create, | ||
| 296 | .timer_set = common_timer_set, | ||
| 297 | .timer_get = common_timer_get, | ||
| 298 | .timer_del = common_timer_del, | ||
| 299 | }; | ||
| 300 | struct k_clock clock_monotonic = { | ||
| 301 | .clock_getres = posix_get_hrtimer_res, | ||
| 302 | .clock_get = posix_ktime_get_ts, | ||
| 303 | .nsleep = common_nsleep, | ||
| 304 | .nsleep_restart = hrtimer_nanosleep_restart, | ||
| 305 | .timer_create = common_timer_create, | ||
| 306 | .timer_set = common_timer_set, | ||
| 307 | .timer_get = common_timer_get, | ||
| 308 | .timer_del = common_timer_del, | ||
| 309 | }; | ||
| 310 | struct k_clock clock_monotonic_raw = { | ||
| 311 | .clock_getres = posix_get_hrtimer_res, | ||
| 312 | .clock_get = posix_get_monotonic_raw, | ||
| 313 | }; | ||
| 314 | struct k_clock clock_realtime_coarse = { | ||
| 315 | .clock_getres = posix_get_coarse_res, | ||
| 316 | .clock_get = posix_get_realtime_coarse, | ||
| 317 | }; | ||
| 318 | struct k_clock clock_monotonic_coarse = { | ||
| 319 | .clock_getres = posix_get_coarse_res, | ||
| 320 | .clock_get = posix_get_monotonic_coarse, | ||
| 321 | }; | ||
| 322 | struct k_clock clock_tai = { | ||
| 323 | .clock_getres = posix_get_hrtimer_res, | ||
| 324 | .clock_get = posix_get_tai, | ||
| 325 | .nsleep = common_nsleep, | ||
| 326 | .nsleep_restart = hrtimer_nanosleep_restart, | ||
| 327 | .timer_create = common_timer_create, | ||
| 328 | .timer_set = common_timer_set, | ||
| 329 | .timer_get = common_timer_get, | ||
| 330 | .timer_del = common_timer_del, | ||
| 331 | }; | ||
| 332 | struct k_clock clock_boottime = { | ||
| 333 | .clock_getres = posix_get_hrtimer_res, | ||
| 334 | .clock_get = posix_get_boottime, | ||
| 335 | .nsleep = common_nsleep, | ||
| 336 | .nsleep_restart = hrtimer_nanosleep_restart, | ||
| 337 | .timer_create = common_timer_create, | ||
| 338 | .timer_set = common_timer_set, | ||
| 339 | .timer_get = common_timer_get, | ||
| 340 | .timer_del = common_timer_del, | ||
| 341 | }; | ||
| 342 | |||
| 343 | posix_timers_register_clock(CLOCK_REALTIME, &clock_realtime); | ||
| 344 | posix_timers_register_clock(CLOCK_MONOTONIC, &clock_monotonic); | ||
| 345 | posix_timers_register_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw); | ||
| 346 | posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse); | ||
| 347 | posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse); | ||
| 348 | posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime); | ||
| 349 | posix_timers_register_clock(CLOCK_TAI, &clock_tai); | ||
| 350 | |||
| 351 | posix_timers_cache = kmem_cache_create("posix_timers_cache", | 278 | posix_timers_cache = kmem_cache_create("posix_timers_cache", |
| 352 | sizeof (struct k_itimer), 0, SLAB_PANIC, | 279 | sizeof (struct k_itimer), 0, SLAB_PANIC, |
| 353 | NULL); | 280 | NULL); |
| 354 | return 0; | 281 | return 0; |
| 355 | } | 282 | } |
| 356 | |||
| 357 | __initcall(init_posix_timers); | 283 | __initcall(init_posix_timers); |
| 358 | 284 | ||
| 359 | static void schedule_next_timer(struct k_itimer *timr) | 285 | static void common_hrtimer_rearm(struct k_itimer *timr) |
| 360 | { | 286 | { |
| 361 | struct hrtimer *timer = &timr->it.real.timer; | 287 | struct hrtimer *timer = &timr->it.real.timer; |
| 362 | 288 | ||
| 363 | if (timr->it.real.interval == 0) | 289 | if (!timr->it_interval) |
| 364 | return; | 290 | return; |
| 365 | 291 | ||
| 366 | timr->it_overrun += (unsigned int) hrtimer_forward(timer, | 292 | timr->it_overrun += (unsigned int) hrtimer_forward(timer, |
| 367 | timer->base->get_time(), | 293 | timer->base->get_time(), |
| 368 | timr->it.real.interval); | 294 | timr->it_interval); |
| 369 | |||
| 370 | timr->it_overrun_last = timr->it_overrun; | ||
| 371 | timr->it_overrun = -1; | ||
| 372 | ++timr->it_requeue_pending; | ||
| 373 | hrtimer_restart(timer); | 295 | hrtimer_restart(timer); |
| 374 | } | 296 | } |
| 375 | 297 | ||
| @@ -384,24 +306,27 @@ static void schedule_next_timer(struct k_itimer *timr) | |||
| 384 | * To protect against the timer going away while the interrupt is queued, | 306 | * To protect against the timer going away while the interrupt is queued, |
| 385 | * we require that the it_requeue_pending flag be set. | 307 | * we require that the it_requeue_pending flag be set. |
| 386 | */ | 308 | */ |
| 387 | void do_schedule_next_timer(struct siginfo *info) | 309 | void posixtimer_rearm(struct siginfo *info) |
| 388 | { | 310 | { |
| 389 | struct k_itimer *timr; | 311 | struct k_itimer *timr; |
| 390 | unsigned long flags; | 312 | unsigned long flags; |
| 391 | 313 | ||
| 392 | timr = lock_timer(info->si_tid, &flags); | 314 | timr = lock_timer(info->si_tid, &flags); |
| 315 | if (!timr) | ||
| 316 | return; | ||
| 317 | |||
| 318 | if (timr->it_requeue_pending == info->si_sys_private) { | ||
| 319 | timr->kclock->timer_rearm(timr); | ||
| 393 | 320 | ||
| 394 | if (timr && timr->it_requeue_pending == info->si_sys_private) { | 321 | timr->it_active = 1; |
| 395 | if (timr->it_clock < 0) | 322 | timr->it_overrun_last = timr->it_overrun; |
| 396 | posix_cpu_timer_schedule(timr); | 323 | timr->it_overrun = -1; |
| 397 | else | 324 | ++timr->it_requeue_pending; |
| 398 | schedule_next_timer(timr); | ||
| 399 | 325 | ||
| 400 | info->si_overrun += timr->it_overrun_last; | 326 | info->si_overrun += timr->it_overrun_last; |
| 401 | } | 327 | } |
| 402 | 328 | ||
| 403 | if (timr) | 329 | unlock_timer(timr, flags); |
| 404 | unlock_timer(timr, flags); | ||
| 405 | } | 330 | } |
| 406 | 331 | ||
| 407 | int posix_timer_event(struct k_itimer *timr, int si_private) | 332 | int posix_timer_event(struct k_itimer *timr, int si_private) |
| @@ -410,12 +335,12 @@ int posix_timer_event(struct k_itimer *timr, int si_private) | |||
| 410 | int shared, ret = -1; | 335 | int shared, ret = -1; |
| 411 | /* | 336 | /* |
| 412 | * FIXME: if ->sigq is queued we can race with | 337 | * FIXME: if ->sigq is queued we can race with |
| 413 | * dequeue_signal()->do_schedule_next_timer(). | 338 | * dequeue_signal()->posixtimer_rearm(). |
| 414 | * | 339 | * |
| 415 | * If dequeue_signal() sees the "right" value of | 340 | * If dequeue_signal() sees the "right" value of |
| 416 | * si_sys_private it calls do_schedule_next_timer(). | 341 | * si_sys_private it calls posixtimer_rearm(). |
| 417 | * We re-queue ->sigq and drop ->it_lock(). | 342 | * We re-queue ->sigq and drop ->it_lock(). |
| 418 | * do_schedule_next_timer() locks the timer | 343 | * posixtimer_rearm() locks the timer |
| 419 | * and re-schedules it while ->sigq is pending. | 344 | * and re-schedules it while ->sigq is pending. |
| 420 | * Not really bad, but not that we want. | 345 | * Not really bad, but not that we want. |
| 421 | */ | 346 | */ |
| @@ -431,7 +356,6 @@ int posix_timer_event(struct k_itimer *timr, int si_private) | |||
| 431 | /* If we failed to send the signal the timer stops. */ | 356 | /* If we failed to send the signal the timer stops. */ |
| 432 | return ret > 0; | 357 | return ret > 0; |
| 433 | } | 358 | } |
| 434 | EXPORT_SYMBOL_GPL(posix_timer_event); | ||
| 435 | 359 | ||
| 436 | /* | 360 | /* |
| 437 | * This function gets called when a POSIX.1b interval timer expires. It | 361 | * This function gets called when a POSIX.1b interval timer expires. It |
| @@ -450,7 +374,8 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) | |||
| 450 | timr = container_of(timer, struct k_itimer, it.real.timer); | 374 | timr = container_of(timer, struct k_itimer, it.real.timer); |
| 451 | spin_lock_irqsave(&timr->it_lock, flags); | 375 | spin_lock_irqsave(&timr->it_lock, flags); |
| 452 | 376 | ||
| 453 | if (timr->it.real.interval != 0) | 377 | timr->it_active = 0; |
| 378 | if (timr->it_interval != 0) | ||
| 454 | si_private = ++timr->it_requeue_pending; | 379 | si_private = ++timr->it_requeue_pending; |
| 455 | 380 | ||
| 456 | if (posix_timer_event(timr, si_private)) { | 381 | if (posix_timer_event(timr, si_private)) { |
| @@ -459,7 +384,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) | |||
| 459 | * we will not get a call back to restart it AND | 384 | * we will not get a call back to restart it AND |
| 460 | * it should be restarted. | 385 | * it should be restarted. |
| 461 | */ | 386 | */ |
| 462 | if (timr->it.real.interval != 0) { | 387 | if (timr->it_interval != 0) { |
| 463 | ktime_t now = hrtimer_cb_get_time(timer); | 388 | ktime_t now = hrtimer_cb_get_time(timer); |
| 464 | 389 | ||
| 465 | /* | 390 | /* |
| @@ -488,15 +413,16 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) | |||
| 488 | { | 413 | { |
| 489 | ktime_t kj = NSEC_PER_SEC / HZ; | 414 | ktime_t kj = NSEC_PER_SEC / HZ; |
| 490 | 415 | ||
| 491 | if (timr->it.real.interval < kj) | 416 | if (timr->it_interval < kj) |
| 492 | now = ktime_add(now, kj); | 417 | now = ktime_add(now, kj); |
| 493 | } | 418 | } |
| 494 | #endif | 419 | #endif |
| 495 | timr->it_overrun += (unsigned int) | 420 | timr->it_overrun += (unsigned int) |
| 496 | hrtimer_forward(timer, now, | 421 | hrtimer_forward(timer, now, |
| 497 | timr->it.real.interval); | 422 | timr->it_interval); |
| 498 | ret = HRTIMER_RESTART; | 423 | ret = HRTIMER_RESTART; |
| 499 | ++timr->it_requeue_pending; | 424 | ++timr->it_requeue_pending; |
| 425 | timr->it_active = 1; | ||
| 500 | } | 426 | } |
| 501 | } | 427 | } |
| 502 | 428 | ||
| @@ -521,30 +447,6 @@ static struct pid *good_sigevent(sigevent_t * event) | |||
| 521 | return task_pid(rtn); | 447 | return task_pid(rtn); |
| 522 | } | 448 | } |
| 523 | 449 | ||
| 524 | void posix_timers_register_clock(const clockid_t clock_id, | ||
| 525 | struct k_clock *new_clock) | ||
| 526 | { | ||
| 527 | if ((unsigned) clock_id >= MAX_CLOCKS) { | ||
| 528 | printk(KERN_WARNING "POSIX clock register failed for clock_id %d\n", | ||
| 529 | clock_id); | ||
| 530 | return; | ||
| 531 | } | ||
| 532 | |||
| 533 | if (!new_clock->clock_get) { | ||
| 534 | printk(KERN_WARNING "POSIX clock id %d lacks clock_get()\n", | ||
| 535 | clock_id); | ||
| 536 | return; | ||
| 537 | } | ||
| 538 | if (!new_clock->clock_getres) { | ||
| 539 | printk(KERN_WARNING "POSIX clock id %d lacks clock_getres()\n", | ||
| 540 | clock_id); | ||
| 541 | return; | ||
| 542 | } | ||
| 543 | |||
| 544 | posix_clocks[clock_id] = *new_clock; | ||
| 545 | } | ||
| 546 | EXPORT_SYMBOL_GPL(posix_timers_register_clock); | ||
| 547 | |||
| 548 | static struct k_itimer * alloc_posix_timer(void) | 450 | static struct k_itimer * alloc_posix_timer(void) |
| 549 | { | 451 | { |
| 550 | struct k_itimer *tmr; | 452 | struct k_itimer *tmr; |
| @@ -581,17 +483,6 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set) | |||
| 581 | call_rcu(&tmr->it.rcu, k_itimer_rcu_free); | 483 | call_rcu(&tmr->it.rcu, k_itimer_rcu_free); |
| 582 | } | 484 | } |
| 583 | 485 | ||
| 584 | static struct k_clock *clockid_to_kclock(const clockid_t id) | ||
| 585 | { | ||
| 586 | if (id < 0) | ||
| 587 | return (id & CLOCKFD_MASK) == CLOCKFD ? | ||
| 588 | &clock_posix_dynamic : &clock_posix_cpu; | ||
| 589 | |||
| 590 | if (id >= MAX_CLOCKS || !posix_clocks[id].clock_getres) | ||
| 591 | return NULL; | ||
| 592 | return &posix_clocks[id]; | ||
| 593 | } | ||
| 594 | |||
| 595 | static int common_timer_create(struct k_itimer *new_timer) | 486 | static int common_timer_create(struct k_itimer *new_timer) |
| 596 | { | 487 | { |
| 597 | hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0); | 488 | hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0); |
| @@ -599,15 +490,12 @@ static int common_timer_create(struct k_itimer *new_timer) | |||
| 599 | } | 490 | } |
| 600 | 491 | ||
| 601 | /* Create a POSIX.1b interval timer. */ | 492 | /* Create a POSIX.1b interval timer. */ |
| 602 | 493 | static int do_timer_create(clockid_t which_clock, struct sigevent *event, | |
| 603 | SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, | 494 | timer_t __user *created_timer_id) |
| 604 | struct sigevent __user *, timer_event_spec, | ||
| 605 | timer_t __user *, created_timer_id) | ||
| 606 | { | 495 | { |
| 607 | struct k_clock *kc = clockid_to_kclock(which_clock); | 496 | const struct k_clock *kc = clockid_to_kclock(which_clock); |
| 608 | struct k_itimer *new_timer; | 497 | struct k_itimer *new_timer; |
| 609 | int error, new_timer_id; | 498 | int error, new_timer_id; |
| 610 | sigevent_t event; | ||
| 611 | int it_id_set = IT_ID_NOT_SET; | 499 | int it_id_set = IT_ID_NOT_SET; |
| 612 | 500 | ||
| 613 | if (!kc) | 501 | if (!kc) |
| @@ -629,31 +517,28 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, | |||
| 629 | it_id_set = IT_ID_SET; | 517 | it_id_set = IT_ID_SET; |
| 630 | new_timer->it_id = (timer_t) new_timer_id; | 518 | new_timer->it_id = (timer_t) new_timer_id; |
| 631 | new_timer->it_clock = which_clock; | 519 | new_timer->it_clock = which_clock; |
| 520 | new_timer->kclock = kc; | ||
| 632 | new_timer->it_overrun = -1; | 521 | new_timer->it_overrun = -1; |
| 633 | 522 | ||
| 634 | if (timer_event_spec) { | 523 | if (event) { |
| 635 | if (copy_from_user(&event, timer_event_spec, sizeof (event))) { | ||
| 636 | error = -EFAULT; | ||
| 637 | goto out; | ||
| 638 | } | ||
| 639 | rcu_read_lock(); | 524 | rcu_read_lock(); |
| 640 | new_timer->it_pid = get_pid(good_sigevent(&event)); | 525 | new_timer->it_pid = get_pid(good_sigevent(event)); |
| 641 | rcu_read_unlock(); | 526 | rcu_read_unlock(); |
| 642 | if (!new_timer->it_pid) { | 527 | if (!new_timer->it_pid) { |
| 643 | error = -EINVAL; | 528 | error = -EINVAL; |
| 644 | goto out; | 529 | goto out; |
| 645 | } | 530 | } |
| 531 | new_timer->it_sigev_notify = event->sigev_notify; | ||
| 532 | new_timer->sigq->info.si_signo = event->sigev_signo; | ||
| 533 | new_timer->sigq->info.si_value = event->sigev_value; | ||
| 646 | } else { | 534 | } else { |
| 647 | memset(&event.sigev_value, 0, sizeof(event.sigev_value)); | 535 | new_timer->it_sigev_notify = SIGEV_SIGNAL; |
| 648 | event.sigev_notify = SIGEV_SIGNAL; | 536 | new_timer->sigq->info.si_signo = SIGALRM; |
| 649 | event.sigev_signo = SIGALRM; | 537 | memset(&new_timer->sigq->info.si_value, 0, sizeof(sigval_t)); |
| 650 | event.sigev_value.sival_int = new_timer->it_id; | 538 | new_timer->sigq->info.si_value.sival_int = new_timer->it_id; |
| 651 | new_timer->it_pid = get_pid(task_tgid(current)); | 539 | new_timer->it_pid = get_pid(task_tgid(current)); |
| 652 | } | 540 | } |
| 653 | 541 | ||
| 654 | new_timer->it_sigev_notify = event.sigev_notify; | ||
| 655 | new_timer->sigq->info.si_signo = event.sigev_signo; | ||
| 656 | new_timer->sigq->info.si_value = event.sigev_value; | ||
| 657 | new_timer->sigq->info.si_tid = new_timer->it_id; | 542 | new_timer->sigq->info.si_tid = new_timer->it_id; |
| 658 | new_timer->sigq->info.si_code = SI_TIMER; | 543 | new_timer->sigq->info.si_code = SI_TIMER; |
| 659 | 544 | ||
| @@ -684,6 +569,36 @@ out: | |||
| 684 | return error; | 569 | return error; |
| 685 | } | 570 | } |
| 686 | 571 | ||
| 572 | SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, | ||
| 573 | struct sigevent __user *, timer_event_spec, | ||
| 574 | timer_t __user *, created_timer_id) | ||
| 575 | { | ||
| 576 | if (timer_event_spec) { | ||
| 577 | sigevent_t event; | ||
| 578 | |||
| 579 | if (copy_from_user(&event, timer_event_spec, sizeof (event))) | ||
| 580 | return -EFAULT; | ||
| 581 | return do_timer_create(which_clock, &event, created_timer_id); | ||
| 582 | } | ||
| 583 | return do_timer_create(which_clock, NULL, created_timer_id); | ||
| 584 | } | ||
| 585 | |||
| 586 | #ifdef CONFIG_COMPAT | ||
| 587 | COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock, | ||
| 588 | struct compat_sigevent __user *, timer_event_spec, | ||
| 589 | timer_t __user *, created_timer_id) | ||
| 590 | { | ||
| 591 | if (timer_event_spec) { | ||
| 592 | sigevent_t event; | ||
| 593 | |||
| 594 | if (get_compat_sigevent(&event, timer_event_spec)) | ||
| 595 | return -EFAULT; | ||
| 596 | return do_timer_create(which_clock, &event, created_timer_id); | ||
| 597 | } | ||
| 598 | return do_timer_create(which_clock, NULL, created_timer_id); | ||
| 599 | } | ||
| 600 | #endif | ||
| 601 | |||
| 687 | /* | 602 | /* |
| 688 | * Locking issues: We need to protect the result of the id look up until | 603 | * Locking issues: We need to protect the result of the id look up until |
| 689 | * we get the timer locked down so it is not deleted under us. The | 604 | * we get the timer locked down so it is not deleted under us. The |
| @@ -717,6 +632,20 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) | |||
| 717 | return NULL; | 632 | return NULL; |
| 718 | } | 633 | } |
| 719 | 634 | ||
| 635 | static ktime_t common_hrtimer_remaining(struct k_itimer *timr, ktime_t now) | ||
| 636 | { | ||
| 637 | struct hrtimer *timer = &timr->it.real.timer; | ||
| 638 | |||
| 639 | return __hrtimer_expires_remaining_adjusted(timer, now); | ||
| 640 | } | ||
| 641 | |||
| 642 | static int common_hrtimer_forward(struct k_itimer *timr, ktime_t now) | ||
| 643 | { | ||
| 644 | struct hrtimer *timer = &timr->it.real.timer; | ||
| 645 | |||
| 646 | return (int)hrtimer_forward(timer, now, timr->it_interval); | ||
| 647 | } | ||
| 648 | |||
| 720 | /* | 649 | /* |
| 721 | * Get the time remaining on a POSIX.1b interval timer. This function | 650 | * Get the time remaining on a POSIX.1b interval timer. This function |
| 722 | * is ALWAYS called with spin_lock_irq on the timer, thus it must not | 651 | * is ALWAYS called with spin_lock_irq on the timer, thus it must not |
| @@ -733,55 +662,61 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) | |||
| 733 | * it is the same as a requeue pending timer WRT to what we should | 662 | * it is the same as a requeue pending timer WRT to what we should |
| 734 | * report. | 663 | * report. |
| 735 | */ | 664 | */ |
| 736 | static void | 665 | void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) |
| 737 | common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) | ||
| 738 | { | 666 | { |
| 667 | const struct k_clock *kc = timr->kclock; | ||
| 739 | ktime_t now, remaining, iv; | 668 | ktime_t now, remaining, iv; |
| 740 | struct hrtimer *timer = &timr->it.real.timer; | 669 | struct timespec64 ts64; |
| 741 | 670 | bool sig_none; | |
| 742 | memset(cur_setting, 0, sizeof(*cur_setting)); | ||
| 743 | 671 | ||
| 744 | iv = timr->it.real.interval; | 672 | sig_none = (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE; |
| 673 | iv = timr->it_interval; | ||
| 745 | 674 | ||
| 746 | /* interval timer ? */ | 675 | /* interval timer ? */ |
| 747 | if (iv) | 676 | if (iv) { |
| 748 | cur_setting->it_interval = ktime_to_timespec64(iv); | 677 | cur_setting->it_interval = ktime_to_timespec64(iv); |
| 749 | else if (!hrtimer_active(timer) && | 678 | } else if (!timr->it_active) { |
| 750 | (timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) | 679 | /* |
| 751 | return; | 680 | * SIGEV_NONE oneshot timers are never queued. Check them |
| 681 | * below. | ||
| 682 | */ | ||
| 683 | if (!sig_none) | ||
| 684 | return; | ||
| 685 | } | ||
| 752 | 686 | ||
| 753 | now = timer->base->get_time(); | 687 | /* |
| 688 | * The timespec64 based conversion is suboptimal, but it's not | ||
| 689 | * worth to implement yet another callback. | ||
| 690 | */ | ||
| 691 | kc->clock_get(timr->it_clock, &ts64); | ||
| 692 | now = timespec64_to_ktime(ts64); | ||
| 754 | 693 | ||
| 755 | /* | 694 | /* |
| 756 | * When a requeue is pending or this is a SIGEV_NONE | 695 | * When a requeue is pending or this is a SIGEV_NONE timer move the |
| 757 | * timer move the expiry time forward by intervals, so | 696 | * expiry time forward by intervals, so expiry is > now. |
| 758 | * expiry is > now. | ||
| 759 | */ | 697 | */ |
| 760 | if (iv && (timr->it_requeue_pending & REQUEUE_PENDING || | 698 | if (iv && (timr->it_requeue_pending & REQUEUE_PENDING || sig_none)) |
| 761 | (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) | 699 | timr->it_overrun += kc->timer_forward(timr, now); |
| 762 | timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv); | ||
| 763 | 700 | ||
| 764 | remaining = __hrtimer_expires_remaining_adjusted(timer, now); | 701 | remaining = kc->timer_remaining(timr, now); |
| 765 | /* Return 0 only, when the timer is expired and not pending */ | 702 | /* Return 0 only, when the timer is expired and not pending */ |
| 766 | if (remaining <= 0) { | 703 | if (remaining <= 0) { |
| 767 | /* | 704 | /* |
| 768 | * A single shot SIGEV_NONE timer must return 0, when | 705 | * A single shot SIGEV_NONE timer must return 0, when |
| 769 | * it is expired ! | 706 | * it is expired ! |
| 770 | */ | 707 | */ |
| 771 | if ((timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) | 708 | if (!sig_none) |
| 772 | cur_setting->it_value.tv_nsec = 1; | 709 | cur_setting->it_value.tv_nsec = 1; |
| 773 | } else | 710 | } else { |
| 774 | cur_setting->it_value = ktime_to_timespec64(remaining); | 711 | cur_setting->it_value = ktime_to_timespec64(remaining); |
| 712 | } | ||
| 775 | } | 713 | } |
| 776 | 714 | ||
| 777 | /* Get the time remaining on a POSIX.1b interval timer. */ | 715 | /* Get the time remaining on a POSIX.1b interval timer. */ |
| 778 | SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, | 716 | static int do_timer_gettime(timer_t timer_id, struct itimerspec64 *setting) |
| 779 | struct itimerspec __user *, setting) | ||
| 780 | { | 717 | { |
| 781 | struct itimerspec64 cur_setting64; | ||
| 782 | struct itimerspec cur_setting; | ||
| 783 | struct k_itimer *timr; | 718 | struct k_itimer *timr; |
| 784 | struct k_clock *kc; | 719 | const struct k_clock *kc; |
| 785 | unsigned long flags; | 720 | unsigned long flags; |
| 786 | int ret = 0; | 721 | int ret = 0; |
| 787 | 722 | ||
| @@ -789,20 +724,45 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, | |||
| 789 | if (!timr) | 724 | if (!timr) |
| 790 | return -EINVAL; | 725 | return -EINVAL; |
| 791 | 726 | ||
| 792 | kc = clockid_to_kclock(timr->it_clock); | 727 | memset(setting, 0, sizeof(*setting)); |
| 728 | kc = timr->kclock; | ||
| 793 | if (WARN_ON_ONCE(!kc || !kc->timer_get)) | 729 | if (WARN_ON_ONCE(!kc || !kc->timer_get)) |
| 794 | ret = -EINVAL; | 730 | ret = -EINVAL; |
| 795 | else | 731 | else |
| 796 | kc->timer_get(timr, &cur_setting64); | 732 | kc->timer_get(timr, setting); |
| 797 | 733 | ||
| 798 | unlock_timer(timr, flags); | 734 | unlock_timer(timr, flags); |
| 735 | return ret; | ||
| 736 | } | ||
| 799 | 737 | ||
| 800 | cur_setting = itimerspec64_to_itimerspec(&cur_setting64); | 738 | /* Get the time remaining on a POSIX.1b interval timer. */ |
| 801 | if (!ret && copy_to_user(setting, &cur_setting, sizeof (cur_setting))) | 739 | SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, |
| 802 | return -EFAULT; | 740 | struct itimerspec __user *, setting) |
| 741 | { | ||
| 742 | struct itimerspec64 cur_setting; | ||
| 743 | |||
| 744 | int ret = do_timer_gettime(timer_id, &cur_setting); | ||
| 745 | if (!ret) { | ||
| 746 | if (put_itimerspec64(&cur_setting, setting)) | ||
| 747 | ret = -EFAULT; | ||
| 748 | } | ||
| 749 | return ret; | ||
| 750 | } | ||
| 751 | |||
| 752 | #ifdef CONFIG_COMPAT | ||
| 753 | COMPAT_SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, | ||
| 754 | struct compat_itimerspec __user *, setting) | ||
| 755 | { | ||
| 756 | struct itimerspec64 cur_setting; | ||
| 803 | 757 | ||
| 758 | int ret = do_timer_gettime(timer_id, &cur_setting); | ||
| 759 | if (!ret) { | ||
| 760 | if (put_compat_itimerspec64(&cur_setting, setting)) | ||
| 761 | ret = -EFAULT; | ||
| 762 | } | ||
| 804 | return ret; | 763 | return ret; |
| 805 | } | 764 | } |
| 765 | #endif | ||
| 806 | 766 | ||
| 807 | /* | 767 | /* |
| 808 | * Get the number of overruns of a POSIX.1b interval timer. This is to | 768 | * Get the number of overruns of a POSIX.1b interval timer. This is to |
| @@ -810,7 +770,7 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, | |||
| 810 | * accumulating overruns on the next timer. The overrun is frozen when | 770 | * accumulating overruns on the next timer. The overrun is frozen when |
| 811 | * the signal is delivered, either at the notify time (if the info block | 771 | * the signal is delivered, either at the notify time (if the info block |
| 812 | * is not queued) or at the actual delivery time (as we are informed by | 772 | * is not queued) or at the actual delivery time (as we are informed by |
| 813 | * the call back to do_schedule_next_timer(). So all we need to do is | 773 | * the call back to posixtimer_rearm(). So all we need to do is |
| 814 | * to pick up the frozen overrun. | 774 | * to pick up the frozen overrun. |
| 815 | */ | 775 | */ |
| 816 | SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id) | 776 | SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id) |
| @@ -829,117 +789,175 @@ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id) | |||
| 829 | return overrun; | 789 | return overrun; |
| 830 | } | 790 | } |
| 831 | 791 | ||
| 832 | /* Set a POSIX.1b interval timer. */ | 792 | static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, |
| 833 | /* timr->it_lock is taken. */ | 793 | bool absolute, bool sigev_none) |
| 834 | static int | ||
| 835 | common_timer_set(struct k_itimer *timr, int flags, | ||
| 836 | struct itimerspec64 *new_setting, struct itimerspec64 *old_setting) | ||
| 837 | { | 794 | { |
| 838 | struct hrtimer *timer = &timr->it.real.timer; | 795 | struct hrtimer *timer = &timr->it.real.timer; |
| 839 | enum hrtimer_mode mode; | 796 | enum hrtimer_mode mode; |
| 840 | 797 | ||
| 798 | mode = absolute ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL; | ||
| 799 | /* | ||
| 800 | * Posix magic: Relative CLOCK_REALTIME timers are not affected by | ||
| 801 | * clock modifications, so they become CLOCK_MONOTONIC based under the | ||
| 802 | * hood. See hrtimer_init(). Update timr->kclock, so the generic | ||
| 803 | * functions which use timr->kclock->clock_get() work. | ||
| 804 | * | ||
| 805 | * Note: it_clock stays unmodified, because the next timer_set() might | ||
| 806 | * use ABSTIME, so it needs to switch back. | ||
| 807 | */ | ||
| 808 | if (timr->it_clock == CLOCK_REALTIME) | ||
| 809 | timr->kclock = absolute ? &clock_realtime : &clock_monotonic; | ||
| 810 | |||
| 811 | hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); | ||
| 812 | timr->it.real.timer.function = posix_timer_fn; | ||
| 813 | |||
| 814 | if (!absolute) | ||
| 815 | expires = ktime_add_safe(expires, timer->base->get_time()); | ||
| 816 | hrtimer_set_expires(timer, expires); | ||
| 817 | |||
| 818 | if (!sigev_none) | ||
| 819 | hrtimer_start_expires(timer, HRTIMER_MODE_ABS); | ||
| 820 | } | ||
| 821 | |||
| 822 | static int common_hrtimer_try_to_cancel(struct k_itimer *timr) | ||
| 823 | { | ||
| 824 | return hrtimer_try_to_cancel(&timr->it.real.timer); | ||
| 825 | } | ||
| 826 | |||
| 827 | /* Set a POSIX.1b interval timer. */ | ||
| 828 | int common_timer_set(struct k_itimer *timr, int flags, | ||
| 829 | struct itimerspec64 *new_setting, | ||
| 830 | struct itimerspec64 *old_setting) | ||
| 831 | { | ||
| 832 | const struct k_clock *kc = timr->kclock; | ||
| 833 | bool sigev_none; | ||
| 834 | ktime_t expires; | ||
| 835 | |||
| 841 | if (old_setting) | 836 | if (old_setting) |
| 842 | common_timer_get(timr, old_setting); | 837 | common_timer_get(timr, old_setting); |
| 843 | 838 | ||
| 844 | /* disable the timer */ | 839 | /* Prevent rearming by clearing the interval */ |
| 845 | timr->it.real.interval = 0; | 840 | timr->it_interval = 0; |
| 846 | /* | 841 | /* |
| 847 | * careful here. If smp we could be in the "fire" routine which will | 842 | * Careful here. On SMP systems the timer expiry function could be |
| 848 | * be spinning as we hold the lock. But this is ONLY an SMP issue. | 843 | * active and spinning on timr->it_lock. |
| 849 | */ | 844 | */ |
| 850 | if (hrtimer_try_to_cancel(timer) < 0) | 845 | if (kc->timer_try_to_cancel(timr) < 0) |
| 851 | return TIMER_RETRY; | 846 | return TIMER_RETRY; |
| 852 | 847 | ||
| 853 | timr->it_requeue_pending = (timr->it_requeue_pending + 2) & | 848 | timr->it_active = 0; |
| 849 | timr->it_requeue_pending = (timr->it_requeue_pending + 2) & | ||
| 854 | ~REQUEUE_PENDING; | 850 | ~REQUEUE_PENDING; |
| 855 | timr->it_overrun_last = 0; | 851 | timr->it_overrun_last = 0; |
| 856 | 852 | ||
| 857 | /* switch off the timer when it_value is zero */ | 853 | /* Switch off the timer when it_value is zero */ |
| 858 | if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec) | 854 | if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec) |
| 859 | return 0; | 855 | return 0; |
| 860 | 856 | ||
| 861 | mode = flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL; | 857 | timr->it_interval = timespec64_to_ktime(new_setting->it_interval); |
| 862 | hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); | 858 | expires = timespec64_to_ktime(new_setting->it_value); |
| 863 | timr->it.real.timer.function = posix_timer_fn; | 859 | sigev_none = (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE; |
| 864 | |||
| 865 | hrtimer_set_expires(timer, timespec64_to_ktime(new_setting->it_value)); | ||
| 866 | |||
| 867 | /* Convert interval */ | ||
| 868 | timr->it.real.interval = timespec64_to_ktime(new_setting->it_interval); | ||
| 869 | |||
| 870 | /* SIGEV_NONE timers are not queued ! See common_timer_get */ | ||
| 871 | if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) { | ||
| 872 | /* Setup correct expiry time for relative timers */ | ||
| 873 | if (mode == HRTIMER_MODE_REL) { | ||
| 874 | hrtimer_add_expires(timer, timer->base->get_time()); | ||
| 875 | } | ||
| 876 | return 0; | ||
| 877 | } | ||
| 878 | 860 | ||
| 879 | hrtimer_start_expires(timer, mode); | 861 | kc->timer_arm(timr, expires, flags & TIMER_ABSTIME, sigev_none); |
| 862 | timr->it_active = !sigev_none; | ||
| 880 | return 0; | 863 | return 0; |
| 881 | } | 864 | } |
| 882 | 865 | ||
| 883 | /* Set a POSIX.1b interval timer */ | 866 | static int do_timer_settime(timer_t timer_id, int flags, |
| 884 | SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, | 867 | struct itimerspec64 *new_spec64, |
| 885 | const struct itimerspec __user *, new_setting, | 868 | struct itimerspec64 *old_spec64) |
| 886 | struct itimerspec __user *, old_setting) | ||
| 887 | { | 869 | { |
| 888 | struct itimerspec64 new_spec64, old_spec64; | 870 | const struct k_clock *kc; |
| 889 | struct itimerspec64 *rtn = old_setting ? &old_spec64 : NULL; | ||
| 890 | struct itimerspec new_spec, old_spec; | ||
| 891 | struct k_itimer *timr; | 871 | struct k_itimer *timr; |
| 892 | unsigned long flag; | 872 | unsigned long flag; |
| 893 | struct k_clock *kc; | ||
| 894 | int error = 0; | 873 | int error = 0; |
| 895 | 874 | ||
| 896 | if (!new_setting) | 875 | if (!timespec64_valid(&new_spec64->it_interval) || |
| 876 | !timespec64_valid(&new_spec64->it_value)) | ||
| 897 | return -EINVAL; | 877 | return -EINVAL; |
| 898 | 878 | ||
| 899 | if (copy_from_user(&new_spec, new_setting, sizeof (new_spec))) | 879 | if (old_spec64) |
| 900 | return -EFAULT; | 880 | memset(old_spec64, 0, sizeof(*old_spec64)); |
| 901 | new_spec64 = itimerspec_to_itimerspec64(&new_spec); | ||
| 902 | |||
| 903 | if (!timespec64_valid(&new_spec64.it_interval) || | ||
| 904 | !timespec64_valid(&new_spec64.it_value)) | ||
| 905 | return -EINVAL; | ||
| 906 | retry: | 881 | retry: |
| 907 | timr = lock_timer(timer_id, &flag); | 882 | timr = lock_timer(timer_id, &flag); |
| 908 | if (!timr) | 883 | if (!timr) |
| 909 | return -EINVAL; | 884 | return -EINVAL; |
| 910 | 885 | ||
| 911 | kc = clockid_to_kclock(timr->it_clock); | 886 | kc = timr->kclock; |
| 912 | if (WARN_ON_ONCE(!kc || !kc->timer_set)) | 887 | if (WARN_ON_ONCE(!kc || !kc->timer_set)) |
| 913 | error = -EINVAL; | 888 | error = -EINVAL; |
| 914 | else | 889 | else |
| 915 | error = kc->timer_set(timr, flags, &new_spec64, rtn); | 890 | error = kc->timer_set(timr, flags, new_spec64, old_spec64); |
| 916 | 891 | ||
| 917 | unlock_timer(timr, flag); | 892 | unlock_timer(timr, flag); |
| 918 | if (error == TIMER_RETRY) { | 893 | if (error == TIMER_RETRY) { |
| 919 | rtn = NULL; // We already got the old time... | 894 | old_spec64 = NULL; // We already got the old time... |
| 920 | goto retry; | 895 | goto retry; |
| 921 | } | 896 | } |
| 922 | 897 | ||
| 923 | old_spec = itimerspec64_to_itimerspec(&old_spec64); | 898 | return error; |
| 924 | if (old_setting && !error && | 899 | } |
| 925 | copy_to_user(old_setting, &old_spec, sizeof (old_spec))) | 900 | |
| 926 | error = -EFAULT; | 901 | /* Set a POSIX.1b interval timer */ |
| 902 | SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, | ||
| 903 | const struct itimerspec __user *, new_setting, | ||
| 904 | struct itimerspec __user *, old_setting) | ||
| 905 | { | ||
| 906 | struct itimerspec64 new_spec, old_spec; | ||
| 907 | struct itimerspec64 *rtn = old_setting ? &old_spec : NULL; | ||
| 908 | int error = 0; | ||
| 909 | |||
| 910 | if (!new_setting) | ||
| 911 | return -EINVAL; | ||
| 927 | 912 | ||
| 913 | if (get_itimerspec64(&new_spec, new_setting)) | ||
| 914 | return -EFAULT; | ||
| 915 | |||
| 916 | error = do_timer_settime(timer_id, flags, &new_spec, rtn); | ||
| 917 | if (!error && old_setting) { | ||
| 918 | if (put_itimerspec64(&old_spec, old_setting)) | ||
| 919 | error = -EFAULT; | ||
| 920 | } | ||
| 921 | return error; | ||
| 922 | } | ||
| 923 | |||
| 924 | #ifdef CONFIG_COMPAT | ||
| 925 | COMPAT_SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, | ||
| 926 | struct compat_itimerspec __user *, new, | ||
| 927 | struct compat_itimerspec __user *, old) | ||
| 928 | { | ||
| 929 | struct itimerspec64 new_spec, old_spec; | ||
| 930 | struct itimerspec64 *rtn = old ? &old_spec : NULL; | ||
| 931 | int error = 0; | ||
| 932 | |||
| 933 | if (!new) | ||
| 934 | return -EINVAL; | ||
| 935 | if (get_compat_itimerspec64(&new_spec, new)) | ||
| 936 | return -EFAULT; | ||
| 937 | |||
| 938 | error = do_timer_settime(timer_id, flags, &new_spec, rtn); | ||
| 939 | if (!error && old) { | ||
| 940 | if (put_compat_itimerspec64(&old_spec, old)) | ||
| 941 | error = -EFAULT; | ||
| 942 | } | ||
| 928 | return error; | 943 | return error; |
| 929 | } | 944 | } |
| 945 | #endif | ||
| 930 | 946 | ||
| 931 | static int common_timer_del(struct k_itimer *timer) | 947 | int common_timer_del(struct k_itimer *timer) |
| 932 | { | 948 | { |
| 933 | timer->it.real.interval = 0; | 949 | const struct k_clock *kc = timer->kclock; |
| 934 | 950 | ||
| 935 | if (hrtimer_try_to_cancel(&timer->it.real.timer) < 0) | 951 | timer->it_interval = 0; |
| 952 | if (kc->timer_try_to_cancel(timer) < 0) | ||
| 936 | return TIMER_RETRY; | 953 | return TIMER_RETRY; |
| 954 | timer->it_active = 0; | ||
| 937 | return 0; | 955 | return 0; |
| 938 | } | 956 | } |
| 939 | 957 | ||
| 940 | static inline int timer_delete_hook(struct k_itimer *timer) | 958 | static inline int timer_delete_hook(struct k_itimer *timer) |
| 941 | { | 959 | { |
| 942 | struct k_clock *kc = clockid_to_kclock(timer->it_clock); | 960 | const struct k_clock *kc = timer->kclock; |
| 943 | 961 | ||
| 944 | if (WARN_ON_ONCE(!kc || !kc->timer_del)) | 962 | if (WARN_ON_ONCE(!kc || !kc->timer_del)) |
| 945 | return -EINVAL; | 963 | return -EINVAL; |
| @@ -1018,35 +1036,31 @@ void exit_itimers(struct signal_struct *sig) | |||
| 1018 | SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, | 1036 | SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, |
| 1019 | const struct timespec __user *, tp) | 1037 | const struct timespec __user *, tp) |
| 1020 | { | 1038 | { |
| 1021 | struct k_clock *kc = clockid_to_kclock(which_clock); | 1039 | const struct k_clock *kc = clockid_to_kclock(which_clock); |
| 1022 | struct timespec64 new_tp64; | 1040 | struct timespec64 new_tp; |
| 1023 | struct timespec new_tp; | ||
| 1024 | 1041 | ||
| 1025 | if (!kc || !kc->clock_set) | 1042 | if (!kc || !kc->clock_set) |
| 1026 | return -EINVAL; | 1043 | return -EINVAL; |
| 1027 | 1044 | ||
| 1028 | if (copy_from_user(&new_tp, tp, sizeof (*tp))) | 1045 | if (get_timespec64(&new_tp, tp)) |
| 1029 | return -EFAULT; | 1046 | return -EFAULT; |
| 1030 | new_tp64 = timespec_to_timespec64(new_tp); | ||
| 1031 | 1047 | ||
| 1032 | return kc->clock_set(which_clock, &new_tp64); | 1048 | return kc->clock_set(which_clock, &new_tp); |
| 1033 | } | 1049 | } |
| 1034 | 1050 | ||
| 1035 | SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, | 1051 | SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, |
| 1036 | struct timespec __user *,tp) | 1052 | struct timespec __user *,tp) |
| 1037 | { | 1053 | { |
| 1038 | struct k_clock *kc = clockid_to_kclock(which_clock); | 1054 | const struct k_clock *kc = clockid_to_kclock(which_clock); |
| 1039 | struct timespec64 kernel_tp64; | 1055 | struct timespec64 kernel_tp; |
| 1040 | struct timespec kernel_tp; | ||
| 1041 | int error; | 1056 | int error; |
| 1042 | 1057 | ||
| 1043 | if (!kc) | 1058 | if (!kc) |
| 1044 | return -EINVAL; | 1059 | return -EINVAL; |
| 1045 | 1060 | ||
| 1046 | error = kc->clock_get(which_clock, &kernel_tp64); | 1061 | error = kc->clock_get(which_clock, &kernel_tp); |
| 1047 | kernel_tp = timespec64_to_timespec(kernel_tp64); | ||
| 1048 | 1062 | ||
| 1049 | if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp))) | 1063 | if (!error && put_timespec64(&kernel_tp, tp)) |
| 1050 | error = -EFAULT; | 1064 | error = -EFAULT; |
| 1051 | 1065 | ||
| 1052 | return error; | 1066 | return error; |
| @@ -1055,7 +1069,7 @@ SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, | |||
| 1055 | SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock, | 1069 | SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock, |
| 1056 | struct timex __user *, utx) | 1070 | struct timex __user *, utx) |
| 1057 | { | 1071 | { |
| 1058 | struct k_clock *kc = clockid_to_kclock(which_clock); | 1072 | const struct k_clock *kc = clockid_to_kclock(which_clock); |
| 1059 | struct timex ktx; | 1073 | struct timex ktx; |
| 1060 | int err; | 1074 | int err; |
| 1061 | 1075 | ||
| @@ -1078,30 +1092,106 @@ SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock, | |||
| 1078 | SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, | 1092 | SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, |
| 1079 | struct timespec __user *, tp) | 1093 | struct timespec __user *, tp) |
| 1080 | { | 1094 | { |
| 1081 | struct k_clock *kc = clockid_to_kclock(which_clock); | 1095 | const struct k_clock *kc = clockid_to_kclock(which_clock); |
| 1082 | struct timespec64 rtn_tp64; | 1096 | struct timespec64 rtn_tp; |
| 1083 | struct timespec rtn_tp; | ||
| 1084 | int error; | 1097 | int error; |
| 1085 | 1098 | ||
| 1086 | if (!kc) | 1099 | if (!kc) |
| 1087 | return -EINVAL; | 1100 | return -EINVAL; |
| 1088 | 1101 | ||
| 1089 | error = kc->clock_getres(which_clock, &rtn_tp64); | 1102 | error = kc->clock_getres(which_clock, &rtn_tp); |
| 1090 | rtn_tp = timespec64_to_timespec(rtn_tp64); | ||
| 1091 | 1103 | ||
| 1092 | if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) | 1104 | if (!error && tp && put_timespec64(&rtn_tp, tp)) |
| 1093 | error = -EFAULT; | 1105 | error = -EFAULT; |
| 1094 | 1106 | ||
| 1095 | return error; | 1107 | return error; |
| 1096 | } | 1108 | } |
| 1097 | 1109 | ||
| 1110 | #ifdef CONFIG_COMPAT | ||
| 1111 | |||
| 1112 | COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock, | ||
| 1113 | struct compat_timespec __user *, tp) | ||
| 1114 | { | ||
| 1115 | const struct k_clock *kc = clockid_to_kclock(which_clock); | ||
| 1116 | struct timespec64 ts; | ||
| 1117 | |||
| 1118 | if (!kc || !kc->clock_set) | ||
| 1119 | return -EINVAL; | ||
| 1120 | |||
| 1121 | if (compat_get_timespec64(&ts, tp)) | ||
| 1122 | return -EFAULT; | ||
| 1123 | |||
| 1124 | return kc->clock_set(which_clock, &ts); | ||
| 1125 | } | ||
| 1126 | |||
| 1127 | COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, | ||
| 1128 | struct compat_timespec __user *, tp) | ||
| 1129 | { | ||
| 1130 | const struct k_clock *kc = clockid_to_kclock(which_clock); | ||
| 1131 | struct timespec64 ts; | ||
| 1132 | int err; | ||
| 1133 | |||
| 1134 | if (!kc) | ||
| 1135 | return -EINVAL; | ||
| 1136 | |||
| 1137 | err = kc->clock_get(which_clock, &ts); | ||
| 1138 | |||
| 1139 | if (!err && compat_put_timespec64(&ts, tp)) | ||
| 1140 | err = -EFAULT; | ||
| 1141 | |||
| 1142 | return err; | ||
| 1143 | } | ||
| 1144 | |||
| 1145 | COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock, | ||
| 1146 | struct compat_timex __user *, utp) | ||
| 1147 | { | ||
| 1148 | const struct k_clock *kc = clockid_to_kclock(which_clock); | ||
| 1149 | struct timex ktx; | ||
| 1150 | int err; | ||
| 1151 | |||
| 1152 | if (!kc) | ||
| 1153 | return -EINVAL; | ||
| 1154 | if (!kc->clock_adj) | ||
| 1155 | return -EOPNOTSUPP; | ||
| 1156 | |||
| 1157 | err = compat_get_timex(&ktx, utp); | ||
| 1158 | if (err) | ||
| 1159 | return err; | ||
| 1160 | |||
| 1161 | err = kc->clock_adj(which_clock, &ktx); | ||
| 1162 | |||
| 1163 | if (err >= 0) | ||
| 1164 | err = compat_put_timex(utp, &ktx); | ||
| 1165 | |||
| 1166 | return err; | ||
| 1167 | } | ||
| 1168 | |||
| 1169 | COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, | ||
| 1170 | struct compat_timespec __user *, tp) | ||
| 1171 | { | ||
| 1172 | const struct k_clock *kc = clockid_to_kclock(which_clock); | ||
| 1173 | struct timespec64 ts; | ||
| 1174 | int err; | ||
| 1175 | |||
| 1176 | if (!kc) | ||
| 1177 | return -EINVAL; | ||
| 1178 | |||
| 1179 | err = kc->clock_getres(which_clock, &ts); | ||
| 1180 | if (!err && tp && compat_put_timespec64(&ts, tp)) | ||
| 1181 | return -EFAULT; | ||
| 1182 | |||
| 1183 | return err; | ||
| 1184 | } | ||
| 1185 | |||
| 1186 | #endif | ||
| 1187 | |||
| 1098 | /* | 1188 | /* |
| 1099 | * nanosleep for monotonic and realtime clocks | 1189 | * nanosleep for monotonic and realtime clocks |
| 1100 | */ | 1190 | */ |
| 1101 | static int common_nsleep(const clockid_t which_clock, int flags, | 1191 | static int common_nsleep(const clockid_t which_clock, int flags, |
| 1102 | struct timespec64 *tsave, struct timespec __user *rmtp) | 1192 | const struct timespec64 *rqtp) |
| 1103 | { | 1193 | { |
| 1104 | return hrtimer_nanosleep(tsave, rmtp, flags & TIMER_ABSTIME ? | 1194 | return hrtimer_nanosleep(rqtp, flags & TIMER_ABSTIME ? |
| 1105 | HRTIMER_MODE_ABS : HRTIMER_MODE_REL, | 1195 | HRTIMER_MODE_ABS : HRTIMER_MODE_REL, |
| 1106 | which_clock); | 1196 | which_clock); |
| 1107 | } | 1197 | } |
| @@ -1110,36 +1200,152 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, | |||
| 1110 | const struct timespec __user *, rqtp, | 1200 | const struct timespec __user *, rqtp, |
| 1111 | struct timespec __user *, rmtp) | 1201 | struct timespec __user *, rmtp) |
| 1112 | { | 1202 | { |
| 1113 | struct k_clock *kc = clockid_to_kclock(which_clock); | 1203 | const struct k_clock *kc = clockid_to_kclock(which_clock); |
| 1114 | struct timespec64 t64; | 1204 | struct timespec64 t; |
| 1115 | struct timespec t; | ||
| 1116 | 1205 | ||
| 1117 | if (!kc) | 1206 | if (!kc) |
| 1118 | return -EINVAL; | 1207 | return -EINVAL; |
| 1119 | if (!kc->nsleep) | 1208 | if (!kc->nsleep) |
| 1120 | return -ENANOSLEEP_NOTSUP; | 1209 | return -ENANOSLEEP_NOTSUP; |
| 1121 | 1210 | ||
| 1122 | if (copy_from_user(&t, rqtp, sizeof (struct timespec))) | 1211 | if (get_timespec64(&t, rqtp)) |
| 1123 | return -EFAULT; | 1212 | return -EFAULT; |
| 1124 | 1213 | ||
| 1125 | t64 = timespec_to_timespec64(t); | 1214 | if (!timespec64_valid(&t)) |
| 1126 | if (!timespec64_valid(&t64)) | ||
| 1127 | return -EINVAL; | 1215 | return -EINVAL; |
| 1216 | if (flags & TIMER_ABSTIME) | ||
| 1217 | rmtp = NULL; | ||
| 1218 | current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; | ||
| 1219 | current->restart_block.nanosleep.rmtp = rmtp; | ||
| 1128 | 1220 | ||
| 1129 | return kc->nsleep(which_clock, flags, &t64, rmtp); | 1221 | return kc->nsleep(which_clock, flags, &t); |
| 1130 | } | 1222 | } |
| 1131 | 1223 | ||
| 1132 | /* | 1224 | #ifdef CONFIG_COMPAT |
| 1133 | * This will restart clock_nanosleep. This is required only by | 1225 | COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, |
| 1134 | * compat_clock_nanosleep_restart for now. | 1226 | struct compat_timespec __user *, rqtp, |
| 1135 | */ | 1227 | struct compat_timespec __user *, rmtp) |
| 1136 | long clock_nanosleep_restart(struct restart_block *restart_block) | ||
| 1137 | { | 1228 | { |
| 1138 | clockid_t which_clock = restart_block->nanosleep.clockid; | 1229 | const struct k_clock *kc = clockid_to_kclock(which_clock); |
| 1139 | struct k_clock *kc = clockid_to_kclock(which_clock); | 1230 | struct timespec64 t; |
| 1231 | |||
| 1232 | if (!kc) | ||
| 1233 | return -EINVAL; | ||
| 1234 | if (!kc->nsleep) | ||
| 1235 | return -ENANOSLEEP_NOTSUP; | ||
| 1140 | 1236 | ||
| 1141 | if (WARN_ON_ONCE(!kc || !kc->nsleep_restart)) | 1237 | if (compat_get_timespec64(&t, rqtp)) |
| 1238 | return -EFAULT; | ||
| 1239 | |||
| 1240 | if (!timespec64_valid(&t)) | ||
| 1142 | return -EINVAL; | 1241 | return -EINVAL; |
| 1242 | if (flags & TIMER_ABSTIME) | ||
| 1243 | rmtp = NULL; | ||
| 1244 | current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; | ||
| 1245 | current->restart_block.nanosleep.compat_rmtp = rmtp; | ||
| 1246 | |||
| 1247 | return kc->nsleep(which_clock, flags, &t); | ||
| 1248 | } | ||
| 1249 | #endif | ||
| 1143 | 1250 | ||
| 1144 | return kc->nsleep_restart(restart_block); | 1251 | static const struct k_clock clock_realtime = { |
| 1252 | .clock_getres = posix_get_hrtimer_res, | ||
| 1253 | .clock_get = posix_clock_realtime_get, | ||
| 1254 | .clock_set = posix_clock_realtime_set, | ||
| 1255 | .clock_adj = posix_clock_realtime_adj, | ||
| 1256 | .nsleep = common_nsleep, | ||
| 1257 | .timer_create = common_timer_create, | ||
| 1258 | .timer_set = common_timer_set, | ||
| 1259 | .timer_get = common_timer_get, | ||
| 1260 | .timer_del = common_timer_del, | ||
| 1261 | .timer_rearm = common_hrtimer_rearm, | ||
| 1262 | .timer_forward = common_hrtimer_forward, | ||
| 1263 | .timer_remaining = common_hrtimer_remaining, | ||
| 1264 | .timer_try_to_cancel = common_hrtimer_try_to_cancel, | ||
| 1265 | .timer_arm = common_hrtimer_arm, | ||
| 1266 | }; | ||
| 1267 | |||
| 1268 | static const struct k_clock clock_monotonic = { | ||
| 1269 | .clock_getres = posix_get_hrtimer_res, | ||
| 1270 | .clock_get = posix_ktime_get_ts, | ||
| 1271 | .nsleep = common_nsleep, | ||
| 1272 | .timer_create = common_timer_create, | ||
| 1273 | .timer_set = common_timer_set, | ||
| 1274 | .timer_get = common_timer_get, | ||
| 1275 | .timer_del = common_timer_del, | ||
| 1276 | .timer_rearm = common_hrtimer_rearm, | ||
| 1277 | .timer_forward = common_hrtimer_forward, | ||
| 1278 | .timer_remaining = common_hrtimer_remaining, | ||
| 1279 | .timer_try_to_cancel = common_hrtimer_try_to_cancel, | ||
| 1280 | .timer_arm = common_hrtimer_arm, | ||
| 1281 | }; | ||
| 1282 | |||
| 1283 | static const struct k_clock clock_monotonic_raw = { | ||
| 1284 | .clock_getres = posix_get_hrtimer_res, | ||
| 1285 | .clock_get = posix_get_monotonic_raw, | ||
| 1286 | }; | ||
| 1287 | |||
| 1288 | static const struct k_clock clock_realtime_coarse = { | ||
| 1289 | .clock_getres = posix_get_coarse_res, | ||
| 1290 | .clock_get = posix_get_realtime_coarse, | ||
| 1291 | }; | ||
| 1292 | |||
| 1293 | static const struct k_clock clock_monotonic_coarse = { | ||
| 1294 | .clock_getres = posix_get_coarse_res, | ||
| 1295 | .clock_get = posix_get_monotonic_coarse, | ||
| 1296 | }; | ||
| 1297 | |||
| 1298 | static const struct k_clock clock_tai = { | ||
| 1299 | .clock_getres = posix_get_hrtimer_res, | ||
| 1300 | .clock_get = posix_get_tai, | ||
| 1301 | .nsleep = common_nsleep, | ||
| 1302 | .timer_create = common_timer_create, | ||
| 1303 | .timer_set = common_timer_set, | ||
| 1304 | .timer_get = common_timer_get, | ||
| 1305 | .timer_del = common_timer_del, | ||
| 1306 | .timer_rearm = common_hrtimer_rearm, | ||
| 1307 | .timer_forward = common_hrtimer_forward, | ||
| 1308 | .timer_remaining = common_hrtimer_remaining, | ||
| 1309 | .timer_try_to_cancel = common_hrtimer_try_to_cancel, | ||
| 1310 | .timer_arm = common_hrtimer_arm, | ||
| 1311 | }; | ||
| 1312 | |||
| 1313 | static const struct k_clock clock_boottime = { | ||
| 1314 | .clock_getres = posix_get_hrtimer_res, | ||
| 1315 | .clock_get = posix_get_boottime, | ||
| 1316 | .nsleep = common_nsleep, | ||
| 1317 | .timer_create = common_timer_create, | ||
| 1318 | .timer_set = common_timer_set, | ||
| 1319 | .timer_get = common_timer_get, | ||
| 1320 | .timer_del = common_timer_del, | ||
| 1321 | .timer_rearm = common_hrtimer_rearm, | ||
| 1322 | .timer_forward = common_hrtimer_forward, | ||
| 1323 | .timer_remaining = common_hrtimer_remaining, | ||
| 1324 | .timer_try_to_cancel = common_hrtimer_try_to_cancel, | ||
| 1325 | .timer_arm = common_hrtimer_arm, | ||
| 1326 | }; | ||
| 1327 | |||
| 1328 | static const struct k_clock * const posix_clocks[] = { | ||
| 1329 | [CLOCK_REALTIME] = &clock_realtime, | ||
| 1330 | [CLOCK_MONOTONIC] = &clock_monotonic, | ||
| 1331 | [CLOCK_PROCESS_CPUTIME_ID] = &clock_process, | ||
| 1332 | [CLOCK_THREAD_CPUTIME_ID] = &clock_thread, | ||
| 1333 | [CLOCK_MONOTONIC_RAW] = &clock_monotonic_raw, | ||
| 1334 | [CLOCK_REALTIME_COARSE] = &clock_realtime_coarse, | ||
| 1335 | [CLOCK_MONOTONIC_COARSE] = &clock_monotonic_coarse, | ||
| 1336 | [CLOCK_BOOTTIME] = &clock_boottime, | ||
| 1337 | [CLOCK_REALTIME_ALARM] = &alarm_clock, | ||
| 1338 | [CLOCK_BOOTTIME_ALARM] = &alarm_clock, | ||
| 1339 | [CLOCK_TAI] = &clock_tai, | ||
| 1340 | }; | ||
| 1341 | |||
| 1342 | static const struct k_clock *clockid_to_kclock(const clockid_t id) | ||
| 1343 | { | ||
| 1344 | if (id < 0) | ||
| 1345 | return (id & CLOCKFD_MASK) == CLOCKFD ? | ||
| 1346 | &clock_posix_dynamic : &clock_posix_cpu; | ||
| 1347 | |||
| 1348 | if (id >= ARRAY_SIZE(posix_clocks) || !posix_clocks[id]) | ||
| 1349 | return NULL; | ||
| 1350 | return posix_clocks[id]; | ||
| 1145 | } | 1351 | } |
diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h new file mode 100644 index 000000000000..fb303c3be4d3 --- /dev/null +++ b/kernel/time/posix-timers.h | |||
| @@ -0,0 +1,40 @@ | |||
| 1 | #define TIMER_RETRY 1 | ||
| 2 | |||
| 3 | struct k_clock { | ||
| 4 | int (*clock_getres)(const clockid_t which_clock, | ||
| 5 | struct timespec64 *tp); | ||
| 6 | int (*clock_set)(const clockid_t which_clock, | ||
| 7 | const struct timespec64 *tp); | ||
| 8 | int (*clock_get)(const clockid_t which_clock, | ||
| 9 | struct timespec64 *tp); | ||
| 10 | int (*clock_adj)(const clockid_t which_clock, struct timex *tx); | ||
| 11 | int (*timer_create)(struct k_itimer *timer); | ||
| 12 | int (*nsleep)(const clockid_t which_clock, int flags, | ||
| 13 | const struct timespec64 *); | ||
| 14 | int (*timer_set)(struct k_itimer *timr, int flags, | ||
| 15 | struct itimerspec64 *new_setting, | ||
| 16 | struct itimerspec64 *old_setting); | ||
| 17 | int (*timer_del)(struct k_itimer *timr); | ||
| 18 | void (*timer_get)(struct k_itimer *timr, | ||
| 19 | struct itimerspec64 *cur_setting); | ||
| 20 | void (*timer_rearm)(struct k_itimer *timr); | ||
| 21 | int (*timer_forward)(struct k_itimer *timr, ktime_t now); | ||
| 22 | ktime_t (*timer_remaining)(struct k_itimer *timr, ktime_t now); | ||
| 23 | int (*timer_try_to_cancel)(struct k_itimer *timr); | ||
| 24 | void (*timer_arm)(struct k_itimer *timr, ktime_t expires, | ||
| 25 | bool absolute, bool sigev_none); | ||
| 26 | }; | ||
| 27 | |||
| 28 | extern const struct k_clock clock_posix_cpu; | ||
| 29 | extern const struct k_clock clock_posix_dynamic; | ||
| 30 | extern const struct k_clock clock_process; | ||
| 31 | extern const struct k_clock clock_thread; | ||
| 32 | extern const struct k_clock alarm_clock; | ||
| 33 | |||
| 34 | int posix_timer_event(struct k_itimer *timr, int si_private); | ||
| 35 | |||
| 36 | void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting); | ||
| 37 | int common_timer_set(struct k_itimer *timr, int flags, | ||
| 38 | struct itimerspec64 *new_setting, | ||
| 39 | struct itimerspec64 *old_setting); | ||
| 40 | int common_timer_del(struct k_itimer *timer); | ||
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 987e496bb51a..b398c2ea69b2 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c | |||
| @@ -37,9 +37,11 @@ static int tick_broadcast_forced; | |||
| 37 | static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(tick_broadcast_lock); | 37 | static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(tick_broadcast_lock); |
| 38 | 38 | ||
| 39 | #ifdef CONFIG_TICK_ONESHOT | 39 | #ifdef CONFIG_TICK_ONESHOT |
| 40 | static void tick_broadcast_setup_oneshot(struct clock_event_device *bc); | ||
| 40 | static void tick_broadcast_clear_oneshot(int cpu); | 41 | static void tick_broadcast_clear_oneshot(int cpu); |
| 41 | static void tick_resume_broadcast_oneshot(struct clock_event_device *bc); | 42 | static void tick_resume_broadcast_oneshot(struct clock_event_device *bc); |
| 42 | #else | 43 | #else |
| 44 | static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); } | ||
| 43 | static inline void tick_broadcast_clear_oneshot(int cpu) { } | 45 | static inline void tick_broadcast_clear_oneshot(int cpu) { } |
| 44 | static inline void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { } | 46 | static inline void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { } |
| 45 | #endif | 47 | #endif |
| @@ -867,7 +869,7 @@ static void tick_broadcast_init_next_event(struct cpumask *mask, | |||
| 867 | /** | 869 | /** |
| 868 | * tick_broadcast_setup_oneshot - setup the broadcast device | 870 | * tick_broadcast_setup_oneshot - setup the broadcast device |
| 869 | */ | 871 | */ |
| 870 | void tick_broadcast_setup_oneshot(struct clock_event_device *bc) | 872 | static void tick_broadcast_setup_oneshot(struct clock_event_device *bc) |
| 871 | { | 873 | { |
| 872 | int cpu = smp_processor_id(); | 874 | int cpu = smp_processor_id(); |
| 873 | 875 | ||
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index f738251000fe..be0ac01f2e12 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h | |||
| @@ -126,7 +126,6 @@ static inline int tick_check_oneshot_change(int allow_nohz) { return 0; } | |||
| 126 | 126 | ||
| 127 | /* Functions related to oneshot broadcasting */ | 127 | /* Functions related to oneshot broadcasting */ |
| 128 | #if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT) | 128 | #if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT) |
| 129 | extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc); | ||
| 130 | extern void tick_broadcast_switch_to_oneshot(void); | 129 | extern void tick_broadcast_switch_to_oneshot(void); |
| 131 | extern void tick_shutdown_broadcast_oneshot(unsigned int cpu); | 130 | extern void tick_shutdown_broadcast_oneshot(unsigned int cpu); |
| 132 | extern int tick_broadcast_oneshot_active(void); | 131 | extern int tick_broadcast_oneshot_active(void); |
| @@ -134,7 +133,6 @@ extern void tick_check_oneshot_broadcast_this_cpu(void); | |||
| 134 | bool tick_broadcast_oneshot_available(void); | 133 | bool tick_broadcast_oneshot_available(void); |
| 135 | extern struct cpumask *tick_get_broadcast_oneshot_mask(void); | 134 | extern struct cpumask *tick_get_broadcast_oneshot_mask(void); |
| 136 | #else /* !(BROADCAST && ONESHOT): */ | 135 | #else /* !(BROADCAST && ONESHOT): */ |
| 137 | static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); } | ||
| 138 | static inline void tick_broadcast_switch_to_oneshot(void) { } | 136 | static inline void tick_broadcast_switch_to_oneshot(void) { } |
| 139 | static inline void tick_shutdown_broadcast_oneshot(unsigned int cpu) { } | 137 | static inline void tick_shutdown_broadcast_oneshot(unsigned int cpu) { } |
| 140 | static inline int tick_broadcast_oneshot_active(void) { return 0; } | 138 | static inline int tick_broadcast_oneshot_active(void) { return 0; } |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 64c97fc130c4..c7a899c5ce64 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
| @@ -150,6 +150,12 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) | |||
| 150 | touch_softlockup_watchdog_sched(); | 150 | touch_softlockup_watchdog_sched(); |
| 151 | if (is_idle_task(current)) | 151 | if (is_idle_task(current)) |
| 152 | ts->idle_jiffies++; | 152 | ts->idle_jiffies++; |
| 153 | /* | ||
| 154 | * In case the current tick fired too early past its expected | ||
| 155 | * expiration, make sure we don't bypass the next clock reprogramming | ||
| 156 | * to the same deadline. | ||
| 157 | */ | ||
| 158 | ts->next_tick = 0; | ||
| 153 | } | 159 | } |
| 154 | #endif | 160 | #endif |
| 155 | update_process_times(user_mode(regs)); | 161 | update_process_times(user_mode(regs)); |
| @@ -554,7 +560,7 @@ static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now) | |||
| 554 | update_ts_time_stats(smp_processor_id(), ts, now, NULL); | 560 | update_ts_time_stats(smp_processor_id(), ts, now, NULL); |
| 555 | ts->idle_active = 0; | 561 | ts->idle_active = 0; |
| 556 | 562 | ||
| 557 | sched_clock_idle_wakeup_event(0); | 563 | sched_clock_idle_wakeup_event(); |
| 558 | } | 564 | } |
| 559 | 565 | ||
| 560 | static ktime_t tick_nohz_start_idle(struct tick_sched *ts) | 566 | static ktime_t tick_nohz_start_idle(struct tick_sched *ts) |
| @@ -660,6 +666,12 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) | |||
| 660 | hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED); | 666 | hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED); |
| 661 | else | 667 | else |
| 662 | tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); | 668 | tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); |
| 669 | |||
| 670 | /* | ||
| 671 | * Reset to make sure next tick stop doesn't get fooled by past | ||
| 672 | * cached clock deadline. | ||
| 673 | */ | ||
| 674 | ts->next_tick = 0; | ||
| 663 | } | 675 | } |
| 664 | 676 | ||
| 665 | static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, | 677 | static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, |
| @@ -701,8 +713,6 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, | |||
| 701 | */ | 713 | */ |
| 702 | delta = next_tick - basemono; | 714 | delta = next_tick - basemono; |
| 703 | if (delta <= (u64)TICK_NSEC) { | 715 | if (delta <= (u64)TICK_NSEC) { |
| 704 | tick = 0; | ||
| 705 | |||
| 706 | /* | 716 | /* |
| 707 | * Tell the timer code that the base is not idle, i.e. undo | 717 | * Tell the timer code that the base is not idle, i.e. undo |
| 708 | * the effect of get_next_timer_interrupt(): | 718 | * the effect of get_next_timer_interrupt(): |
| @@ -712,23 +722,8 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, | |||
| 712 | * We've not stopped the tick yet, and there's a timer in the | 722 | * We've not stopped the tick yet, and there's a timer in the |
| 713 | * next period, so no point in stopping it either, bail. | 723 | * next period, so no point in stopping it either, bail. |
| 714 | */ | 724 | */ |
| 715 | if (!ts->tick_stopped) | 725 | if (!ts->tick_stopped) { |
| 716 | goto out; | 726 | tick = 0; |
| 717 | |||
| 718 | /* | ||
| 719 | * If, OTOH, we did stop it, but there's a pending (expired) | ||
| 720 | * timer reprogram the timer hardware to fire now. | ||
| 721 | * | ||
| 722 | * We will not restart the tick proper, just prod the timer | ||
| 723 | * hardware into firing an interrupt to process the pending | ||
| 724 | * timers. Just like tick_irq_exit() will not restart the tick | ||
| 725 | * for 'normal' interrupts. | ||
| 726 | * | ||
| 727 | * Only once we exit the idle loop will we re-enable the tick, | ||
| 728 | * see tick_nohz_idle_exit(). | ||
| 729 | */ | ||
| 730 | if (delta == 0) { | ||
| 731 | tick_nohz_restart(ts, now); | ||
| 732 | goto out; | 727 | goto out; |
| 733 | } | 728 | } |
| 734 | } | 729 | } |
| @@ -771,8 +766,16 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, | |||
| 771 | tick = expires; | 766 | tick = expires; |
| 772 | 767 | ||
| 773 | /* Skip reprogram of event if its not changed */ | 768 | /* Skip reprogram of event if its not changed */ |
| 774 | if (ts->tick_stopped && (expires == dev->next_event)) | 769 | if (ts->tick_stopped && (expires == ts->next_tick)) { |
| 775 | goto out; | 770 | /* Sanity check: make sure clockevent is actually programmed */ |
| 771 | if (tick == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer)) | ||
| 772 | goto out; | ||
| 773 | |||
| 774 | WARN_ON_ONCE(1); | ||
| 775 | printk_once("basemono: %llu ts->next_tick: %llu dev->next_event: %llu timer->active: %d timer->expires: %llu\n", | ||
| 776 | basemono, ts->next_tick, dev->next_event, | ||
| 777 | hrtimer_active(&ts->sched_timer), hrtimer_get_expires(&ts->sched_timer)); | ||
| 778 | } | ||
| 776 | 779 | ||
| 777 | /* | 780 | /* |
| 778 | * nohz_stop_sched_tick can be called several times before | 781 | * nohz_stop_sched_tick can be called several times before |
| @@ -782,8 +785,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, | |||
| 782 | * the scheduler tick in nohz_restart_sched_tick. | 785 | * the scheduler tick in nohz_restart_sched_tick. |
| 783 | */ | 786 | */ |
| 784 | if (!ts->tick_stopped) { | 787 | if (!ts->tick_stopped) { |
| 785 | nohz_balance_enter_idle(cpu); | 788 | calc_load_nohz_start(); |
| 786 | calc_load_enter_idle(); | ||
| 787 | cpu_load_update_nohz_start(); | 789 | cpu_load_update_nohz_start(); |
| 788 | 790 | ||
| 789 | ts->last_tick = hrtimer_get_expires(&ts->sched_timer); | 791 | ts->last_tick = hrtimer_get_expires(&ts->sched_timer); |
| @@ -791,6 +793,8 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, | |||
| 791 | trace_tick_stop(1, TICK_DEP_MASK_NONE); | 793 | trace_tick_stop(1, TICK_DEP_MASK_NONE); |
| 792 | } | 794 | } |
| 793 | 795 | ||
| 796 | ts->next_tick = tick; | ||
| 797 | |||
| 794 | /* | 798 | /* |
| 795 | * If the expiration time == KTIME_MAX, then we simply stop | 799 | * If the expiration time == KTIME_MAX, then we simply stop |
| 796 | * the tick timer. | 800 | * the tick timer. |
| @@ -801,12 +805,17 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, | |||
| 801 | goto out; | 805 | goto out; |
| 802 | } | 806 | } |
| 803 | 807 | ||
| 808 | hrtimer_set_expires(&ts->sched_timer, tick); | ||
| 809 | |||
| 804 | if (ts->nohz_mode == NOHZ_MODE_HIGHRES) | 810 | if (ts->nohz_mode == NOHZ_MODE_HIGHRES) |
| 805 | hrtimer_start(&ts->sched_timer, tick, HRTIMER_MODE_ABS_PINNED); | 811 | hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED); |
| 806 | else | 812 | else |
| 807 | tick_program_event(tick, 1); | 813 | tick_program_event(tick, 1); |
| 808 | out: | 814 | out: |
| 809 | /* Update the estimated sleep length */ | 815 | /* |
| 816 | * Update the estimated sleep length until the next timer | ||
| 817 | * (not only the tick). | ||
| 818 | */ | ||
| 810 | ts->sleep_length = ktime_sub(dev->next_event, now); | 819 | ts->sleep_length = ktime_sub(dev->next_event, now); |
| 811 | return tick; | 820 | return tick; |
| 812 | } | 821 | } |
| @@ -823,7 +832,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) | |||
| 823 | */ | 832 | */ |
| 824 | timer_clear_idle(); | 833 | timer_clear_idle(); |
| 825 | 834 | ||
| 826 | calc_load_exit_idle(); | 835 | calc_load_nohz_stop(); |
| 827 | touch_softlockup_watchdog_sched(); | 836 | touch_softlockup_watchdog_sched(); |
| 828 | /* | 837 | /* |
| 829 | * Cancel the scheduled timer and restore the tick | 838 | * Cancel the scheduled timer and restore the tick |
| @@ -864,6 +873,11 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) | |||
| 864 | if (unlikely(!cpu_online(cpu))) { | 873 | if (unlikely(!cpu_online(cpu))) { |
| 865 | if (cpu == tick_do_timer_cpu) | 874 | if (cpu == tick_do_timer_cpu) |
| 866 | tick_do_timer_cpu = TICK_DO_TIMER_NONE; | 875 | tick_do_timer_cpu = TICK_DO_TIMER_NONE; |
| 876 | /* | ||
| 877 | * Make sure the CPU doesn't get fooled by obsolete tick | ||
| 878 | * deadline if it comes back online later. | ||
| 879 | */ | ||
| 880 | ts->next_tick = 0; | ||
| 867 | return false; | 881 | return false; |
| 868 | } | 882 | } |
| 869 | 883 | ||
| @@ -923,8 +937,10 @@ static void __tick_nohz_idle_enter(struct tick_sched *ts) | |||
| 923 | ts->idle_expires = expires; | 937 | ts->idle_expires = expires; |
| 924 | } | 938 | } |
| 925 | 939 | ||
| 926 | if (!was_stopped && ts->tick_stopped) | 940 | if (!was_stopped && ts->tick_stopped) { |
| 927 | ts->idle_jiffies = ts->last_jiffies; | 941 | ts->idle_jiffies = ts->last_jiffies; |
| 942 | nohz_balance_enter_idle(cpu); | ||
| 943 | } | ||
| 928 | } | 944 | } |
| 929 | } | 945 | } |
| 930 | 946 | ||
| @@ -1172,6 +1188,8 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) | |||
| 1172 | */ | 1188 | */ |
| 1173 | if (regs) | 1189 | if (regs) |
| 1174 | tick_sched_handle(ts, regs); | 1190 | tick_sched_handle(ts, regs); |
| 1191 | else | ||
| 1192 | ts->next_tick = 0; | ||
| 1175 | 1193 | ||
| 1176 | /* No need to reprogram if we are in idle or full dynticks mode */ | 1194 | /* No need to reprogram if we are in idle or full dynticks mode */ |
| 1177 | if (unlikely(ts->tick_stopped)) | 1195 | if (unlikely(ts->tick_stopped)) |
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h index bf38226e5c17..075444e3d48e 100644 --- a/kernel/time/tick-sched.h +++ b/kernel/time/tick-sched.h | |||
| @@ -27,6 +27,7 @@ enum tick_nohz_mode { | |||
| 27 | * timer is modified for nohz sleeps. This is necessary | 27 | * timer is modified for nohz sleeps. This is necessary |
| 28 | * to resume the tick timer operation in the timeline | 28 | * to resume the tick timer operation in the timeline |
| 29 | * when the CPU returns from nohz sleep. | 29 | * when the CPU returns from nohz sleep. |
| 30 | * @next_tick: Next tick to be fired when in dynticks mode. | ||
| 30 | * @tick_stopped: Indicator that the idle tick has been stopped | 31 | * @tick_stopped: Indicator that the idle tick has been stopped |
| 31 | * @idle_jiffies: jiffies at the entry to idle for idle time accounting | 32 | * @idle_jiffies: jiffies at the entry to idle for idle time accounting |
| 32 | * @idle_calls: Total number of idle calls | 33 | * @idle_calls: Total number of idle calls |
| @@ -44,6 +45,7 @@ struct tick_sched { | |||
| 44 | unsigned long check_clocks; | 45 | unsigned long check_clocks; |
| 45 | enum tick_nohz_mode nohz_mode; | 46 | enum tick_nohz_mode nohz_mode; |
| 46 | ktime_t last_tick; | 47 | ktime_t last_tick; |
| 48 | ktime_t next_tick; | ||
| 47 | int inidle; | 49 | int inidle; |
| 48 | int tick_stopped; | 50 | int tick_stopped; |
| 49 | unsigned long idle_jiffies; | 51 | unsigned long idle_jiffies; |
diff --git a/kernel/time/time.c b/kernel/time/time.c index 49c73c6ed648..44a8c1402133 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c | |||
| @@ -39,6 +39,7 @@ | |||
| 39 | #include <linux/ptrace.h> | 39 | #include <linux/ptrace.h> |
| 40 | 40 | ||
| 41 | #include <linux/uaccess.h> | 41 | #include <linux/uaccess.h> |
| 42 | #include <linux/compat.h> | ||
| 42 | #include <asm/unistd.h> | 43 | #include <asm/unistd.h> |
| 43 | 44 | ||
| 44 | #include <generated/timeconst.h> | 45 | #include <generated/timeconst.h> |
| @@ -99,6 +100,47 @@ SYSCALL_DEFINE1(stime, time_t __user *, tptr) | |||
| 99 | 100 | ||
| 100 | #endif /* __ARCH_WANT_SYS_TIME */ | 101 | #endif /* __ARCH_WANT_SYS_TIME */ |
| 101 | 102 | ||
| 103 | #ifdef CONFIG_COMPAT | ||
| 104 | #ifdef __ARCH_WANT_COMPAT_SYS_TIME | ||
| 105 | |||
| 106 | /* compat_time_t is a 32 bit "long" and needs to get converted. */ | ||
| 107 | COMPAT_SYSCALL_DEFINE1(time, compat_time_t __user *, tloc) | ||
| 108 | { | ||
| 109 | struct timeval tv; | ||
| 110 | compat_time_t i; | ||
| 111 | |||
| 112 | do_gettimeofday(&tv); | ||
| 113 | i = tv.tv_sec; | ||
| 114 | |||
| 115 | if (tloc) { | ||
| 116 | if (put_user(i,tloc)) | ||
| 117 | return -EFAULT; | ||
| 118 | } | ||
| 119 | force_successful_syscall_return(); | ||
| 120 | return i; | ||
| 121 | } | ||
| 122 | |||
| 123 | COMPAT_SYSCALL_DEFINE1(stime, compat_time_t __user *, tptr) | ||
| 124 | { | ||
| 125 | struct timespec tv; | ||
| 126 | int err; | ||
| 127 | |||
| 128 | if (get_user(tv.tv_sec, tptr)) | ||
| 129 | return -EFAULT; | ||
| 130 | |||
| 131 | tv.tv_nsec = 0; | ||
| 132 | |||
| 133 | err = security_settime(&tv, NULL); | ||
| 134 | if (err) | ||
| 135 | return err; | ||
| 136 | |||
| 137 | do_settimeofday(&tv); | ||
| 138 | return 0; | ||
| 139 | } | ||
| 140 | |||
| 141 | #endif /* __ARCH_WANT_COMPAT_SYS_TIME */ | ||
| 142 | #endif | ||
| 143 | |||
| 102 | SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv, | 144 | SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv, |
| 103 | struct timezone __user *, tz) | 145 | struct timezone __user *, tz) |
| 104 | { | 146 | { |
| @@ -215,6 +257,47 @@ SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv, | |||
| 215 | return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL); | 257 | return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL); |
| 216 | } | 258 | } |
| 217 | 259 | ||
| 260 | #ifdef CONFIG_COMPAT | ||
| 261 | COMPAT_SYSCALL_DEFINE2(gettimeofday, struct compat_timeval __user *, tv, | ||
| 262 | struct timezone __user *, tz) | ||
| 263 | { | ||
| 264 | if (tv) { | ||
| 265 | struct timeval ktv; | ||
| 266 | |||
| 267 | do_gettimeofday(&ktv); | ||
| 268 | if (compat_put_timeval(&ktv, tv)) | ||
| 269 | return -EFAULT; | ||
| 270 | } | ||
| 271 | if (tz) { | ||
| 272 | if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) | ||
| 273 | return -EFAULT; | ||
| 274 | } | ||
| 275 | |||
| 276 | return 0; | ||
| 277 | } | ||
| 278 | |||
| 279 | COMPAT_SYSCALL_DEFINE2(settimeofday, struct compat_timeval __user *, tv, | ||
| 280 | struct timezone __user *, tz) | ||
| 281 | { | ||
| 282 | struct timespec64 new_ts; | ||
| 283 | struct timeval user_tv; | ||
| 284 | struct timezone new_tz; | ||
| 285 | |||
| 286 | if (tv) { | ||
| 287 | if (compat_get_timeval(&user_tv, tv)) | ||
| 288 | return -EFAULT; | ||
| 289 | new_ts.tv_sec = user_tv.tv_sec; | ||
| 290 | new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC; | ||
| 291 | } | ||
| 292 | if (tz) { | ||
| 293 | if (copy_from_user(&new_tz, tz, sizeof(*tz))) | ||
| 294 | return -EFAULT; | ||
| 295 | } | ||
| 296 | |||
| 297 | return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL); | ||
| 298 | } | ||
| 299 | #endif | ||
| 300 | |||
| 218 | SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p) | 301 | SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p) |
| 219 | { | 302 | { |
| 220 | struct timex txc; /* Local copy of parameter */ | 303 | struct timex txc; /* Local copy of parameter */ |
| @@ -224,12 +307,33 @@ SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p) | |||
| 224 | * structure. But bear in mind that the structures | 307 | * structure. But bear in mind that the structures |
| 225 | * may change | 308 | * may change |
| 226 | */ | 309 | */ |
| 227 | if(copy_from_user(&txc, txc_p, sizeof(struct timex))) | 310 | if (copy_from_user(&txc, txc_p, sizeof(struct timex))) |
| 228 | return -EFAULT; | 311 | return -EFAULT; |
| 229 | ret = do_adjtimex(&txc); | 312 | ret = do_adjtimex(&txc); |
| 230 | return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret; | 313 | return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret; |
| 231 | } | 314 | } |
| 232 | 315 | ||
| 316 | #ifdef CONFIG_COMPAT | ||
| 317 | |||
| 318 | COMPAT_SYSCALL_DEFINE1(adjtimex, struct compat_timex __user *, utp) | ||
| 319 | { | ||
| 320 | struct timex txc; | ||
| 321 | int err, ret; | ||
| 322 | |||
| 323 | err = compat_get_timex(&txc, utp); | ||
| 324 | if (err) | ||
| 325 | return err; | ||
| 326 | |||
| 327 | ret = do_adjtimex(&txc); | ||
| 328 | |||
| 329 | err = compat_put_timex(utp, &txc); | ||
| 330 | if (err) | ||
| 331 | return err; | ||
| 332 | |||
| 333 | return ret; | ||
| 334 | } | ||
| 335 | #endif | ||
| 336 | |||
| 233 | /* | 337 | /* |
| 234 | * Convert jiffies to milliseconds and back. | 338 | * Convert jiffies to milliseconds and back. |
| 235 | * | 339 | * |
| @@ -786,3 +890,61 @@ struct timespec64 timespec64_add_safe(const struct timespec64 lhs, | |||
| 786 | 890 | ||
| 787 | return res; | 891 | return res; |
| 788 | } | 892 | } |
| 893 | |||
| 894 | int get_timespec64(struct timespec64 *ts, | ||
| 895 | const struct timespec __user *uts) | ||
| 896 | { | ||
| 897 | struct timespec kts; | ||
| 898 | int ret; | ||
| 899 | |||
| 900 | ret = copy_from_user(&kts, uts, sizeof(kts)); | ||
| 901 | if (ret) | ||
| 902 | return -EFAULT; | ||
| 903 | |||
| 904 | ts->tv_sec = kts.tv_sec; | ||
| 905 | ts->tv_nsec = kts.tv_nsec; | ||
| 906 | |||
| 907 | return 0; | ||
| 908 | } | ||
| 909 | EXPORT_SYMBOL_GPL(get_timespec64); | ||
| 910 | |||
| 911 | int put_timespec64(const struct timespec64 *ts, | ||
| 912 | struct timespec __user *uts) | ||
| 913 | { | ||
| 914 | struct timespec kts = { | ||
| 915 | .tv_sec = ts->tv_sec, | ||
| 916 | .tv_nsec = ts->tv_nsec | ||
| 917 | }; | ||
| 918 | return copy_to_user(uts, &kts, sizeof(kts)) ? -EFAULT : 0; | ||
| 919 | } | ||
| 920 | EXPORT_SYMBOL_GPL(put_timespec64); | ||
| 921 | |||
| 922 | int get_itimerspec64(struct itimerspec64 *it, | ||
| 923 | const struct itimerspec __user *uit) | ||
| 924 | { | ||
| 925 | int ret; | ||
| 926 | |||
| 927 | ret = get_timespec64(&it->it_interval, &uit->it_interval); | ||
| 928 | if (ret) | ||
| 929 | return ret; | ||
| 930 | |||
| 931 | ret = get_timespec64(&it->it_value, &uit->it_value); | ||
| 932 | |||
| 933 | return ret; | ||
| 934 | } | ||
| 935 | EXPORT_SYMBOL_GPL(get_itimerspec64); | ||
| 936 | |||
| 937 | int put_itimerspec64(const struct itimerspec64 *it, | ||
| 938 | struct itimerspec __user *uit) | ||
| 939 | { | ||
| 940 | int ret; | ||
| 941 | |||
| 942 | ret = put_timespec64(&it->it_interval, &uit->it_interval); | ||
| 943 | if (ret) | ||
| 944 | return ret; | ||
| 945 | |||
| 946 | ret = put_timespec64(&it->it_value, &uit->it_value); | ||
| 947 | |||
| 948 | return ret; | ||
| 949 | } | ||
| 950 | EXPORT_SYMBOL_GPL(put_itimerspec64); | ||
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 9652bc57fd09..cedafa008de5 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
| @@ -72,6 +72,10 @@ static inline void tk_normalize_xtime(struct timekeeper *tk) | |||
| 72 | tk->tkr_mono.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_mono.shift; | 72 | tk->tkr_mono.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_mono.shift; |
| 73 | tk->xtime_sec++; | 73 | tk->xtime_sec++; |
| 74 | } | 74 | } |
| 75 | while (tk->tkr_raw.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_raw.shift)) { | ||
| 76 | tk->tkr_raw.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_raw.shift; | ||
| 77 | tk->raw_sec++; | ||
| 78 | } | ||
| 75 | } | 79 | } |
| 76 | 80 | ||
| 77 | static inline struct timespec64 tk_xtime(struct timekeeper *tk) | 81 | static inline struct timespec64 tk_xtime(struct timekeeper *tk) |
| @@ -118,6 +122,26 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) | |||
| 118 | tk->offs_boot = ktime_add(tk->offs_boot, delta); | 122 | tk->offs_boot = ktime_add(tk->offs_boot, delta); |
| 119 | } | 123 | } |
| 120 | 124 | ||
| 125 | /* | ||
| 126 | * tk_clock_read - atomic clocksource read() helper | ||
| 127 | * | ||
| 128 | * This helper is necessary to use in the read paths because, while the | ||
| 129 | * seqlock ensures we don't return a bad value while structures are updated, | ||
| 130 | * it doesn't protect from potential crashes. There is the possibility that | ||
| 131 | * the tkr's clocksource may change between the read reference, and the | ||
| 132 | * clock reference passed to the read function. This can cause crashes if | ||
| 133 | * the wrong clocksource is passed to the wrong read function. | ||
| 134 | * This isn't necessary to use when holding the timekeeper_lock or doing | ||
| 135 | * a read of the fast-timekeeper tkrs (which is protected by its own locking | ||
| 136 | * and update logic). | ||
| 137 | */ | ||
| 138 | static inline u64 tk_clock_read(struct tk_read_base *tkr) | ||
| 139 | { | ||
| 140 | struct clocksource *clock = READ_ONCE(tkr->clock); | ||
| 141 | |||
| 142 | return clock->read(clock); | ||
| 143 | } | ||
| 144 | |||
| 121 | #ifdef CONFIG_DEBUG_TIMEKEEPING | 145 | #ifdef CONFIG_DEBUG_TIMEKEEPING |
| 122 | #define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */ | 146 | #define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */ |
| 123 | 147 | ||
| @@ -175,7 +199,7 @@ static inline u64 timekeeping_get_delta(struct tk_read_base *tkr) | |||
| 175 | */ | 199 | */ |
| 176 | do { | 200 | do { |
| 177 | seq = read_seqcount_begin(&tk_core.seq); | 201 | seq = read_seqcount_begin(&tk_core.seq); |
| 178 | now = tkr->read(tkr->clock); | 202 | now = tk_clock_read(tkr); |
| 179 | last = tkr->cycle_last; | 203 | last = tkr->cycle_last; |
| 180 | mask = tkr->mask; | 204 | mask = tkr->mask; |
| 181 | max = tkr->clock->max_cycles; | 205 | max = tkr->clock->max_cycles; |
| @@ -209,7 +233,7 @@ static inline u64 timekeeping_get_delta(struct tk_read_base *tkr) | |||
| 209 | u64 cycle_now, delta; | 233 | u64 cycle_now, delta; |
| 210 | 234 | ||
| 211 | /* read clocksource */ | 235 | /* read clocksource */ |
| 212 | cycle_now = tkr->read(tkr->clock); | 236 | cycle_now = tk_clock_read(tkr); |
| 213 | 237 | ||
| 214 | /* calculate the delta since the last update_wall_time */ | 238 | /* calculate the delta since the last update_wall_time */ |
| 215 | delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask); | 239 | delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask); |
| @@ -238,12 +262,10 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) | |||
| 238 | ++tk->cs_was_changed_seq; | 262 | ++tk->cs_was_changed_seq; |
| 239 | old_clock = tk->tkr_mono.clock; | 263 | old_clock = tk->tkr_mono.clock; |
| 240 | tk->tkr_mono.clock = clock; | 264 | tk->tkr_mono.clock = clock; |
| 241 | tk->tkr_mono.read = clock->read; | ||
| 242 | tk->tkr_mono.mask = clock->mask; | 265 | tk->tkr_mono.mask = clock->mask; |
| 243 | tk->tkr_mono.cycle_last = tk->tkr_mono.read(clock); | 266 | tk->tkr_mono.cycle_last = tk_clock_read(&tk->tkr_mono); |
| 244 | 267 | ||
| 245 | tk->tkr_raw.clock = clock; | 268 | tk->tkr_raw.clock = clock; |
| 246 | tk->tkr_raw.read = clock->read; | ||
| 247 | tk->tkr_raw.mask = clock->mask; | 269 | tk->tkr_raw.mask = clock->mask; |
| 248 | tk->tkr_raw.cycle_last = tk->tkr_mono.cycle_last; | 270 | tk->tkr_raw.cycle_last = tk->tkr_mono.cycle_last; |
| 249 | 271 | ||
| @@ -262,17 +284,19 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) | |||
| 262 | /* Go back from cycles -> shifted ns */ | 284 | /* Go back from cycles -> shifted ns */ |
| 263 | tk->xtime_interval = interval * clock->mult; | 285 | tk->xtime_interval = interval * clock->mult; |
| 264 | tk->xtime_remainder = ntpinterval - tk->xtime_interval; | 286 | tk->xtime_remainder = ntpinterval - tk->xtime_interval; |
| 265 | tk->raw_interval = (interval * clock->mult) >> clock->shift; | 287 | tk->raw_interval = interval * clock->mult; |
| 266 | 288 | ||
| 267 | /* if changing clocks, convert xtime_nsec shift units */ | 289 | /* if changing clocks, convert xtime_nsec shift units */ |
| 268 | if (old_clock) { | 290 | if (old_clock) { |
| 269 | int shift_change = clock->shift - old_clock->shift; | 291 | int shift_change = clock->shift - old_clock->shift; |
| 270 | if (shift_change < 0) | 292 | if (shift_change < 0) { |
| 271 | tk->tkr_mono.xtime_nsec >>= -shift_change; | 293 | tk->tkr_mono.xtime_nsec >>= -shift_change; |
| 272 | else | 294 | tk->tkr_raw.xtime_nsec >>= -shift_change; |
| 295 | } else { | ||
| 273 | tk->tkr_mono.xtime_nsec <<= shift_change; | 296 | tk->tkr_mono.xtime_nsec <<= shift_change; |
| 297 | tk->tkr_raw.xtime_nsec <<= shift_change; | ||
| 298 | } | ||
| 274 | } | 299 | } |
| 275 | tk->tkr_raw.xtime_nsec = 0; | ||
| 276 | 300 | ||
| 277 | tk->tkr_mono.shift = clock->shift; | 301 | tk->tkr_mono.shift = clock->shift; |
| 278 | tk->tkr_raw.shift = clock->shift; | 302 | tk->tkr_raw.shift = clock->shift; |
| @@ -404,7 +428,7 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) | |||
| 404 | 428 | ||
| 405 | now += timekeeping_delta_to_ns(tkr, | 429 | now += timekeeping_delta_to_ns(tkr, |
| 406 | clocksource_delta( | 430 | clocksource_delta( |
| 407 | tkr->read(tkr->clock), | 431 | tk_clock_read(tkr), |
| 408 | tkr->cycle_last, | 432 | tkr->cycle_last, |
| 409 | tkr->mask)); | 433 | tkr->mask)); |
| 410 | } while (read_seqcount_retry(&tkf->seq, seq)); | 434 | } while (read_seqcount_retry(&tkf->seq, seq)); |
| @@ -461,6 +485,10 @@ static u64 dummy_clock_read(struct clocksource *cs) | |||
| 461 | return cycles_at_suspend; | 485 | return cycles_at_suspend; |
| 462 | } | 486 | } |
| 463 | 487 | ||
| 488 | static struct clocksource dummy_clock = { | ||
| 489 | .read = dummy_clock_read, | ||
| 490 | }; | ||
| 491 | |||
| 464 | /** | 492 | /** |
| 465 | * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource. | 493 | * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource. |
| 466 | * @tk: Timekeeper to snapshot. | 494 | * @tk: Timekeeper to snapshot. |
| @@ -477,17 +505,18 @@ static void halt_fast_timekeeper(struct timekeeper *tk) | |||
| 477 | struct tk_read_base *tkr = &tk->tkr_mono; | 505 | struct tk_read_base *tkr = &tk->tkr_mono; |
| 478 | 506 | ||
| 479 | memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); | 507 | memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); |
| 480 | cycles_at_suspend = tkr->read(tkr->clock); | 508 | cycles_at_suspend = tk_clock_read(tkr); |
| 481 | tkr_dummy.read = dummy_clock_read; | 509 | tkr_dummy.clock = &dummy_clock; |
| 482 | update_fast_timekeeper(&tkr_dummy, &tk_fast_mono); | 510 | update_fast_timekeeper(&tkr_dummy, &tk_fast_mono); |
| 483 | 511 | ||
| 484 | tkr = &tk->tkr_raw; | 512 | tkr = &tk->tkr_raw; |
| 485 | memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); | 513 | memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); |
| 486 | tkr_dummy.read = dummy_clock_read; | 514 | tkr_dummy.clock = &dummy_clock; |
| 487 | update_fast_timekeeper(&tkr_dummy, &tk_fast_raw); | 515 | update_fast_timekeeper(&tkr_dummy, &tk_fast_raw); |
| 488 | } | 516 | } |
| 489 | 517 | ||
| 490 | #ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD | 518 | #ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD |
| 519 | #warning Please contact your maintainers, as GENERIC_TIME_VSYSCALL_OLD compatibity will disappear soon. | ||
| 491 | 520 | ||
| 492 | static inline void update_vsyscall(struct timekeeper *tk) | 521 | static inline void update_vsyscall(struct timekeeper *tk) |
| 493 | { | 522 | { |
| @@ -597,9 +626,6 @@ static inline void tk_update_ktime_data(struct timekeeper *tk) | |||
| 597 | nsec = (u32) tk->wall_to_monotonic.tv_nsec; | 626 | nsec = (u32) tk->wall_to_monotonic.tv_nsec; |
| 598 | tk->tkr_mono.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec); | 627 | tk->tkr_mono.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec); |
| 599 | 628 | ||
| 600 | /* Update the monotonic raw base */ | ||
| 601 | tk->tkr_raw.base = timespec64_to_ktime(tk->raw_time); | ||
| 602 | |||
| 603 | /* | 629 | /* |
| 604 | * The sum of the nanoseconds portions of xtime and | 630 | * The sum of the nanoseconds portions of xtime and |
| 605 | * wall_to_monotonic can be greater/equal one second. Take | 631 | * wall_to_monotonic can be greater/equal one second. Take |
| @@ -609,6 +635,11 @@ static inline void tk_update_ktime_data(struct timekeeper *tk) | |||
| 609 | if (nsec >= NSEC_PER_SEC) | 635 | if (nsec >= NSEC_PER_SEC) |
| 610 | seconds++; | 636 | seconds++; |
| 611 | tk->ktime_sec = seconds; | 637 | tk->ktime_sec = seconds; |
| 638 | |||
| 639 | /* Update the monotonic raw base */ | ||
| 640 | seconds = tk->raw_sec; | ||
| 641 | nsec = (u32)(tk->tkr_raw.xtime_nsec >> tk->tkr_raw.shift); | ||
| 642 | tk->tkr_raw.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec); | ||
| 612 | } | 643 | } |
| 613 | 644 | ||
| 614 | /* must hold timekeeper_lock */ | 645 | /* must hold timekeeper_lock */ |
| @@ -649,11 +680,9 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) | |||
| 649 | */ | 680 | */ |
| 650 | static void timekeeping_forward_now(struct timekeeper *tk) | 681 | static void timekeeping_forward_now(struct timekeeper *tk) |
| 651 | { | 682 | { |
| 652 | struct clocksource *clock = tk->tkr_mono.clock; | ||
| 653 | u64 cycle_now, delta; | 683 | u64 cycle_now, delta; |
| 654 | u64 nsec; | ||
| 655 | 684 | ||
| 656 | cycle_now = tk->tkr_mono.read(clock); | 685 | cycle_now = tk_clock_read(&tk->tkr_mono); |
| 657 | delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask); | 686 | delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask); |
| 658 | tk->tkr_mono.cycle_last = cycle_now; | 687 | tk->tkr_mono.cycle_last = cycle_now; |
| 659 | tk->tkr_raw.cycle_last = cycle_now; | 688 | tk->tkr_raw.cycle_last = cycle_now; |
| @@ -663,10 +692,13 @@ static void timekeeping_forward_now(struct timekeeper *tk) | |||
| 663 | /* If arch requires, add in get_arch_timeoffset() */ | 692 | /* If arch requires, add in get_arch_timeoffset() */ |
| 664 | tk->tkr_mono.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_mono.shift; | 693 | tk->tkr_mono.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_mono.shift; |
| 665 | 694 | ||
| 666 | tk_normalize_xtime(tk); | ||
| 667 | 695 | ||
| 668 | nsec = clocksource_cyc2ns(delta, tk->tkr_raw.mult, tk->tkr_raw.shift); | 696 | tk->tkr_raw.xtime_nsec += delta * tk->tkr_raw.mult; |
| 669 | timespec64_add_ns(&tk->raw_time, nsec); | 697 | |
| 698 | /* If arch requires, add in get_arch_timeoffset() */ | ||
| 699 | tk->tkr_raw.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_raw.shift; | ||
| 700 | |||
| 701 | tk_normalize_xtime(tk); | ||
| 670 | } | 702 | } |
| 671 | 703 | ||
| 672 | /** | 704 | /** |
| @@ -929,8 +961,7 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) | |||
| 929 | 961 | ||
| 930 | do { | 962 | do { |
| 931 | seq = read_seqcount_begin(&tk_core.seq); | 963 | seq = read_seqcount_begin(&tk_core.seq); |
| 932 | 964 | now = tk_clock_read(&tk->tkr_mono); | |
| 933 | now = tk->tkr_mono.read(tk->tkr_mono.clock); | ||
| 934 | systime_snapshot->cs_was_changed_seq = tk->cs_was_changed_seq; | 965 | systime_snapshot->cs_was_changed_seq = tk->cs_was_changed_seq; |
| 935 | systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq; | 966 | systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq; |
| 936 | base_real = ktime_add(tk->tkr_mono.base, | 967 | base_real = ktime_add(tk->tkr_mono.base, |
| @@ -1108,7 +1139,7 @@ int get_device_system_crosststamp(int (*get_time_fn) | |||
| 1108 | * Check whether the system counter value provided by the | 1139 | * Check whether the system counter value provided by the |
| 1109 | * device driver is on the current timekeeping interval. | 1140 | * device driver is on the current timekeeping interval. |
| 1110 | */ | 1141 | */ |
| 1111 | now = tk->tkr_mono.read(tk->tkr_mono.clock); | 1142 | now = tk_clock_read(&tk->tkr_mono); |
| 1112 | interval_start = tk->tkr_mono.cycle_last; | 1143 | interval_start = tk->tkr_mono.cycle_last; |
| 1113 | if (!cycle_between(interval_start, cycles, now)) { | 1144 | if (!cycle_between(interval_start, cycles, now)) { |
| 1114 | clock_was_set_seq = tk->clock_was_set_seq; | 1145 | clock_was_set_seq = tk->clock_was_set_seq; |
| @@ -1353,19 +1384,18 @@ int timekeeping_notify(struct clocksource *clock) | |||
| 1353 | void getrawmonotonic64(struct timespec64 *ts) | 1384 | void getrawmonotonic64(struct timespec64 *ts) |
| 1354 | { | 1385 | { |
| 1355 | struct timekeeper *tk = &tk_core.timekeeper; | 1386 | struct timekeeper *tk = &tk_core.timekeeper; |
| 1356 | struct timespec64 ts64; | ||
| 1357 | unsigned long seq; | 1387 | unsigned long seq; |
| 1358 | u64 nsecs; | 1388 | u64 nsecs; |
| 1359 | 1389 | ||
| 1360 | do { | 1390 | do { |
| 1361 | seq = read_seqcount_begin(&tk_core.seq); | 1391 | seq = read_seqcount_begin(&tk_core.seq); |
| 1392 | ts->tv_sec = tk->raw_sec; | ||
| 1362 | nsecs = timekeeping_get_ns(&tk->tkr_raw); | 1393 | nsecs = timekeeping_get_ns(&tk->tkr_raw); |
| 1363 | ts64 = tk->raw_time; | ||
| 1364 | 1394 | ||
| 1365 | } while (read_seqcount_retry(&tk_core.seq, seq)); | 1395 | } while (read_seqcount_retry(&tk_core.seq, seq)); |
| 1366 | 1396 | ||
| 1367 | timespec64_add_ns(&ts64, nsecs); | 1397 | ts->tv_nsec = 0; |
| 1368 | *ts = ts64; | 1398 | timespec64_add_ns(ts, nsecs); |
| 1369 | } | 1399 | } |
| 1370 | EXPORT_SYMBOL(getrawmonotonic64); | 1400 | EXPORT_SYMBOL(getrawmonotonic64); |
| 1371 | 1401 | ||
| @@ -1489,8 +1519,7 @@ void __init timekeeping_init(void) | |||
| 1489 | tk_setup_internals(tk, clock); | 1519 | tk_setup_internals(tk, clock); |
| 1490 | 1520 | ||
| 1491 | tk_set_xtime(tk, &now); | 1521 | tk_set_xtime(tk, &now); |
| 1492 | tk->raw_time.tv_sec = 0; | 1522 | tk->raw_sec = 0; |
| 1493 | tk->raw_time.tv_nsec = 0; | ||
| 1494 | if (boot.tv_sec == 0 && boot.tv_nsec == 0) | 1523 | if (boot.tv_sec == 0 && boot.tv_nsec == 0) |
| 1495 | boot = tk_xtime(tk); | 1524 | boot = tk_xtime(tk); |
| 1496 | 1525 | ||
| @@ -1629,7 +1658,7 @@ void timekeeping_resume(void) | |||
| 1629 | * The less preferred source will only be tried if there is no better | 1658 | * The less preferred source will only be tried if there is no better |
| 1630 | * usable source. The rtc part is handled separately in rtc core code. | 1659 | * usable source. The rtc part is handled separately in rtc core code. |
| 1631 | */ | 1660 | */ |
| 1632 | cycle_now = tk->tkr_mono.read(clock); | 1661 | cycle_now = tk_clock_read(&tk->tkr_mono); |
| 1633 | if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) && | 1662 | if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) && |
| 1634 | cycle_now > tk->tkr_mono.cycle_last) { | 1663 | cycle_now > tk->tkr_mono.cycle_last) { |
| 1635 | u64 nsec, cyc_delta; | 1664 | u64 nsec, cyc_delta; |
| @@ -1976,7 +2005,7 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset, | |||
| 1976 | u32 shift, unsigned int *clock_set) | 2005 | u32 shift, unsigned int *clock_set) |
| 1977 | { | 2006 | { |
| 1978 | u64 interval = tk->cycle_interval << shift; | 2007 | u64 interval = tk->cycle_interval << shift; |
| 1979 | u64 raw_nsecs; | 2008 | u64 snsec_per_sec; |
| 1980 | 2009 | ||
| 1981 | /* If the offset is smaller than a shifted interval, do nothing */ | 2010 | /* If the offset is smaller than a shifted interval, do nothing */ |
| 1982 | if (offset < interval) | 2011 | if (offset < interval) |
| @@ -1991,14 +2020,12 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset, | |||
| 1991 | *clock_set |= accumulate_nsecs_to_secs(tk); | 2020 | *clock_set |= accumulate_nsecs_to_secs(tk); |
| 1992 | 2021 | ||
| 1993 | /* Accumulate raw time */ | 2022 | /* Accumulate raw time */ |
| 1994 | raw_nsecs = (u64)tk->raw_interval << shift; | 2023 | tk->tkr_raw.xtime_nsec += tk->raw_interval << shift; |
| 1995 | raw_nsecs += tk->raw_time.tv_nsec; | 2024 | snsec_per_sec = (u64)NSEC_PER_SEC << tk->tkr_raw.shift; |
| 1996 | if (raw_nsecs >= NSEC_PER_SEC) { | 2025 | while (tk->tkr_raw.xtime_nsec >= snsec_per_sec) { |
| 1997 | u64 raw_secs = raw_nsecs; | 2026 | tk->tkr_raw.xtime_nsec -= snsec_per_sec; |
| 1998 | raw_nsecs = do_div(raw_secs, NSEC_PER_SEC); | 2027 | tk->raw_sec++; |
| 1999 | tk->raw_time.tv_sec += raw_secs; | ||
| 2000 | } | 2028 | } |
| 2001 | tk->raw_time.tv_nsec = raw_nsecs; | ||
| 2002 | 2029 | ||
| 2003 | /* Accumulate error between NTP and clock interval */ | 2030 | /* Accumulate error between NTP and clock interval */ |
| 2004 | tk->ntp_error += tk->ntp_tick << shift; | 2031 | tk->ntp_error += tk->ntp_tick << shift; |
| @@ -2030,7 +2057,7 @@ void update_wall_time(void) | |||
| 2030 | #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET | 2057 | #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET |
| 2031 | offset = real_tk->cycle_interval; | 2058 | offset = real_tk->cycle_interval; |
| 2032 | #else | 2059 | #else |
| 2033 | offset = clocksource_delta(tk->tkr_mono.read(tk->tkr_mono.clock), | 2060 | offset = clocksource_delta(tk_clock_read(&tk->tkr_mono), |
| 2034 | tk->tkr_mono.cycle_last, tk->tkr_mono.mask); | 2061 | tk->tkr_mono.cycle_last, tk->tkr_mono.mask); |
| 2035 | #endif | 2062 | #endif |
| 2036 | 2063 | ||
diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 152a706ef8b8..71ce3f4eead3 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c | |||
| @@ -195,7 +195,7 @@ EXPORT_SYMBOL(jiffies_64); | |||
| 195 | #endif | 195 | #endif |
| 196 | 196 | ||
| 197 | struct timer_base { | 197 | struct timer_base { |
| 198 | spinlock_t lock; | 198 | raw_spinlock_t lock; |
| 199 | struct timer_list *running_timer; | 199 | struct timer_list *running_timer; |
| 200 | unsigned long clk; | 200 | unsigned long clk; |
| 201 | unsigned long next_expiry; | 201 | unsigned long next_expiry; |
| @@ -913,10 +913,10 @@ static struct timer_base *lock_timer_base(struct timer_list *timer, | |||
| 913 | 913 | ||
| 914 | if (!(tf & TIMER_MIGRATING)) { | 914 | if (!(tf & TIMER_MIGRATING)) { |
| 915 | base = get_timer_base(tf); | 915 | base = get_timer_base(tf); |
| 916 | spin_lock_irqsave(&base->lock, *flags); | 916 | raw_spin_lock_irqsave(&base->lock, *flags); |
| 917 | if (timer->flags == tf) | 917 | if (timer->flags == tf) |
| 918 | return base; | 918 | return base; |
| 919 | spin_unlock_irqrestore(&base->lock, *flags); | 919 | raw_spin_unlock_irqrestore(&base->lock, *flags); |
| 920 | } | 920 | } |
| 921 | cpu_relax(); | 921 | cpu_relax(); |
| 922 | } | 922 | } |
| @@ -986,9 +986,9 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) | |||
| 986 | /* See the comment in lock_timer_base() */ | 986 | /* See the comment in lock_timer_base() */ |
| 987 | timer->flags |= TIMER_MIGRATING; | 987 | timer->flags |= TIMER_MIGRATING; |
| 988 | 988 | ||
| 989 | spin_unlock(&base->lock); | 989 | raw_spin_unlock(&base->lock); |
| 990 | base = new_base; | 990 | base = new_base; |
| 991 | spin_lock(&base->lock); | 991 | raw_spin_lock(&base->lock); |
| 992 | WRITE_ONCE(timer->flags, | 992 | WRITE_ONCE(timer->flags, |
| 993 | (timer->flags & ~TIMER_BASEMASK) | base->cpu); | 993 | (timer->flags & ~TIMER_BASEMASK) | base->cpu); |
| 994 | } | 994 | } |
| @@ -1013,7 +1013,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) | |||
| 1013 | } | 1013 | } |
| 1014 | 1014 | ||
| 1015 | out_unlock: | 1015 | out_unlock: |
| 1016 | spin_unlock_irqrestore(&base->lock, flags); | 1016 | raw_spin_unlock_irqrestore(&base->lock, flags); |
| 1017 | 1017 | ||
| 1018 | return ret; | 1018 | return ret; |
| 1019 | } | 1019 | } |
| @@ -1106,16 +1106,16 @@ void add_timer_on(struct timer_list *timer, int cpu) | |||
| 1106 | if (base != new_base) { | 1106 | if (base != new_base) { |
| 1107 | timer->flags |= TIMER_MIGRATING; | 1107 | timer->flags |= TIMER_MIGRATING; |
| 1108 | 1108 | ||
| 1109 | spin_unlock(&base->lock); | 1109 | raw_spin_unlock(&base->lock); |
| 1110 | base = new_base; | 1110 | base = new_base; |
| 1111 | spin_lock(&base->lock); | 1111 | raw_spin_lock(&base->lock); |
| 1112 | WRITE_ONCE(timer->flags, | 1112 | WRITE_ONCE(timer->flags, |
| 1113 | (timer->flags & ~TIMER_BASEMASK) | cpu); | 1113 | (timer->flags & ~TIMER_BASEMASK) | cpu); |
| 1114 | } | 1114 | } |
| 1115 | 1115 | ||
| 1116 | debug_activate(timer, timer->expires); | 1116 | debug_activate(timer, timer->expires); |
| 1117 | internal_add_timer(base, timer); | 1117 | internal_add_timer(base, timer); |
| 1118 | spin_unlock_irqrestore(&base->lock, flags); | 1118 | raw_spin_unlock_irqrestore(&base->lock, flags); |
| 1119 | } | 1119 | } |
| 1120 | EXPORT_SYMBOL_GPL(add_timer_on); | 1120 | EXPORT_SYMBOL_GPL(add_timer_on); |
| 1121 | 1121 | ||
| @@ -1141,7 +1141,7 @@ int del_timer(struct timer_list *timer) | |||
| 1141 | if (timer_pending(timer)) { | 1141 | if (timer_pending(timer)) { |
| 1142 | base = lock_timer_base(timer, &flags); | 1142 | base = lock_timer_base(timer, &flags); |
| 1143 | ret = detach_if_pending(timer, base, true); | 1143 | ret = detach_if_pending(timer, base, true); |
| 1144 | spin_unlock_irqrestore(&base->lock, flags); | 1144 | raw_spin_unlock_irqrestore(&base->lock, flags); |
| 1145 | } | 1145 | } |
| 1146 | 1146 | ||
| 1147 | return ret; | 1147 | return ret; |
| @@ -1150,7 +1150,7 @@ EXPORT_SYMBOL(del_timer); | |||
| 1150 | 1150 | ||
| 1151 | /** | 1151 | /** |
| 1152 | * try_to_del_timer_sync - Try to deactivate a timer | 1152 | * try_to_del_timer_sync - Try to deactivate a timer |
| 1153 | * @timer: timer do del | 1153 | * @timer: timer to delete |
| 1154 | * | 1154 | * |
| 1155 | * This function tries to deactivate a timer. Upon successful (ret >= 0) | 1155 | * This function tries to deactivate a timer. Upon successful (ret >= 0) |
| 1156 | * exit the timer is not queued and the handler is not running on any CPU. | 1156 | * exit the timer is not queued and the handler is not running on any CPU. |
| @@ -1168,7 +1168,7 @@ int try_to_del_timer_sync(struct timer_list *timer) | |||
| 1168 | if (base->running_timer != timer) | 1168 | if (base->running_timer != timer) |
| 1169 | ret = detach_if_pending(timer, base, true); | 1169 | ret = detach_if_pending(timer, base, true); |
| 1170 | 1170 | ||
| 1171 | spin_unlock_irqrestore(&base->lock, flags); | 1171 | raw_spin_unlock_irqrestore(&base->lock, flags); |
| 1172 | 1172 | ||
| 1173 | return ret; | 1173 | return ret; |
| 1174 | } | 1174 | } |
| @@ -1299,13 +1299,13 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head) | |||
| 1299 | data = timer->data; | 1299 | data = timer->data; |
| 1300 | 1300 | ||
| 1301 | if (timer->flags & TIMER_IRQSAFE) { | 1301 | if (timer->flags & TIMER_IRQSAFE) { |
| 1302 | spin_unlock(&base->lock); | 1302 | raw_spin_unlock(&base->lock); |
| 1303 | call_timer_fn(timer, fn, data); | 1303 | call_timer_fn(timer, fn, data); |
| 1304 | spin_lock(&base->lock); | 1304 | raw_spin_lock(&base->lock); |
| 1305 | } else { | 1305 | } else { |
| 1306 | spin_unlock_irq(&base->lock); | 1306 | raw_spin_unlock_irq(&base->lock); |
| 1307 | call_timer_fn(timer, fn, data); | 1307 | call_timer_fn(timer, fn, data); |
| 1308 | spin_lock_irq(&base->lock); | 1308 | raw_spin_lock_irq(&base->lock); |
| 1309 | } | 1309 | } |
| 1310 | } | 1310 | } |
| 1311 | } | 1311 | } |
| @@ -1474,7 +1474,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) | |||
| 1474 | if (cpu_is_offline(smp_processor_id())) | 1474 | if (cpu_is_offline(smp_processor_id())) |
| 1475 | return expires; | 1475 | return expires; |
| 1476 | 1476 | ||
| 1477 | spin_lock(&base->lock); | 1477 | raw_spin_lock(&base->lock); |
| 1478 | nextevt = __next_timer_interrupt(base); | 1478 | nextevt = __next_timer_interrupt(base); |
| 1479 | is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA); | 1479 | is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA); |
| 1480 | base->next_expiry = nextevt; | 1480 | base->next_expiry = nextevt; |
| @@ -1502,7 +1502,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) | |||
| 1502 | if ((expires - basem) > TICK_NSEC) | 1502 | if ((expires - basem) > TICK_NSEC) |
| 1503 | base->is_idle = true; | 1503 | base->is_idle = true; |
| 1504 | } | 1504 | } |
| 1505 | spin_unlock(&base->lock); | 1505 | raw_spin_unlock(&base->lock); |
| 1506 | 1506 | ||
| 1507 | return cmp_next_hrtimer_event(basem, expires); | 1507 | return cmp_next_hrtimer_event(basem, expires); |
| 1508 | } | 1508 | } |
| @@ -1590,7 +1590,7 @@ static inline void __run_timers(struct timer_base *base) | |||
| 1590 | if (!time_after_eq(jiffies, base->clk)) | 1590 | if (!time_after_eq(jiffies, base->clk)) |
| 1591 | return; | 1591 | return; |
| 1592 | 1592 | ||
| 1593 | spin_lock_irq(&base->lock); | 1593 | raw_spin_lock_irq(&base->lock); |
| 1594 | 1594 | ||
| 1595 | while (time_after_eq(jiffies, base->clk)) { | 1595 | while (time_after_eq(jiffies, base->clk)) { |
| 1596 | 1596 | ||
| @@ -1601,7 +1601,7 @@ static inline void __run_timers(struct timer_base *base) | |||
| 1601 | expire_timers(base, heads + levels); | 1601 | expire_timers(base, heads + levels); |
| 1602 | } | 1602 | } |
| 1603 | base->running_timer = NULL; | 1603 | base->running_timer = NULL; |
| 1604 | spin_unlock_irq(&base->lock); | 1604 | raw_spin_unlock_irq(&base->lock); |
| 1605 | } | 1605 | } |
| 1606 | 1606 | ||
| 1607 | /* | 1607 | /* |
| @@ -1786,16 +1786,16 @@ int timers_dead_cpu(unsigned int cpu) | |||
| 1786 | * The caller is globally serialized and nobody else | 1786 | * The caller is globally serialized and nobody else |
| 1787 | * takes two locks at once, deadlock is not possible. | 1787 | * takes two locks at once, deadlock is not possible. |
| 1788 | */ | 1788 | */ |
| 1789 | spin_lock_irq(&new_base->lock); | 1789 | raw_spin_lock_irq(&new_base->lock); |
| 1790 | spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); | 1790 | raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); |
| 1791 | 1791 | ||
| 1792 | BUG_ON(old_base->running_timer); | 1792 | BUG_ON(old_base->running_timer); |
| 1793 | 1793 | ||
| 1794 | for (i = 0; i < WHEEL_SIZE; i++) | 1794 | for (i = 0; i < WHEEL_SIZE; i++) |
| 1795 | migrate_timer_list(new_base, old_base->vectors + i); | 1795 | migrate_timer_list(new_base, old_base->vectors + i); |
| 1796 | 1796 | ||
| 1797 | spin_unlock(&old_base->lock); | 1797 | raw_spin_unlock(&old_base->lock); |
| 1798 | spin_unlock_irq(&new_base->lock); | 1798 | raw_spin_unlock_irq(&new_base->lock); |
| 1799 | put_cpu_ptr(&timer_bases); | 1799 | put_cpu_ptr(&timer_bases); |
| 1800 | } | 1800 | } |
| 1801 | return 0; | 1801 | return 0; |
| @@ -1811,7 +1811,7 @@ static void __init init_timer_cpu(int cpu) | |||
| 1811 | for (i = 0; i < NR_BASES; i++) { | 1811 | for (i = 0; i < NR_BASES; i++) { |
| 1812 | base = per_cpu_ptr(&timer_bases[i], cpu); | 1812 | base = per_cpu_ptr(&timer_bases[i], cpu); |
| 1813 | base->cpu = cpu; | 1813 | base->cpu = cpu; |
| 1814 | spin_lock_init(&base->lock); | 1814 | raw_spin_lock_init(&base->lock); |
| 1815 | base->clk = jiffies; | 1815 | base->clk = jiffies; |
| 1816 | } | 1816 | } |
| 1817 | } | 1817 | } |
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 7e06f04e98fe..434c840e2d82 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig | |||
| @@ -667,30 +667,30 @@ config RING_BUFFER_STARTUP_TEST | |||
| 667 | 667 | ||
| 668 | If unsure, say N | 668 | If unsure, say N |
| 669 | 669 | ||
| 670 | config TRACE_ENUM_MAP_FILE | 670 | config TRACE_EVAL_MAP_FILE |
| 671 | bool "Show enum mappings for trace events" | 671 | bool "Show eval mappings for trace events" |
| 672 | depends on TRACING | 672 | depends on TRACING |
| 673 | help | 673 | help |
| 674 | The "print fmt" of the trace events will show the enum names instead | 674 | The "print fmt" of the trace events will show the enum/sizeof names |
| 675 | of their values. This can cause problems for user space tools that | 675 | instead of their values. This can cause problems for user space tools |
| 676 | use this string to parse the raw data as user space does not know | 676 | that use this string to parse the raw data as user space does not know |
| 677 | how to convert the string to its value. | 677 | how to convert the string to its value. |
| 678 | 678 | ||
| 679 | To fix this, there's a special macro in the kernel that can be used | 679 | To fix this, there's a special macro in the kernel that can be used |
| 680 | to convert the enum into its value. If this macro is used, then the | 680 | to convert an enum/sizeof into its value. If this macro is used, then |
| 681 | print fmt strings will have the enums converted to their values. | 681 | the print fmt strings will be converted to their values. |
| 682 | 682 | ||
| 683 | If something does not get converted properly, this option can be | 683 | If something does not get converted properly, this option can be |
| 684 | used to show what enums the kernel tried to convert. | 684 | used to show what enums/sizeof the kernel tried to convert. |
| 685 | 685 | ||
| 686 | This option is for debugging the enum conversions. A file is created | 686 | This option is for debugging the conversions. A file is created |
| 687 | in the tracing directory called "enum_map" that will show the enum | 687 | in the tracing directory called "eval_map" that will show the |
| 688 | names matched with their values and what trace event system they | 688 | names matched with their values and what trace event system they |
| 689 | belong too. | 689 | belong too. |
| 690 | 690 | ||
| 691 | Normally, the mapping of the strings to values will be freed after | 691 | Normally, the mapping of the strings to values will be freed after |
| 692 | boot up or module load. With this option, they will not be freed, as | 692 | boot up or module load. With this option, they will not be freed, as |
| 693 | they are needed for the "enum_map" file. Enabling this option will | 693 | they are needed for the "eval_map" file. Enabling this option will |
| 694 | increase the memory footprint of the running kernel. | 694 | increase the memory footprint of the running kernel. |
| 695 | 695 | ||
| 696 | If unsure, say N | 696 | If unsure, say N |
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 193c5f5e3f79..bc364f86100a 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c | |||
| @@ -867,7 +867,7 @@ static void blk_add_trace_split(void *ignore, | |||
| 867 | 867 | ||
| 868 | __blk_add_trace(bt, bio->bi_iter.bi_sector, | 868 | __blk_add_trace(bt, bio->bi_iter.bi_sector, |
| 869 | bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf, | 869 | bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf, |
| 870 | BLK_TA_SPLIT, bio->bi_error, sizeof(rpdu), | 870 | BLK_TA_SPLIT, bio->bi_status, sizeof(rpdu), |
| 871 | &rpdu); | 871 | &rpdu); |
| 872 | } | 872 | } |
| 873 | } | 873 | } |
| @@ -900,7 +900,7 @@ static void blk_add_trace_bio_remap(void *ignore, | |||
| 900 | r.sector_from = cpu_to_be64(from); | 900 | r.sector_from = cpu_to_be64(from); |
| 901 | 901 | ||
| 902 | __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, | 902 | __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, |
| 903 | bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_error, | 903 | bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_status, |
| 904 | sizeof(r), &r); | 904 | sizeof(r), &r); |
| 905 | } | 905 | } |
| 906 | 906 | ||
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 460a031c77e5..37385193a608 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c | |||
| @@ -122,8 +122,8 @@ static const struct bpf_func_proto *bpf_get_probe_write_proto(void) | |||
| 122 | } | 122 | } |
| 123 | 123 | ||
| 124 | /* | 124 | /* |
| 125 | * limited trace_printk() | 125 | * Only limited trace_printk() conversion specifiers allowed: |
| 126 | * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed | 126 | * %d %i %u %x %ld %li %lu %lx %lld %lli %llu %llx %p %s |
| 127 | */ | 127 | */ |
| 128 | BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, | 128 | BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, |
| 129 | u64, arg2, u64, arg3) | 129 | u64, arg2, u64, arg3) |
| @@ -198,7 +198,8 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, | |||
| 198 | i++; | 198 | i++; |
| 199 | } | 199 | } |
| 200 | 200 | ||
| 201 | if (fmt[i] != 'd' && fmt[i] != 'u' && fmt[i] != 'x') | 201 | if (fmt[i] != 'i' && fmt[i] != 'd' && |
| 202 | fmt[i] != 'u' && fmt[i] != 'x') | ||
| 202 | return -EINVAL; | 203 | return -EINVAL; |
| 203 | fmt_cnt++; | 204 | fmt_cnt++; |
| 204 | } | 205 | } |
| @@ -234,7 +235,8 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags) | |||
| 234 | unsigned int cpu = smp_processor_id(); | 235 | unsigned int cpu = smp_processor_id(); |
| 235 | u64 index = flags & BPF_F_INDEX_MASK; | 236 | u64 index = flags & BPF_F_INDEX_MASK; |
| 236 | struct bpf_event_entry *ee; | 237 | struct bpf_event_entry *ee; |
| 237 | struct perf_event *event; | 238 | u64 value = 0; |
| 239 | int err; | ||
| 238 | 240 | ||
| 239 | if (unlikely(flags & ~(BPF_F_INDEX_MASK))) | 241 | if (unlikely(flags & ~(BPF_F_INDEX_MASK))) |
| 240 | return -EINVAL; | 242 | return -EINVAL; |
| @@ -247,21 +249,14 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags) | |||
| 247 | if (!ee) | 249 | if (!ee) |
| 248 | return -ENOENT; | 250 | return -ENOENT; |
| 249 | 251 | ||
| 250 | event = ee->event; | 252 | err = perf_event_read_local(ee->event, &value); |
| 251 | if (unlikely(event->attr.type != PERF_TYPE_HARDWARE && | ||
| 252 | event->attr.type != PERF_TYPE_RAW)) | ||
| 253 | return -EINVAL; | ||
| 254 | |||
| 255 | /* make sure event is local and doesn't have pmu::count */ | ||
| 256 | if (unlikely(event->oncpu != cpu || event->pmu->count)) | ||
| 257 | return -EINVAL; | ||
| 258 | |||
| 259 | /* | 253 | /* |
| 260 | * we don't know if the function is run successfully by the | 254 | * this api is ugly since we miss [-22..-2] range of valid |
| 261 | * return value. It can be judged in other places, such as | 255 | * counter values, but that's uapi |
| 262 | * eBPF programs. | ||
| 263 | */ | 256 | */ |
| 264 | return perf_event_read_local(event); | 257 | if (err) |
| 258 | return err; | ||
| 259 | return value; | ||
| 265 | } | 260 | } |
| 266 | 261 | ||
| 267 | static const struct bpf_func_proto bpf_perf_event_read_proto = { | 262 | static const struct bpf_func_proto bpf_perf_event_read_proto = { |
| @@ -272,14 +267,16 @@ static const struct bpf_func_proto bpf_perf_event_read_proto = { | |||
| 272 | .arg2_type = ARG_ANYTHING, | 267 | .arg2_type = ARG_ANYTHING, |
| 273 | }; | 268 | }; |
| 274 | 269 | ||
| 270 | static DEFINE_PER_CPU(struct perf_sample_data, bpf_sd); | ||
| 271 | |||
| 275 | static __always_inline u64 | 272 | static __always_inline u64 |
| 276 | __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, | 273 | __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, |
| 277 | u64 flags, struct perf_raw_record *raw) | 274 | u64 flags, struct perf_raw_record *raw) |
| 278 | { | 275 | { |
| 279 | struct bpf_array *array = container_of(map, struct bpf_array, map); | 276 | struct bpf_array *array = container_of(map, struct bpf_array, map); |
| 277 | struct perf_sample_data *sd = this_cpu_ptr(&bpf_sd); | ||
| 280 | unsigned int cpu = smp_processor_id(); | 278 | unsigned int cpu = smp_processor_id(); |
| 281 | u64 index = flags & BPF_F_INDEX_MASK; | 279 | u64 index = flags & BPF_F_INDEX_MASK; |
| 282 | struct perf_sample_data sample_data; | ||
| 283 | struct bpf_event_entry *ee; | 280 | struct bpf_event_entry *ee; |
| 284 | struct perf_event *event; | 281 | struct perf_event *event; |
| 285 | 282 | ||
| @@ -300,9 +297,9 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, | |||
| 300 | if (unlikely(event->oncpu != cpu)) | 297 | if (unlikely(event->oncpu != cpu)) |
| 301 | return -EOPNOTSUPP; | 298 | return -EOPNOTSUPP; |
| 302 | 299 | ||
| 303 | perf_sample_data_init(&sample_data, 0, 0); | 300 | perf_sample_data_init(sd, 0, 0); |
| 304 | sample_data.raw = raw; | 301 | sd->raw = raw; |
| 305 | perf_event_output(event, &sample_data, regs); | 302 | perf_event_output(event, sd, regs); |
| 306 | return 0; | 303 | return 0; |
| 307 | } | 304 | } |
| 308 | 305 | ||
| @@ -483,7 +480,7 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func | |||
| 483 | 480 | ||
| 484 | /* bpf+kprobe programs can access fields of 'struct pt_regs' */ | 481 | /* bpf+kprobe programs can access fields of 'struct pt_regs' */ |
| 485 | static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type, | 482 | static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type, |
| 486 | enum bpf_reg_type *reg_type) | 483 | struct bpf_insn_access_aux *info) |
| 487 | { | 484 | { |
| 488 | if (off < 0 || off >= sizeof(struct pt_regs)) | 485 | if (off < 0 || off >= sizeof(struct pt_regs)) |
| 489 | return false; | 486 | return false; |
| @@ -566,7 +563,7 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id) | |||
| 566 | } | 563 | } |
| 567 | 564 | ||
| 568 | static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type, | 565 | static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type, |
| 569 | enum bpf_reg_type *reg_type) | 566 | struct bpf_insn_access_aux *info) |
| 570 | { | 567 | { |
| 571 | if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE) | 568 | if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE) |
| 572 | return false; | 569 | return false; |
| @@ -585,40 +582,47 @@ const struct bpf_verifier_ops tracepoint_prog_ops = { | |||
| 585 | }; | 582 | }; |
| 586 | 583 | ||
| 587 | static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, | 584 | static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, |
| 588 | enum bpf_reg_type *reg_type) | 585 | struct bpf_insn_access_aux *info) |
| 589 | { | 586 | { |
| 587 | const int size_sp = FIELD_SIZEOF(struct bpf_perf_event_data, | ||
| 588 | sample_period); | ||
| 589 | |||
| 590 | if (off < 0 || off >= sizeof(struct bpf_perf_event_data)) | 590 | if (off < 0 || off >= sizeof(struct bpf_perf_event_data)) |
| 591 | return false; | 591 | return false; |
| 592 | if (type != BPF_READ) | 592 | if (type != BPF_READ) |
| 593 | return false; | 593 | return false; |
| 594 | if (off % size != 0) | 594 | if (off % size != 0) |
| 595 | return false; | 595 | return false; |
| 596 | if (off == offsetof(struct bpf_perf_event_data, sample_period)) { | 596 | |
| 597 | if (size != sizeof(u64)) | 597 | switch (off) { |
| 598 | case bpf_ctx_range(struct bpf_perf_event_data, sample_period): | ||
| 599 | bpf_ctx_record_field_size(info, size_sp); | ||
| 600 | if (!bpf_ctx_narrow_access_ok(off, size, size_sp)) | ||
| 598 | return false; | 601 | return false; |
| 599 | } else { | 602 | break; |
| 603 | default: | ||
| 600 | if (size != sizeof(long)) | 604 | if (size != sizeof(long)) |
| 601 | return false; | 605 | return false; |
| 602 | } | 606 | } |
| 607 | |||
| 603 | return true; | 608 | return true; |
| 604 | } | 609 | } |
| 605 | 610 | ||
| 606 | static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, | 611 | static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, |
| 607 | const struct bpf_insn *si, | 612 | const struct bpf_insn *si, |
| 608 | struct bpf_insn *insn_buf, | 613 | struct bpf_insn *insn_buf, |
| 609 | struct bpf_prog *prog) | 614 | struct bpf_prog *prog, u32 *target_size) |
| 610 | { | 615 | { |
| 611 | struct bpf_insn *insn = insn_buf; | 616 | struct bpf_insn *insn = insn_buf; |
| 612 | 617 | ||
| 613 | switch (si->off) { | 618 | switch (si->off) { |
| 614 | case offsetof(struct bpf_perf_event_data, sample_period): | 619 | case offsetof(struct bpf_perf_event_data, sample_period): |
| 615 | BUILD_BUG_ON(FIELD_SIZEOF(struct perf_sample_data, period) != sizeof(u64)); | ||
| 616 | |||
| 617 | *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, | 620 | *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, |
| 618 | data), si->dst_reg, si->src_reg, | 621 | data), si->dst_reg, si->src_reg, |
| 619 | offsetof(struct bpf_perf_event_data_kern, data)); | 622 | offsetof(struct bpf_perf_event_data_kern, data)); |
| 620 | *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg, | 623 | *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg, |
| 621 | offsetof(struct perf_sample_data, period)); | 624 | bpf_target_off(struct perf_sample_data, period, 8, |
| 625 | target_size)); | ||
| 622 | break; | 626 | break; |
| 623 | default: | 627 | default: |
| 624 | *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, | 628 | *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, |
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 74fdfe9ed3db..02004ae91860 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
| @@ -113,7 +113,7 @@ static int ftrace_disabled __read_mostly; | |||
| 113 | 113 | ||
| 114 | static DEFINE_MUTEX(ftrace_lock); | 114 | static DEFINE_MUTEX(ftrace_lock); |
| 115 | 115 | ||
| 116 | static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; | 116 | static struct ftrace_ops __rcu *ftrace_ops_list __read_mostly = &ftrace_list_end; |
| 117 | ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; | 117 | ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; |
| 118 | static struct ftrace_ops global_ops; | 118 | static struct ftrace_ops global_ops; |
| 119 | 119 | ||
| @@ -169,8 +169,11 @@ int ftrace_nr_registered_ops(void) | |||
| 169 | 169 | ||
| 170 | mutex_lock(&ftrace_lock); | 170 | mutex_lock(&ftrace_lock); |
| 171 | 171 | ||
| 172 | for (ops = ftrace_ops_list; | 172 | for (ops = rcu_dereference_protected(ftrace_ops_list, |
| 173 | ops != &ftrace_list_end; ops = ops->next) | 173 | lockdep_is_held(&ftrace_lock)); |
| 174 | ops != &ftrace_list_end; | ||
| 175 | ops = rcu_dereference_protected(ops->next, | ||
| 176 | lockdep_is_held(&ftrace_lock))) | ||
| 174 | cnt++; | 177 | cnt++; |
| 175 | 178 | ||
| 176 | mutex_unlock(&ftrace_lock); | 179 | mutex_unlock(&ftrace_lock); |
| @@ -275,10 +278,11 @@ static void update_ftrace_function(void) | |||
| 275 | * If there's only one ftrace_ops registered, the ftrace_ops_list | 278 | * If there's only one ftrace_ops registered, the ftrace_ops_list |
| 276 | * will point to the ops we want. | 279 | * will point to the ops we want. |
| 277 | */ | 280 | */ |
| 278 | set_function_trace_op = ftrace_ops_list; | 281 | set_function_trace_op = rcu_dereference_protected(ftrace_ops_list, |
| 282 | lockdep_is_held(&ftrace_lock)); | ||
| 279 | 283 | ||
| 280 | /* If there's no ftrace_ops registered, just call the stub function */ | 284 | /* If there's no ftrace_ops registered, just call the stub function */ |
| 281 | if (ftrace_ops_list == &ftrace_list_end) { | 285 | if (set_function_trace_op == &ftrace_list_end) { |
| 282 | func = ftrace_stub; | 286 | func = ftrace_stub; |
| 283 | 287 | ||
| 284 | /* | 288 | /* |
| @@ -286,7 +290,8 @@ static void update_ftrace_function(void) | |||
| 286 | * recursion safe and not dynamic and the arch supports passing ops, | 290 | * recursion safe and not dynamic and the arch supports passing ops, |
| 287 | * then have the mcount trampoline call the function directly. | 291 | * then have the mcount trampoline call the function directly. |
| 288 | */ | 292 | */ |
| 289 | } else if (ftrace_ops_list->next == &ftrace_list_end) { | 293 | } else if (rcu_dereference_protected(ftrace_ops_list->next, |
| 294 | lockdep_is_held(&ftrace_lock)) == &ftrace_list_end) { | ||
| 290 | func = ftrace_ops_get_list_func(ftrace_ops_list); | 295 | func = ftrace_ops_get_list_func(ftrace_ops_list); |
| 291 | 296 | ||
| 292 | } else { | 297 | } else { |
| @@ -348,9 +353,11 @@ int using_ftrace_ops_list_func(void) | |||
| 348 | return ftrace_trace_function == ftrace_ops_list_func; | 353 | return ftrace_trace_function == ftrace_ops_list_func; |
| 349 | } | 354 | } |
| 350 | 355 | ||
| 351 | static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) | 356 | static void add_ftrace_ops(struct ftrace_ops __rcu **list, |
| 357 | struct ftrace_ops *ops) | ||
| 352 | { | 358 | { |
| 353 | ops->next = *list; | 359 | rcu_assign_pointer(ops->next, *list); |
| 360 | |||
| 354 | /* | 361 | /* |
| 355 | * We are entering ops into the list but another | 362 | * We are entering ops into the list but another |
| 356 | * CPU might be walking that list. We need to make sure | 363 | * CPU might be walking that list. We need to make sure |
| @@ -360,7 +367,8 @@ static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) | |||
| 360 | rcu_assign_pointer(*list, ops); | 367 | rcu_assign_pointer(*list, ops); |
| 361 | } | 368 | } |
| 362 | 369 | ||
| 363 | static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) | 370 | static int remove_ftrace_ops(struct ftrace_ops __rcu **list, |
| 371 | struct ftrace_ops *ops) | ||
| 364 | { | 372 | { |
| 365 | struct ftrace_ops **p; | 373 | struct ftrace_ops **p; |
| 366 | 374 | ||
| @@ -368,7 +376,10 @@ static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) | |||
| 368 | * If we are removing the last function, then simply point | 376 | * If we are removing the last function, then simply point |
| 369 | * to the ftrace_stub. | 377 | * to the ftrace_stub. |
| 370 | */ | 378 | */ |
| 371 | if (*list == ops && ops->next == &ftrace_list_end) { | 379 | if (rcu_dereference_protected(*list, |
| 380 | lockdep_is_held(&ftrace_lock)) == ops && | ||
| 381 | rcu_dereference_protected(ops->next, | ||
| 382 | lockdep_is_held(&ftrace_lock)) == &ftrace_list_end) { | ||
| 372 | *list = &ftrace_list_end; | 383 | *list = &ftrace_list_end; |
| 373 | return 0; | 384 | return 0; |
| 374 | } | 385 | } |
| @@ -1293,6 +1304,28 @@ static void ftrace_hash_clear(struct ftrace_hash *hash) | |||
| 1293 | FTRACE_WARN_ON(hash->count); | 1304 | FTRACE_WARN_ON(hash->count); |
| 1294 | } | 1305 | } |
| 1295 | 1306 | ||
| 1307 | static void free_ftrace_mod(struct ftrace_mod_load *ftrace_mod) | ||
| 1308 | { | ||
| 1309 | list_del(&ftrace_mod->list); | ||
| 1310 | kfree(ftrace_mod->module); | ||
| 1311 | kfree(ftrace_mod->func); | ||
| 1312 | kfree(ftrace_mod); | ||
| 1313 | } | ||
| 1314 | |||
| 1315 | static void clear_ftrace_mod_list(struct list_head *head) | ||
| 1316 | { | ||
| 1317 | struct ftrace_mod_load *p, *n; | ||
| 1318 | |||
| 1319 | /* stack tracer isn't supported yet */ | ||
| 1320 | if (!head) | ||
| 1321 | return; | ||
| 1322 | |||
| 1323 | mutex_lock(&ftrace_lock); | ||
| 1324 | list_for_each_entry_safe(p, n, head, list) | ||
| 1325 | free_ftrace_mod(p); | ||
| 1326 | mutex_unlock(&ftrace_lock); | ||
| 1327 | } | ||
| 1328 | |||
| 1296 | static void free_ftrace_hash(struct ftrace_hash *hash) | 1329 | static void free_ftrace_hash(struct ftrace_hash *hash) |
| 1297 | { | 1330 | { |
| 1298 | if (!hash || hash == EMPTY_HASH) | 1331 | if (!hash || hash == EMPTY_HASH) |
| @@ -1346,6 +1379,35 @@ static struct ftrace_hash *alloc_ftrace_hash(int size_bits) | |||
| 1346 | return hash; | 1379 | return hash; |
| 1347 | } | 1380 | } |
| 1348 | 1381 | ||
| 1382 | |||
| 1383 | static int ftrace_add_mod(struct trace_array *tr, | ||
| 1384 | const char *func, const char *module, | ||
| 1385 | int enable) | ||
| 1386 | { | ||
| 1387 | struct ftrace_mod_load *ftrace_mod; | ||
| 1388 | struct list_head *mod_head = enable ? &tr->mod_trace : &tr->mod_notrace; | ||
| 1389 | |||
| 1390 | ftrace_mod = kzalloc(sizeof(*ftrace_mod), GFP_KERNEL); | ||
| 1391 | if (!ftrace_mod) | ||
| 1392 | return -ENOMEM; | ||
| 1393 | |||
| 1394 | ftrace_mod->func = kstrdup(func, GFP_KERNEL); | ||
| 1395 | ftrace_mod->module = kstrdup(module, GFP_KERNEL); | ||
| 1396 | ftrace_mod->enable = enable; | ||
| 1397 | |||
| 1398 | if (!ftrace_mod->func || !ftrace_mod->module) | ||
| 1399 | goto out_free; | ||
| 1400 | |||
| 1401 | list_add(&ftrace_mod->list, mod_head); | ||
| 1402 | |||
| 1403 | return 0; | ||
| 1404 | |||
| 1405 | out_free: | ||
| 1406 | free_ftrace_mod(ftrace_mod); | ||
| 1407 | |||
| 1408 | return -ENOMEM; | ||
| 1409 | } | ||
| 1410 | |||
| 1349 | static struct ftrace_hash * | 1411 | static struct ftrace_hash * |
| 1350 | alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) | 1412 | alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) |
| 1351 | { | 1413 | { |
| @@ -1359,6 +1421,9 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) | |||
| 1359 | if (!new_hash) | 1421 | if (!new_hash) |
| 1360 | return NULL; | 1422 | return NULL; |
| 1361 | 1423 | ||
| 1424 | if (hash) | ||
| 1425 | new_hash->flags = hash->flags; | ||
| 1426 | |||
| 1362 | /* Empty hash? */ | 1427 | /* Empty hash? */ |
| 1363 | if (ftrace_hash_empty(hash)) | 1428 | if (ftrace_hash_empty(hash)) |
| 1364 | return new_hash; | 1429 | return new_hash; |
| @@ -1403,7 +1468,7 @@ __ftrace_hash_move(struct ftrace_hash *src) | |||
| 1403 | /* | 1468 | /* |
| 1404 | * If the new source is empty, just return the empty_hash. | 1469 | * If the new source is empty, just return the empty_hash. |
| 1405 | */ | 1470 | */ |
| 1406 | if (!src->count) | 1471 | if (ftrace_hash_empty(src)) |
| 1407 | return EMPTY_HASH; | 1472 | return EMPTY_HASH; |
| 1408 | 1473 | ||
| 1409 | /* | 1474 | /* |
| @@ -1420,6 +1485,8 @@ __ftrace_hash_move(struct ftrace_hash *src) | |||
| 1420 | if (!new_hash) | 1485 | if (!new_hash) |
| 1421 | return NULL; | 1486 | return NULL; |
| 1422 | 1487 | ||
| 1488 | new_hash->flags = src->flags; | ||
| 1489 | |||
| 1423 | size = 1 << src->size_bits; | 1490 | size = 1 << src->size_bits; |
| 1424 | for (i = 0; i < size; i++) { | 1491 | for (i = 0; i < size; i++) { |
| 1425 | hhd = &src->buckets[i]; | 1492 | hhd = &src->buckets[i]; |
| @@ -1513,8 +1580,8 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) | |||
| 1513 | return 0; | 1580 | return 0; |
| 1514 | #endif | 1581 | #endif |
| 1515 | 1582 | ||
| 1516 | hash.filter_hash = rcu_dereference_raw_notrace(ops->func_hash->filter_hash); | 1583 | rcu_assign_pointer(hash.filter_hash, ops->func_hash->filter_hash); |
| 1517 | hash.notrace_hash = rcu_dereference_raw_notrace(ops->func_hash->notrace_hash); | 1584 | rcu_assign_pointer(hash.notrace_hash, ops->func_hash->notrace_hash); |
| 1518 | 1585 | ||
| 1519 | if (hash_contains_ip(ip, &hash)) | 1586 | if (hash_contains_ip(ip, &hash)) |
| 1520 | ret = 1; | 1587 | ret = 1; |
| @@ -1650,7 +1717,7 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, | |||
| 1650 | struct dyn_ftrace *rec; | 1717 | struct dyn_ftrace *rec; |
| 1651 | bool update = false; | 1718 | bool update = false; |
| 1652 | int count = 0; | 1719 | int count = 0; |
| 1653 | int all = 0; | 1720 | int all = false; |
| 1654 | 1721 | ||
| 1655 | /* Only update if the ops has been registered */ | 1722 | /* Only update if the ops has been registered */ |
| 1656 | if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) | 1723 | if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) |
| @@ -1671,7 +1738,7 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, | |||
| 1671 | hash = ops->func_hash->filter_hash; | 1738 | hash = ops->func_hash->filter_hash; |
| 1672 | other_hash = ops->func_hash->notrace_hash; | 1739 | other_hash = ops->func_hash->notrace_hash; |
| 1673 | if (ftrace_hash_empty(hash)) | 1740 | if (ftrace_hash_empty(hash)) |
| 1674 | all = 1; | 1741 | all = true; |
| 1675 | } else { | 1742 | } else { |
| 1676 | inc = !inc; | 1743 | inc = !inc; |
| 1677 | hash = ops->func_hash->notrace_hash; | 1744 | hash = ops->func_hash->notrace_hash; |
| @@ -2784,7 +2851,8 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) | |||
| 2784 | * If there's no more ops registered with ftrace, run a | 2851 | * If there's no more ops registered with ftrace, run a |
| 2785 | * sanity check to make sure all rec flags are cleared. | 2852 | * sanity check to make sure all rec flags are cleared. |
| 2786 | */ | 2853 | */ |
| 2787 | if (ftrace_ops_list == &ftrace_list_end) { | 2854 | if (rcu_dereference_protected(ftrace_ops_list, |
| 2855 | lockdep_is_held(&ftrace_lock)) == &ftrace_list_end) { | ||
| 2788 | struct ftrace_page *pg; | 2856 | struct ftrace_page *pg; |
| 2789 | struct dyn_ftrace *rec; | 2857 | struct dyn_ftrace *rec; |
| 2790 | 2858 | ||
| @@ -3061,6 +3129,7 @@ ftrace_allocate_pages(unsigned long num_to_init) | |||
| 3061 | struct ftrace_iterator { | 3129 | struct ftrace_iterator { |
| 3062 | loff_t pos; | 3130 | loff_t pos; |
| 3063 | loff_t func_pos; | 3131 | loff_t func_pos; |
| 3132 | loff_t mod_pos; | ||
| 3064 | struct ftrace_page *pg; | 3133 | struct ftrace_page *pg; |
| 3065 | struct dyn_ftrace *func; | 3134 | struct dyn_ftrace *func; |
| 3066 | struct ftrace_func_probe *probe; | 3135 | struct ftrace_func_probe *probe; |
| @@ -3068,6 +3137,8 @@ struct ftrace_iterator { | |||
| 3068 | struct trace_parser parser; | 3137 | struct trace_parser parser; |
| 3069 | struct ftrace_hash *hash; | 3138 | struct ftrace_hash *hash; |
| 3070 | struct ftrace_ops *ops; | 3139 | struct ftrace_ops *ops; |
| 3140 | struct trace_array *tr; | ||
| 3141 | struct list_head *mod_list; | ||
| 3071 | int pidx; | 3142 | int pidx; |
| 3072 | int idx; | 3143 | int idx; |
| 3073 | unsigned flags; | 3144 | unsigned flags; |
| @@ -3152,13 +3223,13 @@ static void *t_probe_start(struct seq_file *m, loff_t *pos) | |||
| 3152 | if (!(iter->flags & FTRACE_ITER_DO_PROBES)) | 3223 | if (!(iter->flags & FTRACE_ITER_DO_PROBES)) |
| 3153 | return NULL; | 3224 | return NULL; |
| 3154 | 3225 | ||
| 3155 | if (iter->func_pos > *pos) | 3226 | if (iter->mod_pos > *pos) |
| 3156 | return NULL; | 3227 | return NULL; |
| 3157 | 3228 | ||
| 3158 | iter->probe = NULL; | 3229 | iter->probe = NULL; |
| 3159 | iter->probe_entry = NULL; | 3230 | iter->probe_entry = NULL; |
| 3160 | iter->pidx = 0; | 3231 | iter->pidx = 0; |
| 3161 | for (l = 0; l <= (*pos - iter->func_pos); ) { | 3232 | for (l = 0; l <= (*pos - iter->mod_pos); ) { |
| 3162 | p = t_probe_next(m, &l); | 3233 | p = t_probe_next(m, &l); |
| 3163 | if (!p) | 3234 | if (!p) |
| 3164 | break; | 3235 | break; |
| @@ -3197,6 +3268,82 @@ t_probe_show(struct seq_file *m, struct ftrace_iterator *iter) | |||
| 3197 | } | 3268 | } |
| 3198 | 3269 | ||
| 3199 | static void * | 3270 | static void * |
| 3271 | t_mod_next(struct seq_file *m, loff_t *pos) | ||
| 3272 | { | ||
| 3273 | struct ftrace_iterator *iter = m->private; | ||
| 3274 | struct trace_array *tr = iter->tr; | ||
| 3275 | |||
| 3276 | (*pos)++; | ||
| 3277 | iter->pos = *pos; | ||
| 3278 | |||
| 3279 | iter->mod_list = iter->mod_list->next; | ||
| 3280 | |||
| 3281 | if (iter->mod_list == &tr->mod_trace || | ||
| 3282 | iter->mod_list == &tr->mod_notrace) { | ||
| 3283 | iter->flags &= ~FTRACE_ITER_MOD; | ||
| 3284 | return NULL; | ||
| 3285 | } | ||
| 3286 | |||
| 3287 | iter->mod_pos = *pos; | ||
| 3288 | |||
| 3289 | return iter; | ||
| 3290 | } | ||
| 3291 | |||
| 3292 | static void *t_mod_start(struct seq_file *m, loff_t *pos) | ||
| 3293 | { | ||
| 3294 | struct ftrace_iterator *iter = m->private; | ||
| 3295 | void *p = NULL; | ||
| 3296 | loff_t l; | ||
| 3297 | |||
| 3298 | if (iter->func_pos > *pos) | ||
| 3299 | return NULL; | ||
| 3300 | |||
| 3301 | iter->mod_pos = iter->func_pos; | ||
| 3302 | |||
| 3303 | /* probes are only available if tr is set */ | ||
| 3304 | if (!iter->tr) | ||
| 3305 | return NULL; | ||
| 3306 | |||
| 3307 | for (l = 0; l <= (*pos - iter->func_pos); ) { | ||
| 3308 | p = t_mod_next(m, &l); | ||
| 3309 | if (!p) | ||
| 3310 | break; | ||
| 3311 | } | ||
| 3312 | if (!p) { | ||
| 3313 | iter->flags &= ~FTRACE_ITER_MOD; | ||
| 3314 | return t_probe_start(m, pos); | ||
| 3315 | } | ||
| 3316 | |||
| 3317 | /* Only set this if we have an item */ | ||
| 3318 | iter->flags |= FTRACE_ITER_MOD; | ||
| 3319 | |||
| 3320 | return iter; | ||
| 3321 | } | ||
| 3322 | |||
| 3323 | static int | ||
| 3324 | t_mod_show(struct seq_file *m, struct ftrace_iterator *iter) | ||
| 3325 | { | ||
| 3326 | struct ftrace_mod_load *ftrace_mod; | ||
| 3327 | struct trace_array *tr = iter->tr; | ||
| 3328 | |||
| 3329 | if (WARN_ON_ONCE(!iter->mod_list) || | ||
| 3330 | iter->mod_list == &tr->mod_trace || | ||
| 3331 | iter->mod_list == &tr->mod_notrace) | ||
| 3332 | return -EIO; | ||
| 3333 | |||
| 3334 | ftrace_mod = list_entry(iter->mod_list, struct ftrace_mod_load, list); | ||
| 3335 | |||
| 3336 | if (ftrace_mod->func) | ||
| 3337 | seq_printf(m, "%s", ftrace_mod->func); | ||
| 3338 | else | ||
| 3339 | seq_putc(m, '*'); | ||
| 3340 | |||
| 3341 | seq_printf(m, ":mod:%s\n", ftrace_mod->module); | ||
| 3342 | |||
| 3343 | return 0; | ||
| 3344 | } | ||
| 3345 | |||
| 3346 | static void * | ||
| 3200 | t_func_next(struct seq_file *m, loff_t *pos) | 3347 | t_func_next(struct seq_file *m, loff_t *pos) |
| 3201 | { | 3348 | { |
| 3202 | struct ftrace_iterator *iter = m->private; | 3349 | struct ftrace_iterator *iter = m->private; |
| @@ -3237,7 +3384,7 @@ static void * | |||
| 3237 | t_next(struct seq_file *m, void *v, loff_t *pos) | 3384 | t_next(struct seq_file *m, void *v, loff_t *pos) |
| 3238 | { | 3385 | { |
| 3239 | struct ftrace_iterator *iter = m->private; | 3386 | struct ftrace_iterator *iter = m->private; |
| 3240 | loff_t l = *pos; /* t_hash_start() must use original pos */ | 3387 | loff_t l = *pos; /* t_probe_start() must use original pos */ |
| 3241 | void *ret; | 3388 | void *ret; |
| 3242 | 3389 | ||
| 3243 | if (unlikely(ftrace_disabled)) | 3390 | if (unlikely(ftrace_disabled)) |
| @@ -3246,16 +3393,19 @@ t_next(struct seq_file *m, void *v, loff_t *pos) | |||
| 3246 | if (iter->flags & FTRACE_ITER_PROBE) | 3393 | if (iter->flags & FTRACE_ITER_PROBE) |
| 3247 | return t_probe_next(m, pos); | 3394 | return t_probe_next(m, pos); |
| 3248 | 3395 | ||
| 3396 | if (iter->flags & FTRACE_ITER_MOD) | ||
| 3397 | return t_mod_next(m, pos); | ||
| 3398 | |||
| 3249 | if (iter->flags & FTRACE_ITER_PRINTALL) { | 3399 | if (iter->flags & FTRACE_ITER_PRINTALL) { |
| 3250 | /* next must increment pos, and t_probe_start does not */ | 3400 | /* next must increment pos, and t_probe_start does not */ |
| 3251 | (*pos)++; | 3401 | (*pos)++; |
| 3252 | return t_probe_start(m, &l); | 3402 | return t_mod_start(m, &l); |
| 3253 | } | 3403 | } |
| 3254 | 3404 | ||
| 3255 | ret = t_func_next(m, pos); | 3405 | ret = t_func_next(m, pos); |
| 3256 | 3406 | ||
| 3257 | if (!ret) | 3407 | if (!ret) |
| 3258 | return t_probe_start(m, &l); | 3408 | return t_mod_start(m, &l); |
| 3259 | 3409 | ||
| 3260 | return ret; | 3410 | return ret; |
| 3261 | } | 3411 | } |
| @@ -3264,7 +3414,7 @@ static void reset_iter_read(struct ftrace_iterator *iter) | |||
| 3264 | { | 3414 | { |
| 3265 | iter->pos = 0; | 3415 | iter->pos = 0; |
| 3266 | iter->func_pos = 0; | 3416 | iter->func_pos = 0; |
| 3267 | iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_PROBE); | 3417 | iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_PROBE | FTRACE_ITER_MOD); |
| 3268 | } | 3418 | } |
| 3269 | 3419 | ||
| 3270 | static void *t_start(struct seq_file *m, loff_t *pos) | 3420 | static void *t_start(struct seq_file *m, loff_t *pos) |
| @@ -3293,15 +3443,15 @@ static void *t_start(struct seq_file *m, loff_t *pos) | |||
| 3293 | ftrace_hash_empty(iter->hash)) { | 3443 | ftrace_hash_empty(iter->hash)) { |
| 3294 | iter->func_pos = 1; /* Account for the message */ | 3444 | iter->func_pos = 1; /* Account for the message */ |
| 3295 | if (*pos > 0) | 3445 | if (*pos > 0) |
| 3296 | return t_probe_start(m, pos); | 3446 | return t_mod_start(m, pos); |
| 3297 | iter->flags |= FTRACE_ITER_PRINTALL; | 3447 | iter->flags |= FTRACE_ITER_PRINTALL; |
| 3298 | /* reset in case of seek/pread */ | 3448 | /* reset in case of seek/pread */ |
| 3299 | iter->flags &= ~FTRACE_ITER_PROBE; | 3449 | iter->flags &= ~FTRACE_ITER_PROBE; |
| 3300 | return iter; | 3450 | return iter; |
| 3301 | } | 3451 | } |
| 3302 | 3452 | ||
| 3303 | if (iter->flags & FTRACE_ITER_PROBE) | 3453 | if (iter->flags & FTRACE_ITER_MOD) |
| 3304 | return t_probe_start(m, pos); | 3454 | return t_mod_start(m, pos); |
| 3305 | 3455 | ||
| 3306 | /* | 3456 | /* |
| 3307 | * Unfortunately, we need to restart at ftrace_pages_start | 3457 | * Unfortunately, we need to restart at ftrace_pages_start |
| @@ -3317,7 +3467,7 @@ static void *t_start(struct seq_file *m, loff_t *pos) | |||
| 3317 | } | 3467 | } |
| 3318 | 3468 | ||
| 3319 | if (!p) | 3469 | if (!p) |
| 3320 | return t_probe_start(m, pos); | 3470 | return t_mod_start(m, pos); |
| 3321 | 3471 | ||
| 3322 | return iter; | 3472 | return iter; |
| 3323 | } | 3473 | } |
| @@ -3351,6 +3501,9 @@ static int t_show(struct seq_file *m, void *v) | |||
| 3351 | if (iter->flags & FTRACE_ITER_PROBE) | 3501 | if (iter->flags & FTRACE_ITER_PROBE) |
| 3352 | return t_probe_show(m, iter); | 3502 | return t_probe_show(m, iter); |
| 3353 | 3503 | ||
| 3504 | if (iter->flags & FTRACE_ITER_MOD) | ||
| 3505 | return t_mod_show(m, iter); | ||
| 3506 | |||
| 3354 | if (iter->flags & FTRACE_ITER_PRINTALL) { | 3507 | if (iter->flags & FTRACE_ITER_PRINTALL) { |
| 3355 | if (iter->flags & FTRACE_ITER_NOTRACE) | 3508 | if (iter->flags & FTRACE_ITER_NOTRACE) |
| 3356 | seq_puts(m, "#### no functions disabled ####\n"); | 3509 | seq_puts(m, "#### no functions disabled ####\n"); |
| @@ -3457,6 +3610,8 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, | |||
| 3457 | { | 3610 | { |
| 3458 | struct ftrace_iterator *iter; | 3611 | struct ftrace_iterator *iter; |
| 3459 | struct ftrace_hash *hash; | 3612 | struct ftrace_hash *hash; |
| 3613 | struct list_head *mod_head; | ||
| 3614 | struct trace_array *tr = ops->private; | ||
| 3460 | int ret = 0; | 3615 | int ret = 0; |
| 3461 | 3616 | ||
| 3462 | ftrace_ops_init(ops); | 3617 | ftrace_ops_init(ops); |
| @@ -3475,21 +3630,29 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, | |||
| 3475 | 3630 | ||
| 3476 | iter->ops = ops; | 3631 | iter->ops = ops; |
| 3477 | iter->flags = flag; | 3632 | iter->flags = flag; |
| 3633 | iter->tr = tr; | ||
| 3478 | 3634 | ||
| 3479 | mutex_lock(&ops->func_hash->regex_lock); | 3635 | mutex_lock(&ops->func_hash->regex_lock); |
| 3480 | 3636 | ||
| 3481 | if (flag & FTRACE_ITER_NOTRACE) | 3637 | if (flag & FTRACE_ITER_NOTRACE) { |
| 3482 | hash = ops->func_hash->notrace_hash; | 3638 | hash = ops->func_hash->notrace_hash; |
| 3483 | else | 3639 | mod_head = tr ? &tr->mod_notrace : NULL; |
| 3640 | } else { | ||
| 3484 | hash = ops->func_hash->filter_hash; | 3641 | hash = ops->func_hash->filter_hash; |
| 3642 | mod_head = tr ? &tr->mod_trace : NULL; | ||
| 3643 | } | ||
| 3644 | |||
| 3645 | iter->mod_list = mod_head; | ||
| 3485 | 3646 | ||
| 3486 | if (file->f_mode & FMODE_WRITE) { | 3647 | if (file->f_mode & FMODE_WRITE) { |
| 3487 | const int size_bits = FTRACE_HASH_DEFAULT_BITS; | 3648 | const int size_bits = FTRACE_HASH_DEFAULT_BITS; |
| 3488 | 3649 | ||
| 3489 | if (file->f_flags & O_TRUNC) | 3650 | if (file->f_flags & O_TRUNC) { |
| 3490 | iter->hash = alloc_ftrace_hash(size_bits); | 3651 | iter->hash = alloc_ftrace_hash(size_bits); |
| 3491 | else | 3652 | clear_ftrace_mod_list(mod_head); |
| 3653 | } else { | ||
| 3492 | iter->hash = alloc_and_copy_ftrace_hash(size_bits, hash); | 3654 | iter->hash = alloc_and_copy_ftrace_hash(size_bits, hash); |
| 3655 | } | ||
| 3493 | 3656 | ||
| 3494 | if (!iter->hash) { | 3657 | if (!iter->hash) { |
| 3495 | trace_parser_put(&iter->parser); | 3658 | trace_parser_put(&iter->parser); |
| @@ -3665,7 +3828,7 @@ match_records(struct ftrace_hash *hash, char *func, int len, char *mod) | |||
| 3665 | int exclude_mod = 0; | 3828 | int exclude_mod = 0; |
| 3666 | int found = 0; | 3829 | int found = 0; |
| 3667 | int ret; | 3830 | int ret; |
| 3668 | int clear_filter; | 3831 | int clear_filter = 0; |
| 3669 | 3832 | ||
| 3670 | if (func) { | 3833 | if (func) { |
| 3671 | func_g.type = filter_parse_regex(func, len, &func_g.search, | 3834 | func_g.type = filter_parse_regex(func, len, &func_g.search, |
| @@ -3761,6 +3924,165 @@ static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops, | |||
| 3761 | return ret; | 3924 | return ret; |
| 3762 | } | 3925 | } |
| 3763 | 3926 | ||
| 3927 | static bool module_exists(const char *module) | ||
| 3928 | { | ||
| 3929 | /* All modules have the symbol __this_module */ | ||
| 3930 | const char this_mod[] = "__this_module"; | ||
| 3931 | const int modname_size = MAX_PARAM_PREFIX_LEN + sizeof(this_mod) + 1; | ||
| 3932 | char modname[modname_size + 1]; | ||
| 3933 | unsigned long val; | ||
| 3934 | int n; | ||
| 3935 | |||
| 3936 | n = snprintf(modname, modname_size + 1, "%s:%s", module, this_mod); | ||
| 3937 | |||
| 3938 | if (n > modname_size) | ||
| 3939 | return false; | ||
| 3940 | |||
| 3941 | val = module_kallsyms_lookup_name(modname); | ||
| 3942 | return val != 0; | ||
| 3943 | } | ||
| 3944 | |||
| 3945 | static int cache_mod(struct trace_array *tr, | ||
| 3946 | const char *func, char *module, int enable) | ||
| 3947 | { | ||
| 3948 | struct ftrace_mod_load *ftrace_mod, *n; | ||
| 3949 | struct list_head *head = enable ? &tr->mod_trace : &tr->mod_notrace; | ||
| 3950 | int ret; | ||
| 3951 | |||
| 3952 | mutex_lock(&ftrace_lock); | ||
| 3953 | |||
| 3954 | /* We do not cache inverse filters */ | ||
| 3955 | if (func[0] == '!') { | ||
| 3956 | func++; | ||
| 3957 | ret = -EINVAL; | ||
| 3958 | |||
| 3959 | /* Look to remove this hash */ | ||
| 3960 | list_for_each_entry_safe(ftrace_mod, n, head, list) { | ||
| 3961 | if (strcmp(ftrace_mod->module, module) != 0) | ||
| 3962 | continue; | ||
| 3963 | |||
| 3964 | /* no func matches all */ | ||
| 3965 | if (strcmp(func, "*") == 0 || | ||
| 3966 | (ftrace_mod->func && | ||
| 3967 | strcmp(ftrace_mod->func, func) == 0)) { | ||
| 3968 | ret = 0; | ||
| 3969 | free_ftrace_mod(ftrace_mod); | ||
| 3970 | continue; | ||
| 3971 | } | ||
| 3972 | } | ||
| 3973 | goto out; | ||
| 3974 | } | ||
| 3975 | |||
| 3976 | ret = -EINVAL; | ||
| 3977 | /* We only care about modules that have not been loaded yet */ | ||
| 3978 | if (module_exists(module)) | ||
| 3979 | goto out; | ||
| 3980 | |||
| 3981 | /* Save this string off, and execute it when the module is loaded */ | ||
| 3982 | ret = ftrace_add_mod(tr, func, module, enable); | ||
| 3983 | out: | ||
| 3984 | mutex_unlock(&ftrace_lock); | ||
| 3985 | |||
| 3986 | return ret; | ||
| 3987 | } | ||
| 3988 | |||
| 3989 | static int | ||
| 3990 | ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, | ||
| 3991 | int reset, int enable); | ||
| 3992 | |||
| 3993 | #ifdef CONFIG_MODULES | ||
| 3994 | static void process_mod_list(struct list_head *head, struct ftrace_ops *ops, | ||
| 3995 | char *mod, bool enable) | ||
| 3996 | { | ||
| 3997 | struct ftrace_mod_load *ftrace_mod, *n; | ||
| 3998 | struct ftrace_hash **orig_hash, *new_hash; | ||
| 3999 | LIST_HEAD(process_mods); | ||
| 4000 | char *func; | ||
| 4001 | int ret; | ||
| 4002 | |||
| 4003 | mutex_lock(&ops->func_hash->regex_lock); | ||
| 4004 | |||
| 4005 | if (enable) | ||
| 4006 | orig_hash = &ops->func_hash->filter_hash; | ||
| 4007 | else | ||
| 4008 | orig_hash = &ops->func_hash->notrace_hash; | ||
| 4009 | |||
| 4010 | new_hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, | ||
| 4011 | *orig_hash); | ||
| 4012 | if (!new_hash) | ||
| 4013 | goto out; /* warn? */ | ||
| 4014 | |||
| 4015 | mutex_lock(&ftrace_lock); | ||
| 4016 | |||
| 4017 | list_for_each_entry_safe(ftrace_mod, n, head, list) { | ||
| 4018 | |||
| 4019 | if (strcmp(ftrace_mod->module, mod) != 0) | ||
| 4020 | continue; | ||
| 4021 | |||
| 4022 | if (ftrace_mod->func) | ||
| 4023 | func = kstrdup(ftrace_mod->func, GFP_KERNEL); | ||
| 4024 | else | ||
| 4025 | func = kstrdup("*", GFP_KERNEL); | ||
| 4026 | |||
| 4027 | if (!func) /* warn? */ | ||
| 4028 | continue; | ||
| 4029 | |||
| 4030 | list_del(&ftrace_mod->list); | ||
| 4031 | list_add(&ftrace_mod->list, &process_mods); | ||
| 4032 | |||
| 4033 | /* Use the newly allocated func, as it may be "*" */ | ||
| 4034 | kfree(ftrace_mod->func); | ||
| 4035 | ftrace_mod->func = func; | ||
| 4036 | } | ||
| 4037 | |||
| 4038 | mutex_unlock(&ftrace_lock); | ||
| 4039 | |||
| 4040 | list_for_each_entry_safe(ftrace_mod, n, &process_mods, list) { | ||
| 4041 | |||
| 4042 | func = ftrace_mod->func; | ||
| 4043 | |||
| 4044 | /* Grabs ftrace_lock, which is why we have this extra step */ | ||
| 4045 | match_records(new_hash, func, strlen(func), mod); | ||
| 4046 | free_ftrace_mod(ftrace_mod); | ||
| 4047 | } | ||
| 4048 | |||
| 4049 | if (enable && list_empty(head)) | ||
| 4050 | new_hash->flags &= ~FTRACE_HASH_FL_MOD; | ||
| 4051 | |||
| 4052 | mutex_lock(&ftrace_lock); | ||
| 4053 | |||
| 4054 | ret = ftrace_hash_move_and_update_ops(ops, orig_hash, | ||
| 4055 | new_hash, enable); | ||
| 4056 | mutex_unlock(&ftrace_lock); | ||
| 4057 | |||
| 4058 | out: | ||
| 4059 | mutex_unlock(&ops->func_hash->regex_lock); | ||
| 4060 | |||
| 4061 | free_ftrace_hash(new_hash); | ||
| 4062 | } | ||
| 4063 | |||
| 4064 | static void process_cached_mods(const char *mod_name) | ||
| 4065 | { | ||
| 4066 | struct trace_array *tr; | ||
| 4067 | char *mod; | ||
| 4068 | |||
| 4069 | mod = kstrdup(mod_name, GFP_KERNEL); | ||
| 4070 | if (!mod) | ||
| 4071 | return; | ||
| 4072 | |||
| 4073 | mutex_lock(&trace_types_lock); | ||
| 4074 | list_for_each_entry(tr, &ftrace_trace_arrays, list) { | ||
| 4075 | if (!list_empty(&tr->mod_trace)) | ||
| 4076 | process_mod_list(&tr->mod_trace, tr->ops, mod, true); | ||
| 4077 | if (!list_empty(&tr->mod_notrace)) | ||
| 4078 | process_mod_list(&tr->mod_notrace, tr->ops, mod, false); | ||
| 4079 | } | ||
| 4080 | mutex_unlock(&trace_types_lock); | ||
| 4081 | |||
| 4082 | kfree(mod); | ||
| 4083 | } | ||
| 4084 | #endif | ||
| 4085 | |||
| 3764 | /* | 4086 | /* |
| 3765 | * We register the module command as a template to show others how | 4087 | * We register the module command as a template to show others how |
| 3766 | * to register the a command as well. | 4088 | * to register the a command as well. |
| @@ -3768,10 +4090,16 @@ static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops, | |||
| 3768 | 4090 | ||
| 3769 | static int | 4091 | static int |
| 3770 | ftrace_mod_callback(struct trace_array *tr, struct ftrace_hash *hash, | 4092 | ftrace_mod_callback(struct trace_array *tr, struct ftrace_hash *hash, |
| 3771 | char *func, char *cmd, char *module, int enable) | 4093 | char *func_orig, char *cmd, char *module, int enable) |
| 3772 | { | 4094 | { |
| 4095 | char *func; | ||
| 3773 | int ret; | 4096 | int ret; |
| 3774 | 4097 | ||
| 4098 | /* match_records() modifies func, and we need the original */ | ||
| 4099 | func = kstrdup(func_orig, GFP_KERNEL); | ||
| 4100 | if (!func) | ||
| 4101 | return -ENOMEM; | ||
| 4102 | |||
| 3775 | /* | 4103 | /* |
| 3776 | * cmd == 'mod' because we only registered this func | 4104 | * cmd == 'mod' because we only registered this func |
| 3777 | * for the 'mod' ftrace_func_command. | 4105 | * for the 'mod' ftrace_func_command. |
| @@ -3780,8 +4108,10 @@ ftrace_mod_callback(struct trace_array *tr, struct ftrace_hash *hash, | |||
| 3780 | * parameter. | 4108 | * parameter. |
| 3781 | */ | 4109 | */ |
| 3782 | ret = match_records(hash, func, strlen(func), module); | 4110 | ret = match_records(hash, func, strlen(func), module); |
| 4111 | kfree(func); | ||
| 4112 | |||
| 3783 | if (!ret) | 4113 | if (!ret) |
| 3784 | return -EINVAL; | 4114 | return cache_mod(tr, func_orig, module, enable); |
| 3785 | if (ret < 0) | 4115 | if (ret < 0) |
| 3786 | return ret; | 4116 | return ret; |
| 3787 | return 0; | 4117 | return 0; |
| @@ -4337,9 +4667,6 @@ static int ftrace_process_regex(struct ftrace_iterator *iter, | |||
| 4337 | 4667 | ||
| 4338 | command = strsep(&next, ":"); | 4668 | command = strsep(&next, ":"); |
| 4339 | 4669 | ||
| 4340 | if (WARN_ON_ONCE(!tr)) | ||
| 4341 | return -EINVAL; | ||
| 4342 | |||
| 4343 | mutex_lock(&ftrace_cmd_mutex); | 4670 | mutex_lock(&ftrace_cmd_mutex); |
| 4344 | list_for_each_entry(p, &ftrace_commands, list) { | 4671 | list_for_each_entry(p, &ftrace_commands, list) { |
| 4345 | if (strcmp(p->name, command) == 0) { | 4672 | if (strcmp(p->name, command) == 0) { |
| @@ -4728,9 +5055,11 @@ int ftrace_regex_release(struct inode *inode, struct file *file) | |||
| 4728 | if (file->f_mode & FMODE_WRITE) { | 5055 | if (file->f_mode & FMODE_WRITE) { |
| 4729 | filter_hash = !!(iter->flags & FTRACE_ITER_FILTER); | 5056 | filter_hash = !!(iter->flags & FTRACE_ITER_FILTER); |
| 4730 | 5057 | ||
| 4731 | if (filter_hash) | 5058 | if (filter_hash) { |
| 4732 | orig_hash = &iter->ops->func_hash->filter_hash; | 5059 | orig_hash = &iter->ops->func_hash->filter_hash; |
| 4733 | else | 5060 | if (iter->tr && !list_empty(&iter->tr->mod_trace)) |
| 5061 | iter->hash->flags |= FTRACE_HASH_FL_MOD; | ||
| 5062 | } else | ||
| 4734 | orig_hash = &iter->ops->func_hash->notrace_hash; | 5063 | orig_hash = &iter->ops->func_hash->notrace_hash; |
| 4735 | 5064 | ||
| 4736 | mutex_lock(&ftrace_lock); | 5065 | mutex_lock(&ftrace_lock); |
| @@ -5063,7 +5392,7 @@ ftrace_graph_release(struct inode *inode, struct file *file) | |||
| 5063 | } | 5392 | } |
| 5064 | 5393 | ||
| 5065 | out: | 5394 | out: |
| 5066 | kfree(fgd->new_hash); | 5395 | free_ftrace_hash(fgd->new_hash); |
| 5067 | kfree(fgd); | 5396 | kfree(fgd); |
| 5068 | 5397 | ||
| 5069 | return ret; | 5398 | return ret; |
| @@ -5388,6 +5717,7 @@ void ftrace_release_mod(struct module *mod) | |||
| 5388 | if (pg == ftrace_pages) | 5717 | if (pg == ftrace_pages) |
| 5389 | ftrace_pages = next_to_ftrace_page(last_pg); | 5718 | ftrace_pages = next_to_ftrace_page(last_pg); |
| 5390 | 5719 | ||
| 5720 | ftrace_update_tot_cnt -= pg->index; | ||
| 5391 | *last_pg = pg->next; | 5721 | *last_pg = pg->next; |
| 5392 | order = get_count_order(pg->size / ENTRIES_PER_PAGE); | 5722 | order = get_count_order(pg->size / ENTRIES_PER_PAGE); |
| 5393 | free_pages((unsigned long)pg->records, order); | 5723 | free_pages((unsigned long)pg->records, order); |
| @@ -5466,6 +5796,8 @@ void ftrace_module_enable(struct module *mod) | |||
| 5466 | 5796 | ||
| 5467 | out_unlock: | 5797 | out_unlock: |
| 5468 | mutex_unlock(&ftrace_lock); | 5798 | mutex_unlock(&ftrace_lock); |
| 5799 | |||
| 5800 | process_cached_mods(mod->name); | ||
| 5469 | } | 5801 | } |
| 5470 | 5802 | ||
| 5471 | void ftrace_module_init(struct module *mod) | 5803 | void ftrace_module_init(struct module *mod) |
| @@ -5504,6 +5836,7 @@ void __init ftrace_free_init_mem(void) | |||
| 5504 | if (!rec) | 5836 | if (!rec) |
| 5505 | continue; | 5837 | continue; |
| 5506 | pg->index--; | 5838 | pg->index--; |
| 5839 | ftrace_update_tot_cnt--; | ||
| 5507 | if (!pg->index) { | 5840 | if (!pg->index) { |
| 5508 | *last_pg = pg->next; | 5841 | *last_pg = pg->next; |
| 5509 | order = get_count_order(pg->size / ENTRIES_PER_PAGE); | 5842 | order = get_count_order(pg->size / ENTRIES_PER_PAGE); |
| @@ -5570,6 +5903,8 @@ static void ftrace_update_trampoline(struct ftrace_ops *ops) | |||
| 5570 | void ftrace_init_trace_array(struct trace_array *tr) | 5903 | void ftrace_init_trace_array(struct trace_array *tr) |
| 5571 | { | 5904 | { |
| 5572 | INIT_LIST_HEAD(&tr->func_probes); | 5905 | INIT_LIST_HEAD(&tr->func_probes); |
| 5906 | INIT_LIST_HEAD(&tr->mod_trace); | ||
| 5907 | INIT_LIST_HEAD(&tr->mod_notrace); | ||
| 5573 | } | 5908 | } |
| 5574 | #else | 5909 | #else |
| 5575 | 5910 | ||
| @@ -6130,7 +6465,8 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, | |||
| 6130 | if (ftrace_enabled) { | 6465 | if (ftrace_enabled) { |
| 6131 | 6466 | ||
| 6132 | /* we are starting ftrace again */ | 6467 | /* we are starting ftrace again */ |
| 6133 | if (ftrace_ops_list != &ftrace_list_end) | 6468 | if (rcu_dereference_protected(ftrace_ops_list, |
| 6469 | lockdep_is_held(&ftrace_lock)) != &ftrace_list_end) | ||
| 6134 | update_ftrace_function(); | 6470 | update_ftrace_function(); |
| 6135 | 6471 | ||
| 6136 | ftrace_startup_sysctl(); | 6472 | ftrace_startup_sysctl(); |
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 4ae268e687fe..529cc50d7243 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
| @@ -1136,12 +1136,12 @@ static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu) | |||
| 1136 | for (i = 0; i < nr_pages; i++) { | 1136 | for (i = 0; i < nr_pages; i++) { |
| 1137 | struct page *page; | 1137 | struct page *page; |
| 1138 | /* | 1138 | /* |
| 1139 | * __GFP_NORETRY flag makes sure that the allocation fails | 1139 | * __GFP_RETRY_MAYFAIL flag makes sure that the allocation fails |
| 1140 | * gracefully without invoking oom-killer and the system is | 1140 | * gracefully without invoking oom-killer and the system is not |
| 1141 | * not destabilized. | 1141 | * destabilized. |
| 1142 | */ | 1142 | */ |
| 1143 | bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), | 1143 | bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), |
| 1144 | GFP_KERNEL | __GFP_NORETRY, | 1144 | GFP_KERNEL | __GFP_RETRY_MAYFAIL, |
| 1145 | cpu_to_node(cpu)); | 1145 | cpu_to_node(cpu)); |
| 1146 | if (!bpage) | 1146 | if (!bpage) |
| 1147 | goto free_pages; | 1147 | goto free_pages; |
| @@ -1149,7 +1149,7 @@ static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu) | |||
| 1149 | list_add(&bpage->list, pages); | 1149 | list_add(&bpage->list, pages); |
| 1150 | 1150 | ||
| 1151 | page = alloc_pages_node(cpu_to_node(cpu), | 1151 | page = alloc_pages_node(cpu_to_node(cpu), |
| 1152 | GFP_KERNEL | __GFP_NORETRY, 0); | 1152 | GFP_KERNEL | __GFP_RETRY_MAYFAIL, 0); |
| 1153 | if (!page) | 1153 | if (!page) |
| 1154 | goto free_pages; | 1154 | goto free_pages; |
| 1155 | bpage->page = page_address(page); | 1155 | bpage->page = page_address(page); |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1122f151466f..42b9355033d4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
| @@ -87,7 +87,7 @@ dummy_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) | |||
| 87 | * tracing is active, only save the comm when a trace event | 87 | * tracing is active, only save the comm when a trace event |
| 88 | * occurred. | 88 | * occurred. |
| 89 | */ | 89 | */ |
| 90 | static DEFINE_PER_CPU(bool, trace_cmdline_save); | 90 | static DEFINE_PER_CPU(bool, trace_taskinfo_save); |
| 91 | 91 | ||
| 92 | /* | 92 | /* |
| 93 | * Kill all tracing for good (never come back). | 93 | * Kill all tracing for good (never come back). |
| @@ -120,41 +120,41 @@ enum ftrace_dump_mode ftrace_dump_on_oops; | |||
| 120 | /* When set, tracing will stop when a WARN*() is hit */ | 120 | /* When set, tracing will stop when a WARN*() is hit */ |
| 121 | int __disable_trace_on_warning; | 121 | int __disable_trace_on_warning; |
| 122 | 122 | ||
| 123 | #ifdef CONFIG_TRACE_ENUM_MAP_FILE | 123 | #ifdef CONFIG_TRACE_EVAL_MAP_FILE |
| 124 | /* Map of enums to their values, for "enum_map" file */ | 124 | /* Map of enums to their values, for "eval_map" file */ |
| 125 | struct trace_enum_map_head { | 125 | struct trace_eval_map_head { |
| 126 | struct module *mod; | 126 | struct module *mod; |
| 127 | unsigned long length; | 127 | unsigned long length; |
| 128 | }; | 128 | }; |
| 129 | 129 | ||
| 130 | union trace_enum_map_item; | 130 | union trace_eval_map_item; |
| 131 | 131 | ||
| 132 | struct trace_enum_map_tail { | 132 | struct trace_eval_map_tail { |
| 133 | /* | 133 | /* |
| 134 | * "end" is first and points to NULL as it must be different | 134 | * "end" is first and points to NULL as it must be different |
| 135 | * than "mod" or "enum_string" | 135 | * than "mod" or "eval_string" |
| 136 | */ | 136 | */ |
| 137 | union trace_enum_map_item *next; | 137 | union trace_eval_map_item *next; |
| 138 | const char *end; /* points to NULL */ | 138 | const char *end; /* points to NULL */ |
| 139 | }; | 139 | }; |
| 140 | 140 | ||
| 141 | static DEFINE_MUTEX(trace_enum_mutex); | 141 | static DEFINE_MUTEX(trace_eval_mutex); |
| 142 | 142 | ||
| 143 | /* | 143 | /* |
| 144 | * The trace_enum_maps are saved in an array with two extra elements, | 144 | * The trace_eval_maps are saved in an array with two extra elements, |
| 145 | * one at the beginning, and one at the end. The beginning item contains | 145 | * one at the beginning, and one at the end. The beginning item contains |
| 146 | * the count of the saved maps (head.length), and the module they | 146 | * the count of the saved maps (head.length), and the module they |
| 147 | * belong to if not built in (head.mod). The ending item contains a | 147 | * belong to if not built in (head.mod). The ending item contains a |
| 148 | * pointer to the next array of saved enum_map items. | 148 | * pointer to the next array of saved eval_map items. |
| 149 | */ | 149 | */ |
| 150 | union trace_enum_map_item { | 150 | union trace_eval_map_item { |
| 151 | struct trace_enum_map map; | 151 | struct trace_eval_map map; |
| 152 | struct trace_enum_map_head head; | 152 | struct trace_eval_map_head head; |
| 153 | struct trace_enum_map_tail tail; | 153 | struct trace_eval_map_tail tail; |
| 154 | }; | 154 | }; |
| 155 | 155 | ||
| 156 | static union trace_enum_map_item *trace_enum_maps; | 156 | static union trace_eval_map_item *trace_eval_maps; |
| 157 | #endif /* CONFIG_TRACE_ENUM_MAP_FILE */ | 157 | #endif /* CONFIG_TRACE_EVAL_MAP_FILE */ |
| 158 | 158 | ||
| 159 | static int tracing_set_tracer(struct trace_array *tr, const char *buf); | 159 | static int tracing_set_tracer(struct trace_array *tr, const char *buf); |
| 160 | 160 | ||
| @@ -790,7 +790,7 @@ EXPORT_SYMBOL_GPL(tracing_on); | |||
| 790 | static __always_inline void | 790 | static __always_inline void |
| 791 | __buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event) | 791 | __buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event) |
| 792 | { | 792 | { |
| 793 | __this_cpu_write(trace_cmdline_save, true); | 793 | __this_cpu_write(trace_taskinfo_save, true); |
| 794 | 794 | ||
| 795 | /* If this is the temp buffer, we need to commit fully */ | 795 | /* If this is the temp buffer, we need to commit fully */ |
| 796 | if (this_cpu_read(trace_buffered_event) == event) { | 796 | if (this_cpu_read(trace_buffered_event) == event) { |
| @@ -1141,9 +1141,9 @@ unsigned long nsecs_to_usecs(unsigned long nsecs) | |||
| 1141 | 1141 | ||
| 1142 | /* | 1142 | /* |
| 1143 | * TRACE_FLAGS is defined as a tuple matching bit masks with strings. | 1143 | * TRACE_FLAGS is defined as a tuple matching bit masks with strings. |
| 1144 | * It uses C(a, b) where 'a' is the enum name and 'b' is the string that | 1144 | * It uses C(a, b) where 'a' is the eval (enum) name and 'b' is the string that |
| 1145 | * matches it. By defining "C(a, b) b", TRACE_FLAGS becomes a list | 1145 | * matches it. By defining "C(a, b) b", TRACE_FLAGS becomes a list |
| 1146 | * of strings in the order that the enums were defined. | 1146 | * of strings in the order that the evals (enum) were defined. |
| 1147 | */ | 1147 | */ |
| 1148 | #undef C | 1148 | #undef C |
| 1149 | #define C(a, b) b | 1149 | #define C(a, b) b |
| @@ -1709,6 +1709,8 @@ void tracing_reset_all_online_cpus(void) | |||
| 1709 | } | 1709 | } |
| 1710 | } | 1710 | } |
| 1711 | 1711 | ||
| 1712 | static int *tgid_map; | ||
| 1713 | |||
| 1712 | #define SAVED_CMDLINES_DEFAULT 128 | 1714 | #define SAVED_CMDLINES_DEFAULT 128 |
| 1713 | #define NO_CMDLINE_MAP UINT_MAX | 1715 | #define NO_CMDLINE_MAP UINT_MAX |
| 1714 | static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED; | 1716 | static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED; |
| @@ -1722,7 +1724,7 @@ struct saved_cmdlines_buffer { | |||
| 1722 | static struct saved_cmdlines_buffer *savedcmd; | 1724 | static struct saved_cmdlines_buffer *savedcmd; |
| 1723 | 1725 | ||
| 1724 | /* temporary disable recording */ | 1726 | /* temporary disable recording */ |
| 1725 | static atomic_t trace_record_cmdline_disabled __read_mostly; | 1727 | static atomic_t trace_record_taskinfo_disabled __read_mostly; |
| 1726 | 1728 | ||
| 1727 | static inline char *get_saved_cmdlines(int idx) | 1729 | static inline char *get_saved_cmdlines(int idx) |
| 1728 | { | 1730 | { |
| @@ -1910,13 +1912,15 @@ static void tracing_stop_tr(struct trace_array *tr) | |||
| 1910 | raw_spin_unlock_irqrestore(&tr->start_lock, flags); | 1912 | raw_spin_unlock_irqrestore(&tr->start_lock, flags); |
| 1911 | } | 1913 | } |
| 1912 | 1914 | ||
| 1913 | void trace_stop_cmdline_recording(void); | ||
| 1914 | |||
| 1915 | static int trace_save_cmdline(struct task_struct *tsk) | 1915 | static int trace_save_cmdline(struct task_struct *tsk) |
| 1916 | { | 1916 | { |
| 1917 | unsigned pid, idx; | 1917 | unsigned pid, idx; |
| 1918 | 1918 | ||
| 1919 | if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT)) | 1919 | /* treat recording of idle task as a success */ |
| 1920 | if (!tsk->pid) | ||
| 1921 | return 1; | ||
| 1922 | |||
| 1923 | if (unlikely(tsk->pid > PID_MAX_DEFAULT)) | ||
| 1920 | return 0; | 1924 | return 0; |
| 1921 | 1925 | ||
| 1922 | /* | 1926 | /* |
| @@ -1992,16 +1996,107 @@ void trace_find_cmdline(int pid, char comm[]) | |||
| 1992 | preempt_enable(); | 1996 | preempt_enable(); |
| 1993 | } | 1997 | } |
| 1994 | 1998 | ||
| 1995 | void tracing_record_cmdline(struct task_struct *tsk) | 1999 | int trace_find_tgid(int pid) |
| 2000 | { | ||
| 2001 | if (unlikely(!tgid_map || !pid || pid > PID_MAX_DEFAULT)) | ||
| 2002 | return 0; | ||
| 2003 | |||
| 2004 | return tgid_map[pid]; | ||
| 2005 | } | ||
| 2006 | |||
| 2007 | static int trace_save_tgid(struct task_struct *tsk) | ||
| 2008 | { | ||
| 2009 | /* treat recording of idle task as a success */ | ||
| 2010 | if (!tsk->pid) | ||
| 2011 | return 1; | ||
| 2012 | |||
| 2013 | if (unlikely(!tgid_map || tsk->pid > PID_MAX_DEFAULT)) | ||
| 2014 | return 0; | ||
| 2015 | |||
| 2016 | tgid_map[tsk->pid] = tsk->tgid; | ||
| 2017 | return 1; | ||
| 2018 | } | ||
| 2019 | |||
| 2020 | static bool tracing_record_taskinfo_skip(int flags) | ||
| 2021 | { | ||
| 2022 | if (unlikely(!(flags & (TRACE_RECORD_CMDLINE | TRACE_RECORD_TGID)))) | ||
| 2023 | return true; | ||
| 2024 | if (atomic_read(&trace_record_taskinfo_disabled) || !tracing_is_on()) | ||
| 2025 | return true; | ||
| 2026 | if (!__this_cpu_read(trace_taskinfo_save)) | ||
| 2027 | return true; | ||
| 2028 | return false; | ||
| 2029 | } | ||
| 2030 | |||
| 2031 | /** | ||
| 2032 | * tracing_record_taskinfo - record the task info of a task | ||
| 2033 | * | ||
| 2034 | * @task - task to record | ||
| 2035 | * @flags - TRACE_RECORD_CMDLINE for recording comm | ||
| 2036 | * - TRACE_RECORD_TGID for recording tgid | ||
| 2037 | */ | ||
| 2038 | void tracing_record_taskinfo(struct task_struct *task, int flags) | ||
| 1996 | { | 2039 | { |
| 1997 | if (atomic_read(&trace_record_cmdline_disabled) || !tracing_is_on()) | 2040 | bool done; |
| 2041 | |||
| 2042 | if (tracing_record_taskinfo_skip(flags)) | ||
| 1998 | return; | 2043 | return; |
| 1999 | 2044 | ||
| 2000 | if (!__this_cpu_read(trace_cmdline_save)) | 2045 | /* |
| 2046 | * Record as much task information as possible. If some fail, continue | ||
| 2047 | * to try to record the others. | ||
| 2048 | */ | ||
| 2049 | done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(task); | ||
| 2050 | done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(task); | ||
| 2051 | |||
| 2052 | /* If recording any information failed, retry again soon. */ | ||
| 2053 | if (!done) | ||
| 2001 | return; | 2054 | return; |
| 2002 | 2055 | ||
| 2003 | if (trace_save_cmdline(tsk)) | 2056 | __this_cpu_write(trace_taskinfo_save, false); |
| 2004 | __this_cpu_write(trace_cmdline_save, false); | 2057 | } |
| 2058 | |||
| 2059 | /** | ||
| 2060 | * tracing_record_taskinfo_sched_switch - record task info for sched_switch | ||
| 2061 | * | ||
| 2062 | * @prev - previous task during sched_switch | ||
| 2063 | * @next - next task during sched_switch | ||
| 2064 | * @flags - TRACE_RECORD_CMDLINE for recording comm | ||
| 2065 | * TRACE_RECORD_TGID for recording tgid | ||
| 2066 | */ | ||
| 2067 | void tracing_record_taskinfo_sched_switch(struct task_struct *prev, | ||
| 2068 | struct task_struct *next, int flags) | ||
| 2069 | { | ||
| 2070 | bool done; | ||
| 2071 | |||
| 2072 | if (tracing_record_taskinfo_skip(flags)) | ||
| 2073 | return; | ||
| 2074 | |||
| 2075 | /* | ||
| 2076 | * Record as much task information as possible. If some fail, continue | ||
| 2077 | * to try to record the others. | ||
| 2078 | */ | ||
| 2079 | done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(prev); | ||
| 2080 | done &= !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(next); | ||
| 2081 | done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(prev); | ||
| 2082 | done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(next); | ||
| 2083 | |||
| 2084 | /* If recording any information failed, retry again soon. */ | ||
| 2085 | if (!done) | ||
| 2086 | return; | ||
| 2087 | |||
| 2088 | __this_cpu_write(trace_taskinfo_save, false); | ||
| 2089 | } | ||
| 2090 | |||
| 2091 | /* Helpers to record a specific task information */ | ||
| 2092 | void tracing_record_cmdline(struct task_struct *task) | ||
| 2093 | { | ||
| 2094 | tracing_record_taskinfo(task, TRACE_RECORD_CMDLINE); | ||
| 2095 | } | ||
| 2096 | |||
| 2097 | void tracing_record_tgid(struct task_struct *task) | ||
| 2098 | { | ||
| 2099 | tracing_record_taskinfo(task, TRACE_RECORD_TGID); | ||
| 2005 | } | 2100 | } |
| 2006 | 2101 | ||
| 2007 | /* | 2102 | /* |
| @@ -3146,7 +3241,7 @@ static void *s_start(struct seq_file *m, loff_t *pos) | |||
| 3146 | #endif | 3241 | #endif |
| 3147 | 3242 | ||
| 3148 | if (!iter->snapshot) | 3243 | if (!iter->snapshot) |
| 3149 | atomic_inc(&trace_record_cmdline_disabled); | 3244 | atomic_inc(&trace_record_taskinfo_disabled); |
| 3150 | 3245 | ||
| 3151 | if (*pos != iter->pos) { | 3246 | if (*pos != iter->pos) { |
| 3152 | iter->ent = NULL; | 3247 | iter->ent = NULL; |
| @@ -3191,7 +3286,7 @@ static void s_stop(struct seq_file *m, void *p) | |||
| 3191 | #endif | 3286 | #endif |
| 3192 | 3287 | ||
| 3193 | if (!iter->snapshot) | 3288 | if (!iter->snapshot) |
| 3194 | atomic_dec(&trace_record_cmdline_disabled); | 3289 | atomic_dec(&trace_record_taskinfo_disabled); |
| 3195 | 3290 | ||
| 3196 | trace_access_unlock(iter->cpu_file); | 3291 | trace_access_unlock(iter->cpu_file); |
| 3197 | trace_event_read_unlock(); | 3292 | trace_event_read_unlock(); |
| @@ -3248,23 +3343,38 @@ static void print_event_info(struct trace_buffer *buf, struct seq_file *m) | |||
| 3248 | seq_puts(m, "#\n"); | 3343 | seq_puts(m, "#\n"); |
| 3249 | } | 3344 | } |
| 3250 | 3345 | ||
| 3251 | static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m) | 3346 | static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m, |
| 3347 | unsigned int flags) | ||
| 3252 | { | 3348 | { |
| 3349 | bool tgid = flags & TRACE_ITER_RECORD_TGID; | ||
| 3350 | |||
| 3253 | print_event_info(buf, m); | 3351 | print_event_info(buf, m); |
| 3254 | seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n" | 3352 | |
| 3255 | "# | | | | |\n"); | 3353 | seq_printf(m, "# TASK-PID CPU# %s TIMESTAMP FUNCTION\n", tgid ? "TGID " : ""); |
| 3354 | seq_printf(m, "# | | | %s | |\n", tgid ? " | " : ""); | ||
| 3256 | } | 3355 | } |
| 3257 | 3356 | ||
| 3258 | static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m) | 3357 | static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m, |
| 3358 | unsigned int flags) | ||
| 3259 | { | 3359 | { |
| 3260 | print_event_info(buf, m); | 3360 | bool tgid = flags & TRACE_ITER_RECORD_TGID; |
| 3261 | seq_puts(m, "# _-----=> irqs-off\n" | 3361 | const char tgid_space[] = " "; |
| 3262 | "# / _----=> need-resched\n" | 3362 | const char space[] = " "; |
| 3263 | "# | / _---=> hardirq/softirq\n" | 3363 | |
| 3264 | "# || / _--=> preempt-depth\n" | 3364 | seq_printf(m, "# %s _-----=> irqs-off\n", |
| 3265 | "# ||| / delay\n" | 3365 | tgid ? tgid_space : space); |
| 3266 | "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n" | 3366 | seq_printf(m, "# %s / _----=> need-resched\n", |
| 3267 | "# | | | |||| | |\n"); | 3367 | tgid ? tgid_space : space); |
| 3368 | seq_printf(m, "# %s| / _---=> hardirq/softirq\n", | ||
| 3369 | tgid ? tgid_space : space); | ||
| 3370 | seq_printf(m, "# %s|| / _--=> preempt-depth\n", | ||
| 3371 | tgid ? tgid_space : space); | ||
| 3372 | seq_printf(m, "# %s||| / delay\n", | ||
| 3373 | tgid ? tgid_space : space); | ||
| 3374 | seq_printf(m, "# TASK-PID CPU#%s|||| TIMESTAMP FUNCTION\n", | ||
| 3375 | tgid ? " TGID " : space); | ||
| 3376 | seq_printf(m, "# | | | %s|||| | |\n", | ||
| 3377 | tgid ? " | " : space); | ||
| 3268 | } | 3378 | } |
| 3269 | 3379 | ||
| 3270 | void | 3380 | void |
| @@ -3580,9 +3690,11 @@ void trace_default_header(struct seq_file *m) | |||
| 3580 | } else { | 3690 | } else { |
| 3581 | if (!(trace_flags & TRACE_ITER_VERBOSE)) { | 3691 | if (!(trace_flags & TRACE_ITER_VERBOSE)) { |
| 3582 | if (trace_flags & TRACE_ITER_IRQ_INFO) | 3692 | if (trace_flags & TRACE_ITER_IRQ_INFO) |
| 3583 | print_func_help_header_irq(iter->trace_buffer, m); | 3693 | print_func_help_header_irq(iter->trace_buffer, |
| 3694 | m, trace_flags); | ||
| 3584 | else | 3695 | else |
| 3585 | print_func_help_header(iter->trace_buffer, m); | 3696 | print_func_help_header(iter->trace_buffer, m, |
| 3697 | trace_flags); | ||
| 3586 | } | 3698 | } |
| 3587 | } | 3699 | } |
| 3588 | } | 3700 | } |
| @@ -4238,6 +4350,18 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) | |||
| 4238 | if (mask == TRACE_ITER_RECORD_CMD) | 4350 | if (mask == TRACE_ITER_RECORD_CMD) |
| 4239 | trace_event_enable_cmd_record(enabled); | 4351 | trace_event_enable_cmd_record(enabled); |
| 4240 | 4352 | ||
| 4353 | if (mask == TRACE_ITER_RECORD_TGID) { | ||
| 4354 | if (!tgid_map) | ||
| 4355 | tgid_map = kzalloc((PID_MAX_DEFAULT + 1) * sizeof(*tgid_map), | ||
| 4356 | GFP_KERNEL); | ||
| 4357 | if (!tgid_map) { | ||
| 4358 | tr->trace_flags &= ~TRACE_ITER_RECORD_TGID; | ||
| 4359 | return -ENOMEM; | ||
| 4360 | } | ||
| 4361 | |||
| 4362 | trace_event_enable_tgid_record(enabled); | ||
| 4363 | } | ||
| 4364 | |||
| 4241 | if (mask == TRACE_ITER_EVENT_FORK) | 4365 | if (mask == TRACE_ITER_EVENT_FORK) |
| 4242 | trace_event_follow_fork(tr, enabled); | 4366 | trace_event_follow_fork(tr, enabled); |
| 4243 | 4367 | ||
| @@ -4473,7 +4597,8 @@ static const char readme_msg[] = | |||
| 4473 | #endif | 4597 | #endif |
| 4474 | #if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) | 4598 | #if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) |
| 4475 | "\t accepts: event-definitions (one definition per line)\n" | 4599 | "\t accepts: event-definitions (one definition per line)\n" |
| 4476 | "\t Format: p|r[:[<group>/]<event>] <place> [<args>]\n" | 4600 | "\t Format: p[:[<group>/]<event>] <place> [<args>]\n" |
| 4601 | "\t r[maxactive][:[<group>/]<event>] <place> [<args>]\n" | ||
| 4477 | "\t -:[<group>/]<event>\n" | 4602 | "\t -:[<group>/]<event>\n" |
| 4478 | #ifdef CONFIG_KPROBE_EVENTS | 4603 | #ifdef CONFIG_KPROBE_EVENTS |
| 4479 | "\t place: [<module>:]<symbol>[+<offset>]|<memaddr>\n" | 4604 | "\t place: [<module>:]<symbol>[+<offset>]|<memaddr>\n" |
| @@ -4597,6 +4722,76 @@ static const struct file_operations tracing_readme_fops = { | |||
| 4597 | .llseek = generic_file_llseek, | 4722 | .llseek = generic_file_llseek, |
| 4598 | }; | 4723 | }; |
| 4599 | 4724 | ||
| 4725 | static void *saved_tgids_next(struct seq_file *m, void *v, loff_t *pos) | ||
| 4726 | { | ||
| 4727 | int *ptr = v; | ||
| 4728 | |||
| 4729 | if (*pos || m->count) | ||
| 4730 | ptr++; | ||
| 4731 | |||
| 4732 | (*pos)++; | ||
| 4733 | |||
| 4734 | for (; ptr <= &tgid_map[PID_MAX_DEFAULT]; ptr++) { | ||
| 4735 | if (trace_find_tgid(*ptr)) | ||
| 4736 | return ptr; | ||
| 4737 | } | ||
| 4738 | |||
| 4739 | return NULL; | ||
| 4740 | } | ||
| 4741 | |||
| 4742 | static void *saved_tgids_start(struct seq_file *m, loff_t *pos) | ||
| 4743 | { | ||
| 4744 | void *v; | ||
| 4745 | loff_t l = 0; | ||
| 4746 | |||
| 4747 | if (!tgid_map) | ||
| 4748 | return NULL; | ||
| 4749 | |||
| 4750 | v = &tgid_map[0]; | ||
| 4751 | while (l <= *pos) { | ||
| 4752 | v = saved_tgids_next(m, v, &l); | ||
| 4753 | if (!v) | ||
| 4754 | return NULL; | ||
| 4755 | } | ||
| 4756 | |||
| 4757 | return v; | ||
| 4758 | } | ||
| 4759 | |||
| 4760 | static void saved_tgids_stop(struct seq_file *m, void *v) | ||
| 4761 | { | ||
| 4762 | } | ||
| 4763 | |||
| 4764 | static int saved_tgids_show(struct seq_file *m, void *v) | ||
| 4765 | { | ||
| 4766 | int pid = (int *)v - tgid_map; | ||
| 4767 | |||
| 4768 | seq_printf(m, "%d %d\n", pid, trace_find_tgid(pid)); | ||
| 4769 | return 0; | ||
| 4770 | } | ||
| 4771 | |||
| 4772 | static const struct seq_operations tracing_saved_tgids_seq_ops = { | ||
| 4773 | .start = saved_tgids_start, | ||
| 4774 | .stop = saved_tgids_stop, | ||
| 4775 | .next = saved_tgids_next, | ||
| 4776 | .show = saved_tgids_show, | ||
| 4777 | }; | ||
| 4778 | |||
| 4779 | static int tracing_saved_tgids_open(struct inode *inode, struct file *filp) | ||
| 4780 | { | ||
| 4781 | if (tracing_disabled) | ||
| 4782 | return -ENODEV; | ||
| 4783 | |||
| 4784 | return seq_open(filp, &tracing_saved_tgids_seq_ops); | ||
| 4785 | } | ||
| 4786 | |||
| 4787 | |||
| 4788 | static const struct file_operations tracing_saved_tgids_fops = { | ||
| 4789 | .open = tracing_saved_tgids_open, | ||
| 4790 | .read = seq_read, | ||
| 4791 | .llseek = seq_lseek, | ||
| 4792 | .release = seq_release, | ||
| 4793 | }; | ||
| 4794 | |||
| 4600 | static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos) | 4795 | static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos) |
| 4601 | { | 4796 | { |
| 4602 | unsigned int *ptr = v; | 4797 | unsigned int *ptr = v; |
| @@ -4746,11 +4941,11 @@ static const struct file_operations tracing_saved_cmdlines_size_fops = { | |||
| 4746 | .write = tracing_saved_cmdlines_size_write, | 4941 | .write = tracing_saved_cmdlines_size_write, |
| 4747 | }; | 4942 | }; |
| 4748 | 4943 | ||
| 4749 | #ifdef CONFIG_TRACE_ENUM_MAP_FILE | 4944 | #ifdef CONFIG_TRACE_EVAL_MAP_FILE |
| 4750 | static union trace_enum_map_item * | 4945 | static union trace_eval_map_item * |
| 4751 | update_enum_map(union trace_enum_map_item *ptr) | 4946 | update_eval_map(union trace_eval_map_item *ptr) |
| 4752 | { | 4947 | { |
| 4753 | if (!ptr->map.enum_string) { | 4948 | if (!ptr->map.eval_string) { |
| 4754 | if (ptr->tail.next) { | 4949 | if (ptr->tail.next) { |
| 4755 | ptr = ptr->tail.next; | 4950 | ptr = ptr->tail.next; |
| 4756 | /* Set ptr to the next real item (skip head) */ | 4951 | /* Set ptr to the next real item (skip head) */ |
| @@ -4761,15 +4956,15 @@ update_enum_map(union trace_enum_map_item *ptr) | |||
| 4761 | return ptr; | 4956 | return ptr; |
| 4762 | } | 4957 | } |
| 4763 | 4958 | ||
| 4764 | static void *enum_map_next(struct seq_file *m, void *v, loff_t *pos) | 4959 | static void *eval_map_next(struct seq_file *m, void *v, loff_t *pos) |
| 4765 | { | 4960 | { |
| 4766 | union trace_enum_map_item *ptr = v; | 4961 | union trace_eval_map_item *ptr = v; |
| 4767 | 4962 | ||
| 4768 | /* | 4963 | /* |
| 4769 | * Paranoid! If ptr points to end, we don't want to increment past it. | 4964 | * Paranoid! If ptr points to end, we don't want to increment past it. |
| 4770 | * This really should never happen. | 4965 | * This really should never happen. |
| 4771 | */ | 4966 | */ |
| 4772 | ptr = update_enum_map(ptr); | 4967 | ptr = update_eval_map(ptr); |
| 4773 | if (WARN_ON_ONCE(!ptr)) | 4968 | if (WARN_ON_ONCE(!ptr)) |
| 4774 | return NULL; | 4969 | return NULL; |
| 4775 | 4970 | ||
| @@ -4777,104 +4972,104 @@ static void *enum_map_next(struct seq_file *m, void *v, loff_t *pos) | |||
| 4777 | 4972 | ||
| 4778 | (*pos)++; | 4973 | (*pos)++; |
| 4779 | 4974 | ||
| 4780 | ptr = update_enum_map(ptr); | 4975 | ptr = update_eval_map(ptr); |
| 4781 | 4976 | ||
| 4782 | return ptr; | 4977 | return ptr; |
| 4783 | } | 4978 | } |
| 4784 | 4979 | ||
| 4785 | static void *enum_map_start(struct seq_file *m, loff_t *pos) | 4980 | static void *eval_map_start(struct seq_file *m, loff_t *pos) |
| 4786 | { | 4981 | { |
| 4787 | union trace_enum_map_item *v; | 4982 | union trace_eval_map_item *v; |
| 4788 | loff_t l = 0; | 4983 | loff_t l = 0; |
| 4789 | 4984 | ||
| 4790 | mutex_lock(&trace_enum_mutex); | 4985 | mutex_lock(&trace_eval_mutex); |
| 4791 | 4986 | ||
| 4792 | v = trace_enum_maps; | 4987 | v = trace_eval_maps; |
| 4793 | if (v) | 4988 | if (v) |
| 4794 | v++; | 4989 | v++; |
| 4795 | 4990 | ||
| 4796 | while (v && l < *pos) { | 4991 | while (v && l < *pos) { |
| 4797 | v = enum_map_next(m, v, &l); | 4992 | v = eval_map_next(m, v, &l); |
| 4798 | } | 4993 | } |
| 4799 | 4994 | ||
| 4800 | return v; | 4995 | return v; |
| 4801 | } | 4996 | } |
| 4802 | 4997 | ||
| 4803 | static void enum_map_stop(struct seq_file *m, void *v) | 4998 | static void eval_map_stop(struct seq_file *m, void *v) |
| 4804 | { | 4999 | { |
| 4805 | mutex_unlock(&trace_enum_mutex); | 5000 | mutex_unlock(&trace_eval_mutex); |
| 4806 | } | 5001 | } |
| 4807 | 5002 | ||
| 4808 | static int enum_map_show(struct seq_file *m, void *v) | 5003 | static int eval_map_show(struct seq_file *m, void *v) |
| 4809 | { | 5004 | { |
| 4810 | union trace_enum_map_item *ptr = v; | 5005 | union trace_eval_map_item *ptr = v; |
| 4811 | 5006 | ||
| 4812 | seq_printf(m, "%s %ld (%s)\n", | 5007 | seq_printf(m, "%s %ld (%s)\n", |
| 4813 | ptr->map.enum_string, ptr->map.enum_value, | 5008 | ptr->map.eval_string, ptr->map.eval_value, |
| 4814 | ptr->map.system); | 5009 | ptr->map.system); |
| 4815 | 5010 | ||
| 4816 | return 0; | 5011 | return 0; |
| 4817 | } | 5012 | } |
| 4818 | 5013 | ||
| 4819 | static const struct seq_operations tracing_enum_map_seq_ops = { | 5014 | static const struct seq_operations tracing_eval_map_seq_ops = { |
| 4820 | .start = enum_map_start, | 5015 | .start = eval_map_start, |
| 4821 | .next = enum_map_next, | 5016 | .next = eval_map_next, |
| 4822 | .stop = enum_map_stop, | 5017 | .stop = eval_map_stop, |
| 4823 | .show = enum_map_show, | 5018 | .show = eval_map_show, |
| 4824 | }; | 5019 | }; |
| 4825 | 5020 | ||
| 4826 | static int tracing_enum_map_open(struct inode *inode, struct file *filp) | 5021 | static int tracing_eval_map_open(struct inode *inode, struct file *filp) |
| 4827 | { | 5022 | { |
| 4828 | if (tracing_disabled) | 5023 | if (tracing_disabled) |
| 4829 | return -ENODEV; | 5024 | return -ENODEV; |
| 4830 | 5025 | ||
| 4831 | return seq_open(filp, &tracing_enum_map_seq_ops); | 5026 | return seq_open(filp, &tracing_eval_map_seq_ops); |
| 4832 | } | 5027 | } |
| 4833 | 5028 | ||
| 4834 | static const struct file_operations tracing_enum_map_fops = { | 5029 | static const struct file_operations tracing_eval_map_fops = { |
| 4835 | .open = tracing_enum_map_open, | 5030 | .open = tracing_eval_map_open, |
| 4836 | .read = seq_read, | 5031 | .read = seq_read, |
| 4837 | .llseek = seq_lseek, | 5032 | .llseek = seq_lseek, |
| 4838 | .release = seq_release, | 5033 | .release = seq_release, |
| 4839 | }; | 5034 | }; |
| 4840 | 5035 | ||
| 4841 | static inline union trace_enum_map_item * | 5036 | static inline union trace_eval_map_item * |
| 4842 | trace_enum_jmp_to_tail(union trace_enum_map_item *ptr) | 5037 | trace_eval_jmp_to_tail(union trace_eval_map_item *ptr) |
| 4843 | { | 5038 | { |
| 4844 | /* Return tail of array given the head */ | 5039 | /* Return tail of array given the head */ |
| 4845 | return ptr + ptr->head.length + 1; | 5040 | return ptr + ptr->head.length + 1; |
| 4846 | } | 5041 | } |
| 4847 | 5042 | ||
| 4848 | static void | 5043 | static void |
| 4849 | trace_insert_enum_map_file(struct module *mod, struct trace_enum_map **start, | 5044 | trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start, |
| 4850 | int len) | 5045 | int len) |
| 4851 | { | 5046 | { |
| 4852 | struct trace_enum_map **stop; | 5047 | struct trace_eval_map **stop; |
| 4853 | struct trace_enum_map **map; | 5048 | struct trace_eval_map **map; |
| 4854 | union trace_enum_map_item *map_array; | 5049 | union trace_eval_map_item *map_array; |
| 4855 | union trace_enum_map_item *ptr; | 5050 | union trace_eval_map_item *ptr; |
| 4856 | 5051 | ||
| 4857 | stop = start + len; | 5052 | stop = start + len; |
| 4858 | 5053 | ||
| 4859 | /* | 5054 | /* |
| 4860 | * The trace_enum_maps contains the map plus a head and tail item, | 5055 | * The trace_eval_maps contains the map plus a head and tail item, |
| 4861 | * where the head holds the module and length of array, and the | 5056 | * where the head holds the module and length of array, and the |
| 4862 | * tail holds a pointer to the next list. | 5057 | * tail holds a pointer to the next list. |
| 4863 | */ | 5058 | */ |
| 4864 | map_array = kmalloc(sizeof(*map_array) * (len + 2), GFP_KERNEL); | 5059 | map_array = kmalloc(sizeof(*map_array) * (len + 2), GFP_KERNEL); |
| 4865 | if (!map_array) { | 5060 | if (!map_array) { |
| 4866 | pr_warn("Unable to allocate trace enum mapping\n"); | 5061 | pr_warn("Unable to allocate trace eval mapping\n"); |
| 4867 | return; | 5062 | return; |
| 4868 | } | 5063 | } |
| 4869 | 5064 | ||
| 4870 | mutex_lock(&trace_enum_mutex); | 5065 | mutex_lock(&trace_eval_mutex); |
| 4871 | 5066 | ||
| 4872 | if (!trace_enum_maps) | 5067 | if (!trace_eval_maps) |
| 4873 | trace_enum_maps = map_array; | 5068 | trace_eval_maps = map_array; |
| 4874 | else { | 5069 | else { |
| 4875 | ptr = trace_enum_maps; | 5070 | ptr = trace_eval_maps; |
| 4876 | for (;;) { | 5071 | for (;;) { |
| 4877 | ptr = trace_enum_jmp_to_tail(ptr); | 5072 | ptr = trace_eval_jmp_to_tail(ptr); |
| 4878 | if (!ptr->tail.next) | 5073 | if (!ptr->tail.next) |
| 4879 | break; | 5074 | break; |
| 4880 | ptr = ptr->tail.next; | 5075 | ptr = ptr->tail.next; |
| @@ -4892,34 +5087,34 @@ trace_insert_enum_map_file(struct module *mod, struct trace_enum_map **start, | |||
| 4892 | } | 5087 | } |
| 4893 | memset(map_array, 0, sizeof(*map_array)); | 5088 | memset(map_array, 0, sizeof(*map_array)); |
| 4894 | 5089 | ||
| 4895 | mutex_unlock(&trace_enum_mutex); | 5090 | mutex_unlock(&trace_eval_mutex); |
| 4896 | } | 5091 | } |
| 4897 | 5092 | ||
| 4898 | static void trace_create_enum_file(struct dentry *d_tracer) | 5093 | static void trace_create_eval_file(struct dentry *d_tracer) |
| 4899 | { | 5094 | { |
| 4900 | trace_create_file("enum_map", 0444, d_tracer, | 5095 | trace_create_file("eval_map", 0444, d_tracer, |
| 4901 | NULL, &tracing_enum_map_fops); | 5096 | NULL, &tracing_eval_map_fops); |
| 4902 | } | 5097 | } |
| 4903 | 5098 | ||
| 4904 | #else /* CONFIG_TRACE_ENUM_MAP_FILE */ | 5099 | #else /* CONFIG_TRACE_EVAL_MAP_FILE */ |
| 4905 | static inline void trace_create_enum_file(struct dentry *d_tracer) { } | 5100 | static inline void trace_create_eval_file(struct dentry *d_tracer) { } |
| 4906 | static inline void trace_insert_enum_map_file(struct module *mod, | 5101 | static inline void trace_insert_eval_map_file(struct module *mod, |
| 4907 | struct trace_enum_map **start, int len) { } | 5102 | struct trace_eval_map **start, int len) { } |
| 4908 | #endif /* !CONFIG_TRACE_ENUM_MAP_FILE */ | 5103 | #endif /* !CONFIG_TRACE_EVAL_MAP_FILE */ |
| 4909 | 5104 | ||
| 4910 | static void trace_insert_enum_map(struct module *mod, | 5105 | static void trace_insert_eval_map(struct module *mod, |
| 4911 | struct trace_enum_map **start, int len) | 5106 | struct trace_eval_map **start, int len) |
| 4912 | { | 5107 | { |
| 4913 | struct trace_enum_map **map; | 5108 | struct trace_eval_map **map; |
| 4914 | 5109 | ||
| 4915 | if (len <= 0) | 5110 | if (len <= 0) |
| 4916 | return; | 5111 | return; |
| 4917 | 5112 | ||
| 4918 | map = start; | 5113 | map = start; |
| 4919 | 5114 | ||
| 4920 | trace_event_enum_update(map, len); | 5115 | trace_event_eval_update(map, len); |
| 4921 | 5116 | ||
| 4922 | trace_insert_enum_map_file(mod, start, len); | 5117 | trace_insert_eval_map_file(mod, start, len); |
| 4923 | } | 5118 | } |
| 4924 | 5119 | ||
| 4925 | static ssize_t | 5120 | static ssize_t |
| @@ -6739,33 +6934,18 @@ static const struct file_operations tracing_stats_fops = { | |||
| 6739 | 6934 | ||
| 6740 | #ifdef CONFIG_DYNAMIC_FTRACE | 6935 | #ifdef CONFIG_DYNAMIC_FTRACE |
| 6741 | 6936 | ||
| 6742 | int __weak ftrace_arch_read_dyn_info(char *buf, int size) | ||
| 6743 | { | ||
| 6744 | return 0; | ||
| 6745 | } | ||
| 6746 | |||
| 6747 | static ssize_t | 6937 | static ssize_t |
| 6748 | tracing_read_dyn_info(struct file *filp, char __user *ubuf, | 6938 | tracing_read_dyn_info(struct file *filp, char __user *ubuf, |
| 6749 | size_t cnt, loff_t *ppos) | 6939 | size_t cnt, loff_t *ppos) |
| 6750 | { | 6940 | { |
| 6751 | static char ftrace_dyn_info_buffer[1024]; | ||
| 6752 | static DEFINE_MUTEX(dyn_info_mutex); | ||
| 6753 | unsigned long *p = filp->private_data; | 6941 | unsigned long *p = filp->private_data; |
| 6754 | char *buf = ftrace_dyn_info_buffer; | 6942 | char buf[64]; /* Not too big for a shallow stack */ |
| 6755 | int size = ARRAY_SIZE(ftrace_dyn_info_buffer); | ||
| 6756 | int r; | 6943 | int r; |
| 6757 | 6944 | ||
| 6758 | mutex_lock(&dyn_info_mutex); | 6945 | r = scnprintf(buf, 63, "%ld", *p); |
| 6759 | r = sprintf(buf, "%ld ", *p); | ||
| 6760 | |||
| 6761 | r += ftrace_arch_read_dyn_info(buf+r, (size-1)-r); | ||
| 6762 | buf[r++] = '\n'; | 6946 | buf[r++] = '\n'; |
| 6763 | 6947 | ||
| 6764 | r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); | 6948 | return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); |
| 6765 | |||
| 6766 | mutex_unlock(&dyn_info_mutex); | ||
| 6767 | |||
| 6768 | return r; | ||
| 6769 | } | 6949 | } |
| 6770 | 6950 | ||
| 6771 | static const struct file_operations tracing_dyn_info_fops = { | 6951 | static const struct file_operations tracing_dyn_info_fops = { |
| @@ -6881,6 +7061,9 @@ ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash, | |||
| 6881 | char *number; | 7061 | char *number; |
| 6882 | int ret; | 7062 | int ret; |
| 6883 | 7063 | ||
| 7064 | if (!tr) | ||
| 7065 | return -ENODEV; | ||
| 7066 | |||
| 6884 | /* hash funcs only work with set_ftrace_filter */ | 7067 | /* hash funcs only work with set_ftrace_filter */ |
| 6885 | if (!enable) | 7068 | if (!enable) |
| 6886 | return -EINVAL; | 7069 | return -EINVAL; |
| @@ -7591,6 +7774,7 @@ static int instance_rmdir(const char *name) | |||
| 7591 | } | 7774 | } |
| 7592 | kfree(tr->topts); | 7775 | kfree(tr->topts); |
| 7593 | 7776 | ||
| 7777 | free_cpumask_var(tr->tracing_cpumask); | ||
| 7594 | kfree(tr->name); | 7778 | kfree(tr->name); |
| 7595 | kfree(tr); | 7779 | kfree(tr); |
| 7596 | 7780 | ||
| @@ -7734,21 +7918,21 @@ struct dentry *tracing_init_dentry(void) | |||
| 7734 | return NULL; | 7918 | return NULL; |
| 7735 | } | 7919 | } |
| 7736 | 7920 | ||
| 7737 | extern struct trace_enum_map *__start_ftrace_enum_maps[]; | 7921 | extern struct trace_eval_map *__start_ftrace_eval_maps[]; |
| 7738 | extern struct trace_enum_map *__stop_ftrace_enum_maps[]; | 7922 | extern struct trace_eval_map *__stop_ftrace_eval_maps[]; |
| 7739 | 7923 | ||
| 7740 | static void __init trace_enum_init(void) | 7924 | static void __init trace_eval_init(void) |
| 7741 | { | 7925 | { |
| 7742 | int len; | 7926 | int len; |
| 7743 | 7927 | ||
| 7744 | len = __stop_ftrace_enum_maps - __start_ftrace_enum_maps; | 7928 | len = __stop_ftrace_eval_maps - __start_ftrace_eval_maps; |
| 7745 | trace_insert_enum_map(NULL, __start_ftrace_enum_maps, len); | 7929 | trace_insert_eval_map(NULL, __start_ftrace_eval_maps, len); |
| 7746 | } | 7930 | } |
| 7747 | 7931 | ||
| 7748 | #ifdef CONFIG_MODULES | 7932 | #ifdef CONFIG_MODULES |
| 7749 | static void trace_module_add_enums(struct module *mod) | 7933 | static void trace_module_add_evals(struct module *mod) |
| 7750 | { | 7934 | { |
| 7751 | if (!mod->num_trace_enums) | 7935 | if (!mod->num_trace_evals) |
| 7752 | return; | 7936 | return; |
| 7753 | 7937 | ||
| 7754 | /* | 7938 | /* |
| @@ -7758,40 +7942,40 @@ static void trace_module_add_enums(struct module *mod) | |||
| 7758 | if (trace_module_has_bad_taint(mod)) | 7942 | if (trace_module_has_bad_taint(mod)) |
| 7759 | return; | 7943 | return; |
| 7760 | 7944 | ||
| 7761 | trace_insert_enum_map(mod, mod->trace_enums, mod->num_trace_enums); | 7945 | trace_insert_eval_map(mod, mod->trace_evals, mod->num_trace_evals); |
| 7762 | } | 7946 | } |
| 7763 | 7947 | ||
| 7764 | #ifdef CONFIG_TRACE_ENUM_MAP_FILE | 7948 | #ifdef CONFIG_TRACE_EVAL_MAP_FILE |
| 7765 | static void trace_module_remove_enums(struct module *mod) | 7949 | static void trace_module_remove_evals(struct module *mod) |
| 7766 | { | 7950 | { |
| 7767 | union trace_enum_map_item *map; | 7951 | union trace_eval_map_item *map; |
| 7768 | union trace_enum_map_item **last = &trace_enum_maps; | 7952 | union trace_eval_map_item **last = &trace_eval_maps; |
| 7769 | 7953 | ||
| 7770 | if (!mod->num_trace_enums) | 7954 | if (!mod->num_trace_evals) |
| 7771 | return; | 7955 | return; |
| 7772 | 7956 | ||
| 7773 | mutex_lock(&trace_enum_mutex); | 7957 | mutex_lock(&trace_eval_mutex); |
| 7774 | 7958 | ||
| 7775 | map = trace_enum_maps; | 7959 | map = trace_eval_maps; |
| 7776 | 7960 | ||
| 7777 | while (map) { | 7961 | while (map) { |
| 7778 | if (map->head.mod == mod) | 7962 | if (map->head.mod == mod) |
| 7779 | break; | 7963 | break; |
| 7780 | map = trace_enum_jmp_to_tail(map); | 7964 | map = trace_eval_jmp_to_tail(map); |
| 7781 | last = &map->tail.next; | 7965 | last = &map->tail.next; |
| 7782 | map = map->tail.next; | 7966 | map = map->tail.next; |
| 7783 | } | 7967 | } |
| 7784 | if (!map) | 7968 | if (!map) |
| 7785 | goto out; | 7969 | goto out; |
| 7786 | 7970 | ||
| 7787 | *last = trace_enum_jmp_to_tail(map)->tail.next; | 7971 | *last = trace_eval_jmp_to_tail(map)->tail.next; |
| 7788 | kfree(map); | 7972 | kfree(map); |
| 7789 | out: | 7973 | out: |
| 7790 | mutex_unlock(&trace_enum_mutex); | 7974 | mutex_unlock(&trace_eval_mutex); |
| 7791 | } | 7975 | } |
| 7792 | #else | 7976 | #else |
| 7793 | static inline void trace_module_remove_enums(struct module *mod) { } | 7977 | static inline void trace_module_remove_evals(struct module *mod) { } |
| 7794 | #endif /* CONFIG_TRACE_ENUM_MAP_FILE */ | 7978 | #endif /* CONFIG_TRACE_EVAL_MAP_FILE */ |
| 7795 | 7979 | ||
| 7796 | static int trace_module_notify(struct notifier_block *self, | 7980 | static int trace_module_notify(struct notifier_block *self, |
| 7797 | unsigned long val, void *data) | 7981 | unsigned long val, void *data) |
| @@ -7800,10 +7984,10 @@ static int trace_module_notify(struct notifier_block *self, | |||
| 7800 | 7984 | ||
| 7801 | switch (val) { | 7985 | switch (val) { |
| 7802 | case MODULE_STATE_COMING: | 7986 | case MODULE_STATE_COMING: |
| 7803 | trace_module_add_enums(mod); | 7987 | trace_module_add_evals(mod); |
| 7804 | break; | 7988 | break; |
| 7805 | case MODULE_STATE_GOING: | 7989 | case MODULE_STATE_GOING: |
| 7806 | trace_module_remove_enums(mod); | 7990 | trace_module_remove_evals(mod); |
| 7807 | break; | 7991 | break; |
| 7808 | } | 7992 | } |
| 7809 | 7993 | ||
| @@ -7841,9 +8025,12 @@ static __init int tracer_init_tracefs(void) | |||
| 7841 | trace_create_file("saved_cmdlines_size", 0644, d_tracer, | 8025 | trace_create_file("saved_cmdlines_size", 0644, d_tracer, |
| 7842 | NULL, &tracing_saved_cmdlines_size_fops); | 8026 | NULL, &tracing_saved_cmdlines_size_fops); |
| 7843 | 8027 | ||
| 7844 | trace_enum_init(); | 8028 | trace_create_file("saved_tgids", 0444, d_tracer, |
| 8029 | NULL, &tracing_saved_tgids_fops); | ||
| 8030 | |||
| 8031 | trace_eval_init(); | ||
| 7845 | 8032 | ||
| 7846 | trace_create_enum_file(d_tracer); | 8033 | trace_create_eval_file(d_tracer); |
| 7847 | 8034 | ||
| 7848 | #ifdef CONFIG_MODULES | 8035 | #ifdef CONFIG_MODULES |
| 7849 | register_module_notifier(&trace_module_nb); | 8036 | register_module_notifier(&trace_module_nb); |
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 39fd77330aab..490ba229931d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
| @@ -263,7 +263,10 @@ struct trace_array { | |||
| 263 | struct ftrace_ops *ops; | 263 | struct ftrace_ops *ops; |
| 264 | struct trace_pid_list __rcu *function_pids; | 264 | struct trace_pid_list __rcu *function_pids; |
| 265 | #ifdef CONFIG_DYNAMIC_FTRACE | 265 | #ifdef CONFIG_DYNAMIC_FTRACE |
| 266 | /* All of these are protected by the ftrace_lock */ | ||
| 266 | struct list_head func_probes; | 267 | struct list_head func_probes; |
| 268 | struct list_head mod_trace; | ||
| 269 | struct list_head mod_notrace; | ||
| 267 | #endif | 270 | #endif |
| 268 | /* function tracing enabled */ | 271 | /* function tracing enabled */ |
| 269 | int function_enabled; | 272 | int function_enabled; |
| @@ -637,6 +640,9 @@ void set_graph_array(struct trace_array *tr); | |||
| 637 | 640 | ||
| 638 | void tracing_start_cmdline_record(void); | 641 | void tracing_start_cmdline_record(void); |
| 639 | void tracing_stop_cmdline_record(void); | 642 | void tracing_stop_cmdline_record(void); |
| 643 | void tracing_start_tgid_record(void); | ||
| 644 | void tracing_stop_tgid_record(void); | ||
| 645 | |||
| 640 | int register_tracer(struct tracer *type); | 646 | int register_tracer(struct tracer *type); |
| 641 | int is_tracing_stopped(void); | 647 | int is_tracing_stopped(void); |
| 642 | 648 | ||
| @@ -697,6 +703,7 @@ static inline void __trace_stack(struct trace_array *tr, unsigned long flags, | |||
| 697 | extern u64 ftrace_now(int cpu); | 703 | extern u64 ftrace_now(int cpu); |
| 698 | 704 | ||
| 699 | extern void trace_find_cmdline(int pid, char comm[]); | 705 | extern void trace_find_cmdline(int pid, char comm[]); |
| 706 | extern int trace_find_tgid(int pid); | ||
| 700 | extern void trace_event_follow_fork(struct trace_array *tr, bool enable); | 707 | extern void trace_event_follow_fork(struct trace_array *tr, bool enable); |
| 701 | 708 | ||
| 702 | #ifdef CONFIG_DYNAMIC_FTRACE | 709 | #ifdef CONFIG_DYNAMIC_FTRACE |
| @@ -761,10 +768,24 @@ enum print_line_t print_trace_line(struct trace_iterator *iter); | |||
| 761 | 768 | ||
| 762 | extern char trace_find_mark(unsigned long long duration); | 769 | extern char trace_find_mark(unsigned long long duration); |
| 763 | 770 | ||
| 771 | struct ftrace_hash; | ||
| 772 | |||
| 773 | struct ftrace_mod_load { | ||
| 774 | struct list_head list; | ||
| 775 | char *func; | ||
| 776 | char *module; | ||
| 777 | int enable; | ||
| 778 | }; | ||
| 779 | |||
| 780 | enum { | ||
| 781 | FTRACE_HASH_FL_MOD = (1 << 0), | ||
| 782 | }; | ||
| 783 | |||
| 764 | struct ftrace_hash { | 784 | struct ftrace_hash { |
| 765 | unsigned long size_bits; | 785 | unsigned long size_bits; |
| 766 | struct hlist_head *buckets; | 786 | struct hlist_head *buckets; |
| 767 | unsigned long count; | 787 | unsigned long count; |
| 788 | unsigned long flags; | ||
| 768 | struct rcu_head rcu; | 789 | struct rcu_head rcu; |
| 769 | }; | 790 | }; |
| 770 | 791 | ||
| @@ -773,7 +794,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip); | |||
| 773 | 794 | ||
| 774 | static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash) | 795 | static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash) |
| 775 | { | 796 | { |
| 776 | return !hash || !hash->count; | 797 | return !hash || !(hash->count || (hash->flags & FTRACE_HASH_FL_MOD)); |
| 777 | } | 798 | } |
| 778 | 799 | ||
| 779 | /* Standard output formatting function used for function return traces */ | 800 | /* Standard output formatting function used for function return traces */ |
| @@ -1107,6 +1128,7 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, | |||
| 1107 | C(CONTEXT_INFO, "context-info"), /* Print pid/cpu/time */ \ | 1128 | C(CONTEXT_INFO, "context-info"), /* Print pid/cpu/time */ \ |
| 1108 | C(LATENCY_FMT, "latency-format"), \ | 1129 | C(LATENCY_FMT, "latency-format"), \ |
| 1109 | C(RECORD_CMD, "record-cmd"), \ | 1130 | C(RECORD_CMD, "record-cmd"), \ |
| 1131 | C(RECORD_TGID, "record-tgid"), \ | ||
| 1110 | C(OVERWRITE, "overwrite"), \ | 1132 | C(OVERWRITE, "overwrite"), \ |
| 1111 | C(STOP_ON_FREE, "disable_on_free"), \ | 1133 | C(STOP_ON_FREE, "disable_on_free"), \ |
| 1112 | C(IRQ_INFO, "irq-info"), \ | 1134 | C(IRQ_INFO, "irq-info"), \ |
| @@ -1188,9 +1210,9 @@ struct ftrace_event_field { | |||
| 1188 | struct event_filter { | 1210 | struct event_filter { |
| 1189 | int n_preds; /* Number assigned */ | 1211 | int n_preds; /* Number assigned */ |
| 1190 | int a_preds; /* allocated */ | 1212 | int a_preds; /* allocated */ |
| 1191 | struct filter_pred *preds; | 1213 | struct filter_pred __rcu *preds; |
| 1192 | struct filter_pred *root; | 1214 | struct filter_pred __rcu *root; |
| 1193 | char *filter_string; | 1215 | char *filter_string; |
| 1194 | }; | 1216 | }; |
| 1195 | 1217 | ||
| 1196 | struct event_subsystem { | 1218 | struct event_subsystem { |
| @@ -1423,6 +1445,8 @@ struct ftrace_event_field * | |||
| 1423 | trace_find_event_field(struct trace_event_call *call, char *name); | 1445 | trace_find_event_field(struct trace_event_call *call, char *name); |
| 1424 | 1446 | ||
| 1425 | extern void trace_event_enable_cmd_record(bool enable); | 1447 | extern void trace_event_enable_cmd_record(bool enable); |
| 1448 | extern void trace_event_enable_tgid_record(bool enable); | ||
| 1449 | |||
| 1426 | extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr); | 1450 | extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr); |
| 1427 | extern int event_trace_del_tracer(struct trace_array *tr); | 1451 | extern int event_trace_del_tracer(struct trace_array *tr); |
| 1428 | 1452 | ||
| @@ -1773,10 +1797,10 @@ static inline const char *get_syscall_name(int syscall) | |||
| 1773 | 1797 | ||
| 1774 | #ifdef CONFIG_EVENT_TRACING | 1798 | #ifdef CONFIG_EVENT_TRACING |
| 1775 | void trace_event_init(void); | 1799 | void trace_event_init(void); |
| 1776 | void trace_event_enum_update(struct trace_enum_map **map, int len); | 1800 | void trace_event_eval_update(struct trace_eval_map **map, int len); |
| 1777 | #else | 1801 | #else |
| 1778 | static inline void __init trace_event_init(void) { } | 1802 | static inline void __init trace_event_init(void) { } |
| 1779 | static inline void trace_event_enum_update(struct trace_enum_map **map, int len) { } | 1803 | static inline void trace_event_eval_update(struct trace_eval_map **map, int len) { } |
| 1780 | #endif | 1804 | #endif |
| 1781 | 1805 | ||
| 1782 | extern struct trace_iterator *tracepoint_print_iter; | 1806 | extern struct trace_iterator *tracepoint_print_iter; |
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index e7973e10398c..36132f9280e6 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
| @@ -343,6 +343,28 @@ void trace_event_enable_cmd_record(bool enable) | |||
| 343 | mutex_unlock(&event_mutex); | 343 | mutex_unlock(&event_mutex); |
| 344 | } | 344 | } |
| 345 | 345 | ||
| 346 | void trace_event_enable_tgid_record(bool enable) | ||
| 347 | { | ||
| 348 | struct trace_event_file *file; | ||
| 349 | struct trace_array *tr; | ||
| 350 | |||
| 351 | mutex_lock(&event_mutex); | ||
| 352 | do_for_each_event_file(tr, file) { | ||
| 353 | if (!(file->flags & EVENT_FILE_FL_ENABLED)) | ||
| 354 | continue; | ||
| 355 | |||
| 356 | if (enable) { | ||
| 357 | tracing_start_tgid_record(); | ||
| 358 | set_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags); | ||
| 359 | } else { | ||
| 360 | tracing_stop_tgid_record(); | ||
| 361 | clear_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, | ||
| 362 | &file->flags); | ||
| 363 | } | ||
| 364 | } while_for_each_event_file(); | ||
| 365 | mutex_unlock(&event_mutex); | ||
| 366 | } | ||
| 367 | |||
| 346 | static int __ftrace_event_enable_disable(struct trace_event_file *file, | 368 | static int __ftrace_event_enable_disable(struct trace_event_file *file, |
| 347 | int enable, int soft_disable) | 369 | int enable, int soft_disable) |
| 348 | { | 370 | { |
| @@ -381,6 +403,12 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, | |||
| 381 | tracing_stop_cmdline_record(); | 403 | tracing_stop_cmdline_record(); |
| 382 | clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); | 404 | clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); |
| 383 | } | 405 | } |
| 406 | |||
| 407 | if (file->flags & EVENT_FILE_FL_RECORDED_TGID) { | ||
| 408 | tracing_stop_tgid_record(); | ||
| 409 | clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); | ||
| 410 | } | ||
| 411 | |||
| 384 | call->class->reg(call, TRACE_REG_UNREGISTER, file); | 412 | call->class->reg(call, TRACE_REG_UNREGISTER, file); |
| 385 | } | 413 | } |
| 386 | /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */ | 414 | /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */ |
| @@ -407,18 +435,30 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, | |||
| 407 | } | 435 | } |
| 408 | 436 | ||
| 409 | if (!(file->flags & EVENT_FILE_FL_ENABLED)) { | 437 | if (!(file->flags & EVENT_FILE_FL_ENABLED)) { |
| 438 | bool cmd = false, tgid = false; | ||
| 410 | 439 | ||
| 411 | /* Keep the event disabled, when going to SOFT_MODE. */ | 440 | /* Keep the event disabled, when going to SOFT_MODE. */ |
| 412 | if (soft_disable) | 441 | if (soft_disable) |
| 413 | set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); | 442 | set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); |
| 414 | 443 | ||
| 415 | if (tr->trace_flags & TRACE_ITER_RECORD_CMD) { | 444 | if (tr->trace_flags & TRACE_ITER_RECORD_CMD) { |
| 445 | cmd = true; | ||
| 416 | tracing_start_cmdline_record(); | 446 | tracing_start_cmdline_record(); |
| 417 | set_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); | 447 | set_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); |
| 418 | } | 448 | } |
| 449 | |||
| 450 | if (tr->trace_flags & TRACE_ITER_RECORD_TGID) { | ||
| 451 | tgid = true; | ||
| 452 | tracing_start_tgid_record(); | ||
| 453 | set_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags); | ||
| 454 | } | ||
| 455 | |||
| 419 | ret = call->class->reg(call, TRACE_REG_REGISTER, file); | 456 | ret = call->class->reg(call, TRACE_REG_REGISTER, file); |
| 420 | if (ret) { | 457 | if (ret) { |
| 421 | tracing_stop_cmdline_record(); | 458 | if (cmd) |
| 459 | tracing_stop_cmdline_record(); | ||
| 460 | if (tgid) | ||
| 461 | tracing_stop_tgid_record(); | ||
| 422 | pr_info("event trace: Could not enable event " | 462 | pr_info("event trace: Could not enable event " |
| 423 | "%s\n", trace_event_name(call)); | 463 | "%s\n", trace_event_name(call)); |
| 424 | break; | 464 | break; |
| @@ -2067,18 +2107,18 @@ __register_event(struct trace_event_call *call, struct module *mod) | |||
| 2067 | return 0; | 2107 | return 0; |
| 2068 | } | 2108 | } |
| 2069 | 2109 | ||
| 2070 | static char *enum_replace(char *ptr, struct trace_enum_map *map, int len) | 2110 | static char *eval_replace(char *ptr, struct trace_eval_map *map, int len) |
| 2071 | { | 2111 | { |
| 2072 | int rlen; | 2112 | int rlen; |
| 2073 | int elen; | 2113 | int elen; |
| 2074 | 2114 | ||
| 2075 | /* Find the length of the enum value as a string */ | 2115 | /* Find the length of the eval value as a string */ |
| 2076 | elen = snprintf(ptr, 0, "%ld", map->enum_value); | 2116 | elen = snprintf(ptr, 0, "%ld", map->eval_value); |
| 2077 | /* Make sure there's enough room to replace the string with the value */ | 2117 | /* Make sure there's enough room to replace the string with the value */ |
| 2078 | if (len < elen) | 2118 | if (len < elen) |
| 2079 | return NULL; | 2119 | return NULL; |
| 2080 | 2120 | ||
| 2081 | snprintf(ptr, elen + 1, "%ld", map->enum_value); | 2121 | snprintf(ptr, elen + 1, "%ld", map->eval_value); |
| 2082 | 2122 | ||
| 2083 | /* Get the rest of the string of ptr */ | 2123 | /* Get the rest of the string of ptr */ |
| 2084 | rlen = strlen(ptr + len); | 2124 | rlen = strlen(ptr + len); |
| @@ -2090,11 +2130,11 @@ static char *enum_replace(char *ptr, struct trace_enum_map *map, int len) | |||
| 2090 | } | 2130 | } |
| 2091 | 2131 | ||
| 2092 | static void update_event_printk(struct trace_event_call *call, | 2132 | static void update_event_printk(struct trace_event_call *call, |
| 2093 | struct trace_enum_map *map) | 2133 | struct trace_eval_map *map) |
| 2094 | { | 2134 | { |
| 2095 | char *ptr; | 2135 | char *ptr; |
| 2096 | int quote = 0; | 2136 | int quote = 0; |
| 2097 | int len = strlen(map->enum_string); | 2137 | int len = strlen(map->eval_string); |
| 2098 | 2138 | ||
| 2099 | for (ptr = call->print_fmt; *ptr; ptr++) { | 2139 | for (ptr = call->print_fmt; *ptr; ptr++) { |
| 2100 | if (*ptr == '\\') { | 2140 | if (*ptr == '\\') { |
| @@ -2125,16 +2165,16 @@ static void update_event_printk(struct trace_event_call *call, | |||
| 2125 | continue; | 2165 | continue; |
| 2126 | } | 2166 | } |
| 2127 | if (isalpha(*ptr) || *ptr == '_') { | 2167 | if (isalpha(*ptr) || *ptr == '_') { |
| 2128 | if (strncmp(map->enum_string, ptr, len) == 0 && | 2168 | if (strncmp(map->eval_string, ptr, len) == 0 && |
| 2129 | !isalnum(ptr[len]) && ptr[len] != '_') { | 2169 | !isalnum(ptr[len]) && ptr[len] != '_') { |
| 2130 | ptr = enum_replace(ptr, map, len); | 2170 | ptr = eval_replace(ptr, map, len); |
| 2131 | /* Hmm, enum string smaller than value */ | 2171 | /* enum/sizeof string smaller than value */ |
| 2132 | if (WARN_ON_ONCE(!ptr)) | 2172 | if (WARN_ON_ONCE(!ptr)) |
| 2133 | return; | 2173 | return; |
| 2134 | /* | 2174 | /* |
| 2135 | * No need to decrement here, as enum_replace() | 2175 | * No need to decrement here, as eval_replace() |
| 2136 | * returns the pointer to the character passed | 2176 | * returns the pointer to the character passed |
| 2137 | * the enum, and two enums can not be placed | 2177 | * the eval, and two evals can not be placed |
| 2138 | * back to back without something in between. | 2178 | * back to back without something in between. |
| 2139 | * We can skip that something in between. | 2179 | * We can skip that something in between. |
| 2140 | */ | 2180 | */ |
| @@ -2165,7 +2205,7 @@ static void update_event_printk(struct trace_event_call *call, | |||
| 2165 | } | 2205 | } |
| 2166 | } | 2206 | } |
| 2167 | 2207 | ||
| 2168 | void trace_event_enum_update(struct trace_enum_map **map, int len) | 2208 | void trace_event_eval_update(struct trace_eval_map **map, int len) |
| 2169 | { | 2209 | { |
| 2170 | struct trace_event_call *call, *p; | 2210 | struct trace_event_call *call, *p; |
| 2171 | const char *last_system = NULL; | 2211 | const char *last_system = NULL; |
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index a3bddbfd0874..a0910c0cdf2e 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c | |||
| @@ -654,6 +654,9 @@ ftrace_trace_onoff_callback(struct trace_array *tr, struct ftrace_hash *hash, | |||
| 654 | { | 654 | { |
| 655 | struct ftrace_probe_ops *ops; | 655 | struct ftrace_probe_ops *ops; |
| 656 | 656 | ||
| 657 | if (!tr) | ||
| 658 | return -ENODEV; | ||
| 659 | |||
| 657 | /* we register both traceon and traceoff to this callback */ | 660 | /* we register both traceon and traceoff to this callback */ |
| 658 | if (strcmp(cmd, "traceon") == 0) | 661 | if (strcmp(cmd, "traceon") == 0) |
| 659 | ops = param ? &traceon_count_probe_ops : &traceon_probe_ops; | 662 | ops = param ? &traceon_count_probe_ops : &traceon_probe_ops; |
| @@ -670,6 +673,9 @@ ftrace_stacktrace_callback(struct trace_array *tr, struct ftrace_hash *hash, | |||
| 670 | { | 673 | { |
| 671 | struct ftrace_probe_ops *ops; | 674 | struct ftrace_probe_ops *ops; |
| 672 | 675 | ||
| 676 | if (!tr) | ||
| 677 | return -ENODEV; | ||
| 678 | |||
| 673 | ops = param ? &stacktrace_count_probe_ops : &stacktrace_probe_ops; | 679 | ops = param ? &stacktrace_count_probe_ops : &stacktrace_probe_ops; |
| 674 | 680 | ||
| 675 | return ftrace_trace_probe_callback(tr, ops, hash, glob, cmd, | 681 | return ftrace_trace_probe_callback(tr, ops, hash, glob, cmd, |
| @@ -682,6 +688,9 @@ ftrace_dump_callback(struct trace_array *tr, struct ftrace_hash *hash, | |||
| 682 | { | 688 | { |
| 683 | struct ftrace_probe_ops *ops; | 689 | struct ftrace_probe_ops *ops; |
| 684 | 690 | ||
| 691 | if (!tr) | ||
| 692 | return -ENODEV; | ||
| 693 | |||
| 685 | ops = &dump_probe_ops; | 694 | ops = &dump_probe_ops; |
| 686 | 695 | ||
| 687 | /* Only dump once. */ | 696 | /* Only dump once. */ |
| @@ -695,6 +704,9 @@ ftrace_cpudump_callback(struct trace_array *tr, struct ftrace_hash *hash, | |||
| 695 | { | 704 | { |
| 696 | struct ftrace_probe_ops *ops; | 705 | struct ftrace_probe_ops *ops; |
| 697 | 706 | ||
| 707 | if (!tr) | ||
| 708 | return -ENODEV; | ||
| 709 | |||
| 698 | ops = &cpudump_probe_ops; | 710 | ops = &cpudump_probe_ops; |
| 699 | 711 | ||
| 700 | /* Only dump once. */ | 712 | /* Only dump once. */ |
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index c129fca6ec99..c9b5aa10fbf9 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c | |||
| @@ -598,6 +598,14 @@ static struct notifier_block trace_kprobe_module_nb = { | |||
| 598 | .priority = 1 /* Invoked after kprobe module callback */ | 598 | .priority = 1 /* Invoked after kprobe module callback */ |
| 599 | }; | 599 | }; |
| 600 | 600 | ||
| 601 | /* Convert certain expected symbols into '_' when generating event names */ | ||
| 602 | static inline void sanitize_event_name(char *name) | ||
| 603 | { | ||
| 604 | while (*name++ != '\0') | ||
| 605 | if (*name == ':' || *name == '.') | ||
| 606 | *name = '_'; | ||
| 607 | } | ||
| 608 | |||
| 601 | static int create_trace_kprobe(int argc, char **argv) | 609 | static int create_trace_kprobe(int argc, char **argv) |
| 602 | { | 610 | { |
| 603 | /* | 611 | /* |
| @@ -707,24 +715,20 @@ static int create_trace_kprobe(int argc, char **argv) | |||
| 707 | pr_info("Probe point is not specified.\n"); | 715 | pr_info("Probe point is not specified.\n"); |
| 708 | return -EINVAL; | 716 | return -EINVAL; |
| 709 | } | 717 | } |
| 710 | if (isdigit(argv[1][0])) { | 718 | |
| 711 | /* an address specified */ | 719 | /* try to parse an address. if that fails, try to read the |
| 712 | ret = kstrtoul(&argv[1][0], 0, (unsigned long *)&addr); | 720 | * input as a symbol. */ |
| 713 | if (ret) { | 721 | if (kstrtoul(argv[1], 0, (unsigned long *)&addr)) { |
| 714 | pr_info("Failed to parse address.\n"); | ||
| 715 | return ret; | ||
| 716 | } | ||
| 717 | } else { | ||
| 718 | /* a symbol specified */ | 722 | /* a symbol specified */ |
| 719 | symbol = argv[1]; | 723 | symbol = argv[1]; |
| 720 | /* TODO: support .init module functions */ | 724 | /* TODO: support .init module functions */ |
| 721 | ret = traceprobe_split_symbol_offset(symbol, &offset); | 725 | ret = traceprobe_split_symbol_offset(symbol, &offset); |
| 722 | if (ret) { | 726 | if (ret) { |
| 723 | pr_info("Failed to parse symbol.\n"); | 727 | pr_info("Failed to parse either an address or a symbol.\n"); |
| 724 | return ret; | 728 | return ret; |
| 725 | } | 729 | } |
| 726 | if (offset && is_return && | 730 | if (offset && is_return && |
| 727 | !function_offset_within_entry(NULL, symbol, offset)) { | 731 | !kprobe_on_func_entry(NULL, symbol, offset)) { |
| 728 | pr_info("Given offset is not valid for return probe.\n"); | 732 | pr_info("Given offset is not valid for return probe.\n"); |
| 729 | return -EINVAL; | 733 | return -EINVAL; |
| 730 | } | 734 | } |
| @@ -740,6 +744,7 @@ static int create_trace_kprobe(int argc, char **argv) | |||
| 740 | else | 744 | else |
| 741 | snprintf(buf, MAX_EVENT_NAME_LEN, "%c_0x%p", | 745 | snprintf(buf, MAX_EVENT_NAME_LEN, "%c_0x%p", |
| 742 | is_return ? 'r' : 'p', addr); | 746 | is_return ? 'r' : 'p', addr); |
| 747 | sanitize_event_name(buf); | ||
| 743 | event = buf; | 748 | event = buf; |
| 744 | } | 749 | } |
| 745 | tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive, | 750 | tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive, |
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 08f9bab8089e..bac629af2285 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c | |||
| @@ -340,31 +340,41 @@ static inline const char *kretprobed(const char *name) | |||
| 340 | static void | 340 | static void |
| 341 | seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address) | 341 | seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address) |
| 342 | { | 342 | { |
| 343 | #ifdef CONFIG_KALLSYMS | ||
| 344 | char str[KSYM_SYMBOL_LEN]; | 343 | char str[KSYM_SYMBOL_LEN]; |
| 344 | #ifdef CONFIG_KALLSYMS | ||
| 345 | const char *name; | 345 | const char *name; |
| 346 | 346 | ||
| 347 | kallsyms_lookup(address, NULL, NULL, NULL, str); | 347 | kallsyms_lookup(address, NULL, NULL, NULL, str); |
| 348 | 348 | ||
| 349 | name = kretprobed(str); | 349 | name = kretprobed(str); |
| 350 | 350 | ||
| 351 | trace_seq_printf(s, fmt, name); | 351 | if (name && strlen(name)) { |
| 352 | trace_seq_printf(s, fmt, name); | ||
| 353 | return; | ||
| 354 | } | ||
| 352 | #endif | 355 | #endif |
| 356 | snprintf(str, KSYM_SYMBOL_LEN, "0x%08lx", address); | ||
| 357 | trace_seq_printf(s, fmt, str); | ||
| 353 | } | 358 | } |
| 354 | 359 | ||
| 355 | static void | 360 | static void |
| 356 | seq_print_sym_offset(struct trace_seq *s, const char *fmt, | 361 | seq_print_sym_offset(struct trace_seq *s, const char *fmt, |
| 357 | unsigned long address) | 362 | unsigned long address) |
| 358 | { | 363 | { |
| 359 | #ifdef CONFIG_KALLSYMS | ||
| 360 | char str[KSYM_SYMBOL_LEN]; | 364 | char str[KSYM_SYMBOL_LEN]; |
| 365 | #ifdef CONFIG_KALLSYMS | ||
| 361 | const char *name; | 366 | const char *name; |
| 362 | 367 | ||
| 363 | sprint_symbol(str, address); | 368 | sprint_symbol(str, address); |
| 364 | name = kretprobed(str); | 369 | name = kretprobed(str); |
| 365 | 370 | ||
| 366 | trace_seq_printf(s, fmt, name); | 371 | if (name && strlen(name)) { |
| 372 | trace_seq_printf(s, fmt, name); | ||
| 373 | return; | ||
| 374 | } | ||
| 367 | #endif | 375 | #endif |
| 376 | snprintf(str, KSYM_SYMBOL_LEN, "0x%08lx", address); | ||
| 377 | trace_seq_printf(s, fmt, str); | ||
| 368 | } | 378 | } |
| 369 | 379 | ||
| 370 | #ifndef CONFIG_64BIT | 380 | #ifndef CONFIG_64BIT |
| @@ -587,6 +597,15 @@ int trace_print_context(struct trace_iterator *iter) | |||
| 587 | trace_seq_printf(s, "%16s-%-5d [%03d] ", | 597 | trace_seq_printf(s, "%16s-%-5d [%03d] ", |
| 588 | comm, entry->pid, iter->cpu); | 598 | comm, entry->pid, iter->cpu); |
| 589 | 599 | ||
| 600 | if (tr->trace_flags & TRACE_ITER_RECORD_TGID) { | ||
| 601 | unsigned int tgid = trace_find_tgid(entry->pid); | ||
| 602 | |||
| 603 | if (!tgid) | ||
| 604 | trace_seq_printf(s, "(-----) "); | ||
| 605 | else | ||
| 606 | trace_seq_printf(s, "(%5d) ", tgid); | ||
| 607 | } | ||
| 608 | |||
| 590 | if (tr->trace_flags & TRACE_ITER_IRQ_INFO) | 609 | if (tr->trace_flags & TRACE_ITER_IRQ_INFO) |
| 591 | trace_print_lat_fmt(s, entry); | 610 | trace_print_lat_fmt(s, entry); |
| 592 | 611 | ||
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 4c896a0101bd..b341c02730be 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c | |||
| @@ -12,27 +12,38 @@ | |||
| 12 | 12 | ||
| 13 | #include "trace.h" | 13 | #include "trace.h" |
| 14 | 14 | ||
| 15 | static int sched_ref; | 15 | #define RECORD_CMDLINE 1 |
| 16 | #define RECORD_TGID 2 | ||
| 17 | |||
| 18 | static int sched_cmdline_ref; | ||
| 19 | static int sched_tgid_ref; | ||
| 16 | static DEFINE_MUTEX(sched_register_mutex); | 20 | static DEFINE_MUTEX(sched_register_mutex); |
| 17 | 21 | ||
| 18 | static void | 22 | static void |
| 19 | probe_sched_switch(void *ignore, bool preempt, | 23 | probe_sched_switch(void *ignore, bool preempt, |
| 20 | struct task_struct *prev, struct task_struct *next) | 24 | struct task_struct *prev, struct task_struct *next) |
| 21 | { | 25 | { |
| 22 | if (unlikely(!sched_ref)) | 26 | int flags; |
| 23 | return; | 27 | |
| 28 | flags = (RECORD_TGID * !!sched_tgid_ref) + | ||
| 29 | (RECORD_CMDLINE * !!sched_cmdline_ref); | ||
| 24 | 30 | ||
| 25 | tracing_record_cmdline(prev); | 31 | if (!flags) |
| 26 | tracing_record_cmdline(next); | 32 | return; |
| 33 | tracing_record_taskinfo_sched_switch(prev, next, flags); | ||
| 27 | } | 34 | } |
| 28 | 35 | ||
| 29 | static void | 36 | static void |
| 30 | probe_sched_wakeup(void *ignore, struct task_struct *wakee) | 37 | probe_sched_wakeup(void *ignore, struct task_struct *wakee) |
| 31 | { | 38 | { |
| 32 | if (unlikely(!sched_ref)) | 39 | int flags; |
| 33 | return; | 40 | |
| 41 | flags = (RECORD_TGID * !!sched_tgid_ref) + | ||
| 42 | (RECORD_CMDLINE * !!sched_cmdline_ref); | ||
| 34 | 43 | ||
| 35 | tracing_record_cmdline(current); | 44 | if (!flags) |
| 45 | return; | ||
| 46 | tracing_record_taskinfo(current, flags); | ||
| 36 | } | 47 | } |
| 37 | 48 | ||
| 38 | static int tracing_sched_register(void) | 49 | static int tracing_sched_register(void) |
| @@ -75,28 +86,61 @@ static void tracing_sched_unregister(void) | |||
| 75 | unregister_trace_sched_wakeup(probe_sched_wakeup, NULL); | 86 | unregister_trace_sched_wakeup(probe_sched_wakeup, NULL); |
| 76 | } | 87 | } |
| 77 | 88 | ||
| 78 | static void tracing_start_sched_switch(void) | 89 | static void tracing_start_sched_switch(int ops) |
| 79 | { | 90 | { |
| 91 | bool sched_register = (!sched_cmdline_ref && !sched_tgid_ref); | ||
| 80 | mutex_lock(&sched_register_mutex); | 92 | mutex_lock(&sched_register_mutex); |
| 81 | if (!(sched_ref++)) | 93 | |
| 94 | switch (ops) { | ||
| 95 | case RECORD_CMDLINE: | ||
| 96 | sched_cmdline_ref++; | ||
| 97 | break; | ||
| 98 | |||
| 99 | case RECORD_TGID: | ||
| 100 | sched_tgid_ref++; | ||
| 101 | break; | ||
| 102 | } | ||
| 103 | |||
| 104 | if (sched_register && (sched_cmdline_ref || sched_tgid_ref)) | ||
| 82 | tracing_sched_register(); | 105 | tracing_sched_register(); |
| 83 | mutex_unlock(&sched_register_mutex); | 106 | mutex_unlock(&sched_register_mutex); |
| 84 | } | 107 | } |
| 85 | 108 | ||
| 86 | static void tracing_stop_sched_switch(void) | 109 | static void tracing_stop_sched_switch(int ops) |
| 87 | { | 110 | { |
| 88 | mutex_lock(&sched_register_mutex); | 111 | mutex_lock(&sched_register_mutex); |
| 89 | if (!(--sched_ref)) | 112 | |
| 113 | switch (ops) { | ||
| 114 | case RECORD_CMDLINE: | ||
| 115 | sched_cmdline_ref--; | ||
| 116 | break; | ||
| 117 | |||
| 118 | case RECORD_TGID: | ||
| 119 | sched_tgid_ref--; | ||
| 120 | break; | ||
| 121 | } | ||
| 122 | |||
| 123 | if (!sched_cmdline_ref && !sched_tgid_ref) | ||
| 90 | tracing_sched_unregister(); | 124 | tracing_sched_unregister(); |
| 91 | mutex_unlock(&sched_register_mutex); | 125 | mutex_unlock(&sched_register_mutex); |
| 92 | } | 126 | } |
| 93 | 127 | ||
| 94 | void tracing_start_cmdline_record(void) | 128 | void tracing_start_cmdline_record(void) |
| 95 | { | 129 | { |
| 96 | tracing_start_sched_switch(); | 130 | tracing_start_sched_switch(RECORD_CMDLINE); |
| 97 | } | 131 | } |
| 98 | 132 | ||
| 99 | void tracing_stop_cmdline_record(void) | 133 | void tracing_stop_cmdline_record(void) |
| 100 | { | 134 | { |
| 101 | tracing_stop_sched_switch(); | 135 | tracing_stop_sched_switch(RECORD_CMDLINE); |
| 136 | } | ||
| 137 | |||
| 138 | void tracing_start_tgid_record(void) | ||
| 139 | { | ||
| 140 | tracing_start_sched_switch(RECORD_TGID); | ||
| 141 | } | ||
| 142 | |||
| 143 | void tracing_stop_tgid_record(void) | ||
| 144 | { | ||
| 145 | tracing_stop_sched_switch(RECORD_TGID); | ||
| 102 | } | 146 | } |
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 76aa04d4c925..a4df67cbc711 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c | |||
| @@ -406,10 +406,14 @@ static const struct file_operations stack_trace_fops = { | |||
| 406 | .release = seq_release, | 406 | .release = seq_release, |
| 407 | }; | 407 | }; |
| 408 | 408 | ||
| 409 | #ifdef CONFIG_DYNAMIC_FTRACE | ||
| 410 | |||
| 409 | static int | 411 | static int |
| 410 | stack_trace_filter_open(struct inode *inode, struct file *file) | 412 | stack_trace_filter_open(struct inode *inode, struct file *file) |
| 411 | { | 413 | { |
| 412 | return ftrace_regex_open(&trace_ops, FTRACE_ITER_FILTER, | 414 | struct ftrace_ops *ops = inode->i_private; |
| 415 | |||
| 416 | return ftrace_regex_open(ops, FTRACE_ITER_FILTER, | ||
| 413 | inode, file); | 417 | inode, file); |
| 414 | } | 418 | } |
| 415 | 419 | ||
| @@ -421,6 +425,8 @@ static const struct file_operations stack_trace_filter_fops = { | |||
| 421 | .release = ftrace_regex_release, | 425 | .release = ftrace_regex_release, |
| 422 | }; | 426 | }; |
| 423 | 427 | ||
| 428 | #endif /* CONFIG_DYNAMIC_FTRACE */ | ||
| 429 | |||
| 424 | int | 430 | int |
| 425 | stack_trace_sysctl(struct ctl_table *table, int write, | 431 | stack_trace_sysctl(struct ctl_table *table, int write, |
| 426 | void __user *buffer, size_t *lenp, | 432 | void __user *buffer, size_t *lenp, |
| @@ -475,8 +481,10 @@ static __init int stack_trace_init(void) | |||
| 475 | trace_create_file("stack_trace", 0444, d_tracer, | 481 | trace_create_file("stack_trace", 0444, d_tracer, |
| 476 | NULL, &stack_trace_fops); | 482 | NULL, &stack_trace_fops); |
| 477 | 483 | ||
| 484 | #ifdef CONFIG_DYNAMIC_FTRACE | ||
| 478 | trace_create_file("stack_trace_filter", 0444, d_tracer, | 485 | trace_create_file("stack_trace_filter", 0444, d_tracer, |
| 479 | NULL, &stack_trace_filter_fops); | 486 | &trace_ops, &stack_trace_filter_fops); |
| 487 | #endif | ||
| 480 | 488 | ||
| 481 | if (stack_trace_filter_buf[0]) | 489 | if (stack_trace_filter_buf[0]) |
| 482 | ftrace_set_early_filter(&trace_ops, stack_trace_filter_buf, 1); | 490 | ftrace_set_early_filter(&trace_ops, stack_trace_filter_buf, 1); |
diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 03e0b69bb5bf..06d3389bca0d 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c | |||
| @@ -9,7 +9,7 @@ | |||
| 9 | * to those contributors as well. | 9 | * to those contributors as well. |
| 10 | */ | 10 | */ |
| 11 | 11 | ||
| 12 | #define pr_fmt(fmt) "NMI watchdog: " fmt | 12 | #define pr_fmt(fmt) "watchdog: " fmt |
| 13 | 13 | ||
| 14 | #include <linux/mm.h> | 14 | #include <linux/mm.h> |
| 15 | #include <linux/cpu.h> | 15 | #include <linux/cpu.h> |
| @@ -29,15 +29,58 @@ | |||
| 29 | #include <linux/kvm_para.h> | 29 | #include <linux/kvm_para.h> |
| 30 | #include <linux/kthread.h> | 30 | #include <linux/kthread.h> |
| 31 | 31 | ||
| 32 | /* Watchdog configuration */ | ||
| 32 | static DEFINE_MUTEX(watchdog_proc_mutex); | 33 | static DEFINE_MUTEX(watchdog_proc_mutex); |
| 33 | 34 | ||
| 34 | #if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR) | 35 | int __read_mostly nmi_watchdog_enabled; |
| 35 | unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED; | 36 | |
| 37 | #if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_NMI_WATCHDOG) | ||
| 38 | unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED | | ||
| 39 | NMI_WATCHDOG_ENABLED; | ||
| 36 | #else | 40 | #else |
| 37 | unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED; | 41 | unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED; |
| 38 | #endif | 42 | #endif |
| 39 | int __read_mostly nmi_watchdog_enabled; | 43 | |
| 44 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | ||
| 45 | /* boot commands */ | ||
| 46 | /* | ||
| 47 | * Should we panic when a soft-lockup or hard-lockup occurs: | ||
| 48 | */ | ||
| 49 | unsigned int __read_mostly hardlockup_panic = | ||
| 50 | CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; | ||
| 51 | /* | ||
| 52 | * We may not want to enable hard lockup detection by default in all cases, | ||
| 53 | * for example when running the kernel as a guest on a hypervisor. In these | ||
| 54 | * cases this function can be called to disable hard lockup detection. This | ||
| 55 | * function should only be executed once by the boot processor before the | ||
| 56 | * kernel command line parameters are parsed, because otherwise it is not | ||
| 57 | * possible to override this in hardlockup_panic_setup(). | ||
| 58 | */ | ||
| 59 | void hardlockup_detector_disable(void) | ||
| 60 | { | ||
| 61 | watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; | ||
| 62 | } | ||
| 63 | |||
| 64 | static int __init hardlockup_panic_setup(char *str) | ||
| 65 | { | ||
| 66 | if (!strncmp(str, "panic", 5)) | ||
| 67 | hardlockup_panic = 1; | ||
| 68 | else if (!strncmp(str, "nopanic", 7)) | ||
| 69 | hardlockup_panic = 0; | ||
| 70 | else if (!strncmp(str, "0", 1)) | ||
| 71 | watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; | ||
| 72 | else if (!strncmp(str, "1", 1)) | ||
| 73 | watchdog_enabled |= NMI_WATCHDOG_ENABLED; | ||
| 74 | return 1; | ||
| 75 | } | ||
| 76 | __setup("nmi_watchdog=", hardlockup_panic_setup); | ||
| 77 | |||
| 78 | #endif | ||
| 79 | |||
| 80 | #ifdef CONFIG_SOFTLOCKUP_DETECTOR | ||
| 40 | int __read_mostly soft_watchdog_enabled; | 81 | int __read_mostly soft_watchdog_enabled; |
| 82 | #endif | ||
| 83 | |||
| 41 | int __read_mostly watchdog_user_enabled; | 84 | int __read_mostly watchdog_user_enabled; |
| 42 | int __read_mostly watchdog_thresh = 10; | 85 | int __read_mostly watchdog_thresh = 10; |
| 43 | 86 | ||
| @@ -45,15 +88,9 @@ int __read_mostly watchdog_thresh = 10; | |||
| 45 | int __read_mostly sysctl_softlockup_all_cpu_backtrace; | 88 | int __read_mostly sysctl_softlockup_all_cpu_backtrace; |
| 46 | int __read_mostly sysctl_hardlockup_all_cpu_backtrace; | 89 | int __read_mostly sysctl_hardlockup_all_cpu_backtrace; |
| 47 | #endif | 90 | #endif |
| 48 | static struct cpumask watchdog_cpumask __read_mostly; | 91 | struct cpumask watchdog_cpumask __read_mostly; |
| 49 | unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); | 92 | unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); |
| 50 | 93 | ||
| 51 | /* Helper for online, unparked cpus. */ | ||
| 52 | #define for_each_watchdog_cpu(cpu) \ | ||
| 53 | for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask) | ||
| 54 | |||
| 55 | atomic_t watchdog_park_in_progress = ATOMIC_INIT(0); | ||
| 56 | |||
| 57 | /* | 94 | /* |
| 58 | * The 'watchdog_running' variable is set to 1 when the watchdog threads | 95 | * The 'watchdog_running' variable is set to 1 when the watchdog threads |
| 59 | * are registered/started and is set to 0 when the watchdog threads are | 96 | * are registered/started and is set to 0 when the watchdog threads are |
| @@ -72,7 +109,47 @@ static int __read_mostly watchdog_running; | |||
| 72 | * of 'watchdog_running' cannot change while the watchdog is deactivated | 109 | * of 'watchdog_running' cannot change while the watchdog is deactivated |
| 73 | * temporarily (see related code in 'proc' handlers). | 110 | * temporarily (see related code in 'proc' handlers). |
| 74 | */ | 111 | */ |
| 75 | static int __read_mostly watchdog_suspended; | 112 | int __read_mostly watchdog_suspended; |
| 113 | |||
| 114 | /* | ||
| 115 | * These functions can be overridden if an architecture implements its | ||
| 116 | * own hardlockup detector. | ||
| 117 | * | ||
| 118 | * watchdog_nmi_enable/disable can be implemented to start and stop when | ||
| 119 | * softlockup watchdog threads start and stop. The arch must select the | ||
| 120 | * SOFTLOCKUP_DETECTOR Kconfig. | ||
| 121 | */ | ||
| 122 | int __weak watchdog_nmi_enable(unsigned int cpu) | ||
| 123 | { | ||
| 124 | return 0; | ||
| 125 | } | ||
| 126 | void __weak watchdog_nmi_disable(unsigned int cpu) | ||
| 127 | { | ||
| 128 | } | ||
| 129 | |||
| 130 | /* | ||
| 131 | * watchdog_nmi_reconfigure can be implemented to be notified after any | ||
| 132 | * watchdog configuration change. The arch hardlockup watchdog should | ||
| 133 | * respond to the following variables: | ||
| 134 | * - nmi_watchdog_enabled | ||
| 135 | * - watchdog_thresh | ||
| 136 | * - watchdog_cpumask | ||
| 137 | * - sysctl_hardlockup_all_cpu_backtrace | ||
| 138 | * - hardlockup_panic | ||
| 139 | * - watchdog_suspended | ||
| 140 | */ | ||
| 141 | void __weak watchdog_nmi_reconfigure(void) | ||
| 142 | { | ||
| 143 | } | ||
| 144 | |||
| 145 | |||
| 146 | #ifdef CONFIG_SOFTLOCKUP_DETECTOR | ||
| 147 | |||
| 148 | /* Helper for online, unparked cpus. */ | ||
| 149 | #define for_each_watchdog_cpu(cpu) \ | ||
| 150 | for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask) | ||
| 151 | |||
| 152 | atomic_t watchdog_park_in_progress = ATOMIC_INIT(0); | ||
| 76 | 153 | ||
| 77 | static u64 __read_mostly sample_period; | 154 | static u64 __read_mostly sample_period; |
| 78 | 155 | ||
| @@ -120,6 +197,7 @@ static int __init softlockup_all_cpu_backtrace_setup(char *str) | |||
| 120 | return 1; | 197 | return 1; |
| 121 | } | 198 | } |
| 122 | __setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup); | 199 | __setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup); |
| 200 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | ||
| 123 | static int __init hardlockup_all_cpu_backtrace_setup(char *str) | 201 | static int __init hardlockup_all_cpu_backtrace_setup(char *str) |
| 124 | { | 202 | { |
| 125 | sysctl_hardlockup_all_cpu_backtrace = | 203 | sysctl_hardlockup_all_cpu_backtrace = |
| @@ -128,6 +206,7 @@ static int __init hardlockup_all_cpu_backtrace_setup(char *str) | |||
| 128 | } | 206 | } |
| 129 | __setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup); | 207 | __setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup); |
| 130 | #endif | 208 | #endif |
| 209 | #endif | ||
| 131 | 210 | ||
| 132 | /* | 211 | /* |
| 133 | * Hard-lockup warnings should be triggered after just a few seconds. Soft- | 212 | * Hard-lockup warnings should be triggered after just a few seconds. Soft- |
| @@ -213,18 +292,6 @@ void touch_softlockup_watchdog_sync(void) | |||
| 213 | __this_cpu_write(watchdog_touch_ts, 0); | 292 | __this_cpu_write(watchdog_touch_ts, 0); |
| 214 | } | 293 | } |
| 215 | 294 | ||
| 216 | /* watchdog detector functions */ | ||
| 217 | bool is_hardlockup(void) | ||
| 218 | { | ||
| 219 | unsigned long hrint = __this_cpu_read(hrtimer_interrupts); | ||
| 220 | |||
| 221 | if (__this_cpu_read(hrtimer_interrupts_saved) == hrint) | ||
| 222 | return true; | ||
| 223 | |||
| 224 | __this_cpu_write(hrtimer_interrupts_saved, hrint); | ||
| 225 | return false; | ||
| 226 | } | ||
| 227 | |||
| 228 | static int is_softlockup(unsigned long touch_ts) | 295 | static int is_softlockup(unsigned long touch_ts) |
| 229 | { | 296 | { |
| 230 | unsigned long now = get_timestamp(); | 297 | unsigned long now = get_timestamp(); |
| @@ -237,21 +304,21 @@ static int is_softlockup(unsigned long touch_ts) | |||
| 237 | return 0; | 304 | return 0; |
| 238 | } | 305 | } |
| 239 | 306 | ||
| 240 | static void watchdog_interrupt_count(void) | 307 | /* watchdog detector functions */ |
| 308 | bool is_hardlockup(void) | ||
| 241 | { | 309 | { |
| 242 | __this_cpu_inc(hrtimer_interrupts); | 310 | unsigned long hrint = __this_cpu_read(hrtimer_interrupts); |
| 243 | } | ||
| 244 | 311 | ||
| 245 | /* | 312 | if (__this_cpu_read(hrtimer_interrupts_saved) == hrint) |
| 246 | * These two functions are mostly architecture specific | 313 | return true; |
| 247 | * defining them as weak here. | 314 | |
| 248 | */ | 315 | __this_cpu_write(hrtimer_interrupts_saved, hrint); |
| 249 | int __weak watchdog_nmi_enable(unsigned int cpu) | 316 | return false; |
| 250 | { | ||
| 251 | return 0; | ||
| 252 | } | 317 | } |
| 253 | void __weak watchdog_nmi_disable(unsigned int cpu) | 318 | |
| 319 | static void watchdog_interrupt_count(void) | ||
| 254 | { | 320 | { |
| 321 | __this_cpu_inc(hrtimer_interrupts); | ||
| 255 | } | 322 | } |
| 256 | 323 | ||
| 257 | static int watchdog_enable_all_cpus(void); | 324 | static int watchdog_enable_all_cpus(void); |
| @@ -502,57 +569,6 @@ static void watchdog_unpark_threads(void) | |||
| 502 | kthread_unpark(per_cpu(softlockup_watchdog, cpu)); | 569 | kthread_unpark(per_cpu(softlockup_watchdog, cpu)); |
| 503 | } | 570 | } |
| 504 | 571 | ||
| 505 | /* | ||
| 506 | * Suspend the hard and soft lockup detector by parking the watchdog threads. | ||
| 507 | */ | ||
| 508 | int lockup_detector_suspend(void) | ||
| 509 | { | ||
| 510 | int ret = 0; | ||
| 511 | |||
| 512 | get_online_cpus(); | ||
| 513 | mutex_lock(&watchdog_proc_mutex); | ||
| 514 | /* | ||
| 515 | * Multiple suspend requests can be active in parallel (counted by | ||
| 516 | * the 'watchdog_suspended' variable). If the watchdog threads are | ||
| 517 | * running, the first caller takes care that they will be parked. | ||
| 518 | * The state of 'watchdog_running' cannot change while a suspend | ||
| 519 | * request is active (see related code in 'proc' handlers). | ||
| 520 | */ | ||
| 521 | if (watchdog_running && !watchdog_suspended) | ||
| 522 | ret = watchdog_park_threads(); | ||
| 523 | |||
| 524 | if (ret == 0) | ||
| 525 | watchdog_suspended++; | ||
| 526 | else { | ||
| 527 | watchdog_disable_all_cpus(); | ||
| 528 | pr_err("Failed to suspend lockup detectors, disabled\n"); | ||
| 529 | watchdog_enabled = 0; | ||
| 530 | } | ||
| 531 | |||
| 532 | mutex_unlock(&watchdog_proc_mutex); | ||
| 533 | |||
| 534 | return ret; | ||
| 535 | } | ||
| 536 | |||
| 537 | /* | ||
| 538 | * Resume the hard and soft lockup detector by unparking the watchdog threads. | ||
| 539 | */ | ||
| 540 | void lockup_detector_resume(void) | ||
| 541 | { | ||
| 542 | mutex_lock(&watchdog_proc_mutex); | ||
| 543 | |||
| 544 | watchdog_suspended--; | ||
| 545 | /* | ||
| 546 | * The watchdog threads are unparked if they were previously running | ||
| 547 | * and if there is no more active suspend request. | ||
| 548 | */ | ||
| 549 | if (watchdog_running && !watchdog_suspended) | ||
| 550 | watchdog_unpark_threads(); | ||
| 551 | |||
| 552 | mutex_unlock(&watchdog_proc_mutex); | ||
| 553 | put_online_cpus(); | ||
| 554 | } | ||
| 555 | |||
| 556 | static int update_watchdog_all_cpus(void) | 572 | static int update_watchdog_all_cpus(void) |
| 557 | { | 573 | { |
| 558 | int ret; | 574 | int ret; |
| @@ -605,6 +621,100 @@ static void watchdog_disable_all_cpus(void) | |||
| 605 | } | 621 | } |
| 606 | 622 | ||
| 607 | #ifdef CONFIG_SYSCTL | 623 | #ifdef CONFIG_SYSCTL |
| 624 | static int watchdog_update_cpus(void) | ||
| 625 | { | ||
| 626 | return smpboot_update_cpumask_percpu_thread( | ||
| 627 | &watchdog_threads, &watchdog_cpumask); | ||
| 628 | } | ||
| 629 | #endif | ||
| 630 | |||
| 631 | #else /* SOFTLOCKUP */ | ||
| 632 | static int watchdog_park_threads(void) | ||
| 633 | { | ||
| 634 | return 0; | ||
| 635 | } | ||
| 636 | |||
| 637 | static void watchdog_unpark_threads(void) | ||
| 638 | { | ||
| 639 | } | ||
| 640 | |||
| 641 | static int watchdog_enable_all_cpus(void) | ||
| 642 | { | ||
| 643 | return 0; | ||
| 644 | } | ||
| 645 | |||
| 646 | static void watchdog_disable_all_cpus(void) | ||
| 647 | { | ||
| 648 | } | ||
| 649 | |||
| 650 | #ifdef CONFIG_SYSCTL | ||
| 651 | static int watchdog_update_cpus(void) | ||
| 652 | { | ||
| 653 | return 0; | ||
| 654 | } | ||
| 655 | #endif | ||
| 656 | |||
| 657 | static void set_sample_period(void) | ||
| 658 | { | ||
| 659 | } | ||
| 660 | #endif /* SOFTLOCKUP */ | ||
| 661 | |||
| 662 | /* | ||
| 663 | * Suspend the hard and soft lockup detector by parking the watchdog threads. | ||
| 664 | */ | ||
| 665 | int lockup_detector_suspend(void) | ||
| 666 | { | ||
| 667 | int ret = 0; | ||
| 668 | |||
| 669 | get_online_cpus(); | ||
| 670 | mutex_lock(&watchdog_proc_mutex); | ||
| 671 | /* | ||
| 672 | * Multiple suspend requests can be active in parallel (counted by | ||
| 673 | * the 'watchdog_suspended' variable). If the watchdog threads are | ||
| 674 | * running, the first caller takes care that they will be parked. | ||
| 675 | * The state of 'watchdog_running' cannot change while a suspend | ||
| 676 | * request is active (see related code in 'proc' handlers). | ||
| 677 | */ | ||
| 678 | if (watchdog_running && !watchdog_suspended) | ||
| 679 | ret = watchdog_park_threads(); | ||
| 680 | |||
| 681 | if (ret == 0) | ||
| 682 | watchdog_suspended++; | ||
| 683 | else { | ||
| 684 | watchdog_disable_all_cpus(); | ||
| 685 | pr_err("Failed to suspend lockup detectors, disabled\n"); | ||
| 686 | watchdog_enabled = 0; | ||
| 687 | } | ||
| 688 | |||
| 689 | watchdog_nmi_reconfigure(); | ||
| 690 | |||
| 691 | mutex_unlock(&watchdog_proc_mutex); | ||
| 692 | |||
| 693 | return ret; | ||
| 694 | } | ||
| 695 | |||
| 696 | /* | ||
| 697 | * Resume the hard and soft lockup detector by unparking the watchdog threads. | ||
| 698 | */ | ||
| 699 | void lockup_detector_resume(void) | ||
| 700 | { | ||
| 701 | mutex_lock(&watchdog_proc_mutex); | ||
| 702 | |||
| 703 | watchdog_suspended--; | ||
| 704 | /* | ||
| 705 | * The watchdog threads are unparked if they were previously running | ||
| 706 | * and if there is no more active suspend request. | ||
| 707 | */ | ||
| 708 | if (watchdog_running && !watchdog_suspended) | ||
| 709 | watchdog_unpark_threads(); | ||
| 710 | |||
| 711 | watchdog_nmi_reconfigure(); | ||
| 712 | |||
| 713 | mutex_unlock(&watchdog_proc_mutex); | ||
| 714 | put_online_cpus(); | ||
| 715 | } | ||
| 716 | |||
| 717 | #ifdef CONFIG_SYSCTL | ||
| 608 | 718 | ||
| 609 | /* | 719 | /* |
| 610 | * Update the run state of the lockup detectors. | 720 | * Update the run state of the lockup detectors. |
| @@ -625,6 +735,8 @@ static int proc_watchdog_update(void) | |||
| 625 | else | 735 | else |
| 626 | watchdog_disable_all_cpus(); | 736 | watchdog_disable_all_cpus(); |
| 627 | 737 | ||
| 738 | watchdog_nmi_reconfigure(); | ||
| 739 | |||
| 628 | return err; | 740 | return err; |
| 629 | 741 | ||
| 630 | } | 742 | } |
| @@ -810,10 +922,11 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, | |||
| 810 | * a temporary cpumask, so we are likely not in a | 922 | * a temporary cpumask, so we are likely not in a |
| 811 | * position to do much else to make things better. | 923 | * position to do much else to make things better. |
| 812 | */ | 924 | */ |
| 813 | if (smpboot_update_cpumask_percpu_thread( | 925 | if (watchdog_update_cpus() != 0) |
| 814 | &watchdog_threads, &watchdog_cpumask) != 0) | ||
| 815 | pr_err("cpumask update failed\n"); | 926 | pr_err("cpumask update failed\n"); |
| 816 | } | 927 | } |
| 928 | |||
| 929 | watchdog_nmi_reconfigure(); | ||
| 817 | } | 930 | } |
| 818 | out: | 931 | out: |
| 819 | mutex_unlock(&watchdog_proc_mutex); | 932 | mutex_unlock(&watchdog_proc_mutex); |
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 54a427d1f344..295a0d84934c 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c | |||
| @@ -22,41 +22,9 @@ static DEFINE_PER_CPU(bool, hard_watchdog_warn); | |||
| 22 | static DEFINE_PER_CPU(bool, watchdog_nmi_touch); | 22 | static DEFINE_PER_CPU(bool, watchdog_nmi_touch); |
| 23 | static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); | 23 | static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); |
| 24 | 24 | ||
| 25 | /* boot commands */ | ||
| 26 | /* | ||
| 27 | * Should we panic when a soft-lockup or hard-lockup occurs: | ||
| 28 | */ | ||
| 29 | unsigned int __read_mostly hardlockup_panic = | ||
| 30 | CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; | ||
| 31 | static unsigned long hardlockup_allcpu_dumped; | 25 | static unsigned long hardlockup_allcpu_dumped; |
| 32 | /* | ||
| 33 | * We may not want to enable hard lockup detection by default in all cases, | ||
| 34 | * for example when running the kernel as a guest on a hypervisor. In these | ||
| 35 | * cases this function can be called to disable hard lockup detection. This | ||
| 36 | * function should only be executed once by the boot processor before the | ||
| 37 | * kernel command line parameters are parsed, because otherwise it is not | ||
| 38 | * possible to override this in hardlockup_panic_setup(). | ||
| 39 | */ | ||
| 40 | void hardlockup_detector_disable(void) | ||
| 41 | { | ||
| 42 | watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; | ||
| 43 | } | ||
| 44 | |||
| 45 | static int __init hardlockup_panic_setup(char *str) | ||
| 46 | { | ||
| 47 | if (!strncmp(str, "panic", 5)) | ||
| 48 | hardlockup_panic = 1; | ||
| 49 | else if (!strncmp(str, "nopanic", 7)) | ||
| 50 | hardlockup_panic = 0; | ||
| 51 | else if (!strncmp(str, "0", 1)) | ||
| 52 | watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; | ||
| 53 | else if (!strncmp(str, "1", 1)) | ||
| 54 | watchdog_enabled |= NMI_WATCHDOG_ENABLED; | ||
| 55 | return 1; | ||
| 56 | } | ||
| 57 | __setup("nmi_watchdog=", hardlockup_panic_setup); | ||
| 58 | 26 | ||
| 59 | void touch_nmi_watchdog(void) | 27 | void arch_touch_nmi_watchdog(void) |
| 60 | { | 28 | { |
| 61 | /* | 29 | /* |
| 62 | * Using __raw here because some code paths have | 30 | * Using __raw here because some code paths have |
| @@ -66,9 +34,8 @@ void touch_nmi_watchdog(void) | |||
| 66 | * going off. | 34 | * going off. |
| 67 | */ | 35 | */ |
| 68 | raw_cpu_write(watchdog_nmi_touch, true); | 36 | raw_cpu_write(watchdog_nmi_touch, true); |
| 69 | touch_softlockup_watchdog(); | ||
| 70 | } | 37 | } |
| 71 | EXPORT_SYMBOL(touch_nmi_watchdog); | 38 | EXPORT_SYMBOL(arch_touch_nmi_watchdog); |
| 72 | 39 | ||
| 73 | static struct perf_event_attr wd_hw_attr = { | 40 | static struct perf_event_attr wd_hw_attr = { |
| 74 | .type = PERF_TYPE_HARDWARE, | 41 | .type = PERF_TYPE_HARDWARE, |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c74bf39ef764..a86688fabc55 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
| @@ -2864,11 +2864,11 @@ bool flush_work(struct work_struct *work) | |||
| 2864 | EXPORT_SYMBOL_GPL(flush_work); | 2864 | EXPORT_SYMBOL_GPL(flush_work); |
| 2865 | 2865 | ||
| 2866 | struct cwt_wait { | 2866 | struct cwt_wait { |
| 2867 | wait_queue_t wait; | 2867 | wait_queue_entry_t wait; |
| 2868 | struct work_struct *work; | 2868 | struct work_struct *work; |
| 2869 | }; | 2869 | }; |
| 2870 | 2870 | ||
| 2871 | static int cwt_wakefn(wait_queue_t *wait, unsigned mode, int sync, void *key) | 2871 | static int cwt_wakefn(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) |
| 2872 | { | 2872 | { |
| 2873 | struct cwt_wait *cwait = container_of(wait, struct cwt_wait, wait); | 2873 | struct cwt_wait *cwait = container_of(wait, struct cwt_wait, wait); |
| 2874 | 2874 | ||
