diff options
| author | Ingo Molnar <mingo@kernel.org> | 2017-07-30 05:15:13 -0400 |
|---|---|---|
| committer | Ingo Molnar <mingo@kernel.org> | 2017-07-30 05:15:13 -0400 |
| commit | f5db340f19f14a8df9dfd22d71fba1513e9f1f7e (patch) | |
| tree | 131d3345bc987aee3c922624de816492e7f323a4 /kernel | |
| parent | ee438ec8f33c5af0d4a4ffb935c5b9272e8c2680 (diff) | |
| parent | 38115f2f8cec8087d558c062e779c443a01f87d6 (diff) | |
Merge branch 'perf/urgent' into perf/core, to pick up latest fixes and refresh the tree
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel')
92 files changed, 4246 insertions, 1884 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 72aa080f91f0..4cb8e8b23c6e 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
| @@ -82,7 +82,7 @@ obj-$(CONFIG_KPROBES) += kprobes.o | |||
| 82 | obj-$(CONFIG_KGDB) += debug/ | 82 | obj-$(CONFIG_KGDB) += debug/ |
| 83 | obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o | 83 | obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o |
| 84 | obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o | 84 | obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o |
| 85 | obj-$(CONFIG_HARDLOCKUP_DETECTOR) += watchdog_hld.o | 85 | obj-$(CONFIG_HARDLOCKUP_DETECTOR_PERF) += watchdog_hld.o |
| 86 | obj-$(CONFIG_SECCOMP) += seccomp.o | 86 | obj-$(CONFIG_SECCOMP) += seccomp.o |
| 87 | obj-$(CONFIG_RELAY) += relay.o | 87 | obj-$(CONFIG_RELAY) += relay.o |
| 88 | obj-$(CONFIG_SYSCTL) += utsname_sysctl.o | 88 | obj-$(CONFIG_SYSCTL) += utsname_sysctl.o |
diff --git a/kernel/audit.c b/kernel/audit.c index 4b7d49868ce1..6dd556931739 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
| @@ -575,12 +575,16 @@ static void kauditd_retry_skb(struct sk_buff *skb) | |||
| 575 | 575 | ||
| 576 | /** | 576 | /** |
| 577 | * auditd_reset - Disconnect the auditd connection | 577 | * auditd_reset - Disconnect the auditd connection |
| 578 | * @ac: auditd connection state | ||
| 578 | * | 579 | * |
| 579 | * Description: | 580 | * Description: |
| 580 | * Break the auditd/kauditd connection and move all the queued records into the | 581 | * Break the auditd/kauditd connection and move all the queued records into the |
| 581 | * hold queue in case auditd reconnects. | 582 | * hold queue in case auditd reconnects. It is important to note that the @ac |
| 583 | * pointer should never be dereferenced inside this function as it may be NULL | ||
| 584 | * or invalid, you can only compare the memory address! If @ac is NULL then | ||
| 585 | * the connection will always be reset. | ||
| 582 | */ | 586 | */ |
| 583 | static void auditd_reset(void) | 587 | static void auditd_reset(const struct auditd_connection *ac) |
| 584 | { | 588 | { |
| 585 | unsigned long flags; | 589 | unsigned long flags; |
| 586 | struct sk_buff *skb; | 590 | struct sk_buff *skb; |
| @@ -590,17 +594,21 @@ static void auditd_reset(void) | |||
| 590 | spin_lock_irqsave(&auditd_conn_lock, flags); | 594 | spin_lock_irqsave(&auditd_conn_lock, flags); |
| 591 | ac_old = rcu_dereference_protected(auditd_conn, | 595 | ac_old = rcu_dereference_protected(auditd_conn, |
| 592 | lockdep_is_held(&auditd_conn_lock)); | 596 | lockdep_is_held(&auditd_conn_lock)); |
| 597 | if (ac && ac != ac_old) { | ||
| 598 | /* someone already registered a new auditd connection */ | ||
| 599 | spin_unlock_irqrestore(&auditd_conn_lock, flags); | ||
| 600 | return; | ||
| 601 | } | ||
| 593 | rcu_assign_pointer(auditd_conn, NULL); | 602 | rcu_assign_pointer(auditd_conn, NULL); |
| 594 | spin_unlock_irqrestore(&auditd_conn_lock, flags); | 603 | spin_unlock_irqrestore(&auditd_conn_lock, flags); |
| 595 | 604 | ||
| 596 | if (ac_old) | 605 | if (ac_old) |
| 597 | call_rcu(&ac_old->rcu, auditd_conn_free); | 606 | call_rcu(&ac_old->rcu, auditd_conn_free); |
| 598 | 607 | ||
| 599 | /* flush all of the main and retry queues to the hold queue */ | 608 | /* flush the retry queue to the hold queue, but don't touch the main |
| 609 | * queue since we need to process that normally for multicast */ | ||
| 600 | while ((skb = skb_dequeue(&audit_retry_queue))) | 610 | while ((skb = skb_dequeue(&audit_retry_queue))) |
| 601 | kauditd_hold_skb(skb); | 611 | kauditd_hold_skb(skb); |
| 602 | while ((skb = skb_dequeue(&audit_queue))) | ||
| 603 | kauditd_hold_skb(skb); | ||
| 604 | } | 612 | } |
| 605 | 613 | ||
| 606 | /** | 614 | /** |
| @@ -633,6 +641,7 @@ static int auditd_send_unicast_skb(struct sk_buff *skb) | |||
| 633 | ac = rcu_dereference(auditd_conn); | 641 | ac = rcu_dereference(auditd_conn); |
| 634 | if (!ac) { | 642 | if (!ac) { |
| 635 | rcu_read_unlock(); | 643 | rcu_read_unlock(); |
| 644 | kfree_skb(skb); | ||
| 636 | rc = -ECONNREFUSED; | 645 | rc = -ECONNREFUSED; |
| 637 | goto err; | 646 | goto err; |
| 638 | } | 647 | } |
| @@ -649,8 +658,8 @@ static int auditd_send_unicast_skb(struct sk_buff *skb) | |||
| 649 | return rc; | 658 | return rc; |
| 650 | 659 | ||
| 651 | err: | 660 | err: |
| 652 | if (rc == -ECONNREFUSED) | 661 | if (ac && rc == -ECONNREFUSED) |
| 653 | auditd_reset(); | 662 | auditd_reset(ac); |
| 654 | return rc; | 663 | return rc; |
| 655 | } | 664 | } |
| 656 | 665 | ||
| @@ -795,9 +804,9 @@ static int kauditd_thread(void *dummy) | |||
| 795 | rc = kauditd_send_queue(sk, portid, | 804 | rc = kauditd_send_queue(sk, portid, |
| 796 | &audit_hold_queue, UNICAST_RETRIES, | 805 | &audit_hold_queue, UNICAST_RETRIES, |
| 797 | NULL, kauditd_rehold_skb); | 806 | NULL, kauditd_rehold_skb); |
| 798 | if (rc < 0) { | 807 | if (ac && rc < 0) { |
| 799 | sk = NULL; | 808 | sk = NULL; |
| 800 | auditd_reset(); | 809 | auditd_reset(ac); |
| 801 | goto main_queue; | 810 | goto main_queue; |
| 802 | } | 811 | } |
| 803 | 812 | ||
| @@ -805,9 +814,9 @@ static int kauditd_thread(void *dummy) | |||
| 805 | rc = kauditd_send_queue(sk, portid, | 814 | rc = kauditd_send_queue(sk, portid, |
| 806 | &audit_retry_queue, UNICAST_RETRIES, | 815 | &audit_retry_queue, UNICAST_RETRIES, |
| 807 | NULL, kauditd_hold_skb); | 816 | NULL, kauditd_hold_skb); |
| 808 | if (rc < 0) { | 817 | if (ac && rc < 0) { |
| 809 | sk = NULL; | 818 | sk = NULL; |
| 810 | auditd_reset(); | 819 | auditd_reset(ac); |
| 811 | goto main_queue; | 820 | goto main_queue; |
| 812 | } | 821 | } |
| 813 | 822 | ||
| @@ -815,12 +824,13 @@ main_queue: | |||
| 815 | /* process the main queue - do the multicast send and attempt | 824 | /* process the main queue - do the multicast send and attempt |
| 816 | * unicast, dump failed record sends to the retry queue; if | 825 | * unicast, dump failed record sends to the retry queue; if |
| 817 | * sk == NULL due to previous failures we will just do the | 826 | * sk == NULL due to previous failures we will just do the |
| 818 | * multicast send and move the record to the retry queue */ | 827 | * multicast send and move the record to the hold queue */ |
| 819 | rc = kauditd_send_queue(sk, portid, &audit_queue, 1, | 828 | rc = kauditd_send_queue(sk, portid, &audit_queue, 1, |
| 820 | kauditd_send_multicast_skb, | 829 | kauditd_send_multicast_skb, |
| 821 | kauditd_retry_skb); | 830 | (sk ? |
| 822 | if (sk == NULL || rc < 0) | 831 | kauditd_retry_skb : kauditd_hold_skb)); |
| 823 | auditd_reset(); | 832 | if (ac && rc < 0) |
| 833 | auditd_reset(ac); | ||
| 824 | sk = NULL; | 834 | sk = NULL; |
| 825 | 835 | ||
| 826 | /* drop our netns reference, no auditd sends past this line */ | 836 | /* drop our netns reference, no auditd sends past this line */ |
| @@ -1230,7 +1240,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
| 1230 | auditd_pid, 1); | 1240 | auditd_pid, 1); |
| 1231 | 1241 | ||
| 1232 | /* unregister the auditd connection */ | 1242 | /* unregister the auditd connection */ |
| 1233 | auditd_reset(); | 1243 | auditd_reset(NULL); |
| 1234 | } | 1244 | } |
| 1235 | } | 1245 | } |
| 1236 | if (s.mask & AUDIT_STATUS_RATE_LIMIT) { | 1246 | if (s.mask & AUDIT_STATUS_RATE_LIMIT) { |
| @@ -1999,22 +2009,10 @@ void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap) | |||
| 1999 | 2009 | ||
| 2000 | static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) | 2010 | static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) |
| 2001 | { | 2011 | { |
| 2002 | kernel_cap_t *perm = &name->fcap.permitted; | 2012 | audit_log_cap(ab, "cap_fp", &name->fcap.permitted); |
| 2003 | kernel_cap_t *inh = &name->fcap.inheritable; | 2013 | audit_log_cap(ab, "cap_fi", &name->fcap.inheritable); |
| 2004 | int log = 0; | 2014 | audit_log_format(ab, " cap_fe=%d cap_fver=%x", |
| 2005 | 2015 | name->fcap.fE, name->fcap_ver); | |
| 2006 | if (!cap_isclear(*perm)) { | ||
| 2007 | audit_log_cap(ab, "cap_fp", perm); | ||
| 2008 | log = 1; | ||
| 2009 | } | ||
| 2010 | if (!cap_isclear(*inh)) { | ||
| 2011 | audit_log_cap(ab, "cap_fi", inh); | ||
| 2012 | log = 1; | ||
| 2013 | } | ||
| 2014 | |||
| 2015 | if (log) | ||
| 2016 | audit_log_format(ab, " cap_fe=%d cap_fver=%x", | ||
| 2017 | name->fcap.fE, name->fcap_ver); | ||
| 2018 | } | 2016 | } |
| 2019 | 2017 | ||
| 2020 | static inline int audit_copy_fcaps(struct audit_names *name, | 2018 | static inline int audit_copy_fcaps(struct audit_names *name, |
diff --git a/kernel/audit.h b/kernel/audit.h index ddfce2ea4891..b331d9b83f63 100644 --- a/kernel/audit.h +++ b/kernel/audit.h | |||
| @@ -68,6 +68,7 @@ struct audit_cap_data { | |||
| 68 | unsigned int fE; /* effective bit of file cap */ | 68 | unsigned int fE; /* effective bit of file cap */ |
| 69 | kernel_cap_t effective; /* effective set of process */ | 69 | kernel_cap_t effective; /* effective set of process */ |
| 70 | }; | 70 | }; |
| 71 | kernel_cap_t ambient; | ||
| 71 | }; | 72 | }; |
| 72 | 73 | ||
| 73 | /* When fs/namei.c:getname() is called, we store the pointer in name and bump | 74 | /* When fs/namei.c:getname() is called, we store the pointer in name and bump |
| @@ -247,13 +248,13 @@ struct audit_netlink_list { | |||
| 247 | struct sk_buff_head q; | 248 | struct sk_buff_head q; |
| 248 | }; | 249 | }; |
| 249 | 250 | ||
| 250 | int audit_send_list(void *); | 251 | int audit_send_list(void *_dest); |
| 251 | 252 | ||
| 252 | extern int selinux_audit_rule_update(void); | 253 | extern int selinux_audit_rule_update(void); |
| 253 | 254 | ||
| 254 | extern struct mutex audit_filter_mutex; | 255 | extern struct mutex audit_filter_mutex; |
| 255 | extern int audit_del_rule(struct audit_entry *); | 256 | extern int audit_del_rule(struct audit_entry *entry); |
| 256 | extern void audit_free_rule_rcu(struct rcu_head *); | 257 | extern void audit_free_rule_rcu(struct rcu_head *head); |
| 257 | extern struct list_head audit_filter_list[]; | 258 | extern struct list_head audit_filter_list[]; |
| 258 | 259 | ||
| 259 | extern struct audit_entry *audit_dupe_rule(struct audit_krule *old); | 260 | extern struct audit_entry *audit_dupe_rule(struct audit_krule *old); |
| @@ -301,17 +302,17 @@ extern int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark | |||
| 301 | #endif /* CONFIG_AUDIT_WATCH */ | 302 | #endif /* CONFIG_AUDIT_WATCH */ |
| 302 | 303 | ||
| 303 | #ifdef CONFIG_AUDIT_TREE | 304 | #ifdef CONFIG_AUDIT_TREE |
| 304 | extern struct audit_chunk *audit_tree_lookup(const struct inode *); | 305 | extern struct audit_chunk *audit_tree_lookup(const struct inode *inode); |
| 305 | extern void audit_put_chunk(struct audit_chunk *); | 306 | extern void audit_put_chunk(struct audit_chunk *chunk); |
| 306 | extern bool audit_tree_match(struct audit_chunk *, struct audit_tree *); | 307 | extern bool audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree); |
| 307 | extern int audit_make_tree(struct audit_krule *, char *, u32); | 308 | extern int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op); |
| 308 | extern int audit_add_tree_rule(struct audit_krule *); | 309 | extern int audit_add_tree_rule(struct audit_krule *rule); |
| 309 | extern int audit_remove_tree_rule(struct audit_krule *); | 310 | extern int audit_remove_tree_rule(struct audit_krule *rule); |
| 310 | extern void audit_trim_trees(void); | 311 | extern void audit_trim_trees(void); |
| 311 | extern int audit_tag_tree(char *old, char *new); | 312 | extern int audit_tag_tree(char *old, char *new); |
| 312 | extern const char *audit_tree_path(struct audit_tree *); | 313 | extern const char *audit_tree_path(struct audit_tree *tree); |
| 313 | extern void audit_put_tree(struct audit_tree *); | 314 | extern void audit_put_tree(struct audit_tree *tree); |
| 314 | extern void audit_kill_trees(struct list_head *); | 315 | extern void audit_kill_trees(struct list_head *list); |
| 315 | #else | 316 | #else |
| 316 | #define audit_remove_tree_rule(rule) BUG() | 317 | #define audit_remove_tree_rule(rule) BUG() |
| 317 | #define audit_add_tree_rule(rule) -EINVAL | 318 | #define audit_add_tree_rule(rule) -EINVAL |
| @@ -323,7 +324,7 @@ extern void audit_kill_trees(struct list_head *); | |||
| 323 | #define audit_kill_trees(list) BUG() | 324 | #define audit_kill_trees(list) BUG() |
| 324 | #endif | 325 | #endif |
| 325 | 326 | ||
| 326 | extern char *audit_unpack_string(void **, size_t *, size_t); | 327 | extern char *audit_unpack_string(void **bufp, size_t *remain, size_t len); |
| 327 | 328 | ||
| 328 | extern pid_t audit_sig_pid; | 329 | extern pid_t audit_sig_pid; |
| 329 | extern kuid_t audit_sig_uid; | 330 | extern kuid_t audit_sig_uid; |
| @@ -333,7 +334,7 @@ extern int audit_filter(int msgtype, unsigned int listtype); | |||
| 333 | 334 | ||
| 334 | #ifdef CONFIG_AUDITSYSCALL | 335 | #ifdef CONFIG_AUDITSYSCALL |
| 335 | extern int audit_signal_info(int sig, struct task_struct *t); | 336 | extern int audit_signal_info(int sig, struct task_struct *t); |
| 336 | extern void audit_filter_inodes(struct task_struct *, struct audit_context *); | 337 | extern void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx); |
| 337 | extern struct list_head *audit_killed_trees(void); | 338 | extern struct list_head *audit_killed_trees(void); |
| 338 | #else | 339 | #else |
| 339 | #define audit_signal_info(s,t) AUDIT_DISABLED | 340 | #define audit_signal_info(s,t) AUDIT_DISABLED |
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index bb724baa7ac9..3260ba2312a9 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
| @@ -1261,6 +1261,7 @@ static void show_special(struct audit_context *context, int *call_panic) | |||
| 1261 | audit_log_cap(ab, "cap_pi", &context->capset.cap.inheritable); | 1261 | audit_log_cap(ab, "cap_pi", &context->capset.cap.inheritable); |
| 1262 | audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted); | 1262 | audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted); |
| 1263 | audit_log_cap(ab, "cap_pe", &context->capset.cap.effective); | 1263 | audit_log_cap(ab, "cap_pe", &context->capset.cap.effective); |
| 1264 | audit_log_cap(ab, "cap_pa", &context->capset.cap.ambient); | ||
| 1264 | break; | 1265 | break; |
| 1265 | case AUDIT_MMAP: | 1266 | case AUDIT_MMAP: |
| 1266 | audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd, | 1267 | audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd, |
| @@ -1382,9 +1383,11 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts | |||
| 1382 | audit_log_cap(ab, "old_pp", &axs->old_pcap.permitted); | 1383 | audit_log_cap(ab, "old_pp", &axs->old_pcap.permitted); |
| 1383 | audit_log_cap(ab, "old_pi", &axs->old_pcap.inheritable); | 1384 | audit_log_cap(ab, "old_pi", &axs->old_pcap.inheritable); |
| 1384 | audit_log_cap(ab, "old_pe", &axs->old_pcap.effective); | 1385 | audit_log_cap(ab, "old_pe", &axs->old_pcap.effective); |
| 1385 | audit_log_cap(ab, "new_pp", &axs->new_pcap.permitted); | 1386 | audit_log_cap(ab, "old_pa", &axs->old_pcap.ambient); |
| 1386 | audit_log_cap(ab, "new_pi", &axs->new_pcap.inheritable); | 1387 | audit_log_cap(ab, "pp", &axs->new_pcap.permitted); |
| 1387 | audit_log_cap(ab, "new_pe", &axs->new_pcap.effective); | 1388 | audit_log_cap(ab, "pi", &axs->new_pcap.inheritable); |
| 1389 | audit_log_cap(ab, "pe", &axs->new_pcap.effective); | ||
| 1390 | audit_log_cap(ab, "pa", &axs->new_pcap.ambient); | ||
| 1388 | break; } | 1391 | break; } |
| 1389 | 1392 | ||
| 1390 | } | 1393 | } |
| @@ -2342,10 +2345,12 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm, | |||
| 2342 | ax->old_pcap.permitted = old->cap_permitted; | 2345 | ax->old_pcap.permitted = old->cap_permitted; |
| 2343 | ax->old_pcap.inheritable = old->cap_inheritable; | 2346 | ax->old_pcap.inheritable = old->cap_inheritable; |
| 2344 | ax->old_pcap.effective = old->cap_effective; | 2347 | ax->old_pcap.effective = old->cap_effective; |
| 2348 | ax->old_pcap.ambient = old->cap_ambient; | ||
| 2345 | 2349 | ||
| 2346 | ax->new_pcap.permitted = new->cap_permitted; | 2350 | ax->new_pcap.permitted = new->cap_permitted; |
| 2347 | ax->new_pcap.inheritable = new->cap_inheritable; | 2351 | ax->new_pcap.inheritable = new->cap_inheritable; |
| 2348 | ax->new_pcap.effective = new->cap_effective; | 2352 | ax->new_pcap.effective = new->cap_effective; |
| 2353 | ax->new_pcap.ambient = new->cap_ambient; | ||
| 2349 | return 0; | 2354 | return 0; |
| 2350 | } | 2355 | } |
| 2351 | 2356 | ||
| @@ -2364,6 +2369,7 @@ void __audit_log_capset(const struct cred *new, const struct cred *old) | |||
| 2364 | context->capset.cap.effective = new->cap_effective; | 2369 | context->capset.cap.effective = new->cap_effective; |
| 2365 | context->capset.cap.inheritable = new->cap_effective; | 2370 | context->capset.cap.inheritable = new->cap_effective; |
| 2366 | context->capset.cap.permitted = new->cap_permitted; | 2371 | context->capset.cap.permitted = new->cap_permitted; |
| 2372 | context->capset.cap.ambient = new->cap_ambient; | ||
| 2367 | context->type = AUDIT_CAPSET; | 2373 | context->type = AUDIT_CAPSET; |
| 2368 | } | 2374 | } |
| 2369 | 2375 | ||
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 172dc8ee0e3b..d771a3872500 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c | |||
| @@ -335,6 +335,26 @@ static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key) | |||
| 335 | } | 335 | } |
| 336 | 336 | ||
| 337 | /* only called from syscall */ | 337 | /* only called from syscall */ |
| 338 | int bpf_fd_array_map_lookup_elem(struct bpf_map *map, void *key, u32 *value) | ||
| 339 | { | ||
| 340 | void **elem, *ptr; | ||
| 341 | int ret = 0; | ||
| 342 | |||
| 343 | if (!map->ops->map_fd_sys_lookup_elem) | ||
| 344 | return -ENOTSUPP; | ||
| 345 | |||
| 346 | rcu_read_lock(); | ||
| 347 | elem = array_map_lookup_elem(map, key); | ||
| 348 | if (elem && (ptr = READ_ONCE(*elem))) | ||
| 349 | *value = map->ops->map_fd_sys_lookup_elem(ptr); | ||
| 350 | else | ||
| 351 | ret = -ENOENT; | ||
| 352 | rcu_read_unlock(); | ||
| 353 | |||
| 354 | return ret; | ||
| 355 | } | ||
| 356 | |||
| 357 | /* only called from syscall */ | ||
| 338 | int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, | 358 | int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, |
| 339 | void *key, void *value, u64 map_flags) | 359 | void *key, void *value, u64 map_flags) |
| 340 | { | 360 | { |
| @@ -400,6 +420,11 @@ static void prog_fd_array_put_ptr(void *ptr) | |||
| 400 | bpf_prog_put(ptr); | 420 | bpf_prog_put(ptr); |
| 401 | } | 421 | } |
| 402 | 422 | ||
| 423 | static u32 prog_fd_array_sys_lookup_elem(void *ptr) | ||
| 424 | { | ||
| 425 | return ((struct bpf_prog *)ptr)->aux->id; | ||
| 426 | } | ||
| 427 | |||
| 403 | /* decrement refcnt of all bpf_progs that are stored in this map */ | 428 | /* decrement refcnt of all bpf_progs that are stored in this map */ |
| 404 | void bpf_fd_array_map_clear(struct bpf_map *map) | 429 | void bpf_fd_array_map_clear(struct bpf_map *map) |
| 405 | { | 430 | { |
| @@ -418,6 +443,7 @@ const struct bpf_map_ops prog_array_map_ops = { | |||
| 418 | .map_delete_elem = fd_array_map_delete_elem, | 443 | .map_delete_elem = fd_array_map_delete_elem, |
| 419 | .map_fd_get_ptr = prog_fd_array_get_ptr, | 444 | .map_fd_get_ptr = prog_fd_array_get_ptr, |
| 420 | .map_fd_put_ptr = prog_fd_array_put_ptr, | 445 | .map_fd_put_ptr = prog_fd_array_put_ptr, |
| 446 | .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem, | ||
| 421 | }; | 447 | }; |
| 422 | 448 | ||
| 423 | static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, | 449 | static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, |
| @@ -452,38 +478,24 @@ static void bpf_event_entry_free_rcu(struct bpf_event_entry *ee) | |||
| 452 | static void *perf_event_fd_array_get_ptr(struct bpf_map *map, | 478 | static void *perf_event_fd_array_get_ptr(struct bpf_map *map, |
| 453 | struct file *map_file, int fd) | 479 | struct file *map_file, int fd) |
| 454 | { | 480 | { |
| 455 | const struct perf_event_attr *attr; | ||
| 456 | struct bpf_event_entry *ee; | 481 | struct bpf_event_entry *ee; |
| 457 | struct perf_event *event; | 482 | struct perf_event *event; |
| 458 | struct file *perf_file; | 483 | struct file *perf_file; |
| 484 | u64 value; | ||
| 459 | 485 | ||
| 460 | perf_file = perf_event_get(fd); | 486 | perf_file = perf_event_get(fd); |
| 461 | if (IS_ERR(perf_file)) | 487 | if (IS_ERR(perf_file)) |
| 462 | return perf_file; | 488 | return perf_file; |
| 463 | 489 | ||
| 490 | ee = ERR_PTR(-EOPNOTSUPP); | ||
| 464 | event = perf_file->private_data; | 491 | event = perf_file->private_data; |
| 465 | ee = ERR_PTR(-EINVAL); | 492 | if (perf_event_read_local(event, &value) == -EOPNOTSUPP) |
| 466 | |||
| 467 | attr = perf_event_attrs(event); | ||
| 468 | if (IS_ERR(attr) || attr->inherit) | ||
| 469 | goto err_out; | 493 | goto err_out; |
| 470 | 494 | ||
| 471 | switch (attr->type) { | 495 | ee = bpf_event_entry_gen(perf_file, map_file); |
| 472 | case PERF_TYPE_SOFTWARE: | 496 | if (ee) |
| 473 | if (attr->config != PERF_COUNT_SW_BPF_OUTPUT) | 497 | return ee; |
| 474 | goto err_out; | 498 | ee = ERR_PTR(-ENOMEM); |
| 475 | /* fall-through */ | ||
| 476 | case PERF_TYPE_RAW: | ||
| 477 | case PERF_TYPE_HARDWARE: | ||
| 478 | ee = bpf_event_entry_gen(perf_file, map_file); | ||
| 479 | if (ee) | ||
| 480 | return ee; | ||
| 481 | ee = ERR_PTR(-ENOMEM); | ||
| 482 | /* fall-through */ | ||
| 483 | default: | ||
| 484 | break; | ||
| 485 | } | ||
| 486 | |||
| 487 | err_out: | 499 | err_out: |
| 488 | fput(perf_file); | 500 | fput(perf_file); |
| 489 | return ee; | 501 | return ee; |
| @@ -599,4 +611,5 @@ const struct bpf_map_ops array_of_maps_map_ops = { | |||
| 599 | .map_delete_elem = fd_array_map_delete_elem, | 611 | .map_delete_elem = fd_array_map_delete_elem, |
| 600 | .map_fd_get_ptr = bpf_map_fd_get_ptr, | 612 | .map_fd_get_ptr = bpf_map_fd_get_ptr, |
| 601 | .map_fd_put_ptr = bpf_map_fd_put_ptr, | 613 | .map_fd_put_ptr = bpf_map_fd_put_ptr, |
| 614 | .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, | ||
| 602 | }; | 615 | }; |
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index ea6033cba947..546113430049 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c | |||
| @@ -236,3 +236,40 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk, | |||
| 236 | return ret; | 236 | return ret; |
| 237 | } | 237 | } |
| 238 | EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); | 238 | EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); |
| 239 | |||
| 240 | /** | ||
| 241 | * __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock | ||
| 242 | * @sk: socket to get cgroup from | ||
| 243 | * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains | ||
| 244 | * sk with connection information (IP addresses, etc.) May not contain | ||
| 245 | * cgroup info if it is a req sock. | ||
| 246 | * @type: The type of program to be exectuted | ||
| 247 | * | ||
| 248 | * socket passed is expected to be of type INET or INET6. | ||
| 249 | * | ||
| 250 | * The program type passed in via @type must be suitable for sock_ops | ||
| 251 | * filtering. No further check is performed to assert that. | ||
| 252 | * | ||
| 253 | * This function will return %-EPERM if any if an attached program was found | ||
| 254 | * and if it returned != 1 during execution. In all other cases, 0 is returned. | ||
| 255 | */ | ||
| 256 | int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, | ||
| 257 | struct bpf_sock_ops_kern *sock_ops, | ||
| 258 | enum bpf_attach_type type) | ||
| 259 | { | ||
| 260 | struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); | ||
| 261 | struct bpf_prog *prog; | ||
| 262 | int ret = 0; | ||
| 263 | |||
| 264 | |||
| 265 | rcu_read_lock(); | ||
| 266 | |||
| 267 | prog = rcu_dereference(cgrp->bpf.effective[type]); | ||
| 268 | if (prog) | ||
| 269 | ret = BPF_PROG_RUN(prog, sock_ops) == 1 ? 0 : -EPERM; | ||
| 270 | |||
| 271 | rcu_read_unlock(); | ||
| 272 | |||
| 273 | return ret; | ||
| 274 | } | ||
| 275 | EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops); | ||
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index dedf367f59bb..ad5f55922a13 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c | |||
| @@ -763,10 +763,10 @@ EXPORT_SYMBOL_GPL(__bpf_call_base); | |||
| 763 | * | 763 | * |
| 764 | * Decode and execute eBPF instructions. | 764 | * Decode and execute eBPF instructions. |
| 765 | */ | 765 | */ |
| 766 | static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) | 766 | static unsigned int ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, |
| 767 | u64 *stack) | ||
| 767 | { | 768 | { |
| 768 | u64 stack[MAX_BPF_STACK / sizeof(u64)]; | 769 | u64 tmp; |
| 769 | u64 regs[MAX_BPF_REG], tmp; | ||
| 770 | static const void *jumptable[256] = { | 770 | static const void *jumptable[256] = { |
| 771 | [0 ... 255] = &&default_label, | 771 | [0 ... 255] = &&default_label, |
| 772 | /* Now overwrite non-defaults ... */ | 772 | /* Now overwrite non-defaults ... */ |
| @@ -824,7 +824,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) | |||
| 824 | [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG, | 824 | [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG, |
| 825 | /* Call instruction */ | 825 | /* Call instruction */ |
| 826 | [BPF_JMP | BPF_CALL] = &&JMP_CALL, | 826 | [BPF_JMP | BPF_CALL] = &&JMP_CALL, |
| 827 | [BPF_JMP | BPF_CALL | BPF_X] = &&JMP_TAIL_CALL, | 827 | [BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL, |
| 828 | /* Jumps */ | 828 | /* Jumps */ |
| 829 | [BPF_JMP | BPF_JA] = &&JMP_JA, | 829 | [BPF_JMP | BPF_JA] = &&JMP_JA, |
| 830 | [BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X, | 830 | [BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X, |
| @@ -874,9 +874,6 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) | |||
| 874 | #define CONT ({ insn++; goto select_insn; }) | 874 | #define CONT ({ insn++; goto select_insn; }) |
| 875 | #define CONT_JMP ({ insn++; goto select_insn; }) | 875 | #define CONT_JMP ({ insn++; goto select_insn; }) |
| 876 | 876 | ||
| 877 | FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; | ||
| 878 | ARG1 = (u64) (unsigned long) ctx; | ||
| 879 | |||
| 880 | select_insn: | 877 | select_insn: |
| 881 | goto *jumptable[insn->code]; | 878 | goto *jumptable[insn->code]; |
| 882 | 879 | ||
| @@ -1219,7 +1216,39 @@ load_byte: | |||
| 1219 | WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code); | 1216 | WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code); |
| 1220 | return 0; | 1217 | return 0; |
| 1221 | } | 1218 | } |
| 1222 | STACK_FRAME_NON_STANDARD(__bpf_prog_run); /* jump table */ | 1219 | STACK_FRAME_NON_STANDARD(___bpf_prog_run); /* jump table */ |
| 1220 | |||
| 1221 | #define PROG_NAME(stack_size) __bpf_prog_run##stack_size | ||
| 1222 | #define DEFINE_BPF_PROG_RUN(stack_size) \ | ||
| 1223 | static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn *insn) \ | ||
| 1224 | { \ | ||
| 1225 | u64 stack[stack_size / sizeof(u64)]; \ | ||
| 1226 | u64 regs[MAX_BPF_REG]; \ | ||
| 1227 | \ | ||
| 1228 | FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \ | ||
| 1229 | ARG1 = (u64) (unsigned long) ctx; \ | ||
| 1230 | return ___bpf_prog_run(regs, insn, stack); \ | ||
| 1231 | } | ||
| 1232 | |||
| 1233 | #define EVAL1(FN, X) FN(X) | ||
| 1234 | #define EVAL2(FN, X, Y...) FN(X) EVAL1(FN, Y) | ||
| 1235 | #define EVAL3(FN, X, Y...) FN(X) EVAL2(FN, Y) | ||
| 1236 | #define EVAL4(FN, X, Y...) FN(X) EVAL3(FN, Y) | ||
| 1237 | #define EVAL5(FN, X, Y...) FN(X) EVAL4(FN, Y) | ||
| 1238 | #define EVAL6(FN, X, Y...) FN(X) EVAL5(FN, Y) | ||
| 1239 | |||
| 1240 | EVAL6(DEFINE_BPF_PROG_RUN, 32, 64, 96, 128, 160, 192); | ||
| 1241 | EVAL6(DEFINE_BPF_PROG_RUN, 224, 256, 288, 320, 352, 384); | ||
| 1242 | EVAL4(DEFINE_BPF_PROG_RUN, 416, 448, 480, 512); | ||
| 1243 | |||
| 1244 | #define PROG_NAME_LIST(stack_size) PROG_NAME(stack_size), | ||
| 1245 | |||
| 1246 | static unsigned int (*interpreters[])(const void *ctx, | ||
| 1247 | const struct bpf_insn *insn) = { | ||
| 1248 | EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192) | ||
| 1249 | EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384) | ||
| 1250 | EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) | ||
| 1251 | }; | ||
| 1223 | 1252 | ||
| 1224 | bool bpf_prog_array_compatible(struct bpf_array *array, | 1253 | bool bpf_prog_array_compatible(struct bpf_array *array, |
| 1225 | const struct bpf_prog *fp) | 1254 | const struct bpf_prog *fp) |
| @@ -1268,7 +1297,9 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) | |||
| 1268 | */ | 1297 | */ |
| 1269 | struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) | 1298 | struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) |
| 1270 | { | 1299 | { |
| 1271 | fp->bpf_func = (void *) __bpf_prog_run; | 1300 | u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1); |
| 1301 | |||
| 1302 | fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1]; | ||
| 1272 | 1303 | ||
| 1273 | /* eBPF JITs can rewrite the program in case constant | 1304 | /* eBPF JITs can rewrite the program in case constant |
| 1274 | * blinding is active. However, in case of error during | 1305 | * blinding is active. However, in case of error during |
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 004334ea13ba..4fb463172aa8 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c | |||
| @@ -1244,6 +1244,26 @@ static void fd_htab_map_free(struct bpf_map *map) | |||
| 1244 | } | 1244 | } |
| 1245 | 1245 | ||
| 1246 | /* only called from syscall */ | 1246 | /* only called from syscall */ |
| 1247 | int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value) | ||
| 1248 | { | ||
| 1249 | void **ptr; | ||
| 1250 | int ret = 0; | ||
| 1251 | |||
| 1252 | if (!map->ops->map_fd_sys_lookup_elem) | ||
| 1253 | return -ENOTSUPP; | ||
| 1254 | |||
| 1255 | rcu_read_lock(); | ||
| 1256 | ptr = htab_map_lookup_elem(map, key); | ||
| 1257 | if (ptr) | ||
| 1258 | *value = map->ops->map_fd_sys_lookup_elem(READ_ONCE(*ptr)); | ||
| 1259 | else | ||
| 1260 | ret = -ENOENT; | ||
| 1261 | rcu_read_unlock(); | ||
| 1262 | |||
| 1263 | return ret; | ||
| 1264 | } | ||
| 1265 | |||
| 1266 | /* only called from syscall */ | ||
| 1247 | int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, | 1267 | int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, |
| 1248 | void *key, void *value, u64 map_flags) | 1268 | void *key, void *value, u64 map_flags) |
| 1249 | { | 1269 | { |
| @@ -1305,4 +1325,5 @@ const struct bpf_map_ops htab_of_maps_map_ops = { | |||
| 1305 | .map_delete_elem = htab_map_delete_elem, | 1325 | .map_delete_elem = htab_map_delete_elem, |
| 1306 | .map_fd_get_ptr = bpf_map_fd_get_ptr, | 1326 | .map_fd_get_ptr = bpf_map_fd_get_ptr, |
| 1307 | .map_fd_put_ptr = bpf_map_fd_put_ptr, | 1327 | .map_fd_put_ptr = bpf_map_fd_put_ptr, |
| 1328 | .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, | ||
| 1308 | }; | 1329 | }; |
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 9bbd33497d3d..e833ed914358 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c | |||
| @@ -377,10 +377,22 @@ static void bpf_evict_inode(struct inode *inode) | |||
| 377 | bpf_any_put(inode->i_private, type); | 377 | bpf_any_put(inode->i_private, type); |
| 378 | } | 378 | } |
| 379 | 379 | ||
| 380 | /* | ||
| 381 | * Display the mount options in /proc/mounts. | ||
| 382 | */ | ||
| 383 | static int bpf_show_options(struct seq_file *m, struct dentry *root) | ||
| 384 | { | ||
| 385 | umode_t mode = d_inode(root)->i_mode & S_IALLUGO & ~S_ISVTX; | ||
| 386 | |||
| 387 | if (mode != S_IRWXUGO) | ||
| 388 | seq_printf(m, ",mode=%o", mode); | ||
| 389 | return 0; | ||
| 390 | } | ||
| 391 | |||
| 380 | static const struct super_operations bpf_super_ops = { | 392 | static const struct super_operations bpf_super_ops = { |
| 381 | .statfs = simple_statfs, | 393 | .statfs = simple_statfs, |
| 382 | .drop_inode = generic_delete_inode, | 394 | .drop_inode = generic_delete_inode, |
| 383 | .show_options = generic_show_options, | 395 | .show_options = bpf_show_options, |
| 384 | .evict_inode = bpf_evict_inode, | 396 | .evict_inode = bpf_evict_inode, |
| 385 | }; | 397 | }; |
| 386 | 398 | ||
| @@ -434,8 +446,6 @@ static int bpf_fill_super(struct super_block *sb, void *data, int silent) | |||
| 434 | struct inode *inode; | 446 | struct inode *inode; |
| 435 | int ret; | 447 | int ret; |
| 436 | 448 | ||
| 437 | save_mount_options(sb, data); | ||
| 438 | |||
| 439 | ret = bpf_parse_options(data, &opts); | 449 | ret = bpf_parse_options(data, &opts); |
| 440 | if (ret) | 450 | if (ret) |
| 441 | return ret; | 451 | return ret; |
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 59bcdf821ae4..1da574612bea 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c | |||
| @@ -95,3 +95,8 @@ void bpf_map_fd_put_ptr(void *ptr) | |||
| 95 | */ | 95 | */ |
| 96 | bpf_map_put(ptr); | 96 | bpf_map_put(ptr); |
| 97 | } | 97 | } |
| 98 | |||
| 99 | u32 bpf_map_fd_sys_lookup_elem(void *ptr) | ||
| 100 | { | ||
| 101 | return ((struct bpf_map *)ptr)->id; | ||
| 102 | } | ||
diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h index 177fadb689dc..6183db9ec08c 100644 --- a/kernel/bpf/map_in_map.h +++ b/kernel/bpf/map_in_map.h | |||
| @@ -19,5 +19,6 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0, | |||
| 19 | void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file, | 19 | void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file, |
| 20 | int ufd); | 20 | int ufd); |
| 21 | void bpf_map_fd_put_ptr(void *ptr); | 21 | void bpf_map_fd_put_ptr(void *ptr); |
| 22 | u32 bpf_map_fd_sys_lookup_elem(void *ptr); | ||
| 22 | 23 | ||
| 23 | #endif | 24 | #endif |
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 265a0d854e33..045646da97cc 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c | |||
| @@ -22,8 +22,20 @@ | |||
| 22 | #include <linux/filter.h> | 22 | #include <linux/filter.h> |
| 23 | #include <linux/version.h> | 23 | #include <linux/version.h> |
| 24 | #include <linux/kernel.h> | 24 | #include <linux/kernel.h> |
| 25 | #include <linux/idr.h> | ||
| 26 | |||
| 27 | #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \ | ||
| 28 | (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ | ||
| 29 | (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ | ||
| 30 | (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) | ||
| 31 | #define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) | ||
| 32 | #define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_HASH(map)) | ||
| 25 | 33 | ||
| 26 | DEFINE_PER_CPU(int, bpf_prog_active); | 34 | DEFINE_PER_CPU(int, bpf_prog_active); |
| 35 | static DEFINE_IDR(prog_idr); | ||
| 36 | static DEFINE_SPINLOCK(prog_idr_lock); | ||
| 37 | static DEFINE_IDR(map_idr); | ||
| 38 | static DEFINE_SPINLOCK(map_idr_lock); | ||
| 27 | 39 | ||
| 28 | int sysctl_unprivileged_bpf_disabled __read_mostly; | 40 | int sysctl_unprivileged_bpf_disabled __read_mostly; |
| 29 | 41 | ||
| @@ -114,6 +126,37 @@ static void bpf_map_uncharge_memlock(struct bpf_map *map) | |||
| 114 | free_uid(user); | 126 | free_uid(user); |
| 115 | } | 127 | } |
| 116 | 128 | ||
| 129 | static int bpf_map_alloc_id(struct bpf_map *map) | ||
| 130 | { | ||
| 131 | int id; | ||
| 132 | |||
| 133 | spin_lock_bh(&map_idr_lock); | ||
| 134 | id = idr_alloc_cyclic(&map_idr, map, 1, INT_MAX, GFP_ATOMIC); | ||
| 135 | if (id > 0) | ||
| 136 | map->id = id; | ||
| 137 | spin_unlock_bh(&map_idr_lock); | ||
| 138 | |||
| 139 | if (WARN_ON_ONCE(!id)) | ||
| 140 | return -ENOSPC; | ||
| 141 | |||
| 142 | return id > 0 ? 0 : id; | ||
| 143 | } | ||
| 144 | |||
| 145 | static void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock) | ||
| 146 | { | ||
| 147 | if (do_idr_lock) | ||
| 148 | spin_lock_bh(&map_idr_lock); | ||
| 149 | else | ||
| 150 | __acquire(&map_idr_lock); | ||
| 151 | |||
| 152 | idr_remove(&map_idr, map->id); | ||
| 153 | |||
| 154 | if (do_idr_lock) | ||
| 155 | spin_unlock_bh(&map_idr_lock); | ||
| 156 | else | ||
| 157 | __release(&map_idr_lock); | ||
| 158 | } | ||
| 159 | |||
| 117 | /* called from workqueue */ | 160 | /* called from workqueue */ |
| 118 | static void bpf_map_free_deferred(struct work_struct *work) | 161 | static void bpf_map_free_deferred(struct work_struct *work) |
| 119 | { | 162 | { |
| @@ -135,14 +178,21 @@ static void bpf_map_put_uref(struct bpf_map *map) | |||
| 135 | /* decrement map refcnt and schedule it for freeing via workqueue | 178 | /* decrement map refcnt and schedule it for freeing via workqueue |
| 136 | * (unrelying map implementation ops->map_free() might sleep) | 179 | * (unrelying map implementation ops->map_free() might sleep) |
| 137 | */ | 180 | */ |
| 138 | void bpf_map_put(struct bpf_map *map) | 181 | static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock) |
| 139 | { | 182 | { |
| 140 | if (atomic_dec_and_test(&map->refcnt)) { | 183 | if (atomic_dec_and_test(&map->refcnt)) { |
| 184 | /* bpf_map_free_id() must be called first */ | ||
| 185 | bpf_map_free_id(map, do_idr_lock); | ||
| 141 | INIT_WORK(&map->work, bpf_map_free_deferred); | 186 | INIT_WORK(&map->work, bpf_map_free_deferred); |
| 142 | schedule_work(&map->work); | 187 | schedule_work(&map->work); |
| 143 | } | 188 | } |
| 144 | } | 189 | } |
| 145 | 190 | ||
| 191 | void bpf_map_put(struct bpf_map *map) | ||
| 192 | { | ||
| 193 | __bpf_map_put(map, true); | ||
| 194 | } | ||
| 195 | |||
| 146 | void bpf_map_put_with_uref(struct bpf_map *map) | 196 | void bpf_map_put_with_uref(struct bpf_map *map) |
| 147 | { | 197 | { |
| 148 | bpf_map_put_uref(map); | 198 | bpf_map_put_uref(map); |
| @@ -166,10 +216,12 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) | |||
| 166 | const struct bpf_map *map = filp->private_data; | 216 | const struct bpf_map *map = filp->private_data; |
| 167 | const struct bpf_array *array; | 217 | const struct bpf_array *array; |
| 168 | u32 owner_prog_type = 0; | 218 | u32 owner_prog_type = 0; |
| 219 | u32 owner_jited = 0; | ||
| 169 | 220 | ||
| 170 | if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) { | 221 | if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) { |
| 171 | array = container_of(map, struct bpf_array, map); | 222 | array = container_of(map, struct bpf_array, map); |
| 172 | owner_prog_type = array->owner_prog_type; | 223 | owner_prog_type = array->owner_prog_type; |
| 224 | owner_jited = array->owner_jited; | ||
| 173 | } | 225 | } |
| 174 | 226 | ||
| 175 | seq_printf(m, | 227 | seq_printf(m, |
| @@ -186,9 +238,12 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) | |||
| 186 | map->map_flags, | 238 | map->map_flags, |
| 187 | map->pages * 1ULL << PAGE_SHIFT); | 239 | map->pages * 1ULL << PAGE_SHIFT); |
| 188 | 240 | ||
| 189 | if (owner_prog_type) | 241 | if (owner_prog_type) { |
| 190 | seq_printf(m, "owner_prog_type:\t%u\n", | 242 | seq_printf(m, "owner_prog_type:\t%u\n", |
| 191 | owner_prog_type); | 243 | owner_prog_type); |
| 244 | seq_printf(m, "owner_jited:\t%u\n", | ||
| 245 | owner_jited); | ||
| 246 | } | ||
| 192 | } | 247 | } |
| 193 | #endif | 248 | #endif |
| 194 | 249 | ||
| @@ -236,11 +291,22 @@ static int map_create(union bpf_attr *attr) | |||
| 236 | if (err) | 291 | if (err) |
| 237 | goto free_map_nouncharge; | 292 | goto free_map_nouncharge; |
| 238 | 293 | ||
| 239 | err = bpf_map_new_fd(map); | 294 | err = bpf_map_alloc_id(map); |
| 240 | if (err < 0) | 295 | if (err) |
| 241 | /* failed to allocate fd */ | ||
| 242 | goto free_map; | 296 | goto free_map; |
| 243 | 297 | ||
| 298 | err = bpf_map_new_fd(map); | ||
| 299 | if (err < 0) { | ||
| 300 | /* failed to allocate fd. | ||
| 301 | * bpf_map_put() is needed because the above | ||
| 302 | * bpf_map_alloc_id() has published the map | ||
| 303 | * to the userspace and the userspace may | ||
| 304 | * have refcnt-ed it through BPF_MAP_GET_FD_BY_ID. | ||
| 305 | */ | ||
| 306 | bpf_map_put(map); | ||
| 307 | return err; | ||
| 308 | } | ||
| 309 | |||
| 244 | trace_bpf_map_create(map, err); | 310 | trace_bpf_map_create(map, err); |
| 245 | return err; | 311 | return err; |
| 246 | 312 | ||
| @@ -295,6 +361,28 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd) | |||
| 295 | return map; | 361 | return map; |
| 296 | } | 362 | } |
| 297 | 363 | ||
| 364 | /* map_idr_lock should have been held */ | ||
| 365 | static struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map, | ||
| 366 | bool uref) | ||
| 367 | { | ||
| 368 | int refold; | ||
| 369 | |||
| 370 | refold = __atomic_add_unless(&map->refcnt, 1, 0); | ||
| 371 | |||
| 372 | if (refold >= BPF_MAX_REFCNT) { | ||
| 373 | __bpf_map_put(map, false); | ||
| 374 | return ERR_PTR(-EBUSY); | ||
| 375 | } | ||
| 376 | |||
| 377 | if (!refold) | ||
| 378 | return ERR_PTR(-ENOENT); | ||
| 379 | |||
| 380 | if (uref) | ||
| 381 | atomic_inc(&map->usercnt); | ||
| 382 | |||
| 383 | return map; | ||
| 384 | } | ||
| 385 | |||
| 298 | int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) | 386 | int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) |
| 299 | { | 387 | { |
| 300 | return -ENOTSUPP; | 388 | return -ENOTSUPP; |
| @@ -322,19 +410,18 @@ static int map_lookup_elem(union bpf_attr *attr) | |||
| 322 | if (IS_ERR(map)) | 410 | if (IS_ERR(map)) |
| 323 | return PTR_ERR(map); | 411 | return PTR_ERR(map); |
| 324 | 412 | ||
| 325 | err = -ENOMEM; | 413 | key = memdup_user(ukey, map->key_size); |
| 326 | key = kmalloc(map->key_size, GFP_USER); | 414 | if (IS_ERR(key)) { |
| 327 | if (!key) | 415 | err = PTR_ERR(key); |
| 328 | goto err_put; | 416 | goto err_put; |
| 329 | 417 | } | |
| 330 | err = -EFAULT; | ||
| 331 | if (copy_from_user(key, ukey, map->key_size) != 0) | ||
| 332 | goto free_key; | ||
| 333 | 418 | ||
| 334 | if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || | 419 | if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || |
| 335 | map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || | 420 | map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || |
| 336 | map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) | 421 | map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) |
| 337 | value_size = round_up(map->value_size, 8) * num_possible_cpus(); | 422 | value_size = round_up(map->value_size, 8) * num_possible_cpus(); |
| 423 | else if (IS_FD_MAP(map)) | ||
| 424 | value_size = sizeof(u32); | ||
| 338 | else | 425 | else |
| 339 | value_size = map->value_size; | 426 | value_size = map->value_size; |
| 340 | 427 | ||
| @@ -350,9 +437,10 @@ static int map_lookup_elem(union bpf_attr *attr) | |||
| 350 | err = bpf_percpu_array_copy(map, key, value); | 437 | err = bpf_percpu_array_copy(map, key, value); |
| 351 | } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { | 438 | } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { |
| 352 | err = bpf_stackmap_copy(map, key, value); | 439 | err = bpf_stackmap_copy(map, key, value); |
| 353 | } else if (map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS || | 440 | } else if (IS_FD_ARRAY(map)) { |
| 354 | map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { | 441 | err = bpf_fd_array_map_lookup_elem(map, key, value); |
| 355 | err = -ENOTSUPP; | 442 | } else if (IS_FD_HASH(map)) { |
| 443 | err = bpf_fd_htab_map_lookup_elem(map, key, value); | ||
| 356 | } else { | 444 | } else { |
| 357 | rcu_read_lock(); | 445 | rcu_read_lock(); |
| 358 | ptr = map->ops->map_lookup_elem(map, key); | 446 | ptr = map->ops->map_lookup_elem(map, key); |
| @@ -402,14 +490,11 @@ static int map_update_elem(union bpf_attr *attr) | |||
| 402 | if (IS_ERR(map)) | 490 | if (IS_ERR(map)) |
| 403 | return PTR_ERR(map); | 491 | return PTR_ERR(map); |
| 404 | 492 | ||
| 405 | err = -ENOMEM; | 493 | key = memdup_user(ukey, map->key_size); |
| 406 | key = kmalloc(map->key_size, GFP_USER); | 494 | if (IS_ERR(key)) { |
| 407 | if (!key) | 495 | err = PTR_ERR(key); |
| 408 | goto err_put; | 496 | goto err_put; |
| 409 | 497 | } | |
| 410 | err = -EFAULT; | ||
| 411 | if (copy_from_user(key, ukey, map->key_size) != 0) | ||
| 412 | goto free_key; | ||
| 413 | 498 | ||
| 414 | if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || | 499 | if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || |
| 415 | map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || | 500 | map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || |
| @@ -488,14 +573,11 @@ static int map_delete_elem(union bpf_attr *attr) | |||
| 488 | if (IS_ERR(map)) | 573 | if (IS_ERR(map)) |
| 489 | return PTR_ERR(map); | 574 | return PTR_ERR(map); |
| 490 | 575 | ||
| 491 | err = -ENOMEM; | 576 | key = memdup_user(ukey, map->key_size); |
| 492 | key = kmalloc(map->key_size, GFP_USER); | 577 | if (IS_ERR(key)) { |
| 493 | if (!key) | 578 | err = PTR_ERR(key); |
| 494 | goto err_put; | 579 | goto err_put; |
| 495 | 580 | } | |
| 496 | err = -EFAULT; | ||
| 497 | if (copy_from_user(key, ukey, map->key_size) != 0) | ||
| 498 | goto free_key; | ||
| 499 | 581 | ||
| 500 | preempt_disable(); | 582 | preempt_disable(); |
| 501 | __this_cpu_inc(bpf_prog_active); | 583 | __this_cpu_inc(bpf_prog_active); |
| @@ -507,7 +589,6 @@ static int map_delete_elem(union bpf_attr *attr) | |||
| 507 | 589 | ||
| 508 | if (!err) | 590 | if (!err) |
| 509 | trace_bpf_map_delete_elem(map, ufd, key); | 591 | trace_bpf_map_delete_elem(map, ufd, key); |
| 510 | free_key: | ||
| 511 | kfree(key); | 592 | kfree(key); |
| 512 | err_put: | 593 | err_put: |
| 513 | fdput(f); | 594 | fdput(f); |
| @@ -536,14 +617,11 @@ static int map_get_next_key(union bpf_attr *attr) | |||
| 536 | return PTR_ERR(map); | 617 | return PTR_ERR(map); |
| 537 | 618 | ||
| 538 | if (ukey) { | 619 | if (ukey) { |
| 539 | err = -ENOMEM; | 620 | key = memdup_user(ukey, map->key_size); |
| 540 | key = kmalloc(map->key_size, GFP_USER); | 621 | if (IS_ERR(key)) { |
| 541 | if (!key) | 622 | err = PTR_ERR(key); |
| 542 | goto err_put; | 623 | goto err_put; |
| 543 | 624 | } | |
| 544 | err = -EFAULT; | ||
| 545 | if (copy_from_user(key, ukey, map->key_size) != 0) | ||
| 546 | goto free_key; | ||
| 547 | } else { | 625 | } else { |
| 548 | key = NULL; | 626 | key = NULL; |
| 549 | } | 627 | } |
| @@ -650,6 +728,42 @@ static void bpf_prog_uncharge_memlock(struct bpf_prog *prog) | |||
| 650 | free_uid(user); | 728 | free_uid(user); |
| 651 | } | 729 | } |
| 652 | 730 | ||
| 731 | static int bpf_prog_alloc_id(struct bpf_prog *prog) | ||
| 732 | { | ||
| 733 | int id; | ||
| 734 | |||
| 735 | spin_lock_bh(&prog_idr_lock); | ||
| 736 | id = idr_alloc_cyclic(&prog_idr, prog, 1, INT_MAX, GFP_ATOMIC); | ||
| 737 | if (id > 0) | ||
| 738 | prog->aux->id = id; | ||
| 739 | spin_unlock_bh(&prog_idr_lock); | ||
| 740 | |||
| 741 | /* id is in [1, INT_MAX) */ | ||
| 742 | if (WARN_ON_ONCE(!id)) | ||
| 743 | return -ENOSPC; | ||
| 744 | |||
| 745 | return id > 0 ? 0 : id; | ||
| 746 | } | ||
| 747 | |||
| 748 | static void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock) | ||
| 749 | { | ||
| 750 | /* cBPF to eBPF migrations are currently not in the idr store. */ | ||
| 751 | if (!prog->aux->id) | ||
| 752 | return; | ||
| 753 | |||
| 754 | if (do_idr_lock) | ||
| 755 | spin_lock_bh(&prog_idr_lock); | ||
| 756 | else | ||
| 757 | __acquire(&prog_idr_lock); | ||
| 758 | |||
| 759 | idr_remove(&prog_idr, prog->aux->id); | ||
| 760 | |||
| 761 | if (do_idr_lock) | ||
| 762 | spin_unlock_bh(&prog_idr_lock); | ||
| 763 | else | ||
| 764 | __release(&prog_idr_lock); | ||
| 765 | } | ||
| 766 | |||
| 653 | static void __bpf_prog_put_rcu(struct rcu_head *rcu) | 767 | static void __bpf_prog_put_rcu(struct rcu_head *rcu) |
| 654 | { | 768 | { |
| 655 | struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu); | 769 | struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu); |
| @@ -659,14 +773,21 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu) | |||
| 659 | bpf_prog_free(aux->prog); | 773 | bpf_prog_free(aux->prog); |
| 660 | } | 774 | } |
| 661 | 775 | ||
| 662 | void bpf_prog_put(struct bpf_prog *prog) | 776 | static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) |
| 663 | { | 777 | { |
| 664 | if (atomic_dec_and_test(&prog->aux->refcnt)) { | 778 | if (atomic_dec_and_test(&prog->aux->refcnt)) { |
| 665 | trace_bpf_prog_put_rcu(prog); | 779 | trace_bpf_prog_put_rcu(prog); |
| 780 | /* bpf_prog_free_id() must be called first */ | ||
| 781 | bpf_prog_free_id(prog, do_idr_lock); | ||
| 666 | bpf_prog_kallsyms_del(prog); | 782 | bpf_prog_kallsyms_del(prog); |
| 667 | call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); | 783 | call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); |
| 668 | } | 784 | } |
| 669 | } | 785 | } |
| 786 | |||
| 787 | void bpf_prog_put(struct bpf_prog *prog) | ||
| 788 | { | ||
| 789 | __bpf_prog_put(prog, true); | ||
| 790 | } | ||
| 670 | EXPORT_SYMBOL_GPL(bpf_prog_put); | 791 | EXPORT_SYMBOL_GPL(bpf_prog_put); |
| 671 | 792 | ||
| 672 | static int bpf_prog_release(struct inode *inode, struct file *filp) | 793 | static int bpf_prog_release(struct inode *inode, struct file *filp) |
| @@ -748,6 +869,24 @@ struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog) | |||
| 748 | } | 869 | } |
| 749 | EXPORT_SYMBOL_GPL(bpf_prog_inc); | 870 | EXPORT_SYMBOL_GPL(bpf_prog_inc); |
| 750 | 871 | ||
| 872 | /* prog_idr_lock should have been held */ | ||
| 873 | static struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog) | ||
| 874 | { | ||
| 875 | int refold; | ||
| 876 | |||
| 877 | refold = __atomic_add_unless(&prog->aux->refcnt, 1, 0); | ||
| 878 | |||
| 879 | if (refold >= BPF_MAX_REFCNT) { | ||
| 880 | __bpf_prog_put(prog, false); | ||
| 881 | return ERR_PTR(-EBUSY); | ||
| 882 | } | ||
| 883 | |||
| 884 | if (!refold) | ||
| 885 | return ERR_PTR(-ENOENT); | ||
| 886 | |||
| 887 | return prog; | ||
| 888 | } | ||
| 889 | |||
| 751 | static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type) | 890 | static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type) |
| 752 | { | 891 | { |
| 753 | struct fd f = fdget(ufd); | 892 | struct fd f = fdget(ufd); |
| @@ -815,7 +954,9 @@ static int bpf_prog_load(union bpf_attr *attr) | |||
| 815 | attr->kern_version != LINUX_VERSION_CODE) | 954 | attr->kern_version != LINUX_VERSION_CODE) |
| 816 | return -EINVAL; | 955 | return -EINVAL; |
| 817 | 956 | ||
| 818 | if (type != BPF_PROG_TYPE_SOCKET_FILTER && !capable(CAP_SYS_ADMIN)) | 957 | if (type != BPF_PROG_TYPE_SOCKET_FILTER && |
| 958 | type != BPF_PROG_TYPE_CGROUP_SKB && | ||
| 959 | !capable(CAP_SYS_ADMIN)) | ||
| 819 | return -EPERM; | 960 | return -EPERM; |
| 820 | 961 | ||
| 821 | /* plain bpf_prog allocation */ | 962 | /* plain bpf_prog allocation */ |
| @@ -855,11 +996,22 @@ static int bpf_prog_load(union bpf_attr *attr) | |||
| 855 | if (err < 0) | 996 | if (err < 0) |
| 856 | goto free_used_maps; | 997 | goto free_used_maps; |
| 857 | 998 | ||
| 858 | err = bpf_prog_new_fd(prog); | 999 | err = bpf_prog_alloc_id(prog); |
| 859 | if (err < 0) | 1000 | if (err) |
| 860 | /* failed to allocate fd */ | ||
| 861 | goto free_used_maps; | 1001 | goto free_used_maps; |
| 862 | 1002 | ||
| 1003 | err = bpf_prog_new_fd(prog); | ||
| 1004 | if (err < 0) { | ||
| 1005 | /* failed to allocate fd. | ||
| 1006 | * bpf_prog_put() is needed because the above | ||
| 1007 | * bpf_prog_alloc_id() has published the prog | ||
| 1008 | * to the userspace and the userspace may | ||
| 1009 | * have refcnt-ed it through BPF_PROG_GET_FD_BY_ID. | ||
| 1010 | */ | ||
| 1011 | bpf_prog_put(prog); | ||
| 1012 | return err; | ||
| 1013 | } | ||
| 1014 | |||
| 863 | bpf_prog_kallsyms_add(prog); | 1015 | bpf_prog_kallsyms_add(prog); |
| 864 | trace_bpf_prog_load(prog, err); | 1016 | trace_bpf_prog_load(prog, err); |
| 865 | return err; | 1017 | return err; |
| @@ -919,6 +1071,9 @@ static int bpf_prog_attach(const union bpf_attr *attr) | |||
| 919 | case BPF_CGROUP_INET_SOCK_CREATE: | 1071 | case BPF_CGROUP_INET_SOCK_CREATE: |
| 920 | ptype = BPF_PROG_TYPE_CGROUP_SOCK; | 1072 | ptype = BPF_PROG_TYPE_CGROUP_SOCK; |
| 921 | break; | 1073 | break; |
| 1074 | case BPF_CGROUP_SOCK_OPS: | ||
| 1075 | ptype = BPF_PROG_TYPE_SOCK_OPS; | ||
| 1076 | break; | ||
| 922 | default: | 1077 | default: |
| 923 | return -EINVAL; | 1078 | return -EINVAL; |
| 924 | } | 1079 | } |
| @@ -959,6 +1114,7 @@ static int bpf_prog_detach(const union bpf_attr *attr) | |||
| 959 | case BPF_CGROUP_INET_INGRESS: | 1114 | case BPF_CGROUP_INET_INGRESS: |
| 960 | case BPF_CGROUP_INET_EGRESS: | 1115 | case BPF_CGROUP_INET_EGRESS: |
| 961 | case BPF_CGROUP_INET_SOCK_CREATE: | 1116 | case BPF_CGROUP_INET_SOCK_CREATE: |
| 1117 | case BPF_CGROUP_SOCK_OPS: | ||
| 962 | cgrp = cgroup_get_from_fd(attr->target_fd); | 1118 | cgrp = cgroup_get_from_fd(attr->target_fd); |
| 963 | if (IS_ERR(cgrp)) | 1119 | if (IS_ERR(cgrp)) |
| 964 | return PTR_ERR(cgrp); | 1120 | return PTR_ERR(cgrp); |
| @@ -973,6 +1129,7 @@ static int bpf_prog_detach(const union bpf_attr *attr) | |||
| 973 | 1129 | ||
| 974 | return ret; | 1130 | return ret; |
| 975 | } | 1131 | } |
| 1132 | |||
| 976 | #endif /* CONFIG_CGROUP_BPF */ | 1133 | #endif /* CONFIG_CGROUP_BPF */ |
| 977 | 1134 | ||
| 978 | #define BPF_PROG_TEST_RUN_LAST_FIELD test.duration | 1135 | #define BPF_PROG_TEST_RUN_LAST_FIELD test.duration |
| @@ -997,6 +1154,237 @@ static int bpf_prog_test_run(const union bpf_attr *attr, | |||
| 997 | return ret; | 1154 | return ret; |
| 998 | } | 1155 | } |
| 999 | 1156 | ||
| 1157 | #define BPF_OBJ_GET_NEXT_ID_LAST_FIELD next_id | ||
| 1158 | |||
| 1159 | static int bpf_obj_get_next_id(const union bpf_attr *attr, | ||
| 1160 | union bpf_attr __user *uattr, | ||
| 1161 | struct idr *idr, | ||
| 1162 | spinlock_t *lock) | ||
| 1163 | { | ||
| 1164 | u32 next_id = attr->start_id; | ||
| 1165 | int err = 0; | ||
| 1166 | |||
| 1167 | if (CHECK_ATTR(BPF_OBJ_GET_NEXT_ID) || next_id >= INT_MAX) | ||
| 1168 | return -EINVAL; | ||
| 1169 | |||
| 1170 | if (!capable(CAP_SYS_ADMIN)) | ||
| 1171 | return -EPERM; | ||
| 1172 | |||
| 1173 | next_id++; | ||
| 1174 | spin_lock_bh(lock); | ||
| 1175 | if (!idr_get_next(idr, &next_id)) | ||
| 1176 | err = -ENOENT; | ||
| 1177 | spin_unlock_bh(lock); | ||
| 1178 | |||
| 1179 | if (!err) | ||
| 1180 | err = put_user(next_id, &uattr->next_id); | ||
| 1181 | |||
| 1182 | return err; | ||
| 1183 | } | ||
| 1184 | |||
| 1185 | #define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id | ||
| 1186 | |||
| 1187 | static int bpf_prog_get_fd_by_id(const union bpf_attr *attr) | ||
| 1188 | { | ||
| 1189 | struct bpf_prog *prog; | ||
| 1190 | u32 id = attr->prog_id; | ||
| 1191 | int fd; | ||
| 1192 | |||
| 1193 | if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID)) | ||
| 1194 | return -EINVAL; | ||
| 1195 | |||
| 1196 | if (!capable(CAP_SYS_ADMIN)) | ||
| 1197 | return -EPERM; | ||
| 1198 | |||
| 1199 | spin_lock_bh(&prog_idr_lock); | ||
| 1200 | prog = idr_find(&prog_idr, id); | ||
| 1201 | if (prog) | ||
| 1202 | prog = bpf_prog_inc_not_zero(prog); | ||
| 1203 | else | ||
| 1204 | prog = ERR_PTR(-ENOENT); | ||
| 1205 | spin_unlock_bh(&prog_idr_lock); | ||
| 1206 | |||
| 1207 | if (IS_ERR(prog)) | ||
| 1208 | return PTR_ERR(prog); | ||
| 1209 | |||
| 1210 | fd = bpf_prog_new_fd(prog); | ||
| 1211 | if (fd < 0) | ||
| 1212 | bpf_prog_put(prog); | ||
| 1213 | |||
| 1214 | return fd; | ||
| 1215 | } | ||
| 1216 | |||
| 1217 | #define BPF_MAP_GET_FD_BY_ID_LAST_FIELD map_id | ||
| 1218 | |||
| 1219 | static int bpf_map_get_fd_by_id(const union bpf_attr *attr) | ||
| 1220 | { | ||
| 1221 | struct bpf_map *map; | ||
| 1222 | u32 id = attr->map_id; | ||
| 1223 | int fd; | ||
| 1224 | |||
| 1225 | if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID)) | ||
| 1226 | return -EINVAL; | ||
| 1227 | |||
| 1228 | if (!capable(CAP_SYS_ADMIN)) | ||
| 1229 | return -EPERM; | ||
| 1230 | |||
| 1231 | spin_lock_bh(&map_idr_lock); | ||
| 1232 | map = idr_find(&map_idr, id); | ||
| 1233 | if (map) | ||
| 1234 | map = bpf_map_inc_not_zero(map, true); | ||
| 1235 | else | ||
| 1236 | map = ERR_PTR(-ENOENT); | ||
| 1237 | spin_unlock_bh(&map_idr_lock); | ||
| 1238 | |||
| 1239 | if (IS_ERR(map)) | ||
| 1240 | return PTR_ERR(map); | ||
| 1241 | |||
| 1242 | fd = bpf_map_new_fd(map); | ||
| 1243 | if (fd < 0) | ||
| 1244 | bpf_map_put(map); | ||
| 1245 | |||
| 1246 | return fd; | ||
| 1247 | } | ||
| 1248 | |||
| 1249 | static int check_uarg_tail_zero(void __user *uaddr, | ||
| 1250 | size_t expected_size, | ||
| 1251 | size_t actual_size) | ||
| 1252 | { | ||
| 1253 | unsigned char __user *addr; | ||
| 1254 | unsigned char __user *end; | ||
| 1255 | unsigned char val; | ||
| 1256 | int err; | ||
| 1257 | |||
| 1258 | if (actual_size <= expected_size) | ||
| 1259 | return 0; | ||
| 1260 | |||
| 1261 | addr = uaddr + expected_size; | ||
| 1262 | end = uaddr + actual_size; | ||
| 1263 | |||
| 1264 | for (; addr < end; addr++) { | ||
| 1265 | err = get_user(val, addr); | ||
| 1266 | if (err) | ||
| 1267 | return err; | ||
| 1268 | if (val) | ||
| 1269 | return -E2BIG; | ||
| 1270 | } | ||
| 1271 | |||
| 1272 | return 0; | ||
| 1273 | } | ||
| 1274 | |||
| 1275 | static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, | ||
| 1276 | const union bpf_attr *attr, | ||
| 1277 | union bpf_attr __user *uattr) | ||
| 1278 | { | ||
| 1279 | struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info); | ||
| 1280 | struct bpf_prog_info info = {}; | ||
| 1281 | u32 info_len = attr->info.info_len; | ||
| 1282 | char __user *uinsns; | ||
| 1283 | u32 ulen; | ||
| 1284 | int err; | ||
| 1285 | |||
| 1286 | err = check_uarg_tail_zero(uinfo, sizeof(info), info_len); | ||
| 1287 | if (err) | ||
| 1288 | return err; | ||
| 1289 | info_len = min_t(u32, sizeof(info), info_len); | ||
| 1290 | |||
| 1291 | if (copy_from_user(&info, uinfo, info_len)) | ||
| 1292 | return err; | ||
| 1293 | |||
| 1294 | info.type = prog->type; | ||
| 1295 | info.id = prog->aux->id; | ||
| 1296 | |||
| 1297 | memcpy(info.tag, prog->tag, sizeof(prog->tag)); | ||
| 1298 | |||
| 1299 | if (!capable(CAP_SYS_ADMIN)) { | ||
| 1300 | info.jited_prog_len = 0; | ||
| 1301 | info.xlated_prog_len = 0; | ||
| 1302 | goto done; | ||
| 1303 | } | ||
| 1304 | |||
| 1305 | ulen = info.jited_prog_len; | ||
| 1306 | info.jited_prog_len = prog->jited_len; | ||
| 1307 | if (info.jited_prog_len && ulen) { | ||
| 1308 | uinsns = u64_to_user_ptr(info.jited_prog_insns); | ||
| 1309 | ulen = min_t(u32, info.jited_prog_len, ulen); | ||
| 1310 | if (copy_to_user(uinsns, prog->bpf_func, ulen)) | ||
| 1311 | return -EFAULT; | ||
| 1312 | } | ||
| 1313 | |||
| 1314 | ulen = info.xlated_prog_len; | ||
| 1315 | info.xlated_prog_len = bpf_prog_size(prog->len); | ||
| 1316 | if (info.xlated_prog_len && ulen) { | ||
| 1317 | uinsns = u64_to_user_ptr(info.xlated_prog_insns); | ||
| 1318 | ulen = min_t(u32, info.xlated_prog_len, ulen); | ||
| 1319 | if (copy_to_user(uinsns, prog->insnsi, ulen)) | ||
| 1320 | return -EFAULT; | ||
| 1321 | } | ||
| 1322 | |||
| 1323 | done: | ||
| 1324 | if (copy_to_user(uinfo, &info, info_len) || | ||
| 1325 | put_user(info_len, &uattr->info.info_len)) | ||
| 1326 | return -EFAULT; | ||
| 1327 | |||
| 1328 | return 0; | ||
| 1329 | } | ||
| 1330 | |||
| 1331 | static int bpf_map_get_info_by_fd(struct bpf_map *map, | ||
| 1332 | const union bpf_attr *attr, | ||
| 1333 | union bpf_attr __user *uattr) | ||
| 1334 | { | ||
| 1335 | struct bpf_map_info __user *uinfo = u64_to_user_ptr(attr->info.info); | ||
| 1336 | struct bpf_map_info info = {}; | ||
| 1337 | u32 info_len = attr->info.info_len; | ||
| 1338 | int err; | ||
| 1339 | |||
| 1340 | err = check_uarg_tail_zero(uinfo, sizeof(info), info_len); | ||
| 1341 | if (err) | ||
| 1342 | return err; | ||
| 1343 | info_len = min_t(u32, sizeof(info), info_len); | ||
| 1344 | |||
| 1345 | info.type = map->map_type; | ||
| 1346 | info.id = map->id; | ||
| 1347 | info.key_size = map->key_size; | ||
| 1348 | info.value_size = map->value_size; | ||
| 1349 | info.max_entries = map->max_entries; | ||
| 1350 | info.map_flags = map->map_flags; | ||
| 1351 | |||
| 1352 | if (copy_to_user(uinfo, &info, info_len) || | ||
| 1353 | put_user(info_len, &uattr->info.info_len)) | ||
| 1354 | return -EFAULT; | ||
| 1355 | |||
| 1356 | return 0; | ||
| 1357 | } | ||
| 1358 | |||
| 1359 | #define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info | ||
| 1360 | |||
| 1361 | static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, | ||
| 1362 | union bpf_attr __user *uattr) | ||
| 1363 | { | ||
| 1364 | int ufd = attr->info.bpf_fd; | ||
| 1365 | struct fd f; | ||
| 1366 | int err; | ||
| 1367 | |||
| 1368 | if (CHECK_ATTR(BPF_OBJ_GET_INFO_BY_FD)) | ||
| 1369 | return -EINVAL; | ||
| 1370 | |||
| 1371 | f = fdget(ufd); | ||
| 1372 | if (!f.file) | ||
| 1373 | return -EBADFD; | ||
| 1374 | |||
| 1375 | if (f.file->f_op == &bpf_prog_fops) | ||
| 1376 | err = bpf_prog_get_info_by_fd(f.file->private_data, attr, | ||
| 1377 | uattr); | ||
| 1378 | else if (f.file->f_op == &bpf_map_fops) | ||
| 1379 | err = bpf_map_get_info_by_fd(f.file->private_data, attr, | ||
| 1380 | uattr); | ||
| 1381 | else | ||
| 1382 | err = -EINVAL; | ||
| 1383 | |||
| 1384 | fdput(f); | ||
| 1385 | return err; | ||
| 1386 | } | ||
| 1387 | |||
| 1000 | SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) | 1388 | SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) |
| 1001 | { | 1389 | { |
| 1002 | union bpf_attr attr = {}; | 1390 | union bpf_attr attr = {}; |
| @@ -1016,23 +1404,10 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz | |||
| 1016 | * user-space does not rely on any kernel feature | 1404 | * user-space does not rely on any kernel feature |
| 1017 | * extensions we dont know about yet. | 1405 | * extensions we dont know about yet. |
| 1018 | */ | 1406 | */ |
| 1019 | if (size > sizeof(attr)) { | 1407 | err = check_uarg_tail_zero(uattr, sizeof(attr), size); |
| 1020 | unsigned char __user *addr; | 1408 | if (err) |
| 1021 | unsigned char __user *end; | 1409 | return err; |
| 1022 | unsigned char val; | 1410 | size = min_t(u32, size, sizeof(attr)); |
| 1023 | |||
| 1024 | addr = (void __user *)uattr + sizeof(attr); | ||
| 1025 | end = (void __user *)uattr + size; | ||
| 1026 | |||
| 1027 | for (; addr < end; addr++) { | ||
| 1028 | err = get_user(val, addr); | ||
| 1029 | if (err) | ||
| 1030 | return err; | ||
| 1031 | if (val) | ||
| 1032 | return -E2BIG; | ||
| 1033 | } | ||
| 1034 | size = sizeof(attr); | ||
| 1035 | } | ||
| 1036 | 1411 | ||
| 1037 | /* copy attributes from user space, may be less than sizeof(bpf_attr) */ | 1412 | /* copy attributes from user space, may be less than sizeof(bpf_attr) */ |
| 1038 | if (copy_from_user(&attr, uattr, size) != 0) | 1413 | if (copy_from_user(&attr, uattr, size) != 0) |
| @@ -1074,6 +1449,23 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz | |||
| 1074 | case BPF_PROG_TEST_RUN: | 1449 | case BPF_PROG_TEST_RUN: |
| 1075 | err = bpf_prog_test_run(&attr, uattr); | 1450 | err = bpf_prog_test_run(&attr, uattr); |
| 1076 | break; | 1451 | break; |
| 1452 | case BPF_PROG_GET_NEXT_ID: | ||
| 1453 | err = bpf_obj_get_next_id(&attr, uattr, | ||
| 1454 | &prog_idr, &prog_idr_lock); | ||
| 1455 | break; | ||
| 1456 | case BPF_MAP_GET_NEXT_ID: | ||
| 1457 | err = bpf_obj_get_next_id(&attr, uattr, | ||
| 1458 | &map_idr, &map_idr_lock); | ||
| 1459 | break; | ||
| 1460 | case BPF_PROG_GET_FD_BY_ID: | ||
| 1461 | err = bpf_prog_get_fd_by_id(&attr); | ||
| 1462 | break; | ||
| 1463 | case BPF_MAP_GET_FD_BY_ID: | ||
| 1464 | err = bpf_map_get_fd_by_id(&attr); | ||
| 1465 | break; | ||
| 1466 | case BPF_OBJ_GET_INFO_BY_FD: | ||
| 1467 | err = bpf_obj_get_info_by_fd(&attr, uattr); | ||
| 1468 | break; | ||
| 1077 | default: | 1469 | default: |
| 1078 | err = -EINVAL; | 1470 | err = -EINVAL; |
| 1079 | break; | 1471 | break; |
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a8a725697bed..af9e84a4944e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c | |||
| @@ -504,6 +504,7 @@ static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno) | |||
| 504 | { | 504 | { |
| 505 | regs[regno].min_value = BPF_REGISTER_MIN_RANGE; | 505 | regs[regno].min_value = BPF_REGISTER_MIN_RANGE; |
| 506 | regs[regno].max_value = BPF_REGISTER_MAX_RANGE; | 506 | regs[regno].max_value = BPF_REGISTER_MAX_RANGE; |
| 507 | regs[regno].value_from_signed = false; | ||
| 507 | regs[regno].min_align = 0; | 508 | regs[regno].min_align = 0; |
| 508 | } | 509 | } |
| 509 | 510 | ||
| @@ -546,20 +547,6 @@ static int check_reg_arg(struct bpf_reg_state *regs, u32 regno, | |||
| 546 | return 0; | 547 | return 0; |
| 547 | } | 548 | } |
| 548 | 549 | ||
| 549 | static int bpf_size_to_bytes(int bpf_size) | ||
| 550 | { | ||
| 551 | if (bpf_size == BPF_W) | ||
| 552 | return 4; | ||
| 553 | else if (bpf_size == BPF_H) | ||
| 554 | return 2; | ||
| 555 | else if (bpf_size == BPF_B) | ||
| 556 | return 1; | ||
| 557 | else if (bpf_size == BPF_DW) | ||
| 558 | return 8; | ||
| 559 | else | ||
| 560 | return -EINVAL; | ||
| 561 | } | ||
| 562 | |||
| 563 | static bool is_spillable_regtype(enum bpf_reg_type type) | 550 | static bool is_spillable_regtype(enum bpf_reg_type type) |
| 564 | { | 551 | { |
| 565 | switch (type) { | 552 | switch (type) { |
| @@ -758,15 +745,29 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, | |||
| 758 | } | 745 | } |
| 759 | 746 | ||
| 760 | /* check access to 'struct bpf_context' fields */ | 747 | /* check access to 'struct bpf_context' fields */ |
| 761 | static int check_ctx_access(struct bpf_verifier_env *env, int off, int size, | 748 | static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size, |
| 762 | enum bpf_access_type t, enum bpf_reg_type *reg_type) | 749 | enum bpf_access_type t, enum bpf_reg_type *reg_type) |
| 763 | { | 750 | { |
| 751 | struct bpf_insn_access_aux info = { | ||
| 752 | .reg_type = *reg_type, | ||
| 753 | }; | ||
| 754 | |||
| 764 | /* for analyzer ctx accesses are already validated and converted */ | 755 | /* for analyzer ctx accesses are already validated and converted */ |
| 765 | if (env->analyzer_ops) | 756 | if (env->analyzer_ops) |
| 766 | return 0; | 757 | return 0; |
| 767 | 758 | ||
| 768 | if (env->prog->aux->ops->is_valid_access && | 759 | if (env->prog->aux->ops->is_valid_access && |
| 769 | env->prog->aux->ops->is_valid_access(off, size, t, reg_type)) { | 760 | env->prog->aux->ops->is_valid_access(off, size, t, &info)) { |
| 761 | /* A non zero info.ctx_field_size indicates that this field is a | ||
| 762 | * candidate for later verifier transformation to load the whole | ||
| 763 | * field and then apply a mask when accessed with a narrower | ||
| 764 | * access than actual ctx access size. A zero info.ctx_field_size | ||
| 765 | * will only allow for whole field access and rejects any other | ||
| 766 | * type of narrower access. | ||
| 767 | */ | ||
| 768 | env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size; | ||
| 769 | *reg_type = info.reg_type; | ||
| 770 | |||
| 770 | /* remember the offset of last byte accessed in ctx */ | 771 | /* remember the offset of last byte accessed in ctx */ |
| 771 | if (env->prog->aux->max_ctx_offset < off + size) | 772 | if (env->prog->aux->max_ctx_offset < off + size) |
| 772 | env->prog->aux->max_ctx_offset = off + size; | 773 | env->prog->aux->max_ctx_offset = off + size; |
| @@ -777,12 +778,13 @@ static int check_ctx_access(struct bpf_verifier_env *env, int off, int size, | |||
| 777 | return -EACCES; | 778 | return -EACCES; |
| 778 | } | 779 | } |
| 779 | 780 | ||
| 780 | static bool is_pointer_value(struct bpf_verifier_env *env, int regno) | 781 | static bool __is_pointer_value(bool allow_ptr_leaks, |
| 782 | const struct bpf_reg_state *reg) | ||
| 781 | { | 783 | { |
| 782 | if (env->allow_ptr_leaks) | 784 | if (allow_ptr_leaks) |
| 783 | return false; | 785 | return false; |
| 784 | 786 | ||
| 785 | switch (env->cur_state.regs[regno].type) { | 787 | switch (reg->type) { |
| 786 | case UNKNOWN_VALUE: | 788 | case UNKNOWN_VALUE: |
| 787 | case CONST_IMM: | 789 | case CONST_IMM: |
| 788 | return false; | 790 | return false; |
| @@ -791,6 +793,11 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno) | |||
| 791 | } | 793 | } |
| 792 | } | 794 | } |
| 793 | 795 | ||
| 796 | static bool is_pointer_value(struct bpf_verifier_env *env, int regno) | ||
| 797 | { | ||
| 798 | return __is_pointer_value(env->allow_ptr_leaks, &env->cur_state.regs[regno]); | ||
| 799 | } | ||
| 800 | |||
| 794 | static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg, | 801 | static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg, |
| 795 | int off, int size, bool strict) | 802 | int off, int size, bool strict) |
| 796 | { | 803 | { |
| @@ -868,7 +875,7 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, | |||
| 868 | * if t==write && value_regno==-1, some unknown value is stored into memory | 875 | * if t==write && value_regno==-1, some unknown value is stored into memory |
| 869 | * if t==read && value_regno==-1, don't care what we read from memory | 876 | * if t==read && value_regno==-1, don't care what we read from memory |
| 870 | */ | 877 | */ |
| 871 | static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, | 878 | static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regno, int off, |
| 872 | int bpf_size, enum bpf_access_type t, | 879 | int bpf_size, enum bpf_access_type t, |
| 873 | int value_regno) | 880 | int value_regno) |
| 874 | { | 881 | { |
| @@ -911,7 +918,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, | |||
| 911 | verbose("R%d leaks addr into ctx\n", value_regno); | 918 | verbose("R%d leaks addr into ctx\n", value_regno); |
| 912 | return -EACCES; | 919 | return -EACCES; |
| 913 | } | 920 | } |
| 914 | err = check_ctx_access(env, off, size, t, ®_type); | 921 | err = check_ctx_access(env, insn_idx, off, size, t, ®_type); |
| 915 | if (!err && t == BPF_READ && value_regno >= 0) { | 922 | if (!err && t == BPF_READ && value_regno >= 0) { |
| 916 | mark_reg_unknown_value_and_range(state->regs, | 923 | mark_reg_unknown_value_and_range(state->regs, |
| 917 | value_regno); | 924 | value_regno); |
| @@ -926,6 +933,10 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, | |||
| 926 | verbose("invalid stack off=%d size=%d\n", off, size); | 933 | verbose("invalid stack off=%d size=%d\n", off, size); |
| 927 | return -EACCES; | 934 | return -EACCES; |
| 928 | } | 935 | } |
| 936 | |||
| 937 | if (env->prog->aux->stack_depth < -off) | ||
| 938 | env->prog->aux->stack_depth = -off; | ||
| 939 | |||
| 929 | if (t == BPF_WRITE) { | 940 | if (t == BPF_WRITE) { |
| 930 | if (!env->allow_ptr_leaks && | 941 | if (!env->allow_ptr_leaks && |
| 931 | state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL && | 942 | state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL && |
| @@ -968,7 +979,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, | |||
| 968 | return err; | 979 | return err; |
| 969 | } | 980 | } |
| 970 | 981 | ||
| 971 | static int check_xadd(struct bpf_verifier_env *env, struct bpf_insn *insn) | 982 | static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn) |
| 972 | { | 983 | { |
| 973 | struct bpf_reg_state *regs = env->cur_state.regs; | 984 | struct bpf_reg_state *regs = env->cur_state.regs; |
| 974 | int err; | 985 | int err; |
| @@ -995,13 +1006,13 @@ static int check_xadd(struct bpf_verifier_env *env, struct bpf_insn *insn) | |||
| 995 | } | 1006 | } |
| 996 | 1007 | ||
| 997 | /* check whether atomic_add can read the memory */ | 1008 | /* check whether atomic_add can read the memory */ |
| 998 | err = check_mem_access(env, insn->dst_reg, insn->off, | 1009 | err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, |
| 999 | BPF_SIZE(insn->code), BPF_READ, -1); | 1010 | BPF_SIZE(insn->code), BPF_READ, -1); |
| 1000 | if (err) | 1011 | if (err) |
| 1001 | return err; | 1012 | return err; |
| 1002 | 1013 | ||
| 1003 | /* check whether atomic_add can write into the same memory */ | 1014 | /* check whether atomic_add can write into the same memory */ |
| 1004 | return check_mem_access(env, insn->dst_reg, insn->off, | 1015 | return check_mem_access(env, insn_idx, insn->dst_reg, insn->off, |
| 1005 | BPF_SIZE(insn->code), BPF_WRITE, -1); | 1016 | BPF_SIZE(insn->code), BPF_WRITE, -1); |
| 1006 | } | 1017 | } |
| 1007 | 1018 | ||
| @@ -1037,6 +1048,9 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, | |||
| 1037 | return -EACCES; | 1048 | return -EACCES; |
| 1038 | } | 1049 | } |
| 1039 | 1050 | ||
| 1051 | if (env->prog->aux->stack_depth < -off) | ||
| 1052 | env->prog->aux->stack_depth = -off; | ||
| 1053 | |||
| 1040 | if (meta && meta->raw_mode) { | 1054 | if (meta && meta->raw_mode) { |
| 1041 | meta->access_size = access_size; | 1055 | meta->access_size = access_size; |
| 1042 | meta->regno = regno; | 1056 | meta->regno = regno; |
| @@ -1344,8 +1358,8 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) | |||
| 1344 | if (reg->type != PTR_TO_PACKET && | 1358 | if (reg->type != PTR_TO_PACKET && |
| 1345 | reg->type != PTR_TO_PACKET_END) | 1359 | reg->type != PTR_TO_PACKET_END) |
| 1346 | continue; | 1360 | continue; |
| 1347 | reg->type = UNKNOWN_VALUE; | 1361 | __mark_reg_unknown_value(state->spilled_regs, |
| 1348 | reg->imm = 0; | 1362 | i / BPF_REG_SIZE); |
| 1349 | } | 1363 | } |
| 1350 | } | 1364 | } |
| 1351 | 1365 | ||
| @@ -1414,7 +1428,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) | |||
| 1414 | * is inferred from register state. | 1428 | * is inferred from register state. |
| 1415 | */ | 1429 | */ |
| 1416 | for (i = 0; i < meta.access_size; i++) { | 1430 | for (i = 0; i < meta.access_size; i++) { |
| 1417 | err = check_mem_access(env, meta.regno, i, BPF_B, BPF_WRITE, -1); | 1431 | err = check_mem_access(env, insn_idx, meta.regno, i, BPF_B, BPF_WRITE, -1); |
| 1418 | if (err) | 1432 | if (err) |
| 1419 | return err; | 1433 | return err; |
| 1420 | } | 1434 | } |
| @@ -1650,6 +1664,65 @@ static int evaluate_reg_alu(struct bpf_verifier_env *env, struct bpf_insn *insn) | |||
| 1650 | return 0; | 1664 | return 0; |
| 1651 | } | 1665 | } |
| 1652 | 1666 | ||
| 1667 | static int evaluate_reg_imm_alu_unknown(struct bpf_verifier_env *env, | ||
| 1668 | struct bpf_insn *insn) | ||
| 1669 | { | ||
| 1670 | struct bpf_reg_state *regs = env->cur_state.regs; | ||
| 1671 | struct bpf_reg_state *dst_reg = ®s[insn->dst_reg]; | ||
| 1672 | struct bpf_reg_state *src_reg = ®s[insn->src_reg]; | ||
| 1673 | u8 opcode = BPF_OP(insn->code); | ||
| 1674 | s64 imm_log2 = __ilog2_u64((long long)dst_reg->imm); | ||
| 1675 | |||
| 1676 | /* BPF_X code with src_reg->type UNKNOWN_VALUE here. */ | ||
| 1677 | if (src_reg->imm > 0 && dst_reg->imm) { | ||
| 1678 | switch (opcode) { | ||
| 1679 | case BPF_ADD: | ||
| 1680 | /* dreg += sreg | ||
| 1681 | * where both have zero upper bits. Adding them | ||
| 1682 | * can only result making one more bit non-zero | ||
| 1683 | * in the larger value. | ||
| 1684 | * Ex. 0xffff (imm=48) + 1 (imm=63) = 0x10000 (imm=47) | ||
| 1685 | * 0xffff (imm=48) + 0xffff = 0x1fffe (imm=47) | ||
| 1686 | */ | ||
| 1687 | dst_reg->imm = min(src_reg->imm, 63 - imm_log2); | ||
| 1688 | dst_reg->imm--; | ||
| 1689 | break; | ||
| 1690 | case BPF_AND: | ||
| 1691 | /* dreg &= sreg | ||
| 1692 | * AND can not extend zero bits only shrink | ||
| 1693 | * Ex. 0x00..00ffffff | ||
| 1694 | * & 0x0f..ffffffff | ||
| 1695 | * ---------------- | ||
| 1696 | * 0x00..00ffffff | ||
| 1697 | */ | ||
| 1698 | dst_reg->imm = max(src_reg->imm, 63 - imm_log2); | ||
| 1699 | break; | ||
| 1700 | case BPF_OR: | ||
| 1701 | /* dreg |= sreg | ||
| 1702 | * OR can only extend zero bits | ||
| 1703 | * Ex. 0x00..00ffffff | ||
| 1704 | * | 0x0f..ffffffff | ||
| 1705 | * ---------------- | ||
| 1706 | * 0x0f..00ffffff | ||
| 1707 | */ | ||
| 1708 | dst_reg->imm = min(src_reg->imm, 63 - imm_log2); | ||
| 1709 | break; | ||
| 1710 | case BPF_SUB: | ||
| 1711 | case BPF_MUL: | ||
| 1712 | case BPF_RSH: | ||
| 1713 | case BPF_LSH: | ||
| 1714 | /* These may be flushed out later */ | ||
| 1715 | default: | ||
| 1716 | mark_reg_unknown_value(regs, insn->dst_reg); | ||
| 1717 | } | ||
| 1718 | } else { | ||
| 1719 | mark_reg_unknown_value(regs, insn->dst_reg); | ||
| 1720 | } | ||
| 1721 | |||
| 1722 | dst_reg->type = UNKNOWN_VALUE; | ||
| 1723 | return 0; | ||
| 1724 | } | ||
| 1725 | |||
| 1653 | static int evaluate_reg_imm_alu(struct bpf_verifier_env *env, | 1726 | static int evaluate_reg_imm_alu(struct bpf_verifier_env *env, |
| 1654 | struct bpf_insn *insn) | 1727 | struct bpf_insn *insn) |
| 1655 | { | 1728 | { |
| @@ -1659,6 +1732,9 @@ static int evaluate_reg_imm_alu(struct bpf_verifier_env *env, | |||
| 1659 | u8 opcode = BPF_OP(insn->code); | 1732 | u8 opcode = BPF_OP(insn->code); |
| 1660 | u64 dst_imm = dst_reg->imm; | 1733 | u64 dst_imm = dst_reg->imm; |
| 1661 | 1734 | ||
| 1735 | if (BPF_SRC(insn->code) == BPF_X && src_reg->type == UNKNOWN_VALUE) | ||
| 1736 | return evaluate_reg_imm_alu_unknown(env, insn); | ||
| 1737 | |||
| 1662 | /* dst_reg->type == CONST_IMM here. Simulate execution of insns | 1738 | /* dst_reg->type == CONST_IMM here. Simulate execution of insns |
| 1663 | * containing ALU ops. Don't care about overflow or negative | 1739 | * containing ALU ops. Don't care about overflow or negative |
| 1664 | * values, just add/sub/... them; registers are in u64. | 1740 | * values, just add/sub/... them; registers are in u64. |
| @@ -1763,10 +1839,24 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, | |||
| 1763 | dst_align = dst_reg->min_align; | 1839 | dst_align = dst_reg->min_align; |
| 1764 | 1840 | ||
| 1765 | /* We don't know anything about what was done to this register, mark it | 1841 | /* We don't know anything about what was done to this register, mark it |
| 1766 | * as unknown. | 1842 | * as unknown. Also, if both derived bounds came from signed/unsigned |
| 1843 | * mixed compares and one side is unbounded, we cannot really do anything | ||
| 1844 | * with them as boundaries cannot be trusted. Thus, arithmetic of two | ||
| 1845 | * regs of such kind will get invalidated bounds on the dst side. | ||
| 1767 | */ | 1846 | */ |
| 1768 | if (min_val == BPF_REGISTER_MIN_RANGE && | 1847 | if ((min_val == BPF_REGISTER_MIN_RANGE && |
| 1769 | max_val == BPF_REGISTER_MAX_RANGE) { | 1848 | max_val == BPF_REGISTER_MAX_RANGE) || |
| 1849 | (BPF_SRC(insn->code) == BPF_X && | ||
| 1850 | ((min_val != BPF_REGISTER_MIN_RANGE && | ||
| 1851 | max_val == BPF_REGISTER_MAX_RANGE) || | ||
| 1852 | (min_val == BPF_REGISTER_MIN_RANGE && | ||
| 1853 | max_val != BPF_REGISTER_MAX_RANGE) || | ||
| 1854 | (dst_reg->min_value != BPF_REGISTER_MIN_RANGE && | ||
| 1855 | dst_reg->max_value == BPF_REGISTER_MAX_RANGE) || | ||
| 1856 | (dst_reg->min_value == BPF_REGISTER_MIN_RANGE && | ||
| 1857 | dst_reg->max_value != BPF_REGISTER_MAX_RANGE)) && | ||
| 1858 | regs[insn->dst_reg].value_from_signed != | ||
| 1859 | regs[insn->src_reg].value_from_signed)) { | ||
| 1770 | reset_reg_range_values(regs, insn->dst_reg); | 1860 | reset_reg_range_values(regs, insn->dst_reg); |
| 1771 | return; | 1861 | return; |
| 1772 | } | 1862 | } |
| @@ -1950,9 +2040,11 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) | |||
| 1950 | */ | 2040 | */ |
| 1951 | regs[insn->dst_reg].type = CONST_IMM; | 2041 | regs[insn->dst_reg].type = CONST_IMM; |
| 1952 | regs[insn->dst_reg].imm = insn->imm; | 2042 | regs[insn->dst_reg].imm = insn->imm; |
| 2043 | regs[insn->dst_reg].id = 0; | ||
| 1953 | regs[insn->dst_reg].max_value = insn->imm; | 2044 | regs[insn->dst_reg].max_value = insn->imm; |
| 1954 | regs[insn->dst_reg].min_value = insn->imm; | 2045 | regs[insn->dst_reg].min_value = insn->imm; |
| 1955 | regs[insn->dst_reg].min_align = calc_align(insn->imm); | 2046 | regs[insn->dst_reg].min_align = calc_align(insn->imm); |
| 2047 | regs[insn->dst_reg].value_from_signed = false; | ||
| 1956 | } | 2048 | } |
| 1957 | 2049 | ||
| 1958 | } else if (opcode > BPF_END) { | 2050 | } else if (opcode > BPF_END) { |
| @@ -2128,40 +2220,63 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, | |||
| 2128 | struct bpf_reg_state *false_reg, u64 val, | 2220 | struct bpf_reg_state *false_reg, u64 val, |
| 2129 | u8 opcode) | 2221 | u8 opcode) |
| 2130 | { | 2222 | { |
| 2223 | bool value_from_signed = true; | ||
| 2224 | bool is_range = true; | ||
| 2225 | |||
| 2131 | switch (opcode) { | 2226 | switch (opcode) { |
| 2132 | case BPF_JEQ: | 2227 | case BPF_JEQ: |
| 2133 | /* If this is false then we know nothing Jon Snow, but if it is | 2228 | /* If this is false then we know nothing Jon Snow, but if it is |
| 2134 | * true then we know for sure. | 2229 | * true then we know for sure. |
| 2135 | */ | 2230 | */ |
| 2136 | true_reg->max_value = true_reg->min_value = val; | 2231 | true_reg->max_value = true_reg->min_value = val; |
| 2232 | is_range = false; | ||
| 2137 | break; | 2233 | break; |
| 2138 | case BPF_JNE: | 2234 | case BPF_JNE: |
| 2139 | /* If this is true we know nothing Jon Snow, but if it is false | 2235 | /* If this is true we know nothing Jon Snow, but if it is false |
| 2140 | * we know the value for sure; | 2236 | * we know the value for sure; |
| 2141 | */ | 2237 | */ |
| 2142 | false_reg->max_value = false_reg->min_value = val; | 2238 | false_reg->max_value = false_reg->min_value = val; |
| 2239 | is_range = false; | ||
| 2143 | break; | 2240 | break; |
| 2144 | case BPF_JGT: | 2241 | case BPF_JGT: |
| 2145 | /* Unsigned comparison, the minimum value is 0. */ | 2242 | value_from_signed = false; |
| 2146 | false_reg->min_value = 0; | ||
| 2147 | /* fallthrough */ | 2243 | /* fallthrough */ |
| 2148 | case BPF_JSGT: | 2244 | case BPF_JSGT: |
| 2245 | if (true_reg->value_from_signed != value_from_signed) | ||
| 2246 | reset_reg_range_values(true_reg, 0); | ||
| 2247 | if (false_reg->value_from_signed != value_from_signed) | ||
| 2248 | reset_reg_range_values(false_reg, 0); | ||
| 2249 | if (opcode == BPF_JGT) { | ||
| 2250 | /* Unsigned comparison, the minimum value is 0. */ | ||
| 2251 | false_reg->min_value = 0; | ||
| 2252 | } | ||
| 2149 | /* If this is false then we know the maximum val is val, | 2253 | /* If this is false then we know the maximum val is val, |
| 2150 | * otherwise we know the min val is val+1. | 2254 | * otherwise we know the min val is val+1. |
| 2151 | */ | 2255 | */ |
| 2152 | false_reg->max_value = val; | 2256 | false_reg->max_value = val; |
| 2257 | false_reg->value_from_signed = value_from_signed; | ||
| 2153 | true_reg->min_value = val + 1; | 2258 | true_reg->min_value = val + 1; |
| 2259 | true_reg->value_from_signed = value_from_signed; | ||
| 2154 | break; | 2260 | break; |
| 2155 | case BPF_JGE: | 2261 | case BPF_JGE: |
| 2156 | /* Unsigned comparison, the minimum value is 0. */ | 2262 | value_from_signed = false; |
| 2157 | false_reg->min_value = 0; | ||
| 2158 | /* fallthrough */ | 2263 | /* fallthrough */ |
| 2159 | case BPF_JSGE: | 2264 | case BPF_JSGE: |
| 2265 | if (true_reg->value_from_signed != value_from_signed) | ||
| 2266 | reset_reg_range_values(true_reg, 0); | ||
| 2267 | if (false_reg->value_from_signed != value_from_signed) | ||
| 2268 | reset_reg_range_values(false_reg, 0); | ||
| 2269 | if (opcode == BPF_JGE) { | ||
| 2270 | /* Unsigned comparison, the minimum value is 0. */ | ||
| 2271 | false_reg->min_value = 0; | ||
| 2272 | } | ||
| 2160 | /* If this is false then we know the maximum value is val - 1, | 2273 | /* If this is false then we know the maximum value is val - 1, |
| 2161 | * otherwise we know the mimimum value is val. | 2274 | * otherwise we know the mimimum value is val. |
| 2162 | */ | 2275 | */ |
| 2163 | false_reg->max_value = val - 1; | 2276 | false_reg->max_value = val - 1; |
| 2277 | false_reg->value_from_signed = value_from_signed; | ||
| 2164 | true_reg->min_value = val; | 2278 | true_reg->min_value = val; |
| 2279 | true_reg->value_from_signed = value_from_signed; | ||
| 2165 | break; | 2280 | break; |
| 2166 | default: | 2281 | default: |
| 2167 | break; | 2282 | break; |
| @@ -2169,6 +2284,12 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, | |||
| 2169 | 2284 | ||
| 2170 | check_reg_overflow(false_reg); | 2285 | check_reg_overflow(false_reg); |
| 2171 | check_reg_overflow(true_reg); | 2286 | check_reg_overflow(true_reg); |
| 2287 | if (is_range) { | ||
| 2288 | if (__is_pointer_value(false, false_reg)) | ||
| 2289 | reset_reg_range_values(false_reg, 0); | ||
| 2290 | if (__is_pointer_value(false, true_reg)) | ||
| 2291 | reset_reg_range_values(true_reg, 0); | ||
| 2292 | } | ||
| 2172 | } | 2293 | } |
| 2173 | 2294 | ||
| 2174 | /* Same as above, but for the case that dst_reg is a CONST_IMM reg and src_reg | 2295 | /* Same as above, but for the case that dst_reg is a CONST_IMM reg and src_reg |
| @@ -2178,41 +2299,64 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, | |||
| 2178 | struct bpf_reg_state *false_reg, u64 val, | 2299 | struct bpf_reg_state *false_reg, u64 val, |
| 2179 | u8 opcode) | 2300 | u8 opcode) |
| 2180 | { | 2301 | { |
| 2302 | bool value_from_signed = true; | ||
| 2303 | bool is_range = true; | ||
| 2304 | |||
| 2181 | switch (opcode) { | 2305 | switch (opcode) { |
| 2182 | case BPF_JEQ: | 2306 | case BPF_JEQ: |
| 2183 | /* If this is false then we know nothing Jon Snow, but if it is | 2307 | /* If this is false then we know nothing Jon Snow, but if it is |
| 2184 | * true then we know for sure. | 2308 | * true then we know for sure. |
| 2185 | */ | 2309 | */ |
| 2186 | true_reg->max_value = true_reg->min_value = val; | 2310 | true_reg->max_value = true_reg->min_value = val; |
| 2311 | is_range = false; | ||
| 2187 | break; | 2312 | break; |
| 2188 | case BPF_JNE: | 2313 | case BPF_JNE: |
| 2189 | /* If this is true we know nothing Jon Snow, but if it is false | 2314 | /* If this is true we know nothing Jon Snow, but if it is false |
| 2190 | * we know the value for sure; | 2315 | * we know the value for sure; |
| 2191 | */ | 2316 | */ |
| 2192 | false_reg->max_value = false_reg->min_value = val; | 2317 | false_reg->max_value = false_reg->min_value = val; |
| 2318 | is_range = false; | ||
| 2193 | break; | 2319 | break; |
| 2194 | case BPF_JGT: | 2320 | case BPF_JGT: |
| 2195 | /* Unsigned comparison, the minimum value is 0. */ | 2321 | value_from_signed = false; |
| 2196 | true_reg->min_value = 0; | ||
| 2197 | /* fallthrough */ | 2322 | /* fallthrough */ |
| 2198 | case BPF_JSGT: | 2323 | case BPF_JSGT: |
| 2324 | if (true_reg->value_from_signed != value_from_signed) | ||
| 2325 | reset_reg_range_values(true_reg, 0); | ||
| 2326 | if (false_reg->value_from_signed != value_from_signed) | ||
| 2327 | reset_reg_range_values(false_reg, 0); | ||
| 2328 | if (opcode == BPF_JGT) { | ||
| 2329 | /* Unsigned comparison, the minimum value is 0. */ | ||
| 2330 | true_reg->min_value = 0; | ||
| 2331 | } | ||
| 2199 | /* | 2332 | /* |
| 2200 | * If this is false, then the val is <= the register, if it is | 2333 | * If this is false, then the val is <= the register, if it is |
| 2201 | * true the register <= to the val. | 2334 | * true the register <= to the val. |
| 2202 | */ | 2335 | */ |
| 2203 | false_reg->min_value = val; | 2336 | false_reg->min_value = val; |
| 2337 | false_reg->value_from_signed = value_from_signed; | ||
| 2204 | true_reg->max_value = val - 1; | 2338 | true_reg->max_value = val - 1; |
| 2339 | true_reg->value_from_signed = value_from_signed; | ||
| 2205 | break; | 2340 | break; |
| 2206 | case BPF_JGE: | 2341 | case BPF_JGE: |
| 2207 | /* Unsigned comparison, the minimum value is 0. */ | 2342 | value_from_signed = false; |
| 2208 | true_reg->min_value = 0; | ||
| 2209 | /* fallthrough */ | 2343 | /* fallthrough */ |
| 2210 | case BPF_JSGE: | 2344 | case BPF_JSGE: |
| 2345 | if (true_reg->value_from_signed != value_from_signed) | ||
| 2346 | reset_reg_range_values(true_reg, 0); | ||
| 2347 | if (false_reg->value_from_signed != value_from_signed) | ||
| 2348 | reset_reg_range_values(false_reg, 0); | ||
| 2349 | if (opcode == BPF_JGE) { | ||
| 2350 | /* Unsigned comparison, the minimum value is 0. */ | ||
| 2351 | true_reg->min_value = 0; | ||
| 2352 | } | ||
| 2211 | /* If this is false then constant < register, if it is true then | 2353 | /* If this is false then constant < register, if it is true then |
| 2212 | * the register < constant. | 2354 | * the register < constant. |
| 2213 | */ | 2355 | */ |
| 2214 | false_reg->min_value = val + 1; | 2356 | false_reg->min_value = val + 1; |
| 2357 | false_reg->value_from_signed = value_from_signed; | ||
| 2215 | true_reg->max_value = val; | 2358 | true_reg->max_value = val; |
| 2359 | true_reg->value_from_signed = value_from_signed; | ||
| 2216 | break; | 2360 | break; |
| 2217 | default: | 2361 | default: |
| 2218 | break; | 2362 | break; |
| @@ -2220,6 +2364,12 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, | |||
| 2220 | 2364 | ||
| 2221 | check_reg_overflow(false_reg); | 2365 | check_reg_overflow(false_reg); |
| 2222 | check_reg_overflow(true_reg); | 2366 | check_reg_overflow(true_reg); |
| 2367 | if (is_range) { | ||
| 2368 | if (__is_pointer_value(false, false_reg)) | ||
| 2369 | reset_reg_range_values(false_reg, 0); | ||
| 2370 | if (__is_pointer_value(false, true_reg)) | ||
| 2371 | reset_reg_range_values(true_reg, 0); | ||
| 2372 | } | ||
| 2223 | } | 2373 | } |
| 2224 | 2374 | ||
| 2225 | static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id, | 2375 | static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id, |
| @@ -2407,6 +2557,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) | |||
| 2407 | 2557 | ||
| 2408 | regs[insn->dst_reg].type = CONST_IMM; | 2558 | regs[insn->dst_reg].type = CONST_IMM; |
| 2409 | regs[insn->dst_reg].imm = imm; | 2559 | regs[insn->dst_reg].imm = imm; |
| 2560 | regs[insn->dst_reg].id = 0; | ||
| 2410 | return 0; | 2561 | return 0; |
| 2411 | } | 2562 | } |
| 2412 | 2563 | ||
| @@ -2826,6 +2977,8 @@ static bool states_equal(struct bpf_verifier_env *env, | |||
| 2826 | return false; | 2977 | return false; |
| 2827 | if (i % BPF_REG_SIZE) | 2978 | if (i % BPF_REG_SIZE) |
| 2828 | continue; | 2979 | continue; |
| 2980 | if (old->stack_slot_type[i] != STACK_SPILL) | ||
| 2981 | continue; | ||
| 2829 | if (memcmp(&old->spilled_regs[i / BPF_REG_SIZE], | 2982 | if (memcmp(&old->spilled_regs[i / BPF_REG_SIZE], |
| 2830 | &cur->spilled_regs[i / BPF_REG_SIZE], | 2983 | &cur->spilled_regs[i / BPF_REG_SIZE], |
| 2831 | sizeof(old->spilled_regs[0]))) | 2984 | sizeof(old->spilled_regs[0]))) |
| @@ -2987,18 +3140,12 @@ static int do_check(struct bpf_verifier_env *env) | |||
| 2987 | /* check that memory (src_reg + off) is readable, | 3140 | /* check that memory (src_reg + off) is readable, |
| 2988 | * the state of dst_reg will be updated by this func | 3141 | * the state of dst_reg will be updated by this func |
| 2989 | */ | 3142 | */ |
| 2990 | err = check_mem_access(env, insn->src_reg, insn->off, | 3143 | err = check_mem_access(env, insn_idx, insn->src_reg, insn->off, |
| 2991 | BPF_SIZE(insn->code), BPF_READ, | 3144 | BPF_SIZE(insn->code), BPF_READ, |
| 2992 | insn->dst_reg); | 3145 | insn->dst_reg); |
| 2993 | if (err) | 3146 | if (err) |
| 2994 | return err; | 3147 | return err; |
| 2995 | 3148 | ||
| 2996 | if (BPF_SIZE(insn->code) != BPF_W && | ||
| 2997 | BPF_SIZE(insn->code) != BPF_DW) { | ||
| 2998 | insn_idx++; | ||
| 2999 | continue; | ||
| 3000 | } | ||
| 3001 | |||
| 3002 | prev_src_type = &env->insn_aux_data[insn_idx].ptr_type; | 3149 | prev_src_type = &env->insn_aux_data[insn_idx].ptr_type; |
| 3003 | 3150 | ||
| 3004 | if (*prev_src_type == NOT_INIT) { | 3151 | if (*prev_src_type == NOT_INIT) { |
| @@ -3026,7 +3173,7 @@ static int do_check(struct bpf_verifier_env *env) | |||
| 3026 | enum bpf_reg_type *prev_dst_type, dst_reg_type; | 3173 | enum bpf_reg_type *prev_dst_type, dst_reg_type; |
| 3027 | 3174 | ||
| 3028 | if (BPF_MODE(insn->code) == BPF_XADD) { | 3175 | if (BPF_MODE(insn->code) == BPF_XADD) { |
| 3029 | err = check_xadd(env, insn); | 3176 | err = check_xadd(env, insn_idx, insn); |
| 3030 | if (err) | 3177 | if (err) |
| 3031 | return err; | 3178 | return err; |
| 3032 | insn_idx++; | 3179 | insn_idx++; |
| @@ -3045,7 +3192,7 @@ static int do_check(struct bpf_verifier_env *env) | |||
| 3045 | dst_reg_type = regs[insn->dst_reg].type; | 3192 | dst_reg_type = regs[insn->dst_reg].type; |
| 3046 | 3193 | ||
| 3047 | /* check that memory (dst_reg + off) is writeable */ | 3194 | /* check that memory (dst_reg + off) is writeable */ |
| 3048 | err = check_mem_access(env, insn->dst_reg, insn->off, | 3195 | err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, |
| 3049 | BPF_SIZE(insn->code), BPF_WRITE, | 3196 | BPF_SIZE(insn->code), BPF_WRITE, |
| 3050 | insn->src_reg); | 3197 | insn->src_reg); |
| 3051 | if (err) | 3198 | if (err) |
| @@ -3074,7 +3221,7 @@ static int do_check(struct bpf_verifier_env *env) | |||
| 3074 | return err; | 3221 | return err; |
| 3075 | 3222 | ||
| 3076 | /* check that memory (dst_reg + off) is writeable */ | 3223 | /* check that memory (dst_reg + off) is writeable */ |
| 3077 | err = check_mem_access(env, insn->dst_reg, insn->off, | 3224 | err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, |
| 3078 | BPF_SIZE(insn->code), BPF_WRITE, | 3225 | BPF_SIZE(insn->code), BPF_WRITE, |
| 3079 | -1); | 3226 | -1); |
| 3080 | if (err) | 3227 | if (err) |
| @@ -3172,7 +3319,8 @@ process_bpf_exit: | |||
| 3172 | insn_idx++; | 3319 | insn_idx++; |
| 3173 | } | 3320 | } |
| 3174 | 3321 | ||
| 3175 | verbose("processed %d insns\n", insn_processed); | 3322 | verbose("processed %d insns, stack depth %d\n", |
| 3323 | insn_processed, env->prog->aux->stack_depth); | ||
| 3176 | return 0; | 3324 | return 0; |
| 3177 | } | 3325 | } |
| 3178 | 3326 | ||
| @@ -3372,11 +3520,13 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of | |||
| 3372 | static int convert_ctx_accesses(struct bpf_verifier_env *env) | 3520 | static int convert_ctx_accesses(struct bpf_verifier_env *env) |
| 3373 | { | 3521 | { |
| 3374 | const struct bpf_verifier_ops *ops = env->prog->aux->ops; | 3522 | const struct bpf_verifier_ops *ops = env->prog->aux->ops; |
| 3523 | int i, cnt, size, ctx_field_size, delta = 0; | ||
| 3375 | const int insn_cnt = env->prog->len; | 3524 | const int insn_cnt = env->prog->len; |
| 3376 | struct bpf_insn insn_buf[16], *insn; | 3525 | struct bpf_insn insn_buf[16], *insn; |
| 3377 | struct bpf_prog *new_prog; | 3526 | struct bpf_prog *new_prog; |
| 3378 | enum bpf_access_type type; | 3527 | enum bpf_access_type type; |
| 3379 | int i, cnt, delta = 0; | 3528 | bool is_narrower_load; |
| 3529 | u32 target_size; | ||
| 3380 | 3530 | ||
| 3381 | if (ops->gen_prologue) { | 3531 | if (ops->gen_prologue) { |
| 3382 | cnt = ops->gen_prologue(insn_buf, env->seen_direct_write, | 3532 | cnt = ops->gen_prologue(insn_buf, env->seen_direct_write, |
| @@ -3416,12 +3566,52 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) | |||
| 3416 | if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX) | 3566 | if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX) |
| 3417 | continue; | 3567 | continue; |
| 3418 | 3568 | ||
| 3419 | cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog); | 3569 | ctx_field_size = env->insn_aux_data[i + delta].ctx_field_size; |
| 3420 | if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { | 3570 | size = BPF_LDST_BYTES(insn); |
| 3571 | |||
| 3572 | /* If the read access is a narrower load of the field, | ||
| 3573 | * convert to a 4/8-byte load, to minimum program type specific | ||
| 3574 | * convert_ctx_access changes. If conversion is successful, | ||
| 3575 | * we will apply proper mask to the result. | ||
| 3576 | */ | ||
| 3577 | is_narrower_load = size < ctx_field_size; | ||
| 3578 | if (is_narrower_load) { | ||
| 3579 | u32 off = insn->off; | ||
| 3580 | u8 size_code; | ||
| 3581 | |||
| 3582 | if (type == BPF_WRITE) { | ||
| 3583 | verbose("bpf verifier narrow ctx access misconfigured\n"); | ||
| 3584 | return -EINVAL; | ||
| 3585 | } | ||
| 3586 | |||
| 3587 | size_code = BPF_H; | ||
| 3588 | if (ctx_field_size == 4) | ||
| 3589 | size_code = BPF_W; | ||
| 3590 | else if (ctx_field_size == 8) | ||
| 3591 | size_code = BPF_DW; | ||
| 3592 | |||
| 3593 | insn->off = off & ~(ctx_field_size - 1); | ||
| 3594 | insn->code = BPF_LDX | BPF_MEM | size_code; | ||
| 3595 | } | ||
| 3596 | |||
| 3597 | target_size = 0; | ||
| 3598 | cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog, | ||
| 3599 | &target_size); | ||
| 3600 | if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) || | ||
| 3601 | (ctx_field_size && !target_size)) { | ||
| 3421 | verbose("bpf verifier is misconfigured\n"); | 3602 | verbose("bpf verifier is misconfigured\n"); |
| 3422 | return -EINVAL; | 3603 | return -EINVAL; |
| 3423 | } | 3604 | } |
| 3424 | 3605 | ||
| 3606 | if (is_narrower_load && size < target_size) { | ||
| 3607 | if (ctx_field_size <= 4) | ||
| 3608 | insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg, | ||
| 3609 | (1 << size * 8) - 1); | ||
| 3610 | else | ||
| 3611 | insn_buf[cnt++] = BPF_ALU64_IMM(BPF_AND, insn->dst_reg, | ||
| 3612 | (1 << size * 8) - 1); | ||
| 3613 | } | ||
| 3614 | |||
| 3425 | new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); | 3615 | new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); |
| 3426 | if (!new_prog) | 3616 | if (!new_prog) |
| 3427 | return -ENOMEM; | 3617 | return -ENOMEM; |
| @@ -3467,6 +3657,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) | |||
| 3467 | * the program array. | 3657 | * the program array. |
| 3468 | */ | 3658 | */ |
| 3469 | prog->cb_access = 1; | 3659 | prog->cb_access = 1; |
| 3660 | env->prog->aux->stack_depth = MAX_BPF_STACK; | ||
| 3470 | 3661 | ||
| 3471 | /* mark bpf_tail_call as different opcode to avoid | 3662 | /* mark bpf_tail_call as different opcode to avoid |
| 3472 | * conditional branch in the interpeter for every normal | 3663 | * conditional branch in the interpeter for every normal |
| @@ -3474,7 +3665,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) | |||
| 3474 | * that doesn't support bpf_tail_call yet | 3665 | * that doesn't support bpf_tail_call yet |
| 3475 | */ | 3666 | */ |
| 3476 | insn->imm = 0; | 3667 | insn->imm = 0; |
| 3477 | insn->code |= BPF_X; | 3668 | insn->code = BPF_JMP | BPF_TAIL_CALL; |
| 3478 | continue; | 3669 | continue; |
| 3479 | } | 3670 | } |
| 3480 | 3671 | ||
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile index 387348a40c64..ce693ccb8c58 100644 --- a/kernel/cgroup/Makefile +++ b/kernel/cgroup/Makefile | |||
| @@ -4,3 +4,4 @@ obj-$(CONFIG_CGROUP_FREEZER) += freezer.o | |||
| 4 | obj-$(CONFIG_CGROUP_PIDS) += pids.o | 4 | obj-$(CONFIG_CGROUP_PIDS) += pids.o |
| 5 | obj-$(CONFIG_CGROUP_RDMA) += rdma.o | 5 | obj-$(CONFIG_CGROUP_RDMA) += rdma.o |
| 6 | obj-$(CONFIG_CPUSETS) += cpuset.o | 6 | obj-$(CONFIG_CPUSETS) += cpuset.o |
| 7 | obj-$(CONFIG_CGROUP_DEBUG) += debug.o | ||
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 00f4d6bf048f..793565c05742 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h | |||
| @@ -192,6 +192,8 @@ int cgroup_rmdir(struct kernfs_node *kn); | |||
| 192 | int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, | 192 | int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, |
| 193 | struct kernfs_root *kf_root); | 193 | struct kernfs_root *kf_root); |
| 194 | 194 | ||
| 195 | int cgroup_task_count(const struct cgroup *cgrp); | ||
| 196 | |||
| 195 | /* | 197 | /* |
| 196 | * namespace.c | 198 | * namespace.c |
| 197 | */ | 199 | */ |
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 85d75152402d..7bf4b1533f34 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c | |||
| @@ -334,19 +334,15 @@ static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp, | |||
| 334 | /** | 334 | /** |
| 335 | * cgroup_task_count - count the number of tasks in a cgroup. | 335 | * cgroup_task_count - count the number of tasks in a cgroup. |
| 336 | * @cgrp: the cgroup in question | 336 | * @cgrp: the cgroup in question |
| 337 | * | ||
| 338 | * Return the number of tasks in the cgroup. The returned number can be | ||
| 339 | * higher than the actual number of tasks due to css_set references from | ||
| 340 | * namespace roots and temporary usages. | ||
| 341 | */ | 337 | */ |
| 342 | static int cgroup_task_count(const struct cgroup *cgrp) | 338 | int cgroup_task_count(const struct cgroup *cgrp) |
| 343 | { | 339 | { |
| 344 | int count = 0; | 340 | int count = 0; |
| 345 | struct cgrp_cset_link *link; | 341 | struct cgrp_cset_link *link; |
| 346 | 342 | ||
| 347 | spin_lock_irq(&css_set_lock); | 343 | spin_lock_irq(&css_set_lock); |
| 348 | list_for_each_entry(link, &cgrp->cset_links, cset_link) | 344 | list_for_each_entry(link, &cgrp->cset_links, cset_link) |
| 349 | count += refcount_read(&link->cset->refcount); | 345 | count += link->cset->nr_tasks; |
| 350 | spin_unlock_irq(&css_set_lock); | 346 | spin_unlock_irq(&css_set_lock); |
| 351 | return count; | 347 | return count; |
| 352 | } | 348 | } |
| @@ -1263,150 +1259,3 @@ static int __init cgroup_no_v1(char *str) | |||
| 1263 | return 1; | 1259 | return 1; |
| 1264 | } | 1260 | } |
| 1265 | __setup("cgroup_no_v1=", cgroup_no_v1); | 1261 | __setup("cgroup_no_v1=", cgroup_no_v1); |
| 1266 | |||
| 1267 | |||
| 1268 | #ifdef CONFIG_CGROUP_DEBUG | ||
| 1269 | static struct cgroup_subsys_state * | ||
| 1270 | debug_css_alloc(struct cgroup_subsys_state *parent_css) | ||
| 1271 | { | ||
| 1272 | struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); | ||
| 1273 | |||
| 1274 | if (!css) | ||
| 1275 | return ERR_PTR(-ENOMEM); | ||
| 1276 | |||
| 1277 | return css; | ||
| 1278 | } | ||
| 1279 | |||
| 1280 | static void debug_css_free(struct cgroup_subsys_state *css) | ||
| 1281 | { | ||
| 1282 | kfree(css); | ||
| 1283 | } | ||
| 1284 | |||
| 1285 | static u64 debug_taskcount_read(struct cgroup_subsys_state *css, | ||
| 1286 | struct cftype *cft) | ||
| 1287 | { | ||
| 1288 | return cgroup_task_count(css->cgroup); | ||
| 1289 | } | ||
| 1290 | |||
| 1291 | static u64 current_css_set_read(struct cgroup_subsys_state *css, | ||
| 1292 | struct cftype *cft) | ||
| 1293 | { | ||
| 1294 | return (u64)(unsigned long)current->cgroups; | ||
| 1295 | } | ||
| 1296 | |||
| 1297 | static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css, | ||
| 1298 | struct cftype *cft) | ||
| 1299 | { | ||
| 1300 | u64 count; | ||
| 1301 | |||
| 1302 | rcu_read_lock(); | ||
| 1303 | count = refcount_read(&task_css_set(current)->refcount); | ||
| 1304 | rcu_read_unlock(); | ||
| 1305 | return count; | ||
| 1306 | } | ||
| 1307 | |||
| 1308 | static int current_css_set_cg_links_read(struct seq_file *seq, void *v) | ||
| 1309 | { | ||
| 1310 | struct cgrp_cset_link *link; | ||
| 1311 | struct css_set *cset; | ||
| 1312 | char *name_buf; | ||
| 1313 | |||
| 1314 | name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL); | ||
| 1315 | if (!name_buf) | ||
| 1316 | return -ENOMEM; | ||
| 1317 | |||
| 1318 | spin_lock_irq(&css_set_lock); | ||
| 1319 | rcu_read_lock(); | ||
| 1320 | cset = rcu_dereference(current->cgroups); | ||
| 1321 | list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { | ||
| 1322 | struct cgroup *c = link->cgrp; | ||
| 1323 | |||
| 1324 | cgroup_name(c, name_buf, NAME_MAX + 1); | ||
| 1325 | seq_printf(seq, "Root %d group %s\n", | ||
| 1326 | c->root->hierarchy_id, name_buf); | ||
| 1327 | } | ||
| 1328 | rcu_read_unlock(); | ||
| 1329 | spin_unlock_irq(&css_set_lock); | ||
| 1330 | kfree(name_buf); | ||
| 1331 | return 0; | ||
| 1332 | } | ||
| 1333 | |||
| 1334 | #define MAX_TASKS_SHOWN_PER_CSS 25 | ||
| 1335 | static int cgroup_css_links_read(struct seq_file *seq, void *v) | ||
| 1336 | { | ||
| 1337 | struct cgroup_subsys_state *css = seq_css(seq); | ||
| 1338 | struct cgrp_cset_link *link; | ||
| 1339 | |||
| 1340 | spin_lock_irq(&css_set_lock); | ||
| 1341 | list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { | ||
| 1342 | struct css_set *cset = link->cset; | ||
| 1343 | struct task_struct *task; | ||
| 1344 | int count = 0; | ||
| 1345 | |||
| 1346 | seq_printf(seq, "css_set %pK\n", cset); | ||
| 1347 | |||
| 1348 | list_for_each_entry(task, &cset->tasks, cg_list) { | ||
| 1349 | if (count++ > MAX_TASKS_SHOWN_PER_CSS) | ||
| 1350 | goto overflow; | ||
| 1351 | seq_printf(seq, " task %d\n", task_pid_vnr(task)); | ||
| 1352 | } | ||
| 1353 | |||
| 1354 | list_for_each_entry(task, &cset->mg_tasks, cg_list) { | ||
| 1355 | if (count++ > MAX_TASKS_SHOWN_PER_CSS) | ||
| 1356 | goto overflow; | ||
| 1357 | seq_printf(seq, " task %d\n", task_pid_vnr(task)); | ||
| 1358 | } | ||
| 1359 | continue; | ||
| 1360 | overflow: | ||
| 1361 | seq_puts(seq, " ...\n"); | ||
| 1362 | } | ||
| 1363 | spin_unlock_irq(&css_set_lock); | ||
| 1364 | return 0; | ||
| 1365 | } | ||
| 1366 | |||
| 1367 | static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft) | ||
| 1368 | { | ||
| 1369 | return (!cgroup_is_populated(css->cgroup) && | ||
| 1370 | !css_has_online_children(&css->cgroup->self)); | ||
| 1371 | } | ||
| 1372 | |||
| 1373 | static struct cftype debug_files[] = { | ||
| 1374 | { | ||
| 1375 | .name = "taskcount", | ||
| 1376 | .read_u64 = debug_taskcount_read, | ||
| 1377 | }, | ||
| 1378 | |||
| 1379 | { | ||
| 1380 | .name = "current_css_set", | ||
| 1381 | .read_u64 = current_css_set_read, | ||
| 1382 | }, | ||
| 1383 | |||
| 1384 | { | ||
| 1385 | .name = "current_css_set_refcount", | ||
| 1386 | .read_u64 = current_css_set_refcount_read, | ||
| 1387 | }, | ||
| 1388 | |||
| 1389 | { | ||
| 1390 | .name = "current_css_set_cg_links", | ||
| 1391 | .seq_show = current_css_set_cg_links_read, | ||
| 1392 | }, | ||
| 1393 | |||
| 1394 | { | ||
| 1395 | .name = "cgroup_css_links", | ||
| 1396 | .seq_show = cgroup_css_links_read, | ||
| 1397 | }, | ||
| 1398 | |||
| 1399 | { | ||
| 1400 | .name = "releasable", | ||
| 1401 | .read_u64 = releasable_read, | ||
| 1402 | }, | ||
| 1403 | |||
| 1404 | { } /* terminate */ | ||
| 1405 | }; | ||
| 1406 | |||
| 1407 | struct cgroup_subsys debug_cgrp_subsys = { | ||
| 1408 | .css_alloc = debug_css_alloc, | ||
| 1409 | .css_free = debug_css_free, | ||
| 1410 | .legacy_cftypes = debug_files, | ||
| 1411 | }; | ||
| 1412 | #endif /* CONFIG_CGROUP_DEBUG */ | ||
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 8d4e85eae42c..620794a20a33 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c | |||
| @@ -573,6 +573,11 @@ static int css_set_count = 1; /* 1 for init_css_set */ | |||
| 573 | /** | 573 | /** |
| 574 | * css_set_populated - does a css_set contain any tasks? | 574 | * css_set_populated - does a css_set contain any tasks? |
| 575 | * @cset: target css_set | 575 | * @cset: target css_set |
| 576 | * | ||
| 577 | * css_set_populated() should be the same as !!cset->nr_tasks at steady | ||
| 578 | * state. However, css_set_populated() can be called while a task is being | ||
| 579 | * added to or removed from the linked list before the nr_tasks is | ||
| 580 | * properly updated. Hence, we can't just look at ->nr_tasks here. | ||
| 576 | */ | 581 | */ |
| 577 | static bool css_set_populated(struct css_set *cset) | 582 | static bool css_set_populated(struct css_set *cset) |
| 578 | { | 583 | { |
| @@ -1542,10 +1547,56 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, | |||
| 1542 | return len; | 1547 | return len; |
| 1543 | } | 1548 | } |
| 1544 | 1549 | ||
| 1550 | static int parse_cgroup_root_flags(char *data, unsigned int *root_flags) | ||
| 1551 | { | ||
| 1552 | char *token; | ||
| 1553 | |||
| 1554 | *root_flags = 0; | ||
| 1555 | |||
| 1556 | if (!data) | ||
| 1557 | return 0; | ||
| 1558 | |||
| 1559 | while ((token = strsep(&data, ",")) != NULL) { | ||
| 1560 | if (!strcmp(token, "nsdelegate")) { | ||
| 1561 | *root_flags |= CGRP_ROOT_NS_DELEGATE; | ||
| 1562 | continue; | ||
| 1563 | } | ||
| 1564 | |||
| 1565 | pr_err("cgroup2: unknown option \"%s\"\n", token); | ||
| 1566 | return -EINVAL; | ||
| 1567 | } | ||
| 1568 | |||
| 1569 | return 0; | ||
| 1570 | } | ||
| 1571 | |||
| 1572 | static void apply_cgroup_root_flags(unsigned int root_flags) | ||
| 1573 | { | ||
| 1574 | if (current->nsproxy->cgroup_ns == &init_cgroup_ns) { | ||
| 1575 | if (root_flags & CGRP_ROOT_NS_DELEGATE) | ||
| 1576 | cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE; | ||
| 1577 | else | ||
| 1578 | cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE; | ||
| 1579 | } | ||
| 1580 | } | ||
| 1581 | |||
| 1582 | static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root) | ||
| 1583 | { | ||
| 1584 | if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) | ||
| 1585 | seq_puts(seq, ",nsdelegate"); | ||
| 1586 | return 0; | ||
| 1587 | } | ||
| 1588 | |||
| 1545 | static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) | 1589 | static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) |
| 1546 | { | 1590 | { |
| 1547 | pr_err("remount is not allowed\n"); | 1591 | unsigned int root_flags; |
| 1548 | return -EINVAL; | 1592 | int ret; |
| 1593 | |||
| 1594 | ret = parse_cgroup_root_flags(data, &root_flags); | ||
| 1595 | if (ret) | ||
| 1596 | return ret; | ||
| 1597 | |||
| 1598 | apply_cgroup_root_flags(root_flags); | ||
| 1599 | return 0; | ||
| 1549 | } | 1600 | } |
| 1550 | 1601 | ||
| 1551 | /* | 1602 | /* |
| @@ -1598,6 +1649,7 @@ static void cgroup_enable_task_cg_lists(void) | |||
| 1598 | css_set_update_populated(cset, true); | 1649 | css_set_update_populated(cset, true); |
| 1599 | list_add_tail(&p->cg_list, &cset->tasks); | 1650 | list_add_tail(&p->cg_list, &cset->tasks); |
| 1600 | get_css_set(cset); | 1651 | get_css_set(cset); |
| 1652 | cset->nr_tasks++; | ||
| 1601 | } | 1653 | } |
| 1602 | spin_unlock(&p->sighand->siglock); | 1654 | spin_unlock(&p->sighand->siglock); |
| 1603 | } while_each_thread(g, p); | 1655 | } while_each_thread(g, p); |
| @@ -1784,6 +1836,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
| 1784 | { | 1836 | { |
| 1785 | struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; | 1837 | struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; |
| 1786 | struct dentry *dentry; | 1838 | struct dentry *dentry; |
| 1839 | int ret; | ||
| 1787 | 1840 | ||
| 1788 | get_cgroup_ns(ns); | 1841 | get_cgroup_ns(ns); |
| 1789 | 1842 | ||
| @@ -1801,16 +1854,21 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
| 1801 | cgroup_enable_task_cg_lists(); | 1854 | cgroup_enable_task_cg_lists(); |
| 1802 | 1855 | ||
| 1803 | if (fs_type == &cgroup2_fs_type) { | 1856 | if (fs_type == &cgroup2_fs_type) { |
| 1804 | if (data) { | 1857 | unsigned int root_flags; |
| 1805 | pr_err("cgroup2: unknown option \"%s\"\n", (char *)data); | 1858 | |
| 1859 | ret = parse_cgroup_root_flags(data, &root_flags); | ||
| 1860 | if (ret) { | ||
| 1806 | put_cgroup_ns(ns); | 1861 | put_cgroup_ns(ns); |
| 1807 | return ERR_PTR(-EINVAL); | 1862 | return ERR_PTR(ret); |
| 1808 | } | 1863 | } |
| 1864 | |||
| 1809 | cgrp_dfl_visible = true; | 1865 | cgrp_dfl_visible = true; |
| 1810 | cgroup_get_live(&cgrp_dfl_root.cgrp); | 1866 | cgroup_get_live(&cgrp_dfl_root.cgrp); |
| 1811 | 1867 | ||
| 1812 | dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root, | 1868 | dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root, |
| 1813 | CGROUP2_SUPER_MAGIC, ns); | 1869 | CGROUP2_SUPER_MAGIC, ns); |
| 1870 | if (!IS_ERR(dentry)) | ||
| 1871 | apply_cgroup_root_flags(root_flags); | ||
| 1814 | } else { | 1872 | } else { |
| 1815 | dentry = cgroup1_mount(&cgroup_fs_type, flags, data, | 1873 | dentry = cgroup1_mount(&cgroup_fs_type, flags, data, |
| 1816 | CGROUP_SUPER_MAGIC, ns); | 1874 | CGROUP_SUPER_MAGIC, ns); |
| @@ -2064,8 +2122,10 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx) | |||
| 2064 | struct css_set *to_cset = cset->mg_dst_cset; | 2122 | struct css_set *to_cset = cset->mg_dst_cset; |
| 2065 | 2123 | ||
| 2066 | get_css_set(to_cset); | 2124 | get_css_set(to_cset); |
| 2125 | to_cset->nr_tasks++; | ||
| 2067 | css_set_move_task(task, from_cset, to_cset, true); | 2126 | css_set_move_task(task, from_cset, to_cset, true); |
| 2068 | put_css_set_locked(from_cset); | 2127 | put_css_set_locked(from_cset); |
| 2128 | from_cset->nr_tasks--; | ||
| 2069 | } | 2129 | } |
| 2070 | } | 2130 | } |
| 2071 | spin_unlock_irq(&css_set_lock); | 2131 | spin_unlock_irq(&css_set_lock); |
| @@ -2355,27 +2415,14 @@ static int cgroup_procs_write_permission(struct task_struct *task, | |||
| 2355 | struct cgroup *dst_cgrp, | 2415 | struct cgroup *dst_cgrp, |
| 2356 | struct kernfs_open_file *of) | 2416 | struct kernfs_open_file *of) |
| 2357 | { | 2417 | { |
| 2358 | int ret = 0; | 2418 | struct super_block *sb = of->file->f_path.dentry->d_sb; |
| 2359 | 2419 | struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; | |
| 2360 | if (cgroup_on_dfl(dst_cgrp)) { | 2420 | struct cgroup *root_cgrp = ns->root_cset->dfl_cgrp; |
| 2361 | struct super_block *sb = of->file->f_path.dentry->d_sb; | 2421 | struct cgroup *src_cgrp, *com_cgrp; |
| 2362 | struct cgroup *cgrp; | 2422 | struct inode *inode; |
| 2363 | struct inode *inode; | 2423 | int ret; |
| 2364 | |||
| 2365 | spin_lock_irq(&css_set_lock); | ||
| 2366 | cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); | ||
| 2367 | spin_unlock_irq(&css_set_lock); | ||
| 2368 | |||
| 2369 | while (!cgroup_is_descendant(dst_cgrp, cgrp)) | ||
| 2370 | cgrp = cgroup_parent(cgrp); | ||
| 2371 | 2424 | ||
| 2372 | ret = -ENOMEM; | 2425 | if (!cgroup_on_dfl(dst_cgrp)) { |
| 2373 | inode = kernfs_get_inode(sb, cgrp->procs_file.kn); | ||
| 2374 | if (inode) { | ||
| 2375 | ret = inode_permission(inode, MAY_WRITE); | ||
| 2376 | iput(inode); | ||
| 2377 | } | ||
| 2378 | } else { | ||
| 2379 | const struct cred *cred = current_cred(); | 2426 | const struct cred *cred = current_cred(); |
| 2380 | const struct cred *tcred = get_task_cred(task); | 2427 | const struct cred *tcred = get_task_cred(task); |
| 2381 | 2428 | ||
| @@ -2383,14 +2430,47 @@ static int cgroup_procs_write_permission(struct task_struct *task, | |||
| 2383 | * even if we're attaching all tasks in the thread group, | 2430 | * even if we're attaching all tasks in the thread group, |
| 2384 | * we only need to check permissions on one of them. | 2431 | * we only need to check permissions on one of them. |
| 2385 | */ | 2432 | */ |
| 2386 | if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && | 2433 | if (uid_eq(cred->euid, GLOBAL_ROOT_UID) || |
| 2387 | !uid_eq(cred->euid, tcred->uid) && | 2434 | uid_eq(cred->euid, tcred->uid) || |
| 2388 | !uid_eq(cred->euid, tcred->suid)) | 2435 | uid_eq(cred->euid, tcred->suid)) |
| 2436 | ret = 0; | ||
| 2437 | else | ||
| 2389 | ret = -EACCES; | 2438 | ret = -EACCES; |
| 2439 | |||
| 2390 | put_cred(tcred); | 2440 | put_cred(tcred); |
| 2441 | return ret; | ||
| 2391 | } | 2442 | } |
| 2392 | 2443 | ||
| 2393 | return ret; | 2444 | /* find the source cgroup */ |
| 2445 | spin_lock_irq(&css_set_lock); | ||
| 2446 | src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); | ||
| 2447 | spin_unlock_irq(&css_set_lock); | ||
| 2448 | |||
| 2449 | /* and the common ancestor */ | ||
| 2450 | com_cgrp = src_cgrp; | ||
| 2451 | while (!cgroup_is_descendant(dst_cgrp, com_cgrp)) | ||
| 2452 | com_cgrp = cgroup_parent(com_cgrp); | ||
| 2453 | |||
| 2454 | /* %current should be authorized to migrate to the common ancestor */ | ||
| 2455 | inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn); | ||
| 2456 | if (!inode) | ||
| 2457 | return -ENOMEM; | ||
| 2458 | |||
| 2459 | ret = inode_permission(inode, MAY_WRITE); | ||
| 2460 | iput(inode); | ||
| 2461 | if (ret) | ||
| 2462 | return ret; | ||
| 2463 | |||
| 2464 | /* | ||
| 2465 | * If namespaces are delegation boundaries, %current must be able | ||
| 2466 | * to see both source and destination cgroups from its namespace. | ||
| 2467 | */ | ||
| 2468 | if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) && | ||
| 2469 | (!cgroup_is_descendant(src_cgrp, root_cgrp) || | ||
| 2470 | !cgroup_is_descendant(dst_cgrp, root_cgrp))) | ||
| 2471 | return -ENOENT; | ||
| 2472 | |||
| 2473 | return 0; | ||
| 2394 | } | 2474 | } |
| 2395 | 2475 | ||
| 2396 | /* | 2476 | /* |
| @@ -2954,11 +3034,23 @@ static void cgroup_file_release(struct kernfs_open_file *of) | |||
| 2954 | static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, | 3034 | static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, |
| 2955 | size_t nbytes, loff_t off) | 3035 | size_t nbytes, loff_t off) |
| 2956 | { | 3036 | { |
| 3037 | struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; | ||
| 2957 | struct cgroup *cgrp = of->kn->parent->priv; | 3038 | struct cgroup *cgrp = of->kn->parent->priv; |
| 2958 | struct cftype *cft = of->kn->priv; | 3039 | struct cftype *cft = of->kn->priv; |
| 2959 | struct cgroup_subsys_state *css; | 3040 | struct cgroup_subsys_state *css; |
| 2960 | int ret; | 3041 | int ret; |
| 2961 | 3042 | ||
| 3043 | /* | ||
| 3044 | * If namespaces are delegation boundaries, disallow writes to | ||
| 3045 | * files in an non-init namespace root from inside the namespace | ||
| 3046 | * except for the files explicitly marked delegatable - | ||
| 3047 | * cgroup.procs and cgroup.subtree_control. | ||
| 3048 | */ | ||
| 3049 | if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) && | ||
| 3050 | !(cft->flags & CFTYPE_NS_DELEGATABLE) && | ||
| 3051 | ns != &init_cgroup_ns && ns->root_cset->dfl_cgrp == cgrp) | ||
| 3052 | return -EPERM; | ||
| 3053 | |||
| 2962 | if (cft->write) | 3054 | if (cft->write) |
| 2963 | return cft->write(of, buf, nbytes, off); | 3055 | return cft->write(of, buf, nbytes, off); |
| 2964 | 3056 | ||
| @@ -3792,6 +3884,7 @@ static int cgroup_procs_show(struct seq_file *s, void *v) | |||
| 3792 | static struct cftype cgroup_base_files[] = { | 3884 | static struct cftype cgroup_base_files[] = { |
| 3793 | { | 3885 | { |
| 3794 | .name = "cgroup.procs", | 3886 | .name = "cgroup.procs", |
| 3887 | .flags = CFTYPE_NS_DELEGATABLE, | ||
| 3795 | .file_offset = offsetof(struct cgroup, procs_file), | 3888 | .file_offset = offsetof(struct cgroup, procs_file), |
| 3796 | .release = cgroup_procs_release, | 3889 | .release = cgroup_procs_release, |
| 3797 | .seq_start = cgroup_procs_start, | 3890 | .seq_start = cgroup_procs_start, |
| @@ -3805,6 +3898,7 @@ static struct cftype cgroup_base_files[] = { | |||
| 3805 | }, | 3898 | }, |
| 3806 | { | 3899 | { |
| 3807 | .name = "cgroup.subtree_control", | 3900 | .name = "cgroup.subtree_control", |
| 3901 | .flags = CFTYPE_NS_DELEGATABLE, | ||
| 3808 | .seq_show = cgroup_subtree_control_show, | 3902 | .seq_show = cgroup_subtree_control_show, |
| 3809 | .write = cgroup_subtree_control_write, | 3903 | .write = cgroup_subtree_control_write, |
| 3810 | }, | 3904 | }, |
| @@ -4393,6 +4487,7 @@ int cgroup_rmdir(struct kernfs_node *kn) | |||
| 4393 | } | 4487 | } |
| 4394 | 4488 | ||
| 4395 | static struct kernfs_syscall_ops cgroup_kf_syscall_ops = { | 4489 | static struct kernfs_syscall_ops cgroup_kf_syscall_ops = { |
| 4490 | .show_options = cgroup_show_options, | ||
| 4396 | .remount_fs = cgroup_remount, | 4491 | .remount_fs = cgroup_remount, |
| 4397 | .mkdir = cgroup_mkdir, | 4492 | .mkdir = cgroup_mkdir, |
| 4398 | .rmdir = cgroup_rmdir, | 4493 | .rmdir = cgroup_rmdir, |
| @@ -4789,6 +4884,7 @@ void cgroup_post_fork(struct task_struct *child) | |||
| 4789 | cset = task_css_set(current); | 4884 | cset = task_css_set(current); |
| 4790 | if (list_empty(&child->cg_list)) { | 4885 | if (list_empty(&child->cg_list)) { |
| 4791 | get_css_set(cset); | 4886 | get_css_set(cset); |
| 4887 | cset->nr_tasks++; | ||
| 4792 | css_set_move_task(child, NULL, cset, false); | 4888 | css_set_move_task(child, NULL, cset, false); |
| 4793 | } | 4889 | } |
| 4794 | spin_unlock_irq(&css_set_lock); | 4890 | spin_unlock_irq(&css_set_lock); |
| @@ -4838,6 +4934,7 @@ void cgroup_exit(struct task_struct *tsk) | |||
| 4838 | if (!list_empty(&tsk->cg_list)) { | 4934 | if (!list_empty(&tsk->cg_list)) { |
| 4839 | spin_lock_irq(&css_set_lock); | 4935 | spin_lock_irq(&css_set_lock); |
| 4840 | css_set_move_task(tsk, cset, NULL, false); | 4936 | css_set_move_task(tsk, cset, NULL, false); |
| 4937 | cset->nr_tasks--; | ||
| 4841 | spin_unlock_irq(&css_set_lock); | 4938 | spin_unlock_irq(&css_set_lock); |
| 4842 | } else { | 4939 | } else { |
| 4843 | get_css_set(cset); | 4940 | get_css_set(cset); |
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index ae643412948a..ca8376e5008c 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c | |||
| @@ -1038,40 +1038,25 @@ static void cpuset_post_attach(void) | |||
| 1038 | * @tsk: the task to change | 1038 | * @tsk: the task to change |
| 1039 | * @newmems: new nodes that the task will be set | 1039 | * @newmems: new nodes that the task will be set |
| 1040 | * | 1040 | * |
| 1041 | * In order to avoid seeing no nodes if the old and new nodes are disjoint, | 1041 | * We use the mems_allowed_seq seqlock to safely update both tsk->mems_allowed |
| 1042 | * we structure updates as setting all new allowed nodes, then clearing newly | 1042 | * and rebind an eventual tasks' mempolicy. If the task is allocating in |
| 1043 | * disallowed ones. | 1043 | * parallel, it might temporarily see an empty intersection, which results in |
| 1044 | * a seqlock check and retry before OOM or allocation failure. | ||
| 1044 | */ | 1045 | */ |
| 1045 | static void cpuset_change_task_nodemask(struct task_struct *tsk, | 1046 | static void cpuset_change_task_nodemask(struct task_struct *tsk, |
| 1046 | nodemask_t *newmems) | 1047 | nodemask_t *newmems) |
| 1047 | { | 1048 | { |
| 1048 | bool need_loop; | ||
| 1049 | |||
| 1050 | task_lock(tsk); | 1049 | task_lock(tsk); |
| 1051 | /* | ||
| 1052 | * Determine if a loop is necessary if another thread is doing | ||
| 1053 | * read_mems_allowed_begin(). If at least one node remains unchanged and | ||
| 1054 | * tsk does not have a mempolicy, then an empty nodemask will not be | ||
| 1055 | * possible when mems_allowed is larger than a word. | ||
| 1056 | */ | ||
| 1057 | need_loop = task_has_mempolicy(tsk) || | ||
| 1058 | !nodes_intersects(*newmems, tsk->mems_allowed); | ||
| 1059 | 1050 | ||
| 1060 | if (need_loop) { | 1051 | local_irq_disable(); |
| 1061 | local_irq_disable(); | 1052 | write_seqcount_begin(&tsk->mems_allowed_seq); |
| 1062 | write_seqcount_begin(&tsk->mems_allowed_seq); | ||
| 1063 | } | ||
| 1064 | 1053 | ||
| 1065 | nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); | 1054 | nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); |
| 1066 | mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); | 1055 | mpol_rebind_task(tsk, newmems); |
| 1067 | |||
| 1068 | mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2); | ||
| 1069 | tsk->mems_allowed = *newmems; | 1056 | tsk->mems_allowed = *newmems; |
| 1070 | 1057 | ||
| 1071 | if (need_loop) { | 1058 | write_seqcount_end(&tsk->mems_allowed_seq); |
| 1072 | write_seqcount_end(&tsk->mems_allowed_seq); | 1059 | local_irq_enable(); |
| 1073 | local_irq_enable(); | ||
| 1074 | } | ||
| 1075 | 1060 | ||
| 1076 | task_unlock(tsk); | 1061 | task_unlock(tsk); |
| 1077 | } | 1062 | } |
diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c new file mode 100644 index 000000000000..dac46af22782 --- /dev/null +++ b/kernel/cgroup/debug.c | |||
| @@ -0,0 +1,357 @@ | |||
| 1 | /* | ||
| 2 | * Debug controller | ||
| 3 | * | ||
| 4 | * WARNING: This controller is for cgroup core debugging only. | ||
| 5 | * Its interfaces are unstable and subject to changes at any time. | ||
| 6 | */ | ||
| 7 | #include <linux/ctype.h> | ||
| 8 | #include <linux/mm.h> | ||
| 9 | #include <linux/slab.h> | ||
| 10 | |||
| 11 | #include "cgroup-internal.h" | ||
| 12 | |||
| 13 | static struct cgroup_subsys_state * | ||
| 14 | debug_css_alloc(struct cgroup_subsys_state *parent_css) | ||
| 15 | { | ||
| 16 | struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); | ||
| 17 | |||
| 18 | if (!css) | ||
| 19 | return ERR_PTR(-ENOMEM); | ||
| 20 | |||
| 21 | return css; | ||
| 22 | } | ||
| 23 | |||
| 24 | static void debug_css_free(struct cgroup_subsys_state *css) | ||
| 25 | { | ||
| 26 | kfree(css); | ||
| 27 | } | ||
| 28 | |||
| 29 | /* | ||
| 30 | * debug_taskcount_read - return the number of tasks in a cgroup. | ||
| 31 | * @cgrp: the cgroup in question | ||
| 32 | */ | ||
| 33 | static u64 debug_taskcount_read(struct cgroup_subsys_state *css, | ||
| 34 | struct cftype *cft) | ||
| 35 | { | ||
| 36 | return cgroup_task_count(css->cgroup); | ||
| 37 | } | ||
| 38 | |||
| 39 | static int current_css_set_read(struct seq_file *seq, void *v) | ||
| 40 | { | ||
| 41 | struct kernfs_open_file *of = seq->private; | ||
| 42 | struct css_set *cset; | ||
| 43 | struct cgroup_subsys *ss; | ||
| 44 | struct cgroup_subsys_state *css; | ||
| 45 | int i, refcnt; | ||
| 46 | |||
| 47 | if (!cgroup_kn_lock_live(of->kn, false)) | ||
| 48 | return -ENODEV; | ||
| 49 | |||
| 50 | spin_lock_irq(&css_set_lock); | ||
| 51 | rcu_read_lock(); | ||
| 52 | cset = rcu_dereference(current->cgroups); | ||
| 53 | refcnt = refcount_read(&cset->refcount); | ||
| 54 | seq_printf(seq, "css_set %pK %d", cset, refcnt); | ||
| 55 | if (refcnt > cset->nr_tasks) | ||
| 56 | seq_printf(seq, " +%d", refcnt - cset->nr_tasks); | ||
| 57 | seq_puts(seq, "\n"); | ||
| 58 | |||
| 59 | /* | ||
| 60 | * Print the css'es stored in the current css_set. | ||
| 61 | */ | ||
| 62 | for_each_subsys(ss, i) { | ||
| 63 | css = cset->subsys[ss->id]; | ||
| 64 | if (!css) | ||
| 65 | continue; | ||
| 66 | seq_printf(seq, "%2d: %-4s\t- %lx[%d]\n", ss->id, ss->name, | ||
| 67 | (unsigned long)css, css->id); | ||
| 68 | } | ||
| 69 | rcu_read_unlock(); | ||
| 70 | spin_unlock_irq(&css_set_lock); | ||
| 71 | cgroup_kn_unlock(of->kn); | ||
| 72 | return 0; | ||
| 73 | } | ||
| 74 | |||
| 75 | static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css, | ||
| 76 | struct cftype *cft) | ||
| 77 | { | ||
| 78 | u64 count; | ||
| 79 | |||
| 80 | rcu_read_lock(); | ||
| 81 | count = refcount_read(&task_css_set(current)->refcount); | ||
| 82 | rcu_read_unlock(); | ||
| 83 | return count; | ||
| 84 | } | ||
| 85 | |||
| 86 | static int current_css_set_cg_links_read(struct seq_file *seq, void *v) | ||
| 87 | { | ||
| 88 | struct cgrp_cset_link *link; | ||
| 89 | struct css_set *cset; | ||
| 90 | char *name_buf; | ||
| 91 | |||
| 92 | name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL); | ||
| 93 | if (!name_buf) | ||
| 94 | return -ENOMEM; | ||
| 95 | |||
| 96 | spin_lock_irq(&css_set_lock); | ||
| 97 | rcu_read_lock(); | ||
| 98 | cset = rcu_dereference(current->cgroups); | ||
| 99 | list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { | ||
| 100 | struct cgroup *c = link->cgrp; | ||
| 101 | |||
| 102 | cgroup_name(c, name_buf, NAME_MAX + 1); | ||
| 103 | seq_printf(seq, "Root %d group %s\n", | ||
| 104 | c->root->hierarchy_id, name_buf); | ||
| 105 | } | ||
| 106 | rcu_read_unlock(); | ||
| 107 | spin_unlock_irq(&css_set_lock); | ||
| 108 | kfree(name_buf); | ||
| 109 | return 0; | ||
| 110 | } | ||
| 111 | |||
| 112 | #define MAX_TASKS_SHOWN_PER_CSS 25 | ||
| 113 | static int cgroup_css_links_read(struct seq_file *seq, void *v) | ||
| 114 | { | ||
| 115 | struct cgroup_subsys_state *css = seq_css(seq); | ||
| 116 | struct cgrp_cset_link *link; | ||
| 117 | int dead_cnt = 0, extra_refs = 0; | ||
| 118 | |||
| 119 | spin_lock_irq(&css_set_lock); | ||
| 120 | list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { | ||
| 121 | struct css_set *cset = link->cset; | ||
| 122 | struct task_struct *task; | ||
| 123 | int count = 0; | ||
| 124 | int refcnt = refcount_read(&cset->refcount); | ||
| 125 | |||
| 126 | seq_printf(seq, " %d", refcnt); | ||
| 127 | if (refcnt - cset->nr_tasks > 0) { | ||
| 128 | int extra = refcnt - cset->nr_tasks; | ||
| 129 | |||
| 130 | seq_printf(seq, " +%d", extra); | ||
| 131 | /* | ||
| 132 | * Take out the one additional reference in | ||
| 133 | * init_css_set. | ||
| 134 | */ | ||
| 135 | if (cset == &init_css_set) | ||
| 136 | extra--; | ||
| 137 | extra_refs += extra; | ||
| 138 | } | ||
| 139 | seq_puts(seq, "\n"); | ||
| 140 | |||
| 141 | list_for_each_entry(task, &cset->tasks, cg_list) { | ||
| 142 | if (count++ <= MAX_TASKS_SHOWN_PER_CSS) | ||
| 143 | seq_printf(seq, " task %d\n", | ||
| 144 | task_pid_vnr(task)); | ||
| 145 | } | ||
| 146 | |||
| 147 | list_for_each_entry(task, &cset->mg_tasks, cg_list) { | ||
| 148 | if (count++ <= MAX_TASKS_SHOWN_PER_CSS) | ||
| 149 | seq_printf(seq, " task %d\n", | ||
| 150 | task_pid_vnr(task)); | ||
| 151 | } | ||
| 152 | /* show # of overflowed tasks */ | ||
| 153 | if (count > MAX_TASKS_SHOWN_PER_CSS) | ||
| 154 | seq_printf(seq, " ... (%d)\n", | ||
| 155 | count - MAX_TASKS_SHOWN_PER_CSS); | ||
| 156 | |||
| 157 | if (cset->dead) { | ||
| 158 | seq_puts(seq, " [dead]\n"); | ||
| 159 | dead_cnt++; | ||
| 160 | } | ||
| 161 | |||
| 162 | WARN_ON(count != cset->nr_tasks); | ||
| 163 | } | ||
| 164 | spin_unlock_irq(&css_set_lock); | ||
| 165 | |||
| 166 | if (!dead_cnt && !extra_refs) | ||
| 167 | return 0; | ||
| 168 | |||
| 169 | seq_puts(seq, "\n"); | ||
| 170 | if (extra_refs) | ||
| 171 | seq_printf(seq, "extra references = %d\n", extra_refs); | ||
| 172 | if (dead_cnt) | ||
| 173 | seq_printf(seq, "dead css_sets = %d\n", dead_cnt); | ||
| 174 | |||
| 175 | return 0; | ||
| 176 | } | ||
| 177 | |||
| 178 | static int cgroup_subsys_states_read(struct seq_file *seq, void *v) | ||
| 179 | { | ||
| 180 | struct kernfs_open_file *of = seq->private; | ||
| 181 | struct cgroup *cgrp; | ||
| 182 | struct cgroup_subsys *ss; | ||
| 183 | struct cgroup_subsys_state *css; | ||
| 184 | char pbuf[16]; | ||
| 185 | int i; | ||
| 186 | |||
| 187 | cgrp = cgroup_kn_lock_live(of->kn, false); | ||
| 188 | if (!cgrp) | ||
| 189 | return -ENODEV; | ||
| 190 | |||
| 191 | for_each_subsys(ss, i) { | ||
| 192 | css = rcu_dereference_check(cgrp->subsys[ss->id], true); | ||
| 193 | if (!css) | ||
| 194 | continue; | ||
| 195 | |||
| 196 | pbuf[0] = '\0'; | ||
| 197 | |||
| 198 | /* Show the parent CSS if applicable*/ | ||
| 199 | if (css->parent) | ||
| 200 | snprintf(pbuf, sizeof(pbuf) - 1, " P=%d", | ||
| 201 | css->parent->id); | ||
| 202 | seq_printf(seq, "%2d: %-4s\t- %lx[%d] %d%s\n", ss->id, ss->name, | ||
| 203 | (unsigned long)css, css->id, | ||
| 204 | atomic_read(&css->online_cnt), pbuf); | ||
| 205 | } | ||
| 206 | |||
| 207 | cgroup_kn_unlock(of->kn); | ||
| 208 | return 0; | ||
| 209 | } | ||
| 210 | |||
| 211 | static void cgroup_masks_read_one(struct seq_file *seq, const char *name, | ||
| 212 | u16 mask) | ||
| 213 | { | ||
| 214 | struct cgroup_subsys *ss; | ||
| 215 | int ssid; | ||
| 216 | bool first = true; | ||
| 217 | |||
| 218 | seq_printf(seq, "%-17s: ", name); | ||
| 219 | for_each_subsys(ss, ssid) { | ||
| 220 | if (!(mask & (1 << ssid))) | ||
| 221 | continue; | ||
| 222 | if (!first) | ||
| 223 | seq_puts(seq, ", "); | ||
| 224 | seq_puts(seq, ss->name); | ||
| 225 | first = false; | ||
| 226 | } | ||
| 227 | seq_putc(seq, '\n'); | ||
| 228 | } | ||
| 229 | |||
| 230 | static int cgroup_masks_read(struct seq_file *seq, void *v) | ||
| 231 | { | ||
| 232 | struct kernfs_open_file *of = seq->private; | ||
| 233 | struct cgroup *cgrp; | ||
| 234 | |||
| 235 | cgrp = cgroup_kn_lock_live(of->kn, false); | ||
| 236 | if (!cgrp) | ||
| 237 | return -ENODEV; | ||
| 238 | |||
| 239 | cgroup_masks_read_one(seq, "subtree_control", cgrp->subtree_control); | ||
| 240 | cgroup_masks_read_one(seq, "subtree_ss_mask", cgrp->subtree_ss_mask); | ||
| 241 | |||
| 242 | cgroup_kn_unlock(of->kn); | ||
| 243 | return 0; | ||
| 244 | } | ||
| 245 | |||
| 246 | static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft) | ||
| 247 | { | ||
| 248 | return (!cgroup_is_populated(css->cgroup) && | ||
| 249 | !css_has_online_children(&css->cgroup->self)); | ||
| 250 | } | ||
| 251 | |||
| 252 | static struct cftype debug_legacy_files[] = { | ||
| 253 | { | ||
| 254 | .name = "taskcount", | ||
| 255 | .read_u64 = debug_taskcount_read, | ||
| 256 | }, | ||
| 257 | |||
| 258 | { | ||
| 259 | .name = "current_css_set", | ||
| 260 | .seq_show = current_css_set_read, | ||
| 261 | .flags = CFTYPE_ONLY_ON_ROOT, | ||
| 262 | }, | ||
| 263 | |||
| 264 | { | ||
| 265 | .name = "current_css_set_refcount", | ||
| 266 | .read_u64 = current_css_set_refcount_read, | ||
| 267 | .flags = CFTYPE_ONLY_ON_ROOT, | ||
| 268 | }, | ||
| 269 | |||
| 270 | { | ||
| 271 | .name = "current_css_set_cg_links", | ||
| 272 | .seq_show = current_css_set_cg_links_read, | ||
| 273 | .flags = CFTYPE_ONLY_ON_ROOT, | ||
| 274 | }, | ||
| 275 | |||
| 276 | { | ||
| 277 | .name = "cgroup_css_links", | ||
| 278 | .seq_show = cgroup_css_links_read, | ||
| 279 | }, | ||
| 280 | |||
| 281 | { | ||
| 282 | .name = "cgroup_subsys_states", | ||
| 283 | .seq_show = cgroup_subsys_states_read, | ||
| 284 | }, | ||
| 285 | |||
| 286 | { | ||
| 287 | .name = "cgroup_masks", | ||
| 288 | .seq_show = cgroup_masks_read, | ||
| 289 | }, | ||
| 290 | |||
| 291 | { | ||
| 292 | .name = "releasable", | ||
| 293 | .read_u64 = releasable_read, | ||
| 294 | }, | ||
| 295 | |||
| 296 | { } /* terminate */ | ||
| 297 | }; | ||
| 298 | |||
| 299 | static struct cftype debug_files[] = { | ||
| 300 | { | ||
| 301 | .name = "taskcount", | ||
| 302 | .read_u64 = debug_taskcount_read, | ||
| 303 | }, | ||
| 304 | |||
| 305 | { | ||
| 306 | .name = "current_css_set", | ||
| 307 | .seq_show = current_css_set_read, | ||
| 308 | .flags = CFTYPE_ONLY_ON_ROOT, | ||
| 309 | }, | ||
| 310 | |||
| 311 | { | ||
| 312 | .name = "current_css_set_refcount", | ||
| 313 | .read_u64 = current_css_set_refcount_read, | ||
| 314 | .flags = CFTYPE_ONLY_ON_ROOT, | ||
| 315 | }, | ||
| 316 | |||
| 317 | { | ||
| 318 | .name = "current_css_set_cg_links", | ||
| 319 | .seq_show = current_css_set_cg_links_read, | ||
| 320 | .flags = CFTYPE_ONLY_ON_ROOT, | ||
| 321 | }, | ||
| 322 | |||
| 323 | { | ||
| 324 | .name = "css_links", | ||
| 325 | .seq_show = cgroup_css_links_read, | ||
| 326 | }, | ||
| 327 | |||
| 328 | { | ||
| 329 | .name = "csses", | ||
| 330 | .seq_show = cgroup_subsys_states_read, | ||
| 331 | }, | ||
| 332 | |||
| 333 | { | ||
| 334 | .name = "masks", | ||
| 335 | .seq_show = cgroup_masks_read, | ||
| 336 | }, | ||
| 337 | |||
| 338 | { } /* terminate */ | ||
| 339 | }; | ||
| 340 | |||
| 341 | struct cgroup_subsys debug_cgrp_subsys = { | ||
| 342 | .css_alloc = debug_css_alloc, | ||
| 343 | .css_free = debug_css_free, | ||
| 344 | .legacy_cftypes = debug_legacy_files, | ||
| 345 | }; | ||
| 346 | |||
| 347 | /* | ||
| 348 | * On v2, debug is an implicit controller enabled by "cgroup_debug" boot | ||
| 349 | * parameter. | ||
| 350 | */ | ||
| 351 | static int __init enable_cgroup_debug(char *str) | ||
| 352 | { | ||
| 353 | debug_cgrp_subsys.dfl_cftypes = debug_files; | ||
| 354 | debug_cgrp_subsys.implicit_on_dfl = true; | ||
| 355 | return 1; | ||
| 356 | } | ||
| 357 | __setup("cgroup_debug", enable_cgroup_debug); | ||
diff --git a/kernel/compat.c b/kernel/compat.c index ebd8bdc3fd68..6f0a0e723a06 100644 --- a/kernel/compat.c +++ b/kernel/compat.c | |||
| @@ -120,6 +120,50 @@ static int __compat_put_timespec(const struct timespec *ts, struct compat_timesp | |||
| 120 | __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; | 120 | __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; |
| 121 | } | 121 | } |
| 122 | 122 | ||
| 123 | static int __compat_get_timespec64(struct timespec64 *ts64, | ||
| 124 | const struct compat_timespec __user *cts) | ||
| 125 | { | ||
| 126 | struct compat_timespec ts; | ||
| 127 | int ret; | ||
| 128 | |||
| 129 | ret = copy_from_user(&ts, cts, sizeof(ts)); | ||
| 130 | if (ret) | ||
| 131 | return -EFAULT; | ||
| 132 | |||
| 133 | ts64->tv_sec = ts.tv_sec; | ||
| 134 | ts64->tv_nsec = ts.tv_nsec; | ||
| 135 | |||
| 136 | return 0; | ||
| 137 | } | ||
| 138 | |||
| 139 | static int __compat_put_timespec64(const struct timespec64 *ts64, | ||
| 140 | struct compat_timespec __user *cts) | ||
| 141 | { | ||
| 142 | struct compat_timespec ts = { | ||
| 143 | .tv_sec = ts64->tv_sec, | ||
| 144 | .tv_nsec = ts64->tv_nsec | ||
| 145 | }; | ||
| 146 | return copy_to_user(cts, &ts, sizeof(ts)) ? -EFAULT : 0; | ||
| 147 | } | ||
| 148 | |||
| 149 | int compat_get_timespec64(struct timespec64 *ts, const void __user *uts) | ||
| 150 | { | ||
| 151 | if (COMPAT_USE_64BIT_TIME) | ||
| 152 | return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0; | ||
| 153 | else | ||
| 154 | return __compat_get_timespec64(ts, uts); | ||
| 155 | } | ||
| 156 | EXPORT_SYMBOL_GPL(compat_get_timespec64); | ||
| 157 | |||
| 158 | int compat_put_timespec64(const struct timespec64 *ts, void __user *uts) | ||
| 159 | { | ||
| 160 | if (COMPAT_USE_64BIT_TIME) | ||
| 161 | return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0; | ||
| 162 | else | ||
| 163 | return __compat_put_timespec64(ts, uts); | ||
| 164 | } | ||
| 165 | EXPORT_SYMBOL_GPL(compat_put_timespec64); | ||
| 166 | |||
| 123 | int compat_get_timeval(struct timeval *tv, const void __user *utv) | 167 | int compat_get_timeval(struct timeval *tv, const void __user *utv) |
| 124 | { | 168 | { |
| 125 | if (COMPAT_USE_64BIT_TIME) | 169 | if (COMPAT_USE_64BIT_TIME) |
| @@ -203,53 +247,6 @@ int put_compat_itimerval(struct compat_itimerval __user *o, const struct itimerv | |||
| 203 | return copy_to_user(o, &v32, sizeof(struct compat_itimerval)) ? -EFAULT : 0; | 247 | return copy_to_user(o, &v32, sizeof(struct compat_itimerval)) ? -EFAULT : 0; |
| 204 | } | 248 | } |
| 205 | 249 | ||
| 206 | static compat_clock_t clock_t_to_compat_clock_t(clock_t x) | ||
| 207 | { | ||
| 208 | return compat_jiffies_to_clock_t(clock_t_to_jiffies(x)); | ||
| 209 | } | ||
| 210 | |||
| 211 | COMPAT_SYSCALL_DEFINE1(times, struct compat_tms __user *, tbuf) | ||
| 212 | { | ||
| 213 | if (tbuf) { | ||
| 214 | struct tms tms; | ||
| 215 | struct compat_tms tmp; | ||
| 216 | |||
| 217 | do_sys_times(&tms); | ||
| 218 | /* Convert our struct tms to the compat version. */ | ||
| 219 | tmp.tms_utime = clock_t_to_compat_clock_t(tms.tms_utime); | ||
| 220 | tmp.tms_stime = clock_t_to_compat_clock_t(tms.tms_stime); | ||
| 221 | tmp.tms_cutime = clock_t_to_compat_clock_t(tms.tms_cutime); | ||
| 222 | tmp.tms_cstime = clock_t_to_compat_clock_t(tms.tms_cstime); | ||
| 223 | if (copy_to_user(tbuf, &tmp, sizeof(tmp))) | ||
| 224 | return -EFAULT; | ||
| 225 | } | ||
| 226 | force_successful_syscall_return(); | ||
| 227 | return compat_jiffies_to_clock_t(jiffies); | ||
| 228 | } | ||
| 229 | |||
| 230 | #ifdef __ARCH_WANT_SYS_SIGPENDING | ||
| 231 | |||
| 232 | /* | ||
| 233 | * Assumption: old_sigset_t and compat_old_sigset_t are both | ||
| 234 | * types that can be passed to put_user()/get_user(). | ||
| 235 | */ | ||
| 236 | |||
| 237 | COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set) | ||
| 238 | { | ||
| 239 | old_sigset_t s; | ||
| 240 | long ret; | ||
| 241 | mm_segment_t old_fs = get_fs(); | ||
| 242 | |||
| 243 | set_fs(KERNEL_DS); | ||
| 244 | ret = sys_sigpending((old_sigset_t __user *) &s); | ||
| 245 | set_fs(old_fs); | ||
| 246 | if (ret == 0) | ||
| 247 | ret = put_user(s, set); | ||
| 248 | return ret; | ||
| 249 | } | ||
| 250 | |||
| 251 | #endif | ||
| 252 | |||
| 253 | #ifdef __ARCH_WANT_SYS_SIGPROCMASK | 250 | #ifdef __ARCH_WANT_SYS_SIGPROCMASK |
| 254 | 251 | ||
| 255 | /* | 252 | /* |
| @@ -304,164 +301,33 @@ COMPAT_SYSCALL_DEFINE3(sigprocmask, int, how, | |||
| 304 | 301 | ||
| 305 | #endif | 302 | #endif |
| 306 | 303 | ||
| 307 | COMPAT_SYSCALL_DEFINE2(setrlimit, unsigned int, resource, | ||
| 308 | struct compat_rlimit __user *, rlim) | ||
| 309 | { | ||
| 310 | struct rlimit r; | ||
| 311 | |||
| 312 | if (!access_ok(VERIFY_READ, rlim, sizeof(*rlim)) || | ||
| 313 | __get_user(r.rlim_cur, &rlim->rlim_cur) || | ||
| 314 | __get_user(r.rlim_max, &rlim->rlim_max)) | ||
| 315 | return -EFAULT; | ||
| 316 | |||
| 317 | if (r.rlim_cur == COMPAT_RLIM_INFINITY) | ||
| 318 | r.rlim_cur = RLIM_INFINITY; | ||
| 319 | if (r.rlim_max == COMPAT_RLIM_INFINITY) | ||
| 320 | r.rlim_max = RLIM_INFINITY; | ||
| 321 | return do_prlimit(current, resource, &r, NULL); | ||
| 322 | } | ||
| 323 | |||
| 324 | #ifdef COMPAT_RLIM_OLD_INFINITY | ||
| 325 | |||
| 326 | COMPAT_SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, | ||
| 327 | struct compat_rlimit __user *, rlim) | ||
| 328 | { | ||
| 329 | struct rlimit r; | ||
| 330 | int ret; | ||
| 331 | mm_segment_t old_fs = get_fs(); | ||
| 332 | |||
| 333 | set_fs(KERNEL_DS); | ||
| 334 | ret = sys_old_getrlimit(resource, (struct rlimit __user *)&r); | ||
| 335 | set_fs(old_fs); | ||
| 336 | |||
| 337 | if (!ret) { | ||
| 338 | if (r.rlim_cur > COMPAT_RLIM_OLD_INFINITY) | ||
| 339 | r.rlim_cur = COMPAT_RLIM_INFINITY; | ||
| 340 | if (r.rlim_max > COMPAT_RLIM_OLD_INFINITY) | ||
| 341 | r.rlim_max = COMPAT_RLIM_INFINITY; | ||
| 342 | |||
| 343 | if (!access_ok(VERIFY_WRITE, rlim, sizeof(*rlim)) || | ||
| 344 | __put_user(r.rlim_cur, &rlim->rlim_cur) || | ||
| 345 | __put_user(r.rlim_max, &rlim->rlim_max)) | ||
| 346 | return -EFAULT; | ||
| 347 | } | ||
| 348 | return ret; | ||
| 349 | } | ||
| 350 | |||
| 351 | #endif | ||
| 352 | |||
| 353 | COMPAT_SYSCALL_DEFINE2(getrlimit, unsigned int, resource, | ||
| 354 | struct compat_rlimit __user *, rlim) | ||
| 355 | { | ||
| 356 | struct rlimit r; | ||
| 357 | int ret; | ||
| 358 | |||
| 359 | ret = do_prlimit(current, resource, NULL, &r); | ||
| 360 | if (!ret) { | ||
| 361 | if (r.rlim_cur > COMPAT_RLIM_INFINITY) | ||
| 362 | r.rlim_cur = COMPAT_RLIM_INFINITY; | ||
| 363 | if (r.rlim_max > COMPAT_RLIM_INFINITY) | ||
| 364 | r.rlim_max = COMPAT_RLIM_INFINITY; | ||
| 365 | |||
| 366 | if (!access_ok(VERIFY_WRITE, rlim, sizeof(*rlim)) || | ||
| 367 | __put_user(r.rlim_cur, &rlim->rlim_cur) || | ||
| 368 | __put_user(r.rlim_max, &rlim->rlim_max)) | ||
| 369 | return -EFAULT; | ||
| 370 | } | ||
| 371 | return ret; | ||
| 372 | } | ||
| 373 | |||
| 374 | int put_compat_rusage(const struct rusage *r, struct compat_rusage __user *ru) | 304 | int put_compat_rusage(const struct rusage *r, struct compat_rusage __user *ru) |
| 375 | { | 305 | { |
| 376 | if (!access_ok(VERIFY_WRITE, ru, sizeof(*ru)) || | 306 | struct compat_rusage r32; |
| 377 | __put_user(r->ru_utime.tv_sec, &ru->ru_utime.tv_sec) || | 307 | memset(&r32, 0, sizeof(r32)); |
| 378 | __put_user(r->ru_utime.tv_usec, &ru->ru_utime.tv_usec) || | 308 | r32.ru_utime.tv_sec = r->ru_utime.tv_sec; |
| 379 | __put_user(r->ru_stime.tv_sec, &ru->ru_stime.tv_sec) || | 309 | r32.ru_utime.tv_usec = r->ru_utime.tv_usec; |
| 380 | __put_user(r->ru_stime.tv_usec, &ru->ru_stime.tv_usec) || | 310 | r32.ru_stime.tv_sec = r->ru_stime.tv_sec; |
| 381 | __put_user(r->ru_maxrss, &ru->ru_maxrss) || | 311 | r32.ru_stime.tv_usec = r->ru_stime.tv_usec; |
| 382 | __put_user(r->ru_ixrss, &ru->ru_ixrss) || | 312 | r32.ru_maxrss = r->ru_maxrss; |
| 383 | __put_user(r->ru_idrss, &ru->ru_idrss) || | 313 | r32.ru_ixrss = r->ru_ixrss; |
| 384 | __put_user(r->ru_isrss, &ru->ru_isrss) || | 314 | r32.ru_idrss = r->ru_idrss; |
| 385 | __put_user(r->ru_minflt, &ru->ru_minflt) || | 315 | r32.ru_isrss = r->ru_isrss; |
| 386 | __put_user(r->ru_majflt, &ru->ru_majflt) || | 316 | r32.ru_minflt = r->ru_minflt; |
| 387 | __put_user(r->ru_nswap, &ru->ru_nswap) || | 317 | r32.ru_majflt = r->ru_majflt; |
| 388 | __put_user(r->ru_inblock, &ru->ru_inblock) || | 318 | r32.ru_nswap = r->ru_nswap; |
| 389 | __put_user(r->ru_oublock, &ru->ru_oublock) || | 319 | r32.ru_inblock = r->ru_inblock; |
| 390 | __put_user(r->ru_msgsnd, &ru->ru_msgsnd) || | 320 | r32.ru_oublock = r->ru_oublock; |
| 391 | __put_user(r->ru_msgrcv, &ru->ru_msgrcv) || | 321 | r32.ru_msgsnd = r->ru_msgsnd; |
| 392 | __put_user(r->ru_nsignals, &ru->ru_nsignals) || | 322 | r32.ru_msgrcv = r->ru_msgrcv; |
| 393 | __put_user(r->ru_nvcsw, &ru->ru_nvcsw) || | 323 | r32.ru_nsignals = r->ru_nsignals; |
| 394 | __put_user(r->ru_nivcsw, &ru->ru_nivcsw)) | 324 | r32.ru_nvcsw = r->ru_nvcsw; |
| 325 | r32.ru_nivcsw = r->ru_nivcsw; | ||
| 326 | if (copy_to_user(ru, &r32, sizeof(r32))) | ||
| 395 | return -EFAULT; | 327 | return -EFAULT; |
| 396 | return 0; | 328 | return 0; |
| 397 | } | 329 | } |
| 398 | 330 | ||
| 399 | COMPAT_SYSCALL_DEFINE4(wait4, | ||
| 400 | compat_pid_t, pid, | ||
| 401 | compat_uint_t __user *, stat_addr, | ||
| 402 | int, options, | ||
| 403 | struct compat_rusage __user *, ru) | ||
| 404 | { | ||
| 405 | if (!ru) { | ||
| 406 | return sys_wait4(pid, stat_addr, options, NULL); | ||
| 407 | } else { | ||
| 408 | struct rusage r; | ||
| 409 | int ret; | ||
| 410 | unsigned int status; | ||
| 411 | mm_segment_t old_fs = get_fs(); | ||
| 412 | |||
| 413 | set_fs (KERNEL_DS); | ||
| 414 | ret = sys_wait4(pid, | ||
| 415 | (stat_addr ? | ||
| 416 | (unsigned int __user *) &status : NULL), | ||
| 417 | options, (struct rusage __user *) &r); | ||
| 418 | set_fs (old_fs); | ||
| 419 | |||
| 420 | if (ret > 0) { | ||
| 421 | if (put_compat_rusage(&r, ru)) | ||
| 422 | return -EFAULT; | ||
| 423 | if (stat_addr && put_user(status, stat_addr)) | ||
| 424 | return -EFAULT; | ||
| 425 | } | ||
| 426 | return ret; | ||
| 427 | } | ||
| 428 | } | ||
| 429 | |||
| 430 | COMPAT_SYSCALL_DEFINE5(waitid, | ||
| 431 | int, which, compat_pid_t, pid, | ||
| 432 | struct compat_siginfo __user *, uinfo, int, options, | ||
| 433 | struct compat_rusage __user *, uru) | ||
| 434 | { | ||
| 435 | siginfo_t info; | ||
| 436 | struct rusage ru; | ||
| 437 | long ret; | ||
| 438 | mm_segment_t old_fs = get_fs(); | ||
| 439 | |||
| 440 | memset(&info, 0, sizeof(info)); | ||
| 441 | |||
| 442 | set_fs(KERNEL_DS); | ||
| 443 | ret = sys_waitid(which, pid, (siginfo_t __user *)&info, options, | ||
| 444 | uru ? (struct rusage __user *)&ru : NULL); | ||
| 445 | set_fs(old_fs); | ||
| 446 | |||
| 447 | if ((ret < 0) || (info.si_signo == 0)) | ||
| 448 | return ret; | ||
| 449 | |||
| 450 | if (uru) { | ||
| 451 | /* sys_waitid() overwrites everything in ru */ | ||
| 452 | if (COMPAT_USE_64BIT_TIME) | ||
| 453 | ret = copy_to_user(uru, &ru, sizeof(ru)); | ||
| 454 | else | ||
| 455 | ret = put_compat_rusage(&ru, uru); | ||
| 456 | if (ret) | ||
| 457 | return -EFAULT; | ||
| 458 | } | ||
| 459 | |||
| 460 | BUG_ON(info.si_code & __SI_MASK); | ||
| 461 | info.si_code |= __SI_CHLD; | ||
| 462 | return copy_siginfo_to_user32(uinfo, &info); | ||
| 463 | } | ||
| 464 | |||
| 465 | static int compat_get_user_cpu_mask(compat_ulong_t __user *user_mask_ptr, | 331 | static int compat_get_user_cpu_mask(compat_ulong_t __user *user_mask_ptr, |
| 466 | unsigned len, struct cpumask *new_mask) | 332 | unsigned len, struct cpumask *new_mask) |
| 467 | { | 333 | { |
| @@ -542,6 +408,27 @@ int put_compat_itimerspec(struct compat_itimerspec __user *dst, | |||
| 542 | return 0; | 408 | return 0; |
| 543 | } | 409 | } |
| 544 | 410 | ||
| 411 | int get_compat_itimerspec64(struct itimerspec64 *its, | ||
| 412 | const struct compat_itimerspec __user *uits) | ||
| 413 | { | ||
| 414 | |||
| 415 | if (__compat_get_timespec64(&its->it_interval, &uits->it_interval) || | ||
| 416 | __compat_get_timespec64(&its->it_value, &uits->it_value)) | ||
| 417 | return -EFAULT; | ||
| 418 | return 0; | ||
| 419 | } | ||
| 420 | EXPORT_SYMBOL_GPL(get_compat_itimerspec64); | ||
| 421 | |||
| 422 | int put_compat_itimerspec64(const struct itimerspec64 *its, | ||
| 423 | struct compat_itimerspec __user *uits) | ||
| 424 | { | ||
| 425 | if (__compat_put_timespec64(&its->it_interval, &uits->it_interval) || | ||
| 426 | __compat_put_timespec64(&its->it_value, &uits->it_value)) | ||
| 427 | return -EFAULT; | ||
| 428 | return 0; | ||
| 429 | } | ||
| 430 | EXPORT_SYMBOL_GPL(put_compat_itimerspec64); | ||
| 431 | |||
| 545 | /* | 432 | /* |
| 546 | * We currently only need the following fields from the sigevent | 433 | * We currently only need the following fields from the sigevent |
| 547 | * structure: sigev_value, sigev_signo, sig_notify and (sometimes | 434 | * structure: sigev_value, sigev_signo, sig_notify and (sometimes |
| @@ -566,84 +453,59 @@ int get_compat_sigevent(struct sigevent *event, | |||
| 566 | long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask, | 453 | long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask, |
| 567 | unsigned long bitmap_size) | 454 | unsigned long bitmap_size) |
| 568 | { | 455 | { |
| 569 | int i, j; | ||
| 570 | unsigned long m; | ||
| 571 | compat_ulong_t um; | ||
| 572 | unsigned long nr_compat_longs; | 456 | unsigned long nr_compat_longs; |
| 573 | 457 | ||
| 574 | /* align bitmap up to nearest compat_long_t boundary */ | 458 | /* align bitmap up to nearest compat_long_t boundary */ |
| 575 | bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG); | 459 | bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG); |
| 460 | nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size); | ||
| 576 | 461 | ||
| 577 | if (!access_ok(VERIFY_READ, umask, bitmap_size / 8)) | 462 | if (!access_ok(VERIFY_READ, umask, bitmap_size / 8)) |
| 578 | return -EFAULT; | 463 | return -EFAULT; |
| 579 | 464 | ||
| 580 | nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size); | 465 | user_access_begin(); |
| 581 | 466 | while (nr_compat_longs > 1) { | |
| 582 | for (i = 0; i < BITS_TO_LONGS(bitmap_size); i++) { | 467 | compat_ulong_t l1, l2; |
| 583 | m = 0; | 468 | unsafe_get_user(l1, umask++, Efault); |
| 584 | 469 | unsafe_get_user(l2, umask++, Efault); | |
| 585 | for (j = 0; j < sizeof(m)/sizeof(um); j++) { | 470 | *mask++ = ((unsigned long)l2 << BITS_PER_COMPAT_LONG) | l1; |
| 586 | /* | 471 | nr_compat_longs -= 2; |
| 587 | * We dont want to read past the end of the userspace | ||
| 588 | * bitmap. We must however ensure the end of the | ||
| 589 | * kernel bitmap is zeroed. | ||
| 590 | */ | ||
| 591 | if (nr_compat_longs) { | ||
| 592 | nr_compat_longs--; | ||
| 593 | if (__get_user(um, umask)) | ||
| 594 | return -EFAULT; | ||
| 595 | } else { | ||
| 596 | um = 0; | ||
| 597 | } | ||
| 598 | |||
| 599 | umask++; | ||
| 600 | m |= (long)um << (j * BITS_PER_COMPAT_LONG); | ||
| 601 | } | ||
| 602 | *mask++ = m; | ||
| 603 | } | 472 | } |
| 604 | 473 | if (nr_compat_longs) | |
| 474 | unsafe_get_user(*mask, umask++, Efault); | ||
| 475 | user_access_end(); | ||
| 605 | return 0; | 476 | return 0; |
| 477 | |||
| 478 | Efault: | ||
| 479 | user_access_end(); | ||
| 480 | return -EFAULT; | ||
| 606 | } | 481 | } |
| 607 | 482 | ||
| 608 | long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask, | 483 | long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask, |
| 609 | unsigned long bitmap_size) | 484 | unsigned long bitmap_size) |
| 610 | { | 485 | { |
| 611 | int i, j; | ||
| 612 | unsigned long m; | ||
| 613 | compat_ulong_t um; | ||
| 614 | unsigned long nr_compat_longs; | 486 | unsigned long nr_compat_longs; |
| 615 | 487 | ||
| 616 | /* align bitmap up to nearest compat_long_t boundary */ | 488 | /* align bitmap up to nearest compat_long_t boundary */ |
| 617 | bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG); | 489 | bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG); |
| 490 | nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size); | ||
| 618 | 491 | ||
| 619 | if (!access_ok(VERIFY_WRITE, umask, bitmap_size / 8)) | 492 | if (!access_ok(VERIFY_WRITE, umask, bitmap_size / 8)) |
| 620 | return -EFAULT; | 493 | return -EFAULT; |
| 621 | 494 | ||
| 622 | nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size); | 495 | user_access_begin(); |
| 623 | 496 | while (nr_compat_longs > 1) { | |
| 624 | for (i = 0; i < BITS_TO_LONGS(bitmap_size); i++) { | 497 | unsigned long m = *mask++; |
| 625 | m = *mask++; | 498 | unsafe_put_user((compat_ulong_t)m, umask++, Efault); |
| 626 | 499 | unsafe_put_user(m >> BITS_PER_COMPAT_LONG, umask++, Efault); | |
| 627 | for (j = 0; j < sizeof(m)/sizeof(um); j++) { | 500 | nr_compat_longs -= 2; |
| 628 | um = m; | ||
| 629 | |||
| 630 | /* | ||
| 631 | * We dont want to write past the end of the userspace | ||
| 632 | * bitmap. | ||
| 633 | */ | ||
| 634 | if (nr_compat_longs) { | ||
| 635 | nr_compat_longs--; | ||
| 636 | if (__put_user(um, umask)) | ||
| 637 | return -EFAULT; | ||
| 638 | } | ||
| 639 | |||
| 640 | umask++; | ||
| 641 | m >>= 4*sizeof(um); | ||
| 642 | m >>= 4*sizeof(um); | ||
| 643 | } | ||
| 644 | } | 501 | } |
| 645 | 502 | if (nr_compat_longs) | |
| 503 | unsafe_put_user((compat_ulong_t)*mask, umask++, Efault); | ||
| 504 | user_access_end(); | ||
| 646 | return 0; | 505 | return 0; |
| 506 | Efault: | ||
| 507 | user_access_end(); | ||
| 508 | return -EFAULT; | ||
| 647 | } | 509 | } |
| 648 | 510 | ||
| 649 | void | 511 | void |
| @@ -669,38 +531,6 @@ sigset_to_compat(compat_sigset_t *compat, const sigset_t *set) | |||
| 669 | } | 531 | } |
| 670 | } | 532 | } |
| 671 | 533 | ||
| 672 | COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese, | ||
| 673 | struct compat_siginfo __user *, uinfo, | ||
| 674 | struct compat_timespec __user *, uts, compat_size_t, sigsetsize) | ||
| 675 | { | ||
| 676 | compat_sigset_t s32; | ||
| 677 | sigset_t s; | ||
| 678 | struct timespec t; | ||
| 679 | siginfo_t info; | ||
| 680 | long ret; | ||
| 681 | |||
| 682 | if (sigsetsize != sizeof(sigset_t)) | ||
| 683 | return -EINVAL; | ||
| 684 | |||
| 685 | if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t))) | ||
| 686 | return -EFAULT; | ||
| 687 | sigset_from_compat(&s, &s32); | ||
| 688 | |||
| 689 | if (uts) { | ||
| 690 | if (compat_get_timespec(&t, uts)) | ||
| 691 | return -EFAULT; | ||
| 692 | } | ||
| 693 | |||
| 694 | ret = do_sigtimedwait(&s, &info, uts ? &t : NULL); | ||
| 695 | |||
| 696 | if (ret > 0 && uinfo) { | ||
| 697 | if (copy_siginfo_to_user32(uinfo, &info)) | ||
| 698 | ret = -EFAULT; | ||
| 699 | } | ||
| 700 | |||
| 701 | return ret; | ||
| 702 | } | ||
| 703 | |||
| 704 | #ifdef CONFIG_NUMA | 534 | #ifdef CONFIG_NUMA |
| 705 | COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages, | 535 | COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages, |
| 706 | compat_uptr_t __user *, pages32, | 536 | compat_uptr_t __user *, pages32, |
diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config index 26a06e09a5bd..d70829033bb7 100644 --- a/kernel/configs/android-base.config +++ b/kernel/configs/android-base.config | |||
| @@ -1,10 +1,13 @@ | |||
| 1 | # KEEP ALPHABETICALLY SORTED | 1 | # KEEP ALPHABETICALLY SORTED |
| 2 | # CONFIG_DEVKMEM is not set | 2 | # CONFIG_DEVKMEM is not set |
| 3 | # CONFIG_DEVMEM is not set | 3 | # CONFIG_DEVMEM is not set |
| 4 | # CONFIG_FHANDLE is not set | ||
| 4 | # CONFIG_INET_LRO is not set | 5 | # CONFIG_INET_LRO is not set |
| 5 | # CONFIG_MODULES is not set | 6 | # CONFIG_NFSD is not set |
| 7 | # CONFIG_NFS_FS is not set | ||
| 6 | # CONFIG_OABI_COMPAT is not set | 8 | # CONFIG_OABI_COMPAT is not set |
| 7 | # CONFIG_SYSVIPC is not set | 9 | # CONFIG_SYSVIPC is not set |
| 10 | # CONFIG_USELIB is not set | ||
| 8 | CONFIG_ANDROID=y | 11 | CONFIG_ANDROID=y |
| 9 | CONFIG_ANDROID_BINDER_IPC=y | 12 | CONFIG_ANDROID_BINDER_IPC=y |
| 10 | CONFIG_ANDROID_LOW_MEMORY_KILLER=y | 13 | CONFIG_ANDROID_LOW_MEMORY_KILLER=y |
| @@ -13,6 +16,7 @@ CONFIG_ASHMEM=y | |||
| 13 | CONFIG_AUDIT=y | 16 | CONFIG_AUDIT=y |
| 14 | CONFIG_BLK_DEV_INITRD=y | 17 | CONFIG_BLK_DEV_INITRD=y |
| 15 | CONFIG_CGROUPS=y | 18 | CONFIG_CGROUPS=y |
| 19 | CONFIG_CGROUP_BPF=y | ||
| 16 | CONFIG_CGROUP_CPUACCT=y | 20 | CONFIG_CGROUP_CPUACCT=y |
| 17 | CONFIG_CGROUP_DEBUG=y | 21 | CONFIG_CGROUP_DEBUG=y |
| 18 | CONFIG_CGROUP_FREEZER=y | 22 | CONFIG_CGROUP_FREEZER=y |
| @@ -23,6 +27,8 @@ CONFIG_EMBEDDED=y | |||
| 23 | CONFIG_FB=y | 27 | CONFIG_FB=y |
| 24 | CONFIG_HARDENED_USERCOPY=y | 28 | CONFIG_HARDENED_USERCOPY=y |
| 25 | CONFIG_HIGH_RES_TIMERS=y | 29 | CONFIG_HIGH_RES_TIMERS=y |
| 30 | CONFIG_IKCONFIG=y | ||
| 31 | CONFIG_IKCONFIG_PROC=y | ||
| 26 | CONFIG_INET6_AH=y | 32 | CONFIG_INET6_AH=y |
| 27 | CONFIG_INET6_ESP=y | 33 | CONFIG_INET6_ESP=y |
| 28 | CONFIG_INET6_IPCOMP=y | 34 | CONFIG_INET6_IPCOMP=y |
| @@ -60,6 +66,9 @@ CONFIG_IP_NF_TARGET_MASQUERADE=y | |||
| 60 | CONFIG_IP_NF_TARGET_NETMAP=y | 66 | CONFIG_IP_NF_TARGET_NETMAP=y |
| 61 | CONFIG_IP_NF_TARGET_REDIRECT=y | 67 | CONFIG_IP_NF_TARGET_REDIRECT=y |
| 62 | CONFIG_IP_NF_TARGET_REJECT=y | 68 | CONFIG_IP_NF_TARGET_REJECT=y |
| 69 | CONFIG_MODULES=y | ||
| 70 | CONFIG_MODULE_UNLOAD=y | ||
| 71 | CONFIG_MODVERSIONS=y | ||
| 63 | CONFIG_NET=y | 72 | CONFIG_NET=y |
| 64 | CONFIG_NETDEVICES=y | 73 | CONFIG_NETDEVICES=y |
| 65 | CONFIG_NETFILTER=y | 74 | CONFIG_NETFILTER=y |
diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config index 28ee064b6744..946fb92418f7 100644 --- a/kernel/configs/android-recommended.config +++ b/kernel/configs/android-recommended.config | |||
| @@ -6,13 +6,15 @@ | |||
| 6 | # CONFIG_NF_CONNTRACK_SIP is not set | 6 | # CONFIG_NF_CONNTRACK_SIP is not set |
| 7 | # CONFIG_PM_WAKELOCKS_GC is not set | 7 | # CONFIG_PM_WAKELOCKS_GC is not set |
| 8 | # CONFIG_VT is not set | 8 | # CONFIG_VT is not set |
| 9 | CONFIG_ARM64_SW_TTBR0_PAN=y | ||
| 9 | CONFIG_BACKLIGHT_LCD_SUPPORT=y | 10 | CONFIG_BACKLIGHT_LCD_SUPPORT=y |
| 10 | CONFIG_BLK_DEV_DM=y | 11 | CONFIG_BLK_DEV_DM=y |
| 11 | CONFIG_BLK_DEV_LOOP=y | 12 | CONFIG_BLK_DEV_LOOP=y |
| 12 | CONFIG_BLK_DEV_RAM=y | 13 | CONFIG_BLK_DEV_RAM=y |
| 13 | CONFIG_BLK_DEV_RAM_SIZE=8192 | 14 | CONFIG_BLK_DEV_RAM_SIZE=8192 |
| 15 | CONFIG_CC_STACKPROTECTOR_STRONG=y | ||
| 14 | CONFIG_COMPACTION=y | 16 | CONFIG_COMPACTION=y |
| 15 | CONFIG_STRICT_KERNEL_RWX=y | 17 | CONFIG_CPU_SW_DOMAIN_PAN=y |
| 16 | CONFIG_DM_CRYPT=y | 18 | CONFIG_DM_CRYPT=y |
| 17 | CONFIG_DM_UEVENT=y | 19 | CONFIG_DM_UEVENT=y |
| 18 | CONFIG_DM_VERITY=y | 20 | CONFIG_DM_VERITY=y |
| @@ -105,6 +107,7 @@ CONFIG_SCHEDSTATS=y | |||
| 105 | CONFIG_SMARTJOYPLUS_FF=y | 107 | CONFIG_SMARTJOYPLUS_FF=y |
| 106 | CONFIG_SND=y | 108 | CONFIG_SND=y |
| 107 | CONFIG_SOUND=y | 109 | CONFIG_SOUND=y |
| 110 | CONFIG_STRICT_KERNEL_RWX=y | ||
| 108 | CONFIG_SUSPEND_TIME=y | 111 | CONFIG_SUSPEND_TIME=y |
| 109 | CONFIG_TABLET_USB_ACECAD=y | 112 | CONFIG_TABLET_USB_ACECAD=y |
| 110 | CONFIG_TABLET_USB_AIPTEK=y | 113 | CONFIG_TABLET_USB_AIPTEK=y |
diff --git a/kernel/cpu.c b/kernel/cpu.c index b03a32595cfe..eee033134262 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
| @@ -271,11 +271,26 @@ void cpu_hotplug_enable(void) | |||
| 271 | EXPORT_SYMBOL_GPL(cpu_hotplug_enable); | 271 | EXPORT_SYMBOL_GPL(cpu_hotplug_enable); |
| 272 | #endif /* CONFIG_HOTPLUG_CPU */ | 272 | #endif /* CONFIG_HOTPLUG_CPU */ |
| 273 | 273 | ||
| 274 | static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st); | ||
| 275 | |||
| 274 | static int bringup_wait_for_ap(unsigned int cpu) | 276 | static int bringup_wait_for_ap(unsigned int cpu) |
| 275 | { | 277 | { |
| 276 | struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); | 278 | struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); |
| 277 | 279 | ||
| 280 | /* Wait for the CPU to reach CPUHP_AP_ONLINE_IDLE */ | ||
| 278 | wait_for_completion(&st->done); | 281 | wait_for_completion(&st->done); |
| 282 | if (WARN_ON_ONCE((!cpu_online(cpu)))) | ||
| 283 | return -ECANCELED; | ||
| 284 | |||
| 285 | /* Unpark the stopper thread and the hotplug thread of the target cpu */ | ||
| 286 | stop_machine_unpark(cpu); | ||
| 287 | kthread_unpark(st->thread); | ||
| 288 | |||
| 289 | /* Should we go further up ? */ | ||
| 290 | if (st->target > CPUHP_AP_ONLINE_IDLE) { | ||
| 291 | __cpuhp_kick_ap_work(st); | ||
| 292 | wait_for_completion(&st->done); | ||
| 293 | } | ||
| 279 | return st->result; | 294 | return st->result; |
| 280 | } | 295 | } |
| 281 | 296 | ||
| @@ -296,9 +311,7 @@ static int bringup_cpu(unsigned int cpu) | |||
| 296 | irq_unlock_sparse(); | 311 | irq_unlock_sparse(); |
| 297 | if (ret) | 312 | if (ret) |
| 298 | return ret; | 313 | return ret; |
| 299 | ret = bringup_wait_for_ap(cpu); | 314 | return bringup_wait_for_ap(cpu); |
| 300 | BUG_ON(!cpu_online(cpu)); | ||
| 301 | return ret; | ||
| 302 | } | 315 | } |
| 303 | 316 | ||
| 304 | /* | 317 | /* |
| @@ -767,31 +780,20 @@ void notify_cpu_starting(unsigned int cpu) | |||
| 767 | } | 780 | } |
| 768 | 781 | ||
| 769 | /* | 782 | /* |
| 770 | * Called from the idle task. We need to set active here, so we can kick off | 783 | * Called from the idle task. Wake up the controlling task which brings the |
| 771 | * the stopper thread and unpark the smpboot threads. If the target state is | 784 | * stopper and the hotplug thread of the upcoming CPU up and then delegates |
| 772 | * beyond CPUHP_AP_ONLINE_IDLE we kick cpuhp thread and let it bring up the | 785 | * the rest of the online bringup to the hotplug thread. |
| 773 | * cpu further. | ||
| 774 | */ | 786 | */ |
| 775 | void cpuhp_online_idle(enum cpuhp_state state) | 787 | void cpuhp_online_idle(enum cpuhp_state state) |
| 776 | { | 788 | { |
| 777 | struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); | 789 | struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); |
| 778 | unsigned int cpu = smp_processor_id(); | ||
| 779 | 790 | ||
| 780 | /* Happens for the boot cpu */ | 791 | /* Happens for the boot cpu */ |
| 781 | if (state != CPUHP_AP_ONLINE_IDLE) | 792 | if (state != CPUHP_AP_ONLINE_IDLE) |
| 782 | return; | 793 | return; |
| 783 | 794 | ||
| 784 | st->state = CPUHP_AP_ONLINE_IDLE; | 795 | st->state = CPUHP_AP_ONLINE_IDLE; |
| 785 | 796 | complete(&st->done); | |
| 786 | /* Unpark the stopper thread and the hotplug thread of this cpu */ | ||
| 787 | stop_machine_unpark(cpu); | ||
| 788 | kthread_unpark(st->thread); | ||
| 789 | |||
| 790 | /* Should we go further up ? */ | ||
| 791 | if (st->target > CPUHP_AP_ONLINE_IDLE) | ||
| 792 | __cpuhp_kick_ap_work(st); | ||
| 793 | else | ||
| 794 | complete(&st->done); | ||
| 795 | } | 797 | } |
| 796 | 798 | ||
| 797 | /* Requires cpu_add_remove_lock to be held */ | 799 | /* Requires cpu_add_remove_lock to be held */ |
diff --git a/kernel/crash_core.c b/kernel/crash_core.c index fcbd568f1e95..6db80fc0810b 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c | |||
| @@ -14,10 +14,12 @@ | |||
| 14 | #include <asm/sections.h> | 14 | #include <asm/sections.h> |
| 15 | 15 | ||
| 16 | /* vmcoreinfo stuff */ | 16 | /* vmcoreinfo stuff */ |
| 17 | static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; | 17 | static unsigned char *vmcoreinfo_data; |
| 18 | u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; | 18 | static size_t vmcoreinfo_size; |
| 19 | size_t vmcoreinfo_size; | 19 | u32 *vmcoreinfo_note; |
| 20 | size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); | 20 | |
| 21 | /* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */ | ||
| 22 | static unsigned char *vmcoreinfo_data_safecopy; | ||
| 21 | 23 | ||
| 22 | /* | 24 | /* |
| 23 | * parsing the "crashkernel" commandline | 25 | * parsing the "crashkernel" commandline |
| @@ -324,8 +326,23 @@ static void update_vmcoreinfo_note(void) | |||
| 324 | final_note(buf); | 326 | final_note(buf); |
| 325 | } | 327 | } |
| 326 | 328 | ||
| 329 | void crash_update_vmcoreinfo_safecopy(void *ptr) | ||
| 330 | { | ||
| 331 | if (ptr) | ||
| 332 | memcpy(ptr, vmcoreinfo_data, vmcoreinfo_size); | ||
| 333 | |||
| 334 | vmcoreinfo_data_safecopy = ptr; | ||
| 335 | } | ||
| 336 | |||
| 327 | void crash_save_vmcoreinfo(void) | 337 | void crash_save_vmcoreinfo(void) |
| 328 | { | 338 | { |
| 339 | if (!vmcoreinfo_note) | ||
| 340 | return; | ||
| 341 | |||
| 342 | /* Use the safe copy to generate vmcoreinfo note if have */ | ||
| 343 | if (vmcoreinfo_data_safecopy) | ||
| 344 | vmcoreinfo_data = vmcoreinfo_data_safecopy; | ||
| 345 | |||
| 329 | vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds()); | 346 | vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds()); |
| 330 | update_vmcoreinfo_note(); | 347 | update_vmcoreinfo_note(); |
| 331 | } | 348 | } |
| @@ -340,7 +357,7 @@ void vmcoreinfo_append_str(const char *fmt, ...) | |||
| 340 | r = vscnprintf(buf, sizeof(buf), fmt, args); | 357 | r = vscnprintf(buf, sizeof(buf), fmt, args); |
| 341 | va_end(args); | 358 | va_end(args); |
| 342 | 359 | ||
| 343 | r = min(r, vmcoreinfo_max_size - vmcoreinfo_size); | 360 | r = min(r, (size_t)VMCOREINFO_BYTES - vmcoreinfo_size); |
| 344 | 361 | ||
| 345 | memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); | 362 | memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); |
| 346 | 363 | ||
| @@ -356,11 +373,26 @@ void __weak arch_crash_save_vmcoreinfo(void) | |||
| 356 | 373 | ||
| 357 | phys_addr_t __weak paddr_vmcoreinfo_note(void) | 374 | phys_addr_t __weak paddr_vmcoreinfo_note(void) |
| 358 | { | 375 | { |
| 359 | return __pa_symbol((unsigned long)(char *)&vmcoreinfo_note); | 376 | return __pa(vmcoreinfo_note); |
| 360 | } | 377 | } |
| 361 | 378 | ||
| 362 | static int __init crash_save_vmcoreinfo_init(void) | 379 | static int __init crash_save_vmcoreinfo_init(void) |
| 363 | { | 380 | { |
| 381 | vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL); | ||
| 382 | if (!vmcoreinfo_data) { | ||
| 383 | pr_warn("Memory allocation for vmcoreinfo_data failed\n"); | ||
| 384 | return -ENOMEM; | ||
| 385 | } | ||
| 386 | |||
| 387 | vmcoreinfo_note = alloc_pages_exact(VMCOREINFO_NOTE_SIZE, | ||
| 388 | GFP_KERNEL | __GFP_ZERO); | ||
| 389 | if (!vmcoreinfo_note) { | ||
| 390 | free_page((unsigned long)vmcoreinfo_data); | ||
| 391 | vmcoreinfo_data = NULL; | ||
| 392 | pr_warn("Memory allocation for vmcoreinfo_note failed\n"); | ||
| 393 | return -ENOMEM; | ||
| 394 | } | ||
| 395 | |||
| 364 | VMCOREINFO_OSRELEASE(init_uts_ns.name.release); | 396 | VMCOREINFO_OSRELEASE(init_uts_ns.name.release); |
| 365 | VMCOREINFO_PAGESIZE(PAGE_SIZE); | 397 | VMCOREINFO_PAGESIZE(PAGE_SIZE); |
| 366 | 398 | ||
diff --git a/kernel/cred.c b/kernel/cred.c index 2bc66075740f..ecf03657e71c 100644 --- a/kernel/cred.c +++ b/kernel/cred.c | |||
| @@ -1,4 +1,4 @@ | |||
| 1 | /* Task credentials management - see Documentation/security/credentials.txt | 1 | /* Task credentials management - see Documentation/security/credentials.rst |
| 2 | * | 2 | * |
| 3 | * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. | 3 | * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. |
| 4 | * Written by David Howells (dhowells@redhat.com) | 4 | * Written by David Howells (dhowells@redhat.com) |
diff --git a/kernel/events/core.c b/kernel/events/core.c index c9cdbd396770..426c2ffba16d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
| @@ -3639,10 +3639,10 @@ static inline u64 perf_event_count(struct perf_event *event) | |||
| 3639 | * will not be local and we cannot read them atomically | 3639 | * will not be local and we cannot read them atomically |
| 3640 | * - must not have a pmu::count method | 3640 | * - must not have a pmu::count method |
| 3641 | */ | 3641 | */ |
| 3642 | u64 perf_event_read_local(struct perf_event *event) | 3642 | int perf_event_read_local(struct perf_event *event, u64 *value) |
| 3643 | { | 3643 | { |
| 3644 | unsigned long flags; | 3644 | unsigned long flags; |
| 3645 | u64 val; | 3645 | int ret = 0; |
| 3646 | 3646 | ||
| 3647 | /* | 3647 | /* |
| 3648 | * Disabling interrupts avoids all counter scheduling (context | 3648 | * Disabling interrupts avoids all counter scheduling (context |
| @@ -3650,25 +3650,37 @@ u64 perf_event_read_local(struct perf_event *event) | |||
| 3650 | */ | 3650 | */ |
| 3651 | local_irq_save(flags); | 3651 | local_irq_save(flags); |
| 3652 | 3652 | ||
| 3653 | /* If this is a per-task event, it must be for current */ | ||
| 3654 | WARN_ON_ONCE((event->attach_state & PERF_ATTACH_TASK) && | ||
| 3655 | event->hw.target != current); | ||
| 3656 | |||
| 3657 | /* If this is a per-CPU event, it must be for this CPU */ | ||
| 3658 | WARN_ON_ONCE(!(event->attach_state & PERF_ATTACH_TASK) && | ||
| 3659 | event->cpu != smp_processor_id()); | ||
| 3660 | |||
| 3661 | /* | 3653 | /* |
| 3662 | * It must not be an event with inherit set, we cannot read | 3654 | * It must not be an event with inherit set, we cannot read |
| 3663 | * all child counters from atomic context. | 3655 | * all child counters from atomic context. |
| 3664 | */ | 3656 | */ |
| 3665 | WARN_ON_ONCE(event->attr.inherit); | 3657 | if (event->attr.inherit) { |
| 3658 | ret = -EOPNOTSUPP; | ||
| 3659 | goto out; | ||
| 3660 | } | ||
| 3666 | 3661 | ||
| 3667 | /* | 3662 | /* |
| 3668 | * It must not have a pmu::count method, those are not | 3663 | * It must not have a pmu::count method, those are not |
| 3669 | * NMI safe. | 3664 | * NMI safe. |
| 3670 | */ | 3665 | */ |
| 3671 | WARN_ON_ONCE(event->pmu->count); | 3666 | if (event->pmu->count) { |
| 3667 | ret = -EOPNOTSUPP; | ||
| 3668 | goto out; | ||
| 3669 | } | ||
| 3670 | |||
| 3671 | /* If this is a per-task event, it must be for current */ | ||
| 3672 | if ((event->attach_state & PERF_ATTACH_TASK) && | ||
| 3673 | event->hw.target != current) { | ||
| 3674 | ret = -EINVAL; | ||
| 3675 | goto out; | ||
| 3676 | } | ||
| 3677 | |||
| 3678 | /* If this is a per-CPU event, it must be for this CPU */ | ||
| 3679 | if (!(event->attach_state & PERF_ATTACH_TASK) && | ||
| 3680 | event->cpu != smp_processor_id()) { | ||
| 3681 | ret = -EINVAL; | ||
| 3682 | goto out; | ||
| 3683 | } | ||
| 3672 | 3684 | ||
| 3673 | /* | 3685 | /* |
| 3674 | * If the event is currently on this CPU, its either a per-task event, | 3686 | * If the event is currently on this CPU, its either a per-task event, |
| @@ -3678,10 +3690,11 @@ u64 perf_event_read_local(struct perf_event *event) | |||
| 3678 | if (event->oncpu == smp_processor_id()) | 3690 | if (event->oncpu == smp_processor_id()) |
| 3679 | event->pmu->read(event); | 3691 | event->pmu->read(event); |
| 3680 | 3692 | ||
| 3681 | val = local64_read(&event->count); | 3693 | *value = local64_read(&event->count); |
| 3694 | out: | ||
| 3682 | local_irq_restore(flags); | 3695 | local_irq_restore(flags); |
| 3683 | 3696 | ||
| 3684 | return val; | 3697 | return ret; |
| 3685 | } | 3698 | } |
| 3686 | 3699 | ||
| 3687 | static int perf_event_read(struct perf_event *event, bool group) | 3700 | static int perf_event_read(struct perf_event *event, bool group) |
| @@ -4372,7 +4385,9 @@ EXPORT_SYMBOL_GPL(perf_event_read_value); | |||
| 4372 | static int __perf_read_group_add(struct perf_event *leader, | 4385 | static int __perf_read_group_add(struct perf_event *leader, |
| 4373 | u64 read_format, u64 *values) | 4386 | u64 read_format, u64 *values) |
| 4374 | { | 4387 | { |
| 4388 | struct perf_event_context *ctx = leader->ctx; | ||
| 4375 | struct perf_event *sub; | 4389 | struct perf_event *sub; |
| 4390 | unsigned long flags; | ||
| 4376 | int n = 1; /* skip @nr */ | 4391 | int n = 1; /* skip @nr */ |
| 4377 | int ret; | 4392 | int ret; |
| 4378 | 4393 | ||
| @@ -4402,12 +4417,15 @@ static int __perf_read_group_add(struct perf_event *leader, | |||
| 4402 | if (read_format & PERF_FORMAT_ID) | 4417 | if (read_format & PERF_FORMAT_ID) |
| 4403 | values[n++] = primary_event_id(leader); | 4418 | values[n++] = primary_event_id(leader); |
| 4404 | 4419 | ||
| 4420 | raw_spin_lock_irqsave(&ctx->lock, flags); | ||
| 4421 | |||
| 4405 | list_for_each_entry(sub, &leader->sibling_list, group_entry) { | 4422 | list_for_each_entry(sub, &leader->sibling_list, group_entry) { |
| 4406 | values[n++] += perf_event_count(sub); | 4423 | values[n++] += perf_event_count(sub); |
| 4407 | if (read_format & PERF_FORMAT_ID) | 4424 | if (read_format & PERF_FORMAT_ID) |
| 4408 | values[n++] = primary_event_id(sub); | 4425 | values[n++] = primary_event_id(sub); |
| 4409 | } | 4426 | } |
| 4410 | 4427 | ||
| 4428 | raw_spin_unlock_irqrestore(&ctx->lock, flags); | ||
| 4411 | return 0; | 4429 | return 0; |
| 4412 | } | 4430 | } |
| 4413 | 4431 | ||
| @@ -8035,12 +8053,8 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) | |||
| 8035 | bool is_kprobe, is_tracepoint; | 8053 | bool is_kprobe, is_tracepoint; |
| 8036 | struct bpf_prog *prog; | 8054 | struct bpf_prog *prog; |
| 8037 | 8055 | ||
| 8038 | if (event->attr.type == PERF_TYPE_HARDWARE || | ||
| 8039 | event->attr.type == PERF_TYPE_SOFTWARE) | ||
| 8040 | return perf_event_set_bpf_handler(event, prog_fd); | ||
| 8041 | |||
| 8042 | if (event->attr.type != PERF_TYPE_TRACEPOINT) | 8056 | if (event->attr.type != PERF_TYPE_TRACEPOINT) |
| 8043 | return -EINVAL; | 8057 | return perf_event_set_bpf_handler(event, prog_fd); |
| 8044 | 8058 | ||
| 8045 | if (event->tp_event->prog) | 8059 | if (event->tp_event->prog) |
| 8046 | return -EEXIST; | 8060 | return -EEXIST; |
diff --git a/kernel/exit.c b/kernel/exit.c index c63226283aef..c5548faa9f37 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
| @@ -51,7 +51,6 @@ | |||
| 51 | #include <linux/task_io_accounting_ops.h> | 51 | #include <linux/task_io_accounting_ops.h> |
| 52 | #include <linux/tracehook.h> | 52 | #include <linux/tracehook.h> |
| 53 | #include <linux/fs_struct.h> | 53 | #include <linux/fs_struct.h> |
| 54 | #include <linux/userfaultfd_k.h> | ||
| 55 | #include <linux/init_task.h> | 54 | #include <linux/init_task.h> |
| 56 | #include <linux/perf_event.h> | 55 | #include <linux/perf_event.h> |
| 57 | #include <trace/events/sched.h> | 56 | #include <trace/events/sched.h> |
| @@ -62,6 +61,7 @@ | |||
| 62 | #include <linux/kcov.h> | 61 | #include <linux/kcov.h> |
| 63 | #include <linux/random.h> | 62 | #include <linux/random.h> |
| 64 | #include <linux/rcuwait.h> | 63 | #include <linux/rcuwait.h> |
| 64 | #include <linux/compat.h> | ||
| 65 | 65 | ||
| 66 | #include <linux/uaccess.h> | 66 | #include <linux/uaccess.h> |
| 67 | #include <asm/unistd.h> | 67 | #include <asm/unistd.h> |
| @@ -982,14 +982,21 @@ SYSCALL_DEFINE1(exit_group, int, error_code) | |||
| 982 | return 0; | 982 | return 0; |
| 983 | } | 983 | } |
| 984 | 984 | ||
| 985 | struct waitid_info { | ||
| 986 | pid_t pid; | ||
| 987 | uid_t uid; | ||
| 988 | int status; | ||
| 989 | int cause; | ||
| 990 | }; | ||
| 991 | |||
| 985 | struct wait_opts { | 992 | struct wait_opts { |
| 986 | enum pid_type wo_type; | 993 | enum pid_type wo_type; |
| 987 | int wo_flags; | 994 | int wo_flags; |
| 988 | struct pid *wo_pid; | 995 | struct pid *wo_pid; |
| 989 | 996 | ||
| 990 | struct siginfo __user *wo_info; | 997 | struct waitid_info *wo_info; |
| 991 | int __user *wo_stat; | 998 | int wo_stat; |
| 992 | struct rusage __user *wo_rusage; | 999 | struct rusage *wo_rusage; |
| 993 | 1000 | ||
| 994 | wait_queue_entry_t child_wait; | 1001 | wait_queue_entry_t child_wait; |
| 995 | int notask_error; | 1002 | int notask_error; |
| @@ -1036,34 +1043,6 @@ eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p) | |||
| 1036 | return 1; | 1043 | return 1; |
| 1037 | } | 1044 | } |
| 1038 | 1045 | ||
| 1039 | static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p, | ||
| 1040 | pid_t pid, uid_t uid, int why, int status) | ||
| 1041 | { | ||
| 1042 | struct siginfo __user *infop; | ||
| 1043 | int retval = wo->wo_rusage | ||
| 1044 | ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; | ||
| 1045 | |||
| 1046 | put_task_struct(p); | ||
| 1047 | infop = wo->wo_info; | ||
| 1048 | if (infop) { | ||
| 1049 | if (!retval) | ||
| 1050 | retval = put_user(SIGCHLD, &infop->si_signo); | ||
| 1051 | if (!retval) | ||
| 1052 | retval = put_user(0, &infop->si_errno); | ||
| 1053 | if (!retval) | ||
| 1054 | retval = put_user((short)why, &infop->si_code); | ||
| 1055 | if (!retval) | ||
| 1056 | retval = put_user(pid, &infop->si_pid); | ||
| 1057 | if (!retval) | ||
| 1058 | retval = put_user(uid, &infop->si_uid); | ||
| 1059 | if (!retval) | ||
| 1060 | retval = put_user(status, &infop->si_status); | ||
| 1061 | } | ||
| 1062 | if (!retval) | ||
| 1063 | retval = pid; | ||
| 1064 | return retval; | ||
| 1065 | } | ||
| 1066 | |||
| 1067 | /* | 1046 | /* |
| 1068 | * Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold | 1047 | * Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold |
| 1069 | * read_lock(&tasklist_lock) on entry. If we return zero, we still hold | 1048 | * read_lock(&tasklist_lock) on entry. If we return zero, we still hold |
| @@ -1072,30 +1051,23 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p, | |||
| 1072 | */ | 1051 | */ |
| 1073 | static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) | 1052 | static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) |
| 1074 | { | 1053 | { |
| 1075 | int state, retval, status; | 1054 | int state, status; |
| 1076 | pid_t pid = task_pid_vnr(p); | 1055 | pid_t pid = task_pid_vnr(p); |
| 1077 | uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p)); | 1056 | uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p)); |
| 1078 | struct siginfo __user *infop; | 1057 | struct waitid_info *infop; |
| 1079 | 1058 | ||
| 1080 | if (!likely(wo->wo_flags & WEXITED)) | 1059 | if (!likely(wo->wo_flags & WEXITED)) |
| 1081 | return 0; | 1060 | return 0; |
| 1082 | 1061 | ||
| 1083 | if (unlikely(wo->wo_flags & WNOWAIT)) { | 1062 | if (unlikely(wo->wo_flags & WNOWAIT)) { |
| 1084 | int exit_code = p->exit_code; | 1063 | status = p->exit_code; |
| 1085 | int why; | ||
| 1086 | |||
| 1087 | get_task_struct(p); | 1064 | get_task_struct(p); |
| 1088 | read_unlock(&tasklist_lock); | 1065 | read_unlock(&tasklist_lock); |
| 1089 | sched_annotate_sleep(); | 1066 | sched_annotate_sleep(); |
| 1090 | 1067 | if (wo->wo_rusage) | |
| 1091 | if ((exit_code & 0x7f) == 0) { | 1068 | getrusage(p, RUSAGE_BOTH, wo->wo_rusage); |
| 1092 | why = CLD_EXITED; | 1069 | put_task_struct(p); |
| 1093 | status = exit_code >> 8; | 1070 | goto out_info; |
| 1094 | } else { | ||
| 1095 | why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED; | ||
| 1096 | status = exit_code & 0x7f; | ||
| 1097 | } | ||
| 1098 | return wait_noreap_copyout(wo, p, pid, uid, why, status); | ||
| 1099 | } | 1071 | } |
| 1100 | /* | 1072 | /* |
| 1101 | * Move the task's state to DEAD/TRACE, only one thread can do this. | 1073 | * Move the task's state to DEAD/TRACE, only one thread can do this. |
| @@ -1168,38 +1140,11 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) | |||
| 1168 | spin_unlock_irq(¤t->sighand->siglock); | 1140 | spin_unlock_irq(¤t->sighand->siglock); |
| 1169 | } | 1141 | } |
| 1170 | 1142 | ||
| 1171 | retval = wo->wo_rusage | 1143 | if (wo->wo_rusage) |
| 1172 | ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; | 1144 | getrusage(p, RUSAGE_BOTH, wo->wo_rusage); |
| 1173 | status = (p->signal->flags & SIGNAL_GROUP_EXIT) | 1145 | status = (p->signal->flags & SIGNAL_GROUP_EXIT) |
| 1174 | ? p->signal->group_exit_code : p->exit_code; | 1146 | ? p->signal->group_exit_code : p->exit_code; |
| 1175 | if (!retval && wo->wo_stat) | 1147 | wo->wo_stat = status; |
| 1176 | retval = put_user(status, wo->wo_stat); | ||
| 1177 | |||
| 1178 | infop = wo->wo_info; | ||
| 1179 | if (!retval && infop) | ||
| 1180 | retval = put_user(SIGCHLD, &infop->si_signo); | ||
| 1181 | if (!retval && infop) | ||
| 1182 | retval = put_user(0, &infop->si_errno); | ||
| 1183 | if (!retval && infop) { | ||
| 1184 | int why; | ||
| 1185 | |||
| 1186 | if ((status & 0x7f) == 0) { | ||
| 1187 | why = CLD_EXITED; | ||
| 1188 | status >>= 8; | ||
| 1189 | } else { | ||
| 1190 | why = (status & 0x80) ? CLD_DUMPED : CLD_KILLED; | ||
| 1191 | status &= 0x7f; | ||
| 1192 | } | ||
| 1193 | retval = put_user((short)why, &infop->si_code); | ||
| 1194 | if (!retval) | ||
| 1195 | retval = put_user(status, &infop->si_status); | ||
| 1196 | } | ||
| 1197 | if (!retval && infop) | ||
| 1198 | retval = put_user(pid, &infop->si_pid); | ||
| 1199 | if (!retval && infop) | ||
| 1200 | retval = put_user(uid, &infop->si_uid); | ||
| 1201 | if (!retval) | ||
| 1202 | retval = pid; | ||
| 1203 | 1148 | ||
| 1204 | if (state == EXIT_TRACE) { | 1149 | if (state == EXIT_TRACE) { |
| 1205 | write_lock_irq(&tasklist_lock); | 1150 | write_lock_irq(&tasklist_lock); |
| @@ -1216,7 +1161,21 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) | |||
| 1216 | if (state == EXIT_DEAD) | 1161 | if (state == EXIT_DEAD) |
| 1217 | release_task(p); | 1162 | release_task(p); |
| 1218 | 1163 | ||
| 1219 | return retval; | 1164 | out_info: |
| 1165 | infop = wo->wo_info; | ||
| 1166 | if (infop) { | ||
| 1167 | if ((status & 0x7f) == 0) { | ||
| 1168 | infop->cause = CLD_EXITED; | ||
| 1169 | infop->status = status >> 8; | ||
| 1170 | } else { | ||
| 1171 | infop->cause = (status & 0x80) ? CLD_DUMPED : CLD_KILLED; | ||
| 1172 | infop->status = status & 0x7f; | ||
| 1173 | } | ||
| 1174 | infop->pid = pid; | ||
| 1175 | infop->uid = uid; | ||
| 1176 | } | ||
| 1177 | |||
| 1178 | return pid; | ||
| 1220 | } | 1179 | } |
| 1221 | 1180 | ||
| 1222 | static int *task_stopped_code(struct task_struct *p, bool ptrace) | 1181 | static int *task_stopped_code(struct task_struct *p, bool ptrace) |
| @@ -1252,8 +1211,8 @@ static int *task_stopped_code(struct task_struct *p, bool ptrace) | |||
| 1252 | static int wait_task_stopped(struct wait_opts *wo, | 1211 | static int wait_task_stopped(struct wait_opts *wo, |
| 1253 | int ptrace, struct task_struct *p) | 1212 | int ptrace, struct task_struct *p) |
| 1254 | { | 1213 | { |
| 1255 | struct siginfo __user *infop; | 1214 | struct waitid_info *infop; |
| 1256 | int retval, exit_code, *p_code, why; | 1215 | int exit_code, *p_code, why; |
| 1257 | uid_t uid = 0; /* unneeded, required by compiler */ | 1216 | uid_t uid = 0; /* unneeded, required by compiler */ |
| 1258 | pid_t pid; | 1217 | pid_t pid; |
| 1259 | 1218 | ||
| @@ -1298,34 +1257,21 @@ unlock_sig: | |||
| 1298 | why = ptrace ? CLD_TRAPPED : CLD_STOPPED; | 1257 | why = ptrace ? CLD_TRAPPED : CLD_STOPPED; |
| 1299 | read_unlock(&tasklist_lock); | 1258 | read_unlock(&tasklist_lock); |
| 1300 | sched_annotate_sleep(); | 1259 | sched_annotate_sleep(); |
| 1260 | if (wo->wo_rusage) | ||
| 1261 | getrusage(p, RUSAGE_BOTH, wo->wo_rusage); | ||
| 1262 | put_task_struct(p); | ||
| 1301 | 1263 | ||
| 1302 | if (unlikely(wo->wo_flags & WNOWAIT)) | 1264 | if (likely(!(wo->wo_flags & WNOWAIT))) |
| 1303 | return wait_noreap_copyout(wo, p, pid, uid, why, exit_code); | 1265 | wo->wo_stat = (exit_code << 8) | 0x7f; |
| 1304 | |||
| 1305 | retval = wo->wo_rusage | ||
| 1306 | ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; | ||
| 1307 | if (!retval && wo->wo_stat) | ||
| 1308 | retval = put_user((exit_code << 8) | 0x7f, wo->wo_stat); | ||
| 1309 | 1266 | ||
| 1310 | infop = wo->wo_info; | 1267 | infop = wo->wo_info; |
| 1311 | if (!retval && infop) | 1268 | if (infop) { |
| 1312 | retval = put_user(SIGCHLD, &infop->si_signo); | 1269 | infop->cause = why; |
| 1313 | if (!retval && infop) | 1270 | infop->status = exit_code; |
| 1314 | retval = put_user(0, &infop->si_errno); | 1271 | infop->pid = pid; |
| 1315 | if (!retval && infop) | 1272 | infop->uid = uid; |
| 1316 | retval = put_user((short)why, &infop->si_code); | 1273 | } |
| 1317 | if (!retval && infop) | 1274 | return pid; |
| 1318 | retval = put_user(exit_code, &infop->si_status); | ||
| 1319 | if (!retval && infop) | ||
| 1320 | retval = put_user(pid, &infop->si_pid); | ||
| 1321 | if (!retval && infop) | ||
| 1322 | retval = put_user(uid, &infop->si_uid); | ||
| 1323 | if (!retval) | ||
| 1324 | retval = pid; | ||
| 1325 | put_task_struct(p); | ||
| 1326 | |||
| 1327 | BUG_ON(!retval); | ||
| 1328 | return retval; | ||
| 1329 | } | 1275 | } |
| 1330 | 1276 | ||
| 1331 | /* | 1277 | /* |
| @@ -1336,7 +1282,7 @@ unlock_sig: | |||
| 1336 | */ | 1282 | */ |
| 1337 | static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) | 1283 | static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) |
| 1338 | { | 1284 | { |
| 1339 | int retval; | 1285 | struct waitid_info *infop; |
| 1340 | pid_t pid; | 1286 | pid_t pid; |
| 1341 | uid_t uid; | 1287 | uid_t uid; |
| 1342 | 1288 | ||
| @@ -1361,22 +1307,20 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) | |||
| 1361 | get_task_struct(p); | 1307 | get_task_struct(p); |
| 1362 | read_unlock(&tasklist_lock); | 1308 | read_unlock(&tasklist_lock); |
| 1363 | sched_annotate_sleep(); | 1309 | sched_annotate_sleep(); |
| 1310 | if (wo->wo_rusage) | ||
| 1311 | getrusage(p, RUSAGE_BOTH, wo->wo_rusage); | ||
| 1312 | put_task_struct(p); | ||
| 1364 | 1313 | ||
| 1365 | if (!wo->wo_info) { | 1314 | infop = wo->wo_info; |
| 1366 | retval = wo->wo_rusage | 1315 | if (!infop) { |
| 1367 | ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; | 1316 | wo->wo_stat = 0xffff; |
| 1368 | put_task_struct(p); | ||
| 1369 | if (!retval && wo->wo_stat) | ||
| 1370 | retval = put_user(0xffff, wo->wo_stat); | ||
| 1371 | if (!retval) | ||
| 1372 | retval = pid; | ||
| 1373 | } else { | 1317 | } else { |
| 1374 | retval = wait_noreap_copyout(wo, p, pid, uid, | 1318 | infop->cause = CLD_CONTINUED; |
| 1375 | CLD_CONTINUED, SIGCONT); | 1319 | infop->pid = pid; |
| 1376 | BUG_ON(retval == 0); | 1320 | infop->uid = uid; |
| 1321 | infop->status = SIGCONT; | ||
| 1377 | } | 1322 | } |
| 1378 | 1323 | return pid; | |
| 1379 | return retval; | ||
| 1380 | } | 1324 | } |
| 1381 | 1325 | ||
| 1382 | /* | 1326 | /* |
| @@ -1604,8 +1548,8 @@ end: | |||
| 1604 | return retval; | 1548 | return retval; |
| 1605 | } | 1549 | } |
| 1606 | 1550 | ||
| 1607 | SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, | 1551 | static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, |
| 1608 | infop, int, options, struct rusage __user *, ru) | 1552 | int options, struct rusage *ru) |
| 1609 | { | 1553 | { |
| 1610 | struct wait_opts wo; | 1554 | struct wait_opts wo; |
| 1611 | struct pid *pid = NULL; | 1555 | struct pid *pid = NULL; |
| @@ -1643,38 +1587,48 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, | |||
| 1643 | wo.wo_pid = pid; | 1587 | wo.wo_pid = pid; |
| 1644 | wo.wo_flags = options; | 1588 | wo.wo_flags = options; |
| 1645 | wo.wo_info = infop; | 1589 | wo.wo_info = infop; |
| 1646 | wo.wo_stat = NULL; | ||
| 1647 | wo.wo_rusage = ru; | 1590 | wo.wo_rusage = ru; |
| 1648 | ret = do_wait(&wo); | 1591 | ret = do_wait(&wo); |
| 1649 | 1592 | ||
| 1650 | if (ret > 0) { | ||
| 1651 | ret = 0; | ||
| 1652 | } else if (infop) { | ||
| 1653 | /* | ||
| 1654 | * For a WNOHANG return, clear out all the fields | ||
| 1655 | * we would set so the user can easily tell the | ||
| 1656 | * difference. | ||
| 1657 | */ | ||
| 1658 | if (!ret) | ||
| 1659 | ret = put_user(0, &infop->si_signo); | ||
| 1660 | if (!ret) | ||
| 1661 | ret = put_user(0, &infop->si_errno); | ||
| 1662 | if (!ret) | ||
| 1663 | ret = put_user(0, &infop->si_code); | ||
| 1664 | if (!ret) | ||
| 1665 | ret = put_user(0, &infop->si_pid); | ||
| 1666 | if (!ret) | ||
| 1667 | ret = put_user(0, &infop->si_uid); | ||
| 1668 | if (!ret) | ||
| 1669 | ret = put_user(0, &infop->si_status); | ||
| 1670 | } | ||
| 1671 | |||
| 1672 | put_pid(pid); | 1593 | put_pid(pid); |
| 1673 | return ret; | 1594 | return ret; |
| 1674 | } | 1595 | } |
| 1675 | 1596 | ||
| 1676 | SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, | 1597 | SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, |
| 1677 | int, options, struct rusage __user *, ru) | 1598 | infop, int, options, struct rusage __user *, ru) |
| 1599 | { | ||
| 1600 | struct rusage r; | ||
| 1601 | struct waitid_info info = {.status = 0}; | ||
| 1602 | long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL); | ||
| 1603 | int signo = 0; | ||
| 1604 | if (err > 0) { | ||
| 1605 | signo = SIGCHLD; | ||
| 1606 | err = 0; | ||
| 1607 | } | ||
| 1608 | |||
| 1609 | if (!err) { | ||
| 1610 | if (ru && copy_to_user(ru, &r, sizeof(struct rusage))) | ||
| 1611 | return -EFAULT; | ||
| 1612 | } | ||
| 1613 | if (!infop) | ||
| 1614 | return err; | ||
| 1615 | |||
| 1616 | user_access_begin(); | ||
| 1617 | unsafe_put_user(signo, &infop->si_signo, Efault); | ||
| 1618 | unsafe_put_user(0, &infop->si_errno, Efault); | ||
| 1619 | unsafe_put_user((short)info.cause, &infop->si_code, Efault); | ||
| 1620 | unsafe_put_user(info.pid, &infop->si_pid, Efault); | ||
| 1621 | unsafe_put_user(info.uid, &infop->si_uid, Efault); | ||
| 1622 | unsafe_put_user(info.status, &infop->si_status, Efault); | ||
| 1623 | user_access_end(); | ||
| 1624 | return err; | ||
| 1625 | Efault: | ||
| 1626 | user_access_end(); | ||
| 1627 | return -EFAULT; | ||
| 1628 | } | ||
| 1629 | |||
| 1630 | long kernel_wait4(pid_t upid, int __user *stat_addr, int options, | ||
| 1631 | struct rusage *ru) | ||
| 1678 | { | 1632 | { |
| 1679 | struct wait_opts wo; | 1633 | struct wait_opts wo; |
| 1680 | struct pid *pid = NULL; | 1634 | struct pid *pid = NULL; |
| @@ -1685,6 +1639,10 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, | |||
| 1685 | __WNOTHREAD|__WCLONE|__WALL)) | 1639 | __WNOTHREAD|__WCLONE|__WALL)) |
| 1686 | return -EINVAL; | 1640 | return -EINVAL; |
| 1687 | 1641 | ||
| 1642 | /* -INT_MIN is not defined */ | ||
| 1643 | if (upid == INT_MIN) | ||
| 1644 | return -ESRCH; | ||
| 1645 | |||
| 1688 | if (upid == -1) | 1646 | if (upid == -1) |
| 1689 | type = PIDTYPE_MAX; | 1647 | type = PIDTYPE_MAX; |
| 1690 | else if (upid < 0) { | 1648 | else if (upid < 0) { |
| @@ -1702,14 +1660,29 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, | |||
| 1702 | wo.wo_pid = pid; | 1660 | wo.wo_pid = pid; |
| 1703 | wo.wo_flags = options | WEXITED; | 1661 | wo.wo_flags = options | WEXITED; |
| 1704 | wo.wo_info = NULL; | 1662 | wo.wo_info = NULL; |
| 1705 | wo.wo_stat = stat_addr; | 1663 | wo.wo_stat = 0; |
| 1706 | wo.wo_rusage = ru; | 1664 | wo.wo_rusage = ru; |
| 1707 | ret = do_wait(&wo); | 1665 | ret = do_wait(&wo); |
| 1708 | put_pid(pid); | 1666 | put_pid(pid); |
| 1667 | if (ret > 0 && stat_addr && put_user(wo.wo_stat, stat_addr)) | ||
| 1668 | ret = -EFAULT; | ||
| 1709 | 1669 | ||
| 1710 | return ret; | 1670 | return ret; |
| 1711 | } | 1671 | } |
| 1712 | 1672 | ||
| 1673 | SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, | ||
| 1674 | int, options, struct rusage __user *, ru) | ||
| 1675 | { | ||
| 1676 | struct rusage r; | ||
| 1677 | long err = kernel_wait4(upid, stat_addr, options, ru ? &r : NULL); | ||
| 1678 | |||
| 1679 | if (err > 0) { | ||
| 1680 | if (ru && copy_to_user(ru, &r, sizeof(struct rusage))) | ||
| 1681 | return -EFAULT; | ||
| 1682 | } | ||
| 1683 | return err; | ||
| 1684 | } | ||
| 1685 | |||
| 1713 | #ifdef __ARCH_WANT_SYS_WAITPID | 1686 | #ifdef __ARCH_WANT_SYS_WAITPID |
| 1714 | 1687 | ||
| 1715 | /* | 1688 | /* |
| @@ -1722,3 +1695,61 @@ SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options) | |||
| 1722 | } | 1695 | } |
| 1723 | 1696 | ||
| 1724 | #endif | 1697 | #endif |
| 1698 | |||
| 1699 | #ifdef CONFIG_COMPAT | ||
| 1700 | COMPAT_SYSCALL_DEFINE4(wait4, | ||
| 1701 | compat_pid_t, pid, | ||
| 1702 | compat_uint_t __user *, stat_addr, | ||
| 1703 | int, options, | ||
| 1704 | struct compat_rusage __user *, ru) | ||
| 1705 | { | ||
| 1706 | struct rusage r; | ||
| 1707 | long err = kernel_wait4(pid, stat_addr, options, ru ? &r : NULL); | ||
| 1708 | if (err > 0) { | ||
| 1709 | if (ru && put_compat_rusage(&r, ru)) | ||
| 1710 | return -EFAULT; | ||
| 1711 | } | ||
| 1712 | return err; | ||
| 1713 | } | ||
| 1714 | |||
| 1715 | COMPAT_SYSCALL_DEFINE5(waitid, | ||
| 1716 | int, which, compat_pid_t, pid, | ||
| 1717 | struct compat_siginfo __user *, infop, int, options, | ||
| 1718 | struct compat_rusage __user *, uru) | ||
| 1719 | { | ||
| 1720 | struct rusage ru; | ||
| 1721 | struct waitid_info info = {.status = 0}; | ||
| 1722 | long err = kernel_waitid(which, pid, &info, options, uru ? &ru : NULL); | ||
| 1723 | int signo = 0; | ||
| 1724 | if (err > 0) { | ||
| 1725 | signo = SIGCHLD; | ||
| 1726 | err = 0; | ||
| 1727 | } | ||
| 1728 | |||
| 1729 | if (!err && uru) { | ||
| 1730 | /* kernel_waitid() overwrites everything in ru */ | ||
| 1731 | if (COMPAT_USE_64BIT_TIME) | ||
| 1732 | err = copy_to_user(uru, &ru, sizeof(ru)); | ||
| 1733 | else | ||
| 1734 | err = put_compat_rusage(&ru, uru); | ||
| 1735 | if (err) | ||
| 1736 | return -EFAULT; | ||
| 1737 | } | ||
| 1738 | |||
| 1739 | if (!infop) | ||
| 1740 | return err; | ||
| 1741 | |||
| 1742 | user_access_begin(); | ||
| 1743 | unsafe_put_user(signo, &infop->si_signo, Efault); | ||
| 1744 | unsafe_put_user(0, &infop->si_errno, Efault); | ||
| 1745 | unsafe_put_user((short)info.cause, &infop->si_code, Efault); | ||
| 1746 | unsafe_put_user(info.pid, &infop->si_pid, Efault); | ||
| 1747 | unsafe_put_user(info.uid, &infop->si_uid, Efault); | ||
| 1748 | unsafe_put_user(info.status, &infop->si_status, Efault); | ||
| 1749 | user_access_end(); | ||
| 1750 | return err; | ||
| 1751 | Efault: | ||
| 1752 | user_access_end(); | ||
| 1753 | return -EFAULT; | ||
| 1754 | } | ||
| 1755 | #endif | ||
diff --git a/kernel/extable.c b/kernel/extable.c index 0fbdd8582f08..38c2412401a1 100644 --- a/kernel/extable.c +++ b/kernel/extable.c | |||
| @@ -55,7 +55,8 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr) | |||
| 55 | { | 55 | { |
| 56 | const struct exception_table_entry *e; | 56 | const struct exception_table_entry *e; |
| 57 | 57 | ||
| 58 | e = search_extable(__start___ex_table, __stop___ex_table-1, addr); | 58 | e = search_extable(__start___ex_table, |
| 59 | __stop___ex_table - __start___ex_table, addr); | ||
| 59 | if (!e) | 60 | if (!e) |
| 60 | e = search_module_extables(addr); | 61 | e = search_module_extables(addr); |
| 61 | return e; | 62 | return e; |
| @@ -69,7 +70,7 @@ static inline int init_kernel_text(unsigned long addr) | |||
| 69 | return 0; | 70 | return 0; |
| 70 | } | 71 | } |
| 71 | 72 | ||
| 72 | int core_kernel_text(unsigned long addr) | 73 | int notrace core_kernel_text(unsigned long addr) |
| 73 | { | 74 | { |
| 74 | if (addr >= (unsigned long)_stext && | 75 | if (addr >= (unsigned long)_stext && |
| 75 | addr < (unsigned long)_etext) | 76 | addr < (unsigned long)_etext) |
diff --git a/kernel/fork.c b/kernel/fork.c index e53770d2bf95..17921b0390b4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
| @@ -205,19 +205,17 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) | |||
| 205 | void *stack; | 205 | void *stack; |
| 206 | int i; | 206 | int i; |
| 207 | 207 | ||
| 208 | local_irq_disable(); | ||
| 209 | for (i = 0; i < NR_CACHED_STACKS; i++) { | 208 | for (i = 0; i < NR_CACHED_STACKS; i++) { |
| 210 | struct vm_struct *s = this_cpu_read(cached_stacks[i]); | 209 | struct vm_struct *s; |
| 210 | |||
| 211 | s = this_cpu_xchg(cached_stacks[i], NULL); | ||
| 211 | 212 | ||
| 212 | if (!s) | 213 | if (!s) |
| 213 | continue; | 214 | continue; |
| 214 | this_cpu_write(cached_stacks[i], NULL); | ||
| 215 | 215 | ||
| 216 | tsk->stack_vm_area = s; | 216 | tsk->stack_vm_area = s; |
| 217 | local_irq_enable(); | ||
| 218 | return s->addr; | 217 | return s->addr; |
| 219 | } | 218 | } |
| 220 | local_irq_enable(); | ||
| 221 | 219 | ||
| 222 | stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE, | 220 | stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE, |
| 223 | VMALLOC_START, VMALLOC_END, | 221 | VMALLOC_START, VMALLOC_END, |
| @@ -245,19 +243,15 @@ static inline void free_thread_stack(struct task_struct *tsk) | |||
| 245 | { | 243 | { |
| 246 | #ifdef CONFIG_VMAP_STACK | 244 | #ifdef CONFIG_VMAP_STACK |
| 247 | if (task_stack_vm_area(tsk)) { | 245 | if (task_stack_vm_area(tsk)) { |
| 248 | unsigned long flags; | ||
| 249 | int i; | 246 | int i; |
| 250 | 247 | ||
| 251 | local_irq_save(flags); | ||
| 252 | for (i = 0; i < NR_CACHED_STACKS; i++) { | 248 | for (i = 0; i < NR_CACHED_STACKS; i++) { |
| 253 | if (this_cpu_read(cached_stacks[i])) | 249 | if (this_cpu_cmpxchg(cached_stacks[i], |
| 250 | NULL, tsk->stack_vm_area) != NULL) | ||
| 254 | continue; | 251 | continue; |
| 255 | 252 | ||
| 256 | this_cpu_write(cached_stacks[i], tsk->stack_vm_area); | ||
| 257 | local_irq_restore(flags); | ||
| 258 | return; | 253 | return; |
| 259 | } | 254 | } |
| 260 | local_irq_restore(flags); | ||
| 261 | 255 | ||
| 262 | vfree_atomic(tsk->stack); | 256 | vfree_atomic(tsk->stack); |
| 263 | return; | 257 | return; |
| @@ -326,8 +320,8 @@ static void account_kernel_stack(struct task_struct *tsk, int account) | |||
| 326 | } | 320 | } |
| 327 | 321 | ||
| 328 | /* All stack pages belong to the same memcg. */ | 322 | /* All stack pages belong to the same memcg. */ |
| 329 | memcg_kmem_update_page_stat(vm->pages[0], MEMCG_KERNEL_STACK_KB, | 323 | mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB, |
| 330 | account * (THREAD_SIZE / 1024)); | 324 | account * (THREAD_SIZE / 1024)); |
| 331 | } else { | 325 | } else { |
| 332 | /* | 326 | /* |
| 333 | * All stack pages are in the same zone and belong to the | 327 | * All stack pages are in the same zone and belong to the |
| @@ -338,8 +332,8 @@ static void account_kernel_stack(struct task_struct *tsk, int account) | |||
| 338 | mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, | 332 | mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, |
| 339 | THREAD_SIZE / 1024 * account); | 333 | THREAD_SIZE / 1024 * account); |
| 340 | 334 | ||
| 341 | memcg_kmem_update_page_stat(first_page, MEMCG_KERNEL_STACK_KB, | 335 | mod_memcg_page_state(first_page, MEMCG_KERNEL_STACK_KB, |
| 342 | account * (THREAD_SIZE / 1024)); | 336 | account * (THREAD_SIZE / 1024)); |
| 343 | } | 337 | } |
| 344 | } | 338 | } |
| 345 | 339 | ||
| @@ -560,7 +554,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) | |||
| 560 | set_task_stack_end_magic(tsk); | 554 | set_task_stack_end_magic(tsk); |
| 561 | 555 | ||
| 562 | #ifdef CONFIG_CC_STACKPROTECTOR | 556 | #ifdef CONFIG_CC_STACKPROTECTOR |
| 563 | tsk->stack_canary = get_random_long(); | 557 | tsk->stack_canary = get_random_canary(); |
| 564 | #endif | 558 | #endif |
| 565 | 559 | ||
| 566 | /* | 560 | /* |
| @@ -579,6 +573,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) | |||
| 579 | 573 | ||
| 580 | kcov_task_init(tsk); | 574 | kcov_task_init(tsk); |
| 581 | 575 | ||
| 576 | #ifdef CONFIG_FAULT_INJECTION | ||
| 577 | tsk->fail_nth = 0; | ||
| 578 | #endif | ||
| 579 | |||
| 582 | return tsk; | 580 | return tsk; |
| 583 | 581 | ||
| 584 | free_stack: | 582 | free_stack: |
| @@ -1637,9 +1635,9 @@ static __latent_entropy struct task_struct *copy_process( | |||
| 1637 | prev_cputime_init(&p->prev_cputime); | 1635 | prev_cputime_init(&p->prev_cputime); |
| 1638 | 1636 | ||
| 1639 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN | 1637 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN |
| 1640 | seqcount_init(&p->vtime_seqcount); | 1638 | seqcount_init(&p->vtime.seqcount); |
| 1641 | p->vtime_snap = 0; | 1639 | p->vtime.starttime = 0; |
| 1642 | p->vtime_snap_whence = VTIME_INACTIVE; | 1640 | p->vtime.state = VTIME_INACTIVE; |
| 1643 | #endif | 1641 | #endif |
| 1644 | 1642 | ||
| 1645 | #if defined(SPLIT_RSS_COUNTING) | 1643 | #if defined(SPLIT_RSS_COUNTING) |
diff --git a/kernel/futex.c b/kernel/futex.c index d6cf71d08f21..16dbe4c93895 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
| @@ -212,7 +212,7 @@ struct futex_pi_state { | |||
| 212 | atomic_t refcount; | 212 | atomic_t refcount; |
| 213 | 213 | ||
| 214 | union futex_key key; | 214 | union futex_key key; |
| 215 | }; | 215 | } __randomize_layout; |
| 216 | 216 | ||
| 217 | /** | 217 | /** |
| 218 | * struct futex_q - The hashed futex queue entry, one per waiting task | 218 | * struct futex_q - The hashed futex queue entry, one per waiting task |
| @@ -246,7 +246,7 @@ struct futex_q { | |||
| 246 | struct rt_mutex_waiter *rt_waiter; | 246 | struct rt_mutex_waiter *rt_waiter; |
| 247 | union futex_key *requeue_pi_key; | 247 | union futex_key *requeue_pi_key; |
| 248 | u32 bitset; | 248 | u32 bitset; |
| 249 | }; | 249 | } __randomize_layout; |
| 250 | 250 | ||
| 251 | static const struct futex_q futex_q_init = { | 251 | static const struct futex_q futex_q_init = { |
| 252 | /* list gets initialized in queue_me()*/ | 252 | /* list gets initialized in queue_me()*/ |
| @@ -488,7 +488,7 @@ static void drop_futex_key_refs(union futex_key *key) | |||
| 488 | * | 488 | * |
| 489 | * Return: a negative error code or 0 | 489 | * Return: a negative error code or 0 |
| 490 | * | 490 | * |
| 491 | * The key words are stored in *key on success. | 491 | * The key words are stored in @key on success. |
| 492 | * | 492 | * |
| 493 | * For shared mappings, it's (page->index, file_inode(vma->vm_file), | 493 | * For shared mappings, it's (page->index, file_inode(vma->vm_file), |
| 494 | * offset_within_page). For private mappings, it's (uaddr, current->mm). | 494 | * offset_within_page). For private mappings, it's (uaddr, current->mm). |
| @@ -1259,9 +1259,9 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) | |||
| 1259 | * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) | 1259 | * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) |
| 1260 | * | 1260 | * |
| 1261 | * Return: | 1261 | * Return: |
| 1262 | * 0 - ready to wait; | 1262 | * - 0 - ready to wait; |
| 1263 | * 1 - acquired the lock; | 1263 | * - 1 - acquired the lock; |
| 1264 | * <0 - error | 1264 | * - <0 - error |
| 1265 | * | 1265 | * |
| 1266 | * The hb->lock and futex_key refs shall be held by the caller. | 1266 | * The hb->lock and futex_key refs shall be held by the caller. |
| 1267 | */ | 1267 | */ |
| @@ -1717,9 +1717,9 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, | |||
| 1717 | * hb1 and hb2 must be held by the caller. | 1717 | * hb1 and hb2 must be held by the caller. |
| 1718 | * | 1718 | * |
| 1719 | * Return: | 1719 | * Return: |
| 1720 | * 0 - failed to acquire the lock atomically; | 1720 | * - 0 - failed to acquire the lock atomically; |
| 1721 | * >0 - acquired the lock, return value is vpid of the top_waiter | 1721 | * - >0 - acquired the lock, return value is vpid of the top_waiter |
| 1722 | * <0 - error | 1722 | * - <0 - error |
| 1723 | */ | 1723 | */ |
| 1724 | static int futex_proxy_trylock_atomic(u32 __user *pifutex, | 1724 | static int futex_proxy_trylock_atomic(u32 __user *pifutex, |
| 1725 | struct futex_hash_bucket *hb1, | 1725 | struct futex_hash_bucket *hb1, |
| @@ -1785,8 +1785,8 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, | |||
| 1785 | * uaddr2 atomically on behalf of the top waiter. | 1785 | * uaddr2 atomically on behalf of the top waiter. |
| 1786 | * | 1786 | * |
| 1787 | * Return: | 1787 | * Return: |
| 1788 | * >=0 - on success, the number of tasks requeued or woken; | 1788 | * - >=0 - on success, the number of tasks requeued or woken; |
| 1789 | * <0 - on error | 1789 | * - <0 - on error |
| 1790 | */ | 1790 | */ |
| 1791 | static int futex_requeue(u32 __user *uaddr1, unsigned int flags, | 1791 | static int futex_requeue(u32 __user *uaddr1, unsigned int flags, |
| 1792 | u32 __user *uaddr2, int nr_wake, int nr_requeue, | 1792 | u32 __user *uaddr2, int nr_wake, int nr_requeue, |
| @@ -2142,8 +2142,8 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) | |||
| 2142 | * be paired with exactly one earlier call to queue_me(). | 2142 | * be paired with exactly one earlier call to queue_me(). |
| 2143 | * | 2143 | * |
| 2144 | * Return: | 2144 | * Return: |
| 2145 | * 1 - if the futex_q was still queued (and we removed unqueued it); | 2145 | * - 1 - if the futex_q was still queued (and we removed unqueued it); |
| 2146 | * 0 - if the futex_q was already removed by the waking thread | 2146 | * - 0 - if the futex_q was already removed by the waking thread |
| 2147 | */ | 2147 | */ |
| 2148 | static int unqueue_me(struct futex_q *q) | 2148 | static int unqueue_me(struct futex_q *q) |
| 2149 | { | 2149 | { |
| @@ -2333,9 +2333,9 @@ static long futex_wait_restart(struct restart_block *restart); | |||
| 2333 | * acquire the lock. Must be called with the hb lock held. | 2333 | * acquire the lock. Must be called with the hb lock held. |
| 2334 | * | 2334 | * |
| 2335 | * Return: | 2335 | * Return: |
| 2336 | * 1 - success, lock taken; | 2336 | * - 1 - success, lock taken; |
| 2337 | * 0 - success, lock not taken; | 2337 | * - 0 - success, lock not taken; |
| 2338 | * <0 - on error (-EFAULT) | 2338 | * - <0 - on error (-EFAULT) |
| 2339 | */ | 2339 | */ |
| 2340 | static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) | 2340 | static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) |
| 2341 | { | 2341 | { |
| @@ -2422,8 +2422,8 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, | |||
| 2422 | * with no q.key reference on failure. | 2422 | * with no q.key reference on failure. |
| 2423 | * | 2423 | * |
| 2424 | * Return: | 2424 | * Return: |
| 2425 | * 0 - uaddr contains val and hb has been locked; | 2425 | * - 0 - uaddr contains val and hb has been locked; |
| 2426 | * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked | 2426 | * - <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked |
| 2427 | */ | 2427 | */ |
| 2428 | static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, | 2428 | static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, |
| 2429 | struct futex_q *q, struct futex_hash_bucket **hb) | 2429 | struct futex_q *q, struct futex_hash_bucket **hb) |
| @@ -2895,8 +2895,8 @@ pi_faulted: | |||
| 2895 | * called with the hb lock held. | 2895 | * called with the hb lock held. |
| 2896 | * | 2896 | * |
| 2897 | * Return: | 2897 | * Return: |
| 2898 | * 0 = no early wakeup detected; | 2898 | * - 0 = no early wakeup detected; |
| 2899 | * <0 = -ETIMEDOUT or -ERESTARTNOINTR | 2899 | * - <0 = -ETIMEDOUT or -ERESTARTNOINTR |
| 2900 | */ | 2900 | */ |
| 2901 | static inline | 2901 | static inline |
| 2902 | int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, | 2902 | int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, |
| @@ -2968,8 +2968,8 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, | |||
| 2968 | * If 4 or 7, we cleanup and return with -ETIMEDOUT. | 2968 | * If 4 or 7, we cleanup and return with -ETIMEDOUT. |
| 2969 | * | 2969 | * |
| 2970 | * Return: | 2970 | * Return: |
| 2971 | * 0 - On success; | 2971 | * - 0 - On success; |
| 2972 | * <0 - On error | 2972 | * - <0 - On error |
| 2973 | */ | 2973 | */ |
| 2974 | static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, | 2974 | static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, |
| 2975 | u32 val, ktime_t *abs_time, u32 bitset, | 2975 | u32 val, ktime_t *abs_time, u32 bitset, |
diff --git a/kernel/groups.c b/kernel/groups.c index d09727692a2a..434f6665f187 100644 --- a/kernel/groups.c +++ b/kernel/groups.c | |||
| @@ -5,6 +5,7 @@ | |||
| 5 | #include <linux/export.h> | 5 | #include <linux/export.h> |
| 6 | #include <linux/slab.h> | 6 | #include <linux/slab.h> |
| 7 | #include <linux/security.h> | 7 | #include <linux/security.h> |
| 8 | #include <linux/sort.h> | ||
| 8 | #include <linux/syscalls.h> | 9 | #include <linux/syscalls.h> |
| 9 | #include <linux/user_namespace.h> | 10 | #include <linux/user_namespace.h> |
| 10 | #include <linux/vmalloc.h> | 11 | #include <linux/vmalloc.h> |
| @@ -76,32 +77,18 @@ static int groups_from_user(struct group_info *group_info, | |||
| 76 | return 0; | 77 | return 0; |
| 77 | } | 78 | } |
| 78 | 79 | ||
| 79 | /* a simple Shell sort */ | 80 | static int gid_cmp(const void *_a, const void *_b) |
| 81 | { | ||
| 82 | kgid_t a = *(kgid_t *)_a; | ||
| 83 | kgid_t b = *(kgid_t *)_b; | ||
| 84 | |||
| 85 | return gid_gt(a, b) - gid_lt(a, b); | ||
| 86 | } | ||
| 87 | |||
| 80 | static void groups_sort(struct group_info *group_info) | 88 | static void groups_sort(struct group_info *group_info) |
| 81 | { | 89 | { |
| 82 | int base, max, stride; | 90 | sort(group_info->gid, group_info->ngroups, sizeof(*group_info->gid), |
| 83 | int gidsetsize = group_info->ngroups; | 91 | gid_cmp, NULL); |
| 84 | |||
| 85 | for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1) | ||
| 86 | ; /* nothing */ | ||
| 87 | stride /= 3; | ||
| 88 | |||
| 89 | while (stride) { | ||
| 90 | max = gidsetsize - stride; | ||
| 91 | for (base = 0; base < max; base++) { | ||
| 92 | int left = base; | ||
| 93 | int right = left + stride; | ||
| 94 | kgid_t tmp = group_info->gid[right]; | ||
| 95 | |||
| 96 | while (left >= 0 && gid_gt(group_info->gid[left], tmp)) { | ||
| 97 | group_info->gid[right] = group_info->gid[left]; | ||
| 98 | right = left; | ||
| 99 | left -= stride; | ||
| 100 | } | ||
| 101 | group_info->gid[right] = tmp; | ||
| 102 | } | ||
| 103 | stride /= 3; | ||
| 104 | } | ||
| 105 | } | 92 | } |
| 106 | 93 | ||
| 107 | /* a simple bsearch */ | 94 | /* a simple bsearch */ |
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index d2747f9c5707..d69bd77252a7 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c | |||
| @@ -110,6 +110,13 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) | |||
| 110 | struct cpumask *masks; | 110 | struct cpumask *masks; |
| 111 | cpumask_var_t nmsk, *node_to_present_cpumask; | 111 | cpumask_var_t nmsk, *node_to_present_cpumask; |
| 112 | 112 | ||
| 113 | /* | ||
| 114 | * If there aren't any vectors left after applying the pre/post | ||
| 115 | * vectors don't bother with assigning affinity. | ||
| 116 | */ | ||
| 117 | if (!affv) | ||
| 118 | return NULL; | ||
| 119 | |||
| 113 | if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) | 120 | if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) |
| 114 | return NULL; | 121 | return NULL; |
| 115 | 122 | ||
| @@ -192,15 +199,19 @@ out: | |||
| 192 | 199 | ||
| 193 | /** | 200 | /** |
| 194 | * irq_calc_affinity_vectors - Calculate the optimal number of vectors | 201 | * irq_calc_affinity_vectors - Calculate the optimal number of vectors |
| 202 | * @minvec: The minimum number of vectors available | ||
| 195 | * @maxvec: The maximum number of vectors available | 203 | * @maxvec: The maximum number of vectors available |
| 196 | * @affd: Description of the affinity requirements | 204 | * @affd: Description of the affinity requirements |
| 197 | */ | 205 | */ |
| 198 | int irq_calc_affinity_vectors(int maxvec, const struct irq_affinity *affd) | 206 | int irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity *affd) |
| 199 | { | 207 | { |
| 200 | int resv = affd->pre_vectors + affd->post_vectors; | 208 | int resv = affd->pre_vectors + affd->post_vectors; |
| 201 | int vecs = maxvec - resv; | 209 | int vecs = maxvec - resv; |
| 202 | int ret; | 210 | int ret; |
| 203 | 211 | ||
| 212 | if (resv > minvec) | ||
| 213 | return 0; | ||
| 214 | |||
| 204 | get_online_cpus(); | 215 | get_online_cpus(); |
| 205 | ret = min_t(int, cpumask_weight(cpu_present_mask), vecs) + resv; | 216 | ret = min_t(int, cpumask_weight(cpu_present_mask), vecs) + resv; |
| 206 | put_online_cpus(); | 217 | put_online_cpus(); |
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 2e30d925a40d..a3cc37c0c85e 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
| @@ -7,7 +7,7 @@ | |||
| 7 | * This file contains the core interrupt handling code, for irq-chip | 7 | * This file contains the core interrupt handling code, for irq-chip |
| 8 | * based architectures. | 8 | * based architectures. |
| 9 | * | 9 | * |
| 10 | * Detailed information is available in Documentation/DocBook/genericirq | 10 | * Detailed information is available in Documentation/core-api/genericirq.rst |
| 11 | */ | 11 | */ |
| 12 | 12 | ||
| 13 | #include <linux/irq.h> | 13 | #include <linux/irq.h> |
| @@ -170,21 +170,11 @@ static void irq_state_clr_disabled(struct irq_desc *desc) | |||
| 170 | irqd_clear(&desc->irq_data, IRQD_IRQ_DISABLED); | 170 | irqd_clear(&desc->irq_data, IRQD_IRQ_DISABLED); |
| 171 | } | 171 | } |
| 172 | 172 | ||
| 173 | static void irq_state_set_disabled(struct irq_desc *desc) | ||
| 174 | { | ||
| 175 | irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); | ||
| 176 | } | ||
| 177 | |||
| 178 | static void irq_state_clr_masked(struct irq_desc *desc) | 173 | static void irq_state_clr_masked(struct irq_desc *desc) |
| 179 | { | 174 | { |
| 180 | irqd_clear(&desc->irq_data, IRQD_IRQ_MASKED); | 175 | irqd_clear(&desc->irq_data, IRQD_IRQ_MASKED); |
| 181 | } | 176 | } |
| 182 | 177 | ||
| 183 | static void irq_state_set_masked(struct irq_desc *desc) | ||
| 184 | { | ||
| 185 | irqd_set(&desc->irq_data, IRQD_IRQ_MASKED); | ||
| 186 | } | ||
| 187 | |||
| 188 | static void irq_state_clr_started(struct irq_desc *desc) | 178 | static void irq_state_clr_started(struct irq_desc *desc) |
| 189 | { | 179 | { |
| 190 | irqd_clear(&desc->irq_data, IRQD_IRQ_STARTED); | 180 | irqd_clear(&desc->irq_data, IRQD_IRQ_STARTED); |
| @@ -234,7 +224,7 @@ __irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force) | |||
| 234 | return IRQ_STARTUP_MANAGED; | 224 | return IRQ_STARTUP_MANAGED; |
| 235 | } | 225 | } |
| 236 | #else | 226 | #else |
| 237 | static int | 227 | static __always_inline int |
| 238 | __irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force) | 228 | __irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force) |
| 239 | { | 229 | { |
| 240 | return IRQ_STARTUP_NORMAL; | 230 | return IRQ_STARTUP_NORMAL; |
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index eb4d3e8945b8..79f987b942b8 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c | |||
| @@ -6,7 +6,7 @@ | |||
| 6 | * | 6 | * |
| 7 | * This file contains the core interrupt handling code. | 7 | * This file contains the core interrupt handling code. |
| 8 | * | 8 | * |
| 9 | * Detailed information is available in Documentation/DocBook/genericirq | 9 | * Detailed information is available in Documentation/core-api/genericirq.rst |
| 10 | * | 10 | * |
| 11 | */ | 11 | */ |
| 12 | 12 | ||
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 9da14d125df4..a2c48058354c 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h | |||
| @@ -227,6 +227,16 @@ static inline bool irqd_has_set(struct irq_data *d, unsigned int mask) | |||
| 227 | return __irqd_to_state(d) & mask; | 227 | return __irqd_to_state(d) & mask; |
| 228 | } | 228 | } |
| 229 | 229 | ||
| 230 | static inline void irq_state_set_disabled(struct irq_desc *desc) | ||
| 231 | { | ||
| 232 | irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); | ||
| 233 | } | ||
| 234 | |||
| 235 | static inline void irq_state_set_masked(struct irq_desc *desc) | ||
| 236 | { | ||
| 237 | irqd_set(&desc->irq_data, IRQD_IRQ_MASKED); | ||
| 238 | } | ||
| 239 | |||
| 230 | #undef __irqd_to_state | 240 | #undef __irqd_to_state |
| 231 | 241 | ||
| 232 | static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc) | 242 | static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc) |
| @@ -437,7 +447,9 @@ static inline void irq_remove_debugfs_entry(struct irq_desc *desc) | |||
| 437 | # ifdef CONFIG_IRQ_DOMAIN | 447 | # ifdef CONFIG_IRQ_DOMAIN |
| 438 | void irq_domain_debugfs_init(struct dentry *root); | 448 | void irq_domain_debugfs_init(struct dentry *root); |
| 439 | # else | 449 | # else |
| 440 | static inline void irq_domain_debugfs_init(struct dentry *root); | 450 | static inline void irq_domain_debugfs_init(struct dentry *root) |
| 451 | { | ||
| 452 | } | ||
| 441 | # endif | 453 | # endif |
| 442 | #else /* CONFIG_GENERIC_IRQ_DEBUGFS */ | 454 | #else /* CONFIG_GENERIC_IRQ_DEBUGFS */ |
| 443 | static inline void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *d) | 455 | static inline void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *d) |
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 948b50e78549..73be2b3909bd 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c | |||
| @@ -4,7 +4,7 @@ | |||
| 4 | * | 4 | * |
| 5 | * This file contains the interrupt descriptor management code | 5 | * This file contains the interrupt descriptor management code |
| 6 | * | 6 | * |
| 7 | * Detailed information is available in Documentation/DocBook/genericirq | 7 | * Detailed information is available in Documentation/core-api/genericirq.rst |
| 8 | * | 8 | * |
| 9 | */ | 9 | */ |
| 10 | #include <linux/irq.h> | 10 | #include <linux/irq.h> |
| @@ -373,6 +373,7 @@ static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags, | |||
| 373 | 373 | ||
| 374 | raw_spin_lock_init(&desc->lock); | 374 | raw_spin_lock_init(&desc->lock); |
| 375 | lockdep_set_class(&desc->lock, &irq_desc_lock_class); | 375 | lockdep_set_class(&desc->lock, &irq_desc_lock_class); |
| 376 | mutex_init(&desc->request_mutex); | ||
| 376 | init_rcu_head(&desc->rcu); | 377 | init_rcu_head(&desc->rcu); |
| 377 | 378 | ||
| 378 | desc_set_defaults(irq, desc, node, affinity, owner); | 379 | desc_set_defaults(irq, desc, node, affinity, owner); |
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 14fe862aa2e3..f1f251479aa6 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c | |||
| @@ -1,5 +1,6 @@ | |||
| 1 | #define pr_fmt(fmt) "irq: " fmt | 1 | #define pr_fmt(fmt) "irq: " fmt |
| 2 | 2 | ||
| 3 | #include <linux/acpi.h> | ||
| 3 | #include <linux/debugfs.h> | 4 | #include <linux/debugfs.h> |
| 4 | #include <linux/hardirq.h> | 5 | #include <linux/hardirq.h> |
| 5 | #include <linux/interrupt.h> | 6 | #include <linux/interrupt.h> |
| @@ -155,6 +156,21 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, | |||
| 155 | domain->name = fwid->name; | 156 | domain->name = fwid->name; |
| 156 | break; | 157 | break; |
| 157 | } | 158 | } |
| 159 | #ifdef CONFIG_ACPI | ||
| 160 | } else if (is_acpi_device_node(fwnode)) { | ||
| 161 | struct acpi_buffer buf = { | ||
| 162 | .length = ACPI_ALLOCATE_BUFFER, | ||
| 163 | }; | ||
| 164 | acpi_handle handle; | ||
| 165 | |||
| 166 | handle = acpi_device_handle(to_acpi_device_node(fwnode)); | ||
| 167 | if (acpi_get_name(handle, ACPI_FULL_PATHNAME, &buf) == AE_OK) { | ||
| 168 | domain->name = buf.pointer; | ||
| 169 | domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED; | ||
| 170 | } | ||
| 171 | |||
| 172 | domain->fwnode = fwnode; | ||
| 173 | #endif | ||
| 158 | } else if (of_node) { | 174 | } else if (of_node) { |
| 159 | char *name; | 175 | char *name; |
| 160 | 176 | ||
| @@ -1667,8 +1683,7 @@ static void debugfs_add_domain_dir(struct irq_domain *d) | |||
| 1667 | 1683 | ||
| 1668 | static void debugfs_remove_domain_dir(struct irq_domain *d) | 1684 | static void debugfs_remove_domain_dir(struct irq_domain *d) |
| 1669 | { | 1685 | { |
| 1670 | if (d->debugfs_file) | 1686 | debugfs_remove(d->debugfs_file); |
| 1671 | debugfs_remove(d->debugfs_file); | ||
| 1672 | } | 1687 | } |
| 1673 | 1688 | ||
| 1674 | void __init irq_domain_debugfs_init(struct dentry *root) | 1689 | void __init irq_domain_debugfs_init(struct dentry *root) |
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 5c11c1730ba5..1d1a5b945ab4 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
| @@ -1090,6 +1090,16 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary) | |||
| 1090 | /* | 1090 | /* |
| 1091 | * Internal function to register an irqaction - typically used to | 1091 | * Internal function to register an irqaction - typically used to |
| 1092 | * allocate special interrupts that are part of the architecture. | 1092 | * allocate special interrupts that are part of the architecture. |
| 1093 | * | ||
| 1094 | * Locking rules: | ||
| 1095 | * | ||
| 1096 | * desc->request_mutex Provides serialization against a concurrent free_irq() | ||
| 1097 | * chip_bus_lock Provides serialization for slow bus operations | ||
| 1098 | * desc->lock Provides serialization against hard interrupts | ||
| 1099 | * | ||
| 1100 | * chip_bus_lock and desc->lock are sufficient for all other management and | ||
| 1101 | * interrupt related functions. desc->request_mutex solely serializes | ||
| 1102 | * request/free_irq(). | ||
| 1093 | */ | 1103 | */ |
| 1094 | static int | 1104 | static int |
| 1095 | __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | 1105 | __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) |
| @@ -1168,7 +1178,34 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
| 1168 | new->flags &= ~IRQF_ONESHOT; | 1178 | new->flags &= ~IRQF_ONESHOT; |
| 1169 | 1179 | ||
| 1170 | /* | 1180 | /* |
| 1181 | * Protects against a concurrent __free_irq() call which might wait | ||
| 1182 | * for synchronize_irq() to complete without holding the optional | ||
| 1183 | * chip bus lock and desc->lock. | ||
| 1184 | */ | ||
| 1185 | mutex_lock(&desc->request_mutex); | ||
| 1186 | |||
| 1187 | /* | ||
| 1188 | * Acquire bus lock as the irq_request_resources() callback below | ||
| 1189 | * might rely on the serialization or the magic power management | ||
| 1190 | * functions which are abusing the irq_bus_lock() callback, | ||
| 1191 | */ | ||
| 1192 | chip_bus_lock(desc); | ||
| 1193 | |||
| 1194 | /* First installed action requests resources. */ | ||
| 1195 | if (!desc->action) { | ||
| 1196 | ret = irq_request_resources(desc); | ||
| 1197 | if (ret) { | ||
| 1198 | pr_err("Failed to request resources for %s (irq %d) on irqchip %s\n", | ||
| 1199 | new->name, irq, desc->irq_data.chip->name); | ||
| 1200 | goto out_bus_unlock; | ||
| 1201 | } | ||
| 1202 | } | ||
| 1203 | |||
| 1204 | /* | ||
| 1171 | * The following block of code has to be executed atomically | 1205 | * The following block of code has to be executed atomically |
| 1206 | * protected against a concurrent interrupt and any of the other | ||
| 1207 | * management calls which are not serialized via | ||
| 1208 | * desc->request_mutex or the optional bus lock. | ||
| 1172 | */ | 1209 | */ |
| 1173 | raw_spin_lock_irqsave(&desc->lock, flags); | 1210 | raw_spin_lock_irqsave(&desc->lock, flags); |
| 1174 | old_ptr = &desc->action; | 1211 | old_ptr = &desc->action; |
| @@ -1267,13 +1304,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
| 1267 | } | 1304 | } |
| 1268 | 1305 | ||
| 1269 | if (!shared) { | 1306 | if (!shared) { |
| 1270 | ret = irq_request_resources(desc); | ||
| 1271 | if (ret) { | ||
| 1272 | pr_err("Failed to request resources for %s (irq %d) on irqchip %s\n", | ||
| 1273 | new->name, irq, desc->irq_data.chip->name); | ||
| 1274 | goto out_unlock; | ||
| 1275 | } | ||
| 1276 | |||
| 1277 | init_waitqueue_head(&desc->wait_for_threads); | 1307 | init_waitqueue_head(&desc->wait_for_threads); |
| 1278 | 1308 | ||
| 1279 | /* Setup the type (level, edge polarity) if configured: */ | 1309 | /* Setup the type (level, edge polarity) if configured: */ |
| @@ -1281,10 +1311,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
| 1281 | ret = __irq_set_trigger(desc, | 1311 | ret = __irq_set_trigger(desc, |
| 1282 | new->flags & IRQF_TRIGGER_MASK); | 1312 | new->flags & IRQF_TRIGGER_MASK); |
| 1283 | 1313 | ||
| 1284 | if (ret) { | 1314 | if (ret) |
| 1285 | irq_release_resources(desc); | ||
| 1286 | goto out_unlock; | 1315 | goto out_unlock; |
| 1287 | } | ||
| 1288 | } | 1316 | } |
| 1289 | 1317 | ||
| 1290 | desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \ | 1318 | desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \ |
| @@ -1347,6 +1375,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
| 1347 | } | 1375 | } |
| 1348 | 1376 | ||
| 1349 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 1377 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
| 1378 | chip_bus_sync_unlock(desc); | ||
| 1379 | mutex_unlock(&desc->request_mutex); | ||
| 1350 | 1380 | ||
| 1351 | irq_setup_timings(desc, new); | 1381 | irq_setup_timings(desc, new); |
| 1352 | 1382 | ||
| @@ -1378,6 +1408,12 @@ mismatch: | |||
| 1378 | out_unlock: | 1408 | out_unlock: |
| 1379 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 1409 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
| 1380 | 1410 | ||
| 1411 | if (!desc->action) | ||
| 1412 | irq_release_resources(desc); | ||
| 1413 | out_bus_unlock: | ||
| 1414 | chip_bus_sync_unlock(desc); | ||
| 1415 | mutex_unlock(&desc->request_mutex); | ||
| 1416 | |||
| 1381 | out_thread: | 1417 | out_thread: |
| 1382 | if (new->thread) { | 1418 | if (new->thread) { |
| 1383 | struct task_struct *t = new->thread; | 1419 | struct task_struct *t = new->thread; |
| @@ -1417,9 +1453,7 @@ int setup_irq(unsigned int irq, struct irqaction *act) | |||
| 1417 | if (retval < 0) | 1453 | if (retval < 0) |
| 1418 | return retval; | 1454 | return retval; |
| 1419 | 1455 | ||
| 1420 | chip_bus_lock(desc); | ||
| 1421 | retval = __setup_irq(irq, desc, act); | 1456 | retval = __setup_irq(irq, desc, act); |
| 1422 | chip_bus_sync_unlock(desc); | ||
| 1423 | 1457 | ||
| 1424 | if (retval) | 1458 | if (retval) |
| 1425 | irq_chip_pm_put(&desc->irq_data); | 1459 | irq_chip_pm_put(&desc->irq_data); |
| @@ -1443,6 +1477,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) | |||
| 1443 | if (!desc) | 1477 | if (!desc) |
| 1444 | return NULL; | 1478 | return NULL; |
| 1445 | 1479 | ||
| 1480 | mutex_lock(&desc->request_mutex); | ||
| 1446 | chip_bus_lock(desc); | 1481 | chip_bus_lock(desc); |
| 1447 | raw_spin_lock_irqsave(&desc->lock, flags); | 1482 | raw_spin_lock_irqsave(&desc->lock, flags); |
| 1448 | 1483 | ||
| @@ -1458,6 +1493,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) | |||
| 1458 | WARN(1, "Trying to free already-free IRQ %d\n", irq); | 1493 | WARN(1, "Trying to free already-free IRQ %d\n", irq); |
| 1459 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 1494 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
| 1460 | chip_bus_sync_unlock(desc); | 1495 | chip_bus_sync_unlock(desc); |
| 1496 | mutex_unlock(&desc->request_mutex); | ||
| 1461 | return NULL; | 1497 | return NULL; |
| 1462 | } | 1498 | } |
| 1463 | 1499 | ||
| @@ -1475,8 +1511,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) | |||
| 1475 | if (!desc->action) { | 1511 | if (!desc->action) { |
| 1476 | irq_settings_clr_disable_unlazy(desc); | 1512 | irq_settings_clr_disable_unlazy(desc); |
| 1477 | irq_shutdown(desc); | 1513 | irq_shutdown(desc); |
| 1478 | irq_release_resources(desc); | ||
| 1479 | irq_remove_timings(desc); | ||
| 1480 | } | 1514 | } |
| 1481 | 1515 | ||
| 1482 | #ifdef CONFIG_SMP | 1516 | #ifdef CONFIG_SMP |
| @@ -1486,6 +1520,20 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) | |||
| 1486 | #endif | 1520 | #endif |
| 1487 | 1521 | ||
| 1488 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 1522 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
| 1523 | /* | ||
| 1524 | * Drop bus_lock here so the changes which were done in the chip | ||
| 1525 | * callbacks above are synced out to the irq chips which hang | ||
| 1526 | * behind a slow bus (I2C, SPI) before calling synchronize_irq(). | ||
| 1527 | * | ||
| 1528 | * Aside of that the bus_lock can also be taken from the threaded | ||
| 1529 | * handler in irq_finalize_oneshot() which results in a deadlock | ||
| 1530 | * because synchronize_irq() would wait forever for the thread to | ||
| 1531 | * complete, which is blocked on the bus lock. | ||
| 1532 | * | ||
| 1533 | * The still held desc->request_mutex() protects against a | ||
| 1534 | * concurrent request_irq() of this irq so the release of resources | ||
| 1535 | * and timing data is properly serialized. | ||
| 1536 | */ | ||
| 1489 | chip_bus_sync_unlock(desc); | 1537 | chip_bus_sync_unlock(desc); |
| 1490 | 1538 | ||
| 1491 | unregister_handler_proc(irq, action); | 1539 | unregister_handler_proc(irq, action); |
| @@ -1518,6 +1566,20 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) | |||
| 1518 | } | 1566 | } |
| 1519 | } | 1567 | } |
| 1520 | 1568 | ||
| 1569 | /* Last action releases resources */ | ||
| 1570 | if (!desc->action) { | ||
| 1571 | /* | ||
| 1572 | * Reaquire bus lock as irq_release_resources() might | ||
| 1573 | * require it to deallocate resources over the slow bus. | ||
| 1574 | */ | ||
| 1575 | chip_bus_lock(desc); | ||
| 1576 | irq_release_resources(desc); | ||
| 1577 | chip_bus_sync_unlock(desc); | ||
| 1578 | irq_remove_timings(desc); | ||
| 1579 | } | ||
| 1580 | |||
| 1581 | mutex_unlock(&desc->request_mutex); | ||
| 1582 | |||
| 1521 | irq_chip_pm_put(&desc->irq_data); | 1583 | irq_chip_pm_put(&desc->irq_data); |
| 1522 | module_put(desc->owner); | 1584 | module_put(desc->owner); |
| 1523 | kfree(action->secondary); | 1585 | kfree(action->secondary); |
| @@ -1674,9 +1736,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, | |||
| 1674 | return retval; | 1736 | return retval; |
| 1675 | } | 1737 | } |
| 1676 | 1738 | ||
| 1677 | chip_bus_lock(desc); | ||
| 1678 | retval = __setup_irq(irq, desc, action); | 1739 | retval = __setup_irq(irq, desc, action); |
| 1679 | chip_bus_sync_unlock(desc); | ||
| 1680 | 1740 | ||
| 1681 | if (retval) { | 1741 | if (retval) { |
| 1682 | irq_chip_pm_put(&desc->irq_data); | 1742 | irq_chip_pm_put(&desc->irq_data); |
| @@ -1924,9 +1984,7 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act) | |||
| 1924 | if (retval < 0) | 1984 | if (retval < 0) |
| 1925 | return retval; | 1985 | return retval; |
| 1926 | 1986 | ||
| 1927 | chip_bus_lock(desc); | ||
| 1928 | retval = __setup_irq(irq, desc, act); | 1987 | retval = __setup_irq(irq, desc, act); |
| 1929 | chip_bus_sync_unlock(desc); | ||
| 1930 | 1988 | ||
| 1931 | if (retval) | 1989 | if (retval) |
| 1932 | irq_chip_pm_put(&desc->irq_data); | 1990 | irq_chip_pm_put(&desc->irq_data); |
| @@ -1935,9 +1993,10 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act) | |||
| 1935 | } | 1993 | } |
| 1936 | 1994 | ||
| 1937 | /** | 1995 | /** |
| 1938 | * request_percpu_irq - allocate a percpu interrupt line | 1996 | * __request_percpu_irq - allocate a percpu interrupt line |
| 1939 | * @irq: Interrupt line to allocate | 1997 | * @irq: Interrupt line to allocate |
| 1940 | * @handler: Function to be called when the IRQ occurs. | 1998 | * @handler: Function to be called when the IRQ occurs. |
| 1999 | * @flags: Interrupt type flags (IRQF_TIMER only) | ||
| 1941 | * @devname: An ascii name for the claiming device | 2000 | * @devname: An ascii name for the claiming device |
| 1942 | * @dev_id: A percpu cookie passed back to the handler function | 2001 | * @dev_id: A percpu cookie passed back to the handler function |
| 1943 | * | 2002 | * |
| @@ -1950,8 +2009,9 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act) | |||
| 1950 | * the handler gets called with the interrupted CPU's instance of | 2009 | * the handler gets called with the interrupted CPU's instance of |
| 1951 | * that variable. | 2010 | * that variable. |
| 1952 | */ | 2011 | */ |
| 1953 | int request_percpu_irq(unsigned int irq, irq_handler_t handler, | 2012 | int __request_percpu_irq(unsigned int irq, irq_handler_t handler, |
| 1954 | const char *devname, void __percpu *dev_id) | 2013 | unsigned long flags, const char *devname, |
| 2014 | void __percpu *dev_id) | ||
| 1955 | { | 2015 | { |
| 1956 | struct irqaction *action; | 2016 | struct irqaction *action; |
| 1957 | struct irq_desc *desc; | 2017 | struct irq_desc *desc; |
| @@ -1965,12 +2025,15 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler, | |||
| 1965 | !irq_settings_is_per_cpu_devid(desc)) | 2025 | !irq_settings_is_per_cpu_devid(desc)) |
| 1966 | return -EINVAL; | 2026 | return -EINVAL; |
| 1967 | 2027 | ||
| 2028 | if (flags && flags != IRQF_TIMER) | ||
| 2029 | return -EINVAL; | ||
| 2030 | |||
| 1968 | action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); | 2031 | action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); |
| 1969 | if (!action) | 2032 | if (!action) |
| 1970 | return -ENOMEM; | 2033 | return -ENOMEM; |
| 1971 | 2034 | ||
| 1972 | action->handler = handler; | 2035 | action->handler = handler; |
| 1973 | action->flags = IRQF_PERCPU | IRQF_NO_SUSPEND; | 2036 | action->flags = flags | IRQF_PERCPU | IRQF_NO_SUSPEND; |
| 1974 | action->name = devname; | 2037 | action->name = devname; |
| 1975 | action->percpu_dev_id = dev_id; | 2038 | action->percpu_dev_id = dev_id; |
| 1976 | 2039 | ||
| @@ -1980,9 +2043,7 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler, | |||
| 1980 | return retval; | 2043 | return retval; |
| 1981 | } | 2044 | } |
| 1982 | 2045 | ||
| 1983 | chip_bus_lock(desc); | ||
| 1984 | retval = __setup_irq(irq, desc, action); | 2046 | retval = __setup_irq(irq, desc, action); |
| 1985 | chip_bus_sync_unlock(desc); | ||
| 1986 | 2047 | ||
| 1987 | if (retval) { | 2048 | if (retval) { |
| 1988 | irq_chip_pm_put(&desc->irq_data); | 2049 | irq_chip_pm_put(&desc->irq_data); |
| @@ -1991,7 +2052,7 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler, | |||
| 1991 | 2052 | ||
| 1992 | return retval; | 2053 | return retval; |
| 1993 | } | 2054 | } |
| 1994 | EXPORT_SYMBOL_GPL(request_percpu_irq); | 2055 | EXPORT_SYMBOL_GPL(__request_percpu_irq); |
| 1995 | 2056 | ||
| 1996 | /** | 2057 | /** |
| 1997 | * irq_get_irqchip_state - returns the irqchip state of a interrupt. | 2058 | * irq_get_irqchip_state - returns the irqchip state of a interrupt. |
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index cea1de0161f1..6bd9b58429cc 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c | |||
| @@ -149,6 +149,8 @@ static void resume_irq(struct irq_desc *desc) | |||
| 149 | 149 | ||
| 150 | /* Pretend that it got disabled ! */ | 150 | /* Pretend that it got disabled ! */ |
| 151 | desc->depth++; | 151 | desc->depth++; |
| 152 | irq_state_set_disabled(desc); | ||
| 153 | irq_state_set_masked(desc); | ||
| 152 | resume: | 154 | resume: |
| 153 | desc->istate &= ~IRQS_SUSPENDED; | 155 | desc->istate &= ~IRQS_SUSPENDED; |
| 154 | __enable_irq(desc); | 156 | __enable_irq(desc); |
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 6a3b249a2ae1..127e7cfafa55 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c | |||
| @@ -28,12 +28,6 @@ | |||
| 28 | 28 | ||
| 29 | #include <asm/sections.h> | 29 | #include <asm/sections.h> |
| 30 | 30 | ||
| 31 | #ifdef CONFIG_KALLSYMS_ALL | ||
| 32 | #define all_var 1 | ||
| 33 | #else | ||
| 34 | #define all_var 0 | ||
| 35 | #endif | ||
| 36 | |||
| 37 | /* | 31 | /* |
| 38 | * These will be re-linked against their real values | 32 | * These will be re-linked against their real values |
| 39 | * during the second link stage. | 33 | * during the second link stage. |
| @@ -82,7 +76,7 @@ static inline int is_kernel(unsigned long addr) | |||
| 82 | 76 | ||
| 83 | static int is_ksym_addr(unsigned long addr) | 77 | static int is_ksym_addr(unsigned long addr) |
| 84 | { | 78 | { |
| 85 | if (all_var) | 79 | if (IS_ENABLED(CONFIG_KALLSYMS_ALL)) |
| 86 | return is_kernel(addr); | 80 | return is_kernel(addr); |
| 87 | 81 | ||
| 88 | return is_kernel_text(addr) || is_kernel_inittext(addr); | 82 | return is_kernel_text(addr) || is_kernel_inittext(addr); |
| @@ -280,7 +274,7 @@ static unsigned long get_symbol_pos(unsigned long addr, | |||
| 280 | if (!symbol_end) { | 274 | if (!symbol_end) { |
| 281 | if (is_kernel_inittext(addr)) | 275 | if (is_kernel_inittext(addr)) |
| 282 | symbol_end = (unsigned long)_einittext; | 276 | symbol_end = (unsigned long)_einittext; |
| 283 | else if (all_var) | 277 | else if (IS_ENABLED(CONFIG_KALLSYMS_ALL)) |
| 284 | symbol_end = (unsigned long)_end; | 278 | symbol_end = (unsigned long)_end; |
| 285 | else | 279 | else |
| 286 | symbol_end = (unsigned long)_etext; | 280 | symbol_end = (unsigned long)_etext; |
diff --git a/kernel/kcmp.c b/kernel/kcmp.c index 3a47fa998fe0..ea34ed8bb952 100644 --- a/kernel/kcmp.c +++ b/kernel/kcmp.c | |||
| @@ -11,6 +11,10 @@ | |||
| 11 | #include <linux/bug.h> | 11 | #include <linux/bug.h> |
| 12 | #include <linux/err.h> | 12 | #include <linux/err.h> |
| 13 | #include <linux/kcmp.h> | 13 | #include <linux/kcmp.h> |
| 14 | #include <linux/capability.h> | ||
| 15 | #include <linux/list.h> | ||
| 16 | #include <linux/eventpoll.h> | ||
| 17 | #include <linux/file.h> | ||
| 14 | 18 | ||
| 15 | #include <asm/unistd.h> | 19 | #include <asm/unistd.h> |
| 16 | 20 | ||
| @@ -94,6 +98,56 @@ static int kcmp_lock(struct mutex *m1, struct mutex *m2) | |||
| 94 | return err; | 98 | return err; |
| 95 | } | 99 | } |
| 96 | 100 | ||
| 101 | #ifdef CONFIG_EPOLL | ||
| 102 | static int kcmp_epoll_target(struct task_struct *task1, | ||
| 103 | struct task_struct *task2, | ||
| 104 | unsigned long idx1, | ||
| 105 | struct kcmp_epoll_slot __user *uslot) | ||
| 106 | { | ||
| 107 | struct file *filp, *filp_epoll, *filp_tgt; | ||
| 108 | struct kcmp_epoll_slot slot; | ||
| 109 | struct files_struct *files; | ||
| 110 | |||
| 111 | if (copy_from_user(&slot, uslot, sizeof(slot))) | ||
| 112 | return -EFAULT; | ||
| 113 | |||
| 114 | filp = get_file_raw_ptr(task1, idx1); | ||
| 115 | if (!filp) | ||
| 116 | return -EBADF; | ||
| 117 | |||
| 118 | files = get_files_struct(task2); | ||
| 119 | if (!files) | ||
| 120 | return -EBADF; | ||
| 121 | |||
| 122 | spin_lock(&files->file_lock); | ||
| 123 | filp_epoll = fcheck_files(files, slot.efd); | ||
| 124 | if (filp_epoll) | ||
| 125 | get_file(filp_epoll); | ||
| 126 | else | ||
| 127 | filp_tgt = ERR_PTR(-EBADF); | ||
| 128 | spin_unlock(&files->file_lock); | ||
| 129 | put_files_struct(files); | ||
| 130 | |||
| 131 | if (filp_epoll) { | ||
| 132 | filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, slot.toff); | ||
| 133 | fput(filp_epoll); | ||
| 134 | } else | ||
| 135 | |||
| 136 | if (IS_ERR(filp_tgt)) | ||
| 137 | return PTR_ERR(filp_tgt); | ||
| 138 | |||
| 139 | return kcmp_ptr(filp, filp_tgt, KCMP_FILE); | ||
| 140 | } | ||
| 141 | #else | ||
| 142 | static int kcmp_epoll_target(struct task_struct *task1, | ||
| 143 | struct task_struct *task2, | ||
| 144 | unsigned long idx1, | ||
| 145 | struct kcmp_epoll_slot __user *uslot) | ||
| 146 | { | ||
| 147 | return -EOPNOTSUPP; | ||
| 148 | } | ||
| 149 | #endif | ||
| 150 | |||
| 97 | SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type, | 151 | SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type, |
| 98 | unsigned long, idx1, unsigned long, idx2) | 152 | unsigned long, idx1, unsigned long, idx2) |
| 99 | { | 153 | { |
| @@ -165,6 +219,9 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type, | |||
| 165 | ret = -EOPNOTSUPP; | 219 | ret = -EOPNOTSUPP; |
| 166 | #endif | 220 | #endif |
| 167 | break; | 221 | break; |
| 222 | case KCMP_EPOLL_TFD: | ||
| 223 | ret = kcmp_epoll_target(task1, task2, idx1, (void *)idx2); | ||
| 224 | break; | ||
| 168 | default: | 225 | default: |
| 169 | ret = -EINVAL; | 226 | ret = -EINVAL; |
| 170 | break; | 227 | break; |
diff --git a/kernel/kexec.c b/kernel/kexec.c index 980936a90ee6..e62ec4dc6620 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
| @@ -144,6 +144,14 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments, | |||
| 144 | if (ret) | 144 | if (ret) |
| 145 | goto out; | 145 | goto out; |
| 146 | 146 | ||
| 147 | /* | ||
| 148 | * Some architecture(like S390) may touch the crash memory before | ||
| 149 | * machine_kexec_prepare(), we must copy vmcoreinfo data after it. | ||
| 150 | */ | ||
| 151 | ret = kimage_crash_copy_vmcoreinfo(image); | ||
| 152 | if (ret) | ||
| 153 | goto out; | ||
| 154 | |||
| 147 | for (i = 0; i < nr_segments; i++) { | 155 | for (i = 0; i < nr_segments; i++) { |
| 148 | ret = kimage_load_segment(image, &image->segment[i]); | 156 | ret = kimage_load_segment(image, &image->segment[i]); |
| 149 | if (ret) | 157 | if (ret) |
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 154ffb489b93..1ae7c41c33c1 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c | |||
| @@ -482,6 +482,40 @@ struct page *kimage_alloc_control_pages(struct kimage *image, | |||
| 482 | return pages; | 482 | return pages; |
| 483 | } | 483 | } |
| 484 | 484 | ||
| 485 | int kimage_crash_copy_vmcoreinfo(struct kimage *image) | ||
| 486 | { | ||
| 487 | struct page *vmcoreinfo_page; | ||
| 488 | void *safecopy; | ||
| 489 | |||
| 490 | if (image->type != KEXEC_TYPE_CRASH) | ||
| 491 | return 0; | ||
| 492 | |||
| 493 | /* | ||
| 494 | * For kdump, allocate one vmcoreinfo safe copy from the | ||
| 495 | * crash memory. as we have arch_kexec_protect_crashkres() | ||
| 496 | * after kexec syscall, we naturally protect it from write | ||
| 497 | * (even read) access under kernel direct mapping. But on | ||
| 498 | * the other hand, we still need to operate it when crash | ||
| 499 | * happens to generate vmcoreinfo note, hereby we rely on | ||
| 500 | * vmap for this purpose. | ||
| 501 | */ | ||
| 502 | vmcoreinfo_page = kimage_alloc_control_pages(image, 0); | ||
| 503 | if (!vmcoreinfo_page) { | ||
| 504 | pr_warn("Could not allocate vmcoreinfo buffer\n"); | ||
| 505 | return -ENOMEM; | ||
| 506 | } | ||
| 507 | safecopy = vmap(&vmcoreinfo_page, 1, VM_MAP, PAGE_KERNEL); | ||
| 508 | if (!safecopy) { | ||
| 509 | pr_warn("Could not vmap vmcoreinfo buffer\n"); | ||
| 510 | return -ENOMEM; | ||
| 511 | } | ||
| 512 | |||
| 513 | image->vmcoreinfo_data_copy = safecopy; | ||
| 514 | crash_update_vmcoreinfo_safecopy(safecopy); | ||
| 515 | |||
| 516 | return 0; | ||
| 517 | } | ||
| 518 | |||
| 485 | static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) | 519 | static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) |
| 486 | { | 520 | { |
| 487 | if (*image->entry != 0) | 521 | if (*image->entry != 0) |
| @@ -569,6 +603,11 @@ void kimage_free(struct kimage *image) | |||
| 569 | if (!image) | 603 | if (!image) |
| 570 | return; | 604 | return; |
| 571 | 605 | ||
| 606 | if (image->vmcoreinfo_data_copy) { | ||
| 607 | crash_update_vmcoreinfo_safecopy(NULL); | ||
| 608 | vunmap(image->vmcoreinfo_data_copy); | ||
| 609 | } | ||
| 610 | |||
| 572 | kimage_free_extra_pages(image); | 611 | kimage_free_extra_pages(image); |
| 573 | for_each_kimage_entry(image, ptr, entry) { | 612 | for_each_kimage_entry(image, ptr, entry) { |
| 574 | if (entry & IND_INDIRECTION) { | 613 | if (entry & IND_INDIRECTION) { |
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index b118735fea9d..9f48f4412297 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c | |||
| @@ -26,13 +26,6 @@ | |||
| 26 | #include <linux/vmalloc.h> | 26 | #include <linux/vmalloc.h> |
| 27 | #include "kexec_internal.h" | 27 | #include "kexec_internal.h" |
| 28 | 28 | ||
| 29 | /* | ||
| 30 | * Declare these symbols weak so that if architecture provides a purgatory, | ||
| 31 | * these will be overridden. | ||
| 32 | */ | ||
| 33 | char __weak kexec_purgatory[0]; | ||
| 34 | size_t __weak kexec_purgatory_size = 0; | ||
| 35 | |||
| 36 | static int kexec_calculate_store_digests(struct kimage *image); | 29 | static int kexec_calculate_store_digests(struct kimage *image); |
| 37 | 30 | ||
| 38 | /* Architectures can provide this probe function */ | 31 | /* Architectures can provide this probe function */ |
| @@ -162,16 +155,10 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, | |||
| 162 | } | 155 | } |
| 163 | 156 | ||
| 164 | if (cmdline_len) { | 157 | if (cmdline_len) { |
| 165 | image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL); | 158 | image->cmdline_buf = memdup_user(cmdline_ptr, cmdline_len); |
| 166 | if (!image->cmdline_buf) { | 159 | if (IS_ERR(image->cmdline_buf)) { |
| 167 | ret = -ENOMEM; | 160 | ret = PTR_ERR(image->cmdline_buf); |
| 168 | goto out; | 161 | image->cmdline_buf = NULL; |
| 169 | } | ||
| 170 | |||
| 171 | ret = copy_from_user(image->cmdline_buf, cmdline_ptr, | ||
| 172 | cmdline_len); | ||
| 173 | if (ret) { | ||
| 174 | ret = -EFAULT; | ||
| 175 | goto out; | 162 | goto out; |
| 176 | } | 163 | } |
| 177 | 164 | ||
| @@ -304,6 +291,14 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, | |||
| 304 | if (ret) | 291 | if (ret) |
| 305 | goto out; | 292 | goto out; |
| 306 | 293 | ||
| 294 | /* | ||
| 295 | * Some architecture(like S390) may touch the crash memory before | ||
| 296 | * machine_kexec_prepare(), we must copy vmcoreinfo data after it. | ||
| 297 | */ | ||
| 298 | ret = kimage_crash_copy_vmcoreinfo(image); | ||
| 299 | if (ret) | ||
| 300 | goto out; | ||
| 301 | |||
| 307 | ret = kexec_calculate_store_digests(image); | 302 | ret = kexec_calculate_store_digests(image); |
| 308 | if (ret) | 303 | if (ret) |
| 309 | goto out; | 304 | goto out; |
diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h index 799a8a452187..50dfcb039a41 100644 --- a/kernel/kexec_internal.h +++ b/kernel/kexec_internal.h | |||
| @@ -17,6 +17,8 @@ extern struct mutex kexec_mutex; | |||
| 17 | #ifdef CONFIG_KEXEC_FILE | 17 | #ifdef CONFIG_KEXEC_FILE |
| 18 | #include <linux/purgatory.h> | 18 | #include <linux/purgatory.h> |
| 19 | void kimage_file_post_load_cleanup(struct kimage *image); | 19 | void kimage_file_post_load_cleanup(struct kimage *image); |
| 20 | extern char kexec_purgatory[]; | ||
| 21 | extern size_t kexec_purgatory_size; | ||
| 20 | #else /* CONFIG_KEXEC_FILE */ | 22 | #else /* CONFIG_KEXEC_FILE */ |
| 21 | static inline void kimage_file_post_load_cleanup(struct kimage *image) { } | 23 | static inline void kimage_file_post_load_cleanup(struct kimage *image) { } |
| 22 | #endif /* CONFIG_KEXEC_FILE */ | 24 | #endif /* CONFIG_KEXEC_FILE */ |
diff --git a/kernel/kmod.c b/kernel/kmod.c index 563f97e2be36..6d016c5d97c8 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
| @@ -45,8 +45,6 @@ | |||
| 45 | 45 | ||
| 46 | #include <trace/events/module.h> | 46 | #include <trace/events/module.h> |
| 47 | 47 | ||
| 48 | extern int max_threads; | ||
| 49 | |||
| 50 | #define CAP_BSET (void *)1 | 48 | #define CAP_BSET (void *)1 |
| 51 | #define CAP_PI (void *)2 | 49 | #define CAP_PI (void *)2 |
| 52 | 50 | ||
| @@ -56,6 +54,21 @@ static DEFINE_SPINLOCK(umh_sysctl_lock); | |||
| 56 | static DECLARE_RWSEM(umhelper_sem); | 54 | static DECLARE_RWSEM(umhelper_sem); |
| 57 | 55 | ||
| 58 | #ifdef CONFIG_MODULES | 56 | #ifdef CONFIG_MODULES |
| 57 | /* | ||
| 58 | * Assuming: | ||
| 59 | * | ||
| 60 | * threads = div64_u64((u64) totalram_pages * (u64) PAGE_SIZE, | ||
| 61 | * (u64) THREAD_SIZE * 8UL); | ||
| 62 | * | ||
| 63 | * If you need less than 50 threads would mean we're dealing with systems | ||
| 64 | * smaller than 3200 pages. This assuems you are capable of having ~13M memory, | ||
| 65 | * and this would only be an be an upper limit, after which the OOM killer | ||
| 66 | * would take effect. Systems like these are very unlikely if modules are | ||
| 67 | * enabled. | ||
| 68 | */ | ||
| 69 | #define MAX_KMOD_CONCURRENT 50 | ||
| 70 | static atomic_t kmod_concurrent_max = ATOMIC_INIT(MAX_KMOD_CONCURRENT); | ||
| 71 | static DECLARE_WAIT_QUEUE_HEAD(kmod_wq); | ||
| 59 | 72 | ||
| 60 | /* | 73 | /* |
| 61 | modprobe_path is set via /proc/sys. | 74 | modprobe_path is set via /proc/sys. |
| @@ -127,11 +140,7 @@ int __request_module(bool wait, const char *fmt, ...) | |||
| 127 | { | 140 | { |
| 128 | va_list args; | 141 | va_list args; |
| 129 | char module_name[MODULE_NAME_LEN]; | 142 | char module_name[MODULE_NAME_LEN]; |
| 130 | unsigned int max_modprobes; | ||
| 131 | int ret; | 143 | int ret; |
| 132 | static atomic_t kmod_concurrent = ATOMIC_INIT(0); | ||
| 133 | #define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ | ||
| 134 | static int kmod_loop_msg; | ||
| 135 | 144 | ||
| 136 | /* | 145 | /* |
| 137 | * We don't allow synchronous module loading from async. Module | 146 | * We don't allow synchronous module loading from async. Module |
| @@ -154,40 +163,25 @@ int __request_module(bool wait, const char *fmt, ...) | |||
| 154 | if (ret) | 163 | if (ret) |
| 155 | return ret; | 164 | return ret; |
| 156 | 165 | ||
| 157 | /* If modprobe needs a service that is in a module, we get a recursive | 166 | if (atomic_dec_if_positive(&kmod_concurrent_max) < 0) { |
| 158 | * loop. Limit the number of running kmod threads to max_threads/2 or | 167 | pr_warn_ratelimited("request_module: kmod_concurrent_max (%u) close to 0 (max_modprobes: %u), for module %s, throttling...", |
| 159 | * MAX_KMOD_CONCURRENT, whichever is the smaller. A cleaner method | 168 | atomic_read(&kmod_concurrent_max), |
| 160 | * would be to run the parents of this process, counting how many times | 169 | MAX_KMOD_CONCURRENT, module_name); |
| 161 | * kmod was invoked. That would mean accessing the internals of the | 170 | wait_event_interruptible(kmod_wq, |
| 162 | * process tables to get the command line, proc_pid_cmdline is static | 171 | atomic_dec_if_positive(&kmod_concurrent_max) >= 0); |
| 163 | * and it is not worth changing the proc code just to handle this case. | ||
| 164 | * KAO. | ||
| 165 | * | ||
| 166 | * "trace the ppid" is simple, but will fail if someone's | ||
| 167 | * parent exits. I think this is as good as it gets. --RR | ||
| 168 | */ | ||
| 169 | max_modprobes = min(max_threads/2, MAX_KMOD_CONCURRENT); | ||
| 170 | atomic_inc(&kmod_concurrent); | ||
| 171 | if (atomic_read(&kmod_concurrent) > max_modprobes) { | ||
| 172 | /* We may be blaming an innocent here, but unlikely */ | ||
| 173 | if (kmod_loop_msg < 5) { | ||
| 174 | printk(KERN_ERR | ||
| 175 | "request_module: runaway loop modprobe %s\n", | ||
| 176 | module_name); | ||
| 177 | kmod_loop_msg++; | ||
| 178 | } | ||
| 179 | atomic_dec(&kmod_concurrent); | ||
| 180 | return -ENOMEM; | ||
| 181 | } | 172 | } |
| 182 | 173 | ||
| 183 | trace_module_request(module_name, wait, _RET_IP_); | 174 | trace_module_request(module_name, wait, _RET_IP_); |
| 184 | 175 | ||
| 185 | ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC); | 176 | ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC); |
| 186 | 177 | ||
| 187 | atomic_dec(&kmod_concurrent); | 178 | atomic_inc(&kmod_concurrent_max); |
| 179 | wake_up(&kmod_wq); | ||
| 180 | |||
| 188 | return ret; | 181 | return ret; |
| 189 | } | 182 | } |
| 190 | EXPORT_SYMBOL(__request_module); | 183 | EXPORT_SYMBOL(__request_module); |
| 184 | |||
| 191 | #endif /* CONFIG_MODULES */ | 185 | #endif /* CONFIG_MODULES */ |
| 192 | 186 | ||
| 193 | static void call_usermodehelper_freeinfo(struct subprocess_info *info) | 187 | static void call_usermodehelper_freeinfo(struct subprocess_info *info) |
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 23cd70651238..46ba853656f6 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c | |||
| @@ -134,7 +134,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj, | |||
| 134 | { | 134 | { |
| 135 | phys_addr_t vmcore_base = paddr_vmcoreinfo_note(); | 135 | phys_addr_t vmcore_base = paddr_vmcoreinfo_note(); |
| 136 | return sprintf(buf, "%pa %x\n", &vmcore_base, | 136 | return sprintf(buf, "%pa %x\n", &vmcore_base, |
| 137 | (unsigned int)sizeof(vmcoreinfo_note)); | 137 | (unsigned int)VMCOREINFO_NOTE_SIZE); |
| 138 | } | 138 | } |
| 139 | KERNEL_ATTR_RO(vmcoreinfo); | 139 | KERNEL_ATTR_RO(vmcoreinfo); |
| 140 | 140 | ||
| @@ -234,7 +234,7 @@ static struct attribute * kernel_attrs[] = { | |||
| 234 | NULL | 234 | NULL |
| 235 | }; | 235 | }; |
| 236 | 236 | ||
| 237 | static struct attribute_group kernel_attr_group = { | 237 | static const struct attribute_group kernel_attr_group = { |
| 238 | .attrs = kernel_attrs, | 238 | .attrs = kernel_attrs, |
| 239 | }; | 239 | }; |
| 240 | 240 | ||
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 198527a62149..858a07590e39 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c | |||
| @@ -227,9 +227,9 @@ static void __sched __mutex_lock_slowpath(struct mutex *lock); | |||
| 227 | * (or statically defined) before it can be locked. memset()-ing | 227 | * (or statically defined) before it can be locked. memset()-ing |
| 228 | * the mutex to 0 is not allowed. | 228 | * the mutex to 0 is not allowed. |
| 229 | * | 229 | * |
| 230 | * ( The CONFIG_DEBUG_MUTEXES .config option turns on debugging | 230 | * (The CONFIG_DEBUG_MUTEXES .config option turns on debugging |
| 231 | * checks that will enforce the restrictions and will also do | 231 | * checks that will enforce the restrictions and will also do |
| 232 | * deadlock debugging. ) | 232 | * deadlock debugging) |
| 233 | * | 233 | * |
| 234 | * This function is similar to (but not equivalent to) down(). | 234 | * This function is similar to (but not equivalent to) down(). |
| 235 | */ | 235 | */ |
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index cc3ed0ccdfa2..2655f26ec882 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c | |||
| @@ -20,6 +20,7 @@ | |||
| 20 | #include <linux/cpumask.h> | 20 | #include <linux/cpumask.h> |
| 21 | #include <linux/percpu.h> | 21 | #include <linux/percpu.h> |
| 22 | #include <linux/hardirq.h> | 22 | #include <linux/hardirq.h> |
| 23 | #include <linux/spinlock.h> | ||
| 23 | #include <asm/qrwlock.h> | 24 | #include <asm/qrwlock.h> |
| 24 | 25 | ||
| 25 | /* | 26 | /* |
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index b2caec7315af..fd24153e8a48 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c | |||
| @@ -28,6 +28,7 @@ | |||
| 28 | #include <linux/percpu.h> | 28 | #include <linux/percpu.h> |
| 29 | #include <linux/hardirq.h> | 29 | #include <linux/hardirq.h> |
| 30 | #include <linux/mutex.h> | 30 | #include <linux/mutex.h> |
| 31 | #include <linux/prefetch.h> | ||
| 31 | #include <asm/byteorder.h> | 32 | #include <asm/byteorder.h> |
| 32 | #include <asm/qspinlock.h> | 33 | #include <asm/qspinlock.h> |
| 33 | 34 | ||
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index e6b2f7ad3e51..4ccfcaae5b89 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h | |||
| @@ -193,7 +193,8 @@ void __init __pv_init_lock_hash(void) | |||
| 193 | */ | 193 | */ |
| 194 | pv_lock_hash = alloc_large_system_hash("PV qspinlock", | 194 | pv_lock_hash = alloc_large_system_hash("PV qspinlock", |
| 195 | sizeof(struct pv_hash_entry), | 195 | sizeof(struct pv_hash_entry), |
| 196 | pv_hash_size, 0, HASH_EARLY, | 196 | pv_hash_size, 0, |
| 197 | HASH_EARLY | HASH_ZERO, | ||
| 197 | &pv_lock_hash_bits, NULL, | 198 | &pv_lock_hash_bits, NULL, |
| 198 | pv_hash_size, pv_hash_size); | 199 | pv_hash_size, pv_hash_size); |
| 199 | } | 200 | } |
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 78069895032a..649dc9d3951a 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c | |||
| @@ -963,7 +963,6 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, | |||
| 963 | return -EDEADLK; | 963 | return -EDEADLK; |
| 964 | 964 | ||
| 965 | raw_spin_lock(&task->pi_lock); | 965 | raw_spin_lock(&task->pi_lock); |
| 966 | rt_mutex_adjust_prio(task); | ||
| 967 | waiter->task = task; | 966 | waiter->task = task; |
| 968 | waiter->lock = lock; | 967 | waiter->lock = lock; |
| 969 | waiter->prio = task->prio; | 968 | waiter->prio = task->prio; |
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c index c65f7989f850..20819df98125 100644 --- a/kernel/locking/rwsem-spinlock.c +++ b/kernel/locking/rwsem-spinlock.c | |||
| @@ -231,8 +231,8 @@ int __sched __down_write_common(struct rw_semaphore *sem, int state) | |||
| 231 | 231 | ||
| 232 | out_nolock: | 232 | out_nolock: |
| 233 | list_del(&waiter.list); | 233 | list_del(&waiter.list); |
| 234 | if (!list_empty(&sem->wait_list)) | 234 | if (!list_empty(&sem->wait_list) && sem->count >= 0) |
| 235 | __rwsem_do_wake(sem, 1); | 235 | __rwsem_do_wake(sem, 0); |
| 236 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); | 236 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); |
| 237 | 237 | ||
| 238 | return -EINTR; | 238 | return -EINTR; |
diff --git a/kernel/memremap.c b/kernel/memremap.c index 23a6483c3666..124bed776532 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c | |||
| @@ -358,7 +358,11 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, | |||
| 358 | goto err_pfn_remap; | 358 | goto err_pfn_remap; |
| 359 | 359 | ||
| 360 | mem_hotplug_begin(); | 360 | mem_hotplug_begin(); |
| 361 | error = arch_add_memory(nid, align_start, align_size, true); | 361 | error = arch_add_memory(nid, align_start, align_size, false); |
| 362 | if (!error) | ||
| 363 | move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], | ||
| 364 | align_start >> PAGE_SHIFT, | ||
| 365 | align_size >> PAGE_SHIFT); | ||
| 362 | mem_hotplug_done(); | 366 | mem_hotplug_done(); |
| 363 | if (error) | 367 | if (error) |
| 364 | goto err_add_memory; | 368 | goto err_add_memory; |
diff --git a/kernel/module.c b/kernel/module.c index 4a3665f8f837..40f983cbea81 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
| @@ -49,9 +49,7 @@ | |||
| 49 | #include <linux/rculist.h> | 49 | #include <linux/rculist.h> |
| 50 | #include <linux/uaccess.h> | 50 | #include <linux/uaccess.h> |
| 51 | #include <asm/cacheflush.h> | 51 | #include <asm/cacheflush.h> |
| 52 | #ifdef CONFIG_STRICT_MODULE_RWX | 52 | #include <linux/set_memory.h> |
| 53 | #include <asm/set_memory.h> | ||
| 54 | #endif | ||
| 55 | #include <asm/mmu_context.h> | 53 | #include <asm/mmu_context.h> |
| 56 | #include <linux/license.h> | 54 | #include <linux/license.h> |
| 57 | #include <asm/sections.h> | 55 | #include <asm/sections.h> |
| @@ -302,6 +300,7 @@ int unregister_module_notifier(struct notifier_block *nb) | |||
| 302 | EXPORT_SYMBOL(unregister_module_notifier); | 300 | EXPORT_SYMBOL(unregister_module_notifier); |
| 303 | 301 | ||
| 304 | struct load_info { | 302 | struct load_info { |
| 303 | const char *name; | ||
| 305 | Elf_Ehdr *hdr; | 304 | Elf_Ehdr *hdr; |
| 306 | unsigned long len; | 305 | unsigned long len; |
| 307 | Elf_Shdr *sechdrs; | 306 | Elf_Shdr *sechdrs; |
| @@ -602,7 +601,7 @@ static struct module *find_module_all(const char *name, size_t len, | |||
| 602 | 601 | ||
| 603 | module_assert_mutex_or_preempt(); | 602 | module_assert_mutex_or_preempt(); |
| 604 | 603 | ||
| 605 | list_for_each_entry(mod, &modules, list) { | 604 | list_for_each_entry_rcu(mod, &modules, list) { |
| 606 | if (!even_unformed && mod->state == MODULE_STATE_UNFORMED) | 605 | if (!even_unformed && mod->state == MODULE_STATE_UNFORMED) |
| 607 | continue; | 606 | continue; |
| 608 | if (strlen(mod->name) == len && !memcmp(mod->name, name, len)) | 607 | if (strlen(mod->name) == len && !memcmp(mod->name, name, len)) |
| @@ -1202,10 +1201,7 @@ static ssize_t store_uevent(struct module_attribute *mattr, | |||
| 1202 | struct module_kobject *mk, | 1201 | struct module_kobject *mk, |
| 1203 | const char *buffer, size_t count) | 1202 | const char *buffer, size_t count) |
| 1204 | { | 1203 | { |
| 1205 | enum kobject_action action; | 1204 | kobject_synth_uevent(&mk->kobj, buffer, count); |
| 1206 | |||
| 1207 | if (kobject_action_type(buffer, count, &action) == 0) | ||
| 1208 | kobject_uevent(&mk->kobj, action); | ||
| 1209 | return count; | 1205 | return count; |
| 1210 | } | 1206 | } |
| 1211 | 1207 | ||
| @@ -1278,12 +1274,13 @@ static u32 resolve_rel_crc(const s32 *crc) | |||
| 1278 | return *(u32 *)((void *)crc + *crc); | 1274 | return *(u32 *)((void *)crc + *crc); |
| 1279 | } | 1275 | } |
| 1280 | 1276 | ||
| 1281 | static int check_version(Elf_Shdr *sechdrs, | 1277 | static int check_version(const struct load_info *info, |
| 1282 | unsigned int versindex, | ||
| 1283 | const char *symname, | 1278 | const char *symname, |
| 1284 | struct module *mod, | 1279 | struct module *mod, |
| 1285 | const s32 *crc) | 1280 | const s32 *crc) |
| 1286 | { | 1281 | { |
| 1282 | Elf_Shdr *sechdrs = info->sechdrs; | ||
| 1283 | unsigned int versindex = info->index.vers; | ||
| 1287 | unsigned int i, num_versions; | 1284 | unsigned int i, num_versions; |
| 1288 | struct modversion_info *versions; | 1285 | struct modversion_info *versions; |
| 1289 | 1286 | ||
| @@ -1317,17 +1314,16 @@ static int check_version(Elf_Shdr *sechdrs, | |||
| 1317 | } | 1314 | } |
| 1318 | 1315 | ||
| 1319 | /* Broken toolchain. Warn once, then let it go.. */ | 1316 | /* Broken toolchain. Warn once, then let it go.. */ |
| 1320 | pr_warn_once("%s: no symbol version for %s\n", mod->name, symname); | 1317 | pr_warn_once("%s: no symbol version for %s\n", info->name, symname); |
| 1321 | return 1; | 1318 | return 1; |
| 1322 | 1319 | ||
| 1323 | bad_version: | 1320 | bad_version: |
| 1324 | pr_warn("%s: disagrees about version of symbol %s\n", | 1321 | pr_warn("%s: disagrees about version of symbol %s\n", |
| 1325 | mod->name, symname); | 1322 | info->name, symname); |
| 1326 | return 0; | 1323 | return 0; |
| 1327 | } | 1324 | } |
| 1328 | 1325 | ||
| 1329 | static inline int check_modstruct_version(Elf_Shdr *sechdrs, | 1326 | static inline int check_modstruct_version(const struct load_info *info, |
| 1330 | unsigned int versindex, | ||
| 1331 | struct module *mod) | 1327 | struct module *mod) |
| 1332 | { | 1328 | { |
| 1333 | const s32 *crc; | 1329 | const s32 *crc; |
| @@ -1343,8 +1339,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs, | |||
| 1343 | BUG(); | 1339 | BUG(); |
| 1344 | } | 1340 | } |
| 1345 | preempt_enable(); | 1341 | preempt_enable(); |
| 1346 | return check_version(sechdrs, versindex, | 1342 | return check_version(info, VMLINUX_SYMBOL_STR(module_layout), |
| 1347 | VMLINUX_SYMBOL_STR(module_layout), mod, crc); | 1343 | mod, crc); |
| 1348 | } | 1344 | } |
| 1349 | 1345 | ||
| 1350 | /* First part is kernel version, which we ignore if module has crcs. */ | 1346 | /* First part is kernel version, which we ignore if module has crcs. */ |
| @@ -1358,8 +1354,7 @@ static inline int same_magic(const char *amagic, const char *bmagic, | |||
| 1358 | return strcmp(amagic, bmagic) == 0; | 1354 | return strcmp(amagic, bmagic) == 0; |
| 1359 | } | 1355 | } |
| 1360 | #else | 1356 | #else |
| 1361 | static inline int check_version(Elf_Shdr *sechdrs, | 1357 | static inline int check_version(const struct load_info *info, |
| 1362 | unsigned int versindex, | ||
| 1363 | const char *symname, | 1358 | const char *symname, |
| 1364 | struct module *mod, | 1359 | struct module *mod, |
| 1365 | const s32 *crc) | 1360 | const s32 *crc) |
| @@ -1367,8 +1362,7 @@ static inline int check_version(Elf_Shdr *sechdrs, | |||
| 1367 | return 1; | 1362 | return 1; |
| 1368 | } | 1363 | } |
| 1369 | 1364 | ||
| 1370 | static inline int check_modstruct_version(Elf_Shdr *sechdrs, | 1365 | static inline int check_modstruct_version(const struct load_info *info, |
| 1371 | unsigned int versindex, | ||
| 1372 | struct module *mod) | 1366 | struct module *mod) |
| 1373 | { | 1367 | { |
| 1374 | return 1; | 1368 | return 1; |
| @@ -1404,7 +1398,7 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod, | |||
| 1404 | if (!sym) | 1398 | if (!sym) |
| 1405 | goto unlock; | 1399 | goto unlock; |
| 1406 | 1400 | ||
| 1407 | if (!check_version(info->sechdrs, info->index.vers, name, mod, crc)) { | 1401 | if (!check_version(info, name, mod, crc)) { |
| 1408 | sym = ERR_PTR(-EINVAL); | 1402 | sym = ERR_PTR(-EINVAL); |
| 1409 | goto getname; | 1403 | goto getname; |
| 1410 | } | 1404 | } |
| @@ -1667,31 +1661,36 @@ static inline void remove_notes_attrs(struct module *mod) | |||
| 1667 | } | 1661 | } |
| 1668 | #endif /* CONFIG_KALLSYMS */ | 1662 | #endif /* CONFIG_KALLSYMS */ |
| 1669 | 1663 | ||
| 1670 | static void add_usage_links(struct module *mod) | 1664 | static void del_usage_links(struct module *mod) |
| 1671 | { | 1665 | { |
| 1672 | #ifdef CONFIG_MODULE_UNLOAD | 1666 | #ifdef CONFIG_MODULE_UNLOAD |
| 1673 | struct module_use *use; | 1667 | struct module_use *use; |
| 1674 | int nowarn; | ||
| 1675 | 1668 | ||
| 1676 | mutex_lock(&module_mutex); | 1669 | mutex_lock(&module_mutex); |
| 1677 | list_for_each_entry(use, &mod->target_list, target_list) { | 1670 | list_for_each_entry(use, &mod->target_list, target_list) |
| 1678 | nowarn = sysfs_create_link(use->target->holders_dir, | 1671 | sysfs_remove_link(use->target->holders_dir, mod->name); |
| 1679 | &mod->mkobj.kobj, mod->name); | ||
| 1680 | } | ||
| 1681 | mutex_unlock(&module_mutex); | 1672 | mutex_unlock(&module_mutex); |
| 1682 | #endif | 1673 | #endif |
| 1683 | } | 1674 | } |
| 1684 | 1675 | ||
| 1685 | static void del_usage_links(struct module *mod) | 1676 | static int add_usage_links(struct module *mod) |
| 1686 | { | 1677 | { |
| 1678 | int ret = 0; | ||
| 1687 | #ifdef CONFIG_MODULE_UNLOAD | 1679 | #ifdef CONFIG_MODULE_UNLOAD |
| 1688 | struct module_use *use; | 1680 | struct module_use *use; |
| 1689 | 1681 | ||
| 1690 | mutex_lock(&module_mutex); | 1682 | mutex_lock(&module_mutex); |
| 1691 | list_for_each_entry(use, &mod->target_list, target_list) | 1683 | list_for_each_entry(use, &mod->target_list, target_list) { |
| 1692 | sysfs_remove_link(use->target->holders_dir, mod->name); | 1684 | ret = sysfs_create_link(use->target->holders_dir, |
| 1685 | &mod->mkobj.kobj, mod->name); | ||
| 1686 | if (ret) | ||
| 1687 | break; | ||
| 1688 | } | ||
| 1693 | mutex_unlock(&module_mutex); | 1689 | mutex_unlock(&module_mutex); |
| 1690 | if (ret) | ||
| 1691 | del_usage_links(mod); | ||
| 1694 | #endif | 1692 | #endif |
| 1693 | return ret; | ||
| 1695 | } | 1694 | } |
| 1696 | 1695 | ||
| 1697 | static int module_add_modinfo_attrs(struct module *mod) | 1696 | static int module_add_modinfo_attrs(struct module *mod) |
| @@ -1802,13 +1801,18 @@ static int mod_sysfs_setup(struct module *mod, | |||
| 1802 | if (err) | 1801 | if (err) |
| 1803 | goto out_unreg_param; | 1802 | goto out_unreg_param; |
| 1804 | 1803 | ||
| 1805 | add_usage_links(mod); | 1804 | err = add_usage_links(mod); |
| 1805 | if (err) | ||
| 1806 | goto out_unreg_modinfo_attrs; | ||
| 1807 | |||
| 1806 | add_sect_attrs(mod, info); | 1808 | add_sect_attrs(mod, info); |
| 1807 | add_notes_attrs(mod, info); | 1809 | add_notes_attrs(mod, info); |
| 1808 | 1810 | ||
| 1809 | kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD); | 1811 | kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD); |
| 1810 | return 0; | 1812 | return 0; |
| 1811 | 1813 | ||
| 1814 | out_unreg_modinfo_attrs: | ||
| 1815 | module_remove_modinfo_attrs(mod); | ||
| 1812 | out_unreg_param: | 1816 | out_unreg_param: |
| 1813 | module_param_sysfs_remove(mod); | 1817 | module_param_sysfs_remove(mod); |
| 1814 | out_unreg_holders: | 1818 | out_unreg_holders: |
| @@ -2915,9 +2919,15 @@ static int rewrite_section_headers(struct load_info *info, int flags) | |||
| 2915 | info->index.vers = 0; /* Pretend no __versions section! */ | 2919 | info->index.vers = 0; /* Pretend no __versions section! */ |
| 2916 | else | 2920 | else |
| 2917 | info->index.vers = find_sec(info, "__versions"); | 2921 | info->index.vers = find_sec(info, "__versions"); |
| 2922 | info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC; | ||
| 2923 | |||
| 2918 | info->index.info = find_sec(info, ".modinfo"); | 2924 | info->index.info = find_sec(info, ".modinfo"); |
| 2925 | if (!info->index.info) | ||
| 2926 | info->name = "(missing .modinfo section)"; | ||
| 2927 | else | ||
| 2928 | info->name = get_modinfo(info, "name"); | ||
| 2919 | info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC; | 2929 | info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC; |
| 2920 | info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC; | 2930 | |
| 2921 | return 0; | 2931 | return 0; |
| 2922 | } | 2932 | } |
| 2923 | 2933 | ||
| @@ -2957,21 +2967,29 @@ static struct module *setup_load_info(struct load_info *info, int flags) | |||
| 2957 | 2967 | ||
| 2958 | info->index.mod = find_sec(info, ".gnu.linkonce.this_module"); | 2968 | info->index.mod = find_sec(info, ".gnu.linkonce.this_module"); |
| 2959 | if (!info->index.mod) { | 2969 | if (!info->index.mod) { |
| 2960 | pr_warn("No module found in object\n"); | 2970 | pr_warn("%s: No module found in object\n", |
| 2971 | info->name ?: "(missing .modinfo name field)"); | ||
| 2961 | return ERR_PTR(-ENOEXEC); | 2972 | return ERR_PTR(-ENOEXEC); |
| 2962 | } | 2973 | } |
| 2963 | /* This is temporary: point mod into copy of data. */ | 2974 | /* This is temporary: point mod into copy of data. */ |
| 2964 | mod = (void *)info->sechdrs[info->index.mod].sh_addr; | 2975 | mod = (void *)info->sechdrs[info->index.mod].sh_addr; |
| 2965 | 2976 | ||
| 2977 | /* | ||
| 2978 | * If we didn't load the .modinfo 'name' field, fall back to | ||
| 2979 | * on-disk struct mod 'name' field. | ||
| 2980 | */ | ||
| 2981 | if (!info->name) | ||
| 2982 | info->name = mod->name; | ||
| 2983 | |||
| 2966 | if (info->index.sym == 0) { | 2984 | if (info->index.sym == 0) { |
| 2967 | pr_warn("%s: module has no symbols (stripped?)\n", mod->name); | 2985 | pr_warn("%s: module has no symbols (stripped?)\n", info->name); |
| 2968 | return ERR_PTR(-ENOEXEC); | 2986 | return ERR_PTR(-ENOEXEC); |
| 2969 | } | 2987 | } |
| 2970 | 2988 | ||
| 2971 | info->index.pcpu = find_pcpusec(info); | 2989 | info->index.pcpu = find_pcpusec(info); |
| 2972 | 2990 | ||
| 2973 | /* Check module struct version now, before we try to use module. */ | 2991 | /* Check module struct version now, before we try to use module. */ |
| 2974 | if (!check_modstruct_version(info->sechdrs, info->index.vers, mod)) | 2992 | if (!check_modstruct_version(info, mod)) |
| 2975 | return ERR_PTR(-ENOEXEC); | 2993 | return ERR_PTR(-ENOEXEC); |
| 2976 | 2994 | ||
| 2977 | return mod; | 2995 | return mod; |
| @@ -2992,7 +3010,7 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags) | |||
| 2992 | return err; | 3010 | return err; |
| 2993 | } else if (!same_magic(modmagic, vermagic, info->index.vers)) { | 3011 | } else if (!same_magic(modmagic, vermagic, info->index.vers)) { |
| 2994 | pr_err("%s: version magic '%s' should be '%s'\n", | 3012 | pr_err("%s: version magic '%s' should be '%s'\n", |
| 2995 | mod->name, modmagic, vermagic); | 3013 | info->name, modmagic, vermagic); |
| 2996 | return -ENOEXEC; | 3014 | return -ENOEXEC; |
| 2997 | } | 3015 | } |
| 2998 | 3016 | ||
| @@ -3077,9 +3095,9 @@ static int find_module_sections(struct module *mod, struct load_info *info) | |||
| 3077 | mod->trace_events = section_objs(info, "_ftrace_events", | 3095 | mod->trace_events = section_objs(info, "_ftrace_events", |
| 3078 | sizeof(*mod->trace_events), | 3096 | sizeof(*mod->trace_events), |
| 3079 | &mod->num_trace_events); | 3097 | &mod->num_trace_events); |
| 3080 | mod->trace_enums = section_objs(info, "_ftrace_enum_map", | 3098 | mod->trace_evals = section_objs(info, "_ftrace_eval_map", |
| 3081 | sizeof(*mod->trace_enums), | 3099 | sizeof(*mod->trace_evals), |
| 3082 | &mod->num_trace_enums); | 3100 | &mod->num_trace_evals); |
| 3083 | #endif | 3101 | #endif |
| 3084 | #ifdef CONFIG_TRACING | 3102 | #ifdef CONFIG_TRACING |
| 3085 | mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt", | 3103 | mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt", |
| @@ -3242,7 +3260,7 @@ int __weak module_frob_arch_sections(Elf_Ehdr *hdr, | |||
| 3242 | 3260 | ||
| 3243 | /* module_blacklist is a comma-separated list of module names */ | 3261 | /* module_blacklist is a comma-separated list of module names */ |
| 3244 | static char *module_blacklist; | 3262 | static char *module_blacklist; |
| 3245 | static bool blacklisted(char *module_name) | 3263 | static bool blacklisted(const char *module_name) |
| 3246 | { | 3264 | { |
| 3247 | const char *p; | 3265 | const char *p; |
| 3248 | size_t len; | 3266 | size_t len; |
| @@ -3272,7 +3290,7 @@ static struct module *layout_and_allocate(struct load_info *info, int flags) | |||
| 3272 | if (IS_ERR(mod)) | 3290 | if (IS_ERR(mod)) |
| 3273 | return mod; | 3291 | return mod; |
| 3274 | 3292 | ||
| 3275 | if (blacklisted(mod->name)) | 3293 | if (blacklisted(info->name)) |
| 3276 | return ERR_PTR(-EPERM); | 3294 | return ERR_PTR(-EPERM); |
| 3277 | 3295 | ||
| 3278 | err = check_modinfo(mod, info, flags); | 3296 | err = check_modinfo(mod, info, flags); |
| @@ -4201,7 +4219,7 @@ const struct exception_table_entry *search_module_extables(unsigned long addr) | |||
| 4201 | goto out; | 4219 | goto out; |
| 4202 | 4220 | ||
| 4203 | e = search_extable(mod->extable, | 4221 | e = search_extable(mod->extable, |
| 4204 | mod->extable + mod->num_exentries - 1, | 4222 | mod->num_exentries, |
| 4205 | addr); | 4223 | addr); |
| 4206 | out: | 4224 | out: |
| 4207 | preempt_enable(); | 4225 | preempt_enable(); |
diff --git a/kernel/pid.c b/kernel/pid.c index fd1cde1e4576..731c4e528f4e 100644 --- a/kernel/pid.c +++ b/kernel/pid.c | |||
| @@ -575,16 +575,13 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) | |||
| 575 | */ | 575 | */ |
| 576 | void __init pidhash_init(void) | 576 | void __init pidhash_init(void) |
| 577 | { | 577 | { |
| 578 | unsigned int i, pidhash_size; | 578 | unsigned int pidhash_size; |
| 579 | 579 | ||
| 580 | pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18, | 580 | pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18, |
| 581 | HASH_EARLY | HASH_SMALL, | 581 | HASH_EARLY | HASH_SMALL | HASH_ZERO, |
| 582 | &pidhash_shift, NULL, | 582 | &pidhash_shift, NULL, |
| 583 | 0, 4096); | 583 | 0, 4096); |
| 584 | pidhash_size = 1U << pidhash_shift; | 584 | pidhash_size = 1U << pidhash_shift; |
| 585 | |||
| 586 | for (i = 0; i < pidhash_size; i++) | ||
| 587 | INIT_HLIST_HEAD(&pid_hash[i]); | ||
| 588 | } | 585 | } |
| 589 | 586 | ||
| 590 | void __init pidmap_init(void) | 587 | void __init pidmap_init(void) |
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index a8b978c35a6a..e1914c7b85b1 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c | |||
| @@ -1108,7 +1108,7 @@ static struct attribute * g[] = { | |||
| 1108 | }; | 1108 | }; |
| 1109 | 1109 | ||
| 1110 | 1110 | ||
| 1111 | static struct attribute_group attr_group = { | 1111 | static const struct attribute_group attr_group = { |
| 1112 | .attrs = g, | 1112 | .attrs = g, |
| 1113 | }; | 1113 | }; |
| 1114 | 1114 | ||
diff --git a/kernel/power/main.c b/kernel/power/main.c index d401c21136d1..42bd800a6755 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
| @@ -705,7 +705,7 @@ static struct attribute * g[] = { | |||
| 705 | NULL, | 705 | NULL, |
| 706 | }; | 706 | }; |
| 707 | 707 | ||
| 708 | static struct attribute_group attr_group = { | 708 | static const struct attribute_group attr_group = { |
| 709 | .attrs = g, | 709 | .attrs = g, |
| 710 | }; | 710 | }; |
| 711 | 711 | ||
diff --git a/kernel/power/process.c b/kernel/power/process.c index c7209f060eeb..78672d324a6e 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c | |||
| @@ -132,7 +132,7 @@ int freeze_processes(void) | |||
| 132 | if (!pm_freezing) | 132 | if (!pm_freezing) |
| 133 | atomic_inc(&system_freezing_cnt); | 133 | atomic_inc(&system_freezing_cnt); |
| 134 | 134 | ||
| 135 | pm_wakeup_clear(); | 135 | pm_wakeup_clear(true); |
| 136 | pr_info("Freezing user space processes ... "); | 136 | pr_info("Freezing user space processes ... "); |
| 137 | pm_freezing = true; | 137 | pm_freezing = true; |
| 138 | error = try_to_freeze_tasks(true); | 138 | error = try_to_freeze_tasks(true); |
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index fa46606f3356..222317721c5a 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
| @@ -30,19 +30,17 @@ | |||
| 30 | #include <linux/slab.h> | 30 | #include <linux/slab.h> |
| 31 | #include <linux/compiler.h> | 31 | #include <linux/compiler.h> |
| 32 | #include <linux/ktime.h> | 32 | #include <linux/ktime.h> |
| 33 | #include <linux/set_memory.h> | ||
| 33 | 34 | ||
| 34 | #include <linux/uaccess.h> | 35 | #include <linux/uaccess.h> |
| 35 | #include <asm/mmu_context.h> | 36 | #include <asm/mmu_context.h> |
| 36 | #include <asm/pgtable.h> | 37 | #include <asm/pgtable.h> |
| 37 | #include <asm/tlbflush.h> | 38 | #include <asm/tlbflush.h> |
| 38 | #include <asm/io.h> | 39 | #include <asm/io.h> |
| 39 | #ifdef CONFIG_STRICT_KERNEL_RWX | ||
| 40 | #include <asm/set_memory.h> | ||
| 41 | #endif | ||
| 42 | 40 | ||
| 43 | #include "power.h" | 41 | #include "power.h" |
| 44 | 42 | ||
| 45 | #ifdef CONFIG_STRICT_KERNEL_RWX | 43 | #if defined(CONFIG_STRICT_KERNEL_RWX) && defined(CONFIG_ARCH_HAS_SET_MEMORY) |
| 46 | static bool hibernate_restore_protection; | 44 | static bool hibernate_restore_protection; |
| 47 | static bool hibernate_restore_protection_active; | 45 | static bool hibernate_restore_protection_active; |
| 48 | 46 | ||
| @@ -77,7 +75,7 @@ static inline void hibernate_restore_protection_begin(void) {} | |||
| 77 | static inline void hibernate_restore_protection_end(void) {} | 75 | static inline void hibernate_restore_protection_end(void) {} |
| 78 | static inline void hibernate_restore_protect_page(void *page_address) {} | 76 | static inline void hibernate_restore_protect_page(void *page_address) {} |
| 79 | static inline void hibernate_restore_unprotect_page(void *page_address) {} | 77 | static inline void hibernate_restore_unprotect_page(void *page_address) {} |
| 80 | #endif /* CONFIG_STRICT_KERNEL_RWX */ | 78 | #endif /* CONFIG_STRICT_KERNEL_RWX && CONFIG_ARCH_HAS_SET_MEMORY */ |
| 81 | 79 | ||
| 82 | static int swsusp_page_is_free(struct page *); | 80 | static int swsusp_page_is_free(struct page *); |
| 83 | static void swsusp_set_page_forbidden(struct page *); | 81 | static void swsusp_set_page_forbidden(struct page *); |
| @@ -1929,8 +1927,7 @@ static inline unsigned int alloc_highmem_pages(struct memory_bitmap *bm, | |||
| 1929 | * also be located in the high memory, because of the way in which | 1927 | * also be located in the high memory, because of the way in which |
| 1930 | * copy_data_pages() works. | 1928 | * copy_data_pages() works. |
| 1931 | */ | 1929 | */ |
| 1932 | static int swsusp_alloc(struct memory_bitmap *orig_bm, | 1930 | static int swsusp_alloc(struct memory_bitmap *copy_bm, |
| 1933 | struct memory_bitmap *copy_bm, | ||
| 1934 | unsigned int nr_pages, unsigned int nr_highmem) | 1931 | unsigned int nr_pages, unsigned int nr_highmem) |
| 1935 | { | 1932 | { |
| 1936 | if (nr_highmem > 0) { | 1933 | if (nr_highmem > 0) { |
| @@ -1976,7 +1973,7 @@ asmlinkage __visible int swsusp_save(void) | |||
| 1976 | return -ENOMEM; | 1973 | return -ENOMEM; |
| 1977 | } | 1974 | } |
| 1978 | 1975 | ||
| 1979 | if (swsusp_alloc(&orig_bm, ©_bm, nr_pages, nr_highmem)) { | 1976 | if (swsusp_alloc(©_bm, nr_pages, nr_highmem)) { |
| 1980 | printk(KERN_ERR "PM: Memory allocation failed\n"); | 1977 | printk(KERN_ERR "PM: Memory allocation failed\n"); |
| 1981 | return -ENOMEM; | 1978 | return -ENOMEM; |
| 1982 | } | 1979 | } |
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 15e6baef5c73..3ecf275d7e44 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c | |||
| @@ -72,6 +72,8 @@ static void freeze_begin(void) | |||
| 72 | 72 | ||
| 73 | static void freeze_enter(void) | 73 | static void freeze_enter(void) |
| 74 | { | 74 | { |
| 75 | trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_FREEZE, true); | ||
| 76 | |||
| 75 | spin_lock_irq(&suspend_freeze_lock); | 77 | spin_lock_irq(&suspend_freeze_lock); |
| 76 | if (pm_wakeup_pending()) | 78 | if (pm_wakeup_pending()) |
| 77 | goto out; | 79 | goto out; |
| @@ -84,11 +86,9 @@ static void freeze_enter(void) | |||
| 84 | 86 | ||
| 85 | /* Push all the CPUs into the idle loop. */ | 87 | /* Push all the CPUs into the idle loop. */ |
| 86 | wake_up_all_idle_cpus(); | 88 | wake_up_all_idle_cpus(); |
| 87 | pr_debug("PM: suspend-to-idle\n"); | ||
| 88 | /* Make the current CPU wait so it can enter the idle loop too. */ | 89 | /* Make the current CPU wait so it can enter the idle loop too. */ |
| 89 | wait_event(suspend_freeze_wait_head, | 90 | wait_event(suspend_freeze_wait_head, |
| 90 | suspend_freeze_state == FREEZE_STATE_WAKE); | 91 | suspend_freeze_state == FREEZE_STATE_WAKE); |
| 91 | pr_debug("PM: resume from suspend-to-idle\n"); | ||
| 92 | 92 | ||
| 93 | cpuidle_pause(); | 93 | cpuidle_pause(); |
| 94 | put_online_cpus(); | 94 | put_online_cpus(); |
| @@ -98,6 +98,31 @@ static void freeze_enter(void) | |||
| 98 | out: | 98 | out: |
| 99 | suspend_freeze_state = FREEZE_STATE_NONE; | 99 | suspend_freeze_state = FREEZE_STATE_NONE; |
| 100 | spin_unlock_irq(&suspend_freeze_lock); | 100 | spin_unlock_irq(&suspend_freeze_lock); |
| 101 | |||
| 102 | trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_FREEZE, false); | ||
| 103 | } | ||
| 104 | |||
| 105 | static void s2idle_loop(void) | ||
| 106 | { | ||
| 107 | pr_debug("PM: suspend-to-idle\n"); | ||
| 108 | |||
| 109 | do { | ||
| 110 | freeze_enter(); | ||
| 111 | |||
| 112 | if (freeze_ops && freeze_ops->wake) | ||
| 113 | freeze_ops->wake(); | ||
| 114 | |||
| 115 | dpm_resume_noirq(PMSG_RESUME); | ||
| 116 | if (freeze_ops && freeze_ops->sync) | ||
| 117 | freeze_ops->sync(); | ||
| 118 | |||
| 119 | if (pm_wakeup_pending()) | ||
| 120 | break; | ||
| 121 | |||
| 122 | pm_wakeup_clear(false); | ||
| 123 | } while (!dpm_suspend_noirq(PMSG_SUSPEND)); | ||
| 124 | |||
| 125 | pr_debug("PM: resume from suspend-to-idle\n"); | ||
| 101 | } | 126 | } |
| 102 | 127 | ||
| 103 | void freeze_wake(void) | 128 | void freeze_wake(void) |
| @@ -371,10 +396,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) | |||
| 371 | * all the devices are suspended. | 396 | * all the devices are suspended. |
| 372 | */ | 397 | */ |
| 373 | if (state == PM_SUSPEND_FREEZE) { | 398 | if (state == PM_SUSPEND_FREEZE) { |
| 374 | trace_suspend_resume(TPS("machine_suspend"), state, true); | 399 | s2idle_loop(); |
| 375 | freeze_enter(); | 400 | goto Platform_early_resume; |
| 376 | trace_suspend_resume(TPS("machine_suspend"), state, false); | ||
| 377 | goto Platform_wake; | ||
| 378 | } | 401 | } |
| 379 | 402 | ||
| 380 | error = disable_nonboot_cpus(); | 403 | error = disable_nonboot_cpus(); |
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 1db044f808b7..2a7d04049af4 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h | |||
| @@ -18,12 +18,14 @@ | |||
| 18 | 18 | ||
| 19 | #ifdef CONFIG_PRINTK | 19 | #ifdef CONFIG_PRINTK |
| 20 | 20 | ||
| 21 | #define PRINTK_SAFE_CONTEXT_MASK 0x7fffffff | 21 | #define PRINTK_SAFE_CONTEXT_MASK 0x3fffffff |
| 22 | #define PRINTK_NMI_CONTEXT_MASK 0x80000000 | 22 | #define PRINTK_NMI_DEFERRED_CONTEXT_MASK 0x40000000 |
| 23 | #define PRINTK_NMI_CONTEXT_MASK 0x80000000 | ||
| 23 | 24 | ||
| 24 | extern raw_spinlock_t logbuf_lock; | 25 | extern raw_spinlock_t logbuf_lock; |
| 25 | 26 | ||
| 26 | __printf(1, 0) int vprintk_default(const char *fmt, va_list args); | 27 | __printf(1, 0) int vprintk_default(const char *fmt, va_list args); |
| 28 | __printf(1, 0) int vprintk_deferred(const char *fmt, va_list args); | ||
| 27 | __printf(1, 0) int vprintk_func(const char *fmt, va_list args); | 29 | __printf(1, 0) int vprintk_func(const char *fmt, va_list args); |
| 28 | void __printk_safe_enter(void); | 30 | void __printk_safe_enter(void); |
| 29 | void __printk_safe_exit(void); | 31 | void __printk_safe_exit(void); |
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index bd53ea579dc8..fc47863f629c 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c | |||
| @@ -2720,16 +2720,13 @@ void wake_up_klogd(void) | |||
| 2720 | preempt_enable(); | 2720 | preempt_enable(); |
| 2721 | } | 2721 | } |
| 2722 | 2722 | ||
| 2723 | int printk_deferred(const char *fmt, ...) | 2723 | int vprintk_deferred(const char *fmt, va_list args) |
| 2724 | { | 2724 | { |
| 2725 | va_list args; | ||
| 2726 | int r; | 2725 | int r; |
| 2727 | 2726 | ||
| 2728 | preempt_disable(); | ||
| 2729 | va_start(args, fmt); | ||
| 2730 | r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, 0, fmt, args); | 2727 | r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, 0, fmt, args); |
| 2731 | va_end(args); | ||
| 2732 | 2728 | ||
| 2729 | preempt_disable(); | ||
| 2733 | __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT); | 2730 | __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT); |
| 2734 | irq_work_queue(this_cpu_ptr(&wake_up_klogd_work)); | 2731 | irq_work_queue(this_cpu_ptr(&wake_up_klogd_work)); |
| 2735 | preempt_enable(); | 2732 | preempt_enable(); |
| @@ -2737,6 +2734,18 @@ int printk_deferred(const char *fmt, ...) | |||
| 2737 | return r; | 2734 | return r; |
| 2738 | } | 2735 | } |
| 2739 | 2736 | ||
| 2737 | int printk_deferred(const char *fmt, ...) | ||
| 2738 | { | ||
| 2739 | va_list args; | ||
| 2740 | int r; | ||
| 2741 | |||
| 2742 | va_start(args, fmt); | ||
| 2743 | r = vprintk_deferred(fmt, args); | ||
| 2744 | va_end(args); | ||
| 2745 | |||
| 2746 | return r; | ||
| 2747 | } | ||
| 2748 | |||
| 2740 | /* | 2749 | /* |
| 2741 | * printk rate limiting, lifted from the networking subsystem. | 2750 | * printk rate limiting, lifted from the networking subsystem. |
| 2742 | * | 2751 | * |
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index 033e50a7d706..3cdaeaef9ce1 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c | |||
| @@ -80,8 +80,8 @@ static void queue_flush_work(struct printk_safe_seq_buf *s) | |||
| 80 | * happen, printk_safe_log_store() will notice the buffer->len mismatch | 80 | * happen, printk_safe_log_store() will notice the buffer->len mismatch |
| 81 | * and repeat the write. | 81 | * and repeat the write. |
| 82 | */ | 82 | */ |
| 83 | static int printk_safe_log_store(struct printk_safe_seq_buf *s, | 83 | static __printf(2, 0) int printk_safe_log_store(struct printk_safe_seq_buf *s, |
| 84 | const char *fmt, va_list args) | 84 | const char *fmt, va_list args) |
| 85 | { | 85 | { |
| 86 | int add; | 86 | int add; |
| 87 | size_t len; | 87 | size_t len; |
| @@ -299,7 +299,7 @@ void printk_safe_flush_on_panic(void) | |||
| 299 | * one writer running. But the buffer might get flushed from another | 299 | * one writer running. But the buffer might get flushed from another |
| 300 | * CPU, so we need to be careful. | 300 | * CPU, so we need to be careful. |
| 301 | */ | 301 | */ |
| 302 | static int vprintk_nmi(const char *fmt, va_list args) | 302 | static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args) |
| 303 | { | 303 | { |
| 304 | struct printk_safe_seq_buf *s = this_cpu_ptr(&nmi_print_seq); | 304 | struct printk_safe_seq_buf *s = this_cpu_ptr(&nmi_print_seq); |
| 305 | 305 | ||
| @@ -308,17 +308,29 @@ static int vprintk_nmi(const char *fmt, va_list args) | |||
| 308 | 308 | ||
| 309 | void printk_nmi_enter(void) | 309 | void printk_nmi_enter(void) |
| 310 | { | 310 | { |
| 311 | this_cpu_or(printk_context, PRINTK_NMI_CONTEXT_MASK); | 311 | /* |
| 312 | * The size of the extra per-CPU buffer is limited. Use it only when | ||
| 313 | * the main one is locked. If this CPU is not in the safe context, | ||
| 314 | * the lock must be taken on another CPU and we could wait for it. | ||
| 315 | */ | ||
| 316 | if ((this_cpu_read(printk_context) & PRINTK_SAFE_CONTEXT_MASK) && | ||
| 317 | raw_spin_is_locked(&logbuf_lock)) { | ||
| 318 | this_cpu_or(printk_context, PRINTK_NMI_CONTEXT_MASK); | ||
| 319 | } else { | ||
| 320 | this_cpu_or(printk_context, PRINTK_NMI_DEFERRED_CONTEXT_MASK); | ||
| 321 | } | ||
| 312 | } | 322 | } |
| 313 | 323 | ||
| 314 | void printk_nmi_exit(void) | 324 | void printk_nmi_exit(void) |
| 315 | { | 325 | { |
| 316 | this_cpu_and(printk_context, ~PRINTK_NMI_CONTEXT_MASK); | 326 | this_cpu_and(printk_context, |
| 327 | ~(PRINTK_NMI_CONTEXT_MASK | | ||
| 328 | PRINTK_NMI_DEFERRED_CONTEXT_MASK)); | ||
| 317 | } | 329 | } |
| 318 | 330 | ||
| 319 | #else | 331 | #else |
| 320 | 332 | ||
| 321 | static int vprintk_nmi(const char *fmt, va_list args) | 333 | static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args) |
| 322 | { | 334 | { |
| 323 | return 0; | 335 | return 0; |
| 324 | } | 336 | } |
| @@ -330,7 +342,7 @@ static int vprintk_nmi(const char *fmt, va_list args) | |||
| 330 | * into itself. It uses a per-CPU buffer to store the message, just like | 342 | * into itself. It uses a per-CPU buffer to store the message, just like |
| 331 | * NMI. | 343 | * NMI. |
| 332 | */ | 344 | */ |
| 333 | static int vprintk_safe(const char *fmt, va_list args) | 345 | static __printf(1, 0) int vprintk_safe(const char *fmt, va_list args) |
| 334 | { | 346 | { |
| 335 | struct printk_safe_seq_buf *s = this_cpu_ptr(&safe_print_seq); | 347 | struct printk_safe_seq_buf *s = this_cpu_ptr(&safe_print_seq); |
| 336 | 348 | ||
| @@ -351,12 +363,22 @@ void __printk_safe_exit(void) | |||
| 351 | 363 | ||
| 352 | __printf(1, 0) int vprintk_func(const char *fmt, va_list args) | 364 | __printf(1, 0) int vprintk_func(const char *fmt, va_list args) |
| 353 | { | 365 | { |
| 366 | /* Use extra buffer in NMI when logbuf_lock is taken or in safe mode. */ | ||
| 354 | if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK) | 367 | if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK) |
| 355 | return vprintk_nmi(fmt, args); | 368 | return vprintk_nmi(fmt, args); |
| 356 | 369 | ||
| 370 | /* Use extra buffer to prevent a recursion deadlock in safe mode. */ | ||
| 357 | if (this_cpu_read(printk_context) & PRINTK_SAFE_CONTEXT_MASK) | 371 | if (this_cpu_read(printk_context) & PRINTK_SAFE_CONTEXT_MASK) |
| 358 | return vprintk_safe(fmt, args); | 372 | return vprintk_safe(fmt, args); |
| 359 | 373 | ||
| 374 | /* | ||
| 375 | * Use the main logbuf when logbuf_lock is available in NMI. | ||
| 376 | * But avoid calling console drivers that might have their own locks. | ||
| 377 | */ | ||
| 378 | if (this_cpu_read(printk_context) & PRINTK_NMI_DEFERRED_CONTEXT_MASK) | ||
| 379 | return vprintk_deferred(fmt, args); | ||
| 380 | |||
| 381 | /* No obstacles. */ | ||
| 360 | return vprintk_default(fmt, args); | 382 | return vprintk_default(fmt, args); |
| 361 | } | 383 | } |
| 362 | 384 | ||
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 076a2e31951c..29a397067ffa 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c | |||
| @@ -610,6 +610,11 @@ static int sugov_start(struct cpufreq_policy *policy) | |||
| 610 | sg_cpu->sg_policy = sg_policy; | 610 | sg_cpu->sg_policy = sg_policy; |
| 611 | sg_cpu->flags = SCHED_CPUFREQ_RT; | 611 | sg_cpu->flags = SCHED_CPUFREQ_RT; |
| 612 | sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; | 612 | sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; |
| 613 | } | ||
| 614 | |||
| 615 | for_each_cpu(cpu, policy->cpus) { | ||
| 616 | struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); | ||
| 617 | |||
| 613 | cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, | 618 | cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, |
| 614 | policy_is_shared(policy) ? | 619 | policy_is_shared(policy) ? |
| 615 | sugov_update_shared : | 620 | sugov_update_shared : |
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 67c70e287647..14d2dbf97c53 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c | |||
| @@ -611,17 +611,23 @@ static void cputime_adjust(struct task_cputime *curr, | |||
| 611 | utime = curr->utime; | 611 | utime = curr->utime; |
| 612 | 612 | ||
| 613 | /* | 613 | /* |
| 614 | * If either stime or both stime and utime are 0, assume all runtime is | 614 | * If either stime or utime are 0, assume all runtime is userspace. |
| 615 | * userspace. Once a task gets some ticks, the monotonicy code at | 615 | * Once a task gets some ticks, the monotonicy code at 'update:' |
| 616 | * 'update' will ensure things converge to the observed ratio. | 616 | * will ensure things converge to the observed ratio. |
| 617 | */ | 617 | */ |
| 618 | if (stime != 0) { | 618 | if (stime == 0) { |
| 619 | if (utime == 0) | 619 | utime = rtime; |
| 620 | stime = rtime; | 620 | goto update; |
| 621 | else | ||
| 622 | stime = scale_stime(stime, rtime, stime + utime); | ||
| 623 | } | 621 | } |
| 624 | 622 | ||
| 623 | if (utime == 0) { | ||
| 624 | stime = rtime; | ||
| 625 | goto update; | ||
| 626 | } | ||
| 627 | |||
| 628 | stime = scale_stime(stime, rtime, stime + utime); | ||
| 629 | |||
| 630 | update: | ||
| 625 | /* | 631 | /* |
| 626 | * Make sure stime doesn't go backwards; this preserves monotonicity | 632 | * Make sure stime doesn't go backwards; this preserves monotonicity |
| 627 | * for utime because rtime is monotonic. | 633 | * for utime because rtime is monotonic. |
| @@ -673,20 +679,21 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) | |||
| 673 | #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ | 679 | #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ |
| 674 | 680 | ||
| 675 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN | 681 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN |
| 676 | static u64 vtime_delta(struct task_struct *tsk) | 682 | static u64 vtime_delta(struct vtime *vtime) |
| 677 | { | 683 | { |
| 678 | unsigned long now = READ_ONCE(jiffies); | 684 | unsigned long long clock; |
| 679 | 685 | ||
| 680 | if (time_before(now, (unsigned long)tsk->vtime_snap)) | 686 | clock = sched_clock(); |
| 687 | if (clock < vtime->starttime) | ||
| 681 | return 0; | 688 | return 0; |
| 682 | 689 | ||
| 683 | return jiffies_to_nsecs(now - tsk->vtime_snap); | 690 | return clock - vtime->starttime; |
| 684 | } | 691 | } |
| 685 | 692 | ||
| 686 | static u64 get_vtime_delta(struct task_struct *tsk) | 693 | static u64 get_vtime_delta(struct vtime *vtime) |
| 687 | { | 694 | { |
| 688 | unsigned long now = READ_ONCE(jiffies); | 695 | u64 delta = vtime_delta(vtime); |
| 689 | u64 delta, other; | 696 | u64 other; |
| 690 | 697 | ||
| 691 | /* | 698 | /* |
| 692 | * Unlike tick based timing, vtime based timing never has lost | 699 | * Unlike tick based timing, vtime based timing never has lost |
| @@ -695,104 +702,138 @@ static u64 get_vtime_delta(struct task_struct *tsk) | |||
| 695 | * elapsed time. Limit account_other_time to prevent rounding | 702 | * elapsed time. Limit account_other_time to prevent rounding |
| 696 | * errors from causing elapsed vtime to go negative. | 703 | * errors from causing elapsed vtime to go negative. |
| 697 | */ | 704 | */ |
| 698 | delta = jiffies_to_nsecs(now - tsk->vtime_snap); | ||
| 699 | other = account_other_time(delta); | 705 | other = account_other_time(delta); |
| 700 | WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); | 706 | WARN_ON_ONCE(vtime->state == VTIME_INACTIVE); |
| 701 | tsk->vtime_snap = now; | 707 | vtime->starttime += delta; |
| 702 | 708 | ||
| 703 | return delta - other; | 709 | return delta - other; |
| 704 | } | 710 | } |
| 705 | 711 | ||
| 706 | static void __vtime_account_system(struct task_struct *tsk) | 712 | static void __vtime_account_system(struct task_struct *tsk, |
| 713 | struct vtime *vtime) | ||
| 714 | { | ||
| 715 | vtime->stime += get_vtime_delta(vtime); | ||
| 716 | if (vtime->stime >= TICK_NSEC) { | ||
| 717 | account_system_time(tsk, irq_count(), vtime->stime); | ||
| 718 | vtime->stime = 0; | ||
| 719 | } | ||
| 720 | } | ||
| 721 | |||
| 722 | static void vtime_account_guest(struct task_struct *tsk, | ||
| 723 | struct vtime *vtime) | ||
| 707 | { | 724 | { |
| 708 | account_system_time(tsk, irq_count(), get_vtime_delta(tsk)); | 725 | vtime->gtime += get_vtime_delta(vtime); |
| 726 | if (vtime->gtime >= TICK_NSEC) { | ||
| 727 | account_guest_time(tsk, vtime->gtime); | ||
| 728 | vtime->gtime = 0; | ||
| 729 | } | ||
| 709 | } | 730 | } |
| 710 | 731 | ||
| 711 | void vtime_account_system(struct task_struct *tsk) | 732 | void vtime_account_system(struct task_struct *tsk) |
| 712 | { | 733 | { |
| 713 | if (!vtime_delta(tsk)) | 734 | struct vtime *vtime = &tsk->vtime; |
| 735 | |||
| 736 | if (!vtime_delta(vtime)) | ||
| 714 | return; | 737 | return; |
| 715 | 738 | ||
| 716 | write_seqcount_begin(&tsk->vtime_seqcount); | 739 | write_seqcount_begin(&vtime->seqcount); |
| 717 | __vtime_account_system(tsk); | 740 | /* We might have scheduled out from guest path */ |
| 718 | write_seqcount_end(&tsk->vtime_seqcount); | 741 | if (current->flags & PF_VCPU) |
| 742 | vtime_account_guest(tsk, vtime); | ||
| 743 | else | ||
| 744 | __vtime_account_system(tsk, vtime); | ||
| 745 | write_seqcount_end(&vtime->seqcount); | ||
| 719 | } | 746 | } |
| 720 | 747 | ||
| 721 | void vtime_account_user(struct task_struct *tsk) | 748 | void vtime_user_enter(struct task_struct *tsk) |
| 722 | { | 749 | { |
| 723 | write_seqcount_begin(&tsk->vtime_seqcount); | 750 | struct vtime *vtime = &tsk->vtime; |
| 724 | tsk->vtime_snap_whence = VTIME_SYS; | 751 | |
| 725 | if (vtime_delta(tsk)) | 752 | write_seqcount_begin(&vtime->seqcount); |
| 726 | account_user_time(tsk, get_vtime_delta(tsk)); | 753 | __vtime_account_system(tsk, vtime); |
| 727 | write_seqcount_end(&tsk->vtime_seqcount); | 754 | vtime->state = VTIME_USER; |
| 755 | write_seqcount_end(&vtime->seqcount); | ||
| 728 | } | 756 | } |
| 729 | 757 | ||
| 730 | void vtime_user_enter(struct task_struct *tsk) | 758 | void vtime_user_exit(struct task_struct *tsk) |
| 731 | { | 759 | { |
| 732 | write_seqcount_begin(&tsk->vtime_seqcount); | 760 | struct vtime *vtime = &tsk->vtime; |
| 733 | if (vtime_delta(tsk)) | 761 | |
| 734 | __vtime_account_system(tsk); | 762 | write_seqcount_begin(&vtime->seqcount); |
| 735 | tsk->vtime_snap_whence = VTIME_USER; | 763 | vtime->utime += get_vtime_delta(vtime); |
| 736 | write_seqcount_end(&tsk->vtime_seqcount); | 764 | if (vtime->utime >= TICK_NSEC) { |
| 765 | account_user_time(tsk, vtime->utime); | ||
| 766 | vtime->utime = 0; | ||
| 767 | } | ||
| 768 | vtime->state = VTIME_SYS; | ||
| 769 | write_seqcount_end(&vtime->seqcount); | ||
| 737 | } | 770 | } |
| 738 | 771 | ||
| 739 | void vtime_guest_enter(struct task_struct *tsk) | 772 | void vtime_guest_enter(struct task_struct *tsk) |
| 740 | { | 773 | { |
| 774 | struct vtime *vtime = &tsk->vtime; | ||
| 741 | /* | 775 | /* |
| 742 | * The flags must be updated under the lock with | 776 | * The flags must be updated under the lock with |
| 743 | * the vtime_snap flush and update. | 777 | * the vtime_starttime flush and update. |
| 744 | * That enforces a right ordering and update sequence | 778 | * That enforces a right ordering and update sequence |
| 745 | * synchronization against the reader (task_gtime()) | 779 | * synchronization against the reader (task_gtime()) |
| 746 | * that can thus safely catch up with a tickless delta. | 780 | * that can thus safely catch up with a tickless delta. |
| 747 | */ | 781 | */ |
| 748 | write_seqcount_begin(&tsk->vtime_seqcount); | 782 | write_seqcount_begin(&vtime->seqcount); |
| 749 | if (vtime_delta(tsk)) | 783 | __vtime_account_system(tsk, vtime); |
| 750 | __vtime_account_system(tsk); | ||
| 751 | current->flags |= PF_VCPU; | 784 | current->flags |= PF_VCPU; |
| 752 | write_seqcount_end(&tsk->vtime_seqcount); | 785 | write_seqcount_end(&vtime->seqcount); |
| 753 | } | 786 | } |
| 754 | EXPORT_SYMBOL_GPL(vtime_guest_enter); | 787 | EXPORT_SYMBOL_GPL(vtime_guest_enter); |
| 755 | 788 | ||
| 756 | void vtime_guest_exit(struct task_struct *tsk) | 789 | void vtime_guest_exit(struct task_struct *tsk) |
| 757 | { | 790 | { |
| 758 | write_seqcount_begin(&tsk->vtime_seqcount); | 791 | struct vtime *vtime = &tsk->vtime; |
| 759 | __vtime_account_system(tsk); | 792 | |
| 793 | write_seqcount_begin(&vtime->seqcount); | ||
| 794 | vtime_account_guest(tsk, vtime); | ||
| 760 | current->flags &= ~PF_VCPU; | 795 | current->flags &= ~PF_VCPU; |
| 761 | write_seqcount_end(&tsk->vtime_seqcount); | 796 | write_seqcount_end(&vtime->seqcount); |
| 762 | } | 797 | } |
| 763 | EXPORT_SYMBOL_GPL(vtime_guest_exit); | 798 | EXPORT_SYMBOL_GPL(vtime_guest_exit); |
| 764 | 799 | ||
| 765 | void vtime_account_idle(struct task_struct *tsk) | 800 | void vtime_account_idle(struct task_struct *tsk) |
| 766 | { | 801 | { |
| 767 | account_idle_time(get_vtime_delta(tsk)); | 802 | account_idle_time(get_vtime_delta(&tsk->vtime)); |
| 768 | } | 803 | } |
| 769 | 804 | ||
| 770 | void arch_vtime_task_switch(struct task_struct *prev) | 805 | void arch_vtime_task_switch(struct task_struct *prev) |
| 771 | { | 806 | { |
| 772 | write_seqcount_begin(&prev->vtime_seqcount); | 807 | struct vtime *vtime = &prev->vtime; |
| 773 | prev->vtime_snap_whence = VTIME_INACTIVE; | 808 | |
| 774 | write_seqcount_end(&prev->vtime_seqcount); | 809 | write_seqcount_begin(&vtime->seqcount); |
| 810 | vtime->state = VTIME_INACTIVE; | ||
| 811 | write_seqcount_end(&vtime->seqcount); | ||
| 812 | |||
| 813 | vtime = ¤t->vtime; | ||
| 775 | 814 | ||
| 776 | write_seqcount_begin(¤t->vtime_seqcount); | 815 | write_seqcount_begin(&vtime->seqcount); |
| 777 | current->vtime_snap_whence = VTIME_SYS; | 816 | vtime->state = VTIME_SYS; |
| 778 | current->vtime_snap = jiffies; | 817 | vtime->starttime = sched_clock(); |
| 779 | write_seqcount_end(¤t->vtime_seqcount); | 818 | write_seqcount_end(&vtime->seqcount); |
| 780 | } | 819 | } |
| 781 | 820 | ||
| 782 | void vtime_init_idle(struct task_struct *t, int cpu) | 821 | void vtime_init_idle(struct task_struct *t, int cpu) |
| 783 | { | 822 | { |
| 823 | struct vtime *vtime = &t->vtime; | ||
| 784 | unsigned long flags; | 824 | unsigned long flags; |
| 785 | 825 | ||
| 786 | local_irq_save(flags); | 826 | local_irq_save(flags); |
| 787 | write_seqcount_begin(&t->vtime_seqcount); | 827 | write_seqcount_begin(&vtime->seqcount); |
| 788 | t->vtime_snap_whence = VTIME_SYS; | 828 | vtime->state = VTIME_SYS; |
| 789 | t->vtime_snap = jiffies; | 829 | vtime->starttime = sched_clock(); |
| 790 | write_seqcount_end(&t->vtime_seqcount); | 830 | write_seqcount_end(&vtime->seqcount); |
| 791 | local_irq_restore(flags); | 831 | local_irq_restore(flags); |
| 792 | } | 832 | } |
| 793 | 833 | ||
| 794 | u64 task_gtime(struct task_struct *t) | 834 | u64 task_gtime(struct task_struct *t) |
| 795 | { | 835 | { |
| 836 | struct vtime *vtime = &t->vtime; | ||
| 796 | unsigned int seq; | 837 | unsigned int seq; |
| 797 | u64 gtime; | 838 | u64 gtime; |
| 798 | 839 | ||
| @@ -800,13 +841,13 @@ u64 task_gtime(struct task_struct *t) | |||
| 800 | return t->gtime; | 841 | return t->gtime; |
| 801 | 842 | ||
| 802 | do { | 843 | do { |
| 803 | seq = read_seqcount_begin(&t->vtime_seqcount); | 844 | seq = read_seqcount_begin(&vtime->seqcount); |
| 804 | 845 | ||
| 805 | gtime = t->gtime; | 846 | gtime = t->gtime; |
| 806 | if (t->vtime_snap_whence == VTIME_SYS && t->flags & PF_VCPU) | 847 | if (vtime->state == VTIME_SYS && t->flags & PF_VCPU) |
| 807 | gtime += vtime_delta(t); | 848 | gtime += vtime->gtime + vtime_delta(vtime); |
| 808 | 849 | ||
| 809 | } while (read_seqcount_retry(&t->vtime_seqcount, seq)); | 850 | } while (read_seqcount_retry(&vtime->seqcount, seq)); |
| 810 | 851 | ||
| 811 | return gtime; | 852 | return gtime; |
| 812 | } | 853 | } |
| @@ -818,8 +859,9 @@ u64 task_gtime(struct task_struct *t) | |||
| 818 | */ | 859 | */ |
| 819 | void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) | 860 | void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) |
| 820 | { | 861 | { |
| 821 | u64 delta; | 862 | struct vtime *vtime = &t->vtime; |
| 822 | unsigned int seq; | 863 | unsigned int seq; |
| 864 | u64 delta; | ||
| 823 | 865 | ||
| 824 | if (!vtime_accounting_enabled()) { | 866 | if (!vtime_accounting_enabled()) { |
| 825 | *utime = t->utime; | 867 | *utime = t->utime; |
| @@ -828,25 +870,25 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) | |||
| 828 | } | 870 | } |
| 829 | 871 | ||
| 830 | do { | 872 | do { |
| 831 | seq = read_seqcount_begin(&t->vtime_seqcount); | 873 | seq = read_seqcount_begin(&vtime->seqcount); |
| 832 | 874 | ||
| 833 | *utime = t->utime; | 875 | *utime = t->utime; |
| 834 | *stime = t->stime; | 876 | *stime = t->stime; |
| 835 | 877 | ||
| 836 | /* Task is sleeping, nothing to add */ | 878 | /* Task is sleeping, nothing to add */ |
| 837 | if (t->vtime_snap_whence == VTIME_INACTIVE || is_idle_task(t)) | 879 | if (vtime->state == VTIME_INACTIVE || is_idle_task(t)) |
| 838 | continue; | 880 | continue; |
| 839 | 881 | ||
| 840 | delta = vtime_delta(t); | 882 | delta = vtime_delta(vtime); |
| 841 | 883 | ||
| 842 | /* | 884 | /* |
| 843 | * Task runs either in user or kernel space, add pending nohz time to | 885 | * Task runs either in user or kernel space, add pending nohz time to |
| 844 | * the right place. | 886 | * the right place. |
| 845 | */ | 887 | */ |
| 846 | if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU) | 888 | if (vtime->state == VTIME_USER || t->flags & PF_VCPU) |
| 847 | *utime += delta; | 889 | *utime += vtime->utime + delta; |
| 848 | else if (t->vtime_snap_whence == VTIME_SYS) | 890 | else if (vtime->state == VTIME_SYS) |
| 849 | *stime += delta; | 891 | *stime += vtime->stime + delta; |
| 850 | } while (read_seqcount_retry(&t->vtime_seqcount, seq)); | 892 | } while (read_seqcount_retry(&vtime->seqcount, seq)); |
| 851 | } | 893 | } |
| 852 | #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ | 894 | #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ |
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index a84299f44b5d..755bd3f1a1a9 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c | |||
| @@ -1392,17 +1392,19 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) | |||
| 1392 | struct sched_dl_entity *pi_se = &p->dl; | 1392 | struct sched_dl_entity *pi_se = &p->dl; |
| 1393 | 1393 | ||
| 1394 | /* | 1394 | /* |
| 1395 | * Use the scheduling parameters of the top pi-waiter | 1395 | * Use the scheduling parameters of the top pi-waiter task if: |
| 1396 | * task if we have one and its (absolute) deadline is | 1396 | * - we have a top pi-waiter which is a SCHED_DEADLINE task AND |
| 1397 | * smaller than our one... OTW we keep our runtime and | 1397 | * - our dl_boosted is set (i.e. the pi-waiter's (absolute) deadline is |
| 1398 | * deadline. | 1398 | * smaller than our deadline OR we are a !SCHED_DEADLINE task getting |
| 1399 | * boosted due to a SCHED_DEADLINE pi-waiter). | ||
| 1400 | * Otherwise we keep our runtime and deadline. | ||
| 1399 | */ | 1401 | */ |
| 1400 | if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) { | 1402 | if (pi_task && dl_prio(pi_task->normal_prio) && p->dl.dl_boosted) { |
| 1401 | pi_se = &pi_task->dl; | 1403 | pi_se = &pi_task->dl; |
| 1402 | } else if (!dl_prio(p->normal_prio)) { | 1404 | } else if (!dl_prio(p->normal_prio)) { |
| 1403 | /* | 1405 | /* |
| 1404 | * Special case in which we have a !SCHED_DEADLINE task | 1406 | * Special case in which we have a !SCHED_DEADLINE task |
| 1405 | * that is going to be deboosted, but exceedes its | 1407 | * that is going to be deboosted, but exceeds its |
| 1406 | * runtime while doing so. No point in replenishing | 1408 | * runtime while doing so. No point in replenishing |
| 1407 | * it, as it's going to return back to its original | 1409 | * it, as it's going to return back to its original |
| 1408 | * scheduling class after this. | 1410 | * scheduling class after this. |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 008c514dc241..c95880e216f6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
| @@ -6646,10 +6646,10 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
| 6646 | * our sched_group. We may want to revisit it if we couldn't | 6646 | * our sched_group. We may want to revisit it if we couldn't |
| 6647 | * meet load balance goals by pulling other tasks on src_cpu. | 6647 | * meet load balance goals by pulling other tasks on src_cpu. |
| 6648 | * | 6648 | * |
| 6649 | * Also avoid computing new_dst_cpu if we have already computed | 6649 | * Avoid computing new_dst_cpu for NEWLY_IDLE or if we have |
| 6650 | * one in current iteration. | 6650 | * already computed one in current iteration. |
| 6651 | */ | 6651 | */ |
| 6652 | if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED)) | 6652 | if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED)) |
| 6653 | return 0; | 6653 | return 0; |
| 6654 | 6654 | ||
| 6655 | /* Prevent to re-select dst_cpu via env's cpus */ | 6655 | /* Prevent to re-select dst_cpu via env's cpus */ |
| @@ -8022,14 +8022,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
| 8022 | .tasks = LIST_HEAD_INIT(env.tasks), | 8022 | .tasks = LIST_HEAD_INIT(env.tasks), |
| 8023 | }; | 8023 | }; |
| 8024 | 8024 | ||
| 8025 | /* | 8025 | cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask); |
| 8026 | * For NEWLY_IDLE load_balancing, we don't need to consider | ||
| 8027 | * other cpus in our group | ||
| 8028 | */ | ||
| 8029 | if (idle == CPU_NEWLY_IDLE) | ||
| 8030 | env.dst_grpmask = NULL; | ||
| 8031 | |||
| 8032 | cpumask_copy(cpus, cpu_active_mask); | ||
| 8033 | 8026 | ||
| 8034 | schedstat_inc(sd->lb_count[idle]); | 8027 | schedstat_inc(sd->lb_count[idle]); |
| 8035 | 8028 | ||
| @@ -8151,7 +8144,15 @@ more_balance: | |||
| 8151 | /* All tasks on this runqueue were pinned by CPU affinity */ | 8144 | /* All tasks on this runqueue were pinned by CPU affinity */ |
| 8152 | if (unlikely(env.flags & LBF_ALL_PINNED)) { | 8145 | if (unlikely(env.flags & LBF_ALL_PINNED)) { |
| 8153 | cpumask_clear_cpu(cpu_of(busiest), cpus); | 8146 | cpumask_clear_cpu(cpu_of(busiest), cpus); |
| 8154 | if (!cpumask_empty(cpus)) { | 8147 | /* |
| 8148 | * Attempting to continue load balancing at the current | ||
| 8149 | * sched_domain level only makes sense if there are | ||
| 8150 | * active CPUs remaining as possible busiest CPUs to | ||
| 8151 | * pull load from which are not contained within the | ||
| 8152 | * destination group that is receiving any migrated | ||
| 8153 | * load. | ||
| 8154 | */ | ||
| 8155 | if (!cpumask_subset(cpus, env.dst_grpmask)) { | ||
| 8155 | env.loop = 0; | 8156 | env.loop = 0; |
| 8156 | env.loop_break = sched_nr_migrate_break; | 8157 | env.loop_break = sched_nr_migrate_break; |
| 8157 | goto redo; | 8158 | goto redo; |
| @@ -8447,6 +8448,13 @@ static int active_load_balance_cpu_stop(void *data) | |||
| 8447 | .src_cpu = busiest_rq->cpu, | 8448 | .src_cpu = busiest_rq->cpu, |
| 8448 | .src_rq = busiest_rq, | 8449 | .src_rq = busiest_rq, |
| 8449 | .idle = CPU_IDLE, | 8450 | .idle = CPU_IDLE, |
| 8451 | /* | ||
| 8452 | * can_migrate_task() doesn't need to compute new_dst_cpu | ||
| 8453 | * for active balancing. Since we have CPU_IDLE, but no | ||
| 8454 | * @dst_grpmask we need to make that test go away with lying | ||
| 8455 | * about DST_PINNED. | ||
| 8456 | */ | ||
| 8457 | .flags = LBF_DST_PINNED, | ||
| 8450 | }; | 8458 | }; |
| 8451 | 8459 | ||
| 8452 | schedstat_inc(sd->alb_count); | 8460 | schedstat_inc(sd->alb_count); |
diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 65f61077ad50..98b59b5db90b 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c | |||
| @@ -13,7 +13,7 @@ | |||
| 13 | * of Berkeley Packet Filters/Linux Socket Filters. | 13 | * of Berkeley Packet Filters/Linux Socket Filters. |
| 14 | */ | 14 | */ |
| 15 | 15 | ||
| 16 | #include <linux/atomic.h> | 16 | #include <linux/refcount.h> |
| 17 | #include <linux/audit.h> | 17 | #include <linux/audit.h> |
| 18 | #include <linux/compat.h> | 18 | #include <linux/compat.h> |
| 19 | #include <linux/coredump.h> | 19 | #include <linux/coredump.h> |
| @@ -56,7 +56,7 @@ | |||
| 56 | * to a task_struct (other than @usage). | 56 | * to a task_struct (other than @usage). |
| 57 | */ | 57 | */ |
| 58 | struct seccomp_filter { | 58 | struct seccomp_filter { |
| 59 | atomic_t usage; | 59 | refcount_t usage; |
| 60 | struct seccomp_filter *prev; | 60 | struct seccomp_filter *prev; |
| 61 | struct bpf_prog *prog; | 61 | struct bpf_prog *prog; |
| 62 | }; | 62 | }; |
| @@ -378,7 +378,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) | |||
| 378 | return ERR_PTR(ret); | 378 | return ERR_PTR(ret); |
| 379 | } | 379 | } |
| 380 | 380 | ||
| 381 | atomic_set(&sfilter->usage, 1); | 381 | refcount_set(&sfilter->usage, 1); |
| 382 | 382 | ||
| 383 | return sfilter; | 383 | return sfilter; |
| 384 | } | 384 | } |
| @@ -465,7 +465,7 @@ void get_seccomp_filter(struct task_struct *tsk) | |||
| 465 | if (!orig) | 465 | if (!orig) |
| 466 | return; | 466 | return; |
| 467 | /* Reference count is bounded by the number of total processes. */ | 467 | /* Reference count is bounded by the number of total processes. */ |
| 468 | atomic_inc(&orig->usage); | 468 | refcount_inc(&orig->usage); |
| 469 | } | 469 | } |
| 470 | 470 | ||
| 471 | static inline void seccomp_filter_free(struct seccomp_filter *filter) | 471 | static inline void seccomp_filter_free(struct seccomp_filter *filter) |
| @@ -481,7 +481,7 @@ void put_seccomp_filter(struct task_struct *tsk) | |||
| 481 | { | 481 | { |
| 482 | struct seccomp_filter *orig = tsk->seccomp.filter; | 482 | struct seccomp_filter *orig = tsk->seccomp.filter; |
| 483 | /* Clean up single-reference branches iteratively. */ | 483 | /* Clean up single-reference branches iteratively. */ |
| 484 | while (orig && atomic_dec_and_test(&orig->usage)) { | 484 | while (orig && refcount_dec_and_test(&orig->usage)) { |
| 485 | struct seccomp_filter *freeme = orig; | 485 | struct seccomp_filter *freeme = orig; |
| 486 | orig = orig->prev; | 486 | orig = orig->prev; |
| 487 | seccomp_filter_free(freeme); | 487 | seccomp_filter_free(freeme); |
| @@ -641,11 +641,12 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, | |||
| 641 | return 0; | 641 | return 0; |
| 642 | 642 | ||
| 643 | case SECCOMP_RET_KILL: | 643 | case SECCOMP_RET_KILL: |
| 644 | default: { | 644 | default: |
| 645 | siginfo_t info; | ||
| 646 | audit_seccomp(this_syscall, SIGSYS, action); | 645 | audit_seccomp(this_syscall, SIGSYS, action); |
| 647 | /* Dump core only if this is the last remaining thread. */ | 646 | /* Dump core only if this is the last remaining thread. */ |
| 648 | if (get_nr_threads(current) == 1) { | 647 | if (get_nr_threads(current) == 1) { |
| 648 | siginfo_t info; | ||
| 649 | |||
| 649 | /* Show the original registers in the dump. */ | 650 | /* Show the original registers in the dump. */ |
| 650 | syscall_rollback(current, task_pt_regs(current)); | 651 | syscall_rollback(current, task_pt_regs(current)); |
| 651 | /* Trigger a manual coredump since do_exit skips it. */ | 652 | /* Trigger a manual coredump since do_exit skips it. */ |
| @@ -654,7 +655,6 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, | |||
| 654 | } | 655 | } |
| 655 | do_exit(SIGSYS); | 656 | do_exit(SIGSYS); |
| 656 | } | 657 | } |
| 657 | } | ||
| 658 | 658 | ||
| 659 | unreachable(); | 659 | unreachable(); |
| 660 | 660 | ||
diff --git a/kernel/signal.c b/kernel/signal.c index 35a570f71f07..caed9133ae52 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
| @@ -1402,6 +1402,10 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid) | |||
| 1402 | return ret; | 1402 | return ret; |
| 1403 | } | 1403 | } |
| 1404 | 1404 | ||
| 1405 | /* -INT_MIN is undefined. Exclude this case to avoid a UBSAN warning */ | ||
| 1406 | if (pid == INT_MIN) | ||
| 1407 | return -ESRCH; | ||
| 1408 | |||
| 1405 | read_lock(&tasklist_lock); | 1409 | read_lock(&tasklist_lock); |
| 1406 | if (pid != -1) { | 1410 | if (pid != -1) { |
| 1407 | ret = __kill_pgrp_info(sig, info, | 1411 | ret = __kill_pgrp_info(sig, info, |
| @@ -2776,7 +2780,7 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) | |||
| 2776 | * @info: if non-null, the signal's siginfo is returned here | 2780 | * @info: if non-null, the signal's siginfo is returned here |
| 2777 | * @ts: upper bound on process time suspension | 2781 | * @ts: upper bound on process time suspension |
| 2778 | */ | 2782 | */ |
| 2779 | int do_sigtimedwait(const sigset_t *which, siginfo_t *info, | 2783 | static int do_sigtimedwait(const sigset_t *which, siginfo_t *info, |
| 2780 | const struct timespec *ts) | 2784 | const struct timespec *ts) |
| 2781 | { | 2785 | { |
| 2782 | ktime_t *to = NULL, timeout = KTIME_MAX; | 2786 | ktime_t *to = NULL, timeout = KTIME_MAX; |
| @@ -2865,6 +2869,40 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, | |||
| 2865 | return ret; | 2869 | return ret; |
| 2866 | } | 2870 | } |
| 2867 | 2871 | ||
| 2872 | #ifdef CONFIG_COMPAT | ||
| 2873 | COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese, | ||
| 2874 | struct compat_siginfo __user *, uinfo, | ||
| 2875 | struct compat_timespec __user *, uts, compat_size_t, sigsetsize) | ||
| 2876 | { | ||
| 2877 | compat_sigset_t s32; | ||
| 2878 | sigset_t s; | ||
| 2879 | struct timespec t; | ||
| 2880 | siginfo_t info; | ||
| 2881 | long ret; | ||
| 2882 | |||
| 2883 | if (sigsetsize != sizeof(sigset_t)) | ||
| 2884 | return -EINVAL; | ||
| 2885 | |||
| 2886 | if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t))) | ||
| 2887 | return -EFAULT; | ||
| 2888 | sigset_from_compat(&s, &s32); | ||
| 2889 | |||
| 2890 | if (uts) { | ||
| 2891 | if (compat_get_timespec(&t, uts)) | ||
| 2892 | return -EFAULT; | ||
| 2893 | } | ||
| 2894 | |||
| 2895 | ret = do_sigtimedwait(&s, &info, uts ? &t : NULL); | ||
| 2896 | |||
| 2897 | if (ret > 0 && uinfo) { | ||
| 2898 | if (copy_siginfo_to_user32(uinfo, &info)) | ||
| 2899 | ret = -EFAULT; | ||
| 2900 | } | ||
| 2901 | |||
| 2902 | return ret; | ||
| 2903 | } | ||
| 2904 | #endif | ||
| 2905 | |||
| 2868 | /** | 2906 | /** |
| 2869 | * sys_kill - send a signal to a process | 2907 | * sys_kill - send a signal to a process |
| 2870 | * @pid: the PID of the process | 2908 | * @pid: the PID of the process |
| @@ -3121,78 +3159,68 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) | |||
| 3121 | } | 3159 | } |
| 3122 | 3160 | ||
| 3123 | static int | 3161 | static int |
| 3124 | do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long sp) | 3162 | do_sigaltstack (const stack_t *ss, stack_t *oss, unsigned long sp) |
| 3125 | { | 3163 | { |
| 3126 | stack_t oss; | 3164 | struct task_struct *t = current; |
| 3127 | int error; | ||
| 3128 | 3165 | ||
| 3129 | oss.ss_sp = (void __user *) current->sas_ss_sp; | 3166 | if (oss) { |
| 3130 | oss.ss_size = current->sas_ss_size; | 3167 | memset(oss, 0, sizeof(stack_t)); |
| 3131 | oss.ss_flags = sas_ss_flags(sp) | | 3168 | oss->ss_sp = (void __user *) t->sas_ss_sp; |
| 3132 | (current->sas_ss_flags & SS_FLAG_BITS); | 3169 | oss->ss_size = t->sas_ss_size; |
| 3170 | oss->ss_flags = sas_ss_flags(sp) | | ||
| 3171 | (current->sas_ss_flags & SS_FLAG_BITS); | ||
| 3172 | } | ||
| 3133 | 3173 | ||
| 3134 | if (uss) { | 3174 | if (ss) { |
| 3135 | void __user *ss_sp; | 3175 | void __user *ss_sp = ss->ss_sp; |
| 3136 | size_t ss_size; | 3176 | size_t ss_size = ss->ss_size; |
| 3137 | unsigned ss_flags; | 3177 | unsigned ss_flags = ss->ss_flags; |
| 3138 | int ss_mode; | 3178 | int ss_mode; |
| 3139 | 3179 | ||
| 3140 | error = -EFAULT; | 3180 | if (unlikely(on_sig_stack(sp))) |
| 3141 | if (!access_ok(VERIFY_READ, uss, sizeof(*uss))) | 3181 | return -EPERM; |
| 3142 | goto out; | ||
| 3143 | error = __get_user(ss_sp, &uss->ss_sp) | | ||
| 3144 | __get_user(ss_flags, &uss->ss_flags) | | ||
| 3145 | __get_user(ss_size, &uss->ss_size); | ||
| 3146 | if (error) | ||
| 3147 | goto out; | ||
| 3148 | |||
| 3149 | error = -EPERM; | ||
| 3150 | if (on_sig_stack(sp)) | ||
| 3151 | goto out; | ||
| 3152 | 3182 | ||
| 3153 | ss_mode = ss_flags & ~SS_FLAG_BITS; | 3183 | ss_mode = ss_flags & ~SS_FLAG_BITS; |
| 3154 | error = -EINVAL; | 3184 | if (unlikely(ss_mode != SS_DISABLE && ss_mode != SS_ONSTACK && |
| 3155 | if (ss_mode != SS_DISABLE && ss_mode != SS_ONSTACK && | 3185 | ss_mode != 0)) |
| 3156 | ss_mode != 0) | 3186 | return -EINVAL; |
| 3157 | goto out; | ||
| 3158 | 3187 | ||
| 3159 | if (ss_mode == SS_DISABLE) { | 3188 | if (ss_mode == SS_DISABLE) { |
| 3160 | ss_size = 0; | 3189 | ss_size = 0; |
| 3161 | ss_sp = NULL; | 3190 | ss_sp = NULL; |
| 3162 | } else { | 3191 | } else { |
| 3163 | error = -ENOMEM; | 3192 | if (unlikely(ss_size < MINSIGSTKSZ)) |
| 3164 | if (ss_size < MINSIGSTKSZ) | 3193 | return -ENOMEM; |
| 3165 | goto out; | ||
| 3166 | } | 3194 | } |
| 3167 | 3195 | ||
| 3168 | current->sas_ss_sp = (unsigned long) ss_sp; | 3196 | t->sas_ss_sp = (unsigned long) ss_sp; |
| 3169 | current->sas_ss_size = ss_size; | 3197 | t->sas_ss_size = ss_size; |
| 3170 | current->sas_ss_flags = ss_flags; | 3198 | t->sas_ss_flags = ss_flags; |
| 3171 | } | 3199 | } |
| 3172 | 3200 | return 0; | |
| 3173 | error = 0; | ||
| 3174 | if (uoss) { | ||
| 3175 | error = -EFAULT; | ||
| 3176 | if (!access_ok(VERIFY_WRITE, uoss, sizeof(*uoss))) | ||
| 3177 | goto out; | ||
| 3178 | error = __put_user(oss.ss_sp, &uoss->ss_sp) | | ||
| 3179 | __put_user(oss.ss_size, &uoss->ss_size) | | ||
| 3180 | __put_user(oss.ss_flags, &uoss->ss_flags); | ||
| 3181 | } | ||
| 3182 | |||
| 3183 | out: | ||
| 3184 | return error; | ||
| 3185 | } | 3201 | } |
| 3202 | |||
| 3186 | SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss) | 3203 | SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss) |
| 3187 | { | 3204 | { |
| 3188 | return do_sigaltstack(uss, uoss, current_user_stack_pointer()); | 3205 | stack_t new, old; |
| 3206 | int err; | ||
| 3207 | if (uss && copy_from_user(&new, uss, sizeof(stack_t))) | ||
| 3208 | return -EFAULT; | ||
| 3209 | err = do_sigaltstack(uss ? &new : NULL, uoss ? &old : NULL, | ||
| 3210 | current_user_stack_pointer()); | ||
| 3211 | if (!err && uoss && copy_to_user(uoss, &old, sizeof(stack_t))) | ||
| 3212 | err = -EFAULT; | ||
| 3213 | return err; | ||
| 3189 | } | 3214 | } |
| 3190 | 3215 | ||
| 3191 | int restore_altstack(const stack_t __user *uss) | 3216 | int restore_altstack(const stack_t __user *uss) |
| 3192 | { | 3217 | { |
| 3193 | int err = do_sigaltstack(uss, NULL, current_user_stack_pointer()); | 3218 | stack_t new; |
| 3219 | if (copy_from_user(&new, uss, sizeof(stack_t))) | ||
| 3220 | return -EFAULT; | ||
| 3221 | (void)do_sigaltstack(&new, NULL, current_user_stack_pointer()); | ||
| 3194 | /* squash all but EFAULT for now */ | 3222 | /* squash all but EFAULT for now */ |
| 3195 | return err == -EFAULT ? err : 0; | 3223 | return 0; |
| 3196 | } | 3224 | } |
| 3197 | 3225 | ||
| 3198 | int __save_altstack(stack_t __user *uss, unsigned long sp) | 3226 | int __save_altstack(stack_t __user *uss, unsigned long sp) |
| @@ -3215,29 +3243,24 @@ COMPAT_SYSCALL_DEFINE2(sigaltstack, | |||
| 3215 | { | 3243 | { |
| 3216 | stack_t uss, uoss; | 3244 | stack_t uss, uoss; |
| 3217 | int ret; | 3245 | int ret; |
| 3218 | mm_segment_t seg; | ||
| 3219 | 3246 | ||
| 3220 | if (uss_ptr) { | 3247 | if (uss_ptr) { |
| 3221 | compat_stack_t uss32; | 3248 | compat_stack_t uss32; |
| 3222 | |||
| 3223 | memset(&uss, 0, sizeof(stack_t)); | ||
| 3224 | if (copy_from_user(&uss32, uss_ptr, sizeof(compat_stack_t))) | 3249 | if (copy_from_user(&uss32, uss_ptr, sizeof(compat_stack_t))) |
| 3225 | return -EFAULT; | 3250 | return -EFAULT; |
| 3226 | uss.ss_sp = compat_ptr(uss32.ss_sp); | 3251 | uss.ss_sp = compat_ptr(uss32.ss_sp); |
| 3227 | uss.ss_flags = uss32.ss_flags; | 3252 | uss.ss_flags = uss32.ss_flags; |
| 3228 | uss.ss_size = uss32.ss_size; | 3253 | uss.ss_size = uss32.ss_size; |
| 3229 | } | 3254 | } |
| 3230 | seg = get_fs(); | 3255 | ret = do_sigaltstack(uss_ptr ? &uss : NULL, &uoss, |
| 3231 | set_fs(KERNEL_DS); | ||
| 3232 | ret = do_sigaltstack((stack_t __force __user *) (uss_ptr ? &uss : NULL), | ||
| 3233 | (stack_t __force __user *) &uoss, | ||
| 3234 | compat_user_stack_pointer()); | 3256 | compat_user_stack_pointer()); |
| 3235 | set_fs(seg); | ||
| 3236 | if (ret >= 0 && uoss_ptr) { | 3257 | if (ret >= 0 && uoss_ptr) { |
| 3237 | if (!access_ok(VERIFY_WRITE, uoss_ptr, sizeof(compat_stack_t)) || | 3258 | compat_stack_t old; |
| 3238 | __put_user(ptr_to_compat(uoss.ss_sp), &uoss_ptr->ss_sp) || | 3259 | memset(&old, 0, sizeof(old)); |
| 3239 | __put_user(uoss.ss_flags, &uoss_ptr->ss_flags) || | 3260 | old.ss_sp = ptr_to_compat(uoss.ss_sp); |
| 3240 | __put_user(uoss.ss_size, &uoss_ptr->ss_size)) | 3261 | old.ss_flags = uoss.ss_flags; |
| 3262 | old.ss_size = uoss.ss_size; | ||
| 3263 | if (copy_to_user(uoss_ptr, &old, sizeof(compat_stack_t))) | ||
| 3241 | ret = -EFAULT; | 3264 | ret = -EFAULT; |
| 3242 | } | 3265 | } |
| 3243 | return ret; | 3266 | return ret; |
| @@ -3277,6 +3300,18 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) | |||
| 3277 | return sys_rt_sigpending((sigset_t __user *)set, sizeof(old_sigset_t)); | 3300 | return sys_rt_sigpending((sigset_t __user *)set, sizeof(old_sigset_t)); |
| 3278 | } | 3301 | } |
| 3279 | 3302 | ||
| 3303 | #ifdef CONFIG_COMPAT | ||
| 3304 | COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set32) | ||
| 3305 | { | ||
| 3306 | sigset_t set; | ||
| 3307 | int err = do_sigpending(&set, sizeof(old_sigset_t)); | ||
| 3308 | if (err == 0) | ||
| 3309 | if (copy_to_user(set32, &set, sizeof(old_sigset_t))) | ||
| 3310 | err = -EFAULT; | ||
| 3311 | return err; | ||
| 3312 | } | ||
| 3313 | #endif | ||
| 3314 | |||
| 3280 | #endif | 3315 | #endif |
| 3281 | 3316 | ||
| 3282 | #ifdef __ARCH_WANT_SYS_SIGPROCMASK | 3317 | #ifdef __ARCH_WANT_SYS_SIGPROCMASK |
diff --git a/kernel/sys.c b/kernel/sys.c index 8a94b4eabcaa..2855ee73acd0 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
| @@ -886,7 +886,7 @@ SYSCALL_DEFINE0(getegid) | |||
| 886 | return from_kgid_munged(current_user_ns(), current_egid()); | 886 | return from_kgid_munged(current_user_ns(), current_egid()); |
| 887 | } | 887 | } |
| 888 | 888 | ||
| 889 | void do_sys_times(struct tms *tms) | 889 | static void do_sys_times(struct tms *tms) |
| 890 | { | 890 | { |
| 891 | u64 tgutime, tgstime, cutime, cstime; | 891 | u64 tgutime, tgstime, cutime, cstime; |
| 892 | 892 | ||
| @@ -912,6 +912,32 @@ SYSCALL_DEFINE1(times, struct tms __user *, tbuf) | |||
| 912 | return (long) jiffies_64_to_clock_t(get_jiffies_64()); | 912 | return (long) jiffies_64_to_clock_t(get_jiffies_64()); |
| 913 | } | 913 | } |
| 914 | 914 | ||
| 915 | #ifdef CONFIG_COMPAT | ||
| 916 | static compat_clock_t clock_t_to_compat_clock_t(clock_t x) | ||
| 917 | { | ||
| 918 | return compat_jiffies_to_clock_t(clock_t_to_jiffies(x)); | ||
| 919 | } | ||
| 920 | |||
| 921 | COMPAT_SYSCALL_DEFINE1(times, struct compat_tms __user *, tbuf) | ||
| 922 | { | ||
| 923 | if (tbuf) { | ||
| 924 | struct tms tms; | ||
| 925 | struct compat_tms tmp; | ||
| 926 | |||
| 927 | do_sys_times(&tms); | ||
| 928 | /* Convert our struct tms to the compat version. */ | ||
| 929 | tmp.tms_utime = clock_t_to_compat_clock_t(tms.tms_utime); | ||
| 930 | tmp.tms_stime = clock_t_to_compat_clock_t(tms.tms_stime); | ||
| 931 | tmp.tms_cutime = clock_t_to_compat_clock_t(tms.tms_cutime); | ||
| 932 | tmp.tms_cstime = clock_t_to_compat_clock_t(tms.tms_cstime); | ||
| 933 | if (copy_to_user(tbuf, &tmp, sizeof(tmp))) | ||
| 934 | return -EFAULT; | ||
| 935 | } | ||
| 936 | force_successful_syscall_return(); | ||
| 937 | return compat_jiffies_to_clock_t(jiffies); | ||
| 938 | } | ||
| 939 | #endif | ||
| 940 | |||
| 915 | /* | 941 | /* |
| 916 | * This needs some heavy checking ... | 942 | * This needs some heavy checking ... |
| 917 | * I just haven't the stomach for it. I also don't fully | 943 | * I just haven't the stomach for it. I also don't fully |
| @@ -1306,6 +1332,54 @@ SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim) | |||
| 1306 | return ret; | 1332 | return ret; |
| 1307 | } | 1333 | } |
| 1308 | 1334 | ||
| 1335 | #ifdef CONFIG_COMPAT | ||
| 1336 | |||
| 1337 | COMPAT_SYSCALL_DEFINE2(setrlimit, unsigned int, resource, | ||
| 1338 | struct compat_rlimit __user *, rlim) | ||
| 1339 | { | ||
| 1340 | struct rlimit r; | ||
| 1341 | struct compat_rlimit r32; | ||
| 1342 | |||
| 1343 | if (copy_from_user(&r32, rlim, sizeof(struct compat_rlimit))) | ||
| 1344 | return -EFAULT; | ||
| 1345 | |||
| 1346 | if (r32.rlim_cur == COMPAT_RLIM_INFINITY) | ||
| 1347 | r.rlim_cur = RLIM_INFINITY; | ||
| 1348 | else | ||
| 1349 | r.rlim_cur = r32.rlim_cur; | ||
| 1350 | if (r32.rlim_max == COMPAT_RLIM_INFINITY) | ||
| 1351 | r.rlim_max = RLIM_INFINITY; | ||
| 1352 | else | ||
| 1353 | r.rlim_max = r32.rlim_max; | ||
| 1354 | return do_prlimit(current, resource, &r, NULL); | ||
| 1355 | } | ||
| 1356 | |||
| 1357 | COMPAT_SYSCALL_DEFINE2(getrlimit, unsigned int, resource, | ||
| 1358 | struct compat_rlimit __user *, rlim) | ||
| 1359 | { | ||
| 1360 | struct rlimit r; | ||
| 1361 | int ret; | ||
| 1362 | |||
| 1363 | ret = do_prlimit(current, resource, NULL, &r); | ||
| 1364 | if (!ret) { | ||
| 1365 | struct compat_rlimit r32; | ||
| 1366 | if (r.rlim_cur > COMPAT_RLIM_INFINITY) | ||
| 1367 | r32.rlim_cur = COMPAT_RLIM_INFINITY; | ||
| 1368 | else | ||
| 1369 | r32.rlim_cur = r.rlim_cur; | ||
| 1370 | if (r.rlim_max > COMPAT_RLIM_INFINITY) | ||
| 1371 | r32.rlim_max = COMPAT_RLIM_INFINITY; | ||
| 1372 | else | ||
| 1373 | r32.rlim_max = r.rlim_max; | ||
| 1374 | |||
| 1375 | if (copy_to_user(rlim, &r32, sizeof(struct compat_rlimit))) | ||
| 1376 | return -EFAULT; | ||
| 1377 | } | ||
| 1378 | return ret; | ||
| 1379 | } | ||
| 1380 | |||
| 1381 | #endif | ||
| 1382 | |||
| 1309 | #ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT | 1383 | #ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT |
| 1310 | 1384 | ||
| 1311 | /* | 1385 | /* |
| @@ -1328,6 +1402,30 @@ SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, | |||
| 1328 | return copy_to_user(rlim, &x, sizeof(x)) ? -EFAULT : 0; | 1402 | return copy_to_user(rlim, &x, sizeof(x)) ? -EFAULT : 0; |
| 1329 | } | 1403 | } |
| 1330 | 1404 | ||
| 1405 | #ifdef CONFIG_COMPAT | ||
| 1406 | COMPAT_SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, | ||
| 1407 | struct compat_rlimit __user *, rlim) | ||
| 1408 | { | ||
| 1409 | struct rlimit r; | ||
| 1410 | |||
| 1411 | if (resource >= RLIM_NLIMITS) | ||
| 1412 | return -EINVAL; | ||
| 1413 | |||
| 1414 | task_lock(current->group_leader); | ||
| 1415 | r = current->signal->rlim[resource]; | ||
| 1416 | task_unlock(current->group_leader); | ||
| 1417 | if (r.rlim_cur > 0x7FFFFFFF) | ||
| 1418 | r.rlim_cur = 0x7FFFFFFF; | ||
| 1419 | if (r.rlim_max > 0x7FFFFFFF) | ||
| 1420 | r.rlim_max = 0x7FFFFFFF; | ||
| 1421 | |||
| 1422 | if (put_user(r.rlim_cur, &rlim->rlim_cur) || | ||
| 1423 | put_user(r.rlim_max, &rlim->rlim_max)) | ||
| 1424 | return -EFAULT; | ||
| 1425 | return 0; | ||
| 1426 | } | ||
| 1427 | #endif | ||
| 1428 | |||
| 1331 | #endif | 1429 | #endif |
| 1332 | 1430 | ||
| 1333 | static inline bool rlim64_is_infinity(__u64 rlim64) | 1431 | static inline bool rlim64_is_infinity(__u64 rlim64) |
| @@ -1552,7 +1650,7 @@ static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r) | |||
| 1552 | r->ru_oublock += task_io_get_oublock(t); | 1650 | r->ru_oublock += task_io_get_oublock(t); |
| 1553 | } | 1651 | } |
| 1554 | 1652 | ||
| 1555 | static void k_getrusage(struct task_struct *p, int who, struct rusage *r) | 1653 | void getrusage(struct task_struct *p, int who, struct rusage *r) |
| 1556 | { | 1654 | { |
| 1557 | struct task_struct *t; | 1655 | struct task_struct *t; |
| 1558 | unsigned long flags; | 1656 | unsigned long flags; |
| @@ -1626,20 +1724,16 @@ out: | |||
| 1626 | r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */ | 1724 | r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */ |
| 1627 | } | 1725 | } |
| 1628 | 1726 | ||
| 1629 | int getrusage(struct task_struct *p, int who, struct rusage __user *ru) | 1727 | SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru) |
| 1630 | { | 1728 | { |
| 1631 | struct rusage r; | 1729 | struct rusage r; |
| 1632 | 1730 | ||
| 1633 | k_getrusage(p, who, &r); | ||
| 1634 | return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; | ||
| 1635 | } | ||
| 1636 | |||
| 1637 | SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru) | ||
| 1638 | { | ||
| 1639 | if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN && | 1731 | if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN && |
| 1640 | who != RUSAGE_THREAD) | 1732 | who != RUSAGE_THREAD) |
| 1641 | return -EINVAL; | 1733 | return -EINVAL; |
| 1642 | return getrusage(current, who, ru); | 1734 | |
| 1735 | getrusage(current, who, &r); | ||
| 1736 | return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; | ||
| 1643 | } | 1737 | } |
| 1644 | 1738 | ||
| 1645 | #ifdef CONFIG_COMPAT | 1739 | #ifdef CONFIG_COMPAT |
| @@ -1651,7 +1745,7 @@ COMPAT_SYSCALL_DEFINE2(getrusage, int, who, struct compat_rusage __user *, ru) | |||
| 1651 | who != RUSAGE_THREAD) | 1745 | who != RUSAGE_THREAD) |
| 1652 | return -EINVAL; | 1746 | return -EINVAL; |
| 1653 | 1747 | ||
| 1654 | k_getrusage(current, who, &r); | 1748 | getrusage(current, who, &r); |
| 1655 | return put_compat_rusage(&r, ru); | 1749 | return put_compat_rusage(&r, ru); |
| 1656 | } | 1750 | } |
| 1657 | #endif | 1751 | #endif |
| @@ -2266,7 +2360,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, | |||
| 2266 | case PR_GET_THP_DISABLE: | 2360 | case PR_GET_THP_DISABLE: |
| 2267 | if (arg2 || arg3 || arg4 || arg5) | 2361 | if (arg2 || arg3 || arg4 || arg5) |
| 2268 | return -EINVAL; | 2362 | return -EINVAL; |
| 2269 | error = !!(me->mm->def_flags & VM_NOHUGEPAGE); | 2363 | error = !!test_bit(MMF_DISABLE_THP, &me->mm->flags); |
| 2270 | break; | 2364 | break; |
| 2271 | case PR_SET_THP_DISABLE: | 2365 | case PR_SET_THP_DISABLE: |
| 2272 | if (arg3 || arg4 || arg5) | 2366 | if (arg3 || arg4 || arg5) |
| @@ -2274,9 +2368,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, | |||
| 2274 | if (down_write_killable(&me->mm->mmap_sem)) | 2368 | if (down_write_killable(&me->mm->mmap_sem)) |
| 2275 | return -EINTR; | 2369 | return -EINTR; |
| 2276 | if (arg2) | 2370 | if (arg2) |
| 2277 | me->mm->def_flags |= VM_NOHUGEPAGE; | 2371 | set_bit(MMF_DISABLE_THP, &me->mm->flags); |
| 2278 | else | 2372 | else |
| 2279 | me->mm->def_flags &= ~VM_NOHUGEPAGE; | 2373 | clear_bit(MMF_DISABLE_THP, &me->mm->flags); |
| 2280 | up_write(&me->mm->mmap_sem); | 2374 | up_write(&me->mm->mmap_sem); |
| 2281 | break; | 2375 | break; |
| 2282 | case PR_MPX_ENABLE_MANAGEMENT: | 2376 | case PR_MPX_ENABLE_MANAGEMENT: |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4dfba1a76cc3..6648fbbb8157 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
| @@ -174,11 +174,32 @@ extern int no_unaligned_warning; | |||
| 174 | 174 | ||
| 175 | #ifdef CONFIG_PROC_SYSCTL | 175 | #ifdef CONFIG_PROC_SYSCTL |
| 176 | 176 | ||
| 177 | #define SYSCTL_WRITES_LEGACY -1 | 177 | /** |
| 178 | #define SYSCTL_WRITES_WARN 0 | 178 | * enum sysctl_writes_mode - supported sysctl write modes |
| 179 | #define SYSCTL_WRITES_STRICT 1 | 179 | * |
| 180 | * @SYSCTL_WRITES_LEGACY: each write syscall must fully contain the sysctl value | ||
| 181 | * to be written, and multiple writes on the same sysctl file descriptor | ||
| 182 | * will rewrite the sysctl value, regardless of file position. No warning | ||
| 183 | * is issued when the initial position is not 0. | ||
| 184 | * @SYSCTL_WRITES_WARN: same as above but warn when the initial file position is | ||
| 185 | * not 0. | ||
| 186 | * @SYSCTL_WRITES_STRICT: writes to numeric sysctl entries must always be at | ||
| 187 | * file position 0 and the value must be fully contained in the buffer | ||
| 188 | * sent to the write syscall. If dealing with strings respect the file | ||
| 189 | * position, but restrict this to the max length of the buffer, anything | ||
| 190 | * passed the max lenght will be ignored. Multiple writes will append | ||
| 191 | * to the buffer. | ||
| 192 | * | ||
| 193 | * These write modes control how current file position affects the behavior of | ||
| 194 | * updating sysctl values through the proc interface on each write. | ||
| 195 | */ | ||
| 196 | enum sysctl_writes_mode { | ||
| 197 | SYSCTL_WRITES_LEGACY = -1, | ||
| 198 | SYSCTL_WRITES_WARN = 0, | ||
| 199 | SYSCTL_WRITES_STRICT = 1, | ||
| 200 | }; | ||
| 180 | 201 | ||
| 181 | static int sysctl_writes_strict = SYSCTL_WRITES_STRICT; | 202 | static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT; |
| 182 | 203 | ||
| 183 | static int proc_do_cad_pid(struct ctl_table *table, int write, | 204 | static int proc_do_cad_pid(struct ctl_table *table, int write, |
| 184 | void __user *buffer, size_t *lenp, loff_t *ppos); | 205 | void __user *buffer, size_t *lenp, loff_t *ppos); |
| @@ -880,6 +901,14 @@ static struct ctl_table kern_table[] = { | |||
| 880 | #endif | 901 | #endif |
| 881 | }, | 902 | }, |
| 882 | { | 903 | { |
| 904 | .procname = "watchdog_cpumask", | ||
| 905 | .data = &watchdog_cpumask_bits, | ||
| 906 | .maxlen = NR_CPUS, | ||
| 907 | .mode = 0644, | ||
| 908 | .proc_handler = proc_watchdog_cpumask, | ||
| 909 | }, | ||
| 910 | #ifdef CONFIG_SOFTLOCKUP_DETECTOR | ||
| 911 | { | ||
| 883 | .procname = "soft_watchdog", | 912 | .procname = "soft_watchdog", |
| 884 | .data = &soft_watchdog_enabled, | 913 | .data = &soft_watchdog_enabled, |
| 885 | .maxlen = sizeof (int), | 914 | .maxlen = sizeof (int), |
| @@ -889,13 +918,6 @@ static struct ctl_table kern_table[] = { | |||
| 889 | .extra2 = &one, | 918 | .extra2 = &one, |
| 890 | }, | 919 | }, |
| 891 | { | 920 | { |
| 892 | .procname = "watchdog_cpumask", | ||
| 893 | .data = &watchdog_cpumask_bits, | ||
| 894 | .maxlen = NR_CPUS, | ||
| 895 | .mode = 0644, | ||
| 896 | .proc_handler = proc_watchdog_cpumask, | ||
| 897 | }, | ||
| 898 | { | ||
| 899 | .procname = "softlockup_panic", | 921 | .procname = "softlockup_panic", |
| 900 | .data = &softlockup_panic, | 922 | .data = &softlockup_panic, |
| 901 | .maxlen = sizeof(int), | 923 | .maxlen = sizeof(int), |
| @@ -904,27 +926,29 @@ static struct ctl_table kern_table[] = { | |||
| 904 | .extra1 = &zero, | 926 | .extra1 = &zero, |
| 905 | .extra2 = &one, | 927 | .extra2 = &one, |
| 906 | }, | 928 | }, |
| 907 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | 929 | #ifdef CONFIG_SMP |
| 908 | { | 930 | { |
| 909 | .procname = "hardlockup_panic", | 931 | .procname = "softlockup_all_cpu_backtrace", |
| 910 | .data = &hardlockup_panic, | 932 | .data = &sysctl_softlockup_all_cpu_backtrace, |
| 911 | .maxlen = sizeof(int), | 933 | .maxlen = sizeof(int), |
| 912 | .mode = 0644, | 934 | .mode = 0644, |
| 913 | .proc_handler = proc_dointvec_minmax, | 935 | .proc_handler = proc_dointvec_minmax, |
| 914 | .extra1 = &zero, | 936 | .extra1 = &zero, |
| 915 | .extra2 = &one, | 937 | .extra2 = &one, |
| 916 | }, | 938 | }, |
| 939 | #endif /* CONFIG_SMP */ | ||
| 917 | #endif | 940 | #endif |
| 918 | #ifdef CONFIG_SMP | 941 | #ifdef CONFIG_HARDLOCKUP_DETECTOR |
| 919 | { | 942 | { |
| 920 | .procname = "softlockup_all_cpu_backtrace", | 943 | .procname = "hardlockup_panic", |
| 921 | .data = &sysctl_softlockup_all_cpu_backtrace, | 944 | .data = &hardlockup_panic, |
| 922 | .maxlen = sizeof(int), | 945 | .maxlen = sizeof(int), |
| 923 | .mode = 0644, | 946 | .mode = 0644, |
| 924 | .proc_handler = proc_dointvec_minmax, | 947 | .proc_handler = proc_dointvec_minmax, |
| 925 | .extra1 = &zero, | 948 | .extra1 = &zero, |
| 926 | .extra2 = &one, | 949 | .extra2 = &one, |
| 927 | }, | 950 | }, |
| 951 | #ifdef CONFIG_SMP | ||
| 928 | { | 952 | { |
| 929 | .procname = "hardlockup_all_cpu_backtrace", | 953 | .procname = "hardlockup_all_cpu_backtrace", |
| 930 | .data = &sysctl_hardlockup_all_cpu_backtrace, | 954 | .data = &sysctl_hardlockup_all_cpu_backtrace, |
| @@ -936,6 +960,8 @@ static struct ctl_table kern_table[] = { | |||
| 936 | }, | 960 | }, |
| 937 | #endif /* CONFIG_SMP */ | 961 | #endif /* CONFIG_SMP */ |
| 938 | #endif | 962 | #endif |
| 963 | #endif | ||
| 964 | |||
| 939 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) | 965 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) |
| 940 | { | 966 | { |
| 941 | .procname = "unknown_nmi_panic", | 967 | .procname = "unknown_nmi_panic", |
| @@ -1950,6 +1976,32 @@ static void warn_sysctl_write(struct ctl_table *table) | |||
| 1950 | } | 1976 | } |
| 1951 | 1977 | ||
| 1952 | /** | 1978 | /** |
| 1979 | * proc_first_pos_non_zero_ignore - check if firs position is allowed | ||
| 1980 | * @ppos: file position | ||
| 1981 | * @table: the sysctl table | ||
| 1982 | * | ||
| 1983 | * Returns true if the first position is non-zero and the sysctl_writes_strict | ||
| 1984 | * mode indicates this is not allowed for numeric input types. String proc | ||
| 1985 | * hadlers can ignore the return value. | ||
| 1986 | */ | ||
| 1987 | static bool proc_first_pos_non_zero_ignore(loff_t *ppos, | ||
| 1988 | struct ctl_table *table) | ||
| 1989 | { | ||
| 1990 | if (!*ppos) | ||
| 1991 | return false; | ||
| 1992 | |||
| 1993 | switch (sysctl_writes_strict) { | ||
| 1994 | case SYSCTL_WRITES_STRICT: | ||
| 1995 | return true; | ||
| 1996 | case SYSCTL_WRITES_WARN: | ||
| 1997 | warn_sysctl_write(table); | ||
| 1998 | return false; | ||
| 1999 | default: | ||
| 2000 | return false; | ||
| 2001 | } | ||
| 2002 | } | ||
| 2003 | |||
| 2004 | /** | ||
| 1953 | * proc_dostring - read a string sysctl | 2005 | * proc_dostring - read a string sysctl |
| 1954 | * @table: the sysctl table | 2006 | * @table: the sysctl table |
| 1955 | * @write: %TRUE if this is a write to the sysctl file | 2007 | * @write: %TRUE if this is a write to the sysctl file |
| @@ -1969,8 +2021,8 @@ static void warn_sysctl_write(struct ctl_table *table) | |||
| 1969 | int proc_dostring(struct ctl_table *table, int write, | 2021 | int proc_dostring(struct ctl_table *table, int write, |
| 1970 | void __user *buffer, size_t *lenp, loff_t *ppos) | 2022 | void __user *buffer, size_t *lenp, loff_t *ppos) |
| 1971 | { | 2023 | { |
| 1972 | if (write && *ppos && sysctl_writes_strict == SYSCTL_WRITES_WARN) | 2024 | if (write) |
| 1973 | warn_sysctl_write(table); | 2025 | proc_first_pos_non_zero_ignore(ppos, table); |
| 1974 | 2026 | ||
| 1975 | return _proc_do_string((char *)(table->data), table->maxlen, write, | 2027 | return _proc_do_string((char *)(table->data), table->maxlen, write, |
| 1976 | (char __user *)buffer, lenp, ppos); | 2028 | (char __user *)buffer, lenp, ppos); |
| @@ -2128,19 +2180,18 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp, | |||
| 2128 | return 0; | 2180 | return 0; |
| 2129 | } | 2181 | } |
| 2130 | 2182 | ||
| 2131 | static int do_proc_douintvec_conv(bool *negp, unsigned long *lvalp, | 2183 | static int do_proc_douintvec_conv(unsigned long *lvalp, |
| 2132 | int *valp, | 2184 | unsigned int *valp, |
| 2133 | int write, void *data) | 2185 | int write, void *data) |
| 2134 | { | 2186 | { |
| 2135 | if (write) { | 2187 | if (write) { |
| 2136 | if (*negp) | 2188 | if (*lvalp > UINT_MAX) |
| 2137 | return -EINVAL; | 2189 | return -EINVAL; |
| 2138 | if (*lvalp > UINT_MAX) | 2190 | if (*lvalp > UINT_MAX) |
| 2139 | return -EINVAL; | 2191 | return -EINVAL; |
| 2140 | *valp = *lvalp; | 2192 | *valp = *lvalp; |
| 2141 | } else { | 2193 | } else { |
| 2142 | unsigned int val = *valp; | 2194 | unsigned int val = *valp; |
| 2143 | *negp = false; | ||
| 2144 | *lvalp = (unsigned long)val; | 2195 | *lvalp = (unsigned long)val; |
| 2145 | } | 2196 | } |
| 2146 | return 0; | 2197 | return 0; |
| @@ -2172,17 +2223,8 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, | |||
| 2172 | conv = do_proc_dointvec_conv; | 2223 | conv = do_proc_dointvec_conv; |
| 2173 | 2224 | ||
| 2174 | if (write) { | 2225 | if (write) { |
| 2175 | if (*ppos) { | 2226 | if (proc_first_pos_non_zero_ignore(ppos, table)) |
| 2176 | switch (sysctl_writes_strict) { | 2227 | goto out; |
| 2177 | case SYSCTL_WRITES_STRICT: | ||
| 2178 | goto out; | ||
| 2179 | case SYSCTL_WRITES_WARN: | ||
| 2180 | warn_sysctl_write(table); | ||
| 2181 | break; | ||
| 2182 | default: | ||
| 2183 | break; | ||
| 2184 | } | ||
| 2185 | } | ||
| 2186 | 2228 | ||
| 2187 | if (left > PAGE_SIZE - 1) | 2229 | if (left > PAGE_SIZE - 1) |
| 2188 | left = PAGE_SIZE - 1; | 2230 | left = PAGE_SIZE - 1; |
| @@ -2249,6 +2291,146 @@ static int do_proc_dointvec(struct ctl_table *table, int write, | |||
| 2249 | buffer, lenp, ppos, conv, data); | 2291 | buffer, lenp, ppos, conv, data); |
| 2250 | } | 2292 | } |
| 2251 | 2293 | ||
| 2294 | static int do_proc_douintvec_w(unsigned int *tbl_data, | ||
| 2295 | struct ctl_table *table, | ||
| 2296 | void __user *buffer, | ||
| 2297 | size_t *lenp, loff_t *ppos, | ||
| 2298 | int (*conv)(unsigned long *lvalp, | ||
| 2299 | unsigned int *valp, | ||
| 2300 | int write, void *data), | ||
| 2301 | void *data) | ||
| 2302 | { | ||
| 2303 | unsigned long lval; | ||
| 2304 | int err = 0; | ||
| 2305 | size_t left; | ||
| 2306 | bool neg; | ||
| 2307 | char *kbuf = NULL, *p; | ||
| 2308 | |||
| 2309 | left = *lenp; | ||
| 2310 | |||
| 2311 | if (proc_first_pos_non_zero_ignore(ppos, table)) | ||
| 2312 | goto bail_early; | ||
| 2313 | |||
| 2314 | if (left > PAGE_SIZE - 1) | ||
| 2315 | left = PAGE_SIZE - 1; | ||
| 2316 | |||
| 2317 | p = kbuf = memdup_user_nul(buffer, left); | ||
| 2318 | if (IS_ERR(kbuf)) | ||
| 2319 | return -EINVAL; | ||
| 2320 | |||
| 2321 | left -= proc_skip_spaces(&p); | ||
| 2322 | if (!left) { | ||
| 2323 | err = -EINVAL; | ||
| 2324 | goto out_free; | ||
| 2325 | } | ||
| 2326 | |||
| 2327 | err = proc_get_long(&p, &left, &lval, &neg, | ||
| 2328 | proc_wspace_sep, | ||
| 2329 | sizeof(proc_wspace_sep), NULL); | ||
| 2330 | if (err || neg) { | ||
| 2331 | err = -EINVAL; | ||
| 2332 | goto out_free; | ||
| 2333 | } | ||
| 2334 | |||
| 2335 | if (conv(&lval, tbl_data, 1, data)) { | ||
| 2336 | err = -EINVAL; | ||
| 2337 | goto out_free; | ||
| 2338 | } | ||
| 2339 | |||
| 2340 | if (!err && left) | ||
| 2341 | left -= proc_skip_spaces(&p); | ||
| 2342 | |||
| 2343 | out_free: | ||
| 2344 | kfree(kbuf); | ||
| 2345 | if (err) | ||
| 2346 | return -EINVAL; | ||
| 2347 | |||
| 2348 | return 0; | ||
| 2349 | |||
| 2350 | /* This is in keeping with old __do_proc_dointvec() */ | ||
| 2351 | bail_early: | ||
| 2352 | *ppos += *lenp; | ||
| 2353 | return err; | ||
| 2354 | } | ||
| 2355 | |||
| 2356 | static int do_proc_douintvec_r(unsigned int *tbl_data, void __user *buffer, | ||
| 2357 | size_t *lenp, loff_t *ppos, | ||
| 2358 | int (*conv)(unsigned long *lvalp, | ||
| 2359 | unsigned int *valp, | ||
| 2360 | int write, void *data), | ||
| 2361 | void *data) | ||
| 2362 | { | ||
| 2363 | unsigned long lval; | ||
| 2364 | int err = 0; | ||
| 2365 | size_t left; | ||
| 2366 | |||
| 2367 | left = *lenp; | ||
| 2368 | |||
| 2369 | if (conv(&lval, tbl_data, 0, data)) { | ||
| 2370 | err = -EINVAL; | ||
| 2371 | goto out; | ||
| 2372 | } | ||
| 2373 | |||
| 2374 | err = proc_put_long(&buffer, &left, lval, false); | ||
| 2375 | if (err || !left) | ||
| 2376 | goto out; | ||
| 2377 | |||
| 2378 | err = proc_put_char(&buffer, &left, '\n'); | ||
| 2379 | |||
| 2380 | out: | ||
| 2381 | *lenp -= left; | ||
| 2382 | *ppos += *lenp; | ||
| 2383 | |||
| 2384 | return err; | ||
| 2385 | } | ||
| 2386 | |||
| 2387 | static int __do_proc_douintvec(void *tbl_data, struct ctl_table *table, | ||
| 2388 | int write, void __user *buffer, | ||
| 2389 | size_t *lenp, loff_t *ppos, | ||
| 2390 | int (*conv)(unsigned long *lvalp, | ||
| 2391 | unsigned int *valp, | ||
| 2392 | int write, void *data), | ||
| 2393 | void *data) | ||
| 2394 | { | ||
| 2395 | unsigned int *i, vleft; | ||
| 2396 | |||
| 2397 | if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) { | ||
| 2398 | *lenp = 0; | ||
| 2399 | return 0; | ||
| 2400 | } | ||
| 2401 | |||
| 2402 | i = (unsigned int *) tbl_data; | ||
| 2403 | vleft = table->maxlen / sizeof(*i); | ||
| 2404 | |||
| 2405 | /* | ||
| 2406 | * Arrays are not supported, keep this simple. *Do not* add | ||
| 2407 | * support for them. | ||
| 2408 | */ | ||
| 2409 | if (vleft != 1) { | ||
| 2410 | *lenp = 0; | ||
| 2411 | return -EINVAL; | ||
| 2412 | } | ||
| 2413 | |||
| 2414 | if (!conv) | ||
| 2415 | conv = do_proc_douintvec_conv; | ||
| 2416 | |||
| 2417 | if (write) | ||
| 2418 | return do_proc_douintvec_w(i, table, buffer, lenp, ppos, | ||
| 2419 | conv, data); | ||
| 2420 | return do_proc_douintvec_r(i, buffer, lenp, ppos, conv, data); | ||
| 2421 | } | ||
| 2422 | |||
| 2423 | static int do_proc_douintvec(struct ctl_table *table, int write, | ||
| 2424 | void __user *buffer, size_t *lenp, loff_t *ppos, | ||
| 2425 | int (*conv)(unsigned long *lvalp, | ||
| 2426 | unsigned int *valp, | ||
| 2427 | int write, void *data), | ||
| 2428 | void *data) | ||
| 2429 | { | ||
| 2430 | return __do_proc_douintvec(table->data, table, write, | ||
| 2431 | buffer, lenp, ppos, conv, data); | ||
| 2432 | } | ||
| 2433 | |||
| 2252 | /** | 2434 | /** |
| 2253 | * proc_dointvec - read a vector of integers | 2435 | * proc_dointvec - read a vector of integers |
| 2254 | * @table: the sysctl table | 2436 | * @table: the sysctl table |
| @@ -2284,8 +2466,8 @@ int proc_dointvec(struct ctl_table *table, int write, | |||
| 2284 | int proc_douintvec(struct ctl_table *table, int write, | 2466 | int proc_douintvec(struct ctl_table *table, int write, |
| 2285 | void __user *buffer, size_t *lenp, loff_t *ppos) | 2467 | void __user *buffer, size_t *lenp, loff_t *ppos) |
| 2286 | { | 2468 | { |
| 2287 | return do_proc_dointvec(table, write, buffer, lenp, ppos, | 2469 | return do_proc_douintvec(table, write, buffer, lenp, ppos, |
| 2288 | do_proc_douintvec_conv, NULL); | 2470 | do_proc_douintvec_conv, NULL); |
| 2289 | } | 2471 | } |
| 2290 | 2472 | ||
| 2291 | /* | 2473 | /* |
| @@ -2390,6 +2572,65 @@ int proc_dointvec_minmax(struct ctl_table *table, int write, | |||
| 2390 | do_proc_dointvec_minmax_conv, ¶m); | 2572 | do_proc_dointvec_minmax_conv, ¶m); |
| 2391 | } | 2573 | } |
| 2392 | 2574 | ||
| 2575 | struct do_proc_douintvec_minmax_conv_param { | ||
| 2576 | unsigned int *min; | ||
| 2577 | unsigned int *max; | ||
| 2578 | }; | ||
| 2579 | |||
| 2580 | static int do_proc_douintvec_minmax_conv(unsigned long *lvalp, | ||
| 2581 | unsigned int *valp, | ||
| 2582 | int write, void *data) | ||
| 2583 | { | ||
| 2584 | struct do_proc_douintvec_minmax_conv_param *param = data; | ||
| 2585 | |||
| 2586 | if (write) { | ||
| 2587 | unsigned int val = *lvalp; | ||
| 2588 | |||
| 2589 | if ((param->min && *param->min > val) || | ||
| 2590 | (param->max && *param->max < val)) | ||
| 2591 | return -ERANGE; | ||
| 2592 | |||
| 2593 | if (*lvalp > UINT_MAX) | ||
| 2594 | return -EINVAL; | ||
| 2595 | *valp = val; | ||
| 2596 | } else { | ||
| 2597 | unsigned int val = *valp; | ||
| 2598 | *lvalp = (unsigned long) val; | ||
| 2599 | } | ||
| 2600 | |||
| 2601 | return 0; | ||
| 2602 | } | ||
| 2603 | |||
| 2604 | /** | ||
| 2605 | * proc_douintvec_minmax - read a vector of unsigned ints with min/max values | ||
| 2606 | * @table: the sysctl table | ||
| 2607 | * @write: %TRUE if this is a write to the sysctl file | ||
| 2608 | * @buffer: the user buffer | ||
| 2609 | * @lenp: the size of the user buffer | ||
| 2610 | * @ppos: file position | ||
| 2611 | * | ||
| 2612 | * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer | ||
| 2613 | * values from/to the user buffer, treated as an ASCII string. Negative | ||
| 2614 | * strings are not allowed. | ||
| 2615 | * | ||
| 2616 | * This routine will ensure the values are within the range specified by | ||
| 2617 | * table->extra1 (min) and table->extra2 (max). There is a final sanity | ||
| 2618 | * check for UINT_MAX to avoid having to support wrap around uses from | ||
| 2619 | * userspace. | ||
| 2620 | * | ||
| 2621 | * Returns 0 on success. | ||
| 2622 | */ | ||
| 2623 | int proc_douintvec_minmax(struct ctl_table *table, int write, | ||
| 2624 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
| 2625 | { | ||
| 2626 | struct do_proc_douintvec_minmax_conv_param param = { | ||
| 2627 | .min = (unsigned int *) table->extra1, | ||
| 2628 | .max = (unsigned int *) table->extra2, | ||
| 2629 | }; | ||
| 2630 | return do_proc_douintvec(table, write, buffer, lenp, ppos, | ||
| 2631 | do_proc_douintvec_minmax_conv, ¶m); | ||
| 2632 | } | ||
| 2633 | |||
| 2393 | static void validate_coredump_safety(void) | 2634 | static void validate_coredump_safety(void) |
| 2394 | { | 2635 | { |
| 2395 | #ifdef CONFIG_COREDUMP | 2636 | #ifdef CONFIG_COREDUMP |
| @@ -2447,17 +2688,8 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int | |||
| 2447 | left = *lenp; | 2688 | left = *lenp; |
| 2448 | 2689 | ||
| 2449 | if (write) { | 2690 | if (write) { |
| 2450 | if (*ppos) { | 2691 | if (proc_first_pos_non_zero_ignore(ppos, table)) |
| 2451 | switch (sysctl_writes_strict) { | 2692 | goto out; |
| 2452 | case SYSCTL_WRITES_STRICT: | ||
| 2453 | goto out; | ||
| 2454 | case SYSCTL_WRITES_WARN: | ||
| 2455 | warn_sysctl_write(table); | ||
| 2456 | break; | ||
| 2457 | default: | ||
| 2458 | break; | ||
| 2459 | } | ||
| 2460 | } | ||
| 2461 | 2693 | ||
| 2462 | if (left > PAGE_SIZE - 1) | 2694 | if (left > PAGE_SIZE - 1) |
| 2463 | left = PAGE_SIZE - 1; | 2695 | left = PAGE_SIZE - 1; |
| @@ -2898,6 +3130,12 @@ int proc_dointvec_minmax(struct ctl_table *table, int write, | |||
| 2898 | return -ENOSYS; | 3130 | return -ENOSYS; |
| 2899 | } | 3131 | } |
| 2900 | 3132 | ||
| 3133 | int proc_douintvec_minmax(struct ctl_table *table, int write, | ||
| 3134 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
| 3135 | { | ||
| 3136 | return -ENOSYS; | ||
| 3137 | } | ||
| 3138 | |||
| 2901 | int proc_dointvec_jiffies(struct ctl_table *table, int write, | 3139 | int proc_dointvec_jiffies(struct ctl_table *table, int write, |
| 2902 | void __user *buffer, size_t *lenp, loff_t *ppos) | 3140 | void __user *buffer, size_t *lenp, loff_t *ppos) |
| 2903 | { | 3141 | { |
| @@ -2940,6 +3178,7 @@ EXPORT_SYMBOL(proc_dointvec); | |||
| 2940 | EXPORT_SYMBOL(proc_douintvec); | 3178 | EXPORT_SYMBOL(proc_douintvec); |
| 2941 | EXPORT_SYMBOL(proc_dointvec_jiffies); | 3179 | EXPORT_SYMBOL(proc_dointvec_jiffies); |
| 2942 | EXPORT_SYMBOL(proc_dointvec_minmax); | 3180 | EXPORT_SYMBOL(proc_dointvec_minmax); |
| 3181 | EXPORT_SYMBOL_GPL(proc_douintvec_minmax); | ||
| 2943 | EXPORT_SYMBOL(proc_dointvec_userhz_jiffies); | 3182 | EXPORT_SYMBOL(proc_dointvec_userhz_jiffies); |
| 2944 | EXPORT_SYMBOL(proc_dointvec_ms_jiffies); | 3183 | EXPORT_SYMBOL(proc_dointvec_ms_jiffies); |
| 2945 | EXPORT_SYMBOL(proc_dostring); | 3184 | EXPORT_SYMBOL(proc_dostring); |
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 939a158eab11..02e1859f2ca8 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c | |||
| @@ -1346,7 +1346,7 @@ static void deprecated_sysctl_warning(const int *name, int nlen) | |||
| 1346 | * CTL_KERN/KERN_VERSION is used by older glibc and cannot | 1346 | * CTL_KERN/KERN_VERSION is used by older glibc and cannot |
| 1347 | * ever go away. | 1347 | * ever go away. |
| 1348 | */ | 1348 | */ |
| 1349 | if (name[0] == CTL_KERN && name[1] == KERN_VERSION) | 1349 | if (nlen >= 2 && name[0] == CTL_KERN && name[1] == KERN_VERSION) |
| 1350 | return; | 1350 | return; |
| 1351 | 1351 | ||
| 1352 | if (printk_ratelimit()) { | 1352 | if (printk_ratelimit()) { |
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index c991cf212c6d..0b8ff7d257ea 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c | |||
| @@ -712,14 +712,14 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp, | |||
| 712 | alarmtimer_freezerset(absexp, type); | 712 | alarmtimer_freezerset(absexp, type); |
| 713 | restart = ¤t->restart_block; | 713 | restart = ¤t->restart_block; |
| 714 | if (restart->nanosleep.type != TT_NONE) { | 714 | if (restart->nanosleep.type != TT_NONE) { |
| 715 | struct timespec rmt; | 715 | struct timespec64 rmt; |
| 716 | ktime_t rem; | 716 | ktime_t rem; |
| 717 | 717 | ||
| 718 | rem = ktime_sub(absexp, alarm_bases[type].gettime()); | 718 | rem = ktime_sub(absexp, alarm_bases[type].gettime()); |
| 719 | 719 | ||
| 720 | if (rem <= 0) | 720 | if (rem <= 0) |
| 721 | return 0; | 721 | return 0; |
| 722 | rmt = ktime_to_timespec(rem); | 722 | rmt = ktime_to_timespec64(rem); |
| 723 | 723 | ||
| 724 | return nanosleep_copyout(restart, &rmt); | 724 | return nanosleep_copyout(restart, &rmt); |
| 725 | } | 725 | } |
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 81da124f1115..88f75f92ef36 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c | |||
| @@ -1440,17 +1440,17 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task) | |||
| 1440 | } | 1440 | } |
| 1441 | EXPORT_SYMBOL_GPL(hrtimer_init_sleeper); | 1441 | EXPORT_SYMBOL_GPL(hrtimer_init_sleeper); |
| 1442 | 1442 | ||
| 1443 | int nanosleep_copyout(struct restart_block *restart, struct timespec *ts) | 1443 | int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts) |
| 1444 | { | 1444 | { |
| 1445 | switch(restart->nanosleep.type) { | 1445 | switch(restart->nanosleep.type) { |
| 1446 | #ifdef CONFIG_COMPAT | 1446 | #ifdef CONFIG_COMPAT |
| 1447 | case TT_COMPAT: | 1447 | case TT_COMPAT: |
| 1448 | if (compat_put_timespec(ts, restart->nanosleep.compat_rmtp)) | 1448 | if (compat_put_timespec64(ts, restart->nanosleep.compat_rmtp)) |
| 1449 | return -EFAULT; | 1449 | return -EFAULT; |
| 1450 | break; | 1450 | break; |
| 1451 | #endif | 1451 | #endif |
| 1452 | case TT_NATIVE: | 1452 | case TT_NATIVE: |
| 1453 | if (copy_to_user(restart->nanosleep.rmtp, ts, sizeof(struct timespec))) | 1453 | if (put_timespec64(ts, restart->nanosleep.rmtp)) |
| 1454 | return -EFAULT; | 1454 | return -EFAULT; |
| 1455 | break; | 1455 | break; |
| 1456 | default: | 1456 | default: |
| @@ -1485,11 +1485,11 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod | |||
| 1485 | restart = ¤t->restart_block; | 1485 | restart = ¤t->restart_block; |
| 1486 | if (restart->nanosleep.type != TT_NONE) { | 1486 | if (restart->nanosleep.type != TT_NONE) { |
| 1487 | ktime_t rem = hrtimer_expires_remaining(&t->timer); | 1487 | ktime_t rem = hrtimer_expires_remaining(&t->timer); |
| 1488 | struct timespec rmt; | 1488 | struct timespec64 rmt; |
| 1489 | 1489 | ||
| 1490 | if (rem <= 0) | 1490 | if (rem <= 0) |
| 1491 | return 0; | 1491 | return 0; |
| 1492 | rmt = ktime_to_timespec(rem); | 1492 | rmt = ktime_to_timespec64(rem); |
| 1493 | 1493 | ||
| 1494 | return nanosleep_copyout(restart, &rmt); | 1494 | return nanosleep_copyout(restart, &rmt); |
| 1495 | } | 1495 | } |
| @@ -1546,19 +1546,17 @@ out: | |||
| 1546 | SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp, | 1546 | SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp, |
| 1547 | struct timespec __user *, rmtp) | 1547 | struct timespec __user *, rmtp) |
| 1548 | { | 1548 | { |
| 1549 | struct timespec64 tu64; | 1549 | struct timespec64 tu; |
| 1550 | struct timespec tu; | ||
| 1551 | 1550 | ||
| 1552 | if (copy_from_user(&tu, rqtp, sizeof(tu))) | 1551 | if (get_timespec64(&tu, rqtp)) |
| 1553 | return -EFAULT; | 1552 | return -EFAULT; |
| 1554 | 1553 | ||
| 1555 | tu64 = timespec_to_timespec64(tu); | 1554 | if (!timespec64_valid(&tu)) |
| 1556 | if (!timespec64_valid(&tu64)) | ||
| 1557 | return -EINVAL; | 1555 | return -EINVAL; |
| 1558 | 1556 | ||
| 1559 | current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; | 1557 | current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; |
| 1560 | current->restart_block.nanosleep.rmtp = rmtp; | 1558 | current->restart_block.nanosleep.rmtp = rmtp; |
| 1561 | return hrtimer_nanosleep(&tu64, HRTIMER_MODE_REL, CLOCK_MONOTONIC); | 1559 | return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC); |
| 1562 | } | 1560 | } |
| 1563 | 1561 | ||
| 1564 | #ifdef CONFIG_COMPAT | 1562 | #ifdef CONFIG_COMPAT |
| @@ -1566,19 +1564,17 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp, | |||
| 1566 | COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp, | 1564 | COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp, |
| 1567 | struct compat_timespec __user *, rmtp) | 1565 | struct compat_timespec __user *, rmtp) |
| 1568 | { | 1566 | { |
| 1569 | struct timespec64 tu64; | 1567 | struct timespec64 tu; |
| 1570 | struct timespec tu; | ||
| 1571 | 1568 | ||
| 1572 | if (compat_get_timespec(&tu, rqtp)) | 1569 | if (compat_get_timespec64(&tu, rqtp)) |
| 1573 | return -EFAULT; | 1570 | return -EFAULT; |
| 1574 | 1571 | ||
| 1575 | tu64 = timespec_to_timespec64(tu); | 1572 | if (!timespec64_valid(&tu)) |
| 1576 | if (!timespec64_valid(&tu64)) | ||
| 1577 | return -EINVAL; | 1573 | return -EINVAL; |
| 1578 | 1574 | ||
| 1579 | current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; | 1575 | current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; |
| 1580 | current->restart_block.nanosleep.compat_rmtp = rmtp; | 1576 | current->restart_block.nanosleep.compat_rmtp = rmtp; |
| 1581 | return hrtimer_nanosleep(&tu64, HRTIMER_MODE_REL, CLOCK_MONOTONIC); | 1577 | return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC); |
| 1582 | } | 1578 | } |
| 1583 | #endif | 1579 | #endif |
| 1584 | 1580 | ||
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 60cb24ac9ebc..a3bd5dbe0dc4 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c | |||
| @@ -1318,12 +1318,8 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, | |||
| 1318 | */ | 1318 | */ |
| 1319 | restart = ¤t->restart_block; | 1319 | restart = ¤t->restart_block; |
| 1320 | restart->nanosleep.expires = expires; | 1320 | restart->nanosleep.expires = expires; |
| 1321 | if (restart->nanosleep.type != TT_NONE) { | 1321 | if (restart->nanosleep.type != TT_NONE) |
| 1322 | struct timespec ts; | 1322 | error = nanosleep_copyout(restart, &it.it_value); |
| 1323 | |||
| 1324 | ts = timespec64_to_timespec(it.it_value); | ||
| 1325 | error = nanosleep_copyout(restart, &ts); | ||
| 1326 | } | ||
| 1327 | } | 1323 | } |
| 1328 | 1324 | ||
| 1329 | return error; | 1325 | return error; |
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index 38f3b20efa29..06f34feb635e 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c | |||
| @@ -41,12 +41,6 @@ SYS_NI(setitimer); | |||
| 41 | #ifdef __ARCH_WANT_SYS_ALARM | 41 | #ifdef __ARCH_WANT_SYS_ALARM |
| 42 | SYS_NI(alarm); | 42 | SYS_NI(alarm); |
| 43 | #endif | 43 | #endif |
| 44 | COMPAT_SYS_NI(timer_create); | ||
| 45 | COMPAT_SYS_NI(clock_adjtime); | ||
| 46 | COMPAT_SYS_NI(timer_settime); | ||
| 47 | COMPAT_SYS_NI(timer_gettime); | ||
| 48 | COMPAT_SYS_NI(getitimer); | ||
| 49 | COMPAT_SYS_NI(setitimer); | ||
| 50 | 44 | ||
| 51 | /* | 45 | /* |
| 52 | * We preserve minimal support for CLOCK_REALTIME and CLOCK_MONOTONIC | 46 | * We preserve minimal support for CLOCK_REALTIME and CLOCK_MONOTONIC |
| @@ -57,40 +51,52 @@ COMPAT_SYS_NI(setitimer); | |||
| 57 | SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, | 51 | SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, |
| 58 | const struct timespec __user *, tp) | 52 | const struct timespec __user *, tp) |
| 59 | { | 53 | { |
| 60 | struct timespec64 new_tp64; | 54 | struct timespec64 new_tp; |
| 61 | struct timespec new_tp; | ||
| 62 | 55 | ||
| 63 | if (which_clock != CLOCK_REALTIME) | 56 | if (which_clock != CLOCK_REALTIME) |
| 64 | return -EINVAL; | 57 | return -EINVAL; |
| 65 | if (copy_from_user(&new_tp, tp, sizeof (*tp))) | 58 | if (get_timespec64(&new_tp, tp)) |
| 66 | return -EFAULT; | 59 | return -EFAULT; |
| 67 | 60 | ||
| 68 | new_tp64 = timespec_to_timespec64(new_tp); | 61 | return do_sys_settimeofday64(&new_tp, NULL); |
| 69 | return do_sys_settimeofday64(&new_tp64, NULL); | ||
| 70 | } | 62 | } |
| 71 | 63 | ||
| 72 | SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, | 64 | int do_clock_gettime(clockid_t which_clock, struct timespec64 *tp) |
| 73 | struct timespec __user *,tp) | ||
| 74 | { | 65 | { |
| 75 | struct timespec64 kernel_tp64; | ||
| 76 | struct timespec kernel_tp; | ||
| 77 | |||
| 78 | switch (which_clock) { | 66 | switch (which_clock) { |
| 79 | case CLOCK_REALTIME: ktime_get_real_ts64(&kernel_tp64); break; | 67 | case CLOCK_REALTIME: |
| 80 | case CLOCK_MONOTONIC: ktime_get_ts64(&kernel_tp64); break; | 68 | ktime_get_real_ts64(tp); |
| 81 | case CLOCK_BOOTTIME: get_monotonic_boottime64(&kernel_tp64); break; | 69 | break; |
| 82 | default: return -EINVAL; | 70 | case CLOCK_MONOTONIC: |
| 71 | ktime_get_ts64(tp); | ||
| 72 | break; | ||
| 73 | case CLOCK_BOOTTIME: | ||
| 74 | get_monotonic_boottime64(tp); | ||
| 75 | break; | ||
| 76 | default: | ||
| 77 | return -EINVAL; | ||
| 83 | } | 78 | } |
| 84 | 79 | ||
| 85 | kernel_tp = timespec64_to_timespec(kernel_tp64); | 80 | return 0; |
| 86 | if (copy_to_user(tp, &kernel_tp, sizeof (kernel_tp))) | 81 | } |
| 82 | SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, | ||
| 83 | struct timespec __user *, tp) | ||
| 84 | { | ||
| 85 | int ret; | ||
| 86 | struct timespec64 kernel_tp; | ||
| 87 | |||
| 88 | ret = do_clock_gettime(which_clock, &kernel_tp); | ||
| 89 | if (ret) | ||
| 90 | return ret; | ||
| 91 | |||
| 92 | if (put_timespec64(&kernel_tp, tp)) | ||
| 87 | return -EFAULT; | 93 | return -EFAULT; |
| 88 | return 0; | 94 | return 0; |
| 89 | } | 95 | } |
| 90 | 96 | ||
| 91 | SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __user *, tp) | 97 | SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __user *, tp) |
| 92 | { | 98 | { |
| 93 | struct timespec rtn_tp = { | 99 | struct timespec64 rtn_tp = { |
| 94 | .tv_sec = 0, | 100 | .tv_sec = 0, |
| 95 | .tv_nsec = hrtimer_resolution, | 101 | .tv_nsec = hrtimer_resolution, |
| 96 | }; | 102 | }; |
| @@ -99,7 +105,7 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __us | |||
| 99 | case CLOCK_REALTIME: | 105 | case CLOCK_REALTIME: |
| 100 | case CLOCK_MONOTONIC: | 106 | case CLOCK_MONOTONIC: |
| 101 | case CLOCK_BOOTTIME: | 107 | case CLOCK_BOOTTIME: |
| 102 | if (copy_to_user(tp, &rtn_tp, sizeof(rtn_tp))) | 108 | if (put_timespec64(&rtn_tp, tp)) |
| 103 | return -EFAULT; | 109 | return -EFAULT; |
| 104 | return 0; | 110 | return 0; |
| 105 | default: | 111 | default: |
| @@ -138,44 +144,45 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, | |||
| 138 | } | 144 | } |
| 139 | 145 | ||
| 140 | #ifdef CONFIG_COMPAT | 146 | #ifdef CONFIG_COMPAT |
| 147 | COMPAT_SYS_NI(timer_create); | ||
| 148 | COMPAT_SYS_NI(clock_adjtime); | ||
| 149 | COMPAT_SYS_NI(timer_settime); | ||
| 150 | COMPAT_SYS_NI(timer_gettime); | ||
| 151 | COMPAT_SYS_NI(getitimer); | ||
| 152 | COMPAT_SYS_NI(setitimer); | ||
| 153 | |||
| 141 | COMPAT_SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, | 154 | COMPAT_SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, |
| 142 | struct compat_timespec __user *, tp) | 155 | struct compat_timespec __user *, tp) |
| 143 | { | 156 | { |
| 144 | struct timespec64 new_tp64; | 157 | struct timespec64 new_tp; |
| 145 | struct timespec new_tp; | ||
| 146 | 158 | ||
| 147 | if (which_clock != CLOCK_REALTIME) | 159 | if (which_clock != CLOCK_REALTIME) |
| 148 | return -EINVAL; | 160 | return -EINVAL; |
| 149 | if (compat_get_timespec(&new_tp, tp)) | 161 | if (compat_get_timespec64(&new_tp, tp)) |
| 150 | return -EFAULT; | 162 | return -EFAULT; |
| 151 | 163 | ||
| 152 | new_tp64 = timespec_to_timespec64(new_tp); | 164 | return do_sys_settimeofday64(&new_tp, NULL); |
| 153 | return do_sys_settimeofday64(&new_tp64, NULL); | ||
| 154 | } | 165 | } |
| 155 | 166 | ||
| 156 | COMPAT_SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, | 167 | COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, |
| 157 | struct compat_timespec __user *,tp) | 168 | struct compat_timespec __user *, tp) |
| 158 | { | 169 | { |
| 159 | struct timespec64 kernel_tp64; | 170 | int ret; |
| 160 | struct timespec kernel_tp; | 171 | struct timespec64 kernel_tp; |
| 161 | 172 | ||
| 162 | switch (which_clock) { | 173 | ret = do_clock_gettime(which_clock, &kernel_tp); |
| 163 | case CLOCK_REALTIME: ktime_get_real_ts64(&kernel_tp64); break; | 174 | if (ret) |
| 164 | case CLOCK_MONOTONIC: ktime_get_ts64(&kernel_tp64); break; | 175 | return ret; |
| 165 | case CLOCK_BOOTTIME: get_monotonic_boottime64(&kernel_tp64); break; | ||
| 166 | default: return -EINVAL; | ||
| 167 | } | ||
| 168 | 176 | ||
| 169 | kernel_tp = timespec64_to_timespec(kernel_tp64); | 177 | if (compat_put_timespec64(&kernel_tp, tp)) |
| 170 | if (compat_put_timespec(&kernel_tp, tp)) | ||
| 171 | return -EFAULT; | 178 | return -EFAULT; |
| 172 | return 0; | 179 | return 0; |
| 173 | } | 180 | } |
| 174 | 181 | ||
| 175 | COMPAT_SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, | 182 | COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, |
| 176 | struct compat_timespec __user *, tp) | 183 | struct compat_timespec __user *, tp) |
| 177 | { | 184 | { |
| 178 | struct timespec rtn_tp = { | 185 | struct timespec64 rtn_tp = { |
| 179 | .tv_sec = 0, | 186 | .tv_sec = 0, |
| 180 | .tv_nsec = hrtimer_resolution, | 187 | .tv_nsec = hrtimer_resolution, |
| 181 | }; | 188 | }; |
| @@ -184,13 +191,14 @@ COMPAT_SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, | |||
| 184 | case CLOCK_REALTIME: | 191 | case CLOCK_REALTIME: |
| 185 | case CLOCK_MONOTONIC: | 192 | case CLOCK_MONOTONIC: |
| 186 | case CLOCK_BOOTTIME: | 193 | case CLOCK_BOOTTIME: |
| 187 | if (compat_put_timespec(&rtn_tp, tp)) | 194 | if (compat_put_timespec64(&rtn_tp, tp)) |
| 188 | return -EFAULT; | 195 | return -EFAULT; |
| 189 | return 0; | 196 | return 0; |
| 190 | default: | 197 | default: |
| 191 | return -EINVAL; | 198 | return -EINVAL; |
| 192 | } | 199 | } |
| 193 | } | 200 | } |
| 201 | |||
| 194 | COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, | 202 | COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, |
| 195 | struct compat_timespec __user *, rqtp, | 203 | struct compat_timespec __user *, rqtp, |
| 196 | struct compat_timespec __user *, rmtp) | 204 | struct compat_timespec __user *, rmtp) |
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 82d67be7d9d1..13d6881f908b 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c | |||
| @@ -739,13 +739,11 @@ static int do_timer_gettime(timer_t timer_id, struct itimerspec64 *setting) | |||
| 739 | SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, | 739 | SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, |
| 740 | struct itimerspec __user *, setting) | 740 | struct itimerspec __user *, setting) |
| 741 | { | 741 | { |
| 742 | struct itimerspec64 cur_setting64; | 742 | struct itimerspec64 cur_setting; |
| 743 | 743 | ||
| 744 | int ret = do_timer_gettime(timer_id, &cur_setting64); | 744 | int ret = do_timer_gettime(timer_id, &cur_setting); |
| 745 | if (!ret) { | 745 | if (!ret) { |
| 746 | struct itimerspec cur_setting; | 746 | if (put_itimerspec64(&cur_setting, setting)) |
| 747 | cur_setting = itimerspec64_to_itimerspec(&cur_setting64); | ||
| 748 | if (copy_to_user(setting, &cur_setting, sizeof (cur_setting))) | ||
| 749 | ret = -EFAULT; | 747 | ret = -EFAULT; |
| 750 | } | 748 | } |
| 751 | return ret; | 749 | return ret; |
| @@ -755,13 +753,11 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, | |||
| 755 | COMPAT_SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, | 753 | COMPAT_SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, |
| 756 | struct compat_itimerspec __user *, setting) | 754 | struct compat_itimerspec __user *, setting) |
| 757 | { | 755 | { |
| 758 | struct itimerspec64 cur_setting64; | 756 | struct itimerspec64 cur_setting; |
| 759 | 757 | ||
| 760 | int ret = do_timer_gettime(timer_id, &cur_setting64); | 758 | int ret = do_timer_gettime(timer_id, &cur_setting); |
| 761 | if (!ret) { | 759 | if (!ret) { |
| 762 | struct itimerspec cur_setting; | 760 | if (put_compat_itimerspec64(&cur_setting, setting)) |
| 763 | cur_setting = itimerspec64_to_itimerspec(&cur_setting64); | ||
| 764 | if (put_compat_itimerspec(setting, &cur_setting)) | ||
| 765 | ret = -EFAULT; | 761 | ret = -EFAULT; |
| 766 | } | 762 | } |
| 767 | return ret; | 763 | return ret; |
| @@ -907,23 +903,19 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, | |||
| 907 | const struct itimerspec __user *, new_setting, | 903 | const struct itimerspec __user *, new_setting, |
| 908 | struct itimerspec __user *, old_setting) | 904 | struct itimerspec __user *, old_setting) |
| 909 | { | 905 | { |
| 910 | struct itimerspec64 new_spec64, old_spec64; | 906 | struct itimerspec64 new_spec, old_spec; |
| 911 | struct itimerspec64 *rtn = old_setting ? &old_spec64 : NULL; | 907 | struct itimerspec64 *rtn = old_setting ? &old_spec : NULL; |
| 912 | struct itimerspec new_spec; | ||
| 913 | int error = 0; | 908 | int error = 0; |
| 914 | 909 | ||
| 915 | if (!new_setting) | 910 | if (!new_setting) |
| 916 | return -EINVAL; | 911 | return -EINVAL; |
| 917 | 912 | ||
| 918 | if (copy_from_user(&new_spec, new_setting, sizeof (new_spec))) | 913 | if (get_itimerspec64(&new_spec, new_setting)) |
| 919 | return -EFAULT; | 914 | return -EFAULT; |
| 920 | new_spec64 = itimerspec_to_itimerspec64(&new_spec); | ||
| 921 | 915 | ||
| 922 | error = do_timer_settime(timer_id, flags, &new_spec64, rtn); | 916 | error = do_timer_settime(timer_id, flags, &new_spec, rtn); |
| 923 | if (!error && old_setting) { | 917 | if (!error && old_setting) { |
| 924 | struct itimerspec old_spec; | 918 | if (put_itimerspec64(&old_spec, old_setting)) |
| 925 | old_spec = itimerspec64_to_itimerspec(&old_spec64); | ||
| 926 | if (copy_to_user(old_setting, &old_spec, sizeof (old_spec))) | ||
| 927 | error = -EFAULT; | 919 | error = -EFAULT; |
| 928 | } | 920 | } |
| 929 | return error; | 921 | return error; |
| @@ -934,22 +926,18 @@ COMPAT_SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, | |||
| 934 | struct compat_itimerspec __user *, new, | 926 | struct compat_itimerspec __user *, new, |
| 935 | struct compat_itimerspec __user *, old) | 927 | struct compat_itimerspec __user *, old) |
| 936 | { | 928 | { |
| 937 | struct itimerspec64 new_spec64, old_spec64; | 929 | struct itimerspec64 new_spec, old_spec; |
| 938 | struct itimerspec64 *rtn = old ? &old_spec64 : NULL; | 930 | struct itimerspec64 *rtn = old ? &old_spec : NULL; |
| 939 | struct itimerspec new_spec; | ||
| 940 | int error = 0; | 931 | int error = 0; |
| 941 | 932 | ||
| 942 | if (!new) | 933 | if (!new) |
| 943 | return -EINVAL; | 934 | return -EINVAL; |
| 944 | if (get_compat_itimerspec(&new_spec, new)) | 935 | if (get_compat_itimerspec64(&new_spec, new)) |
| 945 | return -EFAULT; | 936 | return -EFAULT; |
| 946 | 937 | ||
| 947 | new_spec64 = itimerspec_to_itimerspec64(&new_spec); | 938 | error = do_timer_settime(timer_id, flags, &new_spec, rtn); |
| 948 | error = do_timer_settime(timer_id, flags, &new_spec64, rtn); | ||
| 949 | if (!error && old) { | 939 | if (!error && old) { |
| 950 | struct itimerspec old_spec; | 940 | if (put_compat_itimerspec64(&old_spec, old)) |
| 951 | old_spec = itimerspec64_to_itimerspec(&old_spec64); | ||
| 952 | if (put_compat_itimerspec(old, &old_spec)) | ||
| 953 | error = -EFAULT; | 941 | error = -EFAULT; |
| 954 | } | 942 | } |
| 955 | return error; | 943 | return error; |
| @@ -1049,34 +1037,30 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, | |||
| 1049 | const struct timespec __user *, tp) | 1037 | const struct timespec __user *, tp) |
| 1050 | { | 1038 | { |
| 1051 | const struct k_clock *kc = clockid_to_kclock(which_clock); | 1039 | const struct k_clock *kc = clockid_to_kclock(which_clock); |
| 1052 | struct timespec64 new_tp64; | 1040 | struct timespec64 new_tp; |
| 1053 | struct timespec new_tp; | ||
| 1054 | 1041 | ||
| 1055 | if (!kc || !kc->clock_set) | 1042 | if (!kc || !kc->clock_set) |
| 1056 | return -EINVAL; | 1043 | return -EINVAL; |
| 1057 | 1044 | ||
| 1058 | if (copy_from_user(&new_tp, tp, sizeof (*tp))) | 1045 | if (get_timespec64(&new_tp, tp)) |
| 1059 | return -EFAULT; | 1046 | return -EFAULT; |
| 1060 | new_tp64 = timespec_to_timespec64(new_tp); | ||
| 1061 | 1047 | ||
| 1062 | return kc->clock_set(which_clock, &new_tp64); | 1048 | return kc->clock_set(which_clock, &new_tp); |
| 1063 | } | 1049 | } |
| 1064 | 1050 | ||
| 1065 | SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, | 1051 | SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, |
| 1066 | struct timespec __user *,tp) | 1052 | struct timespec __user *,tp) |
| 1067 | { | 1053 | { |
| 1068 | const struct k_clock *kc = clockid_to_kclock(which_clock); | 1054 | const struct k_clock *kc = clockid_to_kclock(which_clock); |
| 1069 | struct timespec64 kernel_tp64; | 1055 | struct timespec64 kernel_tp; |
| 1070 | struct timespec kernel_tp; | ||
| 1071 | int error; | 1056 | int error; |
| 1072 | 1057 | ||
| 1073 | if (!kc) | 1058 | if (!kc) |
| 1074 | return -EINVAL; | 1059 | return -EINVAL; |
| 1075 | 1060 | ||
| 1076 | error = kc->clock_get(which_clock, &kernel_tp64); | 1061 | error = kc->clock_get(which_clock, &kernel_tp); |
| 1077 | kernel_tp = timespec64_to_timespec(kernel_tp64); | ||
| 1078 | 1062 | ||
| 1079 | if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp))) | 1063 | if (!error && put_timespec64(&kernel_tp, tp)) |
| 1080 | error = -EFAULT; | 1064 | error = -EFAULT; |
| 1081 | 1065 | ||
| 1082 | return error; | 1066 | return error; |
| @@ -1109,17 +1093,15 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, | |||
| 1109 | struct timespec __user *, tp) | 1093 | struct timespec __user *, tp) |
| 1110 | { | 1094 | { |
| 1111 | const struct k_clock *kc = clockid_to_kclock(which_clock); | 1095 | const struct k_clock *kc = clockid_to_kclock(which_clock); |
| 1112 | struct timespec64 rtn_tp64; | 1096 | struct timespec64 rtn_tp; |
| 1113 | struct timespec rtn_tp; | ||
| 1114 | int error; | 1097 | int error; |
| 1115 | 1098 | ||
| 1116 | if (!kc) | 1099 | if (!kc) |
| 1117 | return -EINVAL; | 1100 | return -EINVAL; |
| 1118 | 1101 | ||
| 1119 | error = kc->clock_getres(which_clock, &rtn_tp64); | 1102 | error = kc->clock_getres(which_clock, &rtn_tp); |
| 1120 | rtn_tp = timespec64_to_timespec(rtn_tp64); | ||
| 1121 | 1103 | ||
| 1122 | if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) | 1104 | if (!error && tp && put_timespec64(&rtn_tp, tp)) |
| 1123 | error = -EFAULT; | 1105 | error = -EFAULT; |
| 1124 | 1106 | ||
| 1125 | return error; | 1107 | return error; |
| @@ -1131,38 +1113,33 @@ COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock, | |||
| 1131 | struct compat_timespec __user *, tp) | 1113 | struct compat_timespec __user *, tp) |
| 1132 | { | 1114 | { |
| 1133 | const struct k_clock *kc = clockid_to_kclock(which_clock); | 1115 | const struct k_clock *kc = clockid_to_kclock(which_clock); |
| 1134 | struct timespec64 new_tp64; | 1116 | struct timespec64 ts; |
| 1135 | struct timespec new_tp; | ||
| 1136 | 1117 | ||
| 1137 | if (!kc || !kc->clock_set) | 1118 | if (!kc || !kc->clock_set) |
| 1138 | return -EINVAL; | 1119 | return -EINVAL; |
| 1139 | 1120 | ||
| 1140 | if (compat_get_timespec(&new_tp, tp)) | 1121 | if (compat_get_timespec64(&ts, tp)) |
| 1141 | return -EFAULT; | 1122 | return -EFAULT; |
| 1142 | 1123 | ||
| 1143 | new_tp64 = timespec_to_timespec64(new_tp); | 1124 | return kc->clock_set(which_clock, &ts); |
| 1144 | |||
| 1145 | return kc->clock_set(which_clock, &new_tp64); | ||
| 1146 | } | 1125 | } |
| 1147 | 1126 | ||
| 1148 | COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, | 1127 | COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, |
| 1149 | struct compat_timespec __user *, tp) | 1128 | struct compat_timespec __user *, tp) |
| 1150 | { | 1129 | { |
| 1151 | const struct k_clock *kc = clockid_to_kclock(which_clock); | 1130 | const struct k_clock *kc = clockid_to_kclock(which_clock); |
| 1152 | struct timespec64 kernel_tp64; | 1131 | struct timespec64 ts; |
| 1153 | struct timespec kernel_tp; | 1132 | int err; |
| 1154 | int error; | ||
| 1155 | 1133 | ||
| 1156 | if (!kc) | 1134 | if (!kc) |
| 1157 | return -EINVAL; | 1135 | return -EINVAL; |
| 1158 | 1136 | ||
| 1159 | error = kc->clock_get(which_clock, &kernel_tp64); | 1137 | err = kc->clock_get(which_clock, &ts); |
| 1160 | kernel_tp = timespec64_to_timespec(kernel_tp64); | ||
| 1161 | 1138 | ||
| 1162 | if (!error && compat_put_timespec(&kernel_tp, tp)) | 1139 | if (!err && compat_put_timespec64(&ts, tp)) |
| 1163 | error = -EFAULT; | 1140 | err = -EFAULT; |
| 1164 | 1141 | ||
| 1165 | return error; | 1142 | return err; |
| 1166 | } | 1143 | } |
| 1167 | 1144 | ||
| 1168 | COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock, | 1145 | COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock, |
| @@ -1193,21 +1170,19 @@ COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, | |||
| 1193 | struct compat_timespec __user *, tp) | 1170 | struct compat_timespec __user *, tp) |
| 1194 | { | 1171 | { |
| 1195 | const struct k_clock *kc = clockid_to_kclock(which_clock); | 1172 | const struct k_clock *kc = clockid_to_kclock(which_clock); |
| 1196 | struct timespec64 rtn_tp64; | 1173 | struct timespec64 ts; |
| 1197 | struct timespec rtn_tp; | 1174 | int err; |
| 1198 | int error; | ||
| 1199 | 1175 | ||
| 1200 | if (!kc) | 1176 | if (!kc) |
| 1201 | return -EINVAL; | 1177 | return -EINVAL; |
| 1202 | 1178 | ||
| 1203 | error = kc->clock_getres(which_clock, &rtn_tp64); | 1179 | err = kc->clock_getres(which_clock, &ts); |
| 1204 | rtn_tp = timespec64_to_timespec(rtn_tp64); | 1180 | if (!err && tp && compat_put_timespec64(&ts, tp)) |
| 1205 | 1181 | return -EFAULT; | |
| 1206 | if (!error && tp && compat_put_timespec(&rtn_tp, tp)) | ||
| 1207 | error = -EFAULT; | ||
| 1208 | 1182 | ||
| 1209 | return error; | 1183 | return err; |
| 1210 | } | 1184 | } |
| 1185 | |||
| 1211 | #endif | 1186 | #endif |
| 1212 | 1187 | ||
| 1213 | /* | 1188 | /* |
| @@ -1226,26 +1201,24 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, | |||
| 1226 | struct timespec __user *, rmtp) | 1201 | struct timespec __user *, rmtp) |
| 1227 | { | 1202 | { |
| 1228 | const struct k_clock *kc = clockid_to_kclock(which_clock); | 1203 | const struct k_clock *kc = clockid_to_kclock(which_clock); |
| 1229 | struct timespec64 t64; | 1204 | struct timespec64 t; |
| 1230 | struct timespec t; | ||
| 1231 | 1205 | ||
| 1232 | if (!kc) | 1206 | if (!kc) |
| 1233 | return -EINVAL; | 1207 | return -EINVAL; |
| 1234 | if (!kc->nsleep) | 1208 | if (!kc->nsleep) |
| 1235 | return -ENANOSLEEP_NOTSUP; | 1209 | return -ENANOSLEEP_NOTSUP; |
| 1236 | 1210 | ||
| 1237 | if (copy_from_user(&t, rqtp, sizeof (struct timespec))) | 1211 | if (get_timespec64(&t, rqtp)) |
| 1238 | return -EFAULT; | 1212 | return -EFAULT; |
| 1239 | 1213 | ||
| 1240 | t64 = timespec_to_timespec64(t); | 1214 | if (!timespec64_valid(&t)) |
| 1241 | if (!timespec64_valid(&t64)) | ||
| 1242 | return -EINVAL; | 1215 | return -EINVAL; |
| 1243 | if (flags & TIMER_ABSTIME) | 1216 | if (flags & TIMER_ABSTIME) |
| 1244 | rmtp = NULL; | 1217 | rmtp = NULL; |
| 1245 | current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; | 1218 | current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; |
| 1246 | current->restart_block.nanosleep.rmtp = rmtp; | 1219 | current->restart_block.nanosleep.rmtp = rmtp; |
| 1247 | 1220 | ||
| 1248 | return kc->nsleep(which_clock, flags, &t64); | 1221 | return kc->nsleep(which_clock, flags, &t); |
| 1249 | } | 1222 | } |
| 1250 | 1223 | ||
| 1251 | #ifdef CONFIG_COMPAT | 1224 | #ifdef CONFIG_COMPAT |
| @@ -1254,26 +1227,24 @@ COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, | |||
| 1254 | struct compat_timespec __user *, rmtp) | 1227 | struct compat_timespec __user *, rmtp) |
| 1255 | { | 1228 | { |
| 1256 | const struct k_clock *kc = clockid_to_kclock(which_clock); | 1229 | const struct k_clock *kc = clockid_to_kclock(which_clock); |
| 1257 | struct timespec64 t64; | 1230 | struct timespec64 t; |
| 1258 | struct timespec t; | ||
| 1259 | 1231 | ||
| 1260 | if (!kc) | 1232 | if (!kc) |
| 1261 | return -EINVAL; | 1233 | return -EINVAL; |
| 1262 | if (!kc->nsleep) | 1234 | if (!kc->nsleep) |
| 1263 | return -ENANOSLEEP_NOTSUP; | 1235 | return -ENANOSLEEP_NOTSUP; |
| 1264 | 1236 | ||
| 1265 | if (compat_get_timespec(&t, rqtp)) | 1237 | if (compat_get_timespec64(&t, rqtp)) |
| 1266 | return -EFAULT; | 1238 | return -EFAULT; |
| 1267 | 1239 | ||
| 1268 | t64 = timespec_to_timespec64(t); | 1240 | if (!timespec64_valid(&t)) |
| 1269 | if (!timespec64_valid(&t64)) | ||
| 1270 | return -EINVAL; | 1241 | return -EINVAL; |
| 1271 | if (flags & TIMER_ABSTIME) | 1242 | if (flags & TIMER_ABSTIME) |
| 1272 | rmtp = NULL; | 1243 | rmtp = NULL; |
| 1273 | current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; | 1244 | current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; |
| 1274 | current->restart_block.nanosleep.compat_rmtp = rmtp; | 1245 | current->restart_block.nanosleep.compat_rmtp = rmtp; |
| 1275 | 1246 | ||
| 1276 | return kc->nsleep(which_clock, flags, &t64); | 1247 | return kc->nsleep(which_clock, flags, &t); |
| 1277 | } | 1248 | } |
| 1278 | #endif | 1249 | #endif |
| 1279 | 1250 | ||
diff --git a/kernel/time/time.c b/kernel/time/time.c index 7c89e437c4d7..44a8c1402133 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c | |||
| @@ -890,3 +890,61 @@ struct timespec64 timespec64_add_safe(const struct timespec64 lhs, | |||
| 890 | 890 | ||
| 891 | return res; | 891 | return res; |
| 892 | } | 892 | } |
| 893 | |||
| 894 | int get_timespec64(struct timespec64 *ts, | ||
| 895 | const struct timespec __user *uts) | ||
| 896 | { | ||
| 897 | struct timespec kts; | ||
| 898 | int ret; | ||
| 899 | |||
| 900 | ret = copy_from_user(&kts, uts, sizeof(kts)); | ||
| 901 | if (ret) | ||
| 902 | return -EFAULT; | ||
| 903 | |||
| 904 | ts->tv_sec = kts.tv_sec; | ||
| 905 | ts->tv_nsec = kts.tv_nsec; | ||
| 906 | |||
| 907 | return 0; | ||
| 908 | } | ||
| 909 | EXPORT_SYMBOL_GPL(get_timespec64); | ||
| 910 | |||
| 911 | int put_timespec64(const struct timespec64 *ts, | ||
| 912 | struct timespec __user *uts) | ||
| 913 | { | ||
| 914 | struct timespec kts = { | ||
| 915 | .tv_sec = ts->tv_sec, | ||
| 916 | .tv_nsec = ts->tv_nsec | ||
| 917 | }; | ||
| 918 | return copy_to_user(uts, &kts, sizeof(kts)) ? -EFAULT : 0; | ||
| 919 | } | ||
| 920 | EXPORT_SYMBOL_GPL(put_timespec64); | ||
| 921 | |||
| 922 | int get_itimerspec64(struct itimerspec64 *it, | ||
| 923 | const struct itimerspec __user *uit) | ||
| 924 | { | ||
| 925 | int ret; | ||
| 926 | |||
| 927 | ret = get_timespec64(&it->it_interval, &uit->it_interval); | ||
| 928 | if (ret) | ||
| 929 | return ret; | ||
| 930 | |||
| 931 | ret = get_timespec64(&it->it_value, &uit->it_value); | ||
| 932 | |||
| 933 | return ret; | ||
| 934 | } | ||
| 935 | EXPORT_SYMBOL_GPL(get_itimerspec64); | ||
| 936 | |||
| 937 | int put_itimerspec64(const struct itimerspec64 *it, | ||
| 938 | struct itimerspec __user *uit) | ||
| 939 | { | ||
| 940 | int ret; | ||
| 941 | |||
| 942 | ret = put_timespec64(&it->it_interval, &uit->it_interval); | ||
| 943 | if (ret) | ||
| 944 | return ret; | ||
| 945 | |||
| 946 | ret = put_timespec64(&it->it_value, &uit->it_value); | ||
| 947 | |||
| 948 | return ret; | ||
| 949 | } | ||
| 950 | EXPORT_SYMBOL_GPL(put_itimerspec64); | ||
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 7e06f04e98fe..434c840e2d82 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig | |||
| @@ -667,30 +667,30 @@ config RING_BUFFER_STARTUP_TEST | |||
| 667 | 667 | ||
| 668 | If unsure, say N | 668 | If unsure, say N |
| 669 | 669 | ||
| 670 | config TRACE_ENUM_MAP_FILE | 670 | config TRACE_EVAL_MAP_FILE |
| 671 | bool "Show enum mappings for trace events" | 671 | bool "Show eval mappings for trace events" |
| 672 | depends on TRACING | 672 | depends on TRACING |
| 673 | help | 673 | help |
| 674 | The "print fmt" of the trace events will show the enum names instead | 674 | The "print fmt" of the trace events will show the enum/sizeof names |
| 675 | of their values. This can cause problems for user space tools that | 675 | instead of their values. This can cause problems for user space tools |
| 676 | use this string to parse the raw data as user space does not know | 676 | that use this string to parse the raw data as user space does not know |
| 677 | how to convert the string to its value. | 677 | how to convert the string to its value. |
| 678 | 678 | ||
| 679 | To fix this, there's a special macro in the kernel that can be used | 679 | To fix this, there's a special macro in the kernel that can be used |
| 680 | to convert the enum into its value. If this macro is used, then the | 680 | to convert an enum/sizeof into its value. If this macro is used, then |
| 681 | print fmt strings will have the enums converted to their values. | 681 | the print fmt strings will be converted to their values. |
| 682 | 682 | ||
| 683 | If something does not get converted properly, this option can be | 683 | If something does not get converted properly, this option can be |
| 684 | used to show what enums the kernel tried to convert. | 684 | used to show what enums/sizeof the kernel tried to convert. |
| 685 | 685 | ||
| 686 | This option is for debugging the enum conversions. A file is created | 686 | This option is for debugging the conversions. A file is created |
| 687 | in the tracing directory called "enum_map" that will show the enum | 687 | in the tracing directory called "eval_map" that will show the |
| 688 | names matched with their values and what trace event system they | 688 | names matched with their values and what trace event system they |
| 689 | belong too. | 689 | belong too. |
| 690 | 690 | ||
| 691 | Normally, the mapping of the strings to values will be freed after | 691 | Normally, the mapping of the strings to values will be freed after |
| 692 | boot up or module load. With this option, they will not be freed, as | 692 | boot up or module load. With this option, they will not be freed, as |
| 693 | they are needed for the "enum_map" file. Enabling this option will | 693 | they are needed for the "eval_map" file. Enabling this option will |
| 694 | increase the memory footprint of the running kernel. | 694 | increase the memory footprint of the running kernel. |
| 695 | 695 | ||
| 696 | If unsure, say N | 696 | If unsure, say N |
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 460a031c77e5..37385193a608 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c | |||
| @@ -122,8 +122,8 @@ static const struct bpf_func_proto *bpf_get_probe_write_proto(void) | |||
| 122 | } | 122 | } |
| 123 | 123 | ||
| 124 | /* | 124 | /* |
| 125 | * limited trace_printk() | 125 | * Only limited trace_printk() conversion specifiers allowed: |
| 126 | * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed | 126 | * %d %i %u %x %ld %li %lu %lx %lld %lli %llu %llx %p %s |
| 127 | */ | 127 | */ |
| 128 | BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, | 128 | BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, |
| 129 | u64, arg2, u64, arg3) | 129 | u64, arg2, u64, arg3) |
| @@ -198,7 +198,8 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, | |||
| 198 | i++; | 198 | i++; |
| 199 | } | 199 | } |
| 200 | 200 | ||
| 201 | if (fmt[i] != 'd' && fmt[i] != 'u' && fmt[i] != 'x') | 201 | if (fmt[i] != 'i' && fmt[i] != 'd' && |
| 202 | fmt[i] != 'u' && fmt[i] != 'x') | ||
| 202 | return -EINVAL; | 203 | return -EINVAL; |
| 203 | fmt_cnt++; | 204 | fmt_cnt++; |
| 204 | } | 205 | } |
| @@ -234,7 +235,8 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags) | |||
| 234 | unsigned int cpu = smp_processor_id(); | 235 | unsigned int cpu = smp_processor_id(); |
| 235 | u64 index = flags & BPF_F_INDEX_MASK; | 236 | u64 index = flags & BPF_F_INDEX_MASK; |
| 236 | struct bpf_event_entry *ee; | 237 | struct bpf_event_entry *ee; |
| 237 | struct perf_event *event; | 238 | u64 value = 0; |
| 239 | int err; | ||
| 238 | 240 | ||
| 239 | if (unlikely(flags & ~(BPF_F_INDEX_MASK))) | 241 | if (unlikely(flags & ~(BPF_F_INDEX_MASK))) |
| 240 | return -EINVAL; | 242 | return -EINVAL; |
| @@ -247,21 +249,14 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags) | |||
| 247 | if (!ee) | 249 | if (!ee) |
| 248 | return -ENOENT; | 250 | return -ENOENT; |
| 249 | 251 | ||
| 250 | event = ee->event; | 252 | err = perf_event_read_local(ee->event, &value); |
| 251 | if (unlikely(event->attr.type != PERF_TYPE_HARDWARE && | ||
| 252 | event->attr.type != PERF_TYPE_RAW)) | ||
| 253 | return -EINVAL; | ||
| 254 | |||
| 255 | /* make sure event is local and doesn't have pmu::count */ | ||
| 256 | if (unlikely(event->oncpu != cpu || event->pmu->count)) | ||
| 257 | return -EINVAL; | ||
| 258 | |||
| 259 | /* | 253 | /* |
| 260 | * we don't know if the function is run successfully by the | 254 | * this api is ugly since we miss [-22..-2] range of valid |
| 261 | * return value. It can be judged in other places, such as | 255 | * counter values, but that's uapi |
| 262 | * eBPF programs. | ||
| 263 | */ | 256 | */ |
| 264 | return perf_event_read_local(event); | 257 | if (err) |
| 258 | return err; | ||
| 259 | return value; | ||
| 265 | } | 260 | } |
| 266 | 261 | ||
| 267 | static const struct bpf_func_proto bpf_perf_event_read_proto = { | 262 | static const struct bpf_func_proto bpf_perf_event_read_proto = { |
| @@ -272,14 +267,16 @@ static const struct bpf_func_proto bpf_perf_event_read_proto = { | |||
| 272 | .arg2_type = ARG_ANYTHING, | 267 | .arg2_type = ARG_ANYTHING, |
| 273 | }; | 268 | }; |
| 274 | 269 | ||
| 270 | static DEFINE_PER_CPU(struct perf_sample_data, bpf_sd); | ||
| 271 | |||
| 275 | static __always_inline u64 | 272 | static __always_inline u64 |
| 276 | __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, | 273 | __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, |
| 277 | u64 flags, struct perf_raw_record *raw) | 274 | u64 flags, struct perf_raw_record *raw) |
| 278 | { | 275 | { |
| 279 | struct bpf_array *array = container_of(map, struct bpf_array, map); | 276 | struct bpf_array *array = container_of(map, struct bpf_array, map); |
| 277 | struct perf_sample_data *sd = this_cpu_ptr(&bpf_sd); | ||
| 280 | unsigned int cpu = smp_processor_id(); | 278 | unsigned int cpu = smp_processor_id(); |
| 281 | u64 index = flags & BPF_F_INDEX_MASK; | 279 | u64 index = flags & BPF_F_INDEX_MASK; |
| 282 | struct perf_sample_data sample_data; | ||
| 283 | struct bpf_event_entry *ee; | 280 | struct bpf_event_entry *ee; |
| 284 | struct perf_event *event; | 281 | struct perf_event *event; |
| 285 | 282 | ||
| @@ -300,9 +297,9 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, | |||
| 300 | if (unlikely(event->oncpu != cpu)) | 297 | if (unlikely(event->oncpu != cpu)) |
| 301 | return -EOPNOTSUPP; | 298 | return -EOPNOTSUPP; |
| 302 | 299 | ||
| 303 | perf_sample_data_init(&sample_data, 0, 0); | 300 | perf_sample_data_init(sd, 0, 0); |
| 304 | sample_data.raw = raw; | 301 | sd->raw = raw; |
| 305 | perf_event_output(event, &sample_data, regs); | 302 | perf_event_output(event, sd, regs); |
| 306 | return 0; | 303 | return 0; |
| 307 | } | 304 | } |
| 308 | 305 | ||
| @@ -483,7 +480,7 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func | |||
| 483 | 480 | ||
| 484 | /* bpf+kprobe programs can access fields of 'struct pt_regs' */ | 481 | /* bpf+kprobe programs can access fields of 'struct pt_regs' */ |
| 485 | static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type, | 482 | static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type, |
| 486 | enum bpf_reg_type *reg_type) | 483 | struct bpf_insn_access_aux *info) |
| 487 | { | 484 | { |
| 488 | if (off < 0 || off >= sizeof(struct pt_regs)) | 485 | if (off < 0 || off >= sizeof(struct pt_regs)) |
| 489 | return false; | 486 | return false; |
| @@ -566,7 +563,7 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id) | |||
| 566 | } | 563 | } |
| 567 | 564 | ||
| 568 | static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type, | 565 | static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type, |
| 569 | enum bpf_reg_type *reg_type) | 566 | struct bpf_insn_access_aux *info) |
| 570 | { | 567 | { |
| 571 | if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE) | 568 | if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE) |
| 572 | return false; | 569 | return false; |
| @@ -585,40 +582,47 @@ const struct bpf_verifier_ops tracepoint_prog_ops = { | |||
| 585 | }; | 582 | }; |
| 586 | 583 | ||
| 587 | static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, | 584 | static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, |
| 588 | enum bpf_reg_type *reg_type) | 585 | struct bpf_insn_access_aux *info) |
| 589 | { | 586 | { |
| 587 | const int size_sp = FIELD_SIZEOF(struct bpf_perf_event_data, | ||
| 588 | sample_period); | ||
| 589 | |||
| 590 | if (off < 0 || off >= sizeof(struct bpf_perf_event_data)) | 590 | if (off < 0 || off >= sizeof(struct bpf_perf_event_data)) |
| 591 | return false; | 591 | return false; |
| 592 | if (type != BPF_READ) | 592 | if (type != BPF_READ) |
| 593 | return false; | 593 | return false; |
| 594 | if (off % size != 0) | 594 | if (off % size != 0) |
| 595 | return false; | 595 | return false; |
| 596 | if (off == offsetof(struct bpf_perf_event_data, sample_period)) { | 596 | |
| 597 | if (size != sizeof(u64)) | 597 | switch (off) { |
| 598 | case bpf_ctx_range(struct bpf_perf_event_data, sample_period): | ||
| 599 | bpf_ctx_record_field_size(info, size_sp); | ||
| 600 | if (!bpf_ctx_narrow_access_ok(off, size, size_sp)) | ||
| 598 | return false; | 601 | return false; |
| 599 | } else { | 602 | break; |
| 603 | default: | ||
| 600 | if (size != sizeof(long)) | 604 | if (size != sizeof(long)) |
| 601 | return false; | 605 | return false; |
| 602 | } | 606 | } |
| 607 | |||
| 603 | return true; | 608 | return true; |
| 604 | } | 609 | } |
| 605 | 610 | ||
| 606 | static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, | 611 | static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, |
| 607 | const struct bpf_insn *si, | 612 | const struct bpf_insn *si, |
| 608 | struct bpf_insn *insn_buf, | 613 | struct bpf_insn *insn_buf, |
| 609 | struct bpf_prog *prog) | 614 | struct bpf_prog *prog, u32 *target_size) |
| 610 | { | 615 | { |
| 611 | struct bpf_insn *insn = insn_buf; | 616 | struct bpf_insn *insn = insn_buf; |
| 612 | 617 | ||
| 613 | switch (si->off) { | 618 | switch (si->off) { |
| 614 | case offsetof(struct bpf_perf_event_data, sample_period): | 619 | case offsetof(struct bpf_perf_event_data, sample_period): |
| 615 | BUILD_BUG_ON(FIELD_SIZEOF(struct perf_sample_data, period) != sizeof(u64)); | ||
| 616 | |||
| 617 | *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, | 620 | *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, |
| 618 | data), si->dst_reg, si->src_reg, | 621 | data), si->dst_reg, si->src_reg, |
| 619 | offsetof(struct bpf_perf_event_data_kern, data)); | 622 | offsetof(struct bpf_perf_event_data_kern, data)); |
| 620 | *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg, | 623 | *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg, |
| 621 | offsetof(struct perf_sample_data, period)); | 624 | bpf_target_off(struct perf_sample_data, period, 8, |
| 625 | target_size)); | ||
| 622 | break; | 626 | break; |
| 623 | default: | 627 | default: |
| 624 | *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, | 628 | *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, |
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b308be30dfb9..02004ae91860 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
| @@ -113,7 +113,7 @@ static int ftrace_disabled __read_mostly; | |||
| 113 | 113 | ||
| 114 | static DEFINE_MUTEX(ftrace_lock); | 114 | static DEFINE_MUTEX(ftrace_lock); |
| 115 | 115 | ||
| 116 | static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; | 116 | static struct ftrace_ops __rcu *ftrace_ops_list __read_mostly = &ftrace_list_end; |
| 117 | ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; | 117 | ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; |
| 118 | static struct ftrace_ops global_ops; | 118 | static struct ftrace_ops global_ops; |
| 119 | 119 | ||
| @@ -169,8 +169,11 @@ int ftrace_nr_registered_ops(void) | |||
| 169 | 169 | ||
| 170 | mutex_lock(&ftrace_lock); | 170 | mutex_lock(&ftrace_lock); |
| 171 | 171 | ||
| 172 | for (ops = ftrace_ops_list; | 172 | for (ops = rcu_dereference_protected(ftrace_ops_list, |
| 173 | ops != &ftrace_list_end; ops = ops->next) | 173 | lockdep_is_held(&ftrace_lock)); |
| 174 | ops != &ftrace_list_end; | ||
| 175 | ops = rcu_dereference_protected(ops->next, | ||
| 176 | lockdep_is_held(&ftrace_lock))) | ||
| 174 | cnt++; | 177 | cnt++; |
| 175 | 178 | ||
| 176 | mutex_unlock(&ftrace_lock); | 179 | mutex_unlock(&ftrace_lock); |
| @@ -275,10 +278,11 @@ static void update_ftrace_function(void) | |||
| 275 | * If there's only one ftrace_ops registered, the ftrace_ops_list | 278 | * If there's only one ftrace_ops registered, the ftrace_ops_list |
| 276 | * will point to the ops we want. | 279 | * will point to the ops we want. |
| 277 | */ | 280 | */ |
| 278 | set_function_trace_op = ftrace_ops_list; | 281 | set_function_trace_op = rcu_dereference_protected(ftrace_ops_list, |
| 282 | lockdep_is_held(&ftrace_lock)); | ||
| 279 | 283 | ||
| 280 | /* If there's no ftrace_ops registered, just call the stub function */ | 284 | /* If there's no ftrace_ops registered, just call the stub function */ |
| 281 | if (ftrace_ops_list == &ftrace_list_end) { | 285 | if (set_function_trace_op == &ftrace_list_end) { |
| 282 | func = ftrace_stub; | 286 | func = ftrace_stub; |
| 283 | 287 | ||
| 284 | /* | 288 | /* |
| @@ -286,7 +290,8 @@ static void update_ftrace_function(void) | |||
| 286 | * recursion safe and not dynamic and the arch supports passing ops, | 290 | * recursion safe and not dynamic and the arch supports passing ops, |
| 287 | * then have the mcount trampoline call the function directly. | 291 | * then have the mcount trampoline call the function directly. |
| 288 | */ | 292 | */ |
| 289 | } else if (ftrace_ops_list->next == &ftrace_list_end) { | 293 | } else if (rcu_dereference_protected(ftrace_ops_list->next, |
| 294 | lockdep_is_held(&ftrace_lock)) == &ftrace_list_end) { | ||
| 290 | func = ftrace_ops_get_list_func(ftrace_ops_list); | 295 | func = ftrace_ops_get_list_func(ftrace_ops_list); |
| 291 | 296 | ||
| 292 | } else { | 297 | } else { |
| @@ -348,9 +353,11 @@ int using_ftrace_ops_list_func(void) | |||
| 348 | return ftrace_trace_function == ftrace_ops_list_func; | 353 | return ftrace_trace_function == ftrace_ops_list_func; |
| 349 | } | 354 | } |
| 350 | 355 | ||
| 351 | static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) | 356 | static void add_ftrace_ops(struct ftrace_ops __rcu **list, |
| 357 | struct ftrace_ops *ops) | ||
| 352 | { | 358 | { |
| 353 | ops->next = *list; | 359 | rcu_assign_pointer(ops->next, *list); |
| 360 | |||
| 354 | /* | 361 | /* |
| 355 | * We are entering ops into the list but another | 362 | * We are entering ops into the list but another |
| 356 | * CPU might be walking that list. We need to make sure | 363 | * CPU might be walking that list. We need to make sure |
| @@ -360,7 +367,8 @@ static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) | |||
| 360 | rcu_assign_pointer(*list, ops); | 367 | rcu_assign_pointer(*list, ops); |
| 361 | } | 368 | } |
| 362 | 369 | ||
| 363 | static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) | 370 | static int remove_ftrace_ops(struct ftrace_ops __rcu **list, |
| 371 | struct ftrace_ops *ops) | ||
| 364 | { | 372 | { |
| 365 | struct ftrace_ops **p; | 373 | struct ftrace_ops **p; |
| 366 | 374 | ||
| @@ -368,7 +376,10 @@ static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) | |||
| 368 | * If we are removing the last function, then simply point | 376 | * If we are removing the last function, then simply point |
| 369 | * to the ftrace_stub. | 377 | * to the ftrace_stub. |
| 370 | */ | 378 | */ |
| 371 | if (*list == ops && ops->next == &ftrace_list_end) { | 379 | if (rcu_dereference_protected(*list, |
| 380 | lockdep_is_held(&ftrace_lock)) == ops && | ||
| 381 | rcu_dereference_protected(ops->next, | ||
| 382 | lockdep_is_held(&ftrace_lock)) == &ftrace_list_end) { | ||
| 372 | *list = &ftrace_list_end; | 383 | *list = &ftrace_list_end; |
| 373 | return 0; | 384 | return 0; |
| 374 | } | 385 | } |
| @@ -1293,6 +1304,28 @@ static void ftrace_hash_clear(struct ftrace_hash *hash) | |||
| 1293 | FTRACE_WARN_ON(hash->count); | 1304 | FTRACE_WARN_ON(hash->count); |
| 1294 | } | 1305 | } |
| 1295 | 1306 | ||
| 1307 | static void free_ftrace_mod(struct ftrace_mod_load *ftrace_mod) | ||
| 1308 | { | ||
| 1309 | list_del(&ftrace_mod->list); | ||
| 1310 | kfree(ftrace_mod->module); | ||
| 1311 | kfree(ftrace_mod->func); | ||
| 1312 | kfree(ftrace_mod); | ||
| 1313 | } | ||
| 1314 | |||
| 1315 | static void clear_ftrace_mod_list(struct list_head *head) | ||
| 1316 | { | ||
| 1317 | struct ftrace_mod_load *p, *n; | ||
| 1318 | |||
| 1319 | /* stack tracer isn't supported yet */ | ||
| 1320 | if (!head) | ||
| 1321 | return; | ||
| 1322 | |||
| 1323 | mutex_lock(&ftrace_lock); | ||
| 1324 | list_for_each_entry_safe(p, n, head, list) | ||
| 1325 | free_ftrace_mod(p); | ||
| 1326 | mutex_unlock(&ftrace_lock); | ||
| 1327 | } | ||
| 1328 | |||
| 1296 | static void free_ftrace_hash(struct ftrace_hash *hash) | 1329 | static void free_ftrace_hash(struct ftrace_hash *hash) |
| 1297 | { | 1330 | { |
| 1298 | if (!hash || hash == EMPTY_HASH) | 1331 | if (!hash || hash == EMPTY_HASH) |
| @@ -1346,6 +1379,35 @@ static struct ftrace_hash *alloc_ftrace_hash(int size_bits) | |||
| 1346 | return hash; | 1379 | return hash; |
| 1347 | } | 1380 | } |
| 1348 | 1381 | ||
| 1382 | |||
| 1383 | static int ftrace_add_mod(struct trace_array *tr, | ||
| 1384 | const char *func, const char *module, | ||
| 1385 | int enable) | ||
| 1386 | { | ||
| 1387 | struct ftrace_mod_load *ftrace_mod; | ||
| 1388 | struct list_head *mod_head = enable ? &tr->mod_trace : &tr->mod_notrace; | ||
| 1389 | |||
| 1390 | ftrace_mod = kzalloc(sizeof(*ftrace_mod), GFP_KERNEL); | ||
| 1391 | if (!ftrace_mod) | ||
| 1392 | return -ENOMEM; | ||
| 1393 | |||
| 1394 | ftrace_mod->func = kstrdup(func, GFP_KERNEL); | ||
| 1395 | ftrace_mod->module = kstrdup(module, GFP_KERNEL); | ||
| 1396 | ftrace_mod->enable = enable; | ||
| 1397 | |||
| 1398 | if (!ftrace_mod->func || !ftrace_mod->module) | ||
| 1399 | goto out_free; | ||
| 1400 | |||
| 1401 | list_add(&ftrace_mod->list, mod_head); | ||
| 1402 | |||
| 1403 | return 0; | ||
| 1404 | |||
| 1405 | out_free: | ||
| 1406 | free_ftrace_mod(ftrace_mod); | ||
| 1407 | |||
| 1408 | return -ENOMEM; | ||
| 1409 | } | ||
| 1410 | |||
| 1349 | static struct ftrace_hash * | 1411 | static struct ftrace_hash * |
| 1350 | alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) | 1412 | alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) |
| 1351 | { | 1413 | { |
| @@ -1359,6 +1421,9 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) | |||
| 1359 | if (!new_hash) | 1421 | if (!new_hash) |
| 1360 | return NULL; | 1422 | return NULL; |
| 1361 | 1423 | ||
| 1424 | if (hash) | ||
| 1425 | new_hash->flags = hash->flags; | ||
| 1426 | |||
| 1362 | /* Empty hash? */ | 1427 | /* Empty hash? */ |
| 1363 | if (ftrace_hash_empty(hash)) | 1428 | if (ftrace_hash_empty(hash)) |
| 1364 | return new_hash; | 1429 | return new_hash; |
| @@ -1403,7 +1468,7 @@ __ftrace_hash_move(struct ftrace_hash *src) | |||
| 1403 | /* | 1468 | /* |
| 1404 | * If the new source is empty, just return the empty_hash. | 1469 | * If the new source is empty, just return the empty_hash. |
| 1405 | */ | 1470 | */ |
| 1406 | if (!src->count) | 1471 | if (ftrace_hash_empty(src)) |
| 1407 | return EMPTY_HASH; | 1472 | return EMPTY_HASH; |
| 1408 | 1473 | ||
| 1409 | /* | 1474 | /* |
| @@ -1420,6 +1485,8 @@ __ftrace_hash_move(struct ftrace_hash *src) | |||
| 1420 | if (!new_hash) | 1485 | if (!new_hash) |
| 1421 | return NULL; | 1486 | return NULL; |
| 1422 | 1487 | ||
| 1488 | new_hash->flags = src->flags; | ||
| 1489 | |||
| 1423 | size = 1 << src->size_bits; | 1490 | size = 1 << src->size_bits; |
| 1424 | for (i = 0; i < size; i++) { | 1491 | for (i = 0; i < size; i++) { |
| 1425 | hhd = &src->buckets[i]; | 1492 | hhd = &src->buckets[i]; |
| @@ -1513,8 +1580,8 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) | |||
| 1513 | return 0; | 1580 | return 0; |
| 1514 | #endif | 1581 | #endif |
| 1515 | 1582 | ||
| 1516 | hash.filter_hash = rcu_dereference_raw_notrace(ops->func_hash->filter_hash); | 1583 | rcu_assign_pointer(hash.filter_hash, ops->func_hash->filter_hash); |
| 1517 | hash.notrace_hash = rcu_dereference_raw_notrace(ops->func_hash->notrace_hash); | 1584 | rcu_assign_pointer(hash.notrace_hash, ops->func_hash->notrace_hash); |
| 1518 | 1585 | ||
| 1519 | if (hash_contains_ip(ip, &hash)) | 1586 | if (hash_contains_ip(ip, &hash)) |
| 1520 | ret = 1; | 1587 | ret = 1; |
| @@ -1650,7 +1717,7 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, | |||
| 1650 | struct dyn_ftrace *rec; | 1717 | struct dyn_ftrace *rec; |
| 1651 | bool update = false; | 1718 | bool update = false; |
| 1652 | int count = 0; | 1719 | int count = 0; |
| 1653 | int all = 0; | 1720 | int all = false; |
| 1654 | 1721 | ||
| 1655 | /* Only update if the ops has been registered */ | 1722 | /* Only update if the ops has been registered */ |
| 1656 | if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) | 1723 | if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) |
| @@ -1671,7 +1738,7 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, | |||
| 1671 | hash = ops->func_hash->filter_hash; | 1738 | hash = ops->func_hash->filter_hash; |
| 1672 | other_hash = ops->func_hash->notrace_hash; | 1739 | other_hash = ops->func_hash->notrace_hash; |
| 1673 | if (ftrace_hash_empty(hash)) | 1740 | if (ftrace_hash_empty(hash)) |
| 1674 | all = 1; | 1741 | all = true; |
| 1675 | } else { | 1742 | } else { |
| 1676 | inc = !inc; | 1743 | inc = !inc; |
| 1677 | hash = ops->func_hash->notrace_hash; | 1744 | hash = ops->func_hash->notrace_hash; |
| @@ -2784,7 +2851,8 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) | |||
| 2784 | * If there's no more ops registered with ftrace, run a | 2851 | * If there's no more ops registered with ftrace, run a |
| 2785 | * sanity check to make sure all rec flags are cleared. | 2852 | * sanity check to make sure all rec flags are cleared. |
| 2786 | */ | 2853 | */ |
| 2787 | if (ftrace_ops_list == &ftrace_list_end) { | 2854 | if (rcu_dereference_protected(ftrace_ops_list, |
| 2855 | lockdep_is_held(&ftrace_lock)) == &ftrace_list_end) { | ||
| 2788 | struct ftrace_page *pg; | 2856 | struct ftrace_page *pg; |
| 2789 | struct dyn_ftrace *rec; | 2857 | struct dyn_ftrace *rec; |
| 2790 | 2858 | ||
| @@ -3061,6 +3129,7 @@ ftrace_allocate_pages(unsigned long num_to_init) | |||
| 3061 | struct ftrace_iterator { | 3129 | struct ftrace_iterator { |
| 3062 | loff_t pos; | 3130 | loff_t pos; |
| 3063 | loff_t func_pos; | 3131 | loff_t func_pos; |
| 3132 | loff_t mod_pos; | ||
| 3064 | struct ftrace_page *pg; | 3133 | struct ftrace_page *pg; |
| 3065 | struct dyn_ftrace *func; | 3134 | struct dyn_ftrace *func; |
| 3066 | struct ftrace_func_probe *probe; | 3135 | struct ftrace_func_probe *probe; |
| @@ -3068,6 +3137,8 @@ struct ftrace_iterator { | |||
| 3068 | struct trace_parser parser; | 3137 | struct trace_parser parser; |
| 3069 | struct ftrace_hash *hash; | 3138 | struct ftrace_hash *hash; |
| 3070 | struct ftrace_ops *ops; | 3139 | struct ftrace_ops *ops; |
| 3140 | struct trace_array *tr; | ||
| 3141 | struct list_head *mod_list; | ||
| 3071 | int pidx; | 3142 | int pidx; |
| 3072 | int idx; | 3143 | int idx; |
| 3073 | unsigned flags; | 3144 | unsigned flags; |
| @@ -3152,13 +3223,13 @@ static void *t_probe_start(struct seq_file *m, loff_t *pos) | |||
| 3152 | if (!(iter->flags & FTRACE_ITER_DO_PROBES)) | 3223 | if (!(iter->flags & FTRACE_ITER_DO_PROBES)) |
| 3153 | return NULL; | 3224 | return NULL; |
| 3154 | 3225 | ||
| 3155 | if (iter->func_pos > *pos) | 3226 | if (iter->mod_pos > *pos) |
| 3156 | return NULL; | 3227 | return NULL; |
| 3157 | 3228 | ||
| 3158 | iter->probe = NULL; | 3229 | iter->probe = NULL; |
| 3159 | iter->probe_entry = NULL; | 3230 | iter->probe_entry = NULL; |
| 3160 | iter->pidx = 0; | 3231 | iter->pidx = 0; |
| 3161 | for (l = 0; l <= (*pos - iter->func_pos); ) { | 3232 | for (l = 0; l <= (*pos - iter->mod_pos); ) { |
| 3162 | p = t_probe_next(m, &l); | 3233 | p = t_probe_next(m, &l); |
| 3163 | if (!p) | 3234 | if (!p) |
| 3164 | break; | 3235 | break; |
| @@ -3197,6 +3268,82 @@ t_probe_show(struct seq_file *m, struct ftrace_iterator *iter) | |||
| 3197 | } | 3268 | } |
| 3198 | 3269 | ||
| 3199 | static void * | 3270 | static void * |
| 3271 | t_mod_next(struct seq_file *m, loff_t *pos) | ||
| 3272 | { | ||
| 3273 | struct ftrace_iterator *iter = m->private; | ||
| 3274 | struct trace_array *tr = iter->tr; | ||
| 3275 | |||
| 3276 | (*pos)++; | ||
| 3277 | iter->pos = *pos; | ||
| 3278 | |||
| 3279 | iter->mod_list = iter->mod_list->next; | ||
| 3280 | |||
| 3281 | if (iter->mod_list == &tr->mod_trace || | ||
| 3282 | iter->mod_list == &tr->mod_notrace) { | ||
| 3283 | iter->flags &= ~FTRACE_ITER_MOD; | ||
| 3284 | return NULL; | ||
| 3285 | } | ||
| 3286 | |||
| 3287 | iter->mod_pos = *pos; | ||
| 3288 | |||
| 3289 | return iter; | ||
| 3290 | } | ||
| 3291 | |||
| 3292 | static void *t_mod_start(struct seq_file *m, loff_t *pos) | ||
| 3293 | { | ||
| 3294 | struct ftrace_iterator *iter = m->private; | ||
| 3295 | void *p = NULL; | ||
| 3296 | loff_t l; | ||
| 3297 | |||
| 3298 | if (iter->func_pos > *pos) | ||
| 3299 | return NULL; | ||
| 3300 | |||
| 3301 | iter->mod_pos = iter->func_pos; | ||
| 3302 | |||
| 3303 | /* probes are only available if tr is set */ | ||
| 3304 | if (!iter->tr) | ||
| 3305 | return NULL; | ||
| 3306 | |||
| 3307 | for (l = 0; l <= (*pos - iter->func_pos); ) { | ||
| 3308 | p = t_mod_next(m, &l); | ||
| 3309 | if (!p) | ||
| 3310 | break; | ||
| 3311 | } | ||
| 3312 | if (!p) { | ||
| 3313 | iter->flags &= ~FTRACE_ITER_MOD; | ||
| 3314 | return t_probe_start(m, pos); | ||
| 3315 | } | ||
| 3316 | |||
| 3317 | /* Only set this if we have an item */ | ||
| 3318 | iter->flags |= FTRACE_ITER_MOD; | ||
| 3319 | |||
| 3320 | return iter; | ||
| 3321 | } | ||
| 3322 | |||
| 3323 | static int | ||
| 3324 | t_mod_show(struct seq_file *m, struct ftrace_iterator *iter) | ||
| 3325 | { | ||
| 3326 | struct ftrace_mod_load *ftrace_mod; | ||
| 3327 | struct trace_array *tr = iter->tr; | ||
| 3328 | |||
| 3329 | if (WARN_ON_ONCE(!iter->mod_list) || | ||
| 3330 | iter->mod_list == &tr->mod_trace || | ||
| 3331 | iter->mod_list == &tr->mod_notrace) | ||
| 3332 | return -EIO; | ||
| 3333 | |||
| 3334 | ftrace_mod = list_entry(iter->mod_list, struct ftrace_mod_load, list); | ||
| 3335 | |||
| 3336 | if (ftrace_mod->func) | ||
| 3337 | seq_printf(m, "%s", ftrace_mod->func); | ||
| 3338 | else | ||
| 3339 | seq_putc(m, '*'); | ||
| 3340 | |||
| 3341 | seq_printf(m, ":mod:%s\n", ftrace_mod->module); | ||
| 3342 | |||
| 3343 | return 0; | ||
| 3344 | } | ||
| 3345 | |||
| 3346 | static void * | ||
| 3200 | t_func_next(struct seq_file *m, loff_t *pos) | 3347 | t_func_next(struct seq_file *m, loff_t *pos) |
| 3201 | { | 3348 | { |
| 3202 | struct ftrace_iterator *iter = m->private; | 3349 | struct ftrace_iterator *iter = m->private; |
| @@ -3237,7 +3384,7 @@ static void * | |||
| 3237 | t_next(struct seq_file *m, void *v, loff_t *pos) | 3384 | t_next(struct seq_file *m, void *v, loff_t *pos) |
| 3238 | { | 3385 | { |
| 3239 | struct ftrace_iterator *iter = m->private; | 3386 | struct ftrace_iterator *iter = m->private; |
| 3240 | loff_t l = *pos; /* t_hash_start() must use original pos */ | 3387 | loff_t l = *pos; /* t_probe_start() must use original pos */ |
| 3241 | void *ret; | 3388 | void *ret; |
| 3242 | 3389 | ||
| 3243 | if (unlikely(ftrace_disabled)) | 3390 | if (unlikely(ftrace_disabled)) |
| @@ -3246,16 +3393,19 @@ t_next(struct seq_file *m, void *v, loff_t *pos) | |||
| 3246 | if (iter->flags & FTRACE_ITER_PROBE) | 3393 | if (iter->flags & FTRACE_ITER_PROBE) |
| 3247 | return t_probe_next(m, pos); | 3394 | return t_probe_next(m, pos); |
| 3248 | 3395 | ||
| 3396 | if (iter->flags & FTRACE_ITER_MOD) | ||
| 3397 | return t_mod_next(m, pos); | ||
| 3398 | |||
| 3249 | if (iter->flags & FTRACE_ITER_PRINTALL) { | 3399 | if (iter->flags & FTRACE_ITER_PRINTALL) { |
| 3250 | /* next must increment pos, and t_probe_start does not */ | 3400 | /* next must increment pos, and t_probe_start does not */ |
| 3251 | (*pos)++; | 3401 | (*pos)++; |
| 3252 | return t_probe_start(m, &l); | 3402 | return t_mod_start(m, &l); |
| 3253 | } | 3403 | } |
| 3254 | 3404 | ||
| 3255 | ret = t_func_next(m, pos); | 3405 | ret = t_func_next(m, pos); |
| 3256 | 3406 | ||
| 3257 | if (!ret) | 3407 | if (!ret) |
| 3258 | return t_probe_start(m, &l); | 3408 | return t_mod_start(m, &l); |
| 3259 | 3409 | ||
| 3260 | return ret; | 3410 | return ret; |
| 3261 | } | 3411 | } |
| @@ -3264,7 +3414,7 @@ static void reset_iter_read(struct ftrace_iterator *iter) | |||
| 3264 | { | 3414 | { |
| 3265 | iter->pos = 0; | 3415 | iter->pos = 0; |
| 3266 | iter->func_pos = 0; | 3416 | iter->func_pos = 0; |
| 3267 | iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_PROBE); | 3417 | iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_PROBE | FTRACE_ITER_MOD); |
| 3268 | } | 3418 | } |
| 3269 | 3419 | ||
| 3270 | static void *t_start(struct seq_file *m, loff_t *pos) | 3420 | static void *t_start(struct seq_file *m, loff_t *pos) |
| @@ -3293,15 +3443,15 @@ static void *t_start(struct seq_file *m, loff_t *pos) | |||
| 3293 | ftrace_hash_empty(iter->hash)) { | 3443 | ftrace_hash_empty(iter->hash)) { |
| 3294 | iter->func_pos = 1; /* Account for the message */ | 3444 | iter->func_pos = 1; /* Account for the message */ |
| 3295 | if (*pos > 0) | 3445 | if (*pos > 0) |
| 3296 | return t_probe_start(m, pos); | 3446 | return t_mod_start(m, pos); |
| 3297 | iter->flags |= FTRACE_ITER_PRINTALL; | 3447 | iter->flags |= FTRACE_ITER_PRINTALL; |
| 3298 | /* reset in case of seek/pread */ | 3448 | /* reset in case of seek/pread */ |
| 3299 | iter->flags &= ~FTRACE_ITER_PROBE; | 3449 | iter->flags &= ~FTRACE_ITER_PROBE; |
| 3300 | return iter; | 3450 | return iter; |
| 3301 | } | 3451 | } |
| 3302 | 3452 | ||
| 3303 | if (iter->flags & FTRACE_ITER_PROBE) | 3453 | if (iter->flags & FTRACE_ITER_MOD) |
| 3304 | return t_probe_start(m, pos); | 3454 | return t_mod_start(m, pos); |
| 3305 | 3455 | ||
| 3306 | /* | 3456 | /* |
| 3307 | * Unfortunately, we need to restart at ftrace_pages_start | 3457 | * Unfortunately, we need to restart at ftrace_pages_start |
| @@ -3317,7 +3467,7 @@ static void *t_start(struct seq_file *m, loff_t *pos) | |||
| 3317 | } | 3467 | } |
| 3318 | 3468 | ||
| 3319 | if (!p) | 3469 | if (!p) |
| 3320 | return t_probe_start(m, pos); | 3470 | return t_mod_start(m, pos); |
| 3321 | 3471 | ||
| 3322 | return iter; | 3472 | return iter; |
| 3323 | } | 3473 | } |
| @@ -3351,6 +3501,9 @@ static int t_show(struct seq_file *m, void *v) | |||
| 3351 | if (iter->flags & FTRACE_ITER_PROBE) | 3501 | if (iter->flags & FTRACE_ITER_PROBE) |
| 3352 | return t_probe_show(m, iter); | 3502 | return t_probe_show(m, iter); |
| 3353 | 3503 | ||
| 3504 | if (iter->flags & FTRACE_ITER_MOD) | ||
| 3505 | return t_mod_show(m, iter); | ||
| 3506 | |||
| 3354 | if (iter->flags & FTRACE_ITER_PRINTALL) { | 3507 | if (iter->flags & FTRACE_ITER_PRINTALL) { |
| 3355 | if (iter->flags & FTRACE_ITER_NOTRACE) | 3508 | if (iter->flags & FTRACE_ITER_NOTRACE) |
| 3356 | seq_puts(m, "#### no functions disabled ####\n"); | 3509 | seq_puts(m, "#### no functions disabled ####\n"); |
| @@ -3457,6 +3610,8 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, | |||
| 3457 | { | 3610 | { |
| 3458 | struct ftrace_iterator *iter; | 3611 | struct ftrace_iterator *iter; |
| 3459 | struct ftrace_hash *hash; | 3612 | struct ftrace_hash *hash; |
| 3613 | struct list_head *mod_head; | ||
| 3614 | struct trace_array *tr = ops->private; | ||
| 3460 | int ret = 0; | 3615 | int ret = 0; |
| 3461 | 3616 | ||
| 3462 | ftrace_ops_init(ops); | 3617 | ftrace_ops_init(ops); |
| @@ -3475,21 +3630,29 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, | |||
| 3475 | 3630 | ||
| 3476 | iter->ops = ops; | 3631 | iter->ops = ops; |
| 3477 | iter->flags = flag; | 3632 | iter->flags = flag; |
| 3633 | iter->tr = tr; | ||
| 3478 | 3634 | ||
| 3479 | mutex_lock(&ops->func_hash->regex_lock); | 3635 | mutex_lock(&ops->func_hash->regex_lock); |
| 3480 | 3636 | ||
| 3481 | if (flag & FTRACE_ITER_NOTRACE) | 3637 | if (flag & FTRACE_ITER_NOTRACE) { |
| 3482 | hash = ops->func_hash->notrace_hash; | 3638 | hash = ops->func_hash->notrace_hash; |
| 3483 | else | 3639 | mod_head = tr ? &tr->mod_notrace : NULL; |
| 3640 | } else { | ||
| 3484 | hash = ops->func_hash->filter_hash; | 3641 | hash = ops->func_hash->filter_hash; |
| 3642 | mod_head = tr ? &tr->mod_trace : NULL; | ||
| 3643 | } | ||
| 3644 | |||
| 3645 | iter->mod_list = mod_head; | ||
| 3485 | 3646 | ||
| 3486 | if (file->f_mode & FMODE_WRITE) { | 3647 | if (file->f_mode & FMODE_WRITE) { |
| 3487 | const int size_bits = FTRACE_HASH_DEFAULT_BITS; | 3648 | const int size_bits = FTRACE_HASH_DEFAULT_BITS; |
| 3488 | 3649 | ||
| 3489 | if (file->f_flags & O_TRUNC) | 3650 | if (file->f_flags & O_TRUNC) { |
| 3490 | iter->hash = alloc_ftrace_hash(size_bits); | 3651 | iter->hash = alloc_ftrace_hash(size_bits); |
| 3491 | else | 3652 | clear_ftrace_mod_list(mod_head); |
| 3653 | } else { | ||
| 3492 | iter->hash = alloc_and_copy_ftrace_hash(size_bits, hash); | 3654 | iter->hash = alloc_and_copy_ftrace_hash(size_bits, hash); |
| 3655 | } | ||
| 3493 | 3656 | ||
| 3494 | if (!iter->hash) { | 3657 | if (!iter->hash) { |
| 3495 | trace_parser_put(&iter->parser); | 3658 | trace_parser_put(&iter->parser); |
| @@ -3665,7 +3828,7 @@ match_records(struct ftrace_hash *hash, char *func, int len, char *mod) | |||
| 3665 | int exclude_mod = 0; | 3828 | int exclude_mod = 0; |
| 3666 | int found = 0; | 3829 | int found = 0; |
| 3667 | int ret; | 3830 | int ret; |
| 3668 | int clear_filter; | 3831 | int clear_filter = 0; |
| 3669 | 3832 | ||
| 3670 | if (func) { | 3833 | if (func) { |
| 3671 | func_g.type = filter_parse_regex(func, len, &func_g.search, | 3834 | func_g.type = filter_parse_regex(func, len, &func_g.search, |
| @@ -3761,6 +3924,165 @@ static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops, | |||
| 3761 | return ret; | 3924 | return ret; |
| 3762 | } | 3925 | } |
| 3763 | 3926 | ||
| 3927 | static bool module_exists(const char *module) | ||
| 3928 | { | ||
| 3929 | /* All modules have the symbol __this_module */ | ||
| 3930 | const char this_mod[] = "__this_module"; | ||
| 3931 | const int modname_size = MAX_PARAM_PREFIX_LEN + sizeof(this_mod) + 1; | ||
| 3932 | char modname[modname_size + 1]; | ||
| 3933 | unsigned long val; | ||
| 3934 | int n; | ||
| 3935 | |||
| 3936 | n = snprintf(modname, modname_size + 1, "%s:%s", module, this_mod); | ||
| 3937 | |||
| 3938 | if (n > modname_size) | ||
| 3939 | return false; | ||
| 3940 | |||
| 3941 | val = module_kallsyms_lookup_name(modname); | ||
| 3942 | return val != 0; | ||
| 3943 | } | ||
| 3944 | |||
| 3945 | static int cache_mod(struct trace_array *tr, | ||
| 3946 | const char *func, char *module, int enable) | ||
| 3947 | { | ||
| 3948 | struct ftrace_mod_load *ftrace_mod, *n; | ||
| 3949 | struct list_head *head = enable ? &tr->mod_trace : &tr->mod_notrace; | ||
| 3950 | int ret; | ||
| 3951 | |||
| 3952 | mutex_lock(&ftrace_lock); | ||
| 3953 | |||
| 3954 | /* We do not cache inverse filters */ | ||
| 3955 | if (func[0] == '!') { | ||
| 3956 | func++; | ||
| 3957 | ret = -EINVAL; | ||
| 3958 | |||
| 3959 | /* Look to remove this hash */ | ||
| 3960 | list_for_each_entry_safe(ftrace_mod, n, head, list) { | ||
| 3961 | if (strcmp(ftrace_mod->module, module) != 0) | ||
| 3962 | continue; | ||
| 3963 | |||
| 3964 | /* no func matches all */ | ||
| 3965 | if (strcmp(func, "*") == 0 || | ||
| 3966 | (ftrace_mod->func && | ||
| 3967 | strcmp(ftrace_mod->func, func) == 0)) { | ||
| 3968 | ret = 0; | ||
| 3969 | free_ftrace_mod(ftrace_mod); | ||
| 3970 | continue; | ||
| 3971 | } | ||
| 3972 | } | ||
| 3973 | goto out; | ||
| 3974 | } | ||
| 3975 | |||
| 3976 | ret = -EINVAL; | ||
| 3977 | /* We only care about modules that have not been loaded yet */ | ||
| 3978 | if (module_exists(module)) | ||
| 3979 | goto out; | ||
| 3980 | |||
| 3981 | /* Save this string off, and execute it when the module is loaded */ | ||
| 3982 | ret = ftrace_add_mod(tr, func, module, enable); | ||
| 3983 | out: | ||
| 3984 | mutex_unlock(&ftrace_lock); | ||
| 3985 | |||
| 3986 | return ret; | ||
| 3987 | } | ||
| 3988 | |||
| 3989 | static int | ||
| 3990 | ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, | ||
| 3991 | int reset, int enable); | ||
| 3992 | |||
| 3993 | #ifdef CONFIG_MODULES | ||
| 3994 | static void process_mod_list(struct list_head *head, struct ftrace_ops *ops, | ||
| 3995 | char *mod, bool enable) | ||
| 3996 | { | ||
| 3997 | struct ftrace_mod_load *ftrace_mod, *n; | ||
| 3998 | struct ftrace_hash **orig_hash, *new_hash; | ||
| 3999 | LIST_HEAD(process_mods); | ||
| 4000 | char *func; | ||
| 4001 | int ret; | ||
| 4002 | |||
| 4003 | mutex_lock(&ops->func_hash->regex_lock); | ||
| 4004 | |||
| 4005 | if (enable) | ||
| 4006 | orig_hash = &ops->func_hash->filter_hash; | ||
| 4007 | else | ||
| 4008 | orig_hash = &ops->func_hash->notrace_hash; | ||
| 4009 | |||
| 4010 | new_hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, | ||
| 4011 | *orig_hash); | ||
| 4012 | if (!new_hash) | ||
| 4013 | goto out; /* warn? */ | ||
| 4014 | |||
| 4015 | mutex_lock(&ftrace_lock); | ||
| 4016 | |||
| 4017 | list_for_each_entry_safe(ftrace_mod, n, head, list) { | ||
| 4018 | |||
| 4019 | if (strcmp(ftrace_mod->module, mod) != 0) | ||
| 4020 | continue; | ||
| 4021 | |||
| 4022 | if (ftrace_mod->func) | ||
| 4023 | func = kstrdup(ftrace_mod->func, GFP_KERNEL); | ||
| 4024 | else | ||
| 4025 | func = kstrdup("*", GFP_KERNEL); | ||
| 4026 | |||
| 4027 | if (!func) /* warn? */ | ||
| 4028 | continue; | ||
| 4029 | |||
| 4030 | list_del(&ftrace_mod->list); | ||
| 4031 | list_add(&ftrace_mod->list, &process_mods); | ||
| 4032 | |||
| 4033 | /* Use the newly allocated func, as it may be "*" */ | ||
| 4034 | kfree(ftrace_mod->func); | ||
| 4035 | ftrace_mod->func = func; | ||
| 4036 | } | ||
| 4037 | |||
| 4038 | mutex_unlock(&ftrace_lock); | ||
| 4039 | |||
| 4040 | list_for_each_entry_safe(ftrace_mod, n, &process_mods, list) { | ||
| 4041 | |||
| 4042 | func = ftrace_mod->func; | ||
| 4043 | |||
| 4044 | /* Grabs ftrace_lock, which is why we have this extra step */ | ||
| 4045 | match_records(new_hash, func, strlen(func), mod); | ||
| 4046 | free_ftrace_mod(ftrace_mod); | ||
| 4047 | } | ||
| 4048 | |||
| 4049 | if (enable && list_empty(head)) | ||
| 4050 | new_hash->flags &= ~FTRACE_HASH_FL_MOD; | ||
| 4051 | |||
| 4052 | mutex_lock(&ftrace_lock); | ||
| 4053 | |||
| 4054 | ret = ftrace_hash_move_and_update_ops(ops, orig_hash, | ||
| 4055 | new_hash, enable); | ||
| 4056 | mutex_unlock(&ftrace_lock); | ||
| 4057 | |||
| 4058 | out: | ||
| 4059 | mutex_unlock(&ops->func_hash->regex_lock); | ||
| 4060 | |||
| 4061 | free_ftrace_hash(new_hash); | ||
| 4062 | } | ||
| 4063 | |||
| 4064 | static void process_cached_mods(const char *mod_name) | ||
| 4065 | { | ||
| 4066 | struct trace_array *tr; | ||
| 4067 | char *mod; | ||
| 4068 | |||
| 4069 | mod = kstrdup(mod_name, GFP_KERNEL); | ||
| 4070 | if (!mod) | ||
| 4071 | return; | ||
| 4072 | |||
| 4073 | mutex_lock(&trace_types_lock); | ||
| 4074 | list_for_each_entry(tr, &ftrace_trace_arrays, list) { | ||
| 4075 | if (!list_empty(&tr->mod_trace)) | ||
| 4076 | process_mod_list(&tr->mod_trace, tr->ops, mod, true); | ||
| 4077 | if (!list_empty(&tr->mod_notrace)) | ||
| 4078 | process_mod_list(&tr->mod_notrace, tr->ops, mod, false); | ||
| 4079 | } | ||
| 4080 | mutex_unlock(&trace_types_lock); | ||
| 4081 | |||
| 4082 | kfree(mod); | ||
| 4083 | } | ||
| 4084 | #endif | ||
| 4085 | |||
| 3764 | /* | 4086 | /* |
| 3765 | * We register the module command as a template to show others how | 4087 | * We register the module command as a template to show others how |
| 3766 | * to register the a command as well. | 4088 | * to register the a command as well. |
| @@ -3768,10 +4090,16 @@ static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops, | |||
| 3768 | 4090 | ||
| 3769 | static int | 4091 | static int |
| 3770 | ftrace_mod_callback(struct trace_array *tr, struct ftrace_hash *hash, | 4092 | ftrace_mod_callback(struct trace_array *tr, struct ftrace_hash *hash, |
| 3771 | char *func, char *cmd, char *module, int enable) | 4093 | char *func_orig, char *cmd, char *module, int enable) |
| 3772 | { | 4094 | { |
| 4095 | char *func; | ||
| 3773 | int ret; | 4096 | int ret; |
| 3774 | 4097 | ||
| 4098 | /* match_records() modifies func, and we need the original */ | ||
| 4099 | func = kstrdup(func_orig, GFP_KERNEL); | ||
| 4100 | if (!func) | ||
| 4101 | return -ENOMEM; | ||
| 4102 | |||
| 3775 | /* | 4103 | /* |
| 3776 | * cmd == 'mod' because we only registered this func | 4104 | * cmd == 'mod' because we only registered this func |
| 3777 | * for the 'mod' ftrace_func_command. | 4105 | * for the 'mod' ftrace_func_command. |
| @@ -3780,8 +4108,10 @@ ftrace_mod_callback(struct trace_array *tr, struct ftrace_hash *hash, | |||
| 3780 | * parameter. | 4108 | * parameter. |
| 3781 | */ | 4109 | */ |
| 3782 | ret = match_records(hash, func, strlen(func), module); | 4110 | ret = match_records(hash, func, strlen(func), module); |
| 4111 | kfree(func); | ||
| 4112 | |||
| 3783 | if (!ret) | 4113 | if (!ret) |
| 3784 | return -EINVAL; | 4114 | return cache_mod(tr, func_orig, module, enable); |
| 3785 | if (ret < 0) | 4115 | if (ret < 0) |
| 3786 | return ret; | 4116 | return ret; |
| 3787 | return 0; | 4117 | return 0; |
| @@ -4725,9 +5055,11 @@ int ftrace_regex_release(struct inode *inode, struct file *file) | |||
| 4725 | if (file->f_mode & FMODE_WRITE) { | 5055 | if (file->f_mode & FMODE_WRITE) { |
| 4726 | filter_hash = !!(iter->flags & FTRACE_ITER_FILTER); | 5056 | filter_hash = !!(iter->flags & FTRACE_ITER_FILTER); |
| 4727 | 5057 | ||
| 4728 | if (filter_hash) | 5058 | if (filter_hash) { |
| 4729 | orig_hash = &iter->ops->func_hash->filter_hash; | 5059 | orig_hash = &iter->ops->func_hash->filter_hash; |
| 4730 | else | 5060 | if (iter->tr && !list_empty(&iter->tr->mod_trace)) |
| 5061 | iter->hash->flags |= FTRACE_HASH_FL_MOD; | ||
| 5062 | } else | ||
| 4731 | orig_hash = &iter->ops->func_hash->notrace_hash; | 5063 | orig_hash = &iter->ops->func_hash->notrace_hash; |
| 4732 | 5064 | ||
| 4733 | mutex_lock(&ftrace_lock); | 5065 | mutex_lock(&ftrace_lock); |
| @@ -5385,6 +5717,7 @@ void ftrace_release_mod(struct module *mod) | |||
| 5385 | if (pg == ftrace_pages) | 5717 | if (pg == ftrace_pages) |
| 5386 | ftrace_pages = next_to_ftrace_page(last_pg); | 5718 | ftrace_pages = next_to_ftrace_page(last_pg); |
| 5387 | 5719 | ||
| 5720 | ftrace_update_tot_cnt -= pg->index; | ||
| 5388 | *last_pg = pg->next; | 5721 | *last_pg = pg->next; |
| 5389 | order = get_count_order(pg->size / ENTRIES_PER_PAGE); | 5722 | order = get_count_order(pg->size / ENTRIES_PER_PAGE); |
| 5390 | free_pages((unsigned long)pg->records, order); | 5723 | free_pages((unsigned long)pg->records, order); |
| @@ -5463,6 +5796,8 @@ void ftrace_module_enable(struct module *mod) | |||
| 5463 | 5796 | ||
| 5464 | out_unlock: | 5797 | out_unlock: |
| 5465 | mutex_unlock(&ftrace_lock); | 5798 | mutex_unlock(&ftrace_lock); |
| 5799 | |||
| 5800 | process_cached_mods(mod->name); | ||
| 5466 | } | 5801 | } |
| 5467 | 5802 | ||
| 5468 | void ftrace_module_init(struct module *mod) | 5803 | void ftrace_module_init(struct module *mod) |
| @@ -5501,6 +5836,7 @@ void __init ftrace_free_init_mem(void) | |||
| 5501 | if (!rec) | 5836 | if (!rec) |
| 5502 | continue; | 5837 | continue; |
| 5503 | pg->index--; | 5838 | pg->index--; |
| 5839 | ftrace_update_tot_cnt--; | ||
| 5504 | if (!pg->index) { | 5840 | if (!pg->index) { |
| 5505 | *last_pg = pg->next; | 5841 | *last_pg = pg->next; |
| 5506 | order = get_count_order(pg->size / ENTRIES_PER_PAGE); | 5842 | order = get_count_order(pg->size / ENTRIES_PER_PAGE); |
| @@ -5567,6 +5903,8 @@ static void ftrace_update_trampoline(struct ftrace_ops *ops) | |||
| 5567 | void ftrace_init_trace_array(struct trace_array *tr) | 5903 | void ftrace_init_trace_array(struct trace_array *tr) |
| 5568 | { | 5904 | { |
| 5569 | INIT_LIST_HEAD(&tr->func_probes); | 5905 | INIT_LIST_HEAD(&tr->func_probes); |
| 5906 | INIT_LIST_HEAD(&tr->mod_trace); | ||
| 5907 | INIT_LIST_HEAD(&tr->mod_notrace); | ||
| 5570 | } | 5908 | } |
| 5571 | #else | 5909 | #else |
| 5572 | 5910 | ||
| @@ -6127,7 +6465,8 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, | |||
| 6127 | if (ftrace_enabled) { | 6465 | if (ftrace_enabled) { |
| 6128 | 6466 | ||
| 6129 | /* we are starting ftrace again */ | 6467 | /* we are starting ftrace again */ |
| 6130 | if (ftrace_ops_list != &ftrace_list_end) | 6468 | if (rcu_dereference_protected(ftrace_ops_list, |
| 6469 | lockdep_is_held(&ftrace_lock)) != &ftrace_list_end) | ||
| 6131 | update_ftrace_function(); | 6470 | update_ftrace_function(); |
| 6132 | 6471 | ||
| 6133 | ftrace_startup_sysctl(); | 6472 | ftrace_startup_sysctl(); |
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 4ae268e687fe..529cc50d7243 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
| @@ -1136,12 +1136,12 @@ static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu) | |||
| 1136 | for (i = 0; i < nr_pages; i++) { | 1136 | for (i = 0; i < nr_pages; i++) { |
| 1137 | struct page *page; | 1137 | struct page *page; |
| 1138 | /* | 1138 | /* |
| 1139 | * __GFP_NORETRY flag makes sure that the allocation fails | 1139 | * __GFP_RETRY_MAYFAIL flag makes sure that the allocation fails |
| 1140 | * gracefully without invoking oom-killer and the system is | 1140 | * gracefully without invoking oom-killer and the system is not |
| 1141 | * not destabilized. | 1141 | * destabilized. |
| 1142 | */ | 1142 | */ |
| 1143 | bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), | 1143 | bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), |
| 1144 | GFP_KERNEL | __GFP_NORETRY, | 1144 | GFP_KERNEL | __GFP_RETRY_MAYFAIL, |
| 1145 | cpu_to_node(cpu)); | 1145 | cpu_to_node(cpu)); |
| 1146 | if (!bpage) | 1146 | if (!bpage) |
| 1147 | goto free_pages; | 1147 | goto free_pages; |
| @@ -1149,7 +1149,7 @@ static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu) | |||
| 1149 | list_add(&bpage->list, pages); | 1149 | list_add(&bpage->list, pages); |
| 1150 | 1150 | ||
| 1151 | page = alloc_pages_node(cpu_to_node(cpu), | 1151 | page = alloc_pages_node(cpu_to_node(cpu), |
| 1152 | GFP_KERNEL | __GFP_NORETRY, 0); | 1152 | GFP_KERNEL | __GFP_RETRY_MAYFAIL, 0); |
| 1153 | if (!page) | 1153 | if (!page) |
| 1154 | goto free_pages; | 1154 | goto free_pages; |
| 1155 | bpage->page = page_address(page); | 1155 | bpage->page = page_address(page); |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 091e801145c9..42b9355033d4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
| @@ -87,7 +87,7 @@ dummy_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) | |||
| 87 | * tracing is active, only save the comm when a trace event | 87 | * tracing is active, only save the comm when a trace event |
| 88 | * occurred. | 88 | * occurred. |
| 89 | */ | 89 | */ |
| 90 | static DEFINE_PER_CPU(bool, trace_cmdline_save); | 90 | static DEFINE_PER_CPU(bool, trace_taskinfo_save); |
| 91 | 91 | ||
| 92 | /* | 92 | /* |
| 93 | * Kill all tracing for good (never come back). | 93 | * Kill all tracing for good (never come back). |
| @@ -120,41 +120,41 @@ enum ftrace_dump_mode ftrace_dump_on_oops; | |||
| 120 | /* When set, tracing will stop when a WARN*() is hit */ | 120 | /* When set, tracing will stop when a WARN*() is hit */ |
| 121 | int __disable_trace_on_warning; | 121 | int __disable_trace_on_warning; |
| 122 | 122 | ||
| 123 | #ifdef CONFIG_TRACE_ENUM_MAP_FILE | 123 | #ifdef CONFIG_TRACE_EVAL_MAP_FILE |
| 124 | /* Map of enums to their values, for "enum_map" file */ | 124 | /* Map of enums to their values, for "eval_map" file */ |
| 125 | struct trace_enum_map_head { | 125 | struct trace_eval_map_head { |
| 126 | struct module *mod; | 126 | struct module *mod; |
| 127 | unsigned long length; | 127 | unsigned long length; |
| 128 | }; | 128 | }; |
| 129 | 129 | ||
| 130 | union trace_enum_map_item; | 130 | union trace_eval_map_item; |
| 131 | 131 | ||
| 132 | struct trace_enum_map_tail { | 132 | struct trace_eval_map_tail { |
| 133 | /* | 133 | /* |
| 134 | * "end" is first and points to NULL as it must be different | 134 | * "end" is first and points to NULL as it must be different |
| 135 | * than "mod" or "enum_string" | 135 | * than "mod" or "eval_string" |
| 136 | */ | 136 | */ |
| 137 | union trace_enum_map_item *next; | 137 | union trace_eval_map_item *next; |
| 138 | const char *end; /* points to NULL */ | 138 | const char *end; /* points to NULL */ |
| 139 | }; | 139 | }; |
| 140 | 140 | ||
| 141 | static DEFINE_MUTEX(trace_enum_mutex); | 141 | static DEFINE_MUTEX(trace_eval_mutex); |
| 142 | 142 | ||
| 143 | /* | 143 | /* |
| 144 | * The trace_enum_maps are saved in an array with two extra elements, | 144 | * The trace_eval_maps are saved in an array with two extra elements, |
| 145 | * one at the beginning, and one at the end. The beginning item contains | 145 | * one at the beginning, and one at the end. The beginning item contains |
| 146 | * the count of the saved maps (head.length), and the module they | 146 | * the count of the saved maps (head.length), and the module they |
| 147 | * belong to if not built in (head.mod). The ending item contains a | 147 | * belong to if not built in (head.mod). The ending item contains a |
| 148 | * pointer to the next array of saved enum_map items. | 148 | * pointer to the next array of saved eval_map items. |
| 149 | */ | 149 | */ |
| 150 | union trace_enum_map_item { | 150 | union trace_eval_map_item { |
| 151 | struct trace_enum_map map; | 151 | struct trace_eval_map map; |
| 152 | struct trace_enum_map_head head; | 152 | struct trace_eval_map_head head; |
| 153 | struct trace_enum_map_tail tail; | 153 | struct trace_eval_map_tail tail; |
| 154 | }; | 154 | }; |
| 155 | 155 | ||
| 156 | static union trace_enum_map_item *trace_enum_maps; | 156 | static union trace_eval_map_item *trace_eval_maps; |
| 157 | #endif /* CONFIG_TRACE_ENUM_MAP_FILE */ | 157 | #endif /* CONFIG_TRACE_EVAL_MAP_FILE */ |
| 158 | 158 | ||
| 159 | static int tracing_set_tracer(struct trace_array *tr, const char *buf); | 159 | static int tracing_set_tracer(struct trace_array *tr, const char *buf); |
| 160 | 160 | ||
| @@ -790,7 +790,7 @@ EXPORT_SYMBOL_GPL(tracing_on); | |||
| 790 | static __always_inline void | 790 | static __always_inline void |
| 791 | __buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event) | 791 | __buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event) |
| 792 | { | 792 | { |
| 793 | __this_cpu_write(trace_cmdline_save, true); | 793 | __this_cpu_write(trace_taskinfo_save, true); |
| 794 | 794 | ||
| 795 | /* If this is the temp buffer, we need to commit fully */ | 795 | /* If this is the temp buffer, we need to commit fully */ |
| 796 | if (this_cpu_read(trace_buffered_event) == event) { | 796 | if (this_cpu_read(trace_buffered_event) == event) { |
| @@ -1141,9 +1141,9 @@ unsigned long nsecs_to_usecs(unsigned long nsecs) | |||
| 1141 | 1141 | ||
| 1142 | /* | 1142 | /* |
| 1143 | * TRACE_FLAGS is defined as a tuple matching bit masks with strings. | 1143 | * TRACE_FLAGS is defined as a tuple matching bit masks with strings. |
| 1144 | * It uses C(a, b) where 'a' is the enum name and 'b' is the string that | 1144 | * It uses C(a, b) where 'a' is the eval (enum) name and 'b' is the string that |
| 1145 | * matches it. By defining "C(a, b) b", TRACE_FLAGS becomes a list | 1145 | * matches it. By defining "C(a, b) b", TRACE_FLAGS becomes a list |
| 1146 | * of strings in the order that the enums were defined. | 1146 | * of strings in the order that the evals (enum) were defined. |
| 1147 | */ | 1147 | */ |
| 1148 | #undef C | 1148 | #undef C |
| 1149 | #define C(a, b) b | 1149 | #define C(a, b) b |
| @@ -1709,6 +1709,8 @@ void tracing_reset_all_online_cpus(void) | |||
| 1709 | } | 1709 | } |
| 1710 | } | 1710 | } |
| 1711 | 1711 | ||
| 1712 | static int *tgid_map; | ||
| 1713 | |||
| 1712 | #define SAVED_CMDLINES_DEFAULT 128 | 1714 | #define SAVED_CMDLINES_DEFAULT 128 |
| 1713 | #define NO_CMDLINE_MAP UINT_MAX | 1715 | #define NO_CMDLINE_MAP UINT_MAX |
| 1714 | static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED; | 1716 | static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED; |
| @@ -1722,7 +1724,7 @@ struct saved_cmdlines_buffer { | |||
| 1722 | static struct saved_cmdlines_buffer *savedcmd; | 1724 | static struct saved_cmdlines_buffer *savedcmd; |
| 1723 | 1725 | ||
| 1724 | /* temporary disable recording */ | 1726 | /* temporary disable recording */ |
| 1725 | static atomic_t trace_record_cmdline_disabled __read_mostly; | 1727 | static atomic_t trace_record_taskinfo_disabled __read_mostly; |
| 1726 | 1728 | ||
| 1727 | static inline char *get_saved_cmdlines(int idx) | 1729 | static inline char *get_saved_cmdlines(int idx) |
| 1728 | { | 1730 | { |
| @@ -1910,13 +1912,15 @@ static void tracing_stop_tr(struct trace_array *tr) | |||
| 1910 | raw_spin_unlock_irqrestore(&tr->start_lock, flags); | 1912 | raw_spin_unlock_irqrestore(&tr->start_lock, flags); |
| 1911 | } | 1913 | } |
| 1912 | 1914 | ||
| 1913 | void trace_stop_cmdline_recording(void); | ||
| 1914 | |||
| 1915 | static int trace_save_cmdline(struct task_struct *tsk) | 1915 | static int trace_save_cmdline(struct task_struct *tsk) |
| 1916 | { | 1916 | { |
| 1917 | unsigned pid, idx; | 1917 | unsigned pid, idx; |
| 1918 | 1918 | ||
| 1919 | if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT)) | 1919 | /* treat recording of idle task as a success */ |
| 1920 | if (!tsk->pid) | ||
| 1921 | return 1; | ||
| 1922 | |||
| 1923 | if (unlikely(tsk->pid > PID_MAX_DEFAULT)) | ||
| 1920 | return 0; | 1924 | return 0; |
| 1921 | 1925 | ||
| 1922 | /* | 1926 | /* |
| @@ -1992,16 +1996,107 @@ void trace_find_cmdline(int pid, char comm[]) | |||
| 1992 | preempt_enable(); | 1996 | preempt_enable(); |
| 1993 | } | 1997 | } |
| 1994 | 1998 | ||
| 1995 | void tracing_record_cmdline(struct task_struct *tsk) | 1999 | int trace_find_tgid(int pid) |
| 1996 | { | 2000 | { |
| 1997 | if (atomic_read(&trace_record_cmdline_disabled) || !tracing_is_on()) | 2001 | if (unlikely(!tgid_map || !pid || pid > PID_MAX_DEFAULT)) |
| 2002 | return 0; | ||
| 2003 | |||
| 2004 | return tgid_map[pid]; | ||
| 2005 | } | ||
| 2006 | |||
| 2007 | static int trace_save_tgid(struct task_struct *tsk) | ||
| 2008 | { | ||
| 2009 | /* treat recording of idle task as a success */ | ||
| 2010 | if (!tsk->pid) | ||
| 2011 | return 1; | ||
| 2012 | |||
| 2013 | if (unlikely(!tgid_map || tsk->pid > PID_MAX_DEFAULT)) | ||
| 2014 | return 0; | ||
| 2015 | |||
| 2016 | tgid_map[tsk->pid] = tsk->tgid; | ||
| 2017 | return 1; | ||
| 2018 | } | ||
| 2019 | |||
| 2020 | static bool tracing_record_taskinfo_skip(int flags) | ||
| 2021 | { | ||
| 2022 | if (unlikely(!(flags & (TRACE_RECORD_CMDLINE | TRACE_RECORD_TGID)))) | ||
| 2023 | return true; | ||
| 2024 | if (atomic_read(&trace_record_taskinfo_disabled) || !tracing_is_on()) | ||
| 2025 | return true; | ||
| 2026 | if (!__this_cpu_read(trace_taskinfo_save)) | ||
| 2027 | return true; | ||
| 2028 | return false; | ||
| 2029 | } | ||
| 2030 | |||
| 2031 | /** | ||
| 2032 | * tracing_record_taskinfo - record the task info of a task | ||
| 2033 | * | ||
| 2034 | * @task - task to record | ||
| 2035 | * @flags - TRACE_RECORD_CMDLINE for recording comm | ||
| 2036 | * - TRACE_RECORD_TGID for recording tgid | ||
| 2037 | */ | ||
| 2038 | void tracing_record_taskinfo(struct task_struct *task, int flags) | ||
| 2039 | { | ||
| 2040 | bool done; | ||
| 2041 | |||
| 2042 | if (tracing_record_taskinfo_skip(flags)) | ||
| 1998 | return; | 2043 | return; |
| 1999 | 2044 | ||
| 2000 | if (!__this_cpu_read(trace_cmdline_save)) | 2045 | /* |
| 2046 | * Record as much task information as possible. If some fail, continue | ||
| 2047 | * to try to record the others. | ||
| 2048 | */ | ||
| 2049 | done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(task); | ||
| 2050 | done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(task); | ||
| 2051 | |||
| 2052 | /* If recording any information failed, retry again soon. */ | ||
| 2053 | if (!done) | ||
| 2001 | return; | 2054 | return; |
| 2002 | 2055 | ||
| 2003 | if (trace_save_cmdline(tsk)) | 2056 | __this_cpu_write(trace_taskinfo_save, false); |
| 2004 | __this_cpu_write(trace_cmdline_save, false); | 2057 | } |
| 2058 | |||
| 2059 | /** | ||
| 2060 | * tracing_record_taskinfo_sched_switch - record task info for sched_switch | ||
| 2061 | * | ||
| 2062 | * @prev - previous task during sched_switch | ||
| 2063 | * @next - next task during sched_switch | ||
| 2064 | * @flags - TRACE_RECORD_CMDLINE for recording comm | ||
| 2065 | * TRACE_RECORD_TGID for recording tgid | ||
| 2066 | */ | ||
| 2067 | void tracing_record_taskinfo_sched_switch(struct task_struct *prev, | ||
| 2068 | struct task_struct *next, int flags) | ||
| 2069 | { | ||
| 2070 | bool done; | ||
| 2071 | |||
| 2072 | if (tracing_record_taskinfo_skip(flags)) | ||
| 2073 | return; | ||
| 2074 | |||
| 2075 | /* | ||
| 2076 | * Record as much task information as possible. If some fail, continue | ||
| 2077 | * to try to record the others. | ||
| 2078 | */ | ||
| 2079 | done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(prev); | ||
| 2080 | done &= !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(next); | ||
| 2081 | done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(prev); | ||
| 2082 | done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(next); | ||
| 2083 | |||
| 2084 | /* If recording any information failed, retry again soon. */ | ||
| 2085 | if (!done) | ||
| 2086 | return; | ||
| 2087 | |||
| 2088 | __this_cpu_write(trace_taskinfo_save, false); | ||
| 2089 | } | ||
| 2090 | |||
| 2091 | /* Helpers to record a specific task information */ | ||
| 2092 | void tracing_record_cmdline(struct task_struct *task) | ||
| 2093 | { | ||
| 2094 | tracing_record_taskinfo(task, TRACE_RECORD_CMDLINE); | ||
| 2095 | } | ||
| 2096 | |||
| 2097 | void tracing_record_tgid(struct task_struct *task) | ||
| 2098 | { | ||
| 2099 | tracing_record_taskinfo(task, TRACE_RECORD_TGID); | ||
| 2005 | } | 2100 | } |
| 2006 | 2101 | ||
| 2007 | /* | 2102 | /* |
| @@ -3146,7 +3241,7 @@ static void *s_start(struct seq_file *m, loff_t *pos) | |||
| 3146 | #endif | 3241 | #endif |
| 3147 | 3242 | ||
| 3148 | if (!iter->snapshot) | 3243 | if (!iter->snapshot) |
| 3149 | atomic_inc(&trace_record_cmdline_disabled); | 3244 | atomic_inc(&trace_record_taskinfo_disabled); |
| 3150 | 3245 | ||
| 3151 | if (*pos != iter->pos) { | 3246 | if (*pos != iter->pos) { |
| 3152 | iter->ent = NULL; | 3247 | iter->ent = NULL; |
| @@ -3191,7 +3286,7 @@ static void s_stop(struct seq_file *m, void *p) | |||
| 3191 | #endif | 3286 | #endif |
| 3192 | 3287 | ||
| 3193 | if (!iter->snapshot) | 3288 | if (!iter->snapshot) |
| 3194 | atomic_dec(&trace_record_cmdline_disabled); | 3289 | atomic_dec(&trace_record_taskinfo_disabled); |
| 3195 | 3290 | ||
| 3196 | trace_access_unlock(iter->cpu_file); | 3291 | trace_access_unlock(iter->cpu_file); |
| 3197 | trace_event_read_unlock(); | 3292 | trace_event_read_unlock(); |
| @@ -3248,23 +3343,38 @@ static void print_event_info(struct trace_buffer *buf, struct seq_file *m) | |||
| 3248 | seq_puts(m, "#\n"); | 3343 | seq_puts(m, "#\n"); |
| 3249 | } | 3344 | } |
| 3250 | 3345 | ||
| 3251 | static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m) | 3346 | static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m, |
| 3347 | unsigned int flags) | ||
| 3252 | { | 3348 | { |
| 3349 | bool tgid = flags & TRACE_ITER_RECORD_TGID; | ||
| 3350 | |||
| 3253 | print_event_info(buf, m); | 3351 | print_event_info(buf, m); |
| 3254 | seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n" | 3352 | |
| 3255 | "# | | | | |\n"); | 3353 | seq_printf(m, "# TASK-PID CPU# %s TIMESTAMP FUNCTION\n", tgid ? "TGID " : ""); |
| 3354 | seq_printf(m, "# | | | %s | |\n", tgid ? " | " : ""); | ||
| 3256 | } | 3355 | } |
| 3257 | 3356 | ||
| 3258 | static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m) | 3357 | static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m, |
| 3358 | unsigned int flags) | ||
| 3259 | { | 3359 | { |
| 3260 | print_event_info(buf, m); | 3360 | bool tgid = flags & TRACE_ITER_RECORD_TGID; |
| 3261 | seq_puts(m, "# _-----=> irqs-off\n" | 3361 | const char tgid_space[] = " "; |
| 3262 | "# / _----=> need-resched\n" | 3362 | const char space[] = " "; |
| 3263 | "# | / _---=> hardirq/softirq\n" | 3363 | |
| 3264 | "# || / _--=> preempt-depth\n" | 3364 | seq_printf(m, "# %s _-----=> irqs-off\n", |
| 3265 | "# ||| / delay\n" | 3365 | tgid ? tgid_space : space); |
| 3266 | "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n" | 3366 | seq_printf(m, "# %s / _----=> need-resched\n", |
| 3267 | "# | | | |||| | |\n"); | 3367 | tgid ? tgid_space : space); |
| 3368 | seq_printf(m, "# %s| / _---=> hardirq/softirq\n", | ||
| 3369 | tgid ? tgid_space : space); | ||
| 3370 | seq_printf(m, "# %s|| / _--=> preempt-depth\n", | ||
| 3371 | tgid ? tgid_space : space); | ||
| 3372 | seq_printf(m, "# %s||| / delay\n", | ||
| 3373 | tgid ? tgid_space : space); | ||
| 3374 | seq_printf(m, "# TASK-PID CPU#%s|||| TIMESTAMP FUNCTION\n", | ||
| 3375 | tgid ? " TGID " : space); | ||
| 3376 | seq_printf(m, "# | | | %s|||| | |\n", | ||
| 3377 | tgid ? " | " : space); | ||
| 3268 | } | 3378 | } |
| 3269 | 3379 | ||
| 3270 | void | 3380 | void |
| @@ -3580,9 +3690,11 @@ void trace_default_header(struct seq_file *m) | |||
| 3580 | } else { | 3690 | } else { |
| 3581 | if (!(trace_flags & TRACE_ITER_VERBOSE)) { | 3691 | if (!(trace_flags & TRACE_ITER_VERBOSE)) { |
| 3582 | if (trace_flags & TRACE_ITER_IRQ_INFO) | 3692 | if (trace_flags & TRACE_ITER_IRQ_INFO) |
| 3583 | print_func_help_header_irq(iter->trace_buffer, m); | 3693 | print_func_help_header_irq(iter->trace_buffer, |
| 3694 | m, trace_flags); | ||
| 3584 | else | 3695 | else |
| 3585 | print_func_help_header(iter->trace_buffer, m); | 3696 | print_func_help_header(iter->trace_buffer, m, |
| 3697 | trace_flags); | ||
| 3586 | } | 3698 | } |
| 3587 | } | 3699 | } |
| 3588 | } | 3700 | } |
| @@ -4238,6 +4350,18 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) | |||
| 4238 | if (mask == TRACE_ITER_RECORD_CMD) | 4350 | if (mask == TRACE_ITER_RECORD_CMD) |
| 4239 | trace_event_enable_cmd_record(enabled); | 4351 | trace_event_enable_cmd_record(enabled); |
| 4240 | 4352 | ||
| 4353 | if (mask == TRACE_ITER_RECORD_TGID) { | ||
| 4354 | if (!tgid_map) | ||
| 4355 | tgid_map = kzalloc((PID_MAX_DEFAULT + 1) * sizeof(*tgid_map), | ||
| 4356 | GFP_KERNEL); | ||
| 4357 | if (!tgid_map) { | ||
| 4358 | tr->trace_flags &= ~TRACE_ITER_RECORD_TGID; | ||
| 4359 | return -ENOMEM; | ||
| 4360 | } | ||
| 4361 | |||
| 4362 | trace_event_enable_tgid_record(enabled); | ||
| 4363 | } | ||
| 4364 | |||
| 4241 | if (mask == TRACE_ITER_EVENT_FORK) | 4365 | if (mask == TRACE_ITER_EVENT_FORK) |
| 4242 | trace_event_follow_fork(tr, enabled); | 4366 | trace_event_follow_fork(tr, enabled); |
| 4243 | 4367 | ||
| @@ -4473,7 +4597,8 @@ static const char readme_msg[] = | |||
| 4473 | #endif | 4597 | #endif |
| 4474 | #if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) | 4598 | #if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) |
| 4475 | "\t accepts: event-definitions (one definition per line)\n" | 4599 | "\t accepts: event-definitions (one definition per line)\n" |
| 4476 | "\t Format: p|r[:[<group>/]<event>] <place> [<args>]\n" | 4600 | "\t Format: p[:[<group>/]<event>] <place> [<args>]\n" |
| 4601 | "\t r[maxactive][:[<group>/]<event>] <place> [<args>]\n" | ||
| 4477 | "\t -:[<group>/]<event>\n" | 4602 | "\t -:[<group>/]<event>\n" |
| 4478 | #ifdef CONFIG_KPROBE_EVENTS | 4603 | #ifdef CONFIG_KPROBE_EVENTS |
| 4479 | "\t place: [<module>:]<symbol>[+<offset>]|<memaddr>\n" | 4604 | "\t place: [<module>:]<symbol>[+<offset>]|<memaddr>\n" |
| @@ -4597,6 +4722,76 @@ static const struct file_operations tracing_readme_fops = { | |||
| 4597 | .llseek = generic_file_llseek, | 4722 | .llseek = generic_file_llseek, |
| 4598 | }; | 4723 | }; |
| 4599 | 4724 | ||
| 4725 | static void *saved_tgids_next(struct seq_file *m, void *v, loff_t *pos) | ||
| 4726 | { | ||
| 4727 | int *ptr = v; | ||
| 4728 | |||
| 4729 | if (*pos || m->count) | ||
| 4730 | ptr++; | ||
| 4731 | |||
| 4732 | (*pos)++; | ||
| 4733 | |||
| 4734 | for (; ptr <= &tgid_map[PID_MAX_DEFAULT]; ptr++) { | ||
| 4735 | if (trace_find_tgid(*ptr)) | ||
| 4736 | return ptr; | ||
| 4737 | } | ||
| 4738 | |||
| 4739 | return NULL; | ||
| 4740 | } | ||
| 4741 | |||
| 4742 | static void *saved_tgids_start(struct seq_file *m, loff_t *pos) | ||
| 4743 | { | ||
| 4744 | void *v; | ||
| 4745 | loff_t l = 0; | ||
| 4746 | |||
| 4747 | if (!tgid_map) | ||
| 4748 | return NULL; | ||
| 4749 | |||
| 4750 | v = &tgid_map[0]; | ||
| 4751 | while (l <= *pos) { | ||
| 4752 | v = saved_tgids_next(m, v, &l); | ||
| 4753 | if (!v) | ||
| 4754 | return NULL; | ||
| 4755 | } | ||
| 4756 | |||
| 4757 | return v; | ||
| 4758 | } | ||
| 4759 | |||
| 4760 | static void saved_tgids_stop(struct seq_file *m, void *v) | ||
| 4761 | { | ||
| 4762 | } | ||
| 4763 | |||
| 4764 | static int saved_tgids_show(struct seq_file *m, void *v) | ||
| 4765 | { | ||
| 4766 | int pid = (int *)v - tgid_map; | ||
| 4767 | |||
| 4768 | seq_printf(m, "%d %d\n", pid, trace_find_tgid(pid)); | ||
| 4769 | return 0; | ||
| 4770 | } | ||
| 4771 | |||
| 4772 | static const struct seq_operations tracing_saved_tgids_seq_ops = { | ||
| 4773 | .start = saved_tgids_start, | ||
| 4774 | .stop = saved_tgids_stop, | ||
| 4775 | .next = saved_tgids_next, | ||
| 4776 | .show = saved_tgids_show, | ||
| 4777 | }; | ||
| 4778 | |||
| 4779 | static int tracing_saved_tgids_open(struct inode *inode, struct file *filp) | ||
| 4780 | { | ||
| 4781 | if (tracing_disabled) | ||
| 4782 | return -ENODEV; | ||
| 4783 | |||
| 4784 | return seq_open(filp, &tracing_saved_tgids_seq_ops); | ||
| 4785 | } | ||
| 4786 | |||
| 4787 | |||
| 4788 | static const struct file_operations tracing_saved_tgids_fops = { | ||
| 4789 | .open = tracing_saved_tgids_open, | ||
| 4790 | .read = seq_read, | ||
| 4791 | .llseek = seq_lseek, | ||
| 4792 | .release = seq_release, | ||
| 4793 | }; | ||
| 4794 | |||
| 4600 | static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos) | 4795 | static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos) |
| 4601 | { | 4796 | { |
| 4602 | unsigned int *ptr = v; | 4797 | unsigned int *ptr = v; |
| @@ -4746,11 +4941,11 @@ static const struct file_operations tracing_saved_cmdlines_size_fops = { | |||
| 4746 | .write = tracing_saved_cmdlines_size_write, | 4941 | .write = tracing_saved_cmdlines_size_write, |
| 4747 | }; | 4942 | }; |
| 4748 | 4943 | ||
| 4749 | #ifdef CONFIG_TRACE_ENUM_MAP_FILE | 4944 | #ifdef CONFIG_TRACE_EVAL_MAP_FILE |
| 4750 | static union trace_enum_map_item * | 4945 | static union trace_eval_map_item * |
| 4751 | update_enum_map(union trace_enum_map_item *ptr) | 4946 | update_eval_map(union trace_eval_map_item *ptr) |
| 4752 | { | 4947 | { |
| 4753 | if (!ptr->map.enum_string) { | 4948 | if (!ptr->map.eval_string) { |
| 4754 | if (ptr->tail.next) { | 4949 | if (ptr->tail.next) { |
| 4755 | ptr = ptr->tail.next; | 4950 | ptr = ptr->tail.next; |
| 4756 | /* Set ptr to the next real item (skip head) */ | 4951 | /* Set ptr to the next real item (skip head) */ |
| @@ -4761,15 +4956,15 @@ update_enum_map(union trace_enum_map_item *ptr) | |||
| 4761 | return ptr; | 4956 | return ptr; |
| 4762 | } | 4957 | } |
| 4763 | 4958 | ||
| 4764 | static void *enum_map_next(struct seq_file *m, void *v, loff_t *pos) | 4959 | static void *eval_map_next(struct seq_file *m, void *v, loff_t *pos) |
| 4765 | { | 4960 | { |
| 4766 | union trace_enum_map_item *ptr = v; | 4961 | union trace_eval_map_item *ptr = v; |
| 4767 | 4962 | ||
| 4768 | /* | 4963 | /* |
| 4769 | * Paranoid! If ptr points to end, we don't want to increment past it. | 4964 | * Paranoid! If ptr points to end, we don't want to increment past it. |
| 4770 | * This really should never happen. | 4965 | * This really should never happen. |
| 4771 | */ | 4966 | */ |
| 4772 | ptr = update_enum_map(ptr); | 4967 | ptr = update_eval_map(ptr); |
| 4773 | if (WARN_ON_ONCE(!ptr)) | 4968 | if (WARN_ON_ONCE(!ptr)) |
| 4774 | return NULL; | 4969 | return NULL; |
| 4775 | 4970 | ||
| @@ -4777,104 +4972,104 @@ static void *enum_map_next(struct seq_file *m, void *v, loff_t *pos) | |||
| 4777 | 4972 | ||
| 4778 | (*pos)++; | 4973 | (*pos)++; |
| 4779 | 4974 | ||
| 4780 | ptr = update_enum_map(ptr); | 4975 | ptr = update_eval_map(ptr); |
| 4781 | 4976 | ||
| 4782 | return ptr; | 4977 | return ptr; |
| 4783 | } | 4978 | } |
| 4784 | 4979 | ||
| 4785 | static void *enum_map_start(struct seq_file *m, loff_t *pos) | 4980 | static void *eval_map_start(struct seq_file *m, loff_t *pos) |
| 4786 | { | 4981 | { |
| 4787 | union trace_enum_map_item *v; | 4982 | union trace_eval_map_item *v; |
| 4788 | loff_t l = 0; | 4983 | loff_t l = 0; |
| 4789 | 4984 | ||
| 4790 | mutex_lock(&trace_enum_mutex); | 4985 | mutex_lock(&trace_eval_mutex); |
| 4791 | 4986 | ||
| 4792 | v = trace_enum_maps; | 4987 | v = trace_eval_maps; |
| 4793 | if (v) | 4988 | if (v) |
| 4794 | v++; | 4989 | v++; |
| 4795 | 4990 | ||
| 4796 | while (v && l < *pos) { | 4991 | while (v && l < *pos) { |
| 4797 | v = enum_map_next(m, v, &l); | 4992 | v = eval_map_next(m, v, &l); |
| 4798 | } | 4993 | } |
| 4799 | 4994 | ||
| 4800 | return v; | 4995 | return v; |
| 4801 | } | 4996 | } |
| 4802 | 4997 | ||
| 4803 | static void enum_map_stop(struct seq_file *m, void *v) | 4998 | static void eval_map_stop(struct seq_file *m, void *v) |
| 4804 | { | 4999 | { |
| 4805 | mutex_unlock(&trace_enum_mutex); | 5000 | mutex_unlock(&trace_eval_mutex); |
| 4806 | } | 5001 | } |
| 4807 | 5002 | ||
| 4808 | static int enum_map_show(struct seq_file *m, void *v) | 5003 | static int eval_map_show(struct seq_file *m, void *v) |
| 4809 | { | 5004 | { |
| 4810 | union trace_enum_map_item *ptr = v; | 5005 | union trace_eval_map_item *ptr = v; |
| 4811 | 5006 | ||
| 4812 | seq_printf(m, "%s %ld (%s)\n", | 5007 | seq_printf(m, "%s %ld (%s)\n", |
| 4813 | ptr->map.enum_string, ptr->map.enum_value, | 5008 | ptr->map.eval_string, ptr->map.eval_value, |
| 4814 | ptr->map.system); | 5009 | ptr->map.system); |
| 4815 | 5010 | ||
| 4816 | return 0; | 5011 | return 0; |
| 4817 | } | 5012 | } |
| 4818 | 5013 | ||
| 4819 | static const struct seq_operations tracing_enum_map_seq_ops = { | 5014 | static const struct seq_operations tracing_eval_map_seq_ops = { |
| 4820 | .start = enum_map_start, | 5015 | .start = eval_map_start, |
| 4821 | .next = enum_map_next, | 5016 | .next = eval_map_next, |
| 4822 | .stop = enum_map_stop, | 5017 | .stop = eval_map_stop, |
| 4823 | .show = enum_map_show, | 5018 | .show = eval_map_show, |
| 4824 | }; | 5019 | }; |
| 4825 | 5020 | ||
| 4826 | static int tracing_enum_map_open(struct inode *inode, struct file *filp) | 5021 | static int tracing_eval_map_open(struct inode *inode, struct file *filp) |
| 4827 | { | 5022 | { |
| 4828 | if (tracing_disabled) | 5023 | if (tracing_disabled) |
| 4829 | return -ENODEV; | 5024 | return -ENODEV; |
| 4830 | 5025 | ||
| 4831 | return seq_open(filp, &tracing_enum_map_seq_ops); | 5026 | return seq_open(filp, &tracing_eval_map_seq_ops); |
| 4832 | } | 5027 | } |
| 4833 | 5028 | ||
| 4834 | static const struct file_operations tracing_enum_map_fops = { | 5029 | static const struct file_operations tracing_eval_map_fops = { |
| 4835 | .open = tracing_enum_map_open, | 5030 | .open = tracing_eval_map_open, |
| 4836 | .read = seq_read, | 5031 | .read = seq_read, |
| 4837 | .llseek = seq_lseek, | 5032 | .llseek = seq_lseek, |
| 4838 | .release = seq_release, | 5033 | .release = seq_release, |
| 4839 | }; | 5034 | }; |
| 4840 | 5035 | ||
| 4841 | static inline union trace_enum_map_item * | 5036 | static inline union trace_eval_map_item * |
| 4842 | trace_enum_jmp_to_tail(union trace_enum_map_item *ptr) | 5037 | trace_eval_jmp_to_tail(union trace_eval_map_item *ptr) |
| 4843 | { | 5038 | { |
| 4844 | /* Return tail of array given the head */ | 5039 | /* Return tail of array given the head */ |
| 4845 | return ptr + ptr->head.length + 1; | 5040 | return ptr + ptr->head.length + 1; |
| 4846 | } | 5041 | } |
| 4847 | 5042 | ||
| 4848 | static void | 5043 | static void |
| 4849 | trace_insert_enum_map_file(struct module *mod, struct trace_enum_map **start, | 5044 | trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start, |
| 4850 | int len) | 5045 | int len) |
| 4851 | { | 5046 | { |
| 4852 | struct trace_enum_map **stop; | 5047 | struct trace_eval_map **stop; |
| 4853 | struct trace_enum_map **map; | 5048 | struct trace_eval_map **map; |
| 4854 | union trace_enum_map_item *map_array; | 5049 | union trace_eval_map_item *map_array; |
| 4855 | union trace_enum_map_item *ptr; | 5050 | union trace_eval_map_item *ptr; |
| 4856 | 5051 | ||
| 4857 | stop = start + len; | 5052 | stop = start + len; |
| 4858 | 5053 | ||
| 4859 | /* | 5054 | /* |
| 4860 | * The trace_enum_maps contains the map plus a head and tail item, | 5055 | * The trace_eval_maps contains the map plus a head and tail item, |
| 4861 | * where the head holds the module and length of array, and the | 5056 | * where the head holds the module and length of array, and the |
| 4862 | * tail holds a pointer to the next list. | 5057 | * tail holds a pointer to the next list. |
| 4863 | */ | 5058 | */ |
| 4864 | map_array = kmalloc(sizeof(*map_array) * (len + 2), GFP_KERNEL); | 5059 | map_array = kmalloc(sizeof(*map_array) * (len + 2), GFP_KERNEL); |
| 4865 | if (!map_array) { | 5060 | if (!map_array) { |
| 4866 | pr_warn("Unable to allocate trace enum mapping\n"); | 5061 | pr_warn("Unable to allocate trace eval mapping\n"); |
| 4867 | return; | 5062 | return; |
| 4868 | } | 5063 | } |
| 4869 | 5064 | ||
| 4870 | mutex_lock(&trace_enum_mutex); | 5065 | mutex_lock(&trace_eval_mutex); |
| 4871 | 5066 | ||
| 4872 | if (!trace_enum_maps) | 5067 | if (!trace_eval_maps) |
| 4873 | trace_enum_maps = map_array; | 5068 | trace_eval_maps = map_array; |
| 4874 | else { | 5069 | else { |
| 4875 | ptr = trace_enum_maps; | 5070 | ptr = trace_eval_maps; |
| 4876 | for (;;) { | 5071 | for (;;) { |
| 4877 | ptr = trace_enum_jmp_to_tail(ptr); | 5072 | ptr = trace_eval_jmp_to_tail(ptr); |
| 4878 | if (!ptr->tail.next) | 5073 | if (!ptr->tail.next) |
| 4879 | break; | 5074 | break; |
| 4880 | ptr = ptr->tail.next; | 5075 | ptr = ptr->tail.next; |
| @@ -4892,34 +5087,34 @@ trace_insert_enum_map_file(struct module *mod, struct trace_enum_map **start, | |||
| 4892 | } | 5087 | } |
| 4893 | memset(map_array, 0, sizeof(*map_array)); | 5088 | memset(map_array, 0, sizeof(*map_array)); |
| 4894 | 5089 | ||
| 4895 | mutex_unlock(&trace_enum_mutex); | 5090 | mutex_unlock(&trace_eval_mutex); |
| 4896 | } | 5091 | } |
| 4897 | 5092 | ||
| 4898 | static void trace_create_enum_file(struct dentry *d_tracer) | 5093 | static void trace_create_eval_file(struct dentry *d_tracer) |
| 4899 | { | 5094 | { |
| 4900 | trace_create_file("enum_map", 0444, d_tracer, | 5095 | trace_create_file("eval_map", 0444, d_tracer, |
| 4901 | NULL, &tracing_enum_map_fops); | 5096 | NULL, &tracing_eval_map_fops); |
| 4902 | } | 5097 | } |
| 4903 | 5098 | ||
| 4904 | #else /* CONFIG_TRACE_ENUM_MAP_FILE */ | 5099 | #else /* CONFIG_TRACE_EVAL_MAP_FILE */ |
| 4905 | static inline void trace_create_enum_file(struct dentry *d_tracer) { } | 5100 | static inline void trace_create_eval_file(struct dentry *d_tracer) { } |
| 4906 | static inline void trace_insert_enum_map_file(struct module *mod, | 5101 | static inline void trace_insert_eval_map_file(struct module *mod, |
| 4907 | struct trace_enum_map **start, int len) { } | 5102 | struct trace_eval_map **start, int len) { } |
| 4908 | #endif /* !CONFIG_TRACE_ENUM_MAP_FILE */ | 5103 | #endif /* !CONFIG_TRACE_EVAL_MAP_FILE */ |
| 4909 | 5104 | ||
| 4910 | static void trace_insert_enum_map(struct module *mod, | 5105 | static void trace_insert_eval_map(struct module *mod, |
| 4911 | struct trace_enum_map **start, int len) | 5106 | struct trace_eval_map **start, int len) |
| 4912 | { | 5107 | { |
| 4913 | struct trace_enum_map **map; | 5108 | struct trace_eval_map **map; |
| 4914 | 5109 | ||
| 4915 | if (len <= 0) | 5110 | if (len <= 0) |
| 4916 | return; | 5111 | return; |
| 4917 | 5112 | ||
| 4918 | map = start; | 5113 | map = start; |
| 4919 | 5114 | ||
| 4920 | trace_event_enum_update(map, len); | 5115 | trace_event_eval_update(map, len); |
| 4921 | 5116 | ||
| 4922 | trace_insert_enum_map_file(mod, start, len); | 5117 | trace_insert_eval_map_file(mod, start, len); |
| 4923 | } | 5118 | } |
| 4924 | 5119 | ||
| 4925 | static ssize_t | 5120 | static ssize_t |
| @@ -6739,33 +6934,18 @@ static const struct file_operations tracing_stats_fops = { | |||
| 6739 | 6934 | ||
| 6740 | #ifdef CONFIG_DYNAMIC_FTRACE | 6935 | #ifdef CONFIG_DYNAMIC_FTRACE |
| 6741 | 6936 | ||
| 6742 | int __weak ftrace_arch_read_dyn_info(char *buf, int size) | ||
| 6743 | { | ||
| 6744 | return 0; | ||
| 6745 | } | ||
| 6746 | |||
| 6747 | static ssize_t | 6937 | static ssize_t |
| 6748 | tracing_read_dyn_info(struct file *filp, char __user *ubuf, | 6938 | tracing_read_dyn_info(struct file *filp, char __user *ubuf, |
| 6749 | size_t cnt, loff_t *ppos) | 6939 | size_t cnt, loff_t *ppos) |
| 6750 | { | 6940 | { |
| 6751 | static char ftrace_dyn_info_buffer[1024]; | ||
| 6752 | static DEFINE_MUTEX(dyn_info_mutex); | ||
| 6753 | unsigned long *p = filp->private_data; | 6941 | unsigned long *p = filp->private_data; |
| 6754 | char *buf = ftrace_dyn_info_buffer; | 6942 | char buf[64]; /* Not too big for a shallow stack */ |
| 6755 | int size = ARRAY_SIZE(ftrace_dyn_info_buffer); | ||
| 6756 | int r; | 6943 | int r; |
| 6757 | 6944 | ||
| 6758 | mutex_lock(&dyn_info_mutex); | 6945 | r = scnprintf(buf, 63, "%ld", *p); |
| 6759 | r = sprintf(buf, "%ld ", *p); | ||
| 6760 | |||
| 6761 | r += ftrace_arch_read_dyn_info(buf+r, (size-1)-r); | ||
| 6762 | buf[r++] = '\n'; | 6946 | buf[r++] = '\n'; |
| 6763 | 6947 | ||
| 6764 | r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); | 6948 | return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); |
| 6765 | |||
| 6766 | mutex_unlock(&dyn_info_mutex); | ||
| 6767 | |||
| 6768 | return r; | ||
| 6769 | } | 6949 | } |
| 6770 | 6950 | ||
| 6771 | static const struct file_operations tracing_dyn_info_fops = { | 6951 | static const struct file_operations tracing_dyn_info_fops = { |
| @@ -7594,6 +7774,7 @@ static int instance_rmdir(const char *name) | |||
| 7594 | } | 7774 | } |
| 7595 | kfree(tr->topts); | 7775 | kfree(tr->topts); |
| 7596 | 7776 | ||
| 7777 | free_cpumask_var(tr->tracing_cpumask); | ||
| 7597 | kfree(tr->name); | 7778 | kfree(tr->name); |
| 7598 | kfree(tr); | 7779 | kfree(tr); |
| 7599 | 7780 | ||
| @@ -7737,21 +7918,21 @@ struct dentry *tracing_init_dentry(void) | |||
| 7737 | return NULL; | 7918 | return NULL; |
| 7738 | } | 7919 | } |
| 7739 | 7920 | ||
| 7740 | extern struct trace_enum_map *__start_ftrace_enum_maps[]; | 7921 | extern struct trace_eval_map *__start_ftrace_eval_maps[]; |
| 7741 | extern struct trace_enum_map *__stop_ftrace_enum_maps[]; | 7922 | extern struct trace_eval_map *__stop_ftrace_eval_maps[]; |
| 7742 | 7923 | ||
| 7743 | static void __init trace_enum_init(void) | 7924 | static void __init trace_eval_init(void) |
| 7744 | { | 7925 | { |
| 7745 | int len; | 7926 | int len; |
| 7746 | 7927 | ||
| 7747 | len = __stop_ftrace_enum_maps - __start_ftrace_enum_maps; | 7928 | len = __stop_ftrace_eval_maps - __start_ftrace_eval_maps; |
| 7748 | trace_insert_enum_map(NULL, __start_ftrace_enum_maps, len); | 7929 | trace_insert_eval_map(NULL, __start_ftrace_eval_maps, len); |
| 7749 | } | 7930 | } |
| 7750 | 7931 | ||
| 7751 | #ifdef CONFIG_MODULES | 7932 | #ifdef CONFIG_MODULES |
| 7752 | static void trace_module_add_enums(struct module *mod) | 7933 | static void trace_module_add_evals(struct module *mod) |
| 7753 | { | 7934 | { |
| 7754 | if (!mod->num_trace_enums) | 7935 | if (!mod->num_trace_evals) |
| 7755 | return; | 7936 | return; |
| 7756 | 7937 | ||
| 7757 | /* | 7938 | /* |
| @@ -7761,40 +7942,40 @@ static void trace_module_add_enums(struct module *mod) | |||
| 7761 | if (trace_module_has_bad_taint(mod)) | 7942 | if (trace_module_has_bad_taint(mod)) |
| 7762 | return; | 7943 | return; |
| 7763 | 7944 | ||
| 7764 | trace_insert_enum_map(mod, mod->trace_enums, mod->num_trace_enums); | 7945 | trace_insert_eval_map(mod, mod->trace_evals, mod->num_trace_evals); |
| 7765 | } | 7946 | } |
| 7766 | 7947 | ||
| 7767 | #ifdef CONFIG_TRACE_ENUM_MAP_FILE | 7948 | #ifdef CONFIG_TRACE_EVAL_MAP_FILE |
| 7768 | static void trace_module_remove_enums(struct module *mod) | 7949 | static void trace_module_remove_evals(struct module *mod) |
| 7769 | { | 7950 | { |
| 7770 | union trace_enum_map_item *map; | 7951 | union trace_eval_map_item *map; |
| 7771 | union trace_enum_map_item **last = &trace_enum_maps; | 7952 | union trace_eval_map_item **last = &trace_eval_maps; |
| 7772 | 7953 | ||
| 7773 | if (!mod->num_trace_enums) | 7954 | if (!mod->num_trace_evals) |
| 7774 | return; | 7955 | return; |
| 7775 | 7956 | ||
| 7776 | mutex_lock(&trace_enum_mutex); | 7957 | mutex_lock(&trace_eval_mutex); |
| 7777 | 7958 | ||
| 7778 | map = trace_enum_maps; | 7959 | map = trace_eval_maps; |
| 7779 | 7960 | ||
| 7780 | while (map) { | 7961 | while (map) { |
| 7781 | if (map->head.mod == mod) | 7962 | if (map->head.mod == mod) |
| 7782 | break; | 7963 | break; |
| 7783 | map = trace_enum_jmp_to_tail(map); | 7964 | map = trace_eval_jmp_to_tail(map); |
| 7784 | last = &map->tail.next; | 7965 | last = &map->tail.next; |
| 7785 | map = map->tail.next; | 7966 | map = map->tail.next; |
| 7786 | } | 7967 | } |
| 7787 | if (!map) | 7968 | if (!map) |
| 7788 | goto out; | 7969 | goto out; |
| 7789 | 7970 | ||
| 7790 | *last = trace_enum_jmp_to_tail(map)->tail.next; | 7971 | *last = trace_eval_jmp_to_tail(map)->tail.next; |
| 7791 | kfree(map); | 7972 | kfree(map); |
| 7792 | out: | 7973 | out: |
| 7793 | mutex_unlock(&trace_enum_mutex); | 7974 | mutex_unlock(&trace_eval_mutex); |
| 7794 | } | 7975 | } |
| 7795 | #else | 7976 | #else |
| 7796 | static inline void trace_module_remove_enums(struct module *mod) { } | 7977 | static inline void trace_module_remove_evals(struct module *mod) { } |
| 7797 | #endif /* CONFIG_TRACE_ENUM_MAP_FILE */ | 7978 | #endif /* CONFIG_TRACE_EVAL_MAP_FILE */ |
| 7798 | 7979 | ||
| 7799 | static int trace_module_notify(struct notifier_block *self, | 7980 | static int trace_module_notify(struct notifier_block *self, |
| 7800 | unsigned long val, void *data) | 7981 | unsigned long val, void *data) |
| @@ -7803,10 +7984,10 @@ static int trace_module_notify(struct notifier_block *self, | |||
| 7803 | 7984 | ||
| 7804 | switch (val) { | 7985 | switch (val) { |
| 7805 | case MODULE_STATE_COMING: | 7986 | case MODULE_STATE_COMING: |
| 7806 | trace_module_add_enums(mod); | 7987 | trace_module_add_evals(mod); |
| 7807 | break; | 7988 | break; |
| 7808 | case MODULE_STATE_GOING: | 7989 | case MODULE_STATE_GOING: |
| 7809 | trace_module_remove_enums(mod); | 7990 | trace_module_remove_evals(mod); |
| 7810 | break; | 7991 | break; |
| 7811 | } | 7992 | } |
| 7812 | 7993 | ||
| @@ -7844,9 +8025,12 @@ static __init int tracer_init_tracefs(void) | |||
| 7844 | trace_create_file("saved_cmdlines_size", 0644, d_tracer, | 8025 | trace_create_file("saved_cmdlines_size", 0644, d_tracer, |
| 7845 | NULL, &tracing_saved_cmdlines_size_fops); | 8026 | NULL, &tracing_saved_cmdlines_size_fops); |
| 7846 | 8027 | ||
| 7847 | trace_enum_init(); | 8028 | trace_create_file("saved_tgids", 0444, d_tracer, |
| 8029 | NULL, &tracing_saved_tgids_fops); | ||
| 8030 | |||
| 8031 | trace_eval_init(); | ||
| 7848 | 8032 | ||
| 7849 | trace_create_enum_file(d_tracer); | 8033 | trace_create_eval_file(d_tracer); |
| 7850 | 8034 | ||
| 7851 | #ifdef CONFIG_MODULES | 8035 | #ifdef CONFIG_MODULES |
| 7852 | register_module_notifier(&trace_module_nb); | 8036 | register_module_notifier(&trace_module_nb); |
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 39fd77330aab..490ba229931d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
| @@ -263,7 +263,10 @@ struct trace_array { | |||
| 263 | struct ftrace_ops *ops; | 263 | struct ftrace_ops *ops; |
| 264 | struct trace_pid_list __rcu *function_pids; | 264 | struct trace_pid_list __rcu *function_pids; |
| 265 | #ifdef CONFIG_DYNAMIC_FTRACE | 265 | #ifdef CONFIG_DYNAMIC_FTRACE |
| 266 | /* All of these are protected by the ftrace_lock */ | ||
| 266 | struct list_head func_probes; | 267 | struct list_head func_probes; |
| 268 | struct list_head mod_trace; | ||
| 269 | struct list_head mod_notrace; | ||
| 267 | #endif | 270 | #endif |
| 268 | /* function tracing enabled */ | 271 | /* function tracing enabled */ |
| 269 | int function_enabled; | 272 | int function_enabled; |
| @@ -637,6 +640,9 @@ void set_graph_array(struct trace_array *tr); | |||
| 637 | 640 | ||
| 638 | void tracing_start_cmdline_record(void); | 641 | void tracing_start_cmdline_record(void); |
| 639 | void tracing_stop_cmdline_record(void); | 642 | void tracing_stop_cmdline_record(void); |
| 643 | void tracing_start_tgid_record(void); | ||
| 644 | void tracing_stop_tgid_record(void); | ||
| 645 | |||
| 640 | int register_tracer(struct tracer *type); | 646 | int register_tracer(struct tracer *type); |
| 641 | int is_tracing_stopped(void); | 647 | int is_tracing_stopped(void); |
| 642 | 648 | ||
| @@ -697,6 +703,7 @@ static inline void __trace_stack(struct trace_array *tr, unsigned long flags, | |||
| 697 | extern u64 ftrace_now(int cpu); | 703 | extern u64 ftrace_now(int cpu); |
| 698 | 704 | ||
| 699 | extern void trace_find_cmdline(int pid, char comm[]); | 705 | extern void trace_find_cmdline(int pid, char comm[]); |
| 706 | extern int trace_find_tgid(int pid); | ||
| 700 | extern void trace_event_follow_fork(struct trace_array *tr, bool enable); | 707 | extern void trace_event_follow_fork(struct trace_array *tr, bool enable); |
| 701 | 708 | ||
| 702 | #ifdef CONFIG_DYNAMIC_FTRACE | 709 | #ifdef CONFIG_DYNAMIC_FTRACE |
| @@ -761,10 +768,24 @@ enum print_line_t print_trace_line(struct trace_iterator *iter); | |||
| 761 | 768 | ||
| 762 | extern char trace_find_mark(unsigned long long duration); | 769 | extern char trace_find_mark(unsigned long long duration); |
| 763 | 770 | ||
| 771 | struct ftrace_hash; | ||
| 772 | |||
| 773 | struct ftrace_mod_load { | ||
| 774 | struct list_head list; | ||
| 775 | char *func; | ||
| 776 | char *module; | ||
| 777 | int enable; | ||
| 778 | }; | ||
| 779 | |||
| 780 | enum { | ||
| 781 | FTRACE_HASH_FL_MOD = (1 << 0), | ||
| 782 | }; | ||
| 783 | |||
| 764 | struct ftrace_hash { | 784 | struct ftrace_hash { |
| 765 | unsigned long size_bits; | 785 | unsigned long size_bits; |
| 766 | struct hlist_head *buckets; | 786 | struct hlist_head *buckets; |
| 767 | unsigned long count; | 787 | unsigned long count; |
| 788 | unsigned long flags; | ||
| 768 | struct rcu_head rcu; | 789 | struct rcu_head rcu; |
| 769 | }; | 790 | }; |
| 770 | 791 | ||
| @@ -773,7 +794,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip); | |||
| 773 | 794 | ||
| 774 | static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash) | 795 | static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash) |
| 775 | { | 796 | { |
| 776 | return !hash || !hash->count; | 797 | return !hash || !(hash->count || (hash->flags & FTRACE_HASH_FL_MOD)); |
| 777 | } | 798 | } |
| 778 | 799 | ||
| 779 | /* Standard output formatting function used for function return traces */ | 800 | /* Standard output formatting function used for function return traces */ |
| @@ -1107,6 +1128,7 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, | |||
| 1107 | C(CONTEXT_INFO, "context-info"), /* Print pid/cpu/time */ \ | 1128 | C(CONTEXT_INFO, "context-info"), /* Print pid/cpu/time */ \ |
| 1108 | C(LATENCY_FMT, "latency-format"), \ | 1129 | C(LATENCY_FMT, "latency-format"), \ |
| 1109 | C(RECORD_CMD, "record-cmd"), \ | 1130 | C(RECORD_CMD, "record-cmd"), \ |
| 1131 | C(RECORD_TGID, "record-tgid"), \ | ||
| 1110 | C(OVERWRITE, "overwrite"), \ | 1132 | C(OVERWRITE, "overwrite"), \ |
| 1111 | C(STOP_ON_FREE, "disable_on_free"), \ | 1133 | C(STOP_ON_FREE, "disable_on_free"), \ |
| 1112 | C(IRQ_INFO, "irq-info"), \ | 1134 | C(IRQ_INFO, "irq-info"), \ |
| @@ -1188,9 +1210,9 @@ struct ftrace_event_field { | |||
| 1188 | struct event_filter { | 1210 | struct event_filter { |
| 1189 | int n_preds; /* Number assigned */ | 1211 | int n_preds; /* Number assigned */ |
| 1190 | int a_preds; /* allocated */ | 1212 | int a_preds; /* allocated */ |
| 1191 | struct filter_pred *preds; | 1213 | struct filter_pred __rcu *preds; |
| 1192 | struct filter_pred *root; | 1214 | struct filter_pred __rcu *root; |
| 1193 | char *filter_string; | 1215 | char *filter_string; |
| 1194 | }; | 1216 | }; |
| 1195 | 1217 | ||
| 1196 | struct event_subsystem { | 1218 | struct event_subsystem { |
| @@ -1423,6 +1445,8 @@ struct ftrace_event_field * | |||
| 1423 | trace_find_event_field(struct trace_event_call *call, char *name); | 1445 | trace_find_event_field(struct trace_event_call *call, char *name); |
| 1424 | 1446 | ||
| 1425 | extern void trace_event_enable_cmd_record(bool enable); | 1447 | extern void trace_event_enable_cmd_record(bool enable); |
| 1448 | extern void trace_event_enable_tgid_record(bool enable); | ||
| 1449 | |||
| 1426 | extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr); | 1450 | extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr); |
| 1427 | extern int event_trace_del_tracer(struct trace_array *tr); | 1451 | extern int event_trace_del_tracer(struct trace_array *tr); |
| 1428 | 1452 | ||
| @@ -1773,10 +1797,10 @@ static inline const char *get_syscall_name(int syscall) | |||
| 1773 | 1797 | ||
| 1774 | #ifdef CONFIG_EVENT_TRACING | 1798 | #ifdef CONFIG_EVENT_TRACING |
| 1775 | void trace_event_init(void); | 1799 | void trace_event_init(void); |
| 1776 | void trace_event_enum_update(struct trace_enum_map **map, int len); | 1800 | void trace_event_eval_update(struct trace_eval_map **map, int len); |
| 1777 | #else | 1801 | #else |
| 1778 | static inline void __init trace_event_init(void) { } | 1802 | static inline void __init trace_event_init(void) { } |
| 1779 | static inline void trace_event_enum_update(struct trace_enum_map **map, int len) { } | 1803 | static inline void trace_event_eval_update(struct trace_eval_map **map, int len) { } |
| 1780 | #endif | 1804 | #endif |
| 1781 | 1805 | ||
| 1782 | extern struct trace_iterator *tracepoint_print_iter; | 1806 | extern struct trace_iterator *tracepoint_print_iter; |
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index e7973e10398c..36132f9280e6 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
| @@ -343,6 +343,28 @@ void trace_event_enable_cmd_record(bool enable) | |||
| 343 | mutex_unlock(&event_mutex); | 343 | mutex_unlock(&event_mutex); |
| 344 | } | 344 | } |
| 345 | 345 | ||
| 346 | void trace_event_enable_tgid_record(bool enable) | ||
| 347 | { | ||
| 348 | struct trace_event_file *file; | ||
| 349 | struct trace_array *tr; | ||
| 350 | |||
| 351 | mutex_lock(&event_mutex); | ||
| 352 | do_for_each_event_file(tr, file) { | ||
| 353 | if (!(file->flags & EVENT_FILE_FL_ENABLED)) | ||
| 354 | continue; | ||
| 355 | |||
| 356 | if (enable) { | ||
| 357 | tracing_start_tgid_record(); | ||
| 358 | set_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags); | ||
| 359 | } else { | ||
| 360 | tracing_stop_tgid_record(); | ||
| 361 | clear_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, | ||
| 362 | &file->flags); | ||
| 363 | } | ||
| 364 | } while_for_each_event_file(); | ||
| 365 | mutex_unlock(&event_mutex); | ||
| 366 | } | ||
| 367 | |||
| 346 | static int __ftrace_event_enable_disable(struct trace_event_file *file, | 368 | static int __ftrace_event_enable_disable(struct trace_event_file *file, |
| 347 | int enable, int soft_disable) | 369 | int enable, int soft_disable) |
| 348 | { | 370 | { |
| @@ -381,6 +403,12 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, | |||
| 381 | tracing_stop_cmdline_record(); | 403 | tracing_stop_cmdline_record(); |
| 382 | clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); | 404 | clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); |
| 383 | } | 405 | } |
| 406 | |||
| 407 | if (file->flags & EVENT_FILE_FL_RECORDED_TGID) { | ||
| 408 | tracing_stop_tgid_record(); | ||
| 409 | clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); | ||
| 410 | } | ||
| 411 | |||
| 384 | call->class->reg(call, TRACE_REG_UNREGISTER, file); | 412 | call->class->reg(call, TRACE_REG_UNREGISTER, file); |
| 385 | } | 413 | } |
| 386 | /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */ | 414 | /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */ |
| @@ -407,18 +435,30 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, | |||
| 407 | } | 435 | } |
| 408 | 436 | ||
| 409 | if (!(file->flags & EVENT_FILE_FL_ENABLED)) { | 437 | if (!(file->flags & EVENT_FILE_FL_ENABLED)) { |
| 438 | bool cmd = false, tgid = false; | ||
| 410 | 439 | ||
| 411 | /* Keep the event disabled, when going to SOFT_MODE. */ | 440 | /* Keep the event disabled, when going to SOFT_MODE. */ |
| 412 | if (soft_disable) | 441 | if (soft_disable) |
| 413 | set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); | 442 | set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); |
| 414 | 443 | ||
| 415 | if (tr->trace_flags & TRACE_ITER_RECORD_CMD) { | 444 | if (tr->trace_flags & TRACE_ITER_RECORD_CMD) { |
| 445 | cmd = true; | ||
| 416 | tracing_start_cmdline_record(); | 446 | tracing_start_cmdline_record(); |
| 417 | set_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); | 447 | set_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags); |
| 418 | } | 448 | } |
| 449 | |||
| 450 | if (tr->trace_flags & TRACE_ITER_RECORD_TGID) { | ||
| 451 | tgid = true; | ||
| 452 | tracing_start_tgid_record(); | ||
| 453 | set_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags); | ||
| 454 | } | ||
| 455 | |||
| 419 | ret = call->class->reg(call, TRACE_REG_REGISTER, file); | 456 | ret = call->class->reg(call, TRACE_REG_REGISTER, file); |
| 420 | if (ret) { | 457 | if (ret) { |
| 421 | tracing_stop_cmdline_record(); | 458 | if (cmd) |
| 459 | tracing_stop_cmdline_record(); | ||
| 460 | if (tgid) | ||
| 461 | tracing_stop_tgid_record(); | ||
| 422 | pr_info("event trace: Could not enable event " | 462 | pr_info("event trace: Could not enable event " |
| 423 | "%s\n", trace_event_name(call)); | 463 | "%s\n", trace_event_name(call)); |
| 424 | break; | 464 | break; |
| @@ -2067,18 +2107,18 @@ __register_event(struct trace_event_call *call, struct module *mod) | |||
| 2067 | return 0; | 2107 | return 0; |
| 2068 | } | 2108 | } |
| 2069 | 2109 | ||
| 2070 | static char *enum_replace(char *ptr, struct trace_enum_map *map, int len) | 2110 | static char *eval_replace(char *ptr, struct trace_eval_map *map, int len) |
| 2071 | { | 2111 | { |
| 2072 | int rlen; | 2112 | int rlen; |
| 2073 | int elen; | 2113 | int elen; |
| 2074 | 2114 | ||
| 2075 | /* Find the length of the enum value as a string */ | 2115 | /* Find the length of the eval value as a string */ |
| 2076 | elen = snprintf(ptr, 0, "%ld", map->enum_value); | 2116 | elen = snprintf(ptr, 0, "%ld", map->eval_value); |
| 2077 | /* Make sure there's enough room to replace the string with the value */ | 2117 | /* Make sure there's enough room to replace the string with the value */ |
| 2078 | if (len < elen) | 2118 | if (len < elen) |
| 2079 | return NULL; | 2119 | return NULL; |
| 2080 | 2120 | ||
| 2081 | snprintf(ptr, elen + 1, "%ld", map->enum_value); | 2121 | snprintf(ptr, elen + 1, "%ld", map->eval_value); |
| 2082 | 2122 | ||
| 2083 | /* Get the rest of the string of ptr */ | 2123 | /* Get the rest of the string of ptr */ |
| 2084 | rlen = strlen(ptr + len); | 2124 | rlen = strlen(ptr + len); |
| @@ -2090,11 +2130,11 @@ static char *enum_replace(char *ptr, struct trace_enum_map *map, int len) | |||
| 2090 | } | 2130 | } |
| 2091 | 2131 | ||
| 2092 | static void update_event_printk(struct trace_event_call *call, | 2132 | static void update_event_printk(struct trace_event_call *call, |
| 2093 | struct trace_enum_map *map) | 2133 | struct trace_eval_map *map) |
| 2094 | { | 2134 | { |
| 2095 | char *ptr; | 2135 | char *ptr; |
| 2096 | int quote = 0; | 2136 | int quote = 0; |
| 2097 | int len = strlen(map->enum_string); | 2137 | int len = strlen(map->eval_string); |
| 2098 | 2138 | ||
| 2099 | for (ptr = call->print_fmt; *ptr; ptr++) { | 2139 | for (ptr = call->print_fmt; *ptr; ptr++) { |
| 2100 | if (*ptr == '\\') { | 2140 | if (*ptr == '\\') { |
| @@ -2125,16 +2165,16 @@ static void update_event_printk(struct trace_event_call *call, | |||
| 2125 | continue; | 2165 | continue; |
| 2126 | } | 2166 | } |
| 2127 | if (isalpha(*ptr) || *ptr == '_') { | 2167 | if (isalpha(*ptr) || *ptr == '_') { |
| 2128 | if (strncmp(map->enum_string, ptr, len) == 0 && | 2168 | if (strncmp(map->eval_string, ptr, len) == 0 && |
| 2129 | !isalnum(ptr[len]) && ptr[len] != '_') { | 2169 | !isalnum(ptr[len]) && ptr[len] != '_') { |
| 2130 | ptr = enum_replace(ptr, map, len); | 2170 | ptr = eval_replace(ptr, map, len); |
| 2131 | /* Hmm, enum string smaller than value */ | 2171 | /* enum/sizeof string smaller than value */ |
| 2132 | if (WARN_ON_ONCE(!ptr)) | 2172 | if (WARN_ON_ONCE(!ptr)) |
| 2133 | return; | 2173 | return; |
| 2134 | /* | 2174 | /* |
| 2135 | * No need to decrement here, as enum_replace() | 2175 | * No need to decrement here, as eval_replace() |
| 2136 | * returns the pointer to the character passed | 2176 | * returns the pointer to the character passed |
| 2137 | * the enum, and two enums can not be placed | 2177 | * the eval, and two evals can not be placed |
| 2138 | * back to back without something in between. | 2178 | * back to back without something in between. |
| 2139 | * We can skip that something in between. | 2179 | * We can skip that something in between. |
| 2140 | */ | 2180 | */ |
| @@ -2165,7 +2205,7 @@ static void update_event_printk(struct trace_event_call *call, | |||
| 2165 | } | 2205 | } |
| 2166 | } | 2206 | } |
| 2167 | 2207 | ||
| 2168 | void trace_event_enum_update(struct trace_enum_map **map, int len) | 2208 | void trace_event_eval_update(struct trace_eval_map **map, int len) |
| 2169 | { | 2209 | { |
| 2170 | struct trace_event_call *call, *p; | 2210 | struct trace_event_call *call, *p; |
| 2171 | const char *last_system = NULL; | 2211 | const char *last_system = NULL; |
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 2c5221819be5..c9b5aa10fbf9 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c | |||
| @@ -598,6 +598,14 @@ static struct notifier_block trace_kprobe_module_nb = { | |||
| 598 | .priority = 1 /* Invoked after kprobe module callback */ | 598 | .priority = 1 /* Invoked after kprobe module callback */ |
| 599 | }; | 599 | }; |
| 600 | 600 | ||
| 601 | /* Convert certain expected symbols into '_' when generating event names */ | ||
| 602 | static inline void sanitize_event_name(char *name) | ||
| 603 | { | ||
| 604 | while (*name++ != '\0') | ||
| 605 | if (*name == ':' || *name == '.') | ||
| 606 | *name = '_'; | ||
| 607 | } | ||
| 608 | |||
| 601 | static int create_trace_kprobe(int argc, char **argv) | 609 | static int create_trace_kprobe(int argc, char **argv) |
| 602 | { | 610 | { |
| 603 | /* | 611 | /* |
| @@ -736,6 +744,7 @@ static int create_trace_kprobe(int argc, char **argv) | |||
| 736 | else | 744 | else |
| 737 | snprintf(buf, MAX_EVENT_NAME_LEN, "%c_0x%p", | 745 | snprintf(buf, MAX_EVENT_NAME_LEN, "%c_0x%p", |
| 738 | is_return ? 'r' : 'p', addr); | 746 | is_return ? 'r' : 'p', addr); |
| 747 | sanitize_event_name(buf); | ||
| 739 | event = buf; | 748 | event = buf; |
| 740 | } | 749 | } |
| 741 | tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive, | 750 | tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive, |
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 08f9bab8089e..bac629af2285 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c | |||
| @@ -340,31 +340,41 @@ static inline const char *kretprobed(const char *name) | |||
| 340 | static void | 340 | static void |
| 341 | seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address) | 341 | seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address) |
| 342 | { | 342 | { |
| 343 | #ifdef CONFIG_KALLSYMS | ||
| 344 | char str[KSYM_SYMBOL_LEN]; | 343 | char str[KSYM_SYMBOL_LEN]; |
| 344 | #ifdef CONFIG_KALLSYMS | ||
| 345 | const char *name; | 345 | const char *name; |
| 346 | 346 | ||
| 347 | kallsyms_lookup(address, NULL, NULL, NULL, str); | 347 | kallsyms_lookup(address, NULL, NULL, NULL, str); |
| 348 | 348 | ||
| 349 | name = kretprobed(str); | 349 | name = kretprobed(str); |
| 350 | 350 | ||
| 351 | trace_seq_printf(s, fmt, name); | 351 | if (name && strlen(name)) { |
| 352 | trace_seq_printf(s, fmt, name); | ||
| 353 | return; | ||
| 354 | } | ||
| 352 | #endif | 355 | #endif |
| 356 | snprintf(str, KSYM_SYMBOL_LEN, "0x%08lx", address); | ||
| 357 | trace_seq_printf(s, fmt, str); | ||
| 353 | } | 358 | } |
| 354 | 359 | ||
| 355 | static void | 360 | static void |
| 356 | seq_print_sym_offset(struct trace_seq *s, const char *fmt, | 361 | seq_print_sym_offset(struct trace_seq *s, const char *fmt, |
| 357 | unsigned long address) | 362 | unsigned long address) |
| 358 | { | 363 | { |
| 359 | #ifdef CONFIG_KALLSYMS | ||
| 360 | char str[KSYM_SYMBOL_LEN]; | 364 | char str[KSYM_SYMBOL_LEN]; |
| 365 | #ifdef CONFIG_KALLSYMS | ||
| 361 | const char *name; | 366 | const char *name; |
| 362 | 367 | ||
| 363 | sprint_symbol(str, address); | 368 | sprint_symbol(str, address); |
| 364 | name = kretprobed(str); | 369 | name = kretprobed(str); |
| 365 | 370 | ||
| 366 | trace_seq_printf(s, fmt, name); | 371 | if (name && strlen(name)) { |
| 372 | trace_seq_printf(s, fmt, name); | ||
| 373 | return; | ||
| 374 | } | ||
| 367 | #endif | 375 | #endif |
| 376 | snprintf(str, KSYM_SYMBOL_LEN, "0x%08lx", address); | ||
| 377 | trace_seq_printf(s, fmt, str); | ||
| 368 | } | 378 | } |
| 369 | 379 | ||
| 370 | #ifndef CONFIG_64BIT | 380 | #ifndef CONFIG_64BIT |
| @@ -587,6 +597,15 @@ int trace_print_context(struct trace_iterator *iter) | |||
| 587 | trace_seq_printf(s, "%16s-%-5d [%03d] ", | 597 | trace_seq_printf(s, "%16s-%-5d [%03d] ", |
| 588 | comm, entry->pid, iter->cpu); | 598 | comm, entry->pid, iter->cpu); |
| 589 | 599 | ||
| 600 | if (tr->trace_flags & TRACE_ITER_RECORD_TGID) { | ||
| 601 | unsigned int tgid = trace_find_tgid(entry->pid); | ||
| 602 | |||
| 603 | if (!tgid) | ||
| 604 | trace_seq_printf(s, "(-----) "); | ||
| 605 | else | ||
| 606 | trace_seq_printf(s, "(%5d) ", tgid); | ||
| 607 | } | ||
| 608 | |||
| 590 | if (tr->trace_flags & TRACE_ITER_IRQ_INFO) | 609 | if (tr->trace_flags & TRACE_ITER_IRQ_INFO) |
| 591 | trace_print_lat_fmt(s, entry); | 610 | trace_print_lat_fmt(s, entry); |
| 592 | 611 | ||
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 4c896a0101bd..b341c02730be 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c | |||
| @@ -12,27 +12,38 @@ | |||
| 12 | 12 | ||
| 13 | #include "trace.h" | 13 | #include "trace.h" |
| 14 | 14 | ||
| 15 | static int sched_ref; | 15 | #define RECORD_CMDLINE 1 |
| 16 | #define RECORD_TGID 2 | ||
| 17 | |||
| 18 | static int sched_cmdline_ref; | ||
| 19 | static int sched_tgid_ref; | ||
| 16 | static DEFINE_MUTEX(sched_register_mutex); | 20 | static DEFINE_MUTEX(sched_register_mutex); |
| 17 | 21 | ||
| 18 | static void | 22 | static void |
| 19 | probe_sched_switch(void *ignore, bool preempt, | 23 | probe_sched_switch(void *ignore, bool preempt, |
| 20 | struct task_struct *prev, struct task_struct *next) | 24 | struct task_struct *prev, struct task_struct *next) |
| 21 | { | 25 | { |
| 22 | if (unlikely(!sched_ref)) | 26 | int flags; |
| 23 | return; | 27 | |
| 28 | flags = (RECORD_TGID * !!sched_tgid_ref) + | ||
| 29 | (RECORD_CMDLINE * !!sched_cmdline_ref); | ||
| 24 | 30 | ||
| 25 | tracing_record_cmdline(prev); | 31 | if (!flags) |
| 26 | tracing_record_cmdline(next); | 32 | return; |
| 33 | tracing_record_taskinfo_sched_switch(prev, next, flags); | ||
| 27 | } | 34 | } |
| 28 | 35 | ||
| 29 | static void | 36 | static void |
| 30 | probe_sched_wakeup(void *ignore, struct task_struct *wakee) | 37 | probe_sched_wakeup(void *ignore, struct task_struct *wakee) |
| 31 | { | 38 | { |
| 32 | if (unlikely(!sched_ref)) | 39 | int flags; |
| 33 | return; | 40 | |
| 41 | flags = (RECORD_TGID * !!sched_tgid_ref) + | ||
| 42 | (RECORD_CMDLINE * !!sched_cmdline_ref); | ||
| 34 | 43 | ||
| 35 | tracing_record_cmdline(current); | 44 | if (!flags) |
| 45 | return; | ||
| 46 | tracing_record_taskinfo(current, flags); | ||
| 36 | } | 47 | } |
| 37 | 48 | ||
| 38 | static int tracing_sched_register(void) | 49 | static int tracing_sched_register(void) |
| @@ -75,28 +86,61 @@ static void tracing_sched_unregister(void) | |||
| 75 | unregister_trace_sched_wakeup(probe_sched_wakeup, NULL); | 86 | unregister_trace_sched_wakeup(probe_sched_wakeup, NULL); |
| 76 | } | 87 | } |
| 77 | 88 | ||
| 78 | static void tracing_start_sched_switch(void) | 89 | static void tracing_start_sched_switch(int ops) |
| 79 | { | 90 | { |
| 91 | bool sched_register = (!sched_cmdline_ref && !sched_tgid_ref); | ||
| 80 | mutex_lock(&sched_register_mutex); | 92 | mutex_lock(&sched_register_mutex); |
| 81 | if (!(sched_ref++)) | 93 | |
| 94 | switch (ops) { | ||
| 95 | case RECORD_CMDLINE: | ||
| 96 | sched_cmdline_ref++; | ||
| 97 | break; | ||
| 98 | |||
| 99 | case RECORD_TGID: | ||
| 100 | sched_tgid_ref++; | ||
| 101 | break; | ||
| 102 | } | ||
| 103 | |||
| 104 | if (sched_register && (sched_cmdline_ref || sched_tgid_ref)) | ||
| 82 | tracing_sched_register(); | 105 | tracing_sched_register(); |
| 83 | mutex_unlock(&sched_register_mutex); | 106 | mutex_unlock(&sched_register_mutex); |
| 84 | } | 107 | } |
| 85 | 108 | ||
| 86 | static void tracing_stop_sched_switch(void) | 109 | static void tracing_stop_sched_switch(int ops) |
| 87 | { | 110 | { |
| 88 | mutex_lock(&sched_register_mutex); | 111 | mutex_lock(&sched_register_mutex); |
| 89 | if (!(--sched_ref)) | 112 | |
| 113 | switch (ops) { | ||
| 114 | case RECORD_CMDLINE: | ||
| 115 | sched_cmdline_ref--; | ||
| 116 | break; | ||
| 117 | |||
| 118 | case RECORD_TGID: | ||
| 119 | sched_tgid_ref--; | ||
| 120 | break; | ||
| 121 | } | ||
| 122 | |||
| 123 | if (!sched_cmdline_ref && !sched_tgid_ref) | ||
| 90 | tracing_sched_unregister(); | 124 | tracing_sched_unregister(); |
| 91 | mutex_unlock(&sched_register_mutex); | 125 | mutex_unlock(&sched_register_mutex); |
| 92 | } | 126 | } |
| 93 | 127 | ||
| 94 | void tracing_start_cmdline_record(void) | 128 | void tracing_start_cmdline_record(void) |
| 95 | { | 129 | { |
| 96 | tracing_start_sched_switch(); | 130 | tracing_start_sched_switch(RECORD_CMDLINE); |
| 97 | } | 131 | } |
| 98 | 132 | ||
| 99 | void tracing_stop_cmdline_record(void) | 133 | void tracing_stop_cmdline_record(void) |
| 100 | { | 134 | { |
| 101 | tracing_stop_sched_switch(); | 135 | tracing_stop_sched_switch(RECORD_CMDLINE); |
| 136 | } | ||
| 137 | |||
| 138 | void tracing_start_tgid_record(void) | ||
| 139 | { | ||
| 140 | tracing_start_sched_switch(RECORD_TGID); | ||
| 141 | } | ||
| 142 | |||
| 143 | void tracing_stop_tgid_record(void) | ||
| 144 | { | ||
| 145 | tracing_stop_sched_switch(RECORD_TGID); | ||
| 102 | } | 146 | } |
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index b4a751e8f9d6..a4df67cbc711 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c | |||
| @@ -406,6 +406,8 @@ static const struct file_operations stack_trace_fops = { | |||
| 406 | .release = seq_release, | 406 | .release = seq_release, |
| 407 | }; | 407 | }; |
| 408 | 408 | ||
| 409 | #ifdef CONFIG_DYNAMIC_FTRACE | ||
| 410 | |||
| 409 | static int | 411 | static int |
| 410 | stack_trace_filter_open(struct inode *inode, struct file *file) | 412 | stack_trace_filter_open(struct inode *inode, struct file *file) |
| 411 | { | 413 | { |
| @@ -423,6 +425,8 @@ static const struct file_operations stack_trace_filter_fops = { | |||
| 423 | .release = ftrace_regex_release, | 425 | .release = ftrace_regex_release, |
| 424 | }; | 426 | }; |
| 425 | 427 | ||
| 428 | #endif /* CONFIG_DYNAMIC_FTRACE */ | ||
| 429 | |||
| 426 | int | 430 | int |
| 427 | stack_trace_sysctl(struct ctl_table *table, int write, | 431 | stack_trace_sysctl(struct ctl_table *table, int write, |
| 428 | void __user *buffer, size_t *lenp, | 432 | void __user *buffer, size_t *lenp, |
| @@ -477,8 +481,10 @@ static __init int stack_trace_init(void) | |||
| 477 | trace_create_file("stack_trace", 0444, d_tracer, | 481 | trace_create_file("stack_trace", 0444, d_tracer, |
| 478 | NULL, &stack_trace_fops); | 482 | NULL, &stack_trace_fops); |
| 479 | 483 | ||
| 484 | #ifdef CONFIG_DYNAMIC_FTRACE | ||
| 480 | trace_create_file("stack_trace_filter", 0444, d_tracer, | 485 | trace_create_file("stack_trace_filter", 0444, d_tracer, |
| 481 | &trace_ops, &stack_trace_filter_fops); | 486 | &trace_ops, &stack_trace_filter_fops); |
| 487 | #endif | ||
| 482 | 488 | ||
| 483 | if (stack_trace_filter_buf[0]) | 489 | if (stack_trace_filter_buf[0]) |
| 484 | ftrace_set_early_filter(&trace_ops, stack_trace_filter_buf, 1); | 490 | ftrace_set_early_filter(&trace_ops, stack_trace_filter_buf, 1); |
diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 03e0b69bb5bf..06d3389bca0d 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c | |||
| @@ -9,7 +9,7 @@ | |||
| 9 | * to those contributors as well. | 9 | * to those contributors as well. |
| 10 | */ | 10 | */ |
| 11 | 11 | ||
| 12 | #define pr_fmt(fmt) "NMI watchdog: " fmt | 12 | #define pr_fmt(fmt) "watchdog: " fmt |
| 13 | 13 | ||
| 14 | #include <linux/mm.h> | 14 | #include <linux/mm.h> |
| 15 | #include <linux/cpu.h> | 15 | #include <linux/cpu.h> |
| @@ -29,15 +29,58 @@ | |||
| 29 | #include <linux/kvm_para.h> | 29 | #include <linux/kvm_para.h> |
| 30 | #include <linux/kthread.h> | 30 | #include <linux/kthread.h> |
| 31 | 31 | ||
| 32 | /* Watchdog configuration */ | ||
| 32 | static DEFINE_MUTEX(watchdog_proc_mutex); | 33 | static DEFINE_MUTEX(watchdog_proc_mutex); |
| 33 | 34 | ||
| 34 | #if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR) | 35 | int __read_mostly nmi_watchdog_enabled; |
| 35 | unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED; | 36 | |
| 37 | #if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_NMI_WATCHDOG) | ||
| 38 | unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED | | ||
| 39 | NMI_WATCHDOG_ENABLED; | ||
| 36 | #else | 40 | #else |
| 37 | unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED; | 41 | unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED; |
| 38 | #endif | 42 | #endif |
| 39 | int __read_mostly nmi_watchdog_enabled; | 43 | |
| 44 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | ||
| 45 | /* boot commands */ | ||
| 46 | /* | ||
| 47 | * Should we panic when a soft-lockup or hard-lockup occurs: | ||
| 48 | */ | ||
| 49 | unsigned int __read_mostly hardlockup_panic = | ||
| 50 | CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; | ||
| 51 | /* | ||
| 52 | * We may not want to enable hard lockup detection by default in all cases, | ||
| 53 | * for example when running the kernel as a guest on a hypervisor. In these | ||
| 54 | * cases this function can be called to disable hard lockup detection. This | ||
| 55 | * function should only be executed once by the boot processor before the | ||
| 56 | * kernel command line parameters are parsed, because otherwise it is not | ||
| 57 | * possible to override this in hardlockup_panic_setup(). | ||
| 58 | */ | ||
| 59 | void hardlockup_detector_disable(void) | ||
| 60 | { | ||
| 61 | watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; | ||
| 62 | } | ||
| 63 | |||
| 64 | static int __init hardlockup_panic_setup(char *str) | ||
| 65 | { | ||
| 66 | if (!strncmp(str, "panic", 5)) | ||
| 67 | hardlockup_panic = 1; | ||
| 68 | else if (!strncmp(str, "nopanic", 7)) | ||
| 69 | hardlockup_panic = 0; | ||
| 70 | else if (!strncmp(str, "0", 1)) | ||
| 71 | watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; | ||
| 72 | else if (!strncmp(str, "1", 1)) | ||
| 73 | watchdog_enabled |= NMI_WATCHDOG_ENABLED; | ||
| 74 | return 1; | ||
| 75 | } | ||
| 76 | __setup("nmi_watchdog=", hardlockup_panic_setup); | ||
| 77 | |||
| 78 | #endif | ||
| 79 | |||
| 80 | #ifdef CONFIG_SOFTLOCKUP_DETECTOR | ||
| 40 | int __read_mostly soft_watchdog_enabled; | 81 | int __read_mostly soft_watchdog_enabled; |
| 82 | #endif | ||
| 83 | |||
| 41 | int __read_mostly watchdog_user_enabled; | 84 | int __read_mostly watchdog_user_enabled; |
| 42 | int __read_mostly watchdog_thresh = 10; | 85 | int __read_mostly watchdog_thresh = 10; |
| 43 | 86 | ||
| @@ -45,15 +88,9 @@ int __read_mostly watchdog_thresh = 10; | |||
| 45 | int __read_mostly sysctl_softlockup_all_cpu_backtrace; | 88 | int __read_mostly sysctl_softlockup_all_cpu_backtrace; |
| 46 | int __read_mostly sysctl_hardlockup_all_cpu_backtrace; | 89 | int __read_mostly sysctl_hardlockup_all_cpu_backtrace; |
| 47 | #endif | 90 | #endif |
| 48 | static struct cpumask watchdog_cpumask __read_mostly; | 91 | struct cpumask watchdog_cpumask __read_mostly; |
| 49 | unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); | 92 | unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); |
| 50 | 93 | ||
| 51 | /* Helper for online, unparked cpus. */ | ||
| 52 | #define for_each_watchdog_cpu(cpu) \ | ||
| 53 | for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask) | ||
| 54 | |||
| 55 | atomic_t watchdog_park_in_progress = ATOMIC_INIT(0); | ||
| 56 | |||
| 57 | /* | 94 | /* |
| 58 | * The 'watchdog_running' variable is set to 1 when the watchdog threads | 95 | * The 'watchdog_running' variable is set to 1 when the watchdog threads |
| 59 | * are registered/started and is set to 0 when the watchdog threads are | 96 | * are registered/started and is set to 0 when the watchdog threads are |
| @@ -72,7 +109,47 @@ static int __read_mostly watchdog_running; | |||
| 72 | * of 'watchdog_running' cannot change while the watchdog is deactivated | 109 | * of 'watchdog_running' cannot change while the watchdog is deactivated |
| 73 | * temporarily (see related code in 'proc' handlers). | 110 | * temporarily (see related code in 'proc' handlers). |
| 74 | */ | 111 | */ |
| 75 | static int __read_mostly watchdog_suspended; | 112 | int __read_mostly watchdog_suspended; |
| 113 | |||
| 114 | /* | ||
| 115 | * These functions can be overridden if an architecture implements its | ||
| 116 | * own hardlockup detector. | ||
| 117 | * | ||
| 118 | * watchdog_nmi_enable/disable can be implemented to start and stop when | ||
| 119 | * softlockup watchdog threads start and stop. The arch must select the | ||
| 120 | * SOFTLOCKUP_DETECTOR Kconfig. | ||
| 121 | */ | ||
| 122 | int __weak watchdog_nmi_enable(unsigned int cpu) | ||
| 123 | { | ||
| 124 | return 0; | ||
| 125 | } | ||
| 126 | void __weak watchdog_nmi_disable(unsigned int cpu) | ||
| 127 | { | ||
| 128 | } | ||
| 129 | |||
| 130 | /* | ||
| 131 | * watchdog_nmi_reconfigure can be implemented to be notified after any | ||
| 132 | * watchdog configuration change. The arch hardlockup watchdog should | ||
| 133 | * respond to the following variables: | ||
| 134 | * - nmi_watchdog_enabled | ||
| 135 | * - watchdog_thresh | ||
| 136 | * - watchdog_cpumask | ||
| 137 | * - sysctl_hardlockup_all_cpu_backtrace | ||
| 138 | * - hardlockup_panic | ||
| 139 | * - watchdog_suspended | ||
| 140 | */ | ||
| 141 | void __weak watchdog_nmi_reconfigure(void) | ||
| 142 | { | ||
| 143 | } | ||
| 144 | |||
| 145 | |||
| 146 | #ifdef CONFIG_SOFTLOCKUP_DETECTOR | ||
| 147 | |||
| 148 | /* Helper for online, unparked cpus. */ | ||
| 149 | #define for_each_watchdog_cpu(cpu) \ | ||
| 150 | for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask) | ||
| 151 | |||
| 152 | atomic_t watchdog_park_in_progress = ATOMIC_INIT(0); | ||
| 76 | 153 | ||
| 77 | static u64 __read_mostly sample_period; | 154 | static u64 __read_mostly sample_period; |
| 78 | 155 | ||
| @@ -120,6 +197,7 @@ static int __init softlockup_all_cpu_backtrace_setup(char *str) | |||
| 120 | return 1; | 197 | return 1; |
| 121 | } | 198 | } |
| 122 | __setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup); | 199 | __setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup); |
| 200 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | ||
| 123 | static int __init hardlockup_all_cpu_backtrace_setup(char *str) | 201 | static int __init hardlockup_all_cpu_backtrace_setup(char *str) |
| 124 | { | 202 | { |
| 125 | sysctl_hardlockup_all_cpu_backtrace = | 203 | sysctl_hardlockup_all_cpu_backtrace = |
| @@ -128,6 +206,7 @@ static int __init hardlockup_all_cpu_backtrace_setup(char *str) | |||
| 128 | } | 206 | } |
| 129 | __setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup); | 207 | __setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup); |
| 130 | #endif | 208 | #endif |
| 209 | #endif | ||
| 131 | 210 | ||
| 132 | /* | 211 | /* |
| 133 | * Hard-lockup warnings should be triggered after just a few seconds. Soft- | 212 | * Hard-lockup warnings should be triggered after just a few seconds. Soft- |
| @@ -213,18 +292,6 @@ void touch_softlockup_watchdog_sync(void) | |||
| 213 | __this_cpu_write(watchdog_touch_ts, 0); | 292 | __this_cpu_write(watchdog_touch_ts, 0); |
| 214 | } | 293 | } |
| 215 | 294 | ||
| 216 | /* watchdog detector functions */ | ||
| 217 | bool is_hardlockup(void) | ||
| 218 | { | ||
| 219 | unsigned long hrint = __this_cpu_read(hrtimer_interrupts); | ||
| 220 | |||
| 221 | if (__this_cpu_read(hrtimer_interrupts_saved) == hrint) | ||
| 222 | return true; | ||
| 223 | |||
| 224 | __this_cpu_write(hrtimer_interrupts_saved, hrint); | ||
| 225 | return false; | ||
| 226 | } | ||
| 227 | |||
| 228 | static int is_softlockup(unsigned long touch_ts) | 295 | static int is_softlockup(unsigned long touch_ts) |
| 229 | { | 296 | { |
| 230 | unsigned long now = get_timestamp(); | 297 | unsigned long now = get_timestamp(); |
| @@ -237,21 +304,21 @@ static int is_softlockup(unsigned long touch_ts) | |||
| 237 | return 0; | 304 | return 0; |
| 238 | } | 305 | } |
| 239 | 306 | ||
| 240 | static void watchdog_interrupt_count(void) | 307 | /* watchdog detector functions */ |
| 308 | bool is_hardlockup(void) | ||
| 241 | { | 309 | { |
| 242 | __this_cpu_inc(hrtimer_interrupts); | 310 | unsigned long hrint = __this_cpu_read(hrtimer_interrupts); |
| 243 | } | ||
| 244 | 311 | ||
| 245 | /* | 312 | if (__this_cpu_read(hrtimer_interrupts_saved) == hrint) |
| 246 | * These two functions are mostly architecture specific | 313 | return true; |
| 247 | * defining them as weak here. | 314 | |
| 248 | */ | 315 | __this_cpu_write(hrtimer_interrupts_saved, hrint); |
| 249 | int __weak watchdog_nmi_enable(unsigned int cpu) | 316 | return false; |
| 250 | { | ||
| 251 | return 0; | ||
| 252 | } | 317 | } |
| 253 | void __weak watchdog_nmi_disable(unsigned int cpu) | 318 | |
| 319 | static void watchdog_interrupt_count(void) | ||
| 254 | { | 320 | { |
| 321 | __this_cpu_inc(hrtimer_interrupts); | ||
| 255 | } | 322 | } |
| 256 | 323 | ||
| 257 | static int watchdog_enable_all_cpus(void); | 324 | static int watchdog_enable_all_cpus(void); |
| @@ -502,57 +569,6 @@ static void watchdog_unpark_threads(void) | |||
| 502 | kthread_unpark(per_cpu(softlockup_watchdog, cpu)); | 569 | kthread_unpark(per_cpu(softlockup_watchdog, cpu)); |
| 503 | } | 570 | } |
| 504 | 571 | ||
| 505 | /* | ||
| 506 | * Suspend the hard and soft lockup detector by parking the watchdog threads. | ||
| 507 | */ | ||
| 508 | int lockup_detector_suspend(void) | ||
| 509 | { | ||
| 510 | int ret = 0; | ||
| 511 | |||
| 512 | get_online_cpus(); | ||
| 513 | mutex_lock(&watchdog_proc_mutex); | ||
| 514 | /* | ||
| 515 | * Multiple suspend requests can be active in parallel (counted by | ||
| 516 | * the 'watchdog_suspended' variable). If the watchdog threads are | ||
| 517 | * running, the first caller takes care that they will be parked. | ||
| 518 | * The state of 'watchdog_running' cannot change while a suspend | ||
| 519 | * request is active (see related code in 'proc' handlers). | ||
| 520 | */ | ||
| 521 | if (watchdog_running && !watchdog_suspended) | ||
| 522 | ret = watchdog_park_threads(); | ||
| 523 | |||
| 524 | if (ret == 0) | ||
| 525 | watchdog_suspended++; | ||
| 526 | else { | ||
| 527 | watchdog_disable_all_cpus(); | ||
| 528 | pr_err("Failed to suspend lockup detectors, disabled\n"); | ||
| 529 | watchdog_enabled = 0; | ||
| 530 | } | ||
| 531 | |||
| 532 | mutex_unlock(&watchdog_proc_mutex); | ||
| 533 | |||
| 534 | return ret; | ||
| 535 | } | ||
| 536 | |||
| 537 | /* | ||
| 538 | * Resume the hard and soft lockup detector by unparking the watchdog threads. | ||
| 539 | */ | ||
| 540 | void lockup_detector_resume(void) | ||
| 541 | { | ||
| 542 | mutex_lock(&watchdog_proc_mutex); | ||
| 543 | |||
| 544 | watchdog_suspended--; | ||
| 545 | /* | ||
| 546 | * The watchdog threads are unparked if they were previously running | ||
| 547 | * and if there is no more active suspend request. | ||
| 548 | */ | ||
| 549 | if (watchdog_running && !watchdog_suspended) | ||
| 550 | watchdog_unpark_threads(); | ||
| 551 | |||
| 552 | mutex_unlock(&watchdog_proc_mutex); | ||
| 553 | put_online_cpus(); | ||
| 554 | } | ||
| 555 | |||
| 556 | static int update_watchdog_all_cpus(void) | 572 | static int update_watchdog_all_cpus(void) |
| 557 | { | 573 | { |
| 558 | int ret; | 574 | int ret; |
| @@ -605,6 +621,100 @@ static void watchdog_disable_all_cpus(void) | |||
| 605 | } | 621 | } |
| 606 | 622 | ||
| 607 | #ifdef CONFIG_SYSCTL | 623 | #ifdef CONFIG_SYSCTL |
| 624 | static int watchdog_update_cpus(void) | ||
| 625 | { | ||
| 626 | return smpboot_update_cpumask_percpu_thread( | ||
| 627 | &watchdog_threads, &watchdog_cpumask); | ||
| 628 | } | ||
| 629 | #endif | ||
| 630 | |||
| 631 | #else /* SOFTLOCKUP */ | ||
| 632 | static int watchdog_park_threads(void) | ||
| 633 | { | ||
| 634 | return 0; | ||
| 635 | } | ||
| 636 | |||
| 637 | static void watchdog_unpark_threads(void) | ||
| 638 | { | ||
| 639 | } | ||
| 640 | |||
| 641 | static int watchdog_enable_all_cpus(void) | ||
| 642 | { | ||
| 643 | return 0; | ||
| 644 | } | ||
| 645 | |||
| 646 | static void watchdog_disable_all_cpus(void) | ||
| 647 | { | ||
| 648 | } | ||
| 649 | |||
| 650 | #ifdef CONFIG_SYSCTL | ||
| 651 | static int watchdog_update_cpus(void) | ||
| 652 | { | ||
| 653 | return 0; | ||
| 654 | } | ||
| 655 | #endif | ||
| 656 | |||
| 657 | static void set_sample_period(void) | ||
| 658 | { | ||
| 659 | } | ||
| 660 | #endif /* SOFTLOCKUP */ | ||
| 661 | |||
| 662 | /* | ||
| 663 | * Suspend the hard and soft lockup detector by parking the watchdog threads. | ||
| 664 | */ | ||
| 665 | int lockup_detector_suspend(void) | ||
| 666 | { | ||
| 667 | int ret = 0; | ||
| 668 | |||
| 669 | get_online_cpus(); | ||
| 670 | mutex_lock(&watchdog_proc_mutex); | ||
| 671 | /* | ||
| 672 | * Multiple suspend requests can be active in parallel (counted by | ||
| 673 | * the 'watchdog_suspended' variable). If the watchdog threads are | ||
| 674 | * running, the first caller takes care that they will be parked. | ||
| 675 | * The state of 'watchdog_running' cannot change while a suspend | ||
| 676 | * request is active (see related code in 'proc' handlers). | ||
| 677 | */ | ||
| 678 | if (watchdog_running && !watchdog_suspended) | ||
| 679 | ret = watchdog_park_threads(); | ||
| 680 | |||
| 681 | if (ret == 0) | ||
| 682 | watchdog_suspended++; | ||
| 683 | else { | ||
| 684 | watchdog_disable_all_cpus(); | ||
| 685 | pr_err("Failed to suspend lockup detectors, disabled\n"); | ||
| 686 | watchdog_enabled = 0; | ||
| 687 | } | ||
| 688 | |||
| 689 | watchdog_nmi_reconfigure(); | ||
| 690 | |||
| 691 | mutex_unlock(&watchdog_proc_mutex); | ||
| 692 | |||
| 693 | return ret; | ||
| 694 | } | ||
| 695 | |||
| 696 | /* | ||
| 697 | * Resume the hard and soft lockup detector by unparking the watchdog threads. | ||
| 698 | */ | ||
| 699 | void lockup_detector_resume(void) | ||
| 700 | { | ||
| 701 | mutex_lock(&watchdog_proc_mutex); | ||
| 702 | |||
| 703 | watchdog_suspended--; | ||
| 704 | /* | ||
| 705 | * The watchdog threads are unparked if they were previously running | ||
| 706 | * and if there is no more active suspend request. | ||
| 707 | */ | ||
| 708 | if (watchdog_running && !watchdog_suspended) | ||
| 709 | watchdog_unpark_threads(); | ||
| 710 | |||
| 711 | watchdog_nmi_reconfigure(); | ||
| 712 | |||
| 713 | mutex_unlock(&watchdog_proc_mutex); | ||
| 714 | put_online_cpus(); | ||
| 715 | } | ||
| 716 | |||
| 717 | #ifdef CONFIG_SYSCTL | ||
| 608 | 718 | ||
| 609 | /* | 719 | /* |
| 610 | * Update the run state of the lockup detectors. | 720 | * Update the run state of the lockup detectors. |
| @@ -625,6 +735,8 @@ static int proc_watchdog_update(void) | |||
| 625 | else | 735 | else |
| 626 | watchdog_disable_all_cpus(); | 736 | watchdog_disable_all_cpus(); |
| 627 | 737 | ||
| 738 | watchdog_nmi_reconfigure(); | ||
| 739 | |||
| 628 | return err; | 740 | return err; |
| 629 | 741 | ||
| 630 | } | 742 | } |
| @@ -810,10 +922,11 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, | |||
| 810 | * a temporary cpumask, so we are likely not in a | 922 | * a temporary cpumask, so we are likely not in a |
| 811 | * position to do much else to make things better. | 923 | * position to do much else to make things better. |
| 812 | */ | 924 | */ |
| 813 | if (smpboot_update_cpumask_percpu_thread( | 925 | if (watchdog_update_cpus() != 0) |
| 814 | &watchdog_threads, &watchdog_cpumask) != 0) | ||
| 815 | pr_err("cpumask update failed\n"); | 926 | pr_err("cpumask update failed\n"); |
| 816 | } | 927 | } |
| 928 | |||
| 929 | watchdog_nmi_reconfigure(); | ||
| 817 | } | 930 | } |
| 818 | out: | 931 | out: |
| 819 | mutex_unlock(&watchdog_proc_mutex); | 932 | mutex_unlock(&watchdog_proc_mutex); |
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 54a427d1f344..295a0d84934c 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c | |||
| @@ -22,41 +22,9 @@ static DEFINE_PER_CPU(bool, hard_watchdog_warn); | |||
| 22 | static DEFINE_PER_CPU(bool, watchdog_nmi_touch); | 22 | static DEFINE_PER_CPU(bool, watchdog_nmi_touch); |
| 23 | static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); | 23 | static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); |
| 24 | 24 | ||
| 25 | /* boot commands */ | ||
| 26 | /* | ||
| 27 | * Should we panic when a soft-lockup or hard-lockup occurs: | ||
| 28 | */ | ||
| 29 | unsigned int __read_mostly hardlockup_panic = | ||
| 30 | CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; | ||
| 31 | static unsigned long hardlockup_allcpu_dumped; | 25 | static unsigned long hardlockup_allcpu_dumped; |
| 32 | /* | ||
| 33 | * We may not want to enable hard lockup detection by default in all cases, | ||
| 34 | * for example when running the kernel as a guest on a hypervisor. In these | ||
| 35 | * cases this function can be called to disable hard lockup detection. This | ||
| 36 | * function should only be executed once by the boot processor before the | ||
| 37 | * kernel command line parameters are parsed, because otherwise it is not | ||
| 38 | * possible to override this in hardlockup_panic_setup(). | ||
| 39 | */ | ||
| 40 | void hardlockup_detector_disable(void) | ||
| 41 | { | ||
| 42 | watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; | ||
| 43 | } | ||
| 44 | |||
| 45 | static int __init hardlockup_panic_setup(char *str) | ||
| 46 | { | ||
| 47 | if (!strncmp(str, "panic", 5)) | ||
| 48 | hardlockup_panic = 1; | ||
| 49 | else if (!strncmp(str, "nopanic", 7)) | ||
| 50 | hardlockup_panic = 0; | ||
| 51 | else if (!strncmp(str, "0", 1)) | ||
| 52 | watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; | ||
| 53 | else if (!strncmp(str, "1", 1)) | ||
| 54 | watchdog_enabled |= NMI_WATCHDOG_ENABLED; | ||
| 55 | return 1; | ||
| 56 | } | ||
| 57 | __setup("nmi_watchdog=", hardlockup_panic_setup); | ||
| 58 | 26 | ||
| 59 | void touch_nmi_watchdog(void) | 27 | void arch_touch_nmi_watchdog(void) |
| 60 | { | 28 | { |
| 61 | /* | 29 | /* |
| 62 | * Using __raw here because some code paths have | 30 | * Using __raw here because some code paths have |
| @@ -66,9 +34,8 @@ void touch_nmi_watchdog(void) | |||
| 66 | * going off. | 34 | * going off. |
| 67 | */ | 35 | */ |
| 68 | raw_cpu_write(watchdog_nmi_touch, true); | 36 | raw_cpu_write(watchdog_nmi_touch, true); |
| 69 | touch_softlockup_watchdog(); | ||
| 70 | } | 37 | } |
| 71 | EXPORT_SYMBOL(touch_nmi_watchdog); | 38 | EXPORT_SYMBOL(arch_touch_nmi_watchdog); |
| 72 | 39 | ||
| 73 | static struct perf_event_attr wd_hw_attr = { | 40 | static struct perf_event_attr wd_hw_attr = { |
| 74 | .type = PERF_TYPE_HARDWARE, | 41 | .type = PERF_TYPE_HARDWARE, |
