diff options
Diffstat (limited to 'kernel')
126 files changed, 6383 insertions, 3755 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index d2b32ac27a39..35536d9c0964 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks | |||
@@ -223,3 +223,10 @@ endif | |||
223 | config MUTEX_SPIN_ON_OWNER | 223 | config MUTEX_SPIN_ON_OWNER |
224 | def_bool y | 224 | def_bool y |
225 | depends on SMP && !DEBUG_MUTEXES | 225 | depends on SMP && !DEBUG_MUTEXES |
226 | |||
227 | config ARCH_USE_QUEUE_RWLOCK | ||
228 | bool | ||
229 | |||
230 | config QUEUE_RWLOCK | ||
231 | def_bool y if ARCH_USE_QUEUE_RWLOCK | ||
232 | depends on SMP | ||
diff --git a/kernel/acct.c b/kernel/acct.c index 8d6e145138bb..808a86ff229d 100644 --- a/kernel/acct.c +++ b/kernel/acct.c | |||
@@ -55,7 +55,7 @@ | |||
55 | #include <linux/times.h> | 55 | #include <linux/times.h> |
56 | #include <linux/syscalls.h> | 56 | #include <linux/syscalls.h> |
57 | #include <linux/mount.h> | 57 | #include <linux/mount.h> |
58 | #include <asm/uaccess.h> | 58 | #include <linux/uaccess.h> |
59 | #include <asm/div64.h> | 59 | #include <asm/div64.h> |
60 | #include <linux/blkdev.h> /* sector_div */ | 60 | #include <linux/blkdev.h> /* sector_div */ |
61 | #include <linux/pid_namespace.h> | 61 | #include <linux/pid_namespace.h> |
@@ -134,7 +134,7 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file) | |||
134 | spin_lock(&acct_lock); | 134 | spin_lock(&acct_lock); |
135 | if (file != acct->file) { | 135 | if (file != acct->file) { |
136 | if (act) | 136 | if (act) |
137 | res = act>0; | 137 | res = act > 0; |
138 | goto out; | 138 | goto out; |
139 | } | 139 | } |
140 | 140 | ||
@@ -262,7 +262,7 @@ SYSCALL_DEFINE1(acct, const char __user *, name) | |||
262 | if (name) { | 262 | if (name) { |
263 | struct filename *tmp = getname(name); | 263 | struct filename *tmp = getname(name); |
264 | if (IS_ERR(tmp)) | 264 | if (IS_ERR(tmp)) |
265 | return (PTR_ERR(tmp)); | 265 | return PTR_ERR(tmp); |
266 | error = acct_on(tmp); | 266 | error = acct_on(tmp); |
267 | putname(tmp); | 267 | putname(tmp); |
268 | } else { | 268 | } else { |
diff --git a/kernel/audit.c b/kernel/audit.c index 7c2893602d06..3ef2e0e797e8 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
@@ -44,7 +44,7 @@ | |||
44 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | 44 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
45 | 45 | ||
46 | #include <linux/init.h> | 46 | #include <linux/init.h> |
47 | #include <asm/types.h> | 47 | #include <linux/types.h> |
48 | #include <linux/atomic.h> | 48 | #include <linux/atomic.h> |
49 | #include <linux/mm.h> | 49 | #include <linux/mm.h> |
50 | #include <linux/export.h> | 50 | #include <linux/export.h> |
@@ -424,6 +424,38 @@ static void kauditd_send_skb(struct sk_buff *skb) | |||
424 | } | 424 | } |
425 | 425 | ||
426 | /* | 426 | /* |
427 | * kauditd_send_multicast_skb - send the skb to multicast userspace listeners | ||
428 | * | ||
429 | * This function doesn't consume an skb as might be expected since it has to | ||
430 | * copy it anyways. | ||
431 | */ | ||
432 | static void kauditd_send_multicast_skb(struct sk_buff *skb) | ||
433 | { | ||
434 | struct sk_buff *copy; | ||
435 | struct audit_net *aunet = net_generic(&init_net, audit_net_id); | ||
436 | struct sock *sock = aunet->nlsk; | ||
437 | |||
438 | if (!netlink_has_listeners(sock, AUDIT_NLGRP_READLOG)) | ||
439 | return; | ||
440 | |||
441 | /* | ||
442 | * The seemingly wasteful skb_copy() rather than bumping the refcount | ||
443 | * using skb_get() is necessary because non-standard mods are made to | ||
444 | * the skb by the original kaudit unicast socket send routine. The | ||
445 | * existing auditd daemon assumes this breakage. Fixing this would | ||
446 | * require co-ordinating a change in the established protocol between | ||
447 | * the kaudit kernel subsystem and the auditd userspace code. There is | ||
448 | * no reason for new multicast clients to continue with this | ||
449 | * non-compliance. | ||
450 | */ | ||
451 | copy = skb_copy(skb, GFP_KERNEL); | ||
452 | if (!copy) | ||
453 | return; | ||
454 | |||
455 | nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, GFP_KERNEL); | ||
456 | } | ||
457 | |||
458 | /* | ||
427 | * flush_hold_queue - empty the hold queue if auditd appears | 459 | * flush_hold_queue - empty the hold queue if auditd appears |
428 | * | 460 | * |
429 | * If auditd just started, drain the queue of messages already | 461 | * If auditd just started, drain the queue of messages already |
@@ -643,13 +675,13 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) | |||
643 | if ((task_active_pid_ns(current) != &init_pid_ns)) | 675 | if ((task_active_pid_ns(current) != &init_pid_ns)) |
644 | return -EPERM; | 676 | return -EPERM; |
645 | 677 | ||
646 | if (!capable(CAP_AUDIT_CONTROL)) | 678 | if (!netlink_capable(skb, CAP_AUDIT_CONTROL)) |
647 | err = -EPERM; | 679 | err = -EPERM; |
648 | break; | 680 | break; |
649 | case AUDIT_USER: | 681 | case AUDIT_USER: |
650 | case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: | 682 | case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: |
651 | case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2: | 683 | case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2: |
652 | if (!capable(CAP_AUDIT_WRITE)) | 684 | if (!netlink_capable(skb, CAP_AUDIT_WRITE)) |
653 | err = -EPERM; | 685 | err = -EPERM; |
654 | break; | 686 | break; |
655 | default: /* bad msg */ | 687 | default: /* bad msg */ |
@@ -1076,10 +1108,22 @@ static void audit_receive(struct sk_buff *skb) | |||
1076 | mutex_unlock(&audit_cmd_mutex); | 1108 | mutex_unlock(&audit_cmd_mutex); |
1077 | } | 1109 | } |
1078 | 1110 | ||
1111 | /* Run custom bind function on netlink socket group connect or bind requests. */ | ||
1112 | static int audit_bind(int group) | ||
1113 | { | ||
1114 | if (!capable(CAP_AUDIT_READ)) | ||
1115 | return -EPERM; | ||
1116 | |||
1117 | return 0; | ||
1118 | } | ||
1119 | |||
1079 | static int __net_init audit_net_init(struct net *net) | 1120 | static int __net_init audit_net_init(struct net *net) |
1080 | { | 1121 | { |
1081 | struct netlink_kernel_cfg cfg = { | 1122 | struct netlink_kernel_cfg cfg = { |
1082 | .input = audit_receive, | 1123 | .input = audit_receive, |
1124 | .bind = audit_bind, | ||
1125 | .flags = NL_CFG_F_NONROOT_RECV, | ||
1126 | .groups = AUDIT_NLGRP_MAX, | ||
1083 | }; | 1127 | }; |
1084 | 1128 | ||
1085 | struct audit_net *aunet = net_generic(net, audit_net_id); | 1129 | struct audit_net *aunet = net_generic(net, audit_net_id); |
@@ -1901,10 +1945,10 @@ out: | |||
1901 | * audit_log_end - end one audit record | 1945 | * audit_log_end - end one audit record |
1902 | * @ab: the audit_buffer | 1946 | * @ab: the audit_buffer |
1903 | * | 1947 | * |
1904 | * The netlink_* functions cannot be called inside an irq context, so | 1948 | * netlink_unicast() cannot be called inside an irq context because it blocks |
1905 | * the audit buffer is placed on a queue and a tasklet is scheduled to | 1949 | * (last arg, flags, is not set to MSG_DONTWAIT), so the audit buffer is placed |
1906 | * remove them from the queue outside the irq context. May be called in | 1950 | * on a queue and a tasklet is scheduled to remove them from the queue outside |
1907 | * any context. | 1951 | * the irq context. May be called in any context. |
1908 | */ | 1952 | */ |
1909 | void audit_log_end(struct audit_buffer *ab) | 1953 | void audit_log_end(struct audit_buffer *ab) |
1910 | { | 1954 | { |
@@ -1914,6 +1958,18 @@ void audit_log_end(struct audit_buffer *ab) | |||
1914 | audit_log_lost("rate limit exceeded"); | 1958 | audit_log_lost("rate limit exceeded"); |
1915 | } else { | 1959 | } else { |
1916 | struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); | 1960 | struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); |
1961 | |||
1962 | kauditd_send_multicast_skb(ab->skb); | ||
1963 | |||
1964 | /* | ||
1965 | * The original kaudit unicast socket sends up messages with | ||
1966 | * nlmsg_len set to the payload length rather than the entire | ||
1967 | * message length. This breaks the standard set by netlink. | ||
1968 | * The existing auditd daemon assumes this breakage. Fixing | ||
1969 | * this would require co-ordinating a change in the established | ||
1970 | * protocol between the kaudit kernel subsystem and the auditd | ||
1971 | * userspace code. | ||
1972 | */ | ||
1917 | nlh->nlmsg_len = ab->skb->len - NLMSG_HDRLEN; | 1973 | nlh->nlmsg_len = ab->skb->len - NLMSG_HDRLEN; |
1918 | 1974 | ||
1919 | if (audit_pid) { | 1975 | if (audit_pid) { |
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index f251a5e8d17a..21eae3c05ec0 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
@@ -728,6 +728,22 @@ static enum audit_state audit_filter_task(struct task_struct *tsk, char **key) | |||
728 | return AUDIT_BUILD_CONTEXT; | 728 | return AUDIT_BUILD_CONTEXT; |
729 | } | 729 | } |
730 | 730 | ||
731 | static int audit_in_mask(const struct audit_krule *rule, unsigned long val) | ||
732 | { | ||
733 | int word, bit; | ||
734 | |||
735 | if (val > 0xffffffff) | ||
736 | return false; | ||
737 | |||
738 | word = AUDIT_WORD(val); | ||
739 | if (word >= AUDIT_BITMASK_SIZE) | ||
740 | return false; | ||
741 | |||
742 | bit = AUDIT_BIT(val); | ||
743 | |||
744 | return rule->mask[word] & bit; | ||
745 | } | ||
746 | |||
731 | /* At syscall entry and exit time, this filter is called if the | 747 | /* At syscall entry and exit time, this filter is called if the |
732 | * audit_state is not low enough that auditing cannot take place, but is | 748 | * audit_state is not low enough that auditing cannot take place, but is |
733 | * also not high enough that we already know we have to write an audit | 749 | * also not high enough that we already know we have to write an audit |
@@ -745,11 +761,8 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, | |||
745 | 761 | ||
746 | rcu_read_lock(); | 762 | rcu_read_lock(); |
747 | if (!list_empty(list)) { | 763 | if (!list_empty(list)) { |
748 | int word = AUDIT_WORD(ctx->major); | ||
749 | int bit = AUDIT_BIT(ctx->major); | ||
750 | |||
751 | list_for_each_entry_rcu(e, list, list) { | 764 | list_for_each_entry_rcu(e, list, list) { |
752 | if ((e->rule.mask[word] & bit) == bit && | 765 | if (audit_in_mask(&e->rule, ctx->major) && |
753 | audit_filter_rules(tsk, &e->rule, ctx, NULL, | 766 | audit_filter_rules(tsk, &e->rule, ctx, NULL, |
754 | &state, false)) { | 767 | &state, false)) { |
755 | rcu_read_unlock(); | 768 | rcu_read_unlock(); |
@@ -769,20 +782,16 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, | |||
769 | static int audit_filter_inode_name(struct task_struct *tsk, | 782 | static int audit_filter_inode_name(struct task_struct *tsk, |
770 | struct audit_names *n, | 783 | struct audit_names *n, |
771 | struct audit_context *ctx) { | 784 | struct audit_context *ctx) { |
772 | int word, bit; | ||
773 | int h = audit_hash_ino((u32)n->ino); | 785 | int h = audit_hash_ino((u32)n->ino); |
774 | struct list_head *list = &audit_inode_hash[h]; | 786 | struct list_head *list = &audit_inode_hash[h]; |
775 | struct audit_entry *e; | 787 | struct audit_entry *e; |
776 | enum audit_state state; | 788 | enum audit_state state; |
777 | 789 | ||
778 | word = AUDIT_WORD(ctx->major); | ||
779 | bit = AUDIT_BIT(ctx->major); | ||
780 | |||
781 | if (list_empty(list)) | 790 | if (list_empty(list)) |
782 | return 0; | 791 | return 0; |
783 | 792 | ||
784 | list_for_each_entry_rcu(e, list, list) { | 793 | list_for_each_entry_rcu(e, list, list) { |
785 | if ((e->rule.mask[word] & bit) == bit && | 794 | if (audit_in_mask(&e->rule, ctx->major) && |
786 | audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) { | 795 | audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) { |
787 | ctx->current_state = state; | 796 | ctx->current_state = state; |
788 | return 1; | 797 | return 1; |
diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c index a5e026bc45c4..1323360d90e3 100644 --- a/kernel/backtracetest.c +++ b/kernel/backtracetest.c | |||
@@ -19,8 +19,8 @@ | |||
19 | 19 | ||
20 | static void backtrace_test_normal(void) | 20 | static void backtrace_test_normal(void) |
21 | { | 21 | { |
22 | printk("Testing a backtrace from process context.\n"); | 22 | pr_info("Testing a backtrace from process context.\n"); |
23 | printk("The following trace is a kernel self test and not a bug!\n"); | 23 | pr_info("The following trace is a kernel self test and not a bug!\n"); |
24 | 24 | ||
25 | dump_stack(); | 25 | dump_stack(); |
26 | } | 26 | } |
@@ -37,8 +37,8 @@ static DECLARE_TASKLET(backtrace_tasklet, &backtrace_test_irq_callback, 0); | |||
37 | 37 | ||
38 | static void backtrace_test_irq(void) | 38 | static void backtrace_test_irq(void) |
39 | { | 39 | { |
40 | printk("Testing a backtrace from irq context.\n"); | 40 | pr_info("Testing a backtrace from irq context.\n"); |
41 | printk("The following trace is a kernel self test and not a bug!\n"); | 41 | pr_info("The following trace is a kernel self test and not a bug!\n"); |
42 | 42 | ||
43 | init_completion(&backtrace_work); | 43 | init_completion(&backtrace_work); |
44 | tasklet_schedule(&backtrace_tasklet); | 44 | tasklet_schedule(&backtrace_tasklet); |
@@ -51,8 +51,8 @@ static void backtrace_test_saved(void) | |||
51 | struct stack_trace trace; | 51 | struct stack_trace trace; |
52 | unsigned long entries[8]; | 52 | unsigned long entries[8]; |
53 | 53 | ||
54 | printk("Testing a saved backtrace.\n"); | 54 | pr_info("Testing a saved backtrace.\n"); |
55 | printk("The following trace is a kernel self test and not a bug!\n"); | 55 | pr_info("The following trace is a kernel self test and not a bug!\n"); |
56 | 56 | ||
57 | trace.nr_entries = 0; | 57 | trace.nr_entries = 0; |
58 | trace.max_entries = ARRAY_SIZE(entries); | 58 | trace.max_entries = ARRAY_SIZE(entries); |
@@ -65,19 +65,19 @@ static void backtrace_test_saved(void) | |||
65 | #else | 65 | #else |
66 | static void backtrace_test_saved(void) | 66 | static void backtrace_test_saved(void) |
67 | { | 67 | { |
68 | printk("Saved backtrace test skipped.\n"); | 68 | pr_info("Saved backtrace test skipped.\n"); |
69 | } | 69 | } |
70 | #endif | 70 | #endif |
71 | 71 | ||
72 | static int backtrace_regression_test(void) | 72 | static int backtrace_regression_test(void) |
73 | { | 73 | { |
74 | printk("====[ backtrace testing ]===========\n"); | 74 | pr_info("====[ backtrace testing ]===========\n"); |
75 | 75 | ||
76 | backtrace_test_normal(); | 76 | backtrace_test_normal(); |
77 | backtrace_test_irq(); | 77 | backtrace_test_irq(); |
78 | backtrace_test_saved(); | 78 | backtrace_test_saved(); |
79 | 79 | ||
80 | printk("====[ end of backtrace testing ]====\n"); | 80 | pr_info("====[ end of backtrace testing ]====\n"); |
81 | return 0; | 81 | return 0; |
82 | } | 82 | } |
83 | 83 | ||
diff --git a/kernel/capability.c b/kernel/capability.c index a8d63df0c322..a5cf13c018ce 100644 --- a/kernel/capability.c +++ b/kernel/capability.c | |||
@@ -24,7 +24,6 @@ | |||
24 | */ | 24 | */ |
25 | 25 | ||
26 | const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET; | 26 | const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET; |
27 | |||
28 | EXPORT_SYMBOL(__cap_empty_set); | 27 | EXPORT_SYMBOL(__cap_empty_set); |
29 | 28 | ||
30 | int file_caps_enabled = 1; | 29 | int file_caps_enabled = 1; |
@@ -189,7 +188,7 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr) | |||
189 | * | 188 | * |
190 | * An alternative would be to return an error here | 189 | * An alternative would be to return an error here |
191 | * (-ERANGE), but that causes legacy applications to | 190 | * (-ERANGE), but that causes legacy applications to |
192 | * unexpectidly fail; the capget/modify/capset aborts | 191 | * unexpectedly fail; the capget/modify/capset aborts |
193 | * before modification is attempted and the application | 192 | * before modification is attempted and the application |
194 | * fails. | 193 | * fails. |
195 | */ | 194 | */ |
@@ -395,7 +394,8 @@ EXPORT_SYMBOL(ns_capable); | |||
395 | * This does not set PF_SUPERPRIV because the caller may not | 394 | * This does not set PF_SUPERPRIV because the caller may not |
396 | * actually be privileged. | 395 | * actually be privileged. |
397 | */ | 396 | */ |
398 | bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap) | 397 | bool file_ns_capable(const struct file *file, struct user_namespace *ns, |
398 | int cap) | ||
399 | { | 399 | { |
400 | if (WARN_ON_ONCE(!cap_valid(cap))) | 400 | if (WARN_ON_ONCE(!cap_valid(cap))) |
401 | return false; | 401 | return false; |
@@ -424,23 +424,19 @@ bool capable(int cap) | |||
424 | EXPORT_SYMBOL(capable); | 424 | EXPORT_SYMBOL(capable); |
425 | 425 | ||
426 | /** | 426 | /** |
427 | * inode_capable - Check superior capability over inode | 427 | * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped |
428 | * @inode: The inode in question | 428 | * @inode: The inode in question |
429 | * @cap: The capability in question | 429 | * @cap: The capability in question |
430 | * | 430 | * |
431 | * Return true if the current task has the given superior capability | 431 | * Return true if the current task has the given capability targeted at |
432 | * targeted at it's own user namespace and that the given inode is owned | 432 | * its own user namespace and that the given inode's uid and gid are |
433 | * by the current user namespace or a child namespace. | 433 | * mapped into the current user namespace. |
434 | * | ||
435 | * Currently we check to see if an inode is owned by the current | ||
436 | * user namespace by seeing if the inode's owner maps into the | ||
437 | * current user namespace. | ||
438 | * | ||
439 | */ | 434 | */ |
440 | bool inode_capable(const struct inode *inode, int cap) | 435 | bool capable_wrt_inode_uidgid(const struct inode *inode, int cap) |
441 | { | 436 | { |
442 | struct user_namespace *ns = current_user_ns(); | 437 | struct user_namespace *ns = current_user_ns(); |
443 | 438 | ||
444 | return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid); | 439 | return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid) && |
440 | kgid_has_mapping(ns, inode->i_gid); | ||
445 | } | 441 | } |
446 | EXPORT_SYMBOL(inode_capable); | 442 | EXPORT_SYMBOL(capable_wrt_inode_uidgid); |
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 9fcdaa705b6c..7868fc3c0bc5 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -26,6 +26,8 @@ | |||
26 | * distribution for more details. | 26 | * distribution for more details. |
27 | */ | 27 | */ |
28 | 28 | ||
29 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | ||
30 | |||
29 | #include <linux/cgroup.h> | 31 | #include <linux/cgroup.h> |
30 | #include <linux/cred.h> | 32 | #include <linux/cred.h> |
31 | #include <linux/ctype.h> | 33 | #include <linux/ctype.h> |
@@ -33,6 +35,7 @@ | |||
33 | #include <linux/init_task.h> | 35 | #include <linux/init_task.h> |
34 | #include <linux/kernel.h> | 36 | #include <linux/kernel.h> |
35 | #include <linux/list.h> | 37 | #include <linux/list.h> |
38 | #include <linux/magic.h> | ||
36 | #include <linux/mm.h> | 39 | #include <linux/mm.h> |
37 | #include <linux/mutex.h> | 40 | #include <linux/mutex.h> |
38 | #include <linux/mount.h> | 41 | #include <linux/mount.h> |
@@ -69,15 +72,6 @@ | |||
69 | MAX_CFTYPE_NAME + 2) | 72 | MAX_CFTYPE_NAME + 2) |
70 | 73 | ||
71 | /* | 74 | /* |
72 | * cgroup_tree_mutex nests above cgroup_mutex and protects cftypes, file | ||
73 | * creation/removal and hierarchy changing operations including cgroup | ||
74 | * creation, removal, css association and controller rebinding. This outer | ||
75 | * lock is needed mainly to resolve the circular dependency between kernfs | ||
76 | * active ref and cgroup_mutex. cgroup_tree_mutex nests above both. | ||
77 | */ | ||
78 | static DEFINE_MUTEX(cgroup_tree_mutex); | ||
79 | |||
80 | /* | ||
81 | * cgroup_mutex is the master lock. Any modification to cgroup or its | 75 | * cgroup_mutex is the master lock. Any modification to cgroup or its |
82 | * hierarchy must be performed while holding it. | 76 | * hierarchy must be performed while holding it. |
83 | * | 77 | * |
@@ -98,16 +92,21 @@ static DECLARE_RWSEM(css_set_rwsem); | |||
98 | #endif | 92 | #endif |
99 | 93 | ||
100 | /* | 94 | /* |
95 | * Protects cgroup_idr and css_idr so that IDs can be released without | ||
96 | * grabbing cgroup_mutex. | ||
97 | */ | ||
98 | static DEFINE_SPINLOCK(cgroup_idr_lock); | ||
99 | |||
100 | /* | ||
101 | * Protects cgroup_subsys->release_agent_path. Modifying it also requires | 101 | * Protects cgroup_subsys->release_agent_path. Modifying it also requires |
102 | * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock. | 102 | * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock. |
103 | */ | 103 | */ |
104 | static DEFINE_SPINLOCK(release_agent_path_lock); | 104 | static DEFINE_SPINLOCK(release_agent_path_lock); |
105 | 105 | ||
106 | #define cgroup_assert_mutexes_or_rcu_locked() \ | 106 | #define cgroup_assert_mutex_or_rcu_locked() \ |
107 | rcu_lockdep_assert(rcu_read_lock_held() || \ | 107 | rcu_lockdep_assert(rcu_read_lock_held() || \ |
108 | lockdep_is_held(&cgroup_tree_mutex) || \ | ||
109 | lockdep_is_held(&cgroup_mutex), \ | 108 | lockdep_is_held(&cgroup_mutex), \ |
110 | "cgroup_[tree_]mutex or RCU read lock required"); | 109 | "cgroup_mutex or RCU read lock required"); |
111 | 110 | ||
112 | /* | 111 | /* |
113 | * cgroup destruction makes heavy use of work items and there can be a lot | 112 | * cgroup destruction makes heavy use of work items and there can be a lot |
@@ -150,6 +149,13 @@ struct cgroup_root cgrp_dfl_root; | |||
150 | */ | 149 | */ |
151 | static bool cgrp_dfl_root_visible; | 150 | static bool cgrp_dfl_root_visible; |
152 | 151 | ||
152 | /* some controllers are not supported in the default hierarchy */ | ||
153 | static const unsigned int cgrp_dfl_root_inhibit_ss_mask = 0 | ||
154 | #ifdef CONFIG_CGROUP_DEBUG | ||
155 | | (1 << debug_cgrp_id) | ||
156 | #endif | ||
157 | ; | ||
158 | |||
153 | /* The list of hierarchy roots */ | 159 | /* The list of hierarchy roots */ |
154 | 160 | ||
155 | static LIST_HEAD(cgroup_roots); | 161 | static LIST_HEAD(cgroup_roots); |
@@ -159,14 +165,13 @@ static int cgroup_root_count; | |||
159 | static DEFINE_IDR(cgroup_hierarchy_idr); | 165 | static DEFINE_IDR(cgroup_hierarchy_idr); |
160 | 166 | ||
161 | /* | 167 | /* |
162 | * Assign a monotonically increasing serial number to cgroups. It | 168 | * Assign a monotonically increasing serial number to csses. It guarantees |
163 | * guarantees cgroups with bigger numbers are newer than those with smaller | 169 | * cgroups with bigger numbers are newer than those with smaller numbers. |
164 | * numbers. Also, as cgroups are always appended to the parent's | 170 | * Also, as csses are always appended to the parent's ->children list, it |
165 | * ->children list, it guarantees that sibling cgroups are always sorted in | 171 | * guarantees that sibling csses are always sorted in the ascending serial |
166 | * the ascending serial number order on the list. Protected by | 172 | * number order on the list. Protected by cgroup_mutex. |
167 | * cgroup_mutex. | ||
168 | */ | 173 | */ |
169 | static u64 cgroup_serial_nr_next = 1; | 174 | static u64 css_serial_nr_next = 1; |
170 | 175 | ||
171 | /* This flag indicates whether tasks in the fork and exit paths should | 176 | /* This flag indicates whether tasks in the fork and exit paths should |
172 | * check for fork/exit handlers to call. This avoids us having to do | 177 | * check for fork/exit handlers to call. This avoids us having to do |
@@ -179,17 +184,59 @@ static struct cftype cgroup_base_files[]; | |||
179 | 184 | ||
180 | static void cgroup_put(struct cgroup *cgrp); | 185 | static void cgroup_put(struct cgroup *cgrp); |
181 | static int rebind_subsystems(struct cgroup_root *dst_root, | 186 | static int rebind_subsystems(struct cgroup_root *dst_root, |
182 | unsigned long ss_mask); | 187 | unsigned int ss_mask); |
183 | static void cgroup_destroy_css_killed(struct cgroup *cgrp); | ||
184 | static int cgroup_destroy_locked(struct cgroup *cgrp); | 188 | static int cgroup_destroy_locked(struct cgroup *cgrp); |
189 | static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss); | ||
190 | static void css_release(struct percpu_ref *ref); | ||
191 | static void kill_css(struct cgroup_subsys_state *css); | ||
185 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], | 192 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], |
186 | bool is_add); | 193 | bool is_add); |
187 | static void cgroup_pidlist_destroy_all(struct cgroup *cgrp); | 194 | static void cgroup_pidlist_destroy_all(struct cgroup *cgrp); |
188 | 195 | ||
196 | /* IDR wrappers which synchronize using cgroup_idr_lock */ | ||
197 | static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, | ||
198 | gfp_t gfp_mask) | ||
199 | { | ||
200 | int ret; | ||
201 | |||
202 | idr_preload(gfp_mask); | ||
203 | spin_lock_bh(&cgroup_idr_lock); | ||
204 | ret = idr_alloc(idr, ptr, start, end, gfp_mask); | ||
205 | spin_unlock_bh(&cgroup_idr_lock); | ||
206 | idr_preload_end(); | ||
207 | return ret; | ||
208 | } | ||
209 | |||
210 | static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id) | ||
211 | { | ||
212 | void *ret; | ||
213 | |||
214 | spin_lock_bh(&cgroup_idr_lock); | ||
215 | ret = idr_replace(idr, ptr, id); | ||
216 | spin_unlock_bh(&cgroup_idr_lock); | ||
217 | return ret; | ||
218 | } | ||
219 | |||
220 | static void cgroup_idr_remove(struct idr *idr, int id) | ||
221 | { | ||
222 | spin_lock_bh(&cgroup_idr_lock); | ||
223 | idr_remove(idr, id); | ||
224 | spin_unlock_bh(&cgroup_idr_lock); | ||
225 | } | ||
226 | |||
227 | static struct cgroup *cgroup_parent(struct cgroup *cgrp) | ||
228 | { | ||
229 | struct cgroup_subsys_state *parent_css = cgrp->self.parent; | ||
230 | |||
231 | if (parent_css) | ||
232 | return container_of(parent_css, struct cgroup, self); | ||
233 | return NULL; | ||
234 | } | ||
235 | |||
189 | /** | 236 | /** |
190 | * cgroup_css - obtain a cgroup's css for the specified subsystem | 237 | * cgroup_css - obtain a cgroup's css for the specified subsystem |
191 | * @cgrp: the cgroup of interest | 238 | * @cgrp: the cgroup of interest |
192 | * @ss: the subsystem of interest (%NULL returns the dummy_css) | 239 | * @ss: the subsystem of interest (%NULL returns @cgrp->self) |
193 | * | 240 | * |
194 | * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This | 241 | * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This |
195 | * function must be called either under cgroup_mutex or rcu_read_lock() and | 242 | * function must be called either under cgroup_mutex or rcu_read_lock() and |
@@ -202,23 +249,49 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, | |||
202 | { | 249 | { |
203 | if (ss) | 250 | if (ss) |
204 | return rcu_dereference_check(cgrp->subsys[ss->id], | 251 | return rcu_dereference_check(cgrp->subsys[ss->id], |
205 | lockdep_is_held(&cgroup_tree_mutex) || | ||
206 | lockdep_is_held(&cgroup_mutex)); | 252 | lockdep_is_held(&cgroup_mutex)); |
207 | else | 253 | else |
208 | return &cgrp->dummy_css; | 254 | return &cgrp->self; |
255 | } | ||
256 | |||
257 | /** | ||
258 | * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem | ||
259 | * @cgrp: the cgroup of interest | ||
260 | * @ss: the subsystem of interest (%NULL returns @cgrp->self) | ||
261 | * | ||
262 | * Similar to cgroup_css() but returns the effctive css, which is defined | ||
263 | * as the matching css of the nearest ancestor including self which has @ss | ||
264 | * enabled. If @ss is associated with the hierarchy @cgrp is on, this | ||
265 | * function is guaranteed to return non-NULL css. | ||
266 | */ | ||
267 | static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, | ||
268 | struct cgroup_subsys *ss) | ||
269 | { | ||
270 | lockdep_assert_held(&cgroup_mutex); | ||
271 | |||
272 | if (!ss) | ||
273 | return &cgrp->self; | ||
274 | |||
275 | if (!(cgrp->root->subsys_mask & (1 << ss->id))) | ||
276 | return NULL; | ||
277 | |||
278 | while (cgroup_parent(cgrp) && | ||
279 | !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id))) | ||
280 | cgrp = cgroup_parent(cgrp); | ||
281 | |||
282 | return cgroup_css(cgrp, ss); | ||
209 | } | 283 | } |
210 | 284 | ||
211 | /* convenient tests for these bits */ | 285 | /* convenient tests for these bits */ |
212 | static inline bool cgroup_is_dead(const struct cgroup *cgrp) | 286 | static inline bool cgroup_is_dead(const struct cgroup *cgrp) |
213 | { | 287 | { |
214 | return test_bit(CGRP_DEAD, &cgrp->flags); | 288 | return !(cgrp->self.flags & CSS_ONLINE); |
215 | } | 289 | } |
216 | 290 | ||
217 | struct cgroup_subsys_state *seq_css(struct seq_file *seq) | 291 | struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) |
218 | { | 292 | { |
219 | struct kernfs_open_file *of = seq->private; | ||
220 | struct cgroup *cgrp = of->kn->parent->priv; | 293 | struct cgroup *cgrp = of->kn->parent->priv; |
221 | struct cftype *cft = seq_cft(seq); | 294 | struct cftype *cft = of_cft(of); |
222 | 295 | ||
223 | /* | 296 | /* |
224 | * This is open and unprotected implementation of cgroup_css(). | 297 | * This is open and unprotected implementation of cgroup_css(). |
@@ -231,9 +304,9 @@ struct cgroup_subsys_state *seq_css(struct seq_file *seq) | |||
231 | if (cft->ss) | 304 | if (cft->ss) |
232 | return rcu_dereference_raw(cgrp->subsys[cft->ss->id]); | 305 | return rcu_dereference_raw(cgrp->subsys[cft->ss->id]); |
233 | else | 306 | else |
234 | return &cgrp->dummy_css; | 307 | return &cgrp->self; |
235 | } | 308 | } |
236 | EXPORT_SYMBOL_GPL(seq_css); | 309 | EXPORT_SYMBOL_GPL(of_css); |
237 | 310 | ||
238 | /** | 311 | /** |
239 | * cgroup_is_descendant - test ancestry | 312 | * cgroup_is_descendant - test ancestry |
@@ -249,7 +322,7 @@ bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor) | |||
249 | while (cgrp) { | 322 | while (cgrp) { |
250 | if (cgrp == ancestor) | 323 | if (cgrp == ancestor) |
251 | return true; | 324 | return true; |
252 | cgrp = cgrp->parent; | 325 | cgrp = cgroup_parent(cgrp); |
253 | } | 326 | } |
254 | return false; | 327 | return false; |
255 | } | 328 | } |
@@ -273,17 +346,30 @@ static int notify_on_release(const struct cgroup *cgrp) | |||
273 | * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end | 346 | * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end |
274 | * @cgrp: the target cgroup to iterate css's of | 347 | * @cgrp: the target cgroup to iterate css's of |
275 | * | 348 | * |
276 | * Should be called under cgroup_mutex. | 349 | * Should be called under cgroup_[tree_]mutex. |
277 | */ | 350 | */ |
278 | #define for_each_css(css, ssid, cgrp) \ | 351 | #define for_each_css(css, ssid, cgrp) \ |
279 | for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ | 352 | for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ |
280 | if (!((css) = rcu_dereference_check( \ | 353 | if (!((css) = rcu_dereference_check( \ |
281 | (cgrp)->subsys[(ssid)], \ | 354 | (cgrp)->subsys[(ssid)], \ |
282 | lockdep_is_held(&cgroup_tree_mutex) || \ | ||
283 | lockdep_is_held(&cgroup_mutex)))) { } \ | 355 | lockdep_is_held(&cgroup_mutex)))) { } \ |
284 | else | 356 | else |
285 | 357 | ||
286 | /** | 358 | /** |
359 | * for_each_e_css - iterate all effective css's of a cgroup | ||
360 | * @css: the iteration cursor | ||
361 | * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end | ||
362 | * @cgrp: the target cgroup to iterate css's of | ||
363 | * | ||
364 | * Should be called under cgroup_[tree_]mutex. | ||
365 | */ | ||
366 | #define for_each_e_css(css, ssid, cgrp) \ | ||
367 | for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ | ||
368 | if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \ | ||
369 | ; \ | ||
370 | else | ||
371 | |||
372 | /** | ||
287 | * for_each_subsys - iterate all enabled cgroup subsystems | 373 | * for_each_subsys - iterate all enabled cgroup subsystems |
288 | * @ss: the iteration cursor | 374 | * @ss: the iteration cursor |
289 | * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end | 375 | * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end |
@@ -296,22 +382,13 @@ static int notify_on_release(const struct cgroup *cgrp) | |||
296 | #define for_each_root(root) \ | 382 | #define for_each_root(root) \ |
297 | list_for_each_entry((root), &cgroup_roots, root_list) | 383 | list_for_each_entry((root), &cgroup_roots, root_list) |
298 | 384 | ||
299 | /** | 385 | /* iterate over child cgrps, lock should be held throughout iteration */ |
300 | * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. | 386 | #define cgroup_for_each_live_child(child, cgrp) \ |
301 | * @cgrp: the cgroup to be checked for liveness | 387 | list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \ |
302 | * | 388 | if (({ lockdep_assert_held(&cgroup_mutex); \ |
303 | * On success, returns true; the mutex should be later unlocked. On | 389 | cgroup_is_dead(child); })) \ |
304 | * failure returns false with no lock held. | 390 | ; \ |
305 | */ | 391 | else |
306 | static bool cgroup_lock_live_group(struct cgroup *cgrp) | ||
307 | { | ||
308 | mutex_lock(&cgroup_mutex); | ||
309 | if (cgroup_is_dead(cgrp)) { | ||
310 | mutex_unlock(&cgroup_mutex); | ||
311 | return false; | ||
312 | } | ||
313 | return true; | ||
314 | } | ||
315 | 392 | ||
316 | /* the list of cgroups eligible for automatic release. Protected by | 393 | /* the list of cgroups eligible for automatic release. Protected by |
317 | * release_list_lock */ | 394 | * release_list_lock */ |
@@ -348,7 +425,7 @@ struct cgrp_cset_link { | |||
348 | * reference-counted, to improve performance when child cgroups | 425 | * reference-counted, to improve performance when child cgroups |
349 | * haven't been created. | 426 | * haven't been created. |
350 | */ | 427 | */ |
351 | static struct css_set init_css_set = { | 428 | struct css_set init_css_set = { |
352 | .refcount = ATOMIC_INIT(1), | 429 | .refcount = ATOMIC_INIT(1), |
353 | .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), | 430 | .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), |
354 | .tasks = LIST_HEAD_INIT(init_css_set.tasks), | 431 | .tasks = LIST_HEAD_INIT(init_css_set.tasks), |
@@ -359,6 +436,43 @@ static struct css_set init_css_set = { | |||
359 | 436 | ||
360 | static int css_set_count = 1; /* 1 for init_css_set */ | 437 | static int css_set_count = 1; /* 1 for init_css_set */ |
361 | 438 | ||
439 | /** | ||
440 | * cgroup_update_populated - updated populated count of a cgroup | ||
441 | * @cgrp: the target cgroup | ||
442 | * @populated: inc or dec populated count | ||
443 | * | ||
444 | * @cgrp is either getting the first task (css_set) or losing the last. | ||
445 | * Update @cgrp->populated_cnt accordingly. The count is propagated | ||
446 | * towards root so that a given cgroup's populated_cnt is zero iff the | ||
447 | * cgroup and all its descendants are empty. | ||
448 | * | ||
449 | * @cgrp's interface file "cgroup.populated" is zero if | ||
450 | * @cgrp->populated_cnt is zero and 1 otherwise. When @cgrp->populated_cnt | ||
451 | * changes from or to zero, userland is notified that the content of the | ||
452 | * interface file has changed. This can be used to detect when @cgrp and | ||
453 | * its descendants become populated or empty. | ||
454 | */ | ||
455 | static void cgroup_update_populated(struct cgroup *cgrp, bool populated) | ||
456 | { | ||
457 | lockdep_assert_held(&css_set_rwsem); | ||
458 | |||
459 | do { | ||
460 | bool trigger; | ||
461 | |||
462 | if (populated) | ||
463 | trigger = !cgrp->populated_cnt++; | ||
464 | else | ||
465 | trigger = !--cgrp->populated_cnt; | ||
466 | |||
467 | if (!trigger) | ||
468 | break; | ||
469 | |||
470 | if (cgrp->populated_kn) | ||
471 | kernfs_notify(cgrp->populated_kn); | ||
472 | cgrp = cgroup_parent(cgrp); | ||
473 | } while (cgrp); | ||
474 | } | ||
475 | |||
362 | /* | 476 | /* |
363 | * hash table for cgroup groups. This improves the performance to find | 477 | * hash table for cgroup groups. This improves the performance to find |
364 | * an existing css_set. This hash doesn't (currently) take into | 478 | * an existing css_set. This hash doesn't (currently) take into |
@@ -383,6 +497,8 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) | |||
383 | static void put_css_set_locked(struct css_set *cset, bool taskexit) | 497 | static void put_css_set_locked(struct css_set *cset, bool taskexit) |
384 | { | 498 | { |
385 | struct cgrp_cset_link *link, *tmp_link; | 499 | struct cgrp_cset_link *link, *tmp_link; |
500 | struct cgroup_subsys *ss; | ||
501 | int ssid; | ||
386 | 502 | ||
387 | lockdep_assert_held(&css_set_rwsem); | 503 | lockdep_assert_held(&css_set_rwsem); |
388 | 504 | ||
@@ -390,6 +506,8 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit) | |||
390 | return; | 506 | return; |
391 | 507 | ||
392 | /* This css_set is dead. unlink it and release cgroup refcounts */ | 508 | /* This css_set is dead. unlink it and release cgroup refcounts */ |
509 | for_each_subsys(ss, ssid) | ||
510 | list_del(&cset->e_cset_node[ssid]); | ||
393 | hash_del(&cset->hlist); | 511 | hash_del(&cset->hlist); |
394 | css_set_count--; | 512 | css_set_count--; |
395 | 513 | ||
@@ -400,10 +518,13 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit) | |||
400 | list_del(&link->cgrp_link); | 518 | list_del(&link->cgrp_link); |
401 | 519 | ||
402 | /* @cgrp can't go away while we're holding css_set_rwsem */ | 520 | /* @cgrp can't go away while we're holding css_set_rwsem */ |
403 | if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) { | 521 | if (list_empty(&cgrp->cset_links)) { |
404 | if (taskexit) | 522 | cgroup_update_populated(cgrp, false); |
405 | set_bit(CGRP_RELEASABLE, &cgrp->flags); | 523 | if (notify_on_release(cgrp)) { |
406 | check_for_release(cgrp); | 524 | if (taskexit) |
525 | set_bit(CGRP_RELEASABLE, &cgrp->flags); | ||
526 | check_for_release(cgrp); | ||
527 | } | ||
407 | } | 528 | } |
408 | 529 | ||
409 | kfree(link); | 530 | kfree(link); |
@@ -452,20 +573,20 @@ static bool compare_css_sets(struct css_set *cset, | |||
452 | { | 573 | { |
453 | struct list_head *l1, *l2; | 574 | struct list_head *l1, *l2; |
454 | 575 | ||
455 | if (memcmp(template, cset->subsys, sizeof(cset->subsys))) { | 576 | /* |
456 | /* Not all subsystems matched */ | 577 | * On the default hierarchy, there can be csets which are |
578 | * associated with the same set of cgroups but different csses. | ||
579 | * Let's first ensure that csses match. | ||
580 | */ | ||
581 | if (memcmp(template, cset->subsys, sizeof(cset->subsys))) | ||
457 | return false; | 582 | return false; |
458 | } | ||
459 | 583 | ||
460 | /* | 584 | /* |
461 | * Compare cgroup pointers in order to distinguish between | 585 | * Compare cgroup pointers in order to distinguish between |
462 | * different cgroups in heirarchies with no subsystems. We | 586 | * different cgroups in hierarchies. As different cgroups may |
463 | * could get by with just this check alone (and skip the | 587 | * share the same effective css, this comparison is always |
464 | * memcmp above) but on most setups the memcmp check will | 588 | * necessary. |
465 | * avoid the need for this more expensive check on almost all | ||
466 | * candidates. | ||
467 | */ | 589 | */ |
468 | |||
469 | l1 = &cset->cgrp_links; | 590 | l1 = &cset->cgrp_links; |
470 | l2 = &old_cset->cgrp_links; | 591 | l2 = &old_cset->cgrp_links; |
471 | while (1) { | 592 | while (1) { |
@@ -529,14 +650,17 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset, | |||
529 | * won't change, so no need for locking. | 650 | * won't change, so no need for locking. |
530 | */ | 651 | */ |
531 | for_each_subsys(ss, i) { | 652 | for_each_subsys(ss, i) { |
532 | if (root->cgrp.subsys_mask & (1UL << i)) { | 653 | if (root->subsys_mask & (1UL << i)) { |
533 | /* Subsystem is in this hierarchy. So we want | 654 | /* |
534 | * the subsystem state from the new | 655 | * @ss is in this hierarchy, so we want the |
535 | * cgroup */ | 656 | * effective css from @cgrp. |
536 | template[i] = cgroup_css(cgrp, ss); | 657 | */ |
658 | template[i] = cgroup_e_css(cgrp, ss); | ||
537 | } else { | 659 | } else { |
538 | /* Subsystem is not in this hierarchy, so we | 660 | /* |
539 | * don't want to change the subsystem state */ | 661 | * @ss is not in this hierarchy, so we don't want |
662 | * to change the css. | ||
663 | */ | ||
540 | template[i] = old_cset->subsys[i]; | 664 | template[i] = old_cset->subsys[i]; |
541 | } | 665 | } |
542 | } | 666 | } |
@@ -602,10 +726,18 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset, | |||
602 | struct cgrp_cset_link *link; | 726 | struct cgrp_cset_link *link; |
603 | 727 | ||
604 | BUG_ON(list_empty(tmp_links)); | 728 | BUG_ON(list_empty(tmp_links)); |
729 | |||
730 | if (cgroup_on_dfl(cgrp)) | ||
731 | cset->dfl_cgrp = cgrp; | ||
732 | |||
605 | link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link); | 733 | link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link); |
606 | link->cset = cset; | 734 | link->cset = cset; |
607 | link->cgrp = cgrp; | 735 | link->cgrp = cgrp; |
736 | |||
737 | if (list_empty(&cgrp->cset_links)) | ||
738 | cgroup_update_populated(cgrp, true); | ||
608 | list_move(&link->cset_link, &cgrp->cset_links); | 739 | list_move(&link->cset_link, &cgrp->cset_links); |
740 | |||
609 | /* | 741 | /* |
610 | * Always add links to the tail of the list so that the list | 742 | * Always add links to the tail of the list so that the list |
611 | * is sorted by order of hierarchy creation | 743 | * is sorted by order of hierarchy creation |
@@ -628,7 +760,9 @@ static struct css_set *find_css_set(struct css_set *old_cset, | |||
628 | struct css_set *cset; | 760 | struct css_set *cset; |
629 | struct list_head tmp_links; | 761 | struct list_head tmp_links; |
630 | struct cgrp_cset_link *link; | 762 | struct cgrp_cset_link *link; |
763 | struct cgroup_subsys *ss; | ||
631 | unsigned long key; | 764 | unsigned long key; |
765 | int ssid; | ||
632 | 766 | ||
633 | lockdep_assert_held(&cgroup_mutex); | 767 | lockdep_assert_held(&cgroup_mutex); |
634 | 768 | ||
@@ -679,10 +813,14 @@ static struct css_set *find_css_set(struct css_set *old_cset, | |||
679 | 813 | ||
680 | css_set_count++; | 814 | css_set_count++; |
681 | 815 | ||
682 | /* Add this cgroup group to the hash table */ | 816 | /* Add @cset to the hash table */ |
683 | key = css_set_hash(cset->subsys); | 817 | key = css_set_hash(cset->subsys); |
684 | hash_add(css_set_table, &cset->hlist, key); | 818 | hash_add(css_set_table, &cset->hlist, key); |
685 | 819 | ||
820 | for_each_subsys(ss, ssid) | ||
821 | list_add_tail(&cset->e_cset_node[ssid], | ||
822 | &cset->subsys[ssid]->cgroup->e_csets[ssid]); | ||
823 | |||
686 | up_write(&css_set_rwsem); | 824 | up_write(&css_set_rwsem); |
687 | 825 | ||
688 | return cset; | 826 | return cset; |
@@ -735,14 +873,13 @@ static void cgroup_destroy_root(struct cgroup_root *root) | |||
735 | struct cgroup *cgrp = &root->cgrp; | 873 | struct cgroup *cgrp = &root->cgrp; |
736 | struct cgrp_cset_link *link, *tmp_link; | 874 | struct cgrp_cset_link *link, *tmp_link; |
737 | 875 | ||
738 | mutex_lock(&cgroup_tree_mutex); | ||
739 | mutex_lock(&cgroup_mutex); | 876 | mutex_lock(&cgroup_mutex); |
740 | 877 | ||
741 | BUG_ON(atomic_read(&root->nr_cgrps)); | 878 | BUG_ON(atomic_read(&root->nr_cgrps)); |
742 | BUG_ON(!list_empty(&cgrp->children)); | 879 | BUG_ON(!list_empty(&cgrp->self.children)); |
743 | 880 | ||
744 | /* Rebind all subsystems back to the default hierarchy */ | 881 | /* Rebind all subsystems back to the default hierarchy */ |
745 | rebind_subsystems(&cgrp_dfl_root, cgrp->subsys_mask); | 882 | rebind_subsystems(&cgrp_dfl_root, root->subsys_mask); |
746 | 883 | ||
747 | /* | 884 | /* |
748 | * Release all the links from cset_links to this hierarchy's | 885 | * Release all the links from cset_links to this hierarchy's |
@@ -765,7 +902,6 @@ static void cgroup_destroy_root(struct cgroup_root *root) | |||
765 | cgroup_exit_root_id(root); | 902 | cgroup_exit_root_id(root); |
766 | 903 | ||
767 | mutex_unlock(&cgroup_mutex); | 904 | mutex_unlock(&cgroup_mutex); |
768 | mutex_unlock(&cgroup_tree_mutex); | ||
769 | 905 | ||
770 | kernfs_destroy_root(root->kf_root); | 906 | kernfs_destroy_root(root->kf_root); |
771 | cgroup_free_root(root); | 907 | cgroup_free_root(root); |
@@ -848,7 +984,7 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, | |||
848 | * update of a tasks cgroup pointer by cgroup_attach_task() | 984 | * update of a tasks cgroup pointer by cgroup_attach_task() |
849 | */ | 985 | */ |
850 | 986 | ||
851 | static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask); | 987 | static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask); |
852 | static struct kernfs_syscall_ops cgroup_kf_syscall_ops; | 988 | static struct kernfs_syscall_ops cgroup_kf_syscall_ops; |
853 | static const struct file_operations proc_cgroupstats_operations; | 989 | static const struct file_operations proc_cgroupstats_operations; |
854 | 990 | ||
@@ -883,79 +1019,95 @@ static umode_t cgroup_file_mode(const struct cftype *cft) | |||
883 | if (cft->read_u64 || cft->read_s64 || cft->seq_show) | 1019 | if (cft->read_u64 || cft->read_s64 || cft->seq_show) |
884 | mode |= S_IRUGO; | 1020 | mode |= S_IRUGO; |
885 | 1021 | ||
886 | if (cft->write_u64 || cft->write_s64 || cft->write_string || | 1022 | if (cft->write_u64 || cft->write_s64 || cft->write) |
887 | cft->trigger) | ||
888 | mode |= S_IWUSR; | 1023 | mode |= S_IWUSR; |
889 | 1024 | ||
890 | return mode; | 1025 | return mode; |
891 | } | 1026 | } |
892 | 1027 | ||
893 | static void cgroup_free_fn(struct work_struct *work) | 1028 | static void cgroup_get(struct cgroup *cgrp) |
894 | { | 1029 | { |
895 | struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); | 1030 | WARN_ON_ONCE(cgroup_is_dead(cgrp)); |
896 | 1031 | css_get(&cgrp->self); | |
897 | atomic_dec(&cgrp->root->nr_cgrps); | ||
898 | cgroup_pidlist_destroy_all(cgrp); | ||
899 | |||
900 | if (cgrp->parent) { | ||
901 | /* | ||
902 | * We get a ref to the parent, and put the ref when this | ||
903 | * cgroup is being freed, so it's guaranteed that the | ||
904 | * parent won't be destroyed before its children. | ||
905 | */ | ||
906 | cgroup_put(cgrp->parent); | ||
907 | kernfs_put(cgrp->kn); | ||
908 | kfree(cgrp); | ||
909 | } else { | ||
910 | /* | ||
911 | * This is root cgroup's refcnt reaching zero, which | ||
912 | * indicates that the root should be released. | ||
913 | */ | ||
914 | cgroup_destroy_root(cgrp->root); | ||
915 | } | ||
916 | } | 1032 | } |
917 | 1033 | ||
918 | static void cgroup_free_rcu(struct rcu_head *head) | 1034 | static void cgroup_put(struct cgroup *cgrp) |
919 | { | 1035 | { |
920 | struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head); | 1036 | css_put(&cgrp->self); |
921 | |||
922 | INIT_WORK(&cgrp->destroy_work, cgroup_free_fn); | ||
923 | queue_work(cgroup_destroy_wq, &cgrp->destroy_work); | ||
924 | } | 1037 | } |
925 | 1038 | ||
926 | static void cgroup_get(struct cgroup *cgrp) | 1039 | /** |
1040 | * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods | ||
1041 | * @kn: the kernfs_node being serviced | ||
1042 | * | ||
1043 | * This helper undoes cgroup_kn_lock_live() and should be invoked before | ||
1044 | * the method finishes if locking succeeded. Note that once this function | ||
1045 | * returns the cgroup returned by cgroup_kn_lock_live() may become | ||
1046 | * inaccessible any time. If the caller intends to continue to access the | ||
1047 | * cgroup, it should pin it before invoking this function. | ||
1048 | */ | ||
1049 | static void cgroup_kn_unlock(struct kernfs_node *kn) | ||
927 | { | 1050 | { |
928 | WARN_ON_ONCE(cgroup_is_dead(cgrp)); | 1051 | struct cgroup *cgrp; |
929 | WARN_ON_ONCE(atomic_read(&cgrp->refcnt) <= 0); | 1052 | |
930 | atomic_inc(&cgrp->refcnt); | 1053 | if (kernfs_type(kn) == KERNFS_DIR) |
1054 | cgrp = kn->priv; | ||
1055 | else | ||
1056 | cgrp = kn->parent->priv; | ||
1057 | |||
1058 | mutex_unlock(&cgroup_mutex); | ||
1059 | |||
1060 | kernfs_unbreak_active_protection(kn); | ||
1061 | cgroup_put(cgrp); | ||
931 | } | 1062 | } |
932 | 1063 | ||
933 | static void cgroup_put(struct cgroup *cgrp) | 1064 | /** |
1065 | * cgroup_kn_lock_live - locking helper for cgroup kernfs methods | ||
1066 | * @kn: the kernfs_node being serviced | ||
1067 | * | ||
1068 | * This helper is to be used by a cgroup kernfs method currently servicing | ||
1069 | * @kn. It breaks the active protection, performs cgroup locking and | ||
1070 | * verifies that the associated cgroup is alive. Returns the cgroup if | ||
1071 | * alive; otherwise, %NULL. A successful return should be undone by a | ||
1072 | * matching cgroup_kn_unlock() invocation. | ||
1073 | * | ||
1074 | * Any cgroup kernfs method implementation which requires locking the | ||
1075 | * associated cgroup should use this helper. It avoids nesting cgroup | ||
1076 | * locking under kernfs active protection and allows all kernfs operations | ||
1077 | * including self-removal. | ||
1078 | */ | ||
1079 | static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn) | ||
934 | { | 1080 | { |
935 | if (!atomic_dec_and_test(&cgrp->refcnt)) | 1081 | struct cgroup *cgrp; |
936 | return; | 1082 | |
937 | if (WARN_ON_ONCE(cgrp->parent && !cgroup_is_dead(cgrp))) | 1083 | if (kernfs_type(kn) == KERNFS_DIR) |
938 | return; | 1084 | cgrp = kn->priv; |
1085 | else | ||
1086 | cgrp = kn->parent->priv; | ||
939 | 1087 | ||
940 | /* | 1088 | /* |
941 | * XXX: cgrp->id is only used to look up css's. As cgroup and | 1089 | * We're gonna grab cgroup_mutex which nests outside kernfs |
942 | * css's lifetimes will be decoupled, it should be made | 1090 | * active_ref. cgroup liveliness check alone provides enough |
943 | * per-subsystem and moved to css->id so that lookups are | 1091 | * protection against removal. Ensure @cgrp stays accessible and |
944 | * successful until the target css is released. | 1092 | * break the active_ref protection. |
945 | */ | 1093 | */ |
1094 | cgroup_get(cgrp); | ||
1095 | kernfs_break_active_protection(kn); | ||
1096 | |||
946 | mutex_lock(&cgroup_mutex); | 1097 | mutex_lock(&cgroup_mutex); |
947 | idr_remove(&cgrp->root->cgroup_idr, cgrp->id); | ||
948 | mutex_unlock(&cgroup_mutex); | ||
949 | cgrp->id = -1; | ||
950 | 1098 | ||
951 | call_rcu(&cgrp->rcu_head, cgroup_free_rcu); | 1099 | if (!cgroup_is_dead(cgrp)) |
1100 | return cgrp; | ||
1101 | |||
1102 | cgroup_kn_unlock(kn); | ||
1103 | return NULL; | ||
952 | } | 1104 | } |
953 | 1105 | ||
954 | static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) | 1106 | static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) |
955 | { | 1107 | { |
956 | char name[CGROUP_FILE_NAME_MAX]; | 1108 | char name[CGROUP_FILE_NAME_MAX]; |
957 | 1109 | ||
958 | lockdep_assert_held(&cgroup_tree_mutex); | 1110 | lockdep_assert_held(&cgroup_mutex); |
959 | kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name)); | 1111 | kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name)); |
960 | } | 1112 | } |
961 | 1113 | ||
@@ -964,7 +1116,7 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) | |||
964 | * @cgrp: target cgroup | 1116 | * @cgrp: target cgroup |
965 | * @subsys_mask: mask of the subsystem ids whose files should be removed | 1117 | * @subsys_mask: mask of the subsystem ids whose files should be removed |
966 | */ | 1118 | */ |
967 | static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) | 1119 | static void cgroup_clear_dir(struct cgroup *cgrp, unsigned int subsys_mask) |
968 | { | 1120 | { |
969 | struct cgroup_subsys *ss; | 1121 | struct cgroup_subsys *ss; |
970 | int i; | 1122 | int i; |
@@ -972,40 +1124,40 @@ static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) | |||
972 | for_each_subsys(ss, i) { | 1124 | for_each_subsys(ss, i) { |
973 | struct cftype *cfts; | 1125 | struct cftype *cfts; |
974 | 1126 | ||
975 | if (!test_bit(i, &subsys_mask)) | 1127 | if (!(subsys_mask & (1 << i))) |
976 | continue; | 1128 | continue; |
977 | list_for_each_entry(cfts, &ss->cfts, node) | 1129 | list_for_each_entry(cfts, &ss->cfts, node) |
978 | cgroup_addrm_files(cgrp, cfts, false); | 1130 | cgroup_addrm_files(cgrp, cfts, false); |
979 | } | 1131 | } |
980 | } | 1132 | } |
981 | 1133 | ||
982 | static int rebind_subsystems(struct cgroup_root *dst_root, | 1134 | static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask) |
983 | unsigned long ss_mask) | ||
984 | { | 1135 | { |
985 | struct cgroup_subsys *ss; | 1136 | struct cgroup_subsys *ss; |
986 | int ssid, ret; | 1137 | unsigned int tmp_ss_mask; |
1138 | int ssid, i, ret; | ||
987 | 1139 | ||
988 | lockdep_assert_held(&cgroup_tree_mutex); | ||
989 | lockdep_assert_held(&cgroup_mutex); | 1140 | lockdep_assert_held(&cgroup_mutex); |
990 | 1141 | ||
991 | for_each_subsys(ss, ssid) { | 1142 | for_each_subsys(ss, ssid) { |
992 | if (!(ss_mask & (1 << ssid))) | 1143 | if (!(ss_mask & (1 << ssid))) |
993 | continue; | 1144 | continue; |
994 | 1145 | ||
995 | /* if @ss is on the dummy_root, we can always move it */ | 1146 | /* if @ss has non-root csses attached to it, can't move */ |
996 | if (ss->root == &cgrp_dfl_root) | 1147 | if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss))) |
997 | continue; | ||
998 | |||
999 | /* if @ss has non-root cgroups attached to it, can't move */ | ||
1000 | if (!list_empty(&ss->root->cgrp.children)) | ||
1001 | return -EBUSY; | 1148 | return -EBUSY; |
1002 | 1149 | ||
1003 | /* can't move between two non-dummy roots either */ | 1150 | /* can't move between two non-dummy roots either */ |
1004 | if (dst_root != &cgrp_dfl_root) | 1151 | if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root) |
1005 | return -EBUSY; | 1152 | return -EBUSY; |
1006 | } | 1153 | } |
1007 | 1154 | ||
1008 | ret = cgroup_populate_dir(&dst_root->cgrp, ss_mask); | 1155 | /* skip creating root files on dfl_root for inhibited subsystems */ |
1156 | tmp_ss_mask = ss_mask; | ||
1157 | if (dst_root == &cgrp_dfl_root) | ||
1158 | tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask; | ||
1159 | |||
1160 | ret = cgroup_populate_dir(&dst_root->cgrp, tmp_ss_mask); | ||
1009 | if (ret) { | 1161 | if (ret) { |
1010 | if (dst_root != &cgrp_dfl_root) | 1162 | if (dst_root != &cgrp_dfl_root) |
1011 | return ret; | 1163 | return ret; |
@@ -1017,9 +1169,9 @@ static int rebind_subsystems(struct cgroup_root *dst_root, | |||
1017 | * Just warn about it and continue. | 1169 | * Just warn about it and continue. |
1018 | */ | 1170 | */ |
1019 | if (cgrp_dfl_root_visible) { | 1171 | if (cgrp_dfl_root_visible) { |
1020 | pr_warning("cgroup: failed to create files (%d) while rebinding 0x%lx to default root\n", | 1172 | pr_warn("failed to create files (%d) while rebinding 0x%x to default root\n", |
1021 | ret, ss_mask); | 1173 | ret, ss_mask); |
1022 | pr_warning("cgroup: you may retry by moving them to a different hierarchy and unbinding\n"); | 1174 | pr_warn("you may retry by moving them to a different hierarchy and unbinding\n"); |
1023 | } | 1175 | } |
1024 | } | 1176 | } |
1025 | 1177 | ||
@@ -1027,15 +1179,14 @@ static int rebind_subsystems(struct cgroup_root *dst_root, | |||
1027 | * Nothing can fail from this point on. Remove files for the | 1179 | * Nothing can fail from this point on. Remove files for the |
1028 | * removed subsystems and rebind each subsystem. | 1180 | * removed subsystems and rebind each subsystem. |
1029 | */ | 1181 | */ |
1030 | mutex_unlock(&cgroup_mutex); | ||
1031 | for_each_subsys(ss, ssid) | 1182 | for_each_subsys(ss, ssid) |
1032 | if (ss_mask & (1 << ssid)) | 1183 | if (ss_mask & (1 << ssid)) |
1033 | cgroup_clear_dir(&ss->root->cgrp, 1 << ssid); | 1184 | cgroup_clear_dir(&ss->root->cgrp, 1 << ssid); |
1034 | mutex_lock(&cgroup_mutex); | ||
1035 | 1185 | ||
1036 | for_each_subsys(ss, ssid) { | 1186 | for_each_subsys(ss, ssid) { |
1037 | struct cgroup_root *src_root; | 1187 | struct cgroup_root *src_root; |
1038 | struct cgroup_subsys_state *css; | 1188 | struct cgroup_subsys_state *css; |
1189 | struct css_set *cset; | ||
1039 | 1190 | ||
1040 | if (!(ss_mask & (1 << ssid))) | 1191 | if (!(ss_mask & (1 << ssid))) |
1041 | continue; | 1192 | continue; |
@@ -1050,8 +1201,19 @@ static int rebind_subsystems(struct cgroup_root *dst_root, | |||
1050 | ss->root = dst_root; | 1201 | ss->root = dst_root; |
1051 | css->cgroup = &dst_root->cgrp; | 1202 | css->cgroup = &dst_root->cgrp; |
1052 | 1203 | ||
1053 | src_root->cgrp.subsys_mask &= ~(1 << ssid); | 1204 | down_write(&css_set_rwsem); |
1054 | dst_root->cgrp.subsys_mask |= 1 << ssid; | 1205 | hash_for_each(css_set_table, i, cset, hlist) |
1206 | list_move_tail(&cset->e_cset_node[ss->id], | ||
1207 | &dst_root->cgrp.e_csets[ss->id]); | ||
1208 | up_write(&css_set_rwsem); | ||
1209 | |||
1210 | src_root->subsys_mask &= ~(1 << ssid); | ||
1211 | src_root->cgrp.child_subsys_mask &= ~(1 << ssid); | ||
1212 | |||
1213 | /* default hierarchy doesn't enable controllers by default */ | ||
1214 | dst_root->subsys_mask |= 1 << ssid; | ||
1215 | if (dst_root != &cgrp_dfl_root) | ||
1216 | dst_root->cgrp.child_subsys_mask |= 1 << ssid; | ||
1055 | 1217 | ||
1056 | if (ss->bind) | 1218 | if (ss->bind) |
1057 | ss->bind(css); | 1219 | ss->bind(css); |
@@ -1069,7 +1231,7 @@ static int cgroup_show_options(struct seq_file *seq, | |||
1069 | int ssid; | 1231 | int ssid; |
1070 | 1232 | ||
1071 | for_each_subsys(ss, ssid) | 1233 | for_each_subsys(ss, ssid) |
1072 | if (root->cgrp.subsys_mask & (1 << ssid)) | 1234 | if (root->subsys_mask & (1 << ssid)) |
1073 | seq_printf(seq, ",%s", ss->name); | 1235 | seq_printf(seq, ",%s", ss->name); |
1074 | if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) | 1236 | if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) |
1075 | seq_puts(seq, ",sane_behavior"); | 1237 | seq_puts(seq, ",sane_behavior"); |
@@ -1091,8 +1253,8 @@ static int cgroup_show_options(struct seq_file *seq, | |||
1091 | } | 1253 | } |
1092 | 1254 | ||
1093 | struct cgroup_sb_opts { | 1255 | struct cgroup_sb_opts { |
1094 | unsigned long subsys_mask; | 1256 | unsigned int subsys_mask; |
1095 | unsigned long flags; | 1257 | unsigned int flags; |
1096 | char *release_agent; | 1258 | char *release_agent; |
1097 | bool cpuset_clone_children; | 1259 | bool cpuset_clone_children; |
1098 | char *name; | 1260 | char *name; |
@@ -1100,24 +1262,16 @@ struct cgroup_sb_opts { | |||
1100 | bool none; | 1262 | bool none; |
1101 | }; | 1263 | }; |
1102 | 1264 | ||
1103 | /* | ||
1104 | * Convert a hierarchy specifier into a bitmask of subsystems and | ||
1105 | * flags. Call with cgroup_mutex held to protect the cgroup_subsys[] | ||
1106 | * array. This function takes refcounts on subsystems to be used, unless it | ||
1107 | * returns error, in which case no refcounts are taken. | ||
1108 | */ | ||
1109 | static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | 1265 | static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) |
1110 | { | 1266 | { |
1111 | char *token, *o = data; | 1267 | char *token, *o = data; |
1112 | bool all_ss = false, one_ss = false; | 1268 | bool all_ss = false, one_ss = false; |
1113 | unsigned long mask = (unsigned long)-1; | 1269 | unsigned int mask = -1U; |
1114 | struct cgroup_subsys *ss; | 1270 | struct cgroup_subsys *ss; |
1115 | int i; | 1271 | int i; |
1116 | 1272 | ||
1117 | BUG_ON(!mutex_is_locked(&cgroup_mutex)); | ||
1118 | |||
1119 | #ifdef CONFIG_CPUSETS | 1273 | #ifdef CONFIG_CPUSETS |
1120 | mask = ~(1UL << cpuset_cgrp_id); | 1274 | mask = ~(1U << cpuset_cgrp_id); |
1121 | #endif | 1275 | #endif |
1122 | 1276 | ||
1123 | memset(opts, 0, sizeof(*opts)); | 1277 | memset(opts, 0, sizeof(*opts)); |
@@ -1198,7 +1352,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
1198 | /* Mutually exclusive option 'all' + subsystem name */ | 1352 | /* Mutually exclusive option 'all' + subsystem name */ |
1199 | if (all_ss) | 1353 | if (all_ss) |
1200 | return -EINVAL; | 1354 | return -EINVAL; |
1201 | set_bit(i, &opts->subsys_mask); | 1355 | opts->subsys_mask |= (1 << i); |
1202 | one_ss = true; | 1356 | one_ss = true; |
1203 | 1357 | ||
1204 | break; | 1358 | break; |
@@ -1210,12 +1364,12 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
1210 | /* Consistency checks */ | 1364 | /* Consistency checks */ |
1211 | 1365 | ||
1212 | if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { | 1366 | if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { |
1213 | pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); | 1367 | pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); |
1214 | 1368 | ||
1215 | if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) || | 1369 | if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) || |
1216 | opts->cpuset_clone_children || opts->release_agent || | 1370 | opts->cpuset_clone_children || opts->release_agent || |
1217 | opts->name) { | 1371 | opts->name) { |
1218 | pr_err("cgroup: sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n"); | 1372 | pr_err("sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n"); |
1219 | return -EINVAL; | 1373 | return -EINVAL; |
1220 | } | 1374 | } |
1221 | } else { | 1375 | } else { |
@@ -1227,7 +1381,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
1227 | if (all_ss || (!one_ss && !opts->none && !opts->name)) | 1381 | if (all_ss || (!one_ss && !opts->none && !opts->name)) |
1228 | for_each_subsys(ss, i) | 1382 | for_each_subsys(ss, i) |
1229 | if (!ss->disabled) | 1383 | if (!ss->disabled) |
1230 | set_bit(i, &opts->subsys_mask); | 1384 | opts->subsys_mask |= (1 << i); |
1231 | 1385 | ||
1232 | /* | 1386 | /* |
1233 | * We either have to specify by name or by subsystems. (So | 1387 | * We either have to specify by name or by subsystems. (So |
@@ -1258,14 +1412,13 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) | |||
1258 | int ret = 0; | 1412 | int ret = 0; |
1259 | struct cgroup_root *root = cgroup_root_from_kf(kf_root); | 1413 | struct cgroup_root *root = cgroup_root_from_kf(kf_root); |
1260 | struct cgroup_sb_opts opts; | 1414 | struct cgroup_sb_opts opts; |
1261 | unsigned long added_mask, removed_mask; | 1415 | unsigned int added_mask, removed_mask; |
1262 | 1416 | ||
1263 | if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) { | 1417 | if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) { |
1264 | pr_err("cgroup: sane_behavior: remount is not allowed\n"); | 1418 | pr_err("sane_behavior: remount is not allowed\n"); |
1265 | return -EINVAL; | 1419 | return -EINVAL; |
1266 | } | 1420 | } |
1267 | 1421 | ||
1268 | mutex_lock(&cgroup_tree_mutex); | ||
1269 | mutex_lock(&cgroup_mutex); | 1422 | mutex_lock(&cgroup_mutex); |
1270 | 1423 | ||
1271 | /* See what subsystems are wanted */ | 1424 | /* See what subsystems are wanted */ |
@@ -1273,17 +1426,17 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) | |||
1273 | if (ret) | 1426 | if (ret) |
1274 | goto out_unlock; | 1427 | goto out_unlock; |
1275 | 1428 | ||
1276 | if (opts.subsys_mask != root->cgrp.subsys_mask || opts.release_agent) | 1429 | if (opts.subsys_mask != root->subsys_mask || opts.release_agent) |
1277 | pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", | 1430 | pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n", |
1278 | task_tgid_nr(current), current->comm); | 1431 | task_tgid_nr(current), current->comm); |
1279 | 1432 | ||
1280 | added_mask = opts.subsys_mask & ~root->cgrp.subsys_mask; | 1433 | added_mask = opts.subsys_mask & ~root->subsys_mask; |
1281 | removed_mask = root->cgrp.subsys_mask & ~opts.subsys_mask; | 1434 | removed_mask = root->subsys_mask & ~opts.subsys_mask; |
1282 | 1435 | ||
1283 | /* Don't allow flags or name to change at remount */ | 1436 | /* Don't allow flags or name to change at remount */ |
1284 | if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) || | 1437 | if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) || |
1285 | (opts.name && strcmp(opts.name, root->name))) { | 1438 | (opts.name && strcmp(opts.name, root->name))) { |
1286 | pr_err("cgroup: option or name mismatch, new: 0x%lx \"%s\", old: 0x%lx \"%s\"\n", | 1439 | pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n", |
1287 | opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "", | 1440 | opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "", |
1288 | root->flags & CGRP_ROOT_OPTION_MASK, root->name); | 1441 | root->flags & CGRP_ROOT_OPTION_MASK, root->name); |
1289 | ret = -EINVAL; | 1442 | ret = -EINVAL; |
@@ -1291,7 +1444,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) | |||
1291 | } | 1444 | } |
1292 | 1445 | ||
1293 | /* remounting is not allowed for populated hierarchies */ | 1446 | /* remounting is not allowed for populated hierarchies */ |
1294 | if (!list_empty(&root->cgrp.children)) { | 1447 | if (!list_empty(&root->cgrp.self.children)) { |
1295 | ret = -EBUSY; | 1448 | ret = -EBUSY; |
1296 | goto out_unlock; | 1449 | goto out_unlock; |
1297 | } | 1450 | } |
@@ -1311,7 +1464,6 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) | |||
1311 | kfree(opts.release_agent); | 1464 | kfree(opts.release_agent); |
1312 | kfree(opts.name); | 1465 | kfree(opts.name); |
1313 | mutex_unlock(&cgroup_mutex); | 1466 | mutex_unlock(&cgroup_mutex); |
1314 | mutex_unlock(&cgroup_tree_mutex); | ||
1315 | return ret; | 1467 | return ret; |
1316 | } | 1468 | } |
1317 | 1469 | ||
@@ -1369,14 +1521,22 @@ out_unlock: | |||
1369 | 1521 | ||
1370 | static void init_cgroup_housekeeping(struct cgroup *cgrp) | 1522 | static void init_cgroup_housekeeping(struct cgroup *cgrp) |
1371 | { | 1523 | { |
1372 | atomic_set(&cgrp->refcnt, 1); | 1524 | struct cgroup_subsys *ss; |
1373 | INIT_LIST_HEAD(&cgrp->sibling); | 1525 | int ssid; |
1374 | INIT_LIST_HEAD(&cgrp->children); | 1526 | |
1527 | INIT_LIST_HEAD(&cgrp->self.sibling); | ||
1528 | INIT_LIST_HEAD(&cgrp->self.children); | ||
1375 | INIT_LIST_HEAD(&cgrp->cset_links); | 1529 | INIT_LIST_HEAD(&cgrp->cset_links); |
1376 | INIT_LIST_HEAD(&cgrp->release_list); | 1530 | INIT_LIST_HEAD(&cgrp->release_list); |
1377 | INIT_LIST_HEAD(&cgrp->pidlists); | 1531 | INIT_LIST_HEAD(&cgrp->pidlists); |
1378 | mutex_init(&cgrp->pidlist_mutex); | 1532 | mutex_init(&cgrp->pidlist_mutex); |
1379 | cgrp->dummy_css.cgroup = cgrp; | 1533 | cgrp->self.cgroup = cgrp; |
1534 | cgrp->self.flags |= CSS_ONLINE; | ||
1535 | |||
1536 | for_each_subsys(ss, ssid) | ||
1537 | INIT_LIST_HEAD(&cgrp->e_csets[ssid]); | ||
1538 | |||
1539 | init_waitqueue_head(&cgrp->offline_waitq); | ||
1380 | } | 1540 | } |
1381 | 1541 | ||
1382 | static void init_cgroup_root(struct cgroup_root *root, | 1542 | static void init_cgroup_root(struct cgroup_root *root, |
@@ -1399,21 +1559,24 @@ static void init_cgroup_root(struct cgroup_root *root, | |||
1399 | set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); | 1559 | set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); |
1400 | } | 1560 | } |
1401 | 1561 | ||
1402 | static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) | 1562 | static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) |
1403 | { | 1563 | { |
1404 | LIST_HEAD(tmp_links); | 1564 | LIST_HEAD(tmp_links); |
1405 | struct cgroup *root_cgrp = &root->cgrp; | 1565 | struct cgroup *root_cgrp = &root->cgrp; |
1406 | struct css_set *cset; | 1566 | struct css_set *cset; |
1407 | int i, ret; | 1567 | int i, ret; |
1408 | 1568 | ||
1409 | lockdep_assert_held(&cgroup_tree_mutex); | ||
1410 | lockdep_assert_held(&cgroup_mutex); | 1569 | lockdep_assert_held(&cgroup_mutex); |
1411 | 1570 | ||
1412 | ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL); | 1571 | ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_NOWAIT); |
1413 | if (ret < 0) | 1572 | if (ret < 0) |
1414 | goto out; | 1573 | goto out; |
1415 | root_cgrp->id = ret; | 1574 | root_cgrp->id = ret; |
1416 | 1575 | ||
1576 | ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release); | ||
1577 | if (ret) | ||
1578 | goto out; | ||
1579 | |||
1417 | /* | 1580 | /* |
1418 | * We're accessing css_set_count without locking css_set_rwsem here, | 1581 | * We're accessing css_set_count without locking css_set_rwsem here, |
1419 | * but that's OK - it can only be increased by someone holding | 1582 | * but that's OK - it can only be increased by someone holding |
@@ -1422,11 +1585,11 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) | |||
1422 | */ | 1585 | */ |
1423 | ret = allocate_cgrp_cset_links(css_set_count, &tmp_links); | 1586 | ret = allocate_cgrp_cset_links(css_set_count, &tmp_links); |
1424 | if (ret) | 1587 | if (ret) |
1425 | goto out; | 1588 | goto cancel_ref; |
1426 | 1589 | ||
1427 | ret = cgroup_init_root_id(root); | 1590 | ret = cgroup_init_root_id(root); |
1428 | if (ret) | 1591 | if (ret) |
1429 | goto out; | 1592 | goto cancel_ref; |
1430 | 1593 | ||
1431 | root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops, | 1594 | root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops, |
1432 | KERNFS_ROOT_CREATE_DEACTIVATED, | 1595 | KERNFS_ROOT_CREATE_DEACTIVATED, |
@@ -1462,7 +1625,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) | |||
1462 | link_css_set(&tmp_links, cset, root_cgrp); | 1625 | link_css_set(&tmp_links, cset, root_cgrp); |
1463 | up_write(&css_set_rwsem); | 1626 | up_write(&css_set_rwsem); |
1464 | 1627 | ||
1465 | BUG_ON(!list_empty(&root_cgrp->children)); | 1628 | BUG_ON(!list_empty(&root_cgrp->self.children)); |
1466 | BUG_ON(atomic_read(&root->nr_cgrps) != 1); | 1629 | BUG_ON(atomic_read(&root->nr_cgrps) != 1); |
1467 | 1630 | ||
1468 | kernfs_activate(root_cgrp->kn); | 1631 | kernfs_activate(root_cgrp->kn); |
@@ -1474,6 +1637,8 @@ destroy_root: | |||
1474 | root->kf_root = NULL; | 1637 | root->kf_root = NULL; |
1475 | exit_root_id: | 1638 | exit_root_id: |
1476 | cgroup_exit_root_id(root); | 1639 | cgroup_exit_root_id(root); |
1640 | cancel_ref: | ||
1641 | percpu_ref_cancel_init(&root_cgrp->self.refcnt); | ||
1477 | out: | 1642 | out: |
1478 | free_cgrp_cset_links(&tmp_links); | 1643 | free_cgrp_cset_links(&tmp_links); |
1479 | return ret; | 1644 | return ret; |
@@ -1495,8 +1660,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1495 | */ | 1660 | */ |
1496 | if (!use_task_css_set_links) | 1661 | if (!use_task_css_set_links) |
1497 | cgroup_enable_task_cg_lists(); | 1662 | cgroup_enable_task_cg_lists(); |
1498 | retry: | 1663 | |
1499 | mutex_lock(&cgroup_tree_mutex); | ||
1500 | mutex_lock(&cgroup_mutex); | 1664 | mutex_lock(&cgroup_mutex); |
1501 | 1665 | ||
1502 | /* First find the desired set of subsystems */ | 1666 | /* First find the desired set of subsystems */ |
@@ -1535,7 +1699,7 @@ retry: | |||
1535 | * subsystems) then they must match. | 1699 | * subsystems) then they must match. |
1536 | */ | 1700 | */ |
1537 | if ((opts.subsys_mask || opts.none) && | 1701 | if ((opts.subsys_mask || opts.none) && |
1538 | (opts.subsys_mask != root->cgrp.subsys_mask)) { | 1702 | (opts.subsys_mask != root->subsys_mask)) { |
1539 | if (!name_match) | 1703 | if (!name_match) |
1540 | continue; | 1704 | continue; |
1541 | ret = -EBUSY; | 1705 | ret = -EBUSY; |
@@ -1544,28 +1708,27 @@ retry: | |||
1544 | 1708 | ||
1545 | if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) { | 1709 | if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) { |
1546 | if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) { | 1710 | if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) { |
1547 | pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n"); | 1711 | pr_err("sane_behavior: new mount options should match the existing superblock\n"); |
1548 | ret = -EINVAL; | 1712 | ret = -EINVAL; |
1549 | goto out_unlock; | 1713 | goto out_unlock; |
1550 | } else { | 1714 | } else { |
1551 | pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n"); | 1715 | pr_warn("new mount options do not match the existing superblock, will be ignored\n"); |
1552 | } | 1716 | } |
1553 | } | 1717 | } |
1554 | 1718 | ||
1555 | /* | 1719 | /* |
1556 | * A root's lifetime is governed by its root cgroup. Zero | 1720 | * A root's lifetime is governed by its root cgroup. |
1557 | * ref indicate that the root is being destroyed. Wait for | 1721 | * tryget_live failure indicate that the root is being |
1558 | * destruction to complete so that the subsystems are free. | 1722 | * destroyed. Wait for destruction to complete so that the |
1559 | * We can use wait_queue for the wait but this path is | 1723 | * subsystems are free. We can use wait_queue for the wait |
1560 | * super cold. Let's just sleep for a bit and retry. | 1724 | * but this path is super cold. Let's just sleep for a bit |
1725 | * and retry. | ||
1561 | */ | 1726 | */ |
1562 | if (!atomic_inc_not_zero(&root->cgrp.refcnt)) { | 1727 | if (!percpu_ref_tryget_live(&root->cgrp.self.refcnt)) { |
1563 | mutex_unlock(&cgroup_mutex); | 1728 | mutex_unlock(&cgroup_mutex); |
1564 | mutex_unlock(&cgroup_tree_mutex); | ||
1565 | kfree(opts.release_agent); | ||
1566 | kfree(opts.name); | ||
1567 | msleep(10); | 1729 | msleep(10); |
1568 | goto retry; | 1730 | ret = restart_syscall(); |
1731 | goto out_free; | ||
1569 | } | 1732 | } |
1570 | 1733 | ||
1571 | ret = 0; | 1734 | ret = 0; |
@@ -1596,15 +1759,15 @@ retry: | |||
1596 | 1759 | ||
1597 | out_unlock: | 1760 | out_unlock: |
1598 | mutex_unlock(&cgroup_mutex); | 1761 | mutex_unlock(&cgroup_mutex); |
1599 | mutex_unlock(&cgroup_tree_mutex); | 1762 | out_free: |
1600 | |||
1601 | kfree(opts.release_agent); | 1763 | kfree(opts.release_agent); |
1602 | kfree(opts.name); | 1764 | kfree(opts.name); |
1603 | 1765 | ||
1604 | if (ret) | 1766 | if (ret) |
1605 | return ERR_PTR(ret); | 1767 | return ERR_PTR(ret); |
1606 | 1768 | ||
1607 | dentry = kernfs_mount(fs_type, flags, root->kf_root, &new_sb); | 1769 | dentry = kernfs_mount(fs_type, flags, root->kf_root, |
1770 | CGROUP_SUPER_MAGIC, &new_sb); | ||
1608 | if (IS_ERR(dentry) || !new_sb) | 1771 | if (IS_ERR(dentry) || !new_sb) |
1609 | cgroup_put(&root->cgrp); | 1772 | cgroup_put(&root->cgrp); |
1610 | return dentry; | 1773 | return dentry; |
@@ -1615,7 +1778,19 @@ static void cgroup_kill_sb(struct super_block *sb) | |||
1615 | struct kernfs_root *kf_root = kernfs_root_from_sb(sb); | 1778 | struct kernfs_root *kf_root = kernfs_root_from_sb(sb); |
1616 | struct cgroup_root *root = cgroup_root_from_kf(kf_root); | 1779 | struct cgroup_root *root = cgroup_root_from_kf(kf_root); |
1617 | 1780 | ||
1618 | cgroup_put(&root->cgrp); | 1781 | /* |
1782 | * If @root doesn't have any mounts or children, start killing it. | ||
1783 | * This prevents new mounts by disabling percpu_ref_tryget_live(). | ||
1784 | * cgroup_mount() may wait for @root's release. | ||
1785 | * | ||
1786 | * And don't kill the default root. | ||
1787 | */ | ||
1788 | if (css_has_online_children(&root->cgrp.self) || | ||
1789 | root == &cgrp_dfl_root) | ||
1790 | cgroup_put(&root->cgrp); | ||
1791 | else | ||
1792 | percpu_ref_kill(&root->cgrp.self.refcnt); | ||
1793 | |||
1619 | kernfs_kill_sb(sb); | 1794 | kernfs_kill_sb(sb); |
1620 | } | 1795 | } |
1621 | 1796 | ||
@@ -1737,7 +1912,7 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) | |||
1737 | 1912 | ||
1738 | /** | 1913 | /** |
1739 | * cgroup_task_migrate - move a task from one cgroup to another. | 1914 | * cgroup_task_migrate - move a task from one cgroup to another. |
1740 | * @old_cgrp; the cgroup @tsk is being migrated from | 1915 | * @old_cgrp: the cgroup @tsk is being migrated from |
1741 | * @tsk: the task being migrated | 1916 | * @tsk: the task being migrated |
1742 | * @new_cset: the new css_set @tsk is being attached to | 1917 | * @new_cset: the new css_set @tsk is being attached to |
1743 | * | 1918 | * |
@@ -1829,10 +2004,6 @@ static void cgroup_migrate_add_src(struct css_set *src_cset, | |||
1829 | 2004 | ||
1830 | src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); | 2005 | src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); |
1831 | 2006 | ||
1832 | /* nothing to do if this cset already belongs to the cgroup */ | ||
1833 | if (src_cgrp == dst_cgrp) | ||
1834 | return; | ||
1835 | |||
1836 | if (!list_empty(&src_cset->mg_preload_node)) | 2007 | if (!list_empty(&src_cset->mg_preload_node)) |
1837 | return; | 2008 | return; |
1838 | 2009 | ||
@@ -1847,13 +2018,14 @@ static void cgroup_migrate_add_src(struct css_set *src_cset, | |||
1847 | 2018 | ||
1848 | /** | 2019 | /** |
1849 | * cgroup_migrate_prepare_dst - prepare destination css_sets for migration | 2020 | * cgroup_migrate_prepare_dst - prepare destination css_sets for migration |
1850 | * @dst_cgrp: the destination cgroup | 2021 | * @dst_cgrp: the destination cgroup (may be %NULL) |
1851 | * @preloaded_csets: list of preloaded source css_sets | 2022 | * @preloaded_csets: list of preloaded source css_sets |
1852 | * | 2023 | * |
1853 | * Tasks are about to be moved to @dst_cgrp and all the source css_sets | 2024 | * Tasks are about to be moved to @dst_cgrp and all the source css_sets |
1854 | * have been preloaded to @preloaded_csets. This function looks up and | 2025 | * have been preloaded to @preloaded_csets. This function looks up and |
1855 | * pins all destination css_sets, links each to its source, and put them on | 2026 | * pins all destination css_sets, links each to its source, and append them |
1856 | * @preloaded_csets. | 2027 | * to @preloaded_csets. If @dst_cgrp is %NULL, the destination of each |
2028 | * source css_set is assumed to be its cgroup on the default hierarchy. | ||
1857 | * | 2029 | * |
1858 | * This function must be called after cgroup_migrate_add_src() has been | 2030 | * This function must be called after cgroup_migrate_add_src() has been |
1859 | * called on each migration source css_set. After migration is performed | 2031 | * called on each migration source css_set. After migration is performed |
@@ -1864,19 +2036,42 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, | |||
1864 | struct list_head *preloaded_csets) | 2036 | struct list_head *preloaded_csets) |
1865 | { | 2037 | { |
1866 | LIST_HEAD(csets); | 2038 | LIST_HEAD(csets); |
1867 | struct css_set *src_cset; | 2039 | struct css_set *src_cset, *tmp_cset; |
1868 | 2040 | ||
1869 | lockdep_assert_held(&cgroup_mutex); | 2041 | lockdep_assert_held(&cgroup_mutex); |
1870 | 2042 | ||
2043 | /* | ||
2044 | * Except for the root, child_subsys_mask must be zero for a cgroup | ||
2045 | * with tasks so that child cgroups don't compete against tasks. | ||
2046 | */ | ||
2047 | if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && cgroup_parent(dst_cgrp) && | ||
2048 | dst_cgrp->child_subsys_mask) | ||
2049 | return -EBUSY; | ||
2050 | |||
1871 | /* look up the dst cset for each src cset and link it to src */ | 2051 | /* look up the dst cset for each src cset and link it to src */ |
1872 | list_for_each_entry(src_cset, preloaded_csets, mg_preload_node) { | 2052 | list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) { |
1873 | struct css_set *dst_cset; | 2053 | struct css_set *dst_cset; |
1874 | 2054 | ||
1875 | dst_cset = find_css_set(src_cset, dst_cgrp); | 2055 | dst_cset = find_css_set(src_cset, |
2056 | dst_cgrp ?: src_cset->dfl_cgrp); | ||
1876 | if (!dst_cset) | 2057 | if (!dst_cset) |
1877 | goto err; | 2058 | goto err; |
1878 | 2059 | ||
1879 | WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset); | 2060 | WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset); |
2061 | |||
2062 | /* | ||
2063 | * If src cset equals dst, it's noop. Drop the src. | ||
2064 | * cgroup_migrate() will skip the cset too. Note that we | ||
2065 | * can't handle src == dst as some nodes are used by both. | ||
2066 | */ | ||
2067 | if (src_cset == dst_cset) { | ||
2068 | src_cset->mg_src_cgrp = NULL; | ||
2069 | list_del_init(&src_cset->mg_preload_node); | ||
2070 | put_css_set(src_cset, false); | ||
2071 | put_css_set(dst_cset, false); | ||
2072 | continue; | ||
2073 | } | ||
2074 | |||
1880 | src_cset->mg_dst_cset = dst_cset; | 2075 | src_cset->mg_dst_cset = dst_cset; |
1881 | 2076 | ||
1882 | if (list_empty(&dst_cset->mg_preload_node)) | 2077 | if (list_empty(&dst_cset->mg_preload_node)) |
@@ -1885,7 +2080,7 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, | |||
1885 | put_css_set(dst_cset, false); | 2080 | put_css_set(dst_cset, false); |
1886 | } | 2081 | } |
1887 | 2082 | ||
1888 | list_splice(&csets, preloaded_csets); | 2083 | list_splice_tail(&csets, preloaded_csets); |
1889 | return 0; | 2084 | return 0; |
1890 | err: | 2085 | err: |
1891 | cgroup_migrate_finish(&csets); | 2086 | cgroup_migrate_finish(&csets); |
@@ -1966,7 +2161,7 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader, | |||
1966 | return 0; | 2161 | return 0; |
1967 | 2162 | ||
1968 | /* check that we can legitimately attach to the cgroup */ | 2163 | /* check that we can legitimately attach to the cgroup */ |
1969 | for_each_css(css, i, cgrp) { | 2164 | for_each_e_css(css, i, cgrp) { |
1970 | if (css->ss->can_attach) { | 2165 | if (css->ss->can_attach) { |
1971 | ret = css->ss->can_attach(css, &tset); | 2166 | ret = css->ss->can_attach(css, &tset); |
1972 | if (ret) { | 2167 | if (ret) { |
@@ -1996,7 +2191,7 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader, | |||
1996 | */ | 2191 | */ |
1997 | tset.csets = &tset.dst_csets; | 2192 | tset.csets = &tset.dst_csets; |
1998 | 2193 | ||
1999 | for_each_css(css, i, cgrp) | 2194 | for_each_e_css(css, i, cgrp) |
2000 | if (css->ss->attach) | 2195 | if (css->ss->attach) |
2001 | css->ss->attach(css, &tset); | 2196 | css->ss->attach(css, &tset); |
2002 | 2197 | ||
@@ -2004,7 +2199,7 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader, | |||
2004 | goto out_release_tset; | 2199 | goto out_release_tset; |
2005 | 2200 | ||
2006 | out_cancel_attach: | 2201 | out_cancel_attach: |
2007 | for_each_css(css, i, cgrp) { | 2202 | for_each_e_css(css, i, cgrp) { |
2008 | if (css == failed_css) | 2203 | if (css == failed_css) |
2009 | break; | 2204 | break; |
2010 | if (css->ss->cancel_attach) | 2205 | if (css->ss->cancel_attach) |
@@ -2063,13 +2258,20 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, | |||
2063 | * function to attach either it or all tasks in its threadgroup. Will lock | 2258 | * function to attach either it or all tasks in its threadgroup. Will lock |
2064 | * cgroup_mutex and threadgroup. | 2259 | * cgroup_mutex and threadgroup. |
2065 | */ | 2260 | */ |
2066 | static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) | 2261 | static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, |
2262 | size_t nbytes, loff_t off, bool threadgroup) | ||
2067 | { | 2263 | { |
2068 | struct task_struct *tsk; | 2264 | struct task_struct *tsk; |
2069 | const struct cred *cred = current_cred(), *tcred; | 2265 | const struct cred *cred = current_cred(), *tcred; |
2266 | struct cgroup *cgrp; | ||
2267 | pid_t pid; | ||
2070 | int ret; | 2268 | int ret; |
2071 | 2269 | ||
2072 | if (!cgroup_lock_live_group(cgrp)) | 2270 | if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) |
2271 | return -EINVAL; | ||
2272 | |||
2273 | cgrp = cgroup_kn_lock_live(of->kn); | ||
2274 | if (!cgrp) | ||
2073 | return -ENODEV; | 2275 | return -ENODEV; |
2074 | 2276 | ||
2075 | retry_find_task: | 2277 | retry_find_task: |
@@ -2135,8 +2337,8 @@ retry_find_task: | |||
2135 | 2337 | ||
2136 | put_task_struct(tsk); | 2338 | put_task_struct(tsk); |
2137 | out_unlock_cgroup: | 2339 | out_unlock_cgroup: |
2138 | mutex_unlock(&cgroup_mutex); | 2340 | cgroup_kn_unlock(of->kn); |
2139 | return ret; | 2341 | return ret ?: nbytes; |
2140 | } | 2342 | } |
2141 | 2343 | ||
2142 | /** | 2344 | /** |
@@ -2170,43 +2372,44 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) | |||
2170 | } | 2372 | } |
2171 | EXPORT_SYMBOL_GPL(cgroup_attach_task_all); | 2373 | EXPORT_SYMBOL_GPL(cgroup_attach_task_all); |
2172 | 2374 | ||
2173 | static int cgroup_tasks_write(struct cgroup_subsys_state *css, | 2375 | static ssize_t cgroup_tasks_write(struct kernfs_open_file *of, |
2174 | struct cftype *cft, u64 pid) | 2376 | char *buf, size_t nbytes, loff_t off) |
2175 | { | 2377 | { |
2176 | return attach_task_by_pid(css->cgroup, pid, false); | 2378 | return __cgroup_procs_write(of, buf, nbytes, off, false); |
2177 | } | 2379 | } |
2178 | 2380 | ||
2179 | static int cgroup_procs_write(struct cgroup_subsys_state *css, | 2381 | static ssize_t cgroup_procs_write(struct kernfs_open_file *of, |
2180 | struct cftype *cft, u64 tgid) | 2382 | char *buf, size_t nbytes, loff_t off) |
2181 | { | 2383 | { |
2182 | return attach_task_by_pid(css->cgroup, tgid, true); | 2384 | return __cgroup_procs_write(of, buf, nbytes, off, true); |
2183 | } | 2385 | } |
2184 | 2386 | ||
2185 | static int cgroup_release_agent_write(struct cgroup_subsys_state *css, | 2387 | static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, |
2186 | struct cftype *cft, char *buffer) | 2388 | char *buf, size_t nbytes, loff_t off) |
2187 | { | 2389 | { |
2188 | struct cgroup_root *root = css->cgroup->root; | 2390 | struct cgroup *cgrp; |
2189 | 2391 | ||
2190 | BUILD_BUG_ON(sizeof(root->release_agent_path) < PATH_MAX); | 2392 | BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); |
2191 | if (!cgroup_lock_live_group(css->cgroup)) | 2393 | |
2394 | cgrp = cgroup_kn_lock_live(of->kn); | ||
2395 | if (!cgrp) | ||
2192 | return -ENODEV; | 2396 | return -ENODEV; |
2193 | spin_lock(&release_agent_path_lock); | 2397 | spin_lock(&release_agent_path_lock); |
2194 | strlcpy(root->release_agent_path, buffer, | 2398 | strlcpy(cgrp->root->release_agent_path, strstrip(buf), |
2195 | sizeof(root->release_agent_path)); | 2399 | sizeof(cgrp->root->release_agent_path)); |
2196 | spin_unlock(&release_agent_path_lock); | 2400 | spin_unlock(&release_agent_path_lock); |
2197 | mutex_unlock(&cgroup_mutex); | 2401 | cgroup_kn_unlock(of->kn); |
2198 | return 0; | 2402 | return nbytes; |
2199 | } | 2403 | } |
2200 | 2404 | ||
2201 | static int cgroup_release_agent_show(struct seq_file *seq, void *v) | 2405 | static int cgroup_release_agent_show(struct seq_file *seq, void *v) |
2202 | { | 2406 | { |
2203 | struct cgroup *cgrp = seq_css(seq)->cgroup; | 2407 | struct cgroup *cgrp = seq_css(seq)->cgroup; |
2204 | 2408 | ||
2205 | if (!cgroup_lock_live_group(cgrp)) | 2409 | spin_lock(&release_agent_path_lock); |
2206 | return -ENODEV; | ||
2207 | seq_puts(seq, cgrp->root->release_agent_path); | 2410 | seq_puts(seq, cgrp->root->release_agent_path); |
2411 | spin_unlock(&release_agent_path_lock); | ||
2208 | seq_putc(seq, '\n'); | 2412 | seq_putc(seq, '\n'); |
2209 | mutex_unlock(&cgroup_mutex); | ||
2210 | return 0; | 2413 | return 0; |
2211 | } | 2414 | } |
2212 | 2415 | ||
@@ -2218,6 +2421,320 @@ static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) | |||
2218 | return 0; | 2421 | return 0; |
2219 | } | 2422 | } |
2220 | 2423 | ||
2424 | static void cgroup_print_ss_mask(struct seq_file *seq, unsigned int ss_mask) | ||
2425 | { | ||
2426 | struct cgroup_subsys *ss; | ||
2427 | bool printed = false; | ||
2428 | int ssid; | ||
2429 | |||
2430 | for_each_subsys(ss, ssid) { | ||
2431 | if (ss_mask & (1 << ssid)) { | ||
2432 | if (printed) | ||
2433 | seq_putc(seq, ' '); | ||
2434 | seq_printf(seq, "%s", ss->name); | ||
2435 | printed = true; | ||
2436 | } | ||
2437 | } | ||
2438 | if (printed) | ||
2439 | seq_putc(seq, '\n'); | ||
2440 | } | ||
2441 | |||
2442 | /* show controllers which are currently attached to the default hierarchy */ | ||
2443 | static int cgroup_root_controllers_show(struct seq_file *seq, void *v) | ||
2444 | { | ||
2445 | struct cgroup *cgrp = seq_css(seq)->cgroup; | ||
2446 | |||
2447 | cgroup_print_ss_mask(seq, cgrp->root->subsys_mask & | ||
2448 | ~cgrp_dfl_root_inhibit_ss_mask); | ||
2449 | return 0; | ||
2450 | } | ||
2451 | |||
2452 | /* show controllers which are enabled from the parent */ | ||
2453 | static int cgroup_controllers_show(struct seq_file *seq, void *v) | ||
2454 | { | ||
2455 | struct cgroup *cgrp = seq_css(seq)->cgroup; | ||
2456 | |||
2457 | cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->child_subsys_mask); | ||
2458 | return 0; | ||
2459 | } | ||
2460 | |||
2461 | /* show controllers which are enabled for a given cgroup's children */ | ||
2462 | static int cgroup_subtree_control_show(struct seq_file *seq, void *v) | ||
2463 | { | ||
2464 | struct cgroup *cgrp = seq_css(seq)->cgroup; | ||
2465 | |||
2466 | cgroup_print_ss_mask(seq, cgrp->child_subsys_mask); | ||
2467 | return 0; | ||
2468 | } | ||
2469 | |||
2470 | /** | ||
2471 | * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy | ||
2472 | * @cgrp: root of the subtree to update csses for | ||
2473 | * | ||
2474 | * @cgrp's child_subsys_mask has changed and its subtree's (self excluded) | ||
2475 | * css associations need to be updated accordingly. This function looks up | ||
2476 | * all css_sets which are attached to the subtree, creates the matching | ||
2477 | * updated css_sets and migrates the tasks to the new ones. | ||
2478 | */ | ||
2479 | static int cgroup_update_dfl_csses(struct cgroup *cgrp) | ||
2480 | { | ||
2481 | LIST_HEAD(preloaded_csets); | ||
2482 | struct cgroup_subsys_state *css; | ||
2483 | struct css_set *src_cset; | ||
2484 | int ret; | ||
2485 | |||
2486 | lockdep_assert_held(&cgroup_mutex); | ||
2487 | |||
2488 | /* look up all csses currently attached to @cgrp's subtree */ | ||
2489 | down_read(&css_set_rwsem); | ||
2490 | css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) { | ||
2491 | struct cgrp_cset_link *link; | ||
2492 | |||
2493 | /* self is not affected by child_subsys_mask change */ | ||
2494 | if (css->cgroup == cgrp) | ||
2495 | continue; | ||
2496 | |||
2497 | list_for_each_entry(link, &css->cgroup->cset_links, cset_link) | ||
2498 | cgroup_migrate_add_src(link->cset, cgrp, | ||
2499 | &preloaded_csets); | ||
2500 | } | ||
2501 | up_read(&css_set_rwsem); | ||
2502 | |||
2503 | /* NULL dst indicates self on default hierarchy */ | ||
2504 | ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets); | ||
2505 | if (ret) | ||
2506 | goto out_finish; | ||
2507 | |||
2508 | list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) { | ||
2509 | struct task_struct *last_task = NULL, *task; | ||
2510 | |||
2511 | /* src_csets precede dst_csets, break on the first dst_cset */ | ||
2512 | if (!src_cset->mg_src_cgrp) | ||
2513 | break; | ||
2514 | |||
2515 | /* | ||
2516 | * All tasks in src_cset need to be migrated to the | ||
2517 | * matching dst_cset. Empty it process by process. We | ||
2518 | * walk tasks but migrate processes. The leader might even | ||
2519 | * belong to a different cset but such src_cset would also | ||
2520 | * be among the target src_csets because the default | ||
2521 | * hierarchy enforces per-process membership. | ||
2522 | */ | ||
2523 | while (true) { | ||
2524 | down_read(&css_set_rwsem); | ||
2525 | task = list_first_entry_or_null(&src_cset->tasks, | ||
2526 | struct task_struct, cg_list); | ||
2527 | if (task) { | ||
2528 | task = task->group_leader; | ||
2529 | WARN_ON_ONCE(!task_css_set(task)->mg_src_cgrp); | ||
2530 | get_task_struct(task); | ||
2531 | } | ||
2532 | up_read(&css_set_rwsem); | ||
2533 | |||
2534 | if (!task) | ||
2535 | break; | ||
2536 | |||
2537 | /* guard against possible infinite loop */ | ||
2538 | if (WARN(last_task == task, | ||
2539 | "cgroup: update_dfl_csses failed to make progress, aborting in inconsistent state\n")) | ||
2540 | goto out_finish; | ||
2541 | last_task = task; | ||
2542 | |||
2543 | threadgroup_lock(task); | ||
2544 | /* raced against de_thread() from another thread? */ | ||
2545 | if (!thread_group_leader(task)) { | ||
2546 | threadgroup_unlock(task); | ||
2547 | put_task_struct(task); | ||
2548 | continue; | ||
2549 | } | ||
2550 | |||
2551 | ret = cgroup_migrate(src_cset->dfl_cgrp, task, true); | ||
2552 | |||
2553 | threadgroup_unlock(task); | ||
2554 | put_task_struct(task); | ||
2555 | |||
2556 | if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret)) | ||
2557 | goto out_finish; | ||
2558 | } | ||
2559 | } | ||
2560 | |||
2561 | out_finish: | ||
2562 | cgroup_migrate_finish(&preloaded_csets); | ||
2563 | return ret; | ||
2564 | } | ||
2565 | |||
2566 | /* change the enabled child controllers for a cgroup in the default hierarchy */ | ||
2567 | static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, | ||
2568 | char *buf, size_t nbytes, | ||
2569 | loff_t off) | ||
2570 | { | ||
2571 | unsigned int enable = 0, disable = 0; | ||
2572 | struct cgroup *cgrp, *child; | ||
2573 | struct cgroup_subsys *ss; | ||
2574 | char *tok; | ||
2575 | int ssid, ret; | ||
2576 | |||
2577 | /* | ||
2578 | * Parse input - space separated list of subsystem names prefixed | ||
2579 | * with either + or -. | ||
2580 | */ | ||
2581 | buf = strstrip(buf); | ||
2582 | while ((tok = strsep(&buf, " "))) { | ||
2583 | if (tok[0] == '\0') | ||
2584 | continue; | ||
2585 | for_each_subsys(ss, ssid) { | ||
2586 | if (ss->disabled || strcmp(tok + 1, ss->name) || | ||
2587 | ((1 << ss->id) & cgrp_dfl_root_inhibit_ss_mask)) | ||
2588 | continue; | ||
2589 | |||
2590 | if (*tok == '+') { | ||
2591 | enable |= 1 << ssid; | ||
2592 | disable &= ~(1 << ssid); | ||
2593 | } else if (*tok == '-') { | ||
2594 | disable |= 1 << ssid; | ||
2595 | enable &= ~(1 << ssid); | ||
2596 | } else { | ||
2597 | return -EINVAL; | ||
2598 | } | ||
2599 | break; | ||
2600 | } | ||
2601 | if (ssid == CGROUP_SUBSYS_COUNT) | ||
2602 | return -EINVAL; | ||
2603 | } | ||
2604 | |||
2605 | cgrp = cgroup_kn_lock_live(of->kn); | ||
2606 | if (!cgrp) | ||
2607 | return -ENODEV; | ||
2608 | |||
2609 | for_each_subsys(ss, ssid) { | ||
2610 | if (enable & (1 << ssid)) { | ||
2611 | if (cgrp->child_subsys_mask & (1 << ssid)) { | ||
2612 | enable &= ~(1 << ssid); | ||
2613 | continue; | ||
2614 | } | ||
2615 | |||
2616 | /* | ||
2617 | * Because css offlining is asynchronous, userland | ||
2618 | * might try to re-enable the same controller while | ||
2619 | * the previous instance is still around. In such | ||
2620 | * cases, wait till it's gone using offline_waitq. | ||
2621 | */ | ||
2622 | cgroup_for_each_live_child(child, cgrp) { | ||
2623 | DEFINE_WAIT(wait); | ||
2624 | |||
2625 | if (!cgroup_css(child, ss)) | ||
2626 | continue; | ||
2627 | |||
2628 | cgroup_get(child); | ||
2629 | prepare_to_wait(&child->offline_waitq, &wait, | ||
2630 | TASK_UNINTERRUPTIBLE); | ||
2631 | cgroup_kn_unlock(of->kn); | ||
2632 | schedule(); | ||
2633 | finish_wait(&child->offline_waitq, &wait); | ||
2634 | cgroup_put(child); | ||
2635 | |||
2636 | return restart_syscall(); | ||
2637 | } | ||
2638 | |||
2639 | /* unavailable or not enabled on the parent? */ | ||
2640 | if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) || | ||
2641 | (cgroup_parent(cgrp) && | ||
2642 | !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ssid)))) { | ||
2643 | ret = -ENOENT; | ||
2644 | goto out_unlock; | ||
2645 | } | ||
2646 | } else if (disable & (1 << ssid)) { | ||
2647 | if (!(cgrp->child_subsys_mask & (1 << ssid))) { | ||
2648 | disable &= ~(1 << ssid); | ||
2649 | continue; | ||
2650 | } | ||
2651 | |||
2652 | /* a child has it enabled? */ | ||
2653 | cgroup_for_each_live_child(child, cgrp) { | ||
2654 | if (child->child_subsys_mask & (1 << ssid)) { | ||
2655 | ret = -EBUSY; | ||
2656 | goto out_unlock; | ||
2657 | } | ||
2658 | } | ||
2659 | } | ||
2660 | } | ||
2661 | |||
2662 | if (!enable && !disable) { | ||
2663 | ret = 0; | ||
2664 | goto out_unlock; | ||
2665 | } | ||
2666 | |||
2667 | /* | ||
2668 | * Except for the root, child_subsys_mask must be zero for a cgroup | ||
2669 | * with tasks so that child cgroups don't compete against tasks. | ||
2670 | */ | ||
2671 | if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) { | ||
2672 | ret = -EBUSY; | ||
2673 | goto out_unlock; | ||
2674 | } | ||
2675 | |||
2676 | /* | ||
2677 | * Create csses for enables and update child_subsys_mask. This | ||
2678 | * changes cgroup_e_css() results which in turn makes the | ||
2679 | * subsequent cgroup_update_dfl_csses() associate all tasks in the | ||
2680 | * subtree to the updated csses. | ||
2681 | */ | ||
2682 | for_each_subsys(ss, ssid) { | ||
2683 | if (!(enable & (1 << ssid))) | ||
2684 | continue; | ||
2685 | |||
2686 | cgroup_for_each_live_child(child, cgrp) { | ||
2687 | ret = create_css(child, ss); | ||
2688 | if (ret) | ||
2689 | goto err_undo_css; | ||
2690 | } | ||
2691 | } | ||
2692 | |||
2693 | cgrp->child_subsys_mask |= enable; | ||
2694 | cgrp->child_subsys_mask &= ~disable; | ||
2695 | |||
2696 | ret = cgroup_update_dfl_csses(cgrp); | ||
2697 | if (ret) | ||
2698 | goto err_undo_css; | ||
2699 | |||
2700 | /* all tasks are now migrated away from the old csses, kill them */ | ||
2701 | for_each_subsys(ss, ssid) { | ||
2702 | if (!(disable & (1 << ssid))) | ||
2703 | continue; | ||
2704 | |||
2705 | cgroup_for_each_live_child(child, cgrp) | ||
2706 | kill_css(cgroup_css(child, ss)); | ||
2707 | } | ||
2708 | |||
2709 | kernfs_activate(cgrp->kn); | ||
2710 | ret = 0; | ||
2711 | out_unlock: | ||
2712 | cgroup_kn_unlock(of->kn); | ||
2713 | return ret ?: nbytes; | ||
2714 | |||
2715 | err_undo_css: | ||
2716 | cgrp->child_subsys_mask &= ~enable; | ||
2717 | cgrp->child_subsys_mask |= disable; | ||
2718 | |||
2719 | for_each_subsys(ss, ssid) { | ||
2720 | if (!(enable & (1 << ssid))) | ||
2721 | continue; | ||
2722 | |||
2723 | cgroup_for_each_live_child(child, cgrp) { | ||
2724 | struct cgroup_subsys_state *css = cgroup_css(child, ss); | ||
2725 | if (css) | ||
2726 | kill_css(css); | ||
2727 | } | ||
2728 | } | ||
2729 | goto out_unlock; | ||
2730 | } | ||
2731 | |||
2732 | static int cgroup_populated_show(struct seq_file *seq, void *v) | ||
2733 | { | ||
2734 | seq_printf(seq, "%d\n", (bool)seq_css(seq)->cgroup->populated_cnt); | ||
2735 | return 0; | ||
2736 | } | ||
2737 | |||
2221 | static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, | 2738 | static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, |
2222 | size_t nbytes, loff_t off) | 2739 | size_t nbytes, loff_t off) |
2223 | { | 2740 | { |
@@ -2226,6 +2743,9 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, | |||
2226 | struct cgroup_subsys_state *css; | 2743 | struct cgroup_subsys_state *css; |
2227 | int ret; | 2744 | int ret; |
2228 | 2745 | ||
2746 | if (cft->write) | ||
2747 | return cft->write(of, buf, nbytes, off); | ||
2748 | |||
2229 | /* | 2749 | /* |
2230 | * kernfs guarantees that a file isn't deleted with operations in | 2750 | * kernfs guarantees that a file isn't deleted with operations in |
2231 | * flight, which means that the matching css is and stays alive and | 2751 | * flight, which means that the matching css is and stays alive and |
@@ -2236,9 +2756,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, | |||
2236 | css = cgroup_css(cgrp, cft->ss); | 2756 | css = cgroup_css(cgrp, cft->ss); |
2237 | rcu_read_unlock(); | 2757 | rcu_read_unlock(); |
2238 | 2758 | ||
2239 | if (cft->write_string) { | 2759 | if (cft->write_u64) { |
2240 | ret = cft->write_string(css, cft, strstrip(buf)); | ||
2241 | } else if (cft->write_u64) { | ||
2242 | unsigned long long v; | 2760 | unsigned long long v; |
2243 | ret = kstrtoull(buf, 0, &v); | 2761 | ret = kstrtoull(buf, 0, &v); |
2244 | if (!ret) | 2762 | if (!ret) |
@@ -2248,8 +2766,6 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, | |||
2248 | ret = kstrtoll(buf, 0, &v); | 2766 | ret = kstrtoll(buf, 0, &v); |
2249 | if (!ret) | 2767 | if (!ret) |
2250 | ret = cft->write_s64(css, cft, v); | 2768 | ret = cft->write_s64(css, cft, v); |
2251 | } else if (cft->trigger) { | ||
2252 | ret = cft->trigger(css, (unsigned int)cft->private); | ||
2253 | } else { | 2769 | } else { |
2254 | ret = -EINVAL; | 2770 | ret = -EINVAL; |
2255 | } | 2771 | } |
@@ -2326,20 +2842,18 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, | |||
2326 | return -EPERM; | 2842 | return -EPERM; |
2327 | 2843 | ||
2328 | /* | 2844 | /* |
2329 | * We're gonna grab cgroup_tree_mutex which nests outside kernfs | 2845 | * We're gonna grab cgroup_mutex which nests outside kernfs |
2330 | * active_ref. kernfs_rename() doesn't require active_ref | 2846 | * active_ref. kernfs_rename() doesn't require active_ref |
2331 | * protection. Break them before grabbing cgroup_tree_mutex. | 2847 | * protection. Break them before grabbing cgroup_mutex. |
2332 | */ | 2848 | */ |
2333 | kernfs_break_active_protection(new_parent); | 2849 | kernfs_break_active_protection(new_parent); |
2334 | kernfs_break_active_protection(kn); | 2850 | kernfs_break_active_protection(kn); |
2335 | 2851 | ||
2336 | mutex_lock(&cgroup_tree_mutex); | ||
2337 | mutex_lock(&cgroup_mutex); | 2852 | mutex_lock(&cgroup_mutex); |
2338 | 2853 | ||
2339 | ret = kernfs_rename(kn, new_parent, new_name_str); | 2854 | ret = kernfs_rename(kn, new_parent, new_name_str); |
2340 | 2855 | ||
2341 | mutex_unlock(&cgroup_mutex); | 2856 | mutex_unlock(&cgroup_mutex); |
2342 | mutex_unlock(&cgroup_tree_mutex); | ||
2343 | 2857 | ||
2344 | kernfs_unbreak_active_protection(kn); | 2858 | kernfs_unbreak_active_protection(kn); |
2345 | kernfs_unbreak_active_protection(new_parent); | 2859 | kernfs_unbreak_active_protection(new_parent); |
@@ -2377,9 +2891,14 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) | |||
2377 | return PTR_ERR(kn); | 2891 | return PTR_ERR(kn); |
2378 | 2892 | ||
2379 | ret = cgroup_kn_set_ugid(kn); | 2893 | ret = cgroup_kn_set_ugid(kn); |
2380 | if (ret) | 2894 | if (ret) { |
2381 | kernfs_remove(kn); | 2895 | kernfs_remove(kn); |
2382 | return ret; | 2896 | return ret; |
2897 | } | ||
2898 | |||
2899 | if (cft->seq_show == cgroup_populated_show) | ||
2900 | cgrp->populated_kn = kn; | ||
2901 | return 0; | ||
2383 | } | 2902 | } |
2384 | 2903 | ||
2385 | /** | 2904 | /** |
@@ -2399,7 +2918,7 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], | |||
2399 | struct cftype *cft; | 2918 | struct cftype *cft; |
2400 | int ret; | 2919 | int ret; |
2401 | 2920 | ||
2402 | lockdep_assert_held(&cgroup_tree_mutex); | 2921 | lockdep_assert_held(&cgroup_mutex); |
2403 | 2922 | ||
2404 | for (cft = cfts; cft->name[0] != '\0'; cft++) { | 2923 | for (cft = cfts; cft->name[0] != '\0'; cft++) { |
2405 | /* does cft->flags tell us to skip this file on @cgrp? */ | 2924 | /* does cft->flags tell us to skip this file on @cgrp? */ |
@@ -2407,16 +2926,16 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], | |||
2407 | continue; | 2926 | continue; |
2408 | if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp)) | 2927 | if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp)) |
2409 | continue; | 2928 | continue; |
2410 | if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) | 2929 | if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp)) |
2411 | continue; | 2930 | continue; |
2412 | if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) | 2931 | if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp)) |
2413 | continue; | 2932 | continue; |
2414 | 2933 | ||
2415 | if (is_add) { | 2934 | if (is_add) { |
2416 | ret = cgroup_add_file(cgrp, cft); | 2935 | ret = cgroup_add_file(cgrp, cft); |
2417 | if (ret) { | 2936 | if (ret) { |
2418 | pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n", | 2937 | pr_warn("%s: failed to add %s, err=%d\n", |
2419 | cft->name, ret); | 2938 | __func__, cft->name, ret); |
2420 | return ret; | 2939 | return ret; |
2421 | } | 2940 | } |
2422 | } else { | 2941 | } else { |
@@ -2434,11 +2953,7 @@ static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add) | |||
2434 | struct cgroup_subsys_state *css; | 2953 | struct cgroup_subsys_state *css; |
2435 | int ret = 0; | 2954 | int ret = 0; |
2436 | 2955 | ||
2437 | lockdep_assert_held(&cgroup_tree_mutex); | 2956 | lockdep_assert_held(&cgroup_mutex); |
2438 | |||
2439 | /* don't bother if @ss isn't attached */ | ||
2440 | if (ss->root == &cgrp_dfl_root) | ||
2441 | return 0; | ||
2442 | 2957 | ||
2443 | /* add/rm files for all cgroups created before */ | 2958 | /* add/rm files for all cgroups created before */ |
2444 | css_for_each_descendant_pre(css, cgroup_css(root, ss)) { | 2959 | css_for_each_descendant_pre(css, cgroup_css(root, ss)) { |
@@ -2506,7 +3021,7 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) | |||
2506 | 3021 | ||
2507 | static int cgroup_rm_cftypes_locked(struct cftype *cfts) | 3022 | static int cgroup_rm_cftypes_locked(struct cftype *cfts) |
2508 | { | 3023 | { |
2509 | lockdep_assert_held(&cgroup_tree_mutex); | 3024 | lockdep_assert_held(&cgroup_mutex); |
2510 | 3025 | ||
2511 | if (!cfts || !cfts[0].ss) | 3026 | if (!cfts || !cfts[0].ss) |
2512 | return -ENOENT; | 3027 | return -ENOENT; |
@@ -2532,9 +3047,9 @@ int cgroup_rm_cftypes(struct cftype *cfts) | |||
2532 | { | 3047 | { |
2533 | int ret; | 3048 | int ret; |
2534 | 3049 | ||
2535 | mutex_lock(&cgroup_tree_mutex); | 3050 | mutex_lock(&cgroup_mutex); |
2536 | ret = cgroup_rm_cftypes_locked(cfts); | 3051 | ret = cgroup_rm_cftypes_locked(cfts); |
2537 | mutex_unlock(&cgroup_tree_mutex); | 3052 | mutex_unlock(&cgroup_mutex); |
2538 | return ret; | 3053 | return ret; |
2539 | } | 3054 | } |
2540 | 3055 | ||
@@ -2556,6 +3071,9 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) | |||
2556 | { | 3071 | { |
2557 | int ret; | 3072 | int ret; |
2558 | 3073 | ||
3074 | if (ss->disabled) | ||
3075 | return 0; | ||
3076 | |||
2559 | if (!cfts || cfts[0].name[0] == '\0') | 3077 | if (!cfts || cfts[0].name[0] == '\0') |
2560 | return 0; | 3078 | return 0; |
2561 | 3079 | ||
@@ -2563,14 +3081,14 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) | |||
2563 | if (ret) | 3081 | if (ret) |
2564 | return ret; | 3082 | return ret; |
2565 | 3083 | ||
2566 | mutex_lock(&cgroup_tree_mutex); | 3084 | mutex_lock(&cgroup_mutex); |
2567 | 3085 | ||
2568 | list_add_tail(&cfts->node, &ss->cfts); | 3086 | list_add_tail(&cfts->node, &ss->cfts); |
2569 | ret = cgroup_apply_cftypes(cfts, true); | 3087 | ret = cgroup_apply_cftypes(cfts, true); |
2570 | if (ret) | 3088 | if (ret) |
2571 | cgroup_rm_cftypes_locked(cfts); | 3089 | cgroup_rm_cftypes_locked(cfts); |
2572 | 3090 | ||
2573 | mutex_unlock(&cgroup_tree_mutex); | 3091 | mutex_unlock(&cgroup_mutex); |
2574 | return ret; | 3092 | return ret; |
2575 | } | 3093 | } |
2576 | 3094 | ||
@@ -2594,57 +3112,65 @@ static int cgroup_task_count(const struct cgroup *cgrp) | |||
2594 | 3112 | ||
2595 | /** | 3113 | /** |
2596 | * css_next_child - find the next child of a given css | 3114 | * css_next_child - find the next child of a given css |
2597 | * @pos_css: the current position (%NULL to initiate traversal) | 3115 | * @pos: the current position (%NULL to initiate traversal) |
2598 | * @parent_css: css whose children to walk | 3116 | * @parent: css whose children to walk |
2599 | * | 3117 | * |
2600 | * This function returns the next child of @parent_css and should be called | 3118 | * This function returns the next child of @parent and should be called |
2601 | * under either cgroup_mutex or RCU read lock. The only requirement is | 3119 | * under either cgroup_mutex or RCU read lock. The only requirement is |
2602 | * that @parent_css and @pos_css are accessible. The next sibling is | 3120 | * that @parent and @pos are accessible. The next sibling is guaranteed to |
2603 | * guaranteed to be returned regardless of their states. | 3121 | * be returned regardless of their states. |
3122 | * | ||
3123 | * If a subsystem synchronizes ->css_online() and the start of iteration, a | ||
3124 | * css which finished ->css_online() is guaranteed to be visible in the | ||
3125 | * future iterations and will stay visible until the last reference is put. | ||
3126 | * A css which hasn't finished ->css_online() or already finished | ||
3127 | * ->css_offline() may show up during traversal. It's each subsystem's | ||
3128 | * responsibility to synchronize against on/offlining. | ||
2604 | */ | 3129 | */ |
2605 | struct cgroup_subsys_state * | 3130 | struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos, |
2606 | css_next_child(struct cgroup_subsys_state *pos_css, | 3131 | struct cgroup_subsys_state *parent) |
2607 | struct cgroup_subsys_state *parent_css) | ||
2608 | { | 3132 | { |
2609 | struct cgroup *pos = pos_css ? pos_css->cgroup : NULL; | 3133 | struct cgroup_subsys_state *next; |
2610 | struct cgroup *cgrp = parent_css->cgroup; | ||
2611 | struct cgroup *next; | ||
2612 | 3134 | ||
2613 | cgroup_assert_mutexes_or_rcu_locked(); | 3135 | cgroup_assert_mutex_or_rcu_locked(); |
2614 | 3136 | ||
2615 | /* | 3137 | /* |
2616 | * @pos could already have been removed. Once a cgroup is removed, | 3138 | * @pos could already have been unlinked from the sibling list. |
2617 | * its ->sibling.next is no longer updated when its next sibling | 3139 | * Once a cgroup is removed, its ->sibling.next is no longer |
2618 | * changes. As CGRP_DEAD assertion is serialized and happens | 3140 | * updated when its next sibling changes. CSS_RELEASED is set when |
2619 | * before the cgroup is taken off the ->sibling list, if we see it | 3141 | * @pos is taken off list, at which time its next pointer is valid, |
2620 | * unasserted, it's guaranteed that the next sibling hasn't | 3142 | * and, as releases are serialized, the one pointed to by the next |
2621 | * finished its grace period even if it's already removed, and thus | 3143 | * pointer is guaranteed to not have started release yet. This |
2622 | * safe to dereference from this RCU critical section. If | 3144 | * implies that if we observe !CSS_RELEASED on @pos in this RCU |
2623 | * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed | 3145 | * critical section, the one pointed to by its next pointer is |
2624 | * to be visible as %true here. | 3146 | * guaranteed to not have finished its RCU grace period even if we |
3147 | * have dropped rcu_read_lock() inbetween iterations. | ||
2625 | * | 3148 | * |
2626 | * If @pos is dead, its next pointer can't be dereferenced; | 3149 | * If @pos has CSS_RELEASED set, its next pointer can't be |
2627 | * however, as each cgroup is given a monotonically increasing | 3150 | * dereferenced; however, as each css is given a monotonically |
2628 | * unique serial number and always appended to the sibling list, | 3151 | * increasing unique serial number and always appended to the |
2629 | * the next one can be found by walking the parent's children until | 3152 | * sibling list, the next one can be found by walking the parent's |
2630 | * we see a cgroup with higher serial number than @pos's. While | 3153 | * children until the first css with higher serial number than |
2631 | * this path can be slower, it's taken only when either the current | 3154 | * @pos's. While this path can be slower, it happens iff iteration |
2632 | * cgroup is removed or iteration and removal race. | 3155 | * races against release and the race window is very small. |
2633 | */ | 3156 | */ |
2634 | if (!pos) { | 3157 | if (!pos) { |
2635 | next = list_entry_rcu(cgrp->children.next, struct cgroup, sibling); | 3158 | next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling); |
2636 | } else if (likely(!cgroup_is_dead(pos))) { | 3159 | } else if (likely(!(pos->flags & CSS_RELEASED))) { |
2637 | next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); | 3160 | next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling); |
2638 | } else { | 3161 | } else { |
2639 | list_for_each_entry_rcu(next, &cgrp->children, sibling) | 3162 | list_for_each_entry_rcu(next, &parent->children, sibling) |
2640 | if (next->serial_nr > pos->serial_nr) | 3163 | if (next->serial_nr > pos->serial_nr) |
2641 | break; | 3164 | break; |
2642 | } | 3165 | } |
2643 | 3166 | ||
2644 | if (&next->sibling == &cgrp->children) | 3167 | /* |
2645 | return NULL; | 3168 | * @next, if not pointing to the head, can be dereferenced and is |
2646 | 3169 | * the next sibling. | |
2647 | return cgroup_css(next, parent_css->ss); | 3170 | */ |
3171 | if (&next->sibling != &parent->children) | ||
3172 | return next; | ||
3173 | return NULL; | ||
2648 | } | 3174 | } |
2649 | 3175 | ||
2650 | /** | 3176 | /** |
@@ -2660,6 +3186,13 @@ css_next_child(struct cgroup_subsys_state *pos_css, | |||
2660 | * doesn't require the whole traversal to be contained in a single critical | 3186 | * doesn't require the whole traversal to be contained in a single critical |
2661 | * section. This function will return the correct next descendant as long | 3187 | * section. This function will return the correct next descendant as long |
2662 | * as both @pos and @root are accessible and @pos is a descendant of @root. | 3188 | * as both @pos and @root are accessible and @pos is a descendant of @root. |
3189 | * | ||
3190 | * If a subsystem synchronizes ->css_online() and the start of iteration, a | ||
3191 | * css which finished ->css_online() is guaranteed to be visible in the | ||
3192 | * future iterations and will stay visible until the last reference is put. | ||
3193 | * A css which hasn't finished ->css_online() or already finished | ||
3194 | * ->css_offline() may show up during traversal. It's each subsystem's | ||
3195 | * responsibility to synchronize against on/offlining. | ||
2663 | */ | 3196 | */ |
2664 | struct cgroup_subsys_state * | 3197 | struct cgroup_subsys_state * |
2665 | css_next_descendant_pre(struct cgroup_subsys_state *pos, | 3198 | css_next_descendant_pre(struct cgroup_subsys_state *pos, |
@@ -2667,7 +3200,7 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos, | |||
2667 | { | 3200 | { |
2668 | struct cgroup_subsys_state *next; | 3201 | struct cgroup_subsys_state *next; |
2669 | 3202 | ||
2670 | cgroup_assert_mutexes_or_rcu_locked(); | 3203 | cgroup_assert_mutex_or_rcu_locked(); |
2671 | 3204 | ||
2672 | /* if first iteration, visit @root */ | 3205 | /* if first iteration, visit @root */ |
2673 | if (!pos) | 3206 | if (!pos) |
@@ -2680,10 +3213,10 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos, | |||
2680 | 3213 | ||
2681 | /* no child, visit my or the closest ancestor's next sibling */ | 3214 | /* no child, visit my or the closest ancestor's next sibling */ |
2682 | while (pos != root) { | 3215 | while (pos != root) { |
2683 | next = css_next_child(pos, css_parent(pos)); | 3216 | next = css_next_child(pos, pos->parent); |
2684 | if (next) | 3217 | if (next) |
2685 | return next; | 3218 | return next; |
2686 | pos = css_parent(pos); | 3219 | pos = pos->parent; |
2687 | } | 3220 | } |
2688 | 3221 | ||
2689 | return NULL; | 3222 | return NULL; |
@@ -2707,7 +3240,7 @@ css_rightmost_descendant(struct cgroup_subsys_state *pos) | |||
2707 | { | 3240 | { |
2708 | struct cgroup_subsys_state *last, *tmp; | 3241 | struct cgroup_subsys_state *last, *tmp; |
2709 | 3242 | ||
2710 | cgroup_assert_mutexes_or_rcu_locked(); | 3243 | cgroup_assert_mutex_or_rcu_locked(); |
2711 | 3244 | ||
2712 | do { | 3245 | do { |
2713 | last = pos; | 3246 | last = pos; |
@@ -2747,6 +3280,13 @@ css_leftmost_descendant(struct cgroup_subsys_state *pos) | |||
2747 | * section. This function will return the correct next descendant as long | 3280 | * section. This function will return the correct next descendant as long |
2748 | * as both @pos and @cgroup are accessible and @pos is a descendant of | 3281 | * as both @pos and @cgroup are accessible and @pos is a descendant of |
2749 | * @cgroup. | 3282 | * @cgroup. |
3283 | * | ||
3284 | * If a subsystem synchronizes ->css_online() and the start of iteration, a | ||
3285 | * css which finished ->css_online() is guaranteed to be visible in the | ||
3286 | * future iterations and will stay visible until the last reference is put. | ||
3287 | * A css which hasn't finished ->css_online() or already finished | ||
3288 | * ->css_offline() may show up during traversal. It's each subsystem's | ||
3289 | * responsibility to synchronize against on/offlining. | ||
2750 | */ | 3290 | */ |
2751 | struct cgroup_subsys_state * | 3291 | struct cgroup_subsys_state * |
2752 | css_next_descendant_post(struct cgroup_subsys_state *pos, | 3292 | css_next_descendant_post(struct cgroup_subsys_state *pos, |
@@ -2754,7 +3294,7 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, | |||
2754 | { | 3294 | { |
2755 | struct cgroup_subsys_state *next; | 3295 | struct cgroup_subsys_state *next; |
2756 | 3296 | ||
2757 | cgroup_assert_mutexes_or_rcu_locked(); | 3297 | cgroup_assert_mutex_or_rcu_locked(); |
2758 | 3298 | ||
2759 | /* if first iteration, visit leftmost descendant which may be @root */ | 3299 | /* if first iteration, visit leftmost descendant which may be @root */ |
2760 | if (!pos) | 3300 | if (!pos) |
@@ -2765,12 +3305,36 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, | |||
2765 | return NULL; | 3305 | return NULL; |
2766 | 3306 | ||
2767 | /* if there's an unvisited sibling, visit its leftmost descendant */ | 3307 | /* if there's an unvisited sibling, visit its leftmost descendant */ |
2768 | next = css_next_child(pos, css_parent(pos)); | 3308 | next = css_next_child(pos, pos->parent); |
2769 | if (next) | 3309 | if (next) |
2770 | return css_leftmost_descendant(next); | 3310 | return css_leftmost_descendant(next); |
2771 | 3311 | ||
2772 | /* no sibling left, visit parent */ | 3312 | /* no sibling left, visit parent */ |
2773 | return css_parent(pos); | 3313 | return pos->parent; |
3314 | } | ||
3315 | |||
3316 | /** | ||
3317 | * css_has_online_children - does a css have online children | ||
3318 | * @css: the target css | ||
3319 | * | ||
3320 | * Returns %true if @css has any online children; otherwise, %false. This | ||
3321 | * function can be called from any context but the caller is responsible | ||
3322 | * for synchronizing against on/offlining as necessary. | ||
3323 | */ | ||
3324 | bool css_has_online_children(struct cgroup_subsys_state *css) | ||
3325 | { | ||
3326 | struct cgroup_subsys_state *child; | ||
3327 | bool ret = false; | ||
3328 | |||
3329 | rcu_read_lock(); | ||
3330 | css_for_each_child(child, css) { | ||
3331 | if (css->flags & CSS_ONLINE) { | ||
3332 | ret = true; | ||
3333 | break; | ||
3334 | } | ||
3335 | } | ||
3336 | rcu_read_unlock(); | ||
3337 | return ret; | ||
2774 | } | 3338 | } |
2775 | 3339 | ||
2776 | /** | 3340 | /** |
@@ -2781,27 +3345,36 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, | |||
2781 | */ | 3345 | */ |
2782 | static void css_advance_task_iter(struct css_task_iter *it) | 3346 | static void css_advance_task_iter(struct css_task_iter *it) |
2783 | { | 3347 | { |
2784 | struct list_head *l = it->cset_link; | 3348 | struct list_head *l = it->cset_pos; |
2785 | struct cgrp_cset_link *link; | 3349 | struct cgrp_cset_link *link; |
2786 | struct css_set *cset; | 3350 | struct css_set *cset; |
2787 | 3351 | ||
2788 | /* Advance to the next non-empty css_set */ | 3352 | /* Advance to the next non-empty css_set */ |
2789 | do { | 3353 | do { |
2790 | l = l->next; | 3354 | l = l->next; |
2791 | if (l == &it->origin_css->cgroup->cset_links) { | 3355 | if (l == it->cset_head) { |
2792 | it->cset_link = NULL; | 3356 | it->cset_pos = NULL; |
2793 | return; | 3357 | return; |
2794 | } | 3358 | } |
2795 | link = list_entry(l, struct cgrp_cset_link, cset_link); | 3359 | |
2796 | cset = link->cset; | 3360 | if (it->ss) { |
3361 | cset = container_of(l, struct css_set, | ||
3362 | e_cset_node[it->ss->id]); | ||
3363 | } else { | ||
3364 | link = list_entry(l, struct cgrp_cset_link, cset_link); | ||
3365 | cset = link->cset; | ||
3366 | } | ||
2797 | } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks)); | 3367 | } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks)); |
2798 | 3368 | ||
2799 | it->cset_link = l; | 3369 | it->cset_pos = l; |
2800 | 3370 | ||
2801 | if (!list_empty(&cset->tasks)) | 3371 | if (!list_empty(&cset->tasks)) |
2802 | it->task = cset->tasks.next; | 3372 | it->task_pos = cset->tasks.next; |
2803 | else | 3373 | else |
2804 | it->task = cset->mg_tasks.next; | 3374 | it->task_pos = cset->mg_tasks.next; |
3375 | |||
3376 | it->tasks_head = &cset->tasks; | ||
3377 | it->mg_tasks_head = &cset->mg_tasks; | ||
2805 | } | 3378 | } |
2806 | 3379 | ||
2807 | /** | 3380 | /** |
@@ -2827,8 +3400,14 @@ void css_task_iter_start(struct cgroup_subsys_state *css, | |||
2827 | 3400 | ||
2828 | down_read(&css_set_rwsem); | 3401 | down_read(&css_set_rwsem); |
2829 | 3402 | ||
2830 | it->origin_css = css; | 3403 | it->ss = css->ss; |
2831 | it->cset_link = &css->cgroup->cset_links; | 3404 | |
3405 | if (it->ss) | ||
3406 | it->cset_pos = &css->cgroup->e_csets[css->ss->id]; | ||
3407 | else | ||
3408 | it->cset_pos = &css->cgroup->cset_links; | ||
3409 | |||
3410 | it->cset_head = it->cset_pos; | ||
2832 | 3411 | ||
2833 | css_advance_task_iter(it); | 3412 | css_advance_task_iter(it); |
2834 | } | 3413 | } |
@@ -2844,12 +3423,10 @@ void css_task_iter_start(struct cgroup_subsys_state *css, | |||
2844 | struct task_struct *css_task_iter_next(struct css_task_iter *it) | 3423 | struct task_struct *css_task_iter_next(struct css_task_iter *it) |
2845 | { | 3424 | { |
2846 | struct task_struct *res; | 3425 | struct task_struct *res; |
2847 | struct list_head *l = it->task; | 3426 | struct list_head *l = it->task_pos; |
2848 | struct cgrp_cset_link *link = list_entry(it->cset_link, | ||
2849 | struct cgrp_cset_link, cset_link); | ||
2850 | 3427 | ||
2851 | /* If the iterator cg is NULL, we have no tasks */ | 3428 | /* If the iterator cg is NULL, we have no tasks */ |
2852 | if (!it->cset_link) | 3429 | if (!it->cset_pos) |
2853 | return NULL; | 3430 | return NULL; |
2854 | res = list_entry(l, struct task_struct, cg_list); | 3431 | res = list_entry(l, struct task_struct, cg_list); |
2855 | 3432 | ||
@@ -2860,13 +3437,13 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it) | |||
2860 | */ | 3437 | */ |
2861 | l = l->next; | 3438 | l = l->next; |
2862 | 3439 | ||
2863 | if (l == &link->cset->tasks) | 3440 | if (l == it->tasks_head) |
2864 | l = link->cset->mg_tasks.next; | 3441 | l = it->mg_tasks_head->next; |
2865 | 3442 | ||
2866 | if (l == &link->cset->mg_tasks) | 3443 | if (l == it->mg_tasks_head) |
2867 | css_advance_task_iter(it); | 3444 | css_advance_task_iter(it); |
2868 | else | 3445 | else |
2869 | it->task = l; | 3446 | it->task_pos = l; |
2870 | 3447 | ||
2871 | return res; | 3448 | return res; |
2872 | } | 3449 | } |
@@ -2919,7 +3496,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) | |||
2919 | * ->can_attach() fails. | 3496 | * ->can_attach() fails. |
2920 | */ | 3497 | */ |
2921 | do { | 3498 | do { |
2922 | css_task_iter_start(&from->dummy_css, &it); | 3499 | css_task_iter_start(&from->self, &it); |
2923 | task = css_task_iter_next(&it); | 3500 | task = css_task_iter_next(&it); |
2924 | if (task) | 3501 | if (task) |
2925 | get_task_struct(task); | 3502 | get_task_struct(task); |
@@ -3184,7 +3761,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, | |||
3184 | if (!array) | 3761 | if (!array) |
3185 | return -ENOMEM; | 3762 | return -ENOMEM; |
3186 | /* now, populate the array */ | 3763 | /* now, populate the array */ |
3187 | css_task_iter_start(&cgrp->dummy_css, &it); | 3764 | css_task_iter_start(&cgrp->self, &it); |
3188 | while ((tsk = css_task_iter_next(&it))) { | 3765 | while ((tsk = css_task_iter_next(&it))) { |
3189 | if (unlikely(n == length)) | 3766 | if (unlikely(n == length)) |
3190 | break; | 3767 | break; |
@@ -3246,7 +3823,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) | |||
3246 | 3823 | ||
3247 | /* | 3824 | /* |
3248 | * We aren't being called from kernfs and there's no guarantee on | 3825 | * We aren't being called from kernfs and there's no guarantee on |
3249 | * @kn->priv's validity. For this and css_tryget_from_dir(), | 3826 | * @kn->priv's validity. For this and css_tryget_online_from_dir(), |
3250 | * @kn->priv is RCU safe. Let's do the RCU dancing. | 3827 | * @kn->priv is RCU safe. Let's do the RCU dancing. |
3251 | */ | 3828 | */ |
3252 | rcu_read_lock(); | 3829 | rcu_read_lock(); |
@@ -3258,7 +3835,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) | |||
3258 | } | 3835 | } |
3259 | rcu_read_unlock(); | 3836 | rcu_read_unlock(); |
3260 | 3837 | ||
3261 | css_task_iter_start(&cgrp->dummy_css, &it); | 3838 | css_task_iter_start(&cgrp->self, &it); |
3262 | while ((tsk = css_task_iter_next(&it))) { | 3839 | while ((tsk = css_task_iter_next(&it))) { |
3263 | switch (tsk->state) { | 3840 | switch (tsk->state) { |
3264 | case TASK_RUNNING: | 3841 | case TASK_RUNNING: |
@@ -3388,17 +3965,6 @@ static int cgroup_pidlist_show(struct seq_file *s, void *v) | |||
3388 | return seq_printf(s, "%d\n", *(int *)v); | 3965 | return seq_printf(s, "%d\n", *(int *)v); |
3389 | } | 3966 | } |
3390 | 3967 | ||
3391 | /* | ||
3392 | * seq_operations functions for iterating on pidlists through seq_file - | ||
3393 | * independent of whether it's tasks or procs | ||
3394 | */ | ||
3395 | static const struct seq_operations cgroup_pidlist_seq_operations = { | ||
3396 | .start = cgroup_pidlist_start, | ||
3397 | .stop = cgroup_pidlist_stop, | ||
3398 | .next = cgroup_pidlist_next, | ||
3399 | .show = cgroup_pidlist_show, | ||
3400 | }; | ||
3401 | |||
3402 | static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, | 3968 | static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, |
3403 | struct cftype *cft) | 3969 | struct cftype *cft) |
3404 | { | 3970 | { |
@@ -3440,7 +4006,7 @@ static struct cftype cgroup_base_files[] = { | |||
3440 | .seq_stop = cgroup_pidlist_stop, | 4006 | .seq_stop = cgroup_pidlist_stop, |
3441 | .seq_show = cgroup_pidlist_show, | 4007 | .seq_show = cgroup_pidlist_show, |
3442 | .private = CGROUP_FILE_PROCS, | 4008 | .private = CGROUP_FILE_PROCS, |
3443 | .write_u64 = cgroup_procs_write, | 4009 | .write = cgroup_procs_write, |
3444 | .mode = S_IRUGO | S_IWUSR, | 4010 | .mode = S_IRUGO | S_IWUSR, |
3445 | }, | 4011 | }, |
3446 | { | 4012 | { |
@@ -3454,6 +4020,27 @@ static struct cftype cgroup_base_files[] = { | |||
3454 | .flags = CFTYPE_ONLY_ON_ROOT, | 4020 | .flags = CFTYPE_ONLY_ON_ROOT, |
3455 | .seq_show = cgroup_sane_behavior_show, | 4021 | .seq_show = cgroup_sane_behavior_show, |
3456 | }, | 4022 | }, |
4023 | { | ||
4024 | .name = "cgroup.controllers", | ||
4025 | .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_ONLY_ON_ROOT, | ||
4026 | .seq_show = cgroup_root_controllers_show, | ||
4027 | }, | ||
4028 | { | ||
4029 | .name = "cgroup.controllers", | ||
4030 | .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT, | ||
4031 | .seq_show = cgroup_controllers_show, | ||
4032 | }, | ||
4033 | { | ||
4034 | .name = "cgroup.subtree_control", | ||
4035 | .flags = CFTYPE_ONLY_ON_DFL, | ||
4036 | .seq_show = cgroup_subtree_control_show, | ||
4037 | .write = cgroup_subtree_control_write, | ||
4038 | }, | ||
4039 | { | ||
4040 | .name = "cgroup.populated", | ||
4041 | .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT, | ||
4042 | .seq_show = cgroup_populated_show, | ||
4043 | }, | ||
3457 | 4044 | ||
3458 | /* | 4045 | /* |
3459 | * Historical crazy stuff. These don't have "cgroup." prefix and | 4046 | * Historical crazy stuff. These don't have "cgroup." prefix and |
@@ -3468,7 +4055,7 @@ static struct cftype cgroup_base_files[] = { | |||
3468 | .seq_stop = cgroup_pidlist_stop, | 4055 | .seq_stop = cgroup_pidlist_stop, |
3469 | .seq_show = cgroup_pidlist_show, | 4056 | .seq_show = cgroup_pidlist_show, |
3470 | .private = CGROUP_FILE_TASKS, | 4057 | .private = CGROUP_FILE_TASKS, |
3471 | .write_u64 = cgroup_tasks_write, | 4058 | .write = cgroup_tasks_write, |
3472 | .mode = S_IRUGO | S_IWUSR, | 4059 | .mode = S_IRUGO | S_IWUSR, |
3473 | }, | 4060 | }, |
3474 | { | 4061 | { |
@@ -3481,7 +4068,7 @@ static struct cftype cgroup_base_files[] = { | |||
3481 | .name = "release_agent", | 4068 | .name = "release_agent", |
3482 | .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, | 4069 | .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, |
3483 | .seq_show = cgroup_release_agent_show, | 4070 | .seq_show = cgroup_release_agent_show, |
3484 | .write_string = cgroup_release_agent_write, | 4071 | .write = cgroup_release_agent_write, |
3485 | .max_write_len = PATH_MAX - 1, | 4072 | .max_write_len = PATH_MAX - 1, |
3486 | }, | 4073 | }, |
3487 | { } /* terminate */ | 4074 | { } /* terminate */ |
@@ -3494,7 +4081,7 @@ static struct cftype cgroup_base_files[] = { | |||
3494 | * | 4081 | * |
3495 | * On failure, no file is added. | 4082 | * On failure, no file is added. |
3496 | */ | 4083 | */ |
3497 | static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) | 4084 | static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask) |
3498 | { | 4085 | { |
3499 | struct cgroup_subsys *ss; | 4086 | struct cgroup_subsys *ss; |
3500 | int i, ret = 0; | 4087 | int i, ret = 0; |
@@ -3503,7 +4090,7 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) | |||
3503 | for_each_subsys(ss, i) { | 4090 | for_each_subsys(ss, i) { |
3504 | struct cftype *cfts; | 4091 | struct cftype *cfts; |
3505 | 4092 | ||
3506 | if (!test_bit(i, &subsys_mask)) | 4093 | if (!(subsys_mask & (1 << i))) |
3507 | continue; | 4094 | continue; |
3508 | 4095 | ||
3509 | list_for_each_entry(cfts, &ss->cfts, node) { | 4096 | list_for_each_entry(cfts, &ss->cfts, node) { |
@@ -3525,9 +4112,9 @@ err: | |||
3525 | * Implemented in kill_css(). | 4112 | * Implemented in kill_css(). |
3526 | * | 4113 | * |
3527 | * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs | 4114 | * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs |
3528 | * and thus css_tryget() is guaranteed to fail, the css can be offlined | 4115 | * and thus css_tryget_online() is guaranteed to fail, the css can be |
3529 | * by invoking offline_css(). After offlining, the base ref is put. | 4116 | * offlined by invoking offline_css(). After offlining, the base ref is |
3530 | * Implemented in css_killed_work_fn(). | 4117 | * put. Implemented in css_killed_work_fn(). |
3531 | * | 4118 | * |
3532 | * 3. When the percpu_ref reaches zero, the only possible remaining | 4119 | * 3. When the percpu_ref reaches zero, the only possible remaining |
3533 | * accessors are inside RCU read sections. css_release() schedules the | 4120 | * accessors are inside RCU read sections. css_release() schedules the |
@@ -3546,11 +4133,37 @@ static void css_free_work_fn(struct work_struct *work) | |||
3546 | container_of(work, struct cgroup_subsys_state, destroy_work); | 4133 | container_of(work, struct cgroup_subsys_state, destroy_work); |
3547 | struct cgroup *cgrp = css->cgroup; | 4134 | struct cgroup *cgrp = css->cgroup; |
3548 | 4135 | ||
3549 | if (css->parent) | 4136 | if (css->ss) { |
3550 | css_put(css->parent); | 4137 | /* css free path */ |
4138 | if (css->parent) | ||
4139 | css_put(css->parent); | ||
3551 | 4140 | ||
3552 | css->ss->css_free(css); | 4141 | css->ss->css_free(css); |
3553 | cgroup_put(cgrp); | 4142 | cgroup_put(cgrp); |
4143 | } else { | ||
4144 | /* cgroup free path */ | ||
4145 | atomic_dec(&cgrp->root->nr_cgrps); | ||
4146 | cgroup_pidlist_destroy_all(cgrp); | ||
4147 | |||
4148 | if (cgroup_parent(cgrp)) { | ||
4149 | /* | ||
4150 | * We get a ref to the parent, and put the ref when | ||
4151 | * this cgroup is being freed, so it's guaranteed | ||
4152 | * that the parent won't be destroyed before its | ||
4153 | * children. | ||
4154 | */ | ||
4155 | cgroup_put(cgroup_parent(cgrp)); | ||
4156 | kernfs_put(cgrp->kn); | ||
4157 | kfree(cgrp); | ||
4158 | } else { | ||
4159 | /* | ||
4160 | * This is root cgroup's refcnt reaching zero, | ||
4161 | * which indicates that the root should be | ||
4162 | * released. | ||
4163 | */ | ||
4164 | cgroup_destroy_root(cgrp->root); | ||
4165 | } | ||
4166 | } | ||
3554 | } | 4167 | } |
3555 | 4168 | ||
3556 | static void css_free_rcu_fn(struct rcu_head *rcu_head) | 4169 | static void css_free_rcu_fn(struct rcu_head *rcu_head) |
@@ -3562,26 +4175,59 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head) | |||
3562 | queue_work(cgroup_destroy_wq, &css->destroy_work); | 4175 | queue_work(cgroup_destroy_wq, &css->destroy_work); |
3563 | } | 4176 | } |
3564 | 4177 | ||
4178 | static void css_release_work_fn(struct work_struct *work) | ||
4179 | { | ||
4180 | struct cgroup_subsys_state *css = | ||
4181 | container_of(work, struct cgroup_subsys_state, destroy_work); | ||
4182 | struct cgroup_subsys *ss = css->ss; | ||
4183 | struct cgroup *cgrp = css->cgroup; | ||
4184 | |||
4185 | mutex_lock(&cgroup_mutex); | ||
4186 | |||
4187 | css->flags |= CSS_RELEASED; | ||
4188 | list_del_rcu(&css->sibling); | ||
4189 | |||
4190 | if (ss) { | ||
4191 | /* css release path */ | ||
4192 | cgroup_idr_remove(&ss->css_idr, css->id); | ||
4193 | } else { | ||
4194 | /* cgroup release path */ | ||
4195 | cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); | ||
4196 | cgrp->id = -1; | ||
4197 | } | ||
4198 | |||
4199 | mutex_unlock(&cgroup_mutex); | ||
4200 | |||
4201 | call_rcu(&css->rcu_head, css_free_rcu_fn); | ||
4202 | } | ||
4203 | |||
3565 | static void css_release(struct percpu_ref *ref) | 4204 | static void css_release(struct percpu_ref *ref) |
3566 | { | 4205 | { |
3567 | struct cgroup_subsys_state *css = | 4206 | struct cgroup_subsys_state *css = |
3568 | container_of(ref, struct cgroup_subsys_state, refcnt); | 4207 | container_of(ref, struct cgroup_subsys_state, refcnt); |
3569 | 4208 | ||
3570 | RCU_INIT_POINTER(css->cgroup->subsys[css->ss->id], NULL); | 4209 | INIT_WORK(&css->destroy_work, css_release_work_fn); |
3571 | call_rcu(&css->rcu_head, css_free_rcu_fn); | 4210 | queue_work(cgroup_destroy_wq, &css->destroy_work); |
3572 | } | 4211 | } |
3573 | 4212 | ||
3574 | static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, | 4213 | static void init_and_link_css(struct cgroup_subsys_state *css, |
3575 | struct cgroup *cgrp) | 4214 | struct cgroup_subsys *ss, struct cgroup *cgrp) |
3576 | { | 4215 | { |
4216 | lockdep_assert_held(&cgroup_mutex); | ||
4217 | |||
4218 | cgroup_get(cgrp); | ||
4219 | |||
4220 | memset(css, 0, sizeof(*css)); | ||
3577 | css->cgroup = cgrp; | 4221 | css->cgroup = cgrp; |
3578 | css->ss = ss; | 4222 | css->ss = ss; |
3579 | css->flags = 0; | 4223 | INIT_LIST_HEAD(&css->sibling); |
4224 | INIT_LIST_HEAD(&css->children); | ||
4225 | css->serial_nr = css_serial_nr_next++; | ||
3580 | 4226 | ||
3581 | if (cgrp->parent) | 4227 | if (cgroup_parent(cgrp)) { |
3582 | css->parent = cgroup_css(cgrp->parent, ss); | 4228 | css->parent = cgroup_css(cgroup_parent(cgrp), ss); |
3583 | else | 4229 | css_get(css->parent); |
3584 | css->flags |= CSS_ROOT; | 4230 | } |
3585 | 4231 | ||
3586 | BUG_ON(cgroup_css(cgrp, ss)); | 4232 | BUG_ON(cgroup_css(cgrp, ss)); |
3587 | } | 4233 | } |
@@ -3592,14 +4238,12 @@ static int online_css(struct cgroup_subsys_state *css) | |||
3592 | struct cgroup_subsys *ss = css->ss; | 4238 | struct cgroup_subsys *ss = css->ss; |
3593 | int ret = 0; | 4239 | int ret = 0; |
3594 | 4240 | ||
3595 | lockdep_assert_held(&cgroup_tree_mutex); | ||
3596 | lockdep_assert_held(&cgroup_mutex); | 4241 | lockdep_assert_held(&cgroup_mutex); |
3597 | 4242 | ||
3598 | if (ss->css_online) | 4243 | if (ss->css_online) |
3599 | ret = ss->css_online(css); | 4244 | ret = ss->css_online(css); |
3600 | if (!ret) { | 4245 | if (!ret) { |
3601 | css->flags |= CSS_ONLINE; | 4246 | css->flags |= CSS_ONLINE; |
3602 | css->cgroup->nr_css++; | ||
3603 | rcu_assign_pointer(css->cgroup->subsys[ss->id], css); | 4247 | rcu_assign_pointer(css->cgroup->subsys[ss->id], css); |
3604 | } | 4248 | } |
3605 | return ret; | 4249 | return ret; |
@@ -3610,7 +4254,6 @@ static void offline_css(struct cgroup_subsys_state *css) | |||
3610 | { | 4254 | { |
3611 | struct cgroup_subsys *ss = css->ss; | 4255 | struct cgroup_subsys *ss = css->ss; |
3612 | 4256 | ||
3613 | lockdep_assert_held(&cgroup_tree_mutex); | ||
3614 | lockdep_assert_held(&cgroup_mutex); | 4257 | lockdep_assert_held(&cgroup_mutex); |
3615 | 4258 | ||
3616 | if (!(css->flags & CSS_ONLINE)) | 4259 | if (!(css->flags & CSS_ONLINE)) |
@@ -3620,8 +4263,9 @@ static void offline_css(struct cgroup_subsys_state *css) | |||
3620 | ss->css_offline(css); | 4263 | ss->css_offline(css); |
3621 | 4264 | ||
3622 | css->flags &= ~CSS_ONLINE; | 4265 | css->flags &= ~CSS_ONLINE; |
3623 | css->cgroup->nr_css--; | 4266 | RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL); |
3624 | RCU_INIT_POINTER(css->cgroup->subsys[ss->id], css); | 4267 | |
4268 | wake_up_all(&css->cgroup->offline_waitq); | ||
3625 | } | 4269 | } |
3626 | 4270 | ||
3627 | /** | 4271 | /** |
@@ -3635,111 +4279,102 @@ static void offline_css(struct cgroup_subsys_state *css) | |||
3635 | */ | 4279 | */ |
3636 | static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) | 4280 | static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) |
3637 | { | 4281 | { |
3638 | struct cgroup *parent = cgrp->parent; | 4282 | struct cgroup *parent = cgroup_parent(cgrp); |
4283 | struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss); | ||
3639 | struct cgroup_subsys_state *css; | 4284 | struct cgroup_subsys_state *css; |
3640 | int err; | 4285 | int err; |
3641 | 4286 | ||
3642 | lockdep_assert_held(&cgroup_mutex); | 4287 | lockdep_assert_held(&cgroup_mutex); |
3643 | 4288 | ||
3644 | css = ss->css_alloc(cgroup_css(parent, ss)); | 4289 | css = ss->css_alloc(parent_css); |
3645 | if (IS_ERR(css)) | 4290 | if (IS_ERR(css)) |
3646 | return PTR_ERR(css); | 4291 | return PTR_ERR(css); |
3647 | 4292 | ||
4293 | init_and_link_css(css, ss, cgrp); | ||
4294 | |||
3648 | err = percpu_ref_init(&css->refcnt, css_release); | 4295 | err = percpu_ref_init(&css->refcnt, css_release); |
3649 | if (err) | 4296 | if (err) |
3650 | goto err_free_css; | 4297 | goto err_free_css; |
3651 | 4298 | ||
3652 | init_css(css, ss, cgrp); | 4299 | err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_NOWAIT); |
4300 | if (err < 0) | ||
4301 | goto err_free_percpu_ref; | ||
4302 | css->id = err; | ||
3653 | 4303 | ||
3654 | err = cgroup_populate_dir(cgrp, 1 << ss->id); | 4304 | err = cgroup_populate_dir(cgrp, 1 << ss->id); |
3655 | if (err) | 4305 | if (err) |
3656 | goto err_free_percpu_ref; | 4306 | goto err_free_id; |
4307 | |||
4308 | /* @css is ready to be brought online now, make it visible */ | ||
4309 | list_add_tail_rcu(&css->sibling, &parent_css->children); | ||
4310 | cgroup_idr_replace(&ss->css_idr, css, css->id); | ||
3657 | 4311 | ||
3658 | err = online_css(css); | 4312 | err = online_css(css); |
3659 | if (err) | 4313 | if (err) |
3660 | goto err_clear_dir; | 4314 | goto err_list_del; |
3661 | |||
3662 | cgroup_get(cgrp); | ||
3663 | css_get(css->parent); | ||
3664 | |||
3665 | cgrp->subsys_mask |= 1 << ss->id; | ||
3666 | 4315 | ||
3667 | if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && | 4316 | if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && |
3668 | parent->parent) { | 4317 | cgroup_parent(parent)) { |
3669 | pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", | 4318 | pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", |
3670 | current->comm, current->pid, ss->name); | 4319 | current->comm, current->pid, ss->name); |
3671 | if (!strcmp(ss->name, "memory")) | 4320 | if (!strcmp(ss->name, "memory")) |
3672 | pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n"); | 4321 | pr_warn("\"memory\" requires setting use_hierarchy to 1 on the root\n"); |
3673 | ss->warned_broken_hierarchy = true; | 4322 | ss->warned_broken_hierarchy = true; |
3674 | } | 4323 | } |
3675 | 4324 | ||
3676 | return 0; | 4325 | return 0; |
3677 | 4326 | ||
3678 | err_clear_dir: | 4327 | err_list_del: |
4328 | list_del_rcu(&css->sibling); | ||
3679 | cgroup_clear_dir(css->cgroup, 1 << css->ss->id); | 4329 | cgroup_clear_dir(css->cgroup, 1 << css->ss->id); |
4330 | err_free_id: | ||
4331 | cgroup_idr_remove(&ss->css_idr, css->id); | ||
3680 | err_free_percpu_ref: | 4332 | err_free_percpu_ref: |
3681 | percpu_ref_cancel_init(&css->refcnt); | 4333 | percpu_ref_cancel_init(&css->refcnt); |
3682 | err_free_css: | 4334 | err_free_css: |
3683 | ss->css_free(css); | 4335 | call_rcu(&css->rcu_head, css_free_rcu_fn); |
3684 | return err; | 4336 | return err; |
3685 | } | 4337 | } |
3686 | 4338 | ||
3687 | /** | 4339 | static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, |
3688 | * cgroup_create - create a cgroup | 4340 | umode_t mode) |
3689 | * @parent: cgroup that will be parent of the new cgroup | ||
3690 | * @name: name of the new cgroup | ||
3691 | * @mode: mode to set on new cgroup | ||
3692 | */ | ||
3693 | static long cgroup_create(struct cgroup *parent, const char *name, | ||
3694 | umode_t mode) | ||
3695 | { | 4341 | { |
3696 | struct cgroup *cgrp; | 4342 | struct cgroup *parent, *cgrp; |
3697 | struct cgroup_root *root = parent->root; | 4343 | struct cgroup_root *root; |
3698 | int ssid, err; | ||
3699 | struct cgroup_subsys *ss; | 4344 | struct cgroup_subsys *ss; |
3700 | struct kernfs_node *kn; | 4345 | struct kernfs_node *kn; |
4346 | int ssid, ret; | ||
3701 | 4347 | ||
3702 | /* | 4348 | parent = cgroup_kn_lock_live(parent_kn); |
3703 | * XXX: The default hierarchy isn't fully implemented yet. Block | 4349 | if (!parent) |
3704 | * !root cgroup creation on it for now. | 4350 | return -ENODEV; |
3705 | */ | 4351 | root = parent->root; |
3706 | if (root == &cgrp_dfl_root) | ||
3707 | return -EINVAL; | ||
3708 | 4352 | ||
3709 | /* allocate the cgroup and its ID, 0 is reserved for the root */ | 4353 | /* allocate the cgroup and its ID, 0 is reserved for the root */ |
3710 | cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); | 4354 | cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); |
3711 | if (!cgrp) | 4355 | if (!cgrp) { |
3712 | return -ENOMEM; | 4356 | ret = -ENOMEM; |
3713 | 4357 | goto out_unlock; | |
3714 | mutex_lock(&cgroup_tree_mutex); | ||
3715 | |||
3716 | /* | ||
3717 | * Only live parents can have children. Note that the liveliness | ||
3718 | * check isn't strictly necessary because cgroup_mkdir() and | ||
3719 | * cgroup_rmdir() are fully synchronized by i_mutex; however, do it | ||
3720 | * anyway so that locking is contained inside cgroup proper and we | ||
3721 | * don't get nasty surprises if we ever grow another caller. | ||
3722 | */ | ||
3723 | if (!cgroup_lock_live_group(parent)) { | ||
3724 | err = -ENODEV; | ||
3725 | goto err_unlock_tree; | ||
3726 | } | 4358 | } |
3727 | 4359 | ||
4360 | ret = percpu_ref_init(&cgrp->self.refcnt, css_release); | ||
4361 | if (ret) | ||
4362 | goto out_free_cgrp; | ||
4363 | |||
3728 | /* | 4364 | /* |
3729 | * Temporarily set the pointer to NULL, so idr_find() won't return | 4365 | * Temporarily set the pointer to NULL, so idr_find() won't return |
3730 | * a half-baked cgroup. | 4366 | * a half-baked cgroup. |
3731 | */ | 4367 | */ |
3732 | cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL); | 4368 | cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_NOWAIT); |
3733 | if (cgrp->id < 0) { | 4369 | if (cgrp->id < 0) { |
3734 | err = -ENOMEM; | 4370 | ret = -ENOMEM; |
3735 | goto err_unlock; | 4371 | goto out_cancel_ref; |
3736 | } | 4372 | } |
3737 | 4373 | ||
3738 | init_cgroup_housekeeping(cgrp); | 4374 | init_cgroup_housekeeping(cgrp); |
3739 | 4375 | ||
3740 | cgrp->parent = parent; | 4376 | cgrp->self.parent = &parent->self; |
3741 | cgrp->dummy_css.parent = &parent->dummy_css; | 4377 | cgrp->root = root; |
3742 | cgrp->root = parent->root; | ||
3743 | 4378 | ||
3744 | if (notify_on_release(parent)) | 4379 | if (notify_on_release(parent)) |
3745 | set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); | 4380 | set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); |
@@ -3750,8 +4385,8 @@ static long cgroup_create(struct cgroup *parent, const char *name, | |||
3750 | /* create the directory */ | 4385 | /* create the directory */ |
3751 | kn = kernfs_create_dir(parent->kn, name, mode, cgrp); | 4386 | kn = kernfs_create_dir(parent->kn, name, mode, cgrp); |
3752 | if (IS_ERR(kn)) { | 4387 | if (IS_ERR(kn)) { |
3753 | err = PTR_ERR(kn); | 4388 | ret = PTR_ERR(kn); |
3754 | goto err_free_id; | 4389 | goto out_free_id; |
3755 | } | 4390 | } |
3756 | cgrp->kn = kn; | 4391 | cgrp->kn = kn; |
3757 | 4392 | ||
@@ -3761,10 +4396,10 @@ static long cgroup_create(struct cgroup *parent, const char *name, | |||
3761 | */ | 4396 | */ |
3762 | kernfs_get(kn); | 4397 | kernfs_get(kn); |
3763 | 4398 | ||
3764 | cgrp->serial_nr = cgroup_serial_nr_next++; | 4399 | cgrp->self.serial_nr = css_serial_nr_next++; |
3765 | 4400 | ||
3766 | /* allocation complete, commit to creation */ | 4401 | /* allocation complete, commit to creation */ |
3767 | list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); | 4402 | list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children); |
3768 | atomic_inc(&root->nr_cgrps); | 4403 | atomic_inc(&root->nr_cgrps); |
3769 | cgroup_get(parent); | 4404 | cgroup_get(parent); |
3770 | 4405 | ||
@@ -3772,107 +4407,66 @@ static long cgroup_create(struct cgroup *parent, const char *name, | |||
3772 | * @cgrp is now fully operational. If something fails after this | 4407 | * @cgrp is now fully operational. If something fails after this |
3773 | * point, it'll be released via the normal destruction path. | 4408 | * point, it'll be released via the normal destruction path. |
3774 | */ | 4409 | */ |
3775 | idr_replace(&root->cgroup_idr, cgrp, cgrp->id); | 4410 | cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id); |
3776 | 4411 | ||
3777 | err = cgroup_kn_set_ugid(kn); | 4412 | ret = cgroup_kn_set_ugid(kn); |
3778 | if (err) | 4413 | if (ret) |
3779 | goto err_destroy; | 4414 | goto out_destroy; |
3780 | 4415 | ||
3781 | err = cgroup_addrm_files(cgrp, cgroup_base_files, true); | 4416 | ret = cgroup_addrm_files(cgrp, cgroup_base_files, true); |
3782 | if (err) | 4417 | if (ret) |
3783 | goto err_destroy; | 4418 | goto out_destroy; |
3784 | 4419 | ||
3785 | /* let's create and online css's */ | 4420 | /* let's create and online css's */ |
3786 | for_each_subsys(ss, ssid) { | 4421 | for_each_subsys(ss, ssid) { |
3787 | if (root->cgrp.subsys_mask & (1 << ssid)) { | 4422 | if (parent->child_subsys_mask & (1 << ssid)) { |
3788 | err = create_css(cgrp, ss); | 4423 | ret = create_css(cgrp, ss); |
3789 | if (err) | 4424 | if (ret) |
3790 | goto err_destroy; | 4425 | goto out_destroy; |
3791 | } | 4426 | } |
3792 | } | 4427 | } |
3793 | 4428 | ||
3794 | kernfs_activate(kn); | 4429 | /* |
4430 | * On the default hierarchy, a child doesn't automatically inherit | ||
4431 | * child_subsys_mask from the parent. Each is configured manually. | ||
4432 | */ | ||
4433 | if (!cgroup_on_dfl(cgrp)) | ||
4434 | cgrp->child_subsys_mask = parent->child_subsys_mask; | ||
3795 | 4435 | ||
3796 | mutex_unlock(&cgroup_mutex); | 4436 | kernfs_activate(kn); |
3797 | mutex_unlock(&cgroup_tree_mutex); | ||
3798 | 4437 | ||
3799 | return 0; | 4438 | ret = 0; |
4439 | goto out_unlock; | ||
3800 | 4440 | ||
3801 | err_free_id: | 4441 | out_free_id: |
3802 | idr_remove(&root->cgroup_idr, cgrp->id); | 4442 | cgroup_idr_remove(&root->cgroup_idr, cgrp->id); |
3803 | err_unlock: | 4443 | out_cancel_ref: |
3804 | mutex_unlock(&cgroup_mutex); | 4444 | percpu_ref_cancel_init(&cgrp->self.refcnt); |
3805 | err_unlock_tree: | 4445 | out_free_cgrp: |
3806 | mutex_unlock(&cgroup_tree_mutex); | ||
3807 | kfree(cgrp); | 4446 | kfree(cgrp); |
3808 | return err; | 4447 | out_unlock: |
4448 | cgroup_kn_unlock(parent_kn); | ||
4449 | return ret; | ||
3809 | 4450 | ||
3810 | err_destroy: | 4451 | out_destroy: |
3811 | cgroup_destroy_locked(cgrp); | 4452 | cgroup_destroy_locked(cgrp); |
3812 | mutex_unlock(&cgroup_mutex); | 4453 | goto out_unlock; |
3813 | mutex_unlock(&cgroup_tree_mutex); | ||
3814 | return err; | ||
3815 | } | ||
3816 | |||
3817 | static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, | ||
3818 | umode_t mode) | ||
3819 | { | ||
3820 | struct cgroup *parent = parent_kn->priv; | ||
3821 | int ret; | ||
3822 | |||
3823 | /* | ||
3824 | * cgroup_create() grabs cgroup_tree_mutex which nests outside | ||
3825 | * kernfs active_ref and cgroup_create() already synchronizes | ||
3826 | * properly against removal through cgroup_lock_live_group(). | ||
3827 | * Break it before calling cgroup_create(). | ||
3828 | */ | ||
3829 | cgroup_get(parent); | ||
3830 | kernfs_break_active_protection(parent_kn); | ||
3831 | |||
3832 | ret = cgroup_create(parent, name, mode); | ||
3833 | |||
3834 | kernfs_unbreak_active_protection(parent_kn); | ||
3835 | cgroup_put(parent); | ||
3836 | return ret; | ||
3837 | } | 4454 | } |
3838 | 4455 | ||
3839 | /* | 4456 | /* |
3840 | * This is called when the refcnt of a css is confirmed to be killed. | 4457 | * This is called when the refcnt of a css is confirmed to be killed. |
3841 | * css_tryget() is now guaranteed to fail. | 4458 | * css_tryget_online() is now guaranteed to fail. Tell the subsystem to |
4459 | * initate destruction and put the css ref from kill_css(). | ||
3842 | */ | 4460 | */ |
3843 | static void css_killed_work_fn(struct work_struct *work) | 4461 | static void css_killed_work_fn(struct work_struct *work) |
3844 | { | 4462 | { |
3845 | struct cgroup_subsys_state *css = | 4463 | struct cgroup_subsys_state *css = |
3846 | container_of(work, struct cgroup_subsys_state, destroy_work); | 4464 | container_of(work, struct cgroup_subsys_state, destroy_work); |
3847 | struct cgroup *cgrp = css->cgroup; | ||
3848 | 4465 | ||
3849 | mutex_lock(&cgroup_tree_mutex); | ||
3850 | mutex_lock(&cgroup_mutex); | 4466 | mutex_lock(&cgroup_mutex); |
3851 | |||
3852 | /* | ||
3853 | * css_tryget() is guaranteed to fail now. Tell subsystems to | ||
3854 | * initate destruction. | ||
3855 | */ | ||
3856 | offline_css(css); | 4467 | offline_css(css); |
3857 | |||
3858 | /* | ||
3859 | * If @cgrp is marked dead, it's waiting for refs of all css's to | ||
3860 | * be disabled before proceeding to the second phase of cgroup | ||
3861 | * destruction. If we are the last one, kick it off. | ||
3862 | */ | ||
3863 | if (!cgrp->nr_css && cgroup_is_dead(cgrp)) | ||
3864 | cgroup_destroy_css_killed(cgrp); | ||
3865 | |||
3866 | mutex_unlock(&cgroup_mutex); | 4468 | mutex_unlock(&cgroup_mutex); |
3867 | mutex_unlock(&cgroup_tree_mutex); | ||
3868 | 4469 | ||
3869 | /* | ||
3870 | * Put the css refs from kill_css(). Each css holds an extra | ||
3871 | * reference to the cgroup's dentry and cgroup removal proceeds | ||
3872 | * regardless of css refs. On the last put of each css, whenever | ||
3873 | * that may be, the extra dentry ref is put so that dentry | ||
3874 | * destruction happens only after all css's are released. | ||
3875 | */ | ||
3876 | css_put(css); | 4470 | css_put(css); |
3877 | } | 4471 | } |
3878 | 4472 | ||
@@ -3886,9 +4480,18 @@ static void css_killed_ref_fn(struct percpu_ref *ref) | |||
3886 | queue_work(cgroup_destroy_wq, &css->destroy_work); | 4480 | queue_work(cgroup_destroy_wq, &css->destroy_work); |
3887 | } | 4481 | } |
3888 | 4482 | ||
3889 | static void __kill_css(struct cgroup_subsys_state *css) | 4483 | /** |
4484 | * kill_css - destroy a css | ||
4485 | * @css: css to destroy | ||
4486 | * | ||
4487 | * This function initiates destruction of @css by removing cgroup interface | ||
4488 | * files and putting its base reference. ->css_offline() will be invoked | ||
4489 | * asynchronously once css_tryget_online() is guaranteed to fail and when | ||
4490 | * the reference count reaches zero, @css will be released. | ||
4491 | */ | ||
4492 | static void kill_css(struct cgroup_subsys_state *css) | ||
3890 | { | 4493 | { |
3891 | lockdep_assert_held(&cgroup_tree_mutex); | 4494 | lockdep_assert_held(&cgroup_mutex); |
3892 | 4495 | ||
3893 | /* | 4496 | /* |
3894 | * This must happen before css is disassociated with its cgroup. | 4497 | * This must happen before css is disassociated with its cgroup. |
@@ -3905,7 +4508,7 @@ static void __kill_css(struct cgroup_subsys_state *css) | |||
3905 | /* | 4508 | /* |
3906 | * cgroup core guarantees that, by the time ->css_offline() is | 4509 | * cgroup core guarantees that, by the time ->css_offline() is |
3907 | * invoked, no new css reference will be given out via | 4510 | * invoked, no new css reference will be given out via |
3908 | * css_tryget(). We can't simply call percpu_ref_kill() and | 4511 | * css_tryget_online(). We can't simply call percpu_ref_kill() and |
3909 | * proceed to offlining css's because percpu_ref_kill() doesn't | 4512 | * proceed to offlining css's because percpu_ref_kill() doesn't |
3910 | * guarantee that the ref is seen as killed on all CPUs on return. | 4513 | * guarantee that the ref is seen as killed on all CPUs on return. |
3911 | * | 4514 | * |
@@ -3916,36 +4519,14 @@ static void __kill_css(struct cgroup_subsys_state *css) | |||
3916 | } | 4519 | } |
3917 | 4520 | ||
3918 | /** | 4521 | /** |
3919 | * kill_css - destroy a css | ||
3920 | * @css: css to destroy | ||
3921 | * | ||
3922 | * This function initiates destruction of @css by removing cgroup interface | ||
3923 | * files and putting its base reference. ->css_offline() will be invoked | ||
3924 | * asynchronously once css_tryget() is guaranteed to fail and when the | ||
3925 | * reference count reaches zero, @css will be released. | ||
3926 | */ | ||
3927 | static void kill_css(struct cgroup_subsys_state *css) | ||
3928 | { | ||
3929 | struct cgroup *cgrp = css->cgroup; | ||
3930 | |||
3931 | lockdep_assert_held(&cgroup_tree_mutex); | ||
3932 | |||
3933 | /* if already killed, noop */ | ||
3934 | if (cgrp->subsys_mask & (1 << css->ss->id)) { | ||
3935 | cgrp->subsys_mask &= ~(1 << css->ss->id); | ||
3936 | __kill_css(css); | ||
3937 | } | ||
3938 | } | ||
3939 | |||
3940 | /** | ||
3941 | * cgroup_destroy_locked - the first stage of cgroup destruction | 4522 | * cgroup_destroy_locked - the first stage of cgroup destruction |
3942 | * @cgrp: cgroup to be destroyed | 4523 | * @cgrp: cgroup to be destroyed |
3943 | * | 4524 | * |
3944 | * css's make use of percpu refcnts whose killing latency shouldn't be | 4525 | * css's make use of percpu refcnts whose killing latency shouldn't be |
3945 | * exposed to userland and are RCU protected. Also, cgroup core needs to | 4526 | * exposed to userland and are RCU protected. Also, cgroup core needs to |
3946 | * guarantee that css_tryget() won't succeed by the time ->css_offline() is | 4527 | * guarantee that css_tryget_online() won't succeed by the time |
3947 | * invoked. To satisfy all the requirements, destruction is implemented in | 4528 | * ->css_offline() is invoked. To satisfy all the requirements, |
3948 | * the following two steps. | 4529 | * destruction is implemented in the following two steps. |
3949 | * | 4530 | * |
3950 | * s1. Verify @cgrp can be destroyed and mark it dying. Remove all | 4531 | * s1. Verify @cgrp can be destroyed and mark it dying. Remove all |
3951 | * userland visible parts and start killing the percpu refcnts of | 4532 | * userland visible parts and start killing the percpu refcnts of |
@@ -3964,12 +4545,10 @@ static void kill_css(struct cgroup_subsys_state *css) | |||
3964 | static int cgroup_destroy_locked(struct cgroup *cgrp) | 4545 | static int cgroup_destroy_locked(struct cgroup *cgrp) |
3965 | __releases(&cgroup_mutex) __acquires(&cgroup_mutex) | 4546 | __releases(&cgroup_mutex) __acquires(&cgroup_mutex) |
3966 | { | 4547 | { |
3967 | struct cgroup *child; | ||
3968 | struct cgroup_subsys_state *css; | 4548 | struct cgroup_subsys_state *css; |
3969 | bool empty; | 4549 | bool empty; |
3970 | int ssid; | 4550 | int ssid; |
3971 | 4551 | ||
3972 | lockdep_assert_held(&cgroup_tree_mutex); | ||
3973 | lockdep_assert_held(&cgroup_mutex); | 4552 | lockdep_assert_held(&cgroup_mutex); |
3974 | 4553 | ||
3975 | /* | 4554 | /* |
@@ -3983,127 +4562,68 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) | |||
3983 | return -EBUSY; | 4562 | return -EBUSY; |
3984 | 4563 | ||
3985 | /* | 4564 | /* |
3986 | * Make sure there's no live children. We can't test ->children | 4565 | * Make sure there's no live children. We can't test emptiness of |
3987 | * emptiness as dead children linger on it while being destroyed; | 4566 | * ->self.children as dead children linger on it while being |
3988 | * otherwise, "rmdir parent/child parent" may fail with -EBUSY. | 4567 | * drained; otherwise, "rmdir parent/child parent" may fail. |
3989 | */ | 4568 | */ |
3990 | empty = true; | 4569 | if (css_has_online_children(&cgrp->self)) |
3991 | rcu_read_lock(); | ||
3992 | list_for_each_entry_rcu(child, &cgrp->children, sibling) { | ||
3993 | empty = cgroup_is_dead(child); | ||
3994 | if (!empty) | ||
3995 | break; | ||
3996 | } | ||
3997 | rcu_read_unlock(); | ||
3998 | if (!empty) | ||
3999 | return -EBUSY; | 4570 | return -EBUSY; |
4000 | 4571 | ||
4001 | /* | 4572 | /* |
4002 | * Mark @cgrp dead. This prevents further task migration and child | 4573 | * Mark @cgrp dead. This prevents further task migration and child |
4003 | * creation by disabling cgroup_lock_live_group(). Note that | 4574 | * creation by disabling cgroup_lock_live_group(). |
4004 | * CGRP_DEAD assertion is depended upon by css_next_child() to | ||
4005 | * resume iteration after dropping RCU read lock. See | ||
4006 | * css_next_child() for details. | ||
4007 | */ | 4575 | */ |
4008 | set_bit(CGRP_DEAD, &cgrp->flags); | 4576 | cgrp->self.flags &= ~CSS_ONLINE; |
4009 | 4577 | ||
4010 | /* | 4578 | /* initiate massacre of all css's */ |
4011 | * Initiate massacre of all css's. cgroup_destroy_css_killed() | ||
4012 | * will be invoked to perform the rest of destruction once the | ||
4013 | * percpu refs of all css's are confirmed to be killed. This | ||
4014 | * involves removing the subsystem's files, drop cgroup_mutex. | ||
4015 | */ | ||
4016 | mutex_unlock(&cgroup_mutex); | ||
4017 | for_each_css(css, ssid, cgrp) | 4579 | for_each_css(css, ssid, cgrp) |
4018 | kill_css(css); | 4580 | kill_css(css); |
4019 | mutex_lock(&cgroup_mutex); | ||
4020 | 4581 | ||
4021 | /* CGRP_DEAD is set, remove from ->release_list for the last time */ | 4582 | /* CSS_ONLINE is clear, remove from ->release_list for the last time */ |
4022 | raw_spin_lock(&release_list_lock); | 4583 | raw_spin_lock(&release_list_lock); |
4023 | if (!list_empty(&cgrp->release_list)) | 4584 | if (!list_empty(&cgrp->release_list)) |
4024 | list_del_init(&cgrp->release_list); | 4585 | list_del_init(&cgrp->release_list); |
4025 | raw_spin_unlock(&release_list_lock); | 4586 | raw_spin_unlock(&release_list_lock); |
4026 | 4587 | ||
4027 | /* | 4588 | /* |
4028 | * If @cgrp has css's attached, the second stage of cgroup | 4589 | * Remove @cgrp directory along with the base files. @cgrp has an |
4029 | * destruction is kicked off from css_killed_work_fn() after the | 4590 | * extra ref on its kn. |
4030 | * refs of all attached css's are killed. If @cgrp doesn't have | ||
4031 | * any css, we kick it off here. | ||
4032 | */ | 4591 | */ |
4033 | if (!cgrp->nr_css) | 4592 | kernfs_remove(cgrp->kn); |
4034 | cgroup_destroy_css_killed(cgrp); | ||
4035 | |||
4036 | /* remove @cgrp directory along with the base files */ | ||
4037 | mutex_unlock(&cgroup_mutex); | ||
4038 | 4593 | ||
4039 | /* | 4594 | set_bit(CGRP_RELEASABLE, &cgroup_parent(cgrp)->flags); |
4040 | * There are two control paths which try to determine cgroup from | 4595 | check_for_release(cgroup_parent(cgrp)); |
4041 | * dentry without going through kernfs - cgroupstats_build() and | ||
4042 | * css_tryget_from_dir(). Those are supported by RCU protecting | ||
4043 | * clearing of cgrp->kn->priv backpointer, which should happen | ||
4044 | * after all files under it have been removed. | ||
4045 | */ | ||
4046 | kernfs_remove(cgrp->kn); /* @cgrp has an extra ref on its kn */ | ||
4047 | RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL); | ||
4048 | 4596 | ||
4049 | mutex_lock(&cgroup_mutex); | 4597 | /* put the base reference */ |
4598 | percpu_ref_kill(&cgrp->self.refcnt); | ||
4050 | 4599 | ||
4051 | return 0; | 4600 | return 0; |
4052 | }; | 4601 | }; |
4053 | 4602 | ||
4054 | /** | ||
4055 | * cgroup_destroy_css_killed - the second step of cgroup destruction | ||
4056 | * @work: cgroup->destroy_free_work | ||
4057 | * | ||
4058 | * This function is invoked from a work item for a cgroup which is being | ||
4059 | * destroyed after all css's are offlined and performs the rest of | ||
4060 | * destruction. This is the second step of destruction described in the | ||
4061 | * comment above cgroup_destroy_locked(). | ||
4062 | */ | ||
4063 | static void cgroup_destroy_css_killed(struct cgroup *cgrp) | ||
4064 | { | ||
4065 | struct cgroup *parent = cgrp->parent; | ||
4066 | |||
4067 | lockdep_assert_held(&cgroup_tree_mutex); | ||
4068 | lockdep_assert_held(&cgroup_mutex); | ||
4069 | |||
4070 | /* delete this cgroup from parent->children */ | ||
4071 | list_del_rcu(&cgrp->sibling); | ||
4072 | |||
4073 | cgroup_put(cgrp); | ||
4074 | |||
4075 | set_bit(CGRP_RELEASABLE, &parent->flags); | ||
4076 | check_for_release(parent); | ||
4077 | } | ||
4078 | |||
4079 | static int cgroup_rmdir(struct kernfs_node *kn) | 4603 | static int cgroup_rmdir(struct kernfs_node *kn) |
4080 | { | 4604 | { |
4081 | struct cgroup *cgrp = kn->priv; | 4605 | struct cgroup *cgrp; |
4082 | int ret = 0; | 4606 | int ret = 0; |
4083 | 4607 | ||
4084 | /* | 4608 | cgrp = cgroup_kn_lock_live(kn); |
4085 | * This is self-destruction but @kn can't be removed while this | 4609 | if (!cgrp) |
4086 | * callback is in progress. Let's break active protection. Once | 4610 | return 0; |
4087 | * the protection is broken, @cgrp can be destroyed at any point. | 4611 | cgroup_get(cgrp); /* for @kn->priv clearing */ |
4088 | * Pin it so that it stays accessible. | ||
4089 | */ | ||
4090 | cgroup_get(cgrp); | ||
4091 | kernfs_break_active_protection(kn); | ||
4092 | 4612 | ||
4093 | mutex_lock(&cgroup_tree_mutex); | 4613 | ret = cgroup_destroy_locked(cgrp); |
4094 | mutex_lock(&cgroup_mutex); | 4614 | |
4615 | cgroup_kn_unlock(kn); | ||
4095 | 4616 | ||
4096 | /* | 4617 | /* |
4097 | * @cgrp might already have been destroyed while we're trying to | 4618 | * There are two control paths which try to determine cgroup from |
4098 | * grab the mutexes. | 4619 | * dentry without going through kernfs - cgroupstats_build() and |
4620 | * css_tryget_online_from_dir(). Those are supported by RCU | ||
4621 | * protecting clearing of cgrp->kn->priv backpointer, which should | ||
4622 | * happen after all files under it have been removed. | ||
4099 | */ | 4623 | */ |
4100 | if (!cgroup_is_dead(cgrp)) | 4624 | if (!ret) |
4101 | ret = cgroup_destroy_locked(cgrp); | 4625 | RCU_INIT_POINTER(*(void __rcu __force **)&kn->priv, NULL); |
4102 | |||
4103 | mutex_unlock(&cgroup_mutex); | ||
4104 | mutex_unlock(&cgroup_tree_mutex); | ||
4105 | 4626 | ||
4106 | kernfs_unbreak_active_protection(kn); | ||
4107 | cgroup_put(cgrp); | 4627 | cgroup_put(cgrp); |
4108 | return ret; | 4628 | return ret; |
4109 | } | 4629 | } |
@@ -4116,15 +4636,15 @@ static struct kernfs_syscall_ops cgroup_kf_syscall_ops = { | |||
4116 | .rename = cgroup_rename, | 4636 | .rename = cgroup_rename, |
4117 | }; | 4637 | }; |
4118 | 4638 | ||
4119 | static void __init cgroup_init_subsys(struct cgroup_subsys *ss) | 4639 | static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) |
4120 | { | 4640 | { |
4121 | struct cgroup_subsys_state *css; | 4641 | struct cgroup_subsys_state *css; |
4122 | 4642 | ||
4123 | printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); | 4643 | printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); |
4124 | 4644 | ||
4125 | mutex_lock(&cgroup_tree_mutex); | ||
4126 | mutex_lock(&cgroup_mutex); | 4645 | mutex_lock(&cgroup_mutex); |
4127 | 4646 | ||
4647 | idr_init(&ss->css_idr); | ||
4128 | INIT_LIST_HEAD(&ss->cfts); | 4648 | INIT_LIST_HEAD(&ss->cfts); |
4129 | 4649 | ||
4130 | /* Create the root cgroup state for this subsystem */ | 4650 | /* Create the root cgroup state for this subsystem */ |
@@ -4132,7 +4652,21 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) | |||
4132 | css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss)); | 4652 | css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss)); |
4133 | /* We don't handle early failures gracefully */ | 4653 | /* We don't handle early failures gracefully */ |
4134 | BUG_ON(IS_ERR(css)); | 4654 | BUG_ON(IS_ERR(css)); |
4135 | init_css(css, ss, &cgrp_dfl_root.cgrp); | 4655 | init_and_link_css(css, ss, &cgrp_dfl_root.cgrp); |
4656 | |||
4657 | /* | ||
4658 | * Root csses are never destroyed and we can't initialize | ||
4659 | * percpu_ref during early init. Disable refcnting. | ||
4660 | */ | ||
4661 | css->flags |= CSS_NO_REF; | ||
4662 | |||
4663 | if (early) { | ||
4664 | /* allocation can't be done safely during early init */ | ||
4665 | css->id = 1; | ||
4666 | } else { | ||
4667 | css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL); | ||
4668 | BUG_ON(css->id < 0); | ||
4669 | } | ||
4136 | 4670 | ||
4137 | /* Update the init_css_set to contain a subsys | 4671 | /* Update the init_css_set to contain a subsys |
4138 | * pointer to this state - since the subsystem is | 4672 | * pointer to this state - since the subsystem is |
@@ -4149,10 +4683,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) | |||
4149 | 4683 | ||
4150 | BUG_ON(online_css(css)); | 4684 | BUG_ON(online_css(css)); |
4151 | 4685 | ||
4152 | cgrp_dfl_root.cgrp.subsys_mask |= 1 << ss->id; | ||
4153 | |||
4154 | mutex_unlock(&cgroup_mutex); | 4686 | mutex_unlock(&cgroup_mutex); |
4155 | mutex_unlock(&cgroup_tree_mutex); | ||
4156 | } | 4687 | } |
4157 | 4688 | ||
4158 | /** | 4689 | /** |
@@ -4169,6 +4700,8 @@ int __init cgroup_init_early(void) | |||
4169 | int i; | 4700 | int i; |
4170 | 4701 | ||
4171 | init_cgroup_root(&cgrp_dfl_root, &opts); | 4702 | init_cgroup_root(&cgrp_dfl_root, &opts); |
4703 | cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF; | ||
4704 | |||
4172 | RCU_INIT_POINTER(init_task.cgroups, &init_css_set); | 4705 | RCU_INIT_POINTER(init_task.cgroups, &init_css_set); |
4173 | 4706 | ||
4174 | for_each_subsys(ss, i) { | 4707 | for_each_subsys(ss, i) { |
@@ -4183,7 +4716,7 @@ int __init cgroup_init_early(void) | |||
4183 | ss->name = cgroup_subsys_name[i]; | 4716 | ss->name = cgroup_subsys_name[i]; |
4184 | 4717 | ||
4185 | if (ss->early_init) | 4718 | if (ss->early_init) |
4186 | cgroup_init_subsys(ss); | 4719 | cgroup_init_subsys(ss, true); |
4187 | } | 4720 | } |
4188 | return 0; | 4721 | return 0; |
4189 | } | 4722 | } |
@@ -4202,7 +4735,6 @@ int __init cgroup_init(void) | |||
4202 | 4735 | ||
4203 | BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); | 4736 | BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); |
4204 | 4737 | ||
4205 | mutex_lock(&cgroup_tree_mutex); | ||
4206 | mutex_lock(&cgroup_mutex); | 4738 | mutex_lock(&cgroup_mutex); |
4207 | 4739 | ||
4208 | /* Add init_css_set to the hash table */ | 4740 | /* Add init_css_set to the hash table */ |
@@ -4212,18 +4744,31 @@ int __init cgroup_init(void) | |||
4212 | BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0)); | 4744 | BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0)); |
4213 | 4745 | ||
4214 | mutex_unlock(&cgroup_mutex); | 4746 | mutex_unlock(&cgroup_mutex); |
4215 | mutex_unlock(&cgroup_tree_mutex); | ||
4216 | 4747 | ||
4217 | for_each_subsys(ss, ssid) { | 4748 | for_each_subsys(ss, ssid) { |
4218 | if (!ss->early_init) | 4749 | if (ss->early_init) { |
4219 | cgroup_init_subsys(ss); | 4750 | struct cgroup_subsys_state *css = |
4751 | init_css_set.subsys[ss->id]; | ||
4752 | |||
4753 | css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, | ||
4754 | GFP_KERNEL); | ||
4755 | BUG_ON(css->id < 0); | ||
4756 | } else { | ||
4757 | cgroup_init_subsys(ss, false); | ||
4758 | } | ||
4759 | |||
4760 | list_add_tail(&init_css_set.e_cset_node[ssid], | ||
4761 | &cgrp_dfl_root.cgrp.e_csets[ssid]); | ||
4220 | 4762 | ||
4221 | /* | 4763 | /* |
4222 | * cftype registration needs kmalloc and can't be done | 4764 | * Setting dfl_root subsys_mask needs to consider the |
4223 | * during early_init. Register base cftypes separately. | 4765 | * disabled flag and cftype registration needs kmalloc, |
4766 | * both of which aren't available during early_init. | ||
4224 | */ | 4767 | */ |
4225 | if (ss->base_cftypes) | 4768 | if (!ss->disabled) { |
4769 | cgrp_dfl_root.subsys_mask |= 1 << ss->id; | ||
4226 | WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes)); | 4770 | WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes)); |
4771 | } | ||
4227 | } | 4772 | } |
4228 | 4773 | ||
4229 | cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); | 4774 | cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); |
@@ -4306,7 +4851,7 @@ int proc_cgroup_show(struct seq_file *m, void *v) | |||
4306 | 4851 | ||
4307 | seq_printf(m, "%d:", root->hierarchy_id); | 4852 | seq_printf(m, "%d:", root->hierarchy_id); |
4308 | for_each_subsys(ss, ssid) | 4853 | for_each_subsys(ss, ssid) |
4309 | if (root->cgrp.subsys_mask & (1 << ssid)) | 4854 | if (root->subsys_mask & (1 << ssid)) |
4310 | seq_printf(m, "%s%s", count++ ? "," : "", ss->name); | 4855 | seq_printf(m, "%s%s", count++ ? "," : "", ss->name); |
4311 | if (strlen(root->name)) | 4856 | if (strlen(root->name)) |
4312 | seq_printf(m, "%sname=%s", count ? "," : "", | 4857 | seq_printf(m, "%sname=%s", count ? "," : "", |
@@ -4501,8 +5046,8 @@ void cgroup_exit(struct task_struct *tsk) | |||
4501 | 5046 | ||
4502 | static void check_for_release(struct cgroup *cgrp) | 5047 | static void check_for_release(struct cgroup *cgrp) |
4503 | { | 5048 | { |
4504 | if (cgroup_is_releasable(cgrp) && | 5049 | if (cgroup_is_releasable(cgrp) && list_empty(&cgrp->cset_links) && |
4505 | list_empty(&cgrp->cset_links) && list_empty(&cgrp->children)) { | 5050 | !css_has_online_children(&cgrp->self)) { |
4506 | /* | 5051 | /* |
4507 | * Control Group is currently removeable. If it's not | 5052 | * Control Group is currently removeable. If it's not |
4508 | * already queued for a userspace notification, queue | 5053 | * already queued for a userspace notification, queue |
@@ -4619,7 +5164,7 @@ static int __init cgroup_disable(char *str) | |||
4619 | __setup("cgroup_disable=", cgroup_disable); | 5164 | __setup("cgroup_disable=", cgroup_disable); |
4620 | 5165 | ||
4621 | /** | 5166 | /** |
4622 | * css_tryget_from_dir - get corresponding css from the dentry of a cgroup dir | 5167 | * css_tryget_online_from_dir - get corresponding css from a cgroup dentry |
4623 | * @dentry: directory dentry of interest | 5168 | * @dentry: directory dentry of interest |
4624 | * @ss: subsystem of interest | 5169 | * @ss: subsystem of interest |
4625 | * | 5170 | * |
@@ -4627,8 +5172,8 @@ __setup("cgroup_disable=", cgroup_disable); | |||
4627 | * to get the corresponding css and return it. If such css doesn't exist | 5172 | * to get the corresponding css and return it. If such css doesn't exist |
4628 | * or can't be pinned, an ERR_PTR value is returned. | 5173 | * or can't be pinned, an ERR_PTR value is returned. |
4629 | */ | 5174 | */ |
4630 | struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry, | 5175 | struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, |
4631 | struct cgroup_subsys *ss) | 5176 | struct cgroup_subsys *ss) |
4632 | { | 5177 | { |
4633 | struct kernfs_node *kn = kernfs_node_from_dentry(dentry); | 5178 | struct kernfs_node *kn = kernfs_node_from_dentry(dentry); |
4634 | struct cgroup_subsys_state *css = NULL; | 5179 | struct cgroup_subsys_state *css = NULL; |
@@ -4644,13 +5189,13 @@ struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry, | |||
4644 | /* | 5189 | /* |
4645 | * This path doesn't originate from kernfs and @kn could already | 5190 | * This path doesn't originate from kernfs and @kn could already |
4646 | * have been or be removed at any point. @kn->priv is RCU | 5191 | * have been or be removed at any point. @kn->priv is RCU |
4647 | * protected for this access. See destroy_locked() for details. | 5192 | * protected for this access. See cgroup_rmdir() for details. |
4648 | */ | 5193 | */ |
4649 | cgrp = rcu_dereference(kn->priv); | 5194 | cgrp = rcu_dereference(kn->priv); |
4650 | if (cgrp) | 5195 | if (cgrp) |
4651 | css = cgroup_css(cgrp, ss); | 5196 | css = cgroup_css(cgrp, ss); |
4652 | 5197 | ||
4653 | if (!css || !css_tryget(css)) | 5198 | if (!css || !css_tryget_online(css)) |
4654 | css = ERR_PTR(-ENOENT); | 5199 | css = ERR_PTR(-ENOENT); |
4655 | 5200 | ||
4656 | rcu_read_unlock(); | 5201 | rcu_read_unlock(); |
@@ -4667,14 +5212,8 @@ struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry, | |||
4667 | */ | 5212 | */ |
4668 | struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) | 5213 | struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) |
4669 | { | 5214 | { |
4670 | struct cgroup *cgrp; | 5215 | WARN_ON_ONCE(!rcu_read_lock_held()); |
4671 | 5216 | return idr_find(&ss->css_idr, id); | |
4672 | cgroup_assert_mutexes_or_rcu_locked(); | ||
4673 | |||
4674 | cgrp = idr_find(&ss->root->cgroup_idr, id); | ||
4675 | if (cgrp) | ||
4676 | return cgroup_css(cgrp, ss); | ||
4677 | return NULL; | ||
4678 | } | 5217 | } |
4679 | 5218 | ||
4680 | #ifdef CONFIG_CGROUP_DEBUG | 5219 | #ifdef CONFIG_CGROUP_DEBUG |
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 2bc4a2256444..a79e40f9d700 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/uaccess.h> | 21 | #include <linux/uaccess.h> |
22 | #include <linux/freezer.h> | 22 | #include <linux/freezer.h> |
23 | #include <linux/seq_file.h> | 23 | #include <linux/seq_file.h> |
24 | #include <linux/mutex.h> | ||
24 | 25 | ||
25 | /* | 26 | /* |
26 | * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is | 27 | * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is |
@@ -42,9 +43,10 @@ enum freezer_state_flags { | |||
42 | struct freezer { | 43 | struct freezer { |
43 | struct cgroup_subsys_state css; | 44 | struct cgroup_subsys_state css; |
44 | unsigned int state; | 45 | unsigned int state; |
45 | spinlock_t lock; | ||
46 | }; | 46 | }; |
47 | 47 | ||
48 | static DEFINE_MUTEX(freezer_mutex); | ||
49 | |||
48 | static inline struct freezer *css_freezer(struct cgroup_subsys_state *css) | 50 | static inline struct freezer *css_freezer(struct cgroup_subsys_state *css) |
49 | { | 51 | { |
50 | return css ? container_of(css, struct freezer, css) : NULL; | 52 | return css ? container_of(css, struct freezer, css) : NULL; |
@@ -57,7 +59,7 @@ static inline struct freezer *task_freezer(struct task_struct *task) | |||
57 | 59 | ||
58 | static struct freezer *parent_freezer(struct freezer *freezer) | 60 | static struct freezer *parent_freezer(struct freezer *freezer) |
59 | { | 61 | { |
60 | return css_freezer(css_parent(&freezer->css)); | 62 | return css_freezer(freezer->css.parent); |
61 | } | 63 | } |
62 | 64 | ||
63 | bool cgroup_freezing(struct task_struct *task) | 65 | bool cgroup_freezing(struct task_struct *task) |
@@ -71,10 +73,6 @@ bool cgroup_freezing(struct task_struct *task) | |||
71 | return ret; | 73 | return ret; |
72 | } | 74 | } |
73 | 75 | ||
74 | /* | ||
75 | * cgroups_write_string() limits the size of freezer state strings to | ||
76 | * CGROUP_LOCAL_BUFFER_SIZE | ||
77 | */ | ||
78 | static const char *freezer_state_strs(unsigned int state) | 76 | static const char *freezer_state_strs(unsigned int state) |
79 | { | 77 | { |
80 | if (state & CGROUP_FROZEN) | 78 | if (state & CGROUP_FROZEN) |
@@ -93,7 +91,6 @@ freezer_css_alloc(struct cgroup_subsys_state *parent_css) | |||
93 | if (!freezer) | 91 | if (!freezer) |
94 | return ERR_PTR(-ENOMEM); | 92 | return ERR_PTR(-ENOMEM); |
95 | 93 | ||
96 | spin_lock_init(&freezer->lock); | ||
97 | return &freezer->css; | 94 | return &freezer->css; |
98 | } | 95 | } |
99 | 96 | ||
@@ -110,14 +107,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css) | |||
110 | struct freezer *freezer = css_freezer(css); | 107 | struct freezer *freezer = css_freezer(css); |
111 | struct freezer *parent = parent_freezer(freezer); | 108 | struct freezer *parent = parent_freezer(freezer); |
112 | 109 | ||
113 | /* | 110 | mutex_lock(&freezer_mutex); |
114 | * The following double locking and freezing state inheritance | ||
115 | * guarantee that @cgroup can never escape ancestors' freezing | ||
116 | * states. See css_for_each_descendant_pre() for details. | ||
117 | */ | ||
118 | if (parent) | ||
119 | spin_lock_irq(&parent->lock); | ||
120 | spin_lock_nested(&freezer->lock, SINGLE_DEPTH_NESTING); | ||
121 | 111 | ||
122 | freezer->state |= CGROUP_FREEZER_ONLINE; | 112 | freezer->state |= CGROUP_FREEZER_ONLINE; |
123 | 113 | ||
@@ -126,10 +116,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css) | |||
126 | atomic_inc(&system_freezing_cnt); | 116 | atomic_inc(&system_freezing_cnt); |
127 | } | 117 | } |
128 | 118 | ||
129 | spin_unlock(&freezer->lock); | 119 | mutex_unlock(&freezer_mutex); |
130 | if (parent) | ||
131 | spin_unlock_irq(&parent->lock); | ||
132 | |||
133 | return 0; | 120 | return 0; |
134 | } | 121 | } |
135 | 122 | ||
@@ -144,14 +131,14 @@ static void freezer_css_offline(struct cgroup_subsys_state *css) | |||
144 | { | 131 | { |
145 | struct freezer *freezer = css_freezer(css); | 132 | struct freezer *freezer = css_freezer(css); |
146 | 133 | ||
147 | spin_lock_irq(&freezer->lock); | 134 | mutex_lock(&freezer_mutex); |
148 | 135 | ||
149 | if (freezer->state & CGROUP_FREEZING) | 136 | if (freezer->state & CGROUP_FREEZING) |
150 | atomic_dec(&system_freezing_cnt); | 137 | atomic_dec(&system_freezing_cnt); |
151 | 138 | ||
152 | freezer->state = 0; | 139 | freezer->state = 0; |
153 | 140 | ||
154 | spin_unlock_irq(&freezer->lock); | 141 | mutex_unlock(&freezer_mutex); |
155 | } | 142 | } |
156 | 143 | ||
157 | static void freezer_css_free(struct cgroup_subsys_state *css) | 144 | static void freezer_css_free(struct cgroup_subsys_state *css) |
@@ -175,7 +162,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css, | |||
175 | struct task_struct *task; | 162 | struct task_struct *task; |
176 | bool clear_frozen = false; | 163 | bool clear_frozen = false; |
177 | 164 | ||
178 | spin_lock_irq(&freezer->lock); | 165 | mutex_lock(&freezer_mutex); |
179 | 166 | ||
180 | /* | 167 | /* |
181 | * Make the new tasks conform to the current state of @new_css. | 168 | * Make the new tasks conform to the current state of @new_css. |
@@ -197,21 +184,13 @@ static void freezer_attach(struct cgroup_subsys_state *new_css, | |||
197 | } | 184 | } |
198 | } | 185 | } |
199 | 186 | ||
200 | spin_unlock_irq(&freezer->lock); | 187 | /* propagate FROZEN clearing upwards */ |
201 | |||
202 | /* | ||
203 | * Propagate FROZEN clearing upwards. We may race with | ||
204 | * update_if_frozen(), but as long as both work bottom-up, either | ||
205 | * update_if_frozen() sees child's FROZEN cleared or we clear the | ||
206 | * parent's FROZEN later. No parent w/ !FROZEN children can be | ||
207 | * left FROZEN. | ||
208 | */ | ||
209 | while (clear_frozen && (freezer = parent_freezer(freezer))) { | 188 | while (clear_frozen && (freezer = parent_freezer(freezer))) { |
210 | spin_lock_irq(&freezer->lock); | ||
211 | freezer->state &= ~CGROUP_FROZEN; | 189 | freezer->state &= ~CGROUP_FROZEN; |
212 | clear_frozen = freezer->state & CGROUP_FREEZING; | 190 | clear_frozen = freezer->state & CGROUP_FREEZING; |
213 | spin_unlock_irq(&freezer->lock); | ||
214 | } | 191 | } |
192 | |||
193 | mutex_unlock(&freezer_mutex); | ||
215 | } | 194 | } |
216 | 195 | ||
217 | /** | 196 | /** |
@@ -228,9 +207,6 @@ static void freezer_fork(struct task_struct *task) | |||
228 | { | 207 | { |
229 | struct freezer *freezer; | 208 | struct freezer *freezer; |
230 | 209 | ||
231 | rcu_read_lock(); | ||
232 | freezer = task_freezer(task); | ||
233 | |||
234 | /* | 210 | /* |
235 | * The root cgroup is non-freezable, so we can skip locking the | 211 | * The root cgroup is non-freezable, so we can skip locking the |
236 | * freezer. This is safe regardless of race with task migration. | 212 | * freezer. This is safe regardless of race with task migration. |
@@ -238,24 +214,18 @@ static void freezer_fork(struct task_struct *task) | |||
238 | * to do. If we lost and root is the new cgroup, noop is still the | 214 | * to do. If we lost and root is the new cgroup, noop is still the |
239 | * right thing to do. | 215 | * right thing to do. |
240 | */ | 216 | */ |
241 | if (!parent_freezer(freezer)) | 217 | if (task_css_is_root(task, freezer_cgrp_id)) |
242 | goto out; | 218 | return; |
243 | 219 | ||
244 | /* | 220 | mutex_lock(&freezer_mutex); |
245 | * Grab @freezer->lock and freeze @task after verifying @task still | 221 | rcu_read_lock(); |
246 | * belongs to @freezer and it's freezing. The former is for the | 222 | |
247 | * case where we have raced against task migration and lost and | 223 | freezer = task_freezer(task); |
248 | * @task is already in a different cgroup which may not be frozen. | 224 | if (freezer->state & CGROUP_FREEZING) |
249 | * This isn't strictly necessary as freeze_task() is allowed to be | ||
250 | * called spuriously but let's do it anyway for, if nothing else, | ||
251 | * documentation. | ||
252 | */ | ||
253 | spin_lock_irq(&freezer->lock); | ||
254 | if (freezer == task_freezer(task) && (freezer->state & CGROUP_FREEZING)) | ||
255 | freeze_task(task); | 225 | freeze_task(task); |
256 | spin_unlock_irq(&freezer->lock); | 226 | |
257 | out: | ||
258 | rcu_read_unlock(); | 227 | rcu_read_unlock(); |
228 | mutex_unlock(&freezer_mutex); | ||
259 | } | 229 | } |
260 | 230 | ||
261 | /** | 231 | /** |
@@ -281,22 +251,24 @@ static void update_if_frozen(struct cgroup_subsys_state *css) | |||
281 | struct css_task_iter it; | 251 | struct css_task_iter it; |
282 | struct task_struct *task; | 252 | struct task_struct *task; |
283 | 253 | ||
284 | WARN_ON_ONCE(!rcu_read_lock_held()); | 254 | lockdep_assert_held(&freezer_mutex); |
285 | |||
286 | spin_lock_irq(&freezer->lock); | ||
287 | 255 | ||
288 | if (!(freezer->state & CGROUP_FREEZING) || | 256 | if (!(freezer->state & CGROUP_FREEZING) || |
289 | (freezer->state & CGROUP_FROZEN)) | 257 | (freezer->state & CGROUP_FROZEN)) |
290 | goto out_unlock; | 258 | return; |
291 | 259 | ||
292 | /* are all (live) children frozen? */ | 260 | /* are all (live) children frozen? */ |
261 | rcu_read_lock(); | ||
293 | css_for_each_child(pos, css) { | 262 | css_for_each_child(pos, css) { |
294 | struct freezer *child = css_freezer(pos); | 263 | struct freezer *child = css_freezer(pos); |
295 | 264 | ||
296 | if ((child->state & CGROUP_FREEZER_ONLINE) && | 265 | if ((child->state & CGROUP_FREEZER_ONLINE) && |
297 | !(child->state & CGROUP_FROZEN)) | 266 | !(child->state & CGROUP_FROZEN)) { |
298 | goto out_unlock; | 267 | rcu_read_unlock(); |
268 | return; | ||
269 | } | ||
299 | } | 270 | } |
271 | rcu_read_unlock(); | ||
300 | 272 | ||
301 | /* are all tasks frozen? */ | 273 | /* are all tasks frozen? */ |
302 | css_task_iter_start(css, &it); | 274 | css_task_iter_start(css, &it); |
@@ -317,21 +289,29 @@ static void update_if_frozen(struct cgroup_subsys_state *css) | |||
317 | freezer->state |= CGROUP_FROZEN; | 289 | freezer->state |= CGROUP_FROZEN; |
318 | out_iter_end: | 290 | out_iter_end: |
319 | css_task_iter_end(&it); | 291 | css_task_iter_end(&it); |
320 | out_unlock: | ||
321 | spin_unlock_irq(&freezer->lock); | ||
322 | } | 292 | } |
323 | 293 | ||
324 | static int freezer_read(struct seq_file *m, void *v) | 294 | static int freezer_read(struct seq_file *m, void *v) |
325 | { | 295 | { |
326 | struct cgroup_subsys_state *css = seq_css(m), *pos; | 296 | struct cgroup_subsys_state *css = seq_css(m), *pos; |
327 | 297 | ||
298 | mutex_lock(&freezer_mutex); | ||
328 | rcu_read_lock(); | 299 | rcu_read_lock(); |
329 | 300 | ||
330 | /* update states bottom-up */ | 301 | /* update states bottom-up */ |
331 | css_for_each_descendant_post(pos, css) | 302 | css_for_each_descendant_post(pos, css) { |
303 | if (!css_tryget_online(pos)) | ||
304 | continue; | ||
305 | rcu_read_unlock(); | ||
306 | |||
332 | update_if_frozen(pos); | 307 | update_if_frozen(pos); |
333 | 308 | ||
309 | rcu_read_lock(); | ||
310 | css_put(pos); | ||
311 | } | ||
312 | |||
334 | rcu_read_unlock(); | 313 | rcu_read_unlock(); |
314 | mutex_unlock(&freezer_mutex); | ||
335 | 315 | ||
336 | seq_puts(m, freezer_state_strs(css_freezer(css)->state)); | 316 | seq_puts(m, freezer_state_strs(css_freezer(css)->state)); |
337 | seq_putc(m, '\n'); | 317 | seq_putc(m, '\n'); |
@@ -373,7 +353,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze, | |||
373 | unsigned int state) | 353 | unsigned int state) |
374 | { | 354 | { |
375 | /* also synchronizes against task migration, see freezer_attach() */ | 355 | /* also synchronizes against task migration, see freezer_attach() */ |
376 | lockdep_assert_held(&freezer->lock); | 356 | lockdep_assert_held(&freezer_mutex); |
377 | 357 | ||
378 | if (!(freezer->state & CGROUP_FREEZER_ONLINE)) | 358 | if (!(freezer->state & CGROUP_FREEZER_ONLINE)) |
379 | return; | 359 | return; |
@@ -414,47 +394,47 @@ static void freezer_change_state(struct freezer *freezer, bool freeze) | |||
414 | * descendant will try to inherit its parent's FREEZING state as | 394 | * descendant will try to inherit its parent's FREEZING state as |
415 | * CGROUP_FREEZING_PARENT. | 395 | * CGROUP_FREEZING_PARENT. |
416 | */ | 396 | */ |
397 | mutex_lock(&freezer_mutex); | ||
417 | rcu_read_lock(); | 398 | rcu_read_lock(); |
418 | css_for_each_descendant_pre(pos, &freezer->css) { | 399 | css_for_each_descendant_pre(pos, &freezer->css) { |
419 | struct freezer *pos_f = css_freezer(pos); | 400 | struct freezer *pos_f = css_freezer(pos); |
420 | struct freezer *parent = parent_freezer(pos_f); | 401 | struct freezer *parent = parent_freezer(pos_f); |
421 | 402 | ||
422 | spin_lock_irq(&pos_f->lock); | 403 | if (!css_tryget_online(pos)) |
404 | continue; | ||
405 | rcu_read_unlock(); | ||
423 | 406 | ||
424 | if (pos_f == freezer) { | 407 | if (pos_f == freezer) |
425 | freezer_apply_state(pos_f, freeze, | 408 | freezer_apply_state(pos_f, freeze, |
426 | CGROUP_FREEZING_SELF); | 409 | CGROUP_FREEZING_SELF); |
427 | } else { | 410 | else |
428 | /* | ||
429 | * Our update to @parent->state is already visible | ||
430 | * which is all we need. No need to lock @parent. | ||
431 | * For more info on synchronization, see | ||
432 | * freezer_post_create(). | ||
433 | */ | ||
434 | freezer_apply_state(pos_f, | 411 | freezer_apply_state(pos_f, |
435 | parent->state & CGROUP_FREEZING, | 412 | parent->state & CGROUP_FREEZING, |
436 | CGROUP_FREEZING_PARENT); | 413 | CGROUP_FREEZING_PARENT); |
437 | } | ||
438 | 414 | ||
439 | spin_unlock_irq(&pos_f->lock); | 415 | rcu_read_lock(); |
416 | css_put(pos); | ||
440 | } | 417 | } |
441 | rcu_read_unlock(); | 418 | rcu_read_unlock(); |
419 | mutex_unlock(&freezer_mutex); | ||
442 | } | 420 | } |
443 | 421 | ||
444 | static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft, | 422 | static ssize_t freezer_write(struct kernfs_open_file *of, |
445 | char *buffer) | 423 | char *buf, size_t nbytes, loff_t off) |
446 | { | 424 | { |
447 | bool freeze; | 425 | bool freeze; |
448 | 426 | ||
449 | if (strcmp(buffer, freezer_state_strs(0)) == 0) | 427 | buf = strstrip(buf); |
428 | |||
429 | if (strcmp(buf, freezer_state_strs(0)) == 0) | ||
450 | freeze = false; | 430 | freeze = false; |
451 | else if (strcmp(buffer, freezer_state_strs(CGROUP_FROZEN)) == 0) | 431 | else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0) |
452 | freeze = true; | 432 | freeze = true; |
453 | else | 433 | else |
454 | return -EINVAL; | 434 | return -EINVAL; |
455 | 435 | ||
456 | freezer_change_state(css_freezer(css), freeze); | 436 | freezer_change_state(css_freezer(of_css(of)), freeze); |
457 | return 0; | 437 | return nbytes; |
458 | } | 438 | } |
459 | 439 | ||
460 | static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css, | 440 | static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css, |
@@ -478,7 +458,7 @@ static struct cftype files[] = { | |||
478 | .name = "state", | 458 | .name = "state", |
479 | .flags = CFTYPE_NOT_ON_ROOT, | 459 | .flags = CFTYPE_NOT_ON_ROOT, |
480 | .seq_show = freezer_read, | 460 | .seq_show = freezer_read, |
481 | .write_string = freezer_write, | 461 | .write = freezer_write, |
482 | }, | 462 | }, |
483 | { | 463 | { |
484 | .name = "self_freezing", | 464 | .name = "self_freezing", |
diff --git a/kernel/compat.c b/kernel/compat.c index e40b0430b562..633394f442f8 100644 --- a/kernel/compat.c +++ b/kernel/compat.c | |||
@@ -157,7 +157,7 @@ static int __compat_put_timespec(const struct timespec *ts, struct compat_timesp | |||
157 | int compat_get_timeval(struct timeval *tv, const void __user *utv) | 157 | int compat_get_timeval(struct timeval *tv, const void __user *utv) |
158 | { | 158 | { |
159 | if (COMPAT_USE_64BIT_TIME) | 159 | if (COMPAT_USE_64BIT_TIME) |
160 | return copy_from_user(tv, utv, sizeof *tv) ? -EFAULT : 0; | 160 | return copy_from_user(tv, utv, sizeof(*tv)) ? -EFAULT : 0; |
161 | else | 161 | else |
162 | return __compat_get_timeval(tv, utv); | 162 | return __compat_get_timeval(tv, utv); |
163 | } | 163 | } |
@@ -166,7 +166,7 @@ EXPORT_SYMBOL_GPL(compat_get_timeval); | |||
166 | int compat_put_timeval(const struct timeval *tv, void __user *utv) | 166 | int compat_put_timeval(const struct timeval *tv, void __user *utv) |
167 | { | 167 | { |
168 | if (COMPAT_USE_64BIT_TIME) | 168 | if (COMPAT_USE_64BIT_TIME) |
169 | return copy_to_user(utv, tv, sizeof *tv) ? -EFAULT : 0; | 169 | return copy_to_user(utv, tv, sizeof(*tv)) ? -EFAULT : 0; |
170 | else | 170 | else |
171 | return __compat_put_timeval(tv, utv); | 171 | return __compat_put_timeval(tv, utv); |
172 | } | 172 | } |
@@ -175,7 +175,7 @@ EXPORT_SYMBOL_GPL(compat_put_timeval); | |||
175 | int compat_get_timespec(struct timespec *ts, const void __user *uts) | 175 | int compat_get_timespec(struct timespec *ts, const void __user *uts) |
176 | { | 176 | { |
177 | if (COMPAT_USE_64BIT_TIME) | 177 | if (COMPAT_USE_64BIT_TIME) |
178 | return copy_from_user(ts, uts, sizeof *ts) ? -EFAULT : 0; | 178 | return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0; |
179 | else | 179 | else |
180 | return __compat_get_timespec(ts, uts); | 180 | return __compat_get_timespec(ts, uts); |
181 | } | 181 | } |
@@ -184,7 +184,7 @@ EXPORT_SYMBOL_GPL(compat_get_timespec); | |||
184 | int compat_put_timespec(const struct timespec *ts, void __user *uts) | 184 | int compat_put_timespec(const struct timespec *ts, void __user *uts) |
185 | { | 185 | { |
186 | if (COMPAT_USE_64BIT_TIME) | 186 | if (COMPAT_USE_64BIT_TIME) |
187 | return copy_to_user(uts, ts, sizeof *ts) ? -EFAULT : 0; | 187 | return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0; |
188 | else | 188 | else |
189 | return __compat_put_timespec(ts, uts); | 189 | return __compat_put_timespec(ts, uts); |
190 | } | 190 | } |
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 6cb20d2e7ee0..019d45008448 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c | |||
@@ -120,7 +120,7 @@ void context_tracking_user_enter(void) | |||
120 | * instead of preempt_schedule() to exit user context if needed before | 120 | * instead of preempt_schedule() to exit user context if needed before |
121 | * calling the scheduler. | 121 | * calling the scheduler. |
122 | */ | 122 | */ |
123 | asmlinkage void __sched notrace preempt_schedule_context(void) | 123 | asmlinkage __visible void __sched notrace preempt_schedule_context(void) |
124 | { | 124 | { |
125 | enum ctx_state prev_ctx; | 125 | enum ctx_state prev_ctx; |
126 | 126 | ||
diff --git a/kernel/cpu.c b/kernel/cpu.c index a9e710eef0e2..a343bde710b1 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -20,6 +20,7 @@ | |||
20 | #include <linux/gfp.h> | 20 | #include <linux/gfp.h> |
21 | #include <linux/suspend.h> | 21 | #include <linux/suspend.h> |
22 | #include <linux/lockdep.h> | 22 | #include <linux/lockdep.h> |
23 | #include <trace/events/power.h> | ||
23 | 24 | ||
24 | #include "smpboot.h" | 25 | #include "smpboot.h" |
25 | 26 | ||
@@ -283,8 +284,7 @@ static inline void check_for_tasks(int cpu) | |||
283 | task_cputime(p, &utime, &stime); | 284 | task_cputime(p, &utime, &stime); |
284 | if (task_cpu(p) == cpu && p->state == TASK_RUNNING && | 285 | if (task_cpu(p) == cpu && p->state == TASK_RUNNING && |
285 | (utime || stime)) | 286 | (utime || stime)) |
286 | printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d " | 287 | pr_warn("Task %s (pid = %d) is on cpu %d (state = %ld, flags = %x)\n", |
287 | "(state = %ld, flags = %x)\n", | ||
288 | p->comm, task_pid_nr(p), cpu, | 288 | p->comm, task_pid_nr(p), cpu, |
289 | p->state, p->flags); | 289 | p->state, p->flags); |
290 | } | 290 | } |
@@ -336,8 +336,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) | |||
336 | if (err) { | 336 | if (err) { |
337 | nr_calls--; | 337 | nr_calls--; |
338 | __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL); | 338 | __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL); |
339 | printk("%s: attempt to take down CPU %u failed\n", | 339 | pr_warn("%s: attempt to take down CPU %u failed\n", |
340 | __func__, cpu); | 340 | __func__, cpu); |
341 | goto out_release; | 341 | goto out_release; |
342 | } | 342 | } |
343 | 343 | ||
@@ -444,8 +444,8 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen) | |||
444 | ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); | 444 | ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); |
445 | if (ret) { | 445 | if (ret) { |
446 | nr_calls--; | 446 | nr_calls--; |
447 | printk(KERN_WARNING "%s: attempt to bring up CPU %u failed\n", | 447 | pr_warn("%s: attempt to bring up CPU %u failed\n", |
448 | __func__, cpu); | 448 | __func__, cpu); |
449 | goto out_notify; | 449 | goto out_notify; |
450 | } | 450 | } |
451 | 451 | ||
@@ -475,11 +475,10 @@ int cpu_up(unsigned int cpu) | |||
475 | int err = 0; | 475 | int err = 0; |
476 | 476 | ||
477 | if (!cpu_possible(cpu)) { | 477 | if (!cpu_possible(cpu)) { |
478 | printk(KERN_ERR "can't online cpu %d because it is not " | 478 | pr_err("can't online cpu %d because it is not configured as may-hotadd at boot time\n", |
479 | "configured as may-hotadd at boot time\n", cpu); | 479 | cpu); |
480 | #if defined(CONFIG_IA64) | 480 | #if defined(CONFIG_IA64) |
481 | printk(KERN_ERR "please check additional_cpus= boot " | 481 | pr_err("please check additional_cpus= boot parameter\n"); |
482 | "parameter\n"); | ||
483 | #endif | 482 | #endif |
484 | return -EINVAL; | 483 | return -EINVAL; |
485 | } | 484 | } |
@@ -518,16 +517,17 @@ int disable_nonboot_cpus(void) | |||
518 | */ | 517 | */ |
519 | cpumask_clear(frozen_cpus); | 518 | cpumask_clear(frozen_cpus); |
520 | 519 | ||
521 | printk("Disabling non-boot CPUs ...\n"); | 520 | pr_info("Disabling non-boot CPUs ...\n"); |
522 | for_each_online_cpu(cpu) { | 521 | for_each_online_cpu(cpu) { |
523 | if (cpu == first_cpu) | 522 | if (cpu == first_cpu) |
524 | continue; | 523 | continue; |
524 | trace_suspend_resume(TPS("CPU_OFF"), cpu, true); | ||
525 | error = _cpu_down(cpu, 1); | 525 | error = _cpu_down(cpu, 1); |
526 | trace_suspend_resume(TPS("CPU_OFF"), cpu, false); | ||
526 | if (!error) | 527 | if (!error) |
527 | cpumask_set_cpu(cpu, frozen_cpus); | 528 | cpumask_set_cpu(cpu, frozen_cpus); |
528 | else { | 529 | else { |
529 | printk(KERN_ERR "Error taking CPU%d down: %d\n", | 530 | pr_err("Error taking CPU%d down: %d\n", cpu, error); |
530 | cpu, error); | ||
531 | break; | 531 | break; |
532 | } | 532 | } |
533 | } | 533 | } |
@@ -537,7 +537,7 @@ int disable_nonboot_cpus(void) | |||
537 | /* Make sure the CPUs won't be enabled by someone else */ | 537 | /* Make sure the CPUs won't be enabled by someone else */ |
538 | cpu_hotplug_disabled = 1; | 538 | cpu_hotplug_disabled = 1; |
539 | } else { | 539 | } else { |
540 | printk(KERN_ERR "Non-boot CPUs are not disabled\n"); | 540 | pr_err("Non-boot CPUs are not disabled\n"); |
541 | } | 541 | } |
542 | cpu_maps_update_done(); | 542 | cpu_maps_update_done(); |
543 | return error; | 543 | return error; |
@@ -561,17 +561,19 @@ void __ref enable_nonboot_cpus(void) | |||
561 | if (cpumask_empty(frozen_cpus)) | 561 | if (cpumask_empty(frozen_cpus)) |
562 | goto out; | 562 | goto out; |
563 | 563 | ||
564 | printk(KERN_INFO "Enabling non-boot CPUs ...\n"); | 564 | pr_info("Enabling non-boot CPUs ...\n"); |
565 | 565 | ||
566 | arch_enable_nonboot_cpus_begin(); | 566 | arch_enable_nonboot_cpus_begin(); |
567 | 567 | ||
568 | for_each_cpu(cpu, frozen_cpus) { | 568 | for_each_cpu(cpu, frozen_cpus) { |
569 | trace_suspend_resume(TPS("CPU_ON"), cpu, true); | ||
569 | error = _cpu_up(cpu, 1); | 570 | error = _cpu_up(cpu, 1); |
571 | trace_suspend_resume(TPS("CPU_ON"), cpu, false); | ||
570 | if (!error) { | 572 | if (!error) { |
571 | printk(KERN_INFO "CPU%d is up\n", cpu); | 573 | pr_info("CPU%d is up\n", cpu); |
572 | continue; | 574 | continue; |
573 | } | 575 | } |
574 | printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); | 576 | pr_warn("Error taking CPU%d up: %d\n", cpu, error); |
575 | } | 577 | } |
576 | 578 | ||
577 | arch_enable_nonboot_cpus_end(); | 579 | arch_enable_nonboot_cpus_end(); |
@@ -726,10 +728,12 @@ void set_cpu_present(unsigned int cpu, bool present) | |||
726 | 728 | ||
727 | void set_cpu_online(unsigned int cpu, bool online) | 729 | void set_cpu_online(unsigned int cpu, bool online) |
728 | { | 730 | { |
729 | if (online) | 731 | if (online) { |
730 | cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits)); | 732 | cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits)); |
731 | else | 733 | cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits)); |
734 | } else { | ||
732 | cpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits)); | 735 | cpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits)); |
736 | } | ||
733 | } | 737 | } |
734 | 738 | ||
735 | void set_cpu_active(unsigned int cpu, bool active) | 739 | void set_cpu_active(unsigned int cpu, bool active) |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 3d54c418bd06..f6b33c696224 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -61,12 +61,7 @@ | |||
61 | #include <linux/cgroup.h> | 61 | #include <linux/cgroup.h> |
62 | #include <linux/wait.h> | 62 | #include <linux/wait.h> |
63 | 63 | ||
64 | /* | 64 | struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE; |
65 | * Tracks how many cpusets are currently defined in system. | ||
66 | * When there is only one cpuset (the root cpuset) we can | ||
67 | * short circuit some hooks. | ||
68 | */ | ||
69 | int number_of_cpusets __read_mostly; | ||
70 | 65 | ||
71 | /* See "Frequency meter" comments, below. */ | 66 | /* See "Frequency meter" comments, below. */ |
72 | 67 | ||
@@ -124,7 +119,7 @@ static inline struct cpuset *task_cs(struct task_struct *task) | |||
124 | 119 | ||
125 | static inline struct cpuset *parent_cs(struct cpuset *cs) | 120 | static inline struct cpuset *parent_cs(struct cpuset *cs) |
126 | { | 121 | { |
127 | return css_cs(css_parent(&cs->css)); | 122 | return css_cs(cs->css.parent); |
128 | } | 123 | } |
129 | 124 | ||
130 | #ifdef CONFIG_NUMA | 125 | #ifdef CONFIG_NUMA |
@@ -611,7 +606,7 @@ static int generate_sched_domains(cpumask_var_t **domains, | |||
611 | goto done; | 606 | goto done; |
612 | } | 607 | } |
613 | 608 | ||
614 | csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); | 609 | csa = kmalloc(nr_cpusets() * sizeof(cp), GFP_KERNEL); |
615 | if (!csa) | 610 | if (!csa) |
616 | goto done; | 611 | goto done; |
617 | csn = 0; | 612 | csn = 0; |
@@ -696,11 +691,8 @@ restart: | |||
696 | if (nslot == ndoms) { | 691 | if (nslot == ndoms) { |
697 | static int warnings = 10; | 692 | static int warnings = 10; |
698 | if (warnings) { | 693 | if (warnings) { |
699 | printk(KERN_WARNING | 694 | pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n", |
700 | "rebuild_sched_domains confused:" | 695 | nslot, ndoms, csn, i, apn); |
701 | " nslot %d, ndoms %d, csn %d, i %d," | ||
702 | " apn %d\n", | ||
703 | nslot, ndoms, csn, i, apn); | ||
704 | warnings--; | 696 | warnings--; |
705 | } | 697 | } |
706 | continue; | 698 | continue; |
@@ -875,7 +867,7 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root) | |||
875 | continue; | 867 | continue; |
876 | } | 868 | } |
877 | } | 869 | } |
878 | if (!css_tryget(&cp->css)) | 870 | if (!css_tryget_online(&cp->css)) |
879 | continue; | 871 | continue; |
880 | rcu_read_unlock(); | 872 | rcu_read_unlock(); |
881 | 873 | ||
@@ -890,6 +882,7 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root) | |||
890 | /** | 882 | /** |
891 | * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it | 883 | * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it |
892 | * @cs: the cpuset to consider | 884 | * @cs: the cpuset to consider |
885 | * @trialcs: trial cpuset | ||
893 | * @buf: buffer of cpu numbers written to this cpuset | 886 | * @buf: buffer of cpu numbers written to this cpuset |
894 | */ | 887 | */ |
895 | static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, | 888 | static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, |
@@ -1110,7 +1103,7 @@ static void update_tasks_nodemask_hier(struct cpuset *root_cs, bool update_root) | |||
1110 | continue; | 1103 | continue; |
1111 | } | 1104 | } |
1112 | } | 1105 | } |
1113 | if (!css_tryget(&cp->css)) | 1106 | if (!css_tryget_online(&cp->css)) |
1114 | continue; | 1107 | continue; |
1115 | rcu_read_unlock(); | 1108 | rcu_read_unlock(); |
1116 | 1109 | ||
@@ -1605,13 +1598,15 @@ out_unlock: | |||
1605 | /* | 1598 | /* |
1606 | * Common handling for a write to a "cpus" or "mems" file. | 1599 | * Common handling for a write to a "cpus" or "mems" file. |
1607 | */ | 1600 | */ |
1608 | static int cpuset_write_resmask(struct cgroup_subsys_state *css, | 1601 | static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, |
1609 | struct cftype *cft, char *buf) | 1602 | char *buf, size_t nbytes, loff_t off) |
1610 | { | 1603 | { |
1611 | struct cpuset *cs = css_cs(css); | 1604 | struct cpuset *cs = css_cs(of_css(of)); |
1612 | struct cpuset *trialcs; | 1605 | struct cpuset *trialcs; |
1613 | int retval = -ENODEV; | 1606 | int retval = -ENODEV; |
1614 | 1607 | ||
1608 | buf = strstrip(buf); | ||
1609 | |||
1615 | /* | 1610 | /* |
1616 | * CPU or memory hotunplug may leave @cs w/o any execution | 1611 | * CPU or memory hotunplug may leave @cs w/o any execution |
1617 | * resources, in which case the hotplug code asynchronously updates | 1612 | * resources, in which case the hotplug code asynchronously updates |
@@ -1635,7 +1630,7 @@ static int cpuset_write_resmask(struct cgroup_subsys_state *css, | |||
1635 | goto out_unlock; | 1630 | goto out_unlock; |
1636 | } | 1631 | } |
1637 | 1632 | ||
1638 | switch (cft->private) { | 1633 | switch (of_cft(of)->private) { |
1639 | case FILE_CPULIST: | 1634 | case FILE_CPULIST: |
1640 | retval = update_cpumask(cs, trialcs, buf); | 1635 | retval = update_cpumask(cs, trialcs, buf); |
1641 | break; | 1636 | break; |
@@ -1650,7 +1645,7 @@ static int cpuset_write_resmask(struct cgroup_subsys_state *css, | |||
1650 | free_trial_cpuset(trialcs); | 1645 | free_trial_cpuset(trialcs); |
1651 | out_unlock: | 1646 | out_unlock: |
1652 | mutex_unlock(&cpuset_mutex); | 1647 | mutex_unlock(&cpuset_mutex); |
1653 | return retval; | 1648 | return retval ?: nbytes; |
1654 | } | 1649 | } |
1655 | 1650 | ||
1656 | /* | 1651 | /* |
@@ -1752,7 +1747,7 @@ static struct cftype files[] = { | |||
1752 | { | 1747 | { |
1753 | .name = "cpus", | 1748 | .name = "cpus", |
1754 | .seq_show = cpuset_common_seq_show, | 1749 | .seq_show = cpuset_common_seq_show, |
1755 | .write_string = cpuset_write_resmask, | 1750 | .write = cpuset_write_resmask, |
1756 | .max_write_len = (100U + 6 * NR_CPUS), | 1751 | .max_write_len = (100U + 6 * NR_CPUS), |
1757 | .private = FILE_CPULIST, | 1752 | .private = FILE_CPULIST, |
1758 | }, | 1753 | }, |
@@ -1760,7 +1755,7 @@ static struct cftype files[] = { | |||
1760 | { | 1755 | { |
1761 | .name = "mems", | 1756 | .name = "mems", |
1762 | .seq_show = cpuset_common_seq_show, | 1757 | .seq_show = cpuset_common_seq_show, |
1763 | .write_string = cpuset_write_resmask, | 1758 | .write = cpuset_write_resmask, |
1764 | .max_write_len = (100U + 6 * MAX_NUMNODES), | 1759 | .max_write_len = (100U + 6 * MAX_NUMNODES), |
1765 | .private = FILE_MEMLIST, | 1760 | .private = FILE_MEMLIST, |
1766 | }, | 1761 | }, |
@@ -1888,7 +1883,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) | |||
1888 | if (is_spread_slab(parent)) | 1883 | if (is_spread_slab(parent)) |
1889 | set_bit(CS_SPREAD_SLAB, &cs->flags); | 1884 | set_bit(CS_SPREAD_SLAB, &cs->flags); |
1890 | 1885 | ||
1891 | number_of_cpusets++; | 1886 | cpuset_inc(); |
1892 | 1887 | ||
1893 | if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) | 1888 | if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) |
1894 | goto out_unlock; | 1889 | goto out_unlock; |
@@ -1939,7 +1934,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) | |||
1939 | if (is_sched_load_balance(cs)) | 1934 | if (is_sched_load_balance(cs)) |
1940 | update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); | 1935 | update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); |
1941 | 1936 | ||
1942 | number_of_cpusets--; | 1937 | cpuset_dec(); |
1943 | clear_bit(CS_ONLINE, &cs->flags); | 1938 | clear_bit(CS_ONLINE, &cs->flags); |
1944 | 1939 | ||
1945 | mutex_unlock(&cpuset_mutex); | 1940 | mutex_unlock(&cpuset_mutex); |
@@ -1992,7 +1987,6 @@ int __init cpuset_init(void) | |||
1992 | if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)) | 1987 | if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)) |
1993 | BUG(); | 1988 | BUG(); |
1994 | 1989 | ||
1995 | number_of_cpusets = 1; | ||
1996 | return 0; | 1990 | return 0; |
1997 | } | 1991 | } |
1998 | 1992 | ||
@@ -2017,7 +2011,7 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) | |||
2017 | parent = parent_cs(parent); | 2011 | parent = parent_cs(parent); |
2018 | 2012 | ||
2019 | if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) { | 2013 | if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) { |
2020 | printk(KERN_ERR "cpuset: failed to transfer tasks out of empty cpuset "); | 2014 | pr_err("cpuset: failed to transfer tasks out of empty cpuset "); |
2021 | pr_cont_cgroup_name(cs->css.cgroup); | 2015 | pr_cont_cgroup_name(cs->css.cgroup); |
2022 | pr_cont("\n"); | 2016 | pr_cont("\n"); |
2023 | } | 2017 | } |
@@ -2155,7 +2149,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) | |||
2155 | 2149 | ||
2156 | rcu_read_lock(); | 2150 | rcu_read_lock(); |
2157 | cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { | 2151 | cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { |
2158 | if (cs == &top_cpuset || !css_tryget(&cs->css)) | 2152 | if (cs == &top_cpuset || !css_tryget_online(&cs->css)) |
2159 | continue; | 2153 | continue; |
2160 | rcu_read_unlock(); | 2154 | rcu_read_unlock(); |
2161 | 2155 | ||
@@ -2536,7 +2530,7 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, | |||
2536 | 2530 | ||
2537 | /** | 2531 | /** |
2538 | * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed | 2532 | * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed |
2539 | * @task: pointer to task_struct of some task. | 2533 | * @tsk: pointer to task_struct of some task. |
2540 | * | 2534 | * |
2541 | * Description: Prints @task's name, cpuset name, and cached copy of its | 2535 | * Description: Prints @task's name, cpuset name, and cached copy of its |
2542 | * mems_allowed to the kernel log. | 2536 | * mems_allowed to the kernel log. |
@@ -2554,7 +2548,7 @@ void cpuset_print_task_mems_allowed(struct task_struct *tsk) | |||
2554 | cgrp = task_cs(tsk)->css.cgroup; | 2548 | cgrp = task_cs(tsk)->css.cgroup; |
2555 | nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN, | 2549 | nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN, |
2556 | tsk->mems_allowed); | 2550 | tsk->mems_allowed); |
2557 | printk(KERN_INFO "%s cpuset=", tsk->comm); | 2551 | pr_info("%s cpuset=", tsk->comm); |
2558 | pr_cont_cgroup_name(cgrp); | 2552 | pr_cont_cgroup_name(cgrp); |
2559 | pr_cont(" mems_allowed=%s\n", cpuset_nodelist); | 2553 | pr_cont(" mems_allowed=%s\n", cpuset_nodelist); |
2560 | 2554 | ||
@@ -2646,10 +2640,10 @@ out: | |||
2646 | /* Display task mems_allowed in /proc/<pid>/status file. */ | 2640 | /* Display task mems_allowed in /proc/<pid>/status file. */ |
2647 | void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) | 2641 | void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) |
2648 | { | 2642 | { |
2649 | seq_printf(m, "Mems_allowed:\t"); | 2643 | seq_puts(m, "Mems_allowed:\t"); |
2650 | seq_nodemask(m, &task->mems_allowed); | 2644 | seq_nodemask(m, &task->mems_allowed); |
2651 | seq_printf(m, "\n"); | 2645 | seq_puts(m, "\n"); |
2652 | seq_printf(m, "Mems_allowed_list:\t"); | 2646 | seq_puts(m, "Mems_allowed_list:\t"); |
2653 | seq_nodemask_list(m, &task->mems_allowed); | 2647 | seq_nodemask_list(m, &task->mems_allowed); |
2654 | seq_printf(m, "\n"); | 2648 | seq_puts(m, "\n"); |
2655 | } | 2649 | } |
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 2956c8da1605..1adf62b39b96 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c | |||
@@ -534,7 +534,7 @@ return_normal: | |||
534 | kgdb_info[cpu].exception_state &= | 534 | kgdb_info[cpu].exception_state &= |
535 | ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE); | 535 | ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE); |
536 | kgdb_info[cpu].enter_kgdb--; | 536 | kgdb_info[cpu].enter_kgdb--; |
537 | smp_mb__before_atomic_dec(); | 537 | smp_mb__before_atomic(); |
538 | atomic_dec(&slaves_in_kgdb); | 538 | atomic_dec(&slaves_in_kgdb); |
539 | dbg_touch_watchdogs(); | 539 | dbg_touch_watchdogs(); |
540 | local_irq_restore(flags); | 540 | local_irq_restore(flags); |
@@ -662,7 +662,7 @@ kgdb_restore: | |||
662 | kgdb_info[cpu].exception_state &= | 662 | kgdb_info[cpu].exception_state &= |
663 | ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE); | 663 | ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE); |
664 | kgdb_info[cpu].enter_kgdb--; | 664 | kgdb_info[cpu].enter_kgdb--; |
665 | smp_mb__before_atomic_dec(); | 665 | smp_mb__before_atomic(); |
666 | atomic_dec(&masters_in_kgdb); | 666 | atomic_dec(&masters_in_kgdb); |
667 | /* Free kgdb_active */ | 667 | /* Free kgdb_active */ |
668 | atomic_set(&kgdb_active, -1); | 668 | atomic_set(&kgdb_active, -1); |
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index b03e0e814e43..fe15fff5df53 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c | |||
@@ -21,7 +21,7 @@ | |||
21 | static void kdb_show_stack(struct task_struct *p, void *addr) | 21 | static void kdb_show_stack(struct task_struct *p, void *addr) |
22 | { | 22 | { |
23 | int old_lvl = console_loglevel; | 23 | int old_lvl = console_loglevel; |
24 | console_loglevel = 15; | 24 | console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH; |
25 | kdb_trap_printk++; | 25 | kdb_trap_printk++; |
26 | kdb_set_current_task(p); | 26 | kdb_set_current_task(p); |
27 | if (addr) { | 27 | if (addr) { |
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 14ff4849262c..7c70812caea5 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c | |||
@@ -710,7 +710,7 @@ kdb_printit: | |||
710 | } | 710 | } |
711 | if (logging) { | 711 | if (logging) { |
712 | saved_loglevel = console_loglevel; | 712 | saved_loglevel = console_loglevel; |
713 | console_loglevel = 0; | 713 | console_loglevel = CONSOLE_LOGLEVEL_SILENT; |
714 | printk(KERN_INFO "%s", kdb_buffer); | 714 | printk(KERN_INFO "%s", kdb_buffer); |
715 | } | 715 | } |
716 | 716 | ||
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 0b097c8a1e50..2f7c760305ca 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c | |||
@@ -1091,7 +1091,7 @@ static int kdb_reboot(int argc, const char **argv) | |||
1091 | static void kdb_dumpregs(struct pt_regs *regs) | 1091 | static void kdb_dumpregs(struct pt_regs *regs) |
1092 | { | 1092 | { |
1093 | int old_lvl = console_loglevel; | 1093 | int old_lvl = console_loglevel; |
1094 | console_loglevel = 15; | 1094 | console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH; |
1095 | kdb_trap_printk++; | 1095 | kdb_trap_printk++; |
1096 | show_regs(regs); | 1096 | show_regs(regs); |
1097 | kdb_trap_printk--; | 1097 | kdb_trap_printk--; |
diff --git a/kernel/events/core.c b/kernel/events/core.c index f83a71a3e46d..5fa58e4cffac 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
@@ -39,6 +39,7 @@ | |||
39 | #include <linux/hw_breakpoint.h> | 39 | #include <linux/hw_breakpoint.h> |
40 | #include <linux/mm_types.h> | 40 | #include <linux/mm_types.h> |
41 | #include <linux/cgroup.h> | 41 | #include <linux/cgroup.h> |
42 | #include <linux/module.h> | ||
42 | 43 | ||
43 | #include "internal.h" | 44 | #include "internal.h" |
44 | 45 | ||
@@ -607,7 +608,8 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, | |||
607 | if (!f.file) | 608 | if (!f.file) |
608 | return -EBADF; | 609 | return -EBADF; |
609 | 610 | ||
610 | css = css_tryget_from_dir(f.file->f_dentry, &perf_event_cgrp_subsys); | 611 | css = css_tryget_online_from_dir(f.file->f_dentry, |
612 | &perf_event_cgrp_subsys); | ||
611 | if (IS_ERR(css)) { | 613 | if (IS_ERR(css)) { |
612 | ret = PTR_ERR(css); | 614 | ret = PTR_ERR(css); |
613 | goto out; | 615 | goto out; |
@@ -1443,6 +1445,11 @@ group_sched_out(struct perf_event *group_event, | |||
1443 | cpuctx->exclusive = 0; | 1445 | cpuctx->exclusive = 0; |
1444 | } | 1446 | } |
1445 | 1447 | ||
1448 | struct remove_event { | ||
1449 | struct perf_event *event; | ||
1450 | bool detach_group; | ||
1451 | }; | ||
1452 | |||
1446 | /* | 1453 | /* |
1447 | * Cross CPU call to remove a performance event | 1454 | * Cross CPU call to remove a performance event |
1448 | * | 1455 | * |
@@ -1451,12 +1458,15 @@ group_sched_out(struct perf_event *group_event, | |||
1451 | */ | 1458 | */ |
1452 | static int __perf_remove_from_context(void *info) | 1459 | static int __perf_remove_from_context(void *info) |
1453 | { | 1460 | { |
1454 | struct perf_event *event = info; | 1461 | struct remove_event *re = info; |
1462 | struct perf_event *event = re->event; | ||
1455 | struct perf_event_context *ctx = event->ctx; | 1463 | struct perf_event_context *ctx = event->ctx; |
1456 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | 1464 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); |
1457 | 1465 | ||
1458 | raw_spin_lock(&ctx->lock); | 1466 | raw_spin_lock(&ctx->lock); |
1459 | event_sched_out(event, cpuctx, ctx); | 1467 | event_sched_out(event, cpuctx, ctx); |
1468 | if (re->detach_group) | ||
1469 | perf_group_detach(event); | ||
1460 | list_del_event(event, ctx); | 1470 | list_del_event(event, ctx); |
1461 | if (!ctx->nr_events && cpuctx->task_ctx == ctx) { | 1471 | if (!ctx->nr_events && cpuctx->task_ctx == ctx) { |
1462 | ctx->is_active = 0; | 1472 | ctx->is_active = 0; |
@@ -1481,10 +1491,14 @@ static int __perf_remove_from_context(void *info) | |||
1481 | * When called from perf_event_exit_task, it's OK because the | 1491 | * When called from perf_event_exit_task, it's OK because the |
1482 | * context has been detached from its task. | 1492 | * context has been detached from its task. |
1483 | */ | 1493 | */ |
1484 | static void perf_remove_from_context(struct perf_event *event) | 1494 | static void perf_remove_from_context(struct perf_event *event, bool detach_group) |
1485 | { | 1495 | { |
1486 | struct perf_event_context *ctx = event->ctx; | 1496 | struct perf_event_context *ctx = event->ctx; |
1487 | struct task_struct *task = ctx->task; | 1497 | struct task_struct *task = ctx->task; |
1498 | struct remove_event re = { | ||
1499 | .event = event, | ||
1500 | .detach_group = detach_group, | ||
1501 | }; | ||
1488 | 1502 | ||
1489 | lockdep_assert_held(&ctx->mutex); | 1503 | lockdep_assert_held(&ctx->mutex); |
1490 | 1504 | ||
@@ -1493,12 +1507,12 @@ static void perf_remove_from_context(struct perf_event *event) | |||
1493 | * Per cpu events are removed via an smp call and | 1507 | * Per cpu events are removed via an smp call and |
1494 | * the removal is always successful. | 1508 | * the removal is always successful. |
1495 | */ | 1509 | */ |
1496 | cpu_function_call(event->cpu, __perf_remove_from_context, event); | 1510 | cpu_function_call(event->cpu, __perf_remove_from_context, &re); |
1497 | return; | 1511 | return; |
1498 | } | 1512 | } |
1499 | 1513 | ||
1500 | retry: | 1514 | retry: |
1501 | if (!task_function_call(task, __perf_remove_from_context, event)) | 1515 | if (!task_function_call(task, __perf_remove_from_context, &re)) |
1502 | return; | 1516 | return; |
1503 | 1517 | ||
1504 | raw_spin_lock_irq(&ctx->lock); | 1518 | raw_spin_lock_irq(&ctx->lock); |
@@ -1515,6 +1529,8 @@ retry: | |||
1515 | * Since the task isn't running, its safe to remove the event, us | 1529 | * Since the task isn't running, its safe to remove the event, us |
1516 | * holding the ctx->lock ensures the task won't get scheduled in. | 1530 | * holding the ctx->lock ensures the task won't get scheduled in. |
1517 | */ | 1531 | */ |
1532 | if (detach_group) | ||
1533 | perf_group_detach(event); | ||
1518 | list_del_event(event, ctx); | 1534 | list_del_event(event, ctx); |
1519 | raw_spin_unlock_irq(&ctx->lock); | 1535 | raw_spin_unlock_irq(&ctx->lock); |
1520 | } | 1536 | } |
@@ -1663,6 +1679,8 @@ event_sched_in(struct perf_event *event, | |||
1663 | u64 tstamp = perf_event_time(event); | 1679 | u64 tstamp = perf_event_time(event); |
1664 | int ret = 0; | 1680 | int ret = 0; |
1665 | 1681 | ||
1682 | lockdep_assert_held(&ctx->lock); | ||
1683 | |||
1666 | if (event->state <= PERF_EVENT_STATE_OFF) | 1684 | if (event->state <= PERF_EVENT_STATE_OFF) |
1667 | return 0; | 1685 | return 0; |
1668 | 1686 | ||
@@ -2956,6 +2974,22 @@ out: | |||
2956 | local_irq_restore(flags); | 2974 | local_irq_restore(flags); |
2957 | } | 2975 | } |
2958 | 2976 | ||
2977 | void perf_event_exec(void) | ||
2978 | { | ||
2979 | struct perf_event_context *ctx; | ||
2980 | int ctxn; | ||
2981 | |||
2982 | rcu_read_lock(); | ||
2983 | for_each_task_context_nr(ctxn) { | ||
2984 | ctx = current->perf_event_ctxp[ctxn]; | ||
2985 | if (!ctx) | ||
2986 | continue; | ||
2987 | |||
2988 | perf_event_enable_on_exec(ctx); | ||
2989 | } | ||
2990 | rcu_read_unlock(); | ||
2991 | } | ||
2992 | |||
2959 | /* | 2993 | /* |
2960 | * Cross CPU call to read the hardware event | 2994 | * Cross CPU call to read the hardware event |
2961 | */ | 2995 | */ |
@@ -3178,7 +3212,8 @@ static void free_event_rcu(struct rcu_head *head) | |||
3178 | } | 3212 | } |
3179 | 3213 | ||
3180 | static void ring_buffer_put(struct ring_buffer *rb); | 3214 | static void ring_buffer_put(struct ring_buffer *rb); |
3181 | static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb); | 3215 | static void ring_buffer_attach(struct perf_event *event, |
3216 | struct ring_buffer *rb); | ||
3182 | 3217 | ||
3183 | static void unaccount_event_cpu(struct perf_event *event, int cpu) | 3218 | static void unaccount_event_cpu(struct perf_event *event, int cpu) |
3184 | { | 3219 | { |
@@ -3229,17 +3264,19 @@ static void __free_event(struct perf_event *event) | |||
3229 | if (event->ctx) | 3264 | if (event->ctx) |
3230 | put_ctx(event->ctx); | 3265 | put_ctx(event->ctx); |
3231 | 3266 | ||
3267 | if (event->pmu) | ||
3268 | module_put(event->pmu->module); | ||
3269 | |||
3232 | call_rcu(&event->rcu_head, free_event_rcu); | 3270 | call_rcu(&event->rcu_head, free_event_rcu); |
3233 | } | 3271 | } |
3234 | static void free_event(struct perf_event *event) | 3272 | |
3273 | static void _free_event(struct perf_event *event) | ||
3235 | { | 3274 | { |
3236 | irq_work_sync(&event->pending); | 3275 | irq_work_sync(&event->pending); |
3237 | 3276 | ||
3238 | unaccount_event(event); | 3277 | unaccount_event(event); |
3239 | 3278 | ||
3240 | if (event->rb) { | 3279 | if (event->rb) { |
3241 | struct ring_buffer *rb; | ||
3242 | |||
3243 | /* | 3280 | /* |
3244 | * Can happen when we close an event with re-directed output. | 3281 | * Can happen when we close an event with re-directed output. |
3245 | * | 3282 | * |
@@ -3247,57 +3284,38 @@ static void free_event(struct perf_event *event) | |||
3247 | * over us; possibly making our ring_buffer_put() the last. | 3284 | * over us; possibly making our ring_buffer_put() the last. |
3248 | */ | 3285 | */ |
3249 | mutex_lock(&event->mmap_mutex); | 3286 | mutex_lock(&event->mmap_mutex); |
3250 | rb = event->rb; | 3287 | ring_buffer_attach(event, NULL); |
3251 | if (rb) { | ||
3252 | rcu_assign_pointer(event->rb, NULL); | ||
3253 | ring_buffer_detach(event, rb); | ||
3254 | ring_buffer_put(rb); /* could be last */ | ||
3255 | } | ||
3256 | mutex_unlock(&event->mmap_mutex); | 3288 | mutex_unlock(&event->mmap_mutex); |
3257 | } | 3289 | } |
3258 | 3290 | ||
3259 | if (is_cgroup_event(event)) | 3291 | if (is_cgroup_event(event)) |
3260 | perf_detach_cgroup(event); | 3292 | perf_detach_cgroup(event); |
3261 | 3293 | ||
3262 | |||
3263 | __free_event(event); | 3294 | __free_event(event); |
3264 | } | 3295 | } |
3265 | 3296 | ||
3266 | int perf_event_release_kernel(struct perf_event *event) | 3297 | /* |
3298 | * Used to free events which have a known refcount of 1, such as in error paths | ||
3299 | * where the event isn't exposed yet and inherited events. | ||
3300 | */ | ||
3301 | static void free_event(struct perf_event *event) | ||
3267 | { | 3302 | { |
3268 | struct perf_event_context *ctx = event->ctx; | 3303 | if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1, |
3269 | 3304 | "unexpected event refcount: %ld; ptr=%p\n", | |
3270 | WARN_ON_ONCE(ctx->parent_ctx); | 3305 | atomic_long_read(&event->refcount), event)) { |
3271 | /* | 3306 | /* leak to avoid use-after-free */ |
3272 | * There are two ways this annotation is useful: | 3307 | return; |
3273 | * | 3308 | } |
3274 | * 1) there is a lock recursion from perf_event_exit_task | ||
3275 | * see the comment there. | ||
3276 | * | ||
3277 | * 2) there is a lock-inversion with mmap_sem through | ||
3278 | * perf_event_read_group(), which takes faults while | ||
3279 | * holding ctx->mutex, however this is called after | ||
3280 | * the last filedesc died, so there is no possibility | ||
3281 | * to trigger the AB-BA case. | ||
3282 | */ | ||
3283 | mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); | ||
3284 | raw_spin_lock_irq(&ctx->lock); | ||
3285 | perf_group_detach(event); | ||
3286 | raw_spin_unlock_irq(&ctx->lock); | ||
3287 | perf_remove_from_context(event); | ||
3288 | mutex_unlock(&ctx->mutex); | ||
3289 | |||
3290 | free_event(event); | ||
3291 | 3309 | ||
3292 | return 0; | 3310 | _free_event(event); |
3293 | } | 3311 | } |
3294 | EXPORT_SYMBOL_GPL(perf_event_release_kernel); | ||
3295 | 3312 | ||
3296 | /* | 3313 | /* |
3297 | * Called when the last reference to the file is gone. | 3314 | * Called when the last reference to the file is gone. |
3298 | */ | 3315 | */ |
3299 | static void put_event(struct perf_event *event) | 3316 | static void put_event(struct perf_event *event) |
3300 | { | 3317 | { |
3318 | struct perf_event_context *ctx = event->ctx; | ||
3301 | struct task_struct *owner; | 3319 | struct task_struct *owner; |
3302 | 3320 | ||
3303 | if (!atomic_long_dec_and_test(&event->refcount)) | 3321 | if (!atomic_long_dec_and_test(&event->refcount)) |
@@ -3336,9 +3354,33 @@ static void put_event(struct perf_event *event) | |||
3336 | put_task_struct(owner); | 3354 | put_task_struct(owner); |
3337 | } | 3355 | } |
3338 | 3356 | ||
3339 | perf_event_release_kernel(event); | 3357 | WARN_ON_ONCE(ctx->parent_ctx); |
3358 | /* | ||
3359 | * There are two ways this annotation is useful: | ||
3360 | * | ||
3361 | * 1) there is a lock recursion from perf_event_exit_task | ||
3362 | * see the comment there. | ||
3363 | * | ||
3364 | * 2) there is a lock-inversion with mmap_sem through | ||
3365 | * perf_event_read_group(), which takes faults while | ||
3366 | * holding ctx->mutex, however this is called after | ||
3367 | * the last filedesc died, so there is no possibility | ||
3368 | * to trigger the AB-BA case. | ||
3369 | */ | ||
3370 | mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); | ||
3371 | perf_remove_from_context(event, true); | ||
3372 | mutex_unlock(&ctx->mutex); | ||
3373 | |||
3374 | _free_event(event); | ||
3340 | } | 3375 | } |
3341 | 3376 | ||
3377 | int perf_event_release_kernel(struct perf_event *event) | ||
3378 | { | ||
3379 | put_event(event); | ||
3380 | return 0; | ||
3381 | } | ||
3382 | EXPORT_SYMBOL_GPL(perf_event_release_kernel); | ||
3383 | |||
3342 | static int perf_release(struct inode *inode, struct file *file) | 3384 | static int perf_release(struct inode *inode, struct file *file) |
3343 | { | 3385 | { |
3344 | put_event(file->private_data); | 3386 | put_event(file->private_data); |
@@ -3839,28 +3881,47 @@ unlock: | |||
3839 | static void ring_buffer_attach(struct perf_event *event, | 3881 | static void ring_buffer_attach(struct perf_event *event, |
3840 | struct ring_buffer *rb) | 3882 | struct ring_buffer *rb) |
3841 | { | 3883 | { |
3884 | struct ring_buffer *old_rb = NULL; | ||
3842 | unsigned long flags; | 3885 | unsigned long flags; |
3843 | 3886 | ||
3844 | if (!list_empty(&event->rb_entry)) | 3887 | if (event->rb) { |
3845 | return; | 3888 | /* |
3889 | * Should be impossible, we set this when removing | ||
3890 | * event->rb_entry and wait/clear when adding event->rb_entry. | ||
3891 | */ | ||
3892 | WARN_ON_ONCE(event->rcu_pending); | ||
3846 | 3893 | ||
3847 | spin_lock_irqsave(&rb->event_lock, flags); | 3894 | old_rb = event->rb; |
3848 | if (list_empty(&event->rb_entry)) | 3895 | event->rcu_batches = get_state_synchronize_rcu(); |
3849 | list_add(&event->rb_entry, &rb->event_list); | 3896 | event->rcu_pending = 1; |
3850 | spin_unlock_irqrestore(&rb->event_lock, flags); | ||
3851 | } | ||
3852 | 3897 | ||
3853 | static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb) | 3898 | spin_lock_irqsave(&old_rb->event_lock, flags); |
3854 | { | 3899 | list_del_rcu(&event->rb_entry); |
3855 | unsigned long flags; | 3900 | spin_unlock_irqrestore(&old_rb->event_lock, flags); |
3901 | } | ||
3856 | 3902 | ||
3857 | if (list_empty(&event->rb_entry)) | 3903 | if (event->rcu_pending && rb) { |
3858 | return; | 3904 | cond_synchronize_rcu(event->rcu_batches); |
3905 | event->rcu_pending = 0; | ||
3906 | } | ||
3859 | 3907 | ||
3860 | spin_lock_irqsave(&rb->event_lock, flags); | 3908 | if (rb) { |
3861 | list_del_init(&event->rb_entry); | 3909 | spin_lock_irqsave(&rb->event_lock, flags); |
3862 | wake_up_all(&event->waitq); | 3910 | list_add_rcu(&event->rb_entry, &rb->event_list); |
3863 | spin_unlock_irqrestore(&rb->event_lock, flags); | 3911 | spin_unlock_irqrestore(&rb->event_lock, flags); |
3912 | } | ||
3913 | |||
3914 | rcu_assign_pointer(event->rb, rb); | ||
3915 | |||
3916 | if (old_rb) { | ||
3917 | ring_buffer_put(old_rb); | ||
3918 | /* | ||
3919 | * Since we detached before setting the new rb, so that we | ||
3920 | * could attach the new rb, we could have missed a wakeup. | ||
3921 | * Provide it now. | ||
3922 | */ | ||
3923 | wake_up_all(&event->waitq); | ||
3924 | } | ||
3864 | } | 3925 | } |
3865 | 3926 | ||
3866 | static void ring_buffer_wakeup(struct perf_event *event) | 3927 | static void ring_buffer_wakeup(struct perf_event *event) |
@@ -3929,7 +3990,7 @@ static void perf_mmap_close(struct vm_area_struct *vma) | |||
3929 | { | 3990 | { |
3930 | struct perf_event *event = vma->vm_file->private_data; | 3991 | struct perf_event *event = vma->vm_file->private_data; |
3931 | 3992 | ||
3932 | struct ring_buffer *rb = event->rb; | 3993 | struct ring_buffer *rb = ring_buffer_get(event); |
3933 | struct user_struct *mmap_user = rb->mmap_user; | 3994 | struct user_struct *mmap_user = rb->mmap_user; |
3934 | int mmap_locked = rb->mmap_locked; | 3995 | int mmap_locked = rb->mmap_locked; |
3935 | unsigned long size = perf_data_size(rb); | 3996 | unsigned long size = perf_data_size(rb); |
@@ -3937,18 +3998,14 @@ static void perf_mmap_close(struct vm_area_struct *vma) | |||
3937 | atomic_dec(&rb->mmap_count); | 3998 | atomic_dec(&rb->mmap_count); |
3938 | 3999 | ||
3939 | if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) | 4000 | if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) |
3940 | return; | 4001 | goto out_put; |
3941 | 4002 | ||
3942 | /* Detach current event from the buffer. */ | 4003 | ring_buffer_attach(event, NULL); |
3943 | rcu_assign_pointer(event->rb, NULL); | ||
3944 | ring_buffer_detach(event, rb); | ||
3945 | mutex_unlock(&event->mmap_mutex); | 4004 | mutex_unlock(&event->mmap_mutex); |
3946 | 4005 | ||
3947 | /* If there's still other mmap()s of this buffer, we're done. */ | 4006 | /* If there's still other mmap()s of this buffer, we're done. */ |
3948 | if (atomic_read(&rb->mmap_count)) { | 4007 | if (atomic_read(&rb->mmap_count)) |
3949 | ring_buffer_put(rb); /* can't be last */ | 4008 | goto out_put; |
3950 | return; | ||
3951 | } | ||
3952 | 4009 | ||
3953 | /* | 4010 | /* |
3954 | * No other mmap()s, detach from all other events that might redirect | 4011 | * No other mmap()s, detach from all other events that might redirect |
@@ -3978,11 +4035,9 @@ again: | |||
3978 | * still restart the iteration to make sure we're not now | 4035 | * still restart the iteration to make sure we're not now |
3979 | * iterating the wrong list. | 4036 | * iterating the wrong list. |
3980 | */ | 4037 | */ |
3981 | if (event->rb == rb) { | 4038 | if (event->rb == rb) |
3982 | rcu_assign_pointer(event->rb, NULL); | 4039 | ring_buffer_attach(event, NULL); |
3983 | ring_buffer_detach(event, rb); | 4040 | |
3984 | ring_buffer_put(rb); /* can't be last, we still have one */ | ||
3985 | } | ||
3986 | mutex_unlock(&event->mmap_mutex); | 4041 | mutex_unlock(&event->mmap_mutex); |
3987 | put_event(event); | 4042 | put_event(event); |
3988 | 4043 | ||
@@ -4007,6 +4062,7 @@ again: | |||
4007 | vma->vm_mm->pinned_vm -= mmap_locked; | 4062 | vma->vm_mm->pinned_vm -= mmap_locked; |
4008 | free_uid(mmap_user); | 4063 | free_uid(mmap_user); |
4009 | 4064 | ||
4065 | out_put: | ||
4010 | ring_buffer_put(rb); /* could be last */ | 4066 | ring_buffer_put(rb); /* could be last */ |
4011 | } | 4067 | } |
4012 | 4068 | ||
@@ -4124,7 +4180,6 @@ again: | |||
4124 | vma->vm_mm->pinned_vm += extra; | 4180 | vma->vm_mm->pinned_vm += extra; |
4125 | 4181 | ||
4126 | ring_buffer_attach(event, rb); | 4182 | ring_buffer_attach(event, rb); |
4127 | rcu_assign_pointer(event->rb, rb); | ||
4128 | 4183 | ||
4129 | perf_event_init_userpage(event); | 4184 | perf_event_init_userpage(event); |
4130 | perf_event_update_userpage(event); | 4185 | perf_event_update_userpage(event); |
@@ -5036,21 +5091,9 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) | |||
5036 | NULL); | 5091 | NULL); |
5037 | } | 5092 | } |
5038 | 5093 | ||
5039 | void perf_event_comm(struct task_struct *task) | 5094 | void perf_event_comm(struct task_struct *task, bool exec) |
5040 | { | 5095 | { |
5041 | struct perf_comm_event comm_event; | 5096 | struct perf_comm_event comm_event; |
5042 | struct perf_event_context *ctx; | ||
5043 | int ctxn; | ||
5044 | |||
5045 | rcu_read_lock(); | ||
5046 | for_each_task_context_nr(ctxn) { | ||
5047 | ctx = task->perf_event_ctxp[ctxn]; | ||
5048 | if (!ctx) | ||
5049 | continue; | ||
5050 | |||
5051 | perf_event_enable_on_exec(ctx); | ||
5052 | } | ||
5053 | rcu_read_unlock(); | ||
5054 | 5097 | ||
5055 | if (!atomic_read(&nr_comm_events)) | 5098 | if (!atomic_read(&nr_comm_events)) |
5056 | return; | 5099 | return; |
@@ -5062,7 +5105,7 @@ void perf_event_comm(struct task_struct *task) | |||
5062 | .event_id = { | 5105 | .event_id = { |
5063 | .header = { | 5106 | .header = { |
5064 | .type = PERF_RECORD_COMM, | 5107 | .type = PERF_RECORD_COMM, |
5065 | .misc = 0, | 5108 | .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0, |
5066 | /* .size */ | 5109 | /* .size */ |
5067 | }, | 5110 | }, |
5068 | /* .pid */ | 5111 | /* .pid */ |
@@ -5408,6 +5451,9 @@ struct swevent_htable { | |||
5408 | 5451 | ||
5409 | /* Recursion avoidance in each contexts */ | 5452 | /* Recursion avoidance in each contexts */ |
5410 | int recursion[PERF_NR_CONTEXTS]; | 5453 | int recursion[PERF_NR_CONTEXTS]; |
5454 | |||
5455 | /* Keeps track of cpu being initialized/exited */ | ||
5456 | bool online; | ||
5411 | }; | 5457 | }; |
5412 | 5458 | ||
5413 | static DEFINE_PER_CPU(struct swevent_htable, swevent_htable); | 5459 | static DEFINE_PER_CPU(struct swevent_htable, swevent_htable); |
@@ -5654,8 +5700,14 @@ static int perf_swevent_add(struct perf_event *event, int flags) | |||
5654 | hwc->state = !(flags & PERF_EF_START); | 5700 | hwc->state = !(flags & PERF_EF_START); |
5655 | 5701 | ||
5656 | head = find_swevent_head(swhash, event); | 5702 | head = find_swevent_head(swhash, event); |
5657 | if (WARN_ON_ONCE(!head)) | 5703 | if (!head) { |
5704 | /* | ||
5705 | * We can race with cpu hotplug code. Do not | ||
5706 | * WARN if the cpu just got unplugged. | ||
5707 | */ | ||
5708 | WARN_ON_ONCE(swhash->online); | ||
5658 | return -EINVAL; | 5709 | return -EINVAL; |
5710 | } | ||
5659 | 5711 | ||
5660 | hlist_add_head_rcu(&event->hlist_entry, head); | 5712 | hlist_add_head_rcu(&event->hlist_entry, head); |
5661 | 5713 | ||
@@ -6551,6 +6603,7 @@ free_pdc: | |||
6551 | free_percpu(pmu->pmu_disable_count); | 6603 | free_percpu(pmu->pmu_disable_count); |
6552 | goto unlock; | 6604 | goto unlock; |
6553 | } | 6605 | } |
6606 | EXPORT_SYMBOL_GPL(perf_pmu_register); | ||
6554 | 6607 | ||
6555 | void perf_pmu_unregister(struct pmu *pmu) | 6608 | void perf_pmu_unregister(struct pmu *pmu) |
6556 | { | 6609 | { |
@@ -6572,6 +6625,7 @@ void perf_pmu_unregister(struct pmu *pmu) | |||
6572 | put_device(pmu->dev); | 6625 | put_device(pmu->dev); |
6573 | free_pmu_context(pmu); | 6626 | free_pmu_context(pmu); |
6574 | } | 6627 | } |
6628 | EXPORT_SYMBOL_GPL(perf_pmu_unregister); | ||
6575 | 6629 | ||
6576 | struct pmu *perf_init_event(struct perf_event *event) | 6630 | struct pmu *perf_init_event(struct perf_event *event) |
6577 | { | 6631 | { |
@@ -6585,6 +6639,10 @@ struct pmu *perf_init_event(struct perf_event *event) | |||
6585 | pmu = idr_find(&pmu_idr, event->attr.type); | 6639 | pmu = idr_find(&pmu_idr, event->attr.type); |
6586 | rcu_read_unlock(); | 6640 | rcu_read_unlock(); |
6587 | if (pmu) { | 6641 | if (pmu) { |
6642 | if (!try_module_get(pmu->module)) { | ||
6643 | pmu = ERR_PTR(-ENODEV); | ||
6644 | goto unlock; | ||
6645 | } | ||
6588 | event->pmu = pmu; | 6646 | event->pmu = pmu; |
6589 | ret = pmu->event_init(event); | 6647 | ret = pmu->event_init(event); |
6590 | if (ret) | 6648 | if (ret) |
@@ -6593,6 +6651,10 @@ struct pmu *perf_init_event(struct perf_event *event) | |||
6593 | } | 6651 | } |
6594 | 6652 | ||
6595 | list_for_each_entry_rcu(pmu, &pmus, entry) { | 6653 | list_for_each_entry_rcu(pmu, &pmus, entry) { |
6654 | if (!try_module_get(pmu->module)) { | ||
6655 | pmu = ERR_PTR(-ENODEV); | ||
6656 | goto unlock; | ||
6657 | } | ||
6596 | event->pmu = pmu; | 6658 | event->pmu = pmu; |
6597 | ret = pmu->event_init(event); | 6659 | ret = pmu->event_init(event); |
6598 | if (!ret) | 6660 | if (!ret) |
@@ -6771,6 +6833,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, | |||
6771 | err_pmu: | 6833 | err_pmu: |
6772 | if (event->destroy) | 6834 | if (event->destroy) |
6773 | event->destroy(event); | 6835 | event->destroy(event); |
6836 | module_put(pmu->module); | ||
6774 | err_ns: | 6837 | err_ns: |
6775 | if (event->ns) | 6838 | if (event->ns) |
6776 | put_pid_ns(event->ns); | 6839 | put_pid_ns(event->ns); |
@@ -6914,7 +6977,7 @@ err_size: | |||
6914 | static int | 6977 | static int |
6915 | perf_event_set_output(struct perf_event *event, struct perf_event *output_event) | 6978 | perf_event_set_output(struct perf_event *event, struct perf_event *output_event) |
6916 | { | 6979 | { |
6917 | struct ring_buffer *rb = NULL, *old_rb = NULL; | 6980 | struct ring_buffer *rb = NULL; |
6918 | int ret = -EINVAL; | 6981 | int ret = -EINVAL; |
6919 | 6982 | ||
6920 | if (!output_event) | 6983 | if (!output_event) |
@@ -6942,8 +7005,6 @@ set: | |||
6942 | if (atomic_read(&event->mmap_count)) | 7005 | if (atomic_read(&event->mmap_count)) |
6943 | goto unlock; | 7006 | goto unlock; |
6944 | 7007 | ||
6945 | old_rb = event->rb; | ||
6946 | |||
6947 | if (output_event) { | 7008 | if (output_event) { |
6948 | /* get the rb we want to redirect to */ | 7009 | /* get the rb we want to redirect to */ |
6949 | rb = ring_buffer_get(output_event); | 7010 | rb = ring_buffer_get(output_event); |
@@ -6951,23 +7012,7 @@ set: | |||
6951 | goto unlock; | 7012 | goto unlock; |
6952 | } | 7013 | } |
6953 | 7014 | ||
6954 | if (old_rb) | 7015 | ring_buffer_attach(event, rb); |
6955 | ring_buffer_detach(event, old_rb); | ||
6956 | |||
6957 | if (rb) | ||
6958 | ring_buffer_attach(event, rb); | ||
6959 | |||
6960 | rcu_assign_pointer(event->rb, rb); | ||
6961 | |||
6962 | if (old_rb) { | ||
6963 | ring_buffer_put(old_rb); | ||
6964 | /* | ||
6965 | * Since we detached before setting the new rb, so that we | ||
6966 | * could attach the new rb, we could have missed a wakeup. | ||
6967 | * Provide it now. | ||
6968 | */ | ||
6969 | wake_up_all(&event->waitq); | ||
6970 | } | ||
6971 | 7016 | ||
6972 | ret = 0; | 7017 | ret = 0; |
6973 | unlock: | 7018 | unlock: |
@@ -7018,6 +7063,9 @@ SYSCALL_DEFINE5(perf_event_open, | |||
7018 | if (attr.freq) { | 7063 | if (attr.freq) { |
7019 | if (attr.sample_freq > sysctl_perf_event_sample_rate) | 7064 | if (attr.sample_freq > sysctl_perf_event_sample_rate) |
7020 | return -EINVAL; | 7065 | return -EINVAL; |
7066 | } else { | ||
7067 | if (attr.sample_period & (1ULL << 63)) | ||
7068 | return -EINVAL; | ||
7021 | } | 7069 | } |
7022 | 7070 | ||
7023 | /* | 7071 | /* |
@@ -7055,20 +7103,33 @@ SYSCALL_DEFINE5(perf_event_open, | |||
7055 | } | 7103 | } |
7056 | } | 7104 | } |
7057 | 7105 | ||
7106 | if (task && group_leader && | ||
7107 | group_leader->attr.inherit != attr.inherit) { | ||
7108 | err = -EINVAL; | ||
7109 | goto err_task; | ||
7110 | } | ||
7111 | |||
7058 | get_online_cpus(); | 7112 | get_online_cpus(); |
7059 | 7113 | ||
7060 | event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, | 7114 | event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, |
7061 | NULL, NULL); | 7115 | NULL, NULL); |
7062 | if (IS_ERR(event)) { | 7116 | if (IS_ERR(event)) { |
7063 | err = PTR_ERR(event); | 7117 | err = PTR_ERR(event); |
7064 | goto err_task; | 7118 | goto err_cpus; |
7065 | } | 7119 | } |
7066 | 7120 | ||
7067 | if (flags & PERF_FLAG_PID_CGROUP) { | 7121 | if (flags & PERF_FLAG_PID_CGROUP) { |
7068 | err = perf_cgroup_connect(pid, event, &attr, group_leader); | 7122 | err = perf_cgroup_connect(pid, event, &attr, group_leader); |
7069 | if (err) { | 7123 | if (err) { |
7070 | __free_event(event); | 7124 | __free_event(event); |
7071 | goto err_task; | 7125 | goto err_cpus; |
7126 | } | ||
7127 | } | ||
7128 | |||
7129 | if (is_sampling_event(event)) { | ||
7130 | if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) { | ||
7131 | err = -ENOTSUPP; | ||
7132 | goto err_alloc; | ||
7072 | } | 7133 | } |
7073 | } | 7134 | } |
7074 | 7135 | ||
@@ -7165,7 +7226,7 @@ SYSCALL_DEFINE5(perf_event_open, | |||
7165 | struct perf_event_context *gctx = group_leader->ctx; | 7226 | struct perf_event_context *gctx = group_leader->ctx; |
7166 | 7227 | ||
7167 | mutex_lock(&gctx->mutex); | 7228 | mutex_lock(&gctx->mutex); |
7168 | perf_remove_from_context(group_leader); | 7229 | perf_remove_from_context(group_leader, false); |
7169 | 7230 | ||
7170 | /* | 7231 | /* |
7171 | * Removing from the context ends up with disabled | 7232 | * Removing from the context ends up with disabled |
@@ -7175,7 +7236,7 @@ SYSCALL_DEFINE5(perf_event_open, | |||
7175 | perf_event__state_init(group_leader); | 7236 | perf_event__state_init(group_leader); |
7176 | list_for_each_entry(sibling, &group_leader->sibling_list, | 7237 | list_for_each_entry(sibling, &group_leader->sibling_list, |
7177 | group_entry) { | 7238 | group_entry) { |
7178 | perf_remove_from_context(sibling); | 7239 | perf_remove_from_context(sibling, false); |
7179 | perf_event__state_init(sibling); | 7240 | perf_event__state_init(sibling); |
7180 | put_ctx(gctx); | 7241 | put_ctx(gctx); |
7181 | } | 7242 | } |
@@ -7230,8 +7291,9 @@ err_context: | |||
7230 | put_ctx(ctx); | 7291 | put_ctx(ctx); |
7231 | err_alloc: | 7292 | err_alloc: |
7232 | free_event(event); | 7293 | free_event(event); |
7233 | err_task: | 7294 | err_cpus: |
7234 | put_online_cpus(); | 7295 | put_online_cpus(); |
7296 | err_task: | ||
7235 | if (task) | 7297 | if (task) |
7236 | put_task_struct(task); | 7298 | put_task_struct(task); |
7237 | err_group_fd: | 7299 | err_group_fd: |
@@ -7305,7 +7367,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) | |||
7305 | mutex_lock(&src_ctx->mutex); | 7367 | mutex_lock(&src_ctx->mutex); |
7306 | list_for_each_entry_safe(event, tmp, &src_ctx->event_list, | 7368 | list_for_each_entry_safe(event, tmp, &src_ctx->event_list, |
7307 | event_entry) { | 7369 | event_entry) { |
7308 | perf_remove_from_context(event); | 7370 | perf_remove_from_context(event, false); |
7309 | unaccount_event_cpu(event, src_cpu); | 7371 | unaccount_event_cpu(event, src_cpu); |
7310 | put_ctx(src_ctx); | 7372 | put_ctx(src_ctx); |
7311 | list_add(&event->migrate_entry, &events); | 7373 | list_add(&event->migrate_entry, &events); |
@@ -7367,13 +7429,7 @@ __perf_event_exit_task(struct perf_event *child_event, | |||
7367 | struct perf_event_context *child_ctx, | 7429 | struct perf_event_context *child_ctx, |
7368 | struct task_struct *child) | 7430 | struct task_struct *child) |
7369 | { | 7431 | { |
7370 | if (child_event->parent) { | 7432 | perf_remove_from_context(child_event, true); |
7371 | raw_spin_lock_irq(&child_ctx->lock); | ||
7372 | perf_group_detach(child_event); | ||
7373 | raw_spin_unlock_irq(&child_ctx->lock); | ||
7374 | } | ||
7375 | |||
7376 | perf_remove_from_context(child_event); | ||
7377 | 7433 | ||
7378 | /* | 7434 | /* |
7379 | * It can happen that the parent exits first, and has events | 7435 | * It can happen that the parent exits first, and has events |
@@ -7388,7 +7444,7 @@ __perf_event_exit_task(struct perf_event *child_event, | |||
7388 | 7444 | ||
7389 | static void perf_event_exit_task_context(struct task_struct *child, int ctxn) | 7445 | static void perf_event_exit_task_context(struct task_struct *child, int ctxn) |
7390 | { | 7446 | { |
7391 | struct perf_event *child_event, *tmp; | 7447 | struct perf_event *child_event, *next; |
7392 | struct perf_event_context *child_ctx; | 7448 | struct perf_event_context *child_ctx; |
7393 | unsigned long flags; | 7449 | unsigned long flags; |
7394 | 7450 | ||
@@ -7442,24 +7498,9 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) | |||
7442 | */ | 7498 | */ |
7443 | mutex_lock(&child_ctx->mutex); | 7499 | mutex_lock(&child_ctx->mutex); |
7444 | 7500 | ||
7445 | again: | 7501 | list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry) |
7446 | list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups, | ||
7447 | group_entry) | ||
7448 | __perf_event_exit_task(child_event, child_ctx, child); | 7502 | __perf_event_exit_task(child_event, child_ctx, child); |
7449 | 7503 | ||
7450 | list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups, | ||
7451 | group_entry) | ||
7452 | __perf_event_exit_task(child_event, child_ctx, child); | ||
7453 | |||
7454 | /* | ||
7455 | * If the last event was a group event, it will have appended all | ||
7456 | * its siblings to the list, but we obtained 'tmp' before that which | ||
7457 | * will still point to the list head terminating the iteration. | ||
7458 | */ | ||
7459 | if (!list_empty(&child_ctx->pinned_groups) || | ||
7460 | !list_empty(&child_ctx->flexible_groups)) | ||
7461 | goto again; | ||
7462 | |||
7463 | mutex_unlock(&child_ctx->mutex); | 7504 | mutex_unlock(&child_ctx->mutex); |
7464 | 7505 | ||
7465 | put_ctx(child_ctx); | 7506 | put_ctx(child_ctx); |
@@ -7724,6 +7765,8 @@ int perf_event_init_context(struct task_struct *child, int ctxn) | |||
7724 | * swapped under us. | 7765 | * swapped under us. |
7725 | */ | 7766 | */ |
7726 | parent_ctx = perf_pin_task_context(parent, ctxn); | 7767 | parent_ctx = perf_pin_task_context(parent, ctxn); |
7768 | if (!parent_ctx) | ||
7769 | return 0; | ||
7727 | 7770 | ||
7728 | /* | 7771 | /* |
7729 | * No need to check if parent_ctx != NULL here; since we saw | 7772 | * No need to check if parent_ctx != NULL here; since we saw |
@@ -7835,6 +7878,7 @@ static void perf_event_init_cpu(int cpu) | |||
7835 | struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); | 7878 | struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); |
7836 | 7879 | ||
7837 | mutex_lock(&swhash->hlist_mutex); | 7880 | mutex_lock(&swhash->hlist_mutex); |
7881 | swhash->online = true; | ||
7838 | if (swhash->hlist_refcount > 0) { | 7882 | if (swhash->hlist_refcount > 0) { |
7839 | struct swevent_hlist *hlist; | 7883 | struct swevent_hlist *hlist; |
7840 | 7884 | ||
@@ -7857,14 +7901,14 @@ static void perf_pmu_rotate_stop(struct pmu *pmu) | |||
7857 | 7901 | ||
7858 | static void __perf_event_exit_context(void *__info) | 7902 | static void __perf_event_exit_context(void *__info) |
7859 | { | 7903 | { |
7904 | struct remove_event re = { .detach_group = false }; | ||
7860 | struct perf_event_context *ctx = __info; | 7905 | struct perf_event_context *ctx = __info; |
7861 | struct perf_event *event; | ||
7862 | 7906 | ||
7863 | perf_pmu_rotate_stop(ctx->pmu); | 7907 | perf_pmu_rotate_stop(ctx->pmu); |
7864 | 7908 | ||
7865 | rcu_read_lock(); | 7909 | rcu_read_lock(); |
7866 | list_for_each_entry_rcu(event, &ctx->event_list, event_entry) | 7910 | list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry) |
7867 | __perf_remove_from_context(event); | 7911 | __perf_remove_from_context(&re); |
7868 | rcu_read_unlock(); | 7912 | rcu_read_unlock(); |
7869 | } | 7913 | } |
7870 | 7914 | ||
@@ -7892,6 +7936,7 @@ static void perf_event_exit_cpu(int cpu) | |||
7892 | perf_event_exit_cpu_context(cpu); | 7936 | perf_event_exit_cpu_context(cpu); |
7893 | 7937 | ||
7894 | mutex_lock(&swhash->hlist_mutex); | 7938 | mutex_lock(&swhash->hlist_mutex); |
7939 | swhash->online = false; | ||
7895 | swevent_hlist_release(swhash); | 7940 | swevent_hlist_release(swhash); |
7896 | mutex_unlock(&swhash->hlist_mutex); | 7941 | mutex_unlock(&swhash->hlist_mutex); |
7897 | } | 7942 | } |
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 04709b66369d..c445e392e93f 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c | |||
@@ -36,6 +36,7 @@ | |||
36 | #include "../../mm/internal.h" /* munlock_vma_page */ | 36 | #include "../../mm/internal.h" /* munlock_vma_page */ |
37 | #include <linux/percpu-rwsem.h> | 37 | #include <linux/percpu-rwsem.h> |
38 | #include <linux/task_work.h> | 38 | #include <linux/task_work.h> |
39 | #include <linux/shmem_fs.h> | ||
39 | 40 | ||
40 | #include <linux/uprobes.h> | 41 | #include <linux/uprobes.h> |
41 | 42 | ||
@@ -60,8 +61,6 @@ static struct percpu_rw_semaphore dup_mmap_sem; | |||
60 | 61 | ||
61 | /* Have a copy of original instruction */ | 62 | /* Have a copy of original instruction */ |
62 | #define UPROBE_COPY_INSN 0 | 63 | #define UPROBE_COPY_INSN 0 |
63 | /* Can skip singlestep */ | ||
64 | #define UPROBE_SKIP_SSTEP 1 | ||
65 | 64 | ||
66 | struct uprobe { | 65 | struct uprobe { |
67 | struct rb_node rb_node; /* node in the rb tree */ | 66 | struct rb_node rb_node; /* node in the rb tree */ |
@@ -129,7 +128,7 @@ struct xol_area { | |||
129 | */ | 128 | */ |
130 | static bool valid_vma(struct vm_area_struct *vma, bool is_register) | 129 | static bool valid_vma(struct vm_area_struct *vma, bool is_register) |
131 | { | 130 | { |
132 | vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_SHARED; | 131 | vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE; |
133 | 132 | ||
134 | if (is_register) | 133 | if (is_register) |
135 | flags |= VM_WRITE; | 134 | flags |= VM_WRITE; |
@@ -281,18 +280,13 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t | |||
281 | * supported by that architecture then we need to modify is_trap_at_addr and | 280 | * supported by that architecture then we need to modify is_trap_at_addr and |
282 | * uprobe_write_opcode accordingly. This would never be a problem for archs | 281 | * uprobe_write_opcode accordingly. This would never be a problem for archs |
283 | * that have fixed length instructions. | 282 | * that have fixed length instructions. |
284 | */ | 283 | * |
285 | |||
286 | /* | ||
287 | * uprobe_write_opcode - write the opcode at a given virtual address. | 284 | * uprobe_write_opcode - write the opcode at a given virtual address. |
288 | * @mm: the probed process address space. | 285 | * @mm: the probed process address space. |
289 | * @vaddr: the virtual address to store the opcode. | 286 | * @vaddr: the virtual address to store the opcode. |
290 | * @opcode: opcode to be written at @vaddr. | 287 | * @opcode: opcode to be written at @vaddr. |
291 | * | 288 | * |
292 | * Called with mm->mmap_sem held (for read and with a reference to | 289 | * Called with mm->mmap_sem held for write. |
293 | * mm). | ||
294 | * | ||
295 | * For mm @mm, write the opcode at @vaddr. | ||
296 | * Return 0 (success) or a negative errno. | 290 | * Return 0 (success) or a negative errno. |
297 | */ | 291 | */ |
298 | int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, | 292 | int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, |
@@ -312,21 +306,25 @@ retry: | |||
312 | if (ret <= 0) | 306 | if (ret <= 0) |
313 | goto put_old; | 307 | goto put_old; |
314 | 308 | ||
309 | ret = anon_vma_prepare(vma); | ||
310 | if (ret) | ||
311 | goto put_old; | ||
312 | |||
315 | ret = -ENOMEM; | 313 | ret = -ENOMEM; |
316 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr); | 314 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr); |
317 | if (!new_page) | 315 | if (!new_page) |
318 | goto put_old; | 316 | goto put_old; |
319 | 317 | ||
320 | __SetPageUptodate(new_page); | 318 | if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)) |
319 | goto put_new; | ||
321 | 320 | ||
321 | __SetPageUptodate(new_page); | ||
322 | copy_highpage(new_page, old_page); | 322 | copy_highpage(new_page, old_page); |
323 | copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); | 323 | copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); |
324 | 324 | ||
325 | ret = anon_vma_prepare(vma); | ||
326 | if (ret) | ||
327 | goto put_new; | ||
328 | |||
329 | ret = __replace_page(vma, vaddr, old_page, new_page); | 325 | ret = __replace_page(vma, vaddr, old_page, new_page); |
326 | if (ret) | ||
327 | mem_cgroup_uncharge_page(new_page); | ||
330 | 328 | ||
331 | put_new: | 329 | put_new: |
332 | page_cache_release(new_page); | 330 | page_cache_release(new_page); |
@@ -491,12 +489,9 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) | |||
491 | uprobe->offset = offset; | 489 | uprobe->offset = offset; |
492 | init_rwsem(&uprobe->register_rwsem); | 490 | init_rwsem(&uprobe->register_rwsem); |
493 | init_rwsem(&uprobe->consumer_rwsem); | 491 | init_rwsem(&uprobe->consumer_rwsem); |
494 | /* For now assume that the instruction need not be single-stepped */ | ||
495 | __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags); | ||
496 | 492 | ||
497 | /* add to uprobes_tree, sorted on inode:offset */ | 493 | /* add to uprobes_tree, sorted on inode:offset */ |
498 | cur_uprobe = insert_uprobe(uprobe); | 494 | cur_uprobe = insert_uprobe(uprobe); |
499 | |||
500 | /* a uprobe exists for this inode:offset combination */ | 495 | /* a uprobe exists for this inode:offset combination */ |
501 | if (cur_uprobe) { | 496 | if (cur_uprobe) { |
502 | kfree(uprobe); | 497 | kfree(uprobe); |
@@ -542,14 +537,15 @@ static int __copy_insn(struct address_space *mapping, struct file *filp, | |||
542 | void *insn, int nbytes, loff_t offset) | 537 | void *insn, int nbytes, loff_t offset) |
543 | { | 538 | { |
544 | struct page *page; | 539 | struct page *page; |
545 | |||
546 | if (!mapping->a_ops->readpage) | ||
547 | return -EIO; | ||
548 | /* | 540 | /* |
549 | * Ensure that the page that has the original instruction is | 541 | * Ensure that the page that has the original instruction is populated |
550 | * populated and in page-cache. | 542 | * and in page-cache. If ->readpage == NULL it must be shmem_mapping(), |
543 | * see uprobe_register(). | ||
551 | */ | 544 | */ |
552 | page = read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT, filp); | 545 | if (mapping->a_ops->readpage) |
546 | page = read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT, filp); | ||
547 | else | ||
548 | page = shmem_read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT); | ||
553 | if (IS_ERR(page)) | 549 | if (IS_ERR(page)) |
554 | return PTR_ERR(page); | 550 | return PTR_ERR(page); |
555 | 551 | ||
@@ -885,6 +881,9 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer * | |||
885 | if (!uc->handler && !uc->ret_handler) | 881 | if (!uc->handler && !uc->ret_handler) |
886 | return -EINVAL; | 882 | return -EINVAL; |
887 | 883 | ||
884 | /* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */ | ||
885 | if (!inode->i_mapping->a_ops->readpage && !shmem_mapping(inode->i_mapping)) | ||
886 | return -EIO; | ||
888 | /* Racy, just to catch the obvious mistakes */ | 887 | /* Racy, just to catch the obvious mistakes */ |
889 | if (offset > i_size_read(inode)) | 888 | if (offset > i_size_read(inode)) |
890 | return -EINVAL; | 889 | return -EINVAL; |
@@ -1296,14 +1295,8 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe) | |||
1296 | if (unlikely(!xol_vaddr)) | 1295 | if (unlikely(!xol_vaddr)) |
1297 | return 0; | 1296 | return 0; |
1298 | 1297 | ||
1299 | /* Initialize the slot */ | 1298 | arch_uprobe_copy_ixol(area->page, xol_vaddr, |
1300 | copy_to_page(area->page, xol_vaddr, | 1299 | &uprobe->arch.ixol, sizeof(uprobe->arch.ixol)); |
1301 | &uprobe->arch.ixol, sizeof(uprobe->arch.ixol)); | ||
1302 | /* | ||
1303 | * We probably need flush_icache_user_range() but it needs vma. | ||
1304 | * This should work on supported architectures too. | ||
1305 | */ | ||
1306 | flush_dcache_page(area->page); | ||
1307 | 1300 | ||
1308 | return xol_vaddr; | 1301 | return xol_vaddr; |
1309 | } | 1302 | } |
@@ -1346,6 +1339,21 @@ static void xol_free_insn_slot(struct task_struct *tsk) | |||
1346 | } | 1339 | } |
1347 | } | 1340 | } |
1348 | 1341 | ||
1342 | void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, | ||
1343 | void *src, unsigned long len) | ||
1344 | { | ||
1345 | /* Initialize the slot */ | ||
1346 | copy_to_page(page, vaddr, src, len); | ||
1347 | |||
1348 | /* | ||
1349 | * We probably need flush_icache_user_range() but it needs vma. | ||
1350 | * This should work on most of architectures by default. If | ||
1351 | * architecture needs to do something different it can define | ||
1352 | * its own version of the function. | ||
1353 | */ | ||
1354 | flush_dcache_page(page); | ||
1355 | } | ||
1356 | |||
1349 | /** | 1357 | /** |
1350 | * uprobe_get_swbp_addr - compute address of swbp given post-swbp regs | 1358 | * uprobe_get_swbp_addr - compute address of swbp given post-swbp regs |
1351 | * @regs: Reflects the saved state of the task after it has hit a breakpoint | 1359 | * @regs: Reflects the saved state of the task after it has hit a breakpoint |
@@ -1357,6 +1365,16 @@ unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs) | |||
1357 | return instruction_pointer(regs) - UPROBE_SWBP_INSN_SIZE; | 1365 | return instruction_pointer(regs) - UPROBE_SWBP_INSN_SIZE; |
1358 | } | 1366 | } |
1359 | 1367 | ||
1368 | unsigned long uprobe_get_trap_addr(struct pt_regs *regs) | ||
1369 | { | ||
1370 | struct uprobe_task *utask = current->utask; | ||
1371 | |||
1372 | if (unlikely(utask && utask->active_uprobe)) | ||
1373 | return utask->vaddr; | ||
1374 | |||
1375 | return instruction_pointer(regs); | ||
1376 | } | ||
1377 | |||
1360 | /* | 1378 | /* |
1361 | * Called with no locks held. | 1379 | * Called with no locks held. |
1362 | * Called in context of a exiting or a exec-ing thread. | 1380 | * Called in context of a exiting or a exec-ing thread. |
@@ -1628,20 +1646,6 @@ bool uprobe_deny_signal(void) | |||
1628 | return true; | 1646 | return true; |
1629 | } | 1647 | } |
1630 | 1648 | ||
1631 | /* | ||
1632 | * Avoid singlestepping the original instruction if the original instruction | ||
1633 | * is a NOP or can be emulated. | ||
1634 | */ | ||
1635 | static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs) | ||
1636 | { | ||
1637 | if (test_bit(UPROBE_SKIP_SSTEP, &uprobe->flags)) { | ||
1638 | if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) | ||
1639 | return true; | ||
1640 | clear_bit(UPROBE_SKIP_SSTEP, &uprobe->flags); | ||
1641 | } | ||
1642 | return false; | ||
1643 | } | ||
1644 | |||
1645 | static void mmf_recalc_uprobes(struct mm_struct *mm) | 1649 | static void mmf_recalc_uprobes(struct mm_struct *mm) |
1646 | { | 1650 | { |
1647 | struct vm_area_struct *vma; | 1651 | struct vm_area_struct *vma; |
@@ -1868,13 +1872,13 @@ static void handle_swbp(struct pt_regs *regs) | |||
1868 | 1872 | ||
1869 | handler_chain(uprobe, regs); | 1873 | handler_chain(uprobe, regs); |
1870 | 1874 | ||
1871 | if (can_skip_sstep(uprobe, regs)) | 1875 | if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) |
1872 | goto out; | 1876 | goto out; |
1873 | 1877 | ||
1874 | if (!pre_ssout(uprobe, regs, bp_vaddr)) | 1878 | if (!pre_ssout(uprobe, regs, bp_vaddr)) |
1875 | return; | 1879 | return; |
1876 | 1880 | ||
1877 | /* can_skip_sstep() succeeded, or restart if can't singlestep */ | 1881 | /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */ |
1878 | out: | 1882 | out: |
1879 | put_uprobe(uprobe); | 1883 | put_uprobe(uprobe); |
1880 | } | 1884 | } |
@@ -1886,10 +1890,11 @@ out: | |||
1886 | static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) | 1890 | static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) |
1887 | { | 1891 | { |
1888 | struct uprobe *uprobe; | 1892 | struct uprobe *uprobe; |
1893 | int err = 0; | ||
1889 | 1894 | ||
1890 | uprobe = utask->active_uprobe; | 1895 | uprobe = utask->active_uprobe; |
1891 | if (utask->state == UTASK_SSTEP_ACK) | 1896 | if (utask->state == UTASK_SSTEP_ACK) |
1892 | arch_uprobe_post_xol(&uprobe->arch, regs); | 1897 | err = arch_uprobe_post_xol(&uprobe->arch, regs); |
1893 | else if (utask->state == UTASK_SSTEP_TRAPPED) | 1898 | else if (utask->state == UTASK_SSTEP_TRAPPED) |
1894 | arch_uprobe_abort_xol(&uprobe->arch, regs); | 1899 | arch_uprobe_abort_xol(&uprobe->arch, regs); |
1895 | else | 1900 | else |
@@ -1903,6 +1908,11 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) | |||
1903 | spin_lock_irq(¤t->sighand->siglock); | 1908 | spin_lock_irq(¤t->sighand->siglock); |
1904 | recalc_sigpending(); /* see uprobe_deny_signal() */ | 1909 | recalc_sigpending(); /* see uprobe_deny_signal() */ |
1905 | spin_unlock_irq(¤t->sighand->siglock); | 1910 | spin_unlock_irq(¤t->sighand->siglock); |
1911 | |||
1912 | if (unlikely(err)) { | ||
1913 | uprobe_warn(current, "execute the probed insn, sending SIGILL."); | ||
1914 | force_sig_info(SIGILL, SEND_SIG_FORCED, current); | ||
1915 | } | ||
1906 | } | 1916 | } |
1907 | 1917 | ||
1908 | /* | 1918 | /* |
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c index 0dbeae374225..83d4382f5699 100644 --- a/kernel/exec_domain.c +++ b/kernel/exec_domain.c | |||
@@ -37,7 +37,7 @@ static unsigned long ident_map[32] = { | |||
37 | struct exec_domain default_exec_domain = { | 37 | struct exec_domain default_exec_domain = { |
38 | .name = "Linux", /* name */ | 38 | .name = "Linux", /* name */ |
39 | .handler = default_handler, /* lcall7 causes a seg fault. */ | 39 | .handler = default_handler, /* lcall7 causes a seg fault. */ |
40 | .pers_low = 0, /* PER_LINUX personality. */ | 40 | .pers_low = 0, /* PER_LINUX personality. */ |
41 | .pers_high = 0, /* PER_LINUX personality. */ | 41 | .pers_high = 0, /* PER_LINUX personality. */ |
42 | .signal_map = ident_map, /* Identity map signals. */ | 42 | .signal_map = ident_map, /* Identity map signals. */ |
43 | .signal_invmap = ident_map, /* - both ways. */ | 43 | .signal_invmap = ident_map, /* - both ways. */ |
@@ -83,7 +83,7 @@ lookup_exec_domain(unsigned int personality) | |||
83 | ep = &default_exec_domain; | 83 | ep = &default_exec_domain; |
84 | out: | 84 | out: |
85 | read_unlock(&exec_domains_lock); | 85 | read_unlock(&exec_domains_lock); |
86 | return (ep); | 86 | return ep; |
87 | } | 87 | } |
88 | 88 | ||
89 | int | 89 | int |
@@ -110,8 +110,9 @@ register_exec_domain(struct exec_domain *ep) | |||
110 | 110 | ||
111 | out: | 111 | out: |
112 | write_unlock(&exec_domains_lock); | 112 | write_unlock(&exec_domains_lock); |
113 | return (err); | 113 | return err; |
114 | } | 114 | } |
115 | EXPORT_SYMBOL(register_exec_domain); | ||
115 | 116 | ||
116 | int | 117 | int |
117 | unregister_exec_domain(struct exec_domain *ep) | 118 | unregister_exec_domain(struct exec_domain *ep) |
@@ -133,6 +134,7 @@ unregister: | |||
133 | write_unlock(&exec_domains_lock); | 134 | write_unlock(&exec_domains_lock); |
134 | return 0; | 135 | return 0; |
135 | } | 136 | } |
137 | EXPORT_SYMBOL(unregister_exec_domain); | ||
136 | 138 | ||
137 | int __set_personality(unsigned int personality) | 139 | int __set_personality(unsigned int personality) |
138 | { | 140 | { |
@@ -144,6 +146,7 @@ int __set_personality(unsigned int personality) | |||
144 | 146 | ||
145 | return 0; | 147 | return 0; |
146 | } | 148 | } |
149 | EXPORT_SYMBOL(__set_personality); | ||
147 | 150 | ||
148 | #ifdef CONFIG_PROC_FS | 151 | #ifdef CONFIG_PROC_FS |
149 | static int execdomains_proc_show(struct seq_file *m, void *v) | 152 | static int execdomains_proc_show(struct seq_file *m, void *v) |
@@ -188,8 +191,3 @@ SYSCALL_DEFINE1(personality, unsigned int, personality) | |||
188 | 191 | ||
189 | return old; | 192 | return old; |
190 | } | 193 | } |
191 | |||
192 | |||
193 | EXPORT_SYMBOL(register_exec_domain); | ||
194 | EXPORT_SYMBOL(unregister_exec_domain); | ||
195 | EXPORT_SYMBOL(__set_personality); | ||
diff --git a/kernel/exit.c b/kernel/exit.c index 6ed6a1d552b5..e5c4668f1799 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -313,46 +313,7 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) | |||
313 | } | 313 | } |
314 | } | 314 | } |
315 | 315 | ||
316 | /* | 316 | #ifdef CONFIG_MEMCG |
317 | * Let kernel threads use this to say that they allow a certain signal. | ||
318 | * Must not be used if kthread was cloned with CLONE_SIGHAND. | ||
319 | */ | ||
320 | int allow_signal(int sig) | ||
321 | { | ||
322 | if (!valid_signal(sig) || sig < 1) | ||
323 | return -EINVAL; | ||
324 | |||
325 | spin_lock_irq(¤t->sighand->siglock); | ||
326 | /* This is only needed for daemonize()'ed kthreads */ | ||
327 | sigdelset(¤t->blocked, sig); | ||
328 | /* | ||
329 | * Kernel threads handle their own signals. Let the signal code | ||
330 | * know it'll be handled, so that they don't get converted to | ||
331 | * SIGKILL or just silently dropped. | ||
332 | */ | ||
333 | current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2; | ||
334 | recalc_sigpending(); | ||
335 | spin_unlock_irq(¤t->sighand->siglock); | ||
336 | return 0; | ||
337 | } | ||
338 | |||
339 | EXPORT_SYMBOL(allow_signal); | ||
340 | |||
341 | int disallow_signal(int sig) | ||
342 | { | ||
343 | if (!valid_signal(sig) || sig < 1) | ||
344 | return -EINVAL; | ||
345 | |||
346 | spin_lock_irq(¤t->sighand->siglock); | ||
347 | current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN; | ||
348 | recalc_sigpending(); | ||
349 | spin_unlock_irq(¤t->sighand->siglock); | ||
350 | return 0; | ||
351 | } | ||
352 | |||
353 | EXPORT_SYMBOL(disallow_signal); | ||
354 | |||
355 | #ifdef CONFIG_MM_OWNER | ||
356 | /* | 317 | /* |
357 | * A task is exiting. If it owned this mm, find a new owner for the mm. | 318 | * A task is exiting. If it owned this mm, find a new owner for the mm. |
358 | */ | 319 | */ |
@@ -395,14 +356,18 @@ retry: | |||
395 | } | 356 | } |
396 | 357 | ||
397 | /* | 358 | /* |
398 | * Search through everything else. We should not get | 359 | * Search through everything else, we should not get here often. |
399 | * here often | ||
400 | */ | 360 | */ |
401 | do_each_thread(g, c) { | 361 | for_each_process(g) { |
402 | if (c->mm == mm) | 362 | if (g->flags & PF_KTHREAD) |
403 | goto assign_new_owner; | 363 | continue; |
404 | } while_each_thread(g, c); | 364 | for_each_thread(g, c) { |
405 | 365 | if (c->mm == mm) | |
366 | goto assign_new_owner; | ||
367 | if (c->mm) | ||
368 | break; | ||
369 | } | ||
370 | } | ||
406 | read_unlock(&tasklist_lock); | 371 | read_unlock(&tasklist_lock); |
407 | /* | 372 | /* |
408 | * We found no owner yet mm_users > 1: this implies that we are | 373 | * We found no owner yet mm_users > 1: this implies that we are |
@@ -434,7 +399,7 @@ assign_new_owner: | |||
434 | task_unlock(c); | 399 | task_unlock(c); |
435 | put_task_struct(c); | 400 | put_task_struct(c); |
436 | } | 401 | } |
437 | #endif /* CONFIG_MM_OWNER */ | 402 | #endif /* CONFIG_MEMCG */ |
438 | 403 | ||
439 | /* | 404 | /* |
440 | * Turn us into a lazy TLB process if we | 405 | * Turn us into a lazy TLB process if we |
diff --git a/kernel/fork.c b/kernel/fork.c index 54a8d26f612f..d2799d1fc952 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -150,15 +150,15 @@ void __weak arch_release_thread_info(struct thread_info *ti) | |||
150 | static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, | 150 | static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, |
151 | int node) | 151 | int node) |
152 | { | 152 | { |
153 | struct page *page = alloc_pages_node(node, THREADINFO_GFP_ACCOUNTED, | 153 | struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP, |
154 | THREAD_SIZE_ORDER); | 154 | THREAD_SIZE_ORDER); |
155 | 155 | ||
156 | return page ? page_address(page) : NULL; | 156 | return page ? page_address(page) : NULL; |
157 | } | 157 | } |
158 | 158 | ||
159 | static inline void free_thread_info(struct thread_info *ti) | 159 | static inline void free_thread_info(struct thread_info *ti) |
160 | { | 160 | { |
161 | free_memcg_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER); | 161 | free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER); |
162 | } | 162 | } |
163 | # else | 163 | # else |
164 | static struct kmem_cache *thread_info_cache; | 164 | static struct kmem_cache *thread_info_cache; |
@@ -1099,12 +1099,12 @@ static void rt_mutex_init_task(struct task_struct *p) | |||
1099 | #endif | 1099 | #endif |
1100 | } | 1100 | } |
1101 | 1101 | ||
1102 | #ifdef CONFIG_MM_OWNER | 1102 | #ifdef CONFIG_MEMCG |
1103 | void mm_init_owner(struct mm_struct *mm, struct task_struct *p) | 1103 | void mm_init_owner(struct mm_struct *mm, struct task_struct *p) |
1104 | { | 1104 | { |
1105 | mm->owner = p; | 1105 | mm->owner = p; |
1106 | } | 1106 | } |
1107 | #endif /* CONFIG_MM_OWNER */ | 1107 | #endif /* CONFIG_MEMCG */ |
1108 | 1108 | ||
1109 | /* | 1109 | /* |
1110 | * Initialize POSIX timer handling for a single task. | 1110 | * Initialize POSIX timer handling for a single task. |
@@ -1606,10 +1606,12 @@ long do_fork(unsigned long clone_flags, | |||
1606 | */ | 1606 | */ |
1607 | if (!IS_ERR(p)) { | 1607 | if (!IS_ERR(p)) { |
1608 | struct completion vfork; | 1608 | struct completion vfork; |
1609 | struct pid *pid; | ||
1609 | 1610 | ||
1610 | trace_sched_process_fork(current, p); | 1611 | trace_sched_process_fork(current, p); |
1611 | 1612 | ||
1612 | nr = task_pid_vnr(p); | 1613 | pid = get_task_pid(p, PIDTYPE_PID); |
1614 | nr = pid_vnr(pid); | ||
1613 | 1615 | ||
1614 | if (clone_flags & CLONE_PARENT_SETTID) | 1616 | if (clone_flags & CLONE_PARENT_SETTID) |
1615 | put_user(nr, parent_tidptr); | 1617 | put_user(nr, parent_tidptr); |
@@ -1624,12 +1626,14 @@ long do_fork(unsigned long clone_flags, | |||
1624 | 1626 | ||
1625 | /* forking complete and child started to run, tell ptracer */ | 1627 | /* forking complete and child started to run, tell ptracer */ |
1626 | if (unlikely(trace)) | 1628 | if (unlikely(trace)) |
1627 | ptrace_event(trace, nr); | 1629 | ptrace_event_pid(trace, pid); |
1628 | 1630 | ||
1629 | if (clone_flags & CLONE_VFORK) { | 1631 | if (clone_flags & CLONE_VFORK) { |
1630 | if (!wait_for_vfork_done(p, &vfork)) | 1632 | if (!wait_for_vfork_done(p, &vfork)) |
1631 | ptrace_event(PTRACE_EVENT_VFORK_DONE, nr); | 1633 | ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid); |
1632 | } | 1634 | } |
1635 | |||
1636 | put_pid(pid); | ||
1633 | } else { | 1637 | } else { |
1634 | nr = PTR_ERR(p); | 1638 | nr = PTR_ERR(p); |
1635 | } | 1639 | } |
diff --git a/kernel/futex.c b/kernel/futex.c index 5f589279e462..b632b5f3f094 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -267,7 +267,7 @@ static inline void futex_get_mm(union futex_key *key) | |||
267 | * get_futex_key() implies a full barrier. This is relied upon | 267 | * get_futex_key() implies a full barrier. This is relied upon |
268 | * as full barrier (B), see the ordering comment above. | 268 | * as full barrier (B), see the ordering comment above. |
269 | */ | 269 | */ |
270 | smp_mb__after_atomic_inc(); | 270 | smp_mb__after_atomic(); |
271 | } | 271 | } |
272 | 272 | ||
273 | /* | 273 | /* |
@@ -280,7 +280,7 @@ static inline void hb_waiters_inc(struct futex_hash_bucket *hb) | |||
280 | /* | 280 | /* |
281 | * Full barrier (A), see the ordering comment above. | 281 | * Full barrier (A), see the ordering comment above. |
282 | */ | 282 | */ |
283 | smp_mb__after_atomic_inc(); | 283 | smp_mb__after_atomic(); |
284 | #endif | 284 | #endif |
285 | } | 285 | } |
286 | 286 | ||
@@ -743,6 +743,55 @@ void exit_pi_state_list(struct task_struct *curr) | |||
743 | raw_spin_unlock_irq(&curr->pi_lock); | 743 | raw_spin_unlock_irq(&curr->pi_lock); |
744 | } | 744 | } |
745 | 745 | ||
746 | /* | ||
747 | * We need to check the following states: | ||
748 | * | ||
749 | * Waiter | pi_state | pi->owner | uTID | uODIED | ? | ||
750 | * | ||
751 | * [1] NULL | --- | --- | 0 | 0/1 | Valid | ||
752 | * [2] NULL | --- | --- | >0 | 0/1 | Valid | ||
753 | * | ||
754 | * [3] Found | NULL | -- | Any | 0/1 | Invalid | ||
755 | * | ||
756 | * [4] Found | Found | NULL | 0 | 1 | Valid | ||
757 | * [5] Found | Found | NULL | >0 | 1 | Invalid | ||
758 | * | ||
759 | * [6] Found | Found | task | 0 | 1 | Valid | ||
760 | * | ||
761 | * [7] Found | Found | NULL | Any | 0 | Invalid | ||
762 | * | ||
763 | * [8] Found | Found | task | ==taskTID | 0/1 | Valid | ||
764 | * [9] Found | Found | task | 0 | 0 | Invalid | ||
765 | * [10] Found | Found | task | !=taskTID | 0/1 | Invalid | ||
766 | * | ||
767 | * [1] Indicates that the kernel can acquire the futex atomically. We | ||
768 | * came came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit. | ||
769 | * | ||
770 | * [2] Valid, if TID does not belong to a kernel thread. If no matching | ||
771 | * thread is found then it indicates that the owner TID has died. | ||
772 | * | ||
773 | * [3] Invalid. The waiter is queued on a non PI futex | ||
774 | * | ||
775 | * [4] Valid state after exit_robust_list(), which sets the user space | ||
776 | * value to FUTEX_WAITERS | FUTEX_OWNER_DIED. | ||
777 | * | ||
778 | * [5] The user space value got manipulated between exit_robust_list() | ||
779 | * and exit_pi_state_list() | ||
780 | * | ||
781 | * [6] Valid state after exit_pi_state_list() which sets the new owner in | ||
782 | * the pi_state but cannot access the user space value. | ||
783 | * | ||
784 | * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set. | ||
785 | * | ||
786 | * [8] Owner and user space value match | ||
787 | * | ||
788 | * [9] There is no transient state which sets the user space TID to 0 | ||
789 | * except exit_robust_list(), but this is indicated by the | ||
790 | * FUTEX_OWNER_DIED bit. See [4] | ||
791 | * | ||
792 | * [10] There is no transient state which leaves owner and user space | ||
793 | * TID out of sync. | ||
794 | */ | ||
746 | static int | 795 | static int |
747 | lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, | 796 | lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, |
748 | union futex_key *key, struct futex_pi_state **ps) | 797 | union futex_key *key, struct futex_pi_state **ps) |
@@ -755,12 +804,13 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, | |||
755 | plist_for_each_entry_safe(this, next, &hb->chain, list) { | 804 | plist_for_each_entry_safe(this, next, &hb->chain, list) { |
756 | if (match_futex(&this->key, key)) { | 805 | if (match_futex(&this->key, key)) { |
757 | /* | 806 | /* |
758 | * Another waiter already exists - bump up | 807 | * Sanity check the waiter before increasing |
759 | * the refcount and return its pi_state: | 808 | * the refcount and attaching to it. |
760 | */ | 809 | */ |
761 | pi_state = this->pi_state; | 810 | pi_state = this->pi_state; |
762 | /* | 811 | /* |
763 | * Userspace might have messed up non-PI and PI futexes | 812 | * Userspace might have messed up non-PI and |
813 | * PI futexes [3] | ||
764 | */ | 814 | */ |
765 | if (unlikely(!pi_state)) | 815 | if (unlikely(!pi_state)) |
766 | return -EINVAL; | 816 | return -EINVAL; |
@@ -768,34 +818,70 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, | |||
768 | WARN_ON(!atomic_read(&pi_state->refcount)); | 818 | WARN_ON(!atomic_read(&pi_state->refcount)); |
769 | 819 | ||
770 | /* | 820 | /* |
771 | * When pi_state->owner is NULL then the owner died | 821 | * Handle the owner died case: |
772 | * and another waiter is on the fly. pi_state->owner | ||
773 | * is fixed up by the task which acquires | ||
774 | * pi_state->rt_mutex. | ||
775 | * | ||
776 | * We do not check for pid == 0 which can happen when | ||
777 | * the owner died and robust_list_exit() cleared the | ||
778 | * TID. | ||
779 | */ | 822 | */ |
780 | if (pid && pi_state->owner) { | 823 | if (uval & FUTEX_OWNER_DIED) { |
824 | /* | ||
825 | * exit_pi_state_list sets owner to NULL and | ||
826 | * wakes the topmost waiter. The task which | ||
827 | * acquires the pi_state->rt_mutex will fixup | ||
828 | * owner. | ||
829 | */ | ||
830 | if (!pi_state->owner) { | ||
831 | /* | ||
832 | * No pi state owner, but the user | ||
833 | * space TID is not 0. Inconsistent | ||
834 | * state. [5] | ||
835 | */ | ||
836 | if (pid) | ||
837 | return -EINVAL; | ||
838 | /* | ||
839 | * Take a ref on the state and | ||
840 | * return. [4] | ||
841 | */ | ||
842 | goto out_state; | ||
843 | } | ||
844 | |||
781 | /* | 845 | /* |
782 | * Bail out if user space manipulated the | 846 | * If TID is 0, then either the dying owner |
783 | * futex value. | 847 | * has not yet executed exit_pi_state_list() |
848 | * or some waiter acquired the rtmutex in the | ||
849 | * pi state, but did not yet fixup the TID in | ||
850 | * user space. | ||
851 | * | ||
852 | * Take a ref on the state and return. [6] | ||
784 | */ | 853 | */ |
785 | if (pid != task_pid_vnr(pi_state->owner)) | 854 | if (!pid) |
855 | goto out_state; | ||
856 | } else { | ||
857 | /* | ||
858 | * If the owner died bit is not set, | ||
859 | * then the pi_state must have an | ||
860 | * owner. [7] | ||
861 | */ | ||
862 | if (!pi_state->owner) | ||
786 | return -EINVAL; | 863 | return -EINVAL; |
787 | } | 864 | } |
788 | 865 | ||
866 | /* | ||
867 | * Bail out if user space manipulated the | ||
868 | * futex value. If pi state exists then the | ||
869 | * owner TID must be the same as the user | ||
870 | * space TID. [9/10] | ||
871 | */ | ||
872 | if (pid != task_pid_vnr(pi_state->owner)) | ||
873 | return -EINVAL; | ||
874 | |||
875 | out_state: | ||
789 | atomic_inc(&pi_state->refcount); | 876 | atomic_inc(&pi_state->refcount); |
790 | *ps = pi_state; | 877 | *ps = pi_state; |
791 | |||
792 | return 0; | 878 | return 0; |
793 | } | 879 | } |
794 | } | 880 | } |
795 | 881 | ||
796 | /* | 882 | /* |
797 | * We are the first waiter - try to look up the real owner and attach | 883 | * We are the first waiter - try to look up the real owner and attach |
798 | * the new pi_state to it, but bail out when TID = 0 | 884 | * the new pi_state to it, but bail out when TID = 0 [1] |
799 | */ | 885 | */ |
800 | if (!pid) | 886 | if (!pid) |
801 | return -ESRCH; | 887 | return -ESRCH; |
@@ -803,6 +889,11 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, | |||
803 | if (!p) | 889 | if (!p) |
804 | return -ESRCH; | 890 | return -ESRCH; |
805 | 891 | ||
892 | if (!p->mm) { | ||
893 | put_task_struct(p); | ||
894 | return -EPERM; | ||
895 | } | ||
896 | |||
806 | /* | 897 | /* |
807 | * We need to look at the task state flags to figure out, | 898 | * We need to look at the task state flags to figure out, |
808 | * whether the task is exiting. To protect against the do_exit | 899 | * whether the task is exiting. To protect against the do_exit |
@@ -823,6 +914,9 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, | |||
823 | return ret; | 914 | return ret; |
824 | } | 915 | } |
825 | 916 | ||
917 | /* | ||
918 | * No existing pi state. First waiter. [2] | ||
919 | */ | ||
826 | pi_state = alloc_pi_state(); | 920 | pi_state = alloc_pi_state(); |
827 | 921 | ||
828 | /* | 922 | /* |
@@ -894,10 +988,18 @@ retry: | |||
894 | return -EDEADLK; | 988 | return -EDEADLK; |
895 | 989 | ||
896 | /* | 990 | /* |
897 | * Surprise - we got the lock. Just return to userspace: | 991 | * Surprise - we got the lock, but we do not trust user space at all. |
898 | */ | 992 | */ |
899 | if (unlikely(!curval)) | 993 | if (unlikely(!curval)) { |
900 | return 1; | 994 | /* |
995 | * We verify whether there is kernel state for this | ||
996 | * futex. If not, we can safely assume, that the 0 -> | ||
997 | * TID transition is correct. If state exists, we do | ||
998 | * not bother to fixup the user space state as it was | ||
999 | * corrupted already. | ||
1000 | */ | ||
1001 | return futex_top_waiter(hb, key) ? -EINVAL : 1; | ||
1002 | } | ||
901 | 1003 | ||
902 | uval = curval; | 1004 | uval = curval; |
903 | 1005 | ||
@@ -1028,6 +1130,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) | |||
1028 | struct task_struct *new_owner; | 1130 | struct task_struct *new_owner; |
1029 | struct futex_pi_state *pi_state = this->pi_state; | 1131 | struct futex_pi_state *pi_state = this->pi_state; |
1030 | u32 uninitialized_var(curval), newval; | 1132 | u32 uninitialized_var(curval), newval; |
1133 | int ret = 0; | ||
1031 | 1134 | ||
1032 | if (!pi_state) | 1135 | if (!pi_state) |
1033 | return -EINVAL; | 1136 | return -EINVAL; |
@@ -1051,23 +1154,19 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) | |||
1051 | new_owner = this->task; | 1154 | new_owner = this->task; |
1052 | 1155 | ||
1053 | /* | 1156 | /* |
1054 | * We pass it to the next owner. (The WAITERS bit is always | 1157 | * We pass it to the next owner. The WAITERS bit is always |
1055 | * kept enabled while there is PI state around. We must also | 1158 | * kept enabled while there is PI state around. We cleanup the |
1056 | * preserve the owner died bit.) | 1159 | * owner died bit, because we are the owner. |
1057 | */ | 1160 | */ |
1058 | if (!(uval & FUTEX_OWNER_DIED)) { | 1161 | newval = FUTEX_WAITERS | task_pid_vnr(new_owner); |
1059 | int ret = 0; | ||
1060 | 1162 | ||
1061 | newval = FUTEX_WAITERS | task_pid_vnr(new_owner); | 1163 | if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) |
1062 | 1164 | ret = -EFAULT; | |
1063 | if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) | 1165 | else if (curval != uval) |
1064 | ret = -EFAULT; | 1166 | ret = -EINVAL; |
1065 | else if (curval != uval) | 1167 | if (ret) { |
1066 | ret = -EINVAL; | 1168 | raw_spin_unlock(&pi_state->pi_mutex.wait_lock); |
1067 | if (ret) { | 1169 | return ret; |
1068 | raw_spin_unlock(&pi_state->pi_mutex.wait_lock); | ||
1069 | return ret; | ||
1070 | } | ||
1071 | } | 1170 | } |
1072 | 1171 | ||
1073 | raw_spin_lock_irq(&pi_state->owner->pi_lock); | 1172 | raw_spin_lock_irq(&pi_state->owner->pi_lock); |
@@ -1347,7 +1446,7 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, | |||
1347 | * | 1446 | * |
1348 | * Return: | 1447 | * Return: |
1349 | * 0 - failed to acquire the lock atomically; | 1448 | * 0 - failed to acquire the lock atomically; |
1350 | * 1 - acquired the lock; | 1449 | * >0 - acquired the lock, return value is vpid of the top_waiter |
1351 | * <0 - error | 1450 | * <0 - error |
1352 | */ | 1451 | */ |
1353 | static int futex_proxy_trylock_atomic(u32 __user *pifutex, | 1452 | static int futex_proxy_trylock_atomic(u32 __user *pifutex, |
@@ -1358,7 +1457,7 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, | |||
1358 | { | 1457 | { |
1359 | struct futex_q *top_waiter = NULL; | 1458 | struct futex_q *top_waiter = NULL; |
1360 | u32 curval; | 1459 | u32 curval; |
1361 | int ret; | 1460 | int ret, vpid; |
1362 | 1461 | ||
1363 | if (get_futex_value_locked(&curval, pifutex)) | 1462 | if (get_futex_value_locked(&curval, pifutex)) |
1364 | return -EFAULT; | 1463 | return -EFAULT; |
@@ -1386,11 +1485,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, | |||
1386 | * the contended case or if set_waiters is 1. The pi_state is returned | 1485 | * the contended case or if set_waiters is 1. The pi_state is returned |
1387 | * in ps in contended cases. | 1486 | * in ps in contended cases. |
1388 | */ | 1487 | */ |
1488 | vpid = task_pid_vnr(top_waiter->task); | ||
1389 | ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, | 1489 | ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, |
1390 | set_waiters); | 1490 | set_waiters); |
1391 | if (ret == 1) | 1491 | if (ret == 1) { |
1392 | requeue_pi_wake_futex(top_waiter, key2, hb2); | 1492 | requeue_pi_wake_futex(top_waiter, key2, hb2); |
1393 | 1493 | return vpid; | |
1494 | } | ||
1394 | return ret; | 1495 | return ret; |
1395 | } | 1496 | } |
1396 | 1497 | ||
@@ -1421,10 +1522,16 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, | |||
1421 | struct futex_pi_state *pi_state = NULL; | 1522 | struct futex_pi_state *pi_state = NULL; |
1422 | struct futex_hash_bucket *hb1, *hb2; | 1523 | struct futex_hash_bucket *hb1, *hb2; |
1423 | struct futex_q *this, *next; | 1524 | struct futex_q *this, *next; |
1424 | u32 curval2; | ||
1425 | 1525 | ||
1426 | if (requeue_pi) { | 1526 | if (requeue_pi) { |
1427 | /* | 1527 | /* |
1528 | * Requeue PI only works on two distinct uaddrs. This | ||
1529 | * check is only valid for private futexes. See below. | ||
1530 | */ | ||
1531 | if (uaddr1 == uaddr2) | ||
1532 | return -EINVAL; | ||
1533 | |||
1534 | /* | ||
1428 | * requeue_pi requires a pi_state, try to allocate it now | 1535 | * requeue_pi requires a pi_state, try to allocate it now |
1429 | * without any locks in case it fails. | 1536 | * without any locks in case it fails. |
1430 | */ | 1537 | */ |
@@ -1462,6 +1569,15 @@ retry: | |||
1462 | if (unlikely(ret != 0)) | 1569 | if (unlikely(ret != 0)) |
1463 | goto out_put_key1; | 1570 | goto out_put_key1; |
1464 | 1571 | ||
1572 | /* | ||
1573 | * The check above which compares uaddrs is not sufficient for | ||
1574 | * shared futexes. We need to compare the keys: | ||
1575 | */ | ||
1576 | if (requeue_pi && match_futex(&key1, &key2)) { | ||
1577 | ret = -EINVAL; | ||
1578 | goto out_put_keys; | ||
1579 | } | ||
1580 | |||
1465 | hb1 = hash_futex(&key1); | 1581 | hb1 = hash_futex(&key1); |
1466 | hb2 = hash_futex(&key2); | 1582 | hb2 = hash_futex(&key2); |
1467 | 1583 | ||
@@ -1509,16 +1625,25 @@ retry_private: | |||
1509 | * At this point the top_waiter has either taken uaddr2 or is | 1625 | * At this point the top_waiter has either taken uaddr2 or is |
1510 | * waiting on it. If the former, then the pi_state will not | 1626 | * waiting on it. If the former, then the pi_state will not |
1511 | * exist yet, look it up one more time to ensure we have a | 1627 | * exist yet, look it up one more time to ensure we have a |
1512 | * reference to it. | 1628 | * reference to it. If the lock was taken, ret contains the |
1629 | * vpid of the top waiter task. | ||
1513 | */ | 1630 | */ |
1514 | if (ret == 1) { | 1631 | if (ret > 0) { |
1515 | WARN_ON(pi_state); | 1632 | WARN_ON(pi_state); |
1516 | drop_count++; | 1633 | drop_count++; |
1517 | task_count++; | 1634 | task_count++; |
1518 | ret = get_futex_value_locked(&curval2, uaddr2); | 1635 | /* |
1519 | if (!ret) | 1636 | * If we acquired the lock, then the user |
1520 | ret = lookup_pi_state(curval2, hb2, &key2, | 1637 | * space value of uaddr2 should be vpid. It |
1521 | &pi_state); | 1638 | * cannot be changed by the top waiter as it |
1639 | * is blocked on hb2 lock if it tries to do | ||
1640 | * so. If something fiddled with it behind our | ||
1641 | * back the pi state lookup might unearth | ||
1642 | * it. So we rather use the known value than | ||
1643 | * rereading and handing potential crap to | ||
1644 | * lookup_pi_state. | ||
1645 | */ | ||
1646 | ret = lookup_pi_state(ret, hb2, &key2, &pi_state); | ||
1522 | } | 1647 | } |
1523 | 1648 | ||
1524 | switch (ret) { | 1649 | switch (ret) { |
@@ -2301,9 +2426,10 @@ retry: | |||
2301 | /* | 2426 | /* |
2302 | * To avoid races, try to do the TID -> 0 atomic transition | 2427 | * To avoid races, try to do the TID -> 0 atomic transition |
2303 | * again. If it succeeds then we can return without waking | 2428 | * again. If it succeeds then we can return without waking |
2304 | * anyone else up: | 2429 | * anyone else up. We only try this if neither the waiters nor |
2430 | * the owner died bit are set. | ||
2305 | */ | 2431 | */ |
2306 | if (!(uval & FUTEX_OWNER_DIED) && | 2432 | if (!(uval & ~FUTEX_TID_MASK) && |
2307 | cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0)) | 2433 | cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0)) |
2308 | goto pi_faulted; | 2434 | goto pi_faulted; |
2309 | /* | 2435 | /* |
@@ -2333,11 +2459,9 @@ retry: | |||
2333 | /* | 2459 | /* |
2334 | * No waiters - kernel unlocks the futex: | 2460 | * No waiters - kernel unlocks the futex: |
2335 | */ | 2461 | */ |
2336 | if (!(uval & FUTEX_OWNER_DIED)) { | 2462 | ret = unlock_futex_pi(uaddr, uval); |
2337 | ret = unlock_futex_pi(uaddr, uval); | 2463 | if (ret == -EFAULT) |
2338 | if (ret == -EFAULT) | 2464 | goto pi_faulted; |
2339 | goto pi_faulted; | ||
2340 | } | ||
2341 | 2465 | ||
2342 | out_unlock: | 2466 | out_unlock: |
2343 | spin_unlock(&hb->lock); | 2467 | spin_unlock(&hb->lock); |
@@ -2499,6 +2623,15 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, | |||
2499 | if (ret) | 2623 | if (ret) |
2500 | goto out_key2; | 2624 | goto out_key2; |
2501 | 2625 | ||
2626 | /* | ||
2627 | * The check above which compares uaddrs is not sufficient for | ||
2628 | * shared futexes. We need to compare the keys: | ||
2629 | */ | ||
2630 | if (match_futex(&q.key, &key2)) { | ||
2631 | ret = -EINVAL; | ||
2632 | goto out_put_keys; | ||
2633 | } | ||
2634 | |||
2502 | /* Queue the futex_q, drop the hb lock, wait for wakeup. */ | 2635 | /* Queue the futex_q, drop the hb lock, wait for wakeup. */ |
2503 | futex_wait_queue_me(hb, &q, to); | 2636 | futex_wait_queue_me(hb, &q, to); |
2504 | 2637 | ||
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c index f45b75b713c0..b358a802fd18 100644 --- a/kernel/gcov/base.c +++ b/kernel/gcov/base.c | |||
@@ -85,6 +85,12 @@ void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters) | |||
85 | } | 85 | } |
86 | EXPORT_SYMBOL(__gcov_merge_ior); | 86 | EXPORT_SYMBOL(__gcov_merge_ior); |
87 | 87 | ||
88 | void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters) | ||
89 | { | ||
90 | /* Unused. */ | ||
91 | } | ||
92 | EXPORT_SYMBOL(__gcov_merge_time_profile); | ||
93 | |||
88 | /** | 94 | /** |
89 | * gcov_enable_events - enable event reporting through gcov_event() | 95 | * gcov_enable_events - enable event reporting through gcov_event() |
90 | * | 96 | * |
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index 2c6e4631c814..826ba9fb5e32 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c | |||
@@ -18,7 +18,12 @@ | |||
18 | #include <linux/vmalloc.h> | 18 | #include <linux/vmalloc.h> |
19 | #include "gcov.h" | 19 | #include "gcov.h" |
20 | 20 | ||
21 | #if __GNUC__ == 4 && __GNUC_MINOR__ >= 9 | ||
22 | #define GCOV_COUNTERS 9 | ||
23 | #else | ||
21 | #define GCOV_COUNTERS 8 | 24 | #define GCOV_COUNTERS 8 |
25 | #endif | ||
26 | |||
22 | #define GCOV_TAG_FUNCTION_LENGTH 3 | 27 | #define GCOV_TAG_FUNCTION_LENGTH 3 |
23 | 28 | ||
24 | static struct gcov_info *gcov_info_head; | 29 | static struct gcov_info *gcov_info_head; |
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 6b715c0af1b1..3ab28993f6e0 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c | |||
@@ -990,11 +990,8 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, | |||
990 | /* Remove an active timer from the queue: */ | 990 | /* Remove an active timer from the queue: */ |
991 | ret = remove_hrtimer(timer, base); | 991 | ret = remove_hrtimer(timer, base); |
992 | 992 | ||
993 | /* Switch the timer base, if necessary: */ | ||
994 | new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED); | ||
995 | |||
996 | if (mode & HRTIMER_MODE_REL) { | 993 | if (mode & HRTIMER_MODE_REL) { |
997 | tim = ktime_add_safe(tim, new_base->get_time()); | 994 | tim = ktime_add_safe(tim, base->get_time()); |
998 | /* | 995 | /* |
999 | * CONFIG_TIME_LOW_RES is a temporary way for architectures | 996 | * CONFIG_TIME_LOW_RES is a temporary way for architectures |
1000 | * to signal that they simply return xtime in | 997 | * to signal that they simply return xtime in |
@@ -1009,6 +1006,9 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, | |||
1009 | 1006 | ||
1010 | hrtimer_set_expires_range_ns(timer, tim, delta_ns); | 1007 | hrtimer_set_expires_range_ns(timer, tim, delta_ns); |
1011 | 1008 | ||
1009 | /* Switch the timer base, if necessary: */ | ||
1010 | new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED); | ||
1011 | |||
1012 | timer_stats_hrtimer_set_start_info(timer); | 1012 | timer_stats_hrtimer_set_start_info(timer); |
1013 | 1013 | ||
1014 | leftmost = enqueue_hrtimer(timer, new_base); | 1014 | leftmost = enqueue_hrtimer(timer, new_base); |
@@ -1039,6 +1039,7 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, | |||
1039 | 1039 | ||
1040 | return ret; | 1040 | return ret; |
1041 | } | 1041 | } |
1042 | EXPORT_SYMBOL_GPL(__hrtimer_start_range_ns); | ||
1042 | 1043 | ||
1043 | /** | 1044 | /** |
1044 | * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU | 1045 | * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU |
diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 06bb1417b063..06db12434d72 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c | |||
@@ -52,8 +52,10 @@ unsigned int __read_mostly sysctl_hung_task_panic = | |||
52 | 52 | ||
53 | static int __init hung_task_panic_setup(char *str) | 53 | static int __init hung_task_panic_setup(char *str) |
54 | { | 54 | { |
55 | sysctl_hung_task_panic = simple_strtoul(str, NULL, 0); | 55 | int rc = kstrtouint(str, 0, &sysctl_hung_task_panic); |
56 | 56 | ||
57 | if (rc) | ||
58 | return rc; | ||
57 | return 1; | 59 | return 1; |
58 | } | 60 | } |
59 | __setup("hung_task_panic=", hung_task_panic_setup); | 61 | __setup("hung_task_panic=", hung_task_panic_setup); |
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 07cbdfea9ae2..d269cecdfbf0 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig | |||
@@ -5,6 +5,10 @@ menu "IRQ subsystem" | |||
5 | config MAY_HAVE_SPARSE_IRQ | 5 | config MAY_HAVE_SPARSE_IRQ |
6 | bool | 6 | bool |
7 | 7 | ||
8 | # Legacy support, required for itanic | ||
9 | config GENERIC_IRQ_LEGACY | ||
10 | bool | ||
11 | |||
8 | # Enable the generic irq autoprobe mechanism | 12 | # Enable the generic irq autoprobe mechanism |
9 | config GENERIC_IRQ_PROBE | 13 | config GENERIC_IRQ_PROBE |
10 | bool | 14 | bool |
@@ -17,6 +21,11 @@ config GENERIC_IRQ_SHOW | |||
17 | config GENERIC_IRQ_SHOW_LEVEL | 21 | config GENERIC_IRQ_SHOW_LEVEL |
18 | bool | 22 | bool |
19 | 23 | ||
24 | # Facility to allocate a hardware interrupt. This is legacy support | ||
25 | # and should not be used in new code. Use irq domains instead. | ||
26 | config GENERIC_IRQ_LEGACY_ALLOC_HWIRQ | ||
27 | bool | ||
28 | |||
20 | # Support for delayed migration from interrupt context | 29 | # Support for delayed migration from interrupt context |
21 | config GENERIC_PENDING_IRQ | 30 | config GENERIC_PENDING_IRQ |
22 | bool | 31 | bool |
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 6397df2d6945..a2b28a2fd7b1 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
@@ -40,10 +40,9 @@ int irq_set_chip(unsigned int irq, struct irq_chip *chip) | |||
40 | irq_put_desc_unlock(desc, flags); | 40 | irq_put_desc_unlock(desc, flags); |
41 | /* | 41 | /* |
42 | * For !CONFIG_SPARSE_IRQ make the irq show up in | 42 | * For !CONFIG_SPARSE_IRQ make the irq show up in |
43 | * allocated_irqs. For the CONFIG_SPARSE_IRQ case, it is | 43 | * allocated_irqs. |
44 | * already marked, and this call is harmless. | ||
45 | */ | 44 | */ |
46 | irq_reserve_irq(irq); | 45 | irq_mark_irq(irq); |
47 | return 0; | 46 | return 0; |
48 | } | 47 | } |
49 | EXPORT_SYMBOL(irq_set_chip); | 48 | EXPORT_SYMBOL(irq_set_chip); |
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index ddf1ffeb79f1..099ea2e0eb88 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h | |||
@@ -33,7 +33,7 @@ enum { | |||
33 | }; | 33 | }; |
34 | 34 | ||
35 | /* | 35 | /* |
36 | * Bit masks for desc->state | 36 | * Bit masks for desc->core_internal_state__do_not_mess_with_it |
37 | * | 37 | * |
38 | * IRQS_AUTODETECT - autodetection in progress | 38 | * IRQS_AUTODETECT - autodetection in progress |
39 | * IRQS_SPURIOUS_DISABLED - was disabled due to spurious interrupt | 39 | * IRQS_SPURIOUS_DISABLED - was disabled due to spurious interrupt |
@@ -76,6 +76,12 @@ extern void mask_irq(struct irq_desc *desc); | |||
76 | extern void unmask_irq(struct irq_desc *desc); | 76 | extern void unmask_irq(struct irq_desc *desc); |
77 | extern void unmask_threaded_irq(struct irq_desc *desc); | 77 | extern void unmask_threaded_irq(struct irq_desc *desc); |
78 | 78 | ||
79 | #ifdef CONFIG_SPARSE_IRQ | ||
80 | static inline void irq_mark_irq(unsigned int irq) { } | ||
81 | #else | ||
82 | extern void irq_mark_irq(unsigned int irq); | ||
83 | #endif | ||
84 | |||
79 | extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); | 85 | extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); |
80 | 86 | ||
81 | irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action); | 87 | irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action); |
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index bb07f2928f4b..7339e42a85ab 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c | |||
@@ -278,7 +278,12 @@ EXPORT_SYMBOL(irq_to_desc); | |||
278 | 278 | ||
279 | static void free_desc(unsigned int irq) | 279 | static void free_desc(unsigned int irq) |
280 | { | 280 | { |
281 | dynamic_irq_cleanup(irq); | 281 | struct irq_desc *desc = irq_to_desc(irq); |
282 | unsigned long flags; | ||
283 | |||
284 | raw_spin_lock_irqsave(&desc->lock, flags); | ||
285 | desc_set_defaults(irq, desc, desc_node(desc), NULL); | ||
286 | raw_spin_unlock_irqrestore(&desc->lock, flags); | ||
282 | } | 287 | } |
283 | 288 | ||
284 | static inline int alloc_descs(unsigned int start, unsigned int cnt, int node, | 289 | static inline int alloc_descs(unsigned int start, unsigned int cnt, int node, |
@@ -299,6 +304,20 @@ static int irq_expand_nr_irqs(unsigned int nr) | |||
299 | return -ENOMEM; | 304 | return -ENOMEM; |
300 | } | 305 | } |
301 | 306 | ||
307 | void irq_mark_irq(unsigned int irq) | ||
308 | { | ||
309 | mutex_lock(&sparse_irq_lock); | ||
310 | bitmap_set(allocated_irqs, irq, 1); | ||
311 | mutex_unlock(&sparse_irq_lock); | ||
312 | } | ||
313 | |||
314 | #ifdef CONFIG_GENERIC_IRQ_LEGACY | ||
315 | void irq_init_desc(unsigned int irq) | ||
316 | { | ||
317 | free_desc(irq); | ||
318 | } | ||
319 | #endif | ||
320 | |||
302 | #endif /* !CONFIG_SPARSE_IRQ */ | 321 | #endif /* !CONFIG_SPARSE_IRQ */ |
303 | 322 | ||
304 | /** | 323 | /** |
@@ -396,30 +415,56 @@ err: | |||
396 | } | 415 | } |
397 | EXPORT_SYMBOL_GPL(__irq_alloc_descs); | 416 | EXPORT_SYMBOL_GPL(__irq_alloc_descs); |
398 | 417 | ||
418 | #ifdef CONFIG_GENERIC_IRQ_LEGACY_ALLOC_HWIRQ | ||
399 | /** | 419 | /** |
400 | * irq_reserve_irqs - mark irqs allocated | 420 | * irq_alloc_hwirqs - Allocate an irq descriptor and initialize the hardware |
401 | * @from: mark from irq number | 421 | * @cnt: number of interrupts to allocate |
402 | * @cnt: number of irqs to mark | 422 | * @node: node on which to allocate |
403 | * | 423 | * |
404 | * Returns 0 on success or an appropriate error code | 424 | * Returns an interrupt number > 0 or 0, if the allocation fails. |
405 | */ | 425 | */ |
406 | int irq_reserve_irqs(unsigned int from, unsigned int cnt) | 426 | unsigned int irq_alloc_hwirqs(int cnt, int node) |
407 | { | 427 | { |
408 | unsigned int start; | 428 | int i, irq = __irq_alloc_descs(-1, 0, cnt, node, NULL); |
409 | int ret = 0; | ||
410 | 429 | ||
411 | if (!cnt || (from + cnt) > nr_irqs) | 430 | if (irq < 0) |
412 | return -EINVAL; | 431 | return 0; |
413 | 432 | ||
414 | mutex_lock(&sparse_irq_lock); | 433 | for (i = irq; cnt > 0; i++, cnt--) { |
415 | start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0); | 434 | if (arch_setup_hwirq(i, node)) |
416 | if (start == from) | 435 | goto err; |
417 | bitmap_set(allocated_irqs, start, cnt); | 436 | irq_clear_status_flags(i, _IRQ_NOREQUEST); |
418 | else | 437 | } |
419 | ret = -EEXIST; | 438 | return irq; |
420 | mutex_unlock(&sparse_irq_lock); | 439 | |
421 | return ret; | 440 | err: |
441 | for (i--; i >= irq; i--) { | ||
442 | irq_set_status_flags(i, _IRQ_NOREQUEST | _IRQ_NOPROBE); | ||
443 | arch_teardown_hwirq(i); | ||
444 | } | ||
445 | irq_free_descs(irq, cnt); | ||
446 | return 0; | ||
447 | } | ||
448 | EXPORT_SYMBOL_GPL(irq_alloc_hwirqs); | ||
449 | |||
450 | /** | ||
451 | * irq_free_hwirqs - Free irq descriptor and cleanup the hardware | ||
452 | * @from: Free from irq number | ||
453 | * @cnt: number of interrupts to free | ||
454 | * | ||
455 | */ | ||
456 | void irq_free_hwirqs(unsigned int from, int cnt) | ||
457 | { | ||
458 | int i; | ||
459 | |||
460 | for (i = from; cnt > 0; i++, cnt--) { | ||
461 | irq_set_status_flags(i, _IRQ_NOREQUEST | _IRQ_NOPROBE); | ||
462 | arch_teardown_hwirq(i); | ||
463 | } | ||
464 | irq_free_descs(from, cnt); | ||
422 | } | 465 | } |
466 | EXPORT_SYMBOL_GPL(irq_free_hwirqs); | ||
467 | #endif | ||
423 | 468 | ||
424 | /** | 469 | /** |
425 | * irq_get_next_irq - get next allocated irq number | 470 | * irq_get_next_irq - get next allocated irq number |
@@ -482,20 +527,6 @@ int irq_set_percpu_devid(unsigned int irq) | |||
482 | return 0; | 527 | return 0; |
483 | } | 528 | } |
484 | 529 | ||
485 | /** | ||
486 | * dynamic_irq_cleanup - cleanup a dynamically allocated irq | ||
487 | * @irq: irq number to initialize | ||
488 | */ | ||
489 | void dynamic_irq_cleanup(unsigned int irq) | ||
490 | { | ||
491 | struct irq_desc *desc = irq_to_desc(irq); | ||
492 | unsigned long flags; | ||
493 | |||
494 | raw_spin_lock_irqsave(&desc->lock, flags); | ||
495 | desc_set_defaults(irq, desc, desc_node(desc), NULL); | ||
496 | raw_spin_unlock_irqrestore(&desc->lock, flags); | ||
497 | } | ||
498 | |||
499 | void kstat_incr_irq_this_cpu(unsigned int irq) | 530 | void kstat_incr_irq_this_cpu(unsigned int irq) |
500 | { | 531 | { |
501 | kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); | 532 | kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); |
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index f14033700c25..eb5e10e32e05 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c | |||
@@ -27,14 +27,14 @@ static struct irq_domain *irq_default_domain; | |||
27 | * __irq_domain_add() - Allocate a new irq_domain data structure | 27 | * __irq_domain_add() - Allocate a new irq_domain data structure |
28 | * @of_node: optional device-tree node of the interrupt controller | 28 | * @of_node: optional device-tree node of the interrupt controller |
29 | * @size: Size of linear map; 0 for radix mapping only | 29 | * @size: Size of linear map; 0 for radix mapping only |
30 | * @hwirq_max: Maximum number of interrupts supported by controller | ||
30 | * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no | 31 | * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no |
31 | * direct mapping | 32 | * direct mapping |
32 | * @ops: map/unmap domain callbacks | 33 | * @ops: map/unmap domain callbacks |
33 | * @host_data: Controller private data pointer | 34 | * @host_data: Controller private data pointer |
34 | * | 35 | * |
35 | * Allocates and initialize and irq_domain structure. Caller is expected to | 36 | * Allocates and initialize and irq_domain structure. |
36 | * register allocated irq_domain with irq_domain_register(). Returns pointer | 37 | * Returns pointer to IRQ domain, or NULL on failure. |
37 | * to IRQ domain, or NULL on failure. | ||
38 | */ | 38 | */ |
39 | struct irq_domain *__irq_domain_add(struct device_node *of_node, int size, | 39 | struct irq_domain *__irq_domain_add(struct device_node *of_node, int size, |
40 | irq_hw_number_t hwirq_max, int direct_max, | 40 | irq_hw_number_t hwirq_max, int direct_max, |
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index d34131ca372b..3dc6a61bf06a 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
@@ -886,8 +886,8 @@ static int irq_thread(void *data) | |||
886 | irq_thread_check_affinity(desc, action); | 886 | irq_thread_check_affinity(desc, action); |
887 | 887 | ||
888 | action_ret = handler_fn(desc, action); | 888 | action_ret = handler_fn(desc, action); |
889 | if (!noirqdebug) | 889 | if (action_ret == IRQ_HANDLED) |
890 | note_interrupt(action->irq, desc, action_ret); | 890 | atomic_inc(&desc->threads_handled); |
891 | 891 | ||
892 | wake_threads_waitq(desc); | 892 | wake_threads_waitq(desc); |
893 | } | 893 | } |
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index a1d8cc63b56e..e2514b0e439e 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c | |||
@@ -270,6 +270,8 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc, | |||
270 | return action && (action->flags & IRQF_IRQPOLL); | 270 | return action && (action->flags & IRQF_IRQPOLL); |
271 | } | 271 | } |
272 | 272 | ||
273 | #define SPURIOUS_DEFERRED 0x80000000 | ||
274 | |||
273 | void note_interrupt(unsigned int irq, struct irq_desc *desc, | 275 | void note_interrupt(unsigned int irq, struct irq_desc *desc, |
274 | irqreturn_t action_ret) | 276 | irqreturn_t action_ret) |
275 | { | 277 | { |
@@ -277,15 +279,111 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, | |||
277 | irq_settings_is_polled(desc)) | 279 | irq_settings_is_polled(desc)) |
278 | return; | 280 | return; |
279 | 281 | ||
280 | /* we get here again via the threaded handler */ | ||
281 | if (action_ret == IRQ_WAKE_THREAD) | ||
282 | return; | ||
283 | |||
284 | if (bad_action_ret(action_ret)) { | 282 | if (bad_action_ret(action_ret)) { |
285 | report_bad_irq(irq, desc, action_ret); | 283 | report_bad_irq(irq, desc, action_ret); |
286 | return; | 284 | return; |
287 | } | 285 | } |
288 | 286 | ||
287 | /* | ||
288 | * We cannot call note_interrupt from the threaded handler | ||
289 | * because we need to look at the compound of all handlers | ||
290 | * (primary and threaded). Aside of that in the threaded | ||
291 | * shared case we have no serialization against an incoming | ||
292 | * hardware interrupt while we are dealing with a threaded | ||
293 | * result. | ||
294 | * | ||
295 | * So in case a thread is woken, we just note the fact and | ||
296 | * defer the analysis to the next hardware interrupt. | ||
297 | * | ||
298 | * The threaded handlers store whether they sucessfully | ||
299 | * handled an interrupt and we check whether that number | ||
300 | * changed versus the last invocation. | ||
301 | * | ||
302 | * We could handle all interrupts with the delayed by one | ||
303 | * mechanism, but for the non forced threaded case we'd just | ||
304 | * add pointless overhead to the straight hardirq interrupts | ||
305 | * for the sake of a few lines less code. | ||
306 | */ | ||
307 | if (action_ret & IRQ_WAKE_THREAD) { | ||
308 | /* | ||
309 | * There is a thread woken. Check whether one of the | ||
310 | * shared primary handlers returned IRQ_HANDLED. If | ||
311 | * not we defer the spurious detection to the next | ||
312 | * interrupt. | ||
313 | */ | ||
314 | if (action_ret == IRQ_WAKE_THREAD) { | ||
315 | int handled; | ||
316 | /* | ||
317 | * We use bit 31 of thread_handled_last to | ||
318 | * denote the deferred spurious detection | ||
319 | * active. No locking necessary as | ||
320 | * thread_handled_last is only accessed here | ||
321 | * and we have the guarantee that hard | ||
322 | * interrupts are not reentrant. | ||
323 | */ | ||
324 | if (!(desc->threads_handled_last & SPURIOUS_DEFERRED)) { | ||
325 | desc->threads_handled_last |= SPURIOUS_DEFERRED; | ||
326 | return; | ||
327 | } | ||
328 | /* | ||
329 | * Check whether one of the threaded handlers | ||
330 | * returned IRQ_HANDLED since the last | ||
331 | * interrupt happened. | ||
332 | * | ||
333 | * For simplicity we just set bit 31, as it is | ||
334 | * set in threads_handled_last as well. So we | ||
335 | * avoid extra masking. And we really do not | ||
336 | * care about the high bits of the handled | ||
337 | * count. We just care about the count being | ||
338 | * different than the one we saw before. | ||
339 | */ | ||
340 | handled = atomic_read(&desc->threads_handled); | ||
341 | handled |= SPURIOUS_DEFERRED; | ||
342 | if (handled != desc->threads_handled_last) { | ||
343 | action_ret = IRQ_HANDLED; | ||
344 | /* | ||
345 | * Note: We keep the SPURIOUS_DEFERRED | ||
346 | * bit set. We are handling the | ||
347 | * previous invocation right now. | ||
348 | * Keep it for the current one, so the | ||
349 | * next hardware interrupt will | ||
350 | * account for it. | ||
351 | */ | ||
352 | desc->threads_handled_last = handled; | ||
353 | } else { | ||
354 | /* | ||
355 | * None of the threaded handlers felt | ||
356 | * responsible for the last interrupt | ||
357 | * | ||
358 | * We keep the SPURIOUS_DEFERRED bit | ||
359 | * set in threads_handled_last as we | ||
360 | * need to account for the current | ||
361 | * interrupt as well. | ||
362 | */ | ||
363 | action_ret = IRQ_NONE; | ||
364 | } | ||
365 | } else { | ||
366 | /* | ||
367 | * One of the primary handlers returned | ||
368 | * IRQ_HANDLED. So we don't care about the | ||
369 | * threaded handlers on the same line. Clear | ||
370 | * the deferred detection bit. | ||
371 | * | ||
372 | * In theory we could/should check whether the | ||
373 | * deferred bit is set and take the result of | ||
374 | * the previous run into account here as | ||
375 | * well. But it's really not worth the | ||
376 | * trouble. If every other interrupt is | ||
377 | * handled we never trigger the spurious | ||
378 | * detector. And if this is just the one out | ||
379 | * of 100k unhandled ones which is handled | ||
380 | * then we merily delay the spurious detection | ||
381 | * by one hard interrupt. Not a real problem. | ||
382 | */ | ||
383 | desc->threads_handled_last &= ~SPURIOUS_DEFERRED; | ||
384 | } | ||
385 | } | ||
386 | |||
289 | if (unlikely(action_ret == IRQ_NONE)) { | 387 | if (unlikely(action_ret == IRQ_NONE)) { |
290 | /* | 388 | /* |
291 | * If we are seeing only the odd spurious IRQ caused by | 389 | * If we are seeing only the odd spurious IRQ caused by |
diff --git a/kernel/kexec.c b/kernel/kexec.c index c8380ad203bc..6748688813d0 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
@@ -125,8 +125,8 @@ static struct page *kimage_alloc_page(struct kimage *image, | |||
125 | unsigned long dest); | 125 | unsigned long dest); |
126 | 126 | ||
127 | static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, | 127 | static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, |
128 | unsigned long nr_segments, | 128 | unsigned long nr_segments, |
129 | struct kexec_segment __user *segments) | 129 | struct kexec_segment __user *segments) |
130 | { | 130 | { |
131 | size_t segment_bytes; | 131 | size_t segment_bytes; |
132 | struct kimage *image; | 132 | struct kimage *image; |
@@ -257,13 +257,13 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, | |||
257 | image->control_code_page = kimage_alloc_control_pages(image, | 257 | image->control_code_page = kimage_alloc_control_pages(image, |
258 | get_order(KEXEC_CONTROL_PAGE_SIZE)); | 258 | get_order(KEXEC_CONTROL_PAGE_SIZE)); |
259 | if (!image->control_code_page) { | 259 | if (!image->control_code_page) { |
260 | printk(KERN_ERR "Could not allocate control_code_buffer\n"); | 260 | pr_err("Could not allocate control_code_buffer\n"); |
261 | goto out_free; | 261 | goto out_free; |
262 | } | 262 | } |
263 | 263 | ||
264 | image->swap_page = kimage_alloc_control_pages(image, 0); | 264 | image->swap_page = kimage_alloc_control_pages(image, 0); |
265 | if (!image->swap_page) { | 265 | if (!image->swap_page) { |
266 | printk(KERN_ERR "Could not allocate swap buffer\n"); | 266 | pr_err("Could not allocate swap buffer\n"); |
267 | goto out_free; | 267 | goto out_free; |
268 | } | 268 | } |
269 | 269 | ||
@@ -332,7 +332,7 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, | |||
332 | image->control_code_page = kimage_alloc_control_pages(image, | 332 | image->control_code_page = kimage_alloc_control_pages(image, |
333 | get_order(KEXEC_CONTROL_PAGE_SIZE)); | 333 | get_order(KEXEC_CONTROL_PAGE_SIZE)); |
334 | if (!image->control_code_page) { | 334 | if (!image->control_code_page) { |
335 | printk(KERN_ERR "Could not allocate control_code_buffer\n"); | 335 | pr_err("Could not allocate control_code_buffer\n"); |
336 | goto out_free; | 336 | goto out_free; |
337 | } | 337 | } |
338 | 338 | ||
@@ -621,8 +621,8 @@ static void kimage_terminate(struct kimage *image) | |||
621 | 621 | ||
622 | #define for_each_kimage_entry(image, ptr, entry) \ | 622 | #define for_each_kimage_entry(image, ptr, entry) \ |
623 | for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ | 623 | for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ |
624 | ptr = (entry & IND_INDIRECTION)? \ | 624 | ptr = (entry & IND_INDIRECTION) ? \ |
625 | phys_to_virt((entry & PAGE_MASK)): ptr +1) | 625 | phys_to_virt((entry & PAGE_MASK)) : ptr + 1) |
626 | 626 | ||
627 | static void kimage_free_entry(kimage_entry_t entry) | 627 | static void kimage_free_entry(kimage_entry_t entry) |
628 | { | 628 | { |
@@ -650,8 +650,7 @@ static void kimage_free(struct kimage *image) | |||
650 | * done with it. | 650 | * done with it. |
651 | */ | 651 | */ |
652 | ind = entry; | 652 | ind = entry; |
653 | } | 653 | } else if (entry & IND_SOURCE) |
654 | else if (entry & IND_SOURCE) | ||
655 | kimage_free_entry(entry); | 654 | kimage_free_entry(entry); |
656 | } | 655 | } |
657 | /* Free the final indirection page */ | 656 | /* Free the final indirection page */ |
@@ -774,8 +773,7 @@ static struct page *kimage_alloc_page(struct kimage *image, | |||
774 | addr = old_addr; | 773 | addr = old_addr; |
775 | page = old_page; | 774 | page = old_page; |
776 | break; | 775 | break; |
777 | } | 776 | } else { |
778 | else { | ||
779 | /* Place the page on the destination list I | 777 | /* Place the page on the destination list I |
780 | * will use it later. | 778 | * will use it later. |
781 | */ | 779 | */ |
@@ -1059,7 +1057,7 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, | |||
1059 | return -EINVAL; | 1057 | return -EINVAL; |
1060 | 1058 | ||
1061 | ksegments = compat_alloc_user_space(nr_segments * sizeof(out)); | 1059 | ksegments = compat_alloc_user_space(nr_segments * sizeof(out)); |
1062 | for (i=0; i < nr_segments; i++) { | 1060 | for (i = 0; i < nr_segments; i++) { |
1063 | result = copy_from_user(&in, &segments[i], sizeof(in)); | 1061 | result = copy_from_user(&in, &segments[i], sizeof(in)); |
1064 | if (result) | 1062 | if (result) |
1065 | return -EFAULT; | 1063 | return -EFAULT; |
@@ -1214,14 +1212,14 @@ void crash_save_cpu(struct pt_regs *regs, int cpu) | |||
1214 | * squirrelled away. ELF notes happen to provide | 1212 | * squirrelled away. ELF notes happen to provide |
1215 | * all of that, so there is no need to invent something new. | 1213 | * all of that, so there is no need to invent something new. |
1216 | */ | 1214 | */ |
1217 | buf = (u32*)per_cpu_ptr(crash_notes, cpu); | 1215 | buf = (u32 *)per_cpu_ptr(crash_notes, cpu); |
1218 | if (!buf) | 1216 | if (!buf) |
1219 | return; | 1217 | return; |
1220 | memset(&prstatus, 0, sizeof(prstatus)); | 1218 | memset(&prstatus, 0, sizeof(prstatus)); |
1221 | prstatus.pr_pid = current->pid; | 1219 | prstatus.pr_pid = current->pid; |
1222 | elf_core_copy_kernel_regs(&prstatus.pr_reg, regs); | 1220 | elf_core_copy_kernel_regs(&prstatus.pr_reg, regs); |
1223 | buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, | 1221 | buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, |
1224 | &prstatus, sizeof(prstatus)); | 1222 | &prstatus, sizeof(prstatus)); |
1225 | final_note(buf); | 1223 | final_note(buf); |
1226 | } | 1224 | } |
1227 | 1225 | ||
@@ -1230,8 +1228,7 @@ static int __init crash_notes_memory_init(void) | |||
1230 | /* Allocate memory for saving cpu registers. */ | 1228 | /* Allocate memory for saving cpu registers. */ |
1231 | crash_notes = alloc_percpu(note_buf_t); | 1229 | crash_notes = alloc_percpu(note_buf_t); |
1232 | if (!crash_notes) { | 1230 | if (!crash_notes) { |
1233 | printk("Kexec: Memory allocation for saving cpu register" | 1231 | pr_warn("Kexec: Memory allocation for saving cpu register states failed\n"); |
1234 | " states failed\n"); | ||
1235 | return -ENOMEM; | 1232 | return -ENOMEM; |
1236 | } | 1233 | } |
1237 | return 0; | 1234 | return 0; |
@@ -1253,10 +1250,10 @@ subsys_initcall(crash_notes_memory_init); | |||
1253 | * | 1250 | * |
1254 | * The function returns 0 on success and -EINVAL on failure. | 1251 | * The function returns 0 on success and -EINVAL on failure. |
1255 | */ | 1252 | */ |
1256 | static int __init parse_crashkernel_mem(char *cmdline, | 1253 | static int __init parse_crashkernel_mem(char *cmdline, |
1257 | unsigned long long system_ram, | 1254 | unsigned long long system_ram, |
1258 | unsigned long long *crash_size, | 1255 | unsigned long long *crash_size, |
1259 | unsigned long long *crash_base) | 1256 | unsigned long long *crash_base) |
1260 | { | 1257 | { |
1261 | char *cur = cmdline, *tmp; | 1258 | char *cur = cmdline, *tmp; |
1262 | 1259 | ||
@@ -1267,12 +1264,12 @@ static int __init parse_crashkernel_mem(char *cmdline, | |||
1267 | /* get the start of the range */ | 1264 | /* get the start of the range */ |
1268 | start = memparse(cur, &tmp); | 1265 | start = memparse(cur, &tmp); |
1269 | if (cur == tmp) { | 1266 | if (cur == tmp) { |
1270 | pr_warning("crashkernel: Memory value expected\n"); | 1267 | pr_warn("crashkernel: Memory value expected\n"); |
1271 | return -EINVAL; | 1268 | return -EINVAL; |
1272 | } | 1269 | } |
1273 | cur = tmp; | 1270 | cur = tmp; |
1274 | if (*cur != '-') { | 1271 | if (*cur != '-') { |
1275 | pr_warning("crashkernel: '-' expected\n"); | 1272 | pr_warn("crashkernel: '-' expected\n"); |
1276 | return -EINVAL; | 1273 | return -EINVAL; |
1277 | } | 1274 | } |
1278 | cur++; | 1275 | cur++; |
@@ -1281,31 +1278,30 @@ static int __init parse_crashkernel_mem(char *cmdline, | |||
1281 | if (*cur != ':') { | 1278 | if (*cur != ':') { |
1282 | end = memparse(cur, &tmp); | 1279 | end = memparse(cur, &tmp); |
1283 | if (cur == tmp) { | 1280 | if (cur == tmp) { |
1284 | pr_warning("crashkernel: Memory " | 1281 | pr_warn("crashkernel: Memory value expected\n"); |
1285 | "value expected\n"); | ||
1286 | return -EINVAL; | 1282 | return -EINVAL; |
1287 | } | 1283 | } |
1288 | cur = tmp; | 1284 | cur = tmp; |
1289 | if (end <= start) { | 1285 | if (end <= start) { |
1290 | pr_warning("crashkernel: end <= start\n"); | 1286 | pr_warn("crashkernel: end <= start\n"); |
1291 | return -EINVAL; | 1287 | return -EINVAL; |
1292 | } | 1288 | } |
1293 | } | 1289 | } |
1294 | 1290 | ||
1295 | if (*cur != ':') { | 1291 | if (*cur != ':') { |
1296 | pr_warning("crashkernel: ':' expected\n"); | 1292 | pr_warn("crashkernel: ':' expected\n"); |
1297 | return -EINVAL; | 1293 | return -EINVAL; |
1298 | } | 1294 | } |
1299 | cur++; | 1295 | cur++; |
1300 | 1296 | ||
1301 | size = memparse(cur, &tmp); | 1297 | size = memparse(cur, &tmp); |
1302 | if (cur == tmp) { | 1298 | if (cur == tmp) { |
1303 | pr_warning("Memory value expected\n"); | 1299 | pr_warn("Memory value expected\n"); |
1304 | return -EINVAL; | 1300 | return -EINVAL; |
1305 | } | 1301 | } |
1306 | cur = tmp; | 1302 | cur = tmp; |
1307 | if (size >= system_ram) { | 1303 | if (size >= system_ram) { |
1308 | pr_warning("crashkernel: invalid size\n"); | 1304 | pr_warn("crashkernel: invalid size\n"); |
1309 | return -EINVAL; | 1305 | return -EINVAL; |
1310 | } | 1306 | } |
1311 | 1307 | ||
@@ -1323,8 +1319,7 @@ static int __init parse_crashkernel_mem(char *cmdline, | |||
1323 | cur++; | 1319 | cur++; |
1324 | *crash_base = memparse(cur, &tmp); | 1320 | *crash_base = memparse(cur, &tmp); |
1325 | if (cur == tmp) { | 1321 | if (cur == tmp) { |
1326 | pr_warning("Memory value expected " | 1322 | pr_warn("Memory value expected after '@'\n"); |
1327 | "after '@'\n"); | ||
1328 | return -EINVAL; | 1323 | return -EINVAL; |
1329 | } | 1324 | } |
1330 | } | 1325 | } |
@@ -1336,26 +1331,26 @@ static int __init parse_crashkernel_mem(char *cmdline, | |||
1336 | /* | 1331 | /* |
1337 | * That function parses "simple" (old) crashkernel command lines like | 1332 | * That function parses "simple" (old) crashkernel command lines like |
1338 | * | 1333 | * |
1339 | * crashkernel=size[@offset] | 1334 | * crashkernel=size[@offset] |
1340 | * | 1335 | * |
1341 | * It returns 0 on success and -EINVAL on failure. | 1336 | * It returns 0 on success and -EINVAL on failure. |
1342 | */ | 1337 | */ |
1343 | static int __init parse_crashkernel_simple(char *cmdline, | 1338 | static int __init parse_crashkernel_simple(char *cmdline, |
1344 | unsigned long long *crash_size, | 1339 | unsigned long long *crash_size, |
1345 | unsigned long long *crash_base) | 1340 | unsigned long long *crash_base) |
1346 | { | 1341 | { |
1347 | char *cur = cmdline; | 1342 | char *cur = cmdline; |
1348 | 1343 | ||
1349 | *crash_size = memparse(cmdline, &cur); | 1344 | *crash_size = memparse(cmdline, &cur); |
1350 | if (cmdline == cur) { | 1345 | if (cmdline == cur) { |
1351 | pr_warning("crashkernel: memory value expected\n"); | 1346 | pr_warn("crashkernel: memory value expected\n"); |
1352 | return -EINVAL; | 1347 | return -EINVAL; |
1353 | } | 1348 | } |
1354 | 1349 | ||
1355 | if (*cur == '@') | 1350 | if (*cur == '@') |
1356 | *crash_base = memparse(cur+1, &cur); | 1351 | *crash_base = memparse(cur+1, &cur); |
1357 | else if (*cur != ' ' && *cur != '\0') { | 1352 | else if (*cur != ' ' && *cur != '\0') { |
1358 | pr_warning("crashkernel: unrecognized char\n"); | 1353 | pr_warn("crashkernel: unrecognized char\n"); |
1359 | return -EINVAL; | 1354 | return -EINVAL; |
1360 | } | 1355 | } |
1361 | 1356 | ||
@@ -1683,7 +1678,15 @@ int kernel_kexec(void) | |||
1683 | kexec_in_progress = true; | 1678 | kexec_in_progress = true; |
1684 | kernel_restart_prepare(NULL); | 1679 | kernel_restart_prepare(NULL); |
1685 | migrate_to_reboot_cpu(); | 1680 | migrate_to_reboot_cpu(); |
1686 | printk(KERN_EMERG "Starting new kernel\n"); | 1681 | |
1682 | /* | ||
1683 | * migrate_to_reboot_cpu() disables CPU hotplug assuming that | ||
1684 | * no further code needs to use CPU hotplug (which is true in | ||
1685 | * the reboot case). However, the kexec path depends on using | ||
1686 | * CPU hotplug again; so re-enable it here. | ||
1687 | */ | ||
1688 | cpu_hotplug_enable(); | ||
1689 | pr_emerg("Starting new kernel\n"); | ||
1687 | machine_shutdown(); | 1690 | machine_shutdown(); |
1688 | } | 1691 | } |
1689 | 1692 | ||
diff --git a/kernel/kmod.c b/kernel/kmod.c index 6b375af4958d..8637e041a247 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
@@ -285,10 +285,7 @@ static int wait_for_helper(void *data) | |||
285 | pid_t pid; | 285 | pid_t pid; |
286 | 286 | ||
287 | /* If SIGCLD is ignored sys_wait4 won't populate the status. */ | 287 | /* If SIGCLD is ignored sys_wait4 won't populate the status. */ |
288 | spin_lock_irq(¤t->sighand->siglock); | 288 | kernel_sigaction(SIGCHLD, SIG_DFL); |
289 | current->sighand->action[SIGCHLD-1].sa.sa_handler = SIG_DFL; | ||
290 | spin_unlock_irq(¤t->sighand->siglock); | ||
291 | |||
292 | pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD); | 289 | pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD); |
293 | if (pid < 0) { | 290 | if (pid < 0) { |
294 | sub_info->retval = pid; | 291 | sub_info->retval = pid; |
@@ -498,7 +495,7 @@ int __usermodehelper_disable(enum umh_disable_depth depth) | |||
498 | static void helper_lock(void) | 495 | static void helper_lock(void) |
499 | { | 496 | { |
500 | atomic_inc(&running_helpers); | 497 | atomic_inc(&running_helpers); |
501 | smp_mb__after_atomic_inc(); | 498 | smp_mb__after_atomic(); |
502 | } | 499 | } |
503 | 500 | ||
504 | static void helper_unlock(void) | 501 | static void helper_unlock(void) |
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index ceeadfcabb76..3214289df5a7 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
@@ -86,21 +86,8 @@ static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) | |||
86 | return &(kretprobe_table_locks[hash].lock); | 86 | return &(kretprobe_table_locks[hash].lock); |
87 | } | 87 | } |
88 | 88 | ||
89 | /* | 89 | /* Blacklist -- list of struct kprobe_blacklist_entry */ |
90 | * Normally, functions that we'd want to prohibit kprobes in, are marked | 90 | static LIST_HEAD(kprobe_blacklist); |
91 | * __kprobes. But, there are cases where such functions already belong to | ||
92 | * a different section (__sched for preempt_schedule) | ||
93 | * | ||
94 | * For such cases, we now have a blacklist | ||
95 | */ | ||
96 | static struct kprobe_blackpoint kprobe_blacklist[] = { | ||
97 | {"preempt_schedule",}, | ||
98 | {"native_get_debugreg",}, | ||
99 | {"irq_entries_start",}, | ||
100 | {"common_interrupt",}, | ||
101 | {"mcount",}, /* mcount can be called from everywhere */ | ||
102 | {NULL} /* Terminator */ | ||
103 | }; | ||
104 | 91 | ||
105 | #ifdef __ARCH_WANT_KPROBES_INSN_SLOT | 92 | #ifdef __ARCH_WANT_KPROBES_INSN_SLOT |
106 | /* | 93 | /* |
@@ -151,13 +138,13 @@ struct kprobe_insn_cache kprobe_insn_slots = { | |||
151 | .insn_size = MAX_INSN_SIZE, | 138 | .insn_size = MAX_INSN_SIZE, |
152 | .nr_garbage = 0, | 139 | .nr_garbage = 0, |
153 | }; | 140 | }; |
154 | static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c); | 141 | static int collect_garbage_slots(struct kprobe_insn_cache *c); |
155 | 142 | ||
156 | /** | 143 | /** |
157 | * __get_insn_slot() - Find a slot on an executable page for an instruction. | 144 | * __get_insn_slot() - Find a slot on an executable page for an instruction. |
158 | * We allocate an executable page if there's no room on existing ones. | 145 | * We allocate an executable page if there's no room on existing ones. |
159 | */ | 146 | */ |
160 | kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c) | 147 | kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c) |
161 | { | 148 | { |
162 | struct kprobe_insn_page *kip; | 149 | struct kprobe_insn_page *kip; |
163 | kprobe_opcode_t *slot = NULL; | 150 | kprobe_opcode_t *slot = NULL; |
@@ -214,7 +201,7 @@ out: | |||
214 | } | 201 | } |
215 | 202 | ||
216 | /* Return 1 if all garbages are collected, otherwise 0. */ | 203 | /* Return 1 if all garbages are collected, otherwise 0. */ |
217 | static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx) | 204 | static int collect_one_slot(struct kprobe_insn_page *kip, int idx) |
218 | { | 205 | { |
219 | kip->slot_used[idx] = SLOT_CLEAN; | 206 | kip->slot_used[idx] = SLOT_CLEAN; |
220 | kip->nused--; | 207 | kip->nused--; |
@@ -235,7 +222,7 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx) | |||
235 | return 0; | 222 | return 0; |
236 | } | 223 | } |
237 | 224 | ||
238 | static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c) | 225 | static int collect_garbage_slots(struct kprobe_insn_cache *c) |
239 | { | 226 | { |
240 | struct kprobe_insn_page *kip, *next; | 227 | struct kprobe_insn_page *kip, *next; |
241 | 228 | ||
@@ -257,8 +244,8 @@ static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c) | |||
257 | return 0; | 244 | return 0; |
258 | } | 245 | } |
259 | 246 | ||
260 | void __kprobes __free_insn_slot(struct kprobe_insn_cache *c, | 247 | void __free_insn_slot(struct kprobe_insn_cache *c, |
261 | kprobe_opcode_t *slot, int dirty) | 248 | kprobe_opcode_t *slot, int dirty) |
262 | { | 249 | { |
263 | struct kprobe_insn_page *kip; | 250 | struct kprobe_insn_page *kip; |
264 | 251 | ||
@@ -314,7 +301,7 @@ static inline void reset_kprobe_instance(void) | |||
314 | * OR | 301 | * OR |
315 | * - with preemption disabled - from arch/xxx/kernel/kprobes.c | 302 | * - with preemption disabled - from arch/xxx/kernel/kprobes.c |
316 | */ | 303 | */ |
317 | struct kprobe __kprobes *get_kprobe(void *addr) | 304 | struct kprobe *get_kprobe(void *addr) |
318 | { | 305 | { |
319 | struct hlist_head *head; | 306 | struct hlist_head *head; |
320 | struct kprobe *p; | 307 | struct kprobe *p; |
@@ -327,8 +314,9 @@ struct kprobe __kprobes *get_kprobe(void *addr) | |||
327 | 314 | ||
328 | return NULL; | 315 | return NULL; |
329 | } | 316 | } |
317 | NOKPROBE_SYMBOL(get_kprobe); | ||
330 | 318 | ||
331 | static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs); | 319 | static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs); |
332 | 320 | ||
333 | /* Return true if the kprobe is an aggregator */ | 321 | /* Return true if the kprobe is an aggregator */ |
334 | static inline int kprobe_aggrprobe(struct kprobe *p) | 322 | static inline int kprobe_aggrprobe(struct kprobe *p) |
@@ -360,7 +348,7 @@ static bool kprobes_allow_optimization; | |||
360 | * Call all pre_handler on the list, but ignores its return value. | 348 | * Call all pre_handler on the list, but ignores its return value. |
361 | * This must be called from arch-dep optimized caller. | 349 | * This must be called from arch-dep optimized caller. |
362 | */ | 350 | */ |
363 | void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs) | 351 | void opt_pre_handler(struct kprobe *p, struct pt_regs *regs) |
364 | { | 352 | { |
365 | struct kprobe *kp; | 353 | struct kprobe *kp; |
366 | 354 | ||
@@ -372,9 +360,10 @@ void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs) | |||
372 | reset_kprobe_instance(); | 360 | reset_kprobe_instance(); |
373 | } | 361 | } |
374 | } | 362 | } |
363 | NOKPROBE_SYMBOL(opt_pre_handler); | ||
375 | 364 | ||
376 | /* Free optimized instructions and optimized_kprobe */ | 365 | /* Free optimized instructions and optimized_kprobe */ |
377 | static __kprobes void free_aggr_kprobe(struct kprobe *p) | 366 | static void free_aggr_kprobe(struct kprobe *p) |
378 | { | 367 | { |
379 | struct optimized_kprobe *op; | 368 | struct optimized_kprobe *op; |
380 | 369 | ||
@@ -412,7 +401,7 @@ static inline int kprobe_disarmed(struct kprobe *p) | |||
412 | } | 401 | } |
413 | 402 | ||
414 | /* Return true(!0) if the probe is queued on (un)optimizing lists */ | 403 | /* Return true(!0) if the probe is queued on (un)optimizing lists */ |
415 | static int __kprobes kprobe_queued(struct kprobe *p) | 404 | static int kprobe_queued(struct kprobe *p) |
416 | { | 405 | { |
417 | struct optimized_kprobe *op; | 406 | struct optimized_kprobe *op; |
418 | 407 | ||
@@ -428,7 +417,7 @@ static int __kprobes kprobe_queued(struct kprobe *p) | |||
428 | * Return an optimized kprobe whose optimizing code replaces | 417 | * Return an optimized kprobe whose optimizing code replaces |
429 | * instructions including addr (exclude breakpoint). | 418 | * instructions including addr (exclude breakpoint). |
430 | */ | 419 | */ |
431 | static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr) | 420 | static struct kprobe *get_optimized_kprobe(unsigned long addr) |
432 | { | 421 | { |
433 | int i; | 422 | int i; |
434 | struct kprobe *p = NULL; | 423 | struct kprobe *p = NULL; |
@@ -460,7 +449,7 @@ static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); | |||
460 | * Optimize (replace a breakpoint with a jump) kprobes listed on | 449 | * Optimize (replace a breakpoint with a jump) kprobes listed on |
461 | * optimizing_list. | 450 | * optimizing_list. |
462 | */ | 451 | */ |
463 | static __kprobes void do_optimize_kprobes(void) | 452 | static void do_optimize_kprobes(void) |
464 | { | 453 | { |
465 | /* Optimization never be done when disarmed */ | 454 | /* Optimization never be done when disarmed */ |
466 | if (kprobes_all_disarmed || !kprobes_allow_optimization || | 455 | if (kprobes_all_disarmed || !kprobes_allow_optimization || |
@@ -488,7 +477,7 @@ static __kprobes void do_optimize_kprobes(void) | |||
488 | * Unoptimize (replace a jump with a breakpoint and remove the breakpoint | 477 | * Unoptimize (replace a jump with a breakpoint and remove the breakpoint |
489 | * if need) kprobes listed on unoptimizing_list. | 478 | * if need) kprobes listed on unoptimizing_list. |
490 | */ | 479 | */ |
491 | static __kprobes void do_unoptimize_kprobes(void) | 480 | static void do_unoptimize_kprobes(void) |
492 | { | 481 | { |
493 | struct optimized_kprobe *op, *tmp; | 482 | struct optimized_kprobe *op, *tmp; |
494 | 483 | ||
@@ -520,7 +509,7 @@ static __kprobes void do_unoptimize_kprobes(void) | |||
520 | } | 509 | } |
521 | 510 | ||
522 | /* Reclaim all kprobes on the free_list */ | 511 | /* Reclaim all kprobes on the free_list */ |
523 | static __kprobes void do_free_cleaned_kprobes(void) | 512 | static void do_free_cleaned_kprobes(void) |
524 | { | 513 | { |
525 | struct optimized_kprobe *op, *tmp; | 514 | struct optimized_kprobe *op, *tmp; |
526 | 515 | ||
@@ -532,13 +521,13 @@ static __kprobes void do_free_cleaned_kprobes(void) | |||
532 | } | 521 | } |
533 | 522 | ||
534 | /* Start optimizer after OPTIMIZE_DELAY passed */ | 523 | /* Start optimizer after OPTIMIZE_DELAY passed */ |
535 | static __kprobes void kick_kprobe_optimizer(void) | 524 | static void kick_kprobe_optimizer(void) |
536 | { | 525 | { |
537 | schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY); | 526 | schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY); |
538 | } | 527 | } |
539 | 528 | ||
540 | /* Kprobe jump optimizer */ | 529 | /* Kprobe jump optimizer */ |
541 | static __kprobes void kprobe_optimizer(struct work_struct *work) | 530 | static void kprobe_optimizer(struct work_struct *work) |
542 | { | 531 | { |
543 | mutex_lock(&kprobe_mutex); | 532 | mutex_lock(&kprobe_mutex); |
544 | /* Lock modules while optimizing kprobes */ | 533 | /* Lock modules while optimizing kprobes */ |
@@ -574,7 +563,7 @@ static __kprobes void kprobe_optimizer(struct work_struct *work) | |||
574 | } | 563 | } |
575 | 564 | ||
576 | /* Wait for completing optimization and unoptimization */ | 565 | /* Wait for completing optimization and unoptimization */ |
577 | static __kprobes void wait_for_kprobe_optimizer(void) | 566 | static void wait_for_kprobe_optimizer(void) |
578 | { | 567 | { |
579 | mutex_lock(&kprobe_mutex); | 568 | mutex_lock(&kprobe_mutex); |
580 | 569 | ||
@@ -593,7 +582,7 @@ static __kprobes void wait_for_kprobe_optimizer(void) | |||
593 | } | 582 | } |
594 | 583 | ||
595 | /* Optimize kprobe if p is ready to be optimized */ | 584 | /* Optimize kprobe if p is ready to be optimized */ |
596 | static __kprobes void optimize_kprobe(struct kprobe *p) | 585 | static void optimize_kprobe(struct kprobe *p) |
597 | { | 586 | { |
598 | struct optimized_kprobe *op; | 587 | struct optimized_kprobe *op; |
599 | 588 | ||
@@ -627,7 +616,7 @@ static __kprobes void optimize_kprobe(struct kprobe *p) | |||
627 | } | 616 | } |
628 | 617 | ||
629 | /* Short cut to direct unoptimizing */ | 618 | /* Short cut to direct unoptimizing */ |
630 | static __kprobes void force_unoptimize_kprobe(struct optimized_kprobe *op) | 619 | static void force_unoptimize_kprobe(struct optimized_kprobe *op) |
631 | { | 620 | { |
632 | get_online_cpus(); | 621 | get_online_cpus(); |
633 | arch_unoptimize_kprobe(op); | 622 | arch_unoptimize_kprobe(op); |
@@ -637,7 +626,7 @@ static __kprobes void force_unoptimize_kprobe(struct optimized_kprobe *op) | |||
637 | } | 626 | } |
638 | 627 | ||
639 | /* Unoptimize a kprobe if p is optimized */ | 628 | /* Unoptimize a kprobe if p is optimized */ |
640 | static __kprobes void unoptimize_kprobe(struct kprobe *p, bool force) | 629 | static void unoptimize_kprobe(struct kprobe *p, bool force) |
641 | { | 630 | { |
642 | struct optimized_kprobe *op; | 631 | struct optimized_kprobe *op; |
643 | 632 | ||
@@ -697,7 +686,7 @@ static void reuse_unused_kprobe(struct kprobe *ap) | |||
697 | } | 686 | } |
698 | 687 | ||
699 | /* Remove optimized instructions */ | 688 | /* Remove optimized instructions */ |
700 | static void __kprobes kill_optimized_kprobe(struct kprobe *p) | 689 | static void kill_optimized_kprobe(struct kprobe *p) |
701 | { | 690 | { |
702 | struct optimized_kprobe *op; | 691 | struct optimized_kprobe *op; |
703 | 692 | ||
@@ -723,7 +712,7 @@ static void __kprobes kill_optimized_kprobe(struct kprobe *p) | |||
723 | } | 712 | } |
724 | 713 | ||
725 | /* Try to prepare optimized instructions */ | 714 | /* Try to prepare optimized instructions */ |
726 | static __kprobes void prepare_optimized_kprobe(struct kprobe *p) | 715 | static void prepare_optimized_kprobe(struct kprobe *p) |
727 | { | 716 | { |
728 | struct optimized_kprobe *op; | 717 | struct optimized_kprobe *op; |
729 | 718 | ||
@@ -732,7 +721,7 @@ static __kprobes void prepare_optimized_kprobe(struct kprobe *p) | |||
732 | } | 721 | } |
733 | 722 | ||
734 | /* Allocate new optimized_kprobe and try to prepare optimized instructions */ | 723 | /* Allocate new optimized_kprobe and try to prepare optimized instructions */ |
735 | static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) | 724 | static struct kprobe *alloc_aggr_kprobe(struct kprobe *p) |
736 | { | 725 | { |
737 | struct optimized_kprobe *op; | 726 | struct optimized_kprobe *op; |
738 | 727 | ||
@@ -747,13 +736,13 @@ static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) | |||
747 | return &op->kp; | 736 | return &op->kp; |
748 | } | 737 | } |
749 | 738 | ||
750 | static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p); | 739 | static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p); |
751 | 740 | ||
752 | /* | 741 | /* |
753 | * Prepare an optimized_kprobe and optimize it | 742 | * Prepare an optimized_kprobe and optimize it |
754 | * NOTE: p must be a normal registered kprobe | 743 | * NOTE: p must be a normal registered kprobe |
755 | */ | 744 | */ |
756 | static __kprobes void try_to_optimize_kprobe(struct kprobe *p) | 745 | static void try_to_optimize_kprobe(struct kprobe *p) |
757 | { | 746 | { |
758 | struct kprobe *ap; | 747 | struct kprobe *ap; |
759 | struct optimized_kprobe *op; | 748 | struct optimized_kprobe *op; |
@@ -787,7 +776,7 @@ out: | |||
787 | } | 776 | } |
788 | 777 | ||
789 | #ifdef CONFIG_SYSCTL | 778 | #ifdef CONFIG_SYSCTL |
790 | static void __kprobes optimize_all_kprobes(void) | 779 | static void optimize_all_kprobes(void) |
791 | { | 780 | { |
792 | struct hlist_head *head; | 781 | struct hlist_head *head; |
793 | struct kprobe *p; | 782 | struct kprobe *p; |
@@ -810,7 +799,7 @@ out: | |||
810 | mutex_unlock(&kprobe_mutex); | 799 | mutex_unlock(&kprobe_mutex); |
811 | } | 800 | } |
812 | 801 | ||
813 | static void __kprobes unoptimize_all_kprobes(void) | 802 | static void unoptimize_all_kprobes(void) |
814 | { | 803 | { |
815 | struct hlist_head *head; | 804 | struct hlist_head *head; |
816 | struct kprobe *p; | 805 | struct kprobe *p; |
@@ -861,7 +850,7 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write, | |||
861 | #endif /* CONFIG_SYSCTL */ | 850 | #endif /* CONFIG_SYSCTL */ |
862 | 851 | ||
863 | /* Put a breakpoint for a probe. Must be called with text_mutex locked */ | 852 | /* Put a breakpoint for a probe. Must be called with text_mutex locked */ |
864 | static void __kprobes __arm_kprobe(struct kprobe *p) | 853 | static void __arm_kprobe(struct kprobe *p) |
865 | { | 854 | { |
866 | struct kprobe *_p; | 855 | struct kprobe *_p; |
867 | 856 | ||
@@ -876,7 +865,7 @@ static void __kprobes __arm_kprobe(struct kprobe *p) | |||
876 | } | 865 | } |
877 | 866 | ||
878 | /* Remove the breakpoint of a probe. Must be called with text_mutex locked */ | 867 | /* Remove the breakpoint of a probe. Must be called with text_mutex locked */ |
879 | static void __kprobes __disarm_kprobe(struct kprobe *p, bool reopt) | 868 | static void __disarm_kprobe(struct kprobe *p, bool reopt) |
880 | { | 869 | { |
881 | struct kprobe *_p; | 870 | struct kprobe *_p; |
882 | 871 | ||
@@ -911,13 +900,13 @@ static void reuse_unused_kprobe(struct kprobe *ap) | |||
911 | BUG_ON(kprobe_unused(ap)); | 900 | BUG_ON(kprobe_unused(ap)); |
912 | } | 901 | } |
913 | 902 | ||
914 | static __kprobes void free_aggr_kprobe(struct kprobe *p) | 903 | static void free_aggr_kprobe(struct kprobe *p) |
915 | { | 904 | { |
916 | arch_remove_kprobe(p); | 905 | arch_remove_kprobe(p); |
917 | kfree(p); | 906 | kfree(p); |
918 | } | 907 | } |
919 | 908 | ||
920 | static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) | 909 | static struct kprobe *alloc_aggr_kprobe(struct kprobe *p) |
921 | { | 910 | { |
922 | return kzalloc(sizeof(struct kprobe), GFP_KERNEL); | 911 | return kzalloc(sizeof(struct kprobe), GFP_KERNEL); |
923 | } | 912 | } |
@@ -931,7 +920,7 @@ static struct ftrace_ops kprobe_ftrace_ops __read_mostly = { | |||
931 | static int kprobe_ftrace_enabled; | 920 | static int kprobe_ftrace_enabled; |
932 | 921 | ||
933 | /* Must ensure p->addr is really on ftrace */ | 922 | /* Must ensure p->addr is really on ftrace */ |
934 | static int __kprobes prepare_kprobe(struct kprobe *p) | 923 | static int prepare_kprobe(struct kprobe *p) |
935 | { | 924 | { |
936 | if (!kprobe_ftrace(p)) | 925 | if (!kprobe_ftrace(p)) |
937 | return arch_prepare_kprobe(p); | 926 | return arch_prepare_kprobe(p); |
@@ -940,7 +929,7 @@ static int __kprobes prepare_kprobe(struct kprobe *p) | |||
940 | } | 929 | } |
941 | 930 | ||
942 | /* Caller must lock kprobe_mutex */ | 931 | /* Caller must lock kprobe_mutex */ |
943 | static void __kprobes arm_kprobe_ftrace(struct kprobe *p) | 932 | static void arm_kprobe_ftrace(struct kprobe *p) |
944 | { | 933 | { |
945 | int ret; | 934 | int ret; |
946 | 935 | ||
@@ -955,7 +944,7 @@ static void __kprobes arm_kprobe_ftrace(struct kprobe *p) | |||
955 | } | 944 | } |
956 | 945 | ||
957 | /* Caller must lock kprobe_mutex */ | 946 | /* Caller must lock kprobe_mutex */ |
958 | static void __kprobes disarm_kprobe_ftrace(struct kprobe *p) | 947 | static void disarm_kprobe_ftrace(struct kprobe *p) |
959 | { | 948 | { |
960 | int ret; | 949 | int ret; |
961 | 950 | ||
@@ -975,7 +964,7 @@ static void __kprobes disarm_kprobe_ftrace(struct kprobe *p) | |||
975 | #endif | 964 | #endif |
976 | 965 | ||
977 | /* Arm a kprobe with text_mutex */ | 966 | /* Arm a kprobe with text_mutex */ |
978 | static void __kprobes arm_kprobe(struct kprobe *kp) | 967 | static void arm_kprobe(struct kprobe *kp) |
979 | { | 968 | { |
980 | if (unlikely(kprobe_ftrace(kp))) { | 969 | if (unlikely(kprobe_ftrace(kp))) { |
981 | arm_kprobe_ftrace(kp); | 970 | arm_kprobe_ftrace(kp); |
@@ -992,7 +981,7 @@ static void __kprobes arm_kprobe(struct kprobe *kp) | |||
992 | } | 981 | } |
993 | 982 | ||
994 | /* Disarm a kprobe with text_mutex */ | 983 | /* Disarm a kprobe with text_mutex */ |
995 | static void __kprobes disarm_kprobe(struct kprobe *kp, bool reopt) | 984 | static void disarm_kprobe(struct kprobe *kp, bool reopt) |
996 | { | 985 | { |
997 | if (unlikely(kprobe_ftrace(kp))) { | 986 | if (unlikely(kprobe_ftrace(kp))) { |
998 | disarm_kprobe_ftrace(kp); | 987 | disarm_kprobe_ftrace(kp); |
@@ -1008,7 +997,7 @@ static void __kprobes disarm_kprobe(struct kprobe *kp, bool reopt) | |||
1008 | * Aggregate handlers for multiple kprobes support - these handlers | 997 | * Aggregate handlers for multiple kprobes support - these handlers |
1009 | * take care of invoking the individual kprobe handlers on p->list | 998 | * take care of invoking the individual kprobe handlers on p->list |
1010 | */ | 999 | */ |
1011 | static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) | 1000 | static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) |
1012 | { | 1001 | { |
1013 | struct kprobe *kp; | 1002 | struct kprobe *kp; |
1014 | 1003 | ||
@@ -1022,9 +1011,10 @@ static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) | |||
1022 | } | 1011 | } |
1023 | return 0; | 1012 | return 0; |
1024 | } | 1013 | } |
1014 | NOKPROBE_SYMBOL(aggr_pre_handler); | ||
1025 | 1015 | ||
1026 | static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs, | 1016 | static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs, |
1027 | unsigned long flags) | 1017 | unsigned long flags) |
1028 | { | 1018 | { |
1029 | struct kprobe *kp; | 1019 | struct kprobe *kp; |
1030 | 1020 | ||
@@ -1036,9 +1026,10 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs, | |||
1036 | } | 1026 | } |
1037 | } | 1027 | } |
1038 | } | 1028 | } |
1029 | NOKPROBE_SYMBOL(aggr_post_handler); | ||
1039 | 1030 | ||
1040 | static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, | 1031 | static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, |
1041 | int trapnr) | 1032 | int trapnr) |
1042 | { | 1033 | { |
1043 | struct kprobe *cur = __this_cpu_read(kprobe_instance); | 1034 | struct kprobe *cur = __this_cpu_read(kprobe_instance); |
1044 | 1035 | ||
@@ -1052,8 +1043,9 @@ static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, | |||
1052 | } | 1043 | } |
1053 | return 0; | 1044 | return 0; |
1054 | } | 1045 | } |
1046 | NOKPROBE_SYMBOL(aggr_fault_handler); | ||
1055 | 1047 | ||
1056 | static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs) | 1048 | static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs) |
1057 | { | 1049 | { |
1058 | struct kprobe *cur = __this_cpu_read(kprobe_instance); | 1050 | struct kprobe *cur = __this_cpu_read(kprobe_instance); |
1059 | int ret = 0; | 1051 | int ret = 0; |
@@ -1065,9 +1057,10 @@ static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs) | |||
1065 | reset_kprobe_instance(); | 1057 | reset_kprobe_instance(); |
1066 | return ret; | 1058 | return ret; |
1067 | } | 1059 | } |
1060 | NOKPROBE_SYMBOL(aggr_break_handler); | ||
1068 | 1061 | ||
1069 | /* Walks the list and increments nmissed count for multiprobe case */ | 1062 | /* Walks the list and increments nmissed count for multiprobe case */ |
1070 | void __kprobes kprobes_inc_nmissed_count(struct kprobe *p) | 1063 | void kprobes_inc_nmissed_count(struct kprobe *p) |
1071 | { | 1064 | { |
1072 | struct kprobe *kp; | 1065 | struct kprobe *kp; |
1073 | if (!kprobe_aggrprobe(p)) { | 1066 | if (!kprobe_aggrprobe(p)) { |
@@ -1078,9 +1071,10 @@ void __kprobes kprobes_inc_nmissed_count(struct kprobe *p) | |||
1078 | } | 1071 | } |
1079 | return; | 1072 | return; |
1080 | } | 1073 | } |
1074 | NOKPROBE_SYMBOL(kprobes_inc_nmissed_count); | ||
1081 | 1075 | ||
1082 | void __kprobes recycle_rp_inst(struct kretprobe_instance *ri, | 1076 | void recycle_rp_inst(struct kretprobe_instance *ri, |
1083 | struct hlist_head *head) | 1077 | struct hlist_head *head) |
1084 | { | 1078 | { |
1085 | struct kretprobe *rp = ri->rp; | 1079 | struct kretprobe *rp = ri->rp; |
1086 | 1080 | ||
@@ -1095,8 +1089,9 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri, | |||
1095 | /* Unregistering */ | 1089 | /* Unregistering */ |
1096 | hlist_add_head(&ri->hlist, head); | 1090 | hlist_add_head(&ri->hlist, head); |
1097 | } | 1091 | } |
1092 | NOKPROBE_SYMBOL(recycle_rp_inst); | ||
1098 | 1093 | ||
1099 | void __kprobes kretprobe_hash_lock(struct task_struct *tsk, | 1094 | void kretprobe_hash_lock(struct task_struct *tsk, |
1100 | struct hlist_head **head, unsigned long *flags) | 1095 | struct hlist_head **head, unsigned long *flags) |
1101 | __acquires(hlist_lock) | 1096 | __acquires(hlist_lock) |
1102 | { | 1097 | { |
@@ -1107,17 +1102,19 @@ __acquires(hlist_lock) | |||
1107 | hlist_lock = kretprobe_table_lock_ptr(hash); | 1102 | hlist_lock = kretprobe_table_lock_ptr(hash); |
1108 | raw_spin_lock_irqsave(hlist_lock, *flags); | 1103 | raw_spin_lock_irqsave(hlist_lock, *flags); |
1109 | } | 1104 | } |
1105 | NOKPROBE_SYMBOL(kretprobe_hash_lock); | ||
1110 | 1106 | ||
1111 | static void __kprobes kretprobe_table_lock(unsigned long hash, | 1107 | static void kretprobe_table_lock(unsigned long hash, |
1112 | unsigned long *flags) | 1108 | unsigned long *flags) |
1113 | __acquires(hlist_lock) | 1109 | __acquires(hlist_lock) |
1114 | { | 1110 | { |
1115 | raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); | 1111 | raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); |
1116 | raw_spin_lock_irqsave(hlist_lock, *flags); | 1112 | raw_spin_lock_irqsave(hlist_lock, *flags); |
1117 | } | 1113 | } |
1114 | NOKPROBE_SYMBOL(kretprobe_table_lock); | ||
1118 | 1115 | ||
1119 | void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, | 1116 | void kretprobe_hash_unlock(struct task_struct *tsk, |
1120 | unsigned long *flags) | 1117 | unsigned long *flags) |
1121 | __releases(hlist_lock) | 1118 | __releases(hlist_lock) |
1122 | { | 1119 | { |
1123 | unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); | 1120 | unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); |
@@ -1126,14 +1123,16 @@ __releases(hlist_lock) | |||
1126 | hlist_lock = kretprobe_table_lock_ptr(hash); | 1123 | hlist_lock = kretprobe_table_lock_ptr(hash); |
1127 | raw_spin_unlock_irqrestore(hlist_lock, *flags); | 1124 | raw_spin_unlock_irqrestore(hlist_lock, *flags); |
1128 | } | 1125 | } |
1126 | NOKPROBE_SYMBOL(kretprobe_hash_unlock); | ||
1129 | 1127 | ||
1130 | static void __kprobes kretprobe_table_unlock(unsigned long hash, | 1128 | static void kretprobe_table_unlock(unsigned long hash, |
1131 | unsigned long *flags) | 1129 | unsigned long *flags) |
1132 | __releases(hlist_lock) | 1130 | __releases(hlist_lock) |
1133 | { | 1131 | { |
1134 | raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); | 1132 | raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); |
1135 | raw_spin_unlock_irqrestore(hlist_lock, *flags); | 1133 | raw_spin_unlock_irqrestore(hlist_lock, *flags); |
1136 | } | 1134 | } |
1135 | NOKPROBE_SYMBOL(kretprobe_table_unlock); | ||
1137 | 1136 | ||
1138 | /* | 1137 | /* |
1139 | * This function is called from finish_task_switch when task tk becomes dead, | 1138 | * This function is called from finish_task_switch when task tk becomes dead, |
@@ -1141,7 +1140,7 @@ __releases(hlist_lock) | |||
1141 | * with this task. These left over instances represent probed functions | 1140 | * with this task. These left over instances represent probed functions |
1142 | * that have been called but will never return. | 1141 | * that have been called but will never return. |
1143 | */ | 1142 | */ |
1144 | void __kprobes kprobe_flush_task(struct task_struct *tk) | 1143 | void kprobe_flush_task(struct task_struct *tk) |
1145 | { | 1144 | { |
1146 | struct kretprobe_instance *ri; | 1145 | struct kretprobe_instance *ri; |
1147 | struct hlist_head *head, empty_rp; | 1146 | struct hlist_head *head, empty_rp; |
@@ -1166,6 +1165,7 @@ void __kprobes kprobe_flush_task(struct task_struct *tk) | |||
1166 | kfree(ri); | 1165 | kfree(ri); |
1167 | } | 1166 | } |
1168 | } | 1167 | } |
1168 | NOKPROBE_SYMBOL(kprobe_flush_task); | ||
1169 | 1169 | ||
1170 | static inline void free_rp_inst(struct kretprobe *rp) | 1170 | static inline void free_rp_inst(struct kretprobe *rp) |
1171 | { | 1171 | { |
@@ -1178,7 +1178,7 @@ static inline void free_rp_inst(struct kretprobe *rp) | |||
1178 | } | 1178 | } |
1179 | } | 1179 | } |
1180 | 1180 | ||
1181 | static void __kprobes cleanup_rp_inst(struct kretprobe *rp) | 1181 | static void cleanup_rp_inst(struct kretprobe *rp) |
1182 | { | 1182 | { |
1183 | unsigned long flags, hash; | 1183 | unsigned long flags, hash; |
1184 | struct kretprobe_instance *ri; | 1184 | struct kretprobe_instance *ri; |
@@ -1197,12 +1197,13 @@ static void __kprobes cleanup_rp_inst(struct kretprobe *rp) | |||
1197 | } | 1197 | } |
1198 | free_rp_inst(rp); | 1198 | free_rp_inst(rp); |
1199 | } | 1199 | } |
1200 | NOKPROBE_SYMBOL(cleanup_rp_inst); | ||
1200 | 1201 | ||
1201 | /* | 1202 | /* |
1202 | * Add the new probe to ap->list. Fail if this is the | 1203 | * Add the new probe to ap->list. Fail if this is the |
1203 | * second jprobe at the address - two jprobes can't coexist | 1204 | * second jprobe at the address - two jprobes can't coexist |
1204 | */ | 1205 | */ |
1205 | static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) | 1206 | static int add_new_kprobe(struct kprobe *ap, struct kprobe *p) |
1206 | { | 1207 | { |
1207 | BUG_ON(kprobe_gone(ap) || kprobe_gone(p)); | 1208 | BUG_ON(kprobe_gone(ap) || kprobe_gone(p)); |
1208 | 1209 | ||
@@ -1226,7 +1227,7 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) | |||
1226 | * Fill in the required fields of the "manager kprobe". Replace the | 1227 | * Fill in the required fields of the "manager kprobe". Replace the |
1227 | * earlier kprobe in the hlist with the manager kprobe | 1228 | * earlier kprobe in the hlist with the manager kprobe |
1228 | */ | 1229 | */ |
1229 | static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p) | 1230 | static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p) |
1230 | { | 1231 | { |
1231 | /* Copy p's insn slot to ap */ | 1232 | /* Copy p's insn slot to ap */ |
1232 | copy_kprobe(p, ap); | 1233 | copy_kprobe(p, ap); |
@@ -1252,8 +1253,7 @@ static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p) | |||
1252 | * This is the second or subsequent kprobe at the address - handle | 1253 | * This is the second or subsequent kprobe at the address - handle |
1253 | * the intricacies | 1254 | * the intricacies |
1254 | */ | 1255 | */ |
1255 | static int __kprobes register_aggr_kprobe(struct kprobe *orig_p, | 1256 | static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p) |
1256 | struct kprobe *p) | ||
1257 | { | 1257 | { |
1258 | int ret = 0; | 1258 | int ret = 0; |
1259 | struct kprobe *ap = orig_p; | 1259 | struct kprobe *ap = orig_p; |
@@ -1324,25 +1324,29 @@ out: | |||
1324 | return ret; | 1324 | return ret; |
1325 | } | 1325 | } |
1326 | 1326 | ||
1327 | static int __kprobes in_kprobes_functions(unsigned long addr) | 1327 | bool __weak arch_within_kprobe_blacklist(unsigned long addr) |
1328 | { | 1328 | { |
1329 | struct kprobe_blackpoint *kb; | 1329 | /* The __kprobes marked functions and entry code must not be probed */ |
1330 | return addr >= (unsigned long)__kprobes_text_start && | ||
1331 | addr < (unsigned long)__kprobes_text_end; | ||
1332 | } | ||
1330 | 1333 | ||
1331 | if (addr >= (unsigned long)__kprobes_text_start && | 1334 | static bool within_kprobe_blacklist(unsigned long addr) |
1332 | addr < (unsigned long)__kprobes_text_end) | 1335 | { |
1333 | return -EINVAL; | 1336 | struct kprobe_blacklist_entry *ent; |
1337 | |||
1338 | if (arch_within_kprobe_blacklist(addr)) | ||
1339 | return true; | ||
1334 | /* | 1340 | /* |
1335 | * If there exists a kprobe_blacklist, verify and | 1341 | * If there exists a kprobe_blacklist, verify and |
1336 | * fail any probe registration in the prohibited area | 1342 | * fail any probe registration in the prohibited area |
1337 | */ | 1343 | */ |
1338 | for (kb = kprobe_blacklist; kb->name != NULL; kb++) { | 1344 | list_for_each_entry(ent, &kprobe_blacklist, list) { |
1339 | if (kb->start_addr) { | 1345 | if (addr >= ent->start_addr && addr < ent->end_addr) |
1340 | if (addr >= kb->start_addr && | 1346 | return true; |
1341 | addr < (kb->start_addr + kb->range)) | ||
1342 | return -EINVAL; | ||
1343 | } | ||
1344 | } | 1347 | } |
1345 | return 0; | 1348 | |
1349 | return false; | ||
1346 | } | 1350 | } |
1347 | 1351 | ||
1348 | /* | 1352 | /* |
@@ -1351,7 +1355,7 @@ static int __kprobes in_kprobes_functions(unsigned long addr) | |||
1351 | * This returns encoded errors if it fails to look up symbol or invalid | 1355 | * This returns encoded errors if it fails to look up symbol or invalid |
1352 | * combination of parameters. | 1356 | * combination of parameters. |
1353 | */ | 1357 | */ |
1354 | static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p) | 1358 | static kprobe_opcode_t *kprobe_addr(struct kprobe *p) |
1355 | { | 1359 | { |
1356 | kprobe_opcode_t *addr = p->addr; | 1360 | kprobe_opcode_t *addr = p->addr; |
1357 | 1361 | ||
@@ -1374,7 +1378,7 @@ invalid: | |||
1374 | } | 1378 | } |
1375 | 1379 | ||
1376 | /* Check passed kprobe is valid and return kprobe in kprobe_table. */ | 1380 | /* Check passed kprobe is valid and return kprobe in kprobe_table. */ |
1377 | static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p) | 1381 | static struct kprobe *__get_valid_kprobe(struct kprobe *p) |
1378 | { | 1382 | { |
1379 | struct kprobe *ap, *list_p; | 1383 | struct kprobe *ap, *list_p; |
1380 | 1384 | ||
@@ -1406,8 +1410,8 @@ static inline int check_kprobe_rereg(struct kprobe *p) | |||
1406 | return ret; | 1410 | return ret; |
1407 | } | 1411 | } |
1408 | 1412 | ||
1409 | static __kprobes int check_kprobe_address_safe(struct kprobe *p, | 1413 | static int check_kprobe_address_safe(struct kprobe *p, |
1410 | struct module **probed_mod) | 1414 | struct module **probed_mod) |
1411 | { | 1415 | { |
1412 | int ret = 0; | 1416 | int ret = 0; |
1413 | unsigned long ftrace_addr; | 1417 | unsigned long ftrace_addr; |
@@ -1433,7 +1437,7 @@ static __kprobes int check_kprobe_address_safe(struct kprobe *p, | |||
1433 | 1437 | ||
1434 | /* Ensure it is not in reserved area nor out of text */ | 1438 | /* Ensure it is not in reserved area nor out of text */ |
1435 | if (!kernel_text_address((unsigned long) p->addr) || | 1439 | if (!kernel_text_address((unsigned long) p->addr) || |
1436 | in_kprobes_functions((unsigned long) p->addr) || | 1440 | within_kprobe_blacklist((unsigned long) p->addr) || |
1437 | jump_label_text_reserved(p->addr, p->addr)) { | 1441 | jump_label_text_reserved(p->addr, p->addr)) { |
1438 | ret = -EINVAL; | 1442 | ret = -EINVAL; |
1439 | goto out; | 1443 | goto out; |
@@ -1469,7 +1473,7 @@ out: | |||
1469 | return ret; | 1473 | return ret; |
1470 | } | 1474 | } |
1471 | 1475 | ||
1472 | int __kprobes register_kprobe(struct kprobe *p) | 1476 | int register_kprobe(struct kprobe *p) |
1473 | { | 1477 | { |
1474 | int ret; | 1478 | int ret; |
1475 | struct kprobe *old_p; | 1479 | struct kprobe *old_p; |
@@ -1531,7 +1535,7 @@ out: | |||
1531 | EXPORT_SYMBOL_GPL(register_kprobe); | 1535 | EXPORT_SYMBOL_GPL(register_kprobe); |
1532 | 1536 | ||
1533 | /* Check if all probes on the aggrprobe are disabled */ | 1537 | /* Check if all probes on the aggrprobe are disabled */ |
1534 | static int __kprobes aggr_kprobe_disabled(struct kprobe *ap) | 1538 | static int aggr_kprobe_disabled(struct kprobe *ap) |
1535 | { | 1539 | { |
1536 | struct kprobe *kp; | 1540 | struct kprobe *kp; |
1537 | 1541 | ||
@@ -1547,7 +1551,7 @@ static int __kprobes aggr_kprobe_disabled(struct kprobe *ap) | |||
1547 | } | 1551 | } |
1548 | 1552 | ||
1549 | /* Disable one kprobe: Make sure called under kprobe_mutex is locked */ | 1553 | /* Disable one kprobe: Make sure called under kprobe_mutex is locked */ |
1550 | static struct kprobe *__kprobes __disable_kprobe(struct kprobe *p) | 1554 | static struct kprobe *__disable_kprobe(struct kprobe *p) |
1551 | { | 1555 | { |
1552 | struct kprobe *orig_p; | 1556 | struct kprobe *orig_p; |
1553 | 1557 | ||
@@ -1574,7 +1578,7 @@ static struct kprobe *__kprobes __disable_kprobe(struct kprobe *p) | |||
1574 | /* | 1578 | /* |
1575 | * Unregister a kprobe without a scheduler synchronization. | 1579 | * Unregister a kprobe without a scheduler synchronization. |
1576 | */ | 1580 | */ |
1577 | static int __kprobes __unregister_kprobe_top(struct kprobe *p) | 1581 | static int __unregister_kprobe_top(struct kprobe *p) |
1578 | { | 1582 | { |
1579 | struct kprobe *ap, *list_p; | 1583 | struct kprobe *ap, *list_p; |
1580 | 1584 | ||
@@ -1631,7 +1635,7 @@ disarmed: | |||
1631 | return 0; | 1635 | return 0; |
1632 | } | 1636 | } |
1633 | 1637 | ||
1634 | static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) | 1638 | static void __unregister_kprobe_bottom(struct kprobe *p) |
1635 | { | 1639 | { |
1636 | struct kprobe *ap; | 1640 | struct kprobe *ap; |
1637 | 1641 | ||
@@ -1647,7 +1651,7 @@ static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) | |||
1647 | /* Otherwise, do nothing. */ | 1651 | /* Otherwise, do nothing. */ |
1648 | } | 1652 | } |
1649 | 1653 | ||
1650 | int __kprobes register_kprobes(struct kprobe **kps, int num) | 1654 | int register_kprobes(struct kprobe **kps, int num) |
1651 | { | 1655 | { |
1652 | int i, ret = 0; | 1656 | int i, ret = 0; |
1653 | 1657 | ||
@@ -1665,13 +1669,13 @@ int __kprobes register_kprobes(struct kprobe **kps, int num) | |||
1665 | } | 1669 | } |
1666 | EXPORT_SYMBOL_GPL(register_kprobes); | 1670 | EXPORT_SYMBOL_GPL(register_kprobes); |
1667 | 1671 | ||
1668 | void __kprobes unregister_kprobe(struct kprobe *p) | 1672 | void unregister_kprobe(struct kprobe *p) |
1669 | { | 1673 | { |
1670 | unregister_kprobes(&p, 1); | 1674 | unregister_kprobes(&p, 1); |
1671 | } | 1675 | } |
1672 | EXPORT_SYMBOL_GPL(unregister_kprobe); | 1676 | EXPORT_SYMBOL_GPL(unregister_kprobe); |
1673 | 1677 | ||
1674 | void __kprobes unregister_kprobes(struct kprobe **kps, int num) | 1678 | void unregister_kprobes(struct kprobe **kps, int num) |
1675 | { | 1679 | { |
1676 | int i; | 1680 | int i; |
1677 | 1681 | ||
@@ -1700,7 +1704,7 @@ unsigned long __weak arch_deref_entry_point(void *entry) | |||
1700 | return (unsigned long)entry; | 1704 | return (unsigned long)entry; |
1701 | } | 1705 | } |
1702 | 1706 | ||
1703 | int __kprobes register_jprobes(struct jprobe **jps, int num) | 1707 | int register_jprobes(struct jprobe **jps, int num) |
1704 | { | 1708 | { |
1705 | struct jprobe *jp; | 1709 | struct jprobe *jp; |
1706 | int ret = 0, i; | 1710 | int ret = 0, i; |
@@ -1731,19 +1735,19 @@ int __kprobes register_jprobes(struct jprobe **jps, int num) | |||
1731 | } | 1735 | } |
1732 | EXPORT_SYMBOL_GPL(register_jprobes); | 1736 | EXPORT_SYMBOL_GPL(register_jprobes); |
1733 | 1737 | ||
1734 | int __kprobes register_jprobe(struct jprobe *jp) | 1738 | int register_jprobe(struct jprobe *jp) |
1735 | { | 1739 | { |
1736 | return register_jprobes(&jp, 1); | 1740 | return register_jprobes(&jp, 1); |
1737 | } | 1741 | } |
1738 | EXPORT_SYMBOL_GPL(register_jprobe); | 1742 | EXPORT_SYMBOL_GPL(register_jprobe); |
1739 | 1743 | ||
1740 | void __kprobes unregister_jprobe(struct jprobe *jp) | 1744 | void unregister_jprobe(struct jprobe *jp) |
1741 | { | 1745 | { |
1742 | unregister_jprobes(&jp, 1); | 1746 | unregister_jprobes(&jp, 1); |
1743 | } | 1747 | } |
1744 | EXPORT_SYMBOL_GPL(unregister_jprobe); | 1748 | EXPORT_SYMBOL_GPL(unregister_jprobe); |
1745 | 1749 | ||
1746 | void __kprobes unregister_jprobes(struct jprobe **jps, int num) | 1750 | void unregister_jprobes(struct jprobe **jps, int num) |
1747 | { | 1751 | { |
1748 | int i; | 1752 | int i; |
1749 | 1753 | ||
@@ -1768,8 +1772,7 @@ EXPORT_SYMBOL_GPL(unregister_jprobes); | |||
1768 | * This kprobe pre_handler is registered with every kretprobe. When probe | 1772 | * This kprobe pre_handler is registered with every kretprobe. When probe |
1769 | * hits it will set up the return probe. | 1773 | * hits it will set up the return probe. |
1770 | */ | 1774 | */ |
1771 | static int __kprobes pre_handler_kretprobe(struct kprobe *p, | 1775 | static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) |
1772 | struct pt_regs *regs) | ||
1773 | { | 1776 | { |
1774 | struct kretprobe *rp = container_of(p, struct kretprobe, kp); | 1777 | struct kretprobe *rp = container_of(p, struct kretprobe, kp); |
1775 | unsigned long hash, flags = 0; | 1778 | unsigned long hash, flags = 0; |
@@ -1807,8 +1810,9 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, | |||
1807 | } | 1810 | } |
1808 | return 0; | 1811 | return 0; |
1809 | } | 1812 | } |
1813 | NOKPROBE_SYMBOL(pre_handler_kretprobe); | ||
1810 | 1814 | ||
1811 | int __kprobes register_kretprobe(struct kretprobe *rp) | 1815 | int register_kretprobe(struct kretprobe *rp) |
1812 | { | 1816 | { |
1813 | int ret = 0; | 1817 | int ret = 0; |
1814 | struct kretprobe_instance *inst; | 1818 | struct kretprobe_instance *inst; |
@@ -1861,7 +1865,7 @@ int __kprobes register_kretprobe(struct kretprobe *rp) | |||
1861 | } | 1865 | } |
1862 | EXPORT_SYMBOL_GPL(register_kretprobe); | 1866 | EXPORT_SYMBOL_GPL(register_kretprobe); |
1863 | 1867 | ||
1864 | int __kprobes register_kretprobes(struct kretprobe **rps, int num) | 1868 | int register_kretprobes(struct kretprobe **rps, int num) |
1865 | { | 1869 | { |
1866 | int ret = 0, i; | 1870 | int ret = 0, i; |
1867 | 1871 | ||
@@ -1879,13 +1883,13 @@ int __kprobes register_kretprobes(struct kretprobe **rps, int num) | |||
1879 | } | 1883 | } |
1880 | EXPORT_SYMBOL_GPL(register_kretprobes); | 1884 | EXPORT_SYMBOL_GPL(register_kretprobes); |
1881 | 1885 | ||
1882 | void __kprobes unregister_kretprobe(struct kretprobe *rp) | 1886 | void unregister_kretprobe(struct kretprobe *rp) |
1883 | { | 1887 | { |
1884 | unregister_kretprobes(&rp, 1); | 1888 | unregister_kretprobes(&rp, 1); |
1885 | } | 1889 | } |
1886 | EXPORT_SYMBOL_GPL(unregister_kretprobe); | 1890 | EXPORT_SYMBOL_GPL(unregister_kretprobe); |
1887 | 1891 | ||
1888 | void __kprobes unregister_kretprobes(struct kretprobe **rps, int num) | 1892 | void unregister_kretprobes(struct kretprobe **rps, int num) |
1889 | { | 1893 | { |
1890 | int i; | 1894 | int i; |
1891 | 1895 | ||
@@ -1908,38 +1912,38 @@ void __kprobes unregister_kretprobes(struct kretprobe **rps, int num) | |||
1908 | EXPORT_SYMBOL_GPL(unregister_kretprobes); | 1912 | EXPORT_SYMBOL_GPL(unregister_kretprobes); |
1909 | 1913 | ||
1910 | #else /* CONFIG_KRETPROBES */ | 1914 | #else /* CONFIG_KRETPROBES */ |
1911 | int __kprobes register_kretprobe(struct kretprobe *rp) | 1915 | int register_kretprobe(struct kretprobe *rp) |
1912 | { | 1916 | { |
1913 | return -ENOSYS; | 1917 | return -ENOSYS; |
1914 | } | 1918 | } |
1915 | EXPORT_SYMBOL_GPL(register_kretprobe); | 1919 | EXPORT_SYMBOL_GPL(register_kretprobe); |
1916 | 1920 | ||
1917 | int __kprobes register_kretprobes(struct kretprobe **rps, int num) | 1921 | int register_kretprobes(struct kretprobe **rps, int num) |
1918 | { | 1922 | { |
1919 | return -ENOSYS; | 1923 | return -ENOSYS; |
1920 | } | 1924 | } |
1921 | EXPORT_SYMBOL_GPL(register_kretprobes); | 1925 | EXPORT_SYMBOL_GPL(register_kretprobes); |
1922 | 1926 | ||
1923 | void __kprobes unregister_kretprobe(struct kretprobe *rp) | 1927 | void unregister_kretprobe(struct kretprobe *rp) |
1924 | { | 1928 | { |
1925 | } | 1929 | } |
1926 | EXPORT_SYMBOL_GPL(unregister_kretprobe); | 1930 | EXPORT_SYMBOL_GPL(unregister_kretprobe); |
1927 | 1931 | ||
1928 | void __kprobes unregister_kretprobes(struct kretprobe **rps, int num) | 1932 | void unregister_kretprobes(struct kretprobe **rps, int num) |
1929 | { | 1933 | { |
1930 | } | 1934 | } |
1931 | EXPORT_SYMBOL_GPL(unregister_kretprobes); | 1935 | EXPORT_SYMBOL_GPL(unregister_kretprobes); |
1932 | 1936 | ||
1933 | static int __kprobes pre_handler_kretprobe(struct kprobe *p, | 1937 | static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) |
1934 | struct pt_regs *regs) | ||
1935 | { | 1938 | { |
1936 | return 0; | 1939 | return 0; |
1937 | } | 1940 | } |
1941 | NOKPROBE_SYMBOL(pre_handler_kretprobe); | ||
1938 | 1942 | ||
1939 | #endif /* CONFIG_KRETPROBES */ | 1943 | #endif /* CONFIG_KRETPROBES */ |
1940 | 1944 | ||
1941 | /* Set the kprobe gone and remove its instruction buffer. */ | 1945 | /* Set the kprobe gone and remove its instruction buffer. */ |
1942 | static void __kprobes kill_kprobe(struct kprobe *p) | 1946 | static void kill_kprobe(struct kprobe *p) |
1943 | { | 1947 | { |
1944 | struct kprobe *kp; | 1948 | struct kprobe *kp; |
1945 | 1949 | ||
@@ -1963,7 +1967,7 @@ static void __kprobes kill_kprobe(struct kprobe *p) | |||
1963 | } | 1967 | } |
1964 | 1968 | ||
1965 | /* Disable one kprobe */ | 1969 | /* Disable one kprobe */ |
1966 | int __kprobes disable_kprobe(struct kprobe *kp) | 1970 | int disable_kprobe(struct kprobe *kp) |
1967 | { | 1971 | { |
1968 | int ret = 0; | 1972 | int ret = 0; |
1969 | 1973 | ||
@@ -1979,7 +1983,7 @@ int __kprobes disable_kprobe(struct kprobe *kp) | |||
1979 | EXPORT_SYMBOL_GPL(disable_kprobe); | 1983 | EXPORT_SYMBOL_GPL(disable_kprobe); |
1980 | 1984 | ||
1981 | /* Enable one kprobe */ | 1985 | /* Enable one kprobe */ |
1982 | int __kprobes enable_kprobe(struct kprobe *kp) | 1986 | int enable_kprobe(struct kprobe *kp) |
1983 | { | 1987 | { |
1984 | int ret = 0; | 1988 | int ret = 0; |
1985 | struct kprobe *p; | 1989 | struct kprobe *p; |
@@ -2012,16 +2016,49 @@ out: | |||
2012 | } | 2016 | } |
2013 | EXPORT_SYMBOL_GPL(enable_kprobe); | 2017 | EXPORT_SYMBOL_GPL(enable_kprobe); |
2014 | 2018 | ||
2015 | void __kprobes dump_kprobe(struct kprobe *kp) | 2019 | void dump_kprobe(struct kprobe *kp) |
2016 | { | 2020 | { |
2017 | printk(KERN_WARNING "Dumping kprobe:\n"); | 2021 | printk(KERN_WARNING "Dumping kprobe:\n"); |
2018 | printk(KERN_WARNING "Name: %s\nAddress: %p\nOffset: %x\n", | 2022 | printk(KERN_WARNING "Name: %s\nAddress: %p\nOffset: %x\n", |
2019 | kp->symbol_name, kp->addr, kp->offset); | 2023 | kp->symbol_name, kp->addr, kp->offset); |
2020 | } | 2024 | } |
2025 | NOKPROBE_SYMBOL(dump_kprobe); | ||
2026 | |||
2027 | /* | ||
2028 | * Lookup and populate the kprobe_blacklist. | ||
2029 | * | ||
2030 | * Unlike the kretprobe blacklist, we'll need to determine | ||
2031 | * the range of addresses that belong to the said functions, | ||
2032 | * since a kprobe need not necessarily be at the beginning | ||
2033 | * of a function. | ||
2034 | */ | ||
2035 | static int __init populate_kprobe_blacklist(unsigned long *start, | ||
2036 | unsigned long *end) | ||
2037 | { | ||
2038 | unsigned long *iter; | ||
2039 | struct kprobe_blacklist_entry *ent; | ||
2040 | unsigned long offset = 0, size = 0; | ||
2041 | |||
2042 | for (iter = start; iter < end; iter++) { | ||
2043 | if (!kallsyms_lookup_size_offset(*iter, &size, &offset)) { | ||
2044 | pr_err("Failed to find blacklist %p\n", (void *)*iter); | ||
2045 | continue; | ||
2046 | } | ||
2047 | |||
2048 | ent = kmalloc(sizeof(*ent), GFP_KERNEL); | ||
2049 | if (!ent) | ||
2050 | return -ENOMEM; | ||
2051 | ent->start_addr = *iter; | ||
2052 | ent->end_addr = *iter + size; | ||
2053 | INIT_LIST_HEAD(&ent->list); | ||
2054 | list_add_tail(&ent->list, &kprobe_blacklist); | ||
2055 | } | ||
2056 | return 0; | ||
2057 | } | ||
2021 | 2058 | ||
2022 | /* Module notifier call back, checking kprobes on the module */ | 2059 | /* Module notifier call back, checking kprobes on the module */ |
2023 | static int __kprobes kprobes_module_callback(struct notifier_block *nb, | 2060 | static int kprobes_module_callback(struct notifier_block *nb, |
2024 | unsigned long val, void *data) | 2061 | unsigned long val, void *data) |
2025 | { | 2062 | { |
2026 | struct module *mod = data; | 2063 | struct module *mod = data; |
2027 | struct hlist_head *head; | 2064 | struct hlist_head *head; |
@@ -2062,14 +2099,13 @@ static struct notifier_block kprobe_module_nb = { | |||
2062 | .priority = 0 | 2099 | .priority = 0 |
2063 | }; | 2100 | }; |
2064 | 2101 | ||
2102 | /* Markers of _kprobe_blacklist section */ | ||
2103 | extern unsigned long __start_kprobe_blacklist[]; | ||
2104 | extern unsigned long __stop_kprobe_blacklist[]; | ||
2105 | |||
2065 | static int __init init_kprobes(void) | 2106 | static int __init init_kprobes(void) |
2066 | { | 2107 | { |
2067 | int i, err = 0; | 2108 | int i, err = 0; |
2068 | unsigned long offset = 0, size = 0; | ||
2069 | char *modname, namebuf[KSYM_NAME_LEN]; | ||
2070 | const char *symbol_name; | ||
2071 | void *addr; | ||
2072 | struct kprobe_blackpoint *kb; | ||
2073 | 2109 | ||
2074 | /* FIXME allocate the probe table, currently defined statically */ | 2110 | /* FIXME allocate the probe table, currently defined statically */ |
2075 | /* initialize all list heads */ | 2111 | /* initialize all list heads */ |
@@ -2079,26 +2115,11 @@ static int __init init_kprobes(void) | |||
2079 | raw_spin_lock_init(&(kretprobe_table_locks[i].lock)); | 2115 | raw_spin_lock_init(&(kretprobe_table_locks[i].lock)); |
2080 | } | 2116 | } |
2081 | 2117 | ||
2082 | /* | 2118 | err = populate_kprobe_blacklist(__start_kprobe_blacklist, |
2083 | * Lookup and populate the kprobe_blacklist. | 2119 | __stop_kprobe_blacklist); |
2084 | * | 2120 | if (err) { |
2085 | * Unlike the kretprobe blacklist, we'll need to determine | 2121 | pr_err("kprobes: failed to populate blacklist: %d\n", err); |
2086 | * the range of addresses that belong to the said functions, | 2122 | pr_err("Please take care of using kprobes.\n"); |
2087 | * since a kprobe need not necessarily be at the beginning | ||
2088 | * of a function. | ||
2089 | */ | ||
2090 | for (kb = kprobe_blacklist; kb->name != NULL; kb++) { | ||
2091 | kprobe_lookup_name(kb->name, addr); | ||
2092 | if (!addr) | ||
2093 | continue; | ||
2094 | |||
2095 | kb->start_addr = (unsigned long)addr; | ||
2096 | symbol_name = kallsyms_lookup(kb->start_addr, | ||
2097 | &size, &offset, &modname, namebuf); | ||
2098 | if (!symbol_name) | ||
2099 | kb->range = 0; | ||
2100 | else | ||
2101 | kb->range = size; | ||
2102 | } | 2123 | } |
2103 | 2124 | ||
2104 | if (kretprobe_blacklist_size) { | 2125 | if (kretprobe_blacklist_size) { |
@@ -2138,7 +2159,7 @@ static int __init init_kprobes(void) | |||
2138 | } | 2159 | } |
2139 | 2160 | ||
2140 | #ifdef CONFIG_DEBUG_FS | 2161 | #ifdef CONFIG_DEBUG_FS |
2141 | static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p, | 2162 | static void report_probe(struct seq_file *pi, struct kprobe *p, |
2142 | const char *sym, int offset, char *modname, struct kprobe *pp) | 2163 | const char *sym, int offset, char *modname, struct kprobe *pp) |
2143 | { | 2164 | { |
2144 | char *kprobe_type; | 2165 | char *kprobe_type; |
@@ -2167,12 +2188,12 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p, | |||
2167 | (kprobe_ftrace(pp) ? "[FTRACE]" : "")); | 2188 | (kprobe_ftrace(pp) ? "[FTRACE]" : "")); |
2168 | } | 2189 | } |
2169 | 2190 | ||
2170 | static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos) | 2191 | static void *kprobe_seq_start(struct seq_file *f, loff_t *pos) |
2171 | { | 2192 | { |
2172 | return (*pos < KPROBE_TABLE_SIZE) ? pos : NULL; | 2193 | return (*pos < KPROBE_TABLE_SIZE) ? pos : NULL; |
2173 | } | 2194 | } |
2174 | 2195 | ||
2175 | static void __kprobes *kprobe_seq_next(struct seq_file *f, void *v, loff_t *pos) | 2196 | static void *kprobe_seq_next(struct seq_file *f, void *v, loff_t *pos) |
2176 | { | 2197 | { |
2177 | (*pos)++; | 2198 | (*pos)++; |
2178 | if (*pos >= KPROBE_TABLE_SIZE) | 2199 | if (*pos >= KPROBE_TABLE_SIZE) |
@@ -2180,12 +2201,12 @@ static void __kprobes *kprobe_seq_next(struct seq_file *f, void *v, loff_t *pos) | |||
2180 | return pos; | 2201 | return pos; |
2181 | } | 2202 | } |
2182 | 2203 | ||
2183 | static void __kprobes kprobe_seq_stop(struct seq_file *f, void *v) | 2204 | static void kprobe_seq_stop(struct seq_file *f, void *v) |
2184 | { | 2205 | { |
2185 | /* Nothing to do */ | 2206 | /* Nothing to do */ |
2186 | } | 2207 | } |
2187 | 2208 | ||
2188 | static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v) | 2209 | static int show_kprobe_addr(struct seq_file *pi, void *v) |
2189 | { | 2210 | { |
2190 | struct hlist_head *head; | 2211 | struct hlist_head *head; |
2191 | struct kprobe *p, *kp; | 2212 | struct kprobe *p, *kp; |
@@ -2216,7 +2237,7 @@ static const struct seq_operations kprobes_seq_ops = { | |||
2216 | .show = show_kprobe_addr | 2237 | .show = show_kprobe_addr |
2217 | }; | 2238 | }; |
2218 | 2239 | ||
2219 | static int __kprobes kprobes_open(struct inode *inode, struct file *filp) | 2240 | static int kprobes_open(struct inode *inode, struct file *filp) |
2220 | { | 2241 | { |
2221 | return seq_open(filp, &kprobes_seq_ops); | 2242 | return seq_open(filp, &kprobes_seq_ops); |
2222 | } | 2243 | } |
@@ -2228,7 +2249,47 @@ static const struct file_operations debugfs_kprobes_operations = { | |||
2228 | .release = seq_release, | 2249 | .release = seq_release, |
2229 | }; | 2250 | }; |
2230 | 2251 | ||
2231 | static void __kprobes arm_all_kprobes(void) | 2252 | /* kprobes/blacklist -- shows which functions can not be probed */ |
2253 | static void *kprobe_blacklist_seq_start(struct seq_file *m, loff_t *pos) | ||
2254 | { | ||
2255 | return seq_list_start(&kprobe_blacklist, *pos); | ||
2256 | } | ||
2257 | |||
2258 | static void *kprobe_blacklist_seq_next(struct seq_file *m, void *v, loff_t *pos) | ||
2259 | { | ||
2260 | return seq_list_next(v, &kprobe_blacklist, pos); | ||
2261 | } | ||
2262 | |||
2263 | static int kprobe_blacklist_seq_show(struct seq_file *m, void *v) | ||
2264 | { | ||
2265 | struct kprobe_blacklist_entry *ent = | ||
2266 | list_entry(v, struct kprobe_blacklist_entry, list); | ||
2267 | |||
2268 | seq_printf(m, "0x%p-0x%p\t%ps\n", (void *)ent->start_addr, | ||
2269 | (void *)ent->end_addr, (void *)ent->start_addr); | ||
2270 | return 0; | ||
2271 | } | ||
2272 | |||
2273 | static const struct seq_operations kprobe_blacklist_seq_ops = { | ||
2274 | .start = kprobe_blacklist_seq_start, | ||
2275 | .next = kprobe_blacklist_seq_next, | ||
2276 | .stop = kprobe_seq_stop, /* Reuse void function */ | ||
2277 | .show = kprobe_blacklist_seq_show, | ||
2278 | }; | ||
2279 | |||
2280 | static int kprobe_blacklist_open(struct inode *inode, struct file *filp) | ||
2281 | { | ||
2282 | return seq_open(filp, &kprobe_blacklist_seq_ops); | ||
2283 | } | ||
2284 | |||
2285 | static const struct file_operations debugfs_kprobe_blacklist_ops = { | ||
2286 | .open = kprobe_blacklist_open, | ||
2287 | .read = seq_read, | ||
2288 | .llseek = seq_lseek, | ||
2289 | .release = seq_release, | ||
2290 | }; | ||
2291 | |||
2292 | static void arm_all_kprobes(void) | ||
2232 | { | 2293 | { |
2233 | struct hlist_head *head; | 2294 | struct hlist_head *head; |
2234 | struct kprobe *p; | 2295 | struct kprobe *p; |
@@ -2256,7 +2317,7 @@ already_enabled: | |||
2256 | return; | 2317 | return; |
2257 | } | 2318 | } |
2258 | 2319 | ||
2259 | static void __kprobes disarm_all_kprobes(void) | 2320 | static void disarm_all_kprobes(void) |
2260 | { | 2321 | { |
2261 | struct hlist_head *head; | 2322 | struct hlist_head *head; |
2262 | struct kprobe *p; | 2323 | struct kprobe *p; |
@@ -2340,7 +2401,7 @@ static const struct file_operations fops_kp = { | |||
2340 | .llseek = default_llseek, | 2401 | .llseek = default_llseek, |
2341 | }; | 2402 | }; |
2342 | 2403 | ||
2343 | static int __kprobes debugfs_kprobe_init(void) | 2404 | static int __init debugfs_kprobe_init(void) |
2344 | { | 2405 | { |
2345 | struct dentry *dir, *file; | 2406 | struct dentry *dir, *file; |
2346 | unsigned int value = 1; | 2407 | unsigned int value = 1; |
@@ -2351,19 +2412,24 @@ static int __kprobes debugfs_kprobe_init(void) | |||
2351 | 2412 | ||
2352 | file = debugfs_create_file("list", 0444, dir, NULL, | 2413 | file = debugfs_create_file("list", 0444, dir, NULL, |
2353 | &debugfs_kprobes_operations); | 2414 | &debugfs_kprobes_operations); |
2354 | if (!file) { | 2415 | if (!file) |
2355 | debugfs_remove(dir); | 2416 | goto error; |
2356 | return -ENOMEM; | ||
2357 | } | ||
2358 | 2417 | ||
2359 | file = debugfs_create_file("enabled", 0600, dir, | 2418 | file = debugfs_create_file("enabled", 0600, dir, |
2360 | &value, &fops_kp); | 2419 | &value, &fops_kp); |
2361 | if (!file) { | 2420 | if (!file) |
2362 | debugfs_remove(dir); | 2421 | goto error; |
2363 | return -ENOMEM; | 2422 | |
2364 | } | 2423 | file = debugfs_create_file("blacklist", 0444, dir, NULL, |
2424 | &debugfs_kprobe_blacklist_ops); | ||
2425 | if (!file) | ||
2426 | goto error; | ||
2365 | 2427 | ||
2366 | return 0; | 2428 | return 0; |
2429 | |||
2430 | error: | ||
2431 | debugfs_remove(dir); | ||
2432 | return -ENOMEM; | ||
2367 | } | 2433 | } |
2368 | 2434 | ||
2369 | late_initcall(debugfs_kprobe_init); | 2435 | late_initcall(debugfs_kprobe_init); |
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 2495a9b14ac8..6683ccef9fff 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c | |||
@@ -37,6 +37,7 @@ static ssize_t uevent_seqnum_show(struct kobject *kobj, | |||
37 | } | 37 | } |
38 | KERNEL_ATTR_RO(uevent_seqnum); | 38 | KERNEL_ATTR_RO(uevent_seqnum); |
39 | 39 | ||
40 | #ifdef CONFIG_UEVENT_HELPER | ||
40 | /* uevent helper program, used during early boot */ | 41 | /* uevent helper program, used during early boot */ |
41 | static ssize_t uevent_helper_show(struct kobject *kobj, | 42 | static ssize_t uevent_helper_show(struct kobject *kobj, |
42 | struct kobj_attribute *attr, char *buf) | 43 | struct kobj_attribute *attr, char *buf) |
@@ -56,7 +57,7 @@ static ssize_t uevent_helper_store(struct kobject *kobj, | |||
56 | return count; | 57 | return count; |
57 | } | 58 | } |
58 | KERNEL_ATTR_RW(uevent_helper); | 59 | KERNEL_ATTR_RW(uevent_helper); |
59 | 60 | #endif | |
60 | 61 | ||
61 | #ifdef CONFIG_PROFILING | 62 | #ifdef CONFIG_PROFILING |
62 | static ssize_t profiling_show(struct kobject *kobj, | 63 | static ssize_t profiling_show(struct kobject *kobj, |
@@ -189,7 +190,9 @@ EXPORT_SYMBOL_GPL(kernel_kobj); | |||
189 | static struct attribute * kernel_attrs[] = { | 190 | static struct attribute * kernel_attrs[] = { |
190 | &fscaps_attr.attr, | 191 | &fscaps_attr.attr, |
191 | &uevent_seqnum_attr.attr, | 192 | &uevent_seqnum_attr.attr, |
193 | #ifdef CONFIG_UEVENT_HELPER | ||
192 | &uevent_helper_attr.attr, | 194 | &uevent_helper_attr.attr, |
195 | #endif | ||
193 | #ifdef CONFIG_PROFILING | 196 | #ifdef CONFIG_PROFILING |
194 | &profiling_attr.attr, | 197 | &profiling_attr.attr, |
195 | #endif | 198 | #endif |
diff --git a/kernel/kthread.c b/kernel/kthread.c index 9a130ec06f7a..c2390f41307b 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
@@ -262,7 +262,7 @@ static void create_kthread(struct kthread_create_info *create) | |||
262 | * kthread_stop() has been called). The return value should be zero | 262 | * kthread_stop() has been called). The return value should be zero |
263 | * or a negative error number; it will be passed to kthread_stop(). | 263 | * or a negative error number; it will be passed to kthread_stop(). |
264 | * | 264 | * |
265 | * Returns a task_struct or ERR_PTR(-ENOMEM). | 265 | * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR). |
266 | */ | 266 | */ |
267 | struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), | 267 | struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), |
268 | void *data, int node, | 268 | void *data, int node, |
@@ -298,7 +298,7 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), | |||
298 | * that thread. | 298 | * that thread. |
299 | */ | 299 | */ |
300 | if (xchg(&create->done, NULL)) | 300 | if (xchg(&create->done, NULL)) |
301 | return ERR_PTR(-ENOMEM); | 301 | return ERR_PTR(-EINTR); |
302 | /* | 302 | /* |
303 | * kthreadd (or new kernel thread) will call complete() | 303 | * kthreadd (or new kernel thread) will call complete() |
304 | * shortly. | 304 | * shortly. |
diff --git a/kernel/latencytop.c b/kernel/latencytop.c index a462b317f9a0..a02812743a7e 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c | |||
@@ -88,7 +88,8 @@ static void clear_global_latency_tracing(void) | |||
88 | } | 88 | } |
89 | 89 | ||
90 | static void __sched | 90 | static void __sched |
91 | account_global_scheduler_latency(struct task_struct *tsk, struct latency_record *lat) | 91 | account_global_scheduler_latency(struct task_struct *tsk, |
92 | struct latency_record *lat) | ||
92 | { | 93 | { |
93 | int firstnonnull = MAXLR + 1; | 94 | int firstnonnull = MAXLR + 1; |
94 | int i; | 95 | int i; |
@@ -255,7 +256,7 @@ static int lstats_show(struct seq_file *m, void *v) | |||
255 | break; | 256 | break; |
256 | seq_printf(m, " %ps", (void *)bt); | 257 | seq_printf(m, " %ps", (void *)bt); |
257 | } | 258 | } |
258 | seq_printf(m, "\n"); | 259 | seq_puts(m, "\n"); |
259 | } | 260 | } |
260 | } | 261 | } |
261 | return 0; | 262 | return 0; |
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index b8bdcd4785b7..8541bfdfd232 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile | |||
@@ -24,4 +24,5 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o | |||
24 | obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o | 24 | obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o |
25 | obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o | 25 | obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o |
26 | obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o | 26 | obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o |
27 | obj-$(CONFIG_QUEUE_RWLOCK) += qrwlock.o | ||
27 | obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o | 28 | obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o |
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index b0e9467922e1..d24e4339b46d 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c | |||
@@ -4188,7 +4188,7 @@ void debug_show_held_locks(struct task_struct *task) | |||
4188 | } | 4188 | } |
4189 | EXPORT_SYMBOL_GPL(debug_show_held_locks); | 4189 | EXPORT_SYMBOL_GPL(debug_show_held_locks); |
4190 | 4190 | ||
4191 | asmlinkage void lockdep_sys_exit(void) | 4191 | asmlinkage __visible void lockdep_sys_exit(void) |
4192 | { | 4192 | { |
4193 | struct task_struct *curr = current; | 4193 | struct task_struct *curr = current; |
4194 | 4194 | ||
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index 4f560cfedc8f..51c4b24b6328 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h | |||
@@ -54,9 +54,9 @@ enum { | |||
54 | * table (if it's not there yet), and we check it for lock order | 54 | * table (if it's not there yet), and we check it for lock order |
55 | * conflicts and deadlocks. | 55 | * conflicts and deadlocks. |
56 | */ | 56 | */ |
57 | #define MAX_LOCKDEP_ENTRIES 16384UL | 57 | #define MAX_LOCKDEP_ENTRIES 32768UL |
58 | 58 | ||
59 | #define MAX_LOCKDEP_CHAINS_BITS 15 | 59 | #define MAX_LOCKDEP_CHAINS_BITS 16 |
60 | #define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) | 60 | #define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) |
61 | 61 | ||
62 | #define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5) | 62 | #define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5) |
@@ -65,7 +65,7 @@ enum { | |||
65 | * Stack-trace: tightly packed array of stack backtrace | 65 | * Stack-trace: tightly packed array of stack backtrace |
66 | * addresses. Protected by the hash_lock. | 66 | * addresses. Protected by the hash_lock. |
67 | */ | 67 | */ |
68 | #define MAX_STACK_TRACE_ENTRIES 262144UL | 68 | #define MAX_STACK_TRACE_ENTRIES 524288UL |
69 | 69 | ||
70 | extern struct list_head all_lock_classes; | 70 | extern struct list_head all_lock_classes; |
71 | extern struct lock_chain lock_chains[]; | 71 | extern struct lock_chain lock_chains[]; |
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index f26b1a18e34e..0955b885d0dc 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c | |||
@@ -82,14 +82,14 @@ struct lock_writer_stress_stats { | |||
82 | }; | 82 | }; |
83 | static struct lock_writer_stress_stats *lwsa; | 83 | static struct lock_writer_stress_stats *lwsa; |
84 | 84 | ||
85 | #if defined(MODULE) || defined(CONFIG_LOCK_TORTURE_TEST_RUNNABLE) | 85 | #if defined(MODULE) |
86 | #define LOCKTORTURE_RUNNABLE_INIT 1 | 86 | #define LOCKTORTURE_RUNNABLE_INIT 1 |
87 | #else | 87 | #else |
88 | #define LOCKTORTURE_RUNNABLE_INIT 0 | 88 | #define LOCKTORTURE_RUNNABLE_INIT 0 |
89 | #endif | 89 | #endif |
90 | int locktorture_runnable = LOCKTORTURE_RUNNABLE_INIT; | 90 | int locktorture_runnable = LOCKTORTURE_RUNNABLE_INIT; |
91 | module_param(locktorture_runnable, int, 0444); | 91 | module_param(locktorture_runnable, int, 0444); |
92 | MODULE_PARM_DESC(locktorture_runnable, "Start locktorture at boot"); | 92 | MODULE_PARM_DESC(locktorture_runnable, "Start locktorture at module init"); |
93 | 93 | ||
94 | /* Forward reference. */ | 94 | /* Forward reference. */ |
95 | static void lock_torture_cleanup(void); | 95 | static void lock_torture_cleanup(void); |
@@ -216,10 +216,11 @@ static int lock_torture_writer(void *arg) | |||
216 | static DEFINE_TORTURE_RANDOM(rand); | 216 | static DEFINE_TORTURE_RANDOM(rand); |
217 | 217 | ||
218 | VERBOSE_TOROUT_STRING("lock_torture_writer task started"); | 218 | VERBOSE_TOROUT_STRING("lock_torture_writer task started"); |
219 | set_user_nice(current, 19); | 219 | set_user_nice(current, MAX_NICE); |
220 | 220 | ||
221 | do { | 221 | do { |
222 | schedule_timeout_uninterruptible(1); | 222 | if ((torture_random(&rand) & 0xfffff) == 0) |
223 | schedule_timeout_uninterruptible(1); | ||
223 | cur_ops->writelock(); | 224 | cur_ops->writelock(); |
224 | if (WARN_ON_ONCE(lock_is_write_held)) | 225 | if (WARN_ON_ONCE(lock_is_write_held)) |
225 | lwsp->n_write_lock_fail++; | 226 | lwsp->n_write_lock_fail++; |
@@ -354,7 +355,8 @@ static int __init lock_torture_init(void) | |||
354 | &lock_busted_ops, &spin_lock_ops, &spin_lock_irq_ops, | 355 | &lock_busted_ops, &spin_lock_ops, &spin_lock_irq_ops, |
355 | }; | 356 | }; |
356 | 357 | ||
357 | torture_init_begin(torture_type, verbose, &locktorture_runnable); | 358 | if (!torture_init_begin(torture_type, verbose, &locktorture_runnable)) |
359 | return -EBUSY; | ||
358 | 360 | ||
359 | /* Process args and tell the world that the torturer is on the job. */ | 361 | /* Process args and tell the world that the torturer is on the job. */ |
360 | for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { | 362 | for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { |
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c new file mode 100644 index 000000000000..fb5b8ac411a5 --- /dev/null +++ b/kernel/locking/qrwlock.c | |||
@@ -0,0 +1,133 @@ | |||
1 | /* | ||
2 | * Queue read/write lock | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | * GNU General Public License for more details. | ||
13 | * | ||
14 | * (C) Copyright 2013-2014 Hewlett-Packard Development Company, L.P. | ||
15 | * | ||
16 | * Authors: Waiman Long <waiman.long@hp.com> | ||
17 | */ | ||
18 | #include <linux/smp.h> | ||
19 | #include <linux/bug.h> | ||
20 | #include <linux/cpumask.h> | ||
21 | #include <linux/percpu.h> | ||
22 | #include <linux/hardirq.h> | ||
23 | #include <linux/mutex.h> | ||
24 | #include <asm/qrwlock.h> | ||
25 | |||
26 | /** | ||
27 | * rspin_until_writer_unlock - inc reader count & spin until writer is gone | ||
28 | * @lock : Pointer to queue rwlock structure | ||
29 | * @writer: Current queue rwlock writer status byte | ||
30 | * | ||
31 | * In interrupt context or at the head of the queue, the reader will just | ||
32 | * increment the reader count & wait until the writer releases the lock. | ||
33 | */ | ||
34 | static __always_inline void | ||
35 | rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts) | ||
36 | { | ||
37 | while ((cnts & _QW_WMASK) == _QW_LOCKED) { | ||
38 | arch_mutex_cpu_relax(); | ||
39 | cnts = smp_load_acquire((u32 *)&lock->cnts); | ||
40 | } | ||
41 | } | ||
42 | |||
43 | /** | ||
44 | * queue_read_lock_slowpath - acquire read lock of a queue rwlock | ||
45 | * @lock: Pointer to queue rwlock structure | ||
46 | */ | ||
47 | void queue_read_lock_slowpath(struct qrwlock *lock) | ||
48 | { | ||
49 | u32 cnts; | ||
50 | |||
51 | /* | ||
52 | * Readers come here when they cannot get the lock without waiting | ||
53 | */ | ||
54 | if (unlikely(in_interrupt())) { | ||
55 | /* | ||
56 | * Readers in interrupt context will spin until the lock is | ||
57 | * available without waiting in the queue. | ||
58 | */ | ||
59 | cnts = smp_load_acquire((u32 *)&lock->cnts); | ||
60 | rspin_until_writer_unlock(lock, cnts); | ||
61 | return; | ||
62 | } | ||
63 | atomic_sub(_QR_BIAS, &lock->cnts); | ||
64 | |||
65 | /* | ||
66 | * Put the reader into the wait queue | ||
67 | */ | ||
68 | arch_spin_lock(&lock->lock); | ||
69 | |||
70 | /* | ||
71 | * At the head of the wait queue now, wait until the writer state | ||
72 | * goes to 0 and then try to increment the reader count and get | ||
73 | * the lock. It is possible that an incoming writer may steal the | ||
74 | * lock in the interim, so it is necessary to check the writer byte | ||
75 | * to make sure that the write lock isn't taken. | ||
76 | */ | ||
77 | while (atomic_read(&lock->cnts) & _QW_WMASK) | ||
78 | arch_mutex_cpu_relax(); | ||
79 | |||
80 | cnts = atomic_add_return(_QR_BIAS, &lock->cnts) - _QR_BIAS; | ||
81 | rspin_until_writer_unlock(lock, cnts); | ||
82 | |||
83 | /* | ||
84 | * Signal the next one in queue to become queue head | ||
85 | */ | ||
86 | arch_spin_unlock(&lock->lock); | ||
87 | } | ||
88 | EXPORT_SYMBOL(queue_read_lock_slowpath); | ||
89 | |||
90 | /** | ||
91 | * queue_write_lock_slowpath - acquire write lock of a queue rwlock | ||
92 | * @lock : Pointer to queue rwlock structure | ||
93 | */ | ||
94 | void queue_write_lock_slowpath(struct qrwlock *lock) | ||
95 | { | ||
96 | u32 cnts; | ||
97 | |||
98 | /* Put the writer into the wait queue */ | ||
99 | arch_spin_lock(&lock->lock); | ||
100 | |||
101 | /* Try to acquire the lock directly if no reader is present */ | ||
102 | if (!atomic_read(&lock->cnts) && | ||
103 | (atomic_cmpxchg(&lock->cnts, 0, _QW_LOCKED) == 0)) | ||
104 | goto unlock; | ||
105 | |||
106 | /* | ||
107 | * Set the waiting flag to notify readers that a writer is pending, | ||
108 | * or wait for a previous writer to go away. | ||
109 | */ | ||
110 | for (;;) { | ||
111 | cnts = atomic_read(&lock->cnts); | ||
112 | if (!(cnts & _QW_WMASK) && | ||
113 | (atomic_cmpxchg(&lock->cnts, cnts, | ||
114 | cnts | _QW_WAITING) == cnts)) | ||
115 | break; | ||
116 | |||
117 | arch_mutex_cpu_relax(); | ||
118 | } | ||
119 | |||
120 | /* When no more readers, set the locked flag */ | ||
121 | for (;;) { | ||
122 | cnts = atomic_read(&lock->cnts); | ||
123 | if ((cnts == _QW_WAITING) && | ||
124 | (atomic_cmpxchg(&lock->cnts, _QW_WAITING, | ||
125 | _QW_LOCKED) == _QW_WAITING)) | ||
126 | break; | ||
127 | |||
128 | arch_mutex_cpu_relax(); | ||
129 | } | ||
130 | unlock: | ||
131 | arch_spin_unlock(&lock->lock); | ||
132 | } | ||
133 | EXPORT_SYMBOL(queue_write_lock_slowpath); | ||
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index aa4dff04b594..a620d4d08ca6 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c | |||
@@ -343,9 +343,16 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, | |||
343 | * top_waiter can be NULL, when we are in the deboosting | 343 | * top_waiter can be NULL, when we are in the deboosting |
344 | * mode! | 344 | * mode! |
345 | */ | 345 | */ |
346 | if (top_waiter && (!task_has_pi_waiters(task) || | 346 | if (top_waiter) { |
347 | top_waiter != task_top_pi_waiter(task))) | 347 | if (!task_has_pi_waiters(task)) |
348 | goto out_unlock_pi; | 348 | goto out_unlock_pi; |
349 | /* | ||
350 | * If deadlock detection is off, we stop here if we | ||
351 | * are not the top pi waiter of the task. | ||
352 | */ | ||
353 | if (!detect_deadlock && top_waiter != task_top_pi_waiter(task)) | ||
354 | goto out_unlock_pi; | ||
355 | } | ||
349 | 356 | ||
350 | /* | 357 | /* |
351 | * When deadlock detection is off then we check, if further | 358 | * When deadlock detection is off then we check, if further |
@@ -361,7 +368,12 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, | |||
361 | goto retry; | 368 | goto retry; |
362 | } | 369 | } |
363 | 370 | ||
364 | /* Deadlock detection */ | 371 | /* |
372 | * Deadlock detection. If the lock is the same as the original | ||
373 | * lock which caused us to walk the lock chain or if the | ||
374 | * current lock is owned by the task which initiated the chain | ||
375 | * walk, we detected a deadlock. | ||
376 | */ | ||
365 | if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { | 377 | if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { |
366 | debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock); | 378 | debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock); |
367 | raw_spin_unlock(&lock->wait_lock); | 379 | raw_spin_unlock(&lock->wait_lock); |
@@ -527,6 +539,18 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, | |||
527 | unsigned long flags; | 539 | unsigned long flags; |
528 | int chain_walk = 0, res; | 540 | int chain_walk = 0, res; |
529 | 541 | ||
542 | /* | ||
543 | * Early deadlock detection. We really don't want the task to | ||
544 | * enqueue on itself just to untangle the mess later. It's not | ||
545 | * only an optimization. We drop the locks, so another waiter | ||
546 | * can come in before the chain walk detects the deadlock. So | ||
547 | * the other will detect the deadlock and return -EDEADLOCK, | ||
548 | * which is wrong, as the other waiter is not in a deadlock | ||
549 | * situation. | ||
550 | */ | ||
551 | if (detect_deadlock && owner == task) | ||
552 | return -EDEADLK; | ||
553 | |||
530 | raw_spin_lock_irqsave(&task->pi_lock, flags); | 554 | raw_spin_lock_irqsave(&task->pi_lock, flags); |
531 | __rt_mutex_adjust_prio(task); | 555 | __rt_mutex_adjust_prio(task); |
532 | waiter->task = task; | 556 | waiter->task = task; |
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 1d66e08e897d..dacc32142fcc 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c | |||
@@ -5,11 +5,66 @@ | |||
5 | * | 5 | * |
6 | * Writer lock-stealing by Alex Shi <alex.shi@intel.com> | 6 | * Writer lock-stealing by Alex Shi <alex.shi@intel.com> |
7 | * and Michel Lespinasse <walken@google.com> | 7 | * and Michel Lespinasse <walken@google.com> |
8 | * | ||
9 | * Optimistic spinning by Tim Chen <tim.c.chen@intel.com> | ||
10 | * and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes. | ||
8 | */ | 11 | */ |
9 | #include <linux/rwsem.h> | 12 | #include <linux/rwsem.h> |
10 | #include <linux/sched.h> | 13 | #include <linux/sched.h> |
11 | #include <linux/init.h> | 14 | #include <linux/init.h> |
12 | #include <linux/export.h> | 15 | #include <linux/export.h> |
16 | #include <linux/sched/rt.h> | ||
17 | |||
18 | #include "mcs_spinlock.h" | ||
19 | |||
20 | /* | ||
21 | * Guide to the rw_semaphore's count field for common values. | ||
22 | * (32-bit case illustrated, similar for 64-bit) | ||
23 | * | ||
24 | * 0x0000000X (1) X readers active or attempting lock, no writer waiting | ||
25 | * X = #active_readers + #readers attempting to lock | ||
26 | * (X*ACTIVE_BIAS) | ||
27 | * | ||
28 | * 0x00000000 rwsem is unlocked, and no one is waiting for the lock or | ||
29 | * attempting to read lock or write lock. | ||
30 | * | ||
31 | * 0xffff000X (1) X readers active or attempting lock, with waiters for lock | ||
32 | * X = #active readers + # readers attempting lock | ||
33 | * (X*ACTIVE_BIAS + WAITING_BIAS) | ||
34 | * (2) 1 writer attempting lock, no waiters for lock | ||
35 | * X-1 = #active readers + #readers attempting lock | ||
36 | * ((X-1)*ACTIVE_BIAS + ACTIVE_WRITE_BIAS) | ||
37 | * (3) 1 writer active, no waiters for lock | ||
38 | * X-1 = #active readers + #readers attempting lock | ||
39 | * ((X-1)*ACTIVE_BIAS + ACTIVE_WRITE_BIAS) | ||
40 | * | ||
41 | * 0xffff0001 (1) 1 reader active or attempting lock, waiters for lock | ||
42 | * (WAITING_BIAS + ACTIVE_BIAS) | ||
43 | * (2) 1 writer active or attempting lock, no waiters for lock | ||
44 | * (ACTIVE_WRITE_BIAS) | ||
45 | * | ||
46 | * 0xffff0000 (1) There are writers or readers queued but none active | ||
47 | * or in the process of attempting lock. | ||
48 | * (WAITING_BIAS) | ||
49 | * Note: writer can attempt to steal lock for this count by adding | ||
50 | * ACTIVE_WRITE_BIAS in cmpxchg and checking the old count | ||
51 | * | ||
52 | * 0xfffe0001 (1) 1 writer active, or attempting lock. Waiters on queue. | ||
53 | * (ACTIVE_WRITE_BIAS + WAITING_BIAS) | ||
54 | * | ||
55 | * Note: Readers attempt to lock by adding ACTIVE_BIAS in down_read and checking | ||
56 | * the count becomes more than 0 for successful lock acquisition, | ||
57 | * i.e. the case where there are only readers or nobody has lock. | ||
58 | * (1st and 2nd case above). | ||
59 | * | ||
60 | * Writers attempt to lock by adding ACTIVE_WRITE_BIAS in down_write and | ||
61 | * checking the count becomes ACTIVE_WRITE_BIAS for successful lock | ||
62 | * acquisition (i.e. nobody else has lock or attempts lock). If | ||
63 | * unsuccessful, in rwsem_down_write_failed, we'll check to see if there | ||
64 | * are only waiters but none active (5th case above), and attempt to | ||
65 | * steal the lock. | ||
66 | * | ||
67 | */ | ||
13 | 68 | ||
14 | /* | 69 | /* |
15 | * Initialize an rwsem: | 70 | * Initialize an rwsem: |
@@ -27,6 +82,10 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name, | |||
27 | sem->count = RWSEM_UNLOCKED_VALUE; | 82 | sem->count = RWSEM_UNLOCKED_VALUE; |
28 | raw_spin_lock_init(&sem->wait_lock); | 83 | raw_spin_lock_init(&sem->wait_lock); |
29 | INIT_LIST_HEAD(&sem->wait_list); | 84 | INIT_LIST_HEAD(&sem->wait_list); |
85 | #ifdef CONFIG_SMP | ||
86 | sem->owner = NULL; | ||
87 | sem->osq = NULL; | ||
88 | #endif | ||
30 | } | 89 | } |
31 | 90 | ||
32 | EXPORT_SYMBOL(__init_rwsem); | 91 | EXPORT_SYMBOL(__init_rwsem); |
@@ -141,7 +200,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) | |||
141 | } | 200 | } |
142 | 201 | ||
143 | /* | 202 | /* |
144 | * wait for the read lock to be granted | 203 | * Wait for the read lock to be granted |
145 | */ | 204 | */ |
146 | __visible | 205 | __visible |
147 | struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) | 206 | struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) |
@@ -188,64 +247,221 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) | |||
188 | return sem; | 247 | return sem; |
189 | } | 248 | } |
190 | 249 | ||
250 | static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) | ||
251 | { | ||
252 | if (!(count & RWSEM_ACTIVE_MASK)) { | ||
253 | /* try acquiring the write lock */ | ||
254 | if (sem->count == RWSEM_WAITING_BIAS && | ||
255 | cmpxchg(&sem->count, RWSEM_WAITING_BIAS, | ||
256 | RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) { | ||
257 | if (!list_is_singular(&sem->wait_list)) | ||
258 | rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); | ||
259 | return true; | ||
260 | } | ||
261 | } | ||
262 | return false; | ||
263 | } | ||
264 | |||
265 | #ifdef CONFIG_SMP | ||
191 | /* | 266 | /* |
192 | * wait until we successfully acquire the write lock | 267 | * Try to acquire write lock before the writer has been put on wait queue. |
268 | */ | ||
269 | static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) | ||
270 | { | ||
271 | long old, count = ACCESS_ONCE(sem->count); | ||
272 | |||
273 | while (true) { | ||
274 | if (!(count == 0 || count == RWSEM_WAITING_BIAS)) | ||
275 | return false; | ||
276 | |||
277 | old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS); | ||
278 | if (old == count) | ||
279 | return true; | ||
280 | |||
281 | count = old; | ||
282 | } | ||
283 | } | ||
284 | |||
285 | static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) | ||
286 | { | ||
287 | struct task_struct *owner; | ||
288 | bool on_cpu = true; | ||
289 | |||
290 | if (need_resched()) | ||
291 | return 0; | ||
292 | |||
293 | rcu_read_lock(); | ||
294 | owner = ACCESS_ONCE(sem->owner); | ||
295 | if (owner) | ||
296 | on_cpu = owner->on_cpu; | ||
297 | rcu_read_unlock(); | ||
298 | |||
299 | /* | ||
300 | * If sem->owner is not set, the rwsem owner may have | ||
301 | * just acquired it and not set the owner yet or the rwsem | ||
302 | * has been released. | ||
303 | */ | ||
304 | return on_cpu; | ||
305 | } | ||
306 | |||
307 | static inline bool owner_running(struct rw_semaphore *sem, | ||
308 | struct task_struct *owner) | ||
309 | { | ||
310 | if (sem->owner != owner) | ||
311 | return false; | ||
312 | |||
313 | /* | ||
314 | * Ensure we emit the owner->on_cpu, dereference _after_ checking | ||
315 | * sem->owner still matches owner, if that fails, owner might | ||
316 | * point to free()d memory, if it still matches, the rcu_read_lock() | ||
317 | * ensures the memory stays valid. | ||
318 | */ | ||
319 | barrier(); | ||
320 | |||
321 | return owner->on_cpu; | ||
322 | } | ||
323 | |||
324 | static noinline | ||
325 | bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner) | ||
326 | { | ||
327 | rcu_read_lock(); | ||
328 | while (owner_running(sem, owner)) { | ||
329 | if (need_resched()) | ||
330 | break; | ||
331 | |||
332 | arch_mutex_cpu_relax(); | ||
333 | } | ||
334 | rcu_read_unlock(); | ||
335 | |||
336 | /* | ||
337 | * We break out the loop above on need_resched() or when the | ||
338 | * owner changed, which is a sign for heavy contention. Return | ||
339 | * success only when sem->owner is NULL. | ||
340 | */ | ||
341 | return sem->owner == NULL; | ||
342 | } | ||
343 | |||
344 | static bool rwsem_optimistic_spin(struct rw_semaphore *sem) | ||
345 | { | ||
346 | struct task_struct *owner; | ||
347 | bool taken = false; | ||
348 | |||
349 | preempt_disable(); | ||
350 | |||
351 | /* sem->wait_lock should not be held when doing optimistic spinning */ | ||
352 | if (!rwsem_can_spin_on_owner(sem)) | ||
353 | goto done; | ||
354 | |||
355 | if (!osq_lock(&sem->osq)) | ||
356 | goto done; | ||
357 | |||
358 | while (true) { | ||
359 | owner = ACCESS_ONCE(sem->owner); | ||
360 | if (owner && !rwsem_spin_on_owner(sem, owner)) | ||
361 | break; | ||
362 | |||
363 | /* wait_lock will be acquired if write_lock is obtained */ | ||
364 | if (rwsem_try_write_lock_unqueued(sem)) { | ||
365 | taken = true; | ||
366 | break; | ||
367 | } | ||
368 | |||
369 | /* | ||
370 | * When there's no owner, we might have preempted between the | ||
371 | * owner acquiring the lock and setting the owner field. If | ||
372 | * we're an RT task that will live-lock because we won't let | ||
373 | * the owner complete. | ||
374 | */ | ||
375 | if (!owner && (need_resched() || rt_task(current))) | ||
376 | break; | ||
377 | |||
378 | /* | ||
379 | * The cpu_relax() call is a compiler barrier which forces | ||
380 | * everything in this loop to be re-loaded. We don't need | ||
381 | * memory barriers as we'll eventually observe the right | ||
382 | * values at the cost of a few extra spins. | ||
383 | */ | ||
384 | arch_mutex_cpu_relax(); | ||
385 | } | ||
386 | osq_unlock(&sem->osq); | ||
387 | done: | ||
388 | preempt_enable(); | ||
389 | return taken; | ||
390 | } | ||
391 | |||
392 | #else | ||
393 | static bool rwsem_optimistic_spin(struct rw_semaphore *sem) | ||
394 | { | ||
395 | return false; | ||
396 | } | ||
397 | #endif | ||
398 | |||
399 | /* | ||
400 | * Wait until we successfully acquire the write lock | ||
193 | */ | 401 | */ |
194 | __visible | 402 | __visible |
195 | struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) | 403 | struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) |
196 | { | 404 | { |
197 | long count, adjustment = -RWSEM_ACTIVE_WRITE_BIAS; | 405 | long count; |
406 | bool waiting = true; /* any queued threads before us */ | ||
198 | struct rwsem_waiter waiter; | 407 | struct rwsem_waiter waiter; |
199 | struct task_struct *tsk = current; | ||
200 | 408 | ||
201 | /* set up my own style of waitqueue */ | 409 | /* undo write bias from down_write operation, stop active locking */ |
202 | waiter.task = tsk; | 410 | count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem); |
411 | |||
412 | /* do optimistic spinning and steal lock if possible */ | ||
413 | if (rwsem_optimistic_spin(sem)) | ||
414 | return sem; | ||
415 | |||
416 | /* | ||
417 | * Optimistic spinning failed, proceed to the slowpath | ||
418 | * and block until we can acquire the sem. | ||
419 | */ | ||
420 | waiter.task = current; | ||
203 | waiter.type = RWSEM_WAITING_FOR_WRITE; | 421 | waiter.type = RWSEM_WAITING_FOR_WRITE; |
204 | 422 | ||
205 | raw_spin_lock_irq(&sem->wait_lock); | 423 | raw_spin_lock_irq(&sem->wait_lock); |
424 | |||
425 | /* account for this before adding a new element to the list */ | ||
206 | if (list_empty(&sem->wait_list)) | 426 | if (list_empty(&sem->wait_list)) |
207 | adjustment += RWSEM_WAITING_BIAS; | 427 | waiting = false; |
428 | |||
208 | list_add_tail(&waiter.list, &sem->wait_list); | 429 | list_add_tail(&waiter.list, &sem->wait_list); |
209 | 430 | ||
210 | /* we're now waiting on the lock, but no longer actively locking */ | 431 | /* we're now waiting on the lock, but no longer actively locking */ |
211 | count = rwsem_atomic_update(adjustment, sem); | 432 | if (waiting) { |
433 | count = ACCESS_ONCE(sem->count); | ||
434 | |||
435 | /* | ||
436 | * If there were already threads queued before us and there are | ||
437 | * no active writers, the lock must be read owned; so we try to | ||
438 | * wake any read locks that were queued ahead of us. | ||
439 | */ | ||
440 | if (count > RWSEM_WAITING_BIAS) | ||
441 | sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS); | ||
212 | 442 | ||
213 | /* If there were already threads queued before us and there are no | 443 | } else |
214 | * active writers, the lock must be read owned; so we try to wake | 444 | count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); |
215 | * any read locks that were queued ahead of us. */ | ||
216 | if (count > RWSEM_WAITING_BIAS && | ||
217 | adjustment == -RWSEM_ACTIVE_WRITE_BIAS) | ||
218 | sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS); | ||
219 | 445 | ||
220 | /* wait until we successfully acquire the lock */ | 446 | /* wait until we successfully acquire the lock */ |
221 | set_task_state(tsk, TASK_UNINTERRUPTIBLE); | 447 | set_current_state(TASK_UNINTERRUPTIBLE); |
222 | while (true) { | 448 | while (true) { |
223 | if (!(count & RWSEM_ACTIVE_MASK)) { | 449 | if (rwsem_try_write_lock(count, sem)) |
224 | /* Try acquiring the write lock. */ | 450 | break; |
225 | count = RWSEM_ACTIVE_WRITE_BIAS; | ||
226 | if (!list_is_singular(&sem->wait_list)) | ||
227 | count += RWSEM_WAITING_BIAS; | ||
228 | |||
229 | if (sem->count == RWSEM_WAITING_BIAS && | ||
230 | cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) == | ||
231 | RWSEM_WAITING_BIAS) | ||
232 | break; | ||
233 | } | ||
234 | |||
235 | raw_spin_unlock_irq(&sem->wait_lock); | 451 | raw_spin_unlock_irq(&sem->wait_lock); |
236 | 452 | ||
237 | /* Block until there are no active lockers. */ | 453 | /* Block until there are no active lockers. */ |
238 | do { | 454 | do { |
239 | schedule(); | 455 | schedule(); |
240 | set_task_state(tsk, TASK_UNINTERRUPTIBLE); | 456 | set_current_state(TASK_UNINTERRUPTIBLE); |
241 | } while ((count = sem->count) & RWSEM_ACTIVE_MASK); | 457 | } while ((count = sem->count) & RWSEM_ACTIVE_MASK); |
242 | 458 | ||
243 | raw_spin_lock_irq(&sem->wait_lock); | 459 | raw_spin_lock_irq(&sem->wait_lock); |
244 | } | 460 | } |
461 | __set_current_state(TASK_RUNNING); | ||
245 | 462 | ||
246 | list_del(&waiter.list); | 463 | list_del(&waiter.list); |
247 | raw_spin_unlock_irq(&sem->wait_lock); | 464 | raw_spin_unlock_irq(&sem->wait_lock); |
248 | tsk->state = TASK_RUNNING; | ||
249 | 465 | ||
250 | return sem; | 466 | return sem; |
251 | } | 467 | } |
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index cfff1435bdfb..42f806de49d4 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c | |||
@@ -12,6 +12,27 @@ | |||
12 | 12 | ||
13 | #include <linux/atomic.h> | 13 | #include <linux/atomic.h> |
14 | 14 | ||
15 | #if defined(CONFIG_SMP) && defined(CONFIG_RWSEM_XCHGADD_ALGORITHM) | ||
16 | static inline void rwsem_set_owner(struct rw_semaphore *sem) | ||
17 | { | ||
18 | sem->owner = current; | ||
19 | } | ||
20 | |||
21 | static inline void rwsem_clear_owner(struct rw_semaphore *sem) | ||
22 | { | ||
23 | sem->owner = NULL; | ||
24 | } | ||
25 | |||
26 | #else | ||
27 | static inline void rwsem_set_owner(struct rw_semaphore *sem) | ||
28 | { | ||
29 | } | ||
30 | |||
31 | static inline void rwsem_clear_owner(struct rw_semaphore *sem) | ||
32 | { | ||
33 | } | ||
34 | #endif | ||
35 | |||
15 | /* | 36 | /* |
16 | * lock for reading | 37 | * lock for reading |
17 | */ | 38 | */ |
@@ -48,6 +69,7 @@ void __sched down_write(struct rw_semaphore *sem) | |||
48 | rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); | 69 | rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); |
49 | 70 | ||
50 | LOCK_CONTENDED(sem, __down_write_trylock, __down_write); | 71 | LOCK_CONTENDED(sem, __down_write_trylock, __down_write); |
72 | rwsem_set_owner(sem); | ||
51 | } | 73 | } |
52 | 74 | ||
53 | EXPORT_SYMBOL(down_write); | 75 | EXPORT_SYMBOL(down_write); |
@@ -59,8 +81,11 @@ int down_write_trylock(struct rw_semaphore *sem) | |||
59 | { | 81 | { |
60 | int ret = __down_write_trylock(sem); | 82 | int ret = __down_write_trylock(sem); |
61 | 83 | ||
62 | if (ret == 1) | 84 | if (ret == 1) { |
63 | rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_); | 85 | rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_); |
86 | rwsem_set_owner(sem); | ||
87 | } | ||
88 | |||
64 | return ret; | 89 | return ret; |
65 | } | 90 | } |
66 | 91 | ||
@@ -85,6 +110,7 @@ void up_write(struct rw_semaphore *sem) | |||
85 | { | 110 | { |
86 | rwsem_release(&sem->dep_map, 1, _RET_IP_); | 111 | rwsem_release(&sem->dep_map, 1, _RET_IP_); |
87 | 112 | ||
113 | rwsem_clear_owner(sem); | ||
88 | __up_write(sem); | 114 | __up_write(sem); |
89 | } | 115 | } |
90 | 116 | ||
@@ -99,6 +125,7 @@ void downgrade_write(struct rw_semaphore *sem) | |||
99 | * lockdep: a downgraded write will live on as a write | 125 | * lockdep: a downgraded write will live on as a write |
100 | * dependency. | 126 | * dependency. |
101 | */ | 127 | */ |
128 | rwsem_clear_owner(sem); | ||
102 | __downgrade_write(sem); | 129 | __downgrade_write(sem); |
103 | } | 130 | } |
104 | 131 | ||
@@ -122,6 +149,7 @@ void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest) | |||
122 | rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_); | 149 | rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_); |
123 | 150 | ||
124 | LOCK_CONTENDED(sem, __down_write_trylock, __down_write); | 151 | LOCK_CONTENDED(sem, __down_write_trylock, __down_write); |
152 | rwsem_set_owner(sem); | ||
125 | } | 153 | } |
126 | 154 | ||
127 | EXPORT_SYMBOL(_down_write_nest_lock); | 155 | EXPORT_SYMBOL(_down_write_nest_lock); |
@@ -141,6 +169,7 @@ void down_write_nested(struct rw_semaphore *sem, int subclass) | |||
141 | rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); | 169 | rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); |
142 | 170 | ||
143 | LOCK_CONTENDED(sem, __down_write_trylock, __down_write); | 171 | LOCK_CONTENDED(sem, __down_write_trylock, __down_write); |
172 | rwsem_set_owner(sem); | ||
144 | } | 173 | } |
145 | 174 | ||
146 | EXPORT_SYMBOL(down_write_nested); | 175 | EXPORT_SYMBOL(down_write_nested); |
diff --git a/kernel/module.c b/kernel/module.c index 079c4615607d..81e727cf6df9 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -3020,21 +3020,6 @@ static int do_init_module(struct module *mod) | |||
3020 | */ | 3020 | */ |
3021 | current->flags &= ~PF_USED_ASYNC; | 3021 | current->flags &= ~PF_USED_ASYNC; |
3022 | 3022 | ||
3023 | blocking_notifier_call_chain(&module_notify_list, | ||
3024 | MODULE_STATE_COMING, mod); | ||
3025 | |||
3026 | /* Set RO and NX regions for core */ | ||
3027 | set_section_ro_nx(mod->module_core, | ||
3028 | mod->core_text_size, | ||
3029 | mod->core_ro_size, | ||
3030 | mod->core_size); | ||
3031 | |||
3032 | /* Set RO and NX regions for init */ | ||
3033 | set_section_ro_nx(mod->module_init, | ||
3034 | mod->init_text_size, | ||
3035 | mod->init_ro_size, | ||
3036 | mod->init_size); | ||
3037 | |||
3038 | do_mod_ctors(mod); | 3023 | do_mod_ctors(mod); |
3039 | /* Start the module */ | 3024 | /* Start the module */ |
3040 | if (mod->init != NULL) | 3025 | if (mod->init != NULL) |
@@ -3165,9 +3150,26 @@ static int complete_formation(struct module *mod, struct load_info *info) | |||
3165 | /* This relies on module_mutex for list integrity. */ | 3150 | /* This relies on module_mutex for list integrity. */ |
3166 | module_bug_finalize(info->hdr, info->sechdrs, mod); | 3151 | module_bug_finalize(info->hdr, info->sechdrs, mod); |
3167 | 3152 | ||
3153 | /* Set RO and NX regions for core */ | ||
3154 | set_section_ro_nx(mod->module_core, | ||
3155 | mod->core_text_size, | ||
3156 | mod->core_ro_size, | ||
3157 | mod->core_size); | ||
3158 | |||
3159 | /* Set RO and NX regions for init */ | ||
3160 | set_section_ro_nx(mod->module_init, | ||
3161 | mod->init_text_size, | ||
3162 | mod->init_ro_size, | ||
3163 | mod->init_size); | ||
3164 | |||
3168 | /* Mark state as coming so strong_try_module_get() ignores us, | 3165 | /* Mark state as coming so strong_try_module_get() ignores us, |
3169 | * but kallsyms etc. can see us. */ | 3166 | * but kallsyms etc. can see us. */ |
3170 | mod->state = MODULE_STATE_COMING; | 3167 | mod->state = MODULE_STATE_COMING; |
3168 | mutex_unlock(&module_mutex); | ||
3169 | |||
3170 | blocking_notifier_call_chain(&module_notify_list, | ||
3171 | MODULE_STATE_COMING, mod); | ||
3172 | return 0; | ||
3171 | 3173 | ||
3172 | out: | 3174 | out: |
3173 | mutex_unlock(&module_mutex); | 3175 | mutex_unlock(&module_mutex); |
@@ -3190,6 +3192,7 @@ static int load_module(struct load_info *info, const char __user *uargs, | |||
3190 | { | 3192 | { |
3191 | struct module *mod; | 3193 | struct module *mod; |
3192 | long err; | 3194 | long err; |
3195 | char *after_dashes; | ||
3193 | 3196 | ||
3194 | err = module_sig_check(info); | 3197 | err = module_sig_check(info); |
3195 | if (err) | 3198 | if (err) |
@@ -3277,10 +3280,15 @@ static int load_module(struct load_info *info, const char __user *uargs, | |||
3277 | goto ddebug_cleanup; | 3280 | goto ddebug_cleanup; |
3278 | 3281 | ||
3279 | /* Module is ready to execute: parsing args may do that. */ | 3282 | /* Module is ready to execute: parsing args may do that. */ |
3280 | err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, | 3283 | after_dashes = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, |
3281 | -32768, 32767, unknown_module_param_cb); | 3284 | -32768, 32767, unknown_module_param_cb); |
3282 | if (err < 0) | 3285 | if (IS_ERR(after_dashes)) { |
3286 | err = PTR_ERR(after_dashes); | ||
3283 | goto bug_cleanup; | 3287 | goto bug_cleanup; |
3288 | } else if (after_dashes) { | ||
3289 | pr_warn("%s: parameters '%s' after `--' ignored\n", | ||
3290 | mod->name, after_dashes); | ||
3291 | } | ||
3284 | 3292 | ||
3285 | /* Link in to syfs. */ | 3293 | /* Link in to syfs. */ |
3286 | err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp); | 3294 | err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp); |
diff --git a/kernel/notifier.c b/kernel/notifier.c index db4c8b08a50c..4803da6eab62 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c | |||
@@ -71,9 +71,9 @@ static int notifier_chain_unregister(struct notifier_block **nl, | |||
71 | * @returns: notifier_call_chain returns the value returned by the | 71 | * @returns: notifier_call_chain returns the value returned by the |
72 | * last notifier function called. | 72 | * last notifier function called. |
73 | */ | 73 | */ |
74 | static int __kprobes notifier_call_chain(struct notifier_block **nl, | 74 | static int notifier_call_chain(struct notifier_block **nl, |
75 | unsigned long val, void *v, | 75 | unsigned long val, void *v, |
76 | int nr_to_call, int *nr_calls) | 76 | int nr_to_call, int *nr_calls) |
77 | { | 77 | { |
78 | int ret = NOTIFY_DONE; | 78 | int ret = NOTIFY_DONE; |
79 | struct notifier_block *nb, *next_nb; | 79 | struct notifier_block *nb, *next_nb; |
@@ -102,6 +102,7 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl, | |||
102 | } | 102 | } |
103 | return ret; | 103 | return ret; |
104 | } | 104 | } |
105 | NOKPROBE_SYMBOL(notifier_call_chain); | ||
105 | 106 | ||
106 | /* | 107 | /* |
107 | * Atomic notifier chain routines. Registration and unregistration | 108 | * Atomic notifier chain routines. Registration and unregistration |
@@ -172,9 +173,9 @@ EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister); | |||
172 | * Otherwise the return value is the return value | 173 | * Otherwise the return value is the return value |
173 | * of the last notifier function called. | 174 | * of the last notifier function called. |
174 | */ | 175 | */ |
175 | int __kprobes __atomic_notifier_call_chain(struct atomic_notifier_head *nh, | 176 | int __atomic_notifier_call_chain(struct atomic_notifier_head *nh, |
176 | unsigned long val, void *v, | 177 | unsigned long val, void *v, |
177 | int nr_to_call, int *nr_calls) | 178 | int nr_to_call, int *nr_calls) |
178 | { | 179 | { |
179 | int ret; | 180 | int ret; |
180 | 181 | ||
@@ -184,13 +185,15 @@ int __kprobes __atomic_notifier_call_chain(struct atomic_notifier_head *nh, | |||
184 | return ret; | 185 | return ret; |
185 | } | 186 | } |
186 | EXPORT_SYMBOL_GPL(__atomic_notifier_call_chain); | 187 | EXPORT_SYMBOL_GPL(__atomic_notifier_call_chain); |
188 | NOKPROBE_SYMBOL(__atomic_notifier_call_chain); | ||
187 | 189 | ||
188 | int __kprobes atomic_notifier_call_chain(struct atomic_notifier_head *nh, | 190 | int atomic_notifier_call_chain(struct atomic_notifier_head *nh, |
189 | unsigned long val, void *v) | 191 | unsigned long val, void *v) |
190 | { | 192 | { |
191 | return __atomic_notifier_call_chain(nh, val, v, -1, NULL); | 193 | return __atomic_notifier_call_chain(nh, val, v, -1, NULL); |
192 | } | 194 | } |
193 | EXPORT_SYMBOL_GPL(atomic_notifier_call_chain); | 195 | EXPORT_SYMBOL_GPL(atomic_notifier_call_chain); |
196 | NOKPROBE_SYMBOL(atomic_notifier_call_chain); | ||
194 | 197 | ||
195 | /* | 198 | /* |
196 | * Blocking notifier chain routines. All access to the chain is | 199 | * Blocking notifier chain routines. All access to the chain is |
@@ -527,7 +530,7 @@ EXPORT_SYMBOL_GPL(srcu_init_notifier_head); | |||
527 | 530 | ||
528 | static ATOMIC_NOTIFIER_HEAD(die_chain); | 531 | static ATOMIC_NOTIFIER_HEAD(die_chain); |
529 | 532 | ||
530 | int notrace __kprobes notify_die(enum die_val val, const char *str, | 533 | int notrace notify_die(enum die_val val, const char *str, |
531 | struct pt_regs *regs, long err, int trap, int sig) | 534 | struct pt_regs *regs, long err, int trap, int sig) |
532 | { | 535 | { |
533 | struct die_args args = { | 536 | struct die_args args = { |
@@ -540,6 +543,7 @@ int notrace __kprobes notify_die(enum die_val val, const char *str, | |||
540 | }; | 543 | }; |
541 | return atomic_notifier_call_chain(&die_chain, val, &args); | 544 | return atomic_notifier_call_chain(&die_chain, val, &args); |
542 | } | 545 | } |
546 | NOKPROBE_SYMBOL(notify_die); | ||
543 | 547 | ||
544 | int register_die_notifier(struct notifier_block *nb) | 548 | int register_die_notifier(struct notifier_block *nb) |
545 | { | 549 | { |
diff --git a/kernel/panic.c b/kernel/panic.c index d02fa9fef46a..62e16cef9cc2 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
@@ -32,6 +32,7 @@ static unsigned long tainted_mask; | |||
32 | static int pause_on_oops; | 32 | static int pause_on_oops; |
33 | static int pause_on_oops_flag; | 33 | static int pause_on_oops_flag; |
34 | static DEFINE_SPINLOCK(pause_on_oops_lock); | 34 | static DEFINE_SPINLOCK(pause_on_oops_lock); |
35 | static bool crash_kexec_post_notifiers; | ||
35 | 36 | ||
36 | int panic_timeout = CONFIG_PANIC_TIMEOUT; | 37 | int panic_timeout = CONFIG_PANIC_TIMEOUT; |
37 | EXPORT_SYMBOL_GPL(panic_timeout); | 38 | EXPORT_SYMBOL_GPL(panic_timeout); |
@@ -112,9 +113,11 @@ void panic(const char *fmt, ...) | |||
112 | /* | 113 | /* |
113 | * If we have crashed and we have a crash kernel loaded let it handle | 114 | * If we have crashed and we have a crash kernel loaded let it handle |
114 | * everything else. | 115 | * everything else. |
115 | * Do we want to call this before we try to display a message? | 116 | * If we want to run this after calling panic_notifiers, pass |
117 | * the "crash_kexec_post_notifiers" option to the kernel. | ||
116 | */ | 118 | */ |
117 | crash_kexec(NULL); | 119 | if (!crash_kexec_post_notifiers) |
120 | crash_kexec(NULL); | ||
118 | 121 | ||
119 | /* | 122 | /* |
120 | * Note smp_send_stop is the usual smp shutdown function, which | 123 | * Note smp_send_stop is the usual smp shutdown function, which |
@@ -131,6 +134,15 @@ void panic(const char *fmt, ...) | |||
131 | 134 | ||
132 | kmsg_dump(KMSG_DUMP_PANIC); | 135 | kmsg_dump(KMSG_DUMP_PANIC); |
133 | 136 | ||
137 | /* | ||
138 | * If you doubt kdump always works fine in any situation, | ||
139 | * "crash_kexec_post_notifiers" offers you a chance to run | ||
140 | * panic_notifiers and dumping kmsg before kdump. | ||
141 | * Note: since some panic_notifiers can make crashed kernel | ||
142 | * more unstable, it can increase risks of the kdump failure too. | ||
143 | */ | ||
144 | crash_kexec(NULL); | ||
145 | |||
134 | bust_spinlocks(0); | 146 | bust_spinlocks(0); |
135 | 147 | ||
136 | if (!panic_blink) | 148 | if (!panic_blink) |
@@ -472,6 +484,13 @@ EXPORT_SYMBOL(__stack_chk_fail); | |||
472 | core_param(panic, panic_timeout, int, 0644); | 484 | core_param(panic, panic_timeout, int, 0644); |
473 | core_param(pause_on_oops, pause_on_oops, int, 0644); | 485 | core_param(pause_on_oops, pause_on_oops, int, 0644); |
474 | 486 | ||
487 | static int __init setup_crash_kexec_post_notifiers(char *s) | ||
488 | { | ||
489 | crash_kexec_post_notifiers = true; | ||
490 | return 0; | ||
491 | } | ||
492 | early_param("crash_kexec_post_notifiers", setup_crash_kexec_post_notifiers); | ||
493 | |||
475 | static int __init oops_setup(char *s) | 494 | static int __init oops_setup(char *s) |
476 | { | 495 | { |
477 | if (!s) | 496 | if (!s) |
diff --git a/kernel/params.c b/kernel/params.c index b00142e7f3ba..1e52ca233fd9 100644 --- a/kernel/params.c +++ b/kernel/params.c | |||
@@ -177,13 +177,13 @@ static char *next_arg(char *args, char **param, char **val) | |||
177 | } | 177 | } |
178 | 178 | ||
179 | /* Args looks like "foo=bar,bar2 baz=fuz wiz". */ | 179 | /* Args looks like "foo=bar,bar2 baz=fuz wiz". */ |
180 | int parse_args(const char *doing, | 180 | char *parse_args(const char *doing, |
181 | char *args, | 181 | char *args, |
182 | const struct kernel_param *params, | 182 | const struct kernel_param *params, |
183 | unsigned num, | 183 | unsigned num, |
184 | s16 min_level, | 184 | s16 min_level, |
185 | s16 max_level, | 185 | s16 max_level, |
186 | int (*unknown)(char *param, char *val, const char *doing)) | 186 | int (*unknown)(char *param, char *val, const char *doing)) |
187 | { | 187 | { |
188 | char *param, *val; | 188 | char *param, *val; |
189 | 189 | ||
@@ -198,6 +198,9 @@ int parse_args(const char *doing, | |||
198 | int irq_was_disabled; | 198 | int irq_was_disabled; |
199 | 199 | ||
200 | args = next_arg(args, ¶m, &val); | 200 | args = next_arg(args, ¶m, &val); |
201 | /* Stop at -- */ | ||
202 | if (!val && strcmp(param, "--") == 0) | ||
203 | return args; | ||
201 | irq_was_disabled = irqs_disabled(); | 204 | irq_was_disabled = irqs_disabled(); |
202 | ret = parse_one(param, val, doing, params, num, | 205 | ret = parse_one(param, val, doing, params, num, |
203 | min_level, max_level, unknown); | 206 | min_level, max_level, unknown); |
@@ -208,22 +211,22 @@ int parse_args(const char *doing, | |||
208 | switch (ret) { | 211 | switch (ret) { |
209 | case -ENOENT: | 212 | case -ENOENT: |
210 | pr_err("%s: Unknown parameter `%s'\n", doing, param); | 213 | pr_err("%s: Unknown parameter `%s'\n", doing, param); |
211 | return ret; | 214 | return ERR_PTR(ret); |
212 | case -ENOSPC: | 215 | case -ENOSPC: |
213 | pr_err("%s: `%s' too large for parameter `%s'\n", | 216 | pr_err("%s: `%s' too large for parameter `%s'\n", |
214 | doing, val ?: "", param); | 217 | doing, val ?: "", param); |
215 | return ret; | 218 | return ERR_PTR(ret); |
216 | case 0: | 219 | case 0: |
217 | break; | 220 | break; |
218 | default: | 221 | default: |
219 | pr_err("%s: `%s' invalid for parameter `%s'\n", | 222 | pr_err("%s: `%s' invalid for parameter `%s'\n", |
220 | doing, val ?: "", param); | 223 | doing, val ?: "", param); |
221 | return ret; | 224 | return ERR_PTR(ret); |
222 | } | 225 | } |
223 | } | 226 | } |
224 | 227 | ||
225 | /* All parsed OK. */ | 228 | /* All parsed OK. */ |
226 | return 0; | 229 | return NULL; |
227 | } | 230 | } |
228 | 231 | ||
229 | /* Lazy bastard, eh? */ | 232 | /* Lazy bastard, eh? */ |
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 2fac9cc79b3d..9a83d780facd 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
@@ -257,8 +257,7 @@ config ARCH_HAS_OPP | |||
257 | bool | 257 | bool |
258 | 258 | ||
259 | config PM_OPP | 259 | config PM_OPP |
260 | bool "Operating Performance Point (OPP) Layer library" | 260 | bool |
261 | depends on ARCH_HAS_OPP | ||
262 | ---help--- | 261 | ---help--- |
263 | SOCs have a standard set of tuples consisting of frequency and | 262 | SOCs have a standard set of tuples consisting of frequency and |
264 | voltage pairs that the device will support per voltage domain. This | 263 | voltage pairs that the device will support per voltage domain. This |
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index f4f2073711d3..49e0a20fd010 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c | |||
@@ -28,6 +28,7 @@ | |||
28 | #include <linux/syscore_ops.h> | 28 | #include <linux/syscore_ops.h> |
29 | #include <linux/ctype.h> | 29 | #include <linux/ctype.h> |
30 | #include <linux/genhd.h> | 30 | #include <linux/genhd.h> |
31 | #include <trace/events/power.h> | ||
31 | 32 | ||
32 | #include "power.h" | 33 | #include "power.h" |
33 | 34 | ||
@@ -35,7 +36,7 @@ | |||
35 | static int nocompress; | 36 | static int nocompress; |
36 | static int noresume; | 37 | static int noresume; |
37 | static int resume_wait; | 38 | static int resume_wait; |
38 | static int resume_delay; | 39 | static unsigned int resume_delay; |
39 | static char resume_file[256] = CONFIG_PM_STD_PARTITION; | 40 | static char resume_file[256] = CONFIG_PM_STD_PARTITION; |
40 | dev_t swsusp_resume_device; | 41 | dev_t swsusp_resume_device; |
41 | sector_t swsusp_resume_block; | 42 | sector_t swsusp_resume_block; |
@@ -228,19 +229,23 @@ static void platform_recover(int platform_mode) | |||
228 | void swsusp_show_speed(struct timeval *start, struct timeval *stop, | 229 | void swsusp_show_speed(struct timeval *start, struct timeval *stop, |
229 | unsigned nr_pages, char *msg) | 230 | unsigned nr_pages, char *msg) |
230 | { | 231 | { |
231 | s64 elapsed_centisecs64; | 232 | u64 elapsed_centisecs64; |
232 | int centisecs; | 233 | unsigned int centisecs; |
233 | int k; | 234 | unsigned int k; |
234 | int kps; | 235 | unsigned int kps; |
235 | 236 | ||
236 | elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start); | 237 | elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start); |
238 | /* | ||
239 | * If "(s64)elapsed_centisecs64 < 0", it will print long elapsed time, | ||
240 | * it is obvious enough for what went wrong. | ||
241 | */ | ||
237 | do_div(elapsed_centisecs64, NSEC_PER_SEC / 100); | 242 | do_div(elapsed_centisecs64, NSEC_PER_SEC / 100); |
238 | centisecs = elapsed_centisecs64; | 243 | centisecs = elapsed_centisecs64; |
239 | if (centisecs == 0) | 244 | if (centisecs == 0) |
240 | centisecs = 1; /* avoid div-by-zero */ | 245 | centisecs = 1; /* avoid div-by-zero */ |
241 | k = nr_pages * (PAGE_SIZE / 1024); | 246 | k = nr_pages * (PAGE_SIZE / 1024); |
242 | kps = (k * 100) / centisecs; | 247 | kps = (k * 100) / centisecs; |
243 | printk(KERN_INFO "PM: %s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", | 248 | printk(KERN_INFO "PM: %s %u kbytes in %u.%02u seconds (%u.%02u MB/s)\n", |
244 | msg, k, | 249 | msg, k, |
245 | centisecs / 100, centisecs % 100, | 250 | centisecs / 100, centisecs % 100, |
246 | kps / 1000, (kps % 1000) / 10); | 251 | kps / 1000, (kps % 1000) / 10); |
@@ -288,7 +293,9 @@ static int create_image(int platform_mode) | |||
288 | 293 | ||
289 | in_suspend = 1; | 294 | in_suspend = 1; |
290 | save_processor_state(); | 295 | save_processor_state(); |
296 | trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, true); | ||
291 | error = swsusp_arch_suspend(); | 297 | error = swsusp_arch_suspend(); |
298 | trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, false); | ||
292 | if (error) | 299 | if (error) |
293 | printk(KERN_ERR "PM: Error %d creating hibernation image\n", | 300 | printk(KERN_ERR "PM: Error %d creating hibernation image\n", |
294 | error); | 301 | error); |
@@ -595,7 +602,8 @@ static void power_down(void) | |||
595 | case HIBERNATION_PLATFORM: | 602 | case HIBERNATION_PLATFORM: |
596 | hibernation_platform_enter(); | 603 | hibernation_platform_enter(); |
597 | case HIBERNATION_SHUTDOWN: | 604 | case HIBERNATION_SHUTDOWN: |
598 | kernel_power_off(); | 605 | if (pm_power_off) |
606 | kernel_power_off(); | ||
599 | break; | 607 | break; |
600 | #ifdef CONFIG_SUSPEND | 608 | #ifdef CONFIG_SUSPEND |
601 | case HIBERNATION_SUSPEND: | 609 | case HIBERNATION_SUSPEND: |
@@ -623,7 +631,8 @@ static void power_down(void) | |||
623 | * corruption after resume. | 631 | * corruption after resume. |
624 | */ | 632 | */ |
625 | printk(KERN_CRIT "PM: Please power down manually\n"); | 633 | printk(KERN_CRIT "PM: Please power down manually\n"); |
626 | while(1); | 634 | while (1) |
635 | cpu_relax(); | ||
627 | } | 636 | } |
628 | 637 | ||
629 | /** | 638 | /** |
@@ -1109,7 +1118,10 @@ static int __init resumewait_setup(char *str) | |||
1109 | 1118 | ||
1110 | static int __init resumedelay_setup(char *str) | 1119 | static int __init resumedelay_setup(char *str) |
1111 | { | 1120 | { |
1112 | resume_delay = simple_strtoul(str, NULL, 0); | 1121 | int rc = kstrtouint(str, 0, &resume_delay); |
1122 | |||
1123 | if (rc) | ||
1124 | return rc; | ||
1113 | return 1; | 1125 | return 1; |
1114 | } | 1126 | } |
1115 | 1127 | ||
diff --git a/kernel/power/main.c b/kernel/power/main.c index 6271bc4073ef..573410d6647e 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
@@ -279,26 +279,26 @@ static inline void pm_print_times_init(void) {} | |||
279 | struct kobject *power_kobj; | 279 | struct kobject *power_kobj; |
280 | 280 | ||
281 | /** | 281 | /** |
282 | * state - control system power state. | 282 | * state - control system sleep states. |
283 | * | 283 | * |
284 | * show() returns what states are supported, which is hard-coded to | 284 | * show() returns available sleep state labels, which may be "mem", "standby", |
285 | * 'freeze' (Low-Power Idle), 'standby' (Power-On Suspend), | 285 | * "freeze" and "disk" (hibernation). See Documentation/power/states.txt for a |
286 | * 'mem' (Suspend-to-RAM), and 'disk' (Suspend-to-Disk). | 286 | * description of what they mean. |
287 | * | 287 | * |
288 | * store() accepts one of those strings, translates it into the | 288 | * store() accepts one of those strings, translates it into the proper |
289 | * proper enumerated value, and initiates a suspend transition. | 289 | * enumerated value, and initiates a suspend transition. |
290 | */ | 290 | */ |
291 | static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, | 291 | static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, |
292 | char *buf) | 292 | char *buf) |
293 | { | 293 | { |
294 | char *s = buf; | 294 | char *s = buf; |
295 | #ifdef CONFIG_SUSPEND | 295 | #ifdef CONFIG_SUSPEND |
296 | int i; | 296 | suspend_state_t i; |
297 | |||
298 | for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) | ||
299 | if (pm_states[i].state) | ||
300 | s += sprintf(s,"%s ", pm_states[i].label); | ||
297 | 301 | ||
298 | for (i = 0; i < PM_SUSPEND_MAX; i++) { | ||
299 | if (pm_states[i] && valid_state(i)) | ||
300 | s += sprintf(s,"%s ", pm_states[i]); | ||
301 | } | ||
302 | #endif | 302 | #endif |
303 | #ifdef CONFIG_HIBERNATION | 303 | #ifdef CONFIG_HIBERNATION |
304 | s += sprintf(s, "%s\n", "disk"); | 304 | s += sprintf(s, "%s\n", "disk"); |
@@ -314,7 +314,7 @@ static suspend_state_t decode_state(const char *buf, size_t n) | |||
314 | { | 314 | { |
315 | #ifdef CONFIG_SUSPEND | 315 | #ifdef CONFIG_SUSPEND |
316 | suspend_state_t state = PM_SUSPEND_MIN; | 316 | suspend_state_t state = PM_SUSPEND_MIN; |
317 | const char * const *s; | 317 | struct pm_sleep_state *s; |
318 | #endif | 318 | #endif |
319 | char *p; | 319 | char *p; |
320 | int len; | 320 | int len; |
@@ -328,8 +328,9 @@ static suspend_state_t decode_state(const char *buf, size_t n) | |||
328 | 328 | ||
329 | #ifdef CONFIG_SUSPEND | 329 | #ifdef CONFIG_SUSPEND |
330 | for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) | 330 | for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) |
331 | if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) | 331 | if (s->state && len == strlen(s->label) |
332 | return state; | 332 | && !strncmp(buf, s->label, len)) |
333 | return s->state; | ||
333 | #endif | 334 | #endif |
334 | 335 | ||
335 | return PM_SUSPEND_ON; | 336 | return PM_SUSPEND_ON; |
@@ -447,8 +448,8 @@ static ssize_t autosleep_show(struct kobject *kobj, | |||
447 | 448 | ||
448 | #ifdef CONFIG_SUSPEND | 449 | #ifdef CONFIG_SUSPEND |
449 | if (state < PM_SUSPEND_MAX) | 450 | if (state < PM_SUSPEND_MAX) |
450 | return sprintf(buf, "%s\n", valid_state(state) ? | 451 | return sprintf(buf, "%s\n", pm_states[state].state ? |
451 | pm_states[state] : "error"); | 452 | pm_states[state].label : "error"); |
452 | #endif | 453 | #endif |
453 | #ifdef CONFIG_HIBERNATION | 454 | #ifdef CONFIG_HIBERNATION |
454 | return sprintf(buf, "disk\n"); | 455 | return sprintf(buf, "disk\n"); |
diff --git a/kernel/power/power.h b/kernel/power/power.h index 15f37ea08719..c60f13b5270a 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h | |||
@@ -178,17 +178,20 @@ extern void swsusp_show_speed(struct timeval *, struct timeval *, | |||
178 | unsigned int, char *); | 178 | unsigned int, char *); |
179 | 179 | ||
180 | #ifdef CONFIG_SUSPEND | 180 | #ifdef CONFIG_SUSPEND |
181 | struct pm_sleep_state { | ||
182 | const char *label; | ||
183 | suspend_state_t state; | ||
184 | }; | ||
185 | |||
181 | /* kernel/power/suspend.c */ | 186 | /* kernel/power/suspend.c */ |
182 | extern const char *const pm_states[]; | 187 | extern struct pm_sleep_state pm_states[]; |
183 | 188 | ||
184 | extern bool valid_state(suspend_state_t state); | ||
185 | extern int suspend_devices_and_enter(suspend_state_t state); | 189 | extern int suspend_devices_and_enter(suspend_state_t state); |
186 | #else /* !CONFIG_SUSPEND */ | 190 | #else /* !CONFIG_SUSPEND */ |
187 | static inline int suspend_devices_and_enter(suspend_state_t state) | 191 | static inline int suspend_devices_and_enter(suspend_state_t state) |
188 | { | 192 | { |
189 | return -ENOSYS; | 193 | return -ENOSYS; |
190 | } | 194 | } |
191 | static inline bool valid_state(suspend_state_t state) { return false; } | ||
192 | #endif /* !CONFIG_SUSPEND */ | 195 | #endif /* !CONFIG_SUSPEND */ |
193 | 196 | ||
194 | #ifdef CONFIG_PM_TEST_SUSPEND | 197 | #ifdef CONFIG_PM_TEST_SUSPEND |
diff --git a/kernel/power/process.c b/kernel/power/process.c index 06ec8869dbf1..0ca8d83e2369 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c | |||
@@ -17,6 +17,7 @@ | |||
17 | #include <linux/delay.h> | 17 | #include <linux/delay.h> |
18 | #include <linux/workqueue.h> | 18 | #include <linux/workqueue.h> |
19 | #include <linux/kmod.h> | 19 | #include <linux/kmod.h> |
20 | #include <trace/events/power.h> | ||
20 | 21 | ||
21 | /* | 22 | /* |
22 | * Timeout for stopping processes | 23 | * Timeout for stopping processes |
@@ -175,6 +176,7 @@ void thaw_processes(void) | |||
175 | struct task_struct *g, *p; | 176 | struct task_struct *g, *p; |
176 | struct task_struct *curr = current; | 177 | struct task_struct *curr = current; |
177 | 178 | ||
179 | trace_suspend_resume(TPS("thaw_processes"), 0, true); | ||
178 | if (pm_freezing) | 180 | if (pm_freezing) |
179 | atomic_dec(&system_freezing_cnt); | 181 | atomic_dec(&system_freezing_cnt); |
180 | pm_freezing = false; | 182 | pm_freezing = false; |
@@ -201,6 +203,7 @@ void thaw_processes(void) | |||
201 | 203 | ||
202 | schedule(); | 204 | schedule(); |
203 | printk("done.\n"); | 205 | printk("done.\n"); |
206 | trace_suspend_resume(TPS("thaw_processes"), 0, false); | ||
204 | } | 207 | } |
205 | 208 | ||
206 | void thaw_kernel_threads(void) | 209 | void thaw_kernel_threads(void) |
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 18fb7a2fb14b..1ea328aafdc9 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
@@ -1586,7 +1586,7 @@ swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, | |||
1586 | return -ENOMEM; | 1586 | return -ENOMEM; |
1587 | } | 1587 | } |
1588 | 1588 | ||
1589 | asmlinkage int swsusp_save(void) | 1589 | asmlinkage __visible int swsusp_save(void) |
1590 | { | 1590 | { |
1591 | unsigned int nr_pages, nr_highmem; | 1591 | unsigned int nr_pages, nr_highmem; |
1592 | 1592 | ||
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 8233cd4047d7..4dd8822f732a 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c | |||
@@ -31,13 +31,14 @@ | |||
31 | 31 | ||
32 | #include "power.h" | 32 | #include "power.h" |
33 | 33 | ||
34 | const char *const pm_states[PM_SUSPEND_MAX] = { | 34 | struct pm_sleep_state pm_states[PM_SUSPEND_MAX] = { |
35 | [PM_SUSPEND_FREEZE] = "freeze", | 35 | [PM_SUSPEND_FREEZE] = { .label = "freeze", .state = PM_SUSPEND_FREEZE }, |
36 | [PM_SUSPEND_STANDBY] = "standby", | 36 | [PM_SUSPEND_STANDBY] = { .label = "standby", }, |
37 | [PM_SUSPEND_MEM] = "mem", | 37 | [PM_SUSPEND_MEM] = { .label = "mem", }, |
38 | }; | 38 | }; |
39 | 39 | ||
40 | static const struct platform_suspend_ops *suspend_ops; | 40 | static const struct platform_suspend_ops *suspend_ops; |
41 | static const struct platform_freeze_ops *freeze_ops; | ||
41 | 42 | ||
42 | static bool need_suspend_ops(suspend_state_t state) | 43 | static bool need_suspend_ops(suspend_state_t state) |
43 | { | 44 | { |
@@ -47,6 +48,13 @@ static bool need_suspend_ops(suspend_state_t state) | |||
47 | static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head); | 48 | static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head); |
48 | static bool suspend_freeze_wake; | 49 | static bool suspend_freeze_wake; |
49 | 50 | ||
51 | void freeze_set_ops(const struct platform_freeze_ops *ops) | ||
52 | { | ||
53 | lock_system_sleep(); | ||
54 | freeze_ops = ops; | ||
55 | unlock_system_sleep(); | ||
56 | } | ||
57 | |||
50 | static void freeze_begin(void) | 58 | static void freeze_begin(void) |
51 | { | 59 | { |
52 | suspend_freeze_wake = false; | 60 | suspend_freeze_wake = false; |
@@ -54,9 +62,11 @@ static void freeze_begin(void) | |||
54 | 62 | ||
55 | static void freeze_enter(void) | 63 | static void freeze_enter(void) |
56 | { | 64 | { |
65 | cpuidle_use_deepest_state(true); | ||
57 | cpuidle_resume(); | 66 | cpuidle_resume(); |
58 | wait_event(suspend_freeze_wait_head, suspend_freeze_wake); | 67 | wait_event(suspend_freeze_wait_head, suspend_freeze_wake); |
59 | cpuidle_pause(); | 68 | cpuidle_pause(); |
69 | cpuidle_use_deepest_state(false); | ||
60 | } | 70 | } |
61 | 71 | ||
62 | void freeze_wake(void) | 72 | void freeze_wake(void) |
@@ -66,42 +76,62 @@ void freeze_wake(void) | |||
66 | } | 76 | } |
67 | EXPORT_SYMBOL_GPL(freeze_wake); | 77 | EXPORT_SYMBOL_GPL(freeze_wake); |
68 | 78 | ||
79 | static bool valid_state(suspend_state_t state) | ||
80 | { | ||
81 | /* | ||
82 | * PM_SUSPEND_STANDBY and PM_SUSPEND_MEM states need low level | ||
83 | * support and need to be valid to the low level | ||
84 | * implementation, no valid callback implies that none are valid. | ||
85 | */ | ||
86 | return suspend_ops && suspend_ops->valid && suspend_ops->valid(state); | ||
87 | } | ||
88 | |||
89 | /* | ||
90 | * If this is set, the "mem" label always corresponds to the deepest sleep state | ||
91 | * available, the "standby" label corresponds to the second deepest sleep state | ||
92 | * available (if any), and the "freeze" label corresponds to the remaining | ||
93 | * available sleep state (if there is one). | ||
94 | */ | ||
95 | static bool relative_states; | ||
96 | |||
97 | static int __init sleep_states_setup(char *str) | ||
98 | { | ||
99 | relative_states = !strncmp(str, "1", 1); | ||
100 | if (relative_states) { | ||
101 | pm_states[PM_SUSPEND_MEM].state = PM_SUSPEND_FREEZE; | ||
102 | pm_states[PM_SUSPEND_FREEZE].state = 0; | ||
103 | } | ||
104 | return 1; | ||
105 | } | ||
106 | |||
107 | __setup("relative_sleep_states=", sleep_states_setup); | ||
108 | |||
69 | /** | 109 | /** |
70 | * suspend_set_ops - Set the global suspend method table. | 110 | * suspend_set_ops - Set the global suspend method table. |
71 | * @ops: Suspend operations to use. | 111 | * @ops: Suspend operations to use. |
72 | */ | 112 | */ |
73 | void suspend_set_ops(const struct platform_suspend_ops *ops) | 113 | void suspend_set_ops(const struct platform_suspend_ops *ops) |
74 | { | 114 | { |
115 | suspend_state_t i; | ||
116 | int j = PM_SUSPEND_MAX - 1; | ||
117 | |||
75 | lock_system_sleep(); | 118 | lock_system_sleep(); |
119 | |||
76 | suspend_ops = ops; | 120 | suspend_ops = ops; |
121 | for (i = PM_SUSPEND_MEM; i >= PM_SUSPEND_STANDBY; i--) | ||
122 | if (valid_state(i)) | ||
123 | pm_states[j--].state = i; | ||
124 | else if (!relative_states) | ||
125 | pm_states[j--].state = 0; | ||
126 | |||
127 | pm_states[j--].state = PM_SUSPEND_FREEZE; | ||
128 | while (j >= PM_SUSPEND_MIN) | ||
129 | pm_states[j--].state = 0; | ||
130 | |||
77 | unlock_system_sleep(); | 131 | unlock_system_sleep(); |
78 | } | 132 | } |
79 | EXPORT_SYMBOL_GPL(suspend_set_ops); | 133 | EXPORT_SYMBOL_GPL(suspend_set_ops); |
80 | 134 | ||
81 | bool valid_state(suspend_state_t state) | ||
82 | { | ||
83 | if (state == PM_SUSPEND_FREEZE) { | ||
84 | #ifdef CONFIG_PM_DEBUG | ||
85 | if (pm_test_level != TEST_NONE && | ||
86 | pm_test_level != TEST_FREEZER && | ||
87 | pm_test_level != TEST_DEVICES && | ||
88 | pm_test_level != TEST_PLATFORM) { | ||
89 | printk(KERN_WARNING "Unsupported pm_test mode for " | ||
90 | "freeze state, please choose " | ||
91 | "none/freezer/devices/platform.\n"); | ||
92 | return false; | ||
93 | } | ||
94 | #endif | ||
95 | return true; | ||
96 | } | ||
97 | /* | ||
98 | * PM_SUSPEND_STANDBY and PM_SUSPEND_MEMORY states need lowlevel | ||
99 | * support and need to be valid to the lowlevel | ||
100 | * implementation, no valid callback implies that none are valid. | ||
101 | */ | ||
102 | return suspend_ops && suspend_ops->valid && suspend_ops->valid(state); | ||
103 | } | ||
104 | |||
105 | /** | 135 | /** |
106 | * suspend_valid_only_mem - Generic memory-only valid callback. | 136 | * suspend_valid_only_mem - Generic memory-only valid callback. |
107 | * | 137 | * |
@@ -147,7 +177,9 @@ static int suspend_prepare(suspend_state_t state) | |||
147 | if (error) | 177 | if (error) |
148 | goto Finish; | 178 | goto Finish; |
149 | 179 | ||
180 | trace_suspend_resume(TPS("freeze_processes"), 0, true); | ||
150 | error = suspend_freeze_processes(); | 181 | error = suspend_freeze_processes(); |
182 | trace_suspend_resume(TPS("freeze_processes"), 0, false); | ||
151 | if (!error) | 183 | if (!error) |
152 | return 0; | 184 | return 0; |
153 | 185 | ||
@@ -210,7 +242,9 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) | |||
210 | * all the devices are suspended. | 242 | * all the devices are suspended. |
211 | */ | 243 | */ |
212 | if (state == PM_SUSPEND_FREEZE) { | 244 | if (state == PM_SUSPEND_FREEZE) { |
245 | trace_suspend_resume(TPS("machine_suspend"), state, true); | ||
213 | freeze_enter(); | 246 | freeze_enter(); |
247 | trace_suspend_resume(TPS("machine_suspend"), state, false); | ||
214 | goto Platform_wake; | 248 | goto Platform_wake; |
215 | } | 249 | } |
216 | 250 | ||
@@ -226,7 +260,11 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) | |||
226 | if (!error) { | 260 | if (!error) { |
227 | *wakeup = pm_wakeup_pending(); | 261 | *wakeup = pm_wakeup_pending(); |
228 | if (!(suspend_test(TEST_CORE) || *wakeup)) { | 262 | if (!(suspend_test(TEST_CORE) || *wakeup)) { |
263 | trace_suspend_resume(TPS("machine_suspend"), | ||
264 | state, true); | ||
229 | error = suspend_ops->enter(state); | 265 | error = suspend_ops->enter(state); |
266 | trace_suspend_resume(TPS("machine_suspend"), | ||
267 | state, false); | ||
230 | events_check_enabled = false; | 268 | events_check_enabled = false; |
231 | } | 269 | } |
232 | syscore_resume(); | 270 | syscore_resume(); |
@@ -264,11 +302,14 @@ int suspend_devices_and_enter(suspend_state_t state) | |||
264 | if (need_suspend_ops(state) && !suspend_ops) | 302 | if (need_suspend_ops(state) && !suspend_ops) |
265 | return -ENOSYS; | 303 | return -ENOSYS; |
266 | 304 | ||
267 | trace_machine_suspend(state); | ||
268 | if (need_suspend_ops(state) && suspend_ops->begin) { | 305 | if (need_suspend_ops(state) && suspend_ops->begin) { |
269 | error = suspend_ops->begin(state); | 306 | error = suspend_ops->begin(state); |
270 | if (error) | 307 | if (error) |
271 | goto Close; | 308 | goto Close; |
309 | } else if (state == PM_SUSPEND_FREEZE && freeze_ops->begin) { | ||
310 | error = freeze_ops->begin(); | ||
311 | if (error) | ||
312 | goto Close; | ||
272 | } | 313 | } |
273 | suspend_console(); | 314 | suspend_console(); |
274 | suspend_test_start(); | 315 | suspend_test_start(); |
@@ -294,7 +335,9 @@ int suspend_devices_and_enter(suspend_state_t state) | |||
294 | Close: | 335 | Close: |
295 | if (need_suspend_ops(state) && suspend_ops->end) | 336 | if (need_suspend_ops(state) && suspend_ops->end) |
296 | suspend_ops->end(); | 337 | suspend_ops->end(); |
297 | trace_machine_suspend(PWR_EVENT_EXIT); | 338 | else if (state == PM_SUSPEND_FREEZE && freeze_ops->end) |
339 | freeze_ops->end(); | ||
340 | |||
298 | return error; | 341 | return error; |
299 | 342 | ||
300 | Recover_platform: | 343 | Recover_platform: |
@@ -328,20 +371,31 @@ static int enter_state(suspend_state_t state) | |||
328 | { | 371 | { |
329 | int error; | 372 | int error; |
330 | 373 | ||
331 | if (!valid_state(state)) | 374 | trace_suspend_resume(TPS("suspend_enter"), state, true); |
332 | return -ENODEV; | 375 | if (state == PM_SUSPEND_FREEZE) { |
333 | 376 | #ifdef CONFIG_PM_DEBUG | |
377 | if (pm_test_level != TEST_NONE && pm_test_level <= TEST_CPUS) { | ||
378 | pr_warning("PM: Unsupported test mode for freeze state," | ||
379 | "please choose none/freezer/devices/platform.\n"); | ||
380 | return -EAGAIN; | ||
381 | } | ||
382 | #endif | ||
383 | } else if (!valid_state(state)) { | ||
384 | return -EINVAL; | ||
385 | } | ||
334 | if (!mutex_trylock(&pm_mutex)) | 386 | if (!mutex_trylock(&pm_mutex)) |
335 | return -EBUSY; | 387 | return -EBUSY; |
336 | 388 | ||
337 | if (state == PM_SUSPEND_FREEZE) | 389 | if (state == PM_SUSPEND_FREEZE) |
338 | freeze_begin(); | 390 | freeze_begin(); |
339 | 391 | ||
392 | trace_suspend_resume(TPS("sync_filesystems"), 0, true); | ||
340 | printk(KERN_INFO "PM: Syncing filesystems ... "); | 393 | printk(KERN_INFO "PM: Syncing filesystems ... "); |
341 | sys_sync(); | 394 | sys_sync(); |
342 | printk("done.\n"); | 395 | printk("done.\n"); |
396 | trace_suspend_resume(TPS("sync_filesystems"), 0, false); | ||
343 | 397 | ||
344 | pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); | 398 | pr_debug("PM: Preparing system for %s sleep\n", pm_states[state].label); |
345 | error = suspend_prepare(state); | 399 | error = suspend_prepare(state); |
346 | if (error) | 400 | if (error) |
347 | goto Unlock; | 401 | goto Unlock; |
@@ -349,7 +403,8 @@ static int enter_state(suspend_state_t state) | |||
349 | if (suspend_test(TEST_FREEZER)) | 403 | if (suspend_test(TEST_FREEZER)) |
350 | goto Finish; | 404 | goto Finish; |
351 | 405 | ||
352 | pr_debug("PM: Entering %s sleep\n", pm_states[state]); | 406 | trace_suspend_resume(TPS("suspend_enter"), state, false); |
407 | pr_debug("PM: Entering %s sleep\n", pm_states[state].label); | ||
353 | pm_restrict_gfp_mask(); | 408 | pm_restrict_gfp_mask(); |
354 | error = suspend_devices_and_enter(state); | 409 | error = suspend_devices_and_enter(state); |
355 | pm_restore_gfp_mask(); | 410 | pm_restore_gfp_mask(); |
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index 9b2a1d58558d..269b097e78ea 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c | |||
@@ -92,13 +92,13 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state) | |||
92 | } | 92 | } |
93 | 93 | ||
94 | if (state == PM_SUSPEND_MEM) { | 94 | if (state == PM_SUSPEND_MEM) { |
95 | printk(info_test, pm_states[state]); | 95 | printk(info_test, pm_states[state].label); |
96 | status = pm_suspend(state); | 96 | status = pm_suspend(state); |
97 | if (status == -ENODEV) | 97 | if (status == -ENODEV) |
98 | state = PM_SUSPEND_STANDBY; | 98 | state = PM_SUSPEND_STANDBY; |
99 | } | 99 | } |
100 | if (state == PM_SUSPEND_STANDBY) { | 100 | if (state == PM_SUSPEND_STANDBY) { |
101 | printk(info_test, pm_states[state]); | 101 | printk(info_test, pm_states[state].label); |
102 | status = pm_suspend(state); | 102 | status = pm_suspend(state); |
103 | } | 103 | } |
104 | if (status < 0) | 104 | if (status < 0) |
@@ -136,18 +136,16 @@ static char warn_bad_state[] __initdata = | |||
136 | 136 | ||
137 | static int __init setup_test_suspend(char *value) | 137 | static int __init setup_test_suspend(char *value) |
138 | { | 138 | { |
139 | unsigned i; | 139 | suspend_state_t i; |
140 | 140 | ||
141 | /* "=mem" ==> "mem" */ | 141 | /* "=mem" ==> "mem" */ |
142 | value++; | 142 | value++; |
143 | for (i = 0; i < PM_SUSPEND_MAX; i++) { | 143 | for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) |
144 | if (!pm_states[i]) | 144 | if (!strcmp(pm_states[i].label, value)) { |
145 | continue; | 145 | test_state = pm_states[i].state; |
146 | if (strcmp(pm_states[i], value) != 0) | 146 | return 0; |
147 | continue; | 147 | } |
148 | test_state = (__force suspend_state_t) i; | 148 | |
149 | return 0; | ||
150 | } | ||
151 | printk(warn_bad_state, value); | 149 | printk(warn_bad_state, value); |
152 | return 0; | 150 | return 0; |
153 | } | 151 | } |
@@ -164,8 +162,8 @@ static int __init test_suspend(void) | |||
164 | /* PM is initialized by now; is that state testable? */ | 162 | /* PM is initialized by now; is that state testable? */ |
165 | if (test_state == PM_SUSPEND_ON) | 163 | if (test_state == PM_SUSPEND_ON) |
166 | goto done; | 164 | goto done; |
167 | if (!valid_state(test_state)) { | 165 | if (!pm_states[test_state].state) { |
168 | printk(warn_bad_state, pm_states[test_state]); | 166 | printk(warn_bad_state, pm_states[test_state].label); |
169 | goto done; | 167 | goto done; |
170 | } | 168 | } |
171 | 169 | ||
diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 8c9a4819f798..aaa3261dea5d 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c | |||
@@ -567,7 +567,7 @@ static int lzo_compress_threadfn(void *data) | |||
567 | 567 | ||
568 | /** | 568 | /** |
569 | * save_image_lzo - Save the suspend image data compressed with LZO. | 569 | * save_image_lzo - Save the suspend image data compressed with LZO. |
570 | * @handle: Swap mam handle to use for saving the image. | 570 | * @handle: Swap map handle to use for saving the image. |
571 | * @snapshot: Image to read data from. | 571 | * @snapshot: Image to read data from. |
572 | * @nr_to_write: Number of pages to save. | 572 | * @nr_to_write: Number of pages to save. |
573 | */ | 573 | */ |
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index a45b50962295..ea2d5f6962ed 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c | |||
@@ -54,20 +54,16 @@ | |||
54 | #include "console_cmdline.h" | 54 | #include "console_cmdline.h" |
55 | #include "braille.h" | 55 | #include "braille.h" |
56 | 56 | ||
57 | /* printk's without a loglevel use this.. */ | ||
58 | #define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL | ||
59 | |||
60 | /* We show everything that is MORE important than this.. */ | ||
61 | #define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */ | ||
62 | #define DEFAULT_CONSOLE_LOGLEVEL 7 /* anything MORE serious than KERN_DEBUG */ | ||
63 | |||
64 | int console_printk[4] = { | 57 | int console_printk[4] = { |
65 | DEFAULT_CONSOLE_LOGLEVEL, /* console_loglevel */ | 58 | CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */ |
66 | DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */ | 59 | DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */ |
67 | MINIMUM_CONSOLE_LOGLEVEL, /* minimum_console_loglevel */ | 60 | CONSOLE_LOGLEVEL_MIN, /* minimum_console_loglevel */ |
68 | DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ | 61 | CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */ |
69 | }; | 62 | }; |
70 | 63 | ||
64 | /* Deferred messaged from sched code are marked by this special level */ | ||
65 | #define SCHED_MESSAGE_LOGLEVEL -2 | ||
66 | |||
71 | /* | 67 | /* |
72 | * Low level drivers may need that to know if they can schedule in | 68 | * Low level drivers may need that to know if they can schedule in |
73 | * their unblank() callback or not. So let's export it. | 69 | * their unblank() callback or not. So let's export it. |
@@ -91,6 +87,29 @@ static struct lockdep_map console_lock_dep_map = { | |||
91 | #endif | 87 | #endif |
92 | 88 | ||
93 | /* | 89 | /* |
90 | * Helper macros to handle lockdep when locking/unlocking console_sem. We use | ||
91 | * macros instead of functions so that _RET_IP_ contains useful information. | ||
92 | */ | ||
93 | #define down_console_sem() do { \ | ||
94 | down(&console_sem);\ | ||
95 | mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_);\ | ||
96 | } while (0) | ||
97 | |||
98 | static int __down_trylock_console_sem(unsigned long ip) | ||
99 | { | ||
100 | if (down_trylock(&console_sem)) | ||
101 | return 1; | ||
102 | mutex_acquire(&console_lock_dep_map, 0, 1, ip); | ||
103 | return 0; | ||
104 | } | ||
105 | #define down_trylock_console_sem() __down_trylock_console_sem(_RET_IP_) | ||
106 | |||
107 | #define up_console_sem() do { \ | ||
108 | mutex_release(&console_lock_dep_map, 1, _RET_IP_);\ | ||
109 | up(&console_sem);\ | ||
110 | } while (0) | ||
111 | |||
112 | /* | ||
94 | * This is used for debugging the mess that is the VT code by | 113 | * This is used for debugging the mess that is the VT code by |
95 | * keeping track if we have the console semaphore held. It's | 114 | * keeping track if we have the console semaphore held. It's |
96 | * definitely not the perfect debug tool (we don't know if _WE_ | 115 | * definitely not the perfect debug tool (we don't know if _WE_ |
@@ -206,8 +225,9 @@ struct printk_log { | |||
206 | }; | 225 | }; |
207 | 226 | ||
208 | /* | 227 | /* |
209 | * The logbuf_lock protects kmsg buffer, indices, counters. It is also | 228 | * The logbuf_lock protects kmsg buffer, indices, counters. This can be taken |
210 | * used in interesting ways to provide interlocking in console_unlock(); | 229 | * within the scheduler's rq lock. It must be released before calling |
230 | * console_unlock() or anything else that might wake up a process. | ||
211 | */ | 231 | */ |
212 | static DEFINE_RAW_SPINLOCK(logbuf_lock); | 232 | static DEFINE_RAW_SPINLOCK(logbuf_lock); |
213 | 233 | ||
@@ -250,9 +270,6 @@ static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); | |||
250 | static char *log_buf = __log_buf; | 270 | static char *log_buf = __log_buf; |
251 | static u32 log_buf_len = __LOG_BUF_LEN; | 271 | static u32 log_buf_len = __LOG_BUF_LEN; |
252 | 272 | ||
253 | /* cpu currently holding logbuf_lock */ | ||
254 | static volatile unsigned int logbuf_cpu = UINT_MAX; | ||
255 | |||
256 | /* human readable text of the record */ | 273 | /* human readable text of the record */ |
257 | static char *log_text(const struct printk_log *msg) | 274 | static char *log_text(const struct printk_log *msg) |
258 | { | 275 | { |
@@ -297,34 +314,106 @@ static u32 log_next(u32 idx) | |||
297 | return idx + msg->len; | 314 | return idx + msg->len; |
298 | } | 315 | } |
299 | 316 | ||
300 | /* insert record into the buffer, discard old ones, update heads */ | 317 | /* |
301 | static void log_store(int facility, int level, | 318 | * Check whether there is enough free space for the given message. |
302 | enum log_flags flags, u64 ts_nsec, | 319 | * |
303 | const char *dict, u16 dict_len, | 320 | * The same values of first_idx and next_idx mean that the buffer |
304 | const char *text, u16 text_len) | 321 | * is either empty or full. |
322 | * | ||
323 | * If the buffer is empty, we must respect the position of the indexes. | ||
324 | * They cannot be reset to the beginning of the buffer. | ||
325 | */ | ||
326 | static int logbuf_has_space(u32 msg_size, bool empty) | ||
305 | { | 327 | { |
306 | struct printk_log *msg; | 328 | u32 free; |
307 | u32 size, pad_len; | ||
308 | 329 | ||
309 | /* number of '\0' padding bytes to next message */ | 330 | if (log_next_idx > log_first_idx || empty) |
310 | size = sizeof(struct printk_log) + text_len + dict_len; | 331 | free = max(log_buf_len - log_next_idx, log_first_idx); |
311 | pad_len = (-size) & (LOG_ALIGN - 1); | 332 | else |
312 | size += pad_len; | 333 | free = log_first_idx - log_next_idx; |
334 | |||
335 | /* | ||
336 | * We need space also for an empty header that signalizes wrapping | ||
337 | * of the buffer. | ||
338 | */ | ||
339 | return free >= msg_size + sizeof(struct printk_log); | ||
340 | } | ||
313 | 341 | ||
342 | static int log_make_free_space(u32 msg_size) | ||
343 | { | ||
314 | while (log_first_seq < log_next_seq) { | 344 | while (log_first_seq < log_next_seq) { |
315 | u32 free; | 345 | if (logbuf_has_space(msg_size, false)) |
346 | return 0; | ||
347 | /* drop old messages until we have enough continuous space */ | ||
348 | log_first_idx = log_next(log_first_idx); | ||
349 | log_first_seq++; | ||
350 | } | ||
316 | 351 | ||
317 | if (log_next_idx > log_first_idx) | 352 | /* sequence numbers are equal, so the log buffer is empty */ |
318 | free = max(log_buf_len - log_next_idx, log_first_idx); | 353 | if (logbuf_has_space(msg_size, true)) |
319 | else | 354 | return 0; |
320 | free = log_first_idx - log_next_idx; | ||
321 | 355 | ||
322 | if (free >= size + sizeof(struct printk_log)) | 356 | return -ENOMEM; |
323 | break; | 357 | } |
324 | 358 | ||
325 | /* drop old messages until we have enough contiuous space */ | 359 | /* compute the message size including the padding bytes */ |
326 | log_first_idx = log_next(log_first_idx); | 360 | static u32 msg_used_size(u16 text_len, u16 dict_len, u32 *pad_len) |
327 | log_first_seq++; | 361 | { |
362 | u32 size; | ||
363 | |||
364 | size = sizeof(struct printk_log) + text_len + dict_len; | ||
365 | *pad_len = (-size) & (LOG_ALIGN - 1); | ||
366 | size += *pad_len; | ||
367 | |||
368 | return size; | ||
369 | } | ||
370 | |||
371 | /* | ||
372 | * Define how much of the log buffer we could take at maximum. The value | ||
373 | * must be greater than two. Note that only half of the buffer is available | ||
374 | * when the index points to the middle. | ||
375 | */ | ||
376 | #define MAX_LOG_TAKE_PART 4 | ||
377 | static const char trunc_msg[] = "<truncated>"; | ||
378 | |||
379 | static u32 truncate_msg(u16 *text_len, u16 *trunc_msg_len, | ||
380 | u16 *dict_len, u32 *pad_len) | ||
381 | { | ||
382 | /* | ||
383 | * The message should not take the whole buffer. Otherwise, it might | ||
384 | * get removed too soon. | ||
385 | */ | ||
386 | u32 max_text_len = log_buf_len / MAX_LOG_TAKE_PART; | ||
387 | if (*text_len > max_text_len) | ||
388 | *text_len = max_text_len; | ||
389 | /* enable the warning message */ | ||
390 | *trunc_msg_len = strlen(trunc_msg); | ||
391 | /* disable the "dict" completely */ | ||
392 | *dict_len = 0; | ||
393 | /* compute the size again, count also the warning message */ | ||
394 | return msg_used_size(*text_len + *trunc_msg_len, 0, pad_len); | ||
395 | } | ||
396 | |||
397 | /* insert record into the buffer, discard old ones, update heads */ | ||
398 | static int log_store(int facility, int level, | ||
399 | enum log_flags flags, u64 ts_nsec, | ||
400 | const char *dict, u16 dict_len, | ||
401 | const char *text, u16 text_len) | ||
402 | { | ||
403 | struct printk_log *msg; | ||
404 | u32 size, pad_len; | ||
405 | u16 trunc_msg_len = 0; | ||
406 | |||
407 | /* number of '\0' padding bytes to next message */ | ||
408 | size = msg_used_size(text_len, dict_len, &pad_len); | ||
409 | |||
410 | if (log_make_free_space(size)) { | ||
411 | /* truncate the message if it is too long for empty buffer */ | ||
412 | size = truncate_msg(&text_len, &trunc_msg_len, | ||
413 | &dict_len, &pad_len); | ||
414 | /* survive when the log buffer is too small for trunc_msg */ | ||
415 | if (log_make_free_space(size)) | ||
416 | return 0; | ||
328 | } | 417 | } |
329 | 418 | ||
330 | if (log_next_idx + size + sizeof(struct printk_log) > log_buf_len) { | 419 | if (log_next_idx + size + sizeof(struct printk_log) > log_buf_len) { |
@@ -341,6 +430,10 @@ static void log_store(int facility, int level, | |||
341 | msg = (struct printk_log *)(log_buf + log_next_idx); | 430 | msg = (struct printk_log *)(log_buf + log_next_idx); |
342 | memcpy(log_text(msg), text, text_len); | 431 | memcpy(log_text(msg), text, text_len); |
343 | msg->text_len = text_len; | 432 | msg->text_len = text_len; |
433 | if (trunc_msg_len) { | ||
434 | memcpy(log_text(msg) + text_len, trunc_msg, trunc_msg_len); | ||
435 | msg->text_len += trunc_msg_len; | ||
436 | } | ||
344 | memcpy(log_dict(msg), dict, dict_len); | 437 | memcpy(log_dict(msg), dict, dict_len); |
345 | msg->dict_len = dict_len; | 438 | msg->dict_len = dict_len; |
346 | msg->facility = facility; | 439 | msg->facility = facility; |
@@ -356,6 +449,8 @@ static void log_store(int facility, int level, | |||
356 | /* insert message */ | 449 | /* insert message */ |
357 | log_next_idx += msg->len; | 450 | log_next_idx += msg->len; |
358 | log_next_seq++; | 451 | log_next_seq++; |
452 | |||
453 | return msg->text_len; | ||
359 | } | 454 | } |
360 | 455 | ||
361 | #ifdef CONFIG_SECURITY_DMESG_RESTRICT | 456 | #ifdef CONFIG_SECURITY_DMESG_RESTRICT |
@@ -1303,7 +1398,10 @@ static void zap_locks(void) | |||
1303 | sema_init(&console_sem, 1); | 1398 | sema_init(&console_sem, 1); |
1304 | } | 1399 | } |
1305 | 1400 | ||
1306 | /* Check if we have any console registered that can be called early in boot. */ | 1401 | /* |
1402 | * Check if we have any console that is capable of printing while cpu is | ||
1403 | * booting or shutting down. Requires console_sem. | ||
1404 | */ | ||
1307 | static int have_callable_console(void) | 1405 | static int have_callable_console(void) |
1308 | { | 1406 | { |
1309 | struct console *con; | 1407 | struct console *con; |
@@ -1318,10 +1416,9 @@ static int have_callable_console(void) | |||
1318 | /* | 1416 | /* |
1319 | * Can we actually use the console at this time on this cpu? | 1417 | * Can we actually use the console at this time on this cpu? |
1320 | * | 1418 | * |
1321 | * Console drivers may assume that per-cpu resources have | 1419 | * Console drivers may assume that per-cpu resources have been allocated. So |
1322 | * been allocated. So unless they're explicitly marked as | 1420 | * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't |
1323 | * being able to cope (CON_ANYTIME) don't call them until | 1421 | * call them until this CPU is officially up. |
1324 | * this CPU is officially up. | ||
1325 | */ | 1422 | */ |
1326 | static inline int can_use_console(unsigned int cpu) | 1423 | static inline int can_use_console(unsigned int cpu) |
1327 | { | 1424 | { |
@@ -1333,36 +1430,24 @@ static inline int can_use_console(unsigned int cpu) | |||
1333 | * messages from a 'printk'. Return true (and with the | 1430 | * messages from a 'printk'. Return true (and with the |
1334 | * console_lock held, and 'console_locked' set) if it | 1431 | * console_lock held, and 'console_locked' set) if it |
1335 | * is successful, false otherwise. | 1432 | * is successful, false otherwise. |
1336 | * | ||
1337 | * This gets called with the 'logbuf_lock' spinlock held and | ||
1338 | * interrupts disabled. It should return with 'lockbuf_lock' | ||
1339 | * released but interrupts still disabled. | ||
1340 | */ | 1433 | */ |
1341 | static int console_trylock_for_printk(unsigned int cpu) | 1434 | static int console_trylock_for_printk(void) |
1342 | __releases(&logbuf_lock) | ||
1343 | { | 1435 | { |
1344 | int retval = 0, wake = 0; | 1436 | unsigned int cpu = smp_processor_id(); |
1345 | 1437 | ||
1346 | if (console_trylock()) { | 1438 | if (!console_trylock()) |
1347 | retval = 1; | 1439 | return 0; |
1348 | 1440 | /* | |
1349 | /* | 1441 | * If we can't use the console, we need to release the console |
1350 | * If we can't use the console, we need to release | 1442 | * semaphore by hand to avoid flushing the buffer. We need to hold the |
1351 | * the console semaphore by hand to avoid flushing | 1443 | * console semaphore in order to do this test safely. |
1352 | * the buffer. We need to hold the console semaphore | 1444 | */ |
1353 | * in order to do this test safely. | 1445 | if (!can_use_console(cpu)) { |
1354 | */ | 1446 | console_locked = 0; |
1355 | if (!can_use_console(cpu)) { | 1447 | up_console_sem(); |
1356 | console_locked = 0; | 1448 | return 0; |
1357 | wake = 1; | ||
1358 | retval = 0; | ||
1359 | } | ||
1360 | } | 1449 | } |
1361 | logbuf_cpu = UINT_MAX; | 1450 | return 1; |
1362 | raw_spin_unlock(&logbuf_lock); | ||
1363 | if (wake) | ||
1364 | up(&console_sem); | ||
1365 | return retval; | ||
1366 | } | 1451 | } |
1367 | 1452 | ||
1368 | int printk_delay_msec __read_mostly; | 1453 | int printk_delay_msec __read_mostly; |
@@ -1490,11 +1575,19 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
1490 | static int recursion_bug; | 1575 | static int recursion_bug; |
1491 | static char textbuf[LOG_LINE_MAX]; | 1576 | static char textbuf[LOG_LINE_MAX]; |
1492 | char *text = textbuf; | 1577 | char *text = textbuf; |
1493 | size_t text_len; | 1578 | size_t text_len = 0; |
1494 | enum log_flags lflags = 0; | 1579 | enum log_flags lflags = 0; |
1495 | unsigned long flags; | 1580 | unsigned long flags; |
1496 | int this_cpu; | 1581 | int this_cpu; |
1497 | int printed_len = 0; | 1582 | int printed_len = 0; |
1583 | bool in_sched = false; | ||
1584 | /* cpu currently holding logbuf_lock in this function */ | ||
1585 | static volatile unsigned int logbuf_cpu = UINT_MAX; | ||
1586 | |||
1587 | if (level == SCHED_MESSAGE_LOGLEVEL) { | ||
1588 | level = -1; | ||
1589 | in_sched = true; | ||
1590 | } | ||
1498 | 1591 | ||
1499 | boot_delay_msec(level); | 1592 | boot_delay_msec(level); |
1500 | printk_delay(); | 1593 | printk_delay(); |
@@ -1516,7 +1609,8 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
1516 | */ | 1609 | */ |
1517 | if (!oops_in_progress && !lockdep_recursing(current)) { | 1610 | if (!oops_in_progress && !lockdep_recursing(current)) { |
1518 | recursion_bug = 1; | 1611 | recursion_bug = 1; |
1519 | goto out_restore_irqs; | 1612 | local_irq_restore(flags); |
1613 | return 0; | ||
1520 | } | 1614 | } |
1521 | zap_locks(); | 1615 | zap_locks(); |
1522 | } | 1616 | } |
@@ -1530,17 +1624,22 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
1530 | "BUG: recent printk recursion!"; | 1624 | "BUG: recent printk recursion!"; |
1531 | 1625 | ||
1532 | recursion_bug = 0; | 1626 | recursion_bug = 0; |
1533 | printed_len += strlen(recursion_msg); | 1627 | text_len = strlen(recursion_msg); |
1534 | /* emit KERN_CRIT message */ | 1628 | /* emit KERN_CRIT message */ |
1535 | log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, | 1629 | printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, |
1536 | NULL, 0, recursion_msg, printed_len); | 1630 | NULL, 0, recursion_msg, text_len); |
1537 | } | 1631 | } |
1538 | 1632 | ||
1539 | /* | 1633 | /* |
1540 | * The printf needs to come first; we need the syslog | 1634 | * The printf needs to come first; we need the syslog |
1541 | * prefix which might be passed-in as a parameter. | 1635 | * prefix which might be passed-in as a parameter. |
1542 | */ | 1636 | */ |
1543 | text_len = vscnprintf(text, sizeof(textbuf), fmt, args); | 1637 | if (in_sched) |
1638 | text_len = scnprintf(text, sizeof(textbuf), | ||
1639 | KERN_WARNING "[sched_delayed] "); | ||
1640 | |||
1641 | text_len += vscnprintf(text + text_len, | ||
1642 | sizeof(textbuf) - text_len, fmt, args); | ||
1544 | 1643 | ||
1545 | /* mark and strip a trailing newline */ | 1644 | /* mark and strip a trailing newline */ |
1546 | if (text_len && text[text_len-1] == '\n') { | 1645 | if (text_len && text[text_len-1] == '\n') { |
@@ -1586,9 +1685,12 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
1586 | cont_flush(LOG_NEWLINE); | 1685 | cont_flush(LOG_NEWLINE); |
1587 | 1686 | ||
1588 | /* buffer line if possible, otherwise store it right away */ | 1687 | /* buffer line if possible, otherwise store it right away */ |
1589 | if (!cont_add(facility, level, text, text_len)) | 1688 | if (cont_add(facility, level, text, text_len)) |
1590 | log_store(facility, level, lflags | LOG_CONT, 0, | 1689 | printed_len += text_len; |
1591 | dict, dictlen, text, text_len); | 1690 | else |
1691 | printed_len += log_store(facility, level, | ||
1692 | lflags | LOG_CONT, 0, | ||
1693 | dict, dictlen, text, text_len); | ||
1592 | } else { | 1694 | } else { |
1593 | bool stored = false; | 1695 | bool stored = false; |
1594 | 1696 | ||
@@ -1607,26 +1709,35 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
1607 | cont_flush(LOG_NEWLINE); | 1709 | cont_flush(LOG_NEWLINE); |
1608 | } | 1710 | } |
1609 | 1711 | ||
1610 | if (!stored) | 1712 | if (stored) |
1611 | log_store(facility, level, lflags, 0, | 1713 | printed_len += text_len; |
1612 | dict, dictlen, text, text_len); | 1714 | else |
1715 | printed_len += log_store(facility, level, lflags, 0, | ||
1716 | dict, dictlen, text, text_len); | ||
1613 | } | 1717 | } |
1614 | printed_len += text_len; | 1718 | |
1719 | logbuf_cpu = UINT_MAX; | ||
1720 | raw_spin_unlock(&logbuf_lock); | ||
1721 | lockdep_on(); | ||
1722 | local_irq_restore(flags); | ||
1723 | |||
1724 | /* If called from the scheduler, we can not call up(). */ | ||
1725 | if (in_sched) | ||
1726 | return printed_len; | ||
1615 | 1727 | ||
1616 | /* | 1728 | /* |
1729 | * Disable preemption to avoid being preempted while holding | ||
1730 | * console_sem which would prevent anyone from printing to console | ||
1731 | */ | ||
1732 | preempt_disable(); | ||
1733 | /* | ||
1617 | * Try to acquire and then immediately release the console semaphore. | 1734 | * Try to acquire and then immediately release the console semaphore. |
1618 | * The release will print out buffers and wake up /dev/kmsg and syslog() | 1735 | * The release will print out buffers and wake up /dev/kmsg and syslog() |
1619 | * users. | 1736 | * users. |
1620 | * | ||
1621 | * The console_trylock_for_printk() function will release 'logbuf_lock' | ||
1622 | * regardless of whether it actually gets the console semaphore or not. | ||
1623 | */ | 1737 | */ |
1624 | if (console_trylock_for_printk(this_cpu)) | 1738 | if (console_trylock_for_printk()) |
1625 | console_unlock(); | 1739 | console_unlock(); |
1626 | 1740 | preempt_enable(); | |
1627 | lockdep_on(); | ||
1628 | out_restore_irqs: | ||
1629 | local_irq_restore(flags); | ||
1630 | 1741 | ||
1631 | return printed_len; | 1742 | return printed_len; |
1632 | } | 1743 | } |
@@ -1674,7 +1785,7 @@ EXPORT_SYMBOL(printk_emit); | |||
1674 | * | 1785 | * |
1675 | * See the vsnprintf() documentation for format string extensions over C99. | 1786 | * See the vsnprintf() documentation for format string extensions over C99. |
1676 | */ | 1787 | */ |
1677 | asmlinkage int printk(const char *fmt, ...) | 1788 | asmlinkage __visible int printk(const char *fmt, ...) |
1678 | { | 1789 | { |
1679 | va_list args; | 1790 | va_list args; |
1680 | int r; | 1791 | int r; |
@@ -1737,7 +1848,7 @@ void early_vprintk(const char *fmt, va_list ap) | |||
1737 | } | 1848 | } |
1738 | } | 1849 | } |
1739 | 1850 | ||
1740 | asmlinkage void early_printk(const char *fmt, ...) | 1851 | asmlinkage __visible void early_printk(const char *fmt, ...) |
1741 | { | 1852 | { |
1742 | va_list ap; | 1853 | va_list ap; |
1743 | 1854 | ||
@@ -1882,16 +1993,14 @@ void suspend_console(void) | |||
1882 | printk("Suspending console(s) (use no_console_suspend to debug)\n"); | 1993 | printk("Suspending console(s) (use no_console_suspend to debug)\n"); |
1883 | console_lock(); | 1994 | console_lock(); |
1884 | console_suspended = 1; | 1995 | console_suspended = 1; |
1885 | up(&console_sem); | 1996 | up_console_sem(); |
1886 | mutex_release(&console_lock_dep_map, 1, _RET_IP_); | ||
1887 | } | 1997 | } |
1888 | 1998 | ||
1889 | void resume_console(void) | 1999 | void resume_console(void) |
1890 | { | 2000 | { |
1891 | if (!console_suspend_enabled) | 2001 | if (!console_suspend_enabled) |
1892 | return; | 2002 | return; |
1893 | down(&console_sem); | 2003 | down_console_sem(); |
1894 | mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_); | ||
1895 | console_suspended = 0; | 2004 | console_suspended = 0; |
1896 | console_unlock(); | 2005 | console_unlock(); |
1897 | } | 2006 | } |
@@ -1933,12 +2042,11 @@ void console_lock(void) | |||
1933 | { | 2042 | { |
1934 | might_sleep(); | 2043 | might_sleep(); |
1935 | 2044 | ||
1936 | down(&console_sem); | 2045 | down_console_sem(); |
1937 | if (console_suspended) | 2046 | if (console_suspended) |
1938 | return; | 2047 | return; |
1939 | console_locked = 1; | 2048 | console_locked = 1; |
1940 | console_may_schedule = 1; | 2049 | console_may_schedule = 1; |
1941 | mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_); | ||
1942 | } | 2050 | } |
1943 | EXPORT_SYMBOL(console_lock); | 2051 | EXPORT_SYMBOL(console_lock); |
1944 | 2052 | ||
@@ -1952,15 +2060,14 @@ EXPORT_SYMBOL(console_lock); | |||
1952 | */ | 2060 | */ |
1953 | int console_trylock(void) | 2061 | int console_trylock(void) |
1954 | { | 2062 | { |
1955 | if (down_trylock(&console_sem)) | 2063 | if (down_trylock_console_sem()) |
1956 | return 0; | 2064 | return 0; |
1957 | if (console_suspended) { | 2065 | if (console_suspended) { |
1958 | up(&console_sem); | 2066 | up_console_sem(); |
1959 | return 0; | 2067 | return 0; |
1960 | } | 2068 | } |
1961 | console_locked = 1; | 2069 | console_locked = 1; |
1962 | console_may_schedule = 0; | 2070 | console_may_schedule = 0; |
1963 | mutex_acquire(&console_lock_dep_map, 0, 1, _RET_IP_); | ||
1964 | return 1; | 2071 | return 1; |
1965 | } | 2072 | } |
1966 | EXPORT_SYMBOL(console_trylock); | 2073 | EXPORT_SYMBOL(console_trylock); |
@@ -2022,7 +2129,7 @@ void console_unlock(void) | |||
2022 | bool retry; | 2129 | bool retry; |
2023 | 2130 | ||
2024 | if (console_suspended) { | 2131 | if (console_suspended) { |
2025 | up(&console_sem); | 2132 | up_console_sem(); |
2026 | return; | 2133 | return; |
2027 | } | 2134 | } |
2028 | 2135 | ||
@@ -2043,10 +2150,15 @@ again: | |||
2043 | } | 2150 | } |
2044 | 2151 | ||
2045 | if (console_seq < log_first_seq) { | 2152 | if (console_seq < log_first_seq) { |
2153 | len = sprintf(text, "** %u printk messages dropped ** ", | ||
2154 | (unsigned)(log_first_seq - console_seq)); | ||
2155 | |||
2046 | /* messages are gone, move to first one */ | 2156 | /* messages are gone, move to first one */ |
2047 | console_seq = log_first_seq; | 2157 | console_seq = log_first_seq; |
2048 | console_idx = log_first_idx; | 2158 | console_idx = log_first_idx; |
2049 | console_prev = 0; | 2159 | console_prev = 0; |
2160 | } else { | ||
2161 | len = 0; | ||
2050 | } | 2162 | } |
2051 | skip: | 2163 | skip: |
2052 | if (console_seq == log_next_seq) | 2164 | if (console_seq == log_next_seq) |
@@ -2071,8 +2183,8 @@ skip: | |||
2071 | } | 2183 | } |
2072 | 2184 | ||
2073 | level = msg->level; | 2185 | level = msg->level; |
2074 | len = msg_print_text(msg, console_prev, false, | 2186 | len += msg_print_text(msg, console_prev, false, |
2075 | text, sizeof(text)); | 2187 | text + len, sizeof(text) - len); |
2076 | console_idx = log_next(console_idx); | 2188 | console_idx = log_next(console_idx); |
2077 | console_seq++; | 2189 | console_seq++; |
2078 | console_prev = msg->flags; | 2190 | console_prev = msg->flags; |
@@ -2084,7 +2196,6 @@ skip: | |||
2084 | local_irq_restore(flags); | 2196 | local_irq_restore(flags); |
2085 | } | 2197 | } |
2086 | console_locked = 0; | 2198 | console_locked = 0; |
2087 | mutex_release(&console_lock_dep_map, 1, _RET_IP_); | ||
2088 | 2199 | ||
2089 | /* Release the exclusive_console once it is used */ | 2200 | /* Release the exclusive_console once it is used */ |
2090 | if (unlikely(exclusive_console)) | 2201 | if (unlikely(exclusive_console)) |
@@ -2092,7 +2203,7 @@ skip: | |||
2092 | 2203 | ||
2093 | raw_spin_unlock(&logbuf_lock); | 2204 | raw_spin_unlock(&logbuf_lock); |
2094 | 2205 | ||
2095 | up(&console_sem); | 2206 | up_console_sem(); |
2096 | 2207 | ||
2097 | /* | 2208 | /* |
2098 | * Someone could have filled up the buffer again, so re-check if there's | 2209 | * Someone could have filled up the buffer again, so re-check if there's |
@@ -2137,7 +2248,7 @@ void console_unblank(void) | |||
2137 | * oops_in_progress is set to 1.. | 2248 | * oops_in_progress is set to 1.. |
2138 | */ | 2249 | */ |
2139 | if (oops_in_progress) { | 2250 | if (oops_in_progress) { |
2140 | if (down_trylock(&console_sem) != 0) | 2251 | if (down_trylock_console_sem() != 0) |
2141 | return; | 2252 | return; |
2142 | } else | 2253 | } else |
2143 | console_lock(); | 2254 | console_lock(); |
@@ -2413,6 +2524,7 @@ int unregister_console(struct console *console) | |||
2413 | if (console_drivers != NULL && console->flags & CON_CONSDEV) | 2524 | if (console_drivers != NULL && console->flags & CON_CONSDEV) |
2414 | console_drivers->flags |= CON_CONSDEV; | 2525 | console_drivers->flags |= CON_CONSDEV; |
2415 | 2526 | ||
2527 | console->flags &= ~CON_ENABLED; | ||
2416 | console_unlock(); | 2528 | console_unlock(); |
2417 | console_sysfs_notify(); | 2529 | console_sysfs_notify(); |
2418 | return res; | 2530 | return res; |
@@ -2437,21 +2549,19 @@ late_initcall(printk_late_init); | |||
2437 | /* | 2549 | /* |
2438 | * Delayed printk version, for scheduler-internal messages: | 2550 | * Delayed printk version, for scheduler-internal messages: |
2439 | */ | 2551 | */ |
2440 | #define PRINTK_BUF_SIZE 512 | ||
2441 | |||
2442 | #define PRINTK_PENDING_WAKEUP 0x01 | 2552 | #define PRINTK_PENDING_WAKEUP 0x01 |
2443 | #define PRINTK_PENDING_SCHED 0x02 | 2553 | #define PRINTK_PENDING_OUTPUT 0x02 |
2444 | 2554 | ||
2445 | static DEFINE_PER_CPU(int, printk_pending); | 2555 | static DEFINE_PER_CPU(int, printk_pending); |
2446 | static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf); | ||
2447 | 2556 | ||
2448 | static void wake_up_klogd_work_func(struct irq_work *irq_work) | 2557 | static void wake_up_klogd_work_func(struct irq_work *irq_work) |
2449 | { | 2558 | { |
2450 | int pending = __this_cpu_xchg(printk_pending, 0); | 2559 | int pending = __this_cpu_xchg(printk_pending, 0); |
2451 | 2560 | ||
2452 | if (pending & PRINTK_PENDING_SCHED) { | 2561 | if (pending & PRINTK_PENDING_OUTPUT) { |
2453 | char *buf = __get_cpu_var(printk_sched_buf); | 2562 | /* If trylock fails, someone else is doing the printing */ |
2454 | pr_warn("[sched_delayed] %s", buf); | 2563 | if (console_trylock()) |
2564 | console_unlock(); | ||
2455 | } | 2565 | } |
2456 | 2566 | ||
2457 | if (pending & PRINTK_PENDING_WAKEUP) | 2567 | if (pending & PRINTK_PENDING_WAKEUP) |
@@ -2473,23 +2583,19 @@ void wake_up_klogd(void) | |||
2473 | preempt_enable(); | 2583 | preempt_enable(); |
2474 | } | 2584 | } |
2475 | 2585 | ||
2476 | int printk_sched(const char *fmt, ...) | 2586 | int printk_deferred(const char *fmt, ...) |
2477 | { | 2587 | { |
2478 | unsigned long flags; | ||
2479 | va_list args; | 2588 | va_list args; |
2480 | char *buf; | ||
2481 | int r; | 2589 | int r; |
2482 | 2590 | ||
2483 | local_irq_save(flags); | 2591 | preempt_disable(); |
2484 | buf = __get_cpu_var(printk_sched_buf); | ||
2485 | |||
2486 | va_start(args, fmt); | 2592 | va_start(args, fmt); |
2487 | r = vsnprintf(buf, PRINTK_BUF_SIZE, fmt, args); | 2593 | r = vprintk_emit(0, SCHED_MESSAGE_LOGLEVEL, NULL, 0, fmt, args); |
2488 | va_end(args); | 2594 | va_end(args); |
2489 | 2595 | ||
2490 | __this_cpu_or(printk_pending, PRINTK_PENDING_SCHED); | 2596 | __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT); |
2491 | irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); | 2597 | irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); |
2492 | local_irq_restore(flags); | 2598 | preempt_enable(); |
2493 | 2599 | ||
2494 | return r; | 2600 | return r; |
2495 | } | 2601 | } |
diff --git a/kernel/profile.c b/kernel/profile.c index cb980f0c731b..54bf5ba26420 100644 --- a/kernel/profile.c +++ b/kernel/profile.c | |||
@@ -52,9 +52,9 @@ static DEFINE_MUTEX(profile_flip_mutex); | |||
52 | 52 | ||
53 | int profile_setup(char *str) | 53 | int profile_setup(char *str) |
54 | { | 54 | { |
55 | static char schedstr[] = "schedule"; | 55 | static const char schedstr[] = "schedule"; |
56 | static char sleepstr[] = "sleep"; | 56 | static const char sleepstr[] = "sleep"; |
57 | static char kvmstr[] = "kvm"; | 57 | static const char kvmstr[] = "kvm"; |
58 | int par; | 58 | int par; |
59 | 59 | ||
60 | if (!strncmp(str, sleepstr, strlen(sleepstr))) { | 60 | if (!strncmp(str, sleepstr, strlen(sleepstr))) { |
@@ -64,12 +64,10 @@ int profile_setup(char *str) | |||
64 | str += strlen(sleepstr) + 1; | 64 | str += strlen(sleepstr) + 1; |
65 | if (get_option(&str, &par)) | 65 | if (get_option(&str, &par)) |
66 | prof_shift = par; | 66 | prof_shift = par; |
67 | printk(KERN_INFO | 67 | pr_info("kernel sleep profiling enabled (shift: %ld)\n", |
68 | "kernel sleep profiling enabled (shift: %ld)\n", | ||
69 | prof_shift); | 68 | prof_shift); |
70 | #else | 69 | #else |
71 | printk(KERN_WARNING | 70 | pr_warn("kernel sleep profiling requires CONFIG_SCHEDSTATS\n"); |
72 | "kernel sleep profiling requires CONFIG_SCHEDSTATS\n"); | ||
73 | #endif /* CONFIG_SCHEDSTATS */ | 71 | #endif /* CONFIG_SCHEDSTATS */ |
74 | } else if (!strncmp(str, schedstr, strlen(schedstr))) { | 72 | } else if (!strncmp(str, schedstr, strlen(schedstr))) { |
75 | prof_on = SCHED_PROFILING; | 73 | prof_on = SCHED_PROFILING; |
@@ -77,8 +75,7 @@ int profile_setup(char *str) | |||
77 | str += strlen(schedstr) + 1; | 75 | str += strlen(schedstr) + 1; |
78 | if (get_option(&str, &par)) | 76 | if (get_option(&str, &par)) |
79 | prof_shift = par; | 77 | prof_shift = par; |
80 | printk(KERN_INFO | 78 | pr_info("kernel schedule profiling enabled (shift: %ld)\n", |
81 | "kernel schedule profiling enabled (shift: %ld)\n", | ||
82 | prof_shift); | 79 | prof_shift); |
83 | } else if (!strncmp(str, kvmstr, strlen(kvmstr))) { | 80 | } else if (!strncmp(str, kvmstr, strlen(kvmstr))) { |
84 | prof_on = KVM_PROFILING; | 81 | prof_on = KVM_PROFILING; |
@@ -86,13 +83,12 @@ int profile_setup(char *str) | |||
86 | str += strlen(kvmstr) + 1; | 83 | str += strlen(kvmstr) + 1; |
87 | if (get_option(&str, &par)) | 84 | if (get_option(&str, &par)) |
88 | prof_shift = par; | 85 | prof_shift = par; |
89 | printk(KERN_INFO | 86 | pr_info("kernel KVM profiling enabled (shift: %ld)\n", |
90 | "kernel KVM profiling enabled (shift: %ld)\n", | ||
91 | prof_shift); | 87 | prof_shift); |
92 | } else if (get_option(&str, &par)) { | 88 | } else if (get_option(&str, &par)) { |
93 | prof_shift = par; | 89 | prof_shift = par; |
94 | prof_on = CPU_PROFILING; | 90 | prof_on = CPU_PROFILING; |
95 | printk(KERN_INFO "kernel profiling enabled (shift: %ld)\n", | 91 | pr_info("kernel profiling enabled (shift: %ld)\n", |
96 | prof_shift); | 92 | prof_shift); |
97 | } | 93 | } |
98 | return 1; | 94 | return 1; |
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index bd30bc61bc05..7fa34f86e5ba 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c | |||
@@ -58,9 +58,11 @@ torture_param(int, fqs_duration, 0, | |||
58 | "Duration of fqs bursts (us), 0 to disable"); | 58 | "Duration of fqs bursts (us), 0 to disable"); |
59 | torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)"); | 59 | torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)"); |
60 | torture_param(int, fqs_stutter, 3, "Wait time between fqs bursts (s)"); | 60 | torture_param(int, fqs_stutter, 3, "Wait time between fqs bursts (s)"); |
61 | torture_param(bool, gp_cond, false, "Use conditional/async GP wait primitives"); | ||
61 | torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); | 62 | torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); |
62 | torture_param(bool, gp_normal, false, | 63 | torture_param(bool, gp_normal, false, |
63 | "Use normal (non-expedited) GP wait primitives"); | 64 | "Use normal (non-expedited) GP wait primitives"); |
65 | torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives"); | ||
64 | torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers"); | 66 | torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers"); |
65 | torture_param(int, n_barrier_cbs, 0, | 67 | torture_param(int, n_barrier_cbs, 0, |
66 | "# of callbacks/kthreads for barrier testing"); | 68 | "# of callbacks/kthreads for barrier testing"); |
@@ -138,6 +140,18 @@ static long n_barrier_attempts; | |||
138 | static long n_barrier_successes; | 140 | static long n_barrier_successes; |
139 | static struct list_head rcu_torture_removed; | 141 | static struct list_head rcu_torture_removed; |
140 | 142 | ||
143 | static int rcu_torture_writer_state; | ||
144 | #define RTWS_FIXED_DELAY 0 | ||
145 | #define RTWS_DELAY 1 | ||
146 | #define RTWS_REPLACE 2 | ||
147 | #define RTWS_DEF_FREE 3 | ||
148 | #define RTWS_EXP_SYNC 4 | ||
149 | #define RTWS_COND_GET 5 | ||
150 | #define RTWS_COND_SYNC 6 | ||
151 | #define RTWS_SYNC 7 | ||
152 | #define RTWS_STUTTER 8 | ||
153 | #define RTWS_STOPPING 9 | ||
154 | |||
141 | #if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) | 155 | #if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) |
142 | #define RCUTORTURE_RUNNABLE_INIT 1 | 156 | #define RCUTORTURE_RUNNABLE_INIT 1 |
143 | #else | 157 | #else |
@@ -214,6 +228,7 @@ rcu_torture_free(struct rcu_torture *p) | |||
214 | */ | 228 | */ |
215 | 229 | ||
216 | struct rcu_torture_ops { | 230 | struct rcu_torture_ops { |
231 | int ttype; | ||
217 | void (*init)(void); | 232 | void (*init)(void); |
218 | int (*readlock)(void); | 233 | int (*readlock)(void); |
219 | void (*read_delay)(struct torture_random_state *rrsp); | 234 | void (*read_delay)(struct torture_random_state *rrsp); |
@@ -222,6 +237,8 @@ struct rcu_torture_ops { | |||
222 | void (*deferred_free)(struct rcu_torture *p); | 237 | void (*deferred_free)(struct rcu_torture *p); |
223 | void (*sync)(void); | 238 | void (*sync)(void); |
224 | void (*exp_sync)(void); | 239 | void (*exp_sync)(void); |
240 | unsigned long (*get_state)(void); | ||
241 | void (*cond_sync)(unsigned long oldstate); | ||
225 | void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); | 242 | void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); |
226 | void (*cb_barrier)(void); | 243 | void (*cb_barrier)(void); |
227 | void (*fqs)(void); | 244 | void (*fqs)(void); |
@@ -273,10 +290,48 @@ static int rcu_torture_completed(void) | |||
273 | return rcu_batches_completed(); | 290 | return rcu_batches_completed(); |
274 | } | 291 | } |
275 | 292 | ||
293 | /* | ||
294 | * Update callback in the pipe. This should be invoked after a grace period. | ||
295 | */ | ||
296 | static bool | ||
297 | rcu_torture_pipe_update_one(struct rcu_torture *rp) | ||
298 | { | ||
299 | int i; | ||
300 | |||
301 | i = rp->rtort_pipe_count; | ||
302 | if (i > RCU_TORTURE_PIPE_LEN) | ||
303 | i = RCU_TORTURE_PIPE_LEN; | ||
304 | atomic_inc(&rcu_torture_wcount[i]); | ||
305 | if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { | ||
306 | rp->rtort_mbtest = 0; | ||
307 | return true; | ||
308 | } | ||
309 | return false; | ||
310 | } | ||
311 | |||
312 | /* | ||
313 | * Update all callbacks in the pipe. Suitable for synchronous grace-period | ||
314 | * primitives. | ||
315 | */ | ||
316 | static void | ||
317 | rcu_torture_pipe_update(struct rcu_torture *old_rp) | ||
318 | { | ||
319 | struct rcu_torture *rp; | ||
320 | struct rcu_torture *rp1; | ||
321 | |||
322 | if (old_rp) | ||
323 | list_add(&old_rp->rtort_free, &rcu_torture_removed); | ||
324 | list_for_each_entry_safe(rp, rp1, &rcu_torture_removed, rtort_free) { | ||
325 | if (rcu_torture_pipe_update_one(rp)) { | ||
326 | list_del(&rp->rtort_free); | ||
327 | rcu_torture_free(rp); | ||
328 | } | ||
329 | } | ||
330 | } | ||
331 | |||
276 | static void | 332 | static void |
277 | rcu_torture_cb(struct rcu_head *p) | 333 | rcu_torture_cb(struct rcu_head *p) |
278 | { | 334 | { |
279 | int i; | ||
280 | struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu); | 335 | struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu); |
281 | 336 | ||
282 | if (torture_must_stop_irq()) { | 337 | if (torture_must_stop_irq()) { |
@@ -284,16 +339,10 @@ rcu_torture_cb(struct rcu_head *p) | |||
284 | /* The next initialization will pick up the pieces. */ | 339 | /* The next initialization will pick up the pieces. */ |
285 | return; | 340 | return; |
286 | } | 341 | } |
287 | i = rp->rtort_pipe_count; | 342 | if (rcu_torture_pipe_update_one(rp)) |
288 | if (i > RCU_TORTURE_PIPE_LEN) | ||
289 | i = RCU_TORTURE_PIPE_LEN; | ||
290 | atomic_inc(&rcu_torture_wcount[i]); | ||
291 | if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { | ||
292 | rp->rtort_mbtest = 0; | ||
293 | rcu_torture_free(rp); | 343 | rcu_torture_free(rp); |
294 | } else { | 344 | else |
295 | cur_ops->deferred_free(rp); | 345 | cur_ops->deferred_free(rp); |
296 | } | ||
297 | } | 346 | } |
298 | 347 | ||
299 | static int rcu_no_completed(void) | 348 | static int rcu_no_completed(void) |
@@ -312,6 +361,7 @@ static void rcu_sync_torture_init(void) | |||
312 | } | 361 | } |
313 | 362 | ||
314 | static struct rcu_torture_ops rcu_ops = { | 363 | static struct rcu_torture_ops rcu_ops = { |
364 | .ttype = RCU_FLAVOR, | ||
315 | .init = rcu_sync_torture_init, | 365 | .init = rcu_sync_torture_init, |
316 | .readlock = rcu_torture_read_lock, | 366 | .readlock = rcu_torture_read_lock, |
317 | .read_delay = rcu_read_delay, | 367 | .read_delay = rcu_read_delay, |
@@ -320,6 +370,8 @@ static struct rcu_torture_ops rcu_ops = { | |||
320 | .deferred_free = rcu_torture_deferred_free, | 370 | .deferred_free = rcu_torture_deferred_free, |
321 | .sync = synchronize_rcu, | 371 | .sync = synchronize_rcu, |
322 | .exp_sync = synchronize_rcu_expedited, | 372 | .exp_sync = synchronize_rcu_expedited, |
373 | .get_state = get_state_synchronize_rcu, | ||
374 | .cond_sync = cond_synchronize_rcu, | ||
323 | .call = call_rcu, | 375 | .call = call_rcu, |
324 | .cb_barrier = rcu_barrier, | 376 | .cb_barrier = rcu_barrier, |
325 | .fqs = rcu_force_quiescent_state, | 377 | .fqs = rcu_force_quiescent_state, |
@@ -355,6 +407,7 @@ static void rcu_bh_torture_deferred_free(struct rcu_torture *p) | |||
355 | } | 407 | } |
356 | 408 | ||
357 | static struct rcu_torture_ops rcu_bh_ops = { | 409 | static struct rcu_torture_ops rcu_bh_ops = { |
410 | .ttype = RCU_BH_FLAVOR, | ||
358 | .init = rcu_sync_torture_init, | 411 | .init = rcu_sync_torture_init, |
359 | .readlock = rcu_bh_torture_read_lock, | 412 | .readlock = rcu_bh_torture_read_lock, |
360 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | 413 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ |
@@ -397,6 +450,7 @@ call_rcu_busted(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | |||
397 | } | 450 | } |
398 | 451 | ||
399 | static struct rcu_torture_ops rcu_busted_ops = { | 452 | static struct rcu_torture_ops rcu_busted_ops = { |
453 | .ttype = INVALID_RCU_FLAVOR, | ||
400 | .init = rcu_sync_torture_init, | 454 | .init = rcu_sync_torture_init, |
401 | .readlock = rcu_torture_read_lock, | 455 | .readlock = rcu_torture_read_lock, |
402 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | 456 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ |
@@ -479,9 +533,11 @@ static void srcu_torture_stats(char *page) | |||
479 | page += sprintf(page, "%s%s per-CPU(idx=%d):", | 533 | page += sprintf(page, "%s%s per-CPU(idx=%d):", |
480 | torture_type, TORTURE_FLAG, idx); | 534 | torture_type, TORTURE_FLAG, idx); |
481 | for_each_possible_cpu(cpu) { | 535 | for_each_possible_cpu(cpu) { |
482 | page += sprintf(page, " %d(%lu,%lu)", cpu, | 536 | long c0, c1; |
483 | per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx], | 537 | |
484 | per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]); | 538 | c0 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx]; |
539 | c1 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]; | ||
540 | page += sprintf(page, " %d(%ld,%ld)", cpu, c0, c1); | ||
485 | } | 541 | } |
486 | sprintf(page, "\n"); | 542 | sprintf(page, "\n"); |
487 | } | 543 | } |
@@ -492,6 +548,7 @@ static void srcu_torture_synchronize_expedited(void) | |||
492 | } | 548 | } |
493 | 549 | ||
494 | static struct rcu_torture_ops srcu_ops = { | 550 | static struct rcu_torture_ops srcu_ops = { |
551 | .ttype = SRCU_FLAVOR, | ||
495 | .init = rcu_sync_torture_init, | 552 | .init = rcu_sync_torture_init, |
496 | .readlock = srcu_torture_read_lock, | 553 | .readlock = srcu_torture_read_lock, |
497 | .read_delay = srcu_read_delay, | 554 | .read_delay = srcu_read_delay, |
@@ -527,6 +584,7 @@ static void rcu_sched_torture_deferred_free(struct rcu_torture *p) | |||
527 | } | 584 | } |
528 | 585 | ||
529 | static struct rcu_torture_ops sched_ops = { | 586 | static struct rcu_torture_ops sched_ops = { |
587 | .ttype = RCU_SCHED_FLAVOR, | ||
530 | .init = rcu_sync_torture_init, | 588 | .init = rcu_sync_torture_init, |
531 | .readlock = sched_torture_read_lock, | 589 | .readlock = sched_torture_read_lock, |
532 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | 590 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ |
@@ -688,23 +746,59 @@ rcu_torture_fqs(void *arg) | |||
688 | static int | 746 | static int |
689 | rcu_torture_writer(void *arg) | 747 | rcu_torture_writer(void *arg) |
690 | { | 748 | { |
691 | bool exp; | 749 | unsigned long gp_snap; |
750 | bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal; | ||
751 | bool gp_sync1 = gp_sync; | ||
692 | int i; | 752 | int i; |
693 | struct rcu_torture *rp; | 753 | struct rcu_torture *rp; |
694 | struct rcu_torture *rp1; | ||
695 | struct rcu_torture *old_rp; | 754 | struct rcu_torture *old_rp; |
696 | static DEFINE_TORTURE_RANDOM(rand); | 755 | static DEFINE_TORTURE_RANDOM(rand); |
756 | int synctype[] = { RTWS_DEF_FREE, RTWS_EXP_SYNC, | ||
757 | RTWS_COND_GET, RTWS_SYNC }; | ||
758 | int nsynctypes = 0; | ||
697 | 759 | ||
698 | VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); | 760 | VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); |
699 | set_user_nice(current, MAX_NICE); | 761 | |
762 | /* Initialize synctype[] array. If none set, take default. */ | ||
763 | if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync) | ||
764 | gp_cond1 = gp_exp1 = gp_normal1 = gp_sync1 = true; | ||
765 | if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync) | ||
766 | synctype[nsynctypes++] = RTWS_COND_GET; | ||
767 | else if (gp_cond && (!cur_ops->get_state || !cur_ops->cond_sync)) | ||
768 | pr_alert("rcu_torture_writer: gp_cond without primitives.\n"); | ||
769 | if (gp_exp1 && cur_ops->exp_sync) | ||
770 | synctype[nsynctypes++] = RTWS_EXP_SYNC; | ||
771 | else if (gp_exp && !cur_ops->exp_sync) | ||
772 | pr_alert("rcu_torture_writer: gp_exp without primitives.\n"); | ||
773 | if (gp_normal1 && cur_ops->deferred_free) | ||
774 | synctype[nsynctypes++] = RTWS_DEF_FREE; | ||
775 | else if (gp_normal && !cur_ops->deferred_free) | ||
776 | pr_alert("rcu_torture_writer: gp_normal without primitives.\n"); | ||
777 | if (gp_sync1 && cur_ops->sync) | ||
778 | synctype[nsynctypes++] = RTWS_SYNC; | ||
779 | else if (gp_sync && !cur_ops->sync) | ||
780 | pr_alert("rcu_torture_writer: gp_sync without primitives.\n"); | ||
781 | if (WARN_ONCE(nsynctypes == 0, | ||
782 | "rcu_torture_writer: No update-side primitives.\n")) { | ||
783 | /* | ||
784 | * No updates primitives, so don't try updating. | ||
785 | * The resulting test won't be testing much, hence the | ||
786 | * above WARN_ONCE(). | ||
787 | */ | ||
788 | rcu_torture_writer_state = RTWS_STOPPING; | ||
789 | torture_kthread_stopping("rcu_torture_writer"); | ||
790 | } | ||
700 | 791 | ||
701 | do { | 792 | do { |
793 | rcu_torture_writer_state = RTWS_FIXED_DELAY; | ||
702 | schedule_timeout_uninterruptible(1); | 794 | schedule_timeout_uninterruptible(1); |
703 | rp = rcu_torture_alloc(); | 795 | rp = rcu_torture_alloc(); |
704 | if (rp == NULL) | 796 | if (rp == NULL) |
705 | continue; | 797 | continue; |
706 | rp->rtort_pipe_count = 0; | 798 | rp->rtort_pipe_count = 0; |
799 | rcu_torture_writer_state = RTWS_DELAY; | ||
707 | udelay(torture_random(&rand) & 0x3ff); | 800 | udelay(torture_random(&rand) & 0x3ff); |
801 | rcu_torture_writer_state = RTWS_REPLACE; | ||
708 | old_rp = rcu_dereference_check(rcu_torture_current, | 802 | old_rp = rcu_dereference_check(rcu_torture_current, |
709 | current == writer_task); | 803 | current == writer_task); |
710 | rp->rtort_mbtest = 1; | 804 | rp->rtort_mbtest = 1; |
@@ -716,35 +810,42 @@ rcu_torture_writer(void *arg) | |||
716 | i = RCU_TORTURE_PIPE_LEN; | 810 | i = RCU_TORTURE_PIPE_LEN; |
717 | atomic_inc(&rcu_torture_wcount[i]); | 811 | atomic_inc(&rcu_torture_wcount[i]); |
718 | old_rp->rtort_pipe_count++; | 812 | old_rp->rtort_pipe_count++; |
719 | if (gp_normal == gp_exp) | 813 | switch (synctype[torture_random(&rand) % nsynctypes]) { |
720 | exp = !!(torture_random(&rand) & 0x80); | 814 | case RTWS_DEF_FREE: |
721 | else | 815 | rcu_torture_writer_state = RTWS_DEF_FREE; |
722 | exp = gp_exp; | ||
723 | if (!exp) { | ||
724 | cur_ops->deferred_free(old_rp); | 816 | cur_ops->deferred_free(old_rp); |
725 | } else { | 817 | break; |
818 | case RTWS_EXP_SYNC: | ||
819 | rcu_torture_writer_state = RTWS_EXP_SYNC; | ||
726 | cur_ops->exp_sync(); | 820 | cur_ops->exp_sync(); |
727 | list_add(&old_rp->rtort_free, | 821 | rcu_torture_pipe_update(old_rp); |
728 | &rcu_torture_removed); | 822 | break; |
729 | list_for_each_entry_safe(rp, rp1, | 823 | case RTWS_COND_GET: |
730 | &rcu_torture_removed, | 824 | rcu_torture_writer_state = RTWS_COND_GET; |
731 | rtort_free) { | 825 | gp_snap = cur_ops->get_state(); |
732 | i = rp->rtort_pipe_count; | 826 | i = torture_random(&rand) % 16; |
733 | if (i > RCU_TORTURE_PIPE_LEN) | 827 | if (i != 0) |
734 | i = RCU_TORTURE_PIPE_LEN; | 828 | schedule_timeout_interruptible(i); |
735 | atomic_inc(&rcu_torture_wcount[i]); | 829 | udelay(torture_random(&rand) % 1000); |
736 | if (++rp->rtort_pipe_count >= | 830 | rcu_torture_writer_state = RTWS_COND_SYNC; |
737 | RCU_TORTURE_PIPE_LEN) { | 831 | cur_ops->cond_sync(gp_snap); |
738 | rp->rtort_mbtest = 0; | 832 | rcu_torture_pipe_update(old_rp); |
739 | list_del(&rp->rtort_free); | 833 | break; |
740 | rcu_torture_free(rp); | 834 | case RTWS_SYNC: |
741 | } | 835 | rcu_torture_writer_state = RTWS_SYNC; |
742 | } | 836 | cur_ops->sync(); |
837 | rcu_torture_pipe_update(old_rp); | ||
838 | break; | ||
839 | default: | ||
840 | WARN_ON_ONCE(1); | ||
841 | break; | ||
743 | } | 842 | } |
744 | } | 843 | } |
745 | rcutorture_record_progress(++rcu_torture_current_version); | 844 | rcutorture_record_progress(++rcu_torture_current_version); |
845 | rcu_torture_writer_state = RTWS_STUTTER; | ||
746 | stutter_wait("rcu_torture_writer"); | 846 | stutter_wait("rcu_torture_writer"); |
747 | } while (!torture_must_stop()); | 847 | } while (!torture_must_stop()); |
848 | rcu_torture_writer_state = RTWS_STOPPING; | ||
748 | torture_kthread_stopping("rcu_torture_writer"); | 849 | torture_kthread_stopping("rcu_torture_writer"); |
749 | return 0; | 850 | return 0; |
750 | } | 851 | } |
@@ -784,7 +885,7 @@ rcu_torture_fakewriter(void *arg) | |||
784 | return 0; | 885 | return 0; |
785 | } | 886 | } |
786 | 887 | ||
787 | void rcutorture_trace_dump(void) | 888 | static void rcutorture_trace_dump(void) |
788 | { | 889 | { |
789 | static atomic_t beenhere = ATOMIC_INIT(0); | 890 | static atomic_t beenhere = ATOMIC_INIT(0); |
790 | 891 | ||
@@ -918,11 +1019,13 @@ rcu_torture_reader(void *arg) | |||
918 | __this_cpu_inc(rcu_torture_batch[completed]); | 1019 | __this_cpu_inc(rcu_torture_batch[completed]); |
919 | preempt_enable(); | 1020 | preempt_enable(); |
920 | cur_ops->readunlock(idx); | 1021 | cur_ops->readunlock(idx); |
921 | schedule(); | 1022 | cond_resched(); |
922 | stutter_wait("rcu_torture_reader"); | 1023 | stutter_wait("rcu_torture_reader"); |
923 | } while (!torture_must_stop()); | 1024 | } while (!torture_must_stop()); |
924 | if (irqreader && cur_ops->irq_capable) | 1025 | if (irqreader && cur_ops->irq_capable) { |
925 | del_timer_sync(&t); | 1026 | del_timer_sync(&t); |
1027 | destroy_timer_on_stack(&t); | ||
1028 | } | ||
926 | torture_kthread_stopping("rcu_torture_reader"); | 1029 | torture_kthread_stopping("rcu_torture_reader"); |
927 | return 0; | 1030 | return 0; |
928 | } | 1031 | } |
@@ -937,6 +1040,7 @@ rcu_torture_printk(char *page) | |||
937 | int i; | 1040 | int i; |
938 | long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; | 1041 | long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; |
939 | long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; | 1042 | long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; |
1043 | static unsigned long rtcv_snap = ULONG_MAX; | ||
940 | 1044 | ||
941 | for_each_possible_cpu(cpu) { | 1045 | for_each_possible_cpu(cpu) { |
942 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { | 1046 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { |
@@ -997,6 +1101,22 @@ rcu_torture_printk(char *page) | |||
997 | page += sprintf(page, "\n"); | 1101 | page += sprintf(page, "\n"); |
998 | if (cur_ops->stats) | 1102 | if (cur_ops->stats) |
999 | cur_ops->stats(page); | 1103 | cur_ops->stats(page); |
1104 | if (rtcv_snap == rcu_torture_current_version && | ||
1105 | rcu_torture_current != NULL) { | ||
1106 | int __maybe_unused flags; | ||
1107 | unsigned long __maybe_unused gpnum; | ||
1108 | unsigned long __maybe_unused completed; | ||
1109 | |||
1110 | rcutorture_get_gp_data(cur_ops->ttype, | ||
1111 | &flags, &gpnum, &completed); | ||
1112 | page += sprintf(page, | ||
1113 | "??? Writer stall state %d g%lu c%lu f%#x\n", | ||
1114 | rcu_torture_writer_state, | ||
1115 | gpnum, completed, flags); | ||
1116 | show_rcu_gp_kthreads(); | ||
1117 | rcutorture_trace_dump(); | ||
1118 | } | ||
1119 | rtcv_snap = rcu_torture_current_version; | ||
1000 | } | 1120 | } |
1001 | 1121 | ||
1002 | /* | 1122 | /* |
@@ -1146,7 +1266,7 @@ static int __init rcu_torture_stall_init(void) | |||
1146 | } | 1266 | } |
1147 | 1267 | ||
1148 | /* Callback function for RCU barrier testing. */ | 1268 | /* Callback function for RCU barrier testing. */ |
1149 | void rcu_torture_barrier_cbf(struct rcu_head *rcu) | 1269 | static void rcu_torture_barrier_cbf(struct rcu_head *rcu) |
1150 | { | 1270 | { |
1151 | atomic_inc(&barrier_cbs_invoked); | 1271 | atomic_inc(&barrier_cbs_invoked); |
1152 | } | 1272 | } |
@@ -1416,7 +1536,8 @@ rcu_torture_init(void) | |||
1416 | &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &sched_ops, | 1536 | &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &sched_ops, |
1417 | }; | 1537 | }; |
1418 | 1538 | ||
1419 | torture_init_begin(torture_type, verbose, &rcutorture_runnable); | 1539 | if (!torture_init_begin(torture_type, verbose, &rcutorture_runnable)) |
1540 | return -EBUSY; | ||
1420 | 1541 | ||
1421 | /* Process args and tell the world that the torturer is on the job. */ | 1542 | /* Process args and tell the world that the torturer is on the job. */ |
1422 | for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { | 1543 | for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { |
@@ -1441,10 +1562,13 @@ rcu_torture_init(void) | |||
1441 | if (cur_ops->init) | 1562 | if (cur_ops->init) |
1442 | cur_ops->init(); /* no "goto unwind" prior to this point!!! */ | 1563 | cur_ops->init(); /* no "goto unwind" prior to this point!!! */ |
1443 | 1564 | ||
1444 | if (nreaders >= 0) | 1565 | if (nreaders >= 0) { |
1445 | nrealreaders = nreaders; | 1566 | nrealreaders = nreaders; |
1446 | else | 1567 | } else { |
1447 | nrealreaders = 2 * num_online_cpus(); | 1568 | nrealreaders = num_online_cpus() - 1; |
1569 | if (nrealreaders <= 0) | ||
1570 | nrealreaders = 1; | ||
1571 | } | ||
1448 | rcu_torture_print_module_parms(cur_ops, "Start of test"); | 1572 | rcu_torture_print_module_parms(cur_ops, "Start of test"); |
1449 | 1573 | ||
1450 | /* Set up the freelist. */ | 1574 | /* Set up the freelist. */ |
@@ -1533,7 +1657,8 @@ rcu_torture_init(void) | |||
1533 | fqs_duration = 0; | 1657 | fqs_duration = 0; |
1534 | if (fqs_duration) { | 1658 | if (fqs_duration) { |
1535 | /* Create the fqs thread */ | 1659 | /* Create the fqs thread */ |
1536 | torture_create_kthread(rcu_torture_fqs, NULL, fqs_task); | 1660 | firsterr = torture_create_kthread(rcu_torture_fqs, NULL, |
1661 | fqs_task); | ||
1537 | if (firsterr) | 1662 | if (firsterr) |
1538 | goto unwind; | 1663 | goto unwind; |
1539 | } | 1664 | } |
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h index 431528520562..858c56569127 100644 --- a/kernel/rcu/tiny_plugin.h +++ b/kernel/rcu/tiny_plugin.h | |||
@@ -144,7 +144,7 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp) | |||
144 | return; | 144 | return; |
145 | rcp->ticks_this_gp++; | 145 | rcp->ticks_this_gp++; |
146 | j = jiffies; | 146 | j = jiffies; |
147 | js = rcp->jiffies_stall; | 147 | js = ACCESS_ONCE(rcp->jiffies_stall); |
148 | if (*rcp->curtail && ULONG_CMP_GE(j, js)) { | 148 | if (*rcp->curtail && ULONG_CMP_GE(j, js)) { |
149 | pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n", | 149 | pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n", |
150 | rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting, | 150 | rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting, |
@@ -152,17 +152,17 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp) | |||
152 | dump_stack(); | 152 | dump_stack(); |
153 | } | 153 | } |
154 | if (*rcp->curtail && ULONG_CMP_GE(j, js)) | 154 | if (*rcp->curtail && ULONG_CMP_GE(j, js)) |
155 | rcp->jiffies_stall = jiffies + | 155 | ACCESS_ONCE(rcp->jiffies_stall) = jiffies + |
156 | 3 * rcu_jiffies_till_stall_check() + 3; | 156 | 3 * rcu_jiffies_till_stall_check() + 3; |
157 | else if (ULONG_CMP_GE(j, js)) | 157 | else if (ULONG_CMP_GE(j, js)) |
158 | rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); | 158 | ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check(); |
159 | } | 159 | } |
160 | 160 | ||
161 | static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) | 161 | static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) |
162 | { | 162 | { |
163 | rcp->ticks_this_gp = 0; | 163 | rcp->ticks_this_gp = 0; |
164 | rcp->gp_start = jiffies; | 164 | rcp->gp_start = jiffies; |
165 | rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); | 165 | ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check(); |
166 | } | 166 | } |
167 | 167 | ||
168 | static void check_cpu_stalls(void) | 168 | static void check_cpu_stalls(void) |
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 0c47e300210a..f1ba77363fbb 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c | |||
@@ -101,7 +101,7 @@ DEFINE_PER_CPU(struct rcu_data, sname##_data) | |||
101 | RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); | 101 | RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); |
102 | RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); | 102 | RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); |
103 | 103 | ||
104 | static struct rcu_state *rcu_state; | 104 | static struct rcu_state *rcu_state_p; |
105 | LIST_HEAD(rcu_struct_flavors); | 105 | LIST_HEAD(rcu_struct_flavors); |
106 | 106 | ||
107 | /* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */ | 107 | /* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */ |
@@ -243,7 +243,7 @@ static ulong jiffies_till_next_fqs = ULONG_MAX; | |||
243 | module_param(jiffies_till_first_fqs, ulong, 0644); | 243 | module_param(jiffies_till_first_fqs, ulong, 0644); |
244 | module_param(jiffies_till_next_fqs, ulong, 0644); | 244 | module_param(jiffies_till_next_fqs, ulong, 0644); |
245 | 245 | ||
246 | static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, | 246 | static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, |
247 | struct rcu_data *rdp); | 247 | struct rcu_data *rdp); |
248 | static void force_qs_rnp(struct rcu_state *rsp, | 248 | static void force_qs_rnp(struct rcu_state *rsp, |
249 | int (*f)(struct rcu_data *rsp, bool *isidle, | 249 | int (*f)(struct rcu_data *rsp, bool *isidle, |
@@ -271,6 +271,15 @@ long rcu_batches_completed_bh(void) | |||
271 | EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); | 271 | EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); |
272 | 272 | ||
273 | /* | 273 | /* |
274 | * Force a quiescent state. | ||
275 | */ | ||
276 | void rcu_force_quiescent_state(void) | ||
277 | { | ||
278 | force_quiescent_state(rcu_state_p); | ||
279 | } | ||
280 | EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); | ||
281 | |||
282 | /* | ||
274 | * Force a quiescent state for RCU BH. | 283 | * Force a quiescent state for RCU BH. |
275 | */ | 284 | */ |
276 | void rcu_bh_force_quiescent_state(void) | 285 | void rcu_bh_force_quiescent_state(void) |
@@ -280,6 +289,21 @@ void rcu_bh_force_quiescent_state(void) | |||
280 | EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); | 289 | EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); |
281 | 290 | ||
282 | /* | 291 | /* |
292 | * Show the state of the grace-period kthreads. | ||
293 | */ | ||
294 | void show_rcu_gp_kthreads(void) | ||
295 | { | ||
296 | struct rcu_state *rsp; | ||
297 | |||
298 | for_each_rcu_flavor(rsp) { | ||
299 | pr_info("%s: wait state: %d ->state: %#lx\n", | ||
300 | rsp->name, rsp->gp_state, rsp->gp_kthread->state); | ||
301 | /* sched_show_task(rsp->gp_kthread); */ | ||
302 | } | ||
303 | } | ||
304 | EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads); | ||
305 | |||
306 | /* | ||
283 | * Record the number of times rcutorture tests have been initiated and | 307 | * Record the number of times rcutorture tests have been initiated and |
284 | * terminated. This information allows the debugfs tracing stats to be | 308 | * terminated. This information allows the debugfs tracing stats to be |
285 | * correlated to the rcutorture messages, even when the rcutorture module | 309 | * correlated to the rcutorture messages, even when the rcutorture module |
@@ -294,6 +318,39 @@ void rcutorture_record_test_transition(void) | |||
294 | EXPORT_SYMBOL_GPL(rcutorture_record_test_transition); | 318 | EXPORT_SYMBOL_GPL(rcutorture_record_test_transition); |
295 | 319 | ||
296 | /* | 320 | /* |
321 | * Send along grace-period-related data for rcutorture diagnostics. | ||
322 | */ | ||
323 | void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, | ||
324 | unsigned long *gpnum, unsigned long *completed) | ||
325 | { | ||
326 | struct rcu_state *rsp = NULL; | ||
327 | |||
328 | switch (test_type) { | ||
329 | case RCU_FLAVOR: | ||
330 | rsp = rcu_state_p; | ||
331 | break; | ||
332 | case RCU_BH_FLAVOR: | ||
333 | rsp = &rcu_bh_state; | ||
334 | break; | ||
335 | case RCU_SCHED_FLAVOR: | ||
336 | rsp = &rcu_sched_state; | ||
337 | break; | ||
338 | default: | ||
339 | break; | ||
340 | } | ||
341 | if (rsp != NULL) { | ||
342 | *flags = ACCESS_ONCE(rsp->gp_flags); | ||
343 | *gpnum = ACCESS_ONCE(rsp->gpnum); | ||
344 | *completed = ACCESS_ONCE(rsp->completed); | ||
345 | return; | ||
346 | } | ||
347 | *flags = 0; | ||
348 | *gpnum = 0; | ||
349 | *completed = 0; | ||
350 | } | ||
351 | EXPORT_SYMBOL_GPL(rcutorture_get_gp_data); | ||
352 | |||
353 | /* | ||
297 | * Record the number of writer passes through the current rcutorture test. | 354 | * Record the number of writer passes through the current rcutorture test. |
298 | * This is also used to correlate debugfs tracing stats with the rcutorture | 355 | * This is also used to correlate debugfs tracing stats with the rcutorture |
299 | * messages. | 356 | * messages. |
@@ -324,6 +381,28 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) | |||
324 | } | 381 | } |
325 | 382 | ||
326 | /* | 383 | /* |
384 | * Return the root node of the specified rcu_state structure. | ||
385 | */ | ||
386 | static struct rcu_node *rcu_get_root(struct rcu_state *rsp) | ||
387 | { | ||
388 | return &rsp->node[0]; | ||
389 | } | ||
390 | |||
391 | /* | ||
392 | * Is there any need for future grace periods? | ||
393 | * Interrupts must be disabled. If the caller does not hold the root | ||
394 | * rnp_node structure's ->lock, the results are advisory only. | ||
395 | */ | ||
396 | static int rcu_future_needs_gp(struct rcu_state *rsp) | ||
397 | { | ||
398 | struct rcu_node *rnp = rcu_get_root(rsp); | ||
399 | int idx = (ACCESS_ONCE(rnp->completed) + 1) & 0x1; | ||
400 | int *fp = &rnp->need_future_gp[idx]; | ||
401 | |||
402 | return ACCESS_ONCE(*fp); | ||
403 | } | ||
404 | |||
405 | /* | ||
327 | * Does the current CPU require a not-yet-started grace period? | 406 | * Does the current CPU require a not-yet-started grace period? |
328 | * The caller must have disabled interrupts to prevent races with | 407 | * The caller must have disabled interrupts to prevent races with |
329 | * normal callback registry. | 408 | * normal callback registry. |
@@ -335,7 +414,7 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) | |||
335 | 414 | ||
336 | if (rcu_gp_in_progress(rsp)) | 415 | if (rcu_gp_in_progress(rsp)) |
337 | return 0; /* No, a grace period is already in progress. */ | 416 | return 0; /* No, a grace period is already in progress. */ |
338 | if (rcu_nocb_needs_gp(rsp)) | 417 | if (rcu_future_needs_gp(rsp)) |
339 | return 1; /* Yes, a no-CBs CPU needs one. */ | 418 | return 1; /* Yes, a no-CBs CPU needs one. */ |
340 | if (!rdp->nxttail[RCU_NEXT_TAIL]) | 419 | if (!rdp->nxttail[RCU_NEXT_TAIL]) |
341 | return 0; /* No, this is a no-CBs (or offline) CPU. */ | 420 | return 0; /* No, this is a no-CBs (or offline) CPU. */ |
@@ -350,14 +429,6 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) | |||
350 | } | 429 | } |
351 | 430 | ||
352 | /* | 431 | /* |
353 | * Return the root node of the specified rcu_state structure. | ||
354 | */ | ||
355 | static struct rcu_node *rcu_get_root(struct rcu_state *rsp) | ||
356 | { | ||
357 | return &rsp->node[0]; | ||
358 | } | ||
359 | |||
360 | /* | ||
361 | * rcu_eqs_enter_common - current CPU is moving towards extended quiescent state | 432 | * rcu_eqs_enter_common - current CPU is moving towards extended quiescent state |
362 | * | 433 | * |
363 | * If the new value of the ->dynticks_nesting counter now is zero, | 434 | * If the new value of the ->dynticks_nesting counter now is zero, |
@@ -387,9 +458,9 @@ static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, | |||
387 | } | 458 | } |
388 | rcu_prepare_for_idle(smp_processor_id()); | 459 | rcu_prepare_for_idle(smp_processor_id()); |
389 | /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ | 460 | /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ |
390 | smp_mb__before_atomic_inc(); /* See above. */ | 461 | smp_mb__before_atomic(); /* See above. */ |
391 | atomic_inc(&rdtp->dynticks); | 462 | atomic_inc(&rdtp->dynticks); |
392 | smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */ | 463 | smp_mb__after_atomic(); /* Force ordering with next sojourn. */ |
393 | WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); | 464 | WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); |
394 | 465 | ||
395 | /* | 466 | /* |
@@ -507,10 +578,10 @@ void rcu_irq_exit(void) | |||
507 | static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval, | 578 | static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval, |
508 | int user) | 579 | int user) |
509 | { | 580 | { |
510 | smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */ | 581 | smp_mb__before_atomic(); /* Force ordering w/previous sojourn. */ |
511 | atomic_inc(&rdtp->dynticks); | 582 | atomic_inc(&rdtp->dynticks); |
512 | /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ | 583 | /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ |
513 | smp_mb__after_atomic_inc(); /* See above. */ | 584 | smp_mb__after_atomic(); /* See above. */ |
514 | WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); | 585 | WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); |
515 | rcu_cleanup_after_idle(smp_processor_id()); | 586 | rcu_cleanup_after_idle(smp_processor_id()); |
516 | trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting); | 587 | trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting); |
@@ -635,10 +706,10 @@ void rcu_nmi_enter(void) | |||
635 | (atomic_read(&rdtp->dynticks) & 0x1)) | 706 | (atomic_read(&rdtp->dynticks) & 0x1)) |
636 | return; | 707 | return; |
637 | rdtp->dynticks_nmi_nesting++; | 708 | rdtp->dynticks_nmi_nesting++; |
638 | smp_mb__before_atomic_inc(); /* Force delay from prior write. */ | 709 | smp_mb__before_atomic(); /* Force delay from prior write. */ |
639 | atomic_inc(&rdtp->dynticks); | 710 | atomic_inc(&rdtp->dynticks); |
640 | /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ | 711 | /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ |
641 | smp_mb__after_atomic_inc(); /* See above. */ | 712 | smp_mb__after_atomic(); /* See above. */ |
642 | WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); | 713 | WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); |
643 | } | 714 | } |
644 | 715 | ||
@@ -657,9 +728,9 @@ void rcu_nmi_exit(void) | |||
657 | --rdtp->dynticks_nmi_nesting != 0) | 728 | --rdtp->dynticks_nmi_nesting != 0) |
658 | return; | 729 | return; |
659 | /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ | 730 | /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ |
660 | smp_mb__before_atomic_inc(); /* See above. */ | 731 | smp_mb__before_atomic(); /* See above. */ |
661 | atomic_inc(&rdtp->dynticks); | 732 | atomic_inc(&rdtp->dynticks); |
662 | smp_mb__after_atomic_inc(); /* Force delay to next write. */ | 733 | smp_mb__after_atomic(); /* Force delay to next write. */ |
663 | WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); | 734 | WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); |
664 | } | 735 | } |
665 | 736 | ||
@@ -758,7 +829,12 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp, | |||
758 | { | 829 | { |
759 | rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); | 830 | rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); |
760 | rcu_sysidle_check_cpu(rdp, isidle, maxj); | 831 | rcu_sysidle_check_cpu(rdp, isidle, maxj); |
761 | return (rdp->dynticks_snap & 0x1) == 0; | 832 | if ((rdp->dynticks_snap & 0x1) == 0) { |
833 | trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); | ||
834 | return 1; | ||
835 | } else { | ||
836 | return 0; | ||
837 | } | ||
762 | } | 838 | } |
763 | 839 | ||
764 | /* | 840 | /* |
@@ -834,7 +910,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, | |||
834 | * we will beat on the first one until it gets unstuck, then move | 910 | * we will beat on the first one until it gets unstuck, then move |
835 | * to the next. Only do this for the primary flavor of RCU. | 911 | * to the next. Only do this for the primary flavor of RCU. |
836 | */ | 912 | */ |
837 | if (rdp->rsp == rcu_state && | 913 | if (rdp->rsp == rcu_state_p && |
838 | ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) { | 914 | ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) { |
839 | rdp->rsp->jiffies_resched += 5; | 915 | rdp->rsp->jiffies_resched += 5; |
840 | resched_cpu(rdp->cpu); | 916 | resched_cpu(rdp->cpu); |
@@ -851,7 +927,7 @@ static void record_gp_stall_check_time(struct rcu_state *rsp) | |||
851 | rsp->gp_start = j; | 927 | rsp->gp_start = j; |
852 | smp_wmb(); /* Record start time before stall time. */ | 928 | smp_wmb(); /* Record start time before stall time. */ |
853 | j1 = rcu_jiffies_till_stall_check(); | 929 | j1 = rcu_jiffies_till_stall_check(); |
854 | rsp->jiffies_stall = j + j1; | 930 | ACCESS_ONCE(rsp->jiffies_stall) = j + j1; |
855 | rsp->jiffies_resched = j + j1 / 2; | 931 | rsp->jiffies_resched = j + j1 / 2; |
856 | } | 932 | } |
857 | 933 | ||
@@ -890,12 +966,12 @@ static void print_other_cpu_stall(struct rcu_state *rsp) | |||
890 | /* Only let one CPU complain about others per time interval. */ | 966 | /* Only let one CPU complain about others per time interval. */ |
891 | 967 | ||
892 | raw_spin_lock_irqsave(&rnp->lock, flags); | 968 | raw_spin_lock_irqsave(&rnp->lock, flags); |
893 | delta = jiffies - rsp->jiffies_stall; | 969 | delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall); |
894 | if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { | 970 | if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { |
895 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 971 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
896 | return; | 972 | return; |
897 | } | 973 | } |
898 | rsp->jiffies_stall = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; | 974 | ACCESS_ONCE(rsp->jiffies_stall) = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; |
899 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 975 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
900 | 976 | ||
901 | /* | 977 | /* |
@@ -932,9 +1008,9 @@ static void print_other_cpu_stall(struct rcu_state *rsp) | |||
932 | print_cpu_stall_info_end(); | 1008 | print_cpu_stall_info_end(); |
933 | for_each_possible_cpu(cpu) | 1009 | for_each_possible_cpu(cpu) |
934 | totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; | 1010 | totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; |
935 | pr_cont("(detected by %d, t=%ld jiffies, g=%lu, c=%lu, q=%lu)\n", | 1011 | pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n", |
936 | smp_processor_id(), (long)(jiffies - rsp->gp_start), | 1012 | smp_processor_id(), (long)(jiffies - rsp->gp_start), |
937 | rsp->gpnum, rsp->completed, totqlen); | 1013 | (long)rsp->gpnum, (long)rsp->completed, totqlen); |
938 | if (ndetected == 0) | 1014 | if (ndetected == 0) |
939 | pr_err("INFO: Stall ended before state dump start\n"); | 1015 | pr_err("INFO: Stall ended before state dump start\n"); |
940 | else if (!trigger_all_cpu_backtrace()) | 1016 | else if (!trigger_all_cpu_backtrace()) |
@@ -947,12 +1023,6 @@ static void print_other_cpu_stall(struct rcu_state *rsp) | |||
947 | force_quiescent_state(rsp); /* Kick them all. */ | 1023 | force_quiescent_state(rsp); /* Kick them all. */ |
948 | } | 1024 | } |
949 | 1025 | ||
950 | /* | ||
951 | * This function really isn't for public consumption, but RCU is special in | ||
952 | * that context switches can allow the state machine to make progress. | ||
953 | */ | ||
954 | extern void resched_cpu(int cpu); | ||
955 | |||
956 | static void print_cpu_stall(struct rcu_state *rsp) | 1026 | static void print_cpu_stall(struct rcu_state *rsp) |
957 | { | 1027 | { |
958 | int cpu; | 1028 | int cpu; |
@@ -971,14 +1041,15 @@ static void print_cpu_stall(struct rcu_state *rsp) | |||
971 | print_cpu_stall_info_end(); | 1041 | print_cpu_stall_info_end(); |
972 | for_each_possible_cpu(cpu) | 1042 | for_each_possible_cpu(cpu) |
973 | totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; | 1043 | totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; |
974 | pr_cont(" (t=%lu jiffies g=%lu c=%lu q=%lu)\n", | 1044 | pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n", |
975 | jiffies - rsp->gp_start, rsp->gpnum, rsp->completed, totqlen); | 1045 | jiffies - rsp->gp_start, |
1046 | (long)rsp->gpnum, (long)rsp->completed, totqlen); | ||
976 | if (!trigger_all_cpu_backtrace()) | 1047 | if (!trigger_all_cpu_backtrace()) |
977 | dump_stack(); | 1048 | dump_stack(); |
978 | 1049 | ||
979 | raw_spin_lock_irqsave(&rnp->lock, flags); | 1050 | raw_spin_lock_irqsave(&rnp->lock, flags); |
980 | if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall)) | 1051 | if (ULONG_CMP_GE(jiffies, ACCESS_ONCE(rsp->jiffies_stall))) |
981 | rsp->jiffies_stall = jiffies + | 1052 | ACCESS_ONCE(rsp->jiffies_stall) = jiffies + |
982 | 3 * rcu_jiffies_till_stall_check() + 3; | 1053 | 3 * rcu_jiffies_till_stall_check() + 3; |
983 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 1054 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
984 | 1055 | ||
@@ -1062,7 +1133,7 @@ void rcu_cpu_stall_reset(void) | |||
1062 | struct rcu_state *rsp; | 1133 | struct rcu_state *rsp; |
1063 | 1134 | ||
1064 | for_each_rcu_flavor(rsp) | 1135 | for_each_rcu_flavor(rsp) |
1065 | rsp->jiffies_stall = jiffies + ULONG_MAX / 2; | 1136 | ACCESS_ONCE(rsp->jiffies_stall) = jiffies + ULONG_MAX / 2; |
1066 | } | 1137 | } |
1067 | 1138 | ||
1068 | /* | 1139 | /* |
@@ -1123,15 +1194,18 @@ static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, | |||
1123 | /* | 1194 | /* |
1124 | * Start some future grace period, as needed to handle newly arrived | 1195 | * Start some future grace period, as needed to handle newly arrived |
1125 | * callbacks. The required future grace periods are recorded in each | 1196 | * callbacks. The required future grace periods are recorded in each |
1126 | * rcu_node structure's ->need_future_gp field. | 1197 | * rcu_node structure's ->need_future_gp field. Returns true if there |
1198 | * is reason to awaken the grace-period kthread. | ||
1127 | * | 1199 | * |
1128 | * The caller must hold the specified rcu_node structure's ->lock. | 1200 | * The caller must hold the specified rcu_node structure's ->lock. |
1129 | */ | 1201 | */ |
1130 | static unsigned long __maybe_unused | 1202 | static bool __maybe_unused |
1131 | rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) | 1203 | rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, |
1204 | unsigned long *c_out) | ||
1132 | { | 1205 | { |
1133 | unsigned long c; | 1206 | unsigned long c; |
1134 | int i; | 1207 | int i; |
1208 | bool ret = false; | ||
1135 | struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); | 1209 | struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); |
1136 | 1210 | ||
1137 | /* | 1211 | /* |
@@ -1142,7 +1216,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) | |||
1142 | trace_rcu_future_gp(rnp, rdp, c, TPS("Startleaf")); | 1216 | trace_rcu_future_gp(rnp, rdp, c, TPS("Startleaf")); |
1143 | if (rnp->need_future_gp[c & 0x1]) { | 1217 | if (rnp->need_future_gp[c & 0x1]) { |
1144 | trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartleaf")); | 1218 | trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartleaf")); |
1145 | return c; | 1219 | goto out; |
1146 | } | 1220 | } |
1147 | 1221 | ||
1148 | /* | 1222 | /* |
@@ -1156,7 +1230,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) | |||
1156 | ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) { | 1230 | ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) { |
1157 | rnp->need_future_gp[c & 0x1]++; | 1231 | rnp->need_future_gp[c & 0x1]++; |
1158 | trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf")); | 1232 | trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf")); |
1159 | return c; | 1233 | goto out; |
1160 | } | 1234 | } |
1161 | 1235 | ||
1162 | /* | 1236 | /* |
@@ -1197,12 +1271,15 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) | |||
1197 | trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleafroot")); | 1271 | trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleafroot")); |
1198 | } else { | 1272 | } else { |
1199 | trace_rcu_future_gp(rnp, rdp, c, TPS("Startedroot")); | 1273 | trace_rcu_future_gp(rnp, rdp, c, TPS("Startedroot")); |
1200 | rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp); | 1274 | ret = rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp); |
1201 | } | 1275 | } |
1202 | unlock_out: | 1276 | unlock_out: |
1203 | if (rnp != rnp_root) | 1277 | if (rnp != rnp_root) |
1204 | raw_spin_unlock(&rnp_root->lock); | 1278 | raw_spin_unlock(&rnp_root->lock); |
1205 | return c; | 1279 | out: |
1280 | if (c_out != NULL) | ||
1281 | *c_out = c; | ||
1282 | return ret; | ||
1206 | } | 1283 | } |
1207 | 1284 | ||
1208 | /* | 1285 | /* |
@@ -1226,25 +1303,43 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) | |||
1226 | } | 1303 | } |
1227 | 1304 | ||
1228 | /* | 1305 | /* |
1306 | * Awaken the grace-period kthread for the specified flavor of RCU. | ||
1307 | * Don't do a self-awaken, and don't bother awakening when there is | ||
1308 | * nothing for the grace-period kthread to do (as in several CPUs | ||
1309 | * raced to awaken, and we lost), and finally don't try to awaken | ||
1310 | * a kthread that has not yet been created. | ||
1311 | */ | ||
1312 | static void rcu_gp_kthread_wake(struct rcu_state *rsp) | ||
1313 | { | ||
1314 | if (current == rsp->gp_kthread || | ||
1315 | !ACCESS_ONCE(rsp->gp_flags) || | ||
1316 | !rsp->gp_kthread) | ||
1317 | return; | ||
1318 | wake_up(&rsp->gp_wq); | ||
1319 | } | ||
1320 | |||
1321 | /* | ||
1229 | * If there is room, assign a ->completed number to any callbacks on | 1322 | * If there is room, assign a ->completed number to any callbacks on |
1230 | * this CPU that have not already been assigned. Also accelerate any | 1323 | * this CPU that have not already been assigned. Also accelerate any |
1231 | * callbacks that were previously assigned a ->completed number that has | 1324 | * callbacks that were previously assigned a ->completed number that has |
1232 | * since proven to be too conservative, which can happen if callbacks get | 1325 | * since proven to be too conservative, which can happen if callbacks get |
1233 | * assigned a ->completed number while RCU is idle, but with reference to | 1326 | * assigned a ->completed number while RCU is idle, but with reference to |
1234 | * a non-root rcu_node structure. This function is idempotent, so it does | 1327 | * a non-root rcu_node structure. This function is idempotent, so it does |
1235 | * not hurt to call it repeatedly. | 1328 | * not hurt to call it repeatedly. Returns an flag saying that we should |
1329 | * awaken the RCU grace-period kthread. | ||
1236 | * | 1330 | * |
1237 | * The caller must hold rnp->lock with interrupts disabled. | 1331 | * The caller must hold rnp->lock with interrupts disabled. |
1238 | */ | 1332 | */ |
1239 | static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, | 1333 | static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, |
1240 | struct rcu_data *rdp) | 1334 | struct rcu_data *rdp) |
1241 | { | 1335 | { |
1242 | unsigned long c; | 1336 | unsigned long c; |
1243 | int i; | 1337 | int i; |
1338 | bool ret; | ||
1244 | 1339 | ||
1245 | /* If the CPU has no callbacks, nothing to do. */ | 1340 | /* If the CPU has no callbacks, nothing to do. */ |
1246 | if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) | 1341 | if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) |
1247 | return; | 1342 | return false; |
1248 | 1343 | ||
1249 | /* | 1344 | /* |
1250 | * Starting from the sublist containing the callbacks most | 1345 | * Starting from the sublist containing the callbacks most |
@@ -1273,7 +1368,7 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, | |||
1273 | * be grouped into. | 1368 | * be grouped into. |
1274 | */ | 1369 | */ |
1275 | if (++i >= RCU_NEXT_TAIL) | 1370 | if (++i >= RCU_NEXT_TAIL) |
1276 | return; | 1371 | return false; |
1277 | 1372 | ||
1278 | /* | 1373 | /* |
1279 | * Assign all subsequent callbacks' ->completed number to the next | 1374 | * Assign all subsequent callbacks' ->completed number to the next |
@@ -1285,13 +1380,14 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, | |||
1285 | rdp->nxtcompleted[i] = c; | 1380 | rdp->nxtcompleted[i] = c; |
1286 | } | 1381 | } |
1287 | /* Record any needed additional grace periods. */ | 1382 | /* Record any needed additional grace periods. */ |
1288 | rcu_start_future_gp(rnp, rdp); | 1383 | ret = rcu_start_future_gp(rnp, rdp, NULL); |
1289 | 1384 | ||
1290 | /* Trace depending on how much we were able to accelerate. */ | 1385 | /* Trace depending on how much we were able to accelerate. */ |
1291 | if (!*rdp->nxttail[RCU_WAIT_TAIL]) | 1386 | if (!*rdp->nxttail[RCU_WAIT_TAIL]) |
1292 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB")); | 1387 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB")); |
1293 | else | 1388 | else |
1294 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB")); | 1389 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB")); |
1390 | return ret; | ||
1295 | } | 1391 | } |
1296 | 1392 | ||
1297 | /* | 1393 | /* |
@@ -1300,17 +1396,18 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, | |||
1300 | * assign ->completed numbers to any callbacks in the RCU_NEXT_TAIL | 1396 | * assign ->completed numbers to any callbacks in the RCU_NEXT_TAIL |
1301 | * sublist. This function is idempotent, so it does not hurt to | 1397 | * sublist. This function is idempotent, so it does not hurt to |
1302 | * invoke it repeatedly. As long as it is not invoked -too- often... | 1398 | * invoke it repeatedly. As long as it is not invoked -too- often... |
1399 | * Returns true if the RCU grace-period kthread needs to be awakened. | ||
1303 | * | 1400 | * |
1304 | * The caller must hold rnp->lock with interrupts disabled. | 1401 | * The caller must hold rnp->lock with interrupts disabled. |
1305 | */ | 1402 | */ |
1306 | static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, | 1403 | static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, |
1307 | struct rcu_data *rdp) | 1404 | struct rcu_data *rdp) |
1308 | { | 1405 | { |
1309 | int i, j; | 1406 | int i, j; |
1310 | 1407 | ||
1311 | /* If the CPU has no callbacks, nothing to do. */ | 1408 | /* If the CPU has no callbacks, nothing to do. */ |
1312 | if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) | 1409 | if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) |
1313 | return; | 1410 | return false; |
1314 | 1411 | ||
1315 | /* | 1412 | /* |
1316 | * Find all callbacks whose ->completed numbers indicate that they | 1413 | * Find all callbacks whose ->completed numbers indicate that they |
@@ -1334,26 +1431,30 @@ static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, | |||
1334 | } | 1431 | } |
1335 | 1432 | ||
1336 | /* Classify any remaining callbacks. */ | 1433 | /* Classify any remaining callbacks. */ |
1337 | rcu_accelerate_cbs(rsp, rnp, rdp); | 1434 | return rcu_accelerate_cbs(rsp, rnp, rdp); |
1338 | } | 1435 | } |
1339 | 1436 | ||
1340 | /* | 1437 | /* |
1341 | * Update CPU-local rcu_data state to record the beginnings and ends of | 1438 | * Update CPU-local rcu_data state to record the beginnings and ends of |
1342 | * grace periods. The caller must hold the ->lock of the leaf rcu_node | 1439 | * grace periods. The caller must hold the ->lock of the leaf rcu_node |
1343 | * structure corresponding to the current CPU, and must have irqs disabled. | 1440 | * structure corresponding to the current CPU, and must have irqs disabled. |
1441 | * Returns true if the grace-period kthread needs to be awakened. | ||
1344 | */ | 1442 | */ |
1345 | static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) | 1443 | static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, |
1444 | struct rcu_data *rdp) | ||
1346 | { | 1445 | { |
1446 | bool ret; | ||
1447 | |||
1347 | /* Handle the ends of any preceding grace periods first. */ | 1448 | /* Handle the ends of any preceding grace periods first. */ |
1348 | if (rdp->completed == rnp->completed) { | 1449 | if (rdp->completed == rnp->completed) { |
1349 | 1450 | ||
1350 | /* No grace period end, so just accelerate recent callbacks. */ | 1451 | /* No grace period end, so just accelerate recent callbacks. */ |
1351 | rcu_accelerate_cbs(rsp, rnp, rdp); | 1452 | ret = rcu_accelerate_cbs(rsp, rnp, rdp); |
1352 | 1453 | ||
1353 | } else { | 1454 | } else { |
1354 | 1455 | ||
1355 | /* Advance callbacks. */ | 1456 | /* Advance callbacks. */ |
1356 | rcu_advance_cbs(rsp, rnp, rdp); | 1457 | ret = rcu_advance_cbs(rsp, rnp, rdp); |
1357 | 1458 | ||
1358 | /* Remember that we saw this grace-period completion. */ | 1459 | /* Remember that we saw this grace-period completion. */ |
1359 | rdp->completed = rnp->completed; | 1460 | rdp->completed = rnp->completed; |
@@ -1372,11 +1473,13 @@ static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struc | |||
1372 | rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); | 1473 | rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); |
1373 | zero_cpu_stall_ticks(rdp); | 1474 | zero_cpu_stall_ticks(rdp); |
1374 | } | 1475 | } |
1476 | return ret; | ||
1375 | } | 1477 | } |
1376 | 1478 | ||
1377 | static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) | 1479 | static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) |
1378 | { | 1480 | { |
1379 | unsigned long flags; | 1481 | unsigned long flags; |
1482 | bool needwake; | ||
1380 | struct rcu_node *rnp; | 1483 | struct rcu_node *rnp; |
1381 | 1484 | ||
1382 | local_irq_save(flags); | 1485 | local_irq_save(flags); |
@@ -1388,8 +1491,10 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1388 | return; | 1491 | return; |
1389 | } | 1492 | } |
1390 | smp_mb__after_unlock_lock(); | 1493 | smp_mb__after_unlock_lock(); |
1391 | __note_gp_changes(rsp, rnp, rdp); | 1494 | needwake = __note_gp_changes(rsp, rnp, rdp); |
1392 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 1495 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
1496 | if (needwake) | ||
1497 | rcu_gp_kthread_wake(rsp); | ||
1393 | } | 1498 | } |
1394 | 1499 | ||
1395 | /* | 1500 | /* |
@@ -1403,12 +1508,12 @@ static int rcu_gp_init(struct rcu_state *rsp) | |||
1403 | rcu_bind_gp_kthread(); | 1508 | rcu_bind_gp_kthread(); |
1404 | raw_spin_lock_irq(&rnp->lock); | 1509 | raw_spin_lock_irq(&rnp->lock); |
1405 | smp_mb__after_unlock_lock(); | 1510 | smp_mb__after_unlock_lock(); |
1406 | if (rsp->gp_flags == 0) { | 1511 | if (!ACCESS_ONCE(rsp->gp_flags)) { |
1407 | /* Spurious wakeup, tell caller to go back to sleep. */ | 1512 | /* Spurious wakeup, tell caller to go back to sleep. */ |
1408 | raw_spin_unlock_irq(&rnp->lock); | 1513 | raw_spin_unlock_irq(&rnp->lock); |
1409 | return 0; | 1514 | return 0; |
1410 | } | 1515 | } |
1411 | rsp->gp_flags = 0; /* Clear all flags: New grace period. */ | 1516 | ACCESS_ONCE(rsp->gp_flags) = 0; /* Clear all flags: New grace period. */ |
1412 | 1517 | ||
1413 | if (WARN_ON_ONCE(rcu_gp_in_progress(rsp))) { | 1518 | if (WARN_ON_ONCE(rcu_gp_in_progress(rsp))) { |
1414 | /* | 1519 | /* |
@@ -1453,7 +1558,7 @@ static int rcu_gp_init(struct rcu_state *rsp) | |||
1453 | WARN_ON_ONCE(rnp->completed != rsp->completed); | 1558 | WARN_ON_ONCE(rnp->completed != rsp->completed); |
1454 | ACCESS_ONCE(rnp->completed) = rsp->completed; | 1559 | ACCESS_ONCE(rnp->completed) = rsp->completed; |
1455 | if (rnp == rdp->mynode) | 1560 | if (rnp == rdp->mynode) |
1456 | __note_gp_changes(rsp, rnp, rdp); | 1561 | (void)__note_gp_changes(rsp, rnp, rdp); |
1457 | rcu_preempt_boost_start_gp(rnp); | 1562 | rcu_preempt_boost_start_gp(rnp); |
1458 | trace_rcu_grace_period_init(rsp->name, rnp->gpnum, | 1563 | trace_rcu_grace_period_init(rsp->name, rnp->gpnum, |
1459 | rnp->level, rnp->grplo, | 1564 | rnp->level, rnp->grplo, |
@@ -1501,7 +1606,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) | |||
1501 | if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { | 1606 | if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { |
1502 | raw_spin_lock_irq(&rnp->lock); | 1607 | raw_spin_lock_irq(&rnp->lock); |
1503 | smp_mb__after_unlock_lock(); | 1608 | smp_mb__after_unlock_lock(); |
1504 | rsp->gp_flags &= ~RCU_GP_FLAG_FQS; | 1609 | ACCESS_ONCE(rsp->gp_flags) &= ~RCU_GP_FLAG_FQS; |
1505 | raw_spin_unlock_irq(&rnp->lock); | 1610 | raw_spin_unlock_irq(&rnp->lock); |
1506 | } | 1611 | } |
1507 | return fqs_state; | 1612 | return fqs_state; |
@@ -1513,6 +1618,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) | |||
1513 | static void rcu_gp_cleanup(struct rcu_state *rsp) | 1618 | static void rcu_gp_cleanup(struct rcu_state *rsp) |
1514 | { | 1619 | { |
1515 | unsigned long gp_duration; | 1620 | unsigned long gp_duration; |
1621 | bool needgp = false; | ||
1516 | int nocb = 0; | 1622 | int nocb = 0; |
1517 | struct rcu_data *rdp; | 1623 | struct rcu_data *rdp; |
1518 | struct rcu_node *rnp = rcu_get_root(rsp); | 1624 | struct rcu_node *rnp = rcu_get_root(rsp); |
@@ -1548,7 +1654,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) | |||
1548 | ACCESS_ONCE(rnp->completed) = rsp->gpnum; | 1654 | ACCESS_ONCE(rnp->completed) = rsp->gpnum; |
1549 | rdp = this_cpu_ptr(rsp->rda); | 1655 | rdp = this_cpu_ptr(rsp->rda); |
1550 | if (rnp == rdp->mynode) | 1656 | if (rnp == rdp->mynode) |
1551 | __note_gp_changes(rsp, rnp, rdp); | 1657 | needgp = __note_gp_changes(rsp, rnp, rdp) || needgp; |
1552 | /* smp_mb() provided by prior unlock-lock pair. */ | 1658 | /* smp_mb() provided by prior unlock-lock pair. */ |
1553 | nocb += rcu_future_gp_cleanup(rsp, rnp); | 1659 | nocb += rcu_future_gp_cleanup(rsp, rnp); |
1554 | raw_spin_unlock_irq(&rnp->lock); | 1660 | raw_spin_unlock_irq(&rnp->lock); |
@@ -1564,9 +1670,10 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) | |||
1564 | trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end")); | 1670 | trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end")); |
1565 | rsp->fqs_state = RCU_GP_IDLE; | 1671 | rsp->fqs_state = RCU_GP_IDLE; |
1566 | rdp = this_cpu_ptr(rsp->rda); | 1672 | rdp = this_cpu_ptr(rsp->rda); |
1567 | rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */ | 1673 | /* Advance CBs to reduce false positives below. */ |
1568 | if (cpu_needs_another_gp(rsp, rdp)) { | 1674 | needgp = rcu_advance_cbs(rsp, rnp, rdp) || needgp; |
1569 | rsp->gp_flags = RCU_GP_FLAG_INIT; | 1675 | if (needgp || cpu_needs_another_gp(rsp, rdp)) { |
1676 | ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT; | ||
1570 | trace_rcu_grace_period(rsp->name, | 1677 | trace_rcu_grace_period(rsp->name, |
1571 | ACCESS_ONCE(rsp->gpnum), | 1678 | ACCESS_ONCE(rsp->gpnum), |
1572 | TPS("newreq")); | 1679 | TPS("newreq")); |
@@ -1593,6 +1700,7 @@ static int __noreturn rcu_gp_kthread(void *arg) | |||
1593 | trace_rcu_grace_period(rsp->name, | 1700 | trace_rcu_grace_period(rsp->name, |
1594 | ACCESS_ONCE(rsp->gpnum), | 1701 | ACCESS_ONCE(rsp->gpnum), |
1595 | TPS("reqwait")); | 1702 | TPS("reqwait")); |
1703 | rsp->gp_state = RCU_GP_WAIT_GPS; | ||
1596 | wait_event_interruptible(rsp->gp_wq, | 1704 | wait_event_interruptible(rsp->gp_wq, |
1597 | ACCESS_ONCE(rsp->gp_flags) & | 1705 | ACCESS_ONCE(rsp->gp_flags) & |
1598 | RCU_GP_FLAG_INIT); | 1706 | RCU_GP_FLAG_INIT); |
@@ -1620,6 +1728,7 @@ static int __noreturn rcu_gp_kthread(void *arg) | |||
1620 | trace_rcu_grace_period(rsp->name, | 1728 | trace_rcu_grace_period(rsp->name, |
1621 | ACCESS_ONCE(rsp->gpnum), | 1729 | ACCESS_ONCE(rsp->gpnum), |
1622 | TPS("fqswait")); | 1730 | TPS("fqswait")); |
1731 | rsp->gp_state = RCU_GP_WAIT_FQS; | ||
1623 | ret = wait_event_interruptible_timeout(rsp->gp_wq, | 1732 | ret = wait_event_interruptible_timeout(rsp->gp_wq, |
1624 | ((gf = ACCESS_ONCE(rsp->gp_flags)) & | 1733 | ((gf = ACCESS_ONCE(rsp->gp_flags)) & |
1625 | RCU_GP_FLAG_FQS) || | 1734 | RCU_GP_FLAG_FQS) || |
@@ -1665,14 +1774,6 @@ static int __noreturn rcu_gp_kthread(void *arg) | |||
1665 | } | 1774 | } |
1666 | } | 1775 | } |
1667 | 1776 | ||
1668 | static void rsp_wakeup(struct irq_work *work) | ||
1669 | { | ||
1670 | struct rcu_state *rsp = container_of(work, struct rcu_state, wakeup_work); | ||
1671 | |||
1672 | /* Wake up rcu_gp_kthread() to start the grace period. */ | ||
1673 | wake_up(&rsp->gp_wq); | ||
1674 | } | ||
1675 | |||
1676 | /* | 1777 | /* |
1677 | * Start a new RCU grace period if warranted, re-initializing the hierarchy | 1778 | * Start a new RCU grace period if warranted, re-initializing the hierarchy |
1678 | * in preparation for detecting the next grace period. The caller must hold | 1779 | * in preparation for detecting the next grace period. The caller must hold |
@@ -1681,8 +1782,10 @@ static void rsp_wakeup(struct irq_work *work) | |||
1681 | * Note that it is legal for a dying CPU (which is marked as offline) to | 1782 | * Note that it is legal for a dying CPU (which is marked as offline) to |
1682 | * invoke this function. This can happen when the dying CPU reports its | 1783 | * invoke this function. This can happen when the dying CPU reports its |
1683 | * quiescent state. | 1784 | * quiescent state. |
1785 | * | ||
1786 | * Returns true if the grace-period kthread must be awakened. | ||
1684 | */ | 1787 | */ |
1685 | static void | 1788 | static bool |
1686 | rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, | 1789 | rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, |
1687 | struct rcu_data *rdp) | 1790 | struct rcu_data *rdp) |
1688 | { | 1791 | { |
@@ -1693,20 +1796,18 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, | |||
1693 | * or a grace period is already in progress. | 1796 | * or a grace period is already in progress. |
1694 | * Either way, don't start a new grace period. | 1797 | * Either way, don't start a new grace period. |
1695 | */ | 1798 | */ |
1696 | return; | 1799 | return false; |
1697 | } | 1800 | } |
1698 | rsp->gp_flags = RCU_GP_FLAG_INIT; | 1801 | ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT; |
1699 | trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), | 1802 | trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), |
1700 | TPS("newreq")); | 1803 | TPS("newreq")); |
1701 | 1804 | ||
1702 | /* | 1805 | /* |
1703 | * We can't do wakeups while holding the rnp->lock, as that | 1806 | * We can't do wakeups while holding the rnp->lock, as that |
1704 | * could cause possible deadlocks with the rq->lock. Defer | 1807 | * could cause possible deadlocks with the rq->lock. Defer |
1705 | * the wakeup to interrupt context. And don't bother waking | 1808 | * the wakeup to our caller. |
1706 | * up the running kthread. | ||
1707 | */ | 1809 | */ |
1708 | if (current != rsp->gp_kthread) | 1810 | return true; |
1709 | irq_work_queue(&rsp->wakeup_work); | ||
1710 | } | 1811 | } |
1711 | 1812 | ||
1712 | /* | 1813 | /* |
@@ -1715,12 +1816,14 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, | |||
1715 | * is invoked indirectly from rcu_advance_cbs(), which would result in | 1816 | * is invoked indirectly from rcu_advance_cbs(), which would result in |
1716 | * endless recursion -- or would do so if it wasn't for the self-deadlock | 1817 | * endless recursion -- or would do so if it wasn't for the self-deadlock |
1717 | * that is encountered beforehand. | 1818 | * that is encountered beforehand. |
1819 | * | ||
1820 | * Returns true if the grace-period kthread needs to be awakened. | ||
1718 | */ | 1821 | */ |
1719 | static void | 1822 | static bool rcu_start_gp(struct rcu_state *rsp) |
1720 | rcu_start_gp(struct rcu_state *rsp) | ||
1721 | { | 1823 | { |
1722 | struct rcu_data *rdp = this_cpu_ptr(rsp->rda); | 1824 | struct rcu_data *rdp = this_cpu_ptr(rsp->rda); |
1723 | struct rcu_node *rnp = rcu_get_root(rsp); | 1825 | struct rcu_node *rnp = rcu_get_root(rsp); |
1826 | bool ret = false; | ||
1724 | 1827 | ||
1725 | /* | 1828 | /* |
1726 | * If there is no grace period in progress right now, any | 1829 | * If there is no grace period in progress right now, any |
@@ -1730,8 +1833,9 @@ rcu_start_gp(struct rcu_state *rsp) | |||
1730 | * resulting in pointless grace periods. So, advance callbacks | 1833 | * resulting in pointless grace periods. So, advance callbacks |
1731 | * then start the grace period! | 1834 | * then start the grace period! |
1732 | */ | 1835 | */ |
1733 | rcu_advance_cbs(rsp, rnp, rdp); | 1836 | ret = rcu_advance_cbs(rsp, rnp, rdp) || ret; |
1734 | rcu_start_gp_advanced(rsp, rnp, rdp); | 1837 | ret = rcu_start_gp_advanced(rsp, rnp, rdp) || ret; |
1838 | return ret; | ||
1735 | } | 1839 | } |
1736 | 1840 | ||
1737 | /* | 1841 | /* |
@@ -1820,6 +1924,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) | |||
1820 | { | 1924 | { |
1821 | unsigned long flags; | 1925 | unsigned long flags; |
1822 | unsigned long mask; | 1926 | unsigned long mask; |
1927 | bool needwake; | ||
1823 | struct rcu_node *rnp; | 1928 | struct rcu_node *rnp; |
1824 | 1929 | ||
1825 | rnp = rdp->mynode; | 1930 | rnp = rdp->mynode; |
@@ -1848,9 +1953,11 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) | |||
1848 | * This GP can't end until cpu checks in, so all of our | 1953 | * This GP can't end until cpu checks in, so all of our |
1849 | * callbacks can be processed during the next GP. | 1954 | * callbacks can be processed during the next GP. |
1850 | */ | 1955 | */ |
1851 | rcu_accelerate_cbs(rsp, rnp, rdp); | 1956 | needwake = rcu_accelerate_cbs(rsp, rnp, rdp); |
1852 | 1957 | ||
1853 | rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */ | 1958 | rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */ |
1959 | if (needwake) | ||
1960 | rcu_gp_kthread_wake(rsp); | ||
1854 | } | 1961 | } |
1855 | } | 1962 | } |
1856 | 1963 | ||
@@ -1951,7 +2058,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, | |||
1951 | static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) | 2058 | static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) |
1952 | { | 2059 | { |
1953 | int i; | 2060 | int i; |
1954 | struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); | 2061 | struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); |
1955 | 2062 | ||
1956 | /* No-CBs CPUs are handled specially. */ | 2063 | /* No-CBs CPUs are handled specially. */ |
1957 | if (rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags)) | 2064 | if (rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags)) |
@@ -2320,7 +2427,7 @@ static void force_quiescent_state(struct rcu_state *rsp) | |||
2320 | raw_spin_unlock_irqrestore(&rnp_old->lock, flags); | 2427 | raw_spin_unlock_irqrestore(&rnp_old->lock, flags); |
2321 | return; /* Someone beat us to it. */ | 2428 | return; /* Someone beat us to it. */ |
2322 | } | 2429 | } |
2323 | rsp->gp_flags |= RCU_GP_FLAG_FQS; | 2430 | ACCESS_ONCE(rsp->gp_flags) |= RCU_GP_FLAG_FQS; |
2324 | raw_spin_unlock_irqrestore(&rnp_old->lock, flags); | 2431 | raw_spin_unlock_irqrestore(&rnp_old->lock, flags); |
2325 | wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */ | 2432 | wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */ |
2326 | } | 2433 | } |
@@ -2334,7 +2441,8 @@ static void | |||
2334 | __rcu_process_callbacks(struct rcu_state *rsp) | 2441 | __rcu_process_callbacks(struct rcu_state *rsp) |
2335 | { | 2442 | { |
2336 | unsigned long flags; | 2443 | unsigned long flags; |
2337 | struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); | 2444 | bool needwake; |
2445 | struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); | ||
2338 | 2446 | ||
2339 | WARN_ON_ONCE(rdp->beenonline == 0); | 2447 | WARN_ON_ONCE(rdp->beenonline == 0); |
2340 | 2448 | ||
@@ -2345,8 +2453,10 @@ __rcu_process_callbacks(struct rcu_state *rsp) | |||
2345 | local_irq_save(flags); | 2453 | local_irq_save(flags); |
2346 | if (cpu_needs_another_gp(rsp, rdp)) { | 2454 | if (cpu_needs_another_gp(rsp, rdp)) { |
2347 | raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */ | 2455 | raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */ |
2348 | rcu_start_gp(rsp); | 2456 | needwake = rcu_start_gp(rsp); |
2349 | raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); | 2457 | raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); |
2458 | if (needwake) | ||
2459 | rcu_gp_kthread_wake(rsp); | ||
2350 | } else { | 2460 | } else { |
2351 | local_irq_restore(flags); | 2461 | local_irq_restore(flags); |
2352 | } | 2462 | } |
@@ -2404,6 +2514,8 @@ static void invoke_rcu_core(void) | |||
2404 | static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, | 2514 | static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, |
2405 | struct rcu_head *head, unsigned long flags) | 2515 | struct rcu_head *head, unsigned long flags) |
2406 | { | 2516 | { |
2517 | bool needwake; | ||
2518 | |||
2407 | /* | 2519 | /* |
2408 | * If called from an extended quiescent state, invoke the RCU | 2520 | * If called from an extended quiescent state, invoke the RCU |
2409 | * core in order to force a re-evaluation of RCU's idleness. | 2521 | * core in order to force a re-evaluation of RCU's idleness. |
@@ -2433,8 +2545,10 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, | |||
2433 | 2545 | ||
2434 | raw_spin_lock(&rnp_root->lock); | 2546 | raw_spin_lock(&rnp_root->lock); |
2435 | smp_mb__after_unlock_lock(); | 2547 | smp_mb__after_unlock_lock(); |
2436 | rcu_start_gp(rsp); | 2548 | needwake = rcu_start_gp(rsp); |
2437 | raw_spin_unlock(&rnp_root->lock); | 2549 | raw_spin_unlock(&rnp_root->lock); |
2550 | if (needwake) | ||
2551 | rcu_gp_kthread_wake(rsp); | ||
2438 | } else { | 2552 | } else { |
2439 | /* Give the grace period a kick. */ | 2553 | /* Give the grace period a kick. */ |
2440 | rdp->blimit = LONG_MAX; | 2554 | rdp->blimit = LONG_MAX; |
@@ -2537,6 +2651,20 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | |||
2537 | EXPORT_SYMBOL_GPL(call_rcu_bh); | 2651 | EXPORT_SYMBOL_GPL(call_rcu_bh); |
2538 | 2652 | ||
2539 | /* | 2653 | /* |
2654 | * Queue an RCU callback for lazy invocation after a grace period. | ||
2655 | * This will likely be later named something like "call_rcu_lazy()", | ||
2656 | * but this change will require some way of tagging the lazy RCU | ||
2657 | * callbacks in the list of pending callbacks. Until then, this | ||
2658 | * function may only be called from __kfree_rcu(). | ||
2659 | */ | ||
2660 | void kfree_call_rcu(struct rcu_head *head, | ||
2661 | void (*func)(struct rcu_head *rcu)) | ||
2662 | { | ||
2663 | __call_rcu(head, func, rcu_state_p, -1, 1); | ||
2664 | } | ||
2665 | EXPORT_SYMBOL_GPL(kfree_call_rcu); | ||
2666 | |||
2667 | /* | ||
2540 | * Because a context switch is a grace period for RCU-sched and RCU-bh, | 2668 | * Because a context switch is a grace period for RCU-sched and RCU-bh, |
2541 | * any blocking grace-period wait automatically implies a grace period | 2669 | * any blocking grace-period wait automatically implies a grace period |
2542 | * if there is only one CPU online at any point time during execution | 2670 | * if there is only one CPU online at any point time during execution |
@@ -2659,7 +2787,7 @@ unsigned long get_state_synchronize_rcu(void) | |||
2659 | * time-consuming work between get_state_synchronize_rcu() | 2787 | * time-consuming work between get_state_synchronize_rcu() |
2660 | * and cond_synchronize_rcu(). | 2788 | * and cond_synchronize_rcu(). |
2661 | */ | 2789 | */ |
2662 | return smp_load_acquire(&rcu_state->gpnum); | 2790 | return smp_load_acquire(&rcu_state_p->gpnum); |
2663 | } | 2791 | } |
2664 | EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); | 2792 | EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); |
2665 | 2793 | ||
@@ -2685,7 +2813,7 @@ void cond_synchronize_rcu(unsigned long oldstate) | |||
2685 | * Ensure that this load happens before any RCU-destructive | 2813 | * Ensure that this load happens before any RCU-destructive |
2686 | * actions the caller might carry out after we return. | 2814 | * actions the caller might carry out after we return. |
2687 | */ | 2815 | */ |
2688 | newstate = smp_load_acquire(&rcu_state->completed); | 2816 | newstate = smp_load_acquire(&rcu_state_p->completed); |
2689 | if (ULONG_CMP_GE(oldstate, newstate)) | 2817 | if (ULONG_CMP_GE(oldstate, newstate)) |
2690 | synchronize_rcu(); | 2818 | synchronize_rcu(); |
2691 | } | 2819 | } |
@@ -2790,7 +2918,7 @@ void synchronize_sched_expedited(void) | |||
2790 | s = atomic_long_read(&rsp->expedited_done); | 2918 | s = atomic_long_read(&rsp->expedited_done); |
2791 | if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { | 2919 | if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { |
2792 | /* ensure test happens before caller kfree */ | 2920 | /* ensure test happens before caller kfree */ |
2793 | smp_mb__before_atomic_inc(); /* ^^^ */ | 2921 | smp_mb__before_atomic(); /* ^^^ */ |
2794 | atomic_long_inc(&rsp->expedited_workdone1); | 2922 | atomic_long_inc(&rsp->expedited_workdone1); |
2795 | return; | 2923 | return; |
2796 | } | 2924 | } |
@@ -2808,7 +2936,7 @@ void synchronize_sched_expedited(void) | |||
2808 | s = atomic_long_read(&rsp->expedited_done); | 2936 | s = atomic_long_read(&rsp->expedited_done); |
2809 | if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { | 2937 | if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { |
2810 | /* ensure test happens before caller kfree */ | 2938 | /* ensure test happens before caller kfree */ |
2811 | smp_mb__before_atomic_inc(); /* ^^^ */ | 2939 | smp_mb__before_atomic(); /* ^^^ */ |
2812 | atomic_long_inc(&rsp->expedited_workdone2); | 2940 | atomic_long_inc(&rsp->expedited_workdone2); |
2813 | return; | 2941 | return; |
2814 | } | 2942 | } |
@@ -2837,7 +2965,7 @@ void synchronize_sched_expedited(void) | |||
2837 | s = atomic_long_read(&rsp->expedited_done); | 2965 | s = atomic_long_read(&rsp->expedited_done); |
2838 | if (ULONG_CMP_GE((ulong)s, (ulong)snap)) { | 2966 | if (ULONG_CMP_GE((ulong)s, (ulong)snap)) { |
2839 | /* ensure test happens before caller kfree */ | 2967 | /* ensure test happens before caller kfree */ |
2840 | smp_mb__before_atomic_inc(); /* ^^^ */ | 2968 | smp_mb__before_atomic(); /* ^^^ */ |
2841 | atomic_long_inc(&rsp->expedited_done_lost); | 2969 | atomic_long_inc(&rsp->expedited_done_lost); |
2842 | break; | 2970 | break; |
2843 | } | 2971 | } |
@@ -2988,7 +3116,7 @@ static void rcu_barrier_callback(struct rcu_head *rhp) | |||
2988 | static void rcu_barrier_func(void *type) | 3116 | static void rcu_barrier_func(void *type) |
2989 | { | 3117 | { |
2990 | struct rcu_state *rsp = type; | 3118 | struct rcu_state *rsp = type; |
2991 | struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); | 3119 | struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); |
2992 | 3120 | ||
2993 | _rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done); | 3121 | _rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done); |
2994 | atomic_inc(&rsp->barrier_cpu_count); | 3122 | atomic_inc(&rsp->barrier_cpu_count); |
@@ -3160,7 +3288,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) | |||
3160 | * that this CPU cannot possibly have any RCU callbacks in flight yet. | 3288 | * that this CPU cannot possibly have any RCU callbacks in flight yet. |
3161 | */ | 3289 | */ |
3162 | static void | 3290 | static void |
3163 | rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) | 3291 | rcu_init_percpu_data(int cpu, struct rcu_state *rsp) |
3164 | { | 3292 | { |
3165 | unsigned long flags; | 3293 | unsigned long flags; |
3166 | unsigned long mask; | 3294 | unsigned long mask; |
@@ -3173,7 +3301,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) | |||
3173 | /* Set up local state, ensuring consistent view of global state. */ | 3301 | /* Set up local state, ensuring consistent view of global state. */ |
3174 | raw_spin_lock_irqsave(&rnp->lock, flags); | 3302 | raw_spin_lock_irqsave(&rnp->lock, flags); |
3175 | rdp->beenonline = 1; /* We have now been online. */ | 3303 | rdp->beenonline = 1; /* We have now been online. */ |
3176 | rdp->preemptible = preemptible; | ||
3177 | rdp->qlen_last_fqs_check = 0; | 3304 | rdp->qlen_last_fqs_check = 0; |
3178 | rdp->n_force_qs_snap = rsp->n_force_qs; | 3305 | rdp->n_force_qs_snap = rsp->n_force_qs; |
3179 | rdp->blimit = blimit; | 3306 | rdp->blimit = blimit; |
@@ -3217,8 +3344,7 @@ static void rcu_prepare_cpu(int cpu) | |||
3217 | struct rcu_state *rsp; | 3344 | struct rcu_state *rsp; |
3218 | 3345 | ||
3219 | for_each_rcu_flavor(rsp) | 3346 | for_each_rcu_flavor(rsp) |
3220 | rcu_init_percpu_data(cpu, rsp, | 3347 | rcu_init_percpu_data(cpu, rsp); |
3221 | strcmp(rsp->name, "rcu_preempt") == 0); | ||
3222 | } | 3348 | } |
3223 | 3349 | ||
3224 | /* | 3350 | /* |
@@ -3228,7 +3354,7 @@ static int rcu_cpu_notify(struct notifier_block *self, | |||
3228 | unsigned long action, void *hcpu) | 3354 | unsigned long action, void *hcpu) |
3229 | { | 3355 | { |
3230 | long cpu = (long)hcpu; | 3356 | long cpu = (long)hcpu; |
3231 | struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); | 3357 | struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu); |
3232 | struct rcu_node *rnp = rdp->mynode; | 3358 | struct rcu_node *rnp = rdp->mynode; |
3233 | struct rcu_state *rsp; | 3359 | struct rcu_state *rsp; |
3234 | 3360 | ||
@@ -3402,8 +3528,8 @@ static void __init rcu_init_one(struct rcu_state *rsp, | |||
3402 | rnp->qsmaskinit = 0; | 3528 | rnp->qsmaskinit = 0; |
3403 | rnp->grplo = j * cpustride; | 3529 | rnp->grplo = j * cpustride; |
3404 | rnp->grphi = (j + 1) * cpustride - 1; | 3530 | rnp->grphi = (j + 1) * cpustride - 1; |
3405 | if (rnp->grphi >= NR_CPUS) | 3531 | if (rnp->grphi >= nr_cpu_ids) |
3406 | rnp->grphi = NR_CPUS - 1; | 3532 | rnp->grphi = nr_cpu_ids - 1; |
3407 | if (i == 0) { | 3533 | if (i == 0) { |
3408 | rnp->grpnum = 0; | 3534 | rnp->grpnum = 0; |
3409 | rnp->grpmask = 0; | 3535 | rnp->grpmask = 0; |
@@ -3422,7 +3548,6 @@ static void __init rcu_init_one(struct rcu_state *rsp, | |||
3422 | 3548 | ||
3423 | rsp->rda = rda; | 3549 | rsp->rda = rda; |
3424 | init_waitqueue_head(&rsp->gp_wq); | 3550 | init_waitqueue_head(&rsp->gp_wq); |
3425 | init_irq_work(&rsp->wakeup_work, rsp_wakeup); | ||
3426 | rnp = rsp->level[rcu_num_lvls - 1]; | 3551 | rnp = rsp->level[rcu_num_lvls - 1]; |
3427 | for_each_possible_cpu(i) { | 3552 | for_each_possible_cpu(i) { |
3428 | while (i > rnp->grphi) | 3553 | while (i > rnp->grphi) |
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 75dc3c39a02a..bf2c1e669691 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h | |||
@@ -252,7 +252,6 @@ struct rcu_data { | |||
252 | bool passed_quiesce; /* User-mode/idle loop etc. */ | 252 | bool passed_quiesce; /* User-mode/idle loop etc. */ |
253 | bool qs_pending; /* Core waits for quiesc state. */ | 253 | bool qs_pending; /* Core waits for quiesc state. */ |
254 | bool beenonline; /* CPU online at least once. */ | 254 | bool beenonline; /* CPU online at least once. */ |
255 | bool preemptible; /* Preemptible RCU? */ | ||
256 | struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ | 255 | struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ |
257 | unsigned long grpmask; /* Mask to apply to leaf qsmask. */ | 256 | unsigned long grpmask; /* Mask to apply to leaf qsmask. */ |
258 | #ifdef CONFIG_RCU_CPU_STALL_INFO | 257 | #ifdef CONFIG_RCU_CPU_STALL_INFO |
@@ -406,7 +405,8 @@ struct rcu_state { | |||
406 | unsigned long completed; /* # of last completed gp. */ | 405 | unsigned long completed; /* # of last completed gp. */ |
407 | struct task_struct *gp_kthread; /* Task for grace periods. */ | 406 | struct task_struct *gp_kthread; /* Task for grace periods. */ |
408 | wait_queue_head_t gp_wq; /* Where GP task waits. */ | 407 | wait_queue_head_t gp_wq; /* Where GP task waits. */ |
409 | int gp_flags; /* Commands for GP task. */ | 408 | short gp_flags; /* Commands for GP task. */ |
409 | short gp_state; /* GP kthread sleep state. */ | ||
410 | 410 | ||
411 | /* End of fields guarded by root rcu_node's lock. */ | 411 | /* End of fields guarded by root rcu_node's lock. */ |
412 | 412 | ||
@@ -462,13 +462,17 @@ struct rcu_state { | |||
462 | const char *name; /* Name of structure. */ | 462 | const char *name; /* Name of structure. */ |
463 | char abbr; /* Abbreviated name. */ | 463 | char abbr; /* Abbreviated name. */ |
464 | struct list_head flavors; /* List of RCU flavors. */ | 464 | struct list_head flavors; /* List of RCU flavors. */ |
465 | struct irq_work wakeup_work; /* Postponed wakeups */ | ||
466 | }; | 465 | }; |
467 | 466 | ||
468 | /* Values for rcu_state structure's gp_flags field. */ | 467 | /* Values for rcu_state structure's gp_flags field. */ |
469 | #define RCU_GP_FLAG_INIT 0x1 /* Need grace-period initialization. */ | 468 | #define RCU_GP_FLAG_INIT 0x1 /* Need grace-period initialization. */ |
470 | #define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */ | 469 | #define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */ |
471 | 470 | ||
471 | /* Values for rcu_state structure's gp_flags field. */ | ||
472 | #define RCU_GP_WAIT_INIT 0 /* Initial state. */ | ||
473 | #define RCU_GP_WAIT_GPS 1 /* Wait for grace-period start. */ | ||
474 | #define RCU_GP_WAIT_FQS 2 /* Wait for force-quiescent-state time. */ | ||
475 | |||
472 | extern struct list_head rcu_struct_flavors; | 476 | extern struct list_head rcu_struct_flavors; |
473 | 477 | ||
474 | /* Sequence through rcu_state structures for each RCU flavor. */ | 478 | /* Sequence through rcu_state structures for each RCU flavor. */ |
@@ -547,7 +551,6 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); | |||
547 | static void print_cpu_stall_info_end(void); | 551 | static void print_cpu_stall_info_end(void); |
548 | static void zero_cpu_stall_ticks(struct rcu_data *rdp); | 552 | static void zero_cpu_stall_ticks(struct rcu_data *rdp); |
549 | static void increment_cpu_stall_ticks(void); | 553 | static void increment_cpu_stall_ticks(void); |
550 | static int rcu_nocb_needs_gp(struct rcu_state *rsp); | ||
551 | static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); | 554 | static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); |
552 | static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp); | 555 | static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp); |
553 | static void rcu_init_one_nocb(struct rcu_node *rnp); | 556 | static void rcu_init_one_nocb(struct rcu_node *rnp); |
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 962d1d589929..cbc2c45265e2 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h | |||
@@ -116,7 +116,7 @@ static void __init rcu_bootup_announce_oddness(void) | |||
116 | #ifdef CONFIG_TREE_PREEMPT_RCU | 116 | #ifdef CONFIG_TREE_PREEMPT_RCU |
117 | 117 | ||
118 | RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); | 118 | RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); |
119 | static struct rcu_state *rcu_state = &rcu_preempt_state; | 119 | static struct rcu_state *rcu_state_p = &rcu_preempt_state; |
120 | 120 | ||
121 | static int rcu_preempted_readers_exp(struct rcu_node *rnp); | 121 | static int rcu_preempted_readers_exp(struct rcu_node *rnp); |
122 | 122 | ||
@@ -149,15 +149,6 @@ long rcu_batches_completed(void) | |||
149 | EXPORT_SYMBOL_GPL(rcu_batches_completed); | 149 | EXPORT_SYMBOL_GPL(rcu_batches_completed); |
150 | 150 | ||
151 | /* | 151 | /* |
152 | * Force a quiescent state for preemptible RCU. | ||
153 | */ | ||
154 | void rcu_force_quiescent_state(void) | ||
155 | { | ||
156 | force_quiescent_state(&rcu_preempt_state); | ||
157 | } | ||
158 | EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); | ||
159 | |||
160 | /* | ||
161 | * Record a preemptible-RCU quiescent state for the specified CPU. Note | 152 | * Record a preemptible-RCU quiescent state for the specified CPU. Note |
162 | * that this just means that the task currently running on the CPU is | 153 | * that this just means that the task currently running on the CPU is |
163 | * not in a quiescent state. There might be any number of tasks blocked | 154 | * not in a quiescent state. There might be any number of tasks blocked |
@@ -688,20 +679,6 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | |||
688 | } | 679 | } |
689 | EXPORT_SYMBOL_GPL(call_rcu); | 680 | EXPORT_SYMBOL_GPL(call_rcu); |
690 | 681 | ||
691 | /* | ||
692 | * Queue an RCU callback for lazy invocation after a grace period. | ||
693 | * This will likely be later named something like "call_rcu_lazy()", | ||
694 | * but this change will require some way of tagging the lazy RCU | ||
695 | * callbacks in the list of pending callbacks. Until then, this | ||
696 | * function may only be called from __kfree_rcu(). | ||
697 | */ | ||
698 | void kfree_call_rcu(struct rcu_head *head, | ||
699 | void (*func)(struct rcu_head *rcu)) | ||
700 | { | ||
701 | __call_rcu(head, func, &rcu_preempt_state, -1, 1); | ||
702 | } | ||
703 | EXPORT_SYMBOL_GPL(kfree_call_rcu); | ||
704 | |||
705 | /** | 682 | /** |
706 | * synchronize_rcu - wait until a grace period has elapsed. | 683 | * synchronize_rcu - wait until a grace period has elapsed. |
707 | * | 684 | * |
@@ -970,7 +947,7 @@ void exit_rcu(void) | |||
970 | 947 | ||
971 | #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | 948 | #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ |
972 | 949 | ||
973 | static struct rcu_state *rcu_state = &rcu_sched_state; | 950 | static struct rcu_state *rcu_state_p = &rcu_sched_state; |
974 | 951 | ||
975 | /* | 952 | /* |
976 | * Tell them what RCU they are running. | 953 | * Tell them what RCU they are running. |
@@ -991,16 +968,6 @@ long rcu_batches_completed(void) | |||
991 | EXPORT_SYMBOL_GPL(rcu_batches_completed); | 968 | EXPORT_SYMBOL_GPL(rcu_batches_completed); |
992 | 969 | ||
993 | /* | 970 | /* |
994 | * Force a quiescent state for RCU, which, because there is no preemptible | ||
995 | * RCU, becomes the same as rcu-sched. | ||
996 | */ | ||
997 | void rcu_force_quiescent_state(void) | ||
998 | { | ||
999 | rcu_sched_force_quiescent_state(); | ||
1000 | } | ||
1001 | EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); | ||
1002 | |||
1003 | /* | ||
1004 | * Because preemptible RCU does not exist, we never have to check for | 971 | * Because preemptible RCU does not exist, we never have to check for |
1005 | * CPUs being in quiescent states. | 972 | * CPUs being in quiescent states. |
1006 | */ | 973 | */ |
@@ -1080,22 +1047,6 @@ static void rcu_preempt_check_callbacks(int cpu) | |||
1080 | } | 1047 | } |
1081 | 1048 | ||
1082 | /* | 1049 | /* |
1083 | * Queue an RCU callback for lazy invocation after a grace period. | ||
1084 | * This will likely be later named something like "call_rcu_lazy()", | ||
1085 | * but this change will require some way of tagging the lazy RCU | ||
1086 | * callbacks in the list of pending callbacks. Until then, this | ||
1087 | * function may only be called from __kfree_rcu(). | ||
1088 | * | ||
1089 | * Because there is no preemptible RCU, we use RCU-sched instead. | ||
1090 | */ | ||
1091 | void kfree_call_rcu(struct rcu_head *head, | ||
1092 | void (*func)(struct rcu_head *rcu)) | ||
1093 | { | ||
1094 | __call_rcu(head, func, &rcu_sched_state, -1, 1); | ||
1095 | } | ||
1096 | EXPORT_SYMBOL_GPL(kfree_call_rcu); | ||
1097 | |||
1098 | /* | ||
1099 | * Wait for an rcu-preempt grace period, but make it happen quickly. | 1050 | * Wait for an rcu-preempt grace period, but make it happen quickly. |
1100 | * But because preemptible RCU does not exist, map to rcu-sched. | 1051 | * But because preemptible RCU does not exist, map to rcu-sched. |
1101 | */ | 1052 | */ |
@@ -1517,11 +1468,11 @@ static int __init rcu_spawn_kthreads(void) | |||
1517 | for_each_possible_cpu(cpu) | 1468 | for_each_possible_cpu(cpu) |
1518 | per_cpu(rcu_cpu_has_work, cpu) = 0; | 1469 | per_cpu(rcu_cpu_has_work, cpu) = 0; |
1519 | BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec)); | 1470 | BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec)); |
1520 | rnp = rcu_get_root(rcu_state); | 1471 | rnp = rcu_get_root(rcu_state_p); |
1521 | (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); | 1472 | (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp); |
1522 | if (NUM_RCU_NODES > 1) { | 1473 | if (NUM_RCU_NODES > 1) { |
1523 | rcu_for_each_leaf_node(rcu_state, rnp) | 1474 | rcu_for_each_leaf_node(rcu_state_p, rnp) |
1524 | (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); | 1475 | (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp); |
1525 | } | 1476 | } |
1526 | return 0; | 1477 | return 0; |
1527 | } | 1478 | } |
@@ -1529,12 +1480,12 @@ early_initcall(rcu_spawn_kthreads); | |||
1529 | 1480 | ||
1530 | static void rcu_prepare_kthreads(int cpu) | 1481 | static void rcu_prepare_kthreads(int cpu) |
1531 | { | 1482 | { |
1532 | struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); | 1483 | struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu); |
1533 | struct rcu_node *rnp = rdp->mynode; | 1484 | struct rcu_node *rnp = rdp->mynode; |
1534 | 1485 | ||
1535 | /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */ | 1486 | /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */ |
1536 | if (rcu_scheduler_fully_active) | 1487 | if (rcu_scheduler_fully_active) |
1537 | (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); | 1488 | (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp); |
1538 | } | 1489 | } |
1539 | 1490 | ||
1540 | #else /* #ifdef CONFIG_RCU_BOOST */ | 1491 | #else /* #ifdef CONFIG_RCU_BOOST */ |
@@ -1744,6 +1695,7 @@ int rcu_needs_cpu(int cpu, unsigned long *dj) | |||
1744 | static void rcu_prepare_for_idle(int cpu) | 1695 | static void rcu_prepare_for_idle(int cpu) |
1745 | { | 1696 | { |
1746 | #ifndef CONFIG_RCU_NOCB_CPU_ALL | 1697 | #ifndef CONFIG_RCU_NOCB_CPU_ALL |
1698 | bool needwake; | ||
1747 | struct rcu_data *rdp; | 1699 | struct rcu_data *rdp; |
1748 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); | 1700 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); |
1749 | struct rcu_node *rnp; | 1701 | struct rcu_node *rnp; |
@@ -1792,8 +1744,10 @@ static void rcu_prepare_for_idle(int cpu) | |||
1792 | rnp = rdp->mynode; | 1744 | rnp = rdp->mynode; |
1793 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ | 1745 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ |
1794 | smp_mb__after_unlock_lock(); | 1746 | smp_mb__after_unlock_lock(); |
1795 | rcu_accelerate_cbs(rsp, rnp, rdp); | 1747 | needwake = rcu_accelerate_cbs(rsp, rnp, rdp); |
1796 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 1748 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
1749 | if (needwake) | ||
1750 | rcu_gp_kthread_wake(rsp); | ||
1797 | } | 1751 | } |
1798 | #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ | 1752 | #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ |
1799 | } | 1753 | } |
@@ -1855,7 +1809,7 @@ static void rcu_oom_notify_cpu(void *unused) | |||
1855 | struct rcu_data *rdp; | 1809 | struct rcu_data *rdp; |
1856 | 1810 | ||
1857 | for_each_rcu_flavor(rsp) { | 1811 | for_each_rcu_flavor(rsp) { |
1858 | rdp = __this_cpu_ptr(rsp->rda); | 1812 | rdp = raw_cpu_ptr(rsp->rda); |
1859 | if (rdp->qlen_lazy != 0) { | 1813 | if (rdp->qlen_lazy != 0) { |
1860 | atomic_inc(&oom_callback_count); | 1814 | atomic_inc(&oom_callback_count); |
1861 | rsp->call(&rdp->oom_head, rcu_oom_callback); | 1815 | rsp->call(&rdp->oom_head, rcu_oom_callback); |
@@ -1997,7 +1951,7 @@ static void increment_cpu_stall_ticks(void) | |||
1997 | struct rcu_state *rsp; | 1951 | struct rcu_state *rsp; |
1998 | 1952 | ||
1999 | for_each_rcu_flavor(rsp) | 1953 | for_each_rcu_flavor(rsp) |
2000 | __this_cpu_ptr(rsp->rda)->ticks_this_gp++; | 1954 | raw_cpu_inc(rsp->rda->ticks_this_gp); |
2001 | } | 1955 | } |
2002 | 1956 | ||
2003 | #else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ | 1957 | #else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ |
@@ -2068,19 +2022,6 @@ static int __init parse_rcu_nocb_poll(char *arg) | |||
2068 | early_param("rcu_nocb_poll", parse_rcu_nocb_poll); | 2022 | early_param("rcu_nocb_poll", parse_rcu_nocb_poll); |
2069 | 2023 | ||
2070 | /* | 2024 | /* |
2071 | * Do any no-CBs CPUs need another grace period? | ||
2072 | * | ||
2073 | * Interrupts must be disabled. If the caller does not hold the root | ||
2074 | * rnp_node structure's ->lock, the results are advisory only. | ||
2075 | */ | ||
2076 | static int rcu_nocb_needs_gp(struct rcu_state *rsp) | ||
2077 | { | ||
2078 | struct rcu_node *rnp = rcu_get_root(rsp); | ||
2079 | |||
2080 | return rnp->need_future_gp[(ACCESS_ONCE(rnp->completed) + 1) & 0x1]; | ||
2081 | } | ||
2082 | |||
2083 | /* | ||
2084 | * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended | 2025 | * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended |
2085 | * grace period. | 2026 | * grace period. |
2086 | */ | 2027 | */ |
@@ -2109,7 +2050,7 @@ static void rcu_init_one_nocb(struct rcu_node *rnp) | |||
2109 | } | 2050 | } |
2110 | 2051 | ||
2111 | #ifndef CONFIG_RCU_NOCB_CPU_ALL | 2052 | #ifndef CONFIG_RCU_NOCB_CPU_ALL |
2112 | /* Is the specified CPU a no-CPUs CPU? */ | 2053 | /* Is the specified CPU a no-CBs CPU? */ |
2113 | bool rcu_is_nocb_cpu(int cpu) | 2054 | bool rcu_is_nocb_cpu(int cpu) |
2114 | { | 2055 | { |
2115 | if (have_rcu_nocb_mask) | 2056 | if (have_rcu_nocb_mask) |
@@ -2243,12 +2184,15 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) | |||
2243 | unsigned long c; | 2184 | unsigned long c; |
2244 | bool d; | 2185 | bool d; |
2245 | unsigned long flags; | 2186 | unsigned long flags; |
2187 | bool needwake; | ||
2246 | struct rcu_node *rnp = rdp->mynode; | 2188 | struct rcu_node *rnp = rdp->mynode; |
2247 | 2189 | ||
2248 | raw_spin_lock_irqsave(&rnp->lock, flags); | 2190 | raw_spin_lock_irqsave(&rnp->lock, flags); |
2249 | smp_mb__after_unlock_lock(); | 2191 | smp_mb__after_unlock_lock(); |
2250 | c = rcu_start_future_gp(rnp, rdp); | 2192 | needwake = rcu_start_future_gp(rnp, rdp, &c); |
2251 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 2193 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
2194 | if (needwake) | ||
2195 | rcu_gp_kthread_wake(rdp->rsp); | ||
2252 | 2196 | ||
2253 | /* | 2197 | /* |
2254 | * Wait for the grace period. Do so interruptibly to avoid messing | 2198 | * Wait for the grace period. Do so interruptibly to avoid messing |
@@ -2402,11 +2346,6 @@ static bool init_nocb_callback_list(struct rcu_data *rdp) | |||
2402 | 2346 | ||
2403 | #else /* #ifdef CONFIG_RCU_NOCB_CPU */ | 2347 | #else /* #ifdef CONFIG_RCU_NOCB_CPU */ |
2404 | 2348 | ||
2405 | static int rcu_nocb_needs_gp(struct rcu_state *rsp) | ||
2406 | { | ||
2407 | return 0; | ||
2408 | } | ||
2409 | |||
2410 | static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) | 2349 | static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) |
2411 | { | 2350 | { |
2412 | } | 2351 | } |
@@ -2523,9 +2462,9 @@ static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) | |||
2523 | /* Record start of fully idle period. */ | 2462 | /* Record start of fully idle period. */ |
2524 | j = jiffies; | 2463 | j = jiffies; |
2525 | ACCESS_ONCE(rdtp->dynticks_idle_jiffies) = j; | 2464 | ACCESS_ONCE(rdtp->dynticks_idle_jiffies) = j; |
2526 | smp_mb__before_atomic_inc(); | 2465 | smp_mb__before_atomic(); |
2527 | atomic_inc(&rdtp->dynticks_idle); | 2466 | atomic_inc(&rdtp->dynticks_idle); |
2528 | smp_mb__after_atomic_inc(); | 2467 | smp_mb__after_atomic(); |
2529 | WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1); | 2468 | WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1); |
2530 | } | 2469 | } |
2531 | 2470 | ||
@@ -2590,9 +2529,9 @@ static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) | |||
2590 | } | 2529 | } |
2591 | 2530 | ||
2592 | /* Record end of idle period. */ | 2531 | /* Record end of idle period. */ |
2593 | smp_mb__before_atomic_inc(); | 2532 | smp_mb__before_atomic(); |
2594 | atomic_inc(&rdtp->dynticks_idle); | 2533 | atomic_inc(&rdtp->dynticks_idle); |
2595 | smp_mb__after_atomic_inc(); | 2534 | smp_mb__after_atomic(); |
2596 | WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1)); | 2535 | WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1)); |
2597 | 2536 | ||
2598 | /* | 2537 | /* |
@@ -2657,20 +2596,6 @@ static bool is_sysidle_rcu_state(struct rcu_state *rsp) | |||
2657 | } | 2596 | } |
2658 | 2597 | ||
2659 | /* | 2598 | /* |
2660 | * Bind the grace-period kthread for the sysidle flavor of RCU to the | ||
2661 | * timekeeping CPU. | ||
2662 | */ | ||
2663 | static void rcu_bind_gp_kthread(void) | ||
2664 | { | ||
2665 | int cpu = ACCESS_ONCE(tick_do_timer_cpu); | ||
2666 | |||
2667 | if (cpu < 0 || cpu >= nr_cpu_ids) | ||
2668 | return; | ||
2669 | if (raw_smp_processor_id() != cpu) | ||
2670 | set_cpus_allowed_ptr(current, cpumask_of(cpu)); | ||
2671 | } | ||
2672 | |||
2673 | /* | ||
2674 | * Return a delay in jiffies based on the number of CPUs, rcu_node | 2599 | * Return a delay in jiffies based on the number of CPUs, rcu_node |
2675 | * leaf fanout, and jiffies tick rate. The idea is to allow larger | 2600 | * leaf fanout, and jiffies tick rate. The idea is to allow larger |
2676 | * systems more time to transition to full-idle state in order to | 2601 | * systems more time to transition to full-idle state in order to |
@@ -2734,7 +2659,8 @@ static void rcu_sysidle(unsigned long j) | |||
2734 | static void rcu_sysidle_cancel(void) | 2659 | static void rcu_sysidle_cancel(void) |
2735 | { | 2660 | { |
2736 | smp_mb(); | 2661 | smp_mb(); |
2737 | ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT; | 2662 | if (full_sysidle_state > RCU_SYSIDLE_SHORT) |
2663 | ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT; | ||
2738 | } | 2664 | } |
2739 | 2665 | ||
2740 | /* | 2666 | /* |
@@ -2880,10 +2806,6 @@ static bool is_sysidle_rcu_state(struct rcu_state *rsp) | |||
2880 | return false; | 2806 | return false; |
2881 | } | 2807 | } |
2882 | 2808 | ||
2883 | static void rcu_bind_gp_kthread(void) | ||
2884 | { | ||
2885 | } | ||
2886 | |||
2887 | static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, | 2809 | static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, |
2888 | unsigned long maxj) | 2810 | unsigned long maxj) |
2889 | { | 2811 | { |
@@ -2914,3 +2836,19 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp) | |||
2914 | #endif /* #ifdef CONFIG_NO_HZ_FULL */ | 2836 | #endif /* #ifdef CONFIG_NO_HZ_FULL */ |
2915 | return 0; | 2837 | return 0; |
2916 | } | 2838 | } |
2839 | |||
2840 | /* | ||
2841 | * Bind the grace-period kthread for the sysidle flavor of RCU to the | ||
2842 | * timekeeping CPU. | ||
2843 | */ | ||
2844 | static void rcu_bind_gp_kthread(void) | ||
2845 | { | ||
2846 | #ifdef CONFIG_NO_HZ_FULL | ||
2847 | int cpu = ACCESS_ONCE(tick_do_timer_cpu); | ||
2848 | |||
2849 | if (cpu < 0 || cpu >= nr_cpu_ids) | ||
2850 | return; | ||
2851 | if (raw_smp_processor_id() != cpu) | ||
2852 | set_cpus_allowed_ptr(current, cpumask_of(cpu)); | ||
2853 | #endif /* #ifdef CONFIG_NO_HZ_FULL */ | ||
2854 | } | ||
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 4c0a9b0af469..a2aeb4df0f60 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c | |||
@@ -320,6 +320,18 @@ int rcu_jiffies_till_stall_check(void) | |||
320 | return till_stall_check * HZ + RCU_STALL_DELAY_DELTA; | 320 | return till_stall_check * HZ + RCU_STALL_DELAY_DELTA; |
321 | } | 321 | } |
322 | 322 | ||
323 | void rcu_sysrq_start(void) | ||
324 | { | ||
325 | if (!rcu_cpu_stall_suppress) | ||
326 | rcu_cpu_stall_suppress = 2; | ||
327 | } | ||
328 | |||
329 | void rcu_sysrq_end(void) | ||
330 | { | ||
331 | if (rcu_cpu_stall_suppress == 2) | ||
332 | rcu_cpu_stall_suppress = 0; | ||
333 | } | ||
334 | |||
323 | static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) | 335 | static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) |
324 | { | 336 | { |
325 | rcu_cpu_stall_suppress = 1; | 337 | rcu_cpu_stall_suppress = 1; |
@@ -338,3 +350,21 @@ static int __init check_cpu_stall_init(void) | |||
338 | early_initcall(check_cpu_stall_init); | 350 | early_initcall(check_cpu_stall_init); |
339 | 351 | ||
340 | #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ | 352 | #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ |
353 | |||
354 | /* | ||
355 | * Hooks for cond_resched() and friends to avoid RCU CPU stall warnings. | ||
356 | */ | ||
357 | |||
358 | DEFINE_PER_CPU(int, rcu_cond_resched_count); | ||
359 | |||
360 | /* | ||
361 | * Report a set of RCU quiescent states, for use by cond_resched() | ||
362 | * and friends. Out of line due to being called infrequently. | ||
363 | */ | ||
364 | void rcu_resched(void) | ||
365 | { | ||
366 | preempt_disable(); | ||
367 | __this_cpu_write(rcu_cond_resched_count, 0); | ||
368 | rcu_note_context_switch(smp_processor_id()); | ||
369 | preempt_enable(); | ||
370 | } | ||
diff --git a/kernel/reboot.c b/kernel/reboot.c index 662c83fc16b7..a3a9e240fcdb 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c | |||
@@ -388,15 +388,22 @@ static int __init reboot_setup(char *str) | |||
388 | break; | 388 | break; |
389 | 389 | ||
390 | case 's': | 390 | case 's': |
391 | if (isdigit(*(str+1))) | 391 | { |
392 | reboot_cpu = simple_strtoul(str+1, NULL, 0); | 392 | int rc; |
393 | else if (str[1] == 'm' && str[2] == 'p' && | 393 | |
394 | isdigit(*(str+3))) | 394 | if (isdigit(*(str+1))) { |
395 | reboot_cpu = simple_strtoul(str+3, NULL, 0); | 395 | rc = kstrtoint(str+1, 0, &reboot_cpu); |
396 | else | 396 | if (rc) |
397 | return rc; | ||
398 | } else if (str[1] == 'm' && str[2] == 'p' && | ||
399 | isdigit(*(str+3))) { | ||
400 | rc = kstrtoint(str+3, 0, &reboot_cpu); | ||
401 | if (rc) | ||
402 | return rc; | ||
403 | } else | ||
397 | reboot_mode = REBOOT_SOFT; | 404 | reboot_mode = REBOOT_SOFT; |
398 | break; | 405 | break; |
399 | 406 | } | |
400 | case 'g': | 407 | case 'g': |
401 | reboot_mode = REBOOT_GPIO; | 408 | reboot_mode = REBOOT_GPIO; |
402 | break; | 409 | break; |
diff --git a/kernel/res_counter.c b/kernel/res_counter.c index 51dbac6a3633..e791130f85a7 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c | |||
@@ -186,8 +186,11 @@ int res_counter_memparse_write_strategy(const char *buf, | |||
186 | 186 | ||
187 | /* return RES_COUNTER_MAX(unlimited) if "-1" is specified */ | 187 | /* return RES_COUNTER_MAX(unlimited) if "-1" is specified */ |
188 | if (*buf == '-') { | 188 | if (*buf == '-') { |
189 | res = simple_strtoull(buf + 1, &end, 10); | 189 | int rc = kstrtoull(buf + 1, 10, &res); |
190 | if (res != 1 || *end != '\0') | 190 | |
191 | if (rc) | ||
192 | return rc; | ||
193 | if (res != 1) | ||
191 | return -EINVAL; | 194 | return -EINVAL; |
192 | *resp = RES_COUNTER_MAX; | 195 | *resp = RES_COUNTER_MAX; |
193 | return 0; | 196 | return 0; |
diff --git a/kernel/resource.c b/kernel/resource.c index 8957d686e29b..3c2237ac32db 100644 --- a/kernel/resource.c +++ b/kernel/resource.c | |||
@@ -1288,13 +1288,10 @@ int iomem_map_sanity_check(resource_size_t addr, unsigned long size) | |||
1288 | if (p->flags & IORESOURCE_BUSY) | 1288 | if (p->flags & IORESOURCE_BUSY) |
1289 | continue; | 1289 | continue; |
1290 | 1290 | ||
1291 | printk(KERN_WARNING "resource map sanity check conflict: " | 1291 | printk(KERN_WARNING "resource sanity check: requesting [mem %#010llx-%#010llx], which spans more than %s %pR\n", |
1292 | "0x%llx 0x%llx 0x%llx 0x%llx %s\n", | ||
1293 | (unsigned long long)addr, | 1292 | (unsigned long long)addr, |
1294 | (unsigned long long)(addr + size - 1), | 1293 | (unsigned long long)(addr + size - 1), |
1295 | (unsigned long long)p->start, | 1294 | p->name, p); |
1296 | (unsigned long long)p->end, | ||
1297 | p->name); | ||
1298 | err = -1; | 1295 | err = -1; |
1299 | break; | 1296 | break; |
1300 | } | 1297 | } |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 268a45ea238c..3bdf01b494fe 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -90,6 +90,22 @@ | |||
90 | #define CREATE_TRACE_POINTS | 90 | #define CREATE_TRACE_POINTS |
91 | #include <trace/events/sched.h> | 91 | #include <trace/events/sched.h> |
92 | 92 | ||
93 | #ifdef smp_mb__before_atomic | ||
94 | void __smp_mb__before_atomic(void) | ||
95 | { | ||
96 | smp_mb__before_atomic(); | ||
97 | } | ||
98 | EXPORT_SYMBOL(__smp_mb__before_atomic); | ||
99 | #endif | ||
100 | |||
101 | #ifdef smp_mb__after_atomic | ||
102 | void __smp_mb__after_atomic(void) | ||
103 | { | ||
104 | smp_mb__after_atomic(); | ||
105 | } | ||
106 | EXPORT_SYMBOL(__smp_mb__after_atomic); | ||
107 | #endif | ||
108 | |||
93 | void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) | 109 | void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) |
94 | { | 110 | { |
95 | unsigned long delta; | 111 | unsigned long delta; |
@@ -506,6 +522,71 @@ static inline void init_hrtick(void) | |||
506 | #endif /* CONFIG_SCHED_HRTICK */ | 522 | #endif /* CONFIG_SCHED_HRTICK */ |
507 | 523 | ||
508 | /* | 524 | /* |
525 | * cmpxchg based fetch_or, macro so it works for different integer types | ||
526 | */ | ||
527 | #define fetch_or(ptr, val) \ | ||
528 | ({ typeof(*(ptr)) __old, __val = *(ptr); \ | ||
529 | for (;;) { \ | ||
530 | __old = cmpxchg((ptr), __val, __val | (val)); \ | ||
531 | if (__old == __val) \ | ||
532 | break; \ | ||
533 | __val = __old; \ | ||
534 | } \ | ||
535 | __old; \ | ||
536 | }) | ||
537 | |||
538 | #if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) | ||
539 | /* | ||
540 | * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, | ||
541 | * this avoids any races wrt polling state changes and thereby avoids | ||
542 | * spurious IPIs. | ||
543 | */ | ||
544 | static bool set_nr_and_not_polling(struct task_struct *p) | ||
545 | { | ||
546 | struct thread_info *ti = task_thread_info(p); | ||
547 | return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); | ||
548 | } | ||
549 | |||
550 | /* | ||
551 | * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. | ||
552 | * | ||
553 | * If this returns true, then the idle task promises to call | ||
554 | * sched_ttwu_pending() and reschedule soon. | ||
555 | */ | ||
556 | static bool set_nr_if_polling(struct task_struct *p) | ||
557 | { | ||
558 | struct thread_info *ti = task_thread_info(p); | ||
559 | typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags); | ||
560 | |||
561 | for (;;) { | ||
562 | if (!(val & _TIF_POLLING_NRFLAG)) | ||
563 | return false; | ||
564 | if (val & _TIF_NEED_RESCHED) | ||
565 | return true; | ||
566 | old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); | ||
567 | if (old == val) | ||
568 | break; | ||
569 | val = old; | ||
570 | } | ||
571 | return true; | ||
572 | } | ||
573 | |||
574 | #else | ||
575 | static bool set_nr_and_not_polling(struct task_struct *p) | ||
576 | { | ||
577 | set_tsk_need_resched(p); | ||
578 | return true; | ||
579 | } | ||
580 | |||
581 | #ifdef CONFIG_SMP | ||
582 | static bool set_nr_if_polling(struct task_struct *p) | ||
583 | { | ||
584 | return false; | ||
585 | } | ||
586 | #endif | ||
587 | #endif | ||
588 | |||
589 | /* | ||
509 | * resched_task - mark a task 'to be rescheduled now'. | 590 | * resched_task - mark a task 'to be rescheduled now'. |
510 | * | 591 | * |
511 | * On UP this means the setting of the need_resched flag, on SMP it | 592 | * On UP this means the setting of the need_resched flag, on SMP it |
@@ -521,18 +602,18 @@ void resched_task(struct task_struct *p) | |||
521 | if (test_tsk_need_resched(p)) | 602 | if (test_tsk_need_resched(p)) |
522 | return; | 603 | return; |
523 | 604 | ||
524 | set_tsk_need_resched(p); | ||
525 | |||
526 | cpu = task_cpu(p); | 605 | cpu = task_cpu(p); |
606 | |||
527 | if (cpu == smp_processor_id()) { | 607 | if (cpu == smp_processor_id()) { |
608 | set_tsk_need_resched(p); | ||
528 | set_preempt_need_resched(); | 609 | set_preempt_need_resched(); |
529 | return; | 610 | return; |
530 | } | 611 | } |
531 | 612 | ||
532 | /* NEED_RESCHED must be visible before we test polling */ | 613 | if (set_nr_and_not_polling(p)) |
533 | smp_mb(); | ||
534 | if (!tsk_is_polling(p)) | ||
535 | smp_send_reschedule(cpu); | 614 | smp_send_reschedule(cpu); |
615 | else | ||
616 | trace_sched_wake_idle_without_ipi(cpu); | ||
536 | } | 617 | } |
537 | 618 | ||
538 | void resched_cpu(int cpu) | 619 | void resched_cpu(int cpu) |
@@ -595,27 +676,10 @@ static void wake_up_idle_cpu(int cpu) | |||
595 | if (cpu == smp_processor_id()) | 676 | if (cpu == smp_processor_id()) |
596 | return; | 677 | return; |
597 | 678 | ||
598 | /* | 679 | if (set_nr_and_not_polling(rq->idle)) |
599 | * This is safe, as this function is called with the timer | ||
600 | * wheel base lock of (cpu) held. When the CPU is on the way | ||
601 | * to idle and has not yet set rq->curr to idle then it will | ||
602 | * be serialized on the timer wheel base lock and take the new | ||
603 | * timer into account automatically. | ||
604 | */ | ||
605 | if (rq->curr != rq->idle) | ||
606 | return; | ||
607 | |||
608 | /* | ||
609 | * We can set TIF_RESCHED on the idle task of the other CPU | ||
610 | * lockless. The worst case is that the other CPU runs the | ||
611 | * idle task through an additional NOOP schedule() | ||
612 | */ | ||
613 | set_tsk_need_resched(rq->idle); | ||
614 | |||
615 | /* NEED_RESCHED must be visible before we test polling */ | ||
616 | smp_mb(); | ||
617 | if (!tsk_is_polling(rq->idle)) | ||
618 | smp_send_reschedule(cpu); | 680 | smp_send_reschedule(cpu); |
681 | else | ||
682 | trace_sched_wake_idle_without_ipi(cpu); | ||
619 | } | 683 | } |
620 | 684 | ||
621 | static bool wake_up_full_nohz_cpu(int cpu) | 685 | static bool wake_up_full_nohz_cpu(int cpu) |
@@ -841,7 +905,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) | |||
841 | rq->clock_task += delta; | 905 | rq->clock_task += delta; |
842 | 906 | ||
843 | #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) | 907 | #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) |
844 | if ((irq_delta + steal) && sched_feat(NONTASK_POWER)) | 908 | if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) |
845 | sched_rt_avg_update(rq, irq_delta + steal); | 909 | sched_rt_avg_update(rq, irq_delta + steal); |
846 | #endif | 910 | #endif |
847 | } | 911 | } |
@@ -1320,7 +1384,7 @@ out: | |||
1320 | * leave kernel. | 1384 | * leave kernel. |
1321 | */ | 1385 | */ |
1322 | if (p->mm && printk_ratelimit()) { | 1386 | if (p->mm && printk_ratelimit()) { |
1323 | printk_sched("process %d (%s) no longer affine to cpu%d\n", | 1387 | printk_deferred("process %d (%s) no longer affine to cpu%d\n", |
1324 | task_pid_nr(p), p->comm, cpu); | 1388 | task_pid_nr(p), p->comm, cpu); |
1325 | } | 1389 | } |
1326 | } | 1390 | } |
@@ -1474,13 +1538,17 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) | |||
1474 | } | 1538 | } |
1475 | 1539 | ||
1476 | #ifdef CONFIG_SMP | 1540 | #ifdef CONFIG_SMP |
1477 | static void sched_ttwu_pending(void) | 1541 | void sched_ttwu_pending(void) |
1478 | { | 1542 | { |
1479 | struct rq *rq = this_rq(); | 1543 | struct rq *rq = this_rq(); |
1480 | struct llist_node *llist = llist_del_all(&rq->wake_list); | 1544 | struct llist_node *llist = llist_del_all(&rq->wake_list); |
1481 | struct task_struct *p; | 1545 | struct task_struct *p; |
1546 | unsigned long flags; | ||
1482 | 1547 | ||
1483 | raw_spin_lock(&rq->lock); | 1548 | if (!llist) |
1549 | return; | ||
1550 | |||
1551 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
1484 | 1552 | ||
1485 | while (llist) { | 1553 | while (llist) { |
1486 | p = llist_entry(llist, struct task_struct, wake_entry); | 1554 | p = llist_entry(llist, struct task_struct, wake_entry); |
@@ -1488,7 +1556,7 @@ static void sched_ttwu_pending(void) | |||
1488 | ttwu_do_activate(rq, p, 0); | 1556 | ttwu_do_activate(rq, p, 0); |
1489 | } | 1557 | } |
1490 | 1558 | ||
1491 | raw_spin_unlock(&rq->lock); | 1559 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
1492 | } | 1560 | } |
1493 | 1561 | ||
1494 | void scheduler_ipi(void) | 1562 | void scheduler_ipi(void) |
@@ -1534,8 +1602,14 @@ void scheduler_ipi(void) | |||
1534 | 1602 | ||
1535 | static void ttwu_queue_remote(struct task_struct *p, int cpu) | 1603 | static void ttwu_queue_remote(struct task_struct *p, int cpu) |
1536 | { | 1604 | { |
1537 | if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) | 1605 | struct rq *rq = cpu_rq(cpu); |
1538 | smp_send_reschedule(cpu); | 1606 | |
1607 | if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) { | ||
1608 | if (!set_nr_if_polling(rq->idle)) | ||
1609 | smp_send_reschedule(cpu); | ||
1610 | else | ||
1611 | trace_sched_wake_idle_without_ipi(cpu); | ||
1612 | } | ||
1539 | } | 1613 | } |
1540 | 1614 | ||
1541 | bool cpus_share_cache(int this_cpu, int that_cpu) | 1615 | bool cpus_share_cache(int this_cpu, int that_cpu) |
@@ -2192,7 +2266,7 @@ static inline void post_schedule(struct rq *rq) | |||
2192 | * schedule_tail - first thing a freshly forked thread must call. | 2266 | * schedule_tail - first thing a freshly forked thread must call. |
2193 | * @prev: the thread we just switched away from. | 2267 | * @prev: the thread we just switched away from. |
2194 | */ | 2268 | */ |
2195 | asmlinkage void schedule_tail(struct task_struct *prev) | 2269 | asmlinkage __visible void schedule_tail(struct task_struct *prev) |
2196 | __releases(rq->lock) | 2270 | __releases(rq->lock) |
2197 | { | 2271 | { |
2198 | struct rq *rq = this_rq(); | 2272 | struct rq *rq = this_rq(); |
@@ -2480,7 +2554,7 @@ notrace unsigned long get_parent_ip(unsigned long addr) | |||
2480 | #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ | 2554 | #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ |
2481 | defined(CONFIG_PREEMPT_TRACER)) | 2555 | defined(CONFIG_PREEMPT_TRACER)) |
2482 | 2556 | ||
2483 | void __kprobes preempt_count_add(int val) | 2557 | void preempt_count_add(int val) |
2484 | { | 2558 | { |
2485 | #ifdef CONFIG_DEBUG_PREEMPT | 2559 | #ifdef CONFIG_DEBUG_PREEMPT |
2486 | /* | 2560 | /* |
@@ -2506,8 +2580,9 @@ void __kprobes preempt_count_add(int val) | |||
2506 | } | 2580 | } |
2507 | } | 2581 | } |
2508 | EXPORT_SYMBOL(preempt_count_add); | 2582 | EXPORT_SYMBOL(preempt_count_add); |
2583 | NOKPROBE_SYMBOL(preempt_count_add); | ||
2509 | 2584 | ||
2510 | void __kprobes preempt_count_sub(int val) | 2585 | void preempt_count_sub(int val) |
2511 | { | 2586 | { |
2512 | #ifdef CONFIG_DEBUG_PREEMPT | 2587 | #ifdef CONFIG_DEBUG_PREEMPT |
2513 | /* | 2588 | /* |
@@ -2528,6 +2603,7 @@ void __kprobes preempt_count_sub(int val) | |||
2528 | __preempt_count_sub(val); | 2603 | __preempt_count_sub(val); |
2529 | } | 2604 | } |
2530 | EXPORT_SYMBOL(preempt_count_sub); | 2605 | EXPORT_SYMBOL(preempt_count_sub); |
2606 | NOKPROBE_SYMBOL(preempt_count_sub); | ||
2531 | 2607 | ||
2532 | #endif | 2608 | #endif |
2533 | 2609 | ||
@@ -2592,8 +2668,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev) | |||
2592 | if (likely(prev->sched_class == class && | 2668 | if (likely(prev->sched_class == class && |
2593 | rq->nr_running == rq->cfs.h_nr_running)) { | 2669 | rq->nr_running == rq->cfs.h_nr_running)) { |
2594 | p = fair_sched_class.pick_next_task(rq, prev); | 2670 | p = fair_sched_class.pick_next_task(rq, prev); |
2595 | if (likely(p && p != RETRY_TASK)) | 2671 | if (unlikely(p == RETRY_TASK)) |
2596 | return p; | 2672 | goto again; |
2673 | |||
2674 | /* assumes fair_sched_class->next == idle_sched_class */ | ||
2675 | if (unlikely(!p)) | ||
2676 | p = idle_sched_class.pick_next_task(rq, prev); | ||
2677 | |||
2678 | return p; | ||
2597 | } | 2679 | } |
2598 | 2680 | ||
2599 | again: | 2681 | again: |
@@ -2741,7 +2823,7 @@ static inline void sched_submit_work(struct task_struct *tsk) | |||
2741 | blk_schedule_flush_plug(tsk); | 2823 | blk_schedule_flush_plug(tsk); |
2742 | } | 2824 | } |
2743 | 2825 | ||
2744 | asmlinkage void __sched schedule(void) | 2826 | asmlinkage __visible void __sched schedule(void) |
2745 | { | 2827 | { |
2746 | struct task_struct *tsk = current; | 2828 | struct task_struct *tsk = current; |
2747 | 2829 | ||
@@ -2751,7 +2833,7 @@ asmlinkage void __sched schedule(void) | |||
2751 | EXPORT_SYMBOL(schedule); | 2833 | EXPORT_SYMBOL(schedule); |
2752 | 2834 | ||
2753 | #ifdef CONFIG_CONTEXT_TRACKING | 2835 | #ifdef CONFIG_CONTEXT_TRACKING |
2754 | asmlinkage void __sched schedule_user(void) | 2836 | asmlinkage __visible void __sched schedule_user(void) |
2755 | { | 2837 | { |
2756 | /* | 2838 | /* |
2757 | * If we come here after a random call to set_need_resched(), | 2839 | * If we come here after a random call to set_need_resched(), |
@@ -2783,7 +2865,7 @@ void __sched schedule_preempt_disabled(void) | |||
2783 | * off of preempt_enable. Kernel preemptions off return from interrupt | 2865 | * off of preempt_enable. Kernel preemptions off return from interrupt |
2784 | * occur there and call schedule directly. | 2866 | * occur there and call schedule directly. |
2785 | */ | 2867 | */ |
2786 | asmlinkage void __sched notrace preempt_schedule(void) | 2868 | asmlinkage __visible void __sched notrace preempt_schedule(void) |
2787 | { | 2869 | { |
2788 | /* | 2870 | /* |
2789 | * If there is a non-zero preempt_count or interrupts are disabled, | 2871 | * If there is a non-zero preempt_count or interrupts are disabled, |
@@ -2804,6 +2886,7 @@ asmlinkage void __sched notrace preempt_schedule(void) | |||
2804 | barrier(); | 2886 | barrier(); |
2805 | } while (need_resched()); | 2887 | } while (need_resched()); |
2806 | } | 2888 | } |
2889 | NOKPROBE_SYMBOL(preempt_schedule); | ||
2807 | EXPORT_SYMBOL(preempt_schedule); | 2890 | EXPORT_SYMBOL(preempt_schedule); |
2808 | #endif /* CONFIG_PREEMPT */ | 2891 | #endif /* CONFIG_PREEMPT */ |
2809 | 2892 | ||
@@ -2813,7 +2896,7 @@ EXPORT_SYMBOL(preempt_schedule); | |||
2813 | * Note, that this is called and return with irqs disabled. This will | 2896 | * Note, that this is called and return with irqs disabled. This will |
2814 | * protect us against recursive calling from irq. | 2897 | * protect us against recursive calling from irq. |
2815 | */ | 2898 | */ |
2816 | asmlinkage void __sched preempt_schedule_irq(void) | 2899 | asmlinkage __visible void __sched preempt_schedule_irq(void) |
2817 | { | 2900 | { |
2818 | enum ctx_state prev_state; | 2901 | enum ctx_state prev_state; |
2819 | 2902 | ||
@@ -2996,7 +3079,7 @@ EXPORT_SYMBOL(set_user_nice); | |||
2996 | int can_nice(const struct task_struct *p, const int nice) | 3079 | int can_nice(const struct task_struct *p, const int nice) |
2997 | { | 3080 | { |
2998 | /* convert nice value [19,-20] to rlimit style value [1,40] */ | 3081 | /* convert nice value [19,-20] to rlimit style value [1,40] */ |
2999 | int nice_rlim = 20 - nice; | 3082 | int nice_rlim = nice_to_rlimit(nice); |
3000 | 3083 | ||
3001 | return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || | 3084 | return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || |
3002 | capable(CAP_SYS_NICE)); | 3085 | capable(CAP_SYS_NICE)); |
@@ -3020,17 +3103,10 @@ SYSCALL_DEFINE1(nice, int, increment) | |||
3020 | * We don't have to worry. Conceptually one call occurs first | 3103 | * We don't have to worry. Conceptually one call occurs first |
3021 | * and we have a single winner. | 3104 | * and we have a single winner. |
3022 | */ | 3105 | */ |
3023 | if (increment < -40) | 3106 | increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); |
3024 | increment = -40; | ||
3025 | if (increment > 40) | ||
3026 | increment = 40; | ||
3027 | |||
3028 | nice = task_nice(current) + increment; | 3107 | nice = task_nice(current) + increment; |
3029 | if (nice < MIN_NICE) | ||
3030 | nice = MIN_NICE; | ||
3031 | if (nice > MAX_NICE) | ||
3032 | nice = MAX_NICE; | ||
3033 | 3108 | ||
3109 | nice = clamp_val(nice, MIN_NICE, MAX_NICE); | ||
3034 | if (increment < 0 && !can_nice(current, nice)) | 3110 | if (increment < 0 && !can_nice(current, nice)) |
3035 | return -EPERM; | 3111 | return -EPERM; |
3036 | 3112 | ||
@@ -3124,6 +3200,7 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr) | |||
3124 | dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); | 3200 | dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); |
3125 | dl_se->dl_throttled = 0; | 3201 | dl_se->dl_throttled = 0; |
3126 | dl_se->dl_new = 1; | 3202 | dl_se->dl_new = 1; |
3203 | dl_se->dl_yielded = 0; | ||
3127 | } | 3204 | } |
3128 | 3205 | ||
3129 | static void __setscheduler_params(struct task_struct *p, | 3206 | static void __setscheduler_params(struct task_struct *p, |
@@ -3188,17 +3265,40 @@ __getparam_dl(struct task_struct *p, struct sched_attr *attr) | |||
3188 | * We ask for the deadline not being zero, and greater or equal | 3265 | * We ask for the deadline not being zero, and greater or equal |
3189 | * than the runtime, as well as the period of being zero or | 3266 | * than the runtime, as well as the period of being zero or |
3190 | * greater than deadline. Furthermore, we have to be sure that | 3267 | * greater than deadline. Furthermore, we have to be sure that |
3191 | * user parameters are above the internal resolution (1us); we | 3268 | * user parameters are above the internal resolution of 1us (we |
3192 | * check sched_runtime only since it is always the smaller one. | 3269 | * check sched_runtime only since it is always the smaller one) and |
3270 | * below 2^63 ns (we have to check both sched_deadline and | ||
3271 | * sched_period, as the latter can be zero). | ||
3193 | */ | 3272 | */ |
3194 | static bool | 3273 | static bool |
3195 | __checkparam_dl(const struct sched_attr *attr) | 3274 | __checkparam_dl(const struct sched_attr *attr) |
3196 | { | 3275 | { |
3197 | return attr && attr->sched_deadline != 0 && | 3276 | /* deadline != 0 */ |
3198 | (attr->sched_period == 0 || | 3277 | if (attr->sched_deadline == 0) |
3199 | (s64)(attr->sched_period - attr->sched_deadline) >= 0) && | 3278 | return false; |
3200 | (s64)(attr->sched_deadline - attr->sched_runtime ) >= 0 && | 3279 | |
3201 | attr->sched_runtime >= (2 << (DL_SCALE - 1)); | 3280 | /* |
3281 | * Since we truncate DL_SCALE bits, make sure we're at least | ||
3282 | * that big. | ||
3283 | */ | ||
3284 | if (attr->sched_runtime < (1ULL << DL_SCALE)) | ||
3285 | return false; | ||
3286 | |||
3287 | /* | ||
3288 | * Since we use the MSB for wrap-around and sign issues, make | ||
3289 | * sure it's not set (mind that period can be equal to zero). | ||
3290 | */ | ||
3291 | if (attr->sched_deadline & (1ULL << 63) || | ||
3292 | attr->sched_period & (1ULL << 63)) | ||
3293 | return false; | ||
3294 | |||
3295 | /* runtime <= deadline <= period (if period != 0) */ | ||
3296 | if ((attr->sched_period != 0 && | ||
3297 | attr->sched_period < attr->sched_deadline) || | ||
3298 | attr->sched_deadline < attr->sched_runtime) | ||
3299 | return false; | ||
3300 | |||
3301 | return true; | ||
3202 | } | 3302 | } |
3203 | 3303 | ||
3204 | /* | 3304 | /* |
@@ -3596,13 +3696,11 @@ static int sched_copy_attr(struct sched_attr __user *uattr, | |||
3596 | */ | 3696 | */ |
3597 | attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE); | 3697 | attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE); |
3598 | 3698 | ||
3599 | out: | 3699 | return 0; |
3600 | return ret; | ||
3601 | 3700 | ||
3602 | err_size: | 3701 | err_size: |
3603 | put_user(sizeof(*attr), &uattr->size); | 3702 | put_user(sizeof(*attr), &uattr->size); |
3604 | ret = -E2BIG; | 3703 | return -E2BIG; |
3605 | goto out; | ||
3606 | } | 3704 | } |
3607 | 3705 | ||
3608 | /** | 3706 | /** |
@@ -3639,6 +3737,7 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) | |||
3639 | * sys_sched_setattr - same as above, but with extended sched_attr | 3737 | * sys_sched_setattr - same as above, but with extended sched_attr |
3640 | * @pid: the pid in question. | 3738 | * @pid: the pid in question. |
3641 | * @uattr: structure containing the extended parameters. | 3739 | * @uattr: structure containing the extended parameters. |
3740 | * @flags: for future extension. | ||
3642 | */ | 3741 | */ |
3643 | SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, | 3742 | SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, |
3644 | unsigned int, flags) | 3743 | unsigned int, flags) |
@@ -3650,8 +3749,12 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, | |||
3650 | if (!uattr || pid < 0 || flags) | 3749 | if (!uattr || pid < 0 || flags) |
3651 | return -EINVAL; | 3750 | return -EINVAL; |
3652 | 3751 | ||
3653 | if (sched_copy_attr(uattr, &attr)) | 3752 | retval = sched_copy_attr(uattr, &attr); |
3654 | return -EFAULT; | 3753 | if (retval) |
3754 | return retval; | ||
3755 | |||
3756 | if ((int)attr.sched_policy < 0) | ||
3757 | return -EINVAL; | ||
3655 | 3758 | ||
3656 | rcu_read_lock(); | 3759 | rcu_read_lock(); |
3657 | retval = -ESRCH; | 3760 | retval = -ESRCH; |
@@ -3701,7 +3804,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) | |||
3701 | */ | 3804 | */ |
3702 | SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) | 3805 | SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) |
3703 | { | 3806 | { |
3704 | struct sched_param lp; | 3807 | struct sched_param lp = { .sched_priority = 0 }; |
3705 | struct task_struct *p; | 3808 | struct task_struct *p; |
3706 | int retval; | 3809 | int retval; |
3707 | 3810 | ||
@@ -3718,11 +3821,8 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) | |||
3718 | if (retval) | 3821 | if (retval) |
3719 | goto out_unlock; | 3822 | goto out_unlock; |
3720 | 3823 | ||
3721 | if (task_has_dl_policy(p)) { | 3824 | if (task_has_rt_policy(p)) |
3722 | retval = -EINVAL; | 3825 | lp.sched_priority = p->rt_priority; |
3723 | goto out_unlock; | ||
3724 | } | ||
3725 | lp.sched_priority = p->rt_priority; | ||
3726 | rcu_read_unlock(); | 3826 | rcu_read_unlock(); |
3727 | 3827 | ||
3728 | /* | 3828 | /* |
@@ -3760,7 +3860,7 @@ static int sched_read_attr(struct sched_attr __user *uattr, | |||
3760 | 3860 | ||
3761 | for (; addr < end; addr++) { | 3861 | for (; addr < end; addr++) { |
3762 | if (*addr) | 3862 | if (*addr) |
3763 | goto err_size; | 3863 | return -EFBIG; |
3764 | } | 3864 | } |
3765 | 3865 | ||
3766 | attr->size = usize; | 3866 | attr->size = usize; |
@@ -3770,12 +3870,7 @@ static int sched_read_attr(struct sched_attr __user *uattr, | |||
3770 | if (ret) | 3870 | if (ret) |
3771 | return -EFAULT; | 3871 | return -EFAULT; |
3772 | 3872 | ||
3773 | out: | 3873 | return 0; |
3774 | return ret; | ||
3775 | |||
3776 | err_size: | ||
3777 | ret = -E2BIG; | ||
3778 | goto out; | ||
3779 | } | 3874 | } |
3780 | 3875 | ||
3781 | /** | 3876 | /** |
@@ -3783,6 +3878,7 @@ err_size: | |||
3783 | * @pid: the pid in question. | 3878 | * @pid: the pid in question. |
3784 | * @uattr: structure containing the extended parameters. | 3879 | * @uattr: structure containing the extended parameters. |
3785 | * @size: sizeof(attr) for fwd/bwd comp. | 3880 | * @size: sizeof(attr) for fwd/bwd comp. |
3881 | * @flags: for future extension. | ||
3786 | */ | 3882 | */ |
3787 | SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, | 3883 | SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, |
3788 | unsigned int, size, unsigned int, flags) | 3884 | unsigned int, size, unsigned int, flags) |
@@ -4051,6 +4147,7 @@ static void __cond_resched(void) | |||
4051 | 4147 | ||
4052 | int __sched _cond_resched(void) | 4148 | int __sched _cond_resched(void) |
4053 | { | 4149 | { |
4150 | rcu_cond_resched(); | ||
4054 | if (should_resched()) { | 4151 | if (should_resched()) { |
4055 | __cond_resched(); | 4152 | __cond_resched(); |
4056 | return 1; | 4153 | return 1; |
@@ -4069,15 +4166,18 @@ EXPORT_SYMBOL(_cond_resched); | |||
4069 | */ | 4166 | */ |
4070 | int __cond_resched_lock(spinlock_t *lock) | 4167 | int __cond_resched_lock(spinlock_t *lock) |
4071 | { | 4168 | { |
4169 | bool need_rcu_resched = rcu_should_resched(); | ||
4072 | int resched = should_resched(); | 4170 | int resched = should_resched(); |
4073 | int ret = 0; | 4171 | int ret = 0; |
4074 | 4172 | ||
4075 | lockdep_assert_held(lock); | 4173 | lockdep_assert_held(lock); |
4076 | 4174 | ||
4077 | if (spin_needbreak(lock) || resched) { | 4175 | if (spin_needbreak(lock) || resched || need_rcu_resched) { |
4078 | spin_unlock(lock); | 4176 | spin_unlock(lock); |
4079 | if (resched) | 4177 | if (resched) |
4080 | __cond_resched(); | 4178 | __cond_resched(); |
4179 | else if (unlikely(need_rcu_resched)) | ||
4180 | rcu_resched(); | ||
4081 | else | 4181 | else |
4082 | cpu_relax(); | 4182 | cpu_relax(); |
4083 | ret = 1; | 4183 | ret = 1; |
@@ -4091,6 +4191,7 @@ int __sched __cond_resched_softirq(void) | |||
4091 | { | 4191 | { |
4092 | BUG_ON(!in_softirq()); | 4192 | BUG_ON(!in_softirq()); |
4093 | 4193 | ||
4194 | rcu_cond_resched(); /* BH disabled OK, just recording QSes. */ | ||
4094 | if (should_resched()) { | 4195 | if (should_resched()) { |
4095 | local_bh_enable(); | 4196 | local_bh_enable(); |
4096 | __cond_resched(); | 4197 | __cond_resched(); |
@@ -4145,7 +4246,7 @@ EXPORT_SYMBOL(yield); | |||
4145 | * false (0) if we failed to boost the target. | 4246 | * false (0) if we failed to boost the target. |
4146 | * -ESRCH if there's no task to yield to. | 4247 | * -ESRCH if there's no task to yield to. |
4147 | */ | 4248 | */ |
4148 | bool __sched yield_to(struct task_struct *p, bool preempt) | 4249 | int __sched yield_to(struct task_struct *p, bool preempt) |
4149 | { | 4250 | { |
4150 | struct task_struct *curr = current; | 4251 | struct task_struct *curr = current; |
4151 | struct rq *rq, *p_rq; | 4252 | struct rq *rq, *p_rq; |
@@ -5039,11 +5140,20 @@ static struct notifier_block migration_notifier = { | |||
5039 | .priority = CPU_PRI_MIGRATION, | 5140 | .priority = CPU_PRI_MIGRATION, |
5040 | }; | 5141 | }; |
5041 | 5142 | ||
5143 | static void __cpuinit set_cpu_rq_start_time(void) | ||
5144 | { | ||
5145 | int cpu = smp_processor_id(); | ||
5146 | struct rq *rq = cpu_rq(cpu); | ||
5147 | rq->age_stamp = sched_clock_cpu(cpu); | ||
5148 | } | ||
5149 | |||
5042 | static int sched_cpu_active(struct notifier_block *nfb, | 5150 | static int sched_cpu_active(struct notifier_block *nfb, |
5043 | unsigned long action, void *hcpu) | 5151 | unsigned long action, void *hcpu) |
5044 | { | 5152 | { |
5045 | switch (action & ~CPU_TASKS_FROZEN) { | 5153 | switch (action & ~CPU_TASKS_FROZEN) { |
5046 | case CPU_STARTING: | 5154 | case CPU_STARTING: |
5155 | set_cpu_rq_start_time(); | ||
5156 | return NOTIFY_OK; | ||
5047 | case CPU_DOWN_FAILED: | 5157 | case CPU_DOWN_FAILED: |
5048 | set_cpu_active((long)hcpu, true); | 5158 | set_cpu_active((long)hcpu, true); |
5049 | return NOTIFY_OK; | 5159 | return NOTIFY_OK; |
@@ -5162,14 +5272,13 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
5162 | } | 5272 | } |
5163 | 5273 | ||
5164 | /* | 5274 | /* |
5165 | * Even though we initialize ->power to something semi-sane, | 5275 | * Even though we initialize ->capacity to something semi-sane, |
5166 | * we leave power_orig unset. This allows us to detect if | 5276 | * we leave capacity_orig unset. This allows us to detect if |
5167 | * domain iteration is still funny without causing /0 traps. | 5277 | * domain iteration is still funny without causing /0 traps. |
5168 | */ | 5278 | */ |
5169 | if (!group->sgp->power_orig) { | 5279 | if (!group->sgc->capacity_orig) { |
5170 | printk(KERN_CONT "\n"); | 5280 | printk(KERN_CONT "\n"); |
5171 | printk(KERN_ERR "ERROR: domain->cpu_power not " | 5281 | printk(KERN_ERR "ERROR: domain->cpu_capacity not set\n"); |
5172 | "set\n"); | ||
5173 | break; | 5282 | break; |
5174 | } | 5283 | } |
5175 | 5284 | ||
@@ -5191,9 +5300,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
5191 | cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); | 5300 | cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); |
5192 | 5301 | ||
5193 | printk(KERN_CONT " %s", str); | 5302 | printk(KERN_CONT " %s", str); |
5194 | if (group->sgp->power != SCHED_POWER_SCALE) { | 5303 | if (group->sgc->capacity != SCHED_CAPACITY_SCALE) { |
5195 | printk(KERN_CONT " (cpu_power = %d)", | 5304 | printk(KERN_CONT " (cpu_capacity = %d)", |
5196 | group->sgp->power); | 5305 | group->sgc->capacity); |
5197 | } | 5306 | } |
5198 | 5307 | ||
5199 | group = group->next; | 5308 | group = group->next; |
@@ -5251,8 +5360,9 @@ static int sd_degenerate(struct sched_domain *sd) | |||
5251 | SD_BALANCE_NEWIDLE | | 5360 | SD_BALANCE_NEWIDLE | |
5252 | SD_BALANCE_FORK | | 5361 | SD_BALANCE_FORK | |
5253 | SD_BALANCE_EXEC | | 5362 | SD_BALANCE_EXEC | |
5254 | SD_SHARE_CPUPOWER | | 5363 | SD_SHARE_CPUCAPACITY | |
5255 | SD_SHARE_PKG_RESOURCES)) { | 5364 | SD_SHARE_PKG_RESOURCES | |
5365 | SD_SHARE_POWERDOMAIN)) { | ||
5256 | if (sd->groups != sd->groups->next) | 5366 | if (sd->groups != sd->groups->next) |
5257 | return 0; | 5367 | return 0; |
5258 | } | 5368 | } |
@@ -5281,9 +5391,10 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) | |||
5281 | SD_BALANCE_NEWIDLE | | 5391 | SD_BALANCE_NEWIDLE | |
5282 | SD_BALANCE_FORK | | 5392 | SD_BALANCE_FORK | |
5283 | SD_BALANCE_EXEC | | 5393 | SD_BALANCE_EXEC | |
5284 | SD_SHARE_CPUPOWER | | 5394 | SD_SHARE_CPUCAPACITY | |
5285 | SD_SHARE_PKG_RESOURCES | | 5395 | SD_SHARE_PKG_RESOURCES | |
5286 | SD_PREFER_SIBLING); | 5396 | SD_PREFER_SIBLING | |
5397 | SD_SHARE_POWERDOMAIN); | ||
5287 | if (nr_node_ids == 1) | 5398 | if (nr_node_ids == 1) |
5288 | pflags &= ~SD_SERIALIZE; | 5399 | pflags &= ~SD_SERIALIZE; |
5289 | } | 5400 | } |
@@ -5405,7 +5516,7 @@ static struct root_domain *alloc_rootdomain(void) | |||
5405 | return rd; | 5516 | return rd; |
5406 | } | 5517 | } |
5407 | 5518 | ||
5408 | static void free_sched_groups(struct sched_group *sg, int free_sgp) | 5519 | static void free_sched_groups(struct sched_group *sg, int free_sgc) |
5409 | { | 5520 | { |
5410 | struct sched_group *tmp, *first; | 5521 | struct sched_group *tmp, *first; |
5411 | 5522 | ||
@@ -5416,8 +5527,8 @@ static void free_sched_groups(struct sched_group *sg, int free_sgp) | |||
5416 | do { | 5527 | do { |
5417 | tmp = sg->next; | 5528 | tmp = sg->next; |
5418 | 5529 | ||
5419 | if (free_sgp && atomic_dec_and_test(&sg->sgp->ref)) | 5530 | if (free_sgc && atomic_dec_and_test(&sg->sgc->ref)) |
5420 | kfree(sg->sgp); | 5531 | kfree(sg->sgc); |
5421 | 5532 | ||
5422 | kfree(sg); | 5533 | kfree(sg); |
5423 | sg = tmp; | 5534 | sg = tmp; |
@@ -5435,7 +5546,7 @@ static void free_sched_domain(struct rcu_head *rcu) | |||
5435 | if (sd->flags & SD_OVERLAP) { | 5546 | if (sd->flags & SD_OVERLAP) { |
5436 | free_sched_groups(sd->groups, 1); | 5547 | free_sched_groups(sd->groups, 1); |
5437 | } else if (atomic_dec_and_test(&sd->groups->ref)) { | 5548 | } else if (atomic_dec_and_test(&sd->groups->ref)) { |
5438 | kfree(sd->groups->sgp); | 5549 | kfree(sd->groups->sgc); |
5439 | kfree(sd->groups); | 5550 | kfree(sd->groups); |
5440 | } | 5551 | } |
5441 | kfree(sd); | 5552 | kfree(sd); |
@@ -5557,17 +5668,6 @@ static int __init isolated_cpu_setup(char *str) | |||
5557 | 5668 | ||
5558 | __setup("isolcpus=", isolated_cpu_setup); | 5669 | __setup("isolcpus=", isolated_cpu_setup); |
5559 | 5670 | ||
5560 | static const struct cpumask *cpu_cpu_mask(int cpu) | ||
5561 | { | ||
5562 | return cpumask_of_node(cpu_to_node(cpu)); | ||
5563 | } | ||
5564 | |||
5565 | struct sd_data { | ||
5566 | struct sched_domain **__percpu sd; | ||
5567 | struct sched_group **__percpu sg; | ||
5568 | struct sched_group_power **__percpu sgp; | ||
5569 | }; | ||
5570 | |||
5571 | struct s_data { | 5671 | struct s_data { |
5572 | struct sched_domain ** __percpu sd; | 5672 | struct sched_domain ** __percpu sd; |
5573 | struct root_domain *rd; | 5673 | struct root_domain *rd; |
@@ -5580,21 +5680,6 @@ enum s_alloc { | |||
5580 | sa_none, | 5680 | sa_none, |
5581 | }; | 5681 | }; |
5582 | 5682 | ||
5583 | struct sched_domain_topology_level; | ||
5584 | |||
5585 | typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); | ||
5586 | typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); | ||
5587 | |||
5588 | #define SDTL_OVERLAP 0x01 | ||
5589 | |||
5590 | struct sched_domain_topology_level { | ||
5591 | sched_domain_init_f init; | ||
5592 | sched_domain_mask_f mask; | ||
5593 | int flags; | ||
5594 | int numa_level; | ||
5595 | struct sd_data data; | ||
5596 | }; | ||
5597 | |||
5598 | /* | 5683 | /* |
5599 | * Build an iteration mask that can exclude certain CPUs from the upwards | 5684 | * Build an iteration mask that can exclude certain CPUs from the upwards |
5600 | * domain traversal. | 5685 | * domain traversal. |
@@ -5672,17 +5757,17 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) | |||
5672 | 5757 | ||
5673 | cpumask_or(covered, covered, sg_span); | 5758 | cpumask_or(covered, covered, sg_span); |
5674 | 5759 | ||
5675 | sg->sgp = *per_cpu_ptr(sdd->sgp, i); | 5760 | sg->sgc = *per_cpu_ptr(sdd->sgc, i); |
5676 | if (atomic_inc_return(&sg->sgp->ref) == 1) | 5761 | if (atomic_inc_return(&sg->sgc->ref) == 1) |
5677 | build_group_mask(sd, sg); | 5762 | build_group_mask(sd, sg); |
5678 | 5763 | ||
5679 | /* | 5764 | /* |
5680 | * Initialize sgp->power such that even if we mess up the | 5765 | * Initialize sgc->capacity such that even if we mess up the |
5681 | * domains and no possible iteration will get us here, we won't | 5766 | * domains and no possible iteration will get us here, we won't |
5682 | * die on a /0 trap. | 5767 | * die on a /0 trap. |
5683 | */ | 5768 | */ |
5684 | sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span); | 5769 | sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); |
5685 | sg->sgp->power_orig = sg->sgp->power; | 5770 | sg->sgc->capacity_orig = sg->sgc->capacity; |
5686 | 5771 | ||
5687 | /* | 5772 | /* |
5688 | * Make sure the first group of this domain contains the | 5773 | * Make sure the first group of this domain contains the |
@@ -5720,8 +5805,8 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) | |||
5720 | 5805 | ||
5721 | if (sg) { | 5806 | if (sg) { |
5722 | *sg = *per_cpu_ptr(sdd->sg, cpu); | 5807 | *sg = *per_cpu_ptr(sdd->sg, cpu); |
5723 | (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu); | 5808 | (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu); |
5724 | atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */ | 5809 | atomic_set(&(*sg)->sgc->ref, 1); /* for claim_allocations */ |
5725 | } | 5810 | } |
5726 | 5811 | ||
5727 | return cpu; | 5812 | return cpu; |
@@ -5730,7 +5815,7 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) | |||
5730 | /* | 5815 | /* |
5731 | * build_sched_groups will build a circular linked list of the groups | 5816 | * build_sched_groups will build a circular linked list of the groups |
5732 | * covered by the given span, and will set each group's ->cpumask correctly, | 5817 | * covered by the given span, and will set each group's ->cpumask correctly, |
5733 | * and ->cpu_power to 0. | 5818 | * and ->cpu_capacity to 0. |
5734 | * | 5819 | * |
5735 | * Assumes the sched_domain tree is fully constructed | 5820 | * Assumes the sched_domain tree is fully constructed |
5736 | */ | 5821 | */ |
@@ -5762,8 +5847,6 @@ build_sched_groups(struct sched_domain *sd, int cpu) | |||
5762 | continue; | 5847 | continue; |
5763 | 5848 | ||
5764 | group = get_group(i, sdd, &sg); | 5849 | group = get_group(i, sdd, &sg); |
5765 | cpumask_clear(sched_group_cpus(sg)); | ||
5766 | sg->sgp->power = 0; | ||
5767 | cpumask_setall(sched_group_mask(sg)); | 5850 | cpumask_setall(sched_group_mask(sg)); |
5768 | 5851 | ||
5769 | for_each_cpu(j, span) { | 5852 | for_each_cpu(j, span) { |
@@ -5786,16 +5869,16 @@ build_sched_groups(struct sched_domain *sd, int cpu) | |||
5786 | } | 5869 | } |
5787 | 5870 | ||
5788 | /* | 5871 | /* |
5789 | * Initialize sched groups cpu_power. | 5872 | * Initialize sched groups cpu_capacity. |
5790 | * | 5873 | * |
5791 | * cpu_power indicates the capacity of sched group, which is used while | 5874 | * cpu_capacity indicates the capacity of sched group, which is used while |
5792 | * distributing the load between different sched groups in a sched domain. | 5875 | * distributing the load between different sched groups in a sched domain. |
5793 | * Typically cpu_power for all the groups in a sched domain will be same unless | 5876 | * Typically cpu_capacity for all the groups in a sched domain will be same |
5794 | * there are asymmetries in the topology. If there are asymmetries, group | 5877 | * unless there are asymmetries in the topology. If there are asymmetries, |
5795 | * having more cpu_power will pickup more load compared to the group having | 5878 | * group having more cpu_capacity will pickup more load compared to the |
5796 | * less cpu_power. | 5879 | * group having less cpu_capacity. |
5797 | */ | 5880 | */ |
5798 | static void init_sched_groups_power(int cpu, struct sched_domain *sd) | 5881 | static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) |
5799 | { | 5882 | { |
5800 | struct sched_group *sg = sd->groups; | 5883 | struct sched_group *sg = sd->groups; |
5801 | 5884 | ||
@@ -5809,13 +5892,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
5809 | if (cpu != group_balance_cpu(sg)) | 5892 | if (cpu != group_balance_cpu(sg)) |
5810 | return; | 5893 | return; |
5811 | 5894 | ||
5812 | update_group_power(sd, cpu); | 5895 | update_group_capacity(sd, cpu); |
5813 | atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight); | 5896 | atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight); |
5814 | } | ||
5815 | |||
5816 | int __weak arch_sd_sibling_asym_packing(void) | ||
5817 | { | ||
5818 | return 0*SD_ASYM_PACKING; | ||
5819 | } | 5897 | } |
5820 | 5898 | ||
5821 | /* | 5899 | /* |
@@ -5823,34 +5901,6 @@ int __weak arch_sd_sibling_asym_packing(void) | |||
5823 | * Non-inlined to reduce accumulated stack pressure in build_sched_domains() | 5901 | * Non-inlined to reduce accumulated stack pressure in build_sched_domains() |
5824 | */ | 5902 | */ |
5825 | 5903 | ||
5826 | #ifdef CONFIG_SCHED_DEBUG | ||
5827 | # define SD_INIT_NAME(sd, type) sd->name = #type | ||
5828 | #else | ||
5829 | # define SD_INIT_NAME(sd, type) do { } while (0) | ||
5830 | #endif | ||
5831 | |||
5832 | #define SD_INIT_FUNC(type) \ | ||
5833 | static noinline struct sched_domain * \ | ||
5834 | sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \ | ||
5835 | { \ | ||
5836 | struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \ | ||
5837 | *sd = SD_##type##_INIT; \ | ||
5838 | SD_INIT_NAME(sd, type); \ | ||
5839 | sd->private = &tl->data; \ | ||
5840 | return sd; \ | ||
5841 | } | ||
5842 | |||
5843 | SD_INIT_FUNC(CPU) | ||
5844 | #ifdef CONFIG_SCHED_SMT | ||
5845 | SD_INIT_FUNC(SIBLING) | ||
5846 | #endif | ||
5847 | #ifdef CONFIG_SCHED_MC | ||
5848 | SD_INIT_FUNC(MC) | ||
5849 | #endif | ||
5850 | #ifdef CONFIG_SCHED_BOOK | ||
5851 | SD_INIT_FUNC(BOOK) | ||
5852 | #endif | ||
5853 | |||
5854 | static int default_relax_domain_level = -1; | 5904 | static int default_relax_domain_level = -1; |
5855 | int sched_domain_level_max; | 5905 | int sched_domain_level_max; |
5856 | 5906 | ||
@@ -5934,101 +5984,158 @@ static void claim_allocations(int cpu, struct sched_domain *sd) | |||
5934 | if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) | 5984 | if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) |
5935 | *per_cpu_ptr(sdd->sg, cpu) = NULL; | 5985 | *per_cpu_ptr(sdd->sg, cpu) = NULL; |
5936 | 5986 | ||
5937 | if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref)) | 5987 | if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref)) |
5938 | *per_cpu_ptr(sdd->sgp, cpu) = NULL; | 5988 | *per_cpu_ptr(sdd->sgc, cpu) = NULL; |
5939 | } | 5989 | } |
5940 | 5990 | ||
5941 | #ifdef CONFIG_SCHED_SMT | ||
5942 | static const struct cpumask *cpu_smt_mask(int cpu) | ||
5943 | { | ||
5944 | return topology_thread_cpumask(cpu); | ||
5945 | } | ||
5946 | #endif | ||
5947 | |||
5948 | /* | ||
5949 | * Topology list, bottom-up. | ||
5950 | */ | ||
5951 | static struct sched_domain_topology_level default_topology[] = { | ||
5952 | #ifdef CONFIG_SCHED_SMT | ||
5953 | { sd_init_SIBLING, cpu_smt_mask, }, | ||
5954 | #endif | ||
5955 | #ifdef CONFIG_SCHED_MC | ||
5956 | { sd_init_MC, cpu_coregroup_mask, }, | ||
5957 | #endif | ||
5958 | #ifdef CONFIG_SCHED_BOOK | ||
5959 | { sd_init_BOOK, cpu_book_mask, }, | ||
5960 | #endif | ||
5961 | { sd_init_CPU, cpu_cpu_mask, }, | ||
5962 | { NULL, }, | ||
5963 | }; | ||
5964 | |||
5965 | static struct sched_domain_topology_level *sched_domain_topology = default_topology; | ||
5966 | |||
5967 | #define for_each_sd_topology(tl) \ | ||
5968 | for (tl = sched_domain_topology; tl->init; tl++) | ||
5969 | |||
5970 | #ifdef CONFIG_NUMA | 5991 | #ifdef CONFIG_NUMA |
5971 | |||
5972 | static int sched_domains_numa_levels; | 5992 | static int sched_domains_numa_levels; |
5973 | static int *sched_domains_numa_distance; | 5993 | static int *sched_domains_numa_distance; |
5974 | static struct cpumask ***sched_domains_numa_masks; | 5994 | static struct cpumask ***sched_domains_numa_masks; |
5975 | static int sched_domains_curr_level; | 5995 | static int sched_domains_curr_level; |
5996 | #endif | ||
5976 | 5997 | ||
5977 | static inline int sd_local_flags(int level) | 5998 | /* |
5978 | { | 5999 | * SD_flags allowed in topology descriptions. |
5979 | if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE) | 6000 | * |
5980 | return 0; | 6001 | * SD_SHARE_CPUCAPACITY - describes SMT topologies |
5981 | 6002 | * SD_SHARE_PKG_RESOURCES - describes shared caches | |
5982 | return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; | 6003 | * SD_NUMA - describes NUMA topologies |
5983 | } | 6004 | * SD_SHARE_POWERDOMAIN - describes shared power domain |
6005 | * | ||
6006 | * Odd one out: | ||
6007 | * SD_ASYM_PACKING - describes SMT quirks | ||
6008 | */ | ||
6009 | #define TOPOLOGY_SD_FLAGS \ | ||
6010 | (SD_SHARE_CPUCAPACITY | \ | ||
6011 | SD_SHARE_PKG_RESOURCES | \ | ||
6012 | SD_NUMA | \ | ||
6013 | SD_ASYM_PACKING | \ | ||
6014 | SD_SHARE_POWERDOMAIN) | ||
5984 | 6015 | ||
5985 | static struct sched_domain * | 6016 | static struct sched_domain * |
5986 | sd_numa_init(struct sched_domain_topology_level *tl, int cpu) | 6017 | sd_init(struct sched_domain_topology_level *tl, int cpu) |
5987 | { | 6018 | { |
5988 | struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); | 6019 | struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); |
5989 | int level = tl->numa_level; | 6020 | int sd_weight, sd_flags = 0; |
5990 | int sd_weight = cpumask_weight( | 6021 | |
5991 | sched_domains_numa_masks[level][cpu_to_node(cpu)]); | 6022 | #ifdef CONFIG_NUMA |
6023 | /* | ||
6024 | * Ugly hack to pass state to sd_numa_mask()... | ||
6025 | */ | ||
6026 | sched_domains_curr_level = tl->numa_level; | ||
6027 | #endif | ||
6028 | |||
6029 | sd_weight = cpumask_weight(tl->mask(cpu)); | ||
6030 | |||
6031 | if (tl->sd_flags) | ||
6032 | sd_flags = (*tl->sd_flags)(); | ||
6033 | if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS, | ||
6034 | "wrong sd_flags in topology description\n")) | ||
6035 | sd_flags &= ~TOPOLOGY_SD_FLAGS; | ||
5992 | 6036 | ||
5993 | *sd = (struct sched_domain){ | 6037 | *sd = (struct sched_domain){ |
5994 | .min_interval = sd_weight, | 6038 | .min_interval = sd_weight, |
5995 | .max_interval = 2*sd_weight, | 6039 | .max_interval = 2*sd_weight, |
5996 | .busy_factor = 32, | 6040 | .busy_factor = 32, |
5997 | .imbalance_pct = 125, | 6041 | .imbalance_pct = 125, |
5998 | .cache_nice_tries = 2, | 6042 | |
5999 | .busy_idx = 3, | 6043 | .cache_nice_tries = 0, |
6000 | .idle_idx = 2, | 6044 | .busy_idx = 0, |
6045 | .idle_idx = 0, | ||
6001 | .newidle_idx = 0, | 6046 | .newidle_idx = 0, |
6002 | .wake_idx = 0, | 6047 | .wake_idx = 0, |
6003 | .forkexec_idx = 0, | 6048 | .forkexec_idx = 0, |
6004 | 6049 | ||
6005 | .flags = 1*SD_LOAD_BALANCE | 6050 | .flags = 1*SD_LOAD_BALANCE |
6006 | | 1*SD_BALANCE_NEWIDLE | 6051 | | 1*SD_BALANCE_NEWIDLE |
6007 | | 0*SD_BALANCE_EXEC | 6052 | | 1*SD_BALANCE_EXEC |
6008 | | 0*SD_BALANCE_FORK | 6053 | | 1*SD_BALANCE_FORK |
6009 | | 0*SD_BALANCE_WAKE | 6054 | | 0*SD_BALANCE_WAKE |
6010 | | 0*SD_WAKE_AFFINE | 6055 | | 1*SD_WAKE_AFFINE |
6011 | | 0*SD_SHARE_CPUPOWER | 6056 | | 0*SD_SHARE_CPUCAPACITY |
6012 | | 0*SD_SHARE_PKG_RESOURCES | 6057 | | 0*SD_SHARE_PKG_RESOURCES |
6013 | | 1*SD_SERIALIZE | 6058 | | 0*SD_SERIALIZE |
6014 | | 0*SD_PREFER_SIBLING | 6059 | | 0*SD_PREFER_SIBLING |
6015 | | 1*SD_NUMA | 6060 | | 0*SD_NUMA |
6016 | | sd_local_flags(level) | 6061 | | sd_flags |
6017 | , | 6062 | , |
6063 | |||
6018 | .last_balance = jiffies, | 6064 | .last_balance = jiffies, |
6019 | .balance_interval = sd_weight, | 6065 | .balance_interval = sd_weight, |
6066 | .smt_gain = 0, | ||
6067 | .max_newidle_lb_cost = 0, | ||
6068 | .next_decay_max_lb_cost = jiffies, | ||
6069 | #ifdef CONFIG_SCHED_DEBUG | ||
6070 | .name = tl->name, | ||
6071 | #endif | ||
6020 | }; | 6072 | }; |
6021 | SD_INIT_NAME(sd, NUMA); | ||
6022 | sd->private = &tl->data; | ||
6023 | 6073 | ||
6024 | /* | 6074 | /* |
6025 | * Ugly hack to pass state to sd_numa_mask()... | 6075 | * Convert topological properties into behaviour. |
6026 | */ | 6076 | */ |
6027 | sched_domains_curr_level = tl->numa_level; | 6077 | |
6078 | if (sd->flags & SD_SHARE_CPUCAPACITY) { | ||
6079 | sd->imbalance_pct = 110; | ||
6080 | sd->smt_gain = 1178; /* ~15% */ | ||
6081 | |||
6082 | } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { | ||
6083 | sd->imbalance_pct = 117; | ||
6084 | sd->cache_nice_tries = 1; | ||
6085 | sd->busy_idx = 2; | ||
6086 | |||
6087 | #ifdef CONFIG_NUMA | ||
6088 | } else if (sd->flags & SD_NUMA) { | ||
6089 | sd->cache_nice_tries = 2; | ||
6090 | sd->busy_idx = 3; | ||
6091 | sd->idle_idx = 2; | ||
6092 | |||
6093 | sd->flags |= SD_SERIALIZE; | ||
6094 | if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) { | ||
6095 | sd->flags &= ~(SD_BALANCE_EXEC | | ||
6096 | SD_BALANCE_FORK | | ||
6097 | SD_WAKE_AFFINE); | ||
6098 | } | ||
6099 | |||
6100 | #endif | ||
6101 | } else { | ||
6102 | sd->flags |= SD_PREFER_SIBLING; | ||
6103 | sd->cache_nice_tries = 1; | ||
6104 | sd->busy_idx = 2; | ||
6105 | sd->idle_idx = 1; | ||
6106 | } | ||
6107 | |||
6108 | sd->private = &tl->data; | ||
6028 | 6109 | ||
6029 | return sd; | 6110 | return sd; |
6030 | } | 6111 | } |
6031 | 6112 | ||
6113 | /* | ||
6114 | * Topology list, bottom-up. | ||
6115 | */ | ||
6116 | static struct sched_domain_topology_level default_topology[] = { | ||
6117 | #ifdef CONFIG_SCHED_SMT | ||
6118 | { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, | ||
6119 | #endif | ||
6120 | #ifdef CONFIG_SCHED_MC | ||
6121 | { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, | ||
6122 | #endif | ||
6123 | { cpu_cpu_mask, SD_INIT_NAME(DIE) }, | ||
6124 | { NULL, }, | ||
6125 | }; | ||
6126 | |||
6127 | struct sched_domain_topology_level *sched_domain_topology = default_topology; | ||
6128 | |||
6129 | #define for_each_sd_topology(tl) \ | ||
6130 | for (tl = sched_domain_topology; tl->mask; tl++) | ||
6131 | |||
6132 | void set_sched_topology(struct sched_domain_topology_level *tl) | ||
6133 | { | ||
6134 | sched_domain_topology = tl; | ||
6135 | } | ||
6136 | |||
6137 | #ifdef CONFIG_NUMA | ||
6138 | |||
6032 | static const struct cpumask *sd_numa_mask(int cpu) | 6139 | static const struct cpumask *sd_numa_mask(int cpu) |
6033 | { | 6140 | { |
6034 | return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; | 6141 | return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; |
@@ -6172,7 +6279,10 @@ static void sched_init_numa(void) | |||
6172 | } | 6279 | } |
6173 | } | 6280 | } |
6174 | 6281 | ||
6175 | tl = kzalloc((ARRAY_SIZE(default_topology) + level) * | 6282 | /* Compute default topology size */ |
6283 | for (i = 0; sched_domain_topology[i].mask; i++); | ||
6284 | |||
6285 | tl = kzalloc((i + level + 1) * | ||
6176 | sizeof(struct sched_domain_topology_level), GFP_KERNEL); | 6286 | sizeof(struct sched_domain_topology_level), GFP_KERNEL); |
6177 | if (!tl) | 6287 | if (!tl) |
6178 | return; | 6288 | return; |
@@ -6180,18 +6290,19 @@ static void sched_init_numa(void) | |||
6180 | /* | 6290 | /* |
6181 | * Copy the default topology bits.. | 6291 | * Copy the default topology bits.. |
6182 | */ | 6292 | */ |
6183 | for (i = 0; default_topology[i].init; i++) | 6293 | for (i = 0; sched_domain_topology[i].mask; i++) |
6184 | tl[i] = default_topology[i]; | 6294 | tl[i] = sched_domain_topology[i]; |
6185 | 6295 | ||
6186 | /* | 6296 | /* |
6187 | * .. and append 'j' levels of NUMA goodness. | 6297 | * .. and append 'j' levels of NUMA goodness. |
6188 | */ | 6298 | */ |
6189 | for (j = 0; j < level; i++, j++) { | 6299 | for (j = 0; j < level; i++, j++) { |
6190 | tl[i] = (struct sched_domain_topology_level){ | 6300 | tl[i] = (struct sched_domain_topology_level){ |
6191 | .init = sd_numa_init, | ||
6192 | .mask = sd_numa_mask, | 6301 | .mask = sd_numa_mask, |
6302 | .sd_flags = cpu_numa_flags, | ||
6193 | .flags = SDTL_OVERLAP, | 6303 | .flags = SDTL_OVERLAP, |
6194 | .numa_level = j, | 6304 | .numa_level = j, |
6305 | SD_INIT_NAME(NUMA) | ||
6195 | }; | 6306 | }; |
6196 | } | 6307 | } |
6197 | 6308 | ||
@@ -6276,14 +6387,14 @@ static int __sdt_alloc(const struct cpumask *cpu_map) | |||
6276 | if (!sdd->sg) | 6387 | if (!sdd->sg) |
6277 | return -ENOMEM; | 6388 | return -ENOMEM; |
6278 | 6389 | ||
6279 | sdd->sgp = alloc_percpu(struct sched_group_power *); | 6390 | sdd->sgc = alloc_percpu(struct sched_group_capacity *); |
6280 | if (!sdd->sgp) | 6391 | if (!sdd->sgc) |
6281 | return -ENOMEM; | 6392 | return -ENOMEM; |
6282 | 6393 | ||
6283 | for_each_cpu(j, cpu_map) { | 6394 | for_each_cpu(j, cpu_map) { |
6284 | struct sched_domain *sd; | 6395 | struct sched_domain *sd; |
6285 | struct sched_group *sg; | 6396 | struct sched_group *sg; |
6286 | struct sched_group_power *sgp; | 6397 | struct sched_group_capacity *sgc; |
6287 | 6398 | ||
6288 | sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), | 6399 | sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), |
6289 | GFP_KERNEL, cpu_to_node(j)); | 6400 | GFP_KERNEL, cpu_to_node(j)); |
@@ -6301,12 +6412,12 @@ static int __sdt_alloc(const struct cpumask *cpu_map) | |||
6301 | 6412 | ||
6302 | *per_cpu_ptr(sdd->sg, j) = sg; | 6413 | *per_cpu_ptr(sdd->sg, j) = sg; |
6303 | 6414 | ||
6304 | sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(), | 6415 | sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(), |
6305 | GFP_KERNEL, cpu_to_node(j)); | 6416 | GFP_KERNEL, cpu_to_node(j)); |
6306 | if (!sgp) | 6417 | if (!sgc) |
6307 | return -ENOMEM; | 6418 | return -ENOMEM; |
6308 | 6419 | ||
6309 | *per_cpu_ptr(sdd->sgp, j) = sgp; | 6420 | *per_cpu_ptr(sdd->sgc, j) = sgc; |
6310 | } | 6421 | } |
6311 | } | 6422 | } |
6312 | 6423 | ||
@@ -6333,15 +6444,15 @@ static void __sdt_free(const struct cpumask *cpu_map) | |||
6333 | 6444 | ||
6334 | if (sdd->sg) | 6445 | if (sdd->sg) |
6335 | kfree(*per_cpu_ptr(sdd->sg, j)); | 6446 | kfree(*per_cpu_ptr(sdd->sg, j)); |
6336 | if (sdd->sgp) | 6447 | if (sdd->sgc) |
6337 | kfree(*per_cpu_ptr(sdd->sgp, j)); | 6448 | kfree(*per_cpu_ptr(sdd->sgc, j)); |
6338 | } | 6449 | } |
6339 | free_percpu(sdd->sd); | 6450 | free_percpu(sdd->sd); |
6340 | sdd->sd = NULL; | 6451 | sdd->sd = NULL; |
6341 | free_percpu(sdd->sg); | 6452 | free_percpu(sdd->sg); |
6342 | sdd->sg = NULL; | 6453 | sdd->sg = NULL; |
6343 | free_percpu(sdd->sgp); | 6454 | free_percpu(sdd->sgc); |
6344 | sdd->sgp = NULL; | 6455 | sdd->sgc = NULL; |
6345 | } | 6456 | } |
6346 | } | 6457 | } |
6347 | 6458 | ||
@@ -6349,7 +6460,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, | |||
6349 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | 6460 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, |
6350 | struct sched_domain *child, int cpu) | 6461 | struct sched_domain *child, int cpu) |
6351 | { | 6462 | { |
6352 | struct sched_domain *sd = tl->init(tl, cpu); | 6463 | struct sched_domain *sd = sd_init(tl, cpu); |
6353 | if (!sd) | 6464 | if (!sd) |
6354 | return child; | 6465 | return child; |
6355 | 6466 | ||
@@ -6411,14 +6522,14 @@ static int build_sched_domains(const struct cpumask *cpu_map, | |||
6411 | } | 6522 | } |
6412 | } | 6523 | } |
6413 | 6524 | ||
6414 | /* Calculate CPU power for physical packages and nodes */ | 6525 | /* Calculate CPU capacity for physical packages and nodes */ |
6415 | for (i = nr_cpumask_bits-1; i >= 0; i--) { | 6526 | for (i = nr_cpumask_bits-1; i >= 0; i--) { |
6416 | if (!cpumask_test_cpu(i, cpu_map)) | 6527 | if (!cpumask_test_cpu(i, cpu_map)) |
6417 | continue; | 6528 | continue; |
6418 | 6529 | ||
6419 | for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { | 6530 | for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { |
6420 | claim_allocations(i, sd); | 6531 | claim_allocations(i, sd); |
6421 | init_sched_groups_power(i, sd); | 6532 | init_sched_groups_capacity(i, sd); |
6422 | } | 6533 | } |
6423 | } | 6534 | } |
6424 | 6535 | ||
@@ -6861,7 +6972,7 @@ void __init sched_init(void) | |||
6861 | #ifdef CONFIG_SMP | 6972 | #ifdef CONFIG_SMP |
6862 | rq->sd = NULL; | 6973 | rq->sd = NULL; |
6863 | rq->rd = NULL; | 6974 | rq->rd = NULL; |
6864 | rq->cpu_power = SCHED_POWER_SCALE; | 6975 | rq->cpu_capacity = SCHED_CAPACITY_SCALE; |
6865 | rq->post_schedule = 0; | 6976 | rq->post_schedule = 0; |
6866 | rq->active_balance = 0; | 6977 | rq->active_balance = 0; |
6867 | rq->next_balance = jiffies; | 6978 | rq->next_balance = jiffies; |
@@ -6919,6 +7030,7 @@ void __init sched_init(void) | |||
6919 | if (cpu_isolated_map == NULL) | 7030 | if (cpu_isolated_map == NULL) |
6920 | zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); | 7031 | zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); |
6921 | idle_thread_set_boot_cpu(); | 7032 | idle_thread_set_boot_cpu(); |
7033 | set_cpu_rq_start_time(); | ||
6922 | #endif | 7034 | #endif |
6923 | init_sched_fair_class(); | 7035 | init_sched_fair_class(); |
6924 | 7036 | ||
@@ -7586,7 +7698,7 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) | |||
7586 | static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) | 7698 | static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) |
7587 | { | 7699 | { |
7588 | struct task_group *tg = css_tg(css); | 7700 | struct task_group *tg = css_tg(css); |
7589 | struct task_group *parent = css_tg(css_parent(css)); | 7701 | struct task_group *parent = css_tg(css->parent); |
7590 | 7702 | ||
7591 | if (parent) | 7703 | if (parent) |
7592 | sched_online_group(tg, parent); | 7704 | sched_online_group(tg, parent); |
@@ -7717,8 +7829,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) | |||
7717 | /* restart the period timer (if active) to handle new period expiry */ | 7829 | /* restart the period timer (if active) to handle new period expiry */ |
7718 | if (runtime_enabled && cfs_b->timer_active) { | 7830 | if (runtime_enabled && cfs_b->timer_active) { |
7719 | /* force a reprogram */ | 7831 | /* force a reprogram */ |
7720 | cfs_b->timer_active = 0; | 7832 | __start_cfs_bandwidth(cfs_b, true); |
7721 | __start_cfs_bandwidth(cfs_b); | ||
7722 | } | 7833 | } |
7723 | raw_spin_unlock_irq(&cfs_b->lock); | 7834 | raw_spin_unlock_irq(&cfs_b->lock); |
7724 | 7835 | ||
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index c143ee380e3a..9cf350c94ec4 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c | |||
@@ -46,7 +46,7 @@ static inline struct cpuacct *task_ca(struct task_struct *tsk) | |||
46 | 46 | ||
47 | static inline struct cpuacct *parent_ca(struct cpuacct *ca) | 47 | static inline struct cpuacct *parent_ca(struct cpuacct *ca) |
48 | { | 48 | { |
49 | return css_ca(css_parent(&ca->css)); | 49 | return css_ca(ca->css.parent); |
50 | } | 50 | } |
51 | 51 | ||
52 | static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); | 52 | static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); |
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 5b9bb42b2d47..bd95963dae80 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c | |||
@@ -13,6 +13,7 @@ | |||
13 | 13 | ||
14 | #include <linux/gfp.h> | 14 | #include <linux/gfp.h> |
15 | #include <linux/kernel.h> | 15 | #include <linux/kernel.h> |
16 | #include <linux/slab.h> | ||
16 | #include "cpudeadline.h" | 17 | #include "cpudeadline.h" |
17 | 18 | ||
18 | static inline int parent(int i) | 19 | static inline int parent(int i) |
@@ -39,8 +40,10 @@ static void cpudl_exchange(struct cpudl *cp, int a, int b) | |||
39 | { | 40 | { |
40 | int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu; | 41 | int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu; |
41 | 42 | ||
42 | swap(cp->elements[a], cp->elements[b]); | 43 | swap(cp->elements[a].cpu, cp->elements[b].cpu); |
43 | swap(cp->cpu_to_idx[cpu_a], cp->cpu_to_idx[cpu_b]); | 44 | swap(cp->elements[a].dl , cp->elements[b].dl ); |
45 | |||
46 | swap(cp->elements[cpu_a].idx, cp->elements[cpu_b].idx); | ||
44 | } | 47 | } |
45 | 48 | ||
46 | static void cpudl_heapify(struct cpudl *cp, int idx) | 49 | static void cpudl_heapify(struct cpudl *cp, int idx) |
@@ -140,7 +143,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) | |||
140 | WARN_ON(!cpu_present(cpu)); | 143 | WARN_ON(!cpu_present(cpu)); |
141 | 144 | ||
142 | raw_spin_lock_irqsave(&cp->lock, flags); | 145 | raw_spin_lock_irqsave(&cp->lock, flags); |
143 | old_idx = cp->cpu_to_idx[cpu]; | 146 | old_idx = cp->elements[cpu].idx; |
144 | if (!is_valid) { | 147 | if (!is_valid) { |
145 | /* remove item */ | 148 | /* remove item */ |
146 | if (old_idx == IDX_INVALID) { | 149 | if (old_idx == IDX_INVALID) { |
@@ -155,8 +158,8 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) | |||
155 | cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl; | 158 | cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl; |
156 | cp->elements[old_idx].cpu = new_cpu; | 159 | cp->elements[old_idx].cpu = new_cpu; |
157 | cp->size--; | 160 | cp->size--; |
158 | cp->cpu_to_idx[new_cpu] = old_idx; | 161 | cp->elements[new_cpu].idx = old_idx; |
159 | cp->cpu_to_idx[cpu] = IDX_INVALID; | 162 | cp->elements[cpu].idx = IDX_INVALID; |
160 | while (old_idx > 0 && dl_time_before( | 163 | while (old_idx > 0 && dl_time_before( |
161 | cp->elements[parent(old_idx)].dl, | 164 | cp->elements[parent(old_idx)].dl, |
162 | cp->elements[old_idx].dl)) { | 165 | cp->elements[old_idx].dl)) { |
@@ -173,7 +176,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) | |||
173 | cp->size++; | 176 | cp->size++; |
174 | cp->elements[cp->size - 1].dl = 0; | 177 | cp->elements[cp->size - 1].dl = 0; |
175 | cp->elements[cp->size - 1].cpu = cpu; | 178 | cp->elements[cp->size - 1].cpu = cpu; |
176 | cp->cpu_to_idx[cpu] = cp->size - 1; | 179 | cp->elements[cpu].idx = cp->size - 1; |
177 | cpudl_change_key(cp, cp->size - 1, dl); | 180 | cpudl_change_key(cp, cp->size - 1, dl); |
178 | cpumask_clear_cpu(cpu, cp->free_cpus); | 181 | cpumask_clear_cpu(cpu, cp->free_cpus); |
179 | } else { | 182 | } else { |
@@ -195,10 +198,21 @@ int cpudl_init(struct cpudl *cp) | |||
195 | memset(cp, 0, sizeof(*cp)); | 198 | memset(cp, 0, sizeof(*cp)); |
196 | raw_spin_lock_init(&cp->lock); | 199 | raw_spin_lock_init(&cp->lock); |
197 | cp->size = 0; | 200 | cp->size = 0; |
198 | for (i = 0; i < NR_CPUS; i++) | 201 | |
199 | cp->cpu_to_idx[i] = IDX_INVALID; | 202 | cp->elements = kcalloc(nr_cpu_ids, |
200 | if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) | 203 | sizeof(struct cpudl_item), |
204 | GFP_KERNEL); | ||
205 | if (!cp->elements) | ||
206 | return -ENOMEM; | ||
207 | |||
208 | if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) { | ||
209 | kfree(cp->elements); | ||
201 | return -ENOMEM; | 210 | return -ENOMEM; |
211 | } | ||
212 | |||
213 | for_each_possible_cpu(i) | ||
214 | cp->elements[i].idx = IDX_INVALID; | ||
215 | |||
202 | cpumask_setall(cp->free_cpus); | 216 | cpumask_setall(cp->free_cpus); |
203 | 217 | ||
204 | return 0; | 218 | return 0; |
@@ -210,7 +224,6 @@ int cpudl_init(struct cpudl *cp) | |||
210 | */ | 224 | */ |
211 | void cpudl_cleanup(struct cpudl *cp) | 225 | void cpudl_cleanup(struct cpudl *cp) |
212 | { | 226 | { |
213 | /* | 227 | free_cpumask_var(cp->free_cpus); |
214 | * nothing to do for the moment | 228 | kfree(cp->elements); |
215 | */ | ||
216 | } | 229 | } |
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index a202789a412c..538c9796ad4a 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h | |||
@@ -5,17 +5,17 @@ | |||
5 | 5 | ||
6 | #define IDX_INVALID -1 | 6 | #define IDX_INVALID -1 |
7 | 7 | ||
8 | struct array_item { | 8 | struct cpudl_item { |
9 | u64 dl; | 9 | u64 dl; |
10 | int cpu; | 10 | int cpu; |
11 | int idx; | ||
11 | }; | 12 | }; |
12 | 13 | ||
13 | struct cpudl { | 14 | struct cpudl { |
14 | raw_spinlock_t lock; | 15 | raw_spinlock_t lock; |
15 | int size; | 16 | int size; |
16 | int cpu_to_idx[NR_CPUS]; | ||
17 | struct array_item elements[NR_CPUS]; | ||
18 | cpumask_var_t free_cpus; | 17 | cpumask_var_t free_cpus; |
18 | struct cpudl_item *elements; | ||
19 | }; | 19 | }; |
20 | 20 | ||
21 | 21 | ||
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 8b836b376d91..981fcd7dc394 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c | |||
@@ -30,6 +30,7 @@ | |||
30 | #include <linux/gfp.h> | 30 | #include <linux/gfp.h> |
31 | #include <linux/sched.h> | 31 | #include <linux/sched.h> |
32 | #include <linux/sched/rt.h> | 32 | #include <linux/sched/rt.h> |
33 | #include <linux/slab.h> | ||
33 | #include "cpupri.h" | 34 | #include "cpupri.h" |
34 | 35 | ||
35 | /* Convert between a 140 based task->prio, and our 102 based cpupri */ | 36 | /* Convert between a 140 based task->prio, and our 102 based cpupri */ |
@@ -70,8 +71,7 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, | |||
70 | int idx = 0; | 71 | int idx = 0; |
71 | int task_pri = convert_prio(p->prio); | 72 | int task_pri = convert_prio(p->prio); |
72 | 73 | ||
73 | if (task_pri >= MAX_RT_PRIO) | 74 | BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES); |
74 | return 0; | ||
75 | 75 | ||
76 | for (idx = 0; idx < task_pri; idx++) { | 76 | for (idx = 0; idx < task_pri; idx++) { |
77 | struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; | 77 | struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; |
@@ -165,7 +165,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) | |||
165 | * do a write memory barrier, and then update the count, to | 165 | * do a write memory barrier, and then update the count, to |
166 | * make sure the vector is visible when count is set. | 166 | * make sure the vector is visible when count is set. |
167 | */ | 167 | */ |
168 | smp_mb__before_atomic_inc(); | 168 | smp_mb__before_atomic(); |
169 | atomic_inc(&(vec)->count); | 169 | atomic_inc(&(vec)->count); |
170 | do_mb = 1; | 170 | do_mb = 1; |
171 | } | 171 | } |
@@ -185,14 +185,14 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) | |||
185 | * the new priority vec. | 185 | * the new priority vec. |
186 | */ | 186 | */ |
187 | if (do_mb) | 187 | if (do_mb) |
188 | smp_mb__after_atomic_inc(); | 188 | smp_mb__after_atomic(); |
189 | 189 | ||
190 | /* | 190 | /* |
191 | * When removing from the vector, we decrement the counter first | 191 | * When removing from the vector, we decrement the counter first |
192 | * do a memory barrier and then clear the mask. | 192 | * do a memory barrier and then clear the mask. |
193 | */ | 193 | */ |
194 | atomic_dec(&(vec)->count); | 194 | atomic_dec(&(vec)->count); |
195 | smp_mb__after_atomic_inc(); | 195 | smp_mb__after_atomic(); |
196 | cpumask_clear_cpu(cpu, vec->mask); | 196 | cpumask_clear_cpu(cpu, vec->mask); |
197 | } | 197 | } |
198 | 198 | ||
@@ -219,8 +219,13 @@ int cpupri_init(struct cpupri *cp) | |||
219 | goto cleanup; | 219 | goto cleanup; |
220 | } | 220 | } |
221 | 221 | ||
222 | cp->cpu_to_pri = kcalloc(nr_cpu_ids, sizeof(int), GFP_KERNEL); | ||
223 | if (!cp->cpu_to_pri) | ||
224 | goto cleanup; | ||
225 | |||
222 | for_each_possible_cpu(i) | 226 | for_each_possible_cpu(i) |
223 | cp->cpu_to_pri[i] = CPUPRI_INVALID; | 227 | cp->cpu_to_pri[i] = CPUPRI_INVALID; |
228 | |||
224 | return 0; | 229 | return 0; |
225 | 230 | ||
226 | cleanup: | 231 | cleanup: |
@@ -237,6 +242,7 @@ void cpupri_cleanup(struct cpupri *cp) | |||
237 | { | 242 | { |
238 | int i; | 243 | int i; |
239 | 244 | ||
245 | kfree(cp->cpu_to_pri); | ||
240 | for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) | 246 | for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) |
241 | free_cpumask_var(cp->pri_to_cpu[i].mask); | 247 | free_cpumask_var(cp->pri_to_cpu[i].mask); |
242 | } | 248 | } |
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h index f6d756173491..6b033347fdfd 100644 --- a/kernel/sched/cpupri.h +++ b/kernel/sched/cpupri.h | |||
@@ -17,7 +17,7 @@ struct cpupri_vec { | |||
17 | 17 | ||
18 | struct cpupri { | 18 | struct cpupri { |
19 | struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; | 19 | struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; |
20 | int cpu_to_pri[NR_CPUS]; | 20 | int *cpu_to_pri; |
21 | }; | 21 | }; |
22 | 22 | ||
23 | #ifdef CONFIG_SMP | 23 | #ifdef CONFIG_SMP |
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index a95097cb4591..72fdf06ef865 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c | |||
@@ -332,50 +332,50 @@ out: | |||
332 | * softirq as those do not count in task exec_runtime any more. | 332 | * softirq as those do not count in task exec_runtime any more. |
333 | */ | 333 | */ |
334 | static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | 334 | static void irqtime_account_process_tick(struct task_struct *p, int user_tick, |
335 | struct rq *rq) | 335 | struct rq *rq, int ticks) |
336 | { | 336 | { |
337 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); | 337 | cputime_t scaled = cputime_to_scaled(cputime_one_jiffy); |
338 | u64 cputime = (__force u64) cputime_one_jiffy; | ||
338 | u64 *cpustat = kcpustat_this_cpu->cpustat; | 339 | u64 *cpustat = kcpustat_this_cpu->cpustat; |
339 | 340 | ||
340 | if (steal_account_process_tick()) | 341 | if (steal_account_process_tick()) |
341 | return; | 342 | return; |
342 | 343 | ||
344 | cputime *= ticks; | ||
345 | scaled *= ticks; | ||
346 | |||
343 | if (irqtime_account_hi_update()) { | 347 | if (irqtime_account_hi_update()) { |
344 | cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy; | 348 | cpustat[CPUTIME_IRQ] += cputime; |
345 | } else if (irqtime_account_si_update()) { | 349 | } else if (irqtime_account_si_update()) { |
346 | cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy; | 350 | cpustat[CPUTIME_SOFTIRQ] += cputime; |
347 | } else if (this_cpu_ksoftirqd() == p) { | 351 | } else if (this_cpu_ksoftirqd() == p) { |
348 | /* | 352 | /* |
349 | * ksoftirqd time do not get accounted in cpu_softirq_time. | 353 | * ksoftirqd time do not get accounted in cpu_softirq_time. |
350 | * So, we have to handle it separately here. | 354 | * So, we have to handle it separately here. |
351 | * Also, p->stime needs to be updated for ksoftirqd. | 355 | * Also, p->stime needs to be updated for ksoftirqd. |
352 | */ | 356 | */ |
353 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, | 357 | __account_system_time(p, cputime, scaled, CPUTIME_SOFTIRQ); |
354 | CPUTIME_SOFTIRQ); | ||
355 | } else if (user_tick) { | 358 | } else if (user_tick) { |
356 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); | 359 | account_user_time(p, cputime, scaled); |
357 | } else if (p == rq->idle) { | 360 | } else if (p == rq->idle) { |
358 | account_idle_time(cputime_one_jiffy); | 361 | account_idle_time(cputime); |
359 | } else if (p->flags & PF_VCPU) { /* System time or guest time */ | 362 | } else if (p->flags & PF_VCPU) { /* System time or guest time */ |
360 | account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); | 363 | account_guest_time(p, cputime, scaled); |
361 | } else { | 364 | } else { |
362 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, | 365 | __account_system_time(p, cputime, scaled, CPUTIME_SYSTEM); |
363 | CPUTIME_SYSTEM); | ||
364 | } | 366 | } |
365 | } | 367 | } |
366 | 368 | ||
367 | static void irqtime_account_idle_ticks(int ticks) | 369 | static void irqtime_account_idle_ticks(int ticks) |
368 | { | 370 | { |
369 | int i; | ||
370 | struct rq *rq = this_rq(); | 371 | struct rq *rq = this_rq(); |
371 | 372 | ||
372 | for (i = 0; i < ticks; i++) | 373 | irqtime_account_process_tick(current, 0, rq, ticks); |
373 | irqtime_account_process_tick(current, 0, rq); | ||
374 | } | 374 | } |
375 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ | 375 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ |
376 | static inline void irqtime_account_idle_ticks(int ticks) {} | 376 | static inline void irqtime_account_idle_ticks(int ticks) {} |
377 | static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, | 377 | static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, |
378 | struct rq *rq) {} | 378 | struct rq *rq, int nr_ticks) {} |
379 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ | 379 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ |
380 | 380 | ||
381 | /* | 381 | /* |
@@ -464,7 +464,7 @@ void account_process_tick(struct task_struct *p, int user_tick) | |||
464 | return; | 464 | return; |
465 | 465 | ||
466 | if (sched_clock_irqtime) { | 466 | if (sched_clock_irqtime) { |
467 | irqtime_account_process_tick(p, user_tick, rq); | 467 | irqtime_account_process_tick(p, user_tick, rq, 1); |
468 | return; | 468 | return; |
469 | } | 469 | } |
470 | 470 | ||
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index b08095786cb8..fc4f98b1258f 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c | |||
@@ -57,8 +57,6 @@ void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime) | |||
57 | dl_b->dl_runtime = runtime; | 57 | dl_b->dl_runtime = runtime; |
58 | } | 58 | } |
59 | 59 | ||
60 | extern unsigned long to_ratio(u64 period, u64 runtime); | ||
61 | |||
62 | void init_dl_bw(struct dl_bw *dl_b) | 60 | void init_dl_bw(struct dl_bw *dl_b) |
63 | { | 61 | { |
64 | raw_spin_lock_init(&dl_b->lock); | 62 | raw_spin_lock_init(&dl_b->lock); |
@@ -348,12 +346,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, | |||
348 | * entity. | 346 | * entity. |
349 | */ | 347 | */ |
350 | if (dl_time_before(dl_se->deadline, rq_clock(rq))) { | 348 | if (dl_time_before(dl_se->deadline, rq_clock(rq))) { |
351 | static bool lag_once = false; | 349 | printk_deferred_once("sched: DL replenish lagged to much\n"); |
352 | |||
353 | if (!lag_once) { | ||
354 | lag_once = true; | ||
355 | printk_sched("sched: DL replenish lagged to much\n"); | ||
356 | } | ||
357 | dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; | 350 | dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; |
358 | dl_se->runtime = pi_se->dl_runtime; | 351 | dl_se->runtime = pi_se->dl_runtime; |
359 | } | 352 | } |
@@ -513,14 +506,22 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) | |||
513 | struct sched_dl_entity, | 506 | struct sched_dl_entity, |
514 | dl_timer); | 507 | dl_timer); |
515 | struct task_struct *p = dl_task_of(dl_se); | 508 | struct task_struct *p = dl_task_of(dl_se); |
516 | struct rq *rq = task_rq(p); | 509 | struct rq *rq; |
510 | again: | ||
511 | rq = task_rq(p); | ||
517 | raw_spin_lock(&rq->lock); | 512 | raw_spin_lock(&rq->lock); |
518 | 513 | ||
514 | if (rq != task_rq(p)) { | ||
515 | /* Task was moved, retrying. */ | ||
516 | raw_spin_unlock(&rq->lock); | ||
517 | goto again; | ||
518 | } | ||
519 | |||
519 | /* | 520 | /* |
520 | * We need to take care of a possible races here. In fact, the | 521 | * We need to take care of a possible races here. In fact, the |
521 | * task might have changed its scheduling policy to something | 522 | * task might have changed its scheduling policy to something |
522 | * different from SCHED_DEADLINE or changed its reservation | 523 | * different from SCHED_DEADLINE or changed its reservation |
523 | * parameters (through sched_setscheduler()). | 524 | * parameters (through sched_setattr()). |
524 | */ | 525 | */ |
525 | if (!dl_task(p) || dl_se->dl_new) | 526 | if (!dl_task(p) || dl_se->dl_new) |
526 | goto unlock; | 527 | goto unlock; |
@@ -528,6 +529,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) | |||
528 | sched_clock_tick(); | 529 | sched_clock_tick(); |
529 | update_rq_clock(rq); | 530 | update_rq_clock(rq); |
530 | dl_se->dl_throttled = 0; | 531 | dl_se->dl_throttled = 0; |
532 | dl_se->dl_yielded = 0; | ||
531 | if (p->on_rq) { | 533 | if (p->on_rq) { |
532 | enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); | 534 | enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); |
533 | if (task_has_dl_policy(rq->curr)) | 535 | if (task_has_dl_policy(rq->curr)) |
@@ -740,7 +742,7 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) | |||
740 | 742 | ||
741 | WARN_ON(!dl_prio(prio)); | 743 | WARN_ON(!dl_prio(prio)); |
742 | dl_rq->dl_nr_running++; | 744 | dl_rq->dl_nr_running++; |
743 | inc_nr_running(rq_of_dl_rq(dl_rq)); | 745 | add_nr_running(rq_of_dl_rq(dl_rq), 1); |
744 | 746 | ||
745 | inc_dl_deadline(dl_rq, deadline); | 747 | inc_dl_deadline(dl_rq, deadline); |
746 | inc_dl_migration(dl_se, dl_rq); | 748 | inc_dl_migration(dl_se, dl_rq); |
@@ -754,7 +756,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) | |||
754 | WARN_ON(!dl_prio(prio)); | 756 | WARN_ON(!dl_prio(prio)); |
755 | WARN_ON(!dl_rq->dl_nr_running); | 757 | WARN_ON(!dl_rq->dl_nr_running); |
756 | dl_rq->dl_nr_running--; | 758 | dl_rq->dl_nr_running--; |
757 | dec_nr_running(rq_of_dl_rq(dl_rq)); | 759 | sub_nr_running(rq_of_dl_rq(dl_rq), 1); |
758 | 760 | ||
759 | dec_dl_deadline(dl_rq, dl_se->deadline); | 761 | dec_dl_deadline(dl_rq, dl_se->deadline); |
760 | dec_dl_migration(dl_se, dl_rq); | 762 | dec_dl_migration(dl_se, dl_rq); |
@@ -893,10 +895,10 @@ static void yield_task_dl(struct rq *rq) | |||
893 | * We make the task go to sleep until its current deadline by | 895 | * We make the task go to sleep until its current deadline by |
894 | * forcing its runtime to zero. This way, update_curr_dl() stops | 896 | * forcing its runtime to zero. This way, update_curr_dl() stops |
895 | * it and the bandwidth timer will wake it up and will give it | 897 | * it and the bandwidth timer will wake it up and will give it |
896 | * new scheduling parameters (thanks to dl_new=1). | 898 | * new scheduling parameters (thanks to dl_yielded=1). |
897 | */ | 899 | */ |
898 | if (p->dl.runtime > 0) { | 900 | if (p->dl.runtime > 0) { |
899 | rq->curr->dl.dl_new = 1; | 901 | rq->curr->dl.dl_yielded = 1; |
900 | p->dl.runtime = 0; | 902 | p->dl.runtime = 0; |
901 | } | 903 | } |
902 | update_curr_dl(rq); | 904 | update_curr_dl(rq); |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7570dd969c28..fea7d3335e1f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -1017,7 +1017,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, | |||
1017 | static unsigned long weighted_cpuload(const int cpu); | 1017 | static unsigned long weighted_cpuload(const int cpu); |
1018 | static unsigned long source_load(int cpu, int type); | 1018 | static unsigned long source_load(int cpu, int type); |
1019 | static unsigned long target_load(int cpu, int type); | 1019 | static unsigned long target_load(int cpu, int type); |
1020 | static unsigned long power_of(int cpu); | 1020 | static unsigned long capacity_of(int cpu); |
1021 | static long effective_load(struct task_group *tg, int cpu, long wl, long wg); | 1021 | static long effective_load(struct task_group *tg, int cpu, long wl, long wg); |
1022 | 1022 | ||
1023 | /* Cached statistics for all CPUs within a node */ | 1023 | /* Cached statistics for all CPUs within a node */ |
@@ -1026,11 +1026,11 @@ struct numa_stats { | |||
1026 | unsigned long load; | 1026 | unsigned long load; |
1027 | 1027 | ||
1028 | /* Total compute capacity of CPUs on a node */ | 1028 | /* Total compute capacity of CPUs on a node */ |
1029 | unsigned long power; | 1029 | unsigned long compute_capacity; |
1030 | 1030 | ||
1031 | /* Approximate capacity in terms of runnable tasks on a node */ | 1031 | /* Approximate capacity in terms of runnable tasks on a node */ |
1032 | unsigned long capacity; | 1032 | unsigned long task_capacity; |
1033 | int has_capacity; | 1033 | int has_free_capacity; |
1034 | }; | 1034 | }; |
1035 | 1035 | ||
1036 | /* | 1036 | /* |
@@ -1046,7 +1046,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid) | |||
1046 | 1046 | ||
1047 | ns->nr_running += rq->nr_running; | 1047 | ns->nr_running += rq->nr_running; |
1048 | ns->load += weighted_cpuload(cpu); | 1048 | ns->load += weighted_cpuload(cpu); |
1049 | ns->power += power_of(cpu); | 1049 | ns->compute_capacity += capacity_of(cpu); |
1050 | 1050 | ||
1051 | cpus++; | 1051 | cpus++; |
1052 | } | 1052 | } |
@@ -1056,15 +1056,16 @@ static void update_numa_stats(struct numa_stats *ns, int nid) | |||
1056 | * the @ns structure is NULL'ed and task_numa_compare() will | 1056 | * the @ns structure is NULL'ed and task_numa_compare() will |
1057 | * not find this node attractive. | 1057 | * not find this node attractive. |
1058 | * | 1058 | * |
1059 | * We'll either bail at !has_capacity, or we'll detect a huge imbalance | 1059 | * We'll either bail at !has_free_capacity, or we'll detect a huge |
1060 | * and bail there. | 1060 | * imbalance and bail there. |
1061 | */ | 1061 | */ |
1062 | if (!cpus) | 1062 | if (!cpus) |
1063 | return; | 1063 | return; |
1064 | 1064 | ||
1065 | ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power; | 1065 | ns->load = (ns->load * SCHED_CAPACITY_SCALE) / ns->compute_capacity; |
1066 | ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE); | 1066 | ns->task_capacity = |
1067 | ns->has_capacity = (ns->nr_running < ns->capacity); | 1067 | DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE); |
1068 | ns->has_free_capacity = (ns->nr_running < ns->task_capacity); | ||
1068 | } | 1069 | } |
1069 | 1070 | ||
1070 | struct task_numa_env { | 1071 | struct task_numa_env { |
@@ -1095,6 +1096,34 @@ static void task_numa_assign(struct task_numa_env *env, | |||
1095 | env->best_cpu = env->dst_cpu; | 1096 | env->best_cpu = env->dst_cpu; |
1096 | } | 1097 | } |
1097 | 1098 | ||
1099 | static bool load_too_imbalanced(long orig_src_load, long orig_dst_load, | ||
1100 | long src_load, long dst_load, | ||
1101 | struct task_numa_env *env) | ||
1102 | { | ||
1103 | long imb, old_imb; | ||
1104 | |||
1105 | /* We care about the slope of the imbalance, not the direction. */ | ||
1106 | if (dst_load < src_load) | ||
1107 | swap(dst_load, src_load); | ||
1108 | |||
1109 | /* Is the difference below the threshold? */ | ||
1110 | imb = dst_load * 100 - src_load * env->imbalance_pct; | ||
1111 | if (imb <= 0) | ||
1112 | return false; | ||
1113 | |||
1114 | /* | ||
1115 | * The imbalance is above the allowed threshold. | ||
1116 | * Compare it with the old imbalance. | ||
1117 | */ | ||
1118 | if (orig_dst_load < orig_src_load) | ||
1119 | swap(orig_dst_load, orig_src_load); | ||
1120 | |||
1121 | old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct; | ||
1122 | |||
1123 | /* Would this change make things worse? */ | ||
1124 | return (imb > old_imb); | ||
1125 | } | ||
1126 | |||
1098 | /* | 1127 | /* |
1099 | * This checks if the overall compute and NUMA accesses of the system would | 1128 | * This checks if the overall compute and NUMA accesses of the system would |
1100 | * be improved if the source tasks was migrated to the target dst_cpu taking | 1129 | * be improved if the source tasks was migrated to the target dst_cpu taking |
@@ -1107,7 +1136,8 @@ static void task_numa_compare(struct task_numa_env *env, | |||
1107 | struct rq *src_rq = cpu_rq(env->src_cpu); | 1136 | struct rq *src_rq = cpu_rq(env->src_cpu); |
1108 | struct rq *dst_rq = cpu_rq(env->dst_cpu); | 1137 | struct rq *dst_rq = cpu_rq(env->dst_cpu); |
1109 | struct task_struct *cur; | 1138 | struct task_struct *cur; |
1110 | long dst_load, src_load; | 1139 | long orig_src_load, src_load; |
1140 | long orig_dst_load, dst_load; | ||
1111 | long load; | 1141 | long load; |
1112 | long imp = (groupimp > 0) ? groupimp : taskimp; | 1142 | long imp = (groupimp > 0) ? groupimp : taskimp; |
1113 | 1143 | ||
@@ -1166,8 +1196,8 @@ static void task_numa_compare(struct task_numa_env *env, | |||
1166 | 1196 | ||
1167 | if (!cur) { | 1197 | if (!cur) { |
1168 | /* Is there capacity at our destination? */ | 1198 | /* Is there capacity at our destination? */ |
1169 | if (env->src_stats.has_capacity && | 1199 | if (env->src_stats.has_free_capacity && |
1170 | !env->dst_stats.has_capacity) | 1200 | !env->dst_stats.has_free_capacity) |
1171 | goto unlock; | 1201 | goto unlock; |
1172 | 1202 | ||
1173 | goto balance; | 1203 | goto balance; |
@@ -1181,13 +1211,13 @@ static void task_numa_compare(struct task_numa_env *env, | |||
1181 | * In the overloaded case, try and keep the load balanced. | 1211 | * In the overloaded case, try and keep the load balanced. |
1182 | */ | 1212 | */ |
1183 | balance: | 1213 | balance: |
1184 | dst_load = env->dst_stats.load; | 1214 | orig_dst_load = env->dst_stats.load; |
1185 | src_load = env->src_stats.load; | 1215 | orig_src_load = env->src_stats.load; |
1186 | 1216 | ||
1187 | /* XXX missing power terms */ | 1217 | /* XXX missing capacity terms */ |
1188 | load = task_h_load(env->p); | 1218 | load = task_h_load(env->p); |
1189 | dst_load += load; | 1219 | dst_load = orig_dst_load + load; |
1190 | src_load -= load; | 1220 | src_load = orig_src_load - load; |
1191 | 1221 | ||
1192 | if (cur) { | 1222 | if (cur) { |
1193 | load = task_h_load(cur); | 1223 | load = task_h_load(cur); |
@@ -1195,11 +1225,8 @@ balance: | |||
1195 | src_load += load; | 1225 | src_load += load; |
1196 | } | 1226 | } |
1197 | 1227 | ||
1198 | /* make src_load the smaller */ | 1228 | if (load_too_imbalanced(orig_src_load, orig_dst_load, |
1199 | if (dst_load < src_load) | 1229 | src_load, dst_load, env)) |
1200 | swap(dst_load, src_load); | ||
1201 | |||
1202 | if (src_load * env->imbalance_pct < dst_load * 100) | ||
1203 | goto unlock; | 1230 | goto unlock; |
1204 | 1231 | ||
1205 | assign: | 1232 | assign: |
@@ -1275,8 +1302,8 @@ static int task_numa_migrate(struct task_struct *p) | |||
1275 | groupimp = group_weight(p, env.dst_nid) - groupweight; | 1302 | groupimp = group_weight(p, env.dst_nid) - groupweight; |
1276 | update_numa_stats(&env.dst_stats, env.dst_nid); | 1303 | update_numa_stats(&env.dst_stats, env.dst_nid); |
1277 | 1304 | ||
1278 | /* If the preferred nid has capacity, try to use it. */ | 1305 | /* If the preferred nid has free capacity, try to use it. */ |
1279 | if (env.dst_stats.has_capacity) | 1306 | if (env.dst_stats.has_free_capacity) |
1280 | task_numa_find_cpu(&env, taskimp, groupimp); | 1307 | task_numa_find_cpu(&env, taskimp, groupimp); |
1281 | 1308 | ||
1282 | /* No space available on the preferred nid. Look elsewhere. */ | 1309 | /* No space available on the preferred nid. Look elsewhere. */ |
@@ -1301,7 +1328,16 @@ static int task_numa_migrate(struct task_struct *p) | |||
1301 | if (env.best_cpu == -1) | 1328 | if (env.best_cpu == -1) |
1302 | return -EAGAIN; | 1329 | return -EAGAIN; |
1303 | 1330 | ||
1304 | sched_setnuma(p, env.dst_nid); | 1331 | /* |
1332 | * If the task is part of a workload that spans multiple NUMA nodes, | ||
1333 | * and is migrating into one of the workload's active nodes, remember | ||
1334 | * this node as the task's preferred numa node, so the workload can | ||
1335 | * settle down. | ||
1336 | * A task that migrated to a second choice node will be better off | ||
1337 | * trying for a better one later. Do not set the preferred node here. | ||
1338 | */ | ||
1339 | if (p->numa_group && node_isset(env.dst_nid, p->numa_group->active_nodes)) | ||
1340 | sched_setnuma(p, env.dst_nid); | ||
1305 | 1341 | ||
1306 | /* | 1342 | /* |
1307 | * Reset the scan period if the task is being rescheduled on an | 1343 | * Reset the scan period if the task is being rescheduled on an |
@@ -1326,12 +1362,15 @@ static int task_numa_migrate(struct task_struct *p) | |||
1326 | /* Attempt to migrate a task to a CPU on the preferred node. */ | 1362 | /* Attempt to migrate a task to a CPU on the preferred node. */ |
1327 | static void numa_migrate_preferred(struct task_struct *p) | 1363 | static void numa_migrate_preferred(struct task_struct *p) |
1328 | { | 1364 | { |
1365 | unsigned long interval = HZ; | ||
1366 | |||
1329 | /* This task has no NUMA fault statistics yet */ | 1367 | /* This task has no NUMA fault statistics yet */ |
1330 | if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory)) | 1368 | if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory)) |
1331 | return; | 1369 | return; |
1332 | 1370 | ||
1333 | /* Periodically retry migrating the task to the preferred node */ | 1371 | /* Periodically retry migrating the task to the preferred node */ |
1334 | p->numa_migrate_retry = jiffies + HZ; | 1372 | interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16); |
1373 | p->numa_migrate_retry = jiffies + interval; | ||
1335 | 1374 | ||
1336 | /* Success if task is already running on preferred CPU */ | 1375 | /* Success if task is already running on preferred CPU */ |
1337 | if (task_node(p) == p->numa_preferred_nid) | 1376 | if (task_node(p) == p->numa_preferred_nid) |
@@ -1707,18 +1746,19 @@ no_join: | |||
1707 | void task_numa_free(struct task_struct *p) | 1746 | void task_numa_free(struct task_struct *p) |
1708 | { | 1747 | { |
1709 | struct numa_group *grp = p->numa_group; | 1748 | struct numa_group *grp = p->numa_group; |
1710 | int i; | ||
1711 | void *numa_faults = p->numa_faults_memory; | 1749 | void *numa_faults = p->numa_faults_memory; |
1750 | unsigned long flags; | ||
1751 | int i; | ||
1712 | 1752 | ||
1713 | if (grp) { | 1753 | if (grp) { |
1714 | spin_lock_irq(&grp->lock); | 1754 | spin_lock_irqsave(&grp->lock, flags); |
1715 | for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) | 1755 | for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) |
1716 | grp->faults[i] -= p->numa_faults_memory[i]; | 1756 | grp->faults[i] -= p->numa_faults_memory[i]; |
1717 | grp->total_faults -= p->total_numa_faults; | 1757 | grp->total_faults -= p->total_numa_faults; |
1718 | 1758 | ||
1719 | list_del(&p->numa_entry); | 1759 | list_del(&p->numa_entry); |
1720 | grp->nr_tasks--; | 1760 | grp->nr_tasks--; |
1721 | spin_unlock_irq(&grp->lock); | 1761 | spin_unlock_irqrestore(&grp->lock, flags); |
1722 | rcu_assign_pointer(p->numa_group, NULL); | 1762 | rcu_assign_pointer(p->numa_group, NULL); |
1723 | put_numa_group(grp); | 1763 | put_numa_group(grp); |
1724 | } | 1764 | } |
@@ -1738,6 +1778,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) | |||
1738 | struct task_struct *p = current; | 1778 | struct task_struct *p = current; |
1739 | bool migrated = flags & TNF_MIGRATED; | 1779 | bool migrated = flags & TNF_MIGRATED; |
1740 | int cpu_node = task_node(current); | 1780 | int cpu_node = task_node(current); |
1781 | int local = !!(flags & TNF_FAULT_LOCAL); | ||
1741 | int priv; | 1782 | int priv; |
1742 | 1783 | ||
1743 | if (!numabalancing_enabled) | 1784 | if (!numabalancing_enabled) |
@@ -1786,6 +1827,17 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) | |||
1786 | task_numa_group(p, last_cpupid, flags, &priv); | 1827 | task_numa_group(p, last_cpupid, flags, &priv); |
1787 | } | 1828 | } |
1788 | 1829 | ||
1830 | /* | ||
1831 | * If a workload spans multiple NUMA nodes, a shared fault that | ||
1832 | * occurs wholly within the set of nodes that the workload is | ||
1833 | * actively using should be counted as local. This allows the | ||
1834 | * scan rate to slow down when a workload has settled down. | ||
1835 | */ | ||
1836 | if (!priv && !local && p->numa_group && | ||
1837 | node_isset(cpu_node, p->numa_group->active_nodes) && | ||
1838 | node_isset(mem_node, p->numa_group->active_nodes)) | ||
1839 | local = 1; | ||
1840 | |||
1789 | task_numa_placement(p); | 1841 | task_numa_placement(p); |
1790 | 1842 | ||
1791 | /* | 1843 | /* |
@@ -1800,7 +1852,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) | |||
1800 | 1852 | ||
1801 | p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages; | 1853 | p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages; |
1802 | p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages; | 1854 | p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages; |
1803 | p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages; | 1855 | p->numa_faults_locality[local] += pages; |
1804 | } | 1856 | } |
1805 | 1857 | ||
1806 | static void reset_ptenuma_scan(struct task_struct *p) | 1858 | static void reset_ptenuma_scan(struct task_struct *p) |
@@ -3129,7 +3181,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) | |||
3129 | */ | 3181 | */ |
3130 | if (!cfs_b->timer_active) { | 3182 | if (!cfs_b->timer_active) { |
3131 | __refill_cfs_bandwidth_runtime(cfs_b); | 3183 | __refill_cfs_bandwidth_runtime(cfs_b); |
3132 | __start_cfs_bandwidth(cfs_b); | 3184 | __start_cfs_bandwidth(cfs_b, false); |
3133 | } | 3185 | } |
3134 | 3186 | ||
3135 | if (cfs_b->runtime > 0) { | 3187 | if (cfs_b->runtime > 0) { |
@@ -3174,10 +3226,12 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) | |||
3174 | * has not truly expired. | 3226 | * has not truly expired. |
3175 | * | 3227 | * |
3176 | * Fortunately we can check determine whether this the case by checking | 3228 | * Fortunately we can check determine whether this the case by checking |
3177 | * whether the global deadline has advanced. | 3229 | * whether the global deadline has advanced. It is valid to compare |
3230 | * cfs_b->runtime_expires without any locks since we only care about | ||
3231 | * exact equality, so a partial write will still work. | ||
3178 | */ | 3232 | */ |
3179 | 3233 | ||
3180 | if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) { | 3234 | if (cfs_rq->runtime_expires != cfs_b->runtime_expires) { |
3181 | /* extend local deadline, drift is bounded above by 2 ticks */ | 3235 | /* extend local deadline, drift is bounded above by 2 ticks */ |
3182 | cfs_rq->runtime_expires += TICK_NSEC; | 3236 | cfs_rq->runtime_expires += TICK_NSEC; |
3183 | } else { | 3237 | } else { |
@@ -3301,14 +3355,14 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) | |||
3301 | } | 3355 | } |
3302 | 3356 | ||
3303 | if (!se) | 3357 | if (!se) |
3304 | rq->nr_running -= task_delta; | 3358 | sub_nr_running(rq, task_delta); |
3305 | 3359 | ||
3306 | cfs_rq->throttled = 1; | 3360 | cfs_rq->throttled = 1; |
3307 | cfs_rq->throttled_clock = rq_clock(rq); | 3361 | cfs_rq->throttled_clock = rq_clock(rq); |
3308 | raw_spin_lock(&cfs_b->lock); | 3362 | raw_spin_lock(&cfs_b->lock); |
3309 | list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); | 3363 | list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); |
3310 | if (!cfs_b->timer_active) | 3364 | if (!cfs_b->timer_active) |
3311 | __start_cfs_bandwidth(cfs_b); | 3365 | __start_cfs_bandwidth(cfs_b, false); |
3312 | raw_spin_unlock(&cfs_b->lock); | 3366 | raw_spin_unlock(&cfs_b->lock); |
3313 | } | 3367 | } |
3314 | 3368 | ||
@@ -3352,7 +3406,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) | |||
3352 | } | 3406 | } |
3353 | 3407 | ||
3354 | if (!se) | 3408 | if (!se) |
3355 | rq->nr_running += task_delta; | 3409 | add_nr_running(rq, task_delta); |
3356 | 3410 | ||
3357 | /* determine whether we need to wake up potentially idle cpu */ | 3411 | /* determine whether we need to wake up potentially idle cpu */ |
3358 | if (rq->curr == rq->idle && rq->cfs.nr_running) | 3412 | if (rq->curr == rq->idle && rq->cfs.nr_running) |
@@ -3406,21 +3460,21 @@ next: | |||
3406 | static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) | 3460 | static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) |
3407 | { | 3461 | { |
3408 | u64 runtime, runtime_expires; | 3462 | u64 runtime, runtime_expires; |
3409 | int idle = 1, throttled; | 3463 | int throttled; |
3410 | 3464 | ||
3411 | raw_spin_lock(&cfs_b->lock); | ||
3412 | /* no need to continue the timer with no bandwidth constraint */ | 3465 | /* no need to continue the timer with no bandwidth constraint */ |
3413 | if (cfs_b->quota == RUNTIME_INF) | 3466 | if (cfs_b->quota == RUNTIME_INF) |
3414 | goto out_unlock; | 3467 | goto out_deactivate; |
3415 | 3468 | ||
3416 | throttled = !list_empty(&cfs_b->throttled_cfs_rq); | 3469 | throttled = !list_empty(&cfs_b->throttled_cfs_rq); |
3417 | /* idle depends on !throttled (for the case of a large deficit) */ | ||
3418 | idle = cfs_b->idle && !throttled; | ||
3419 | cfs_b->nr_periods += overrun; | 3470 | cfs_b->nr_periods += overrun; |
3420 | 3471 | ||
3421 | /* if we're going inactive then everything else can be deferred */ | 3472 | /* |
3422 | if (idle) | 3473 | * idle depends on !throttled (for the case of a large deficit), and if |
3423 | goto out_unlock; | 3474 | * we're going inactive then everything else can be deferred |
3475 | */ | ||
3476 | if (cfs_b->idle && !throttled) | ||
3477 | goto out_deactivate; | ||
3424 | 3478 | ||
3425 | /* | 3479 | /* |
3426 | * if we have relooped after returning idle once, we need to update our | 3480 | * if we have relooped after returning idle once, we need to update our |
@@ -3434,7 +3488,7 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) | |||
3434 | if (!throttled) { | 3488 | if (!throttled) { |
3435 | /* mark as potentially idle for the upcoming period */ | 3489 | /* mark as potentially idle for the upcoming period */ |
3436 | cfs_b->idle = 1; | 3490 | cfs_b->idle = 1; |
3437 | goto out_unlock; | 3491 | return 0; |
3438 | } | 3492 | } |
3439 | 3493 | ||
3440 | /* account preceding periods in which throttling occurred */ | 3494 | /* account preceding periods in which throttling occurred */ |
@@ -3474,12 +3528,12 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) | |||
3474 | * timer to remain active while there are any throttled entities.) | 3528 | * timer to remain active while there are any throttled entities.) |
3475 | */ | 3529 | */ |
3476 | cfs_b->idle = 0; | 3530 | cfs_b->idle = 0; |
3477 | out_unlock: | ||
3478 | if (idle) | ||
3479 | cfs_b->timer_active = 0; | ||
3480 | raw_spin_unlock(&cfs_b->lock); | ||
3481 | 3531 | ||
3482 | return idle; | 3532 | return 0; |
3533 | |||
3534 | out_deactivate: | ||
3535 | cfs_b->timer_active = 0; | ||
3536 | return 1; | ||
3483 | } | 3537 | } |
3484 | 3538 | ||
3485 | /* a cfs_rq won't donate quota below this amount */ | 3539 | /* a cfs_rq won't donate quota below this amount */ |
@@ -3656,6 +3710,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) | |||
3656 | int overrun; | 3710 | int overrun; |
3657 | int idle = 0; | 3711 | int idle = 0; |
3658 | 3712 | ||
3713 | raw_spin_lock(&cfs_b->lock); | ||
3659 | for (;;) { | 3714 | for (;;) { |
3660 | now = hrtimer_cb_get_time(timer); | 3715 | now = hrtimer_cb_get_time(timer); |
3661 | overrun = hrtimer_forward(timer, now, cfs_b->period); | 3716 | overrun = hrtimer_forward(timer, now, cfs_b->period); |
@@ -3665,6 +3720,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) | |||
3665 | 3720 | ||
3666 | idle = do_sched_cfs_period_timer(cfs_b, overrun); | 3721 | idle = do_sched_cfs_period_timer(cfs_b, overrun); |
3667 | } | 3722 | } |
3723 | raw_spin_unlock(&cfs_b->lock); | ||
3668 | 3724 | ||
3669 | return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; | 3725 | return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; |
3670 | } | 3726 | } |
@@ -3690,7 +3746,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) | |||
3690 | } | 3746 | } |
3691 | 3747 | ||
3692 | /* requires cfs_b->lock, may release to reprogram timer */ | 3748 | /* requires cfs_b->lock, may release to reprogram timer */ |
3693 | void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | 3749 | void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force) |
3694 | { | 3750 | { |
3695 | /* | 3751 | /* |
3696 | * The timer may be active because we're trying to set a new bandwidth | 3752 | * The timer may be active because we're trying to set a new bandwidth |
@@ -3705,7 +3761,7 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | |||
3705 | cpu_relax(); | 3761 | cpu_relax(); |
3706 | raw_spin_lock(&cfs_b->lock); | 3762 | raw_spin_lock(&cfs_b->lock); |
3707 | /* if someone else restarted the timer then we're done */ | 3763 | /* if someone else restarted the timer then we're done */ |
3708 | if (cfs_b->timer_active) | 3764 | if (!force && cfs_b->timer_active) |
3709 | return; | 3765 | return; |
3710 | } | 3766 | } |
3711 | 3767 | ||
@@ -3724,8 +3780,6 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) | |||
3724 | struct cfs_rq *cfs_rq; | 3780 | struct cfs_rq *cfs_rq; |
3725 | 3781 | ||
3726 | for_each_leaf_cfs_rq(rq, cfs_rq) { | 3782 | for_each_leaf_cfs_rq(rq, cfs_rq) { |
3727 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); | ||
3728 | |||
3729 | if (!cfs_rq->runtime_enabled) | 3783 | if (!cfs_rq->runtime_enabled) |
3730 | continue; | 3784 | continue; |
3731 | 3785 | ||
@@ -3733,7 +3787,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) | |||
3733 | * clock_task is not advancing so we just need to make sure | 3787 | * clock_task is not advancing so we just need to make sure |
3734 | * there's some valid quota amount | 3788 | * there's some valid quota amount |
3735 | */ | 3789 | */ |
3736 | cfs_rq->runtime_remaining = cfs_b->quota; | 3790 | cfs_rq->runtime_remaining = 1; |
3737 | if (cfs_rq_throttled(cfs_rq)) | 3791 | if (cfs_rq_throttled(cfs_rq)) |
3738 | unthrottle_cfs_rq(cfs_rq); | 3792 | unthrottle_cfs_rq(cfs_rq); |
3739 | } | 3793 | } |
@@ -3884,7 +3938,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
3884 | 3938 | ||
3885 | if (!se) { | 3939 | if (!se) { |
3886 | update_rq_runnable_avg(rq, rq->nr_running); | 3940 | update_rq_runnable_avg(rq, rq->nr_running); |
3887 | inc_nr_running(rq); | 3941 | add_nr_running(rq, 1); |
3888 | } | 3942 | } |
3889 | hrtick_update(rq); | 3943 | hrtick_update(rq); |
3890 | } | 3944 | } |
@@ -3944,7 +3998,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
3944 | } | 3998 | } |
3945 | 3999 | ||
3946 | if (!se) { | 4000 | if (!se) { |
3947 | dec_nr_running(rq); | 4001 | sub_nr_running(rq, 1); |
3948 | update_rq_runnable_avg(rq, 1); | 4002 | update_rq_runnable_avg(rq, 1); |
3949 | } | 4003 | } |
3950 | hrtick_update(rq); | 4004 | hrtick_update(rq); |
@@ -3990,9 +4044,9 @@ static unsigned long target_load(int cpu, int type) | |||
3990 | return max(rq->cpu_load[type-1], total); | 4044 | return max(rq->cpu_load[type-1], total); |
3991 | } | 4045 | } |
3992 | 4046 | ||
3993 | static unsigned long power_of(int cpu) | 4047 | static unsigned long capacity_of(int cpu) |
3994 | { | 4048 | { |
3995 | return cpu_rq(cpu)->cpu_power; | 4049 | return cpu_rq(cpu)->cpu_capacity; |
3996 | } | 4050 | } |
3997 | 4051 | ||
3998 | static unsigned long cpu_avg_load_per_task(int cpu) | 4052 | static unsigned long cpu_avg_load_per_task(int cpu) |
@@ -4014,8 +4068,8 @@ static void record_wakee(struct task_struct *p) | |||
4014 | * about the boundary, really active task won't care | 4068 | * about the boundary, really active task won't care |
4015 | * about the loss. | 4069 | * about the loss. |
4016 | */ | 4070 | */ |
4017 | if (jiffies > current->wakee_flip_decay_ts + HZ) { | 4071 | if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) { |
4018 | current->wakee_flips = 0; | 4072 | current->wakee_flips >>= 1; |
4019 | current->wakee_flip_decay_ts = jiffies; | 4073 | current->wakee_flip_decay_ts = jiffies; |
4020 | } | 4074 | } |
4021 | 4075 | ||
@@ -4235,12 +4289,12 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) | |||
4235 | s64 this_eff_load, prev_eff_load; | 4289 | s64 this_eff_load, prev_eff_load; |
4236 | 4290 | ||
4237 | this_eff_load = 100; | 4291 | this_eff_load = 100; |
4238 | this_eff_load *= power_of(prev_cpu); | 4292 | this_eff_load *= capacity_of(prev_cpu); |
4239 | this_eff_load *= this_load + | 4293 | this_eff_load *= this_load + |
4240 | effective_load(tg, this_cpu, weight, weight); | 4294 | effective_load(tg, this_cpu, weight, weight); |
4241 | 4295 | ||
4242 | prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; | 4296 | prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; |
4243 | prev_eff_load *= power_of(this_cpu); | 4297 | prev_eff_load *= capacity_of(this_cpu); |
4244 | prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight); | 4298 | prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight); |
4245 | 4299 | ||
4246 | balanced = this_eff_load <= prev_eff_load; | 4300 | balanced = this_eff_load <= prev_eff_load; |
@@ -4316,8 +4370,8 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, | |||
4316 | avg_load += load; | 4370 | avg_load += load; |
4317 | } | 4371 | } |
4318 | 4372 | ||
4319 | /* Adjust by relative CPU power of the group */ | 4373 | /* Adjust by relative CPU capacity of the group */ |
4320 | avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power; | 4374 | avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity; |
4321 | 4375 | ||
4322 | if (local_group) { | 4376 | if (local_group) { |
4323 | this_load = avg_load; | 4377 | this_load = avg_load; |
@@ -4449,10 +4503,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f | |||
4449 | sd = tmp; | 4503 | sd = tmp; |
4450 | } | 4504 | } |
4451 | 4505 | ||
4452 | if (affine_sd) { | 4506 | if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync)) |
4453 | if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) | 4507 | prev_cpu = cpu; |
4454 | prev_cpu = cpu; | ||
4455 | 4508 | ||
4509 | if (sd_flag & SD_BALANCE_WAKE) { | ||
4456 | new_cpu = select_idle_sibling(p, prev_cpu); | 4510 | new_cpu = select_idle_sibling(p, prev_cpu); |
4457 | goto unlock; | 4511 | goto unlock; |
4458 | } | 4512 | } |
@@ -4520,6 +4574,9 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu) | |||
4520 | atomic_long_add(se->avg.load_avg_contrib, | 4574 | atomic_long_add(se->avg.load_avg_contrib, |
4521 | &cfs_rq->removed_load); | 4575 | &cfs_rq->removed_load); |
4522 | } | 4576 | } |
4577 | |||
4578 | /* We have migrated, no longer consider this task hot */ | ||
4579 | se->exec_start = 0; | ||
4523 | } | 4580 | } |
4524 | #endif /* CONFIG_SMP */ | 4581 | #endif /* CONFIG_SMP */ |
4525 | 4582 | ||
@@ -4894,14 +4951,14 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
4894 | * | 4951 | * |
4895 | * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3) | 4952 | * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3) |
4896 | * | 4953 | * |
4897 | * P_i is the cpu power (or compute capacity) of cpu i, typically it is the | 4954 | * C_i is the compute capacity of cpu i, typically it is the |
4898 | * fraction of 'recent' time available for SCHED_OTHER task execution. But it | 4955 | * fraction of 'recent' time available for SCHED_OTHER task execution. But it |
4899 | * can also include other factors [XXX]. | 4956 | * can also include other factors [XXX]. |
4900 | * | 4957 | * |
4901 | * To achieve this balance we define a measure of imbalance which follows | 4958 | * To achieve this balance we define a measure of imbalance which follows |
4902 | * directly from (1): | 4959 | * directly from (1): |
4903 | * | 4960 | * |
4904 | * imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j } (4) | 4961 | * imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j } (4) |
4905 | * | 4962 | * |
4906 | * We them move tasks around to minimize the imbalance. In the continuous | 4963 | * We them move tasks around to minimize the imbalance. In the continuous |
4907 | * function space it is obvious this converges, in the discrete case we get | 4964 | * function space it is obvious this converges, in the discrete case we get |
@@ -5070,6 +5127,7 @@ task_hot(struct task_struct *p, u64 now) | |||
5070 | /* Returns true if the destination node has incurred more faults */ | 5127 | /* Returns true if the destination node has incurred more faults */ |
5071 | static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) | 5128 | static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) |
5072 | { | 5129 | { |
5130 | struct numa_group *numa_group = rcu_dereference(p->numa_group); | ||
5073 | int src_nid, dst_nid; | 5131 | int src_nid, dst_nid; |
5074 | 5132 | ||
5075 | if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory || | 5133 | if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory || |
@@ -5083,21 +5141,29 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) | |||
5083 | if (src_nid == dst_nid) | 5141 | if (src_nid == dst_nid) |
5084 | return false; | 5142 | return false; |
5085 | 5143 | ||
5086 | /* Always encourage migration to the preferred node. */ | 5144 | if (numa_group) { |
5087 | if (dst_nid == p->numa_preferred_nid) | 5145 | /* Task is already in the group's interleave set. */ |
5088 | return true; | 5146 | if (node_isset(src_nid, numa_group->active_nodes)) |
5147 | return false; | ||
5148 | |||
5149 | /* Task is moving into the group's interleave set. */ | ||
5150 | if (node_isset(dst_nid, numa_group->active_nodes)) | ||
5151 | return true; | ||
5152 | |||
5153 | return group_faults(p, dst_nid) > group_faults(p, src_nid); | ||
5154 | } | ||
5089 | 5155 | ||
5090 | /* If both task and group weight improve, this move is a winner. */ | 5156 | /* Encourage migration to the preferred node. */ |
5091 | if (task_weight(p, dst_nid) > task_weight(p, src_nid) && | 5157 | if (dst_nid == p->numa_preferred_nid) |
5092 | group_weight(p, dst_nid) > group_weight(p, src_nid)) | ||
5093 | return true; | 5158 | return true; |
5094 | 5159 | ||
5095 | return false; | 5160 | return task_faults(p, dst_nid) > task_faults(p, src_nid); |
5096 | } | 5161 | } |
5097 | 5162 | ||
5098 | 5163 | ||
5099 | static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) | 5164 | static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) |
5100 | { | 5165 | { |
5166 | struct numa_group *numa_group = rcu_dereference(p->numa_group); | ||
5101 | int src_nid, dst_nid; | 5167 | int src_nid, dst_nid; |
5102 | 5168 | ||
5103 | if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) | 5169 | if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) |
@@ -5112,16 +5178,23 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) | |||
5112 | if (src_nid == dst_nid) | 5178 | if (src_nid == dst_nid) |
5113 | return false; | 5179 | return false; |
5114 | 5180 | ||
5181 | if (numa_group) { | ||
5182 | /* Task is moving within/into the group's interleave set. */ | ||
5183 | if (node_isset(dst_nid, numa_group->active_nodes)) | ||
5184 | return false; | ||
5185 | |||
5186 | /* Task is moving out of the group's interleave set. */ | ||
5187 | if (node_isset(src_nid, numa_group->active_nodes)) | ||
5188 | return true; | ||
5189 | |||
5190 | return group_faults(p, dst_nid) < group_faults(p, src_nid); | ||
5191 | } | ||
5192 | |||
5115 | /* Migrating away from the preferred node is always bad. */ | 5193 | /* Migrating away from the preferred node is always bad. */ |
5116 | if (src_nid == p->numa_preferred_nid) | 5194 | if (src_nid == p->numa_preferred_nid) |
5117 | return true; | 5195 | return true; |
5118 | 5196 | ||
5119 | /* If either task or group weight get worse, don't do it. */ | 5197 | return task_faults(p, dst_nid) < task_faults(p, src_nid); |
5120 | if (task_weight(p, dst_nid) < task_weight(p, src_nid) || | ||
5121 | group_weight(p, dst_nid) < group_weight(p, src_nid)) | ||
5122 | return true; | ||
5123 | |||
5124 | return false; | ||
5125 | } | 5198 | } |
5126 | 5199 | ||
5127 | #else | 5200 | #else |
@@ -5460,13 +5533,13 @@ struct sg_lb_stats { | |||
5460 | unsigned long group_load; /* Total load over the CPUs of the group */ | 5533 | unsigned long group_load; /* Total load over the CPUs of the group */ |
5461 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ | 5534 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ |
5462 | unsigned long load_per_task; | 5535 | unsigned long load_per_task; |
5463 | unsigned long group_power; | 5536 | unsigned long group_capacity; |
5464 | unsigned int sum_nr_running; /* Nr tasks running in the group */ | 5537 | unsigned int sum_nr_running; /* Nr tasks running in the group */ |
5465 | unsigned int group_capacity; | 5538 | unsigned int group_capacity_factor; |
5466 | unsigned int idle_cpus; | 5539 | unsigned int idle_cpus; |
5467 | unsigned int group_weight; | 5540 | unsigned int group_weight; |
5468 | int group_imb; /* Is there an imbalance in the group ? */ | 5541 | int group_imb; /* Is there an imbalance in the group ? */ |
5469 | int group_has_capacity; /* Is there extra capacity in the group? */ | 5542 | int group_has_free_capacity; |
5470 | #ifdef CONFIG_NUMA_BALANCING | 5543 | #ifdef CONFIG_NUMA_BALANCING |
5471 | unsigned int nr_numa_running; | 5544 | unsigned int nr_numa_running; |
5472 | unsigned int nr_preferred_running; | 5545 | unsigned int nr_preferred_running; |
@@ -5481,7 +5554,7 @@ struct sd_lb_stats { | |||
5481 | struct sched_group *busiest; /* Busiest group in this sd */ | 5554 | struct sched_group *busiest; /* Busiest group in this sd */ |
5482 | struct sched_group *local; /* Local group in this sd */ | 5555 | struct sched_group *local; /* Local group in this sd */ |
5483 | unsigned long total_load; /* Total load of all groups in sd */ | 5556 | unsigned long total_load; /* Total load of all groups in sd */ |
5484 | unsigned long total_pwr; /* Total power of all groups in sd */ | 5557 | unsigned long total_capacity; /* Total capacity of all groups in sd */ |
5485 | unsigned long avg_load; /* Average load across all groups in sd */ | 5558 | unsigned long avg_load; /* Average load across all groups in sd */ |
5486 | 5559 | ||
5487 | struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */ | 5560 | struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */ |
@@ -5500,7 +5573,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) | |||
5500 | .busiest = NULL, | 5573 | .busiest = NULL, |
5501 | .local = NULL, | 5574 | .local = NULL, |
5502 | .total_load = 0UL, | 5575 | .total_load = 0UL, |
5503 | .total_pwr = 0UL, | 5576 | .total_capacity = 0UL, |
5504 | .busiest_stat = { | 5577 | .busiest_stat = { |
5505 | .avg_load = 0UL, | 5578 | .avg_load = 0UL, |
5506 | }, | 5579 | }, |
@@ -5535,17 +5608,17 @@ static inline int get_sd_load_idx(struct sched_domain *sd, | |||
5535 | return load_idx; | 5608 | return load_idx; |
5536 | } | 5609 | } |
5537 | 5610 | ||
5538 | static unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) | 5611 | static unsigned long default_scale_capacity(struct sched_domain *sd, int cpu) |
5539 | { | 5612 | { |
5540 | return SCHED_POWER_SCALE; | 5613 | return SCHED_CAPACITY_SCALE; |
5541 | } | 5614 | } |
5542 | 5615 | ||
5543 | unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) | 5616 | unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu) |
5544 | { | 5617 | { |
5545 | return default_scale_freq_power(sd, cpu); | 5618 | return default_scale_capacity(sd, cpu); |
5546 | } | 5619 | } |
5547 | 5620 | ||
5548 | static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) | 5621 | static unsigned long default_scale_smt_capacity(struct sched_domain *sd, int cpu) |
5549 | { | 5622 | { |
5550 | unsigned long weight = sd->span_weight; | 5623 | unsigned long weight = sd->span_weight; |
5551 | unsigned long smt_gain = sd->smt_gain; | 5624 | unsigned long smt_gain = sd->smt_gain; |
@@ -5555,15 +5628,16 @@ static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) | |||
5555 | return smt_gain; | 5628 | return smt_gain; |
5556 | } | 5629 | } |
5557 | 5630 | ||
5558 | unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) | 5631 | unsigned long __weak arch_scale_smt_capacity(struct sched_domain *sd, int cpu) |
5559 | { | 5632 | { |
5560 | return default_scale_smt_power(sd, cpu); | 5633 | return default_scale_smt_capacity(sd, cpu); |
5561 | } | 5634 | } |
5562 | 5635 | ||
5563 | static unsigned long scale_rt_power(int cpu) | 5636 | static unsigned long scale_rt_capacity(int cpu) |
5564 | { | 5637 | { |
5565 | struct rq *rq = cpu_rq(cpu); | 5638 | struct rq *rq = cpu_rq(cpu); |
5566 | u64 total, available, age_stamp, avg; | 5639 | u64 total, available, age_stamp, avg; |
5640 | s64 delta; | ||
5567 | 5641 | ||
5568 | /* | 5642 | /* |
5569 | * Since we're reading these variables without serialization make sure | 5643 | * Since we're reading these variables without serialization make sure |
@@ -5572,74 +5646,78 @@ static unsigned long scale_rt_power(int cpu) | |||
5572 | age_stamp = ACCESS_ONCE(rq->age_stamp); | 5646 | age_stamp = ACCESS_ONCE(rq->age_stamp); |
5573 | avg = ACCESS_ONCE(rq->rt_avg); | 5647 | avg = ACCESS_ONCE(rq->rt_avg); |
5574 | 5648 | ||
5575 | total = sched_avg_period() + (rq_clock(rq) - age_stamp); | 5649 | delta = rq_clock(rq) - age_stamp; |
5650 | if (unlikely(delta < 0)) | ||
5651 | delta = 0; | ||
5652 | |||
5653 | total = sched_avg_period() + delta; | ||
5576 | 5654 | ||
5577 | if (unlikely(total < avg)) { | 5655 | if (unlikely(total < avg)) { |
5578 | /* Ensures that power won't end up being negative */ | 5656 | /* Ensures that capacity won't end up being negative */ |
5579 | available = 0; | 5657 | available = 0; |
5580 | } else { | 5658 | } else { |
5581 | available = total - avg; | 5659 | available = total - avg; |
5582 | } | 5660 | } |
5583 | 5661 | ||
5584 | if (unlikely((s64)total < SCHED_POWER_SCALE)) | 5662 | if (unlikely((s64)total < SCHED_CAPACITY_SCALE)) |
5585 | total = SCHED_POWER_SCALE; | 5663 | total = SCHED_CAPACITY_SCALE; |
5586 | 5664 | ||
5587 | total >>= SCHED_POWER_SHIFT; | 5665 | total >>= SCHED_CAPACITY_SHIFT; |
5588 | 5666 | ||
5589 | return div_u64(available, total); | 5667 | return div_u64(available, total); |
5590 | } | 5668 | } |
5591 | 5669 | ||
5592 | static void update_cpu_power(struct sched_domain *sd, int cpu) | 5670 | static void update_cpu_capacity(struct sched_domain *sd, int cpu) |
5593 | { | 5671 | { |
5594 | unsigned long weight = sd->span_weight; | 5672 | unsigned long weight = sd->span_weight; |
5595 | unsigned long power = SCHED_POWER_SCALE; | 5673 | unsigned long capacity = SCHED_CAPACITY_SCALE; |
5596 | struct sched_group *sdg = sd->groups; | 5674 | struct sched_group *sdg = sd->groups; |
5597 | 5675 | ||
5598 | if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { | 5676 | if ((sd->flags & SD_SHARE_CPUCAPACITY) && weight > 1) { |
5599 | if (sched_feat(ARCH_POWER)) | 5677 | if (sched_feat(ARCH_CAPACITY)) |
5600 | power *= arch_scale_smt_power(sd, cpu); | 5678 | capacity *= arch_scale_smt_capacity(sd, cpu); |
5601 | else | 5679 | else |
5602 | power *= default_scale_smt_power(sd, cpu); | 5680 | capacity *= default_scale_smt_capacity(sd, cpu); |
5603 | 5681 | ||
5604 | power >>= SCHED_POWER_SHIFT; | 5682 | capacity >>= SCHED_CAPACITY_SHIFT; |
5605 | } | 5683 | } |
5606 | 5684 | ||
5607 | sdg->sgp->power_orig = power; | 5685 | sdg->sgc->capacity_orig = capacity; |
5608 | 5686 | ||
5609 | if (sched_feat(ARCH_POWER)) | 5687 | if (sched_feat(ARCH_CAPACITY)) |
5610 | power *= arch_scale_freq_power(sd, cpu); | 5688 | capacity *= arch_scale_freq_capacity(sd, cpu); |
5611 | else | 5689 | else |
5612 | power *= default_scale_freq_power(sd, cpu); | 5690 | capacity *= default_scale_capacity(sd, cpu); |
5613 | 5691 | ||
5614 | power >>= SCHED_POWER_SHIFT; | 5692 | capacity >>= SCHED_CAPACITY_SHIFT; |
5615 | 5693 | ||
5616 | power *= scale_rt_power(cpu); | 5694 | capacity *= scale_rt_capacity(cpu); |
5617 | power >>= SCHED_POWER_SHIFT; | 5695 | capacity >>= SCHED_CAPACITY_SHIFT; |
5618 | 5696 | ||
5619 | if (!power) | 5697 | if (!capacity) |
5620 | power = 1; | 5698 | capacity = 1; |
5621 | 5699 | ||
5622 | cpu_rq(cpu)->cpu_power = power; | 5700 | cpu_rq(cpu)->cpu_capacity = capacity; |
5623 | sdg->sgp->power = power; | 5701 | sdg->sgc->capacity = capacity; |
5624 | } | 5702 | } |
5625 | 5703 | ||
5626 | void update_group_power(struct sched_domain *sd, int cpu) | 5704 | void update_group_capacity(struct sched_domain *sd, int cpu) |
5627 | { | 5705 | { |
5628 | struct sched_domain *child = sd->child; | 5706 | struct sched_domain *child = sd->child; |
5629 | struct sched_group *group, *sdg = sd->groups; | 5707 | struct sched_group *group, *sdg = sd->groups; |
5630 | unsigned long power, power_orig; | 5708 | unsigned long capacity, capacity_orig; |
5631 | unsigned long interval; | 5709 | unsigned long interval; |
5632 | 5710 | ||
5633 | interval = msecs_to_jiffies(sd->balance_interval); | 5711 | interval = msecs_to_jiffies(sd->balance_interval); |
5634 | interval = clamp(interval, 1UL, max_load_balance_interval); | 5712 | interval = clamp(interval, 1UL, max_load_balance_interval); |
5635 | sdg->sgp->next_update = jiffies + interval; | 5713 | sdg->sgc->next_update = jiffies + interval; |
5636 | 5714 | ||
5637 | if (!child) { | 5715 | if (!child) { |
5638 | update_cpu_power(sd, cpu); | 5716 | update_cpu_capacity(sd, cpu); |
5639 | return; | 5717 | return; |
5640 | } | 5718 | } |
5641 | 5719 | ||
5642 | power_orig = power = 0; | 5720 | capacity_orig = capacity = 0; |
5643 | 5721 | ||
5644 | if (child->flags & SD_OVERLAP) { | 5722 | if (child->flags & SD_OVERLAP) { |
5645 | /* | 5723 | /* |
@@ -5648,31 +5726,31 @@ void update_group_power(struct sched_domain *sd, int cpu) | |||
5648 | */ | 5726 | */ |
5649 | 5727 | ||
5650 | for_each_cpu(cpu, sched_group_cpus(sdg)) { | 5728 | for_each_cpu(cpu, sched_group_cpus(sdg)) { |
5651 | struct sched_group_power *sgp; | 5729 | struct sched_group_capacity *sgc; |
5652 | struct rq *rq = cpu_rq(cpu); | 5730 | struct rq *rq = cpu_rq(cpu); |
5653 | 5731 | ||
5654 | /* | 5732 | /* |
5655 | * build_sched_domains() -> init_sched_groups_power() | 5733 | * build_sched_domains() -> init_sched_groups_capacity() |
5656 | * gets here before we've attached the domains to the | 5734 | * gets here before we've attached the domains to the |
5657 | * runqueues. | 5735 | * runqueues. |
5658 | * | 5736 | * |
5659 | * Use power_of(), which is set irrespective of domains | 5737 | * Use capacity_of(), which is set irrespective of domains |
5660 | * in update_cpu_power(). | 5738 | * in update_cpu_capacity(). |
5661 | * | 5739 | * |
5662 | * This avoids power/power_orig from being 0 and | 5740 | * This avoids capacity/capacity_orig from being 0 and |
5663 | * causing divide-by-zero issues on boot. | 5741 | * causing divide-by-zero issues on boot. |
5664 | * | 5742 | * |
5665 | * Runtime updates will correct power_orig. | 5743 | * Runtime updates will correct capacity_orig. |
5666 | */ | 5744 | */ |
5667 | if (unlikely(!rq->sd)) { | 5745 | if (unlikely(!rq->sd)) { |
5668 | power_orig += power_of(cpu); | 5746 | capacity_orig += capacity_of(cpu); |
5669 | power += power_of(cpu); | 5747 | capacity += capacity_of(cpu); |
5670 | continue; | 5748 | continue; |
5671 | } | 5749 | } |
5672 | 5750 | ||
5673 | sgp = rq->sd->groups->sgp; | 5751 | sgc = rq->sd->groups->sgc; |
5674 | power_orig += sgp->power_orig; | 5752 | capacity_orig += sgc->capacity_orig; |
5675 | power += sgp->power; | 5753 | capacity += sgc->capacity; |
5676 | } | 5754 | } |
5677 | } else { | 5755 | } else { |
5678 | /* | 5756 | /* |
@@ -5682,14 +5760,14 @@ void update_group_power(struct sched_domain *sd, int cpu) | |||
5682 | 5760 | ||
5683 | group = child->groups; | 5761 | group = child->groups; |
5684 | do { | 5762 | do { |
5685 | power_orig += group->sgp->power_orig; | 5763 | capacity_orig += group->sgc->capacity_orig; |
5686 | power += group->sgp->power; | 5764 | capacity += group->sgc->capacity; |
5687 | group = group->next; | 5765 | group = group->next; |
5688 | } while (group != child->groups); | 5766 | } while (group != child->groups); |
5689 | } | 5767 | } |
5690 | 5768 | ||
5691 | sdg->sgp->power_orig = power_orig; | 5769 | sdg->sgc->capacity_orig = capacity_orig; |
5692 | sdg->sgp->power = power; | 5770 | sdg->sgc->capacity = capacity; |
5693 | } | 5771 | } |
5694 | 5772 | ||
5695 | /* | 5773 | /* |
@@ -5703,15 +5781,15 @@ static inline int | |||
5703 | fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | 5781 | fix_small_capacity(struct sched_domain *sd, struct sched_group *group) |
5704 | { | 5782 | { |
5705 | /* | 5783 | /* |
5706 | * Only siblings can have significantly less than SCHED_POWER_SCALE | 5784 | * Only siblings can have significantly less than SCHED_CAPACITY_SCALE |
5707 | */ | 5785 | */ |
5708 | if (!(sd->flags & SD_SHARE_CPUPOWER)) | 5786 | if (!(sd->flags & SD_SHARE_CPUCAPACITY)) |
5709 | return 0; | 5787 | return 0; |
5710 | 5788 | ||
5711 | /* | 5789 | /* |
5712 | * If ~90% of the cpu_power is still there, we're good. | 5790 | * If ~90% of the cpu_capacity is still there, we're good. |
5713 | */ | 5791 | */ |
5714 | if (group->sgp->power * 32 > group->sgp->power_orig * 29) | 5792 | if (group->sgc->capacity * 32 > group->sgc->capacity_orig * 29) |
5715 | return 1; | 5793 | return 1; |
5716 | 5794 | ||
5717 | return 0; | 5795 | return 0; |
@@ -5748,34 +5826,35 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | |||
5748 | 5826 | ||
5749 | static inline int sg_imbalanced(struct sched_group *group) | 5827 | static inline int sg_imbalanced(struct sched_group *group) |
5750 | { | 5828 | { |
5751 | return group->sgp->imbalance; | 5829 | return group->sgc->imbalance; |
5752 | } | 5830 | } |
5753 | 5831 | ||
5754 | /* | 5832 | /* |
5755 | * Compute the group capacity. | 5833 | * Compute the group capacity factor. |
5756 | * | 5834 | * |
5757 | * Avoid the issue where N*frac(smt_power) >= 1 creates 'phantom' cores by | 5835 | * Avoid the issue where N*frac(smt_capacity) >= 1 creates 'phantom' cores by |
5758 | * first dividing out the smt factor and computing the actual number of cores | 5836 | * first dividing out the smt factor and computing the actual number of cores |
5759 | * and limit power unit capacity with that. | 5837 | * and limit unit capacity with that. |
5760 | */ | 5838 | */ |
5761 | static inline int sg_capacity(struct lb_env *env, struct sched_group *group) | 5839 | static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *group) |
5762 | { | 5840 | { |
5763 | unsigned int capacity, smt, cpus; | 5841 | unsigned int capacity_factor, smt, cpus; |
5764 | unsigned int power, power_orig; | 5842 | unsigned int capacity, capacity_orig; |
5765 | 5843 | ||
5766 | power = group->sgp->power; | 5844 | capacity = group->sgc->capacity; |
5767 | power_orig = group->sgp->power_orig; | 5845 | capacity_orig = group->sgc->capacity_orig; |
5768 | cpus = group->group_weight; | 5846 | cpus = group->group_weight; |
5769 | 5847 | ||
5770 | /* smt := ceil(cpus / power), assumes: 1 < smt_power < 2 */ | 5848 | /* smt := ceil(cpus / capacity), assumes: 1 < smt_capacity < 2 */ |
5771 | smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, power_orig); | 5849 | smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, capacity_orig); |
5772 | capacity = cpus / smt; /* cores */ | 5850 | capacity_factor = cpus / smt; /* cores */ |
5773 | 5851 | ||
5774 | capacity = min_t(unsigned, capacity, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE)); | 5852 | capacity_factor = min_t(unsigned, |
5775 | if (!capacity) | 5853 | capacity_factor, DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE)); |
5776 | capacity = fix_small_capacity(env->sd, group); | 5854 | if (!capacity_factor) |
5855 | capacity_factor = fix_small_capacity(env->sd, group); | ||
5777 | 5856 | ||
5778 | return capacity; | 5857 | return capacity_factor; |
5779 | } | 5858 | } |
5780 | 5859 | ||
5781 | /** | 5860 | /** |
@@ -5815,9 +5894,9 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
5815 | sgs->idle_cpus++; | 5894 | sgs->idle_cpus++; |
5816 | } | 5895 | } |
5817 | 5896 | ||
5818 | /* Adjust by relative CPU power of the group */ | 5897 | /* Adjust by relative CPU capacity of the group */ |
5819 | sgs->group_power = group->sgp->power; | 5898 | sgs->group_capacity = group->sgc->capacity; |
5820 | sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power; | 5899 | sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity; |
5821 | 5900 | ||
5822 | if (sgs->sum_nr_running) | 5901 | if (sgs->sum_nr_running) |
5823 | sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; | 5902 | sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; |
@@ -5825,10 +5904,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
5825 | sgs->group_weight = group->group_weight; | 5904 | sgs->group_weight = group->group_weight; |
5826 | 5905 | ||
5827 | sgs->group_imb = sg_imbalanced(group); | 5906 | sgs->group_imb = sg_imbalanced(group); |
5828 | sgs->group_capacity = sg_capacity(env, group); | 5907 | sgs->group_capacity_factor = sg_capacity_factor(env, group); |
5829 | 5908 | ||
5830 | if (sgs->group_capacity > sgs->sum_nr_running) | 5909 | if (sgs->group_capacity_factor > sgs->sum_nr_running) |
5831 | sgs->group_has_capacity = 1; | 5910 | sgs->group_has_free_capacity = 1; |
5832 | } | 5911 | } |
5833 | 5912 | ||
5834 | /** | 5913 | /** |
@@ -5852,7 +5931,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, | |||
5852 | if (sgs->avg_load <= sds->busiest_stat.avg_load) | 5931 | if (sgs->avg_load <= sds->busiest_stat.avg_load) |
5853 | return false; | 5932 | return false; |
5854 | 5933 | ||
5855 | if (sgs->sum_nr_running > sgs->group_capacity) | 5934 | if (sgs->sum_nr_running > sgs->group_capacity_factor) |
5856 | return true; | 5935 | return true; |
5857 | 5936 | ||
5858 | if (sgs->group_imb) | 5937 | if (sgs->group_imb) |
@@ -5932,8 +6011,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd | |||
5932 | sgs = &sds->local_stat; | 6011 | sgs = &sds->local_stat; |
5933 | 6012 | ||
5934 | if (env->idle != CPU_NEWLY_IDLE || | 6013 | if (env->idle != CPU_NEWLY_IDLE || |
5935 | time_after_eq(jiffies, sg->sgp->next_update)) | 6014 | time_after_eq(jiffies, sg->sgc->next_update)) |
5936 | update_group_power(env->sd, env->dst_cpu); | 6015 | update_group_capacity(env->sd, env->dst_cpu); |
5937 | } | 6016 | } |
5938 | 6017 | ||
5939 | update_sg_lb_stats(env, sg, load_idx, local_group, sgs); | 6018 | update_sg_lb_stats(env, sg, load_idx, local_group, sgs); |
@@ -5943,17 +6022,17 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd | |||
5943 | 6022 | ||
5944 | /* | 6023 | /* |
5945 | * In case the child domain prefers tasks go to siblings | 6024 | * In case the child domain prefers tasks go to siblings |
5946 | * first, lower the sg capacity to one so that we'll try | 6025 | * first, lower the sg capacity factor to one so that we'll try |
5947 | * and move all the excess tasks away. We lower the capacity | 6026 | * and move all the excess tasks away. We lower the capacity |
5948 | * of a group only if the local group has the capacity to fit | 6027 | * of a group only if the local group has the capacity to fit |
5949 | * these excess tasks, i.e. nr_running < group_capacity. The | 6028 | * these excess tasks, i.e. nr_running < group_capacity_factor. The |
5950 | * extra check prevents the case where you always pull from the | 6029 | * extra check prevents the case where you always pull from the |
5951 | * heaviest group when it is already under-utilized (possible | 6030 | * heaviest group when it is already under-utilized (possible |
5952 | * with a large weight task outweighs the tasks on the system). | 6031 | * with a large weight task outweighs the tasks on the system). |
5953 | */ | 6032 | */ |
5954 | if (prefer_sibling && sds->local && | 6033 | if (prefer_sibling && sds->local && |
5955 | sds->local_stat.group_has_capacity) | 6034 | sds->local_stat.group_has_free_capacity) |
5956 | sgs->group_capacity = min(sgs->group_capacity, 1U); | 6035 | sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U); |
5957 | 6036 | ||
5958 | if (update_sd_pick_busiest(env, sds, sg, sgs)) { | 6037 | if (update_sd_pick_busiest(env, sds, sg, sgs)) { |
5959 | sds->busiest = sg; | 6038 | sds->busiest = sg; |
@@ -5963,7 +6042,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd | |||
5963 | next_group: | 6042 | next_group: |
5964 | /* Now, start updating sd_lb_stats */ | 6043 | /* Now, start updating sd_lb_stats */ |
5965 | sds->total_load += sgs->group_load; | 6044 | sds->total_load += sgs->group_load; |
5966 | sds->total_pwr += sgs->group_power; | 6045 | sds->total_capacity += sgs->group_capacity; |
5967 | 6046 | ||
5968 | sg = sg->next; | 6047 | sg = sg->next; |
5969 | } while (sg != env->sd->groups); | 6048 | } while (sg != env->sd->groups); |
@@ -6010,8 +6089,8 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) | |||
6010 | return 0; | 6089 | return 0; |
6011 | 6090 | ||
6012 | env->imbalance = DIV_ROUND_CLOSEST( | 6091 | env->imbalance = DIV_ROUND_CLOSEST( |
6013 | sds->busiest_stat.avg_load * sds->busiest_stat.group_power, | 6092 | sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity, |
6014 | SCHED_POWER_SCALE); | 6093 | SCHED_CAPACITY_SCALE); |
6015 | 6094 | ||
6016 | return 1; | 6095 | return 1; |
6017 | } | 6096 | } |
@@ -6026,7 +6105,7 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) | |||
6026 | static inline | 6105 | static inline |
6027 | void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) | 6106 | void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) |
6028 | { | 6107 | { |
6029 | unsigned long tmp, pwr_now = 0, pwr_move = 0; | 6108 | unsigned long tmp, capa_now = 0, capa_move = 0; |
6030 | unsigned int imbn = 2; | 6109 | unsigned int imbn = 2; |
6031 | unsigned long scaled_busy_load_per_task; | 6110 | unsigned long scaled_busy_load_per_task; |
6032 | struct sg_lb_stats *local, *busiest; | 6111 | struct sg_lb_stats *local, *busiest; |
@@ -6040,8 +6119,8 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) | |||
6040 | imbn = 1; | 6119 | imbn = 1; |
6041 | 6120 | ||
6042 | scaled_busy_load_per_task = | 6121 | scaled_busy_load_per_task = |
6043 | (busiest->load_per_task * SCHED_POWER_SCALE) / | 6122 | (busiest->load_per_task * SCHED_CAPACITY_SCALE) / |
6044 | busiest->group_power; | 6123 | busiest->group_capacity; |
6045 | 6124 | ||
6046 | if (busiest->avg_load + scaled_busy_load_per_task >= | 6125 | if (busiest->avg_load + scaled_busy_load_per_task >= |
6047 | local->avg_load + (scaled_busy_load_per_task * imbn)) { | 6126 | local->avg_load + (scaled_busy_load_per_task * imbn)) { |
@@ -6051,38 +6130,38 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) | |||
6051 | 6130 | ||
6052 | /* | 6131 | /* |
6053 | * OK, we don't have enough imbalance to justify moving tasks, | 6132 | * OK, we don't have enough imbalance to justify moving tasks, |
6054 | * however we may be able to increase total CPU power used by | 6133 | * however we may be able to increase total CPU capacity used by |
6055 | * moving them. | 6134 | * moving them. |
6056 | */ | 6135 | */ |
6057 | 6136 | ||
6058 | pwr_now += busiest->group_power * | 6137 | capa_now += busiest->group_capacity * |
6059 | min(busiest->load_per_task, busiest->avg_load); | 6138 | min(busiest->load_per_task, busiest->avg_load); |
6060 | pwr_now += local->group_power * | 6139 | capa_now += local->group_capacity * |
6061 | min(local->load_per_task, local->avg_load); | 6140 | min(local->load_per_task, local->avg_load); |
6062 | pwr_now /= SCHED_POWER_SCALE; | 6141 | capa_now /= SCHED_CAPACITY_SCALE; |
6063 | 6142 | ||
6064 | /* Amount of load we'd subtract */ | 6143 | /* Amount of load we'd subtract */ |
6065 | if (busiest->avg_load > scaled_busy_load_per_task) { | 6144 | if (busiest->avg_load > scaled_busy_load_per_task) { |
6066 | pwr_move += busiest->group_power * | 6145 | capa_move += busiest->group_capacity * |
6067 | min(busiest->load_per_task, | 6146 | min(busiest->load_per_task, |
6068 | busiest->avg_load - scaled_busy_load_per_task); | 6147 | busiest->avg_load - scaled_busy_load_per_task); |
6069 | } | 6148 | } |
6070 | 6149 | ||
6071 | /* Amount of load we'd add */ | 6150 | /* Amount of load we'd add */ |
6072 | if (busiest->avg_load * busiest->group_power < | 6151 | if (busiest->avg_load * busiest->group_capacity < |
6073 | busiest->load_per_task * SCHED_POWER_SCALE) { | 6152 | busiest->load_per_task * SCHED_CAPACITY_SCALE) { |
6074 | tmp = (busiest->avg_load * busiest->group_power) / | 6153 | tmp = (busiest->avg_load * busiest->group_capacity) / |
6075 | local->group_power; | 6154 | local->group_capacity; |
6076 | } else { | 6155 | } else { |
6077 | tmp = (busiest->load_per_task * SCHED_POWER_SCALE) / | 6156 | tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) / |
6078 | local->group_power; | 6157 | local->group_capacity; |
6079 | } | 6158 | } |
6080 | pwr_move += local->group_power * | 6159 | capa_move += local->group_capacity * |
6081 | min(local->load_per_task, local->avg_load + tmp); | 6160 | min(local->load_per_task, local->avg_load + tmp); |
6082 | pwr_move /= SCHED_POWER_SCALE; | 6161 | capa_move /= SCHED_CAPACITY_SCALE; |
6083 | 6162 | ||
6084 | /* Move if we gain throughput */ | 6163 | /* Move if we gain throughput */ |
6085 | if (pwr_move > pwr_now) | 6164 | if (capa_move > capa_now) |
6086 | env->imbalance = busiest->load_per_task; | 6165 | env->imbalance = busiest->load_per_task; |
6087 | } | 6166 | } |
6088 | 6167 | ||
@@ -6112,7 +6191,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
6112 | /* | 6191 | /* |
6113 | * In the presence of smp nice balancing, certain scenarios can have | 6192 | * In the presence of smp nice balancing, certain scenarios can have |
6114 | * max load less than avg load(as we skip the groups at or below | 6193 | * max load less than avg load(as we skip the groups at or below |
6115 | * its cpu_power, while calculating max_load..) | 6194 | * its cpu_capacity, while calculating max_load..) |
6116 | */ | 6195 | */ |
6117 | if (busiest->avg_load <= sds->avg_load || | 6196 | if (busiest->avg_load <= sds->avg_load || |
6118 | local->avg_load >= sds->avg_load) { | 6197 | local->avg_load >= sds->avg_load) { |
@@ -6127,10 +6206,10 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
6127 | * have to drop below capacity to reach cpu-load equilibrium. | 6206 | * have to drop below capacity to reach cpu-load equilibrium. |
6128 | */ | 6207 | */ |
6129 | load_above_capacity = | 6208 | load_above_capacity = |
6130 | (busiest->sum_nr_running - busiest->group_capacity); | 6209 | (busiest->sum_nr_running - busiest->group_capacity_factor); |
6131 | 6210 | ||
6132 | load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); | 6211 | load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_CAPACITY_SCALE); |
6133 | load_above_capacity /= busiest->group_power; | 6212 | load_above_capacity /= busiest->group_capacity; |
6134 | } | 6213 | } |
6135 | 6214 | ||
6136 | /* | 6215 | /* |
@@ -6145,9 +6224,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
6145 | 6224 | ||
6146 | /* How much load to actually move to equalise the imbalance */ | 6225 | /* How much load to actually move to equalise the imbalance */ |
6147 | env->imbalance = min( | 6226 | env->imbalance = min( |
6148 | max_pull * busiest->group_power, | 6227 | max_pull * busiest->group_capacity, |
6149 | (sds->avg_load - local->avg_load) * local->group_power | 6228 | (sds->avg_load - local->avg_load) * local->group_capacity |
6150 | ) / SCHED_POWER_SCALE; | 6229 | ) / SCHED_CAPACITY_SCALE; |
6151 | 6230 | ||
6152 | /* | 6231 | /* |
6153 | * if *imbalance is less than the average load per runnable task | 6232 | * if *imbalance is less than the average load per runnable task |
@@ -6201,7 +6280,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env) | |||
6201 | if (!sds.busiest || busiest->sum_nr_running == 0) | 6280 | if (!sds.busiest || busiest->sum_nr_running == 0) |
6202 | goto out_balanced; | 6281 | goto out_balanced; |
6203 | 6282 | ||
6204 | sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr; | 6283 | sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load) |
6284 | / sds.total_capacity; | ||
6205 | 6285 | ||
6206 | /* | 6286 | /* |
6207 | * If the busiest group is imbalanced the below checks don't | 6287 | * If the busiest group is imbalanced the below checks don't |
@@ -6212,8 +6292,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env) | |||
6212 | goto force_balance; | 6292 | goto force_balance; |
6213 | 6293 | ||
6214 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ | 6294 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ |
6215 | if (env->idle == CPU_NEWLY_IDLE && local->group_has_capacity && | 6295 | if (env->idle == CPU_NEWLY_IDLE && local->group_has_free_capacity && |
6216 | !busiest->group_has_capacity) | 6296 | !busiest->group_has_free_capacity) |
6217 | goto force_balance; | 6297 | goto force_balance; |
6218 | 6298 | ||
6219 | /* | 6299 | /* |
@@ -6267,11 +6347,11 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
6267 | struct sched_group *group) | 6347 | struct sched_group *group) |
6268 | { | 6348 | { |
6269 | struct rq *busiest = NULL, *rq; | 6349 | struct rq *busiest = NULL, *rq; |
6270 | unsigned long busiest_load = 0, busiest_power = 1; | 6350 | unsigned long busiest_load = 0, busiest_capacity = 1; |
6271 | int i; | 6351 | int i; |
6272 | 6352 | ||
6273 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { | 6353 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { |
6274 | unsigned long power, capacity, wl; | 6354 | unsigned long capacity, capacity_factor, wl; |
6275 | enum fbq_type rt; | 6355 | enum fbq_type rt; |
6276 | 6356 | ||
6277 | rq = cpu_rq(i); | 6357 | rq = cpu_rq(i); |
@@ -6299,34 +6379,34 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
6299 | if (rt > env->fbq_type) | 6379 | if (rt > env->fbq_type) |
6300 | continue; | 6380 | continue; |
6301 | 6381 | ||
6302 | power = power_of(i); | 6382 | capacity = capacity_of(i); |
6303 | capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); | 6383 | capacity_factor = DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE); |
6304 | if (!capacity) | 6384 | if (!capacity_factor) |
6305 | capacity = fix_small_capacity(env->sd, group); | 6385 | capacity_factor = fix_small_capacity(env->sd, group); |
6306 | 6386 | ||
6307 | wl = weighted_cpuload(i); | 6387 | wl = weighted_cpuload(i); |
6308 | 6388 | ||
6309 | /* | 6389 | /* |
6310 | * When comparing with imbalance, use weighted_cpuload() | 6390 | * When comparing with imbalance, use weighted_cpuload() |
6311 | * which is not scaled with the cpu power. | 6391 | * which is not scaled with the cpu capacity. |
6312 | */ | 6392 | */ |
6313 | if (capacity && rq->nr_running == 1 && wl > env->imbalance) | 6393 | if (capacity_factor && rq->nr_running == 1 && wl > env->imbalance) |
6314 | continue; | 6394 | continue; |
6315 | 6395 | ||
6316 | /* | 6396 | /* |
6317 | * For the load comparisons with the other cpu's, consider | 6397 | * For the load comparisons with the other cpu's, consider |
6318 | * the weighted_cpuload() scaled with the cpu power, so that | 6398 | * the weighted_cpuload() scaled with the cpu capacity, so |
6319 | * the load can be moved away from the cpu that is potentially | 6399 | * that the load can be moved away from the cpu that is |
6320 | * running at a lower capacity. | 6400 | * potentially running at a lower capacity. |
6321 | * | 6401 | * |
6322 | * Thus we're looking for max(wl_i / power_i), crosswise | 6402 | * Thus we're looking for max(wl_i / capacity_i), crosswise |
6323 | * multiplication to rid ourselves of the division works out | 6403 | * multiplication to rid ourselves of the division works out |
6324 | * to: wl_i * power_j > wl_j * power_i; where j is our | 6404 | * to: wl_i * capacity_j > wl_j * capacity_i; where j is |
6325 | * previous maximum. | 6405 | * our previous maximum. |
6326 | */ | 6406 | */ |
6327 | if (wl * busiest_power > busiest_load * power) { | 6407 | if (wl * busiest_capacity > busiest_load * capacity) { |
6328 | busiest_load = wl; | 6408 | busiest_load = wl; |
6329 | busiest_power = power; | 6409 | busiest_capacity = capacity; |
6330 | busiest = rq; | 6410 | busiest = rq; |
6331 | } | 6411 | } |
6332 | } | 6412 | } |
@@ -6534,7 +6614,7 @@ more_balance: | |||
6534 | * We failed to reach balance because of affinity. | 6614 | * We failed to reach balance because of affinity. |
6535 | */ | 6615 | */ |
6536 | if (sd_parent) { | 6616 | if (sd_parent) { |
6537 | int *group_imbalance = &sd_parent->groups->sgp->imbalance; | 6617 | int *group_imbalance = &sd_parent->groups->sgc->imbalance; |
6538 | 6618 | ||
6539 | if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { | 6619 | if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { |
6540 | *group_imbalance = 1; | 6620 | *group_imbalance = 1; |
@@ -6640,27 +6720,62 @@ out: | |||
6640 | return ld_moved; | 6720 | return ld_moved; |
6641 | } | 6721 | } |
6642 | 6722 | ||
6723 | static inline unsigned long | ||
6724 | get_sd_balance_interval(struct sched_domain *sd, int cpu_busy) | ||
6725 | { | ||
6726 | unsigned long interval = sd->balance_interval; | ||
6727 | |||
6728 | if (cpu_busy) | ||
6729 | interval *= sd->busy_factor; | ||
6730 | |||
6731 | /* scale ms to jiffies */ | ||
6732 | interval = msecs_to_jiffies(interval); | ||
6733 | interval = clamp(interval, 1UL, max_load_balance_interval); | ||
6734 | |||
6735 | return interval; | ||
6736 | } | ||
6737 | |||
6738 | static inline void | ||
6739 | update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance) | ||
6740 | { | ||
6741 | unsigned long interval, next; | ||
6742 | |||
6743 | interval = get_sd_balance_interval(sd, cpu_busy); | ||
6744 | next = sd->last_balance + interval; | ||
6745 | |||
6746 | if (time_after(*next_balance, next)) | ||
6747 | *next_balance = next; | ||
6748 | } | ||
6749 | |||
6643 | /* | 6750 | /* |
6644 | * idle_balance is called by schedule() if this_cpu is about to become | 6751 | * idle_balance is called by schedule() if this_cpu is about to become |
6645 | * idle. Attempts to pull tasks from other CPUs. | 6752 | * idle. Attempts to pull tasks from other CPUs. |
6646 | */ | 6753 | */ |
6647 | static int idle_balance(struct rq *this_rq) | 6754 | static int idle_balance(struct rq *this_rq) |
6648 | { | 6755 | { |
6756 | unsigned long next_balance = jiffies + HZ; | ||
6757 | int this_cpu = this_rq->cpu; | ||
6649 | struct sched_domain *sd; | 6758 | struct sched_domain *sd; |
6650 | int pulled_task = 0; | 6759 | int pulled_task = 0; |
6651 | unsigned long next_balance = jiffies + HZ; | ||
6652 | u64 curr_cost = 0; | 6760 | u64 curr_cost = 0; |
6653 | int this_cpu = this_rq->cpu; | ||
6654 | 6761 | ||
6655 | idle_enter_fair(this_rq); | 6762 | idle_enter_fair(this_rq); |
6763 | |||
6656 | /* | 6764 | /* |
6657 | * We must set idle_stamp _before_ calling idle_balance(), such that we | 6765 | * We must set idle_stamp _before_ calling idle_balance(), such that we |
6658 | * measure the duration of idle_balance() as idle time. | 6766 | * measure the duration of idle_balance() as idle time. |
6659 | */ | 6767 | */ |
6660 | this_rq->idle_stamp = rq_clock(this_rq); | 6768 | this_rq->idle_stamp = rq_clock(this_rq); |
6661 | 6769 | ||
6662 | if (this_rq->avg_idle < sysctl_sched_migration_cost) | 6770 | if (this_rq->avg_idle < sysctl_sched_migration_cost) { |
6771 | rcu_read_lock(); | ||
6772 | sd = rcu_dereference_check_sched_domain(this_rq->sd); | ||
6773 | if (sd) | ||
6774 | update_next_balance(sd, 0, &next_balance); | ||
6775 | rcu_read_unlock(); | ||
6776 | |||
6663 | goto out; | 6777 | goto out; |
6778 | } | ||
6664 | 6779 | ||
6665 | /* | 6780 | /* |
6666 | * Drop the rq->lock, but keep IRQ/preempt disabled. | 6781 | * Drop the rq->lock, but keep IRQ/preempt disabled. |
@@ -6670,20 +6785,20 @@ static int idle_balance(struct rq *this_rq) | |||
6670 | update_blocked_averages(this_cpu); | 6785 | update_blocked_averages(this_cpu); |
6671 | rcu_read_lock(); | 6786 | rcu_read_lock(); |
6672 | for_each_domain(this_cpu, sd) { | 6787 | for_each_domain(this_cpu, sd) { |
6673 | unsigned long interval; | ||
6674 | int continue_balancing = 1; | 6788 | int continue_balancing = 1; |
6675 | u64 t0, domain_cost; | 6789 | u64 t0, domain_cost; |
6676 | 6790 | ||
6677 | if (!(sd->flags & SD_LOAD_BALANCE)) | 6791 | if (!(sd->flags & SD_LOAD_BALANCE)) |
6678 | continue; | 6792 | continue; |
6679 | 6793 | ||
6680 | if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) | 6794 | if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { |
6795 | update_next_balance(sd, 0, &next_balance); | ||
6681 | break; | 6796 | break; |
6797 | } | ||
6682 | 6798 | ||
6683 | if (sd->flags & SD_BALANCE_NEWIDLE) { | 6799 | if (sd->flags & SD_BALANCE_NEWIDLE) { |
6684 | t0 = sched_clock_cpu(this_cpu); | 6800 | t0 = sched_clock_cpu(this_cpu); |
6685 | 6801 | ||
6686 | /* If we've pulled tasks over stop searching: */ | ||
6687 | pulled_task = load_balance(this_cpu, this_rq, | 6802 | pulled_task = load_balance(this_cpu, this_rq, |
6688 | sd, CPU_NEWLY_IDLE, | 6803 | sd, CPU_NEWLY_IDLE, |
6689 | &continue_balancing); | 6804 | &continue_balancing); |
@@ -6695,42 +6810,37 @@ static int idle_balance(struct rq *this_rq) | |||
6695 | curr_cost += domain_cost; | 6810 | curr_cost += domain_cost; |
6696 | } | 6811 | } |
6697 | 6812 | ||
6698 | interval = msecs_to_jiffies(sd->balance_interval); | 6813 | update_next_balance(sd, 0, &next_balance); |
6699 | if (time_after(next_balance, sd->last_balance + interval)) | 6814 | |
6700 | next_balance = sd->last_balance + interval; | 6815 | /* |
6701 | if (pulled_task) | 6816 | * Stop searching for tasks to pull if there are |
6817 | * now runnable tasks on this rq. | ||
6818 | */ | ||
6819 | if (pulled_task || this_rq->nr_running > 0) | ||
6702 | break; | 6820 | break; |
6703 | } | 6821 | } |
6704 | rcu_read_unlock(); | 6822 | rcu_read_unlock(); |
6705 | 6823 | ||
6706 | raw_spin_lock(&this_rq->lock); | 6824 | raw_spin_lock(&this_rq->lock); |
6707 | 6825 | ||
6826 | if (curr_cost > this_rq->max_idle_balance_cost) | ||
6827 | this_rq->max_idle_balance_cost = curr_cost; | ||
6828 | |||
6708 | /* | 6829 | /* |
6709 | * While browsing the domains, we released the rq lock. | 6830 | * While browsing the domains, we released the rq lock, a task could |
6710 | * A task could have be enqueued in the meantime | 6831 | * have been enqueued in the meantime. Since we're not going idle, |
6832 | * pretend we pulled a task. | ||
6711 | */ | 6833 | */ |
6712 | if (this_rq->cfs.h_nr_running && !pulled_task) { | 6834 | if (this_rq->cfs.h_nr_running && !pulled_task) |
6713 | pulled_task = 1; | 6835 | pulled_task = 1; |
6714 | goto out; | ||
6715 | } | ||
6716 | 6836 | ||
6717 | if (pulled_task || time_after(jiffies, this_rq->next_balance)) { | 6837 | out: |
6718 | /* | 6838 | /* Move the next balance forward */ |
6719 | * We are going idle. next_balance may be set based on | 6839 | if (time_after(this_rq->next_balance, next_balance)) |
6720 | * a busy processor. So reset next_balance. | ||
6721 | */ | ||
6722 | this_rq->next_balance = next_balance; | 6840 | this_rq->next_balance = next_balance; |
6723 | } | ||
6724 | 6841 | ||
6725 | if (curr_cost > this_rq->max_idle_balance_cost) | ||
6726 | this_rq->max_idle_balance_cost = curr_cost; | ||
6727 | |||
6728 | out: | ||
6729 | /* Is there a task of a high priority class? */ | 6842 | /* Is there a task of a high priority class? */ |
6730 | if (this_rq->nr_running != this_rq->cfs.h_nr_running && | 6843 | if (this_rq->nr_running != this_rq->cfs.h_nr_running) |
6731 | ((this_rq->stop && this_rq->stop->on_rq) || | ||
6732 | this_rq->dl.dl_nr_running || | ||
6733 | (this_rq->rt.rt_nr_running && !rt_rq_throttled(&this_rq->rt)))) | ||
6734 | pulled_task = -1; | 6844 | pulled_task = -1; |
6735 | 6845 | ||
6736 | if (pulled_task) { | 6846 | if (pulled_task) { |
@@ -6891,7 +7001,7 @@ static inline void set_cpu_sd_state_busy(void) | |||
6891 | goto unlock; | 7001 | goto unlock; |
6892 | sd->nohz_idle = 0; | 7002 | sd->nohz_idle = 0; |
6893 | 7003 | ||
6894 | atomic_inc(&sd->groups->sgp->nr_busy_cpus); | 7004 | atomic_inc(&sd->groups->sgc->nr_busy_cpus); |
6895 | unlock: | 7005 | unlock: |
6896 | rcu_read_unlock(); | 7006 | rcu_read_unlock(); |
6897 | } | 7007 | } |
@@ -6908,7 +7018,7 @@ void set_cpu_sd_state_idle(void) | |||
6908 | goto unlock; | 7018 | goto unlock; |
6909 | sd->nohz_idle = 1; | 7019 | sd->nohz_idle = 1; |
6910 | 7020 | ||
6911 | atomic_dec(&sd->groups->sgp->nr_busy_cpus); | 7021 | atomic_dec(&sd->groups->sgc->nr_busy_cpus); |
6912 | unlock: | 7022 | unlock: |
6913 | rcu_read_unlock(); | 7023 | rcu_read_unlock(); |
6914 | } | 7024 | } |
@@ -7011,16 +7121,9 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) | |||
7011 | break; | 7121 | break; |
7012 | } | 7122 | } |
7013 | 7123 | ||
7014 | interval = sd->balance_interval; | 7124 | interval = get_sd_balance_interval(sd, idle != CPU_IDLE); |
7015 | if (idle != CPU_IDLE) | ||
7016 | interval *= sd->busy_factor; | ||
7017 | |||
7018 | /* scale ms to jiffies */ | ||
7019 | interval = msecs_to_jiffies(interval); | ||
7020 | interval = clamp(interval, 1UL, max_load_balance_interval); | ||
7021 | 7125 | ||
7022 | need_serialize = sd->flags & SD_SERIALIZE; | 7126 | need_serialize = sd->flags & SD_SERIALIZE; |
7023 | |||
7024 | if (need_serialize) { | 7127 | if (need_serialize) { |
7025 | if (!spin_trylock(&balancing)) | 7128 | if (!spin_trylock(&balancing)) |
7026 | goto out; | 7129 | goto out; |
@@ -7036,6 +7139,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) | |||
7036 | idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE; | 7139 | idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE; |
7037 | } | 7140 | } |
7038 | sd->last_balance = jiffies; | 7141 | sd->last_balance = jiffies; |
7142 | interval = get_sd_balance_interval(sd, idle != CPU_IDLE); | ||
7039 | } | 7143 | } |
7040 | if (need_serialize) | 7144 | if (need_serialize) |
7041 | spin_unlock(&balancing); | 7145 | spin_unlock(&balancing); |
@@ -7093,12 +7197,17 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) | |||
7093 | 7197 | ||
7094 | rq = cpu_rq(balance_cpu); | 7198 | rq = cpu_rq(balance_cpu); |
7095 | 7199 | ||
7096 | raw_spin_lock_irq(&rq->lock); | 7200 | /* |
7097 | update_rq_clock(rq); | 7201 | * If time for next balance is due, |
7098 | update_idle_cpu_load(rq); | 7202 | * do the balance. |
7099 | raw_spin_unlock_irq(&rq->lock); | 7203 | */ |
7100 | 7204 | if (time_after_eq(jiffies, rq->next_balance)) { | |
7101 | rebalance_domains(rq, CPU_IDLE); | 7205 | raw_spin_lock_irq(&rq->lock); |
7206 | update_rq_clock(rq); | ||
7207 | update_idle_cpu_load(rq); | ||
7208 | raw_spin_unlock_irq(&rq->lock); | ||
7209 | rebalance_domains(rq, CPU_IDLE); | ||
7210 | } | ||
7102 | 7211 | ||
7103 | if (time_after(this_rq->next_balance, rq->next_balance)) | 7212 | if (time_after(this_rq->next_balance, rq->next_balance)) |
7104 | this_rq->next_balance = rq->next_balance; | 7213 | this_rq->next_balance = rq->next_balance; |
@@ -7113,7 +7222,7 @@ end: | |||
7113 | * of an idle cpu is the system. | 7222 | * of an idle cpu is the system. |
7114 | * - This rq has more than one task. | 7223 | * - This rq has more than one task. |
7115 | * - At any scheduler domain level, this cpu's scheduler group has multiple | 7224 | * - At any scheduler domain level, this cpu's scheduler group has multiple |
7116 | * busy cpu's exceeding the group's power. | 7225 | * busy cpu's exceeding the group's capacity. |
7117 | * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler | 7226 | * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler |
7118 | * domain span are idle. | 7227 | * domain span are idle. |
7119 | */ | 7228 | */ |
@@ -7121,7 +7230,7 @@ static inline int nohz_kick_needed(struct rq *rq) | |||
7121 | { | 7230 | { |
7122 | unsigned long now = jiffies; | 7231 | unsigned long now = jiffies; |
7123 | struct sched_domain *sd; | 7232 | struct sched_domain *sd; |
7124 | struct sched_group_power *sgp; | 7233 | struct sched_group_capacity *sgc; |
7125 | int nr_busy, cpu = rq->cpu; | 7234 | int nr_busy, cpu = rq->cpu; |
7126 | 7235 | ||
7127 | if (unlikely(rq->idle_balance)) | 7236 | if (unlikely(rq->idle_balance)) |
@@ -7151,8 +7260,8 @@ static inline int nohz_kick_needed(struct rq *rq) | |||
7151 | sd = rcu_dereference(per_cpu(sd_busy, cpu)); | 7260 | sd = rcu_dereference(per_cpu(sd_busy, cpu)); |
7152 | 7261 | ||
7153 | if (sd) { | 7262 | if (sd) { |
7154 | sgp = sd->groups->sgp; | 7263 | sgc = sd->groups->sgc; |
7155 | nr_busy = atomic_read(&sgp->nr_busy_cpus); | 7264 | nr_busy = atomic_read(&sgc->nr_busy_cpus); |
7156 | 7265 | ||
7157 | if (nr_busy > 1) | 7266 | if (nr_busy > 1) |
7158 | goto need_kick_unlock; | 7267 | goto need_kick_unlock; |
diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 5716929a2e3a..90284d117fe6 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h | |||
@@ -37,18 +37,18 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true) | |||
37 | SCHED_FEAT(WAKEUP_PREEMPTION, true) | 37 | SCHED_FEAT(WAKEUP_PREEMPTION, true) |
38 | 38 | ||
39 | /* | 39 | /* |
40 | * Use arch dependent cpu power functions | 40 | * Use arch dependent cpu capacity functions |
41 | */ | 41 | */ |
42 | SCHED_FEAT(ARCH_POWER, true) | 42 | SCHED_FEAT(ARCH_CAPACITY, true) |
43 | 43 | ||
44 | SCHED_FEAT(HRTICK, false) | 44 | SCHED_FEAT(HRTICK, false) |
45 | SCHED_FEAT(DOUBLE_TICK, false) | 45 | SCHED_FEAT(DOUBLE_TICK, false) |
46 | SCHED_FEAT(LB_BIAS, true) | 46 | SCHED_FEAT(LB_BIAS, true) |
47 | 47 | ||
48 | /* | 48 | /* |
49 | * Decrement CPU power based on time not spent running tasks | 49 | * Decrement CPU capacity based on time not spent running tasks |
50 | */ | 50 | */ |
51 | SCHED_FEAT(NONTASK_POWER, true) | 51 | SCHED_FEAT(NONTASK_CAPACITY, true) |
52 | 52 | ||
53 | /* | 53 | /* |
54 | * Queue remote wakeups on the target CPU and process them | 54 | * Queue remote wakeups on the target CPU and process them |
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 8f4390a079c7..cf009fb0bc25 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c | |||
@@ -12,6 +12,8 @@ | |||
12 | 12 | ||
13 | #include <trace/events/power.h> | 13 | #include <trace/events/power.h> |
14 | 14 | ||
15 | #include "sched.h" | ||
16 | |||
15 | static int __read_mostly cpu_idle_force_poll; | 17 | static int __read_mostly cpu_idle_force_poll; |
16 | 18 | ||
17 | void cpu_idle_poll_ctrl(bool enable) | 19 | void cpu_idle_poll_ctrl(bool enable) |
@@ -67,24 +69,25 @@ void __weak arch_cpu_idle(void) | |||
67 | * cpuidle_idle_call - the main idle function | 69 | * cpuidle_idle_call - the main idle function |
68 | * | 70 | * |
69 | * NOTE: no locks or semaphores should be used here | 71 | * NOTE: no locks or semaphores should be used here |
70 | * return non-zero on failure | 72 | * |
73 | * On archs that support TIF_POLLING_NRFLAG, is called with polling | ||
74 | * set, and it returns with polling set. If it ever stops polling, it | ||
75 | * must clear the polling bit. | ||
71 | */ | 76 | */ |
72 | static int cpuidle_idle_call(void) | 77 | static void cpuidle_idle_call(void) |
73 | { | 78 | { |
74 | struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); | 79 | struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); |
75 | struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); | 80 | struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); |
76 | int next_state, entered_state, ret; | 81 | int next_state, entered_state; |
77 | bool broadcast; | 82 | bool broadcast; |
78 | 83 | ||
79 | /* | 84 | /* |
80 | * Check if the idle task must be rescheduled. If it is the | 85 | * Check if the idle task must be rescheduled. If it is the |
81 | * case, exit the function after re-enabling the local irq and | 86 | * case, exit the function after re-enabling the local irq. |
82 | * set again the polling flag | ||
83 | */ | 87 | */ |
84 | if (current_clr_polling_and_test()) { | 88 | if (need_resched()) { |
85 | local_irq_enable(); | 89 | local_irq_enable(); |
86 | __current_set_polling(); | 90 | return; |
87 | return 0; | ||
88 | } | 91 | } |
89 | 92 | ||
90 | /* | 93 | /* |
@@ -101,104 +104,99 @@ static int cpuidle_idle_call(void) | |||
101 | rcu_idle_enter(); | 104 | rcu_idle_enter(); |
102 | 105 | ||
103 | /* | 106 | /* |
104 | * Check if the cpuidle framework is ready, otherwise fallback | 107 | * Ask the cpuidle framework to choose a convenient idle state. |
105 | * to the default arch specific idle method | 108 | * Fall back to the default arch idle method on errors. |
106 | */ | 109 | */ |
107 | ret = cpuidle_enabled(drv, dev); | 110 | next_state = cpuidle_select(drv, dev); |
108 | 111 | if (next_state < 0) { | |
109 | if (!ret) { | 112 | use_default: |
110 | /* | 113 | /* |
111 | * Ask the governor to choose an idle state it thinks | 114 | * We can't use the cpuidle framework, let's use the default |
112 | * it is convenient to go to. There is *always* a | 115 | * idle routine. |
113 | * convenient idle state | ||
114 | */ | 116 | */ |
115 | next_state = cpuidle_select(drv, dev); | 117 | if (current_clr_polling_and_test()) |
116 | |||
117 | /* | ||
118 | * The idle task must be scheduled, it is pointless to | ||
119 | * go to idle, just update no idle residency and get | ||
120 | * out of this function | ||
121 | */ | ||
122 | if (current_clr_polling_and_test()) { | ||
123 | dev->last_residency = 0; | ||
124 | entered_state = next_state; | ||
125 | local_irq_enable(); | 118 | local_irq_enable(); |
126 | } else { | 119 | else |
127 | broadcast = !!(drv->states[next_state].flags & | 120 | arch_cpu_idle(); |
128 | CPUIDLE_FLAG_TIMER_STOP); | 121 | |
129 | 122 | goto exit_idle; | |
130 | if (broadcast) | ||
131 | /* | ||
132 | * Tell the time framework to switch | ||
133 | * to a broadcast timer because our | ||
134 | * local timer will be shutdown. If a | ||
135 | * local timer is used from another | ||
136 | * cpu as a broadcast timer, this call | ||
137 | * may fail if it is not available | ||
138 | */ | ||
139 | ret = clockevents_notify( | ||
140 | CLOCK_EVT_NOTIFY_BROADCAST_ENTER, | ||
141 | &dev->cpu); | ||
142 | |||
143 | if (!ret) { | ||
144 | trace_cpu_idle_rcuidle(next_state, dev->cpu); | ||
145 | |||
146 | /* | ||
147 | * Enter the idle state previously | ||
148 | * returned by the governor | ||
149 | * decision. This function will block | ||
150 | * until an interrupt occurs and will | ||
151 | * take care of re-enabling the local | ||
152 | * interrupts | ||
153 | */ | ||
154 | entered_state = cpuidle_enter(drv, dev, | ||
155 | next_state); | ||
156 | |||
157 | trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, | ||
158 | dev->cpu); | ||
159 | |||
160 | if (broadcast) | ||
161 | clockevents_notify( | ||
162 | CLOCK_EVT_NOTIFY_BROADCAST_EXIT, | ||
163 | &dev->cpu); | ||
164 | |||
165 | /* | ||
166 | * Give the governor an opportunity to reflect on the | ||
167 | * outcome | ||
168 | */ | ||
169 | cpuidle_reflect(dev, entered_state); | ||
170 | } | ||
171 | } | ||
172 | } | 123 | } |
173 | 124 | ||
125 | |||
126 | /* | ||
127 | * The idle task must be scheduled, it is pointless to | ||
128 | * go to idle, just update no idle residency and get | ||
129 | * out of this function | ||
130 | */ | ||
131 | if (current_clr_polling_and_test()) { | ||
132 | dev->last_residency = 0; | ||
133 | entered_state = next_state; | ||
134 | local_irq_enable(); | ||
135 | goto exit_idle; | ||
136 | } | ||
137 | |||
138 | broadcast = !!(drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP); | ||
139 | |||
174 | /* | 140 | /* |
175 | * We can't use the cpuidle framework, let's use the default | 141 | * Tell the time framework to switch to a broadcast timer |
176 | * idle routine | 142 | * because our local timer will be shutdown. If a local timer |
143 | * is used from another cpu as a broadcast timer, this call may | ||
144 | * fail if it is not available | ||
177 | */ | 145 | */ |
178 | if (ret) | 146 | if (broadcast && |
179 | arch_cpu_idle(); | 147 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu)) |
148 | goto use_default; | ||
180 | 149 | ||
150 | trace_cpu_idle_rcuidle(next_state, dev->cpu); | ||
151 | |||
152 | /* | ||
153 | * Enter the idle state previously returned by the governor decision. | ||
154 | * This function will block until an interrupt occurs and will take | ||
155 | * care of re-enabling the local interrupts | ||
156 | */ | ||
157 | entered_state = cpuidle_enter(drv, dev, next_state); | ||
158 | |||
159 | trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu); | ||
160 | |||
161 | if (broadcast) | ||
162 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu); | ||
163 | |||
164 | /* | ||
165 | * Give the governor an opportunity to reflect on the outcome | ||
166 | */ | ||
167 | cpuidle_reflect(dev, entered_state); | ||
168 | |||
169 | exit_idle: | ||
181 | __current_set_polling(); | 170 | __current_set_polling(); |
182 | 171 | ||
183 | /* | 172 | /* |
184 | * It is up to the idle functions to enable back the local | 173 | * It is up to the idle functions to reenable local interrupts |
185 | * interrupt | ||
186 | */ | 174 | */ |
187 | if (WARN_ON_ONCE(irqs_disabled())) | 175 | if (WARN_ON_ONCE(irqs_disabled())) |
188 | local_irq_enable(); | 176 | local_irq_enable(); |
189 | 177 | ||
190 | rcu_idle_exit(); | 178 | rcu_idle_exit(); |
191 | start_critical_timings(); | 179 | start_critical_timings(); |
192 | |||
193 | return 0; | ||
194 | } | 180 | } |
195 | 181 | ||
196 | /* | 182 | /* |
197 | * Generic idle loop implementation | 183 | * Generic idle loop implementation |
184 | * | ||
185 | * Called with polling cleared. | ||
198 | */ | 186 | */ |
199 | static void cpu_idle_loop(void) | 187 | static void cpu_idle_loop(void) |
200 | { | 188 | { |
201 | while (1) { | 189 | while (1) { |
190 | /* | ||
191 | * If the arch has a polling bit, we maintain an invariant: | ||
192 | * | ||
193 | * Our polling bit is clear if we're not scheduled (i.e. if | ||
194 | * rq->curr != rq->idle). This means that, if rq->idle has | ||
195 | * the polling bit set, then setting need_resched is | ||
196 | * guaranteed to cause the cpu to reschedule. | ||
197 | */ | ||
198 | |||
199 | __current_set_polling(); | ||
202 | tick_nohz_idle_enter(); | 200 | tick_nohz_idle_enter(); |
203 | 201 | ||
204 | while (!need_resched()) { | 202 | while (!need_resched()) { |
@@ -238,6 +236,17 @@ static void cpu_idle_loop(void) | |||
238 | */ | 236 | */ |
239 | preempt_set_need_resched(); | 237 | preempt_set_need_resched(); |
240 | tick_nohz_idle_exit(); | 238 | tick_nohz_idle_exit(); |
239 | __current_clr_polling(); | ||
240 | |||
241 | /* | ||
242 | * We promise to call sched_ttwu_pending and reschedule | ||
243 | * if need_resched is set while polling is set. That | ||
244 | * means that clearing polling needs to be visible | ||
245 | * before doing these things. | ||
246 | */ | ||
247 | smp_mb__after_atomic(); | ||
248 | |||
249 | sched_ttwu_pending(); | ||
241 | schedule_preempt_disabled(); | 250 | schedule_preempt_disabled(); |
242 | } | 251 | } |
243 | } | 252 | } |
@@ -259,7 +268,6 @@ void cpu_startup_entry(enum cpuhp_state state) | |||
259 | */ | 268 | */ |
260 | boot_init_stack_canary(); | 269 | boot_init_stack_canary(); |
261 | #endif | 270 | #endif |
262 | __current_set_polling(); | ||
263 | arch_cpu_idle_prepare(); | 271 | arch_cpu_idle_prepare(); |
264 | cpu_idle_loop(); | 272 | cpu_idle_loop(); |
265 | } | 273 | } |
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index bd2267ad404f..a49083192c64 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
@@ -79,6 +79,8 @@ void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) | |||
79 | rt_rq->overloaded = 0; | 79 | rt_rq->overloaded = 0; |
80 | plist_head_init(&rt_rq->pushable_tasks); | 80 | plist_head_init(&rt_rq->pushable_tasks); |
81 | #endif | 81 | #endif |
82 | /* We start is dequeued state, because no RT tasks are queued */ | ||
83 | rt_rq->rt_queued = 0; | ||
82 | 84 | ||
83 | rt_rq->rt_time = 0; | 85 | rt_rq->rt_time = 0; |
84 | rt_rq->rt_throttled = 0; | 86 | rt_rq->rt_throttled = 0; |
@@ -112,6 +114,13 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) | |||
112 | return rt_se->rt_rq; | 114 | return rt_se->rt_rq; |
113 | } | 115 | } |
114 | 116 | ||
117 | static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se) | ||
118 | { | ||
119 | struct rt_rq *rt_rq = rt_se->rt_rq; | ||
120 | |||
121 | return rt_rq->rq; | ||
122 | } | ||
123 | |||
115 | void free_rt_sched_group(struct task_group *tg) | 124 | void free_rt_sched_group(struct task_group *tg) |
116 | { | 125 | { |
117 | int i; | 126 | int i; |
@@ -211,10 +220,16 @@ static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) | |||
211 | return container_of(rt_rq, struct rq, rt); | 220 | return container_of(rt_rq, struct rq, rt); |
212 | } | 221 | } |
213 | 222 | ||
214 | static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) | 223 | static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se) |
215 | { | 224 | { |
216 | struct task_struct *p = rt_task_of(rt_se); | 225 | struct task_struct *p = rt_task_of(rt_se); |
217 | struct rq *rq = task_rq(p); | 226 | |
227 | return task_rq(p); | ||
228 | } | ||
229 | |||
230 | static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) | ||
231 | { | ||
232 | struct rq *rq = rq_of_rt_se(rt_se); | ||
218 | 233 | ||
219 | return &rq->rt; | 234 | return &rq->rt; |
220 | } | 235 | } |
@@ -391,6 +406,9 @@ static inline void set_post_schedule(struct rq *rq) | |||
391 | } | 406 | } |
392 | #endif /* CONFIG_SMP */ | 407 | #endif /* CONFIG_SMP */ |
393 | 408 | ||
409 | static void enqueue_top_rt_rq(struct rt_rq *rt_rq); | ||
410 | static void dequeue_top_rt_rq(struct rt_rq *rt_rq); | ||
411 | |||
394 | static inline int on_rt_rq(struct sched_rt_entity *rt_se) | 412 | static inline int on_rt_rq(struct sched_rt_entity *rt_se) |
395 | { | 413 | { |
396 | return !list_empty(&rt_se->run_list); | 414 | return !list_empty(&rt_se->run_list); |
@@ -452,8 +470,11 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) | |||
452 | rt_se = rt_rq->tg->rt_se[cpu]; | 470 | rt_se = rt_rq->tg->rt_se[cpu]; |
453 | 471 | ||
454 | if (rt_rq->rt_nr_running) { | 472 | if (rt_rq->rt_nr_running) { |
455 | if (rt_se && !on_rt_rq(rt_se)) | 473 | if (!rt_se) |
474 | enqueue_top_rt_rq(rt_rq); | ||
475 | else if (!on_rt_rq(rt_se)) | ||
456 | enqueue_rt_entity(rt_se, false); | 476 | enqueue_rt_entity(rt_se, false); |
477 | |||
457 | if (rt_rq->highest_prio.curr < curr->prio) | 478 | if (rt_rq->highest_prio.curr < curr->prio) |
458 | resched_task(curr); | 479 | resched_task(curr); |
459 | } | 480 | } |
@@ -466,10 +487,17 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) | |||
466 | 487 | ||
467 | rt_se = rt_rq->tg->rt_se[cpu]; | 488 | rt_se = rt_rq->tg->rt_se[cpu]; |
468 | 489 | ||
469 | if (rt_se && on_rt_rq(rt_se)) | 490 | if (!rt_se) |
491 | dequeue_top_rt_rq(rt_rq); | ||
492 | else if (on_rt_rq(rt_se)) | ||
470 | dequeue_rt_entity(rt_se); | 493 | dequeue_rt_entity(rt_se); |
471 | } | 494 | } |
472 | 495 | ||
496 | static inline int rt_rq_throttled(struct rt_rq *rt_rq) | ||
497 | { | ||
498 | return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted; | ||
499 | } | ||
500 | |||
473 | static int rt_se_boosted(struct sched_rt_entity *rt_se) | 501 | static int rt_se_boosted(struct sched_rt_entity *rt_se) |
474 | { | 502 | { |
475 | struct rt_rq *rt_rq = group_rt_rq(rt_se); | 503 | struct rt_rq *rt_rq = group_rt_rq(rt_se); |
@@ -532,12 +560,23 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) | |||
532 | 560 | ||
533 | static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) | 561 | static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) |
534 | { | 562 | { |
535 | if (rt_rq->rt_nr_running) | 563 | struct rq *rq = rq_of_rt_rq(rt_rq); |
536 | resched_task(rq_of_rt_rq(rt_rq)->curr); | 564 | |
565 | if (!rt_rq->rt_nr_running) | ||
566 | return; | ||
567 | |||
568 | enqueue_top_rt_rq(rt_rq); | ||
569 | resched_task(rq->curr); | ||
537 | } | 570 | } |
538 | 571 | ||
539 | static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) | 572 | static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) |
540 | { | 573 | { |
574 | dequeue_top_rt_rq(rt_rq); | ||
575 | } | ||
576 | |||
577 | static inline int rt_rq_throttled(struct rt_rq *rt_rq) | ||
578 | { | ||
579 | return rt_rq->rt_throttled; | ||
541 | } | 580 | } |
542 | 581 | ||
543 | static inline const struct cpumask *sched_rt_period_mask(void) | 582 | static inline const struct cpumask *sched_rt_period_mask(void) |
@@ -851,14 +890,8 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) | |||
851 | * but accrue some time due to boosting. | 890 | * but accrue some time due to boosting. |
852 | */ | 891 | */ |
853 | if (likely(rt_b->rt_runtime)) { | 892 | if (likely(rt_b->rt_runtime)) { |
854 | static bool once = false; | ||
855 | |||
856 | rt_rq->rt_throttled = 1; | 893 | rt_rq->rt_throttled = 1; |
857 | 894 | printk_deferred_once("sched: RT throttling activated\n"); | |
858 | if (!once) { | ||
859 | once = true; | ||
860 | printk_sched("sched: RT throttling activated\n"); | ||
861 | } | ||
862 | } else { | 895 | } else { |
863 | /* | 896 | /* |
864 | * In case we did anyway, make it go away, | 897 | * In case we did anyway, make it go away, |
@@ -885,7 +918,6 @@ static void update_curr_rt(struct rq *rq) | |||
885 | { | 918 | { |
886 | struct task_struct *curr = rq->curr; | 919 | struct task_struct *curr = rq->curr; |
887 | struct sched_rt_entity *rt_se = &curr->rt; | 920 | struct sched_rt_entity *rt_se = &curr->rt; |
888 | struct rt_rq *rt_rq = rt_rq_of_se(rt_se); | ||
889 | u64 delta_exec; | 921 | u64 delta_exec; |
890 | 922 | ||
891 | if (curr->sched_class != &rt_sched_class) | 923 | if (curr->sched_class != &rt_sched_class) |
@@ -910,7 +942,7 @@ static void update_curr_rt(struct rq *rq) | |||
910 | return; | 942 | return; |
911 | 943 | ||
912 | for_each_sched_rt_entity(rt_se) { | 944 | for_each_sched_rt_entity(rt_se) { |
913 | rt_rq = rt_rq_of_se(rt_se); | 945 | struct rt_rq *rt_rq = rt_rq_of_se(rt_se); |
914 | 946 | ||
915 | if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { | 947 | if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { |
916 | raw_spin_lock(&rt_rq->rt_runtime_lock); | 948 | raw_spin_lock(&rt_rq->rt_runtime_lock); |
@@ -922,6 +954,38 @@ static void update_curr_rt(struct rq *rq) | |||
922 | } | 954 | } |
923 | } | 955 | } |
924 | 956 | ||
957 | static void | ||
958 | dequeue_top_rt_rq(struct rt_rq *rt_rq) | ||
959 | { | ||
960 | struct rq *rq = rq_of_rt_rq(rt_rq); | ||
961 | |||
962 | BUG_ON(&rq->rt != rt_rq); | ||
963 | |||
964 | if (!rt_rq->rt_queued) | ||
965 | return; | ||
966 | |||
967 | BUG_ON(!rq->nr_running); | ||
968 | |||
969 | sub_nr_running(rq, rt_rq->rt_nr_running); | ||
970 | rt_rq->rt_queued = 0; | ||
971 | } | ||
972 | |||
973 | static void | ||
974 | enqueue_top_rt_rq(struct rt_rq *rt_rq) | ||
975 | { | ||
976 | struct rq *rq = rq_of_rt_rq(rt_rq); | ||
977 | |||
978 | BUG_ON(&rq->rt != rt_rq); | ||
979 | |||
980 | if (rt_rq->rt_queued) | ||
981 | return; | ||
982 | if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running) | ||
983 | return; | ||
984 | |||
985 | add_nr_running(rq, rt_rq->rt_nr_running); | ||
986 | rt_rq->rt_queued = 1; | ||
987 | } | ||
988 | |||
925 | #if defined CONFIG_SMP | 989 | #if defined CONFIG_SMP |
926 | 990 | ||
927 | static void | 991 | static void |
@@ -1045,12 +1109,23 @@ void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {} | |||
1045 | #endif /* CONFIG_RT_GROUP_SCHED */ | 1109 | #endif /* CONFIG_RT_GROUP_SCHED */ |
1046 | 1110 | ||
1047 | static inline | 1111 | static inline |
1112 | unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se) | ||
1113 | { | ||
1114 | struct rt_rq *group_rq = group_rt_rq(rt_se); | ||
1115 | |||
1116 | if (group_rq) | ||
1117 | return group_rq->rt_nr_running; | ||
1118 | else | ||
1119 | return 1; | ||
1120 | } | ||
1121 | |||
1122 | static inline | ||
1048 | void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) | 1123 | void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) |
1049 | { | 1124 | { |
1050 | int prio = rt_se_prio(rt_se); | 1125 | int prio = rt_se_prio(rt_se); |
1051 | 1126 | ||
1052 | WARN_ON(!rt_prio(prio)); | 1127 | WARN_ON(!rt_prio(prio)); |
1053 | rt_rq->rt_nr_running++; | 1128 | rt_rq->rt_nr_running += rt_se_nr_running(rt_se); |
1054 | 1129 | ||
1055 | inc_rt_prio(rt_rq, prio); | 1130 | inc_rt_prio(rt_rq, prio); |
1056 | inc_rt_migration(rt_se, rt_rq); | 1131 | inc_rt_migration(rt_se, rt_rq); |
@@ -1062,7 +1137,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) | |||
1062 | { | 1137 | { |
1063 | WARN_ON(!rt_prio(rt_se_prio(rt_se))); | 1138 | WARN_ON(!rt_prio(rt_se_prio(rt_se))); |
1064 | WARN_ON(!rt_rq->rt_nr_running); | 1139 | WARN_ON(!rt_rq->rt_nr_running); |
1065 | rt_rq->rt_nr_running--; | 1140 | rt_rq->rt_nr_running -= rt_se_nr_running(rt_se); |
1066 | 1141 | ||
1067 | dec_rt_prio(rt_rq, rt_se_prio(rt_se)); | 1142 | dec_rt_prio(rt_rq, rt_se_prio(rt_se)); |
1068 | dec_rt_migration(rt_se, rt_rq); | 1143 | dec_rt_migration(rt_se, rt_rq); |
@@ -1119,6 +1194,8 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se) | |||
1119 | back = rt_se; | 1194 | back = rt_se; |
1120 | } | 1195 | } |
1121 | 1196 | ||
1197 | dequeue_top_rt_rq(rt_rq_of_se(back)); | ||
1198 | |||
1122 | for (rt_se = back; rt_se; rt_se = rt_se->back) { | 1199 | for (rt_se = back; rt_se; rt_se = rt_se->back) { |
1123 | if (on_rt_rq(rt_se)) | 1200 | if (on_rt_rq(rt_se)) |
1124 | __dequeue_rt_entity(rt_se); | 1201 | __dequeue_rt_entity(rt_se); |
@@ -1127,13 +1204,18 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se) | |||
1127 | 1204 | ||
1128 | static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) | 1205 | static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) |
1129 | { | 1206 | { |
1207 | struct rq *rq = rq_of_rt_se(rt_se); | ||
1208 | |||
1130 | dequeue_rt_stack(rt_se); | 1209 | dequeue_rt_stack(rt_se); |
1131 | for_each_sched_rt_entity(rt_se) | 1210 | for_each_sched_rt_entity(rt_se) |
1132 | __enqueue_rt_entity(rt_se, head); | 1211 | __enqueue_rt_entity(rt_se, head); |
1212 | enqueue_top_rt_rq(&rq->rt); | ||
1133 | } | 1213 | } |
1134 | 1214 | ||
1135 | static void dequeue_rt_entity(struct sched_rt_entity *rt_se) | 1215 | static void dequeue_rt_entity(struct sched_rt_entity *rt_se) |
1136 | { | 1216 | { |
1217 | struct rq *rq = rq_of_rt_se(rt_se); | ||
1218 | |||
1137 | dequeue_rt_stack(rt_se); | 1219 | dequeue_rt_stack(rt_se); |
1138 | 1220 | ||
1139 | for_each_sched_rt_entity(rt_se) { | 1221 | for_each_sched_rt_entity(rt_se) { |
@@ -1142,6 +1224,7 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se) | |||
1142 | if (rt_rq && rt_rq->rt_nr_running) | 1224 | if (rt_rq && rt_rq->rt_nr_running) |
1143 | __enqueue_rt_entity(rt_se, false); | 1225 | __enqueue_rt_entity(rt_se, false); |
1144 | } | 1226 | } |
1227 | enqueue_top_rt_rq(&rq->rt); | ||
1145 | } | 1228 | } |
1146 | 1229 | ||
1147 | /* | 1230 | /* |
@@ -1159,8 +1242,6 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) | |||
1159 | 1242 | ||
1160 | if (!task_current(rq, p) && p->nr_cpus_allowed > 1) | 1243 | if (!task_current(rq, p) && p->nr_cpus_allowed > 1) |
1161 | enqueue_pushable_task(rq, p); | 1244 | enqueue_pushable_task(rq, p); |
1162 | |||
1163 | inc_nr_running(rq); | ||
1164 | } | 1245 | } |
1165 | 1246 | ||
1166 | static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) | 1247 | static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) |
@@ -1171,8 +1252,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) | |||
1171 | dequeue_rt_entity(rt_se); | 1252 | dequeue_rt_entity(rt_se); |
1172 | 1253 | ||
1173 | dequeue_pushable_task(rq, p); | 1254 | dequeue_pushable_task(rq, p); |
1174 | |||
1175 | dec_nr_running(rq); | ||
1176 | } | 1255 | } |
1177 | 1256 | ||
1178 | /* | 1257 | /* |
@@ -1377,10 +1456,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev) | |||
1377 | if (prev->sched_class == &rt_sched_class) | 1456 | if (prev->sched_class == &rt_sched_class) |
1378 | update_curr_rt(rq); | 1457 | update_curr_rt(rq); |
1379 | 1458 | ||
1380 | if (!rt_rq->rt_nr_running) | 1459 | if (!rt_rq->rt_queued) |
1381 | return NULL; | ||
1382 | |||
1383 | if (rt_rq_throttled(rt_rq)) | ||
1384 | return NULL; | 1460 | return NULL; |
1385 | 1461 | ||
1386 | put_prev_task(rq, prev); | 1462 | put_prev_task(rq, prev); |
@@ -1892,9 +1968,9 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) | |||
1892 | */ | 1968 | */ |
1893 | if (p->on_rq && rq->curr != p) { | 1969 | if (p->on_rq && rq->curr != p) { |
1894 | #ifdef CONFIG_SMP | 1970 | #ifdef CONFIG_SMP |
1895 | if (rq->rt.overloaded && push_rt_task(rq) && | 1971 | if (p->nr_cpus_allowed > 1 && rq->rt.overloaded && |
1896 | /* Don't resched if we changed runqueues */ | 1972 | /* Don't resched if we changed runqueues */ |
1897 | rq != task_rq(p)) | 1973 | push_rt_task(rq) && rq != task_rq(p)) |
1898 | check_resched = 0; | 1974 | check_resched = 0; |
1899 | #endif /* CONFIG_SMP */ | 1975 | #endif /* CONFIG_SMP */ |
1900 | if (check_resched && p->prio < rq->curr->prio) | 1976 | if (check_resched && p->prio < rq->curr->prio) |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 456e492a3dca..31cc02ebc54e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -278,7 +278,7 @@ extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b); | |||
278 | extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); | 278 | extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); |
279 | 279 | ||
280 | extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b); | 280 | extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b); |
281 | extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); | 281 | extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force); |
282 | extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq); | 282 | extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq); |
283 | 283 | ||
284 | extern void free_rt_sched_group(struct task_group *tg); | 284 | extern void free_rt_sched_group(struct task_group *tg); |
@@ -409,6 +409,8 @@ struct rt_rq { | |||
409 | int overloaded; | 409 | int overloaded; |
410 | struct plist_head pushable_tasks; | 410 | struct plist_head pushable_tasks; |
411 | #endif | 411 | #endif |
412 | int rt_queued; | ||
413 | |||
412 | int rt_throttled; | 414 | int rt_throttled; |
413 | u64 rt_time; | 415 | u64 rt_time; |
414 | u64 rt_runtime; | 416 | u64 rt_runtime; |
@@ -423,18 +425,6 @@ struct rt_rq { | |||
423 | #endif | 425 | #endif |
424 | }; | 426 | }; |
425 | 427 | ||
426 | #ifdef CONFIG_RT_GROUP_SCHED | ||
427 | static inline int rt_rq_throttled(struct rt_rq *rt_rq) | ||
428 | { | ||
429 | return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted; | ||
430 | } | ||
431 | #else | ||
432 | static inline int rt_rq_throttled(struct rt_rq *rt_rq) | ||
433 | { | ||
434 | return rt_rq->rt_throttled; | ||
435 | } | ||
436 | #endif | ||
437 | |||
438 | /* Deadline class' related fields in a runqueue */ | 428 | /* Deadline class' related fields in a runqueue */ |
439 | struct dl_rq { | 429 | struct dl_rq { |
440 | /* runqueue is an rbtree, ordered by deadline */ | 430 | /* runqueue is an rbtree, ordered by deadline */ |
@@ -577,7 +567,7 @@ struct rq { | |||
577 | struct root_domain *rd; | 567 | struct root_domain *rd; |
578 | struct sched_domain *sd; | 568 | struct sched_domain *sd; |
579 | 569 | ||
580 | unsigned long cpu_power; | 570 | unsigned long cpu_capacity; |
581 | 571 | ||
582 | unsigned char idle_balance; | 572 | unsigned char idle_balance; |
583 | /* For active balancing */ | 573 | /* For active balancing */ |
@@ -680,6 +670,8 @@ extern int migrate_swap(struct task_struct *, struct task_struct *); | |||
680 | 670 | ||
681 | #ifdef CONFIG_SMP | 671 | #ifdef CONFIG_SMP |
682 | 672 | ||
673 | extern void sched_ttwu_pending(void); | ||
674 | |||
683 | #define rcu_dereference_check_sched_domain(p) \ | 675 | #define rcu_dereference_check_sched_domain(p) \ |
684 | rcu_dereference_check((p), \ | 676 | rcu_dereference_check((p), \ |
685 | lockdep_is_held(&sched_domains_mutex)) | 677 | lockdep_is_held(&sched_domains_mutex)) |
@@ -738,15 +730,15 @@ DECLARE_PER_CPU(struct sched_domain *, sd_numa); | |||
738 | DECLARE_PER_CPU(struct sched_domain *, sd_busy); | 730 | DECLARE_PER_CPU(struct sched_domain *, sd_busy); |
739 | DECLARE_PER_CPU(struct sched_domain *, sd_asym); | 731 | DECLARE_PER_CPU(struct sched_domain *, sd_asym); |
740 | 732 | ||
741 | struct sched_group_power { | 733 | struct sched_group_capacity { |
742 | atomic_t ref; | 734 | atomic_t ref; |
743 | /* | 735 | /* |
744 | * CPU power of this group, SCHED_LOAD_SCALE being max power for a | 736 | * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity |
745 | * single CPU. | 737 | * for a single CPU. |
746 | */ | 738 | */ |
747 | unsigned int power, power_orig; | 739 | unsigned int capacity, capacity_orig; |
748 | unsigned long next_update; | 740 | unsigned long next_update; |
749 | int imbalance; /* XXX unrelated to power but shared group state */ | 741 | int imbalance; /* XXX unrelated to capacity but shared group state */ |
750 | /* | 742 | /* |
751 | * Number of busy cpus in this group. | 743 | * Number of busy cpus in this group. |
752 | */ | 744 | */ |
@@ -760,7 +752,7 @@ struct sched_group { | |||
760 | atomic_t ref; | 752 | atomic_t ref; |
761 | 753 | ||
762 | unsigned int group_weight; | 754 | unsigned int group_weight; |
763 | struct sched_group_power *sgp; | 755 | struct sched_group_capacity *sgc; |
764 | 756 | ||
765 | /* | 757 | /* |
766 | * The CPUs this group covers. | 758 | * The CPUs this group covers. |
@@ -783,7 +775,7 @@ static inline struct cpumask *sched_group_cpus(struct sched_group *sg) | |||
783 | */ | 775 | */ |
784 | static inline struct cpumask *sched_group_mask(struct sched_group *sg) | 776 | static inline struct cpumask *sched_group_mask(struct sched_group *sg) |
785 | { | 777 | { |
786 | return to_cpumask(sg->sgp->cpumask); | 778 | return to_cpumask(sg->sgc->cpumask); |
787 | } | 779 | } |
788 | 780 | ||
789 | /** | 781 | /** |
@@ -797,6 +789,10 @@ static inline unsigned int group_first_cpu(struct sched_group *group) | |||
797 | 789 | ||
798 | extern int group_balance_cpu(struct sched_group *sg); | 790 | extern int group_balance_cpu(struct sched_group *sg); |
799 | 791 | ||
792 | #else | ||
793 | |||
794 | static inline void sched_ttwu_pending(void) { } | ||
795 | |||
800 | #endif /* CONFIG_SMP */ | 796 | #endif /* CONFIG_SMP */ |
801 | 797 | ||
802 | #include "stats.h" | 798 | #include "stats.h" |
@@ -1177,7 +1173,7 @@ extern const struct sched_class idle_sched_class; | |||
1177 | 1173 | ||
1178 | #ifdef CONFIG_SMP | 1174 | #ifdef CONFIG_SMP |
1179 | 1175 | ||
1180 | extern void update_group_power(struct sched_domain *sd, int cpu); | 1176 | extern void update_group_capacity(struct sched_domain *sd, int cpu); |
1181 | 1177 | ||
1182 | extern void trigger_load_balance(struct rq *rq); | 1178 | extern void trigger_load_balance(struct rq *rq); |
1183 | 1179 | ||
@@ -1216,12 +1212,14 @@ extern void update_idle_cpu_load(struct rq *this_rq); | |||
1216 | 1212 | ||
1217 | extern void init_task_runnable_average(struct task_struct *p); | 1213 | extern void init_task_runnable_average(struct task_struct *p); |
1218 | 1214 | ||
1219 | static inline void inc_nr_running(struct rq *rq) | 1215 | static inline void add_nr_running(struct rq *rq, unsigned count) |
1220 | { | 1216 | { |
1221 | rq->nr_running++; | 1217 | unsigned prev_nr = rq->nr_running; |
1218 | |||
1219 | rq->nr_running = prev_nr + count; | ||
1222 | 1220 | ||
1223 | #ifdef CONFIG_NO_HZ_FULL | 1221 | #ifdef CONFIG_NO_HZ_FULL |
1224 | if (rq->nr_running == 2) { | 1222 | if (prev_nr < 2 && rq->nr_running >= 2) { |
1225 | if (tick_nohz_full_cpu(rq->cpu)) { | 1223 | if (tick_nohz_full_cpu(rq->cpu)) { |
1226 | /* Order rq->nr_running write against the IPI */ | 1224 | /* Order rq->nr_running write against the IPI */ |
1227 | smp_wmb(); | 1225 | smp_wmb(); |
@@ -1231,9 +1229,9 @@ static inline void inc_nr_running(struct rq *rq) | |||
1231 | #endif | 1229 | #endif |
1232 | } | 1230 | } |
1233 | 1231 | ||
1234 | static inline void dec_nr_running(struct rq *rq) | 1232 | static inline void sub_nr_running(struct rq *rq, unsigned count) |
1235 | { | 1233 | { |
1236 | rq->nr_running--; | 1234 | rq->nr_running -= count; |
1237 | } | 1235 | } |
1238 | 1236 | ||
1239 | static inline void rq_last_tick_reset(struct rq *rq) | 1237 | static inline void rq_last_tick_reset(struct rq *rq) |
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index d6ce65dde541..bfe0edadbfbb 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c | |||
@@ -41,13 +41,13 @@ pick_next_task_stop(struct rq *rq, struct task_struct *prev) | |||
41 | static void | 41 | static void |
42 | enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) | 42 | enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) |
43 | { | 43 | { |
44 | inc_nr_running(rq); | 44 | add_nr_running(rq, 1); |
45 | } | 45 | } |
46 | 46 | ||
47 | static void | 47 | static void |
48 | dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) | 48 | dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) |
49 | { | 49 | { |
50 | dec_nr_running(rq); | 50 | sub_nr_running(rq, 1); |
51 | } | 51 | } |
52 | 52 | ||
53 | static void yield_task_stop(struct rq *rq) | 53 | static void yield_task_stop(struct rq *rq) |
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 7d50f794e248..0ffa20ae657b 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c | |||
@@ -394,7 +394,7 @@ EXPORT_SYMBOL(__wake_up_bit); | |||
394 | * | 394 | * |
395 | * In order for this to function properly, as it uses waitqueue_active() | 395 | * In order for this to function properly, as it uses waitqueue_active() |
396 | * internally, some kind of memory barrier must be done prior to calling | 396 | * internally, some kind of memory barrier must be done prior to calling |
397 | * this. Typically, this will be smp_mb__after_clear_bit(), but in some | 397 | * this. Typically, this will be smp_mb__after_atomic(), but in some |
398 | * cases where bitflags are manipulated non-atomically under a lock, one | 398 | * cases where bitflags are manipulated non-atomically under a lock, one |
399 | * may need to use a less regular barrier, such fs/inode.c's smp_mb(), | 399 | * may need to use a less regular barrier, such fs/inode.c's smp_mb(), |
400 | * because spin_unlock() does not guarantee a memory barrier. | 400 | * because spin_unlock() does not guarantee a memory barrier. |
diff --git a/kernel/seccomp.c b/kernel/seccomp.c index b35c21503a36..301bbc24739c 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c | |||
@@ -39,7 +39,7 @@ | |||
39 | * is only needed for handling filters shared across tasks. | 39 | * is only needed for handling filters shared across tasks. |
40 | * @prev: points to a previously installed, or inherited, filter | 40 | * @prev: points to a previously installed, or inherited, filter |
41 | * @len: the number of instructions in the program | 41 | * @len: the number of instructions in the program |
42 | * @insns: the BPF program instructions to evaluate | 42 | * @insnsi: the BPF program instructions to evaluate |
43 | * | 43 | * |
44 | * seccomp_filter objects are organized in a tree linked via the @prev | 44 | * seccomp_filter objects are organized in a tree linked via the @prev |
45 | * pointer. For any task, it appears to be a singly-linked list starting | 45 | * pointer. For any task, it appears to be a singly-linked list starting |
@@ -54,8 +54,7 @@ | |||
54 | struct seccomp_filter { | 54 | struct seccomp_filter { |
55 | atomic_t usage; | 55 | atomic_t usage; |
56 | struct seccomp_filter *prev; | 56 | struct seccomp_filter *prev; |
57 | unsigned short len; /* Instruction count */ | 57 | struct sk_filter *prog; |
58 | struct sock_filter_int insnsi[]; | ||
59 | }; | 58 | }; |
60 | 59 | ||
61 | /* Limit any path through the tree to 256KB worth of instructions. */ | 60 | /* Limit any path through the tree to 256KB worth of instructions. */ |
@@ -104,60 +103,59 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) | |||
104 | u32 k = ftest->k; | 103 | u32 k = ftest->k; |
105 | 104 | ||
106 | switch (code) { | 105 | switch (code) { |
107 | case BPF_S_LD_W_ABS: | 106 | case BPF_LD | BPF_W | BPF_ABS: |
108 | ftest->code = BPF_LDX | BPF_W | BPF_ABS; | 107 | ftest->code = BPF_LDX | BPF_W | BPF_ABS; |
109 | /* 32-bit aligned and not out of bounds. */ | 108 | /* 32-bit aligned and not out of bounds. */ |
110 | if (k >= sizeof(struct seccomp_data) || k & 3) | 109 | if (k >= sizeof(struct seccomp_data) || k & 3) |
111 | return -EINVAL; | 110 | return -EINVAL; |
112 | continue; | 111 | continue; |
113 | case BPF_S_LD_W_LEN: | 112 | case BPF_LD | BPF_W | BPF_LEN: |
114 | ftest->code = BPF_LD | BPF_IMM; | 113 | ftest->code = BPF_LD | BPF_IMM; |
115 | ftest->k = sizeof(struct seccomp_data); | 114 | ftest->k = sizeof(struct seccomp_data); |
116 | continue; | 115 | continue; |
117 | case BPF_S_LDX_W_LEN: | 116 | case BPF_LDX | BPF_W | BPF_LEN: |
118 | ftest->code = BPF_LDX | BPF_IMM; | 117 | ftest->code = BPF_LDX | BPF_IMM; |
119 | ftest->k = sizeof(struct seccomp_data); | 118 | ftest->k = sizeof(struct seccomp_data); |
120 | continue; | 119 | continue; |
121 | /* Explicitly include allowed calls. */ | 120 | /* Explicitly include allowed calls. */ |
122 | case BPF_S_RET_K: | 121 | case BPF_RET | BPF_K: |
123 | case BPF_S_RET_A: | 122 | case BPF_RET | BPF_A: |
124 | case BPF_S_ALU_ADD_K: | 123 | case BPF_ALU | BPF_ADD | BPF_K: |
125 | case BPF_S_ALU_ADD_X: | 124 | case BPF_ALU | BPF_ADD | BPF_X: |
126 | case BPF_S_ALU_SUB_K: | 125 | case BPF_ALU | BPF_SUB | BPF_K: |
127 | case BPF_S_ALU_SUB_X: | 126 | case BPF_ALU | BPF_SUB | BPF_X: |
128 | case BPF_S_ALU_MUL_K: | 127 | case BPF_ALU | BPF_MUL | BPF_K: |
129 | case BPF_S_ALU_MUL_X: | 128 | case BPF_ALU | BPF_MUL | BPF_X: |
130 | case BPF_S_ALU_DIV_X: | 129 | case BPF_ALU | BPF_DIV | BPF_K: |
131 | case BPF_S_ALU_AND_K: | 130 | case BPF_ALU | BPF_DIV | BPF_X: |
132 | case BPF_S_ALU_AND_X: | 131 | case BPF_ALU | BPF_AND | BPF_K: |
133 | case BPF_S_ALU_OR_K: | 132 | case BPF_ALU | BPF_AND | BPF_X: |
134 | case BPF_S_ALU_OR_X: | 133 | case BPF_ALU | BPF_OR | BPF_K: |
135 | case BPF_S_ALU_XOR_K: | 134 | case BPF_ALU | BPF_OR | BPF_X: |
136 | case BPF_S_ALU_XOR_X: | 135 | case BPF_ALU | BPF_XOR | BPF_K: |
137 | case BPF_S_ALU_LSH_K: | 136 | case BPF_ALU | BPF_XOR | BPF_X: |
138 | case BPF_S_ALU_LSH_X: | 137 | case BPF_ALU | BPF_LSH | BPF_K: |
139 | case BPF_S_ALU_RSH_K: | 138 | case BPF_ALU | BPF_LSH | BPF_X: |
140 | case BPF_S_ALU_RSH_X: | 139 | case BPF_ALU | BPF_RSH | BPF_K: |
141 | case BPF_S_ALU_NEG: | 140 | case BPF_ALU | BPF_RSH | BPF_X: |
142 | case BPF_S_LD_IMM: | 141 | case BPF_ALU | BPF_NEG: |
143 | case BPF_S_LDX_IMM: | 142 | case BPF_LD | BPF_IMM: |
144 | case BPF_S_MISC_TAX: | 143 | case BPF_LDX | BPF_IMM: |
145 | case BPF_S_MISC_TXA: | 144 | case BPF_MISC | BPF_TAX: |
146 | case BPF_S_ALU_DIV_K: | 145 | case BPF_MISC | BPF_TXA: |
147 | case BPF_S_LD_MEM: | 146 | case BPF_LD | BPF_MEM: |
148 | case BPF_S_LDX_MEM: | 147 | case BPF_LDX | BPF_MEM: |
149 | case BPF_S_ST: | 148 | case BPF_ST: |
150 | case BPF_S_STX: | 149 | case BPF_STX: |
151 | case BPF_S_JMP_JA: | 150 | case BPF_JMP | BPF_JA: |
152 | case BPF_S_JMP_JEQ_K: | 151 | case BPF_JMP | BPF_JEQ | BPF_K: |
153 | case BPF_S_JMP_JEQ_X: | 152 | case BPF_JMP | BPF_JEQ | BPF_X: |
154 | case BPF_S_JMP_JGE_K: | 153 | case BPF_JMP | BPF_JGE | BPF_K: |
155 | case BPF_S_JMP_JGE_X: | 154 | case BPF_JMP | BPF_JGE | BPF_X: |
156 | case BPF_S_JMP_JGT_K: | 155 | case BPF_JMP | BPF_JGT | BPF_K: |
157 | case BPF_S_JMP_JGT_X: | 156 | case BPF_JMP | BPF_JGT | BPF_X: |
158 | case BPF_S_JMP_JSET_K: | 157 | case BPF_JMP | BPF_JSET | BPF_K: |
159 | case BPF_S_JMP_JSET_X: | 158 | case BPF_JMP | BPF_JSET | BPF_X: |
160 | sk_decode_filter(ftest, ftest); | ||
161 | continue; | 159 | continue; |
162 | default: | 160 | default: |
163 | return -EINVAL; | 161 | return -EINVAL; |
@@ -189,7 +187,8 @@ static u32 seccomp_run_filters(int syscall) | |||
189 | * value always takes priority (ignoring the DATA). | 187 | * value always takes priority (ignoring the DATA). |
190 | */ | 188 | */ |
191 | for (f = current->seccomp.filter; f; f = f->prev) { | 189 | for (f = current->seccomp.filter; f; f = f->prev) { |
192 | u32 cur_ret = sk_run_filter_int_seccomp(&sd, f->insnsi); | 190 | u32 cur_ret = SK_RUN_FILTER(f->prog, (void *)&sd); |
191 | |||
193 | if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) | 192 | if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) |
194 | ret = cur_ret; | 193 | ret = cur_ret; |
195 | } | 194 | } |
@@ -215,12 +214,12 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) | |||
215 | return -EINVAL; | 214 | return -EINVAL; |
216 | 215 | ||
217 | for (filter = current->seccomp.filter; filter; filter = filter->prev) | 216 | for (filter = current->seccomp.filter; filter; filter = filter->prev) |
218 | total_insns += filter->len + 4; /* include a 4 instr penalty */ | 217 | total_insns += filter->prog->len + 4; /* include a 4 instr penalty */ |
219 | if (total_insns > MAX_INSNS_PER_PATH) | 218 | if (total_insns > MAX_INSNS_PER_PATH) |
220 | return -ENOMEM; | 219 | return -ENOMEM; |
221 | 220 | ||
222 | /* | 221 | /* |
223 | * Installing a seccomp filter requires that the task have | 222 | * Installing a seccomp filter requires that the task has |
224 | * CAP_SYS_ADMIN in its namespace or be running with no_new_privs. | 223 | * CAP_SYS_ADMIN in its namespace or be running with no_new_privs. |
225 | * This avoids scenarios where unprivileged tasks can affect the | 224 | * This avoids scenarios where unprivileged tasks can affect the |
226 | * behavior of privileged children. | 225 | * behavior of privileged children. |
@@ -256,19 +255,25 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) | |||
256 | 255 | ||
257 | /* Allocate a new seccomp_filter */ | 256 | /* Allocate a new seccomp_filter */ |
258 | ret = -ENOMEM; | 257 | ret = -ENOMEM; |
259 | filter = kzalloc(sizeof(struct seccomp_filter) + | 258 | filter = kzalloc(sizeof(struct seccomp_filter), |
260 | sizeof(struct sock_filter_int) * new_len, | ||
261 | GFP_KERNEL|__GFP_NOWARN); | 259 | GFP_KERNEL|__GFP_NOWARN); |
262 | if (!filter) | 260 | if (!filter) |
263 | goto free_prog; | 261 | goto free_prog; |
264 | 262 | ||
265 | ret = sk_convert_filter(fp, fprog->len, filter->insnsi, &new_len); | 263 | filter->prog = kzalloc(sk_filter_size(new_len), |
266 | if (ret) | 264 | GFP_KERNEL|__GFP_NOWARN); |
265 | if (!filter->prog) | ||
267 | goto free_filter; | 266 | goto free_filter; |
267 | |||
268 | ret = sk_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len); | ||
269 | if (ret) | ||
270 | goto free_filter_prog; | ||
268 | kfree(fp); | 271 | kfree(fp); |
269 | 272 | ||
270 | atomic_set(&filter->usage, 1); | 273 | atomic_set(&filter->usage, 1); |
271 | filter->len = new_len; | 274 | filter->prog->len = new_len; |
275 | |||
276 | sk_filter_select_runtime(filter->prog); | ||
272 | 277 | ||
273 | /* | 278 | /* |
274 | * If there is an existing filter, make it the prev and don't drop its | 279 | * If there is an existing filter, make it the prev and don't drop its |
@@ -278,6 +283,8 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) | |||
278 | current->seccomp.filter = filter; | 283 | current->seccomp.filter = filter; |
279 | return 0; | 284 | return 0; |
280 | 285 | ||
286 | free_filter_prog: | ||
287 | kfree(filter->prog); | ||
281 | free_filter: | 288 | free_filter: |
282 | kfree(filter); | 289 | kfree(filter); |
283 | free_prog: | 290 | free_prog: |
@@ -330,6 +337,7 @@ void put_seccomp_filter(struct task_struct *tsk) | |||
330 | while (orig && atomic_dec_and_test(&orig->usage)) { | 337 | while (orig && atomic_dec_and_test(&orig->usage)) { |
331 | struct seccomp_filter *freeme = orig; | 338 | struct seccomp_filter *freeme = orig; |
332 | orig = orig->prev; | 339 | orig = orig->prev; |
340 | sk_filter_free(freeme->prog); | ||
333 | kfree(freeme); | 341 | kfree(freeme); |
334 | } | 342 | } |
335 | } | 343 | } |
diff --git a/kernel/signal.c b/kernel/signal.c index 6ea13c09ae56..a4077e90f19f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -277,6 +277,7 @@ void task_clear_jobctl_trapping(struct task_struct *task) | |||
277 | { | 277 | { |
278 | if (unlikely(task->jobctl & JOBCTL_TRAPPING)) { | 278 | if (unlikely(task->jobctl & JOBCTL_TRAPPING)) { |
279 | task->jobctl &= ~JOBCTL_TRAPPING; | 279 | task->jobctl &= ~JOBCTL_TRAPPING; |
280 | smp_mb(); /* advised by wake_up_bit() */ | ||
280 | wake_up_bit(&task->jobctl, JOBCTL_TRAPPING_BIT); | 281 | wake_up_bit(&task->jobctl, JOBCTL_TRAPPING_BIT); |
281 | } | 282 | } |
282 | } | 283 | } |
@@ -705,11 +706,8 @@ void signal_wake_up_state(struct task_struct *t, unsigned int state) | |||
705 | * Returns 1 if any signals were found. | 706 | * Returns 1 if any signals were found. |
706 | * | 707 | * |
707 | * All callers must be holding the siglock. | 708 | * All callers must be holding the siglock. |
708 | * | ||
709 | * This version takes a sigset mask and looks at all signals, | ||
710 | * not just those in the first mask word. | ||
711 | */ | 709 | */ |
712 | static int rm_from_queue_full(sigset_t *mask, struct sigpending *s) | 710 | static int flush_sigqueue_mask(sigset_t *mask, struct sigpending *s) |
713 | { | 711 | { |
714 | struct sigqueue *q, *n; | 712 | struct sigqueue *q, *n; |
715 | sigset_t m; | 713 | sigset_t m; |
@@ -727,29 +725,6 @@ static int rm_from_queue_full(sigset_t *mask, struct sigpending *s) | |||
727 | } | 725 | } |
728 | return 1; | 726 | return 1; |
729 | } | 727 | } |
730 | /* | ||
731 | * Remove signals in mask from the pending set and queue. | ||
732 | * Returns 1 if any signals were found. | ||
733 | * | ||
734 | * All callers must be holding the siglock. | ||
735 | */ | ||
736 | static int rm_from_queue(unsigned long mask, struct sigpending *s) | ||
737 | { | ||
738 | struct sigqueue *q, *n; | ||
739 | |||
740 | if (!sigtestsetmask(&s->signal, mask)) | ||
741 | return 0; | ||
742 | |||
743 | sigdelsetmask(&s->signal, mask); | ||
744 | list_for_each_entry_safe(q, n, &s->list, list) { | ||
745 | if (q->info.si_signo < SIGRTMIN && | ||
746 | (mask & sigmask(q->info.si_signo))) { | ||
747 | list_del_init(&q->list); | ||
748 | __sigqueue_free(q); | ||
749 | } | ||
750 | } | ||
751 | return 1; | ||
752 | } | ||
753 | 728 | ||
754 | static inline int is_si_special(const struct siginfo *info) | 729 | static inline int is_si_special(const struct siginfo *info) |
755 | { | 730 | { |
@@ -861,6 +836,7 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force) | |||
861 | { | 836 | { |
862 | struct signal_struct *signal = p->signal; | 837 | struct signal_struct *signal = p->signal; |
863 | struct task_struct *t; | 838 | struct task_struct *t; |
839 | sigset_t flush; | ||
864 | 840 | ||
865 | if (signal->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP)) { | 841 | if (signal->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP)) { |
866 | if (signal->flags & SIGNAL_GROUP_COREDUMP) | 842 | if (signal->flags & SIGNAL_GROUP_COREDUMP) |
@@ -872,26 +848,25 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force) | |||
872 | /* | 848 | /* |
873 | * This is a stop signal. Remove SIGCONT from all queues. | 849 | * This is a stop signal. Remove SIGCONT from all queues. |
874 | */ | 850 | */ |
875 | rm_from_queue(sigmask(SIGCONT), &signal->shared_pending); | 851 | siginitset(&flush, sigmask(SIGCONT)); |
876 | t = p; | 852 | flush_sigqueue_mask(&flush, &signal->shared_pending); |
877 | do { | 853 | for_each_thread(p, t) |
878 | rm_from_queue(sigmask(SIGCONT), &t->pending); | 854 | flush_sigqueue_mask(&flush, &t->pending); |
879 | } while_each_thread(p, t); | ||
880 | } else if (sig == SIGCONT) { | 855 | } else if (sig == SIGCONT) { |
881 | unsigned int why; | 856 | unsigned int why; |
882 | /* | 857 | /* |
883 | * Remove all stop signals from all queues, wake all threads. | 858 | * Remove all stop signals from all queues, wake all threads. |
884 | */ | 859 | */ |
885 | rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending); | 860 | siginitset(&flush, SIG_KERNEL_STOP_MASK); |
886 | t = p; | 861 | flush_sigqueue_mask(&flush, &signal->shared_pending); |
887 | do { | 862 | for_each_thread(p, t) { |
863 | flush_sigqueue_mask(&flush, &t->pending); | ||
888 | task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING); | 864 | task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING); |
889 | rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending); | ||
890 | if (likely(!(t->ptrace & PT_SEIZED))) | 865 | if (likely(!(t->ptrace & PT_SEIZED))) |
891 | wake_up_state(t, __TASK_STOPPED); | 866 | wake_up_state(t, __TASK_STOPPED); |
892 | else | 867 | else |
893 | ptrace_trap_notify(t); | 868 | ptrace_trap_notify(t); |
894 | } while_each_thread(p, t); | 869 | } |
895 | 870 | ||
896 | /* | 871 | /* |
897 | * Notify the parent with CLD_CONTINUED if we were stopped. | 872 | * Notify the parent with CLD_CONTINUED if we were stopped. |
@@ -2854,7 +2829,7 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info, | |||
2854 | 2829 | ||
2855 | spin_lock_irq(&tsk->sighand->siglock); | 2830 | spin_lock_irq(&tsk->sighand->siglock); |
2856 | __set_task_blocked(tsk, &tsk->real_blocked); | 2831 | __set_task_blocked(tsk, &tsk->real_blocked); |
2857 | siginitset(&tsk->real_blocked, 0); | 2832 | sigemptyset(&tsk->real_blocked); |
2858 | sig = dequeue_signal(tsk, &mask, info); | 2833 | sig = dequeue_signal(tsk, &mask, info); |
2859 | } | 2834 | } |
2860 | spin_unlock_irq(&tsk->sighand->siglock); | 2835 | spin_unlock_irq(&tsk->sighand->siglock); |
@@ -3091,18 +3066,39 @@ COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo, | |||
3091 | } | 3066 | } |
3092 | #endif | 3067 | #endif |
3093 | 3068 | ||
3069 | /* | ||
3070 | * For kthreads only, must not be used if cloned with CLONE_SIGHAND | ||
3071 | */ | ||
3072 | void kernel_sigaction(int sig, __sighandler_t action) | ||
3073 | { | ||
3074 | spin_lock_irq(¤t->sighand->siglock); | ||
3075 | current->sighand->action[sig - 1].sa.sa_handler = action; | ||
3076 | if (action == SIG_IGN) { | ||
3077 | sigset_t mask; | ||
3078 | |||
3079 | sigemptyset(&mask); | ||
3080 | sigaddset(&mask, sig); | ||
3081 | |||
3082 | flush_sigqueue_mask(&mask, ¤t->signal->shared_pending); | ||
3083 | flush_sigqueue_mask(&mask, ¤t->pending); | ||
3084 | recalc_sigpending(); | ||
3085 | } | ||
3086 | spin_unlock_irq(¤t->sighand->siglock); | ||
3087 | } | ||
3088 | EXPORT_SYMBOL(kernel_sigaction); | ||
3089 | |||
3094 | int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) | 3090 | int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) |
3095 | { | 3091 | { |
3096 | struct task_struct *t = current; | 3092 | struct task_struct *p = current, *t; |
3097 | struct k_sigaction *k; | 3093 | struct k_sigaction *k; |
3098 | sigset_t mask; | 3094 | sigset_t mask; |
3099 | 3095 | ||
3100 | if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig))) | 3096 | if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig))) |
3101 | return -EINVAL; | 3097 | return -EINVAL; |
3102 | 3098 | ||
3103 | k = &t->sighand->action[sig-1]; | 3099 | k = &p->sighand->action[sig-1]; |
3104 | 3100 | ||
3105 | spin_lock_irq(¤t->sighand->siglock); | 3101 | spin_lock_irq(&p->sighand->siglock); |
3106 | if (oact) | 3102 | if (oact) |
3107 | *oact = *k; | 3103 | *oact = *k; |
3108 | 3104 | ||
@@ -3121,21 +3117,20 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) | |||
3121 | * (for example, SIGCHLD), shall cause the pending signal to | 3117 | * (for example, SIGCHLD), shall cause the pending signal to |
3122 | * be discarded, whether or not it is blocked" | 3118 | * be discarded, whether or not it is blocked" |
3123 | */ | 3119 | */ |
3124 | if (sig_handler_ignored(sig_handler(t, sig), sig)) { | 3120 | if (sig_handler_ignored(sig_handler(p, sig), sig)) { |
3125 | sigemptyset(&mask); | 3121 | sigemptyset(&mask); |
3126 | sigaddset(&mask, sig); | 3122 | sigaddset(&mask, sig); |
3127 | rm_from_queue_full(&mask, &t->signal->shared_pending); | 3123 | flush_sigqueue_mask(&mask, &p->signal->shared_pending); |
3128 | do { | 3124 | for_each_thread(p, t) |
3129 | rm_from_queue_full(&mask, &t->pending); | 3125 | flush_sigqueue_mask(&mask, &t->pending); |
3130 | } while_each_thread(current, t); | ||
3131 | } | 3126 | } |
3132 | } | 3127 | } |
3133 | 3128 | ||
3134 | spin_unlock_irq(¤t->sighand->siglock); | 3129 | spin_unlock_irq(&p->sighand->siglock); |
3135 | return 0; | 3130 | return 0; |
3136 | } | 3131 | } |
3137 | 3132 | ||
3138 | static int | 3133 | static int |
3139 | do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long sp) | 3134 | do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long sp) |
3140 | { | 3135 | { |
3141 | stack_t oss; | 3136 | stack_t oss; |
@@ -3496,7 +3491,7 @@ COMPAT_SYSCALL_DEFINE3(sigaction, int, sig, | |||
3496 | } | 3491 | } |
3497 | #endif | 3492 | #endif |
3498 | 3493 | ||
3499 | #ifdef __ARCH_WANT_SYS_SGETMASK | 3494 | #ifdef CONFIG_SGETMASK_SYSCALL |
3500 | 3495 | ||
3501 | /* | 3496 | /* |
3502 | * For backwards compatibility. Functionality superseded by sigprocmask. | 3497 | * For backwards compatibility. Functionality superseded by sigprocmask. |
@@ -3517,7 +3512,7 @@ SYSCALL_DEFINE1(ssetmask, int, newmask) | |||
3517 | 3512 | ||
3518 | return old; | 3513 | return old; |
3519 | } | 3514 | } |
3520 | #endif /* __ARCH_WANT_SGETMASK */ | 3515 | #endif /* CONFIG_SGETMASK_SYSCALL */ |
3521 | 3516 | ||
3522 | #ifdef __ARCH_WANT_SYS_SIGNAL | 3517 | #ifdef __ARCH_WANT_SYS_SIGNAL |
3523 | /* | 3518 | /* |
diff --git a/kernel/smp.c b/kernel/smp.c index 06d574e42c72..306f8180b0d5 100644 --- a/kernel/smp.c +++ b/kernel/smp.c | |||
@@ -185,14 +185,26 @@ void generic_smp_call_function_single_interrupt(void) | |||
185 | { | 185 | { |
186 | struct llist_node *entry; | 186 | struct llist_node *entry; |
187 | struct call_single_data *csd, *csd_next; | 187 | struct call_single_data *csd, *csd_next; |
188 | static bool warned; | ||
189 | |||
190 | entry = llist_del_all(&__get_cpu_var(call_single_queue)); | ||
191 | entry = llist_reverse_order(entry); | ||
188 | 192 | ||
189 | /* | 193 | /* |
190 | * Shouldn't receive this interrupt on a cpu that is not yet online. | 194 | * Shouldn't receive this interrupt on a cpu that is not yet online. |
191 | */ | 195 | */ |
192 | WARN_ON_ONCE(!cpu_online(smp_processor_id())); | 196 | if (unlikely(!cpu_online(smp_processor_id()) && !warned)) { |
197 | warned = true; | ||
198 | WARN(1, "IPI on offline CPU %d\n", smp_processor_id()); | ||
193 | 199 | ||
194 | entry = llist_del_all(&__get_cpu_var(call_single_queue)); | 200 | /* |
195 | entry = llist_reverse_order(entry); | 201 | * We don't have to use the _safe() variant here |
202 | * because we are not invoking the IPI handlers yet. | ||
203 | */ | ||
204 | llist_for_each_entry(csd, entry, llist) | ||
205 | pr_warn("IPI callback %pS sent to offline CPU\n", | ||
206 | csd->func); | ||
207 | } | ||
196 | 208 | ||
197 | llist_for_each_entry_safe(csd, csd_next, entry, llist) { | 209 | llist_for_each_entry_safe(csd, csd_next, entry, llist) { |
198 | csd->func(csd->info); | 210 | csd->func(csd->info); |
diff --git a/kernel/softirq.c b/kernel/softirq.c index 33e4648ae0e7..5918d227730f 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
@@ -223,7 +223,7 @@ static inline bool lockdep_softirq_start(void) { return false; } | |||
223 | static inline void lockdep_softirq_end(bool in_hardirq) { } | 223 | static inline void lockdep_softirq_end(bool in_hardirq) { } |
224 | #endif | 224 | #endif |
225 | 225 | ||
226 | asmlinkage void __do_softirq(void) | 226 | asmlinkage __visible void __do_softirq(void) |
227 | { | 227 | { |
228 | unsigned long end = jiffies + MAX_SOFTIRQ_TIME; | 228 | unsigned long end = jiffies + MAX_SOFTIRQ_TIME; |
229 | unsigned long old_flags = current->flags; | 229 | unsigned long old_flags = current->flags; |
@@ -232,7 +232,6 @@ asmlinkage void __do_softirq(void) | |||
232 | bool in_hardirq; | 232 | bool in_hardirq; |
233 | __u32 pending; | 233 | __u32 pending; |
234 | int softirq_bit; | 234 | int softirq_bit; |
235 | int cpu; | ||
236 | 235 | ||
237 | /* | 236 | /* |
238 | * Mask out PF_MEMALLOC s current task context is borrowed for the | 237 | * Mask out PF_MEMALLOC s current task context is borrowed for the |
@@ -247,7 +246,6 @@ asmlinkage void __do_softirq(void) | |||
247 | __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET); | 246 | __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET); |
248 | in_hardirq = lockdep_softirq_start(); | 247 | in_hardirq = lockdep_softirq_start(); |
249 | 248 | ||
250 | cpu = smp_processor_id(); | ||
251 | restart: | 249 | restart: |
252 | /* Reset the pending bitmask before enabling irqs */ | 250 | /* Reset the pending bitmask before enabling irqs */ |
253 | set_softirq_pending(0); | 251 | set_softirq_pending(0); |
@@ -276,11 +274,11 @@ restart: | |||
276 | prev_count, preempt_count()); | 274 | prev_count, preempt_count()); |
277 | preempt_count_set(prev_count); | 275 | preempt_count_set(prev_count); |
278 | } | 276 | } |
279 | rcu_bh_qs(cpu); | ||
280 | h++; | 277 | h++; |
281 | pending >>= softirq_bit; | 278 | pending >>= softirq_bit; |
282 | } | 279 | } |
283 | 280 | ||
281 | rcu_bh_qs(smp_processor_id()); | ||
284 | local_irq_disable(); | 282 | local_irq_disable(); |
285 | 283 | ||
286 | pending = local_softirq_pending(); | 284 | pending = local_softirq_pending(); |
@@ -299,7 +297,7 @@ restart: | |||
299 | tsk_restore_flags(current, old_flags, PF_MEMALLOC); | 297 | tsk_restore_flags(current, old_flags, PF_MEMALLOC); |
300 | } | 298 | } |
301 | 299 | ||
302 | asmlinkage void do_softirq(void) | 300 | asmlinkage __visible void do_softirq(void) |
303 | { | 301 | { |
304 | __u32 pending; | 302 | __u32 pending; |
305 | unsigned long flags; | 303 | unsigned long flags; |
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 01fbae5b97b7..695f0c6cd169 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c | |||
@@ -307,6 +307,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * | |||
307 | * @cpu: cpu to stop | 307 | * @cpu: cpu to stop |
308 | * @fn: function to execute | 308 | * @fn: function to execute |
309 | * @arg: argument to @fn | 309 | * @arg: argument to @fn |
310 | * @work_buf: pointer to cpu_stop_work structure | ||
310 | * | 311 | * |
311 | * Similar to stop_one_cpu() but doesn't wait for completion. The | 312 | * Similar to stop_one_cpu() but doesn't wait for completion. The |
312 | * caller is responsible for ensuring @work_buf is currently unused | 313 | * caller is responsible for ensuring @work_buf is currently unused |
diff --git a/kernel/sys.c b/kernel/sys.c index fba0f29401ea..66a751ebf9d9 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -250,7 +250,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) | |||
250 | else | 250 | else |
251 | p = current; | 251 | p = current; |
252 | if (p) { | 252 | if (p) { |
253 | niceval = 20 - task_nice(p); | 253 | niceval = nice_to_rlimit(task_nice(p)); |
254 | if (niceval > retval) | 254 | if (niceval > retval) |
255 | retval = niceval; | 255 | retval = niceval; |
256 | } | 256 | } |
@@ -261,7 +261,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) | |||
261 | else | 261 | else |
262 | pgrp = task_pgrp(current); | 262 | pgrp = task_pgrp(current); |
263 | do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { | 263 | do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { |
264 | niceval = 20 - task_nice(p); | 264 | niceval = nice_to_rlimit(task_nice(p)); |
265 | if (niceval > retval) | 265 | if (niceval > retval) |
266 | retval = niceval; | 266 | retval = niceval; |
267 | } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); | 267 | } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); |
@@ -277,7 +277,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) | |||
277 | 277 | ||
278 | do_each_thread(g, p) { | 278 | do_each_thread(g, p) { |
279 | if (uid_eq(task_uid(p), uid)) { | 279 | if (uid_eq(task_uid(p), uid)) { |
280 | niceval = 20 - task_nice(p); | 280 | niceval = nice_to_rlimit(task_nice(p)); |
281 | if (niceval > retval) | 281 | if (niceval > retval) |
282 | retval = niceval; | 282 | retval = niceval; |
283 | } | 283 | } |
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index bc8d1b74a6b9..36441b51b5df 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c | |||
@@ -135,6 +135,8 @@ cond_syscall(sys_setresgid16); | |||
135 | cond_syscall(sys_setresuid16); | 135 | cond_syscall(sys_setresuid16); |
136 | cond_syscall(sys_setreuid16); | 136 | cond_syscall(sys_setreuid16); |
137 | cond_syscall(sys_setuid16); | 137 | cond_syscall(sys_setuid16); |
138 | cond_syscall(sys_sgetmask); | ||
139 | cond_syscall(sys_ssetmask); | ||
138 | cond_syscall(sys_vm86old); | 140 | cond_syscall(sys_vm86old); |
139 | cond_syscall(sys_vm86); | 141 | cond_syscall(sys_vm86); |
140 | cond_syscall(sys_ipc); | 142 | cond_syscall(sys_ipc); |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 74f5b580fe34..ba9ed453c4ed 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -173,6 +173,13 @@ extern int no_unaligned_warning; | |||
173 | #endif | 173 | #endif |
174 | 174 | ||
175 | #ifdef CONFIG_PROC_SYSCTL | 175 | #ifdef CONFIG_PROC_SYSCTL |
176 | |||
177 | #define SYSCTL_WRITES_LEGACY -1 | ||
178 | #define SYSCTL_WRITES_WARN 0 | ||
179 | #define SYSCTL_WRITES_STRICT 1 | ||
180 | |||
181 | static int sysctl_writes_strict = SYSCTL_WRITES_WARN; | ||
182 | |||
176 | static int proc_do_cad_pid(struct ctl_table *table, int write, | 183 | static int proc_do_cad_pid(struct ctl_table *table, int write, |
177 | void __user *buffer, size_t *lenp, loff_t *ppos); | 184 | void __user *buffer, size_t *lenp, loff_t *ppos); |
178 | static int proc_taint(struct ctl_table *table, int write, | 185 | static int proc_taint(struct ctl_table *table, int write, |
@@ -195,7 +202,7 @@ static int proc_dostring_coredump(struct ctl_table *table, int write, | |||
195 | /* Note: sysrq code uses it's own private copy */ | 202 | /* Note: sysrq code uses it's own private copy */ |
196 | static int __sysrq_enabled = CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE; | 203 | static int __sysrq_enabled = CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE; |
197 | 204 | ||
198 | static int sysrq_sysctl_handler(ctl_table *table, int write, | 205 | static int sysrq_sysctl_handler(struct ctl_table *table, int write, |
199 | void __user *buffer, size_t *lenp, | 206 | void __user *buffer, size_t *lenp, |
200 | loff_t *ppos) | 207 | loff_t *ppos) |
201 | { | 208 | { |
@@ -495,6 +502,15 @@ static struct ctl_table kern_table[] = { | |||
495 | .mode = 0644, | 502 | .mode = 0644, |
496 | .proc_handler = proc_taint, | 503 | .proc_handler = proc_taint, |
497 | }, | 504 | }, |
505 | { | ||
506 | .procname = "sysctl_writes_strict", | ||
507 | .data = &sysctl_writes_strict, | ||
508 | .maxlen = sizeof(int), | ||
509 | .mode = 0644, | ||
510 | .proc_handler = proc_dointvec_minmax, | ||
511 | .extra1 = &neg_one, | ||
512 | .extra2 = &one, | ||
513 | }, | ||
498 | #endif | 514 | #endif |
499 | #ifdef CONFIG_LATENCYTOP | 515 | #ifdef CONFIG_LATENCYTOP |
500 | { | 516 | { |
@@ -643,7 +659,7 @@ static struct ctl_table kern_table[] = { | |||
643 | .extra2 = &one, | 659 | .extra2 = &one, |
644 | }, | 660 | }, |
645 | #endif | 661 | #endif |
646 | 662 | #ifdef CONFIG_UEVENT_HELPER | |
647 | { | 663 | { |
648 | .procname = "hotplug", | 664 | .procname = "hotplug", |
649 | .data = &uevent_helper, | 665 | .data = &uevent_helper, |
@@ -651,7 +667,7 @@ static struct ctl_table kern_table[] = { | |||
651 | .mode = 0644, | 667 | .mode = 0644, |
652 | .proc_handler = proc_dostring, | 668 | .proc_handler = proc_dostring, |
653 | }, | 669 | }, |
654 | 670 | #endif | |
655 | #ifdef CONFIG_CHR_DEV_SG | 671 | #ifdef CONFIG_CHR_DEV_SG |
656 | { | 672 | { |
657 | .procname = "sg-big-buff", | 673 | .procname = "sg-big-buff", |
@@ -1418,8 +1434,13 @@ static struct ctl_table vm_table[] = { | |||
1418 | (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) | 1434 | (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) |
1419 | { | 1435 | { |
1420 | .procname = "vdso_enabled", | 1436 | .procname = "vdso_enabled", |
1437 | #ifdef CONFIG_X86_32 | ||
1438 | .data = &vdso32_enabled, | ||
1439 | .maxlen = sizeof(vdso32_enabled), | ||
1440 | #else | ||
1421 | .data = &vdso_enabled, | 1441 | .data = &vdso_enabled, |
1422 | .maxlen = sizeof(vdso_enabled), | 1442 | .maxlen = sizeof(vdso_enabled), |
1443 | #endif | ||
1423 | .mode = 0644, | 1444 | .mode = 0644, |
1424 | .proc_handler = proc_dointvec, | 1445 | .proc_handler = proc_dointvec, |
1425 | .extra1 = &zero, | 1446 | .extra1 = &zero, |
@@ -1698,8 +1719,8 @@ int __init sysctl_init(void) | |||
1698 | 1719 | ||
1699 | #ifdef CONFIG_PROC_SYSCTL | 1720 | #ifdef CONFIG_PROC_SYSCTL |
1700 | 1721 | ||
1701 | static int _proc_do_string(void* data, int maxlen, int write, | 1722 | static int _proc_do_string(char *data, int maxlen, int write, |
1702 | void __user *buffer, | 1723 | char __user *buffer, |
1703 | size_t *lenp, loff_t *ppos) | 1724 | size_t *lenp, loff_t *ppos) |
1704 | { | 1725 | { |
1705 | size_t len; | 1726 | size_t len; |
@@ -1712,21 +1733,30 @@ static int _proc_do_string(void* data, int maxlen, int write, | |||
1712 | } | 1733 | } |
1713 | 1734 | ||
1714 | if (write) { | 1735 | if (write) { |
1715 | len = 0; | 1736 | if (sysctl_writes_strict == SYSCTL_WRITES_STRICT) { |
1737 | /* Only continue writes not past the end of buffer. */ | ||
1738 | len = strlen(data); | ||
1739 | if (len > maxlen - 1) | ||
1740 | len = maxlen - 1; | ||
1741 | |||
1742 | if (*ppos > len) | ||
1743 | return 0; | ||
1744 | len = *ppos; | ||
1745 | } else { | ||
1746 | /* Start writing from beginning of buffer. */ | ||
1747 | len = 0; | ||
1748 | } | ||
1749 | |||
1750 | *ppos += *lenp; | ||
1716 | p = buffer; | 1751 | p = buffer; |
1717 | while (len < *lenp) { | 1752 | while ((p - buffer) < *lenp && len < maxlen - 1) { |
1718 | if (get_user(c, p++)) | 1753 | if (get_user(c, p++)) |
1719 | return -EFAULT; | 1754 | return -EFAULT; |
1720 | if (c == 0 || c == '\n') | 1755 | if (c == 0 || c == '\n') |
1721 | break; | 1756 | break; |
1722 | len++; | 1757 | data[len++] = c; |
1723 | } | 1758 | } |
1724 | if (len >= maxlen) | 1759 | data[len] = 0; |
1725 | len = maxlen-1; | ||
1726 | if(copy_from_user(data, buffer, len)) | ||
1727 | return -EFAULT; | ||
1728 | ((char *) data)[len] = 0; | ||
1729 | *ppos += *lenp; | ||
1730 | } else { | 1760 | } else { |
1731 | len = strlen(data); | 1761 | len = strlen(data); |
1732 | if (len > maxlen) | 1762 | if (len > maxlen) |
@@ -1743,10 +1773,10 @@ static int _proc_do_string(void* data, int maxlen, int write, | |||
1743 | if (len > *lenp) | 1773 | if (len > *lenp) |
1744 | len = *lenp; | 1774 | len = *lenp; |
1745 | if (len) | 1775 | if (len) |
1746 | if(copy_to_user(buffer, data, len)) | 1776 | if (copy_to_user(buffer, data, len)) |
1747 | return -EFAULT; | 1777 | return -EFAULT; |
1748 | if (len < *lenp) { | 1778 | if (len < *lenp) { |
1749 | if(put_user('\n', ((char __user *) buffer) + len)) | 1779 | if (put_user('\n', buffer + len)) |
1750 | return -EFAULT; | 1780 | return -EFAULT; |
1751 | len++; | 1781 | len++; |
1752 | } | 1782 | } |
@@ -1756,6 +1786,14 @@ static int _proc_do_string(void* data, int maxlen, int write, | |||
1756 | return 0; | 1786 | return 0; |
1757 | } | 1787 | } |
1758 | 1788 | ||
1789 | static void warn_sysctl_write(struct ctl_table *table) | ||
1790 | { | ||
1791 | pr_warn_once("%s wrote to %s when file position was not 0!\n" | ||
1792 | "This will not be supported in the future. To silence this\n" | ||
1793 | "warning, set kernel.sysctl_writes_strict = -1\n", | ||
1794 | current->comm, table->procname); | ||
1795 | } | ||
1796 | |||
1759 | /** | 1797 | /** |
1760 | * proc_dostring - read a string sysctl | 1798 | * proc_dostring - read a string sysctl |
1761 | * @table: the sysctl table | 1799 | * @table: the sysctl table |
@@ -1776,8 +1814,11 @@ static int _proc_do_string(void* data, int maxlen, int write, | |||
1776 | int proc_dostring(struct ctl_table *table, int write, | 1814 | int proc_dostring(struct ctl_table *table, int write, |
1777 | void __user *buffer, size_t *lenp, loff_t *ppos) | 1815 | void __user *buffer, size_t *lenp, loff_t *ppos) |
1778 | { | 1816 | { |
1779 | return _proc_do_string(table->data, table->maxlen, write, | 1817 | if (write && *ppos && sysctl_writes_strict == SYSCTL_WRITES_WARN) |
1780 | buffer, lenp, ppos); | 1818 | warn_sysctl_write(table); |
1819 | |||
1820 | return _proc_do_string((char *)(table->data), table->maxlen, write, | ||
1821 | (char __user *)buffer, lenp, ppos); | ||
1781 | } | 1822 | } |
1782 | 1823 | ||
1783 | static size_t proc_skip_spaces(char **buf) | 1824 | static size_t proc_skip_spaces(char **buf) |
@@ -1951,6 +1992,18 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, | |||
1951 | conv = do_proc_dointvec_conv; | 1992 | conv = do_proc_dointvec_conv; |
1952 | 1993 | ||
1953 | if (write) { | 1994 | if (write) { |
1995 | if (*ppos) { | ||
1996 | switch (sysctl_writes_strict) { | ||
1997 | case SYSCTL_WRITES_STRICT: | ||
1998 | goto out; | ||
1999 | case SYSCTL_WRITES_WARN: | ||
2000 | warn_sysctl_write(table); | ||
2001 | break; | ||
2002 | default: | ||
2003 | break; | ||
2004 | } | ||
2005 | } | ||
2006 | |||
1954 | if (left > PAGE_SIZE - 1) | 2007 | if (left > PAGE_SIZE - 1) |
1955 | left = PAGE_SIZE - 1; | 2008 | left = PAGE_SIZE - 1; |
1956 | page = __get_free_page(GFP_TEMPORARY); | 2009 | page = __get_free_page(GFP_TEMPORARY); |
@@ -2008,6 +2061,7 @@ free: | |||
2008 | return err ? : -EINVAL; | 2061 | return err ? : -EINVAL; |
2009 | } | 2062 | } |
2010 | *lenp -= left; | 2063 | *lenp -= left; |
2064 | out: | ||
2011 | *ppos += *lenp; | 2065 | *ppos += *lenp; |
2012 | return err; | 2066 | return err; |
2013 | } | 2067 | } |
@@ -2200,6 +2254,18 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int | |||
2200 | left = *lenp; | 2254 | left = *lenp; |
2201 | 2255 | ||
2202 | if (write) { | 2256 | if (write) { |
2257 | if (*ppos) { | ||
2258 | switch (sysctl_writes_strict) { | ||
2259 | case SYSCTL_WRITES_STRICT: | ||
2260 | goto out; | ||
2261 | case SYSCTL_WRITES_WARN: | ||
2262 | warn_sysctl_write(table); | ||
2263 | break; | ||
2264 | default: | ||
2265 | break; | ||
2266 | } | ||
2267 | } | ||
2268 | |||
2203 | if (left > PAGE_SIZE - 1) | 2269 | if (left > PAGE_SIZE - 1) |
2204 | left = PAGE_SIZE - 1; | 2270 | left = PAGE_SIZE - 1; |
2205 | page = __get_free_page(GFP_TEMPORARY); | 2271 | page = __get_free_page(GFP_TEMPORARY); |
@@ -2255,6 +2321,7 @@ free: | |||
2255 | return err ? : -EINVAL; | 2321 | return err ? : -EINVAL; |
2256 | } | 2322 | } |
2257 | *lenp -= left; | 2323 | *lenp -= left; |
2324 | out: | ||
2258 | *ppos += *lenp; | 2325 | *ppos += *lenp; |
2259 | return err; | 2326 | return err; |
2260 | } | 2327 | } |
@@ -2501,11 +2568,11 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, | |||
2501 | bool first = 1; | 2568 | bool first = 1; |
2502 | size_t left = *lenp; | 2569 | size_t left = *lenp; |
2503 | unsigned long bitmap_len = table->maxlen; | 2570 | unsigned long bitmap_len = table->maxlen; |
2504 | unsigned long *bitmap = (unsigned long *) table->data; | 2571 | unsigned long *bitmap = *(unsigned long **) table->data; |
2505 | unsigned long *tmp_bitmap = NULL; | 2572 | unsigned long *tmp_bitmap = NULL; |
2506 | char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c; | 2573 | char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c; |
2507 | 2574 | ||
2508 | if (!bitmap_len || !left || (*ppos && !write)) { | 2575 | if (!bitmap || !bitmap_len || !left || (*ppos && !write)) { |
2509 | *lenp = 0; | 2576 | *lenp = 0; |
2510 | return 0; | 2577 | return 0; |
2511 | } | 2578 | } |
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 419a52cecd20..33db43a39515 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c | |||
@@ -165,21 +165,21 @@ static inline void pps_set_freq(s64 freq) | |||
165 | 165 | ||
166 | static inline int is_error_status(int status) | 166 | static inline int is_error_status(int status) |
167 | { | 167 | { |
168 | return (time_status & (STA_UNSYNC|STA_CLOCKERR)) | 168 | return (status & (STA_UNSYNC|STA_CLOCKERR)) |
169 | /* PPS signal lost when either PPS time or | 169 | /* PPS signal lost when either PPS time or |
170 | * PPS frequency synchronization requested | 170 | * PPS frequency synchronization requested |
171 | */ | 171 | */ |
172 | || ((time_status & (STA_PPSFREQ|STA_PPSTIME)) | 172 | || ((status & (STA_PPSFREQ|STA_PPSTIME)) |
173 | && !(time_status & STA_PPSSIGNAL)) | 173 | && !(status & STA_PPSSIGNAL)) |
174 | /* PPS jitter exceeded when | 174 | /* PPS jitter exceeded when |
175 | * PPS time synchronization requested */ | 175 | * PPS time synchronization requested */ |
176 | || ((time_status & (STA_PPSTIME|STA_PPSJITTER)) | 176 | || ((status & (STA_PPSTIME|STA_PPSJITTER)) |
177 | == (STA_PPSTIME|STA_PPSJITTER)) | 177 | == (STA_PPSTIME|STA_PPSJITTER)) |
178 | /* PPS wander exceeded or calibration error when | 178 | /* PPS wander exceeded or calibration error when |
179 | * PPS frequency synchronization requested | 179 | * PPS frequency synchronization requested |
180 | */ | 180 | */ |
181 | || ((time_status & STA_PPSFREQ) | 181 | || ((status & STA_PPSFREQ) |
182 | && (time_status & (STA_PPSWANDER|STA_PPSERROR))); | 182 | && (status & (STA_PPSWANDER|STA_PPSERROR))); |
183 | } | 183 | } |
184 | 184 | ||
185 | static inline void pps_fill_timex(struct timex *txc) | 185 | static inline void pps_fill_timex(struct timex *txc) |
@@ -786,8 +786,9 @@ static long hardpps_update_freq(struct pps_normtime freq_norm) | |||
786 | time_status |= STA_PPSERROR; | 786 | time_status |= STA_PPSERROR; |
787 | pps_errcnt++; | 787 | pps_errcnt++; |
788 | pps_dec_freq_interval(); | 788 | pps_dec_freq_interval(); |
789 | pr_err("hardpps: PPSERROR: interval too long - %ld s\n", | 789 | printk_deferred(KERN_ERR |
790 | freq_norm.sec); | 790 | "hardpps: PPSERROR: interval too long - %ld s\n", |
791 | freq_norm.sec); | ||
791 | return 0; | 792 | return 0; |
792 | } | 793 | } |
793 | 794 | ||
@@ -800,7 +801,8 @@ static long hardpps_update_freq(struct pps_normtime freq_norm) | |||
800 | delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT); | 801 | delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT); |
801 | pps_freq = ftemp; | 802 | pps_freq = ftemp; |
802 | if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) { | 803 | if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) { |
803 | pr_warning("hardpps: PPSWANDER: change=%ld\n", delta); | 804 | printk_deferred(KERN_WARNING |
805 | "hardpps: PPSWANDER: change=%ld\n", delta); | ||
804 | time_status |= STA_PPSWANDER; | 806 | time_status |= STA_PPSWANDER; |
805 | pps_stbcnt++; | 807 | pps_stbcnt++; |
806 | pps_dec_freq_interval(); | 808 | pps_dec_freq_interval(); |
@@ -844,8 +846,9 @@ static void hardpps_update_phase(long error) | |||
844 | * the time offset is updated. | 846 | * the time offset is updated. |
845 | */ | 847 | */ |
846 | if (jitter > (pps_jitter << PPS_POPCORN)) { | 848 | if (jitter > (pps_jitter << PPS_POPCORN)) { |
847 | pr_warning("hardpps: PPSJITTER: jitter=%ld, limit=%ld\n", | 849 | printk_deferred(KERN_WARNING |
848 | jitter, (pps_jitter << PPS_POPCORN)); | 850 | "hardpps: PPSJITTER: jitter=%ld, limit=%ld\n", |
851 | jitter, (pps_jitter << PPS_POPCORN)); | ||
849 | time_status |= STA_PPSJITTER; | 852 | time_status |= STA_PPSJITTER; |
850 | pps_jitcnt++; | 853 | pps_jitcnt++; |
851 | } else if (time_status & STA_PPSTIME) { | 854 | } else if (time_status & STA_PPSTIME) { |
@@ -902,7 +905,7 @@ void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) | |||
902 | time_status |= STA_PPSJITTER; | 905 | time_status |= STA_PPSJITTER; |
903 | /* restart the frequency calibration interval */ | 906 | /* restart the frequency calibration interval */ |
904 | pps_fbase = *raw_ts; | 907 | pps_fbase = *raw_ts; |
905 | pr_err("hardpps: PPSJITTER: bad pulse\n"); | 908 | printk_deferred(KERN_ERR "hardpps: PPSJITTER: bad pulse\n"); |
906 | return; | 909 | return; |
907 | } | 910 | } |
908 | 911 | ||
@@ -923,7 +926,10 @@ void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) | |||
923 | 926 | ||
924 | static int __init ntp_tick_adj_setup(char *str) | 927 | static int __init ntp_tick_adj_setup(char *str) |
925 | { | 928 | { |
926 | ntp_tick_adj = simple_strtol(str, NULL, 0); | 929 | int rc = kstrtol(str, 0, (long *)&ntp_tick_adj); |
930 | |||
931 | if (rc) | ||
932 | return rc; | ||
927 | ntp_tick_adj <<= NTP_SCALE_SHIFT; | 933 | ntp_tick_adj <<= NTP_SCALE_SHIFT; |
928 | 934 | ||
929 | return 1; | 935 | return 1; |
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 4d23dc4d8139..445106d2c729 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c | |||
@@ -49,13 +49,6 @@ static u64 notrace jiffy_sched_clock_read(void) | |||
49 | return (u64)(jiffies - INITIAL_JIFFIES); | 49 | return (u64)(jiffies - INITIAL_JIFFIES); |
50 | } | 50 | } |
51 | 51 | ||
52 | static u32 __read_mostly (*read_sched_clock_32)(void); | ||
53 | |||
54 | static u64 notrace read_sched_clock_32_wrapper(void) | ||
55 | { | ||
56 | return read_sched_clock_32(); | ||
57 | } | ||
58 | |||
59 | static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read; | 52 | static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read; |
60 | 53 | ||
61 | static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) | 54 | static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) |
@@ -176,12 +169,6 @@ void __init sched_clock_register(u64 (*read)(void), int bits, | |||
176 | pr_debug("Registered %pF as sched_clock source\n", read); | 169 | pr_debug("Registered %pF as sched_clock source\n", read); |
177 | } | 170 | } |
178 | 171 | ||
179 | void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate) | ||
180 | { | ||
181 | read_sched_clock_32 = read; | ||
182 | sched_clock_register(read_sched_clock_32_wrapper, bits, rate); | ||
183 | } | ||
184 | |||
185 | void __init sched_clock_postinit(void) | 172 | void __init sched_clock_postinit(void) |
186 | { | 173 | { |
187 | /* | 174 | /* |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index f7df8ea21707..32d8d6aaedb8 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
@@ -852,8 +852,9 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, | |||
852 | struct timespec *delta) | 852 | struct timespec *delta) |
853 | { | 853 | { |
854 | if (!timespec_valid_strict(delta)) { | 854 | if (!timespec_valid_strict(delta)) { |
855 | printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid " | 855 | printk_deferred(KERN_WARNING |
856 | "sleep delta value!\n"); | 856 | "__timekeeping_inject_sleeptime: Invalid " |
857 | "sleep delta value!\n"); | ||
857 | return; | 858 | return; |
858 | } | 859 | } |
859 | tk_xtime_add(tk, delta); | 860 | tk_xtime_add(tk, delta); |
@@ -1157,7 +1158,7 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) | |||
1157 | 1158 | ||
1158 | if (unlikely(tk->clock->maxadj && | 1159 | if (unlikely(tk->clock->maxadj && |
1159 | (tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) { | 1160 | (tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) { |
1160 | printk_once(KERN_WARNING | 1161 | printk_deferred_once(KERN_WARNING |
1161 | "Adjusting %s more than 11%% (%ld vs %ld)\n", | 1162 | "Adjusting %s more than 11%% (%ld vs %ld)\n", |
1162 | tk->clock->name, (long)tk->mult + adj, | 1163 | tk->clock->name, (long)tk->mult + adj, |
1163 | (long)tk->clock->mult + tk->clock->maxadj); | 1164 | (long)tk->clock->mult + tk->clock->maxadj); |
diff --git a/kernel/torture.c b/kernel/torture.c index acc9afc2f26e..40bb511cca48 100644 --- a/kernel/torture.c +++ b/kernel/torture.c | |||
@@ -335,13 +335,8 @@ static void torture_shuffle_tasks(void) | |||
335 | shuffle_idle_cpu = cpumask_next(shuffle_idle_cpu, shuffle_tmp_mask); | 335 | shuffle_idle_cpu = cpumask_next(shuffle_idle_cpu, shuffle_tmp_mask); |
336 | if (shuffle_idle_cpu >= nr_cpu_ids) | 336 | if (shuffle_idle_cpu >= nr_cpu_ids) |
337 | shuffle_idle_cpu = -1; | 337 | shuffle_idle_cpu = -1; |
338 | if (shuffle_idle_cpu != -1) { | 338 | else |
339 | cpumask_clear_cpu(shuffle_idle_cpu, shuffle_tmp_mask); | 339 | cpumask_clear_cpu(shuffle_idle_cpu, shuffle_tmp_mask); |
340 | if (cpumask_empty(shuffle_tmp_mask)) { | ||
341 | put_online_cpus(); | ||
342 | return; | ||
343 | } | ||
344 | } | ||
345 | 340 | ||
346 | mutex_lock(&shuffle_task_mutex); | 341 | mutex_lock(&shuffle_task_mutex); |
347 | list_for_each_entry(stp, &shuffle_task_list, st_l) | 342 | list_for_each_entry(stp, &shuffle_task_list, st_l) |
@@ -533,7 +528,11 @@ void stutter_wait(const char *title) | |||
533 | while (ACCESS_ONCE(stutter_pause_test) || | 528 | while (ACCESS_ONCE(stutter_pause_test) || |
534 | (torture_runnable && !ACCESS_ONCE(*torture_runnable))) { | 529 | (torture_runnable && !ACCESS_ONCE(*torture_runnable))) { |
535 | if (stutter_pause_test) | 530 | if (stutter_pause_test) |
536 | schedule_timeout_interruptible(1); | 531 | if (ACCESS_ONCE(stutter_pause_test) == 1) |
532 | schedule_timeout_interruptible(1); | ||
533 | else | ||
534 | while (ACCESS_ONCE(stutter_pause_test)) | ||
535 | cond_resched(); | ||
537 | else | 536 | else |
538 | schedule_timeout_interruptible(round_jiffies_relative(HZ)); | 537 | schedule_timeout_interruptible(round_jiffies_relative(HZ)); |
539 | torture_shutdown_absorb(title); | 538 | torture_shutdown_absorb(title); |
@@ -550,7 +549,11 @@ static int torture_stutter(void *arg) | |||
550 | VERBOSE_TOROUT_STRING("torture_stutter task started"); | 549 | VERBOSE_TOROUT_STRING("torture_stutter task started"); |
551 | do { | 550 | do { |
552 | if (!torture_must_stop()) { | 551 | if (!torture_must_stop()) { |
553 | schedule_timeout_interruptible(stutter); | 552 | if (stutter > 1) { |
553 | schedule_timeout_interruptible(stutter - 1); | ||
554 | ACCESS_ONCE(stutter_pause_test) = 2; | ||
555 | } | ||
556 | schedule_timeout_interruptible(1); | ||
554 | ACCESS_ONCE(stutter_pause_test) = 1; | 557 | ACCESS_ONCE(stutter_pause_test) = 1; |
555 | } | 558 | } |
556 | if (!torture_must_stop()) | 559 | if (!torture_must_stop()) |
@@ -596,21 +599,27 @@ static void torture_stutter_cleanup(void) | |||
596 | * The runnable parameter points to a flag that controls whether or not | 599 | * The runnable parameter points to a flag that controls whether or not |
597 | * the test is currently runnable. If there is no such flag, pass in NULL. | 600 | * the test is currently runnable. If there is no such flag, pass in NULL. |
598 | */ | 601 | */ |
599 | void __init torture_init_begin(char *ttype, bool v, int *runnable) | 602 | bool torture_init_begin(char *ttype, bool v, int *runnable) |
600 | { | 603 | { |
601 | mutex_lock(&fullstop_mutex); | 604 | mutex_lock(&fullstop_mutex); |
605 | if (torture_type != NULL) { | ||
606 | pr_alert("torture_init_begin: refusing %s init: %s running", | ||
607 | ttype, torture_type); | ||
608 | mutex_unlock(&fullstop_mutex); | ||
609 | return false; | ||
610 | } | ||
602 | torture_type = ttype; | 611 | torture_type = ttype; |
603 | verbose = v; | 612 | verbose = v; |
604 | torture_runnable = runnable; | 613 | torture_runnable = runnable; |
605 | fullstop = FULLSTOP_DONTSTOP; | 614 | fullstop = FULLSTOP_DONTSTOP; |
606 | 615 | return true; | |
607 | } | 616 | } |
608 | EXPORT_SYMBOL_GPL(torture_init_begin); | 617 | EXPORT_SYMBOL_GPL(torture_init_begin); |
609 | 618 | ||
610 | /* | 619 | /* |
611 | * Tell the torture module that initialization is complete. | 620 | * Tell the torture module that initialization is complete. |
612 | */ | 621 | */ |
613 | void __init torture_init_end(void) | 622 | void torture_init_end(void) |
614 | { | 623 | { |
615 | mutex_unlock(&fullstop_mutex); | 624 | mutex_unlock(&fullstop_mutex); |
616 | register_reboot_notifier(&torture_shutdown_nb); | 625 | register_reboot_notifier(&torture_shutdown_nb); |
@@ -642,6 +651,9 @@ bool torture_cleanup(void) | |||
642 | torture_shuffle_cleanup(); | 651 | torture_shuffle_cleanup(); |
643 | torture_stutter_cleanup(); | 652 | torture_stutter_cleanup(); |
644 | torture_onoff_cleanup(); | 653 | torture_onoff_cleanup(); |
654 | mutex_lock(&fullstop_mutex); | ||
655 | torture_type = NULL; | ||
656 | mutex_unlock(&fullstop_mutex); | ||
645 | return false; | 657 | return false; |
646 | } | 658 | } |
647 | EXPORT_SYMBOL_GPL(torture_cleanup); | 659 | EXPORT_SYMBOL_GPL(torture_cleanup); |
@@ -674,8 +686,10 @@ EXPORT_SYMBOL_GPL(torture_must_stop_irq); | |||
674 | */ | 686 | */ |
675 | void torture_kthread_stopping(char *title) | 687 | void torture_kthread_stopping(char *title) |
676 | { | 688 | { |
677 | if (verbose) | 689 | char buf[128]; |
678 | VERBOSE_TOROUT_STRING(title); | 690 | |
691 | snprintf(buf, sizeof(buf), "Stopping %s", title); | ||
692 | VERBOSE_TOROUT_STRING(buf); | ||
679 | while (!kthread_should_stop()) { | 693 | while (!kthread_should_stop()) { |
680 | torture_shutdown_absorb(title); | 694 | torture_shutdown_absorb(title); |
681 | schedule_timeout_uninterruptible(1); | 695 | schedule_timeout_uninterruptible(1); |
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 8639819f6cef..d4409356f40d 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig | |||
@@ -535,6 +535,36 @@ config MMIOTRACE_TEST | |||
535 | 535 | ||
536 | Say N, unless you absolutely know what you are doing. | 536 | Say N, unless you absolutely know what you are doing. |
537 | 537 | ||
538 | config TRACEPOINT_BENCHMARK | ||
539 | bool "Add tracepoint that benchmarks tracepoints" | ||
540 | help | ||
541 | This option creates the tracepoint "benchmark:benchmark_event". | ||
542 | When the tracepoint is enabled, it kicks off a kernel thread that | ||
543 | goes into an infinite loop (calling cond_sched() to let other tasks | ||
544 | run), and calls the tracepoint. Each iteration will record the time | ||
545 | it took to write to the tracepoint and the next iteration that | ||
546 | data will be passed to the tracepoint itself. That is, the tracepoint | ||
547 | will report the time it took to do the previous tracepoint. | ||
548 | The string written to the tracepoint is a static string of 128 bytes | ||
549 | to keep the time the same. The initial string is simply a write of | ||
550 | "START". The second string records the cold cache time of the first | ||
551 | write which is not added to the rest of the calculations. | ||
552 | |||
553 | As it is a tight loop, it benchmarks as hot cache. That's fine because | ||
554 | we care most about hot paths that are probably in cache already. | ||
555 | |||
556 | An example of the output: | ||
557 | |||
558 | START | ||
559 | first=3672 [COLD CACHED] | ||
560 | last=632 first=3672 max=632 min=632 avg=316 std=446 std^2=199712 | ||
561 | last=278 first=3672 max=632 min=278 avg=303 std=316 std^2=100337 | ||
562 | last=277 first=3672 max=632 min=277 avg=296 std=258 std^2=67064 | ||
563 | last=273 first=3672 max=632 min=273 avg=292 std=224 std^2=50411 | ||
564 | last=273 first=3672 max=632 min=273 avg=288 std=200 std^2=40389 | ||
565 | last=281 first=3672 max=632 min=273 avg=287 std=183 std^2=33666 | ||
566 | |||
567 | |||
538 | config RING_BUFFER_BENCHMARK | 568 | config RING_BUFFER_BENCHMARK |
539 | tristate "Ring buffer benchmark stress tester" | 569 | tristate "Ring buffer benchmark stress tester" |
540 | depends on RING_BUFFER | 570 | depends on RING_BUFFER |
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 1378e84fbe39..2611613f14f1 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile | |||
@@ -17,6 +17,7 @@ ifdef CONFIG_TRACING_BRANCHES | |||
17 | KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING | 17 | KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING |
18 | endif | 18 | endif |
19 | 19 | ||
20 | CFLAGS_trace_benchmark.o := -I$(src) | ||
20 | CFLAGS_trace_events_filter.o := -I$(src) | 21 | CFLAGS_trace_events_filter.o := -I$(src) |
21 | 22 | ||
22 | obj-$(CONFIG_TRACE_CLOCK) += trace_clock.o | 23 | obj-$(CONFIG_TRACE_CLOCK) += trace_clock.o |
@@ -62,4 +63,6 @@ endif | |||
62 | obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o | 63 | obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o |
63 | obj-$(CONFIG_UPROBE_EVENT) += trace_uprobe.o | 64 | obj-$(CONFIG_UPROBE_EVENT) += trace_uprobe.o |
64 | 65 | ||
66 | obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o | ||
67 | |||
65 | libftrace-y := ftrace.o | 68 | libftrace-y := ftrace.o |
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4a54a25afa2f..5b372e3ed675 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
@@ -62,7 +62,7 @@ | |||
62 | #define FTRACE_HASH_DEFAULT_BITS 10 | 62 | #define FTRACE_HASH_DEFAULT_BITS 10 |
63 | #define FTRACE_HASH_MAX_BITS 12 | 63 | #define FTRACE_HASH_MAX_BITS 12 |
64 | 64 | ||
65 | #define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_CONTROL) | 65 | #define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_CONTROL) |
66 | 66 | ||
67 | #ifdef CONFIG_DYNAMIC_FTRACE | 67 | #ifdef CONFIG_DYNAMIC_FTRACE |
68 | #define INIT_REGEX_LOCK(opsname) \ | 68 | #define INIT_REGEX_LOCK(opsname) \ |
@@ -103,7 +103,6 @@ static int ftrace_disabled __read_mostly; | |||
103 | 103 | ||
104 | static DEFINE_MUTEX(ftrace_lock); | 104 | static DEFINE_MUTEX(ftrace_lock); |
105 | 105 | ||
106 | static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end; | ||
107 | static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end; | 106 | static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end; |
108 | static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; | 107 | static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; |
109 | ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; | 108 | ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; |
@@ -171,23 +170,6 @@ int ftrace_nr_registered_ops(void) | |||
171 | return cnt; | 170 | return cnt; |
172 | } | 171 | } |
173 | 172 | ||
174 | static void | ||
175 | ftrace_global_list_func(unsigned long ip, unsigned long parent_ip, | ||
176 | struct ftrace_ops *op, struct pt_regs *regs) | ||
177 | { | ||
178 | int bit; | ||
179 | |||
180 | bit = trace_test_and_set_recursion(TRACE_GLOBAL_START, TRACE_GLOBAL_MAX); | ||
181 | if (bit < 0) | ||
182 | return; | ||
183 | |||
184 | do_for_each_ftrace_op(op, ftrace_global_list) { | ||
185 | op->func(ip, parent_ip, op, regs); | ||
186 | } while_for_each_ftrace_op(op); | ||
187 | |||
188 | trace_clear_recursion(bit); | ||
189 | } | ||
190 | |||
191 | static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, | 173 | static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, |
192 | struct ftrace_ops *op, struct pt_regs *regs) | 174 | struct ftrace_ops *op, struct pt_regs *regs) |
193 | { | 175 | { |
@@ -237,43 +219,6 @@ static int control_ops_alloc(struct ftrace_ops *ops) | |||
237 | return 0; | 219 | return 0; |
238 | } | 220 | } |
239 | 221 | ||
240 | static void update_global_ops(void) | ||
241 | { | ||
242 | ftrace_func_t func = ftrace_global_list_func; | ||
243 | void *private = NULL; | ||
244 | |||
245 | /* The list has its own recursion protection. */ | ||
246 | global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE; | ||
247 | |||
248 | /* | ||
249 | * If there's only one function registered, then call that | ||
250 | * function directly. Otherwise, we need to iterate over the | ||
251 | * registered callers. | ||
252 | */ | ||
253 | if (ftrace_global_list == &ftrace_list_end || | ||
254 | ftrace_global_list->next == &ftrace_list_end) { | ||
255 | func = ftrace_global_list->func; | ||
256 | private = ftrace_global_list->private; | ||
257 | /* | ||
258 | * As we are calling the function directly. | ||
259 | * If it does not have recursion protection, | ||
260 | * the function_trace_op needs to be updated | ||
261 | * accordingly. | ||
262 | */ | ||
263 | if (!(ftrace_global_list->flags & FTRACE_OPS_FL_RECURSION_SAFE)) | ||
264 | global_ops.flags &= ~FTRACE_OPS_FL_RECURSION_SAFE; | ||
265 | } | ||
266 | |||
267 | /* If we filter on pids, update to use the pid function */ | ||
268 | if (!list_empty(&ftrace_pids)) { | ||
269 | set_ftrace_pid_function(func); | ||
270 | func = ftrace_pid_func; | ||
271 | } | ||
272 | |||
273 | global_ops.func = func; | ||
274 | global_ops.private = private; | ||
275 | } | ||
276 | |||
277 | static void ftrace_sync(struct work_struct *work) | 222 | static void ftrace_sync(struct work_struct *work) |
278 | { | 223 | { |
279 | /* | 224 | /* |
@@ -301,8 +246,6 @@ static void update_ftrace_function(void) | |||
301 | { | 246 | { |
302 | ftrace_func_t func; | 247 | ftrace_func_t func; |
303 | 248 | ||
304 | update_global_ops(); | ||
305 | |||
306 | /* | 249 | /* |
307 | * If we are at the end of the list and this ops is | 250 | * If we are at the end of the list and this ops is |
308 | * recursion safe and not dynamic and the arch supports passing ops, | 251 | * recursion safe and not dynamic and the arch supports passing ops, |
@@ -314,10 +257,7 @@ static void update_ftrace_function(void) | |||
314 | (ftrace_ops_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) && | 257 | (ftrace_ops_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) && |
315 | !FTRACE_FORCE_LIST_FUNC)) { | 258 | !FTRACE_FORCE_LIST_FUNC)) { |
316 | /* Set the ftrace_ops that the arch callback uses */ | 259 | /* Set the ftrace_ops that the arch callback uses */ |
317 | if (ftrace_ops_list == &global_ops) | 260 | set_function_trace_op = ftrace_ops_list; |
318 | set_function_trace_op = ftrace_global_list; | ||
319 | else | ||
320 | set_function_trace_op = ftrace_ops_list; | ||
321 | func = ftrace_ops_list->func; | 261 | func = ftrace_ops_list->func; |
322 | } else { | 262 | } else { |
323 | /* Just use the default ftrace_ops */ | 263 | /* Just use the default ftrace_ops */ |
@@ -373,6 +313,11 @@ static void update_ftrace_function(void) | |||
373 | ftrace_trace_function = func; | 313 | ftrace_trace_function = func; |
374 | } | 314 | } |
375 | 315 | ||
316 | int using_ftrace_ops_list_func(void) | ||
317 | { | ||
318 | return ftrace_trace_function == ftrace_ops_list_func; | ||
319 | } | ||
320 | |||
376 | static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) | 321 | static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) |
377 | { | 322 | { |
378 | ops->next = *list; | 323 | ops->next = *list; |
@@ -434,16 +379,9 @@ static int __register_ftrace_function(struct ftrace_ops *ops) | |||
434 | if (ops->flags & FTRACE_OPS_FL_DELETED) | 379 | if (ops->flags & FTRACE_OPS_FL_DELETED) |
435 | return -EINVAL; | 380 | return -EINVAL; |
436 | 381 | ||
437 | if (FTRACE_WARN_ON(ops == &global_ops)) | ||
438 | return -EINVAL; | ||
439 | |||
440 | if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED)) | 382 | if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED)) |
441 | return -EBUSY; | 383 | return -EBUSY; |
442 | 384 | ||
443 | /* We don't support both control and global flags set. */ | ||
444 | if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK) | ||
445 | return -EINVAL; | ||
446 | |||
447 | #ifndef CONFIG_DYNAMIC_FTRACE_WITH_REGS | 385 | #ifndef CONFIG_DYNAMIC_FTRACE_WITH_REGS |
448 | /* | 386 | /* |
449 | * If the ftrace_ops specifies SAVE_REGS, then it only can be used | 387 | * If the ftrace_ops specifies SAVE_REGS, then it only can be used |
@@ -461,10 +399,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops) | |||
461 | if (!core_kernel_data((unsigned long)ops)) | 399 | if (!core_kernel_data((unsigned long)ops)) |
462 | ops->flags |= FTRACE_OPS_FL_DYNAMIC; | 400 | ops->flags |= FTRACE_OPS_FL_DYNAMIC; |
463 | 401 | ||
464 | if (ops->flags & FTRACE_OPS_FL_GLOBAL) { | 402 | if (ops->flags & FTRACE_OPS_FL_CONTROL) { |
465 | add_ftrace_list_ops(&ftrace_global_list, &global_ops, ops); | ||
466 | ops->flags |= FTRACE_OPS_FL_ENABLED; | ||
467 | } else if (ops->flags & FTRACE_OPS_FL_CONTROL) { | ||
468 | if (control_ops_alloc(ops)) | 403 | if (control_ops_alloc(ops)) |
469 | return -ENOMEM; | 404 | return -ENOMEM; |
470 | add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops); | 405 | add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops); |
@@ -484,15 +419,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) | |||
484 | if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED))) | 419 | if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED))) |
485 | return -EBUSY; | 420 | return -EBUSY; |
486 | 421 | ||
487 | if (FTRACE_WARN_ON(ops == &global_ops)) | 422 | if (ops->flags & FTRACE_OPS_FL_CONTROL) { |
488 | return -EINVAL; | ||
489 | |||
490 | if (ops->flags & FTRACE_OPS_FL_GLOBAL) { | ||
491 | ret = remove_ftrace_list_ops(&ftrace_global_list, | ||
492 | &global_ops, ops); | ||
493 | if (!ret) | ||
494 | ops->flags &= ~FTRACE_OPS_FL_ENABLED; | ||
495 | } else if (ops->flags & FTRACE_OPS_FL_CONTROL) { | ||
496 | ret = remove_ftrace_list_ops(&ftrace_control_list, | 423 | ret = remove_ftrace_list_ops(&ftrace_control_list, |
497 | &control_ops, ops); | 424 | &control_ops, ops); |
498 | } else | 425 | } else |
@@ -895,7 +822,7 @@ function_profile_call(unsigned long ip, unsigned long parent_ip, | |||
895 | 822 | ||
896 | local_irq_save(flags); | 823 | local_irq_save(flags); |
897 | 824 | ||
898 | stat = &__get_cpu_var(ftrace_profile_stats); | 825 | stat = this_cpu_ptr(&ftrace_profile_stats); |
899 | if (!stat->hash || !ftrace_profile_enabled) | 826 | if (!stat->hash || !ftrace_profile_enabled) |
900 | goto out; | 827 | goto out; |
901 | 828 | ||
@@ -926,7 +853,7 @@ static void profile_graph_return(struct ftrace_graph_ret *trace) | |||
926 | unsigned long flags; | 853 | unsigned long flags; |
927 | 854 | ||
928 | local_irq_save(flags); | 855 | local_irq_save(flags); |
929 | stat = &__get_cpu_var(ftrace_profile_stats); | 856 | stat = this_cpu_ptr(&ftrace_profile_stats); |
930 | if (!stat->hash || !ftrace_profile_enabled) | 857 | if (!stat->hash || !ftrace_profile_enabled) |
931 | goto out; | 858 | goto out; |
932 | 859 | ||
@@ -1178,7 +1105,7 @@ struct ftrace_page { | |||
1178 | static struct ftrace_page *ftrace_pages_start; | 1105 | static struct ftrace_page *ftrace_pages_start; |
1179 | static struct ftrace_page *ftrace_pages; | 1106 | static struct ftrace_page *ftrace_pages; |
1180 | 1107 | ||
1181 | static bool ftrace_hash_empty(struct ftrace_hash *hash) | 1108 | static bool __always_inline ftrace_hash_empty(struct ftrace_hash *hash) |
1182 | { | 1109 | { |
1183 | return !hash || !hash->count; | 1110 | return !hash || !hash->count; |
1184 | } | 1111 | } |
@@ -1625,7 +1552,14 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, | |||
1625 | in_other_hash = !!ftrace_lookup_ip(other_hash, rec->ip); | 1552 | in_other_hash = !!ftrace_lookup_ip(other_hash, rec->ip); |
1626 | 1553 | ||
1627 | /* | 1554 | /* |
1555 | * If filter_hash is set, we want to match all functions | ||
1556 | * that are in the hash but not in the other hash. | ||
1628 | * | 1557 | * |
1558 | * If filter_hash is not set, then we are decrementing. | ||
1559 | * That means we match anything that is in the hash | ||
1560 | * and also in the other_hash. That is, we need to turn | ||
1561 | * off functions in the other hash because they are disabled | ||
1562 | * by this hash. | ||
1629 | */ | 1563 | */ |
1630 | if (filter_hash && in_hash && !in_other_hash) | 1564 | if (filter_hash && in_hash && !in_other_hash) |
1631 | match = 1; | 1565 | match = 1; |
@@ -1767,19 +1701,15 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) | |||
1767 | /* | 1701 | /* |
1768 | * If this record is being updated from a nop, then | 1702 | * If this record is being updated from a nop, then |
1769 | * return UPDATE_MAKE_CALL. | 1703 | * return UPDATE_MAKE_CALL. |
1770 | * Otherwise, if the EN flag is set, then return | ||
1771 | * UPDATE_MODIFY_CALL_REGS to tell the caller to convert | ||
1772 | * from the non-save regs, to a save regs function. | ||
1773 | * Otherwise, | 1704 | * Otherwise, |
1774 | * return UPDATE_MODIFY_CALL to tell the caller to convert | 1705 | * return UPDATE_MODIFY_CALL to tell the caller to convert |
1775 | * from the save regs, to a non-save regs function. | 1706 | * from the save regs, to a non-save regs function or |
1707 | * vice versa. | ||
1776 | */ | 1708 | */ |
1777 | if (flag & FTRACE_FL_ENABLED) | 1709 | if (flag & FTRACE_FL_ENABLED) |
1778 | return FTRACE_UPDATE_MAKE_CALL; | 1710 | return FTRACE_UPDATE_MAKE_CALL; |
1779 | else if (rec->flags & FTRACE_FL_REGS_EN) | 1711 | |
1780 | return FTRACE_UPDATE_MODIFY_CALL_REGS; | 1712 | return FTRACE_UPDATE_MODIFY_CALL; |
1781 | else | ||
1782 | return FTRACE_UPDATE_MODIFY_CALL; | ||
1783 | } | 1713 | } |
1784 | 1714 | ||
1785 | if (update) { | 1715 | if (update) { |
@@ -1821,6 +1751,42 @@ int ftrace_test_record(struct dyn_ftrace *rec, int enable) | |||
1821 | return ftrace_check_record(rec, enable, 0); | 1751 | return ftrace_check_record(rec, enable, 0); |
1822 | } | 1752 | } |
1823 | 1753 | ||
1754 | /** | ||
1755 | * ftrace_get_addr_new - Get the call address to set to | ||
1756 | * @rec: The ftrace record descriptor | ||
1757 | * | ||
1758 | * If the record has the FTRACE_FL_REGS set, that means that it | ||
1759 | * wants to convert to a callback that saves all regs. If FTRACE_FL_REGS | ||
1760 | * is not not set, then it wants to convert to the normal callback. | ||
1761 | * | ||
1762 | * Returns the address of the trampoline to set to | ||
1763 | */ | ||
1764 | unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec) | ||
1765 | { | ||
1766 | if (rec->flags & FTRACE_FL_REGS) | ||
1767 | return (unsigned long)FTRACE_REGS_ADDR; | ||
1768 | else | ||
1769 | return (unsigned long)FTRACE_ADDR; | ||
1770 | } | ||
1771 | |||
1772 | /** | ||
1773 | * ftrace_get_addr_curr - Get the call address that is already there | ||
1774 | * @rec: The ftrace record descriptor | ||
1775 | * | ||
1776 | * The FTRACE_FL_REGS_EN is set when the record already points to | ||
1777 | * a function that saves all the regs. Basically the '_EN' version | ||
1778 | * represents the current state of the function. | ||
1779 | * | ||
1780 | * Returns the address of the trampoline that is currently being called | ||
1781 | */ | ||
1782 | unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec) | ||
1783 | { | ||
1784 | if (rec->flags & FTRACE_FL_REGS_EN) | ||
1785 | return (unsigned long)FTRACE_REGS_ADDR; | ||
1786 | else | ||
1787 | return (unsigned long)FTRACE_ADDR; | ||
1788 | } | ||
1789 | |||
1824 | static int | 1790 | static int |
1825 | __ftrace_replace_code(struct dyn_ftrace *rec, int enable) | 1791 | __ftrace_replace_code(struct dyn_ftrace *rec, int enable) |
1826 | { | 1792 | { |
@@ -1828,12 +1794,12 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) | |||
1828 | unsigned long ftrace_addr; | 1794 | unsigned long ftrace_addr; |
1829 | int ret; | 1795 | int ret; |
1830 | 1796 | ||
1831 | ret = ftrace_update_record(rec, enable); | 1797 | ftrace_addr = ftrace_get_addr_new(rec); |
1832 | 1798 | ||
1833 | if (rec->flags & FTRACE_FL_REGS) | 1799 | /* This needs to be done before we call ftrace_update_record */ |
1834 | ftrace_addr = (unsigned long)FTRACE_REGS_ADDR; | 1800 | ftrace_old_addr = ftrace_get_addr_curr(rec); |
1835 | else | 1801 | |
1836 | ftrace_addr = (unsigned long)FTRACE_ADDR; | 1802 | ret = ftrace_update_record(rec, enable); |
1837 | 1803 | ||
1838 | switch (ret) { | 1804 | switch (ret) { |
1839 | case FTRACE_UPDATE_IGNORE: | 1805 | case FTRACE_UPDATE_IGNORE: |
@@ -1845,13 +1811,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) | |||
1845 | case FTRACE_UPDATE_MAKE_NOP: | 1811 | case FTRACE_UPDATE_MAKE_NOP: |
1846 | return ftrace_make_nop(NULL, rec, ftrace_addr); | 1812 | return ftrace_make_nop(NULL, rec, ftrace_addr); |
1847 | 1813 | ||
1848 | case FTRACE_UPDATE_MODIFY_CALL_REGS: | ||
1849 | case FTRACE_UPDATE_MODIFY_CALL: | 1814 | case FTRACE_UPDATE_MODIFY_CALL: |
1850 | if (rec->flags & FTRACE_FL_REGS) | ||
1851 | ftrace_old_addr = (unsigned long)FTRACE_ADDR; | ||
1852 | else | ||
1853 | ftrace_old_addr = (unsigned long)FTRACE_REGS_ADDR; | ||
1854 | |||
1855 | return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr); | 1815 | return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr); |
1856 | } | 1816 | } |
1857 | 1817 | ||
@@ -2115,7 +2075,6 @@ static void ftrace_startup_enable(int command) | |||
2115 | 2075 | ||
2116 | static int ftrace_startup(struct ftrace_ops *ops, int command) | 2076 | static int ftrace_startup(struct ftrace_ops *ops, int command) |
2117 | { | 2077 | { |
2118 | bool hash_enable = true; | ||
2119 | int ret; | 2078 | int ret; |
2120 | 2079 | ||
2121 | if (unlikely(ftrace_disabled)) | 2080 | if (unlikely(ftrace_disabled)) |
@@ -2128,18 +2087,9 @@ static int ftrace_startup(struct ftrace_ops *ops, int command) | |||
2128 | ftrace_start_up++; | 2087 | ftrace_start_up++; |
2129 | command |= FTRACE_UPDATE_CALLS; | 2088 | command |= FTRACE_UPDATE_CALLS; |
2130 | 2089 | ||
2131 | /* ops marked global share the filter hashes */ | ||
2132 | if (ops->flags & FTRACE_OPS_FL_GLOBAL) { | ||
2133 | ops = &global_ops; | ||
2134 | /* Don't update hash if global is already set */ | ||
2135 | if (global_start_up) | ||
2136 | hash_enable = false; | ||
2137 | global_start_up++; | ||
2138 | } | ||
2139 | |||
2140 | ops->flags |= FTRACE_OPS_FL_ENABLED; | 2090 | ops->flags |= FTRACE_OPS_FL_ENABLED; |
2141 | if (hash_enable) | 2091 | |
2142 | ftrace_hash_rec_enable(ops, 1); | 2092 | ftrace_hash_rec_enable(ops, 1); |
2143 | 2093 | ||
2144 | ftrace_startup_enable(command); | 2094 | ftrace_startup_enable(command); |
2145 | 2095 | ||
@@ -2148,7 +2098,6 @@ static int ftrace_startup(struct ftrace_ops *ops, int command) | |||
2148 | 2098 | ||
2149 | static int ftrace_shutdown(struct ftrace_ops *ops, int command) | 2099 | static int ftrace_shutdown(struct ftrace_ops *ops, int command) |
2150 | { | 2100 | { |
2151 | bool hash_disable = true; | ||
2152 | int ret; | 2101 | int ret; |
2153 | 2102 | ||
2154 | if (unlikely(ftrace_disabled)) | 2103 | if (unlikely(ftrace_disabled)) |
@@ -2166,21 +2115,9 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) | |||
2166 | */ | 2115 | */ |
2167 | WARN_ON_ONCE(ftrace_start_up < 0); | 2116 | WARN_ON_ONCE(ftrace_start_up < 0); |
2168 | 2117 | ||
2169 | if (ops->flags & FTRACE_OPS_FL_GLOBAL) { | 2118 | ftrace_hash_rec_disable(ops, 1); |
2170 | ops = &global_ops; | ||
2171 | global_start_up--; | ||
2172 | WARN_ON_ONCE(global_start_up < 0); | ||
2173 | /* Don't update hash if global still has users */ | ||
2174 | if (global_start_up) { | ||
2175 | WARN_ON_ONCE(!ftrace_start_up); | ||
2176 | hash_disable = false; | ||
2177 | } | ||
2178 | } | ||
2179 | |||
2180 | if (hash_disable) | ||
2181 | ftrace_hash_rec_disable(ops, 1); | ||
2182 | 2119 | ||
2183 | if (ops != &global_ops || !global_start_up) | 2120 | if (!global_start_up) |
2184 | ops->flags &= ~FTRACE_OPS_FL_ENABLED; | 2121 | ops->flags &= ~FTRACE_OPS_FL_ENABLED; |
2185 | 2122 | ||
2186 | command |= FTRACE_UPDATE_CALLS; | 2123 | command |= FTRACE_UPDATE_CALLS; |
@@ -3524,10 +3461,6 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, | |||
3524 | struct ftrace_hash *hash; | 3461 | struct ftrace_hash *hash; |
3525 | int ret; | 3462 | int ret; |
3526 | 3463 | ||
3527 | /* All global ops uses the global ops filters */ | ||
3528 | if (ops->flags & FTRACE_OPS_FL_GLOBAL) | ||
3529 | ops = &global_ops; | ||
3530 | |||
3531 | if (unlikely(ftrace_disabled)) | 3464 | if (unlikely(ftrace_disabled)) |
3532 | return -ENODEV; | 3465 | return -ENODEV; |
3533 | 3466 | ||
@@ -3639,8 +3572,7 @@ int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, | |||
3639 | } | 3572 | } |
3640 | EXPORT_SYMBOL_GPL(ftrace_set_notrace); | 3573 | EXPORT_SYMBOL_GPL(ftrace_set_notrace); |
3641 | /** | 3574 | /** |
3642 | * ftrace_set_filter - set a function to filter on in ftrace | 3575 | * ftrace_set_global_filter - set a function to filter on with global tracers |
3643 | * @ops - the ops to set the filter with | ||
3644 | * @buf - the string that holds the function filter text. | 3576 | * @buf - the string that holds the function filter text. |
3645 | * @len - the length of the string. | 3577 | * @len - the length of the string. |
3646 | * @reset - non zero to reset all filters before applying this filter. | 3578 | * @reset - non zero to reset all filters before applying this filter. |
@@ -3655,8 +3587,7 @@ void ftrace_set_global_filter(unsigned char *buf, int len, int reset) | |||
3655 | EXPORT_SYMBOL_GPL(ftrace_set_global_filter); | 3587 | EXPORT_SYMBOL_GPL(ftrace_set_global_filter); |
3656 | 3588 | ||
3657 | /** | 3589 | /** |
3658 | * ftrace_set_notrace - set a function to not trace in ftrace | 3590 | * ftrace_set_global_notrace - set a function to not trace with global tracers |
3659 | * @ops - the ops to set the notrace filter with | ||
3660 | * @buf - the string that holds the function notrace text. | 3591 | * @buf - the string that holds the function notrace text. |
3661 | * @len - the length of the string. | 3592 | * @len - the length of the string. |
3662 | * @reset - non zero to reset all filters before applying this filter. | 3593 | * @reset - non zero to reset all filters before applying this filter. |
@@ -4443,6 +4374,34 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) | |||
4443 | 4374 | ||
4444 | #endif /* CONFIG_DYNAMIC_FTRACE */ | 4375 | #endif /* CONFIG_DYNAMIC_FTRACE */ |
4445 | 4376 | ||
4377 | __init void ftrace_init_global_array_ops(struct trace_array *tr) | ||
4378 | { | ||
4379 | tr->ops = &global_ops; | ||
4380 | tr->ops->private = tr; | ||
4381 | } | ||
4382 | |||
4383 | void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func) | ||
4384 | { | ||
4385 | /* If we filter on pids, update to use the pid function */ | ||
4386 | if (tr->flags & TRACE_ARRAY_FL_GLOBAL) { | ||
4387 | if (WARN_ON(tr->ops->func != ftrace_stub)) | ||
4388 | printk("ftrace ops had %pS for function\n", | ||
4389 | tr->ops->func); | ||
4390 | /* Only the top level instance does pid tracing */ | ||
4391 | if (!list_empty(&ftrace_pids)) { | ||
4392 | set_ftrace_pid_function(func); | ||
4393 | func = ftrace_pid_func; | ||
4394 | } | ||
4395 | } | ||
4396 | tr->ops->func = func; | ||
4397 | tr->ops->private = tr; | ||
4398 | } | ||
4399 | |||
4400 | void ftrace_reset_array_ops(struct trace_array *tr) | ||
4401 | { | ||
4402 | tr->ops->func = ftrace_stub; | ||
4403 | } | ||
4404 | |||
4446 | static void | 4405 | static void |
4447 | ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, | 4406 | ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, |
4448 | struct ftrace_ops *op, struct pt_regs *regs) | 4407 | struct ftrace_ops *op, struct pt_regs *regs) |
@@ -4501,9 +4460,16 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, | |||
4501 | */ | 4460 | */ |
4502 | preempt_disable_notrace(); | 4461 | preempt_disable_notrace(); |
4503 | do_for_each_ftrace_op(op, ftrace_ops_list) { | 4462 | do_for_each_ftrace_op(op, ftrace_ops_list) { |
4504 | if (ftrace_ops_test(op, ip, regs)) | 4463 | if (ftrace_ops_test(op, ip, regs)) { |
4464 | if (WARN_ON(!op->func)) { | ||
4465 | function_trace_stop = 1; | ||
4466 | printk("op=%p %pS\n", op, op); | ||
4467 | goto out; | ||
4468 | } | ||
4505 | op->func(ip, parent_ip, op, regs); | 4469 | op->func(ip, parent_ip, op, regs); |
4470 | } | ||
4506 | } while_for_each_ftrace_op(op); | 4471 | } while_for_each_ftrace_op(op); |
4472 | out: | ||
4507 | preempt_enable_notrace(); | 4473 | preempt_enable_notrace(); |
4508 | trace_clear_recursion(bit); | 4474 | trace_clear_recursion(bit); |
4509 | } | 4475 | } |
@@ -4908,7 +4874,6 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, | |||
4908 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | 4874 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER |
4909 | 4875 | ||
4910 | static int ftrace_graph_active; | 4876 | static int ftrace_graph_active; |
4911 | static struct notifier_block ftrace_suspend_notifier; | ||
4912 | 4877 | ||
4913 | int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) | 4878 | int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) |
4914 | { | 4879 | { |
@@ -5054,13 +5019,6 @@ ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state, | |||
5054 | return NOTIFY_DONE; | 5019 | return NOTIFY_DONE; |
5055 | } | 5020 | } |
5056 | 5021 | ||
5057 | /* Just a place holder for function graph */ | ||
5058 | static struct ftrace_ops fgraph_ops __read_mostly = { | ||
5059 | .func = ftrace_stub, | ||
5060 | .flags = FTRACE_OPS_FL_STUB | FTRACE_OPS_FL_GLOBAL | | ||
5061 | FTRACE_OPS_FL_RECURSION_SAFE, | ||
5062 | }; | ||
5063 | |||
5064 | static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace) | 5022 | static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace) |
5065 | { | 5023 | { |
5066 | if (!ftrace_ops_test(&global_ops, trace->func, NULL)) | 5024 | if (!ftrace_ops_test(&global_ops, trace->func, NULL)) |
@@ -5085,6 +5043,10 @@ static void update_function_graph_func(void) | |||
5085 | ftrace_graph_entry = ftrace_graph_entry_test; | 5043 | ftrace_graph_entry = ftrace_graph_entry_test; |
5086 | } | 5044 | } |
5087 | 5045 | ||
5046 | static struct notifier_block ftrace_suspend_notifier = { | ||
5047 | .notifier_call = ftrace_suspend_notifier_call, | ||
5048 | }; | ||
5049 | |||
5088 | int register_ftrace_graph(trace_func_graph_ret_t retfunc, | 5050 | int register_ftrace_graph(trace_func_graph_ret_t retfunc, |
5089 | trace_func_graph_ent_t entryfunc) | 5051 | trace_func_graph_ent_t entryfunc) |
5090 | { | 5052 | { |
@@ -5098,7 +5060,6 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, | |||
5098 | goto out; | 5060 | goto out; |
5099 | } | 5061 | } |
5100 | 5062 | ||
5101 | ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call; | ||
5102 | register_pm_notifier(&ftrace_suspend_notifier); | 5063 | register_pm_notifier(&ftrace_suspend_notifier); |
5103 | 5064 | ||
5104 | ftrace_graph_active++; | 5065 | ftrace_graph_active++; |
@@ -5120,7 +5081,10 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, | |||
5120 | ftrace_graph_entry = ftrace_graph_entry_test; | 5081 | ftrace_graph_entry = ftrace_graph_entry_test; |
5121 | update_function_graph_func(); | 5082 | update_function_graph_func(); |
5122 | 5083 | ||
5123 | ret = ftrace_startup(&fgraph_ops, FTRACE_START_FUNC_RET); | 5084 | /* Function graph doesn't use the .func field of global_ops */ |
5085 | global_ops.flags |= FTRACE_OPS_FL_STUB; | ||
5086 | |||
5087 | ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET); | ||
5124 | 5088 | ||
5125 | out: | 5089 | out: |
5126 | mutex_unlock(&ftrace_lock); | 5090 | mutex_unlock(&ftrace_lock); |
@@ -5138,7 +5102,8 @@ void unregister_ftrace_graph(void) | |||
5138 | ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; | 5102 | ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; |
5139 | ftrace_graph_entry = ftrace_graph_entry_stub; | 5103 | ftrace_graph_entry = ftrace_graph_entry_stub; |
5140 | __ftrace_graph_entry = ftrace_graph_entry_stub; | 5104 | __ftrace_graph_entry = ftrace_graph_entry_stub; |
5141 | ftrace_shutdown(&fgraph_ops, FTRACE_STOP_FUNC_RET); | 5105 | ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET); |
5106 | global_ops.flags &= ~FTRACE_OPS_FL_STUB; | ||
5142 | unregister_pm_notifier(&ftrace_suspend_notifier); | 5107 | unregister_pm_notifier(&ftrace_suspend_notifier); |
5143 | unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); | 5108 | unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); |
5144 | 5109 | ||
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index c634868c2921..7c56c3d06943 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
@@ -543,7 +543,7 @@ static void rb_wake_up_waiters(struct irq_work *work) | |||
543 | * as data is added to any of the @buffer's cpu buffers. Otherwise | 543 | * as data is added to any of the @buffer's cpu buffers. Otherwise |
544 | * it will wait for data to be added to a specific cpu buffer. | 544 | * it will wait for data to be added to a specific cpu buffer. |
545 | */ | 545 | */ |
546 | void ring_buffer_wait(struct ring_buffer *buffer, int cpu) | 546 | int ring_buffer_wait(struct ring_buffer *buffer, int cpu) |
547 | { | 547 | { |
548 | struct ring_buffer_per_cpu *cpu_buffer; | 548 | struct ring_buffer_per_cpu *cpu_buffer; |
549 | DEFINE_WAIT(wait); | 549 | DEFINE_WAIT(wait); |
@@ -557,6 +557,8 @@ void ring_buffer_wait(struct ring_buffer *buffer, int cpu) | |||
557 | if (cpu == RING_BUFFER_ALL_CPUS) | 557 | if (cpu == RING_BUFFER_ALL_CPUS) |
558 | work = &buffer->irq_work; | 558 | work = &buffer->irq_work; |
559 | else { | 559 | else { |
560 | if (!cpumask_test_cpu(cpu, buffer->cpumask)) | ||
561 | return -ENODEV; | ||
560 | cpu_buffer = buffer->buffers[cpu]; | 562 | cpu_buffer = buffer->buffers[cpu]; |
561 | work = &cpu_buffer->irq_work; | 563 | work = &cpu_buffer->irq_work; |
562 | } | 564 | } |
@@ -591,6 +593,7 @@ void ring_buffer_wait(struct ring_buffer *buffer, int cpu) | |||
591 | schedule(); | 593 | schedule(); |
592 | 594 | ||
593 | finish_wait(&work->waiters, &wait); | 595 | finish_wait(&work->waiters, &wait); |
596 | return 0; | ||
594 | } | 597 | } |
595 | 598 | ||
596 | /** | 599 | /** |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 737b0efa1a62..384ede311717 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
@@ -275,7 +275,7 @@ int call_filter_check_discard(struct ftrace_event_call *call, void *rec, | |||
275 | } | 275 | } |
276 | EXPORT_SYMBOL_GPL(call_filter_check_discard); | 276 | EXPORT_SYMBOL_GPL(call_filter_check_discard); |
277 | 277 | ||
278 | cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu) | 278 | static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu) |
279 | { | 279 | { |
280 | u64 ts; | 280 | u64 ts; |
281 | 281 | ||
@@ -599,7 +599,7 @@ static int alloc_snapshot(struct trace_array *tr) | |||
599 | return 0; | 599 | return 0; |
600 | } | 600 | } |
601 | 601 | ||
602 | void free_snapshot(struct trace_array *tr) | 602 | static void free_snapshot(struct trace_array *tr) |
603 | { | 603 | { |
604 | /* | 604 | /* |
605 | * We don't free the ring buffer. instead, resize it because | 605 | * We don't free the ring buffer. instead, resize it because |
@@ -963,27 +963,9 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) | |||
963 | return cnt; | 963 | return cnt; |
964 | } | 964 | } |
965 | 965 | ||
966 | /* | ||
967 | * ftrace_max_lock is used to protect the swapping of buffers | ||
968 | * when taking a max snapshot. The buffers themselves are | ||
969 | * protected by per_cpu spinlocks. But the action of the swap | ||
970 | * needs its own lock. | ||
971 | * | ||
972 | * This is defined as a arch_spinlock_t in order to help | ||
973 | * with performance when lockdep debugging is enabled. | ||
974 | * | ||
975 | * It is also used in other places outside the update_max_tr | ||
976 | * so it needs to be defined outside of the | ||
977 | * CONFIG_TRACER_MAX_TRACE. | ||
978 | */ | ||
979 | static arch_spinlock_t ftrace_max_lock = | ||
980 | (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; | ||
981 | |||
982 | unsigned long __read_mostly tracing_thresh; | 966 | unsigned long __read_mostly tracing_thresh; |
983 | 967 | ||
984 | #ifdef CONFIG_TRACER_MAX_TRACE | 968 | #ifdef CONFIG_TRACER_MAX_TRACE |
985 | unsigned long __read_mostly tracing_max_latency; | ||
986 | |||
987 | /* | 969 | /* |
988 | * Copy the new maximum trace into the separate maximum-trace | 970 | * Copy the new maximum trace into the separate maximum-trace |
989 | * structure. (this way the maximum trace is permanently saved, | 971 | * structure. (this way the maximum trace is permanently saved, |
@@ -1000,7 +982,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) | |||
1000 | max_buf->cpu = cpu; | 982 | max_buf->cpu = cpu; |
1001 | max_buf->time_start = data->preempt_timestamp; | 983 | max_buf->time_start = data->preempt_timestamp; |
1002 | 984 | ||
1003 | max_data->saved_latency = tracing_max_latency; | 985 | max_data->saved_latency = tr->max_latency; |
1004 | max_data->critical_start = data->critical_start; | 986 | max_data->critical_start = data->critical_start; |
1005 | max_data->critical_end = data->critical_end; | 987 | max_data->critical_end = data->critical_end; |
1006 | 988 | ||
@@ -1048,14 +1030,14 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) | |||
1048 | return; | 1030 | return; |
1049 | } | 1031 | } |
1050 | 1032 | ||
1051 | arch_spin_lock(&ftrace_max_lock); | 1033 | arch_spin_lock(&tr->max_lock); |
1052 | 1034 | ||
1053 | buf = tr->trace_buffer.buffer; | 1035 | buf = tr->trace_buffer.buffer; |
1054 | tr->trace_buffer.buffer = tr->max_buffer.buffer; | 1036 | tr->trace_buffer.buffer = tr->max_buffer.buffer; |
1055 | tr->max_buffer.buffer = buf; | 1037 | tr->max_buffer.buffer = buf; |
1056 | 1038 | ||
1057 | __update_max_tr(tr, tsk, cpu); | 1039 | __update_max_tr(tr, tsk, cpu); |
1058 | arch_spin_unlock(&ftrace_max_lock); | 1040 | arch_spin_unlock(&tr->max_lock); |
1059 | } | 1041 | } |
1060 | 1042 | ||
1061 | /** | 1043 | /** |
@@ -1081,7 +1063,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) | |||
1081 | return; | 1063 | return; |
1082 | } | 1064 | } |
1083 | 1065 | ||
1084 | arch_spin_lock(&ftrace_max_lock); | 1066 | arch_spin_lock(&tr->max_lock); |
1085 | 1067 | ||
1086 | ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->trace_buffer.buffer, cpu); | 1068 | ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->trace_buffer.buffer, cpu); |
1087 | 1069 | ||
@@ -1099,17 +1081,17 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) | |||
1099 | WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY); | 1081 | WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY); |
1100 | 1082 | ||
1101 | __update_max_tr(tr, tsk, cpu); | 1083 | __update_max_tr(tr, tsk, cpu); |
1102 | arch_spin_unlock(&ftrace_max_lock); | 1084 | arch_spin_unlock(&tr->max_lock); |
1103 | } | 1085 | } |
1104 | #endif /* CONFIG_TRACER_MAX_TRACE */ | 1086 | #endif /* CONFIG_TRACER_MAX_TRACE */ |
1105 | 1087 | ||
1106 | static void default_wait_pipe(struct trace_iterator *iter) | 1088 | static int wait_on_pipe(struct trace_iterator *iter) |
1107 | { | 1089 | { |
1108 | /* Iterators are static, they should be filled or empty */ | 1090 | /* Iterators are static, they should be filled or empty */ |
1109 | if (trace_buffer_iter(iter, iter->cpu_file)) | 1091 | if (trace_buffer_iter(iter, iter->cpu_file)) |
1110 | return; | 1092 | return 0; |
1111 | 1093 | ||
1112 | ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file); | 1094 | return ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file); |
1113 | } | 1095 | } |
1114 | 1096 | ||
1115 | #ifdef CONFIG_FTRACE_STARTUP_TEST | 1097 | #ifdef CONFIG_FTRACE_STARTUP_TEST |
@@ -1220,8 +1202,6 @@ int register_tracer(struct tracer *type) | |||
1220 | else | 1202 | else |
1221 | if (!type->flags->opts) | 1203 | if (!type->flags->opts) |
1222 | type->flags->opts = dummy_tracer_opt; | 1204 | type->flags->opts = dummy_tracer_opt; |
1223 | if (!type->wait_pipe) | ||
1224 | type->wait_pipe = default_wait_pipe; | ||
1225 | 1205 | ||
1226 | ret = run_tracer_selftest(type); | 1206 | ret = run_tracer_selftest(type); |
1227 | if (ret < 0) | 1207 | if (ret < 0) |
@@ -1305,22 +1285,71 @@ void tracing_reset_all_online_cpus(void) | |||
1305 | } | 1285 | } |
1306 | } | 1286 | } |
1307 | 1287 | ||
1308 | #define SAVED_CMDLINES 128 | 1288 | #define SAVED_CMDLINES_DEFAULT 128 |
1309 | #define NO_CMDLINE_MAP UINT_MAX | 1289 | #define NO_CMDLINE_MAP UINT_MAX |
1310 | static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; | ||
1311 | static unsigned map_cmdline_to_pid[SAVED_CMDLINES]; | ||
1312 | static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN]; | ||
1313 | static int cmdline_idx; | ||
1314 | static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED; | 1290 | static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED; |
1291 | struct saved_cmdlines_buffer { | ||
1292 | unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; | ||
1293 | unsigned *map_cmdline_to_pid; | ||
1294 | unsigned cmdline_num; | ||
1295 | int cmdline_idx; | ||
1296 | char *saved_cmdlines; | ||
1297 | }; | ||
1298 | static struct saved_cmdlines_buffer *savedcmd; | ||
1315 | 1299 | ||
1316 | /* temporary disable recording */ | 1300 | /* temporary disable recording */ |
1317 | static atomic_t trace_record_cmdline_disabled __read_mostly; | 1301 | static atomic_t trace_record_cmdline_disabled __read_mostly; |
1318 | 1302 | ||
1319 | static void trace_init_cmdlines(void) | 1303 | static inline char *get_saved_cmdlines(int idx) |
1304 | { | ||
1305 | return &savedcmd->saved_cmdlines[idx * TASK_COMM_LEN]; | ||
1306 | } | ||
1307 | |||
1308 | static inline void set_cmdline(int idx, const char *cmdline) | ||
1309 | { | ||
1310 | memcpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN); | ||
1311 | } | ||
1312 | |||
1313 | static int allocate_cmdlines_buffer(unsigned int val, | ||
1314 | struct saved_cmdlines_buffer *s) | ||
1320 | { | 1315 | { |
1321 | memset(&map_pid_to_cmdline, NO_CMDLINE_MAP, sizeof(map_pid_to_cmdline)); | 1316 | s->map_cmdline_to_pid = kmalloc(val * sizeof(*s->map_cmdline_to_pid), |
1322 | memset(&map_cmdline_to_pid, NO_CMDLINE_MAP, sizeof(map_cmdline_to_pid)); | 1317 | GFP_KERNEL); |
1323 | cmdline_idx = 0; | 1318 | if (!s->map_cmdline_to_pid) |
1319 | return -ENOMEM; | ||
1320 | |||
1321 | s->saved_cmdlines = kmalloc(val * TASK_COMM_LEN, GFP_KERNEL); | ||
1322 | if (!s->saved_cmdlines) { | ||
1323 | kfree(s->map_cmdline_to_pid); | ||
1324 | return -ENOMEM; | ||
1325 | } | ||
1326 | |||
1327 | s->cmdline_idx = 0; | ||
1328 | s->cmdline_num = val; | ||
1329 | memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP, | ||
1330 | sizeof(s->map_pid_to_cmdline)); | ||
1331 | memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP, | ||
1332 | val * sizeof(*s->map_cmdline_to_pid)); | ||
1333 | |||
1334 | return 0; | ||
1335 | } | ||
1336 | |||
1337 | static int trace_create_savedcmd(void) | ||
1338 | { | ||
1339 | int ret; | ||
1340 | |||
1341 | savedcmd = kmalloc(sizeof(*savedcmd), GFP_KERNEL); | ||
1342 | if (!savedcmd) | ||
1343 | return -ENOMEM; | ||
1344 | |||
1345 | ret = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT, savedcmd); | ||
1346 | if (ret < 0) { | ||
1347 | kfree(savedcmd); | ||
1348 | savedcmd = NULL; | ||
1349 | return -ENOMEM; | ||
1350 | } | ||
1351 | |||
1352 | return 0; | ||
1324 | } | 1353 | } |
1325 | 1354 | ||
1326 | int is_tracing_stopped(void) | 1355 | int is_tracing_stopped(void) |
@@ -1353,7 +1382,7 @@ void tracing_start(void) | |||
1353 | } | 1382 | } |
1354 | 1383 | ||
1355 | /* Prevent the buffers from switching */ | 1384 | /* Prevent the buffers from switching */ |
1356 | arch_spin_lock(&ftrace_max_lock); | 1385 | arch_spin_lock(&global_trace.max_lock); |
1357 | 1386 | ||
1358 | buffer = global_trace.trace_buffer.buffer; | 1387 | buffer = global_trace.trace_buffer.buffer; |
1359 | if (buffer) | 1388 | if (buffer) |
@@ -1365,7 +1394,7 @@ void tracing_start(void) | |||
1365 | ring_buffer_record_enable(buffer); | 1394 | ring_buffer_record_enable(buffer); |
1366 | #endif | 1395 | #endif |
1367 | 1396 | ||
1368 | arch_spin_unlock(&ftrace_max_lock); | 1397 | arch_spin_unlock(&global_trace.max_lock); |
1369 | 1398 | ||
1370 | ftrace_start(); | 1399 | ftrace_start(); |
1371 | out: | 1400 | out: |
@@ -1420,7 +1449,7 @@ void tracing_stop(void) | |||
1420 | goto out; | 1449 | goto out; |
1421 | 1450 | ||
1422 | /* Prevent the buffers from switching */ | 1451 | /* Prevent the buffers from switching */ |
1423 | arch_spin_lock(&ftrace_max_lock); | 1452 | arch_spin_lock(&global_trace.max_lock); |
1424 | 1453 | ||
1425 | buffer = global_trace.trace_buffer.buffer; | 1454 | buffer = global_trace.trace_buffer.buffer; |
1426 | if (buffer) | 1455 | if (buffer) |
@@ -1432,7 +1461,7 @@ void tracing_stop(void) | |||
1432 | ring_buffer_record_disable(buffer); | 1461 | ring_buffer_record_disable(buffer); |
1433 | #endif | 1462 | #endif |
1434 | 1463 | ||
1435 | arch_spin_unlock(&ftrace_max_lock); | 1464 | arch_spin_unlock(&global_trace.max_lock); |
1436 | 1465 | ||
1437 | out: | 1466 | out: |
1438 | raw_spin_unlock_irqrestore(&global_trace.start_lock, flags); | 1467 | raw_spin_unlock_irqrestore(&global_trace.start_lock, flags); |
@@ -1461,12 +1490,12 @@ static void tracing_stop_tr(struct trace_array *tr) | |||
1461 | 1490 | ||
1462 | void trace_stop_cmdline_recording(void); | 1491 | void trace_stop_cmdline_recording(void); |
1463 | 1492 | ||
1464 | static void trace_save_cmdline(struct task_struct *tsk) | 1493 | static int trace_save_cmdline(struct task_struct *tsk) |
1465 | { | 1494 | { |
1466 | unsigned pid, idx; | 1495 | unsigned pid, idx; |
1467 | 1496 | ||
1468 | if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT)) | 1497 | if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT)) |
1469 | return; | 1498 | return 0; |
1470 | 1499 | ||
1471 | /* | 1500 | /* |
1472 | * It's not the end of the world if we don't get | 1501 | * It's not the end of the world if we don't get |
@@ -1475,11 +1504,11 @@ static void trace_save_cmdline(struct task_struct *tsk) | |||
1475 | * so if we miss here, then better luck next time. | 1504 | * so if we miss here, then better luck next time. |
1476 | */ | 1505 | */ |
1477 | if (!arch_spin_trylock(&trace_cmdline_lock)) | 1506 | if (!arch_spin_trylock(&trace_cmdline_lock)) |
1478 | return; | 1507 | return 0; |
1479 | 1508 | ||
1480 | idx = map_pid_to_cmdline[tsk->pid]; | 1509 | idx = savedcmd->map_pid_to_cmdline[tsk->pid]; |
1481 | if (idx == NO_CMDLINE_MAP) { | 1510 | if (idx == NO_CMDLINE_MAP) { |
1482 | idx = (cmdline_idx + 1) % SAVED_CMDLINES; | 1511 | idx = (savedcmd->cmdline_idx + 1) % savedcmd->cmdline_num; |
1483 | 1512 | ||
1484 | /* | 1513 | /* |
1485 | * Check whether the cmdline buffer at idx has a pid | 1514 | * Check whether the cmdline buffer at idx has a pid |
@@ -1487,22 +1516,24 @@ static void trace_save_cmdline(struct task_struct *tsk) | |||
1487 | * need to clear the map_pid_to_cmdline. Otherwise we | 1516 | * need to clear the map_pid_to_cmdline. Otherwise we |
1488 | * would read the new comm for the old pid. | 1517 | * would read the new comm for the old pid. |
1489 | */ | 1518 | */ |
1490 | pid = map_cmdline_to_pid[idx]; | 1519 | pid = savedcmd->map_cmdline_to_pid[idx]; |
1491 | if (pid != NO_CMDLINE_MAP) | 1520 | if (pid != NO_CMDLINE_MAP) |
1492 | map_pid_to_cmdline[pid] = NO_CMDLINE_MAP; | 1521 | savedcmd->map_pid_to_cmdline[pid] = NO_CMDLINE_MAP; |
1493 | 1522 | ||
1494 | map_cmdline_to_pid[idx] = tsk->pid; | 1523 | savedcmd->map_cmdline_to_pid[idx] = tsk->pid; |
1495 | map_pid_to_cmdline[tsk->pid] = idx; | 1524 | savedcmd->map_pid_to_cmdline[tsk->pid] = idx; |
1496 | 1525 | ||
1497 | cmdline_idx = idx; | 1526 | savedcmd->cmdline_idx = idx; |
1498 | } | 1527 | } |
1499 | 1528 | ||
1500 | memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN); | 1529 | set_cmdline(idx, tsk->comm); |
1501 | 1530 | ||
1502 | arch_spin_unlock(&trace_cmdline_lock); | 1531 | arch_spin_unlock(&trace_cmdline_lock); |
1532 | |||
1533 | return 1; | ||
1503 | } | 1534 | } |
1504 | 1535 | ||
1505 | void trace_find_cmdline(int pid, char comm[]) | 1536 | static void __trace_find_cmdline(int pid, char comm[]) |
1506 | { | 1537 | { |
1507 | unsigned map; | 1538 | unsigned map; |
1508 | 1539 | ||
@@ -1521,13 +1552,19 @@ void trace_find_cmdline(int pid, char comm[]) | |||
1521 | return; | 1552 | return; |
1522 | } | 1553 | } |
1523 | 1554 | ||
1524 | preempt_disable(); | 1555 | map = savedcmd->map_pid_to_cmdline[pid]; |
1525 | arch_spin_lock(&trace_cmdline_lock); | ||
1526 | map = map_pid_to_cmdline[pid]; | ||
1527 | if (map != NO_CMDLINE_MAP) | 1556 | if (map != NO_CMDLINE_MAP) |
1528 | strcpy(comm, saved_cmdlines[map]); | 1557 | strcpy(comm, get_saved_cmdlines(map)); |
1529 | else | 1558 | else |
1530 | strcpy(comm, "<...>"); | 1559 | strcpy(comm, "<...>"); |
1560 | } | ||
1561 | |||
1562 | void trace_find_cmdline(int pid, char comm[]) | ||
1563 | { | ||
1564 | preempt_disable(); | ||
1565 | arch_spin_lock(&trace_cmdline_lock); | ||
1566 | |||
1567 | __trace_find_cmdline(pid, comm); | ||
1531 | 1568 | ||
1532 | arch_spin_unlock(&trace_cmdline_lock); | 1569 | arch_spin_unlock(&trace_cmdline_lock); |
1533 | preempt_enable(); | 1570 | preempt_enable(); |
@@ -1541,9 +1578,8 @@ void tracing_record_cmdline(struct task_struct *tsk) | |||
1541 | if (!__this_cpu_read(trace_cmdline_save)) | 1578 | if (!__this_cpu_read(trace_cmdline_save)) |
1542 | return; | 1579 | return; |
1543 | 1580 | ||
1544 | __this_cpu_write(trace_cmdline_save, false); | 1581 | if (trace_save_cmdline(tsk)) |
1545 | 1582 | __this_cpu_write(trace_cmdline_save, false); | |
1546 | trace_save_cmdline(tsk); | ||
1547 | } | 1583 | } |
1548 | 1584 | ||
1549 | void | 1585 | void |
@@ -1746,7 +1782,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, | |||
1746 | */ | 1782 | */ |
1747 | barrier(); | 1783 | barrier(); |
1748 | if (use_stack == 1) { | 1784 | if (use_stack == 1) { |
1749 | trace.entries = &__get_cpu_var(ftrace_stack).calls[0]; | 1785 | trace.entries = this_cpu_ptr(ftrace_stack.calls); |
1750 | trace.max_entries = FTRACE_STACK_MAX_ENTRIES; | 1786 | trace.max_entries = FTRACE_STACK_MAX_ENTRIES; |
1751 | 1787 | ||
1752 | if (regs) | 1788 | if (regs) |
@@ -1995,7 +2031,21 @@ void trace_printk_init_buffers(void) | |||
1995 | if (alloc_percpu_trace_buffer()) | 2031 | if (alloc_percpu_trace_buffer()) |
1996 | return; | 2032 | return; |
1997 | 2033 | ||
1998 | pr_info("ftrace: Allocated trace_printk buffers\n"); | 2034 | /* trace_printk() is for debug use only. Don't use it in production. */ |
2035 | |||
2036 | pr_warning("\n**********************************************************\n"); | ||
2037 | pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); | ||
2038 | pr_warning("** **\n"); | ||
2039 | pr_warning("** trace_printk() being used. Allocating extra memory. **\n"); | ||
2040 | pr_warning("** **\n"); | ||
2041 | pr_warning("** This means that this is a DEBUG kernel and it is **\n"); | ||
2042 | pr_warning("** unsafe for produciton use. **\n"); | ||
2043 | pr_warning("** **\n"); | ||
2044 | pr_warning("** If you see this message and you are not debugging **\n"); | ||
2045 | pr_warning("** the kernel, report this immediately to your vendor! **\n"); | ||
2046 | pr_warning("** **\n"); | ||
2047 | pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); | ||
2048 | pr_warning("**********************************************************\n"); | ||
1999 | 2049 | ||
2000 | /* Expand the buffers to set size */ | 2050 | /* Expand the buffers to set size */ |
2001 | tracing_update_buffers(); | 2051 | tracing_update_buffers(); |
@@ -3333,7 +3383,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, | |||
3333 | mutex_lock(&tracing_cpumask_update_lock); | 3383 | mutex_lock(&tracing_cpumask_update_lock); |
3334 | 3384 | ||
3335 | local_irq_disable(); | 3385 | local_irq_disable(); |
3336 | arch_spin_lock(&ftrace_max_lock); | 3386 | arch_spin_lock(&tr->max_lock); |
3337 | for_each_tracing_cpu(cpu) { | 3387 | for_each_tracing_cpu(cpu) { |
3338 | /* | 3388 | /* |
3339 | * Increase/decrease the disabled counter if we are | 3389 | * Increase/decrease the disabled counter if we are |
@@ -3350,7 +3400,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, | |||
3350 | ring_buffer_record_enable_cpu(tr->trace_buffer.buffer, cpu); | 3400 | ring_buffer_record_enable_cpu(tr->trace_buffer.buffer, cpu); |
3351 | } | 3401 | } |
3352 | } | 3402 | } |
3353 | arch_spin_unlock(&ftrace_max_lock); | 3403 | arch_spin_unlock(&tr->max_lock); |
3354 | local_irq_enable(); | 3404 | local_irq_enable(); |
3355 | 3405 | ||
3356 | cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new); | 3406 | cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new); |
@@ -3592,6 +3642,7 @@ static const char readme_msg[] = | |||
3592 | " trace_options\t\t- Set format or modify how tracing happens\n" | 3642 | " trace_options\t\t- Set format or modify how tracing happens\n" |
3593 | "\t\t\t Disable an option by adding a suffix 'no' to the\n" | 3643 | "\t\t\t Disable an option by adding a suffix 'no' to the\n" |
3594 | "\t\t\t option name\n" | 3644 | "\t\t\t option name\n" |
3645 | " saved_cmdlines_size\t- echo command number in here to store comm-pid list\n" | ||
3595 | #ifdef CONFIG_DYNAMIC_FTRACE | 3646 | #ifdef CONFIG_DYNAMIC_FTRACE |
3596 | "\n available_filter_functions - list of functions that can be filtered on\n" | 3647 | "\n available_filter_functions - list of functions that can be filtered on\n" |
3597 | " set_ftrace_filter\t- echo function name in here to only trace these\n" | 3648 | " set_ftrace_filter\t- echo function name in here to only trace these\n" |
@@ -3705,55 +3756,153 @@ static const struct file_operations tracing_readme_fops = { | |||
3705 | .llseek = generic_file_llseek, | 3756 | .llseek = generic_file_llseek, |
3706 | }; | 3757 | }; |
3707 | 3758 | ||
3759 | static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos) | ||
3760 | { | ||
3761 | unsigned int *ptr = v; | ||
3762 | |||
3763 | if (*pos || m->count) | ||
3764 | ptr++; | ||
3765 | |||
3766 | (*pos)++; | ||
3767 | |||
3768 | for (; ptr < &savedcmd->map_cmdline_to_pid[savedcmd->cmdline_num]; | ||
3769 | ptr++) { | ||
3770 | if (*ptr == -1 || *ptr == NO_CMDLINE_MAP) | ||
3771 | continue; | ||
3772 | |||
3773 | return ptr; | ||
3774 | } | ||
3775 | |||
3776 | return NULL; | ||
3777 | } | ||
3778 | |||
3779 | static void *saved_cmdlines_start(struct seq_file *m, loff_t *pos) | ||
3780 | { | ||
3781 | void *v; | ||
3782 | loff_t l = 0; | ||
3783 | |||
3784 | preempt_disable(); | ||
3785 | arch_spin_lock(&trace_cmdline_lock); | ||
3786 | |||
3787 | v = &savedcmd->map_cmdline_to_pid[0]; | ||
3788 | while (l <= *pos) { | ||
3789 | v = saved_cmdlines_next(m, v, &l); | ||
3790 | if (!v) | ||
3791 | return NULL; | ||
3792 | } | ||
3793 | |||
3794 | return v; | ||
3795 | } | ||
3796 | |||
3797 | static void saved_cmdlines_stop(struct seq_file *m, void *v) | ||
3798 | { | ||
3799 | arch_spin_unlock(&trace_cmdline_lock); | ||
3800 | preempt_enable(); | ||
3801 | } | ||
3802 | |||
3803 | static int saved_cmdlines_show(struct seq_file *m, void *v) | ||
3804 | { | ||
3805 | char buf[TASK_COMM_LEN]; | ||
3806 | unsigned int *pid = v; | ||
3807 | |||
3808 | __trace_find_cmdline(*pid, buf); | ||
3809 | seq_printf(m, "%d %s\n", *pid, buf); | ||
3810 | return 0; | ||
3811 | } | ||
3812 | |||
3813 | static const struct seq_operations tracing_saved_cmdlines_seq_ops = { | ||
3814 | .start = saved_cmdlines_start, | ||
3815 | .next = saved_cmdlines_next, | ||
3816 | .stop = saved_cmdlines_stop, | ||
3817 | .show = saved_cmdlines_show, | ||
3818 | }; | ||
3819 | |||
3820 | static int tracing_saved_cmdlines_open(struct inode *inode, struct file *filp) | ||
3821 | { | ||
3822 | if (tracing_disabled) | ||
3823 | return -ENODEV; | ||
3824 | |||
3825 | return seq_open(filp, &tracing_saved_cmdlines_seq_ops); | ||
3826 | } | ||
3827 | |||
3828 | static const struct file_operations tracing_saved_cmdlines_fops = { | ||
3829 | .open = tracing_saved_cmdlines_open, | ||
3830 | .read = seq_read, | ||
3831 | .llseek = seq_lseek, | ||
3832 | .release = seq_release, | ||
3833 | }; | ||
3834 | |||
3708 | static ssize_t | 3835 | static ssize_t |
3709 | tracing_saved_cmdlines_read(struct file *file, char __user *ubuf, | 3836 | tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf, |
3710 | size_t cnt, loff_t *ppos) | 3837 | size_t cnt, loff_t *ppos) |
3711 | { | 3838 | { |
3712 | char *buf_comm; | 3839 | char buf[64]; |
3713 | char *file_buf; | 3840 | int r; |
3714 | char *buf; | ||
3715 | int len = 0; | ||
3716 | int pid; | ||
3717 | int i; | ||
3718 | 3841 | ||
3719 | file_buf = kmalloc(SAVED_CMDLINES*(16+TASK_COMM_LEN), GFP_KERNEL); | 3842 | arch_spin_lock(&trace_cmdline_lock); |
3720 | if (!file_buf) | 3843 | r = scnprintf(buf, sizeof(buf), "%u\n", savedcmd->cmdline_num); |
3844 | arch_spin_unlock(&trace_cmdline_lock); | ||
3845 | |||
3846 | return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); | ||
3847 | } | ||
3848 | |||
3849 | static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s) | ||
3850 | { | ||
3851 | kfree(s->saved_cmdlines); | ||
3852 | kfree(s->map_cmdline_to_pid); | ||
3853 | kfree(s); | ||
3854 | } | ||
3855 | |||
3856 | static int tracing_resize_saved_cmdlines(unsigned int val) | ||
3857 | { | ||
3858 | struct saved_cmdlines_buffer *s, *savedcmd_temp; | ||
3859 | |||
3860 | s = kmalloc(sizeof(*s), GFP_KERNEL); | ||
3861 | if (!s) | ||
3721 | return -ENOMEM; | 3862 | return -ENOMEM; |
3722 | 3863 | ||
3723 | buf_comm = kmalloc(TASK_COMM_LEN, GFP_KERNEL); | 3864 | if (allocate_cmdlines_buffer(val, s) < 0) { |
3724 | if (!buf_comm) { | 3865 | kfree(s); |
3725 | kfree(file_buf); | ||
3726 | return -ENOMEM; | 3866 | return -ENOMEM; |
3727 | } | 3867 | } |
3728 | 3868 | ||
3729 | buf = file_buf; | 3869 | arch_spin_lock(&trace_cmdline_lock); |
3870 | savedcmd_temp = savedcmd; | ||
3871 | savedcmd = s; | ||
3872 | arch_spin_unlock(&trace_cmdline_lock); | ||
3873 | free_saved_cmdlines_buffer(savedcmd_temp); | ||
3730 | 3874 | ||
3731 | for (i = 0; i < SAVED_CMDLINES; i++) { | 3875 | return 0; |
3732 | int r; | 3876 | } |
3733 | 3877 | ||
3734 | pid = map_cmdline_to_pid[i]; | 3878 | static ssize_t |
3735 | if (pid == -1 || pid == NO_CMDLINE_MAP) | 3879 | tracing_saved_cmdlines_size_write(struct file *filp, const char __user *ubuf, |
3736 | continue; | 3880 | size_t cnt, loff_t *ppos) |
3881 | { | ||
3882 | unsigned long val; | ||
3883 | int ret; | ||
3737 | 3884 | ||
3738 | trace_find_cmdline(pid, buf_comm); | 3885 | ret = kstrtoul_from_user(ubuf, cnt, 10, &val); |
3739 | r = sprintf(buf, "%d %s\n", pid, buf_comm); | 3886 | if (ret) |
3740 | buf += r; | 3887 | return ret; |
3741 | len += r; | ||
3742 | } | ||
3743 | 3888 | ||
3744 | len = simple_read_from_buffer(ubuf, cnt, ppos, | 3889 | /* must have at least 1 entry or less than PID_MAX_DEFAULT */ |
3745 | file_buf, len); | 3890 | if (!val || val > PID_MAX_DEFAULT) |
3891 | return -EINVAL; | ||
3746 | 3892 | ||
3747 | kfree(file_buf); | 3893 | ret = tracing_resize_saved_cmdlines((unsigned int)val); |
3748 | kfree(buf_comm); | 3894 | if (ret < 0) |
3895 | return ret; | ||
3749 | 3896 | ||
3750 | return len; | 3897 | *ppos += cnt; |
3898 | |||
3899 | return cnt; | ||
3751 | } | 3900 | } |
3752 | 3901 | ||
3753 | static const struct file_operations tracing_saved_cmdlines_fops = { | 3902 | static const struct file_operations tracing_saved_cmdlines_size_fops = { |
3754 | .open = tracing_open_generic, | 3903 | .open = tracing_open_generic, |
3755 | .read = tracing_saved_cmdlines_read, | 3904 | .read = tracing_saved_cmdlines_size_read, |
3756 | .llseek = generic_file_llseek, | 3905 | .write = tracing_saved_cmdlines_size_write, |
3757 | }; | 3906 | }; |
3758 | 3907 | ||
3759 | static ssize_t | 3908 | static ssize_t |
@@ -4225,29 +4374,11 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table) | |||
4225 | return trace_poll(iter, filp, poll_table); | 4374 | return trace_poll(iter, filp, poll_table); |
4226 | } | 4375 | } |
4227 | 4376 | ||
4228 | /* | ||
4229 | * This is a make-shift waitqueue. | ||
4230 | * A tracer might use this callback on some rare cases: | ||
4231 | * | ||
4232 | * 1) the current tracer might hold the runqueue lock when it wakes up | ||
4233 | * a reader, hence a deadlock (sched, function, and function graph tracers) | ||
4234 | * 2) the function tracers, trace all functions, we don't want | ||
4235 | * the overhead of calling wake_up and friends | ||
4236 | * (and tracing them too) | ||
4237 | * | ||
4238 | * Anyway, this is really very primitive wakeup. | ||
4239 | */ | ||
4240 | void poll_wait_pipe(struct trace_iterator *iter) | ||
4241 | { | ||
4242 | set_current_state(TASK_INTERRUPTIBLE); | ||
4243 | /* sleep for 100 msecs, and try again. */ | ||
4244 | schedule_timeout(HZ / 10); | ||
4245 | } | ||
4246 | |||
4247 | /* Must be called with trace_types_lock mutex held. */ | 4377 | /* Must be called with trace_types_lock mutex held. */ |
4248 | static int tracing_wait_pipe(struct file *filp) | 4378 | static int tracing_wait_pipe(struct file *filp) |
4249 | { | 4379 | { |
4250 | struct trace_iterator *iter = filp->private_data; | 4380 | struct trace_iterator *iter = filp->private_data; |
4381 | int ret; | ||
4251 | 4382 | ||
4252 | while (trace_empty(iter)) { | 4383 | while (trace_empty(iter)) { |
4253 | 4384 | ||
@@ -4255,15 +4386,6 @@ static int tracing_wait_pipe(struct file *filp) | |||
4255 | return -EAGAIN; | 4386 | return -EAGAIN; |
4256 | } | 4387 | } |
4257 | 4388 | ||
4258 | mutex_unlock(&iter->mutex); | ||
4259 | |||
4260 | iter->trace->wait_pipe(iter); | ||
4261 | |||
4262 | mutex_lock(&iter->mutex); | ||
4263 | |||
4264 | if (signal_pending(current)) | ||
4265 | return -EINTR; | ||
4266 | |||
4267 | /* | 4389 | /* |
4268 | * We block until we read something and tracing is disabled. | 4390 | * We block until we read something and tracing is disabled. |
4269 | * We still block if tracing is disabled, but we have never | 4391 | * We still block if tracing is disabled, but we have never |
@@ -4275,6 +4397,18 @@ static int tracing_wait_pipe(struct file *filp) | |||
4275 | */ | 4397 | */ |
4276 | if (!tracing_is_on() && iter->pos) | 4398 | if (!tracing_is_on() && iter->pos) |
4277 | break; | 4399 | break; |
4400 | |||
4401 | mutex_unlock(&iter->mutex); | ||
4402 | |||
4403 | ret = wait_on_pipe(iter); | ||
4404 | |||
4405 | mutex_lock(&iter->mutex); | ||
4406 | |||
4407 | if (ret) | ||
4408 | return ret; | ||
4409 | |||
4410 | if (signal_pending(current)) | ||
4411 | return -EINTR; | ||
4278 | } | 4412 | } |
4279 | 4413 | ||
4280 | return 1; | 4414 | return 1; |
@@ -5197,8 +5331,12 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, | |||
5197 | goto out_unlock; | 5331 | goto out_unlock; |
5198 | } | 5332 | } |
5199 | mutex_unlock(&trace_types_lock); | 5333 | mutex_unlock(&trace_types_lock); |
5200 | iter->trace->wait_pipe(iter); | 5334 | ret = wait_on_pipe(iter); |
5201 | mutex_lock(&trace_types_lock); | 5335 | mutex_lock(&trace_types_lock); |
5336 | if (ret) { | ||
5337 | size = ret; | ||
5338 | goto out_unlock; | ||
5339 | } | ||
5202 | if (signal_pending(current)) { | 5340 | if (signal_pending(current)) { |
5203 | size = -EINTR; | 5341 | size = -EINTR; |
5204 | goto out_unlock; | 5342 | goto out_unlock; |
@@ -5408,8 +5546,10 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, | |||
5408 | goto out; | 5546 | goto out; |
5409 | } | 5547 | } |
5410 | mutex_unlock(&trace_types_lock); | 5548 | mutex_unlock(&trace_types_lock); |
5411 | iter->trace->wait_pipe(iter); | 5549 | ret = wait_on_pipe(iter); |
5412 | mutex_lock(&trace_types_lock); | 5550 | mutex_lock(&trace_types_lock); |
5551 | if (ret) | ||
5552 | goto out; | ||
5413 | if (signal_pending(current)) { | 5553 | if (signal_pending(current)) { |
5414 | ret = -EINTR; | 5554 | ret = -EINTR; |
5415 | goto out; | 5555 | goto out; |
@@ -6102,6 +6242,28 @@ static int allocate_trace_buffers(struct trace_array *tr, int size) | |||
6102 | return 0; | 6242 | return 0; |
6103 | } | 6243 | } |
6104 | 6244 | ||
6245 | static void free_trace_buffer(struct trace_buffer *buf) | ||
6246 | { | ||
6247 | if (buf->buffer) { | ||
6248 | ring_buffer_free(buf->buffer); | ||
6249 | buf->buffer = NULL; | ||
6250 | free_percpu(buf->data); | ||
6251 | buf->data = NULL; | ||
6252 | } | ||
6253 | } | ||
6254 | |||
6255 | static void free_trace_buffers(struct trace_array *tr) | ||
6256 | { | ||
6257 | if (!tr) | ||
6258 | return; | ||
6259 | |||
6260 | free_trace_buffer(&tr->trace_buffer); | ||
6261 | |||
6262 | #ifdef CONFIG_TRACER_MAX_TRACE | ||
6263 | free_trace_buffer(&tr->max_buffer); | ||
6264 | #endif | ||
6265 | } | ||
6266 | |||
6105 | static int new_instance_create(const char *name) | 6267 | static int new_instance_create(const char *name) |
6106 | { | 6268 | { |
6107 | struct trace_array *tr; | 6269 | struct trace_array *tr; |
@@ -6131,6 +6293,8 @@ static int new_instance_create(const char *name) | |||
6131 | 6293 | ||
6132 | raw_spin_lock_init(&tr->start_lock); | 6294 | raw_spin_lock_init(&tr->start_lock); |
6133 | 6295 | ||
6296 | tr->max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; | ||
6297 | |||
6134 | tr->current_trace = &nop_trace; | 6298 | tr->current_trace = &nop_trace; |
6135 | 6299 | ||
6136 | INIT_LIST_HEAD(&tr->systems); | 6300 | INIT_LIST_HEAD(&tr->systems); |
@@ -6158,8 +6322,7 @@ static int new_instance_create(const char *name) | |||
6158 | return 0; | 6322 | return 0; |
6159 | 6323 | ||
6160 | out_free_tr: | 6324 | out_free_tr: |
6161 | if (tr->trace_buffer.buffer) | 6325 | free_trace_buffers(tr); |
6162 | ring_buffer_free(tr->trace_buffer.buffer); | ||
6163 | free_cpumask_var(tr->tracing_cpumask); | 6326 | free_cpumask_var(tr->tracing_cpumask); |
6164 | kfree(tr->name); | 6327 | kfree(tr->name); |
6165 | kfree(tr); | 6328 | kfree(tr); |
@@ -6199,8 +6362,7 @@ static int instance_delete(const char *name) | |||
6199 | event_trace_del_tracer(tr); | 6362 | event_trace_del_tracer(tr); |
6200 | ftrace_destroy_function_files(tr); | 6363 | ftrace_destroy_function_files(tr); |
6201 | debugfs_remove_recursive(tr->dir); | 6364 | debugfs_remove_recursive(tr->dir); |
6202 | free_percpu(tr->trace_buffer.data); | 6365 | free_trace_buffers(tr); |
6203 | ring_buffer_free(tr->trace_buffer.buffer); | ||
6204 | 6366 | ||
6205 | kfree(tr->name); | 6367 | kfree(tr->name); |
6206 | kfree(tr); | 6368 | kfree(tr); |
@@ -6328,6 +6490,11 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer) | |||
6328 | trace_create_file("tracing_on", 0644, d_tracer, | 6490 | trace_create_file("tracing_on", 0644, d_tracer, |
6329 | tr, &rb_simple_fops); | 6491 | tr, &rb_simple_fops); |
6330 | 6492 | ||
6493 | #ifdef CONFIG_TRACER_MAX_TRACE | ||
6494 | trace_create_file("tracing_max_latency", 0644, d_tracer, | ||
6495 | &tr->max_latency, &tracing_max_lat_fops); | ||
6496 | #endif | ||
6497 | |||
6331 | if (ftrace_create_function_files(tr, d_tracer)) | 6498 | if (ftrace_create_function_files(tr, d_tracer)) |
6332 | WARN(1, "Could not allocate function filter files"); | 6499 | WARN(1, "Could not allocate function filter files"); |
6333 | 6500 | ||
@@ -6353,11 +6520,6 @@ static __init int tracer_init_debugfs(void) | |||
6353 | 6520 | ||
6354 | init_tracer_debugfs(&global_trace, d_tracer); | 6521 | init_tracer_debugfs(&global_trace, d_tracer); |
6355 | 6522 | ||
6356 | #ifdef CONFIG_TRACER_MAX_TRACE | ||
6357 | trace_create_file("tracing_max_latency", 0644, d_tracer, | ||
6358 | &tracing_max_latency, &tracing_max_lat_fops); | ||
6359 | #endif | ||
6360 | |||
6361 | trace_create_file("tracing_thresh", 0644, d_tracer, | 6523 | trace_create_file("tracing_thresh", 0644, d_tracer, |
6362 | &tracing_thresh, &tracing_max_lat_fops); | 6524 | &tracing_thresh, &tracing_max_lat_fops); |
6363 | 6525 | ||
@@ -6367,6 +6529,9 @@ static __init int tracer_init_debugfs(void) | |||
6367 | trace_create_file("saved_cmdlines", 0444, d_tracer, | 6529 | trace_create_file("saved_cmdlines", 0444, d_tracer, |
6368 | NULL, &tracing_saved_cmdlines_fops); | 6530 | NULL, &tracing_saved_cmdlines_fops); |
6369 | 6531 | ||
6532 | trace_create_file("saved_cmdlines_size", 0644, d_tracer, | ||
6533 | NULL, &tracing_saved_cmdlines_size_fops); | ||
6534 | |||
6370 | #ifdef CONFIG_DYNAMIC_FTRACE | 6535 | #ifdef CONFIG_DYNAMIC_FTRACE |
6371 | trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, | 6536 | trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, |
6372 | &ftrace_update_tot_cnt, &tracing_dyn_info_fops); | 6537 | &ftrace_update_tot_cnt, &tracing_dyn_info_fops); |
@@ -6603,18 +6768,19 @@ __init static int tracer_alloc_buffers(void) | |||
6603 | if (!temp_buffer) | 6768 | if (!temp_buffer) |
6604 | goto out_free_cpumask; | 6769 | goto out_free_cpumask; |
6605 | 6770 | ||
6771 | if (trace_create_savedcmd() < 0) | ||
6772 | goto out_free_temp_buffer; | ||
6773 | |||
6606 | /* TODO: make the number of buffers hot pluggable with CPUS */ | 6774 | /* TODO: make the number of buffers hot pluggable with CPUS */ |
6607 | if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) { | 6775 | if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) { |
6608 | printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); | 6776 | printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); |
6609 | WARN_ON(1); | 6777 | WARN_ON(1); |
6610 | goto out_free_temp_buffer; | 6778 | goto out_free_savedcmd; |
6611 | } | 6779 | } |
6612 | 6780 | ||
6613 | if (global_trace.buffer_disabled) | 6781 | if (global_trace.buffer_disabled) |
6614 | tracing_off(); | 6782 | tracing_off(); |
6615 | 6783 | ||
6616 | trace_init_cmdlines(); | ||
6617 | |||
6618 | if (trace_boot_clock) { | 6784 | if (trace_boot_clock) { |
6619 | ret = tracing_set_clock(&global_trace, trace_boot_clock); | 6785 | ret = tracing_set_clock(&global_trace, trace_boot_clock); |
6620 | if (ret < 0) | 6786 | if (ret < 0) |
@@ -6629,6 +6795,10 @@ __init static int tracer_alloc_buffers(void) | |||
6629 | */ | 6795 | */ |
6630 | global_trace.current_trace = &nop_trace; | 6796 | global_trace.current_trace = &nop_trace; |
6631 | 6797 | ||
6798 | global_trace.max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; | ||
6799 | |||
6800 | ftrace_init_global_array_ops(&global_trace); | ||
6801 | |||
6632 | register_tracer(&nop_trace); | 6802 | register_tracer(&nop_trace); |
6633 | 6803 | ||
6634 | /* All seems OK, enable tracing */ | 6804 | /* All seems OK, enable tracing */ |
@@ -6656,13 +6826,11 @@ __init static int tracer_alloc_buffers(void) | |||
6656 | 6826 | ||
6657 | return 0; | 6827 | return 0; |
6658 | 6828 | ||
6829 | out_free_savedcmd: | ||
6830 | free_saved_cmdlines_buffer(savedcmd); | ||
6659 | out_free_temp_buffer: | 6831 | out_free_temp_buffer: |
6660 | ring_buffer_free(temp_buffer); | 6832 | ring_buffer_free(temp_buffer); |
6661 | out_free_cpumask: | 6833 | out_free_cpumask: |
6662 | free_percpu(global_trace.trace_buffer.data); | ||
6663 | #ifdef CONFIG_TRACER_MAX_TRACE | ||
6664 | free_percpu(global_trace.max_buffer.data); | ||
6665 | #endif | ||
6666 | free_cpumask_var(global_trace.tracing_cpumask); | 6834 | free_cpumask_var(global_trace.tracing_cpumask); |
6667 | out_free_buffer_mask: | 6835 | out_free_buffer_mask: |
6668 | free_cpumask_var(tracing_buffer_mask); | 6836 | free_cpumask_var(tracing_buffer_mask); |
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 2e29d7ba5a52..9258f5a815db 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
@@ -190,7 +190,22 @@ struct trace_array { | |||
190 | */ | 190 | */ |
191 | struct trace_buffer max_buffer; | 191 | struct trace_buffer max_buffer; |
192 | bool allocated_snapshot; | 192 | bool allocated_snapshot; |
193 | unsigned long max_latency; | ||
193 | #endif | 194 | #endif |
195 | /* | ||
196 | * max_lock is used to protect the swapping of buffers | ||
197 | * when taking a max snapshot. The buffers themselves are | ||
198 | * protected by per_cpu spinlocks. But the action of the swap | ||
199 | * needs its own lock. | ||
200 | * | ||
201 | * This is defined as a arch_spinlock_t in order to help | ||
202 | * with performance when lockdep debugging is enabled. | ||
203 | * | ||
204 | * It is also used in other places outside the update_max_tr | ||
205 | * so it needs to be defined outside of the | ||
206 | * CONFIG_TRACER_MAX_TRACE. | ||
207 | */ | ||
208 | arch_spinlock_t max_lock; | ||
194 | int buffer_disabled; | 209 | int buffer_disabled; |
195 | #ifdef CONFIG_FTRACE_SYSCALLS | 210 | #ifdef CONFIG_FTRACE_SYSCALLS |
196 | int sys_refcount_enter; | 211 | int sys_refcount_enter; |
@@ -237,6 +252,9 @@ static inline struct trace_array *top_trace_array(void) | |||
237 | { | 252 | { |
238 | struct trace_array *tr; | 253 | struct trace_array *tr; |
239 | 254 | ||
255 | if (list_empty(&ftrace_trace_arrays)) | ||
256 | return NULL; | ||
257 | |||
240 | tr = list_entry(ftrace_trace_arrays.prev, | 258 | tr = list_entry(ftrace_trace_arrays.prev, |
241 | typeof(*tr), list); | 259 | typeof(*tr), list); |
242 | WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL)); | 260 | WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL)); |
@@ -323,7 +341,6 @@ struct tracer_flags { | |||
323 | * @stop: called when tracing is paused (echo 0 > tracing_enabled) | 341 | * @stop: called when tracing is paused (echo 0 > tracing_enabled) |
324 | * @open: called when the trace file is opened | 342 | * @open: called when the trace file is opened |
325 | * @pipe_open: called when the trace_pipe file is opened | 343 | * @pipe_open: called when the trace_pipe file is opened |
326 | * @wait_pipe: override how the user waits for traces on trace_pipe | ||
327 | * @close: called when the trace file is released | 344 | * @close: called when the trace file is released |
328 | * @pipe_close: called when the trace_pipe file is released | 345 | * @pipe_close: called when the trace_pipe file is released |
329 | * @read: override the default read callback on trace_pipe | 346 | * @read: override the default read callback on trace_pipe |
@@ -342,7 +359,6 @@ struct tracer { | |||
342 | void (*stop)(struct trace_array *tr); | 359 | void (*stop)(struct trace_array *tr); |
343 | void (*open)(struct trace_iterator *iter); | 360 | void (*open)(struct trace_iterator *iter); |
344 | void (*pipe_open)(struct trace_iterator *iter); | 361 | void (*pipe_open)(struct trace_iterator *iter); |
345 | void (*wait_pipe)(struct trace_iterator *iter); | ||
346 | void (*close)(struct trace_iterator *iter); | 362 | void (*close)(struct trace_iterator *iter); |
347 | void (*pipe_close)(struct trace_iterator *iter); | 363 | void (*pipe_close)(struct trace_iterator *iter); |
348 | ssize_t (*read)(struct trace_iterator *iter, | 364 | ssize_t (*read)(struct trace_iterator *iter, |
@@ -416,13 +432,7 @@ enum { | |||
416 | TRACE_FTRACE_IRQ_BIT, | 432 | TRACE_FTRACE_IRQ_BIT, |
417 | TRACE_FTRACE_SIRQ_BIT, | 433 | TRACE_FTRACE_SIRQ_BIT, |
418 | 434 | ||
419 | /* GLOBAL_BITs must be greater than FTRACE_BITs */ | 435 | /* INTERNAL_BITs must be greater than FTRACE_BITs */ |
420 | TRACE_GLOBAL_BIT, | ||
421 | TRACE_GLOBAL_NMI_BIT, | ||
422 | TRACE_GLOBAL_IRQ_BIT, | ||
423 | TRACE_GLOBAL_SIRQ_BIT, | ||
424 | |||
425 | /* INTERNAL_BITs must be greater than GLOBAL_BITs */ | ||
426 | TRACE_INTERNAL_BIT, | 436 | TRACE_INTERNAL_BIT, |
427 | TRACE_INTERNAL_NMI_BIT, | 437 | TRACE_INTERNAL_NMI_BIT, |
428 | TRACE_INTERNAL_IRQ_BIT, | 438 | TRACE_INTERNAL_IRQ_BIT, |
@@ -449,9 +459,6 @@ enum { | |||
449 | #define TRACE_FTRACE_START TRACE_FTRACE_BIT | 459 | #define TRACE_FTRACE_START TRACE_FTRACE_BIT |
450 | #define TRACE_FTRACE_MAX ((1 << (TRACE_FTRACE_START + TRACE_CONTEXT_BITS)) - 1) | 460 | #define TRACE_FTRACE_MAX ((1 << (TRACE_FTRACE_START + TRACE_CONTEXT_BITS)) - 1) |
451 | 461 | ||
452 | #define TRACE_GLOBAL_START TRACE_GLOBAL_BIT | ||
453 | #define TRACE_GLOBAL_MAX ((1 << (TRACE_GLOBAL_START + TRACE_CONTEXT_BITS)) - 1) | ||
454 | |||
455 | #define TRACE_LIST_START TRACE_INTERNAL_BIT | 462 | #define TRACE_LIST_START TRACE_INTERNAL_BIT |
456 | #define TRACE_LIST_MAX ((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1) | 463 | #define TRACE_LIST_MAX ((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1) |
457 | 464 | ||
@@ -560,8 +567,6 @@ void trace_init_global_iter(struct trace_iterator *iter); | |||
560 | 567 | ||
561 | void tracing_iter_reset(struct trace_iterator *iter, int cpu); | 568 | void tracing_iter_reset(struct trace_iterator *iter, int cpu); |
562 | 569 | ||
563 | void poll_wait_pipe(struct trace_iterator *iter); | ||
564 | |||
565 | void tracing_sched_switch_trace(struct trace_array *tr, | 570 | void tracing_sched_switch_trace(struct trace_array *tr, |
566 | struct task_struct *prev, | 571 | struct task_struct *prev, |
567 | struct task_struct *next, | 572 | struct task_struct *next, |
@@ -608,8 +613,6 @@ extern unsigned long nsecs_to_usecs(unsigned long nsecs); | |||
608 | extern unsigned long tracing_thresh; | 613 | extern unsigned long tracing_thresh; |
609 | 614 | ||
610 | #ifdef CONFIG_TRACER_MAX_TRACE | 615 | #ifdef CONFIG_TRACER_MAX_TRACE |
611 | extern unsigned long tracing_max_latency; | ||
612 | |||
613 | void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); | 616 | void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); |
614 | void update_max_tr_single(struct trace_array *tr, | 617 | void update_max_tr_single(struct trace_array *tr, |
615 | struct task_struct *tsk, int cpu); | 618 | struct task_struct *tsk, int cpu); |
@@ -724,6 +727,8 @@ extern unsigned long trace_flags; | |||
724 | #define TRACE_GRAPH_PRINT_PROC 0x8 | 727 | #define TRACE_GRAPH_PRINT_PROC 0x8 |
725 | #define TRACE_GRAPH_PRINT_DURATION 0x10 | 728 | #define TRACE_GRAPH_PRINT_DURATION 0x10 |
726 | #define TRACE_GRAPH_PRINT_ABS_TIME 0x20 | 729 | #define TRACE_GRAPH_PRINT_ABS_TIME 0x20 |
730 | #define TRACE_GRAPH_PRINT_IRQS 0x40 | ||
731 | #define TRACE_GRAPH_PRINT_TAIL 0x80 | ||
727 | #define TRACE_GRAPH_PRINT_FILL_SHIFT 28 | 732 | #define TRACE_GRAPH_PRINT_FILL_SHIFT 28 |
728 | #define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT) | 733 | #define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT) |
729 | 734 | ||
@@ -823,6 +828,10 @@ extern int ftrace_is_dead(void); | |||
823 | int ftrace_create_function_files(struct trace_array *tr, | 828 | int ftrace_create_function_files(struct trace_array *tr, |
824 | struct dentry *parent); | 829 | struct dentry *parent); |
825 | void ftrace_destroy_function_files(struct trace_array *tr); | 830 | void ftrace_destroy_function_files(struct trace_array *tr); |
831 | void ftrace_init_global_array_ops(struct trace_array *tr); | ||
832 | void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func); | ||
833 | void ftrace_reset_array_ops(struct trace_array *tr); | ||
834 | int using_ftrace_ops_list_func(void); | ||
826 | #else | 835 | #else |
827 | static inline int ftrace_trace_task(struct task_struct *task) | 836 | static inline int ftrace_trace_task(struct task_struct *task) |
828 | { | 837 | { |
@@ -836,6 +845,11 @@ ftrace_create_function_files(struct trace_array *tr, | |||
836 | return 0; | 845 | return 0; |
837 | } | 846 | } |
838 | static inline void ftrace_destroy_function_files(struct trace_array *tr) { } | 847 | static inline void ftrace_destroy_function_files(struct trace_array *tr) { } |
848 | static inline __init void | ||
849 | ftrace_init_global_array_ops(struct trace_array *tr) { } | ||
850 | static inline void ftrace_reset_array_ops(struct trace_array *tr) { } | ||
851 | /* ftace_func_t type is not defined, use macro instead of static inline */ | ||
852 | #define ftrace_init_array_ops(tr, func) do { } while (0) | ||
839 | #endif /* CONFIG_FUNCTION_TRACER */ | 853 | #endif /* CONFIG_FUNCTION_TRACER */ |
840 | 854 | ||
841 | #if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE) | 855 | #if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE) |
diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c new file mode 100644 index 000000000000..40a14cbcf8e0 --- /dev/null +++ b/kernel/trace/trace_benchmark.c | |||
@@ -0,0 +1,198 @@ | |||
1 | #include <linux/delay.h> | ||
2 | #include <linux/module.h> | ||
3 | #include <linux/kthread.h> | ||
4 | #include <linux/trace_clock.h> | ||
5 | |||
6 | #define CREATE_TRACE_POINTS | ||
7 | #include "trace_benchmark.h" | ||
8 | |||
9 | static struct task_struct *bm_event_thread; | ||
10 | |||
11 | static char bm_str[BENCHMARK_EVENT_STRLEN] = "START"; | ||
12 | |||
13 | static u64 bm_total; | ||
14 | static u64 bm_totalsq; | ||
15 | static u64 bm_last; | ||
16 | static u64 bm_max; | ||
17 | static u64 bm_min; | ||
18 | static u64 bm_first; | ||
19 | static u64 bm_cnt; | ||
20 | static u64 bm_stddev; | ||
21 | static unsigned int bm_avg; | ||
22 | static unsigned int bm_std; | ||
23 | |||
24 | /* | ||
25 | * This gets called in a loop recording the time it took to write | ||
26 | * the tracepoint. What it writes is the time statistics of the last | ||
27 | * tracepoint write. As there is nothing to write the first time | ||
28 | * it simply writes "START". As the first write is cold cache and | ||
29 | * the rest is hot, we save off that time in bm_first and it is | ||
30 | * reported as "first", which is shown in the second write to the | ||
31 | * tracepoint. The "first" field is writen within the statics from | ||
32 | * then on but never changes. | ||
33 | */ | ||
34 | static void trace_do_benchmark(void) | ||
35 | { | ||
36 | u64 start; | ||
37 | u64 stop; | ||
38 | u64 delta; | ||
39 | u64 stddev; | ||
40 | u64 seed; | ||
41 | u64 last_seed; | ||
42 | unsigned int avg; | ||
43 | unsigned int std = 0; | ||
44 | |||
45 | /* Only run if the tracepoint is actually active */ | ||
46 | if (!trace_benchmark_event_enabled()) | ||
47 | return; | ||
48 | |||
49 | local_irq_disable(); | ||
50 | start = trace_clock_local(); | ||
51 | trace_benchmark_event(bm_str); | ||
52 | stop = trace_clock_local(); | ||
53 | local_irq_enable(); | ||
54 | |||
55 | bm_cnt++; | ||
56 | |||
57 | delta = stop - start; | ||
58 | |||
59 | /* | ||
60 | * The first read is cold cached, keep it separate from the | ||
61 | * other calculations. | ||
62 | */ | ||
63 | if (bm_cnt == 1) { | ||
64 | bm_first = delta; | ||
65 | scnprintf(bm_str, BENCHMARK_EVENT_STRLEN, | ||
66 | "first=%llu [COLD CACHED]", bm_first); | ||
67 | return; | ||
68 | } | ||
69 | |||
70 | bm_last = delta; | ||
71 | |||
72 | if (delta > bm_max) | ||
73 | bm_max = delta; | ||
74 | if (!bm_min || delta < bm_min) | ||
75 | bm_min = delta; | ||
76 | |||
77 | /* | ||
78 | * When bm_cnt is greater than UINT_MAX, it breaks the statistics | ||
79 | * accounting. Freeze the statistics when that happens. | ||
80 | * We should have enough data for the avg and stddev anyway. | ||
81 | */ | ||
82 | if (bm_cnt > UINT_MAX) { | ||
83 | scnprintf(bm_str, BENCHMARK_EVENT_STRLEN, | ||
84 | "last=%llu first=%llu max=%llu min=%llu ** avg=%u std=%d std^2=%lld", | ||
85 | bm_last, bm_first, bm_max, bm_min, bm_avg, bm_std, bm_stddev); | ||
86 | return; | ||
87 | } | ||
88 | |||
89 | bm_total += delta; | ||
90 | bm_totalsq += delta * delta; | ||
91 | |||
92 | |||
93 | if (bm_cnt > 1) { | ||
94 | /* | ||
95 | * Apply Welford's method to calculate standard deviation: | ||
96 | * s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2) | ||
97 | */ | ||
98 | stddev = (u64)bm_cnt * bm_totalsq - bm_total * bm_total; | ||
99 | do_div(stddev, (u32)bm_cnt); | ||
100 | do_div(stddev, (u32)bm_cnt - 1); | ||
101 | } else | ||
102 | stddev = 0; | ||
103 | |||
104 | delta = bm_total; | ||
105 | do_div(delta, bm_cnt); | ||
106 | avg = delta; | ||
107 | |||
108 | if (stddev > 0) { | ||
109 | int i = 0; | ||
110 | /* | ||
111 | * stddev is the square of standard deviation but | ||
112 | * we want the actualy number. Use the average | ||
113 | * as our seed to find the std. | ||
114 | * | ||
115 | * The next try is: | ||
116 | * x = (x + N/x) / 2 | ||
117 | * | ||
118 | * Where N is the squared number to find the square | ||
119 | * root of. | ||
120 | */ | ||
121 | seed = avg; | ||
122 | do { | ||
123 | last_seed = seed; | ||
124 | seed = stddev; | ||
125 | if (!last_seed) | ||
126 | break; | ||
127 | do_div(seed, last_seed); | ||
128 | seed += last_seed; | ||
129 | do_div(seed, 2); | ||
130 | } while (i++ < 10 && last_seed != seed); | ||
131 | |||
132 | std = seed; | ||
133 | } | ||
134 | |||
135 | scnprintf(bm_str, BENCHMARK_EVENT_STRLEN, | ||
136 | "last=%llu first=%llu max=%llu min=%llu avg=%u std=%d std^2=%lld", | ||
137 | bm_last, bm_first, bm_max, bm_min, avg, std, stddev); | ||
138 | |||
139 | bm_std = std; | ||
140 | bm_avg = avg; | ||
141 | bm_stddev = stddev; | ||
142 | } | ||
143 | |||
144 | static int benchmark_event_kthread(void *arg) | ||
145 | { | ||
146 | /* sleep a bit to make sure the tracepoint gets activated */ | ||
147 | msleep(100); | ||
148 | |||
149 | while (!kthread_should_stop()) { | ||
150 | |||
151 | trace_do_benchmark(); | ||
152 | |||
153 | /* | ||
154 | * We don't go to sleep, but let others | ||
155 | * run as well. | ||
156 | */ | ||
157 | cond_resched(); | ||
158 | } | ||
159 | |||
160 | return 0; | ||
161 | } | ||
162 | |||
163 | /* | ||
164 | * When the benchmark tracepoint is enabled, it calls this | ||
165 | * function and the thread that calls the tracepoint is created. | ||
166 | */ | ||
167 | void trace_benchmark_reg(void) | ||
168 | { | ||
169 | bm_event_thread = kthread_run(benchmark_event_kthread, | ||
170 | NULL, "event_benchmark"); | ||
171 | WARN_ON(!bm_event_thread); | ||
172 | } | ||
173 | |||
174 | /* | ||
175 | * When the benchmark tracepoint is disabled, it calls this | ||
176 | * function and the thread that calls the tracepoint is deleted | ||
177 | * and all the numbers are reset. | ||
178 | */ | ||
179 | void trace_benchmark_unreg(void) | ||
180 | { | ||
181 | if (!bm_event_thread) | ||
182 | return; | ||
183 | |||
184 | kthread_stop(bm_event_thread); | ||
185 | |||
186 | strcpy(bm_str, "START"); | ||
187 | bm_total = 0; | ||
188 | bm_totalsq = 0; | ||
189 | bm_last = 0; | ||
190 | bm_max = 0; | ||
191 | bm_min = 0; | ||
192 | bm_cnt = 0; | ||
193 | /* These don't need to be reset but reset them anyway */ | ||
194 | bm_first = 0; | ||
195 | bm_std = 0; | ||
196 | bm_avg = 0; | ||
197 | bm_stddev = 0; | ||
198 | } | ||
diff --git a/kernel/trace/trace_benchmark.h b/kernel/trace/trace_benchmark.h new file mode 100644 index 000000000000..3c1df1df4e29 --- /dev/null +++ b/kernel/trace/trace_benchmark.h | |||
@@ -0,0 +1,41 @@ | |||
1 | #undef TRACE_SYSTEM | ||
2 | #define TRACE_SYSTEM benchmark | ||
3 | |||
4 | #if !defined(_TRACE_BENCHMARK_H) || defined(TRACE_HEADER_MULTI_READ) | ||
5 | #define _TRACE_BENCHMARK_H | ||
6 | |||
7 | #include <linux/tracepoint.h> | ||
8 | |||
9 | extern void trace_benchmark_reg(void); | ||
10 | extern void trace_benchmark_unreg(void); | ||
11 | |||
12 | #define BENCHMARK_EVENT_STRLEN 128 | ||
13 | |||
14 | TRACE_EVENT_FN(benchmark_event, | ||
15 | |||
16 | TP_PROTO(const char *str), | ||
17 | |||
18 | TP_ARGS(str), | ||
19 | |||
20 | TP_STRUCT__entry( | ||
21 | __array( char, str, BENCHMARK_EVENT_STRLEN ) | ||
22 | ), | ||
23 | |||
24 | TP_fast_assign( | ||
25 | memcpy(__entry->str, str, BENCHMARK_EVENT_STRLEN); | ||
26 | ), | ||
27 | |||
28 | TP_printk("%s", __entry->str), | ||
29 | |||
30 | trace_benchmark_reg, trace_benchmark_unreg | ||
31 | ); | ||
32 | |||
33 | #endif /* _TRACE_BENCHMARK_H */ | ||
34 | |||
35 | #undef TRACE_INCLUDE_FILE | ||
36 | #undef TRACE_INCLUDE_PATH | ||
37 | #define TRACE_INCLUDE_PATH . | ||
38 | #define TRACE_INCLUDE_FILE trace_benchmark | ||
39 | |||
40 | /* This part must be outside protection */ | ||
41 | #include <trace/define_trace.h> | ||
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index c894614de14d..5d12bb407b44 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c | |||
@@ -248,8 +248,8 @@ void perf_trace_del(struct perf_event *p_event, int flags) | |||
248 | tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event); | 248 | tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event); |
249 | } | 249 | } |
250 | 250 | ||
251 | __kprobes void *perf_trace_buf_prepare(int size, unsigned short type, | 251 | void *perf_trace_buf_prepare(int size, unsigned short type, |
252 | struct pt_regs *regs, int *rctxp) | 252 | struct pt_regs *regs, int *rctxp) |
253 | { | 253 | { |
254 | struct trace_entry *entry; | 254 | struct trace_entry *entry; |
255 | unsigned long flags; | 255 | unsigned long flags; |
@@ -281,6 +281,7 @@ __kprobes void *perf_trace_buf_prepare(int size, unsigned short type, | |||
281 | return raw_data; | 281 | return raw_data; |
282 | } | 282 | } |
283 | EXPORT_SYMBOL_GPL(perf_trace_buf_prepare); | 283 | EXPORT_SYMBOL_GPL(perf_trace_buf_prepare); |
284 | NOKPROBE_SYMBOL(perf_trace_buf_prepare); | ||
284 | 285 | ||
285 | #ifdef CONFIG_FUNCTION_TRACER | 286 | #ifdef CONFIG_FUNCTION_TRACER |
286 | static void | 287 | static void |
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 3ddfd8f62c05..f99e0b3bca8c 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
@@ -574,6 +574,9 @@ int trace_set_clr_event(const char *system, const char *event, int set) | |||
574 | { | 574 | { |
575 | struct trace_array *tr = top_trace_array(); | 575 | struct trace_array *tr = top_trace_array(); |
576 | 576 | ||
577 | if (!tr) | ||
578 | return -ENODEV; | ||
579 | |||
577 | return __ftrace_set_clr_event(tr, NULL, system, event, set); | 580 | return __ftrace_set_clr_event(tr, NULL, system, event, set); |
578 | } | 581 | } |
579 | EXPORT_SYMBOL_GPL(trace_set_clr_event); | 582 | EXPORT_SYMBOL_GPL(trace_set_clr_event); |
@@ -2065,6 +2068,9 @@ event_enable_func(struct ftrace_hash *hash, | |||
2065 | bool enable; | 2068 | bool enable; |
2066 | int ret; | 2069 | int ret; |
2067 | 2070 | ||
2071 | if (!tr) | ||
2072 | return -ENODEV; | ||
2073 | |||
2068 | /* hash funcs only work with set_ftrace_filter */ | 2074 | /* hash funcs only work with set_ftrace_filter */ |
2069 | if (!enabled || !param) | 2075 | if (!enabled || !param) |
2070 | return -EINVAL; | 2076 | return -EINVAL; |
@@ -2396,6 +2402,9 @@ static __init int event_trace_enable(void) | |||
2396 | char *token; | 2402 | char *token; |
2397 | int ret; | 2403 | int ret; |
2398 | 2404 | ||
2405 | if (!tr) | ||
2406 | return -ENODEV; | ||
2407 | |||
2399 | for_each_event(iter, __start_ftrace_events, __stop_ftrace_events) { | 2408 | for_each_event(iter, __start_ftrace_events, __stop_ftrace_events) { |
2400 | 2409 | ||
2401 | call = *iter; | 2410 | call = *iter; |
@@ -2442,6 +2451,8 @@ static __init int event_trace_init(void) | |||
2442 | int ret; | 2451 | int ret; |
2443 | 2452 | ||
2444 | tr = top_trace_array(); | 2453 | tr = top_trace_array(); |
2454 | if (!tr) | ||
2455 | return -ENODEV; | ||
2445 | 2456 | ||
2446 | d_tracer = tracing_init_dentry(); | 2457 | d_tracer = tracing_init_dentry(); |
2447 | if (!d_tracer) | 2458 | if (!d_tracer) |
@@ -2535,6 +2546,8 @@ static __init void event_trace_self_tests(void) | |||
2535 | int ret; | 2546 | int ret; |
2536 | 2547 | ||
2537 | tr = top_trace_array(); | 2548 | tr = top_trace_array(); |
2549 | if (!tr) | ||
2550 | return; | ||
2538 | 2551 | ||
2539 | pr_info("Running tests on trace events:\n"); | 2552 | pr_info("Running tests on trace events:\n"); |
2540 | 2553 | ||
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index ffd56351b521..57f0ec962d2c 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c | |||
@@ -26,8 +26,6 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, | |||
26 | static void | 26 | static void |
27 | function_stack_trace_call(unsigned long ip, unsigned long parent_ip, | 27 | function_stack_trace_call(unsigned long ip, unsigned long parent_ip, |
28 | struct ftrace_ops *op, struct pt_regs *pt_regs); | 28 | struct ftrace_ops *op, struct pt_regs *pt_regs); |
29 | static struct ftrace_ops trace_ops; | ||
30 | static struct ftrace_ops trace_stack_ops; | ||
31 | static struct tracer_flags func_flags; | 29 | static struct tracer_flags func_flags; |
32 | 30 | ||
33 | /* Our option */ | 31 | /* Our option */ |
@@ -83,28 +81,24 @@ void ftrace_destroy_function_files(struct trace_array *tr) | |||
83 | 81 | ||
84 | static int function_trace_init(struct trace_array *tr) | 82 | static int function_trace_init(struct trace_array *tr) |
85 | { | 83 | { |
86 | struct ftrace_ops *ops; | 84 | ftrace_func_t func; |
87 | |||
88 | if (tr->flags & TRACE_ARRAY_FL_GLOBAL) { | ||
89 | /* There's only one global tr */ | ||
90 | if (!trace_ops.private) { | ||
91 | trace_ops.private = tr; | ||
92 | trace_stack_ops.private = tr; | ||
93 | } | ||
94 | 85 | ||
95 | if (func_flags.val & TRACE_FUNC_OPT_STACK) | 86 | /* |
96 | ops = &trace_stack_ops; | 87 | * Instance trace_arrays get their ops allocated |
97 | else | 88 | * at instance creation. Unless it failed |
98 | ops = &trace_ops; | 89 | * the allocation. |
99 | tr->ops = ops; | 90 | */ |
100 | } else if (!tr->ops) { | 91 | if (!tr->ops) |
101 | /* | ||
102 | * Instance trace_arrays get their ops allocated | ||
103 | * at instance creation. Unless it failed | ||
104 | * the allocation. | ||
105 | */ | ||
106 | return -ENOMEM; | 92 | return -ENOMEM; |
107 | } | 93 | |
94 | /* Currently only the global instance can do stack tracing */ | ||
95 | if (tr->flags & TRACE_ARRAY_FL_GLOBAL && | ||
96 | func_flags.val & TRACE_FUNC_OPT_STACK) | ||
97 | func = function_stack_trace_call; | ||
98 | else | ||
99 | func = function_trace_call; | ||
100 | |||
101 | ftrace_init_array_ops(tr, func); | ||
108 | 102 | ||
109 | tr->trace_buffer.cpu = get_cpu(); | 103 | tr->trace_buffer.cpu = get_cpu(); |
110 | put_cpu(); | 104 | put_cpu(); |
@@ -118,6 +112,7 @@ static void function_trace_reset(struct trace_array *tr) | |||
118 | { | 112 | { |
119 | tracing_stop_function_trace(tr); | 113 | tracing_stop_function_trace(tr); |
120 | tracing_stop_cmdline_record(); | 114 | tracing_stop_cmdline_record(); |
115 | ftrace_reset_array_ops(tr); | ||
121 | } | 116 | } |
122 | 117 | ||
123 | static void function_trace_start(struct trace_array *tr) | 118 | static void function_trace_start(struct trace_array *tr) |
@@ -199,18 +194,6 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip, | |||
199 | local_irq_restore(flags); | 194 | local_irq_restore(flags); |
200 | } | 195 | } |
201 | 196 | ||
202 | static struct ftrace_ops trace_ops __read_mostly = | ||
203 | { | ||
204 | .func = function_trace_call, | ||
205 | .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, | ||
206 | }; | ||
207 | |||
208 | static struct ftrace_ops trace_stack_ops __read_mostly = | ||
209 | { | ||
210 | .func = function_stack_trace_call, | ||
211 | .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, | ||
212 | }; | ||
213 | |||
214 | static struct tracer_opt func_opts[] = { | 197 | static struct tracer_opt func_opts[] = { |
215 | #ifdef CONFIG_STACKTRACE | 198 | #ifdef CONFIG_STACKTRACE |
216 | { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) }, | 199 | { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) }, |
@@ -248,10 +231,10 @@ func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) | |||
248 | unregister_ftrace_function(tr->ops); | 231 | unregister_ftrace_function(tr->ops); |
249 | 232 | ||
250 | if (set) { | 233 | if (set) { |
251 | tr->ops = &trace_stack_ops; | 234 | tr->ops->func = function_stack_trace_call; |
252 | register_ftrace_function(tr->ops); | 235 | register_ftrace_function(tr->ops); |
253 | } else { | 236 | } else { |
254 | tr->ops = &trace_ops; | 237 | tr->ops->func = function_trace_call; |
255 | register_ftrace_function(tr->ops); | 238 | register_ftrace_function(tr->ops); |
256 | } | 239 | } |
257 | 240 | ||
@@ -269,7 +252,6 @@ static struct tracer function_trace __tracer_data = | |||
269 | .init = function_trace_init, | 252 | .init = function_trace_init, |
270 | .reset = function_trace_reset, | 253 | .reset = function_trace_reset, |
271 | .start = function_trace_start, | 254 | .start = function_trace_start, |
272 | .wait_pipe = poll_wait_pipe, | ||
273 | .flags = &func_flags, | 255 | .flags = &func_flags, |
274 | .set_flag = func_set_flag, | 256 | .set_flag = func_set_flag, |
275 | .allow_instances = true, | 257 | .allow_instances = true, |
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index deff11200261..4de3e57f723c 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c | |||
@@ -38,15 +38,6 @@ struct fgraph_data { | |||
38 | 38 | ||
39 | #define TRACE_GRAPH_INDENT 2 | 39 | #define TRACE_GRAPH_INDENT 2 |
40 | 40 | ||
41 | /* Flag options */ | ||
42 | #define TRACE_GRAPH_PRINT_OVERRUN 0x1 | ||
43 | #define TRACE_GRAPH_PRINT_CPU 0x2 | ||
44 | #define TRACE_GRAPH_PRINT_OVERHEAD 0x4 | ||
45 | #define TRACE_GRAPH_PRINT_PROC 0x8 | ||
46 | #define TRACE_GRAPH_PRINT_DURATION 0x10 | ||
47 | #define TRACE_GRAPH_PRINT_ABS_TIME 0x20 | ||
48 | #define TRACE_GRAPH_PRINT_IRQS 0x40 | ||
49 | |||
50 | static unsigned int max_depth; | 41 | static unsigned int max_depth; |
51 | 42 | ||
52 | static struct tracer_opt trace_opts[] = { | 43 | static struct tracer_opt trace_opts[] = { |
@@ -64,11 +55,13 @@ static struct tracer_opt trace_opts[] = { | |||
64 | { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) }, | 55 | { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) }, |
65 | /* Display interrupts */ | 56 | /* Display interrupts */ |
66 | { TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) }, | 57 | { TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) }, |
58 | /* Display function name after trailing } */ | ||
59 | { TRACER_OPT(funcgraph-tail, TRACE_GRAPH_PRINT_TAIL) }, | ||
67 | { } /* Empty entry */ | 60 | { } /* Empty entry */ |
68 | }; | 61 | }; |
69 | 62 | ||
70 | static struct tracer_flags tracer_flags = { | 63 | static struct tracer_flags tracer_flags = { |
71 | /* Don't display overruns and proc by default */ | 64 | /* Don't display overruns, proc, or tail by default */ |
72 | .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD | | 65 | .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD | |
73 | TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS, | 66 | TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS, |
74 | .opts = trace_opts | 67 | .opts = trace_opts |
@@ -1176,9 +1169,10 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, | |||
1176 | * If the return function does not have a matching entry, | 1169 | * If the return function does not have a matching entry, |
1177 | * then the entry was lost. Instead of just printing | 1170 | * then the entry was lost. Instead of just printing |
1178 | * the '}' and letting the user guess what function this | 1171 | * the '}' and letting the user guess what function this |
1179 | * belongs to, write out the function name. | 1172 | * belongs to, write out the function name. Always do |
1173 | * that if the funcgraph-tail option is enabled. | ||
1180 | */ | 1174 | */ |
1181 | if (func_match) { | 1175 | if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL)) { |
1182 | ret = trace_seq_puts(s, "}\n"); | 1176 | ret = trace_seq_puts(s, "}\n"); |
1183 | if (!ret) | 1177 | if (!ret) |
1184 | return TRACE_TYPE_PARTIAL_LINE; | 1178 | return TRACE_TYPE_PARTIAL_LINE; |
@@ -1505,7 +1499,6 @@ static struct tracer graph_trace __tracer_data = { | |||
1505 | .pipe_open = graph_trace_open, | 1499 | .pipe_open = graph_trace_open, |
1506 | .close = graph_trace_close, | 1500 | .close = graph_trace_close, |
1507 | .pipe_close = graph_trace_close, | 1501 | .pipe_close = graph_trace_close, |
1508 | .wait_pipe = poll_wait_pipe, | ||
1509 | .init = graph_trace_init, | 1502 | .init = graph_trace_init, |
1510 | .reset = graph_trace_reset, | 1503 | .reset = graph_trace_reset, |
1511 | .print_line = print_graph_function, | 1504 | .print_line = print_graph_function, |
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 8ff02cbb892f..9bb104f748d0 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c | |||
@@ -151,12 +151,6 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip, | |||
151 | 151 | ||
152 | atomic_dec(&data->disabled); | 152 | atomic_dec(&data->disabled); |
153 | } | 153 | } |
154 | |||
155 | static struct ftrace_ops trace_ops __read_mostly = | ||
156 | { | ||
157 | .func = irqsoff_tracer_call, | ||
158 | .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, | ||
159 | }; | ||
160 | #endif /* CONFIG_FUNCTION_TRACER */ | 154 | #endif /* CONFIG_FUNCTION_TRACER */ |
161 | 155 | ||
162 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | 156 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER |
@@ -176,7 +170,7 @@ irqsoff_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) | |||
176 | for_each_possible_cpu(cpu) | 170 | for_each_possible_cpu(cpu) |
177 | per_cpu(tracing_cpu, cpu) = 0; | 171 | per_cpu(tracing_cpu, cpu) = 0; |
178 | 172 | ||
179 | tracing_max_latency = 0; | 173 | tr->max_latency = 0; |
180 | tracing_reset_online_cpus(&irqsoff_trace->trace_buffer); | 174 | tracing_reset_online_cpus(&irqsoff_trace->trace_buffer); |
181 | 175 | ||
182 | return start_irqsoff_tracer(irqsoff_trace, set); | 176 | return start_irqsoff_tracer(irqsoff_trace, set); |
@@ -303,13 +297,13 @@ static void irqsoff_print_header(struct seq_file *s) | |||
303 | /* | 297 | /* |
304 | * Should this new latency be reported/recorded? | 298 | * Should this new latency be reported/recorded? |
305 | */ | 299 | */ |
306 | static int report_latency(cycle_t delta) | 300 | static int report_latency(struct trace_array *tr, cycle_t delta) |
307 | { | 301 | { |
308 | if (tracing_thresh) { | 302 | if (tracing_thresh) { |
309 | if (delta < tracing_thresh) | 303 | if (delta < tracing_thresh) |
310 | return 0; | 304 | return 0; |
311 | } else { | 305 | } else { |
312 | if (delta <= tracing_max_latency) | 306 | if (delta <= tr->max_latency) |
313 | return 0; | 307 | return 0; |
314 | } | 308 | } |
315 | return 1; | 309 | return 1; |
@@ -333,13 +327,13 @@ check_critical_timing(struct trace_array *tr, | |||
333 | 327 | ||
334 | pc = preempt_count(); | 328 | pc = preempt_count(); |
335 | 329 | ||
336 | if (!report_latency(delta)) | 330 | if (!report_latency(tr, delta)) |
337 | goto out; | 331 | goto out; |
338 | 332 | ||
339 | raw_spin_lock_irqsave(&max_trace_lock, flags); | 333 | raw_spin_lock_irqsave(&max_trace_lock, flags); |
340 | 334 | ||
341 | /* check if we are still the max latency */ | 335 | /* check if we are still the max latency */ |
342 | if (!report_latency(delta)) | 336 | if (!report_latency(tr, delta)) |
343 | goto out_unlock; | 337 | goto out_unlock; |
344 | 338 | ||
345 | __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); | 339 | __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); |
@@ -352,7 +346,7 @@ check_critical_timing(struct trace_array *tr, | |||
352 | data->critical_end = parent_ip; | 346 | data->critical_end = parent_ip; |
353 | 347 | ||
354 | if (likely(!is_tracing_stopped())) { | 348 | if (likely(!is_tracing_stopped())) { |
355 | tracing_max_latency = delta; | 349 | tr->max_latency = delta; |
356 | update_max_tr_single(tr, current, cpu); | 350 | update_max_tr_single(tr, current, cpu); |
357 | } | 351 | } |
358 | 352 | ||
@@ -531,7 +525,7 @@ void trace_preempt_off(unsigned long a0, unsigned long a1) | |||
531 | } | 525 | } |
532 | #endif /* CONFIG_PREEMPT_TRACER */ | 526 | #endif /* CONFIG_PREEMPT_TRACER */ |
533 | 527 | ||
534 | static int register_irqsoff_function(int graph, int set) | 528 | static int register_irqsoff_function(struct trace_array *tr, int graph, int set) |
535 | { | 529 | { |
536 | int ret; | 530 | int ret; |
537 | 531 | ||
@@ -543,7 +537,7 @@ static int register_irqsoff_function(int graph, int set) | |||
543 | ret = register_ftrace_graph(&irqsoff_graph_return, | 537 | ret = register_ftrace_graph(&irqsoff_graph_return, |
544 | &irqsoff_graph_entry); | 538 | &irqsoff_graph_entry); |
545 | else | 539 | else |
546 | ret = register_ftrace_function(&trace_ops); | 540 | ret = register_ftrace_function(tr->ops); |
547 | 541 | ||
548 | if (!ret) | 542 | if (!ret) |
549 | function_enabled = true; | 543 | function_enabled = true; |
@@ -551,7 +545,7 @@ static int register_irqsoff_function(int graph, int set) | |||
551 | return ret; | 545 | return ret; |
552 | } | 546 | } |
553 | 547 | ||
554 | static void unregister_irqsoff_function(int graph) | 548 | static void unregister_irqsoff_function(struct trace_array *tr, int graph) |
555 | { | 549 | { |
556 | if (!function_enabled) | 550 | if (!function_enabled) |
557 | return; | 551 | return; |
@@ -559,17 +553,17 @@ static void unregister_irqsoff_function(int graph) | |||
559 | if (graph) | 553 | if (graph) |
560 | unregister_ftrace_graph(); | 554 | unregister_ftrace_graph(); |
561 | else | 555 | else |
562 | unregister_ftrace_function(&trace_ops); | 556 | unregister_ftrace_function(tr->ops); |
563 | 557 | ||
564 | function_enabled = false; | 558 | function_enabled = false; |
565 | } | 559 | } |
566 | 560 | ||
567 | static void irqsoff_function_set(int set) | 561 | static void irqsoff_function_set(struct trace_array *tr, int set) |
568 | { | 562 | { |
569 | if (set) | 563 | if (set) |
570 | register_irqsoff_function(is_graph(), 1); | 564 | register_irqsoff_function(tr, is_graph(), 1); |
571 | else | 565 | else |
572 | unregister_irqsoff_function(is_graph()); | 566 | unregister_irqsoff_function(tr, is_graph()); |
573 | } | 567 | } |
574 | 568 | ||
575 | static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set) | 569 | static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set) |
@@ -577,7 +571,7 @@ static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set) | |||
577 | struct tracer *tracer = tr->current_trace; | 571 | struct tracer *tracer = tr->current_trace; |
578 | 572 | ||
579 | if (mask & TRACE_ITER_FUNCTION) | 573 | if (mask & TRACE_ITER_FUNCTION) |
580 | irqsoff_function_set(set); | 574 | irqsoff_function_set(tr, set); |
581 | 575 | ||
582 | return trace_keep_overwrite(tracer, mask, set); | 576 | return trace_keep_overwrite(tracer, mask, set); |
583 | } | 577 | } |
@@ -586,7 +580,7 @@ static int start_irqsoff_tracer(struct trace_array *tr, int graph) | |||
586 | { | 580 | { |
587 | int ret; | 581 | int ret; |
588 | 582 | ||
589 | ret = register_irqsoff_function(graph, 0); | 583 | ret = register_irqsoff_function(tr, graph, 0); |
590 | 584 | ||
591 | if (!ret && tracing_is_enabled()) | 585 | if (!ret && tracing_is_enabled()) |
592 | tracer_enabled = 1; | 586 | tracer_enabled = 1; |
@@ -600,25 +594,37 @@ static void stop_irqsoff_tracer(struct trace_array *tr, int graph) | |||
600 | { | 594 | { |
601 | tracer_enabled = 0; | 595 | tracer_enabled = 0; |
602 | 596 | ||
603 | unregister_irqsoff_function(graph); | 597 | unregister_irqsoff_function(tr, graph); |
604 | } | 598 | } |
605 | 599 | ||
606 | static void __irqsoff_tracer_init(struct trace_array *tr) | 600 | static bool irqsoff_busy; |
601 | |||
602 | static int __irqsoff_tracer_init(struct trace_array *tr) | ||
607 | { | 603 | { |
604 | if (irqsoff_busy) | ||
605 | return -EBUSY; | ||
606 | |||
608 | save_flags = trace_flags; | 607 | save_flags = trace_flags; |
609 | 608 | ||
610 | /* non overwrite screws up the latency tracers */ | 609 | /* non overwrite screws up the latency tracers */ |
611 | set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); | 610 | set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); |
612 | set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1); | 611 | set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1); |
613 | 612 | ||
614 | tracing_max_latency = 0; | 613 | tr->max_latency = 0; |
615 | irqsoff_trace = tr; | 614 | irqsoff_trace = tr; |
616 | /* make sure that the tracer is visible */ | 615 | /* make sure that the tracer is visible */ |
617 | smp_wmb(); | 616 | smp_wmb(); |
618 | tracing_reset_online_cpus(&tr->trace_buffer); | 617 | tracing_reset_online_cpus(&tr->trace_buffer); |
619 | 618 | ||
620 | if (start_irqsoff_tracer(tr, is_graph())) | 619 | ftrace_init_array_ops(tr, irqsoff_tracer_call); |
620 | |||
621 | /* Only toplevel instance supports graph tracing */ | ||
622 | if (start_irqsoff_tracer(tr, (tr->flags & TRACE_ARRAY_FL_GLOBAL && | ||
623 | is_graph()))) | ||
621 | printk(KERN_ERR "failed to start irqsoff tracer\n"); | 624 | printk(KERN_ERR "failed to start irqsoff tracer\n"); |
625 | |||
626 | irqsoff_busy = true; | ||
627 | return 0; | ||
622 | } | 628 | } |
623 | 629 | ||
624 | static void irqsoff_tracer_reset(struct trace_array *tr) | 630 | static void irqsoff_tracer_reset(struct trace_array *tr) |
@@ -630,6 +636,9 @@ static void irqsoff_tracer_reset(struct trace_array *tr) | |||
630 | 636 | ||
631 | set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); | 637 | set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); |
632 | set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag); | 638 | set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag); |
639 | ftrace_reset_array_ops(tr); | ||
640 | |||
641 | irqsoff_busy = false; | ||
633 | } | 642 | } |
634 | 643 | ||
635 | static void irqsoff_tracer_start(struct trace_array *tr) | 644 | static void irqsoff_tracer_start(struct trace_array *tr) |
@@ -647,8 +656,7 @@ static int irqsoff_tracer_init(struct trace_array *tr) | |||
647 | { | 656 | { |
648 | trace_type = TRACER_IRQS_OFF; | 657 | trace_type = TRACER_IRQS_OFF; |
649 | 658 | ||
650 | __irqsoff_tracer_init(tr); | 659 | return __irqsoff_tracer_init(tr); |
651 | return 0; | ||
652 | } | 660 | } |
653 | static struct tracer irqsoff_tracer __read_mostly = | 661 | static struct tracer irqsoff_tracer __read_mostly = |
654 | { | 662 | { |
@@ -668,6 +676,7 @@ static struct tracer irqsoff_tracer __read_mostly = | |||
668 | #endif | 676 | #endif |
669 | .open = irqsoff_trace_open, | 677 | .open = irqsoff_trace_open, |
670 | .close = irqsoff_trace_close, | 678 | .close = irqsoff_trace_close, |
679 | .allow_instances = true, | ||
671 | .use_max_tr = true, | 680 | .use_max_tr = true, |
672 | }; | 681 | }; |
673 | # define register_irqsoff(trace) register_tracer(&trace) | 682 | # define register_irqsoff(trace) register_tracer(&trace) |
@@ -680,8 +689,7 @@ static int preemptoff_tracer_init(struct trace_array *tr) | |||
680 | { | 689 | { |
681 | trace_type = TRACER_PREEMPT_OFF; | 690 | trace_type = TRACER_PREEMPT_OFF; |
682 | 691 | ||
683 | __irqsoff_tracer_init(tr); | 692 | return __irqsoff_tracer_init(tr); |
684 | return 0; | ||
685 | } | 693 | } |
686 | 694 | ||
687 | static struct tracer preemptoff_tracer __read_mostly = | 695 | static struct tracer preemptoff_tracer __read_mostly = |
@@ -702,6 +710,7 @@ static struct tracer preemptoff_tracer __read_mostly = | |||
702 | #endif | 710 | #endif |
703 | .open = irqsoff_trace_open, | 711 | .open = irqsoff_trace_open, |
704 | .close = irqsoff_trace_close, | 712 | .close = irqsoff_trace_close, |
713 | .allow_instances = true, | ||
705 | .use_max_tr = true, | 714 | .use_max_tr = true, |
706 | }; | 715 | }; |
707 | # define register_preemptoff(trace) register_tracer(&trace) | 716 | # define register_preemptoff(trace) register_tracer(&trace) |
@@ -716,8 +725,7 @@ static int preemptirqsoff_tracer_init(struct trace_array *tr) | |||
716 | { | 725 | { |
717 | trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF; | 726 | trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF; |
718 | 727 | ||
719 | __irqsoff_tracer_init(tr); | 728 | return __irqsoff_tracer_init(tr); |
720 | return 0; | ||
721 | } | 729 | } |
722 | 730 | ||
723 | static struct tracer preemptirqsoff_tracer __read_mostly = | 731 | static struct tracer preemptirqsoff_tracer __read_mostly = |
@@ -738,6 +746,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly = | |||
738 | #endif | 746 | #endif |
739 | .open = irqsoff_trace_open, | 747 | .open = irqsoff_trace_open, |
740 | .close = irqsoff_trace_close, | 748 | .close = irqsoff_trace_close, |
749 | .allow_instances = true, | ||
741 | .use_max_tr = true, | 750 | .use_max_tr = true, |
742 | }; | 751 | }; |
743 | 752 | ||
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 903ae28962be..282f6e4e5539 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c | |||
@@ -40,27 +40,27 @@ struct trace_kprobe { | |||
40 | (sizeof(struct probe_arg) * (n))) | 40 | (sizeof(struct probe_arg) * (n))) |
41 | 41 | ||
42 | 42 | ||
43 | static __kprobes bool trace_kprobe_is_return(struct trace_kprobe *tk) | 43 | static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk) |
44 | { | 44 | { |
45 | return tk->rp.handler != NULL; | 45 | return tk->rp.handler != NULL; |
46 | } | 46 | } |
47 | 47 | ||
48 | static __kprobes const char *trace_kprobe_symbol(struct trace_kprobe *tk) | 48 | static nokprobe_inline const char *trace_kprobe_symbol(struct trace_kprobe *tk) |
49 | { | 49 | { |
50 | return tk->symbol ? tk->symbol : "unknown"; | 50 | return tk->symbol ? tk->symbol : "unknown"; |
51 | } | 51 | } |
52 | 52 | ||
53 | static __kprobes unsigned long trace_kprobe_offset(struct trace_kprobe *tk) | 53 | static nokprobe_inline unsigned long trace_kprobe_offset(struct trace_kprobe *tk) |
54 | { | 54 | { |
55 | return tk->rp.kp.offset; | 55 | return tk->rp.kp.offset; |
56 | } | 56 | } |
57 | 57 | ||
58 | static __kprobes bool trace_kprobe_has_gone(struct trace_kprobe *tk) | 58 | static nokprobe_inline bool trace_kprobe_has_gone(struct trace_kprobe *tk) |
59 | { | 59 | { |
60 | return !!(kprobe_gone(&tk->rp.kp)); | 60 | return !!(kprobe_gone(&tk->rp.kp)); |
61 | } | 61 | } |
62 | 62 | ||
63 | static __kprobes bool trace_kprobe_within_module(struct trace_kprobe *tk, | 63 | static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk, |
64 | struct module *mod) | 64 | struct module *mod) |
65 | { | 65 | { |
66 | int len = strlen(mod->name); | 66 | int len = strlen(mod->name); |
@@ -68,7 +68,7 @@ static __kprobes bool trace_kprobe_within_module(struct trace_kprobe *tk, | |||
68 | return strncmp(mod->name, name, len) == 0 && name[len] == ':'; | 68 | return strncmp(mod->name, name, len) == 0 && name[len] == ':'; |
69 | } | 69 | } |
70 | 70 | ||
71 | static __kprobes bool trace_kprobe_is_on_module(struct trace_kprobe *tk) | 71 | static nokprobe_inline bool trace_kprobe_is_on_module(struct trace_kprobe *tk) |
72 | { | 72 | { |
73 | return !!strchr(trace_kprobe_symbol(tk), ':'); | 73 | return !!strchr(trace_kprobe_symbol(tk), ':'); |
74 | } | 74 | } |
@@ -132,19 +132,21 @@ struct symbol_cache *alloc_symbol_cache(const char *sym, long offset) | |||
132 | * Kprobes-specific fetch functions | 132 | * Kprobes-specific fetch functions |
133 | */ | 133 | */ |
134 | #define DEFINE_FETCH_stack(type) \ | 134 | #define DEFINE_FETCH_stack(type) \ |
135 | static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ | 135 | static void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs, \ |
136 | void *offset, void *dest) \ | 136 | void *offset, void *dest) \ |
137 | { \ | 137 | { \ |
138 | *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \ | 138 | *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \ |
139 | (unsigned int)((unsigned long)offset)); \ | 139 | (unsigned int)((unsigned long)offset)); \ |
140 | } | 140 | } \ |
141 | NOKPROBE_SYMBOL(FETCH_FUNC_NAME(stack, type)); | ||
142 | |||
141 | DEFINE_BASIC_FETCH_FUNCS(stack) | 143 | DEFINE_BASIC_FETCH_FUNCS(stack) |
142 | /* No string on the stack entry */ | 144 | /* No string on the stack entry */ |
143 | #define fetch_stack_string NULL | 145 | #define fetch_stack_string NULL |
144 | #define fetch_stack_string_size NULL | 146 | #define fetch_stack_string_size NULL |
145 | 147 | ||
146 | #define DEFINE_FETCH_memory(type) \ | 148 | #define DEFINE_FETCH_memory(type) \ |
147 | static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ | 149 | static void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs, \ |
148 | void *addr, void *dest) \ | 150 | void *addr, void *dest) \ |
149 | { \ | 151 | { \ |
150 | type retval; \ | 152 | type retval; \ |
@@ -152,14 +154,16 @@ static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ | |||
152 | *(type *)dest = 0; \ | 154 | *(type *)dest = 0; \ |
153 | else \ | 155 | else \ |
154 | *(type *)dest = retval; \ | 156 | *(type *)dest = retval; \ |
155 | } | 157 | } \ |
158 | NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, type)); | ||
159 | |||
156 | DEFINE_BASIC_FETCH_FUNCS(memory) | 160 | DEFINE_BASIC_FETCH_FUNCS(memory) |
157 | /* | 161 | /* |
158 | * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max | 162 | * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max |
159 | * length and relative data location. | 163 | * length and relative data location. |
160 | */ | 164 | */ |
161 | static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, | 165 | static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, |
162 | void *addr, void *dest) | 166 | void *addr, void *dest) |
163 | { | 167 | { |
164 | long ret; | 168 | long ret; |
165 | int maxlen = get_rloc_len(*(u32 *)dest); | 169 | int maxlen = get_rloc_len(*(u32 *)dest); |
@@ -193,10 +197,11 @@ static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, | |||
193 | get_rloc_offs(*(u32 *)dest)); | 197 | get_rloc_offs(*(u32 *)dest)); |
194 | } | 198 | } |
195 | } | 199 | } |
200 | NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string)); | ||
196 | 201 | ||
197 | /* Return the length of string -- including null terminal byte */ | 202 | /* Return the length of string -- including null terminal byte */ |
198 | static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, | 203 | static void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, |
199 | void *addr, void *dest) | 204 | void *addr, void *dest) |
200 | { | 205 | { |
201 | mm_segment_t old_fs; | 206 | mm_segment_t old_fs; |
202 | int ret, len = 0; | 207 | int ret, len = 0; |
@@ -219,17 +224,19 @@ static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, | |||
219 | else | 224 | else |
220 | *(u32 *)dest = len; | 225 | *(u32 *)dest = len; |
221 | } | 226 | } |
227 | NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string_size)); | ||
222 | 228 | ||
223 | #define DEFINE_FETCH_symbol(type) \ | 229 | #define DEFINE_FETCH_symbol(type) \ |
224 | __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs, \ | 230 | void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs, void *data, void *dest)\ |
225 | void *data, void *dest) \ | ||
226 | { \ | 231 | { \ |
227 | struct symbol_cache *sc = data; \ | 232 | struct symbol_cache *sc = data; \ |
228 | if (sc->addr) \ | 233 | if (sc->addr) \ |
229 | fetch_memory_##type(regs, (void *)sc->addr, dest); \ | 234 | fetch_memory_##type(regs, (void *)sc->addr, dest); \ |
230 | else \ | 235 | else \ |
231 | *(type *)dest = 0; \ | 236 | *(type *)dest = 0; \ |
232 | } | 237 | } \ |
238 | NOKPROBE_SYMBOL(FETCH_FUNC_NAME(symbol, type)); | ||
239 | |||
233 | DEFINE_BASIC_FETCH_FUNCS(symbol) | 240 | DEFINE_BASIC_FETCH_FUNCS(symbol) |
234 | DEFINE_FETCH_symbol(string) | 241 | DEFINE_FETCH_symbol(string) |
235 | DEFINE_FETCH_symbol(string_size) | 242 | DEFINE_FETCH_symbol(string_size) |
@@ -907,7 +914,7 @@ static const struct file_operations kprobe_profile_ops = { | |||
907 | }; | 914 | }; |
908 | 915 | ||
909 | /* Kprobe handler */ | 916 | /* Kprobe handler */ |
910 | static __kprobes void | 917 | static nokprobe_inline void |
911 | __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs, | 918 | __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs, |
912 | struct ftrace_event_file *ftrace_file) | 919 | struct ftrace_event_file *ftrace_file) |
913 | { | 920 | { |
@@ -943,7 +950,7 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs, | |||
943 | entry, irq_flags, pc, regs); | 950 | entry, irq_flags, pc, regs); |
944 | } | 951 | } |
945 | 952 | ||
946 | static __kprobes void | 953 | static void |
947 | kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs) | 954 | kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs) |
948 | { | 955 | { |
949 | struct event_file_link *link; | 956 | struct event_file_link *link; |
@@ -951,9 +958,10 @@ kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs) | |||
951 | list_for_each_entry_rcu(link, &tk->tp.files, list) | 958 | list_for_each_entry_rcu(link, &tk->tp.files, list) |
952 | __kprobe_trace_func(tk, regs, link->file); | 959 | __kprobe_trace_func(tk, regs, link->file); |
953 | } | 960 | } |
961 | NOKPROBE_SYMBOL(kprobe_trace_func); | ||
954 | 962 | ||
955 | /* Kretprobe handler */ | 963 | /* Kretprobe handler */ |
956 | static __kprobes void | 964 | static nokprobe_inline void |
957 | __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, | 965 | __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, |
958 | struct pt_regs *regs, | 966 | struct pt_regs *regs, |
959 | struct ftrace_event_file *ftrace_file) | 967 | struct ftrace_event_file *ftrace_file) |
@@ -991,7 +999,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, | |||
991 | entry, irq_flags, pc, regs); | 999 | entry, irq_flags, pc, regs); |
992 | } | 1000 | } |
993 | 1001 | ||
994 | static __kprobes void | 1002 | static void |
995 | kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, | 1003 | kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, |
996 | struct pt_regs *regs) | 1004 | struct pt_regs *regs) |
997 | { | 1005 | { |
@@ -1000,6 +1008,7 @@ kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, | |||
1000 | list_for_each_entry_rcu(link, &tk->tp.files, list) | 1008 | list_for_each_entry_rcu(link, &tk->tp.files, list) |
1001 | __kretprobe_trace_func(tk, ri, regs, link->file); | 1009 | __kretprobe_trace_func(tk, ri, regs, link->file); |
1002 | } | 1010 | } |
1011 | NOKPROBE_SYMBOL(kretprobe_trace_func); | ||
1003 | 1012 | ||
1004 | /* Event entry printers */ | 1013 | /* Event entry printers */ |
1005 | static enum print_line_t | 1014 | static enum print_line_t |
@@ -1131,7 +1140,7 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) | |||
1131 | #ifdef CONFIG_PERF_EVENTS | 1140 | #ifdef CONFIG_PERF_EVENTS |
1132 | 1141 | ||
1133 | /* Kprobe profile handler */ | 1142 | /* Kprobe profile handler */ |
1134 | static __kprobes void | 1143 | static void |
1135 | kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) | 1144 | kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) |
1136 | { | 1145 | { |
1137 | struct ftrace_event_call *call = &tk->tp.call; | 1146 | struct ftrace_event_call *call = &tk->tp.call; |
@@ -1158,9 +1167,10 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) | |||
1158 | store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); | 1167 | store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); |
1159 | perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); | 1168 | perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); |
1160 | } | 1169 | } |
1170 | NOKPROBE_SYMBOL(kprobe_perf_func); | ||
1161 | 1171 | ||
1162 | /* Kretprobe profile handler */ | 1172 | /* Kretprobe profile handler */ |
1163 | static __kprobes void | 1173 | static void |
1164 | kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, | 1174 | kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, |
1165 | struct pt_regs *regs) | 1175 | struct pt_regs *regs) |
1166 | { | 1176 | { |
@@ -1188,6 +1198,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, | |||
1188 | store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); | 1198 | store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); |
1189 | perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); | 1199 | perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); |
1190 | } | 1200 | } |
1201 | NOKPROBE_SYMBOL(kretprobe_perf_func); | ||
1191 | #endif /* CONFIG_PERF_EVENTS */ | 1202 | #endif /* CONFIG_PERF_EVENTS */ |
1192 | 1203 | ||
1193 | /* | 1204 | /* |
@@ -1196,9 +1207,8 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, | |||
1196 | * kprobe_trace_self_tests_init() does enable_trace_probe/disable_trace_probe | 1207 | * kprobe_trace_self_tests_init() does enable_trace_probe/disable_trace_probe |
1197 | * lockless, but we can't race with this __init function. | 1208 | * lockless, but we can't race with this __init function. |
1198 | */ | 1209 | */ |
1199 | static __kprobes | 1210 | static int kprobe_register(struct ftrace_event_call *event, |
1200 | int kprobe_register(struct ftrace_event_call *event, | 1211 | enum trace_reg type, void *data) |
1201 | enum trace_reg type, void *data) | ||
1202 | { | 1212 | { |
1203 | struct trace_kprobe *tk = (struct trace_kprobe *)event->data; | 1213 | struct trace_kprobe *tk = (struct trace_kprobe *)event->data; |
1204 | struct ftrace_event_file *file = data; | 1214 | struct ftrace_event_file *file = data; |
@@ -1224,8 +1234,7 @@ int kprobe_register(struct ftrace_event_call *event, | |||
1224 | return 0; | 1234 | return 0; |
1225 | } | 1235 | } |
1226 | 1236 | ||
1227 | static __kprobes | 1237 | static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) |
1228 | int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) | ||
1229 | { | 1238 | { |
1230 | struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp); | 1239 | struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp); |
1231 | 1240 | ||
@@ -1239,9 +1248,10 @@ int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) | |||
1239 | #endif | 1248 | #endif |
1240 | return 0; /* We don't tweek kernel, so just return 0 */ | 1249 | return 0; /* We don't tweek kernel, so just return 0 */ |
1241 | } | 1250 | } |
1251 | NOKPROBE_SYMBOL(kprobe_dispatcher); | ||
1242 | 1252 | ||
1243 | static __kprobes | 1253 | static int |
1244 | int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) | 1254 | kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) |
1245 | { | 1255 | { |
1246 | struct trace_kprobe *tk = container_of(ri->rp, struct trace_kprobe, rp); | 1256 | struct trace_kprobe *tk = container_of(ri->rp, struct trace_kprobe, rp); |
1247 | 1257 | ||
@@ -1255,6 +1265,7 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) | |||
1255 | #endif | 1265 | #endif |
1256 | return 0; /* We don't tweek kernel, so just return 0 */ | 1266 | return 0; /* We don't tweek kernel, so just return 0 */ |
1257 | } | 1267 | } |
1268 | NOKPROBE_SYMBOL(kretprobe_dispatcher); | ||
1258 | 1269 | ||
1259 | static struct trace_event_functions kretprobe_funcs = { | 1270 | static struct trace_event_functions kretprobe_funcs = { |
1260 | .trace = print_kretprobe_event | 1271 | .trace = print_kretprobe_event |
@@ -1377,6 +1388,9 @@ static __init int kprobe_trace_self_tests_init(void) | |||
1377 | struct trace_kprobe *tk; | 1388 | struct trace_kprobe *tk; |
1378 | struct ftrace_event_file *file; | 1389 | struct ftrace_event_file *file; |
1379 | 1390 | ||
1391 | if (tracing_is_disabled()) | ||
1392 | return -ENODEV; | ||
1393 | |||
1380 | target = kprobe_trace_selftest_target; | 1394 | target = kprobe_trace_selftest_target; |
1381 | 1395 | ||
1382 | pr_info("Testing kprobe tracing: "); | 1396 | pr_info("Testing kprobe tracing: "); |
diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c index 69a5cc94c01a..fcf0a9e48916 100644 --- a/kernel/trace/trace_nop.c +++ b/kernel/trace/trace_nop.c | |||
@@ -91,7 +91,6 @@ struct tracer nop_trace __read_mostly = | |||
91 | .name = "nop", | 91 | .name = "nop", |
92 | .init = nop_trace_init, | 92 | .init = nop_trace_init, |
93 | .reset = nop_trace_reset, | 93 | .reset = nop_trace_reset, |
94 | .wait_pipe = poll_wait_pipe, | ||
95 | #ifdef CONFIG_FTRACE_SELFTEST | 94 | #ifdef CONFIG_FTRACE_SELFTEST |
96 | .selftest = trace_selftest_startup_nop, | 95 | .selftest = trace_selftest_startup_nop, |
97 | #endif | 96 | #endif |
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index a436de18aa99..f3dad80c20b2 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c | |||
@@ -126,6 +126,34 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...) | |||
126 | EXPORT_SYMBOL_GPL(trace_seq_printf); | 126 | EXPORT_SYMBOL_GPL(trace_seq_printf); |
127 | 127 | ||
128 | /** | 128 | /** |
129 | * trace_seq_bitmask - put a list of longs as a bitmask print output | ||
130 | * @s: trace sequence descriptor | ||
131 | * @maskp: points to an array of unsigned longs that represent a bitmask | ||
132 | * @nmaskbits: The number of bits that are valid in @maskp | ||
133 | * | ||
134 | * It returns 0 if the trace oversizes the buffer's free | ||
135 | * space, 1 otherwise. | ||
136 | * | ||
137 | * Writes a ASCII representation of a bitmask string into @s. | ||
138 | */ | ||
139 | int | ||
140 | trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp, | ||
141 | int nmaskbits) | ||
142 | { | ||
143 | int len = (PAGE_SIZE - 1) - s->len; | ||
144 | int ret; | ||
145 | |||
146 | if (s->full || !len) | ||
147 | return 0; | ||
148 | |||
149 | ret = bitmap_scnprintf(s->buffer, len, maskp, nmaskbits); | ||
150 | s->len += ret; | ||
151 | |||
152 | return 1; | ||
153 | } | ||
154 | EXPORT_SYMBOL_GPL(trace_seq_bitmask); | ||
155 | |||
156 | /** | ||
129 | * trace_seq_vprintf - sequence printing of trace information | 157 | * trace_seq_vprintf - sequence printing of trace information |
130 | * @s: trace sequence descriptor | 158 | * @s: trace sequence descriptor |
131 | * @fmt: printf format string | 159 | * @fmt: printf format string |
@@ -399,6 +427,19 @@ EXPORT_SYMBOL(ftrace_print_symbols_seq_u64); | |||
399 | #endif | 427 | #endif |
400 | 428 | ||
401 | const char * | 429 | const char * |
430 | ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr, | ||
431 | unsigned int bitmask_size) | ||
432 | { | ||
433 | const char *ret = p->buffer + p->len; | ||
434 | |||
435 | trace_seq_bitmask(p, bitmask_ptr, bitmask_size * 8); | ||
436 | trace_seq_putc(p, 0); | ||
437 | |||
438 | return ret; | ||
439 | } | ||
440 | EXPORT_SYMBOL_GPL(ftrace_print_bitmask_seq); | ||
441 | |||
442 | const char * | ||
402 | ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len) | 443 | ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len) |
403 | { | 444 | { |
404 | int i; | 445 | int i; |
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 8364a421b4df..d4b9fc22cd27 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c | |||
@@ -37,13 +37,13 @@ const char *reserved_field_names[] = { | |||
37 | 37 | ||
38 | /* Printing in basic type function template */ | 38 | /* Printing in basic type function template */ |
39 | #define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt) \ | 39 | #define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt) \ |
40 | __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \ | 40 | int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name, \ |
41 | const char *name, \ | 41 | void *data, void *ent) \ |
42 | void *data, void *ent) \ | ||
43 | { \ | 42 | { \ |
44 | return trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \ | 43 | return trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \ |
45 | } \ | 44 | } \ |
46 | const char PRINT_TYPE_FMT_NAME(type)[] = fmt; | 45 | const char PRINT_TYPE_FMT_NAME(type)[] = fmt; \ |
46 | NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(type)); | ||
47 | 47 | ||
48 | DEFINE_BASIC_PRINT_TYPE_FUNC(u8 , "0x%x") | 48 | DEFINE_BASIC_PRINT_TYPE_FUNC(u8 , "0x%x") |
49 | DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "0x%x") | 49 | DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "0x%x") |
@@ -55,9 +55,8 @@ DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%d") | |||
55 | DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%Ld") | 55 | DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%Ld") |
56 | 56 | ||
57 | /* Print type function for string type */ | 57 | /* Print type function for string type */ |
58 | __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, | 58 | int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, const char *name, |
59 | const char *name, | 59 | void *data, void *ent) |
60 | void *data, void *ent) | ||
61 | { | 60 | { |
62 | int len = *(u32 *)data >> 16; | 61 | int len = *(u32 *)data >> 16; |
63 | 62 | ||
@@ -67,6 +66,7 @@ __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, | |||
67 | return trace_seq_printf(s, " %s=\"%s\"", name, | 66 | return trace_seq_printf(s, " %s=\"%s\"", name, |
68 | (const char *)get_loc_data(data, ent)); | 67 | (const char *)get_loc_data(data, ent)); |
69 | } | 68 | } |
69 | NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(string)); | ||
70 | 70 | ||
71 | const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\""; | 71 | const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\""; |
72 | 72 | ||
@@ -81,23 +81,24 @@ const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\""; | |||
81 | 81 | ||
82 | /* Data fetch function templates */ | 82 | /* Data fetch function templates */ |
83 | #define DEFINE_FETCH_reg(type) \ | 83 | #define DEFINE_FETCH_reg(type) \ |
84 | __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \ | 84 | void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, void *offset, void *dest) \ |
85 | void *offset, void *dest) \ | ||
86 | { \ | 85 | { \ |
87 | *(type *)dest = (type)regs_get_register(regs, \ | 86 | *(type *)dest = (type)regs_get_register(regs, \ |
88 | (unsigned int)((unsigned long)offset)); \ | 87 | (unsigned int)((unsigned long)offset)); \ |
89 | } | 88 | } \ |
89 | NOKPROBE_SYMBOL(FETCH_FUNC_NAME(reg, type)); | ||
90 | DEFINE_BASIC_FETCH_FUNCS(reg) | 90 | DEFINE_BASIC_FETCH_FUNCS(reg) |
91 | /* No string on the register */ | 91 | /* No string on the register */ |
92 | #define fetch_reg_string NULL | 92 | #define fetch_reg_string NULL |
93 | #define fetch_reg_string_size NULL | 93 | #define fetch_reg_string_size NULL |
94 | 94 | ||
95 | #define DEFINE_FETCH_retval(type) \ | 95 | #define DEFINE_FETCH_retval(type) \ |
96 | __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs, \ | 96 | void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs, \ |
97 | void *dummy, void *dest) \ | 97 | void *dummy, void *dest) \ |
98 | { \ | 98 | { \ |
99 | *(type *)dest = (type)regs_return_value(regs); \ | 99 | *(type *)dest = (type)regs_return_value(regs); \ |
100 | } | 100 | } \ |
101 | NOKPROBE_SYMBOL(FETCH_FUNC_NAME(retval, type)); | ||
101 | DEFINE_BASIC_FETCH_FUNCS(retval) | 102 | DEFINE_BASIC_FETCH_FUNCS(retval) |
102 | /* No string on the retval */ | 103 | /* No string on the retval */ |
103 | #define fetch_retval_string NULL | 104 | #define fetch_retval_string NULL |
@@ -112,8 +113,8 @@ struct deref_fetch_param { | |||
112 | }; | 113 | }; |
113 | 114 | ||
114 | #define DEFINE_FETCH_deref(type) \ | 115 | #define DEFINE_FETCH_deref(type) \ |
115 | __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs, \ | 116 | void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs, \ |
116 | void *data, void *dest) \ | 117 | void *data, void *dest) \ |
117 | { \ | 118 | { \ |
118 | struct deref_fetch_param *dprm = data; \ | 119 | struct deref_fetch_param *dprm = data; \ |
119 | unsigned long addr; \ | 120 | unsigned long addr; \ |
@@ -123,12 +124,13 @@ __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs, \ | |||
123 | dprm->fetch(regs, (void *)addr, dest); \ | 124 | dprm->fetch(regs, (void *)addr, dest); \ |
124 | } else \ | 125 | } else \ |
125 | *(type *)dest = 0; \ | 126 | *(type *)dest = 0; \ |
126 | } | 127 | } \ |
128 | NOKPROBE_SYMBOL(FETCH_FUNC_NAME(deref, type)); | ||
127 | DEFINE_BASIC_FETCH_FUNCS(deref) | 129 | DEFINE_BASIC_FETCH_FUNCS(deref) |
128 | DEFINE_FETCH_deref(string) | 130 | DEFINE_FETCH_deref(string) |
129 | 131 | ||
130 | __kprobes void FETCH_FUNC_NAME(deref, string_size)(struct pt_regs *regs, | 132 | void FETCH_FUNC_NAME(deref, string_size)(struct pt_regs *regs, |
131 | void *data, void *dest) | 133 | void *data, void *dest) |
132 | { | 134 | { |
133 | struct deref_fetch_param *dprm = data; | 135 | struct deref_fetch_param *dprm = data; |
134 | unsigned long addr; | 136 | unsigned long addr; |
@@ -140,16 +142,18 @@ __kprobes void FETCH_FUNC_NAME(deref, string_size)(struct pt_regs *regs, | |||
140 | } else | 142 | } else |
141 | *(string_size *)dest = 0; | 143 | *(string_size *)dest = 0; |
142 | } | 144 | } |
145 | NOKPROBE_SYMBOL(FETCH_FUNC_NAME(deref, string_size)); | ||
143 | 146 | ||
144 | static __kprobes void update_deref_fetch_param(struct deref_fetch_param *data) | 147 | static void update_deref_fetch_param(struct deref_fetch_param *data) |
145 | { | 148 | { |
146 | if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) | 149 | if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) |
147 | update_deref_fetch_param(data->orig.data); | 150 | update_deref_fetch_param(data->orig.data); |
148 | else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) | 151 | else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) |
149 | update_symbol_cache(data->orig.data); | 152 | update_symbol_cache(data->orig.data); |
150 | } | 153 | } |
154 | NOKPROBE_SYMBOL(update_deref_fetch_param); | ||
151 | 155 | ||
152 | static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) | 156 | static void free_deref_fetch_param(struct deref_fetch_param *data) |
153 | { | 157 | { |
154 | if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) | 158 | if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) |
155 | free_deref_fetch_param(data->orig.data); | 159 | free_deref_fetch_param(data->orig.data); |
@@ -157,6 +161,7 @@ static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) | |||
157 | free_symbol_cache(data->orig.data); | 161 | free_symbol_cache(data->orig.data); |
158 | kfree(data); | 162 | kfree(data); |
159 | } | 163 | } |
164 | NOKPROBE_SYMBOL(free_deref_fetch_param); | ||
160 | 165 | ||
161 | /* Bitfield fetch function */ | 166 | /* Bitfield fetch function */ |
162 | struct bitfield_fetch_param { | 167 | struct bitfield_fetch_param { |
@@ -166,8 +171,8 @@ struct bitfield_fetch_param { | |||
166 | }; | 171 | }; |
167 | 172 | ||
168 | #define DEFINE_FETCH_bitfield(type) \ | 173 | #define DEFINE_FETCH_bitfield(type) \ |
169 | __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs, \ | 174 | void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs, \ |
170 | void *data, void *dest) \ | 175 | void *data, void *dest) \ |
171 | { \ | 176 | { \ |
172 | struct bitfield_fetch_param *bprm = data; \ | 177 | struct bitfield_fetch_param *bprm = data; \ |
173 | type buf = 0; \ | 178 | type buf = 0; \ |
@@ -177,13 +182,13 @@ __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs, \ | |||
177 | buf >>= bprm->low_shift; \ | 182 | buf >>= bprm->low_shift; \ |
178 | } \ | 183 | } \ |
179 | *(type *)dest = buf; \ | 184 | *(type *)dest = buf; \ |
180 | } | 185 | } \ |
181 | 186 | NOKPROBE_SYMBOL(FETCH_FUNC_NAME(bitfield, type)); | |
182 | DEFINE_BASIC_FETCH_FUNCS(bitfield) | 187 | DEFINE_BASIC_FETCH_FUNCS(bitfield) |
183 | #define fetch_bitfield_string NULL | 188 | #define fetch_bitfield_string NULL |
184 | #define fetch_bitfield_string_size NULL | 189 | #define fetch_bitfield_string_size NULL |
185 | 190 | ||
186 | static __kprobes void | 191 | static void |
187 | update_bitfield_fetch_param(struct bitfield_fetch_param *data) | 192 | update_bitfield_fetch_param(struct bitfield_fetch_param *data) |
188 | { | 193 | { |
189 | /* | 194 | /* |
@@ -196,7 +201,7 @@ update_bitfield_fetch_param(struct bitfield_fetch_param *data) | |||
196 | update_symbol_cache(data->orig.data); | 201 | update_symbol_cache(data->orig.data); |
197 | } | 202 | } |
198 | 203 | ||
199 | static __kprobes void | 204 | static void |
200 | free_bitfield_fetch_param(struct bitfield_fetch_param *data) | 205 | free_bitfield_fetch_param(struct bitfield_fetch_param *data) |
201 | { | 206 | { |
202 | /* | 207 | /* |
@@ -255,17 +260,17 @@ fail: | |||
255 | } | 260 | } |
256 | 261 | ||
257 | /* Special function : only accept unsigned long */ | 262 | /* Special function : only accept unsigned long */ |
258 | static __kprobes void fetch_kernel_stack_address(struct pt_regs *regs, | 263 | static void fetch_kernel_stack_address(struct pt_regs *regs, void *dummy, void *dest) |
259 | void *dummy, void *dest) | ||
260 | { | 264 | { |
261 | *(unsigned long *)dest = kernel_stack_pointer(regs); | 265 | *(unsigned long *)dest = kernel_stack_pointer(regs); |
262 | } | 266 | } |
267 | NOKPROBE_SYMBOL(fetch_kernel_stack_address); | ||
263 | 268 | ||
264 | static __kprobes void fetch_user_stack_address(struct pt_regs *regs, | 269 | static void fetch_user_stack_address(struct pt_regs *regs, void *dummy, void *dest) |
265 | void *dummy, void *dest) | ||
266 | { | 270 | { |
267 | *(unsigned long *)dest = user_stack_pointer(regs); | 271 | *(unsigned long *)dest = user_stack_pointer(regs); |
268 | } | 272 | } |
273 | NOKPROBE_SYMBOL(fetch_user_stack_address); | ||
269 | 274 | ||
270 | static fetch_func_t get_fetch_size_function(const struct fetch_type *type, | 275 | static fetch_func_t get_fetch_size_function(const struct fetch_type *type, |
271 | fetch_func_t orig_fn, | 276 | fetch_func_t orig_fn, |
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index fb1ab5dfbd42..4f815fbce16d 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h | |||
@@ -81,13 +81,13 @@ | |||
81 | */ | 81 | */ |
82 | #define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs)) | 82 | #define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs)) |
83 | 83 | ||
84 | static inline void *get_rloc_data(u32 *dl) | 84 | static nokprobe_inline void *get_rloc_data(u32 *dl) |
85 | { | 85 | { |
86 | return (u8 *)dl + get_rloc_offs(*dl); | 86 | return (u8 *)dl + get_rloc_offs(*dl); |
87 | } | 87 | } |
88 | 88 | ||
89 | /* For data_loc conversion */ | 89 | /* For data_loc conversion */ |
90 | static inline void *get_loc_data(u32 *dl, void *ent) | 90 | static nokprobe_inline void *get_loc_data(u32 *dl, void *ent) |
91 | { | 91 | { |
92 | return (u8 *)ent + get_rloc_offs(*dl); | 92 | return (u8 *)ent + get_rloc_offs(*dl); |
93 | } | 93 | } |
@@ -136,9 +136,8 @@ typedef u32 string_size; | |||
136 | 136 | ||
137 | /* Printing in basic type function template */ | 137 | /* Printing in basic type function template */ |
138 | #define DECLARE_BASIC_PRINT_TYPE_FUNC(type) \ | 138 | #define DECLARE_BASIC_PRINT_TYPE_FUNC(type) \ |
139 | __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \ | 139 | int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name, \ |
140 | const char *name, \ | 140 | void *data, void *ent); \ |
141 | void *data, void *ent); \ | ||
142 | extern const char PRINT_TYPE_FMT_NAME(type)[] | 141 | extern const char PRINT_TYPE_FMT_NAME(type)[] |
143 | 142 | ||
144 | DECLARE_BASIC_PRINT_TYPE_FUNC(u8); | 143 | DECLARE_BASIC_PRINT_TYPE_FUNC(u8); |
@@ -303,7 +302,7 @@ static inline bool trace_probe_is_registered(struct trace_probe *tp) | |||
303 | return !!(tp->flags & TP_FLAG_REGISTERED); | 302 | return !!(tp->flags & TP_FLAG_REGISTERED); |
304 | } | 303 | } |
305 | 304 | ||
306 | static inline __kprobes void call_fetch(struct fetch_param *fprm, | 305 | static nokprobe_inline void call_fetch(struct fetch_param *fprm, |
307 | struct pt_regs *regs, void *dest) | 306 | struct pt_regs *regs, void *dest) |
308 | { | 307 | { |
309 | return fprm->fn(regs, fprm->data, dest); | 308 | return fprm->fn(regs, fprm->data, dest); |
@@ -351,7 +350,7 @@ extern ssize_t traceprobe_probes_write(struct file *file, | |||
351 | extern int traceprobe_command(const char *buf, int (*createfn)(int, char**)); | 350 | extern int traceprobe_command(const char *buf, int (*createfn)(int, char**)); |
352 | 351 | ||
353 | /* Sum up total data length for dynamic arraies (strings) */ | 352 | /* Sum up total data length for dynamic arraies (strings) */ |
354 | static inline __kprobes int | 353 | static nokprobe_inline int |
355 | __get_data_size(struct trace_probe *tp, struct pt_regs *regs) | 354 | __get_data_size(struct trace_probe *tp, struct pt_regs *regs) |
356 | { | 355 | { |
357 | int i, ret = 0; | 356 | int i, ret = 0; |
@@ -367,7 +366,7 @@ __get_data_size(struct trace_probe *tp, struct pt_regs *regs) | |||
367 | } | 366 | } |
368 | 367 | ||
369 | /* Store the value of each argument */ | 368 | /* Store the value of each argument */ |
370 | static inline __kprobes void | 369 | static nokprobe_inline void |
371 | store_trace_args(int ent_size, struct trace_probe *tp, struct pt_regs *regs, | 370 | store_trace_args(int ent_size, struct trace_probe *tp, struct pt_regs *regs, |
372 | u8 *data, int maxlen) | 371 | u8 *data, int maxlen) |
373 | { | 372 | { |
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index e14da5e97a69..19bd8928ce94 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c | |||
@@ -130,15 +130,9 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip, | |||
130 | atomic_dec(&data->disabled); | 130 | atomic_dec(&data->disabled); |
131 | preempt_enable_notrace(); | 131 | preempt_enable_notrace(); |
132 | } | 132 | } |
133 | |||
134 | static struct ftrace_ops trace_ops __read_mostly = | ||
135 | { | ||
136 | .func = wakeup_tracer_call, | ||
137 | .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, | ||
138 | }; | ||
139 | #endif /* CONFIG_FUNCTION_TRACER */ | 133 | #endif /* CONFIG_FUNCTION_TRACER */ |
140 | 134 | ||
141 | static int register_wakeup_function(int graph, int set) | 135 | static int register_wakeup_function(struct trace_array *tr, int graph, int set) |
142 | { | 136 | { |
143 | int ret; | 137 | int ret; |
144 | 138 | ||
@@ -150,7 +144,7 @@ static int register_wakeup_function(int graph, int set) | |||
150 | ret = register_ftrace_graph(&wakeup_graph_return, | 144 | ret = register_ftrace_graph(&wakeup_graph_return, |
151 | &wakeup_graph_entry); | 145 | &wakeup_graph_entry); |
152 | else | 146 | else |
153 | ret = register_ftrace_function(&trace_ops); | 147 | ret = register_ftrace_function(tr->ops); |
154 | 148 | ||
155 | if (!ret) | 149 | if (!ret) |
156 | function_enabled = true; | 150 | function_enabled = true; |
@@ -158,7 +152,7 @@ static int register_wakeup_function(int graph, int set) | |||
158 | return ret; | 152 | return ret; |
159 | } | 153 | } |
160 | 154 | ||
161 | static void unregister_wakeup_function(int graph) | 155 | static void unregister_wakeup_function(struct trace_array *tr, int graph) |
162 | { | 156 | { |
163 | if (!function_enabled) | 157 | if (!function_enabled) |
164 | return; | 158 | return; |
@@ -166,17 +160,17 @@ static void unregister_wakeup_function(int graph) | |||
166 | if (graph) | 160 | if (graph) |
167 | unregister_ftrace_graph(); | 161 | unregister_ftrace_graph(); |
168 | else | 162 | else |
169 | unregister_ftrace_function(&trace_ops); | 163 | unregister_ftrace_function(tr->ops); |
170 | 164 | ||
171 | function_enabled = false; | 165 | function_enabled = false; |
172 | } | 166 | } |
173 | 167 | ||
174 | static void wakeup_function_set(int set) | 168 | static void wakeup_function_set(struct trace_array *tr, int set) |
175 | { | 169 | { |
176 | if (set) | 170 | if (set) |
177 | register_wakeup_function(is_graph(), 1); | 171 | register_wakeup_function(tr, is_graph(), 1); |
178 | else | 172 | else |
179 | unregister_wakeup_function(is_graph()); | 173 | unregister_wakeup_function(tr, is_graph()); |
180 | } | 174 | } |
181 | 175 | ||
182 | static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set) | 176 | static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set) |
@@ -184,16 +178,16 @@ static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set) | |||
184 | struct tracer *tracer = tr->current_trace; | 178 | struct tracer *tracer = tr->current_trace; |
185 | 179 | ||
186 | if (mask & TRACE_ITER_FUNCTION) | 180 | if (mask & TRACE_ITER_FUNCTION) |
187 | wakeup_function_set(set); | 181 | wakeup_function_set(tr, set); |
188 | 182 | ||
189 | return trace_keep_overwrite(tracer, mask, set); | 183 | return trace_keep_overwrite(tracer, mask, set); |
190 | } | 184 | } |
191 | 185 | ||
192 | static int start_func_tracer(int graph) | 186 | static int start_func_tracer(struct trace_array *tr, int graph) |
193 | { | 187 | { |
194 | int ret; | 188 | int ret; |
195 | 189 | ||
196 | ret = register_wakeup_function(graph, 0); | 190 | ret = register_wakeup_function(tr, graph, 0); |
197 | 191 | ||
198 | if (!ret && tracing_is_enabled()) | 192 | if (!ret && tracing_is_enabled()) |
199 | tracer_enabled = 1; | 193 | tracer_enabled = 1; |
@@ -203,11 +197,11 @@ static int start_func_tracer(int graph) | |||
203 | return ret; | 197 | return ret; |
204 | } | 198 | } |
205 | 199 | ||
206 | static void stop_func_tracer(int graph) | 200 | static void stop_func_tracer(struct trace_array *tr, int graph) |
207 | { | 201 | { |
208 | tracer_enabled = 0; | 202 | tracer_enabled = 0; |
209 | 203 | ||
210 | unregister_wakeup_function(graph); | 204 | unregister_wakeup_function(tr, graph); |
211 | } | 205 | } |
212 | 206 | ||
213 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | 207 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER |
@@ -221,12 +215,12 @@ wakeup_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) | |||
221 | if (!(is_graph() ^ set)) | 215 | if (!(is_graph() ^ set)) |
222 | return 0; | 216 | return 0; |
223 | 217 | ||
224 | stop_func_tracer(!set); | 218 | stop_func_tracer(tr, !set); |
225 | 219 | ||
226 | wakeup_reset(wakeup_trace); | 220 | wakeup_reset(wakeup_trace); |
227 | tracing_max_latency = 0; | 221 | tr->max_latency = 0; |
228 | 222 | ||
229 | return start_func_tracer(set); | 223 | return start_func_tracer(tr, set); |
230 | } | 224 | } |
231 | 225 | ||
232 | static int wakeup_graph_entry(struct ftrace_graph_ent *trace) | 226 | static int wakeup_graph_entry(struct ftrace_graph_ent *trace) |
@@ -350,13 +344,13 @@ static void wakeup_print_header(struct seq_file *s) | |||
350 | /* | 344 | /* |
351 | * Should this new latency be reported/recorded? | 345 | * Should this new latency be reported/recorded? |
352 | */ | 346 | */ |
353 | static int report_latency(cycle_t delta) | 347 | static int report_latency(struct trace_array *tr, cycle_t delta) |
354 | { | 348 | { |
355 | if (tracing_thresh) { | 349 | if (tracing_thresh) { |
356 | if (delta < tracing_thresh) | 350 | if (delta < tracing_thresh) |
357 | return 0; | 351 | return 0; |
358 | } else { | 352 | } else { |
359 | if (delta <= tracing_max_latency) | 353 | if (delta <= tr->max_latency) |
360 | return 0; | 354 | return 0; |
361 | } | 355 | } |
362 | return 1; | 356 | return 1; |
@@ -424,11 +418,11 @@ probe_wakeup_sched_switch(void *ignore, | |||
424 | T1 = ftrace_now(cpu); | 418 | T1 = ftrace_now(cpu); |
425 | delta = T1-T0; | 419 | delta = T1-T0; |
426 | 420 | ||
427 | if (!report_latency(delta)) | 421 | if (!report_latency(wakeup_trace, delta)) |
428 | goto out_unlock; | 422 | goto out_unlock; |
429 | 423 | ||
430 | if (likely(!is_tracing_stopped())) { | 424 | if (likely(!is_tracing_stopped())) { |
431 | tracing_max_latency = delta; | 425 | wakeup_trace->max_latency = delta; |
432 | update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu); | 426 | update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu); |
433 | } | 427 | } |
434 | 428 | ||
@@ -587,7 +581,7 @@ static void start_wakeup_tracer(struct trace_array *tr) | |||
587 | */ | 581 | */ |
588 | smp_wmb(); | 582 | smp_wmb(); |
589 | 583 | ||
590 | if (start_func_tracer(is_graph())) | 584 | if (start_func_tracer(tr, is_graph())) |
591 | printk(KERN_ERR "failed to start wakeup tracer\n"); | 585 | printk(KERN_ERR "failed to start wakeup tracer\n"); |
592 | 586 | ||
593 | return; | 587 | return; |
@@ -600,13 +594,15 @@ fail_deprobe: | |||
600 | static void stop_wakeup_tracer(struct trace_array *tr) | 594 | static void stop_wakeup_tracer(struct trace_array *tr) |
601 | { | 595 | { |
602 | tracer_enabled = 0; | 596 | tracer_enabled = 0; |
603 | stop_func_tracer(is_graph()); | 597 | stop_func_tracer(tr, is_graph()); |
604 | unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL); | 598 | unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL); |
605 | unregister_trace_sched_wakeup_new(probe_wakeup, NULL); | 599 | unregister_trace_sched_wakeup_new(probe_wakeup, NULL); |
606 | unregister_trace_sched_wakeup(probe_wakeup, NULL); | 600 | unregister_trace_sched_wakeup(probe_wakeup, NULL); |
607 | unregister_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL); | 601 | unregister_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL); |
608 | } | 602 | } |
609 | 603 | ||
604 | static bool wakeup_busy; | ||
605 | |||
610 | static int __wakeup_tracer_init(struct trace_array *tr) | 606 | static int __wakeup_tracer_init(struct trace_array *tr) |
611 | { | 607 | { |
612 | save_flags = trace_flags; | 608 | save_flags = trace_flags; |
@@ -615,14 +611,20 @@ static int __wakeup_tracer_init(struct trace_array *tr) | |||
615 | set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); | 611 | set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); |
616 | set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1); | 612 | set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1); |
617 | 613 | ||
618 | tracing_max_latency = 0; | 614 | tr->max_latency = 0; |
619 | wakeup_trace = tr; | 615 | wakeup_trace = tr; |
616 | ftrace_init_array_ops(tr, wakeup_tracer_call); | ||
620 | start_wakeup_tracer(tr); | 617 | start_wakeup_tracer(tr); |
618 | |||
619 | wakeup_busy = true; | ||
621 | return 0; | 620 | return 0; |
622 | } | 621 | } |
623 | 622 | ||
624 | static int wakeup_tracer_init(struct trace_array *tr) | 623 | static int wakeup_tracer_init(struct trace_array *tr) |
625 | { | 624 | { |
625 | if (wakeup_busy) | ||
626 | return -EBUSY; | ||
627 | |||
626 | wakeup_dl = 0; | 628 | wakeup_dl = 0; |
627 | wakeup_rt = 0; | 629 | wakeup_rt = 0; |
628 | return __wakeup_tracer_init(tr); | 630 | return __wakeup_tracer_init(tr); |
@@ -630,6 +632,9 @@ static int wakeup_tracer_init(struct trace_array *tr) | |||
630 | 632 | ||
631 | static int wakeup_rt_tracer_init(struct trace_array *tr) | 633 | static int wakeup_rt_tracer_init(struct trace_array *tr) |
632 | { | 634 | { |
635 | if (wakeup_busy) | ||
636 | return -EBUSY; | ||
637 | |||
633 | wakeup_dl = 0; | 638 | wakeup_dl = 0; |
634 | wakeup_rt = 1; | 639 | wakeup_rt = 1; |
635 | return __wakeup_tracer_init(tr); | 640 | return __wakeup_tracer_init(tr); |
@@ -637,6 +642,9 @@ static int wakeup_rt_tracer_init(struct trace_array *tr) | |||
637 | 642 | ||
638 | static int wakeup_dl_tracer_init(struct trace_array *tr) | 643 | static int wakeup_dl_tracer_init(struct trace_array *tr) |
639 | { | 644 | { |
645 | if (wakeup_busy) | ||
646 | return -EBUSY; | ||
647 | |||
640 | wakeup_dl = 1; | 648 | wakeup_dl = 1; |
641 | wakeup_rt = 0; | 649 | wakeup_rt = 0; |
642 | return __wakeup_tracer_init(tr); | 650 | return __wakeup_tracer_init(tr); |
@@ -653,6 +661,8 @@ static void wakeup_tracer_reset(struct trace_array *tr) | |||
653 | 661 | ||
654 | set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); | 662 | set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); |
655 | set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag); | 663 | set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag); |
664 | ftrace_reset_array_ops(tr); | ||
665 | wakeup_busy = false; | ||
656 | } | 666 | } |
657 | 667 | ||
658 | static void wakeup_tracer_start(struct trace_array *tr) | 668 | static void wakeup_tracer_start(struct trace_array *tr) |
@@ -684,6 +694,7 @@ static struct tracer wakeup_tracer __read_mostly = | |||
684 | #endif | 694 | #endif |
685 | .open = wakeup_trace_open, | 695 | .open = wakeup_trace_open, |
686 | .close = wakeup_trace_close, | 696 | .close = wakeup_trace_close, |
697 | .allow_instances = true, | ||
687 | .use_max_tr = true, | 698 | .use_max_tr = true, |
688 | }; | 699 | }; |
689 | 700 | ||
@@ -694,7 +705,6 @@ static struct tracer wakeup_rt_tracer __read_mostly = | |||
694 | .reset = wakeup_tracer_reset, | 705 | .reset = wakeup_tracer_reset, |
695 | .start = wakeup_tracer_start, | 706 | .start = wakeup_tracer_start, |
696 | .stop = wakeup_tracer_stop, | 707 | .stop = wakeup_tracer_stop, |
697 | .wait_pipe = poll_wait_pipe, | ||
698 | .print_max = true, | 708 | .print_max = true, |
699 | .print_header = wakeup_print_header, | 709 | .print_header = wakeup_print_header, |
700 | .print_line = wakeup_print_line, | 710 | .print_line = wakeup_print_line, |
@@ -706,6 +716,7 @@ static struct tracer wakeup_rt_tracer __read_mostly = | |||
706 | #endif | 716 | #endif |
707 | .open = wakeup_trace_open, | 717 | .open = wakeup_trace_open, |
708 | .close = wakeup_trace_close, | 718 | .close = wakeup_trace_close, |
719 | .allow_instances = true, | ||
709 | .use_max_tr = true, | 720 | .use_max_tr = true, |
710 | }; | 721 | }; |
711 | 722 | ||
@@ -716,7 +727,6 @@ static struct tracer wakeup_dl_tracer __read_mostly = | |||
716 | .reset = wakeup_tracer_reset, | 727 | .reset = wakeup_tracer_reset, |
717 | .start = wakeup_tracer_start, | 728 | .start = wakeup_tracer_start, |
718 | .stop = wakeup_tracer_stop, | 729 | .stop = wakeup_tracer_stop, |
719 | .wait_pipe = poll_wait_pipe, | ||
720 | .print_max = true, | 730 | .print_max = true, |
721 | .print_header = wakeup_print_header, | 731 | .print_header = wakeup_print_header, |
722 | .print_line = wakeup_print_line, | 732 | .print_line = wakeup_print_line, |
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index e98fca60974f..5ef60499dc8e 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c | |||
@@ -65,7 +65,7 @@ static int trace_test_buffer(struct trace_buffer *buf, unsigned long *count) | |||
65 | 65 | ||
66 | /* Don't allow flipping of max traces now */ | 66 | /* Don't allow flipping of max traces now */ |
67 | local_irq_save(flags); | 67 | local_irq_save(flags); |
68 | arch_spin_lock(&ftrace_max_lock); | 68 | arch_spin_lock(&buf->tr->max_lock); |
69 | 69 | ||
70 | cnt = ring_buffer_entries(buf->buffer); | 70 | cnt = ring_buffer_entries(buf->buffer); |
71 | 71 | ||
@@ -83,7 +83,7 @@ static int trace_test_buffer(struct trace_buffer *buf, unsigned long *count) | |||
83 | break; | 83 | break; |
84 | } | 84 | } |
85 | tracing_on(); | 85 | tracing_on(); |
86 | arch_spin_unlock(&ftrace_max_lock); | 86 | arch_spin_unlock(&buf->tr->max_lock); |
87 | local_irq_restore(flags); | 87 | local_irq_restore(flags); |
88 | 88 | ||
89 | if (count) | 89 | if (count) |
@@ -161,11 +161,6 @@ static struct ftrace_ops test_probe3 = { | |||
161 | .flags = FTRACE_OPS_FL_RECURSION_SAFE, | 161 | .flags = FTRACE_OPS_FL_RECURSION_SAFE, |
162 | }; | 162 | }; |
163 | 163 | ||
164 | static struct ftrace_ops test_global = { | ||
165 | .func = trace_selftest_test_global_func, | ||
166 | .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, | ||
167 | }; | ||
168 | |||
169 | static void print_counts(void) | 164 | static void print_counts(void) |
170 | { | 165 | { |
171 | printk("(%d %d %d %d %d) ", | 166 | printk("(%d %d %d %d %d) ", |
@@ -185,7 +180,7 @@ static void reset_counts(void) | |||
185 | trace_selftest_test_dyn_cnt = 0; | 180 | trace_selftest_test_dyn_cnt = 0; |
186 | } | 181 | } |
187 | 182 | ||
188 | static int trace_selftest_ops(int cnt) | 183 | static int trace_selftest_ops(struct trace_array *tr, int cnt) |
189 | { | 184 | { |
190 | int save_ftrace_enabled = ftrace_enabled; | 185 | int save_ftrace_enabled = ftrace_enabled; |
191 | struct ftrace_ops *dyn_ops; | 186 | struct ftrace_ops *dyn_ops; |
@@ -220,7 +215,11 @@ static int trace_selftest_ops(int cnt) | |||
220 | register_ftrace_function(&test_probe1); | 215 | register_ftrace_function(&test_probe1); |
221 | register_ftrace_function(&test_probe2); | 216 | register_ftrace_function(&test_probe2); |
222 | register_ftrace_function(&test_probe3); | 217 | register_ftrace_function(&test_probe3); |
223 | register_ftrace_function(&test_global); | 218 | /* First time we are running with main function */ |
219 | if (cnt > 1) { | ||
220 | ftrace_init_array_ops(tr, trace_selftest_test_global_func); | ||
221 | register_ftrace_function(tr->ops); | ||
222 | } | ||
224 | 223 | ||
225 | DYN_FTRACE_TEST_NAME(); | 224 | DYN_FTRACE_TEST_NAME(); |
226 | 225 | ||
@@ -232,8 +231,10 @@ static int trace_selftest_ops(int cnt) | |||
232 | goto out; | 231 | goto out; |
233 | if (trace_selftest_test_probe3_cnt != 1) | 232 | if (trace_selftest_test_probe3_cnt != 1) |
234 | goto out; | 233 | goto out; |
235 | if (trace_selftest_test_global_cnt == 0) | 234 | if (cnt > 1) { |
236 | goto out; | 235 | if (trace_selftest_test_global_cnt == 0) |
236 | goto out; | ||
237 | } | ||
237 | 238 | ||
238 | DYN_FTRACE_TEST_NAME2(); | 239 | DYN_FTRACE_TEST_NAME2(); |
239 | 240 | ||
@@ -269,8 +270,10 @@ static int trace_selftest_ops(int cnt) | |||
269 | goto out_free; | 270 | goto out_free; |
270 | if (trace_selftest_test_probe3_cnt != 3) | 271 | if (trace_selftest_test_probe3_cnt != 3) |
271 | goto out_free; | 272 | goto out_free; |
272 | if (trace_selftest_test_global_cnt == 0) | 273 | if (cnt > 1) { |
273 | goto out; | 274 | if (trace_selftest_test_global_cnt == 0) |
275 | goto out; | ||
276 | } | ||
274 | if (trace_selftest_test_dyn_cnt == 0) | 277 | if (trace_selftest_test_dyn_cnt == 0) |
275 | goto out_free; | 278 | goto out_free; |
276 | 279 | ||
@@ -295,7 +298,9 @@ static int trace_selftest_ops(int cnt) | |||
295 | unregister_ftrace_function(&test_probe1); | 298 | unregister_ftrace_function(&test_probe1); |
296 | unregister_ftrace_function(&test_probe2); | 299 | unregister_ftrace_function(&test_probe2); |
297 | unregister_ftrace_function(&test_probe3); | 300 | unregister_ftrace_function(&test_probe3); |
298 | unregister_ftrace_function(&test_global); | 301 | if (cnt > 1) |
302 | unregister_ftrace_function(tr->ops); | ||
303 | ftrace_reset_array_ops(tr); | ||
299 | 304 | ||
300 | /* Make sure everything is off */ | 305 | /* Make sure everything is off */ |
301 | reset_counts(); | 306 | reset_counts(); |
@@ -315,9 +320,9 @@ static int trace_selftest_ops(int cnt) | |||
315 | } | 320 | } |
316 | 321 | ||
317 | /* Test dynamic code modification and ftrace filters */ | 322 | /* Test dynamic code modification and ftrace filters */ |
318 | int trace_selftest_startup_dynamic_tracing(struct tracer *trace, | 323 | static int trace_selftest_startup_dynamic_tracing(struct tracer *trace, |
319 | struct trace_array *tr, | 324 | struct trace_array *tr, |
320 | int (*func)(void)) | 325 | int (*func)(void)) |
321 | { | 326 | { |
322 | int save_ftrace_enabled = ftrace_enabled; | 327 | int save_ftrace_enabled = ftrace_enabled; |
323 | unsigned long count; | 328 | unsigned long count; |
@@ -388,7 +393,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, | |||
388 | } | 393 | } |
389 | 394 | ||
390 | /* Test the ops with global tracing running */ | 395 | /* Test the ops with global tracing running */ |
391 | ret = trace_selftest_ops(1); | 396 | ret = trace_selftest_ops(tr, 1); |
392 | trace->reset(tr); | 397 | trace->reset(tr); |
393 | 398 | ||
394 | out: | 399 | out: |
@@ -399,7 +404,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, | |||
399 | 404 | ||
400 | /* Test the ops with global tracing off */ | 405 | /* Test the ops with global tracing off */ |
401 | if (!ret) | 406 | if (!ret) |
402 | ret = trace_selftest_ops(2); | 407 | ret = trace_selftest_ops(tr, 2); |
403 | 408 | ||
404 | return ret; | 409 | return ret; |
405 | } | 410 | } |
@@ -802,7 +807,7 @@ out: | |||
802 | int | 807 | int |
803 | trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) | 808 | trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) |
804 | { | 809 | { |
805 | unsigned long save_max = tracing_max_latency; | 810 | unsigned long save_max = tr->max_latency; |
806 | unsigned long count; | 811 | unsigned long count; |
807 | int ret; | 812 | int ret; |
808 | 813 | ||
@@ -814,7 +819,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) | |||
814 | } | 819 | } |
815 | 820 | ||
816 | /* reset the max latency */ | 821 | /* reset the max latency */ |
817 | tracing_max_latency = 0; | 822 | tr->max_latency = 0; |
818 | /* disable interrupts for a bit */ | 823 | /* disable interrupts for a bit */ |
819 | local_irq_disable(); | 824 | local_irq_disable(); |
820 | udelay(100); | 825 | udelay(100); |
@@ -841,7 +846,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) | |||
841 | ret = -1; | 846 | ret = -1; |
842 | } | 847 | } |
843 | 848 | ||
844 | tracing_max_latency = save_max; | 849 | tr->max_latency = save_max; |
845 | 850 | ||
846 | return ret; | 851 | return ret; |
847 | } | 852 | } |
@@ -851,7 +856,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) | |||
851 | int | 856 | int |
852 | trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) | 857 | trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) |
853 | { | 858 | { |
854 | unsigned long save_max = tracing_max_latency; | 859 | unsigned long save_max = tr->max_latency; |
855 | unsigned long count; | 860 | unsigned long count; |
856 | int ret; | 861 | int ret; |
857 | 862 | ||
@@ -876,7 +881,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) | |||
876 | } | 881 | } |
877 | 882 | ||
878 | /* reset the max latency */ | 883 | /* reset the max latency */ |
879 | tracing_max_latency = 0; | 884 | tr->max_latency = 0; |
880 | /* disable preemption for a bit */ | 885 | /* disable preemption for a bit */ |
881 | preempt_disable(); | 886 | preempt_disable(); |
882 | udelay(100); | 887 | udelay(100); |
@@ -903,7 +908,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) | |||
903 | ret = -1; | 908 | ret = -1; |
904 | } | 909 | } |
905 | 910 | ||
906 | tracing_max_latency = save_max; | 911 | tr->max_latency = save_max; |
907 | 912 | ||
908 | return ret; | 913 | return ret; |
909 | } | 914 | } |
@@ -913,7 +918,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) | |||
913 | int | 918 | int |
914 | trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *tr) | 919 | trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *tr) |
915 | { | 920 | { |
916 | unsigned long save_max = tracing_max_latency; | 921 | unsigned long save_max = tr->max_latency; |
917 | unsigned long count; | 922 | unsigned long count; |
918 | int ret; | 923 | int ret; |
919 | 924 | ||
@@ -938,7 +943,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * | |||
938 | } | 943 | } |
939 | 944 | ||
940 | /* reset the max latency */ | 945 | /* reset the max latency */ |
941 | tracing_max_latency = 0; | 946 | tr->max_latency = 0; |
942 | 947 | ||
943 | /* disable preemption and interrupts for a bit */ | 948 | /* disable preemption and interrupts for a bit */ |
944 | preempt_disable(); | 949 | preempt_disable(); |
@@ -973,7 +978,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * | |||
973 | } | 978 | } |
974 | 979 | ||
975 | /* do the test by disabling interrupts first this time */ | 980 | /* do the test by disabling interrupts first this time */ |
976 | tracing_max_latency = 0; | 981 | tr->max_latency = 0; |
977 | tracing_start(); | 982 | tracing_start(); |
978 | trace->start(tr); | 983 | trace->start(tr); |
979 | 984 | ||
@@ -1004,7 +1009,7 @@ out: | |||
1004 | tracing_start(); | 1009 | tracing_start(); |
1005 | out_no_start: | 1010 | out_no_start: |
1006 | trace->reset(tr); | 1011 | trace->reset(tr); |
1007 | tracing_max_latency = save_max; | 1012 | tr->max_latency = save_max; |
1008 | 1013 | ||
1009 | return ret; | 1014 | return ret; |
1010 | } | 1015 | } |
@@ -1057,7 +1062,7 @@ static int trace_wakeup_test_thread(void *data) | |||
1057 | int | 1062 | int |
1058 | trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) | 1063 | trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) |
1059 | { | 1064 | { |
1060 | unsigned long save_max = tracing_max_latency; | 1065 | unsigned long save_max = tr->max_latency; |
1061 | struct task_struct *p; | 1066 | struct task_struct *p; |
1062 | struct completion is_ready; | 1067 | struct completion is_ready; |
1063 | unsigned long count; | 1068 | unsigned long count; |
@@ -1083,7 +1088,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) | |||
1083 | } | 1088 | } |
1084 | 1089 | ||
1085 | /* reset the max latency */ | 1090 | /* reset the max latency */ |
1086 | tracing_max_latency = 0; | 1091 | tr->max_latency = 0; |
1087 | 1092 | ||
1088 | while (p->on_rq) { | 1093 | while (p->on_rq) { |
1089 | /* | 1094 | /* |
@@ -1113,7 +1118,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) | |||
1113 | trace->reset(tr); | 1118 | trace->reset(tr); |
1114 | tracing_start(); | 1119 | tracing_start(); |
1115 | 1120 | ||
1116 | tracing_max_latency = save_max; | 1121 | tr->max_latency = save_max; |
1117 | 1122 | ||
1118 | /* kill the thread */ | 1123 | /* kill the thread */ |
1119 | kthread_stop(p); | 1124 | kthread_stop(p); |
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 21b320e5d163..8a4e5cb66a4c 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c | |||
@@ -51,11 +51,33 @@ static DEFINE_MUTEX(stack_sysctl_mutex); | |||
51 | int stack_tracer_enabled; | 51 | int stack_tracer_enabled; |
52 | static int last_stack_tracer_enabled; | 52 | static int last_stack_tracer_enabled; |
53 | 53 | ||
54 | static inline void print_max_stack(void) | ||
55 | { | ||
56 | long i; | ||
57 | int size; | ||
58 | |||
59 | pr_emerg(" Depth Size Location (%d entries)\n" | ||
60 | " ----- ---- --------\n", | ||
61 | max_stack_trace.nr_entries - 1); | ||
62 | |||
63 | for (i = 0; i < max_stack_trace.nr_entries; i++) { | ||
64 | if (stack_dump_trace[i] == ULONG_MAX) | ||
65 | break; | ||
66 | if (i+1 == max_stack_trace.nr_entries || | ||
67 | stack_dump_trace[i+1] == ULONG_MAX) | ||
68 | size = stack_dump_index[i]; | ||
69 | else | ||
70 | size = stack_dump_index[i] - stack_dump_index[i+1]; | ||
71 | |||
72 | pr_emerg("%3ld) %8d %5d %pS\n", i, stack_dump_index[i], | ||
73 | size, (void *)stack_dump_trace[i]); | ||
74 | } | ||
75 | } | ||
76 | |||
54 | static inline void | 77 | static inline void |
55 | check_stack(unsigned long ip, unsigned long *stack) | 78 | check_stack(unsigned long ip, unsigned long *stack) |
56 | { | 79 | { |
57 | unsigned long this_size, flags; | 80 | unsigned long this_size, flags; unsigned long *p, *top, *start; |
58 | unsigned long *p, *top, *start; | ||
59 | static int tracer_frame; | 81 | static int tracer_frame; |
60 | int frame_size = ACCESS_ONCE(tracer_frame); | 82 | int frame_size = ACCESS_ONCE(tracer_frame); |
61 | int i; | 83 | int i; |
@@ -85,8 +107,12 @@ check_stack(unsigned long ip, unsigned long *stack) | |||
85 | 107 | ||
86 | max_stack_size = this_size; | 108 | max_stack_size = this_size; |
87 | 109 | ||
88 | max_stack_trace.nr_entries = 0; | 110 | max_stack_trace.nr_entries = 0; |
89 | max_stack_trace.skip = 3; | 111 | |
112 | if (using_ftrace_ops_list_func()) | ||
113 | max_stack_trace.skip = 4; | ||
114 | else | ||
115 | max_stack_trace.skip = 3; | ||
90 | 116 | ||
91 | save_stack_trace(&max_stack_trace); | 117 | save_stack_trace(&max_stack_trace); |
92 | 118 | ||
@@ -145,8 +171,12 @@ check_stack(unsigned long ip, unsigned long *stack) | |||
145 | i++; | 171 | i++; |
146 | } | 172 | } |
147 | 173 | ||
148 | BUG_ON(current != &init_task && | 174 | if ((current != &init_task && |
149 | *(end_of_stack(current)) != STACK_END_MAGIC); | 175 | *(end_of_stack(current)) != STACK_END_MAGIC)) { |
176 | print_max_stack(); | ||
177 | BUG(); | ||
178 | } | ||
179 | |||
150 | out: | 180 | out: |
151 | arch_spin_unlock(&max_stack_lock); | 181 | arch_spin_unlock(&max_stack_lock); |
152 | local_irq_restore(flags); | 182 | local_irq_restore(flags); |
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index c082a7441345..04fdb5de823c 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c | |||
@@ -108,8 +108,8 @@ static unsigned long get_user_stack_nth(struct pt_regs *regs, unsigned int n) | |||
108 | * Uprobes-specific fetch functions | 108 | * Uprobes-specific fetch functions |
109 | */ | 109 | */ |
110 | #define DEFINE_FETCH_stack(type) \ | 110 | #define DEFINE_FETCH_stack(type) \ |
111 | static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ | 111 | static void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs, \ |
112 | void *offset, void *dest) \ | 112 | void *offset, void *dest) \ |
113 | { \ | 113 | { \ |
114 | *(type *)dest = (type)get_user_stack_nth(regs, \ | 114 | *(type *)dest = (type)get_user_stack_nth(regs, \ |
115 | ((unsigned long)offset)); \ | 115 | ((unsigned long)offset)); \ |
@@ -120,8 +120,8 @@ DEFINE_BASIC_FETCH_FUNCS(stack) | |||
120 | #define fetch_stack_string_size NULL | 120 | #define fetch_stack_string_size NULL |
121 | 121 | ||
122 | #define DEFINE_FETCH_memory(type) \ | 122 | #define DEFINE_FETCH_memory(type) \ |
123 | static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ | 123 | static void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs, \ |
124 | void *addr, void *dest) \ | 124 | void *addr, void *dest) \ |
125 | { \ | 125 | { \ |
126 | type retval; \ | 126 | type retval; \ |
127 | void __user *vaddr = (void __force __user *) addr; \ | 127 | void __user *vaddr = (void __force __user *) addr; \ |
@@ -136,8 +136,8 @@ DEFINE_BASIC_FETCH_FUNCS(memory) | |||
136 | * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max | 136 | * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max |
137 | * length and relative data location. | 137 | * length and relative data location. |
138 | */ | 138 | */ |
139 | static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, | 139 | static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, |
140 | void *addr, void *dest) | 140 | void *addr, void *dest) |
141 | { | 141 | { |
142 | long ret; | 142 | long ret; |
143 | u32 rloc = *(u32 *)dest; | 143 | u32 rloc = *(u32 *)dest; |
@@ -158,8 +158,8 @@ static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, | |||
158 | } | 158 | } |
159 | } | 159 | } |
160 | 160 | ||
161 | static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, | 161 | static void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, |
162 | void *addr, void *dest) | 162 | void *addr, void *dest) |
163 | { | 163 | { |
164 | int len; | 164 | int len; |
165 | void __user *vaddr = (void __force __user *) addr; | 165 | void __user *vaddr = (void __force __user *) addr; |
@@ -184,8 +184,8 @@ static unsigned long translate_user_vaddr(void *file_offset) | |||
184 | } | 184 | } |
185 | 185 | ||
186 | #define DEFINE_FETCH_file_offset(type) \ | 186 | #define DEFINE_FETCH_file_offset(type) \ |
187 | static __kprobes void FETCH_FUNC_NAME(file_offset, type)(struct pt_regs *regs,\ | 187 | static void FETCH_FUNC_NAME(file_offset, type)(struct pt_regs *regs, \ |
188 | void *offset, void *dest) \ | 188 | void *offset, void *dest)\ |
189 | { \ | 189 | { \ |
190 | void *vaddr = (void *)translate_user_vaddr(offset); \ | 190 | void *vaddr = (void *)translate_user_vaddr(offset); \ |
191 | \ | 191 | \ |
@@ -1009,56 +1009,60 @@ uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event) | |||
1009 | return __uprobe_perf_filter(&tu->filter, event->hw.tp_target->mm); | 1009 | return __uprobe_perf_filter(&tu->filter, event->hw.tp_target->mm); |
1010 | } | 1010 | } |
1011 | 1011 | ||
1012 | static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) | 1012 | static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) |
1013 | { | 1013 | { |
1014 | bool done; | 1014 | bool done; |
1015 | 1015 | ||
1016 | write_lock(&tu->filter.rwlock); | 1016 | write_lock(&tu->filter.rwlock); |
1017 | if (event->hw.tp_target) { | 1017 | if (event->hw.tp_target) { |
1018 | /* | 1018 | list_del(&event->hw.tp_list); |
1019 | * event->parent != NULL means copy_process(), we can avoid | ||
1020 | * uprobe_apply(). current->mm must be probed and we can rely | ||
1021 | * on dup_mmap() which preserves the already installed bp's. | ||
1022 | * | ||
1023 | * attr.enable_on_exec means that exec/mmap will install the | ||
1024 | * breakpoints we need. | ||
1025 | */ | ||
1026 | done = tu->filter.nr_systemwide || | 1019 | done = tu->filter.nr_systemwide || |
1027 | event->parent || event->attr.enable_on_exec || | 1020 | (event->hw.tp_target->flags & PF_EXITING) || |
1028 | uprobe_filter_event(tu, event); | 1021 | uprobe_filter_event(tu, event); |
1029 | list_add(&event->hw.tp_list, &tu->filter.perf_events); | ||
1030 | } else { | 1022 | } else { |
1023 | tu->filter.nr_systemwide--; | ||
1031 | done = tu->filter.nr_systemwide; | 1024 | done = tu->filter.nr_systemwide; |
1032 | tu->filter.nr_systemwide++; | ||
1033 | } | 1025 | } |
1034 | write_unlock(&tu->filter.rwlock); | 1026 | write_unlock(&tu->filter.rwlock); |
1035 | 1027 | ||
1036 | if (!done) | 1028 | if (!done) |
1037 | uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); | 1029 | return uprobe_apply(tu->inode, tu->offset, &tu->consumer, false); |
1038 | 1030 | ||
1039 | return 0; | 1031 | return 0; |
1040 | } | 1032 | } |
1041 | 1033 | ||
1042 | static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) | 1034 | static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) |
1043 | { | 1035 | { |
1044 | bool done; | 1036 | bool done; |
1037 | int err; | ||
1045 | 1038 | ||
1046 | write_lock(&tu->filter.rwlock); | 1039 | write_lock(&tu->filter.rwlock); |
1047 | if (event->hw.tp_target) { | 1040 | if (event->hw.tp_target) { |
1048 | list_del(&event->hw.tp_list); | 1041 | /* |
1042 | * event->parent != NULL means copy_process(), we can avoid | ||
1043 | * uprobe_apply(). current->mm must be probed and we can rely | ||
1044 | * on dup_mmap() which preserves the already installed bp's. | ||
1045 | * | ||
1046 | * attr.enable_on_exec means that exec/mmap will install the | ||
1047 | * breakpoints we need. | ||
1048 | */ | ||
1049 | done = tu->filter.nr_systemwide || | 1049 | done = tu->filter.nr_systemwide || |
1050 | (event->hw.tp_target->flags & PF_EXITING) || | 1050 | event->parent || event->attr.enable_on_exec || |
1051 | uprobe_filter_event(tu, event); | 1051 | uprobe_filter_event(tu, event); |
1052 | list_add(&event->hw.tp_list, &tu->filter.perf_events); | ||
1052 | } else { | 1053 | } else { |
1053 | tu->filter.nr_systemwide--; | ||
1054 | done = tu->filter.nr_systemwide; | 1054 | done = tu->filter.nr_systemwide; |
1055 | tu->filter.nr_systemwide++; | ||
1055 | } | 1056 | } |
1056 | write_unlock(&tu->filter.rwlock); | 1057 | write_unlock(&tu->filter.rwlock); |
1057 | 1058 | ||
1058 | if (!done) | 1059 | err = 0; |
1059 | uprobe_apply(tu->inode, tu->offset, &tu->consumer, false); | 1060 | if (!done) { |
1060 | 1061 | err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); | |
1061 | return 0; | 1062 | if (err) |
1063 | uprobe_perf_close(tu, event); | ||
1064 | } | ||
1065 | return err; | ||
1062 | } | 1066 | } |
1063 | 1067 | ||
1064 | static bool uprobe_perf_filter(struct uprobe_consumer *uc, | 1068 | static bool uprobe_perf_filter(struct uprobe_consumer *uc, |
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index ac5b23cf7212..33cbd8c203f8 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c | |||
@@ -188,7 +188,6 @@ static int tracepoint_add_func(struct tracepoint *tp, | |||
188 | WARN_ON_ONCE(1); | 188 | WARN_ON_ONCE(1); |
189 | return PTR_ERR(old); | 189 | return PTR_ERR(old); |
190 | } | 190 | } |
191 | release_probes(old); | ||
192 | 191 | ||
193 | /* | 192 | /* |
194 | * rcu_assign_pointer has a smp_wmb() which makes sure that the new | 193 | * rcu_assign_pointer has a smp_wmb() which makes sure that the new |
@@ -200,6 +199,7 @@ static int tracepoint_add_func(struct tracepoint *tp, | |||
200 | rcu_assign_pointer(tp->funcs, tp_funcs); | 199 | rcu_assign_pointer(tp->funcs, tp_funcs); |
201 | if (!static_key_enabled(&tp->key)) | 200 | if (!static_key_enabled(&tp->key)) |
202 | static_key_slow_inc(&tp->key); | 201 | static_key_slow_inc(&tp->key); |
202 | release_probes(old); | ||
203 | return 0; | 203 | return 0; |
204 | } | 204 | } |
205 | 205 | ||
@@ -221,7 +221,6 @@ static int tracepoint_remove_func(struct tracepoint *tp, | |||
221 | WARN_ON_ONCE(1); | 221 | WARN_ON_ONCE(1); |
222 | return PTR_ERR(old); | 222 | return PTR_ERR(old); |
223 | } | 223 | } |
224 | release_probes(old); | ||
225 | 224 | ||
226 | if (!tp_funcs) { | 225 | if (!tp_funcs) { |
227 | /* Removed last function */ | 226 | /* Removed last function */ |
@@ -232,6 +231,7 @@ static int tracepoint_remove_func(struct tracepoint *tp, | |||
232 | static_key_slow_dec(&tp->key); | 231 | static_key_slow_dec(&tp->key); |
233 | } | 232 | } |
234 | rcu_assign_pointer(tp->funcs, tp_funcs); | 233 | rcu_assign_pointer(tp->funcs, tp_funcs); |
234 | release_probes(old); | ||
235 | return 0; | 235 | return 0; |
236 | } | 236 | } |
237 | 237 | ||
@@ -239,6 +239,7 @@ static int tracepoint_remove_func(struct tracepoint *tp, | |||
239 | * tracepoint_probe_register - Connect a probe to a tracepoint | 239 | * tracepoint_probe_register - Connect a probe to a tracepoint |
240 | * @tp: tracepoint | 240 | * @tp: tracepoint |
241 | * @probe: probe handler | 241 | * @probe: probe handler |
242 | * @data: tracepoint data | ||
242 | * | 243 | * |
243 | * Returns 0 if ok, error value on error. | 244 | * Returns 0 if ok, error value on error. |
244 | * Note: if @tp is within a module, the caller is responsible for | 245 | * Note: if @tp is within a module, the caller is responsible for |
@@ -264,6 +265,7 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_register); | |||
264 | * tracepoint_probe_unregister - Disconnect a probe from a tracepoint | 265 | * tracepoint_probe_unregister - Disconnect a probe from a tracepoint |
265 | * @tp: tracepoint | 266 | * @tp: tracepoint |
266 | * @probe: probe function pointer | 267 | * @probe: probe function pointer |
268 | * @data: tracepoint data | ||
267 | * | 269 | * |
268 | * Returns 0 if ok, error value on error. | 270 | * Returns 0 if ok, error value on error. |
269 | */ | 271 | */ |
diff --git a/kernel/user.c b/kernel/user.c index 294fc6a94168..4efa39350e44 100644 --- a/kernel/user.c +++ b/kernel/user.c | |||
@@ -87,7 +87,6 @@ static DEFINE_SPINLOCK(uidhash_lock); | |||
87 | struct user_struct root_user = { | 87 | struct user_struct root_user = { |
88 | .__count = ATOMIC_INIT(1), | 88 | .__count = ATOMIC_INIT(1), |
89 | .processes = ATOMIC_INIT(1), | 89 | .processes = ATOMIC_INIT(1), |
90 | .files = ATOMIC_INIT(0), | ||
91 | .sigpending = ATOMIC_INIT(0), | 90 | .sigpending = ATOMIC_INIT(0), |
92 | .locked_shm = 0, | 91 | .locked_shm = 0, |
93 | .uid = GLOBAL_ROOT_UID, | 92 | .uid = GLOBAL_ROOT_UID, |
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index bf71b4b2d632..fcc02560fd6b 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c | |||
@@ -286,7 +286,7 @@ EXPORT_SYMBOL(from_kuid_munged); | |||
286 | /** | 286 | /** |
287 | * make_kgid - Map a user-namespace gid pair into a kgid. | 287 | * make_kgid - Map a user-namespace gid pair into a kgid. |
288 | * @ns: User namespace that the gid is in | 288 | * @ns: User namespace that the gid is in |
289 | * @uid: group identifier | 289 | * @gid: group identifier |
290 | * | 290 | * |
291 | * Maps a user-namespace gid pair into a kernel internal kgid, | 291 | * Maps a user-namespace gid pair into a kernel internal kgid, |
292 | * and returns that kgid. | 292 | * and returns that kgid. |
@@ -482,7 +482,8 @@ static int projid_m_show(struct seq_file *seq, void *v) | |||
482 | return 0; | 482 | return 0; |
483 | } | 483 | } |
484 | 484 | ||
485 | static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map) | 485 | static void *m_start(struct seq_file *seq, loff_t *ppos, |
486 | struct uid_gid_map *map) | ||
486 | { | 487 | { |
487 | struct uid_gid_extent *extent = NULL; | 488 | struct uid_gid_extent *extent = NULL; |
488 | loff_t pos = *ppos; | 489 | loff_t pos = *ppos; |
@@ -546,7 +547,8 @@ struct seq_operations proc_projid_seq_operations = { | |||
546 | .show = projid_m_show, | 547 | .show = projid_m_show, |
547 | }; | 548 | }; |
548 | 549 | ||
549 | static bool mappings_overlap(struct uid_gid_map *new_map, struct uid_gid_extent *extent) | 550 | static bool mappings_overlap(struct uid_gid_map *new_map, |
551 | struct uid_gid_extent *extent) | ||
550 | { | 552 | { |
551 | u32 upper_first, lower_first, upper_last, lower_last; | 553 | u32 upper_first, lower_first, upper_last, lower_last; |
552 | unsigned idx; | 554 | unsigned idx; |
@@ -653,7 +655,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, | |||
653 | ret = -EINVAL; | 655 | ret = -EINVAL; |
654 | pos = kbuf; | 656 | pos = kbuf; |
655 | new_map.nr_extents = 0; | 657 | new_map.nr_extents = 0; |
656 | for (;pos; pos = next_line) { | 658 | for (; pos; pos = next_line) { |
657 | extent = &new_map.extent[new_map.nr_extents]; | 659 | extent = &new_map.extent[new_map.nr_extents]; |
658 | 660 | ||
659 | /* Find the end of line and ensure I don't look past it */ | 661 | /* Find the end of line and ensure I don't look past it */ |
@@ -687,13 +689,16 @@ static ssize_t map_write(struct file *file, const char __user *buf, | |||
687 | 689 | ||
688 | /* Verify we have been given valid starting values */ | 690 | /* Verify we have been given valid starting values */ |
689 | if ((extent->first == (u32) -1) || | 691 | if ((extent->first == (u32) -1) || |
690 | (extent->lower_first == (u32) -1 )) | 692 | (extent->lower_first == (u32) -1)) |
691 | goto out; | 693 | goto out; |
692 | 694 | ||
693 | /* Verify count is not zero and does not cause the extent to wrap */ | 695 | /* Verify count is not zero and does not cause the |
696 | * extent to wrap | ||
697 | */ | ||
694 | if ((extent->first + extent->count) <= extent->first) | 698 | if ((extent->first + extent->count) <= extent->first) |
695 | goto out; | 699 | goto out; |
696 | if ((extent->lower_first + extent->count) <= extent->lower_first) | 700 | if ((extent->lower_first + extent->count) <= |
701 | extent->lower_first) | ||
697 | goto out; | 702 | goto out; |
698 | 703 | ||
699 | /* Do the ranges in extent overlap any previous extents? */ | 704 | /* Do the ranges in extent overlap any previous extents? */ |
@@ -751,7 +756,8 @@ out: | |||
751 | return ret; | 756 | return ret; |
752 | } | 757 | } |
753 | 758 | ||
754 | ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) | 759 | ssize_t proc_uid_map_write(struct file *file, const char __user *buf, |
760 | size_t size, loff_t *ppos) | ||
755 | { | 761 | { |
756 | struct seq_file *seq = file->private_data; | 762 | struct seq_file *seq = file->private_data; |
757 | struct user_namespace *ns = seq->private; | 763 | struct user_namespace *ns = seq->private; |
@@ -767,7 +773,8 @@ ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t siz | |||
767 | &ns->uid_map, &ns->parent->uid_map); | 773 | &ns->uid_map, &ns->parent->uid_map); |
768 | } | 774 | } |
769 | 775 | ||
770 | ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) | 776 | ssize_t proc_gid_map_write(struct file *file, const char __user *buf, |
777 | size_t size, loff_t *ppos) | ||
771 | { | 778 | { |
772 | struct seq_file *seq = file->private_data; | 779 | struct seq_file *seq = file->private_data; |
773 | struct user_namespace *ns = seq->private; | 780 | struct user_namespace *ns = seq->private; |
@@ -783,7 +790,8 @@ ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t siz | |||
783 | &ns->gid_map, &ns->parent->gid_map); | 790 | &ns->gid_map, &ns->parent->gid_map); |
784 | } | 791 | } |
785 | 792 | ||
786 | ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) | 793 | ssize_t proc_projid_map_write(struct file *file, const char __user *buf, |
794 | size_t size, loff_t *ppos) | ||
787 | { | 795 | { |
788 | struct seq_file *seq = file->private_data; | 796 | struct seq_file *seq = file->private_data; |
789 | struct user_namespace *ns = seq->private; | 797 | struct user_namespace *ns = seq->private; |
@@ -800,7 +808,7 @@ ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t | |||
800 | &ns->projid_map, &ns->parent->projid_map); | 808 | &ns->projid_map, &ns->parent->projid_map); |
801 | } | 809 | } |
802 | 810 | ||
803 | static bool new_idmap_permitted(const struct file *file, | 811 | static bool new_idmap_permitted(const struct file *file, |
804 | struct user_namespace *ns, int cap_setid, | 812 | struct user_namespace *ns, int cap_setid, |
805 | struct uid_gid_map *new_map) | 813 | struct uid_gid_map *new_map) |
806 | { | 814 | { |
@@ -811,8 +819,7 @@ static bool new_idmap_permitted(const struct file *file, | |||
811 | kuid_t uid = make_kuid(ns->parent, id); | 819 | kuid_t uid = make_kuid(ns->parent, id); |
812 | if (uid_eq(uid, file->f_cred->fsuid)) | 820 | if (uid_eq(uid, file->f_cred->fsuid)) |
813 | return true; | 821 | return true; |
814 | } | 822 | } else if (cap_setid == CAP_SETGID) { |
815 | else if (cap_setid == CAP_SETGID) { | ||
816 | kgid_t gid = make_kgid(ns->parent, id); | 823 | kgid_t gid = make_kgid(ns->parent, id); |
817 | if (gid_eq(gid, file->f_cred->fsgid)) | 824 | if (gid_eq(gid, file->f_cred->fsgid)) |
818 | return true; | 825 | return true; |
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 4f69f9a5e221..c8eac43267e9 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c | |||
@@ -17,7 +17,7 @@ | |||
17 | 17 | ||
18 | #ifdef CONFIG_PROC_SYSCTL | 18 | #ifdef CONFIG_PROC_SYSCTL |
19 | 19 | ||
20 | static void *get_uts(ctl_table *table, int write) | 20 | static void *get_uts(struct ctl_table *table, int write) |
21 | { | 21 | { |
22 | char *which = table->data; | 22 | char *which = table->data; |
23 | struct uts_namespace *uts_ns; | 23 | struct uts_namespace *uts_ns; |
@@ -32,7 +32,7 @@ static void *get_uts(ctl_table *table, int write) | |||
32 | return which; | 32 | return which; |
33 | } | 33 | } |
34 | 34 | ||
35 | static void put_uts(ctl_table *table, int write, void *which) | 35 | static void put_uts(struct ctl_table *table, int write, void *which) |
36 | { | 36 | { |
37 | if (!write) | 37 | if (!write) |
38 | up_read(&uts_sem); | 38 | up_read(&uts_sem); |
@@ -44,14 +44,14 @@ static void put_uts(ctl_table *table, int write, void *which) | |||
44 | * Special case of dostring for the UTS structure. This has locks | 44 | * Special case of dostring for the UTS structure. This has locks |
45 | * to observe. Should this be in kernel/sys.c ???? | 45 | * to observe. Should this be in kernel/sys.c ???? |
46 | */ | 46 | */ |
47 | static int proc_do_uts_string(ctl_table *table, int write, | 47 | static int proc_do_uts_string(struct ctl_table *table, int write, |
48 | void __user *buffer, size_t *lenp, loff_t *ppos) | 48 | void __user *buffer, size_t *lenp, loff_t *ppos) |
49 | { | 49 | { |
50 | struct ctl_table uts_table; | 50 | struct ctl_table uts_table; |
51 | int r; | 51 | int r; |
52 | memcpy(&uts_table, table, sizeof(uts_table)); | 52 | memcpy(&uts_table, table, sizeof(uts_table)); |
53 | uts_table.data = get_uts(table, write); | 53 | uts_table.data = get_uts(table, write); |
54 | r = proc_dostring(&uts_table,write,buffer,lenp, ppos); | 54 | r = proc_dostring(&uts_table, write, buffer, lenp, ppos); |
55 | put_uts(table, write, uts_table.data); | 55 | put_uts(table, write, uts_table.data); |
56 | 56 | ||
57 | if (write) | 57 | if (write) |
@@ -135,4 +135,4 @@ static int __init utsname_sysctl_init(void) | |||
135 | return 0; | 135 | return 0; |
136 | } | 136 | } |
137 | 137 | ||
138 | __initcall(utsname_sysctl_init); | 138 | device_initcall(utsname_sysctl_init); |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 0ee63af30bd1..6203d2900877 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -65,15 +65,12 @@ enum { | |||
65 | * be executing on any CPU. The pool behaves as an unbound one. | 65 | * be executing on any CPU. The pool behaves as an unbound one. |
66 | * | 66 | * |
67 | * Note that DISASSOCIATED should be flipped only while holding | 67 | * Note that DISASSOCIATED should be flipped only while holding |
68 | * manager_mutex to avoid changing binding state while | 68 | * attach_mutex to avoid changing binding state while |
69 | * create_worker() is in progress. | 69 | * worker_attach_to_pool() is in progress. |
70 | */ | 70 | */ |
71 | POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ | ||
72 | POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ | 71 | POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ |
73 | POOL_FREEZING = 1 << 3, /* freeze in progress */ | ||
74 | 72 | ||
75 | /* worker flags */ | 73 | /* worker flags */ |
76 | WORKER_STARTED = 1 << 0, /* started */ | ||
77 | WORKER_DIE = 1 << 1, /* die die die */ | 74 | WORKER_DIE = 1 << 1, /* die die die */ |
78 | WORKER_IDLE = 1 << 2, /* is idle */ | 75 | WORKER_IDLE = 1 << 2, /* is idle */ |
79 | WORKER_PREP = 1 << 3, /* preparing to run works */ | 76 | WORKER_PREP = 1 << 3, /* preparing to run works */ |
@@ -100,10 +97,10 @@ enum { | |||
100 | 97 | ||
101 | /* | 98 | /* |
102 | * Rescue workers are used only on emergencies and shared by | 99 | * Rescue workers are used only on emergencies and shared by |
103 | * all cpus. Give -20. | 100 | * all cpus. Give MIN_NICE. |
104 | */ | 101 | */ |
105 | RESCUER_NICE_LEVEL = -20, | 102 | RESCUER_NICE_LEVEL = MIN_NICE, |
106 | HIGHPRI_NICE_LEVEL = -20, | 103 | HIGHPRI_NICE_LEVEL = MIN_NICE, |
107 | 104 | ||
108 | WQ_NAME_LEN = 24, | 105 | WQ_NAME_LEN = 24, |
109 | }; | 106 | }; |
@@ -124,8 +121,7 @@ enum { | |||
124 | * cpu or grabbing pool->lock is enough for read access. If | 121 | * cpu or grabbing pool->lock is enough for read access. If |
125 | * POOL_DISASSOCIATED is set, it's identical to L. | 122 | * POOL_DISASSOCIATED is set, it's identical to L. |
126 | * | 123 | * |
127 | * MG: pool->manager_mutex and pool->lock protected. Writes require both | 124 | * A: pool->attach_mutex protected. |
128 | * locks. Reads can happen under either lock. | ||
129 | * | 125 | * |
130 | * PL: wq_pool_mutex protected. | 126 | * PL: wq_pool_mutex protected. |
131 | * | 127 | * |
@@ -163,8 +159,11 @@ struct worker_pool { | |||
163 | 159 | ||
164 | /* see manage_workers() for details on the two manager mutexes */ | 160 | /* see manage_workers() for details on the two manager mutexes */ |
165 | struct mutex manager_arb; /* manager arbitration */ | 161 | struct mutex manager_arb; /* manager arbitration */ |
166 | struct mutex manager_mutex; /* manager exclusion */ | 162 | struct mutex attach_mutex; /* attach/detach exclusion */ |
167 | struct idr worker_idr; /* MG: worker IDs and iteration */ | 163 | struct list_head workers; /* A: attached workers */ |
164 | struct completion *detach_completion; /* all workers detached */ | ||
165 | |||
166 | struct ida worker_ida; /* worker IDs for task name */ | ||
168 | 167 | ||
169 | struct workqueue_attrs *attrs; /* I: worker attributes */ | 168 | struct workqueue_attrs *attrs; /* I: worker attributes */ |
170 | struct hlist_node hash_node; /* PL: unbound_pool_hash node */ | 169 | struct hlist_node hash_node; /* PL: unbound_pool_hash node */ |
@@ -340,16 +339,6 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, | |||
340 | lockdep_is_held(&wq->mutex), \ | 339 | lockdep_is_held(&wq->mutex), \ |
341 | "sched RCU or wq->mutex should be held") | 340 | "sched RCU or wq->mutex should be held") |
342 | 341 | ||
343 | #ifdef CONFIG_LOCKDEP | ||
344 | #define assert_manager_or_pool_lock(pool) \ | ||
345 | WARN_ONCE(debug_locks && \ | ||
346 | !lockdep_is_held(&(pool)->manager_mutex) && \ | ||
347 | !lockdep_is_held(&(pool)->lock), \ | ||
348 | "pool->manager_mutex or ->lock should be held") | ||
349 | #else | ||
350 | #define assert_manager_or_pool_lock(pool) do { } while (0) | ||
351 | #endif | ||
352 | |||
353 | #define for_each_cpu_worker_pool(pool, cpu) \ | 342 | #define for_each_cpu_worker_pool(pool, cpu) \ |
354 | for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \ | 343 | for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \ |
355 | (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \ | 344 | (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \ |
@@ -375,17 +364,16 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, | |||
375 | /** | 364 | /** |
376 | * for_each_pool_worker - iterate through all workers of a worker_pool | 365 | * for_each_pool_worker - iterate through all workers of a worker_pool |
377 | * @worker: iteration cursor | 366 | * @worker: iteration cursor |
378 | * @wi: integer used for iteration | ||
379 | * @pool: worker_pool to iterate workers of | 367 | * @pool: worker_pool to iterate workers of |
380 | * | 368 | * |
381 | * This must be called with either @pool->manager_mutex or ->lock held. | 369 | * This must be called with @pool->attach_mutex. |
382 | * | 370 | * |
383 | * The if/else clause exists only for the lockdep assertion and can be | 371 | * The if/else clause exists only for the lockdep assertion and can be |
384 | * ignored. | 372 | * ignored. |
385 | */ | 373 | */ |
386 | #define for_each_pool_worker(worker, wi, pool) \ | 374 | #define for_each_pool_worker(worker, pool) \ |
387 | idr_for_each_entry(&(pool)->worker_idr, (worker), (wi)) \ | 375 | list_for_each_entry((worker), &(pool)->workers, node) \ |
388 | if (({ assert_manager_or_pool_lock((pool)); false; })) { } \ | 376 | if (({ lockdep_assert_held(&pool->attach_mutex); false; })) { } \ |
389 | else | 377 | else |
390 | 378 | ||
391 | /** | 379 | /** |
@@ -763,13 +751,6 @@ static bool need_to_create_worker(struct worker_pool *pool) | |||
763 | return need_more_worker(pool) && !may_start_working(pool); | 751 | return need_more_worker(pool) && !may_start_working(pool); |
764 | } | 752 | } |
765 | 753 | ||
766 | /* Do I need to be the manager? */ | ||
767 | static bool need_to_manage_workers(struct worker_pool *pool) | ||
768 | { | ||
769 | return need_to_create_worker(pool) || | ||
770 | (pool->flags & POOL_MANAGE_WORKERS); | ||
771 | } | ||
772 | |||
773 | /* Do we have too many workers and should some go away? */ | 754 | /* Do we have too many workers and should some go away? */ |
774 | static bool too_many_workers(struct worker_pool *pool) | 755 | static bool too_many_workers(struct worker_pool *pool) |
775 | { | 756 | { |
@@ -791,8 +772,8 @@ static bool too_many_workers(struct worker_pool *pool) | |||
791 | * Wake up functions. | 772 | * Wake up functions. |
792 | */ | 773 | */ |
793 | 774 | ||
794 | /* Return the first worker. Safe with preemption disabled */ | 775 | /* Return the first idle worker. Safe with preemption disabled */ |
795 | static struct worker *first_worker(struct worker_pool *pool) | 776 | static struct worker *first_idle_worker(struct worker_pool *pool) |
796 | { | 777 | { |
797 | if (unlikely(list_empty(&pool->idle_list))) | 778 | if (unlikely(list_empty(&pool->idle_list))) |
798 | return NULL; | 779 | return NULL; |
@@ -811,7 +792,7 @@ static struct worker *first_worker(struct worker_pool *pool) | |||
811 | */ | 792 | */ |
812 | static void wake_up_worker(struct worker_pool *pool) | 793 | static void wake_up_worker(struct worker_pool *pool) |
813 | { | 794 | { |
814 | struct worker *worker = first_worker(pool); | 795 | struct worker *worker = first_idle_worker(pool); |
815 | 796 | ||
816 | if (likely(worker)) | 797 | if (likely(worker)) |
817 | wake_up_process(worker->task); | 798 | wake_up_process(worker->task); |
@@ -885,7 +866,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu) | |||
885 | */ | 866 | */ |
886 | if (atomic_dec_and_test(&pool->nr_running) && | 867 | if (atomic_dec_and_test(&pool->nr_running) && |
887 | !list_empty(&pool->worklist)) | 868 | !list_empty(&pool->worklist)) |
888 | to_wakeup = first_worker(pool); | 869 | to_wakeup = first_idle_worker(pool); |
889 | return to_wakeup ? to_wakeup->task : NULL; | 870 | return to_wakeup ? to_wakeup->task : NULL; |
890 | } | 871 | } |
891 | 872 | ||
@@ -1621,70 +1602,6 @@ static void worker_leave_idle(struct worker *worker) | |||
1621 | list_del_init(&worker->entry); | 1602 | list_del_init(&worker->entry); |
1622 | } | 1603 | } |
1623 | 1604 | ||
1624 | /** | ||
1625 | * worker_maybe_bind_and_lock - try to bind %current to worker_pool and lock it | ||
1626 | * @pool: target worker_pool | ||
1627 | * | ||
1628 | * Bind %current to the cpu of @pool if it is associated and lock @pool. | ||
1629 | * | ||
1630 | * Works which are scheduled while the cpu is online must at least be | ||
1631 | * scheduled to a worker which is bound to the cpu so that if they are | ||
1632 | * flushed from cpu callbacks while cpu is going down, they are | ||
1633 | * guaranteed to execute on the cpu. | ||
1634 | * | ||
1635 | * This function is to be used by unbound workers and rescuers to bind | ||
1636 | * themselves to the target cpu and may race with cpu going down or | ||
1637 | * coming online. kthread_bind() can't be used because it may put the | ||
1638 | * worker to already dead cpu and set_cpus_allowed_ptr() can't be used | ||
1639 | * verbatim as it's best effort and blocking and pool may be | ||
1640 | * [dis]associated in the meantime. | ||
1641 | * | ||
1642 | * This function tries set_cpus_allowed() and locks pool and verifies the | ||
1643 | * binding against %POOL_DISASSOCIATED which is set during | ||
1644 | * %CPU_DOWN_PREPARE and cleared during %CPU_ONLINE, so if the worker | ||
1645 | * enters idle state or fetches works without dropping lock, it can | ||
1646 | * guarantee the scheduling requirement described in the first paragraph. | ||
1647 | * | ||
1648 | * CONTEXT: | ||
1649 | * Might sleep. Called without any lock but returns with pool->lock | ||
1650 | * held. | ||
1651 | * | ||
1652 | * Return: | ||
1653 | * %true if the associated pool is online (@worker is successfully | ||
1654 | * bound), %false if offline. | ||
1655 | */ | ||
1656 | static bool worker_maybe_bind_and_lock(struct worker_pool *pool) | ||
1657 | __acquires(&pool->lock) | ||
1658 | { | ||
1659 | while (true) { | ||
1660 | /* | ||
1661 | * The following call may fail, succeed or succeed | ||
1662 | * without actually migrating the task to the cpu if | ||
1663 | * it races with cpu hotunplug operation. Verify | ||
1664 | * against POOL_DISASSOCIATED. | ||
1665 | */ | ||
1666 | if (!(pool->flags & POOL_DISASSOCIATED)) | ||
1667 | set_cpus_allowed_ptr(current, pool->attrs->cpumask); | ||
1668 | |||
1669 | spin_lock_irq(&pool->lock); | ||
1670 | if (pool->flags & POOL_DISASSOCIATED) | ||
1671 | return false; | ||
1672 | if (task_cpu(current) == pool->cpu && | ||
1673 | cpumask_equal(¤t->cpus_allowed, pool->attrs->cpumask)) | ||
1674 | return true; | ||
1675 | spin_unlock_irq(&pool->lock); | ||
1676 | |||
1677 | /* | ||
1678 | * We've raced with CPU hot[un]plug. Give it a breather | ||
1679 | * and retry migration. cond_resched() is required here; | ||
1680 | * otherwise, we might deadlock against cpu_stop trying to | ||
1681 | * bring down the CPU on non-preemptive kernel. | ||
1682 | */ | ||
1683 | cpu_relax(); | ||
1684 | cond_resched(); | ||
1685 | } | ||
1686 | } | ||
1687 | |||
1688 | static struct worker *alloc_worker(void) | 1605 | static struct worker *alloc_worker(void) |
1689 | { | 1606 | { |
1690 | struct worker *worker; | 1607 | struct worker *worker; |
@@ -1693,6 +1610,7 @@ static struct worker *alloc_worker(void) | |||
1693 | if (worker) { | 1610 | if (worker) { |
1694 | INIT_LIST_HEAD(&worker->entry); | 1611 | INIT_LIST_HEAD(&worker->entry); |
1695 | INIT_LIST_HEAD(&worker->scheduled); | 1612 | INIT_LIST_HEAD(&worker->scheduled); |
1613 | INIT_LIST_HEAD(&worker->node); | ||
1696 | /* on creation a worker is in !idle && prep state */ | 1614 | /* on creation a worker is in !idle && prep state */ |
1697 | worker->flags = WORKER_PREP; | 1615 | worker->flags = WORKER_PREP; |
1698 | } | 1616 | } |
@@ -1700,12 +1618,68 @@ static struct worker *alloc_worker(void) | |||
1700 | } | 1618 | } |
1701 | 1619 | ||
1702 | /** | 1620 | /** |
1621 | * worker_attach_to_pool() - attach a worker to a pool | ||
1622 | * @worker: worker to be attached | ||
1623 | * @pool: the target pool | ||
1624 | * | ||
1625 | * Attach @worker to @pool. Once attached, the %WORKER_UNBOUND flag and | ||
1626 | * cpu-binding of @worker are kept coordinated with the pool across | ||
1627 | * cpu-[un]hotplugs. | ||
1628 | */ | ||
1629 | static void worker_attach_to_pool(struct worker *worker, | ||
1630 | struct worker_pool *pool) | ||
1631 | { | ||
1632 | mutex_lock(&pool->attach_mutex); | ||
1633 | |||
1634 | /* | ||
1635 | * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any | ||
1636 | * online CPUs. It'll be re-applied when any of the CPUs come up. | ||
1637 | */ | ||
1638 | set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); | ||
1639 | |||
1640 | /* | ||
1641 | * The pool->attach_mutex ensures %POOL_DISASSOCIATED remains | ||
1642 | * stable across this function. See the comments above the | ||
1643 | * flag definition for details. | ||
1644 | */ | ||
1645 | if (pool->flags & POOL_DISASSOCIATED) | ||
1646 | worker->flags |= WORKER_UNBOUND; | ||
1647 | |||
1648 | list_add_tail(&worker->node, &pool->workers); | ||
1649 | |||
1650 | mutex_unlock(&pool->attach_mutex); | ||
1651 | } | ||
1652 | |||
1653 | /** | ||
1654 | * worker_detach_from_pool() - detach a worker from its pool | ||
1655 | * @worker: worker which is attached to its pool | ||
1656 | * @pool: the pool @worker is attached to | ||
1657 | * | ||
1658 | * Undo the attaching which had been done in worker_attach_to_pool(). The | ||
1659 | * caller worker shouldn't access to the pool after detached except it has | ||
1660 | * other reference to the pool. | ||
1661 | */ | ||
1662 | static void worker_detach_from_pool(struct worker *worker, | ||
1663 | struct worker_pool *pool) | ||
1664 | { | ||
1665 | struct completion *detach_completion = NULL; | ||
1666 | |||
1667 | mutex_lock(&pool->attach_mutex); | ||
1668 | list_del(&worker->node); | ||
1669 | if (list_empty(&pool->workers)) | ||
1670 | detach_completion = pool->detach_completion; | ||
1671 | mutex_unlock(&pool->attach_mutex); | ||
1672 | |||
1673 | if (detach_completion) | ||
1674 | complete(detach_completion); | ||
1675 | } | ||
1676 | |||
1677 | /** | ||
1703 | * create_worker - create a new workqueue worker | 1678 | * create_worker - create a new workqueue worker |
1704 | * @pool: pool the new worker will belong to | 1679 | * @pool: pool the new worker will belong to |
1705 | * | 1680 | * |
1706 | * Create a new worker which is bound to @pool. The returned worker | 1681 | * Create a new worker which is attached to @pool. The new worker must be |
1707 | * can be started by calling start_worker() or destroyed using | 1682 | * started by start_worker(). |
1708 | * destroy_worker(). | ||
1709 | * | 1683 | * |
1710 | * CONTEXT: | 1684 | * CONTEXT: |
1711 | * Might sleep. Does GFP_KERNEL allocations. | 1685 | * Might sleep. Does GFP_KERNEL allocations. |
@@ -1719,19 +1693,8 @@ static struct worker *create_worker(struct worker_pool *pool) | |||
1719 | int id = -1; | 1693 | int id = -1; |
1720 | char id_buf[16]; | 1694 | char id_buf[16]; |
1721 | 1695 | ||
1722 | lockdep_assert_held(&pool->manager_mutex); | 1696 | /* ID is needed to determine kthread name */ |
1723 | 1697 | id = ida_simple_get(&pool->worker_ida, 0, 0, GFP_KERNEL); | |
1724 | /* | ||
1725 | * ID is needed to determine kthread name. Allocate ID first | ||
1726 | * without installing the pointer. | ||
1727 | */ | ||
1728 | idr_preload(GFP_KERNEL); | ||
1729 | spin_lock_irq(&pool->lock); | ||
1730 | |||
1731 | id = idr_alloc(&pool->worker_idr, NULL, 0, 0, GFP_NOWAIT); | ||
1732 | |||
1733 | spin_unlock_irq(&pool->lock); | ||
1734 | idr_preload_end(); | ||
1735 | if (id < 0) | 1698 | if (id < 0) |
1736 | goto fail; | 1699 | goto fail; |
1737 | 1700 | ||
@@ -1758,33 +1721,14 @@ static struct worker *create_worker(struct worker_pool *pool) | |||
1758 | /* prevent userland from meddling with cpumask of workqueue workers */ | 1721 | /* prevent userland from meddling with cpumask of workqueue workers */ |
1759 | worker->task->flags |= PF_NO_SETAFFINITY; | 1722 | worker->task->flags |= PF_NO_SETAFFINITY; |
1760 | 1723 | ||
1761 | /* | 1724 | /* successful, attach the worker to the pool */ |
1762 | * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any | 1725 | worker_attach_to_pool(worker, pool); |
1763 | * online CPUs. It'll be re-applied when any of the CPUs come up. | ||
1764 | */ | ||
1765 | set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); | ||
1766 | |||
1767 | /* | ||
1768 | * The caller is responsible for ensuring %POOL_DISASSOCIATED | ||
1769 | * remains stable across this function. See the comments above the | ||
1770 | * flag definition for details. | ||
1771 | */ | ||
1772 | if (pool->flags & POOL_DISASSOCIATED) | ||
1773 | worker->flags |= WORKER_UNBOUND; | ||
1774 | |||
1775 | /* successful, commit the pointer to idr */ | ||
1776 | spin_lock_irq(&pool->lock); | ||
1777 | idr_replace(&pool->worker_idr, worker, worker->id); | ||
1778 | spin_unlock_irq(&pool->lock); | ||
1779 | 1726 | ||
1780 | return worker; | 1727 | return worker; |
1781 | 1728 | ||
1782 | fail: | 1729 | fail: |
1783 | if (id >= 0) { | 1730 | if (id >= 0) |
1784 | spin_lock_irq(&pool->lock); | 1731 | ida_simple_remove(&pool->worker_ida, id); |
1785 | idr_remove(&pool->worker_idr, id); | ||
1786 | spin_unlock_irq(&pool->lock); | ||
1787 | } | ||
1788 | kfree(worker); | 1732 | kfree(worker); |
1789 | return NULL; | 1733 | return NULL; |
1790 | } | 1734 | } |
@@ -1800,7 +1744,6 @@ fail: | |||
1800 | */ | 1744 | */ |
1801 | static void start_worker(struct worker *worker) | 1745 | static void start_worker(struct worker *worker) |
1802 | { | 1746 | { |
1803 | worker->flags |= WORKER_STARTED; | ||
1804 | worker->pool->nr_workers++; | 1747 | worker->pool->nr_workers++; |
1805 | worker_enter_idle(worker); | 1748 | worker_enter_idle(worker); |
1806 | wake_up_process(worker->task); | 1749 | wake_up_process(worker->task); |
@@ -1818,8 +1761,6 @@ static int create_and_start_worker(struct worker_pool *pool) | |||
1818 | { | 1761 | { |
1819 | struct worker *worker; | 1762 | struct worker *worker; |
1820 | 1763 | ||
1821 | mutex_lock(&pool->manager_mutex); | ||
1822 | |||
1823 | worker = create_worker(pool); | 1764 | worker = create_worker(pool); |
1824 | if (worker) { | 1765 | if (worker) { |
1825 | spin_lock_irq(&pool->lock); | 1766 | spin_lock_irq(&pool->lock); |
@@ -1827,8 +1768,6 @@ static int create_and_start_worker(struct worker_pool *pool) | |||
1827 | spin_unlock_irq(&pool->lock); | 1768 | spin_unlock_irq(&pool->lock); |
1828 | } | 1769 | } |
1829 | 1770 | ||
1830 | mutex_unlock(&pool->manager_mutex); | ||
1831 | |||
1832 | return worker ? 0 : -ENOMEM; | 1771 | return worker ? 0 : -ENOMEM; |
1833 | } | 1772 | } |
1834 | 1773 | ||
@@ -1836,46 +1775,30 @@ static int create_and_start_worker(struct worker_pool *pool) | |||
1836 | * destroy_worker - destroy a workqueue worker | 1775 | * destroy_worker - destroy a workqueue worker |
1837 | * @worker: worker to be destroyed | 1776 | * @worker: worker to be destroyed |
1838 | * | 1777 | * |
1839 | * Destroy @worker and adjust @pool stats accordingly. | 1778 | * Destroy @worker and adjust @pool stats accordingly. The worker should |
1779 | * be idle. | ||
1840 | * | 1780 | * |
1841 | * CONTEXT: | 1781 | * CONTEXT: |
1842 | * spin_lock_irq(pool->lock) which is released and regrabbed. | 1782 | * spin_lock_irq(pool->lock). |
1843 | */ | 1783 | */ |
1844 | static void destroy_worker(struct worker *worker) | 1784 | static void destroy_worker(struct worker *worker) |
1845 | { | 1785 | { |
1846 | struct worker_pool *pool = worker->pool; | 1786 | struct worker_pool *pool = worker->pool; |
1847 | 1787 | ||
1848 | lockdep_assert_held(&pool->manager_mutex); | ||
1849 | lockdep_assert_held(&pool->lock); | 1788 | lockdep_assert_held(&pool->lock); |
1850 | 1789 | ||
1851 | /* sanity check frenzy */ | 1790 | /* sanity check frenzy */ |
1852 | if (WARN_ON(worker->current_work) || | 1791 | if (WARN_ON(worker->current_work) || |
1853 | WARN_ON(!list_empty(&worker->scheduled))) | 1792 | WARN_ON(!list_empty(&worker->scheduled)) || |
1793 | WARN_ON(!(worker->flags & WORKER_IDLE))) | ||
1854 | return; | 1794 | return; |
1855 | 1795 | ||
1856 | if (worker->flags & WORKER_STARTED) | 1796 | pool->nr_workers--; |
1857 | pool->nr_workers--; | 1797 | pool->nr_idle--; |
1858 | if (worker->flags & WORKER_IDLE) | ||
1859 | pool->nr_idle--; | ||
1860 | |||
1861 | /* | ||
1862 | * Once WORKER_DIE is set, the kworker may destroy itself at any | ||
1863 | * point. Pin to ensure the task stays until we're done with it. | ||
1864 | */ | ||
1865 | get_task_struct(worker->task); | ||
1866 | 1798 | ||
1867 | list_del_init(&worker->entry); | 1799 | list_del_init(&worker->entry); |
1868 | worker->flags |= WORKER_DIE; | 1800 | worker->flags |= WORKER_DIE; |
1869 | 1801 | wake_up_process(worker->task); | |
1870 | idr_remove(&pool->worker_idr, worker->id); | ||
1871 | |||
1872 | spin_unlock_irq(&pool->lock); | ||
1873 | |||
1874 | kthread_stop(worker->task); | ||
1875 | put_task_struct(worker->task); | ||
1876 | kfree(worker); | ||
1877 | |||
1878 | spin_lock_irq(&pool->lock); | ||
1879 | } | 1802 | } |
1880 | 1803 | ||
1881 | static void idle_worker_timeout(unsigned long __pool) | 1804 | static void idle_worker_timeout(unsigned long __pool) |
@@ -1884,7 +1807,7 @@ static void idle_worker_timeout(unsigned long __pool) | |||
1884 | 1807 | ||
1885 | spin_lock_irq(&pool->lock); | 1808 | spin_lock_irq(&pool->lock); |
1886 | 1809 | ||
1887 | if (too_many_workers(pool)) { | 1810 | while (too_many_workers(pool)) { |
1888 | struct worker *worker; | 1811 | struct worker *worker; |
1889 | unsigned long expires; | 1812 | unsigned long expires; |
1890 | 1813 | ||
@@ -1892,13 +1815,12 @@ static void idle_worker_timeout(unsigned long __pool) | |||
1892 | worker = list_entry(pool->idle_list.prev, struct worker, entry); | 1815 | worker = list_entry(pool->idle_list.prev, struct worker, entry); |
1893 | expires = worker->last_active + IDLE_WORKER_TIMEOUT; | 1816 | expires = worker->last_active + IDLE_WORKER_TIMEOUT; |
1894 | 1817 | ||
1895 | if (time_before(jiffies, expires)) | 1818 | if (time_before(jiffies, expires)) { |
1896 | mod_timer(&pool->idle_timer, expires); | 1819 | mod_timer(&pool->idle_timer, expires); |
1897 | else { | 1820 | break; |
1898 | /* it's been idle for too long, wake up manager */ | ||
1899 | pool->flags |= POOL_MANAGE_WORKERS; | ||
1900 | wake_up_worker(pool); | ||
1901 | } | 1821 | } |
1822 | |||
1823 | destroy_worker(worker); | ||
1902 | } | 1824 | } |
1903 | 1825 | ||
1904 | spin_unlock_irq(&pool->lock); | 1826 | spin_unlock_irq(&pool->lock); |
@@ -1916,6 +1838,12 @@ static void send_mayday(struct work_struct *work) | |||
1916 | 1838 | ||
1917 | /* mayday mayday mayday */ | 1839 | /* mayday mayday mayday */ |
1918 | if (list_empty(&pwq->mayday_node)) { | 1840 | if (list_empty(&pwq->mayday_node)) { |
1841 | /* | ||
1842 | * If @pwq is for an unbound wq, its base ref may be put at | ||
1843 | * any time due to an attribute change. Pin @pwq until the | ||
1844 | * rescuer is done with it. | ||
1845 | */ | ||
1846 | get_pwq(pwq); | ||
1919 | list_add_tail(&pwq->mayday_node, &wq->maydays); | 1847 | list_add_tail(&pwq->mayday_node, &wq->maydays); |
1920 | wake_up_process(wq->rescuer->task); | 1848 | wake_up_process(wq->rescuer->task); |
1921 | } | 1849 | } |
@@ -2011,44 +1939,6 @@ restart: | |||
2011 | } | 1939 | } |
2012 | 1940 | ||
2013 | /** | 1941 | /** |
2014 | * maybe_destroy_worker - destroy workers which have been idle for a while | ||
2015 | * @pool: pool to destroy workers for | ||
2016 | * | ||
2017 | * Destroy @pool workers which have been idle for longer than | ||
2018 | * IDLE_WORKER_TIMEOUT. | ||
2019 | * | ||
2020 | * LOCKING: | ||
2021 | * spin_lock_irq(pool->lock) which may be released and regrabbed | ||
2022 | * multiple times. Called only from manager. | ||
2023 | * | ||
2024 | * Return: | ||
2025 | * %false if no action was taken and pool->lock stayed locked, %true | ||
2026 | * otherwise. | ||
2027 | */ | ||
2028 | static bool maybe_destroy_workers(struct worker_pool *pool) | ||
2029 | { | ||
2030 | bool ret = false; | ||
2031 | |||
2032 | while (too_many_workers(pool)) { | ||
2033 | struct worker *worker; | ||
2034 | unsigned long expires; | ||
2035 | |||
2036 | worker = list_entry(pool->idle_list.prev, struct worker, entry); | ||
2037 | expires = worker->last_active + IDLE_WORKER_TIMEOUT; | ||
2038 | |||
2039 | if (time_before(jiffies, expires)) { | ||
2040 | mod_timer(&pool->idle_timer, expires); | ||
2041 | break; | ||
2042 | } | ||
2043 | |||
2044 | destroy_worker(worker); | ||
2045 | ret = true; | ||
2046 | } | ||
2047 | |||
2048 | return ret; | ||
2049 | } | ||
2050 | |||
2051 | /** | ||
2052 | * manage_workers - manage worker pool | 1942 | * manage_workers - manage worker pool |
2053 | * @worker: self | 1943 | * @worker: self |
2054 | * | 1944 | * |
@@ -2077,8 +1967,6 @@ static bool manage_workers(struct worker *worker) | |||
2077 | bool ret = false; | 1967 | bool ret = false; |
2078 | 1968 | ||
2079 | /* | 1969 | /* |
2080 | * Managership is governed by two mutexes - manager_arb and | ||
2081 | * manager_mutex. manager_arb handles arbitration of manager role. | ||
2082 | * Anyone who successfully grabs manager_arb wins the arbitration | 1970 | * Anyone who successfully grabs manager_arb wins the arbitration |
2083 | * and becomes the manager. mutex_trylock() on pool->manager_arb | 1971 | * and becomes the manager. mutex_trylock() on pool->manager_arb |
2084 | * failure while holding pool->lock reliably indicates that someone | 1972 | * failure while holding pool->lock reliably indicates that someone |
@@ -2087,40 +1975,12 @@ static bool manage_workers(struct worker *worker) | |||
2087 | * grabbing manager_arb is responsible for actually performing | 1975 | * grabbing manager_arb is responsible for actually performing |
2088 | * manager duties. If manager_arb is grabbed and released without | 1976 | * manager duties. If manager_arb is grabbed and released without |
2089 | * actual management, the pool may stall indefinitely. | 1977 | * actual management, the pool may stall indefinitely. |
2090 | * | ||
2091 | * manager_mutex is used for exclusion of actual management | ||
2092 | * operations. The holder of manager_mutex can be sure that none | ||
2093 | * of management operations, including creation and destruction of | ||
2094 | * workers, won't take place until the mutex is released. Because | ||
2095 | * manager_mutex doesn't interfere with manager role arbitration, | ||
2096 | * it is guaranteed that the pool's management, while may be | ||
2097 | * delayed, won't be disturbed by someone else grabbing | ||
2098 | * manager_mutex. | ||
2099 | */ | 1978 | */ |
2100 | if (!mutex_trylock(&pool->manager_arb)) | 1979 | if (!mutex_trylock(&pool->manager_arb)) |
2101 | return ret; | 1980 | return ret; |
2102 | 1981 | ||
2103 | /* | ||
2104 | * With manager arbitration won, manager_mutex would be free in | ||
2105 | * most cases. trylock first without dropping @pool->lock. | ||
2106 | */ | ||
2107 | if (unlikely(!mutex_trylock(&pool->manager_mutex))) { | ||
2108 | spin_unlock_irq(&pool->lock); | ||
2109 | mutex_lock(&pool->manager_mutex); | ||
2110 | spin_lock_irq(&pool->lock); | ||
2111 | ret = true; | ||
2112 | } | ||
2113 | |||
2114 | pool->flags &= ~POOL_MANAGE_WORKERS; | ||
2115 | |||
2116 | /* | ||
2117 | * Destroy and then create so that may_start_working() is true | ||
2118 | * on return. | ||
2119 | */ | ||
2120 | ret |= maybe_destroy_workers(pool); | ||
2121 | ret |= maybe_create_worker(pool); | 1982 | ret |= maybe_create_worker(pool); |
2122 | 1983 | ||
2123 | mutex_unlock(&pool->manager_mutex); | ||
2124 | mutex_unlock(&pool->manager_arb); | 1984 | mutex_unlock(&pool->manager_arb); |
2125 | return ret; | 1985 | return ret; |
2126 | } | 1986 | } |
@@ -2308,6 +2168,11 @@ woke_up: | |||
2308 | spin_unlock_irq(&pool->lock); | 2168 | spin_unlock_irq(&pool->lock); |
2309 | WARN_ON_ONCE(!list_empty(&worker->entry)); | 2169 | WARN_ON_ONCE(!list_empty(&worker->entry)); |
2310 | worker->task->flags &= ~PF_WQ_WORKER; | 2170 | worker->task->flags &= ~PF_WQ_WORKER; |
2171 | |||
2172 | set_task_comm(worker->task, "kworker/dying"); | ||
2173 | ida_simple_remove(&pool->worker_ida, worker->id); | ||
2174 | worker_detach_from_pool(worker, pool); | ||
2175 | kfree(worker); | ||
2311 | return 0; | 2176 | return 0; |
2312 | } | 2177 | } |
2313 | 2178 | ||
@@ -2355,9 +2220,6 @@ recheck: | |||
2355 | 2220 | ||
2356 | worker_set_flags(worker, WORKER_PREP, false); | 2221 | worker_set_flags(worker, WORKER_PREP, false); |
2357 | sleep: | 2222 | sleep: |
2358 | if (unlikely(need_to_manage_workers(pool)) && manage_workers(worker)) | ||
2359 | goto recheck; | ||
2360 | |||
2361 | /* | 2223 | /* |
2362 | * pool->lock is held and there's no work to process and no need to | 2224 | * pool->lock is held and there's no work to process and no need to |
2363 | * manage, sleep. Workers are woken up only while holding | 2225 | * manage, sleep. Workers are woken up only while holding |
@@ -2398,6 +2260,7 @@ static int rescuer_thread(void *__rescuer) | |||
2398 | struct worker *rescuer = __rescuer; | 2260 | struct worker *rescuer = __rescuer; |
2399 | struct workqueue_struct *wq = rescuer->rescue_wq; | 2261 | struct workqueue_struct *wq = rescuer->rescue_wq; |
2400 | struct list_head *scheduled = &rescuer->scheduled; | 2262 | struct list_head *scheduled = &rescuer->scheduled; |
2263 | bool should_stop; | ||
2401 | 2264 | ||
2402 | set_user_nice(current, RESCUER_NICE_LEVEL); | 2265 | set_user_nice(current, RESCUER_NICE_LEVEL); |
2403 | 2266 | ||
@@ -2409,11 +2272,15 @@ static int rescuer_thread(void *__rescuer) | |||
2409 | repeat: | 2272 | repeat: |
2410 | set_current_state(TASK_INTERRUPTIBLE); | 2273 | set_current_state(TASK_INTERRUPTIBLE); |
2411 | 2274 | ||
2412 | if (kthread_should_stop()) { | 2275 | /* |
2413 | __set_current_state(TASK_RUNNING); | 2276 | * By the time the rescuer is requested to stop, the workqueue |
2414 | rescuer->task->flags &= ~PF_WQ_WORKER; | 2277 | * shouldn't have any work pending, but @wq->maydays may still have |
2415 | return 0; | 2278 | * pwq(s) queued. This can happen by non-rescuer workers consuming |
2416 | } | 2279 | * all the work items before the rescuer got to them. Go through |
2280 | * @wq->maydays processing before acting on should_stop so that the | ||
2281 | * list is always empty on exit. | ||
2282 | */ | ||
2283 | should_stop = kthread_should_stop(); | ||
2417 | 2284 | ||
2418 | /* see whether any pwq is asking for help */ | 2285 | /* see whether any pwq is asking for help */ |
2419 | spin_lock_irq(&wq_mayday_lock); | 2286 | spin_lock_irq(&wq_mayday_lock); |
@@ -2429,8 +2296,9 @@ repeat: | |||
2429 | 2296 | ||
2430 | spin_unlock_irq(&wq_mayday_lock); | 2297 | spin_unlock_irq(&wq_mayday_lock); |
2431 | 2298 | ||
2432 | /* migrate to the target cpu if possible */ | 2299 | worker_attach_to_pool(rescuer, pool); |
2433 | worker_maybe_bind_and_lock(pool); | 2300 | |
2301 | spin_lock_irq(&pool->lock); | ||
2434 | rescuer->pool = pool; | 2302 | rescuer->pool = pool; |
2435 | 2303 | ||
2436 | /* | 2304 | /* |
@@ -2443,6 +2311,17 @@ repeat: | |||
2443 | move_linked_works(work, scheduled, &n); | 2311 | move_linked_works(work, scheduled, &n); |
2444 | 2312 | ||
2445 | process_scheduled_works(rescuer); | 2313 | process_scheduled_works(rescuer); |
2314 | spin_unlock_irq(&pool->lock); | ||
2315 | |||
2316 | worker_detach_from_pool(rescuer, pool); | ||
2317 | |||
2318 | spin_lock_irq(&pool->lock); | ||
2319 | |||
2320 | /* | ||
2321 | * Put the reference grabbed by send_mayday(). @pool won't | ||
2322 | * go away while we're holding its lock. | ||
2323 | */ | ||
2324 | put_pwq(pwq); | ||
2446 | 2325 | ||
2447 | /* | 2326 | /* |
2448 | * Leave this pool. If keep_working() is %true, notify a | 2327 | * Leave this pool. If keep_working() is %true, notify a |
@@ -2459,6 +2338,12 @@ repeat: | |||
2459 | 2338 | ||
2460 | spin_unlock_irq(&wq_mayday_lock); | 2339 | spin_unlock_irq(&wq_mayday_lock); |
2461 | 2340 | ||
2341 | if (should_stop) { | ||
2342 | __set_current_state(TASK_RUNNING); | ||
2343 | rescuer->task->flags &= ~PF_WQ_WORKER; | ||
2344 | return 0; | ||
2345 | } | ||
2346 | |||
2462 | /* rescuers should never participate in concurrency management */ | 2347 | /* rescuers should never participate in concurrency management */ |
2463 | WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING)); | 2348 | WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING)); |
2464 | schedule(); | 2349 | schedule(); |
@@ -3527,9 +3412,10 @@ static int init_worker_pool(struct worker_pool *pool) | |||
3527 | (unsigned long)pool); | 3412 | (unsigned long)pool); |
3528 | 3413 | ||
3529 | mutex_init(&pool->manager_arb); | 3414 | mutex_init(&pool->manager_arb); |
3530 | mutex_init(&pool->manager_mutex); | 3415 | mutex_init(&pool->attach_mutex); |
3531 | idr_init(&pool->worker_idr); | 3416 | INIT_LIST_HEAD(&pool->workers); |
3532 | 3417 | ||
3418 | ida_init(&pool->worker_ida); | ||
3533 | INIT_HLIST_NODE(&pool->hash_node); | 3419 | INIT_HLIST_NODE(&pool->hash_node); |
3534 | pool->refcnt = 1; | 3420 | pool->refcnt = 1; |
3535 | 3421 | ||
@@ -3544,7 +3430,7 @@ static void rcu_free_pool(struct rcu_head *rcu) | |||
3544 | { | 3430 | { |
3545 | struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu); | 3431 | struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu); |
3546 | 3432 | ||
3547 | idr_destroy(&pool->worker_idr); | 3433 | ida_destroy(&pool->worker_ida); |
3548 | free_workqueue_attrs(pool->attrs); | 3434 | free_workqueue_attrs(pool->attrs); |
3549 | kfree(pool); | 3435 | kfree(pool); |
3550 | } | 3436 | } |
@@ -3562,6 +3448,7 @@ static void rcu_free_pool(struct rcu_head *rcu) | |||
3562 | */ | 3448 | */ |
3563 | static void put_unbound_pool(struct worker_pool *pool) | 3449 | static void put_unbound_pool(struct worker_pool *pool) |
3564 | { | 3450 | { |
3451 | DECLARE_COMPLETION_ONSTACK(detach_completion); | ||
3565 | struct worker *worker; | 3452 | struct worker *worker; |
3566 | 3453 | ||
3567 | lockdep_assert_held(&wq_pool_mutex); | 3454 | lockdep_assert_held(&wq_pool_mutex); |
@@ -3582,18 +3469,24 @@ static void put_unbound_pool(struct worker_pool *pool) | |||
3582 | /* | 3469 | /* |
3583 | * Become the manager and destroy all workers. Grabbing | 3470 | * Become the manager and destroy all workers. Grabbing |
3584 | * manager_arb prevents @pool's workers from blocking on | 3471 | * manager_arb prevents @pool's workers from blocking on |
3585 | * manager_mutex. | 3472 | * attach_mutex. |
3586 | */ | 3473 | */ |
3587 | mutex_lock(&pool->manager_arb); | 3474 | mutex_lock(&pool->manager_arb); |
3588 | mutex_lock(&pool->manager_mutex); | ||
3589 | spin_lock_irq(&pool->lock); | ||
3590 | 3475 | ||
3591 | while ((worker = first_worker(pool))) | 3476 | spin_lock_irq(&pool->lock); |
3477 | while ((worker = first_idle_worker(pool))) | ||
3592 | destroy_worker(worker); | 3478 | destroy_worker(worker); |
3593 | WARN_ON(pool->nr_workers || pool->nr_idle); | 3479 | WARN_ON(pool->nr_workers || pool->nr_idle); |
3594 | |||
3595 | spin_unlock_irq(&pool->lock); | 3480 | spin_unlock_irq(&pool->lock); |
3596 | mutex_unlock(&pool->manager_mutex); | 3481 | |
3482 | mutex_lock(&pool->attach_mutex); | ||
3483 | if (!list_empty(&pool->workers)) | ||
3484 | pool->detach_completion = &detach_completion; | ||
3485 | mutex_unlock(&pool->attach_mutex); | ||
3486 | |||
3487 | if (pool->detach_completion) | ||
3488 | wait_for_completion(pool->detach_completion); | ||
3489 | |||
3597 | mutex_unlock(&pool->manager_arb); | 3490 | mutex_unlock(&pool->manager_arb); |
3598 | 3491 | ||
3599 | /* shut down the timers */ | 3492 | /* shut down the timers */ |
@@ -3639,9 +3532,6 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) | |||
3639 | if (!pool || init_worker_pool(pool) < 0) | 3532 | if (!pool || init_worker_pool(pool) < 0) |
3640 | goto fail; | 3533 | goto fail; |
3641 | 3534 | ||
3642 | if (workqueue_freezing) | ||
3643 | pool->flags |= POOL_FREEZING; | ||
3644 | |||
3645 | lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */ | 3535 | lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */ |
3646 | copy_workqueue_attrs(pool->attrs, attrs); | 3536 | copy_workqueue_attrs(pool->attrs, attrs); |
3647 | 3537 | ||
@@ -3748,7 +3638,12 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq) | |||
3748 | 3638 | ||
3749 | spin_lock_irq(&pwq->pool->lock); | 3639 | spin_lock_irq(&pwq->pool->lock); |
3750 | 3640 | ||
3751 | if (!freezable || !(pwq->pool->flags & POOL_FREEZING)) { | 3641 | /* |
3642 | * During [un]freezing, the caller is responsible for ensuring that | ||
3643 | * this function is called at least once after @workqueue_freezing | ||
3644 | * is updated and visible. | ||
3645 | */ | ||
3646 | if (!freezable || !workqueue_freezing) { | ||
3752 | pwq->max_active = wq->saved_max_active; | 3647 | pwq->max_active = wq->saved_max_active; |
3753 | 3648 | ||
3754 | while (!list_empty(&pwq->delayed_works) && | 3649 | while (!list_empty(&pwq->delayed_works) && |
@@ -4080,17 +3975,13 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu, | |||
4080 | * Let's determine what needs to be done. If the target cpumask is | 3975 | * Let's determine what needs to be done. If the target cpumask is |
4081 | * different from wq's, we need to compare it to @pwq's and create | 3976 | * different from wq's, we need to compare it to @pwq's and create |
4082 | * a new one if they don't match. If the target cpumask equals | 3977 | * a new one if they don't match. If the target cpumask equals |
4083 | * wq's, the default pwq should be used. If @pwq is already the | 3978 | * wq's, the default pwq should be used. |
4084 | * default one, nothing to do; otherwise, install the default one. | ||
4085 | */ | 3979 | */ |
4086 | if (wq_calc_node_cpumask(wq->unbound_attrs, node, cpu_off, cpumask)) { | 3980 | if (wq_calc_node_cpumask(wq->unbound_attrs, node, cpu_off, cpumask)) { |
4087 | if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask)) | 3981 | if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask)) |
4088 | goto out_unlock; | 3982 | goto out_unlock; |
4089 | } else { | 3983 | } else { |
4090 | if (pwq == wq->dfl_pwq) | 3984 | goto use_dfl_pwq; |
4091 | goto out_unlock; | ||
4092 | else | ||
4093 | goto use_dfl_pwq; | ||
4094 | } | 3985 | } |
4095 | 3986 | ||
4096 | mutex_unlock(&wq->mutex); | 3987 | mutex_unlock(&wq->mutex); |
@@ -4098,9 +3989,10 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu, | |||
4098 | /* create a new pwq */ | 3989 | /* create a new pwq */ |
4099 | pwq = alloc_unbound_pwq(wq, target_attrs); | 3990 | pwq = alloc_unbound_pwq(wq, target_attrs); |
4100 | if (!pwq) { | 3991 | if (!pwq) { |
4101 | pr_warning("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n", | 3992 | pr_warn("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n", |
4102 | wq->name); | 3993 | wq->name); |
4103 | goto out_unlock; | 3994 | mutex_lock(&wq->mutex); |
3995 | goto use_dfl_pwq; | ||
4104 | } | 3996 | } |
4105 | 3997 | ||
4106 | /* | 3998 | /* |
@@ -4575,28 +4467,27 @@ static void wq_unbind_fn(struct work_struct *work) | |||
4575 | int cpu = smp_processor_id(); | 4467 | int cpu = smp_processor_id(); |
4576 | struct worker_pool *pool; | 4468 | struct worker_pool *pool; |
4577 | struct worker *worker; | 4469 | struct worker *worker; |
4578 | int wi; | ||
4579 | 4470 | ||
4580 | for_each_cpu_worker_pool(pool, cpu) { | 4471 | for_each_cpu_worker_pool(pool, cpu) { |
4581 | WARN_ON_ONCE(cpu != smp_processor_id()); | 4472 | WARN_ON_ONCE(cpu != smp_processor_id()); |
4582 | 4473 | ||
4583 | mutex_lock(&pool->manager_mutex); | 4474 | mutex_lock(&pool->attach_mutex); |
4584 | spin_lock_irq(&pool->lock); | 4475 | spin_lock_irq(&pool->lock); |
4585 | 4476 | ||
4586 | /* | 4477 | /* |
4587 | * We've blocked all manager operations. Make all workers | 4478 | * We've blocked all attach/detach operations. Make all workers |
4588 | * unbound and set DISASSOCIATED. Before this, all workers | 4479 | * unbound and set DISASSOCIATED. Before this, all workers |
4589 | * except for the ones which are still executing works from | 4480 | * except for the ones which are still executing works from |
4590 | * before the last CPU down must be on the cpu. After | 4481 | * before the last CPU down must be on the cpu. After |
4591 | * this, they may become diasporas. | 4482 | * this, they may become diasporas. |
4592 | */ | 4483 | */ |
4593 | for_each_pool_worker(worker, wi, pool) | 4484 | for_each_pool_worker(worker, pool) |
4594 | worker->flags |= WORKER_UNBOUND; | 4485 | worker->flags |= WORKER_UNBOUND; |
4595 | 4486 | ||
4596 | pool->flags |= POOL_DISASSOCIATED; | 4487 | pool->flags |= POOL_DISASSOCIATED; |
4597 | 4488 | ||
4598 | spin_unlock_irq(&pool->lock); | 4489 | spin_unlock_irq(&pool->lock); |
4599 | mutex_unlock(&pool->manager_mutex); | 4490 | mutex_unlock(&pool->attach_mutex); |
4600 | 4491 | ||
4601 | /* | 4492 | /* |
4602 | * Call schedule() so that we cross rq->lock and thus can | 4493 | * Call schedule() so that we cross rq->lock and thus can |
@@ -4636,9 +4527,8 @@ static void wq_unbind_fn(struct work_struct *work) | |||
4636 | static void rebind_workers(struct worker_pool *pool) | 4527 | static void rebind_workers(struct worker_pool *pool) |
4637 | { | 4528 | { |
4638 | struct worker *worker; | 4529 | struct worker *worker; |
4639 | int wi; | ||
4640 | 4530 | ||
4641 | lockdep_assert_held(&pool->manager_mutex); | 4531 | lockdep_assert_held(&pool->attach_mutex); |
4642 | 4532 | ||
4643 | /* | 4533 | /* |
4644 | * Restore CPU affinity of all workers. As all idle workers should | 4534 | * Restore CPU affinity of all workers. As all idle workers should |
@@ -4647,13 +4537,13 @@ static void rebind_workers(struct worker_pool *pool) | |||
4647 | * of all workers first and then clear UNBOUND. As we're called | 4537 | * of all workers first and then clear UNBOUND. As we're called |
4648 | * from CPU_ONLINE, the following shouldn't fail. | 4538 | * from CPU_ONLINE, the following shouldn't fail. |
4649 | */ | 4539 | */ |
4650 | for_each_pool_worker(worker, wi, pool) | 4540 | for_each_pool_worker(worker, pool) |
4651 | WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, | 4541 | WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, |
4652 | pool->attrs->cpumask) < 0); | 4542 | pool->attrs->cpumask) < 0); |
4653 | 4543 | ||
4654 | spin_lock_irq(&pool->lock); | 4544 | spin_lock_irq(&pool->lock); |
4655 | 4545 | ||
4656 | for_each_pool_worker(worker, wi, pool) { | 4546 | for_each_pool_worker(worker, pool) { |
4657 | unsigned int worker_flags = worker->flags; | 4547 | unsigned int worker_flags = worker->flags; |
4658 | 4548 | ||
4659 | /* | 4549 | /* |
@@ -4705,9 +4595,8 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu) | |||
4705 | { | 4595 | { |
4706 | static cpumask_t cpumask; | 4596 | static cpumask_t cpumask; |
4707 | struct worker *worker; | 4597 | struct worker *worker; |
4708 | int wi; | ||
4709 | 4598 | ||
4710 | lockdep_assert_held(&pool->manager_mutex); | 4599 | lockdep_assert_held(&pool->attach_mutex); |
4711 | 4600 | ||
4712 | /* is @cpu allowed for @pool? */ | 4601 | /* is @cpu allowed for @pool? */ |
4713 | if (!cpumask_test_cpu(cpu, pool->attrs->cpumask)) | 4602 | if (!cpumask_test_cpu(cpu, pool->attrs->cpumask)) |
@@ -4719,7 +4608,7 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu) | |||
4719 | return; | 4608 | return; |
4720 | 4609 | ||
4721 | /* as we're called from CPU_ONLINE, the following shouldn't fail */ | 4610 | /* as we're called from CPU_ONLINE, the following shouldn't fail */ |
4722 | for_each_pool_worker(worker, wi, pool) | 4611 | for_each_pool_worker(worker, pool) |
4723 | WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, | 4612 | WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, |
4724 | pool->attrs->cpumask) < 0); | 4613 | pool->attrs->cpumask) < 0); |
4725 | } | 4614 | } |
@@ -4752,7 +4641,7 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb, | |||
4752 | mutex_lock(&wq_pool_mutex); | 4641 | mutex_lock(&wq_pool_mutex); |
4753 | 4642 | ||
4754 | for_each_pool(pool, pi) { | 4643 | for_each_pool(pool, pi) { |
4755 | mutex_lock(&pool->manager_mutex); | 4644 | mutex_lock(&pool->attach_mutex); |
4756 | 4645 | ||
4757 | if (pool->cpu == cpu) { | 4646 | if (pool->cpu == cpu) { |
4758 | spin_lock_irq(&pool->lock); | 4647 | spin_lock_irq(&pool->lock); |
@@ -4764,7 +4653,7 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb, | |||
4764 | restore_unbound_workers_cpumask(pool, cpu); | 4653 | restore_unbound_workers_cpumask(pool, cpu); |
4765 | } | 4654 | } |
4766 | 4655 | ||
4767 | mutex_unlock(&pool->manager_mutex); | 4656 | mutex_unlock(&pool->attach_mutex); |
4768 | } | 4657 | } |
4769 | 4658 | ||
4770 | /* update NUMA affinity of unbound workqueues */ | 4659 | /* update NUMA affinity of unbound workqueues */ |
@@ -4863,24 +4752,14 @@ EXPORT_SYMBOL_GPL(work_on_cpu); | |||
4863 | */ | 4752 | */ |
4864 | void freeze_workqueues_begin(void) | 4753 | void freeze_workqueues_begin(void) |
4865 | { | 4754 | { |
4866 | struct worker_pool *pool; | ||
4867 | struct workqueue_struct *wq; | 4755 | struct workqueue_struct *wq; |
4868 | struct pool_workqueue *pwq; | 4756 | struct pool_workqueue *pwq; |
4869 | int pi; | ||
4870 | 4757 | ||
4871 | mutex_lock(&wq_pool_mutex); | 4758 | mutex_lock(&wq_pool_mutex); |
4872 | 4759 | ||
4873 | WARN_ON_ONCE(workqueue_freezing); | 4760 | WARN_ON_ONCE(workqueue_freezing); |
4874 | workqueue_freezing = true; | 4761 | workqueue_freezing = true; |
4875 | 4762 | ||
4876 | /* set FREEZING */ | ||
4877 | for_each_pool(pool, pi) { | ||
4878 | spin_lock_irq(&pool->lock); | ||
4879 | WARN_ON_ONCE(pool->flags & POOL_FREEZING); | ||
4880 | pool->flags |= POOL_FREEZING; | ||
4881 | spin_unlock_irq(&pool->lock); | ||
4882 | } | ||
4883 | |||
4884 | list_for_each_entry(wq, &workqueues, list) { | 4763 | list_for_each_entry(wq, &workqueues, list) { |
4885 | mutex_lock(&wq->mutex); | 4764 | mutex_lock(&wq->mutex); |
4886 | for_each_pwq(pwq, wq) | 4765 | for_each_pwq(pwq, wq) |
@@ -4950,21 +4829,13 @@ void thaw_workqueues(void) | |||
4950 | { | 4829 | { |
4951 | struct workqueue_struct *wq; | 4830 | struct workqueue_struct *wq; |
4952 | struct pool_workqueue *pwq; | 4831 | struct pool_workqueue *pwq; |
4953 | struct worker_pool *pool; | ||
4954 | int pi; | ||
4955 | 4832 | ||
4956 | mutex_lock(&wq_pool_mutex); | 4833 | mutex_lock(&wq_pool_mutex); |
4957 | 4834 | ||
4958 | if (!workqueue_freezing) | 4835 | if (!workqueue_freezing) |
4959 | goto out_unlock; | 4836 | goto out_unlock; |
4960 | 4837 | ||
4961 | /* clear FREEZING */ | 4838 | workqueue_freezing = false; |
4962 | for_each_pool(pool, pi) { | ||
4963 | spin_lock_irq(&pool->lock); | ||
4964 | WARN_ON_ONCE(!(pool->flags & POOL_FREEZING)); | ||
4965 | pool->flags &= ~POOL_FREEZING; | ||
4966 | spin_unlock_irq(&pool->lock); | ||
4967 | } | ||
4968 | 4839 | ||
4969 | /* restore max_active and repopulate worklist */ | 4840 | /* restore max_active and repopulate worklist */ |
4970 | list_for_each_entry(wq, &workqueues, list) { | 4841 | list_for_each_entry(wq, &workqueues, list) { |
@@ -4974,7 +4845,6 @@ void thaw_workqueues(void) | |||
4974 | mutex_unlock(&wq->mutex); | 4845 | mutex_unlock(&wq->mutex); |
4975 | } | 4846 | } |
4976 | 4847 | ||
4977 | workqueue_freezing = false; | ||
4978 | out_unlock: | 4848 | out_unlock: |
4979 | mutex_unlock(&wq_pool_mutex); | 4849 | mutex_unlock(&wq_pool_mutex); |
4980 | } | 4850 | } |
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index 7e2204db0b1a..45215870ac6c 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h | |||
@@ -37,6 +37,8 @@ struct worker { | |||
37 | struct task_struct *task; /* I: worker task */ | 37 | struct task_struct *task; /* I: worker task */ |
38 | struct worker_pool *pool; /* I: the associated pool */ | 38 | struct worker_pool *pool; /* I: the associated pool */ |
39 | /* L: for rescuers */ | 39 | /* L: for rescuers */ |
40 | struct list_head node; /* A: anchored at pool->workers */ | ||
41 | /* A: runs through worker->node */ | ||
40 | 42 | ||
41 | unsigned long last_active; /* L: last active timestamp */ | 43 | unsigned long last_active; /* L: last active timestamp */ |
42 | unsigned int flags; /* X: flags */ | 44 | unsigned int flags; /* X: flags */ |