diff options
Diffstat (limited to 'kernel')
119 files changed, 5168 insertions, 3124 deletions
diff --git a/kernel/acct.c b/kernel/acct.c index 8d6e145138bb..808a86ff229d 100644 --- a/kernel/acct.c +++ b/kernel/acct.c | |||
| @@ -55,7 +55,7 @@ | |||
| 55 | #include <linux/times.h> | 55 | #include <linux/times.h> |
| 56 | #include <linux/syscalls.h> | 56 | #include <linux/syscalls.h> |
| 57 | #include <linux/mount.h> | 57 | #include <linux/mount.h> |
| 58 | #include <asm/uaccess.h> | 58 | #include <linux/uaccess.h> |
| 59 | #include <asm/div64.h> | 59 | #include <asm/div64.h> |
| 60 | #include <linux/blkdev.h> /* sector_div */ | 60 | #include <linux/blkdev.h> /* sector_div */ |
| 61 | #include <linux/pid_namespace.h> | 61 | #include <linux/pid_namespace.h> |
| @@ -134,7 +134,7 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file) | |||
| 134 | spin_lock(&acct_lock); | 134 | spin_lock(&acct_lock); |
| 135 | if (file != acct->file) { | 135 | if (file != acct->file) { |
| 136 | if (act) | 136 | if (act) |
| 137 | res = act>0; | 137 | res = act > 0; |
| 138 | goto out; | 138 | goto out; |
| 139 | } | 139 | } |
| 140 | 140 | ||
| @@ -262,7 +262,7 @@ SYSCALL_DEFINE1(acct, const char __user *, name) | |||
| 262 | if (name) { | 262 | if (name) { |
| 263 | struct filename *tmp = getname(name); | 263 | struct filename *tmp = getname(name); |
| 264 | if (IS_ERR(tmp)) | 264 | if (IS_ERR(tmp)) |
| 265 | return (PTR_ERR(tmp)); | 265 | return PTR_ERR(tmp); |
| 266 | error = acct_on(tmp); | 266 | error = acct_on(tmp); |
| 267 | putname(tmp); | 267 | putname(tmp); |
| 268 | } else { | 268 | } else { |
diff --git a/kernel/audit.c b/kernel/audit.c index 7c2893602d06..f30106459a32 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
| @@ -44,7 +44,7 @@ | |||
| 44 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | 44 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
| 45 | 45 | ||
| 46 | #include <linux/init.h> | 46 | #include <linux/init.h> |
| 47 | #include <asm/types.h> | 47 | #include <linux/types.h> |
| 48 | #include <linux/atomic.h> | 48 | #include <linux/atomic.h> |
| 49 | #include <linux/mm.h> | 49 | #include <linux/mm.h> |
| 50 | #include <linux/export.h> | 50 | #include <linux/export.h> |
| @@ -643,13 +643,13 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) | |||
| 643 | if ((task_active_pid_ns(current) != &init_pid_ns)) | 643 | if ((task_active_pid_ns(current) != &init_pid_ns)) |
| 644 | return -EPERM; | 644 | return -EPERM; |
| 645 | 645 | ||
| 646 | if (!capable(CAP_AUDIT_CONTROL)) | 646 | if (!netlink_capable(skb, CAP_AUDIT_CONTROL)) |
| 647 | err = -EPERM; | 647 | err = -EPERM; |
| 648 | break; | 648 | break; |
| 649 | case AUDIT_USER: | 649 | case AUDIT_USER: |
| 650 | case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: | 650 | case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: |
| 651 | case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2: | 651 | case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2: |
| 652 | if (!capable(CAP_AUDIT_WRITE)) | 652 | if (!netlink_capable(skb, CAP_AUDIT_WRITE)) |
| 653 | err = -EPERM; | 653 | err = -EPERM; |
| 654 | break; | 654 | break; |
| 655 | default: /* bad msg */ | 655 | default: /* bad msg */ |
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index f251a5e8d17a..21eae3c05ec0 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
| @@ -728,6 +728,22 @@ static enum audit_state audit_filter_task(struct task_struct *tsk, char **key) | |||
| 728 | return AUDIT_BUILD_CONTEXT; | 728 | return AUDIT_BUILD_CONTEXT; |
| 729 | } | 729 | } |
| 730 | 730 | ||
| 731 | static int audit_in_mask(const struct audit_krule *rule, unsigned long val) | ||
| 732 | { | ||
| 733 | int word, bit; | ||
| 734 | |||
| 735 | if (val > 0xffffffff) | ||
| 736 | return false; | ||
| 737 | |||
| 738 | word = AUDIT_WORD(val); | ||
| 739 | if (word >= AUDIT_BITMASK_SIZE) | ||
| 740 | return false; | ||
| 741 | |||
| 742 | bit = AUDIT_BIT(val); | ||
| 743 | |||
| 744 | return rule->mask[word] & bit; | ||
| 745 | } | ||
| 746 | |||
| 731 | /* At syscall entry and exit time, this filter is called if the | 747 | /* At syscall entry and exit time, this filter is called if the |
| 732 | * audit_state is not low enough that auditing cannot take place, but is | 748 | * audit_state is not low enough that auditing cannot take place, but is |
| 733 | * also not high enough that we already know we have to write an audit | 749 | * also not high enough that we already know we have to write an audit |
| @@ -745,11 +761,8 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, | |||
| 745 | 761 | ||
| 746 | rcu_read_lock(); | 762 | rcu_read_lock(); |
| 747 | if (!list_empty(list)) { | 763 | if (!list_empty(list)) { |
| 748 | int word = AUDIT_WORD(ctx->major); | ||
| 749 | int bit = AUDIT_BIT(ctx->major); | ||
| 750 | |||
| 751 | list_for_each_entry_rcu(e, list, list) { | 764 | list_for_each_entry_rcu(e, list, list) { |
| 752 | if ((e->rule.mask[word] & bit) == bit && | 765 | if (audit_in_mask(&e->rule, ctx->major) && |
| 753 | audit_filter_rules(tsk, &e->rule, ctx, NULL, | 766 | audit_filter_rules(tsk, &e->rule, ctx, NULL, |
| 754 | &state, false)) { | 767 | &state, false)) { |
| 755 | rcu_read_unlock(); | 768 | rcu_read_unlock(); |
| @@ -769,20 +782,16 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, | |||
| 769 | static int audit_filter_inode_name(struct task_struct *tsk, | 782 | static int audit_filter_inode_name(struct task_struct *tsk, |
| 770 | struct audit_names *n, | 783 | struct audit_names *n, |
| 771 | struct audit_context *ctx) { | 784 | struct audit_context *ctx) { |
| 772 | int word, bit; | ||
| 773 | int h = audit_hash_ino((u32)n->ino); | 785 | int h = audit_hash_ino((u32)n->ino); |
| 774 | struct list_head *list = &audit_inode_hash[h]; | 786 | struct list_head *list = &audit_inode_hash[h]; |
| 775 | struct audit_entry *e; | 787 | struct audit_entry *e; |
| 776 | enum audit_state state; | 788 | enum audit_state state; |
| 777 | 789 | ||
| 778 | word = AUDIT_WORD(ctx->major); | ||
| 779 | bit = AUDIT_BIT(ctx->major); | ||
| 780 | |||
| 781 | if (list_empty(list)) | 790 | if (list_empty(list)) |
| 782 | return 0; | 791 | return 0; |
| 783 | 792 | ||
| 784 | list_for_each_entry_rcu(e, list, list) { | 793 | list_for_each_entry_rcu(e, list, list) { |
| 785 | if ((e->rule.mask[word] & bit) == bit && | 794 | if (audit_in_mask(&e->rule, ctx->major) && |
| 786 | audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) { | 795 | audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) { |
| 787 | ctx->current_state = state; | 796 | ctx->current_state = state; |
| 788 | return 1; | 797 | return 1; |
diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c index a5e026bc45c4..1323360d90e3 100644 --- a/kernel/backtracetest.c +++ b/kernel/backtracetest.c | |||
| @@ -19,8 +19,8 @@ | |||
| 19 | 19 | ||
| 20 | static void backtrace_test_normal(void) | 20 | static void backtrace_test_normal(void) |
| 21 | { | 21 | { |
| 22 | printk("Testing a backtrace from process context.\n"); | 22 | pr_info("Testing a backtrace from process context.\n"); |
| 23 | printk("The following trace is a kernel self test and not a bug!\n"); | 23 | pr_info("The following trace is a kernel self test and not a bug!\n"); |
| 24 | 24 | ||
| 25 | dump_stack(); | 25 | dump_stack(); |
| 26 | } | 26 | } |
| @@ -37,8 +37,8 @@ static DECLARE_TASKLET(backtrace_tasklet, &backtrace_test_irq_callback, 0); | |||
| 37 | 37 | ||
| 38 | static void backtrace_test_irq(void) | 38 | static void backtrace_test_irq(void) |
| 39 | { | 39 | { |
| 40 | printk("Testing a backtrace from irq context.\n"); | 40 | pr_info("Testing a backtrace from irq context.\n"); |
| 41 | printk("The following trace is a kernel self test and not a bug!\n"); | 41 | pr_info("The following trace is a kernel self test and not a bug!\n"); |
| 42 | 42 | ||
| 43 | init_completion(&backtrace_work); | 43 | init_completion(&backtrace_work); |
| 44 | tasklet_schedule(&backtrace_tasklet); | 44 | tasklet_schedule(&backtrace_tasklet); |
| @@ -51,8 +51,8 @@ static void backtrace_test_saved(void) | |||
| 51 | struct stack_trace trace; | 51 | struct stack_trace trace; |
| 52 | unsigned long entries[8]; | 52 | unsigned long entries[8]; |
| 53 | 53 | ||
| 54 | printk("Testing a saved backtrace.\n"); | 54 | pr_info("Testing a saved backtrace.\n"); |
| 55 | printk("The following trace is a kernel self test and not a bug!\n"); | 55 | pr_info("The following trace is a kernel self test and not a bug!\n"); |
| 56 | 56 | ||
| 57 | trace.nr_entries = 0; | 57 | trace.nr_entries = 0; |
| 58 | trace.max_entries = ARRAY_SIZE(entries); | 58 | trace.max_entries = ARRAY_SIZE(entries); |
| @@ -65,19 +65,19 @@ static void backtrace_test_saved(void) | |||
| 65 | #else | 65 | #else |
| 66 | static void backtrace_test_saved(void) | 66 | static void backtrace_test_saved(void) |
| 67 | { | 67 | { |
| 68 | printk("Saved backtrace test skipped.\n"); | 68 | pr_info("Saved backtrace test skipped.\n"); |
| 69 | } | 69 | } |
| 70 | #endif | 70 | #endif |
| 71 | 71 | ||
| 72 | static int backtrace_regression_test(void) | 72 | static int backtrace_regression_test(void) |
| 73 | { | 73 | { |
| 74 | printk("====[ backtrace testing ]===========\n"); | 74 | pr_info("====[ backtrace testing ]===========\n"); |
| 75 | 75 | ||
| 76 | backtrace_test_normal(); | 76 | backtrace_test_normal(); |
| 77 | backtrace_test_irq(); | 77 | backtrace_test_irq(); |
| 78 | backtrace_test_saved(); | 78 | backtrace_test_saved(); |
| 79 | 79 | ||
| 80 | printk("====[ end of backtrace testing ]====\n"); | 80 | pr_info("====[ end of backtrace testing ]====\n"); |
| 81 | return 0; | 81 | return 0; |
| 82 | } | 82 | } |
| 83 | 83 | ||
diff --git a/kernel/capability.c b/kernel/capability.c index a8d63df0c322..a5cf13c018ce 100644 --- a/kernel/capability.c +++ b/kernel/capability.c | |||
| @@ -24,7 +24,6 @@ | |||
| 24 | */ | 24 | */ |
| 25 | 25 | ||
| 26 | const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET; | 26 | const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET; |
| 27 | |||
| 28 | EXPORT_SYMBOL(__cap_empty_set); | 27 | EXPORT_SYMBOL(__cap_empty_set); |
| 29 | 28 | ||
| 30 | int file_caps_enabled = 1; | 29 | int file_caps_enabled = 1; |
| @@ -189,7 +188,7 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr) | |||
| 189 | * | 188 | * |
| 190 | * An alternative would be to return an error here | 189 | * An alternative would be to return an error here |
| 191 | * (-ERANGE), but that causes legacy applications to | 190 | * (-ERANGE), but that causes legacy applications to |
| 192 | * unexpectidly fail; the capget/modify/capset aborts | 191 | * unexpectedly fail; the capget/modify/capset aborts |
| 193 | * before modification is attempted and the application | 192 | * before modification is attempted and the application |
| 194 | * fails. | 193 | * fails. |
| 195 | */ | 194 | */ |
| @@ -395,7 +394,8 @@ EXPORT_SYMBOL(ns_capable); | |||
| 395 | * This does not set PF_SUPERPRIV because the caller may not | 394 | * This does not set PF_SUPERPRIV because the caller may not |
| 396 | * actually be privileged. | 395 | * actually be privileged. |
| 397 | */ | 396 | */ |
| 398 | bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap) | 397 | bool file_ns_capable(const struct file *file, struct user_namespace *ns, |
| 398 | int cap) | ||
| 399 | { | 399 | { |
| 400 | if (WARN_ON_ONCE(!cap_valid(cap))) | 400 | if (WARN_ON_ONCE(!cap_valid(cap))) |
| 401 | return false; | 401 | return false; |
| @@ -424,23 +424,19 @@ bool capable(int cap) | |||
| 424 | EXPORT_SYMBOL(capable); | 424 | EXPORT_SYMBOL(capable); |
| 425 | 425 | ||
| 426 | /** | 426 | /** |
| 427 | * inode_capable - Check superior capability over inode | 427 | * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped |
| 428 | * @inode: The inode in question | 428 | * @inode: The inode in question |
| 429 | * @cap: The capability in question | 429 | * @cap: The capability in question |
| 430 | * | 430 | * |
| 431 | * Return true if the current task has the given superior capability | 431 | * Return true if the current task has the given capability targeted at |
| 432 | * targeted at it's own user namespace and that the given inode is owned | 432 | * its own user namespace and that the given inode's uid and gid are |
| 433 | * by the current user namespace or a child namespace. | 433 | * mapped into the current user namespace. |
| 434 | * | ||
| 435 | * Currently we check to see if an inode is owned by the current | ||
| 436 | * user namespace by seeing if the inode's owner maps into the | ||
| 437 | * current user namespace. | ||
| 438 | * | ||
| 439 | */ | 434 | */ |
| 440 | bool inode_capable(const struct inode *inode, int cap) | 435 | bool capable_wrt_inode_uidgid(const struct inode *inode, int cap) |
| 441 | { | 436 | { |
| 442 | struct user_namespace *ns = current_user_ns(); | 437 | struct user_namespace *ns = current_user_ns(); |
| 443 | 438 | ||
| 444 | return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid); | 439 | return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid) && |
| 440 | kgid_has_mapping(ns, inode->i_gid); | ||
| 445 | } | 441 | } |
| 446 | EXPORT_SYMBOL(inode_capable); | 442 | EXPORT_SYMBOL(capable_wrt_inode_uidgid); |
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 9fcdaa705b6c..7868fc3c0bc5 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
| @@ -26,6 +26,8 @@ | |||
| 26 | * distribution for more details. | 26 | * distribution for more details. |
| 27 | */ | 27 | */ |
| 28 | 28 | ||
| 29 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | ||
| 30 | |||
| 29 | #include <linux/cgroup.h> | 31 | #include <linux/cgroup.h> |
| 30 | #include <linux/cred.h> | 32 | #include <linux/cred.h> |
| 31 | #include <linux/ctype.h> | 33 | #include <linux/ctype.h> |
| @@ -33,6 +35,7 @@ | |||
| 33 | #include <linux/init_task.h> | 35 | #include <linux/init_task.h> |
| 34 | #include <linux/kernel.h> | 36 | #include <linux/kernel.h> |
| 35 | #include <linux/list.h> | 37 | #include <linux/list.h> |
| 38 | #include <linux/magic.h> | ||
| 36 | #include <linux/mm.h> | 39 | #include <linux/mm.h> |
| 37 | #include <linux/mutex.h> | 40 | #include <linux/mutex.h> |
| 38 | #include <linux/mount.h> | 41 | #include <linux/mount.h> |
| @@ -69,15 +72,6 @@ | |||
| 69 | MAX_CFTYPE_NAME + 2) | 72 | MAX_CFTYPE_NAME + 2) |
| 70 | 73 | ||
| 71 | /* | 74 | /* |
| 72 | * cgroup_tree_mutex nests above cgroup_mutex and protects cftypes, file | ||
| 73 | * creation/removal and hierarchy changing operations including cgroup | ||
| 74 | * creation, removal, css association and controller rebinding. This outer | ||
| 75 | * lock is needed mainly to resolve the circular dependency between kernfs | ||
| 76 | * active ref and cgroup_mutex. cgroup_tree_mutex nests above both. | ||
| 77 | */ | ||
| 78 | static DEFINE_MUTEX(cgroup_tree_mutex); | ||
| 79 | |||
| 80 | /* | ||
| 81 | * cgroup_mutex is the master lock. Any modification to cgroup or its | 75 | * cgroup_mutex is the master lock. Any modification to cgroup or its |
| 82 | * hierarchy must be performed while holding it. | 76 | * hierarchy must be performed while holding it. |
| 83 | * | 77 | * |
| @@ -98,16 +92,21 @@ static DECLARE_RWSEM(css_set_rwsem); | |||
| 98 | #endif | 92 | #endif |
| 99 | 93 | ||
| 100 | /* | 94 | /* |
| 95 | * Protects cgroup_idr and css_idr so that IDs can be released without | ||
| 96 | * grabbing cgroup_mutex. | ||
| 97 | */ | ||
| 98 | static DEFINE_SPINLOCK(cgroup_idr_lock); | ||
| 99 | |||
| 100 | /* | ||
| 101 | * Protects cgroup_subsys->release_agent_path. Modifying it also requires | 101 | * Protects cgroup_subsys->release_agent_path. Modifying it also requires |
| 102 | * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock. | 102 | * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock. |
| 103 | */ | 103 | */ |
| 104 | static DEFINE_SPINLOCK(release_agent_path_lock); | 104 | static DEFINE_SPINLOCK(release_agent_path_lock); |
| 105 | 105 | ||
| 106 | #define cgroup_assert_mutexes_or_rcu_locked() \ | 106 | #define cgroup_assert_mutex_or_rcu_locked() \ |
| 107 | rcu_lockdep_assert(rcu_read_lock_held() || \ | 107 | rcu_lockdep_assert(rcu_read_lock_held() || \ |
| 108 | lockdep_is_held(&cgroup_tree_mutex) || \ | ||
| 109 | lockdep_is_held(&cgroup_mutex), \ | 108 | lockdep_is_held(&cgroup_mutex), \ |
| 110 | "cgroup_[tree_]mutex or RCU read lock required"); | 109 | "cgroup_mutex or RCU read lock required"); |
| 111 | 110 | ||
| 112 | /* | 111 | /* |
| 113 | * cgroup destruction makes heavy use of work items and there can be a lot | 112 | * cgroup destruction makes heavy use of work items and there can be a lot |
| @@ -150,6 +149,13 @@ struct cgroup_root cgrp_dfl_root; | |||
| 150 | */ | 149 | */ |
| 151 | static bool cgrp_dfl_root_visible; | 150 | static bool cgrp_dfl_root_visible; |
| 152 | 151 | ||
| 152 | /* some controllers are not supported in the default hierarchy */ | ||
| 153 | static const unsigned int cgrp_dfl_root_inhibit_ss_mask = 0 | ||
| 154 | #ifdef CONFIG_CGROUP_DEBUG | ||
| 155 | | (1 << debug_cgrp_id) | ||
| 156 | #endif | ||
| 157 | ; | ||
| 158 | |||
| 153 | /* The list of hierarchy roots */ | 159 | /* The list of hierarchy roots */ |
| 154 | 160 | ||
| 155 | static LIST_HEAD(cgroup_roots); | 161 | static LIST_HEAD(cgroup_roots); |
| @@ -159,14 +165,13 @@ static int cgroup_root_count; | |||
| 159 | static DEFINE_IDR(cgroup_hierarchy_idr); | 165 | static DEFINE_IDR(cgroup_hierarchy_idr); |
| 160 | 166 | ||
| 161 | /* | 167 | /* |
| 162 | * Assign a monotonically increasing serial number to cgroups. It | 168 | * Assign a monotonically increasing serial number to csses. It guarantees |
| 163 | * guarantees cgroups with bigger numbers are newer than those with smaller | 169 | * cgroups with bigger numbers are newer than those with smaller numbers. |
| 164 | * numbers. Also, as cgroups are always appended to the parent's | 170 | * Also, as csses are always appended to the parent's ->children list, it |
| 165 | * ->children list, it guarantees that sibling cgroups are always sorted in | 171 | * guarantees that sibling csses are always sorted in the ascending serial |
| 166 | * the ascending serial number order on the list. Protected by | 172 | * number order on the list. Protected by cgroup_mutex. |
| 167 | * cgroup_mutex. | ||
| 168 | */ | 173 | */ |
| 169 | static u64 cgroup_serial_nr_next = 1; | 174 | static u64 css_serial_nr_next = 1; |
| 170 | 175 | ||
| 171 | /* This flag indicates whether tasks in the fork and exit paths should | 176 | /* This flag indicates whether tasks in the fork and exit paths should |
| 172 | * check for fork/exit handlers to call. This avoids us having to do | 177 | * check for fork/exit handlers to call. This avoids us having to do |
| @@ -179,17 +184,59 @@ static struct cftype cgroup_base_files[]; | |||
| 179 | 184 | ||
| 180 | static void cgroup_put(struct cgroup *cgrp); | 185 | static void cgroup_put(struct cgroup *cgrp); |
| 181 | static int rebind_subsystems(struct cgroup_root *dst_root, | 186 | static int rebind_subsystems(struct cgroup_root *dst_root, |
| 182 | unsigned long ss_mask); | 187 | unsigned int ss_mask); |
| 183 | static void cgroup_destroy_css_killed(struct cgroup *cgrp); | ||
| 184 | static int cgroup_destroy_locked(struct cgroup *cgrp); | 188 | static int cgroup_destroy_locked(struct cgroup *cgrp); |
| 189 | static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss); | ||
| 190 | static void css_release(struct percpu_ref *ref); | ||
| 191 | static void kill_css(struct cgroup_subsys_state *css); | ||
| 185 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], | 192 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], |
| 186 | bool is_add); | 193 | bool is_add); |
| 187 | static void cgroup_pidlist_destroy_all(struct cgroup *cgrp); | 194 | static void cgroup_pidlist_destroy_all(struct cgroup *cgrp); |
| 188 | 195 | ||
| 196 | /* IDR wrappers which synchronize using cgroup_idr_lock */ | ||
| 197 | static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, | ||
| 198 | gfp_t gfp_mask) | ||
| 199 | { | ||
| 200 | int ret; | ||
| 201 | |||
| 202 | idr_preload(gfp_mask); | ||
| 203 | spin_lock_bh(&cgroup_idr_lock); | ||
| 204 | ret = idr_alloc(idr, ptr, start, end, gfp_mask); | ||
| 205 | spin_unlock_bh(&cgroup_idr_lock); | ||
| 206 | idr_preload_end(); | ||
| 207 | return ret; | ||
| 208 | } | ||
| 209 | |||
| 210 | static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id) | ||
| 211 | { | ||
| 212 | void *ret; | ||
| 213 | |||
| 214 | spin_lock_bh(&cgroup_idr_lock); | ||
| 215 | ret = idr_replace(idr, ptr, id); | ||
| 216 | spin_unlock_bh(&cgroup_idr_lock); | ||
| 217 | return ret; | ||
| 218 | } | ||
| 219 | |||
| 220 | static void cgroup_idr_remove(struct idr *idr, int id) | ||
| 221 | { | ||
| 222 | spin_lock_bh(&cgroup_idr_lock); | ||
| 223 | idr_remove(idr, id); | ||
| 224 | spin_unlock_bh(&cgroup_idr_lock); | ||
| 225 | } | ||
| 226 | |||
| 227 | static struct cgroup *cgroup_parent(struct cgroup *cgrp) | ||
| 228 | { | ||
| 229 | struct cgroup_subsys_state *parent_css = cgrp->self.parent; | ||
| 230 | |||
| 231 | if (parent_css) | ||
| 232 | return container_of(parent_css, struct cgroup, self); | ||
| 233 | return NULL; | ||
| 234 | } | ||
| 235 | |||
| 189 | /** | 236 | /** |
| 190 | * cgroup_css - obtain a cgroup's css for the specified subsystem | 237 | * cgroup_css - obtain a cgroup's css for the specified subsystem |
| 191 | * @cgrp: the cgroup of interest | 238 | * @cgrp: the cgroup of interest |
| 192 | * @ss: the subsystem of interest (%NULL returns the dummy_css) | 239 | * @ss: the subsystem of interest (%NULL returns @cgrp->self) |
| 193 | * | 240 | * |
| 194 | * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This | 241 | * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This |
| 195 | * function must be called either under cgroup_mutex or rcu_read_lock() and | 242 | * function must be called either under cgroup_mutex or rcu_read_lock() and |
| @@ -202,23 +249,49 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, | |||
| 202 | { | 249 | { |
| 203 | if (ss) | 250 | if (ss) |
| 204 | return rcu_dereference_check(cgrp->subsys[ss->id], | 251 | return rcu_dereference_check(cgrp->subsys[ss->id], |
| 205 | lockdep_is_held(&cgroup_tree_mutex) || | ||
| 206 | lockdep_is_held(&cgroup_mutex)); | 252 | lockdep_is_held(&cgroup_mutex)); |
| 207 | else | 253 | else |
| 208 | return &cgrp->dummy_css; | 254 | return &cgrp->self; |
| 255 | } | ||
| 256 | |||
| 257 | /** | ||
| 258 | * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem | ||
| 259 | * @cgrp: the cgroup of interest | ||
| 260 | * @ss: the subsystem of interest (%NULL returns @cgrp->self) | ||
| 261 | * | ||
| 262 | * Similar to cgroup_css() but returns the effctive css, which is defined | ||
| 263 | * as the matching css of the nearest ancestor including self which has @ss | ||
| 264 | * enabled. If @ss is associated with the hierarchy @cgrp is on, this | ||
| 265 | * function is guaranteed to return non-NULL css. | ||
| 266 | */ | ||
| 267 | static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, | ||
| 268 | struct cgroup_subsys *ss) | ||
| 269 | { | ||
| 270 | lockdep_assert_held(&cgroup_mutex); | ||
| 271 | |||
| 272 | if (!ss) | ||
| 273 | return &cgrp->self; | ||
| 274 | |||
| 275 | if (!(cgrp->root->subsys_mask & (1 << ss->id))) | ||
| 276 | return NULL; | ||
| 277 | |||
| 278 | while (cgroup_parent(cgrp) && | ||
| 279 | !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id))) | ||
| 280 | cgrp = cgroup_parent(cgrp); | ||
| 281 | |||
| 282 | return cgroup_css(cgrp, ss); | ||
| 209 | } | 283 | } |
| 210 | 284 | ||
| 211 | /* convenient tests for these bits */ | 285 | /* convenient tests for these bits */ |
| 212 | static inline bool cgroup_is_dead(const struct cgroup *cgrp) | 286 | static inline bool cgroup_is_dead(const struct cgroup *cgrp) |
| 213 | { | 287 | { |
| 214 | return test_bit(CGRP_DEAD, &cgrp->flags); | 288 | return !(cgrp->self.flags & CSS_ONLINE); |
| 215 | } | 289 | } |
| 216 | 290 | ||
| 217 | struct cgroup_subsys_state *seq_css(struct seq_file *seq) | 291 | struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) |
| 218 | { | 292 | { |
| 219 | struct kernfs_open_file *of = seq->private; | ||
| 220 | struct cgroup *cgrp = of->kn->parent->priv; | 293 | struct cgroup *cgrp = of->kn->parent->priv; |
| 221 | struct cftype *cft = seq_cft(seq); | 294 | struct cftype *cft = of_cft(of); |
| 222 | 295 | ||
| 223 | /* | 296 | /* |
| 224 | * This is open and unprotected implementation of cgroup_css(). | 297 | * This is open and unprotected implementation of cgroup_css(). |
| @@ -231,9 +304,9 @@ struct cgroup_subsys_state *seq_css(struct seq_file *seq) | |||
| 231 | if (cft->ss) | 304 | if (cft->ss) |
| 232 | return rcu_dereference_raw(cgrp->subsys[cft->ss->id]); | 305 | return rcu_dereference_raw(cgrp->subsys[cft->ss->id]); |
| 233 | else | 306 | else |
| 234 | return &cgrp->dummy_css; | 307 | return &cgrp->self; |
| 235 | } | 308 | } |
| 236 | EXPORT_SYMBOL_GPL(seq_css); | 309 | EXPORT_SYMBOL_GPL(of_css); |
| 237 | 310 | ||
| 238 | /** | 311 | /** |
| 239 | * cgroup_is_descendant - test ancestry | 312 | * cgroup_is_descendant - test ancestry |
| @@ -249,7 +322,7 @@ bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor) | |||
| 249 | while (cgrp) { | 322 | while (cgrp) { |
| 250 | if (cgrp == ancestor) | 323 | if (cgrp == ancestor) |
| 251 | return true; | 324 | return true; |
| 252 | cgrp = cgrp->parent; | 325 | cgrp = cgroup_parent(cgrp); |
| 253 | } | 326 | } |
| 254 | return false; | 327 | return false; |
| 255 | } | 328 | } |
| @@ -273,17 +346,30 @@ static int notify_on_release(const struct cgroup *cgrp) | |||
| 273 | * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end | 346 | * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end |
| 274 | * @cgrp: the target cgroup to iterate css's of | 347 | * @cgrp: the target cgroup to iterate css's of |
| 275 | * | 348 | * |
| 276 | * Should be called under cgroup_mutex. | 349 | * Should be called under cgroup_[tree_]mutex. |
| 277 | */ | 350 | */ |
| 278 | #define for_each_css(css, ssid, cgrp) \ | 351 | #define for_each_css(css, ssid, cgrp) \ |
| 279 | for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ | 352 | for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ |
| 280 | if (!((css) = rcu_dereference_check( \ | 353 | if (!((css) = rcu_dereference_check( \ |
| 281 | (cgrp)->subsys[(ssid)], \ | 354 | (cgrp)->subsys[(ssid)], \ |
| 282 | lockdep_is_held(&cgroup_tree_mutex) || \ | ||
| 283 | lockdep_is_held(&cgroup_mutex)))) { } \ | 355 | lockdep_is_held(&cgroup_mutex)))) { } \ |
| 284 | else | 356 | else |
| 285 | 357 | ||
| 286 | /** | 358 | /** |
| 359 | * for_each_e_css - iterate all effective css's of a cgroup | ||
| 360 | * @css: the iteration cursor | ||
| 361 | * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end | ||
| 362 | * @cgrp: the target cgroup to iterate css's of | ||
| 363 | * | ||
| 364 | * Should be called under cgroup_[tree_]mutex. | ||
| 365 | */ | ||
| 366 | #define for_each_e_css(css, ssid, cgrp) \ | ||
| 367 | for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ | ||
| 368 | if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \ | ||
| 369 | ; \ | ||
| 370 | else | ||
| 371 | |||
| 372 | /** | ||
| 287 | * for_each_subsys - iterate all enabled cgroup subsystems | 373 | * for_each_subsys - iterate all enabled cgroup subsystems |
| 288 | * @ss: the iteration cursor | 374 | * @ss: the iteration cursor |
| 289 | * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end | 375 | * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end |
| @@ -296,22 +382,13 @@ static int notify_on_release(const struct cgroup *cgrp) | |||
| 296 | #define for_each_root(root) \ | 382 | #define for_each_root(root) \ |
| 297 | list_for_each_entry((root), &cgroup_roots, root_list) | 383 | list_for_each_entry((root), &cgroup_roots, root_list) |
| 298 | 384 | ||
| 299 | /** | 385 | /* iterate over child cgrps, lock should be held throughout iteration */ |
| 300 | * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. | 386 | #define cgroup_for_each_live_child(child, cgrp) \ |
| 301 | * @cgrp: the cgroup to be checked for liveness | 387 | list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \ |
| 302 | * | 388 | if (({ lockdep_assert_held(&cgroup_mutex); \ |
| 303 | * On success, returns true; the mutex should be later unlocked. On | 389 | cgroup_is_dead(child); })) \ |
| 304 | * failure returns false with no lock held. | 390 | ; \ |
| 305 | */ | 391 | else |
| 306 | static bool cgroup_lock_live_group(struct cgroup *cgrp) | ||
| 307 | { | ||
| 308 | mutex_lock(&cgroup_mutex); | ||
| 309 | if (cgroup_is_dead(cgrp)) { | ||
| 310 | mutex_unlock(&cgroup_mutex); | ||
| 311 | return false; | ||
| 312 | } | ||
| 313 | return true; | ||
| 314 | } | ||
| 315 | 392 | ||
| 316 | /* the list of cgroups eligible for automatic release. Protected by | 393 | /* the list of cgroups eligible for automatic release. Protected by |
| 317 | * release_list_lock */ | 394 | * release_list_lock */ |
| @@ -348,7 +425,7 @@ struct cgrp_cset_link { | |||
| 348 | * reference-counted, to improve performance when child cgroups | 425 | * reference-counted, to improve performance when child cgroups |
| 349 | * haven't been created. | 426 | * haven't been created. |
| 350 | */ | 427 | */ |
| 351 | static struct css_set init_css_set = { | 428 | struct css_set init_css_set = { |
| 352 | .refcount = ATOMIC_INIT(1), | 429 | .refcount = ATOMIC_INIT(1), |
| 353 | .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), | 430 | .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), |
| 354 | .tasks = LIST_HEAD_INIT(init_css_set.tasks), | 431 | .tasks = LIST_HEAD_INIT(init_css_set.tasks), |
| @@ -359,6 +436,43 @@ static struct css_set init_css_set = { | |||
| 359 | 436 | ||
| 360 | static int css_set_count = 1; /* 1 for init_css_set */ | 437 | static int css_set_count = 1; /* 1 for init_css_set */ |
| 361 | 438 | ||
| 439 | /** | ||
| 440 | * cgroup_update_populated - updated populated count of a cgroup | ||
| 441 | * @cgrp: the target cgroup | ||
| 442 | * @populated: inc or dec populated count | ||
| 443 | * | ||
| 444 | * @cgrp is either getting the first task (css_set) or losing the last. | ||
| 445 | * Update @cgrp->populated_cnt accordingly. The count is propagated | ||
| 446 | * towards root so that a given cgroup's populated_cnt is zero iff the | ||
| 447 | * cgroup and all its descendants are empty. | ||
| 448 | * | ||
| 449 | * @cgrp's interface file "cgroup.populated" is zero if | ||
| 450 | * @cgrp->populated_cnt is zero and 1 otherwise. When @cgrp->populated_cnt | ||
| 451 | * changes from or to zero, userland is notified that the content of the | ||
| 452 | * interface file has changed. This can be used to detect when @cgrp and | ||
| 453 | * its descendants become populated or empty. | ||
| 454 | */ | ||
| 455 | static void cgroup_update_populated(struct cgroup *cgrp, bool populated) | ||
| 456 | { | ||
| 457 | lockdep_assert_held(&css_set_rwsem); | ||
| 458 | |||
| 459 | do { | ||
| 460 | bool trigger; | ||
| 461 | |||
| 462 | if (populated) | ||
| 463 | trigger = !cgrp->populated_cnt++; | ||
| 464 | else | ||
| 465 | trigger = !--cgrp->populated_cnt; | ||
| 466 | |||
| 467 | if (!trigger) | ||
| 468 | break; | ||
| 469 | |||
| 470 | if (cgrp->populated_kn) | ||
| 471 | kernfs_notify(cgrp->populated_kn); | ||
| 472 | cgrp = cgroup_parent(cgrp); | ||
| 473 | } while (cgrp); | ||
| 474 | } | ||
| 475 | |||
| 362 | /* | 476 | /* |
| 363 | * hash table for cgroup groups. This improves the performance to find | 477 | * hash table for cgroup groups. This improves the performance to find |
| 364 | * an existing css_set. This hash doesn't (currently) take into | 478 | * an existing css_set. This hash doesn't (currently) take into |
| @@ -383,6 +497,8 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) | |||
| 383 | static void put_css_set_locked(struct css_set *cset, bool taskexit) | 497 | static void put_css_set_locked(struct css_set *cset, bool taskexit) |
| 384 | { | 498 | { |
| 385 | struct cgrp_cset_link *link, *tmp_link; | 499 | struct cgrp_cset_link *link, *tmp_link; |
| 500 | struct cgroup_subsys *ss; | ||
| 501 | int ssid; | ||
| 386 | 502 | ||
| 387 | lockdep_assert_held(&css_set_rwsem); | 503 | lockdep_assert_held(&css_set_rwsem); |
| 388 | 504 | ||
| @@ -390,6 +506,8 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit) | |||
| 390 | return; | 506 | return; |
| 391 | 507 | ||
| 392 | /* This css_set is dead. unlink it and release cgroup refcounts */ | 508 | /* This css_set is dead. unlink it and release cgroup refcounts */ |
| 509 | for_each_subsys(ss, ssid) | ||
| 510 | list_del(&cset->e_cset_node[ssid]); | ||
| 393 | hash_del(&cset->hlist); | 511 | hash_del(&cset->hlist); |
| 394 | css_set_count--; | 512 | css_set_count--; |
| 395 | 513 | ||
| @@ -400,10 +518,13 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit) | |||
| 400 | list_del(&link->cgrp_link); | 518 | list_del(&link->cgrp_link); |
| 401 | 519 | ||
| 402 | /* @cgrp can't go away while we're holding css_set_rwsem */ | 520 | /* @cgrp can't go away while we're holding css_set_rwsem */ |
| 403 | if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) { | 521 | if (list_empty(&cgrp->cset_links)) { |
| 404 | if (taskexit) | 522 | cgroup_update_populated(cgrp, false); |
| 405 | set_bit(CGRP_RELEASABLE, &cgrp->flags); | 523 | if (notify_on_release(cgrp)) { |
| 406 | check_for_release(cgrp); | 524 | if (taskexit) |
| 525 | set_bit(CGRP_RELEASABLE, &cgrp->flags); | ||
| 526 | check_for_release(cgrp); | ||
| 527 | } | ||
| 407 | } | 528 | } |
| 408 | 529 | ||
| 409 | kfree(link); | 530 | kfree(link); |
| @@ -452,20 +573,20 @@ static bool compare_css_sets(struct css_set *cset, | |||
| 452 | { | 573 | { |
| 453 | struct list_head *l1, *l2; | 574 | struct list_head *l1, *l2; |
| 454 | 575 | ||
| 455 | if (memcmp(template, cset->subsys, sizeof(cset->subsys))) { | 576 | /* |
| 456 | /* Not all subsystems matched */ | 577 | * On the default hierarchy, there can be csets which are |
| 578 | * associated with the same set of cgroups but different csses. | ||
| 579 | * Let's first ensure that csses match. | ||
| 580 | */ | ||
| 581 | if (memcmp(template, cset->subsys, sizeof(cset->subsys))) | ||
| 457 | return false; | 582 | return false; |
| 458 | } | ||
| 459 | 583 | ||
| 460 | /* | 584 | /* |
| 461 | * Compare cgroup pointers in order to distinguish between | 585 | * Compare cgroup pointers in order to distinguish between |
| 462 | * different cgroups in heirarchies with no subsystems. We | 586 | * different cgroups in hierarchies. As different cgroups may |
| 463 | * could get by with just this check alone (and skip the | 587 | * share the same effective css, this comparison is always |
| 464 | * memcmp above) but on most setups the memcmp check will | 588 | * necessary. |
| 465 | * avoid the need for this more expensive check on almost all | ||
| 466 | * candidates. | ||
| 467 | */ | 589 | */ |
| 468 | |||
| 469 | l1 = &cset->cgrp_links; | 590 | l1 = &cset->cgrp_links; |
| 470 | l2 = &old_cset->cgrp_links; | 591 | l2 = &old_cset->cgrp_links; |
| 471 | while (1) { | 592 | while (1) { |
| @@ -529,14 +650,17 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset, | |||
| 529 | * won't change, so no need for locking. | 650 | * won't change, so no need for locking. |
| 530 | */ | 651 | */ |
| 531 | for_each_subsys(ss, i) { | 652 | for_each_subsys(ss, i) { |
| 532 | if (root->cgrp.subsys_mask & (1UL << i)) { | 653 | if (root->subsys_mask & (1UL << i)) { |
| 533 | /* Subsystem is in this hierarchy. So we want | 654 | /* |
| 534 | * the subsystem state from the new | 655 | * @ss is in this hierarchy, so we want the |
| 535 | * cgroup */ | 656 | * effective css from @cgrp. |
| 536 | template[i] = cgroup_css(cgrp, ss); | 657 | */ |
| 658 | template[i] = cgroup_e_css(cgrp, ss); | ||
| 537 | } else { | 659 | } else { |
| 538 | /* Subsystem is not in this hierarchy, so we | 660 | /* |
| 539 | * don't want to change the subsystem state */ | 661 | * @ss is not in this hierarchy, so we don't want |
| 662 | * to change the css. | ||
| 663 | */ | ||
| 540 | template[i] = old_cset->subsys[i]; | 664 | template[i] = old_cset->subsys[i]; |
| 541 | } | 665 | } |
| 542 | } | 666 | } |
| @@ -602,10 +726,18 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset, | |||
| 602 | struct cgrp_cset_link *link; | 726 | struct cgrp_cset_link *link; |
| 603 | 727 | ||
| 604 | BUG_ON(list_empty(tmp_links)); | 728 | BUG_ON(list_empty(tmp_links)); |
| 729 | |||
| 730 | if (cgroup_on_dfl(cgrp)) | ||
| 731 | cset->dfl_cgrp = cgrp; | ||
| 732 | |||
| 605 | link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link); | 733 | link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link); |
| 606 | link->cset = cset; | 734 | link->cset = cset; |
| 607 | link->cgrp = cgrp; | 735 | link->cgrp = cgrp; |
| 736 | |||
| 737 | if (list_empty(&cgrp->cset_links)) | ||
| 738 | cgroup_update_populated(cgrp, true); | ||
| 608 | list_move(&link->cset_link, &cgrp->cset_links); | 739 | list_move(&link->cset_link, &cgrp->cset_links); |
| 740 | |||
| 609 | /* | 741 | /* |
| 610 | * Always add links to the tail of the list so that the list | 742 | * Always add links to the tail of the list so that the list |
| 611 | * is sorted by order of hierarchy creation | 743 | * is sorted by order of hierarchy creation |
| @@ -628,7 +760,9 @@ static struct css_set *find_css_set(struct css_set *old_cset, | |||
| 628 | struct css_set *cset; | 760 | struct css_set *cset; |
| 629 | struct list_head tmp_links; | 761 | struct list_head tmp_links; |
| 630 | struct cgrp_cset_link *link; | 762 | struct cgrp_cset_link *link; |
| 763 | struct cgroup_subsys *ss; | ||
| 631 | unsigned long key; | 764 | unsigned long key; |
| 765 | int ssid; | ||
| 632 | 766 | ||
| 633 | lockdep_assert_held(&cgroup_mutex); | 767 | lockdep_assert_held(&cgroup_mutex); |
| 634 | 768 | ||
| @@ -679,10 +813,14 @@ static struct css_set *find_css_set(struct css_set *old_cset, | |||
| 679 | 813 | ||
| 680 | css_set_count++; | 814 | css_set_count++; |
| 681 | 815 | ||
| 682 | /* Add this cgroup group to the hash table */ | 816 | /* Add @cset to the hash table */ |
| 683 | key = css_set_hash(cset->subsys); | 817 | key = css_set_hash(cset->subsys); |
| 684 | hash_add(css_set_table, &cset->hlist, key); | 818 | hash_add(css_set_table, &cset->hlist, key); |
| 685 | 819 | ||
| 820 | for_each_subsys(ss, ssid) | ||
| 821 | list_add_tail(&cset->e_cset_node[ssid], | ||
| 822 | &cset->subsys[ssid]->cgroup->e_csets[ssid]); | ||
| 823 | |||
| 686 | up_write(&css_set_rwsem); | 824 | up_write(&css_set_rwsem); |
| 687 | 825 | ||
| 688 | return cset; | 826 | return cset; |
| @@ -735,14 +873,13 @@ static void cgroup_destroy_root(struct cgroup_root *root) | |||
| 735 | struct cgroup *cgrp = &root->cgrp; | 873 | struct cgroup *cgrp = &root->cgrp; |
| 736 | struct cgrp_cset_link *link, *tmp_link; | 874 | struct cgrp_cset_link *link, *tmp_link; |
| 737 | 875 | ||
| 738 | mutex_lock(&cgroup_tree_mutex); | ||
| 739 | mutex_lock(&cgroup_mutex); | 876 | mutex_lock(&cgroup_mutex); |
| 740 | 877 | ||
| 741 | BUG_ON(atomic_read(&root->nr_cgrps)); | 878 | BUG_ON(atomic_read(&root->nr_cgrps)); |
| 742 | BUG_ON(!list_empty(&cgrp->children)); | 879 | BUG_ON(!list_empty(&cgrp->self.children)); |
| 743 | 880 | ||
| 744 | /* Rebind all subsystems back to the default hierarchy */ | 881 | /* Rebind all subsystems back to the default hierarchy */ |
| 745 | rebind_subsystems(&cgrp_dfl_root, cgrp->subsys_mask); | 882 | rebind_subsystems(&cgrp_dfl_root, root->subsys_mask); |
| 746 | 883 | ||
| 747 | /* | 884 | /* |
| 748 | * Release all the links from cset_links to this hierarchy's | 885 | * Release all the links from cset_links to this hierarchy's |
| @@ -765,7 +902,6 @@ static void cgroup_destroy_root(struct cgroup_root *root) | |||
| 765 | cgroup_exit_root_id(root); | 902 | cgroup_exit_root_id(root); |
| 766 | 903 | ||
| 767 | mutex_unlock(&cgroup_mutex); | 904 | mutex_unlock(&cgroup_mutex); |
| 768 | mutex_unlock(&cgroup_tree_mutex); | ||
| 769 | 905 | ||
| 770 | kernfs_destroy_root(root->kf_root); | 906 | kernfs_destroy_root(root->kf_root); |
| 771 | cgroup_free_root(root); | 907 | cgroup_free_root(root); |
| @@ -848,7 +984,7 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, | |||
| 848 | * update of a tasks cgroup pointer by cgroup_attach_task() | 984 | * update of a tasks cgroup pointer by cgroup_attach_task() |
| 849 | */ | 985 | */ |
| 850 | 986 | ||
| 851 | static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask); | 987 | static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask); |
| 852 | static struct kernfs_syscall_ops cgroup_kf_syscall_ops; | 988 | static struct kernfs_syscall_ops cgroup_kf_syscall_ops; |
| 853 | static const struct file_operations proc_cgroupstats_operations; | 989 | static const struct file_operations proc_cgroupstats_operations; |
| 854 | 990 | ||
| @@ -883,79 +1019,95 @@ static umode_t cgroup_file_mode(const struct cftype *cft) | |||
| 883 | if (cft->read_u64 || cft->read_s64 || cft->seq_show) | 1019 | if (cft->read_u64 || cft->read_s64 || cft->seq_show) |
| 884 | mode |= S_IRUGO; | 1020 | mode |= S_IRUGO; |
| 885 | 1021 | ||
| 886 | if (cft->write_u64 || cft->write_s64 || cft->write_string || | 1022 | if (cft->write_u64 || cft->write_s64 || cft->write) |
| 887 | cft->trigger) | ||
| 888 | mode |= S_IWUSR; | 1023 | mode |= S_IWUSR; |
| 889 | 1024 | ||
| 890 | return mode; | 1025 | return mode; |
| 891 | } | 1026 | } |
| 892 | 1027 | ||
| 893 | static void cgroup_free_fn(struct work_struct *work) | 1028 | static void cgroup_get(struct cgroup *cgrp) |
| 894 | { | 1029 | { |
| 895 | struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); | 1030 | WARN_ON_ONCE(cgroup_is_dead(cgrp)); |
| 896 | 1031 | css_get(&cgrp->self); | |
| 897 | atomic_dec(&cgrp->root->nr_cgrps); | ||
| 898 | cgroup_pidlist_destroy_all(cgrp); | ||
| 899 | |||
| 900 | if (cgrp->parent) { | ||
| 901 | /* | ||
| 902 | * We get a ref to the parent, and put the ref when this | ||
| 903 | * cgroup is being freed, so it's guaranteed that the | ||
| 904 | * parent won't be destroyed before its children. | ||
| 905 | */ | ||
| 906 | cgroup_put(cgrp->parent); | ||
| 907 | kernfs_put(cgrp->kn); | ||
| 908 | kfree(cgrp); | ||
| 909 | } else { | ||
| 910 | /* | ||
| 911 | * This is root cgroup's refcnt reaching zero, which | ||
| 912 | * indicates that the root should be released. | ||
| 913 | */ | ||
| 914 | cgroup_destroy_root(cgrp->root); | ||
| 915 | } | ||
| 916 | } | 1032 | } |
| 917 | 1033 | ||
| 918 | static void cgroup_free_rcu(struct rcu_head *head) | 1034 | static void cgroup_put(struct cgroup *cgrp) |
| 919 | { | 1035 | { |
| 920 | struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head); | 1036 | css_put(&cgrp->self); |
| 921 | |||
| 922 | INIT_WORK(&cgrp->destroy_work, cgroup_free_fn); | ||
| 923 | queue_work(cgroup_destroy_wq, &cgrp->destroy_work); | ||
| 924 | } | 1037 | } |
| 925 | 1038 | ||
| 926 | static void cgroup_get(struct cgroup *cgrp) | 1039 | /** |
| 1040 | * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods | ||
| 1041 | * @kn: the kernfs_node being serviced | ||
| 1042 | * | ||
| 1043 | * This helper undoes cgroup_kn_lock_live() and should be invoked before | ||
| 1044 | * the method finishes if locking succeeded. Note that once this function | ||
| 1045 | * returns the cgroup returned by cgroup_kn_lock_live() may become | ||
| 1046 | * inaccessible any time. If the caller intends to continue to access the | ||
| 1047 | * cgroup, it should pin it before invoking this function. | ||
| 1048 | */ | ||
| 1049 | static void cgroup_kn_unlock(struct kernfs_node *kn) | ||
| 927 | { | 1050 | { |
| 928 | WARN_ON_ONCE(cgroup_is_dead(cgrp)); | 1051 | struct cgroup *cgrp; |
| 929 | WARN_ON_ONCE(atomic_read(&cgrp->refcnt) <= 0); | 1052 | |
| 930 | atomic_inc(&cgrp->refcnt); | 1053 | if (kernfs_type(kn) == KERNFS_DIR) |
| 1054 | cgrp = kn->priv; | ||
| 1055 | else | ||
| 1056 | cgrp = kn->parent->priv; | ||
| 1057 | |||
| 1058 | mutex_unlock(&cgroup_mutex); | ||
| 1059 | |||
| 1060 | kernfs_unbreak_active_protection(kn); | ||
| 1061 | cgroup_put(cgrp); | ||
| 931 | } | 1062 | } |
| 932 | 1063 | ||
| 933 | static void cgroup_put(struct cgroup *cgrp) | 1064 | /** |
| 1065 | * cgroup_kn_lock_live - locking helper for cgroup kernfs methods | ||
| 1066 | * @kn: the kernfs_node being serviced | ||
| 1067 | * | ||
| 1068 | * This helper is to be used by a cgroup kernfs method currently servicing | ||
| 1069 | * @kn. It breaks the active protection, performs cgroup locking and | ||
| 1070 | * verifies that the associated cgroup is alive. Returns the cgroup if | ||
| 1071 | * alive; otherwise, %NULL. A successful return should be undone by a | ||
| 1072 | * matching cgroup_kn_unlock() invocation. | ||
| 1073 | * | ||
| 1074 | * Any cgroup kernfs method implementation which requires locking the | ||
| 1075 | * associated cgroup should use this helper. It avoids nesting cgroup | ||
| 1076 | * locking under kernfs active protection and allows all kernfs operations | ||
| 1077 | * including self-removal. | ||
| 1078 | */ | ||
| 1079 | static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn) | ||
| 934 | { | 1080 | { |
| 935 | if (!atomic_dec_and_test(&cgrp->refcnt)) | 1081 | struct cgroup *cgrp; |
| 936 | return; | 1082 | |
| 937 | if (WARN_ON_ONCE(cgrp->parent && !cgroup_is_dead(cgrp))) | 1083 | if (kernfs_type(kn) == KERNFS_DIR) |
| 938 | return; | 1084 | cgrp = kn->priv; |
| 1085 | else | ||
| 1086 | cgrp = kn->parent->priv; | ||
| 939 | 1087 | ||
| 940 | /* | 1088 | /* |
| 941 | * XXX: cgrp->id is only used to look up css's. As cgroup and | 1089 | * We're gonna grab cgroup_mutex which nests outside kernfs |
| 942 | * css's lifetimes will be decoupled, it should be made | 1090 | * active_ref. cgroup liveliness check alone provides enough |
| 943 | * per-subsystem and moved to css->id so that lookups are | 1091 | * protection against removal. Ensure @cgrp stays accessible and |
| 944 | * successful until the target css is released. | 1092 | * break the active_ref protection. |
| 945 | */ | 1093 | */ |
| 1094 | cgroup_get(cgrp); | ||
| 1095 | kernfs_break_active_protection(kn); | ||
| 1096 | |||
| 946 | mutex_lock(&cgroup_mutex); | 1097 | mutex_lock(&cgroup_mutex); |
| 947 | idr_remove(&cgrp->root->cgroup_idr, cgrp->id); | ||
| 948 | mutex_unlock(&cgroup_mutex); | ||
| 949 | cgrp->id = -1; | ||
| 950 | 1098 | ||
| 951 | call_rcu(&cgrp->rcu_head, cgroup_free_rcu); | 1099 | if (!cgroup_is_dead(cgrp)) |
| 1100 | return cgrp; | ||
| 1101 | |||
| 1102 | cgroup_kn_unlock(kn); | ||
| 1103 | return NULL; | ||
| 952 | } | 1104 | } |
| 953 | 1105 | ||
| 954 | static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) | 1106 | static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) |
| 955 | { | 1107 | { |
| 956 | char name[CGROUP_FILE_NAME_MAX]; | 1108 | char name[CGROUP_FILE_NAME_MAX]; |
| 957 | 1109 | ||
| 958 | lockdep_assert_held(&cgroup_tree_mutex); | 1110 | lockdep_assert_held(&cgroup_mutex); |
| 959 | kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name)); | 1111 | kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name)); |
| 960 | } | 1112 | } |
| 961 | 1113 | ||
| @@ -964,7 +1116,7 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) | |||
| 964 | * @cgrp: target cgroup | 1116 | * @cgrp: target cgroup |
| 965 | * @subsys_mask: mask of the subsystem ids whose files should be removed | 1117 | * @subsys_mask: mask of the subsystem ids whose files should be removed |
| 966 | */ | 1118 | */ |
| 967 | static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) | 1119 | static void cgroup_clear_dir(struct cgroup *cgrp, unsigned int subsys_mask) |
| 968 | { | 1120 | { |
| 969 | struct cgroup_subsys *ss; | 1121 | struct cgroup_subsys *ss; |
| 970 | int i; | 1122 | int i; |
| @@ -972,40 +1124,40 @@ static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) | |||
| 972 | for_each_subsys(ss, i) { | 1124 | for_each_subsys(ss, i) { |
| 973 | struct cftype *cfts; | 1125 | struct cftype *cfts; |
| 974 | 1126 | ||
| 975 | if (!test_bit(i, &subsys_mask)) | 1127 | if (!(subsys_mask & (1 << i))) |
| 976 | continue; | 1128 | continue; |
| 977 | list_for_each_entry(cfts, &ss->cfts, node) | 1129 | list_for_each_entry(cfts, &ss->cfts, node) |
| 978 | cgroup_addrm_files(cgrp, cfts, false); | 1130 | cgroup_addrm_files(cgrp, cfts, false); |
| 979 | } | 1131 | } |
| 980 | } | 1132 | } |
| 981 | 1133 | ||
| 982 | static int rebind_subsystems(struct cgroup_root *dst_root, | 1134 | static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask) |
| 983 | unsigned long ss_mask) | ||
| 984 | { | 1135 | { |
| 985 | struct cgroup_subsys *ss; | 1136 | struct cgroup_subsys *ss; |
| 986 | int ssid, ret; | 1137 | unsigned int tmp_ss_mask; |
| 1138 | int ssid, i, ret; | ||
| 987 | 1139 | ||
| 988 | lockdep_assert_held(&cgroup_tree_mutex); | ||
| 989 | lockdep_assert_held(&cgroup_mutex); | 1140 | lockdep_assert_held(&cgroup_mutex); |
| 990 | 1141 | ||
| 991 | for_each_subsys(ss, ssid) { | 1142 | for_each_subsys(ss, ssid) { |
| 992 | if (!(ss_mask & (1 << ssid))) | 1143 | if (!(ss_mask & (1 << ssid))) |
| 993 | continue; | 1144 | continue; |
| 994 | 1145 | ||
| 995 | /* if @ss is on the dummy_root, we can always move it */ | 1146 | /* if @ss has non-root csses attached to it, can't move */ |
| 996 | if (ss->root == &cgrp_dfl_root) | 1147 | if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss))) |
| 997 | continue; | ||
| 998 | |||
| 999 | /* if @ss has non-root cgroups attached to it, can't move */ | ||
| 1000 | if (!list_empty(&ss->root->cgrp.children)) | ||
| 1001 | return -EBUSY; | 1148 | return -EBUSY; |
| 1002 | 1149 | ||
| 1003 | /* can't move between two non-dummy roots either */ | 1150 | /* can't move between two non-dummy roots either */ |
| 1004 | if (dst_root != &cgrp_dfl_root) | 1151 | if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root) |
| 1005 | return -EBUSY; | 1152 | return -EBUSY; |
| 1006 | } | 1153 | } |
| 1007 | 1154 | ||
| 1008 | ret = cgroup_populate_dir(&dst_root->cgrp, ss_mask); | 1155 | /* skip creating root files on dfl_root for inhibited subsystems */ |
| 1156 | tmp_ss_mask = ss_mask; | ||
| 1157 | if (dst_root == &cgrp_dfl_root) | ||
| 1158 | tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask; | ||
| 1159 | |||
| 1160 | ret = cgroup_populate_dir(&dst_root->cgrp, tmp_ss_mask); | ||
| 1009 | if (ret) { | 1161 | if (ret) { |
| 1010 | if (dst_root != &cgrp_dfl_root) | 1162 | if (dst_root != &cgrp_dfl_root) |
| 1011 | return ret; | 1163 | return ret; |
| @@ -1017,9 +1169,9 @@ static int rebind_subsystems(struct cgroup_root *dst_root, | |||
| 1017 | * Just warn about it and continue. | 1169 | * Just warn about it and continue. |
| 1018 | */ | 1170 | */ |
| 1019 | if (cgrp_dfl_root_visible) { | 1171 | if (cgrp_dfl_root_visible) { |
| 1020 | pr_warning("cgroup: failed to create files (%d) while rebinding 0x%lx to default root\n", | 1172 | pr_warn("failed to create files (%d) while rebinding 0x%x to default root\n", |
| 1021 | ret, ss_mask); | 1173 | ret, ss_mask); |
| 1022 | pr_warning("cgroup: you may retry by moving them to a different hierarchy and unbinding\n"); | 1174 | pr_warn("you may retry by moving them to a different hierarchy and unbinding\n"); |
| 1023 | } | 1175 | } |
| 1024 | } | 1176 | } |
| 1025 | 1177 | ||
| @@ -1027,15 +1179,14 @@ static int rebind_subsystems(struct cgroup_root *dst_root, | |||
| 1027 | * Nothing can fail from this point on. Remove files for the | 1179 | * Nothing can fail from this point on. Remove files for the |
| 1028 | * removed subsystems and rebind each subsystem. | 1180 | * removed subsystems and rebind each subsystem. |
| 1029 | */ | 1181 | */ |
| 1030 | mutex_unlock(&cgroup_mutex); | ||
| 1031 | for_each_subsys(ss, ssid) | 1182 | for_each_subsys(ss, ssid) |
| 1032 | if (ss_mask & (1 << ssid)) | 1183 | if (ss_mask & (1 << ssid)) |
| 1033 | cgroup_clear_dir(&ss->root->cgrp, 1 << ssid); | 1184 | cgroup_clear_dir(&ss->root->cgrp, 1 << ssid); |
| 1034 | mutex_lock(&cgroup_mutex); | ||
| 1035 | 1185 | ||
| 1036 | for_each_subsys(ss, ssid) { | 1186 | for_each_subsys(ss, ssid) { |
| 1037 | struct cgroup_root *src_root; | 1187 | struct cgroup_root *src_root; |
| 1038 | struct cgroup_subsys_state *css; | 1188 | struct cgroup_subsys_state *css; |
| 1189 | struct css_set *cset; | ||
| 1039 | 1190 | ||
| 1040 | if (!(ss_mask & (1 << ssid))) | 1191 | if (!(ss_mask & (1 << ssid))) |
| 1041 | continue; | 1192 | continue; |
| @@ -1050,8 +1201,19 @@ static int rebind_subsystems(struct cgroup_root *dst_root, | |||
| 1050 | ss->root = dst_root; | 1201 | ss->root = dst_root; |
| 1051 | css->cgroup = &dst_root->cgrp; | 1202 | css->cgroup = &dst_root->cgrp; |
| 1052 | 1203 | ||
| 1053 | src_root->cgrp.subsys_mask &= ~(1 << ssid); | 1204 | down_write(&css_set_rwsem); |
| 1054 | dst_root->cgrp.subsys_mask |= 1 << ssid; | 1205 | hash_for_each(css_set_table, i, cset, hlist) |
| 1206 | list_move_tail(&cset->e_cset_node[ss->id], | ||
| 1207 | &dst_root->cgrp.e_csets[ss->id]); | ||
| 1208 | up_write(&css_set_rwsem); | ||
| 1209 | |||
| 1210 | src_root->subsys_mask &= ~(1 << ssid); | ||
| 1211 | src_root->cgrp.child_subsys_mask &= ~(1 << ssid); | ||
| 1212 | |||
| 1213 | /* default hierarchy doesn't enable controllers by default */ | ||
| 1214 | dst_root->subsys_mask |= 1 << ssid; | ||
| 1215 | if (dst_root != &cgrp_dfl_root) | ||
| 1216 | dst_root->cgrp.child_subsys_mask |= 1 << ssid; | ||
| 1055 | 1217 | ||
| 1056 | if (ss->bind) | 1218 | if (ss->bind) |
| 1057 | ss->bind(css); | 1219 | ss->bind(css); |
| @@ -1069,7 +1231,7 @@ static int cgroup_show_options(struct seq_file *seq, | |||
| 1069 | int ssid; | 1231 | int ssid; |
| 1070 | 1232 | ||
| 1071 | for_each_subsys(ss, ssid) | 1233 | for_each_subsys(ss, ssid) |
| 1072 | if (root->cgrp.subsys_mask & (1 << ssid)) | 1234 | if (root->subsys_mask & (1 << ssid)) |
| 1073 | seq_printf(seq, ",%s", ss->name); | 1235 | seq_printf(seq, ",%s", ss->name); |
| 1074 | if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) | 1236 | if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) |
| 1075 | seq_puts(seq, ",sane_behavior"); | 1237 | seq_puts(seq, ",sane_behavior"); |
| @@ -1091,8 +1253,8 @@ static int cgroup_show_options(struct seq_file *seq, | |||
| 1091 | } | 1253 | } |
| 1092 | 1254 | ||
| 1093 | struct cgroup_sb_opts { | 1255 | struct cgroup_sb_opts { |
| 1094 | unsigned long subsys_mask; | 1256 | unsigned int subsys_mask; |
| 1095 | unsigned long flags; | 1257 | unsigned int flags; |
| 1096 | char *release_agent; | 1258 | char *release_agent; |
| 1097 | bool cpuset_clone_children; | 1259 | bool cpuset_clone_children; |
| 1098 | char *name; | 1260 | char *name; |
| @@ -1100,24 +1262,16 @@ struct cgroup_sb_opts { | |||
| 1100 | bool none; | 1262 | bool none; |
| 1101 | }; | 1263 | }; |
| 1102 | 1264 | ||
| 1103 | /* | ||
| 1104 | * Convert a hierarchy specifier into a bitmask of subsystems and | ||
| 1105 | * flags. Call with cgroup_mutex held to protect the cgroup_subsys[] | ||
| 1106 | * array. This function takes refcounts on subsystems to be used, unless it | ||
| 1107 | * returns error, in which case no refcounts are taken. | ||
| 1108 | */ | ||
| 1109 | static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | 1265 | static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) |
| 1110 | { | 1266 | { |
| 1111 | char *token, *o = data; | 1267 | char *token, *o = data; |
| 1112 | bool all_ss = false, one_ss = false; | 1268 | bool all_ss = false, one_ss = false; |
| 1113 | unsigned long mask = (unsigned long)-1; | 1269 | unsigned int mask = -1U; |
| 1114 | struct cgroup_subsys *ss; | 1270 | struct cgroup_subsys *ss; |
| 1115 | int i; | 1271 | int i; |
| 1116 | 1272 | ||
| 1117 | BUG_ON(!mutex_is_locked(&cgroup_mutex)); | ||
| 1118 | |||
| 1119 | #ifdef CONFIG_CPUSETS | 1273 | #ifdef CONFIG_CPUSETS |
| 1120 | mask = ~(1UL << cpuset_cgrp_id); | 1274 | mask = ~(1U << cpuset_cgrp_id); |
| 1121 | #endif | 1275 | #endif |
| 1122 | 1276 | ||
| 1123 | memset(opts, 0, sizeof(*opts)); | 1277 | memset(opts, 0, sizeof(*opts)); |
| @@ -1198,7 +1352,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
| 1198 | /* Mutually exclusive option 'all' + subsystem name */ | 1352 | /* Mutually exclusive option 'all' + subsystem name */ |
| 1199 | if (all_ss) | 1353 | if (all_ss) |
| 1200 | return -EINVAL; | 1354 | return -EINVAL; |
| 1201 | set_bit(i, &opts->subsys_mask); | 1355 | opts->subsys_mask |= (1 << i); |
| 1202 | one_ss = true; | 1356 | one_ss = true; |
| 1203 | 1357 | ||
| 1204 | break; | 1358 | break; |
| @@ -1210,12 +1364,12 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
| 1210 | /* Consistency checks */ | 1364 | /* Consistency checks */ |
| 1211 | 1365 | ||
| 1212 | if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { | 1366 | if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { |
| 1213 | pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); | 1367 | pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); |
| 1214 | 1368 | ||
| 1215 | if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) || | 1369 | if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) || |
| 1216 | opts->cpuset_clone_children || opts->release_agent || | 1370 | opts->cpuset_clone_children || opts->release_agent || |
| 1217 | opts->name) { | 1371 | opts->name) { |
| 1218 | pr_err("cgroup: sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n"); | 1372 | pr_err("sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n"); |
| 1219 | return -EINVAL; | 1373 | return -EINVAL; |
| 1220 | } | 1374 | } |
| 1221 | } else { | 1375 | } else { |
| @@ -1227,7 +1381,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
| 1227 | if (all_ss || (!one_ss && !opts->none && !opts->name)) | 1381 | if (all_ss || (!one_ss && !opts->none && !opts->name)) |
| 1228 | for_each_subsys(ss, i) | 1382 | for_each_subsys(ss, i) |
| 1229 | if (!ss->disabled) | 1383 | if (!ss->disabled) |
| 1230 | set_bit(i, &opts->subsys_mask); | 1384 | opts->subsys_mask |= (1 << i); |
| 1231 | 1385 | ||
| 1232 | /* | 1386 | /* |
| 1233 | * We either have to specify by name or by subsystems. (So | 1387 | * We either have to specify by name or by subsystems. (So |
| @@ -1258,14 +1412,13 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) | |||
| 1258 | int ret = 0; | 1412 | int ret = 0; |
| 1259 | struct cgroup_root *root = cgroup_root_from_kf(kf_root); | 1413 | struct cgroup_root *root = cgroup_root_from_kf(kf_root); |
| 1260 | struct cgroup_sb_opts opts; | 1414 | struct cgroup_sb_opts opts; |
| 1261 | unsigned long added_mask, removed_mask; | 1415 | unsigned int added_mask, removed_mask; |
| 1262 | 1416 | ||
| 1263 | if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) { | 1417 | if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) { |
| 1264 | pr_err("cgroup: sane_behavior: remount is not allowed\n"); | 1418 | pr_err("sane_behavior: remount is not allowed\n"); |
| 1265 | return -EINVAL; | 1419 | return -EINVAL; |
| 1266 | } | 1420 | } |
| 1267 | 1421 | ||
| 1268 | mutex_lock(&cgroup_tree_mutex); | ||
| 1269 | mutex_lock(&cgroup_mutex); | 1422 | mutex_lock(&cgroup_mutex); |
| 1270 | 1423 | ||
| 1271 | /* See what subsystems are wanted */ | 1424 | /* See what subsystems are wanted */ |
| @@ -1273,17 +1426,17 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) | |||
| 1273 | if (ret) | 1426 | if (ret) |
| 1274 | goto out_unlock; | 1427 | goto out_unlock; |
| 1275 | 1428 | ||
| 1276 | if (opts.subsys_mask != root->cgrp.subsys_mask || opts.release_agent) | 1429 | if (opts.subsys_mask != root->subsys_mask || opts.release_agent) |
| 1277 | pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", | 1430 | pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n", |
| 1278 | task_tgid_nr(current), current->comm); | 1431 | task_tgid_nr(current), current->comm); |
| 1279 | 1432 | ||
| 1280 | added_mask = opts.subsys_mask & ~root->cgrp.subsys_mask; | 1433 | added_mask = opts.subsys_mask & ~root->subsys_mask; |
| 1281 | removed_mask = root->cgrp.subsys_mask & ~opts.subsys_mask; | 1434 | removed_mask = root->subsys_mask & ~opts.subsys_mask; |
| 1282 | 1435 | ||
| 1283 | /* Don't allow flags or name to change at remount */ | 1436 | /* Don't allow flags or name to change at remount */ |
| 1284 | if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) || | 1437 | if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) || |
| 1285 | (opts.name && strcmp(opts.name, root->name))) { | 1438 | (opts.name && strcmp(opts.name, root->name))) { |
| 1286 | pr_err("cgroup: option or name mismatch, new: 0x%lx \"%s\", old: 0x%lx \"%s\"\n", | 1439 | pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n", |
| 1287 | opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "", | 1440 | opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "", |
| 1288 | root->flags & CGRP_ROOT_OPTION_MASK, root->name); | 1441 | root->flags & CGRP_ROOT_OPTION_MASK, root->name); |
| 1289 | ret = -EINVAL; | 1442 | ret = -EINVAL; |
| @@ -1291,7 +1444,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) | |||
| 1291 | } | 1444 | } |
| 1292 | 1445 | ||
| 1293 | /* remounting is not allowed for populated hierarchies */ | 1446 | /* remounting is not allowed for populated hierarchies */ |
| 1294 | if (!list_empty(&root->cgrp.children)) { | 1447 | if (!list_empty(&root->cgrp.self.children)) { |
| 1295 | ret = -EBUSY; | 1448 | ret = -EBUSY; |
| 1296 | goto out_unlock; | 1449 | goto out_unlock; |
| 1297 | } | 1450 | } |
| @@ -1311,7 +1464,6 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) | |||
| 1311 | kfree(opts.release_agent); | 1464 | kfree(opts.release_agent); |
| 1312 | kfree(opts.name); | 1465 | kfree(opts.name); |
| 1313 | mutex_unlock(&cgroup_mutex); | 1466 | mutex_unlock(&cgroup_mutex); |
| 1314 | mutex_unlock(&cgroup_tree_mutex); | ||
| 1315 | return ret; | 1467 | return ret; |
| 1316 | } | 1468 | } |
| 1317 | 1469 | ||
| @@ -1369,14 +1521,22 @@ out_unlock: | |||
| 1369 | 1521 | ||
| 1370 | static void init_cgroup_housekeeping(struct cgroup *cgrp) | 1522 | static void init_cgroup_housekeeping(struct cgroup *cgrp) |
| 1371 | { | 1523 | { |
| 1372 | atomic_set(&cgrp->refcnt, 1); | 1524 | struct cgroup_subsys *ss; |
| 1373 | INIT_LIST_HEAD(&cgrp->sibling); | 1525 | int ssid; |
| 1374 | INIT_LIST_HEAD(&cgrp->children); | 1526 | |
| 1527 | INIT_LIST_HEAD(&cgrp->self.sibling); | ||
| 1528 | INIT_LIST_HEAD(&cgrp->self.children); | ||
| 1375 | INIT_LIST_HEAD(&cgrp->cset_links); | 1529 | INIT_LIST_HEAD(&cgrp->cset_links); |
| 1376 | INIT_LIST_HEAD(&cgrp->release_list); | 1530 | INIT_LIST_HEAD(&cgrp->release_list); |
| 1377 | INIT_LIST_HEAD(&cgrp->pidlists); | 1531 | INIT_LIST_HEAD(&cgrp->pidlists); |
| 1378 | mutex_init(&cgrp->pidlist_mutex); | 1532 | mutex_init(&cgrp->pidlist_mutex); |
| 1379 | cgrp->dummy_css.cgroup = cgrp; | 1533 | cgrp->self.cgroup = cgrp; |
| 1534 | cgrp->self.flags |= CSS_ONLINE; | ||
| 1535 | |||
| 1536 | for_each_subsys(ss, ssid) | ||
| 1537 | INIT_LIST_HEAD(&cgrp->e_csets[ssid]); | ||
| 1538 | |||
| 1539 | init_waitqueue_head(&cgrp->offline_waitq); | ||
| 1380 | } | 1540 | } |
| 1381 | 1541 | ||
| 1382 | static void init_cgroup_root(struct cgroup_root *root, | 1542 | static void init_cgroup_root(struct cgroup_root *root, |
| @@ -1399,21 +1559,24 @@ static void init_cgroup_root(struct cgroup_root *root, | |||
| 1399 | set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); | 1559 | set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); |
| 1400 | } | 1560 | } |
| 1401 | 1561 | ||
| 1402 | static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) | 1562 | static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) |
| 1403 | { | 1563 | { |
| 1404 | LIST_HEAD(tmp_links); | 1564 | LIST_HEAD(tmp_links); |
| 1405 | struct cgroup *root_cgrp = &root->cgrp; | 1565 | struct cgroup *root_cgrp = &root->cgrp; |
| 1406 | struct css_set *cset; | 1566 | struct css_set *cset; |
| 1407 | int i, ret; | 1567 | int i, ret; |
| 1408 | 1568 | ||
| 1409 | lockdep_assert_held(&cgroup_tree_mutex); | ||
| 1410 | lockdep_assert_held(&cgroup_mutex); | 1569 | lockdep_assert_held(&cgroup_mutex); |
| 1411 | 1570 | ||
| 1412 | ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL); | 1571 | ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_NOWAIT); |
| 1413 | if (ret < 0) | 1572 | if (ret < 0) |
| 1414 | goto out; | 1573 | goto out; |
| 1415 | root_cgrp->id = ret; | 1574 | root_cgrp->id = ret; |
| 1416 | 1575 | ||
| 1576 | ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release); | ||
| 1577 | if (ret) | ||
| 1578 | goto out; | ||
| 1579 | |||
| 1417 | /* | 1580 | /* |
| 1418 | * We're accessing css_set_count without locking css_set_rwsem here, | 1581 | * We're accessing css_set_count without locking css_set_rwsem here, |
| 1419 | * but that's OK - it can only be increased by someone holding | 1582 | * but that's OK - it can only be increased by someone holding |
| @@ -1422,11 +1585,11 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) | |||
| 1422 | */ | 1585 | */ |
| 1423 | ret = allocate_cgrp_cset_links(css_set_count, &tmp_links); | 1586 | ret = allocate_cgrp_cset_links(css_set_count, &tmp_links); |
| 1424 | if (ret) | 1587 | if (ret) |
| 1425 | goto out; | 1588 | goto cancel_ref; |
| 1426 | 1589 | ||
| 1427 | ret = cgroup_init_root_id(root); | 1590 | ret = cgroup_init_root_id(root); |
| 1428 | if (ret) | 1591 | if (ret) |
| 1429 | goto out; | 1592 | goto cancel_ref; |
| 1430 | 1593 | ||
| 1431 | root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops, | 1594 | root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops, |
| 1432 | KERNFS_ROOT_CREATE_DEACTIVATED, | 1595 | KERNFS_ROOT_CREATE_DEACTIVATED, |
| @@ -1462,7 +1625,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) | |||
| 1462 | link_css_set(&tmp_links, cset, root_cgrp); | 1625 | link_css_set(&tmp_links, cset, root_cgrp); |
| 1463 | up_write(&css_set_rwsem); | 1626 | up_write(&css_set_rwsem); |
| 1464 | 1627 | ||
| 1465 | BUG_ON(!list_empty(&root_cgrp->children)); | 1628 | BUG_ON(!list_empty(&root_cgrp->self.children)); |
| 1466 | BUG_ON(atomic_read(&root->nr_cgrps) != 1); | 1629 | BUG_ON(atomic_read(&root->nr_cgrps) != 1); |
| 1467 | 1630 | ||
| 1468 | kernfs_activate(root_cgrp->kn); | 1631 | kernfs_activate(root_cgrp->kn); |
| @@ -1474,6 +1637,8 @@ destroy_root: | |||
| 1474 | root->kf_root = NULL; | 1637 | root->kf_root = NULL; |
| 1475 | exit_root_id: | 1638 | exit_root_id: |
| 1476 | cgroup_exit_root_id(root); | 1639 | cgroup_exit_root_id(root); |
| 1640 | cancel_ref: | ||
| 1641 | percpu_ref_cancel_init(&root_cgrp->self.refcnt); | ||
| 1477 | out: | 1642 | out: |
| 1478 | free_cgrp_cset_links(&tmp_links); | 1643 | free_cgrp_cset_links(&tmp_links); |
| 1479 | return ret; | 1644 | return ret; |
| @@ -1495,8 +1660,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
| 1495 | */ | 1660 | */ |
| 1496 | if (!use_task_css_set_links) | 1661 | if (!use_task_css_set_links) |
| 1497 | cgroup_enable_task_cg_lists(); | 1662 | cgroup_enable_task_cg_lists(); |
| 1498 | retry: | 1663 | |
| 1499 | mutex_lock(&cgroup_tree_mutex); | ||
| 1500 | mutex_lock(&cgroup_mutex); | 1664 | mutex_lock(&cgroup_mutex); |
| 1501 | 1665 | ||
| 1502 | /* First find the desired set of subsystems */ | 1666 | /* First find the desired set of subsystems */ |
| @@ -1535,7 +1699,7 @@ retry: | |||
| 1535 | * subsystems) then they must match. | 1699 | * subsystems) then they must match. |
| 1536 | */ | 1700 | */ |
| 1537 | if ((opts.subsys_mask || opts.none) && | 1701 | if ((opts.subsys_mask || opts.none) && |
| 1538 | (opts.subsys_mask != root->cgrp.subsys_mask)) { | 1702 | (opts.subsys_mask != root->subsys_mask)) { |
| 1539 | if (!name_match) | 1703 | if (!name_match) |
| 1540 | continue; | 1704 | continue; |
| 1541 | ret = -EBUSY; | 1705 | ret = -EBUSY; |
| @@ -1544,28 +1708,27 @@ retry: | |||
| 1544 | 1708 | ||
| 1545 | if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) { | 1709 | if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) { |
| 1546 | if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) { | 1710 | if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) { |
| 1547 | pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n"); | 1711 | pr_err("sane_behavior: new mount options should match the existing superblock\n"); |
| 1548 | ret = -EINVAL; | 1712 | ret = -EINVAL; |
| 1549 | goto out_unlock; | 1713 | goto out_unlock; |
| 1550 | } else { | 1714 | } else { |
| 1551 | pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n"); | 1715 | pr_warn("new mount options do not match the existing superblock, will be ignored\n"); |
| 1552 | } | 1716 | } |
| 1553 | } | 1717 | } |
| 1554 | 1718 | ||
| 1555 | /* | 1719 | /* |
| 1556 | * A root's lifetime is governed by its root cgroup. Zero | 1720 | * A root's lifetime is governed by its root cgroup. |
| 1557 | * ref indicate that the root is being destroyed. Wait for | 1721 | * tryget_live failure indicate that the root is being |
| 1558 | * destruction to complete so that the subsystems are free. | 1722 | * destroyed. Wait for destruction to complete so that the |
| 1559 | * We can use wait_queue for the wait but this path is | 1723 | * subsystems are free. We can use wait_queue for the wait |
| 1560 | * super cold. Let's just sleep for a bit and retry. | 1724 | * but this path is super cold. Let's just sleep for a bit |
| 1725 | * and retry. | ||
| 1561 | */ | 1726 | */ |
| 1562 | if (!atomic_inc_not_zero(&root->cgrp.refcnt)) { | 1727 | if (!percpu_ref_tryget_live(&root->cgrp.self.refcnt)) { |
| 1563 | mutex_unlock(&cgroup_mutex); | 1728 | mutex_unlock(&cgroup_mutex); |
| 1564 | mutex_unlock(&cgroup_tree_mutex); | ||
| 1565 | kfree(opts.release_agent); | ||
| 1566 | kfree(opts.name); | ||
| 1567 | msleep(10); | 1729 | msleep(10); |
| 1568 | goto retry; | 1730 | ret = restart_syscall(); |
| 1731 | goto out_free; | ||
| 1569 | } | 1732 | } |
| 1570 | 1733 | ||
| 1571 | ret = 0; | 1734 | ret = 0; |
| @@ -1596,15 +1759,15 @@ retry: | |||
| 1596 | 1759 | ||
| 1597 | out_unlock: | 1760 | out_unlock: |
| 1598 | mutex_unlock(&cgroup_mutex); | 1761 | mutex_unlock(&cgroup_mutex); |
| 1599 | mutex_unlock(&cgroup_tree_mutex); | 1762 | out_free: |
| 1600 | |||
| 1601 | kfree(opts.release_agent); | 1763 | kfree(opts.release_agent); |
| 1602 | kfree(opts.name); | 1764 | kfree(opts.name); |
| 1603 | 1765 | ||
| 1604 | if (ret) | 1766 | if (ret) |
| 1605 | return ERR_PTR(ret); | 1767 | return ERR_PTR(ret); |
| 1606 | 1768 | ||
| 1607 | dentry = kernfs_mount(fs_type, flags, root->kf_root, &new_sb); | 1769 | dentry = kernfs_mount(fs_type, flags, root->kf_root, |
| 1770 | CGROUP_SUPER_MAGIC, &new_sb); | ||
| 1608 | if (IS_ERR(dentry) || !new_sb) | 1771 | if (IS_ERR(dentry) || !new_sb) |
| 1609 | cgroup_put(&root->cgrp); | 1772 | cgroup_put(&root->cgrp); |
| 1610 | return dentry; | 1773 | return dentry; |
| @@ -1615,7 +1778,19 @@ static void cgroup_kill_sb(struct super_block *sb) | |||
| 1615 | struct kernfs_root *kf_root = kernfs_root_from_sb(sb); | 1778 | struct kernfs_root *kf_root = kernfs_root_from_sb(sb); |
| 1616 | struct cgroup_root *root = cgroup_root_from_kf(kf_root); | 1779 | struct cgroup_root *root = cgroup_root_from_kf(kf_root); |
| 1617 | 1780 | ||
| 1618 | cgroup_put(&root->cgrp); | 1781 | /* |
| 1782 | * If @root doesn't have any mounts or children, start killing it. | ||
| 1783 | * This prevents new mounts by disabling percpu_ref_tryget_live(). | ||
| 1784 | * cgroup_mount() may wait for @root's release. | ||
| 1785 | * | ||
| 1786 | * And don't kill the default root. | ||
| 1787 | */ | ||
| 1788 | if (css_has_online_children(&root->cgrp.self) || | ||
| 1789 | root == &cgrp_dfl_root) | ||
| 1790 | cgroup_put(&root->cgrp); | ||
| 1791 | else | ||
| 1792 | percpu_ref_kill(&root->cgrp.self.refcnt); | ||
| 1793 | |||
| 1619 | kernfs_kill_sb(sb); | 1794 | kernfs_kill_sb(sb); |
| 1620 | } | 1795 | } |
| 1621 | 1796 | ||
| @@ -1737,7 +1912,7 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) | |||
| 1737 | 1912 | ||
| 1738 | /** | 1913 | /** |
| 1739 | * cgroup_task_migrate - move a task from one cgroup to another. | 1914 | * cgroup_task_migrate - move a task from one cgroup to another. |
| 1740 | * @old_cgrp; the cgroup @tsk is being migrated from | 1915 | * @old_cgrp: the cgroup @tsk is being migrated from |
| 1741 | * @tsk: the task being migrated | 1916 | * @tsk: the task being migrated |
| 1742 | * @new_cset: the new css_set @tsk is being attached to | 1917 | * @new_cset: the new css_set @tsk is being attached to |
| 1743 | * | 1918 | * |
| @@ -1829,10 +2004,6 @@ static void cgroup_migrate_add_src(struct css_set *src_cset, | |||
| 1829 | 2004 | ||
| 1830 | src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); | 2005 | src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); |
| 1831 | 2006 | ||
| 1832 | /* nothing to do if this cset already belongs to the cgroup */ | ||
| 1833 | if (src_cgrp == dst_cgrp) | ||
| 1834 | return; | ||
| 1835 | |||
| 1836 | if (!list_empty(&src_cset->mg_preload_node)) | 2007 | if (!list_empty(&src_cset->mg_preload_node)) |
| 1837 | return; | 2008 | return; |
| 1838 | 2009 | ||
| @@ -1847,13 +2018,14 @@ static void cgroup_migrate_add_src(struct css_set *src_cset, | |||
| 1847 | 2018 | ||
| 1848 | /** | 2019 | /** |
| 1849 | * cgroup_migrate_prepare_dst - prepare destination css_sets for migration | 2020 | * cgroup_migrate_prepare_dst - prepare destination css_sets for migration |
| 1850 | * @dst_cgrp: the destination cgroup | 2021 | * @dst_cgrp: the destination cgroup (may be %NULL) |
| 1851 | * @preloaded_csets: list of preloaded source css_sets | 2022 | * @preloaded_csets: list of preloaded source css_sets |
| 1852 | * | 2023 | * |
| 1853 | * Tasks are about to be moved to @dst_cgrp and all the source css_sets | 2024 | * Tasks are about to be moved to @dst_cgrp and all the source css_sets |
| 1854 | * have been preloaded to @preloaded_csets. This function looks up and | 2025 | * have been preloaded to @preloaded_csets. This function looks up and |
| 1855 | * pins all destination css_sets, links each to its source, and put them on | 2026 | * pins all destination css_sets, links each to its source, and append them |
| 1856 | * @preloaded_csets. | 2027 | * to @preloaded_csets. If @dst_cgrp is %NULL, the destination of each |
| 2028 | * source css_set is assumed to be its cgroup on the default hierarchy. | ||
| 1857 | * | 2029 | * |
| 1858 | * This function must be called after cgroup_migrate_add_src() has been | 2030 | * This function must be called after cgroup_migrate_add_src() has been |
| 1859 | * called on each migration source css_set. After migration is performed | 2031 | * called on each migration source css_set. After migration is performed |
| @@ -1864,19 +2036,42 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, | |||
| 1864 | struct list_head *preloaded_csets) | 2036 | struct list_head *preloaded_csets) |
| 1865 | { | 2037 | { |
| 1866 | LIST_HEAD(csets); | 2038 | LIST_HEAD(csets); |
| 1867 | struct css_set *src_cset; | 2039 | struct css_set *src_cset, *tmp_cset; |
| 1868 | 2040 | ||
| 1869 | lockdep_assert_held(&cgroup_mutex); | 2041 | lockdep_assert_held(&cgroup_mutex); |
| 1870 | 2042 | ||
| 2043 | /* | ||
| 2044 | * Except for the root, child_subsys_mask must be zero for a cgroup | ||
| 2045 | * with tasks so that child cgroups don't compete against tasks. | ||
| 2046 | */ | ||
| 2047 | if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && cgroup_parent(dst_cgrp) && | ||
| 2048 | dst_cgrp->child_subsys_mask) | ||
| 2049 | return -EBUSY; | ||
| 2050 | |||
| 1871 | /* look up the dst cset for each src cset and link it to src */ | 2051 | /* look up the dst cset for each src cset and link it to src */ |
| 1872 | list_for_each_entry(src_cset, preloaded_csets, mg_preload_node) { | 2052 | list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) { |
| 1873 | struct css_set *dst_cset; | 2053 | struct css_set *dst_cset; |
| 1874 | 2054 | ||
| 1875 | dst_cset = find_css_set(src_cset, dst_cgrp); | 2055 | dst_cset = find_css_set(src_cset, |
| 2056 | dst_cgrp ?: src_cset->dfl_cgrp); | ||
| 1876 | if (!dst_cset) | 2057 | if (!dst_cset) |
| 1877 | goto err; | 2058 | goto err; |
| 1878 | 2059 | ||
| 1879 | WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset); | 2060 | WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset); |
| 2061 | |||
| 2062 | /* | ||
| 2063 | * If src cset equals dst, it's noop. Drop the src. | ||
| 2064 | * cgroup_migrate() will skip the cset too. Note that we | ||
| 2065 | * can't handle src == dst as some nodes are used by both. | ||
| 2066 | */ | ||
| 2067 | if (src_cset == dst_cset) { | ||
| 2068 | src_cset->mg_src_cgrp = NULL; | ||
| 2069 | list_del_init(&src_cset->mg_preload_node); | ||
| 2070 | put_css_set(src_cset, false); | ||
| 2071 | put_css_set(dst_cset, false); | ||
| 2072 | continue; | ||
| 2073 | } | ||
| 2074 | |||
| 1880 | src_cset->mg_dst_cset = dst_cset; | 2075 | src_cset->mg_dst_cset = dst_cset; |
| 1881 | 2076 | ||
| 1882 | if (list_empty(&dst_cset->mg_preload_node)) | 2077 | if (list_empty(&dst_cset->mg_preload_node)) |
| @@ -1885,7 +2080,7 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, | |||
| 1885 | put_css_set(dst_cset, false); | 2080 | put_css_set(dst_cset, false); |
| 1886 | } | 2081 | } |
| 1887 | 2082 | ||
| 1888 | list_splice(&csets, preloaded_csets); | 2083 | list_splice_tail(&csets, preloaded_csets); |
| 1889 | return 0; | 2084 | return 0; |
| 1890 | err: | 2085 | err: |
| 1891 | cgroup_migrate_finish(&csets); | 2086 | cgroup_migrate_finish(&csets); |
| @@ -1966,7 +2161,7 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader, | |||
| 1966 | return 0; | 2161 | return 0; |
| 1967 | 2162 | ||
| 1968 | /* check that we can legitimately attach to the cgroup */ | 2163 | /* check that we can legitimately attach to the cgroup */ |
| 1969 | for_each_css(css, i, cgrp) { | 2164 | for_each_e_css(css, i, cgrp) { |
| 1970 | if (css->ss->can_attach) { | 2165 | if (css->ss->can_attach) { |
| 1971 | ret = css->ss->can_attach(css, &tset); | 2166 | ret = css->ss->can_attach(css, &tset); |
| 1972 | if (ret) { | 2167 | if (ret) { |
| @@ -1996,7 +2191,7 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader, | |||
| 1996 | */ | 2191 | */ |
| 1997 | tset.csets = &tset.dst_csets; | 2192 | tset.csets = &tset.dst_csets; |
| 1998 | 2193 | ||
| 1999 | for_each_css(css, i, cgrp) | 2194 | for_each_e_css(css, i, cgrp) |
| 2000 | if (css->ss->attach) | 2195 | if (css->ss->attach) |
| 2001 | css->ss->attach(css, &tset); | 2196 | css->ss->attach(css, &tset); |
| 2002 | 2197 | ||
| @@ -2004,7 +2199,7 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader, | |||
| 2004 | goto out_release_tset; | 2199 | goto out_release_tset; |
| 2005 | 2200 | ||
| 2006 | out_cancel_attach: | 2201 | out_cancel_attach: |
| 2007 | for_each_css(css, i, cgrp) { | 2202 | for_each_e_css(css, i, cgrp) { |
| 2008 | if (css == failed_css) | 2203 | if (css == failed_css) |
| 2009 | break; | 2204 | break; |
| 2010 | if (css->ss->cancel_attach) | 2205 | if (css->ss->cancel_attach) |
| @@ -2063,13 +2258,20 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, | |||
| 2063 | * function to attach either it or all tasks in its threadgroup. Will lock | 2258 | * function to attach either it or all tasks in its threadgroup. Will lock |
| 2064 | * cgroup_mutex and threadgroup. | 2259 | * cgroup_mutex and threadgroup. |
| 2065 | */ | 2260 | */ |
| 2066 | static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) | 2261 | static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, |
| 2262 | size_t nbytes, loff_t off, bool threadgroup) | ||
| 2067 | { | 2263 | { |
| 2068 | struct task_struct *tsk; | 2264 | struct task_struct *tsk; |
| 2069 | const struct cred *cred = current_cred(), *tcred; | 2265 | const struct cred *cred = current_cred(), *tcred; |
| 2266 | struct cgroup *cgrp; | ||
| 2267 | pid_t pid; | ||
| 2070 | int ret; | 2268 | int ret; |
| 2071 | 2269 | ||
| 2072 | if (!cgroup_lock_live_group(cgrp)) | 2270 | if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) |
| 2271 | return -EINVAL; | ||
| 2272 | |||
| 2273 | cgrp = cgroup_kn_lock_live(of->kn); | ||
| 2274 | if (!cgrp) | ||
| 2073 | return -ENODEV; | 2275 | return -ENODEV; |
| 2074 | 2276 | ||
| 2075 | retry_find_task: | 2277 | retry_find_task: |
| @@ -2135,8 +2337,8 @@ retry_find_task: | |||
| 2135 | 2337 | ||
| 2136 | put_task_struct(tsk); | 2338 | put_task_struct(tsk); |
| 2137 | out_unlock_cgroup: | 2339 | out_unlock_cgroup: |
| 2138 | mutex_unlock(&cgroup_mutex); | 2340 | cgroup_kn_unlock(of->kn); |
| 2139 | return ret; | 2341 | return ret ?: nbytes; |
| 2140 | } | 2342 | } |
| 2141 | 2343 | ||
| 2142 | /** | 2344 | /** |
| @@ -2170,43 +2372,44 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) | |||
| 2170 | } | 2372 | } |
| 2171 | EXPORT_SYMBOL_GPL(cgroup_attach_task_all); | 2373 | EXPORT_SYMBOL_GPL(cgroup_attach_task_all); |
| 2172 | 2374 | ||
| 2173 | static int cgroup_tasks_write(struct cgroup_subsys_state *css, | 2375 | static ssize_t cgroup_tasks_write(struct kernfs_open_file *of, |
| 2174 | struct cftype *cft, u64 pid) | 2376 | char *buf, size_t nbytes, loff_t off) |
| 2175 | { | 2377 | { |
| 2176 | return attach_task_by_pid(css->cgroup, pid, false); | 2378 | return __cgroup_procs_write(of, buf, nbytes, off, false); |
| 2177 | } | 2379 | } |
| 2178 | 2380 | ||
| 2179 | static int cgroup_procs_write(struct cgroup_subsys_state *css, | 2381 | static ssize_t cgroup_procs_write(struct kernfs_open_file *of, |
| 2180 | struct cftype *cft, u64 tgid) | 2382 | char *buf, size_t nbytes, loff_t off) |
| 2181 | { | 2383 | { |
| 2182 | return attach_task_by_pid(css->cgroup, tgid, true); | 2384 | return __cgroup_procs_write(of, buf, nbytes, off, true); |
| 2183 | } | 2385 | } |
| 2184 | 2386 | ||
| 2185 | static int cgroup_release_agent_write(struct cgroup_subsys_state *css, | 2387 | static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, |
| 2186 | struct cftype *cft, char *buffer) | 2388 | char *buf, size_t nbytes, loff_t off) |
| 2187 | { | 2389 | { |
| 2188 | struct cgroup_root *root = css->cgroup->root; | 2390 | struct cgroup *cgrp; |
| 2189 | 2391 | ||
| 2190 | BUILD_BUG_ON(sizeof(root->release_agent_path) < PATH_MAX); | 2392 | BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); |
| 2191 | if (!cgroup_lock_live_group(css->cgroup)) | 2393 | |
| 2394 | cgrp = cgroup_kn_lock_live(of->kn); | ||
| 2395 | if (!cgrp) | ||
| 2192 | return -ENODEV; | 2396 | return -ENODEV; |
| 2193 | spin_lock(&release_agent_path_lock); | 2397 | spin_lock(&release_agent_path_lock); |
| 2194 | strlcpy(root->release_agent_path, buffer, | 2398 | strlcpy(cgrp->root->release_agent_path, strstrip(buf), |
| 2195 | sizeof(root->release_agent_path)); | 2399 | sizeof(cgrp->root->release_agent_path)); |
| 2196 | spin_unlock(&release_agent_path_lock); | 2400 | spin_unlock(&release_agent_path_lock); |
| 2197 | mutex_unlock(&cgroup_mutex); | 2401 | cgroup_kn_unlock(of->kn); |
| 2198 | return 0; | 2402 | return nbytes; |
| 2199 | } | 2403 | } |
| 2200 | 2404 | ||
| 2201 | static int cgroup_release_agent_show(struct seq_file *seq, void *v) | 2405 | static int cgroup_release_agent_show(struct seq_file *seq, void *v) |
| 2202 | { | 2406 | { |
| 2203 | struct cgroup *cgrp = seq_css(seq)->cgroup; | 2407 | struct cgroup *cgrp = seq_css(seq)->cgroup; |
| 2204 | 2408 | ||
| 2205 | if (!cgroup_lock_live_group(cgrp)) | 2409 | spin_lock(&release_agent_path_lock); |
| 2206 | return -ENODEV; | ||
| 2207 | seq_puts(seq, cgrp->root->release_agent_path); | 2410 | seq_puts(seq, cgrp->root->release_agent_path); |
| 2411 | spin_unlock(&release_agent_path_lock); | ||
| 2208 | seq_putc(seq, '\n'); | 2412 | seq_putc(seq, '\n'); |
| 2209 | mutex_unlock(&cgroup_mutex); | ||
| 2210 | return 0; | 2413 | return 0; |
| 2211 | } | 2414 | } |
| 2212 | 2415 | ||
| @@ -2218,6 +2421,320 @@ static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) | |||
| 2218 | return 0; | 2421 | return 0; |
| 2219 | } | 2422 | } |
| 2220 | 2423 | ||
| 2424 | static void cgroup_print_ss_mask(struct seq_file *seq, unsigned int ss_mask) | ||
| 2425 | { | ||
| 2426 | struct cgroup_subsys *ss; | ||
| 2427 | bool printed = false; | ||
| 2428 | int ssid; | ||
| 2429 | |||
| 2430 | for_each_subsys(ss, ssid) { | ||
| 2431 | if (ss_mask & (1 << ssid)) { | ||
| 2432 | if (printed) | ||
| 2433 | seq_putc(seq, ' '); | ||
| 2434 | seq_printf(seq, "%s", ss->name); | ||
| 2435 | printed = true; | ||
| 2436 | } | ||
| 2437 | } | ||
| 2438 | if (printed) | ||
| 2439 | seq_putc(seq, '\n'); | ||
| 2440 | } | ||
| 2441 | |||
| 2442 | /* show controllers which are currently attached to the default hierarchy */ | ||
| 2443 | static int cgroup_root_controllers_show(struct seq_file *seq, void *v) | ||
| 2444 | { | ||
| 2445 | struct cgroup *cgrp = seq_css(seq)->cgroup; | ||
| 2446 | |||
| 2447 | cgroup_print_ss_mask(seq, cgrp->root->subsys_mask & | ||
| 2448 | ~cgrp_dfl_root_inhibit_ss_mask); | ||
| 2449 | return 0; | ||
| 2450 | } | ||
| 2451 | |||
| 2452 | /* show controllers which are enabled from the parent */ | ||
| 2453 | static int cgroup_controllers_show(struct seq_file *seq, void *v) | ||
| 2454 | { | ||
| 2455 | struct cgroup *cgrp = seq_css(seq)->cgroup; | ||
| 2456 | |||
| 2457 | cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->child_subsys_mask); | ||
| 2458 | return 0; | ||
| 2459 | } | ||
| 2460 | |||
| 2461 | /* show controllers which are enabled for a given cgroup's children */ | ||
| 2462 | static int cgroup_subtree_control_show(struct seq_file *seq, void *v) | ||
| 2463 | { | ||
| 2464 | struct cgroup *cgrp = seq_css(seq)->cgroup; | ||
| 2465 | |||
| 2466 | cgroup_print_ss_mask(seq, cgrp->child_subsys_mask); | ||
| 2467 | return 0; | ||
| 2468 | } | ||
| 2469 | |||
| 2470 | /** | ||
| 2471 | * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy | ||
| 2472 | * @cgrp: root of the subtree to update csses for | ||
| 2473 | * | ||
| 2474 | * @cgrp's child_subsys_mask has changed and its subtree's (self excluded) | ||
| 2475 | * css associations need to be updated accordingly. This function looks up | ||
| 2476 | * all css_sets which are attached to the subtree, creates the matching | ||
| 2477 | * updated css_sets and migrates the tasks to the new ones. | ||
| 2478 | */ | ||
| 2479 | static int cgroup_update_dfl_csses(struct cgroup *cgrp) | ||
| 2480 | { | ||
| 2481 | LIST_HEAD(preloaded_csets); | ||
| 2482 | struct cgroup_subsys_state *css; | ||
| 2483 | struct css_set *src_cset; | ||
| 2484 | int ret; | ||
| 2485 | |||
| 2486 | lockdep_assert_held(&cgroup_mutex); | ||
| 2487 | |||
| 2488 | /* look up all csses currently attached to @cgrp's subtree */ | ||
| 2489 | down_read(&css_set_rwsem); | ||
| 2490 | css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) { | ||
| 2491 | struct cgrp_cset_link *link; | ||
| 2492 | |||
| 2493 | /* self is not affected by child_subsys_mask change */ | ||
| 2494 | if (css->cgroup == cgrp) | ||
| 2495 | continue; | ||
| 2496 | |||
| 2497 | list_for_each_entry(link, &css->cgroup->cset_links, cset_link) | ||
| 2498 | cgroup_migrate_add_src(link->cset, cgrp, | ||
| 2499 | &preloaded_csets); | ||
| 2500 | } | ||
| 2501 | up_read(&css_set_rwsem); | ||
| 2502 | |||
| 2503 | /* NULL dst indicates self on default hierarchy */ | ||
| 2504 | ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets); | ||
| 2505 | if (ret) | ||
| 2506 | goto out_finish; | ||
| 2507 | |||
| 2508 | list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) { | ||
| 2509 | struct task_struct *last_task = NULL, *task; | ||
| 2510 | |||
| 2511 | /* src_csets precede dst_csets, break on the first dst_cset */ | ||
| 2512 | if (!src_cset->mg_src_cgrp) | ||
| 2513 | break; | ||
| 2514 | |||
| 2515 | /* | ||
| 2516 | * All tasks in src_cset need to be migrated to the | ||
| 2517 | * matching dst_cset. Empty it process by process. We | ||
| 2518 | * walk tasks but migrate processes. The leader might even | ||
| 2519 | * belong to a different cset but such src_cset would also | ||
| 2520 | * be among the target src_csets because the default | ||
| 2521 | * hierarchy enforces per-process membership. | ||
| 2522 | */ | ||
| 2523 | while (true) { | ||
| 2524 | down_read(&css_set_rwsem); | ||
| 2525 | task = list_first_entry_or_null(&src_cset->tasks, | ||
| 2526 | struct task_struct, cg_list); | ||
| 2527 | if (task) { | ||
| 2528 | task = task->group_leader; | ||
| 2529 | WARN_ON_ONCE(!task_css_set(task)->mg_src_cgrp); | ||
| 2530 | get_task_struct(task); | ||
| 2531 | } | ||
| 2532 | up_read(&css_set_rwsem); | ||
| 2533 | |||
| 2534 | if (!task) | ||
| 2535 | break; | ||
| 2536 | |||
| 2537 | /* guard against possible infinite loop */ | ||
| 2538 | if (WARN(last_task == task, | ||
| 2539 | "cgroup: update_dfl_csses failed to make progress, aborting in inconsistent state\n")) | ||
| 2540 | goto out_finish; | ||
| 2541 | last_task = task; | ||
| 2542 | |||
| 2543 | threadgroup_lock(task); | ||
| 2544 | /* raced against de_thread() from another thread? */ | ||
| 2545 | if (!thread_group_leader(task)) { | ||
| 2546 | threadgroup_unlock(task); | ||
| 2547 | put_task_struct(task); | ||
| 2548 | continue; | ||
| 2549 | } | ||
| 2550 | |||
| 2551 | ret = cgroup_migrate(src_cset->dfl_cgrp, task, true); | ||
| 2552 | |||
| 2553 | threadgroup_unlock(task); | ||
| 2554 | put_task_struct(task); | ||
| 2555 | |||
| 2556 | if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret)) | ||
| 2557 | goto out_finish; | ||
| 2558 | } | ||
| 2559 | } | ||
| 2560 | |||
| 2561 | out_finish: | ||
| 2562 | cgroup_migrate_finish(&preloaded_csets); | ||
| 2563 | return ret; | ||
| 2564 | } | ||
| 2565 | |||
| 2566 | /* change the enabled child controllers for a cgroup in the default hierarchy */ | ||
| 2567 | static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, | ||
| 2568 | char *buf, size_t nbytes, | ||
| 2569 | loff_t off) | ||
| 2570 | { | ||
| 2571 | unsigned int enable = 0, disable = 0; | ||
| 2572 | struct cgroup *cgrp, *child; | ||
| 2573 | struct cgroup_subsys *ss; | ||
| 2574 | char *tok; | ||
| 2575 | int ssid, ret; | ||
| 2576 | |||
| 2577 | /* | ||
| 2578 | * Parse input - space separated list of subsystem names prefixed | ||
| 2579 | * with either + or -. | ||
| 2580 | */ | ||
| 2581 | buf = strstrip(buf); | ||
| 2582 | while ((tok = strsep(&buf, " "))) { | ||
| 2583 | if (tok[0] == '\0') | ||
| 2584 | continue; | ||
| 2585 | for_each_subsys(ss, ssid) { | ||
| 2586 | if (ss->disabled || strcmp(tok + 1, ss->name) || | ||
| 2587 | ((1 << ss->id) & cgrp_dfl_root_inhibit_ss_mask)) | ||
| 2588 | continue; | ||
| 2589 | |||
| 2590 | if (*tok == '+') { | ||
| 2591 | enable |= 1 << ssid; | ||
| 2592 | disable &= ~(1 << ssid); | ||
| 2593 | } else if (*tok == '-') { | ||
| 2594 | disable |= 1 << ssid; | ||
| 2595 | enable &= ~(1 << ssid); | ||
| 2596 | } else { | ||
| 2597 | return -EINVAL; | ||
| 2598 | } | ||
| 2599 | break; | ||
| 2600 | } | ||
| 2601 | if (ssid == CGROUP_SUBSYS_COUNT) | ||
| 2602 | return -EINVAL; | ||
| 2603 | } | ||
| 2604 | |||
| 2605 | cgrp = cgroup_kn_lock_live(of->kn); | ||
| 2606 | if (!cgrp) | ||
| 2607 | return -ENODEV; | ||
| 2608 | |||
| 2609 | for_each_subsys(ss, ssid) { | ||
| 2610 | if (enable & (1 << ssid)) { | ||
| 2611 | if (cgrp->child_subsys_mask & (1 << ssid)) { | ||
| 2612 | enable &= ~(1 << ssid); | ||
| 2613 | continue; | ||
| 2614 | } | ||
| 2615 | |||
| 2616 | /* | ||
| 2617 | * Because css offlining is asynchronous, userland | ||
| 2618 | * might try to re-enable the same controller while | ||
| 2619 | * the previous instance is still around. In such | ||
| 2620 | * cases, wait till it's gone using offline_waitq. | ||
| 2621 | */ | ||
| 2622 | cgroup_for_each_live_child(child, cgrp) { | ||
| 2623 | DEFINE_WAIT(wait); | ||
| 2624 | |||
| 2625 | if (!cgroup_css(child, ss)) | ||
| 2626 | continue; | ||
| 2627 | |||
| 2628 | cgroup_get(child); | ||
| 2629 | prepare_to_wait(&child->offline_waitq, &wait, | ||
| 2630 | TASK_UNINTERRUPTIBLE); | ||
| 2631 | cgroup_kn_unlock(of->kn); | ||
| 2632 | schedule(); | ||
| 2633 | finish_wait(&child->offline_waitq, &wait); | ||
| 2634 | cgroup_put(child); | ||
| 2635 | |||
| 2636 | return restart_syscall(); | ||
| 2637 | } | ||
| 2638 | |||
| 2639 | /* unavailable or not enabled on the parent? */ | ||
| 2640 | if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) || | ||
| 2641 | (cgroup_parent(cgrp) && | ||
| 2642 | !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ssid)))) { | ||
| 2643 | ret = -ENOENT; | ||
| 2644 | goto out_unlock; | ||
| 2645 | } | ||
| 2646 | } else if (disable & (1 << ssid)) { | ||
| 2647 | if (!(cgrp->child_subsys_mask & (1 << ssid))) { | ||
| 2648 | disable &= ~(1 << ssid); | ||
| 2649 | continue; | ||
| 2650 | } | ||
| 2651 | |||
| 2652 | /* a child has it enabled? */ | ||
| 2653 | cgroup_for_each_live_child(child, cgrp) { | ||
| 2654 | if (child->child_subsys_mask & (1 << ssid)) { | ||
| 2655 | ret = -EBUSY; | ||
| 2656 | goto out_unlock; | ||
| 2657 | } | ||
| 2658 | } | ||
| 2659 | } | ||
| 2660 | } | ||
| 2661 | |||
| 2662 | if (!enable && !disable) { | ||
| 2663 | ret = 0; | ||
| 2664 | goto out_unlock; | ||
| 2665 | } | ||
| 2666 | |||
| 2667 | /* | ||
| 2668 | * Except for the root, child_subsys_mask must be zero for a cgroup | ||
| 2669 | * with tasks so that child cgroups don't compete against tasks. | ||
| 2670 | */ | ||
| 2671 | if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) { | ||
| 2672 | ret = -EBUSY; | ||
| 2673 | goto out_unlock; | ||
| 2674 | } | ||
| 2675 | |||
| 2676 | /* | ||
| 2677 | * Create csses for enables and update child_subsys_mask. This | ||
| 2678 | * changes cgroup_e_css() results which in turn makes the | ||
| 2679 | * subsequent cgroup_update_dfl_csses() associate all tasks in the | ||
| 2680 | * subtree to the updated csses. | ||
| 2681 | */ | ||
| 2682 | for_each_subsys(ss, ssid) { | ||
| 2683 | if (!(enable & (1 << ssid))) | ||
| 2684 | continue; | ||
| 2685 | |||
| 2686 | cgroup_for_each_live_child(child, cgrp) { | ||
| 2687 | ret = create_css(child, ss); | ||
| 2688 | if (ret) | ||
| 2689 | goto err_undo_css; | ||
| 2690 | } | ||
| 2691 | } | ||
| 2692 | |||
| 2693 | cgrp->child_subsys_mask |= enable; | ||
| 2694 | cgrp->child_subsys_mask &= ~disable; | ||
| 2695 | |||
| 2696 | ret = cgroup_update_dfl_csses(cgrp); | ||
| 2697 | if (ret) | ||
| 2698 | goto err_undo_css; | ||
| 2699 | |||
| 2700 | /* all tasks are now migrated away from the old csses, kill them */ | ||
| 2701 | for_each_subsys(ss, ssid) { | ||
| 2702 | if (!(disable & (1 << ssid))) | ||
| 2703 | continue; | ||
| 2704 | |||
| 2705 | cgroup_for_each_live_child(child, cgrp) | ||
| 2706 | kill_css(cgroup_css(child, ss)); | ||
| 2707 | } | ||
| 2708 | |||
| 2709 | kernfs_activate(cgrp->kn); | ||
| 2710 | ret = 0; | ||
| 2711 | out_unlock: | ||
| 2712 | cgroup_kn_unlock(of->kn); | ||
| 2713 | return ret ?: nbytes; | ||
| 2714 | |||
| 2715 | err_undo_css: | ||
| 2716 | cgrp->child_subsys_mask &= ~enable; | ||
| 2717 | cgrp->child_subsys_mask |= disable; | ||
| 2718 | |||
| 2719 | for_each_subsys(ss, ssid) { | ||
| 2720 | if (!(enable & (1 << ssid))) | ||
| 2721 | continue; | ||
| 2722 | |||
| 2723 | cgroup_for_each_live_child(child, cgrp) { | ||
| 2724 | struct cgroup_subsys_state *css = cgroup_css(child, ss); | ||
| 2725 | if (css) | ||
| 2726 | kill_css(css); | ||
| 2727 | } | ||
| 2728 | } | ||
| 2729 | goto out_unlock; | ||
| 2730 | } | ||
| 2731 | |||
| 2732 | static int cgroup_populated_show(struct seq_file *seq, void *v) | ||
| 2733 | { | ||
| 2734 | seq_printf(seq, "%d\n", (bool)seq_css(seq)->cgroup->populated_cnt); | ||
| 2735 | return 0; | ||
| 2736 | } | ||
| 2737 | |||
| 2221 | static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, | 2738 | static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, |
| 2222 | size_t nbytes, loff_t off) | 2739 | size_t nbytes, loff_t off) |
| 2223 | { | 2740 | { |
| @@ -2226,6 +2743,9 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, | |||
| 2226 | struct cgroup_subsys_state *css; | 2743 | struct cgroup_subsys_state *css; |
| 2227 | int ret; | 2744 | int ret; |
| 2228 | 2745 | ||
| 2746 | if (cft->write) | ||
| 2747 | return cft->write(of, buf, nbytes, off); | ||
| 2748 | |||
| 2229 | /* | 2749 | /* |
| 2230 | * kernfs guarantees that a file isn't deleted with operations in | 2750 | * kernfs guarantees that a file isn't deleted with operations in |
| 2231 | * flight, which means that the matching css is and stays alive and | 2751 | * flight, which means that the matching css is and stays alive and |
| @@ -2236,9 +2756,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, | |||
| 2236 | css = cgroup_css(cgrp, cft->ss); | 2756 | css = cgroup_css(cgrp, cft->ss); |
| 2237 | rcu_read_unlock(); | 2757 | rcu_read_unlock(); |
| 2238 | 2758 | ||
| 2239 | if (cft->write_string) { | 2759 | if (cft->write_u64) { |
| 2240 | ret = cft->write_string(css, cft, strstrip(buf)); | ||
| 2241 | } else if (cft->write_u64) { | ||
| 2242 | unsigned long long v; | 2760 | unsigned long long v; |
| 2243 | ret = kstrtoull(buf, 0, &v); | 2761 | ret = kstrtoull(buf, 0, &v); |
| 2244 | if (!ret) | 2762 | if (!ret) |
| @@ -2248,8 +2766,6 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, | |||
| 2248 | ret = kstrtoll(buf, 0, &v); | 2766 | ret = kstrtoll(buf, 0, &v); |
| 2249 | if (!ret) | 2767 | if (!ret) |
| 2250 | ret = cft->write_s64(css, cft, v); | 2768 | ret = cft->write_s64(css, cft, v); |
| 2251 | } else if (cft->trigger) { | ||
| 2252 | ret = cft->trigger(css, (unsigned int)cft->private); | ||
| 2253 | } else { | 2769 | } else { |
| 2254 | ret = -EINVAL; | 2770 | ret = -EINVAL; |
| 2255 | } | 2771 | } |
| @@ -2326,20 +2842,18 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, | |||
| 2326 | return -EPERM; | 2842 | return -EPERM; |
| 2327 | 2843 | ||
| 2328 | /* | 2844 | /* |
| 2329 | * We're gonna grab cgroup_tree_mutex which nests outside kernfs | 2845 | * We're gonna grab cgroup_mutex which nests outside kernfs |
| 2330 | * active_ref. kernfs_rename() doesn't require active_ref | 2846 | * active_ref. kernfs_rename() doesn't require active_ref |
| 2331 | * protection. Break them before grabbing cgroup_tree_mutex. | 2847 | * protection. Break them before grabbing cgroup_mutex. |
| 2332 | */ | 2848 | */ |
| 2333 | kernfs_break_active_protection(new_parent); | 2849 | kernfs_break_active_protection(new_parent); |
| 2334 | kernfs_break_active_protection(kn); | 2850 | kernfs_break_active_protection(kn); |
| 2335 | 2851 | ||
| 2336 | mutex_lock(&cgroup_tree_mutex); | ||
| 2337 | mutex_lock(&cgroup_mutex); | 2852 | mutex_lock(&cgroup_mutex); |
| 2338 | 2853 | ||
| 2339 | ret = kernfs_rename(kn, new_parent, new_name_str); | 2854 | ret = kernfs_rename(kn, new_parent, new_name_str); |
| 2340 | 2855 | ||
| 2341 | mutex_unlock(&cgroup_mutex); | 2856 | mutex_unlock(&cgroup_mutex); |
| 2342 | mutex_unlock(&cgroup_tree_mutex); | ||
| 2343 | 2857 | ||
| 2344 | kernfs_unbreak_active_protection(kn); | 2858 | kernfs_unbreak_active_protection(kn); |
| 2345 | kernfs_unbreak_active_protection(new_parent); | 2859 | kernfs_unbreak_active_protection(new_parent); |
| @@ -2377,9 +2891,14 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) | |||
| 2377 | return PTR_ERR(kn); | 2891 | return PTR_ERR(kn); |
| 2378 | 2892 | ||
| 2379 | ret = cgroup_kn_set_ugid(kn); | 2893 | ret = cgroup_kn_set_ugid(kn); |
| 2380 | if (ret) | 2894 | if (ret) { |
| 2381 | kernfs_remove(kn); | 2895 | kernfs_remove(kn); |
| 2382 | return ret; | 2896 | return ret; |
| 2897 | } | ||
| 2898 | |||
| 2899 | if (cft->seq_show == cgroup_populated_show) | ||
| 2900 | cgrp->populated_kn = kn; | ||
| 2901 | return 0; | ||
| 2383 | } | 2902 | } |
| 2384 | 2903 | ||
| 2385 | /** | 2904 | /** |
| @@ -2399,7 +2918,7 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], | |||
| 2399 | struct cftype *cft; | 2918 | struct cftype *cft; |
| 2400 | int ret; | 2919 | int ret; |
| 2401 | 2920 | ||
| 2402 | lockdep_assert_held(&cgroup_tree_mutex); | 2921 | lockdep_assert_held(&cgroup_mutex); |
| 2403 | 2922 | ||
| 2404 | for (cft = cfts; cft->name[0] != '\0'; cft++) { | 2923 | for (cft = cfts; cft->name[0] != '\0'; cft++) { |
| 2405 | /* does cft->flags tell us to skip this file on @cgrp? */ | 2924 | /* does cft->flags tell us to skip this file on @cgrp? */ |
| @@ -2407,16 +2926,16 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], | |||
| 2407 | continue; | 2926 | continue; |
| 2408 | if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp)) | 2927 | if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp)) |
| 2409 | continue; | 2928 | continue; |
| 2410 | if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) | 2929 | if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp)) |
| 2411 | continue; | 2930 | continue; |
| 2412 | if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) | 2931 | if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp)) |
| 2413 | continue; | 2932 | continue; |
| 2414 | 2933 | ||
| 2415 | if (is_add) { | 2934 | if (is_add) { |
| 2416 | ret = cgroup_add_file(cgrp, cft); | 2935 | ret = cgroup_add_file(cgrp, cft); |
| 2417 | if (ret) { | 2936 | if (ret) { |
| 2418 | pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n", | 2937 | pr_warn("%s: failed to add %s, err=%d\n", |
| 2419 | cft->name, ret); | 2938 | __func__, cft->name, ret); |
| 2420 | return ret; | 2939 | return ret; |
| 2421 | } | 2940 | } |
| 2422 | } else { | 2941 | } else { |
| @@ -2434,11 +2953,7 @@ static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add) | |||
| 2434 | struct cgroup_subsys_state *css; | 2953 | struct cgroup_subsys_state *css; |
| 2435 | int ret = 0; | 2954 | int ret = 0; |
| 2436 | 2955 | ||
| 2437 | lockdep_assert_held(&cgroup_tree_mutex); | 2956 | lockdep_assert_held(&cgroup_mutex); |
| 2438 | |||
| 2439 | /* don't bother if @ss isn't attached */ | ||
| 2440 | if (ss->root == &cgrp_dfl_root) | ||
| 2441 | return 0; | ||
| 2442 | 2957 | ||
| 2443 | /* add/rm files for all cgroups created before */ | 2958 | /* add/rm files for all cgroups created before */ |
| 2444 | css_for_each_descendant_pre(css, cgroup_css(root, ss)) { | 2959 | css_for_each_descendant_pre(css, cgroup_css(root, ss)) { |
| @@ -2506,7 +3021,7 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) | |||
| 2506 | 3021 | ||
| 2507 | static int cgroup_rm_cftypes_locked(struct cftype *cfts) | 3022 | static int cgroup_rm_cftypes_locked(struct cftype *cfts) |
| 2508 | { | 3023 | { |
| 2509 | lockdep_assert_held(&cgroup_tree_mutex); | 3024 | lockdep_assert_held(&cgroup_mutex); |
| 2510 | 3025 | ||
| 2511 | if (!cfts || !cfts[0].ss) | 3026 | if (!cfts || !cfts[0].ss) |
| 2512 | return -ENOENT; | 3027 | return -ENOENT; |
| @@ -2532,9 +3047,9 @@ int cgroup_rm_cftypes(struct cftype *cfts) | |||
| 2532 | { | 3047 | { |
| 2533 | int ret; | 3048 | int ret; |
| 2534 | 3049 | ||
| 2535 | mutex_lock(&cgroup_tree_mutex); | 3050 | mutex_lock(&cgroup_mutex); |
| 2536 | ret = cgroup_rm_cftypes_locked(cfts); | 3051 | ret = cgroup_rm_cftypes_locked(cfts); |
| 2537 | mutex_unlock(&cgroup_tree_mutex); | 3052 | mutex_unlock(&cgroup_mutex); |
| 2538 | return ret; | 3053 | return ret; |
| 2539 | } | 3054 | } |
| 2540 | 3055 | ||
| @@ -2556,6 +3071,9 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) | |||
| 2556 | { | 3071 | { |
| 2557 | int ret; | 3072 | int ret; |
| 2558 | 3073 | ||
| 3074 | if (ss->disabled) | ||
| 3075 | return 0; | ||
| 3076 | |||
| 2559 | if (!cfts || cfts[0].name[0] == '\0') | 3077 | if (!cfts || cfts[0].name[0] == '\0') |
| 2560 | return 0; | 3078 | return 0; |
| 2561 | 3079 | ||
| @@ -2563,14 +3081,14 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) | |||
| 2563 | if (ret) | 3081 | if (ret) |
| 2564 | return ret; | 3082 | return ret; |
| 2565 | 3083 | ||
| 2566 | mutex_lock(&cgroup_tree_mutex); | 3084 | mutex_lock(&cgroup_mutex); |
| 2567 | 3085 | ||
| 2568 | list_add_tail(&cfts->node, &ss->cfts); | 3086 | list_add_tail(&cfts->node, &ss->cfts); |
| 2569 | ret = cgroup_apply_cftypes(cfts, true); | 3087 | ret = cgroup_apply_cftypes(cfts, true); |
| 2570 | if (ret) | 3088 | if (ret) |
| 2571 | cgroup_rm_cftypes_locked(cfts); | 3089 | cgroup_rm_cftypes_locked(cfts); |
| 2572 | 3090 | ||
| 2573 | mutex_unlock(&cgroup_tree_mutex); | 3091 | mutex_unlock(&cgroup_mutex); |
| 2574 | return ret; | 3092 | return ret; |
| 2575 | } | 3093 | } |
| 2576 | 3094 | ||
| @@ -2594,57 +3112,65 @@ static int cgroup_task_count(const struct cgroup *cgrp) | |||
| 2594 | 3112 | ||
| 2595 | /** | 3113 | /** |
| 2596 | * css_next_child - find the next child of a given css | 3114 | * css_next_child - find the next child of a given css |
| 2597 | * @pos_css: the current position (%NULL to initiate traversal) | 3115 | * @pos: the current position (%NULL to initiate traversal) |
| 2598 | * @parent_css: css whose children to walk | 3116 | * @parent: css whose children to walk |
| 2599 | * | 3117 | * |
| 2600 | * This function returns the next child of @parent_css and should be called | 3118 | * This function returns the next child of @parent and should be called |
| 2601 | * under either cgroup_mutex or RCU read lock. The only requirement is | 3119 | * under either cgroup_mutex or RCU read lock. The only requirement is |
| 2602 | * that @parent_css and @pos_css are accessible. The next sibling is | 3120 | * that @parent and @pos are accessible. The next sibling is guaranteed to |
| 2603 | * guaranteed to be returned regardless of their states. | 3121 | * be returned regardless of their states. |
| 3122 | * | ||
| 3123 | * If a subsystem synchronizes ->css_online() and the start of iteration, a | ||
| 3124 | * css which finished ->css_online() is guaranteed to be visible in the | ||
| 3125 | * future iterations and will stay visible until the last reference is put. | ||
| 3126 | * A css which hasn't finished ->css_online() or already finished | ||
| 3127 | * ->css_offline() may show up during traversal. It's each subsystem's | ||
| 3128 | * responsibility to synchronize against on/offlining. | ||
| 2604 | */ | 3129 | */ |
| 2605 | struct cgroup_subsys_state * | 3130 | struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos, |
| 2606 | css_next_child(struct cgroup_subsys_state *pos_css, | 3131 | struct cgroup_subsys_state *parent) |
| 2607 | struct cgroup_subsys_state *parent_css) | ||
| 2608 | { | 3132 | { |
| 2609 | struct cgroup *pos = pos_css ? pos_css->cgroup : NULL; | 3133 | struct cgroup_subsys_state *next; |
| 2610 | struct cgroup *cgrp = parent_css->cgroup; | ||
| 2611 | struct cgroup *next; | ||
| 2612 | 3134 | ||
| 2613 | cgroup_assert_mutexes_or_rcu_locked(); | 3135 | cgroup_assert_mutex_or_rcu_locked(); |
| 2614 | 3136 | ||
| 2615 | /* | 3137 | /* |
| 2616 | * @pos could already have been removed. Once a cgroup is removed, | 3138 | * @pos could already have been unlinked from the sibling list. |
| 2617 | * its ->sibling.next is no longer updated when its next sibling | 3139 | * Once a cgroup is removed, its ->sibling.next is no longer |
| 2618 | * changes. As CGRP_DEAD assertion is serialized and happens | 3140 | * updated when its next sibling changes. CSS_RELEASED is set when |
| 2619 | * before the cgroup is taken off the ->sibling list, if we see it | 3141 | * @pos is taken off list, at which time its next pointer is valid, |
| 2620 | * unasserted, it's guaranteed that the next sibling hasn't | 3142 | * and, as releases are serialized, the one pointed to by the next |
| 2621 | * finished its grace period even if it's already removed, and thus | 3143 | * pointer is guaranteed to not have started release yet. This |
| 2622 | * safe to dereference from this RCU critical section. If | 3144 | * implies that if we observe !CSS_RELEASED on @pos in this RCU |
| 2623 | * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed | 3145 | * critical section, the one pointed to by its next pointer is |
| 2624 | * to be visible as %true here. | 3146 | * guaranteed to not have finished its RCU grace period even if we |
| 3147 | * have dropped rcu_read_lock() inbetween iterations. | ||
| 2625 | * | 3148 | * |
| 2626 | * If @pos is dead, its next pointer can't be dereferenced; | 3149 | * If @pos has CSS_RELEASED set, its next pointer can't be |
| 2627 | * however, as each cgroup is given a monotonically increasing | 3150 | * dereferenced; however, as each css is given a monotonically |
| 2628 | * unique serial number and always appended to the sibling list, | 3151 | * increasing unique serial number and always appended to the |
| 2629 | * the next one can be found by walking the parent's children until | 3152 | * sibling list, the next one can be found by walking the parent's |
| 2630 | * we see a cgroup with higher serial number than @pos's. While | 3153 | * children until the first css with higher serial number than |
| 2631 | * this path can be slower, it's taken only when either the current | 3154 | * @pos's. While this path can be slower, it happens iff iteration |
| 2632 | * cgroup is removed or iteration and removal race. | 3155 | * races against release and the race window is very small. |
| 2633 | */ | 3156 | */ |
| 2634 | if (!pos) { | 3157 | if (!pos) { |
| 2635 | next = list_entry_rcu(cgrp->children.next, struct cgroup, sibling); | 3158 | next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling); |
| 2636 | } else if (likely(!cgroup_is_dead(pos))) { | 3159 | } else if (likely(!(pos->flags & CSS_RELEASED))) { |
| 2637 | next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); | 3160 | next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling); |
| 2638 | } else { | 3161 | } else { |
| 2639 | list_for_each_entry_rcu(next, &cgrp->children, sibling) | 3162 | list_for_each_entry_rcu(next, &parent->children, sibling) |
| 2640 | if (next->serial_nr > pos->serial_nr) | 3163 | if (next->serial_nr > pos->serial_nr) |
| 2641 | break; | 3164 | break; |
| 2642 | } | 3165 | } |
| 2643 | 3166 | ||
| 2644 | if (&next->sibling == &cgrp->children) | 3167 | /* |
| 2645 | return NULL; | 3168 | * @next, if not pointing to the head, can be dereferenced and is |
| 2646 | 3169 | * the next sibling. | |
| 2647 | return cgroup_css(next, parent_css->ss); | 3170 | */ |
| 3171 | if (&next->sibling != &parent->children) | ||
| 3172 | return next; | ||
| 3173 | return NULL; | ||
| 2648 | } | 3174 | } |
| 2649 | 3175 | ||
| 2650 | /** | 3176 | /** |
| @@ -2660,6 +3186,13 @@ css_next_child(struct cgroup_subsys_state *pos_css, | |||
| 2660 | * doesn't require the whole traversal to be contained in a single critical | 3186 | * doesn't require the whole traversal to be contained in a single critical |
| 2661 | * section. This function will return the correct next descendant as long | 3187 | * section. This function will return the correct next descendant as long |
| 2662 | * as both @pos and @root are accessible and @pos is a descendant of @root. | 3188 | * as both @pos and @root are accessible and @pos is a descendant of @root. |
| 3189 | * | ||
| 3190 | * If a subsystem synchronizes ->css_online() and the start of iteration, a | ||
| 3191 | * css which finished ->css_online() is guaranteed to be visible in the | ||
| 3192 | * future iterations and will stay visible until the last reference is put. | ||
| 3193 | * A css which hasn't finished ->css_online() or already finished | ||
| 3194 | * ->css_offline() may show up during traversal. It's each subsystem's | ||
| 3195 | * responsibility to synchronize against on/offlining. | ||
| 2663 | */ | 3196 | */ |
| 2664 | struct cgroup_subsys_state * | 3197 | struct cgroup_subsys_state * |
| 2665 | css_next_descendant_pre(struct cgroup_subsys_state *pos, | 3198 | css_next_descendant_pre(struct cgroup_subsys_state *pos, |
| @@ -2667,7 +3200,7 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos, | |||
| 2667 | { | 3200 | { |
| 2668 | struct cgroup_subsys_state *next; | 3201 | struct cgroup_subsys_state *next; |
| 2669 | 3202 | ||
| 2670 | cgroup_assert_mutexes_or_rcu_locked(); | 3203 | cgroup_assert_mutex_or_rcu_locked(); |
| 2671 | 3204 | ||
| 2672 | /* if first iteration, visit @root */ | 3205 | /* if first iteration, visit @root */ |
| 2673 | if (!pos) | 3206 | if (!pos) |
| @@ -2680,10 +3213,10 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos, | |||
| 2680 | 3213 | ||
| 2681 | /* no child, visit my or the closest ancestor's next sibling */ | 3214 | /* no child, visit my or the closest ancestor's next sibling */ |
| 2682 | while (pos != root) { | 3215 | while (pos != root) { |
| 2683 | next = css_next_child(pos, css_parent(pos)); | 3216 | next = css_next_child(pos, pos->parent); |
| 2684 | if (next) | 3217 | if (next) |
| 2685 | return next; | 3218 | return next; |
| 2686 | pos = css_parent(pos); | 3219 | pos = pos->parent; |
| 2687 | } | 3220 | } |
| 2688 | 3221 | ||
| 2689 | return NULL; | 3222 | return NULL; |
| @@ -2707,7 +3240,7 @@ css_rightmost_descendant(struct cgroup_subsys_state *pos) | |||
| 2707 | { | 3240 | { |
| 2708 | struct cgroup_subsys_state *last, *tmp; | 3241 | struct cgroup_subsys_state *last, *tmp; |
| 2709 | 3242 | ||
| 2710 | cgroup_assert_mutexes_or_rcu_locked(); | 3243 | cgroup_assert_mutex_or_rcu_locked(); |
| 2711 | 3244 | ||
| 2712 | do { | 3245 | do { |
| 2713 | last = pos; | 3246 | last = pos; |
| @@ -2747,6 +3280,13 @@ css_leftmost_descendant(struct cgroup_subsys_state *pos) | |||
| 2747 | * section. This function will return the correct next descendant as long | 3280 | * section. This function will return the correct next descendant as long |
| 2748 | * as both @pos and @cgroup are accessible and @pos is a descendant of | 3281 | * as both @pos and @cgroup are accessible and @pos is a descendant of |
| 2749 | * @cgroup. | 3282 | * @cgroup. |
| 3283 | * | ||
| 3284 | * If a subsystem synchronizes ->css_online() and the start of iteration, a | ||
| 3285 | * css which finished ->css_online() is guaranteed to be visible in the | ||
| 3286 | * future iterations and will stay visible until the last reference is put. | ||
| 3287 | * A css which hasn't finished ->css_online() or already finished | ||
| 3288 | * ->css_offline() may show up during traversal. It's each subsystem's | ||
| 3289 | * responsibility to synchronize against on/offlining. | ||
| 2750 | */ | 3290 | */ |
| 2751 | struct cgroup_subsys_state * | 3291 | struct cgroup_subsys_state * |
| 2752 | css_next_descendant_post(struct cgroup_subsys_state *pos, | 3292 | css_next_descendant_post(struct cgroup_subsys_state *pos, |
| @@ -2754,7 +3294,7 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, | |||
| 2754 | { | 3294 | { |
| 2755 | struct cgroup_subsys_state *next; | 3295 | struct cgroup_subsys_state *next; |
| 2756 | 3296 | ||
| 2757 | cgroup_assert_mutexes_or_rcu_locked(); | 3297 | cgroup_assert_mutex_or_rcu_locked(); |
| 2758 | 3298 | ||
| 2759 | /* if first iteration, visit leftmost descendant which may be @root */ | 3299 | /* if first iteration, visit leftmost descendant which may be @root */ |
| 2760 | if (!pos) | 3300 | if (!pos) |
| @@ -2765,12 +3305,36 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, | |||
| 2765 | return NULL; | 3305 | return NULL; |
| 2766 | 3306 | ||
| 2767 | /* if there's an unvisited sibling, visit its leftmost descendant */ | 3307 | /* if there's an unvisited sibling, visit its leftmost descendant */ |
| 2768 | next = css_next_child(pos, css_parent(pos)); | 3308 | next = css_next_child(pos, pos->parent); |
| 2769 | if (next) | 3309 | if (next) |
| 2770 | return css_leftmost_descendant(next); | 3310 | return css_leftmost_descendant(next); |
| 2771 | 3311 | ||
| 2772 | /* no sibling left, visit parent */ | 3312 | /* no sibling left, visit parent */ |
| 2773 | return css_parent(pos); | 3313 | return pos->parent; |
| 3314 | } | ||
| 3315 | |||
| 3316 | /** | ||
| 3317 | * css_has_online_children - does a css have online children | ||
| 3318 | * @css: the target css | ||
| 3319 | * | ||
| 3320 | * Returns %true if @css has any online children; otherwise, %false. This | ||
| 3321 | * function can be called from any context but the caller is responsible | ||
| 3322 | * for synchronizing against on/offlining as necessary. | ||
| 3323 | */ | ||
| 3324 | bool css_has_online_children(struct cgroup_subsys_state *css) | ||
| 3325 | { | ||
| 3326 | struct cgroup_subsys_state *child; | ||
| 3327 | bool ret = false; | ||
| 3328 | |||
| 3329 | rcu_read_lock(); | ||
| 3330 | css_for_each_child(child, css) { | ||
| 3331 | if (css->flags & CSS_ONLINE) { | ||
| 3332 | ret = true; | ||
| 3333 | break; | ||
| 3334 | } | ||
| 3335 | } | ||
| 3336 | rcu_read_unlock(); | ||
| 3337 | return ret; | ||
| 2774 | } | 3338 | } |
| 2775 | 3339 | ||
| 2776 | /** | 3340 | /** |
| @@ -2781,27 +3345,36 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, | |||
| 2781 | */ | 3345 | */ |
| 2782 | static void css_advance_task_iter(struct css_task_iter *it) | 3346 | static void css_advance_task_iter(struct css_task_iter *it) |
| 2783 | { | 3347 | { |
| 2784 | struct list_head *l = it->cset_link; | 3348 | struct list_head *l = it->cset_pos; |
| 2785 | struct cgrp_cset_link *link; | 3349 | struct cgrp_cset_link *link; |
| 2786 | struct css_set *cset; | 3350 | struct css_set *cset; |
| 2787 | 3351 | ||
| 2788 | /* Advance to the next non-empty css_set */ | 3352 | /* Advance to the next non-empty css_set */ |
| 2789 | do { | 3353 | do { |
| 2790 | l = l->next; | 3354 | l = l->next; |
| 2791 | if (l == &it->origin_css->cgroup->cset_links) { | 3355 | if (l == it->cset_head) { |
| 2792 | it->cset_link = NULL; | 3356 | it->cset_pos = NULL; |
| 2793 | return; | 3357 | return; |
| 2794 | } | 3358 | } |
| 2795 | link = list_entry(l, struct cgrp_cset_link, cset_link); | 3359 | |
| 2796 | cset = link->cset; | 3360 | if (it->ss) { |
| 3361 | cset = container_of(l, struct css_set, | ||
| 3362 | e_cset_node[it->ss->id]); | ||
| 3363 | } else { | ||
| 3364 | link = list_entry(l, struct cgrp_cset_link, cset_link); | ||
| 3365 | cset = link->cset; | ||
| 3366 | } | ||
| 2797 | } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks)); | 3367 | } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks)); |
| 2798 | 3368 | ||
| 2799 | it->cset_link = l; | 3369 | it->cset_pos = l; |
| 2800 | 3370 | ||
| 2801 | if (!list_empty(&cset->tasks)) | 3371 | if (!list_empty(&cset->tasks)) |
| 2802 | it->task = cset->tasks.next; | 3372 | it->task_pos = cset->tasks.next; |
| 2803 | else | 3373 | else |
| 2804 | it->task = cset->mg_tasks.next; | 3374 | it->task_pos = cset->mg_tasks.next; |
| 3375 | |||
| 3376 | it->tasks_head = &cset->tasks; | ||
| 3377 | it->mg_tasks_head = &cset->mg_tasks; | ||
| 2805 | } | 3378 | } |
| 2806 | 3379 | ||
| 2807 | /** | 3380 | /** |
| @@ -2827,8 +3400,14 @@ void css_task_iter_start(struct cgroup_subsys_state *css, | |||
| 2827 | 3400 | ||
| 2828 | down_read(&css_set_rwsem); | 3401 | down_read(&css_set_rwsem); |
| 2829 | 3402 | ||
| 2830 | it->origin_css = css; | 3403 | it->ss = css->ss; |
| 2831 | it->cset_link = &css->cgroup->cset_links; | 3404 | |
| 3405 | if (it->ss) | ||
| 3406 | it->cset_pos = &css->cgroup->e_csets[css->ss->id]; | ||
| 3407 | else | ||
| 3408 | it->cset_pos = &css->cgroup->cset_links; | ||
| 3409 | |||
| 3410 | it->cset_head = it->cset_pos; | ||
| 2832 | 3411 | ||
| 2833 | css_advance_task_iter(it); | 3412 | css_advance_task_iter(it); |
| 2834 | } | 3413 | } |
| @@ -2844,12 +3423,10 @@ void css_task_iter_start(struct cgroup_subsys_state *css, | |||
| 2844 | struct task_struct *css_task_iter_next(struct css_task_iter *it) | 3423 | struct task_struct *css_task_iter_next(struct css_task_iter *it) |
| 2845 | { | 3424 | { |
| 2846 | struct task_struct *res; | 3425 | struct task_struct *res; |
| 2847 | struct list_head *l = it->task; | 3426 | struct list_head *l = it->task_pos; |
| 2848 | struct cgrp_cset_link *link = list_entry(it->cset_link, | ||
| 2849 | struct cgrp_cset_link, cset_link); | ||
| 2850 | 3427 | ||
| 2851 | /* If the iterator cg is NULL, we have no tasks */ | 3428 | /* If the iterator cg is NULL, we have no tasks */ |
| 2852 | if (!it->cset_link) | 3429 | if (!it->cset_pos) |
| 2853 | return NULL; | 3430 | return NULL; |
| 2854 | res = list_entry(l, struct task_struct, cg_list); | 3431 | res = list_entry(l, struct task_struct, cg_list); |
| 2855 | 3432 | ||
| @@ -2860,13 +3437,13 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it) | |||
| 2860 | */ | 3437 | */ |
| 2861 | l = l->next; | 3438 | l = l->next; |
| 2862 | 3439 | ||
| 2863 | if (l == &link->cset->tasks) | 3440 | if (l == it->tasks_head) |
| 2864 | l = link->cset->mg_tasks.next; | 3441 | l = it->mg_tasks_head->next; |
| 2865 | 3442 | ||
| 2866 | if (l == &link->cset->mg_tasks) | 3443 | if (l == it->mg_tasks_head) |
| 2867 | css_advance_task_iter(it); | 3444 | css_advance_task_iter(it); |
| 2868 | else | 3445 | else |
| 2869 | it->task = l; | 3446 | it->task_pos = l; |
| 2870 | 3447 | ||
| 2871 | return res; | 3448 | return res; |
| 2872 | } | 3449 | } |
| @@ -2919,7 +3496,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) | |||
| 2919 | * ->can_attach() fails. | 3496 | * ->can_attach() fails. |
| 2920 | */ | 3497 | */ |
| 2921 | do { | 3498 | do { |
| 2922 | css_task_iter_start(&from->dummy_css, &it); | 3499 | css_task_iter_start(&from->self, &it); |
| 2923 | task = css_task_iter_next(&it); | 3500 | task = css_task_iter_next(&it); |
| 2924 | if (task) | 3501 | if (task) |
| 2925 | get_task_struct(task); | 3502 | get_task_struct(task); |
| @@ -3184,7 +3761,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, | |||
| 3184 | if (!array) | 3761 | if (!array) |
| 3185 | return -ENOMEM; | 3762 | return -ENOMEM; |
| 3186 | /* now, populate the array */ | 3763 | /* now, populate the array */ |
| 3187 | css_task_iter_start(&cgrp->dummy_css, &it); | 3764 | css_task_iter_start(&cgrp->self, &it); |
| 3188 | while ((tsk = css_task_iter_next(&it))) { | 3765 | while ((tsk = css_task_iter_next(&it))) { |
| 3189 | if (unlikely(n == length)) | 3766 | if (unlikely(n == length)) |
| 3190 | break; | 3767 | break; |
| @@ -3246,7 +3823,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) | |||
| 3246 | 3823 | ||
| 3247 | /* | 3824 | /* |
| 3248 | * We aren't being called from kernfs and there's no guarantee on | 3825 | * We aren't being called from kernfs and there's no guarantee on |
| 3249 | * @kn->priv's validity. For this and css_tryget_from_dir(), | 3826 | * @kn->priv's validity. For this and css_tryget_online_from_dir(), |
| 3250 | * @kn->priv is RCU safe. Let's do the RCU dancing. | 3827 | * @kn->priv is RCU safe. Let's do the RCU dancing. |
| 3251 | */ | 3828 | */ |
| 3252 | rcu_read_lock(); | 3829 | rcu_read_lock(); |
| @@ -3258,7 +3835,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) | |||
| 3258 | } | 3835 | } |
| 3259 | rcu_read_unlock(); | 3836 | rcu_read_unlock(); |
| 3260 | 3837 | ||
| 3261 | css_task_iter_start(&cgrp->dummy_css, &it); | 3838 | css_task_iter_start(&cgrp->self, &it); |
| 3262 | while ((tsk = css_task_iter_next(&it))) { | 3839 | while ((tsk = css_task_iter_next(&it))) { |
| 3263 | switch (tsk->state) { | 3840 | switch (tsk->state) { |
| 3264 | case TASK_RUNNING: | 3841 | case TASK_RUNNING: |
| @@ -3388,17 +3965,6 @@ static int cgroup_pidlist_show(struct seq_file *s, void *v) | |||
| 3388 | return seq_printf(s, "%d\n", *(int *)v); | 3965 | return seq_printf(s, "%d\n", *(int *)v); |
| 3389 | } | 3966 | } |
| 3390 | 3967 | ||
| 3391 | /* | ||
| 3392 | * seq_operations functions for iterating on pidlists through seq_file - | ||
| 3393 | * independent of whether it's tasks or procs | ||
| 3394 | */ | ||
| 3395 | static const struct seq_operations cgroup_pidlist_seq_operations = { | ||
| 3396 | .start = cgroup_pidlist_start, | ||
| 3397 | .stop = cgroup_pidlist_stop, | ||
| 3398 | .next = cgroup_pidlist_next, | ||
| 3399 | .show = cgroup_pidlist_show, | ||
| 3400 | }; | ||
| 3401 | |||
| 3402 | static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, | 3968 | static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, |
| 3403 | struct cftype *cft) | 3969 | struct cftype *cft) |
| 3404 | { | 3970 | { |
| @@ -3440,7 +4006,7 @@ static struct cftype cgroup_base_files[] = { | |||
| 3440 | .seq_stop = cgroup_pidlist_stop, | 4006 | .seq_stop = cgroup_pidlist_stop, |
| 3441 | .seq_show = cgroup_pidlist_show, | 4007 | .seq_show = cgroup_pidlist_show, |
| 3442 | .private = CGROUP_FILE_PROCS, | 4008 | .private = CGROUP_FILE_PROCS, |
| 3443 | .write_u64 = cgroup_procs_write, | 4009 | .write = cgroup_procs_write, |
| 3444 | .mode = S_IRUGO | S_IWUSR, | 4010 | .mode = S_IRUGO | S_IWUSR, |
| 3445 | }, | 4011 | }, |
| 3446 | { | 4012 | { |
| @@ -3454,6 +4020,27 @@ static struct cftype cgroup_base_files[] = { | |||
| 3454 | .flags = CFTYPE_ONLY_ON_ROOT, | 4020 | .flags = CFTYPE_ONLY_ON_ROOT, |
| 3455 | .seq_show = cgroup_sane_behavior_show, | 4021 | .seq_show = cgroup_sane_behavior_show, |
| 3456 | }, | 4022 | }, |
| 4023 | { | ||
| 4024 | .name = "cgroup.controllers", | ||
| 4025 | .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_ONLY_ON_ROOT, | ||
| 4026 | .seq_show = cgroup_root_controllers_show, | ||
| 4027 | }, | ||
| 4028 | { | ||
| 4029 | .name = "cgroup.controllers", | ||
| 4030 | .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT, | ||
| 4031 | .seq_show = cgroup_controllers_show, | ||
| 4032 | }, | ||
| 4033 | { | ||
| 4034 | .name = "cgroup.subtree_control", | ||
| 4035 | .flags = CFTYPE_ONLY_ON_DFL, | ||
| 4036 | .seq_show = cgroup_subtree_control_show, | ||
| 4037 | .write = cgroup_subtree_control_write, | ||
| 4038 | }, | ||
| 4039 | { | ||
| 4040 | .name = "cgroup.populated", | ||
| 4041 | .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT, | ||
| 4042 | .seq_show = cgroup_populated_show, | ||
| 4043 | }, | ||
| 3457 | 4044 | ||
| 3458 | /* | 4045 | /* |
| 3459 | * Historical crazy stuff. These don't have "cgroup." prefix and | 4046 | * Historical crazy stuff. These don't have "cgroup." prefix and |
| @@ -3468,7 +4055,7 @@ static struct cftype cgroup_base_files[] = { | |||
| 3468 | .seq_stop = cgroup_pidlist_stop, | 4055 | .seq_stop = cgroup_pidlist_stop, |
| 3469 | .seq_show = cgroup_pidlist_show, | 4056 | .seq_show = cgroup_pidlist_show, |
| 3470 | .private = CGROUP_FILE_TASKS, | 4057 | .private = CGROUP_FILE_TASKS, |
| 3471 | .write_u64 = cgroup_tasks_write, | 4058 | .write = cgroup_tasks_write, |
| 3472 | .mode = S_IRUGO | S_IWUSR, | 4059 | .mode = S_IRUGO | S_IWUSR, |
| 3473 | }, | 4060 | }, |
| 3474 | { | 4061 | { |
| @@ -3481,7 +4068,7 @@ static struct cftype cgroup_base_files[] = { | |||
| 3481 | .name = "release_agent", | 4068 | .name = "release_agent", |
| 3482 | .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, | 4069 | .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, |
| 3483 | .seq_show = cgroup_release_agent_show, | 4070 | .seq_show = cgroup_release_agent_show, |
| 3484 | .write_string = cgroup_release_agent_write, | 4071 | .write = cgroup_release_agent_write, |
| 3485 | .max_write_len = PATH_MAX - 1, | 4072 | .max_write_len = PATH_MAX - 1, |
| 3486 | }, | 4073 | }, |
| 3487 | { } /* terminate */ | 4074 | { } /* terminate */ |
| @@ -3494,7 +4081,7 @@ static struct cftype cgroup_base_files[] = { | |||
| 3494 | * | 4081 | * |
| 3495 | * On failure, no file is added. | 4082 | * On failure, no file is added. |
| 3496 | */ | 4083 | */ |
| 3497 | static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) | 4084 | static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask) |
| 3498 | { | 4085 | { |
| 3499 | struct cgroup_subsys *ss; | 4086 | struct cgroup_subsys *ss; |
| 3500 | int i, ret = 0; | 4087 | int i, ret = 0; |
| @@ -3503,7 +4090,7 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) | |||
| 3503 | for_each_subsys(ss, i) { | 4090 | for_each_subsys(ss, i) { |
| 3504 | struct cftype *cfts; | 4091 | struct cftype *cfts; |
| 3505 | 4092 | ||
| 3506 | if (!test_bit(i, &subsys_mask)) | 4093 | if (!(subsys_mask & (1 << i))) |
| 3507 | continue; | 4094 | continue; |
| 3508 | 4095 | ||
| 3509 | list_for_each_entry(cfts, &ss->cfts, node) { | 4096 | list_for_each_entry(cfts, &ss->cfts, node) { |
| @@ -3525,9 +4112,9 @@ err: | |||
| 3525 | * Implemented in kill_css(). | 4112 | * Implemented in kill_css(). |
| 3526 | * | 4113 | * |
| 3527 | * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs | 4114 | * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs |
| 3528 | * and thus css_tryget() is guaranteed to fail, the css can be offlined | 4115 | * and thus css_tryget_online() is guaranteed to fail, the css can be |
| 3529 | * by invoking offline_css(). After offlining, the base ref is put. | 4116 | * offlined by invoking offline_css(). After offlining, the base ref is |
| 3530 | * Implemented in css_killed_work_fn(). | 4117 | * put. Implemented in css_killed_work_fn(). |
| 3531 | * | 4118 | * |
| 3532 | * 3. When the percpu_ref reaches zero, the only possible remaining | 4119 | * 3. When the percpu_ref reaches zero, the only possible remaining |
| 3533 | * accessors are inside RCU read sections. css_release() schedules the | 4120 | * accessors are inside RCU read sections. css_release() schedules the |
| @@ -3546,11 +4133,37 @@ static void css_free_work_fn(struct work_struct *work) | |||
| 3546 | container_of(work, struct cgroup_subsys_state, destroy_work); | 4133 | container_of(work, struct cgroup_subsys_state, destroy_work); |
| 3547 | struct cgroup *cgrp = css->cgroup; | 4134 | struct cgroup *cgrp = css->cgroup; |
| 3548 | 4135 | ||
| 3549 | if (css->parent) | 4136 | if (css->ss) { |
| 3550 | css_put(css->parent); | 4137 | /* css free path */ |
| 4138 | if (css->parent) | ||
| 4139 | css_put(css->parent); | ||
| 3551 | 4140 | ||
| 3552 | css->ss->css_free(css); | 4141 | css->ss->css_free(css); |
| 3553 | cgroup_put(cgrp); | 4142 | cgroup_put(cgrp); |
| 4143 | } else { | ||
| 4144 | /* cgroup free path */ | ||
| 4145 | atomic_dec(&cgrp->root->nr_cgrps); | ||
| 4146 | cgroup_pidlist_destroy_all(cgrp); | ||
| 4147 | |||
| 4148 | if (cgroup_parent(cgrp)) { | ||
| 4149 | /* | ||
| 4150 | * We get a ref to the parent, and put the ref when | ||
| 4151 | * this cgroup is being freed, so it's guaranteed | ||
| 4152 | * that the parent won't be destroyed before its | ||
| 4153 | * children. | ||
| 4154 | */ | ||
| 4155 | cgroup_put(cgroup_parent(cgrp)); | ||
| 4156 | kernfs_put(cgrp->kn); | ||
| 4157 | kfree(cgrp); | ||
| 4158 | } else { | ||
| 4159 | /* | ||
| 4160 | * This is root cgroup's refcnt reaching zero, | ||
| 4161 | * which indicates that the root should be | ||
| 4162 | * released. | ||
| 4163 | */ | ||
| 4164 | cgroup_destroy_root(cgrp->root); | ||
| 4165 | } | ||
| 4166 | } | ||
| 3554 | } | 4167 | } |
| 3555 | 4168 | ||
| 3556 | static void css_free_rcu_fn(struct rcu_head *rcu_head) | 4169 | static void css_free_rcu_fn(struct rcu_head *rcu_head) |
| @@ -3562,26 +4175,59 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head) | |||
| 3562 | queue_work(cgroup_destroy_wq, &css->destroy_work); | 4175 | queue_work(cgroup_destroy_wq, &css->destroy_work); |
| 3563 | } | 4176 | } |
| 3564 | 4177 | ||
| 4178 | static void css_release_work_fn(struct work_struct *work) | ||
| 4179 | { | ||
| 4180 | struct cgroup_subsys_state *css = | ||
| 4181 | container_of(work, struct cgroup_subsys_state, destroy_work); | ||
| 4182 | struct cgroup_subsys *ss = css->ss; | ||
| 4183 | struct cgroup *cgrp = css->cgroup; | ||
| 4184 | |||
| 4185 | mutex_lock(&cgroup_mutex); | ||
| 4186 | |||
| 4187 | css->flags |= CSS_RELEASED; | ||
| 4188 | list_del_rcu(&css->sibling); | ||
| 4189 | |||
| 4190 | if (ss) { | ||
| 4191 | /* css release path */ | ||
| 4192 | cgroup_idr_remove(&ss->css_idr, css->id); | ||
| 4193 | } else { | ||
| 4194 | /* cgroup release path */ | ||
| 4195 | cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); | ||
| 4196 | cgrp->id = -1; | ||
| 4197 | } | ||
| 4198 | |||
| 4199 | mutex_unlock(&cgroup_mutex); | ||
| 4200 | |||
| 4201 | call_rcu(&css->rcu_head, css_free_rcu_fn); | ||
| 4202 | } | ||
| 4203 | |||
| 3565 | static void css_release(struct percpu_ref *ref) | 4204 | static void css_release(struct percpu_ref *ref) |
| 3566 | { | 4205 | { |
| 3567 | struct cgroup_subsys_state *css = | 4206 | struct cgroup_subsys_state *css = |
| 3568 | container_of(ref, struct cgroup_subsys_state, refcnt); | 4207 | container_of(ref, struct cgroup_subsys_state, refcnt); |
| 3569 | 4208 | ||
| 3570 | RCU_INIT_POINTER(css->cgroup->subsys[css->ss->id], NULL); | 4209 | INIT_WORK(&css->destroy_work, css_release_work_fn); |
| 3571 | call_rcu(&css->rcu_head, css_free_rcu_fn); | 4210 | queue_work(cgroup_destroy_wq, &css->destroy_work); |
| 3572 | } | 4211 | } |
| 3573 | 4212 | ||
| 3574 | static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, | 4213 | static void init_and_link_css(struct cgroup_subsys_state *css, |
| 3575 | struct cgroup *cgrp) | 4214 | struct cgroup_subsys *ss, struct cgroup *cgrp) |
| 3576 | { | 4215 | { |
| 4216 | lockdep_assert_held(&cgroup_mutex); | ||
| 4217 | |||
| 4218 | cgroup_get(cgrp); | ||
| 4219 | |||
| 4220 | memset(css, 0, sizeof(*css)); | ||
| 3577 | css->cgroup = cgrp; | 4221 | css->cgroup = cgrp; |
| 3578 | css->ss = ss; | 4222 | css->ss = ss; |
| 3579 | css->flags = 0; | 4223 | INIT_LIST_HEAD(&css->sibling); |
| 4224 | INIT_LIST_HEAD(&css->children); | ||
| 4225 | css->serial_nr = css_serial_nr_next++; | ||
| 3580 | 4226 | ||
| 3581 | if (cgrp->parent) | 4227 | if (cgroup_parent(cgrp)) { |
| 3582 | css->parent = cgroup_css(cgrp->parent, ss); | 4228 | css->parent = cgroup_css(cgroup_parent(cgrp), ss); |
| 3583 | else | 4229 | css_get(css->parent); |
| 3584 | css->flags |= CSS_ROOT; | 4230 | } |
| 3585 | 4231 | ||
| 3586 | BUG_ON(cgroup_css(cgrp, ss)); | 4232 | BUG_ON(cgroup_css(cgrp, ss)); |
| 3587 | } | 4233 | } |
| @@ -3592,14 +4238,12 @@ static int online_css(struct cgroup_subsys_state *css) | |||
| 3592 | struct cgroup_subsys *ss = css->ss; | 4238 | struct cgroup_subsys *ss = css->ss; |
| 3593 | int ret = 0; | 4239 | int ret = 0; |
| 3594 | 4240 | ||
| 3595 | lockdep_assert_held(&cgroup_tree_mutex); | ||
| 3596 | lockdep_assert_held(&cgroup_mutex); | 4241 | lockdep_assert_held(&cgroup_mutex); |
| 3597 | 4242 | ||
| 3598 | if (ss->css_online) | 4243 | if (ss->css_online) |
| 3599 | ret = ss->css_online(css); | 4244 | ret = ss->css_online(css); |
| 3600 | if (!ret) { | 4245 | if (!ret) { |
| 3601 | css->flags |= CSS_ONLINE; | 4246 | css->flags |= CSS_ONLINE; |
| 3602 | css->cgroup->nr_css++; | ||
| 3603 | rcu_assign_pointer(css->cgroup->subsys[ss->id], css); | 4247 | rcu_assign_pointer(css->cgroup->subsys[ss->id], css); |
| 3604 | } | 4248 | } |
| 3605 | return ret; | 4249 | return ret; |
| @@ -3610,7 +4254,6 @@ static void offline_css(struct cgroup_subsys_state *css) | |||
| 3610 | { | 4254 | { |
| 3611 | struct cgroup_subsys *ss = css->ss; | 4255 | struct cgroup_subsys *ss = css->ss; |
| 3612 | 4256 | ||
| 3613 | lockdep_assert_held(&cgroup_tree_mutex); | ||
| 3614 | lockdep_assert_held(&cgroup_mutex); | 4257 | lockdep_assert_held(&cgroup_mutex); |
| 3615 | 4258 | ||
| 3616 | if (!(css->flags & CSS_ONLINE)) | 4259 | if (!(css->flags & CSS_ONLINE)) |
| @@ -3620,8 +4263,9 @@ static void offline_css(struct cgroup_subsys_state *css) | |||
| 3620 | ss->css_offline(css); | 4263 | ss->css_offline(css); |
| 3621 | 4264 | ||
| 3622 | css->flags &= ~CSS_ONLINE; | 4265 | css->flags &= ~CSS_ONLINE; |
| 3623 | css->cgroup->nr_css--; | 4266 | RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL); |
| 3624 | RCU_INIT_POINTER(css->cgroup->subsys[ss->id], css); | 4267 | |
| 4268 | wake_up_all(&css->cgroup->offline_waitq); | ||
| 3625 | } | 4269 | } |
| 3626 | 4270 | ||
| 3627 | /** | 4271 | /** |
| @@ -3635,111 +4279,102 @@ static void offline_css(struct cgroup_subsys_state *css) | |||
| 3635 | */ | 4279 | */ |
| 3636 | static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) | 4280 | static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) |
| 3637 | { | 4281 | { |
| 3638 | struct cgroup *parent = cgrp->parent; | 4282 | struct cgroup *parent = cgroup_parent(cgrp); |
| 4283 | struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss); | ||
| 3639 | struct cgroup_subsys_state *css; | 4284 | struct cgroup_subsys_state *css; |
| 3640 | int err; | 4285 | int err; |
| 3641 | 4286 | ||
| 3642 | lockdep_assert_held(&cgroup_mutex); | 4287 | lockdep_assert_held(&cgroup_mutex); |
| 3643 | 4288 | ||
| 3644 | css = ss->css_alloc(cgroup_css(parent, ss)); | 4289 | css = ss->css_alloc(parent_css); |
| 3645 | if (IS_ERR(css)) | 4290 | if (IS_ERR(css)) |
| 3646 | return PTR_ERR(css); | 4291 | return PTR_ERR(css); |
| 3647 | 4292 | ||
| 4293 | init_and_link_css(css, ss, cgrp); | ||
| 4294 | |||
| 3648 | err = percpu_ref_init(&css->refcnt, css_release); | 4295 | err = percpu_ref_init(&css->refcnt, css_release); |
| 3649 | if (err) | 4296 | if (err) |
| 3650 | goto err_free_css; | 4297 | goto err_free_css; |
| 3651 | 4298 | ||
| 3652 | init_css(css, ss, cgrp); | 4299 | err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_NOWAIT); |
| 4300 | if (err < 0) | ||
| 4301 | goto err_free_percpu_ref; | ||
| 4302 | css->id = err; | ||
| 3653 | 4303 | ||
| 3654 | err = cgroup_populate_dir(cgrp, 1 << ss->id); | 4304 | err = cgroup_populate_dir(cgrp, 1 << ss->id); |
| 3655 | if (err) | 4305 | if (err) |
| 3656 | goto err_free_percpu_ref; | 4306 | goto err_free_id; |
| 4307 | |||
| 4308 | /* @css is ready to be brought online now, make it visible */ | ||
| 4309 | list_add_tail_rcu(&css->sibling, &parent_css->children); | ||
| 4310 | cgroup_idr_replace(&ss->css_idr, css, css->id); | ||
| 3657 | 4311 | ||
| 3658 | err = online_css(css); | 4312 | err = online_css(css); |
| 3659 | if (err) | 4313 | if (err) |
| 3660 | goto err_clear_dir; | 4314 | goto err_list_del; |
| 3661 | |||
| 3662 | cgroup_get(cgrp); | ||
| 3663 | css_get(css->parent); | ||
| 3664 | |||
| 3665 | cgrp->subsys_mask |= 1 << ss->id; | ||
| 3666 | 4315 | ||
| 3667 | if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && | 4316 | if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && |
| 3668 | parent->parent) { | 4317 | cgroup_parent(parent)) { |
| 3669 | pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", | 4318 | pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", |
| 3670 | current->comm, current->pid, ss->name); | 4319 | current->comm, current->pid, ss->name); |
| 3671 | if (!strcmp(ss->name, "memory")) | 4320 | if (!strcmp(ss->name, "memory")) |
| 3672 | pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n"); | 4321 | pr_warn("\"memory\" requires setting use_hierarchy to 1 on the root\n"); |
| 3673 | ss->warned_broken_hierarchy = true; | 4322 | ss->warned_broken_hierarchy = true; |
| 3674 | } | 4323 | } |
| 3675 | 4324 | ||
| 3676 | return 0; | 4325 | return 0; |
| 3677 | 4326 | ||
| 3678 | err_clear_dir: | 4327 | err_list_del: |
| 4328 | list_del_rcu(&css->sibling); | ||
| 3679 | cgroup_clear_dir(css->cgroup, 1 << css->ss->id); | 4329 | cgroup_clear_dir(css->cgroup, 1 << css->ss->id); |
| 4330 | err_free_id: | ||
| 4331 | cgroup_idr_remove(&ss->css_idr, css->id); | ||
| 3680 | err_free_percpu_ref: | 4332 | err_free_percpu_ref: |
| 3681 | percpu_ref_cancel_init(&css->refcnt); | 4333 | percpu_ref_cancel_init(&css->refcnt); |
| 3682 | err_free_css: | 4334 | err_free_css: |
| 3683 | ss->css_free(css); | 4335 | call_rcu(&css->rcu_head, css_free_rcu_fn); |
| 3684 | return err; | 4336 | return err; |
| 3685 | } | 4337 | } |
| 3686 | 4338 | ||
| 3687 | /** | 4339 | static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, |
| 3688 | * cgroup_create - create a cgroup | 4340 | umode_t mode) |
| 3689 | * @parent: cgroup that will be parent of the new cgroup | ||
| 3690 | * @name: name of the new cgroup | ||
| 3691 | * @mode: mode to set on new cgroup | ||
| 3692 | */ | ||
| 3693 | static long cgroup_create(struct cgroup *parent, const char *name, | ||
| 3694 | umode_t mode) | ||
| 3695 | { | 4341 | { |
| 3696 | struct cgroup *cgrp; | 4342 | struct cgroup *parent, *cgrp; |
| 3697 | struct cgroup_root *root = parent->root; | 4343 | struct cgroup_root *root; |
| 3698 | int ssid, err; | ||
| 3699 | struct cgroup_subsys *ss; | 4344 | struct cgroup_subsys *ss; |
| 3700 | struct kernfs_node *kn; | 4345 | struct kernfs_node *kn; |
| 4346 | int ssid, ret; | ||
| 3701 | 4347 | ||
| 3702 | /* | 4348 | parent = cgroup_kn_lock_live(parent_kn); |
| 3703 | * XXX: The default hierarchy isn't fully implemented yet. Block | 4349 | if (!parent) |
| 3704 | * !root cgroup creation on it for now. | 4350 | return -ENODEV; |
| 3705 | */ | 4351 | root = parent->root; |
| 3706 | if (root == &cgrp_dfl_root) | ||
| 3707 | return -EINVAL; | ||
| 3708 | 4352 | ||
| 3709 | /* allocate the cgroup and its ID, 0 is reserved for the root */ | 4353 | /* allocate the cgroup and its ID, 0 is reserved for the root */ |
| 3710 | cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); | 4354 | cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); |
| 3711 | if (!cgrp) | 4355 | if (!cgrp) { |
| 3712 | return -ENOMEM; | 4356 | ret = -ENOMEM; |
| 3713 | 4357 | goto out_unlock; | |
| 3714 | mutex_lock(&cgroup_tree_mutex); | ||
| 3715 | |||
| 3716 | /* | ||
| 3717 | * Only live parents can have children. Note that the liveliness | ||
| 3718 | * check isn't strictly necessary because cgroup_mkdir() and | ||
| 3719 | * cgroup_rmdir() are fully synchronized by i_mutex; however, do it | ||
| 3720 | * anyway so that locking is contained inside cgroup proper and we | ||
| 3721 | * don't get nasty surprises if we ever grow another caller. | ||
| 3722 | */ | ||
| 3723 | if (!cgroup_lock_live_group(parent)) { | ||
| 3724 | err = -ENODEV; | ||
| 3725 | goto err_unlock_tree; | ||
| 3726 | } | 4358 | } |
| 3727 | 4359 | ||
| 4360 | ret = percpu_ref_init(&cgrp->self.refcnt, css_release); | ||
| 4361 | if (ret) | ||
| 4362 | goto out_free_cgrp; | ||
| 4363 | |||
| 3728 | /* | 4364 | /* |
| 3729 | * Temporarily set the pointer to NULL, so idr_find() won't return | 4365 | * Temporarily set the pointer to NULL, so idr_find() won't return |
| 3730 | * a half-baked cgroup. | 4366 | * a half-baked cgroup. |
| 3731 | */ | 4367 | */ |
| 3732 | cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL); | 4368 | cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_NOWAIT); |
| 3733 | if (cgrp->id < 0) { | 4369 | if (cgrp->id < 0) { |
| 3734 | err = -ENOMEM; | 4370 | ret = -ENOMEM; |
| 3735 | goto err_unlock; | 4371 | goto out_cancel_ref; |
| 3736 | } | 4372 | } |
| 3737 | 4373 | ||
| 3738 | init_cgroup_housekeeping(cgrp); | 4374 | init_cgroup_housekeeping(cgrp); |
| 3739 | 4375 | ||
| 3740 | cgrp->parent = parent; | 4376 | cgrp->self.parent = &parent->self; |
| 3741 | cgrp->dummy_css.parent = &parent->dummy_css; | 4377 | cgrp->root = root; |
| 3742 | cgrp->root = parent->root; | ||
| 3743 | 4378 | ||
| 3744 | if (notify_on_release(parent)) | 4379 | if (notify_on_release(parent)) |
| 3745 | set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); | 4380 | set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); |
| @@ -3750,8 +4385,8 @@ static long cgroup_create(struct cgroup *parent, const char *name, | |||
| 3750 | /* create the directory */ | 4385 | /* create the directory */ |
| 3751 | kn = kernfs_create_dir(parent->kn, name, mode, cgrp); | 4386 | kn = kernfs_create_dir(parent->kn, name, mode, cgrp); |
| 3752 | if (IS_ERR(kn)) { | 4387 | if (IS_ERR(kn)) { |
| 3753 | err = PTR_ERR(kn); | 4388 | ret = PTR_ERR(kn); |
| 3754 | goto err_free_id; | 4389 | goto out_free_id; |
| 3755 | } | 4390 | } |
| 3756 | cgrp->kn = kn; | 4391 | cgrp->kn = kn; |
| 3757 | 4392 | ||
| @@ -3761,10 +4396,10 @@ static long cgroup_create(struct cgroup *parent, const char *name, | |||
| 3761 | */ | 4396 | */ |
| 3762 | kernfs_get(kn); | 4397 | kernfs_get(kn); |
| 3763 | 4398 | ||
| 3764 | cgrp->serial_nr = cgroup_serial_nr_next++; | 4399 | cgrp->self.serial_nr = css_serial_nr_next++; |
| 3765 | 4400 | ||
| 3766 | /* allocation complete, commit to creation */ | 4401 | /* allocation complete, commit to creation */ |
| 3767 | list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); | 4402 | list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children); |
| 3768 | atomic_inc(&root->nr_cgrps); | 4403 | atomic_inc(&root->nr_cgrps); |
| 3769 | cgroup_get(parent); | 4404 | cgroup_get(parent); |
| 3770 | 4405 | ||
| @@ -3772,107 +4407,66 @@ static long cgroup_create(struct cgroup *parent, const char *name, | |||
| 3772 | * @cgrp is now fully operational. If something fails after this | 4407 | * @cgrp is now fully operational. If something fails after this |
| 3773 | * point, it'll be released via the normal destruction path. | 4408 | * point, it'll be released via the normal destruction path. |
| 3774 | */ | 4409 | */ |
| 3775 | idr_replace(&root->cgroup_idr, cgrp, cgrp->id); | 4410 | cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id); |
| 3776 | 4411 | ||
| 3777 | err = cgroup_kn_set_ugid(kn); | 4412 | ret = cgroup_kn_set_ugid(kn); |
| 3778 | if (err) | 4413 | if (ret) |
| 3779 | goto err_destroy; | 4414 | goto out_destroy; |
| 3780 | 4415 | ||
| 3781 | err = cgroup_addrm_files(cgrp, cgroup_base_files, true); | 4416 | ret = cgroup_addrm_files(cgrp, cgroup_base_files, true); |
| 3782 | if (err) | 4417 | if (ret) |
| 3783 | goto err_destroy; | 4418 | goto out_destroy; |
| 3784 | 4419 | ||
| 3785 | /* let's create and online css's */ | 4420 | /* let's create and online css's */ |
| 3786 | for_each_subsys(ss, ssid) { | 4421 | for_each_subsys(ss, ssid) { |
| 3787 | if (root->cgrp.subsys_mask & (1 << ssid)) { | 4422 | if (parent->child_subsys_mask & (1 << ssid)) { |
| 3788 | err = create_css(cgrp, ss); | 4423 | ret = create_css(cgrp, ss); |
| 3789 | if (err) | 4424 | if (ret) |
| 3790 | goto err_destroy; | 4425 | goto out_destroy; |
| 3791 | } | 4426 | } |
| 3792 | } | 4427 | } |
| 3793 | 4428 | ||
| 3794 | kernfs_activate(kn); | 4429 | /* |
| 4430 | * On the default hierarchy, a child doesn't automatically inherit | ||
| 4431 | * child_subsys_mask from the parent. Each is configured manually. | ||
| 4432 | */ | ||
| 4433 | if (!cgroup_on_dfl(cgrp)) | ||
| 4434 | cgrp->child_subsys_mask = parent->child_subsys_mask; | ||
| 3795 | 4435 | ||
| 3796 | mutex_unlock(&cgroup_mutex); | 4436 | kernfs_activate(kn); |
| 3797 | mutex_unlock(&cgroup_tree_mutex); | ||
| 3798 | 4437 | ||
| 3799 | return 0; | 4438 | ret = 0; |
| 4439 | goto out_unlock; | ||
| 3800 | 4440 | ||
| 3801 | err_free_id: | 4441 | out_free_id: |
| 3802 | idr_remove(&root->cgroup_idr, cgrp->id); | 4442 | cgroup_idr_remove(&root->cgroup_idr, cgrp->id); |
| 3803 | err_unlock: | 4443 | out_cancel_ref: |
| 3804 | mutex_unlock(&cgroup_mutex); | 4444 | percpu_ref_cancel_init(&cgrp->self.refcnt); |
| 3805 | err_unlock_tree: | 4445 | out_free_cgrp: |
| 3806 | mutex_unlock(&cgroup_tree_mutex); | ||
| 3807 | kfree(cgrp); | 4446 | kfree(cgrp); |
| 3808 | return err; | 4447 | out_unlock: |
| 4448 | cgroup_kn_unlock(parent_kn); | ||
| 4449 | return ret; | ||
| 3809 | 4450 | ||
| 3810 | err_destroy: | 4451 | out_destroy: |
| 3811 | cgroup_destroy_locked(cgrp); | 4452 | cgroup_destroy_locked(cgrp); |
| 3812 | mutex_unlock(&cgroup_mutex); | 4453 | goto out_unlock; |
| 3813 | mutex_unlock(&cgroup_tree_mutex); | ||
| 3814 | return err; | ||
| 3815 | } | ||
| 3816 | |||
| 3817 | static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, | ||
| 3818 | umode_t mode) | ||
| 3819 | { | ||
| 3820 | struct cgroup *parent = parent_kn->priv; | ||
| 3821 | int ret; | ||
| 3822 | |||
| 3823 | /* | ||
| 3824 | * cgroup_create() grabs cgroup_tree_mutex which nests outside | ||
| 3825 | * kernfs active_ref and cgroup_create() already synchronizes | ||
| 3826 | * properly against removal through cgroup_lock_live_group(). | ||
| 3827 | * Break it before calling cgroup_create(). | ||
| 3828 | */ | ||
| 3829 | cgroup_get(parent); | ||
| 3830 | kernfs_break_active_protection(parent_kn); | ||
| 3831 | |||
| 3832 | ret = cgroup_create(parent, name, mode); | ||
| 3833 | |||
| 3834 | kernfs_unbreak_active_protection(parent_kn); | ||
| 3835 | cgroup_put(parent); | ||
| 3836 | return ret; | ||
| 3837 | } | 4454 | } |
| 3838 | 4455 | ||
| 3839 | /* | 4456 | /* |
| 3840 | * This is called when the refcnt of a css is confirmed to be killed. | 4457 | * This is called when the refcnt of a css is confirmed to be killed. |
| 3841 | * css_tryget() is now guaranteed to fail. | 4458 | * css_tryget_online() is now guaranteed to fail. Tell the subsystem to |
| 4459 | * initate destruction and put the css ref from kill_css(). | ||
| 3842 | */ | 4460 | */ |
| 3843 | static void css_killed_work_fn(struct work_struct *work) | 4461 | static void css_killed_work_fn(struct work_struct *work) |
| 3844 | { | 4462 | { |
| 3845 | struct cgroup_subsys_state *css = | 4463 | struct cgroup_subsys_state *css = |
| 3846 | container_of(work, struct cgroup_subsys_state, destroy_work); | 4464 | container_of(work, struct cgroup_subsys_state, destroy_work); |
| 3847 | struct cgroup *cgrp = css->cgroup; | ||
| 3848 | 4465 | ||
| 3849 | mutex_lock(&cgroup_tree_mutex); | ||
| 3850 | mutex_lock(&cgroup_mutex); | 4466 | mutex_lock(&cgroup_mutex); |
| 3851 | |||
| 3852 | /* | ||
| 3853 | * css_tryget() is guaranteed to fail now. Tell subsystems to | ||
| 3854 | * initate destruction. | ||
| 3855 | */ | ||
| 3856 | offline_css(css); | 4467 | offline_css(css); |
| 3857 | |||
| 3858 | /* | ||
| 3859 | * If @cgrp is marked dead, it's waiting for refs of all css's to | ||
| 3860 | * be disabled before proceeding to the second phase of cgroup | ||
| 3861 | * destruction. If we are the last one, kick it off. | ||
| 3862 | */ | ||
| 3863 | if (!cgrp->nr_css && cgroup_is_dead(cgrp)) | ||
| 3864 | cgroup_destroy_css_killed(cgrp); | ||
| 3865 | |||
| 3866 | mutex_unlock(&cgroup_mutex); | 4468 | mutex_unlock(&cgroup_mutex); |
| 3867 | mutex_unlock(&cgroup_tree_mutex); | ||
| 3868 | 4469 | ||
| 3869 | /* | ||
| 3870 | * Put the css refs from kill_css(). Each css holds an extra | ||
| 3871 | * reference to the cgroup's dentry and cgroup removal proceeds | ||
| 3872 | * regardless of css refs. On the last put of each css, whenever | ||
| 3873 | * that may be, the extra dentry ref is put so that dentry | ||
| 3874 | * destruction happens only after all css's are released. | ||
| 3875 | */ | ||
| 3876 | css_put(css); | 4470 | css_put(css); |
| 3877 | } | 4471 | } |
| 3878 | 4472 | ||
| @@ -3886,9 +4480,18 @@ static void css_killed_ref_fn(struct percpu_ref *ref) | |||
| 3886 | queue_work(cgroup_destroy_wq, &css->destroy_work); | 4480 | queue_work(cgroup_destroy_wq, &css->destroy_work); |
| 3887 | } | 4481 | } |
| 3888 | 4482 | ||
| 3889 | static void __kill_css(struct cgroup_subsys_state *css) | 4483 | /** |
| 4484 | * kill_css - destroy a css | ||
| 4485 | * @css: css to destroy | ||
| 4486 | * | ||
| 4487 | * This function initiates destruction of @css by removing cgroup interface | ||
| 4488 | * files and putting its base reference. ->css_offline() will be invoked | ||
| 4489 | * asynchronously once css_tryget_online() is guaranteed to fail and when | ||
| 4490 | * the reference count reaches zero, @css will be released. | ||
| 4491 | */ | ||
| 4492 | static void kill_css(struct cgroup_subsys_state *css) | ||
| 3890 | { | 4493 | { |
| 3891 | lockdep_assert_held(&cgroup_tree_mutex); | 4494 | lockdep_assert_held(&cgroup_mutex); |
| 3892 | 4495 | ||
| 3893 | /* | 4496 | /* |
| 3894 | * This must happen before css is disassociated with its cgroup. | 4497 | * This must happen before css is disassociated with its cgroup. |
| @@ -3905,7 +4508,7 @@ static void __kill_css(struct cgroup_subsys_state *css) | |||
| 3905 | /* | 4508 | /* |
| 3906 | * cgroup core guarantees that, by the time ->css_offline() is | 4509 | * cgroup core guarantees that, by the time ->css_offline() is |
| 3907 | * invoked, no new css reference will be given out via | 4510 | * invoked, no new css reference will be given out via |
| 3908 | * css_tryget(). We can't simply call percpu_ref_kill() and | 4511 | * css_tryget_online(). We can't simply call percpu_ref_kill() and |
| 3909 | * proceed to offlining css's because percpu_ref_kill() doesn't | 4512 | * proceed to offlining css's because percpu_ref_kill() doesn't |
| 3910 | * guarantee that the ref is seen as killed on all CPUs on return. | 4513 | * guarantee that the ref is seen as killed on all CPUs on return. |
| 3911 | * | 4514 | * |
| @@ -3916,36 +4519,14 @@ static void __kill_css(struct cgroup_subsys_state *css) | |||
| 3916 | } | 4519 | } |
| 3917 | 4520 | ||
| 3918 | /** | 4521 | /** |
| 3919 | * kill_css - destroy a css | ||
| 3920 | * @css: css to destroy | ||
| 3921 | * | ||
| 3922 | * This function initiates destruction of @css by removing cgroup interface | ||
| 3923 | * files and putting its base reference. ->css_offline() will be invoked | ||
| 3924 | * asynchronously once css_tryget() is guaranteed to fail and when the | ||
| 3925 | * reference count reaches zero, @css will be released. | ||
| 3926 | */ | ||
| 3927 | static void kill_css(struct cgroup_subsys_state *css) | ||
| 3928 | { | ||
| 3929 | struct cgroup *cgrp = css->cgroup; | ||
| 3930 | |||
| 3931 | lockdep_assert_held(&cgroup_tree_mutex); | ||
| 3932 | |||
| 3933 | /* if already killed, noop */ | ||
| 3934 | if (cgrp->subsys_mask & (1 << css->ss->id)) { | ||
| 3935 | cgrp->subsys_mask &= ~(1 << css->ss->id); | ||
| 3936 | __kill_css(css); | ||
| 3937 | } | ||
| 3938 | } | ||
| 3939 | |||
| 3940 | /** | ||
| 3941 | * cgroup_destroy_locked - the first stage of cgroup destruction | 4522 | * cgroup_destroy_locked - the first stage of cgroup destruction |
| 3942 | * @cgrp: cgroup to be destroyed | 4523 | * @cgrp: cgroup to be destroyed |
| 3943 | * | 4524 | * |
| 3944 | * css's make use of percpu refcnts whose killing latency shouldn't be | 4525 | * css's make use of percpu refcnts whose killing latency shouldn't be |
| 3945 | * exposed to userland and are RCU protected. Also, cgroup core needs to | 4526 | * exposed to userland and are RCU protected. Also, cgroup core needs to |
| 3946 | * guarantee that css_tryget() won't succeed by the time ->css_offline() is | 4527 | * guarantee that css_tryget_online() won't succeed by the time |
| 3947 | * invoked. To satisfy all the requirements, destruction is implemented in | 4528 | * ->css_offline() is invoked. To satisfy all the requirements, |
| 3948 | * the following two steps. | 4529 | * destruction is implemented in the following two steps. |
| 3949 | * | 4530 | * |
| 3950 | * s1. Verify @cgrp can be destroyed and mark it dying. Remove all | 4531 | * s1. Verify @cgrp can be destroyed and mark it dying. Remove all |
| 3951 | * userland visible parts and start killing the percpu refcnts of | 4532 | * userland visible parts and start killing the percpu refcnts of |
| @@ -3964,12 +4545,10 @@ static void kill_css(struct cgroup_subsys_state *css) | |||
| 3964 | static int cgroup_destroy_locked(struct cgroup *cgrp) | 4545 | static int cgroup_destroy_locked(struct cgroup *cgrp) |
| 3965 | __releases(&cgroup_mutex) __acquires(&cgroup_mutex) | 4546 | __releases(&cgroup_mutex) __acquires(&cgroup_mutex) |
| 3966 | { | 4547 | { |
| 3967 | struct cgroup *child; | ||
| 3968 | struct cgroup_subsys_state *css; | 4548 | struct cgroup_subsys_state *css; |
| 3969 | bool empty; | 4549 | bool empty; |
| 3970 | int ssid; | 4550 | int ssid; |
| 3971 | 4551 | ||
| 3972 | lockdep_assert_held(&cgroup_tree_mutex); | ||
| 3973 | lockdep_assert_held(&cgroup_mutex); | 4552 | lockdep_assert_held(&cgroup_mutex); |
| 3974 | 4553 | ||
| 3975 | /* | 4554 | /* |
| @@ -3983,127 +4562,68 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) | |||
| 3983 | return -EBUSY; | 4562 | return -EBUSY; |
| 3984 | 4563 | ||
| 3985 | /* | 4564 | /* |
| 3986 | * Make sure there's no live children. We can't test ->children | 4565 | * Make sure there's no live children. We can't test emptiness of |
| 3987 | * emptiness as dead children linger on it while being destroyed; | 4566 | * ->self.children as dead children linger on it while being |
| 3988 | * otherwise, "rmdir parent/child parent" may fail with -EBUSY. | 4567 | * drained; otherwise, "rmdir parent/child parent" may fail. |
| 3989 | */ | 4568 | */ |
| 3990 | empty = true; | 4569 | if (css_has_online_children(&cgrp->self)) |
| 3991 | rcu_read_lock(); | ||
| 3992 | list_for_each_entry_rcu(child, &cgrp->children, sibling) { | ||
| 3993 | empty = cgroup_is_dead(child); | ||
| 3994 | if (!empty) | ||
| 3995 | break; | ||
| 3996 | } | ||
| 3997 | rcu_read_unlock(); | ||
| 3998 | if (!empty) | ||
| 3999 | return -EBUSY; | 4570 | return -EBUSY; |
| 4000 | 4571 | ||
| 4001 | /* | 4572 | /* |
| 4002 | * Mark @cgrp dead. This prevents further task migration and child | 4573 | * Mark @cgrp dead. This prevents further task migration and child |
| 4003 | * creation by disabling cgroup_lock_live_group(). Note that | 4574 | * creation by disabling cgroup_lock_live_group(). |
| 4004 | * CGRP_DEAD assertion is depended upon by css_next_child() to | ||
| 4005 | * resume iteration after dropping RCU read lock. See | ||
| 4006 | * css_next_child() for details. | ||
| 4007 | */ | 4575 | */ |
| 4008 | set_bit(CGRP_DEAD, &cgrp->flags); | 4576 | cgrp->self.flags &= ~CSS_ONLINE; |
| 4009 | 4577 | ||
| 4010 | /* | 4578 | /* initiate massacre of all css's */ |
| 4011 | * Initiate massacre of all css's. cgroup_destroy_css_killed() | ||
| 4012 | * will be invoked to perform the rest of destruction once the | ||
| 4013 | * percpu refs of all css's are confirmed to be killed. This | ||
| 4014 | * involves removing the subsystem's files, drop cgroup_mutex. | ||
| 4015 | */ | ||
| 4016 | mutex_unlock(&cgroup_mutex); | ||
| 4017 | for_each_css(css, ssid, cgrp) | 4579 | for_each_css(css, ssid, cgrp) |
| 4018 | kill_css(css); | 4580 | kill_css(css); |
| 4019 | mutex_lock(&cgroup_mutex); | ||
| 4020 | 4581 | ||
| 4021 | /* CGRP_DEAD is set, remove from ->release_list for the last time */ | 4582 | /* CSS_ONLINE is clear, remove from ->release_list for the last time */ |
| 4022 | raw_spin_lock(&release_list_lock); | 4583 | raw_spin_lock(&release_list_lock); |
| 4023 | if (!list_empty(&cgrp->release_list)) | 4584 | if (!list_empty(&cgrp->release_list)) |
| 4024 | list_del_init(&cgrp->release_list); | 4585 | list_del_init(&cgrp->release_list); |
| 4025 | raw_spin_unlock(&release_list_lock); | 4586 | raw_spin_unlock(&release_list_lock); |
| 4026 | 4587 | ||
| 4027 | /* | 4588 | /* |
| 4028 | * If @cgrp has css's attached, the second stage of cgroup | 4589 | * Remove @cgrp directory along with the base files. @cgrp has an |
| 4029 | * destruction is kicked off from css_killed_work_fn() after the | 4590 | * extra ref on its kn. |
| 4030 | * refs of all attached css's are killed. If @cgrp doesn't have | ||
| 4031 | * any css, we kick it off here. | ||
| 4032 | */ | 4591 | */ |
| 4033 | if (!cgrp->nr_css) | 4592 | kernfs_remove(cgrp->kn); |
| 4034 | cgroup_destroy_css_killed(cgrp); | ||
| 4035 | |||
| 4036 | /* remove @cgrp directory along with the base files */ | ||
| 4037 | mutex_unlock(&cgroup_mutex); | ||
| 4038 | 4593 | ||
| 4039 | /* | 4594 | set_bit(CGRP_RELEASABLE, &cgroup_parent(cgrp)->flags); |
| 4040 | * There are two control paths which try to determine cgroup from | 4595 | check_for_release(cgroup_parent(cgrp)); |
| 4041 | * dentry without going through kernfs - cgroupstats_build() and | ||
| 4042 | * css_tryget_from_dir(). Those are supported by RCU protecting | ||
| 4043 | * clearing of cgrp->kn->priv backpointer, which should happen | ||
| 4044 | * after all files under it have been removed. | ||
| 4045 | */ | ||
| 4046 | kernfs_remove(cgrp->kn); /* @cgrp has an extra ref on its kn */ | ||
| 4047 | RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL); | ||
| 4048 | 4596 | ||
| 4049 | mutex_lock(&cgroup_mutex); | 4597 | /* put the base reference */ |
| 4598 | percpu_ref_kill(&cgrp->self.refcnt); | ||
| 4050 | 4599 | ||
| 4051 | return 0; | 4600 | return 0; |
| 4052 | }; | 4601 | }; |
| 4053 | 4602 | ||
| 4054 | /** | ||
| 4055 | * cgroup_destroy_css_killed - the second step of cgroup destruction | ||
| 4056 | * @work: cgroup->destroy_free_work | ||
| 4057 | * | ||
| 4058 | * This function is invoked from a work item for a cgroup which is being | ||
| 4059 | * destroyed after all css's are offlined and performs the rest of | ||
| 4060 | * destruction. This is the second step of destruction described in the | ||
| 4061 | * comment above cgroup_destroy_locked(). | ||
| 4062 | */ | ||
| 4063 | static void cgroup_destroy_css_killed(struct cgroup *cgrp) | ||
| 4064 | { | ||
| 4065 | struct cgroup *parent = cgrp->parent; | ||
| 4066 | |||
| 4067 | lockdep_assert_held(&cgroup_tree_mutex); | ||
| 4068 | lockdep_assert_held(&cgroup_mutex); | ||
| 4069 | |||
| 4070 | /* delete this cgroup from parent->children */ | ||
| 4071 | list_del_rcu(&cgrp->sibling); | ||
| 4072 | |||
| 4073 | cgroup_put(cgrp); | ||
| 4074 | |||
| 4075 | set_bit(CGRP_RELEASABLE, &parent->flags); | ||
| 4076 | check_for_release(parent); | ||
| 4077 | } | ||
| 4078 | |||
| 4079 | static int cgroup_rmdir(struct kernfs_node *kn) | 4603 | static int cgroup_rmdir(struct kernfs_node *kn) |
| 4080 | { | 4604 | { |
| 4081 | struct cgroup *cgrp = kn->priv; | 4605 | struct cgroup *cgrp; |
| 4082 | int ret = 0; | 4606 | int ret = 0; |
| 4083 | 4607 | ||
| 4084 | /* | 4608 | cgrp = cgroup_kn_lock_live(kn); |
| 4085 | * This is self-destruction but @kn can't be removed while this | 4609 | if (!cgrp) |
| 4086 | * callback is in progress. Let's break active protection. Once | 4610 | return 0; |
| 4087 | * the protection is broken, @cgrp can be destroyed at any point. | 4611 | cgroup_get(cgrp); /* for @kn->priv clearing */ |
| 4088 | * Pin it so that it stays accessible. | ||
| 4089 | */ | ||
| 4090 | cgroup_get(cgrp); | ||
| 4091 | kernfs_break_active_protection(kn); | ||
| 4092 | 4612 | ||
| 4093 | mutex_lock(&cgroup_tree_mutex); | 4613 | ret = cgroup_destroy_locked(cgrp); |
| 4094 | mutex_lock(&cgroup_mutex); | 4614 | |
| 4615 | cgroup_kn_unlock(kn); | ||
| 4095 | 4616 | ||
| 4096 | /* | 4617 | /* |
| 4097 | * @cgrp might already have been destroyed while we're trying to | 4618 | * There are two control paths which try to determine cgroup from |
| 4098 | * grab the mutexes. | 4619 | * dentry without going through kernfs - cgroupstats_build() and |
| 4620 | * css_tryget_online_from_dir(). Those are supported by RCU | ||
| 4621 | * protecting clearing of cgrp->kn->priv backpointer, which should | ||
| 4622 | * happen after all files under it have been removed. | ||
| 4099 | */ | 4623 | */ |
| 4100 | if (!cgroup_is_dead(cgrp)) | 4624 | if (!ret) |
| 4101 | ret = cgroup_destroy_locked(cgrp); | 4625 | RCU_INIT_POINTER(*(void __rcu __force **)&kn->priv, NULL); |
| 4102 | |||
| 4103 | mutex_unlock(&cgroup_mutex); | ||
| 4104 | mutex_unlock(&cgroup_tree_mutex); | ||
| 4105 | 4626 | ||
| 4106 | kernfs_unbreak_active_protection(kn); | ||
| 4107 | cgroup_put(cgrp); | 4627 | cgroup_put(cgrp); |
| 4108 | return ret; | 4628 | return ret; |
| 4109 | } | 4629 | } |
| @@ -4116,15 +4636,15 @@ static struct kernfs_syscall_ops cgroup_kf_syscall_ops = { | |||
| 4116 | .rename = cgroup_rename, | 4636 | .rename = cgroup_rename, |
| 4117 | }; | 4637 | }; |
| 4118 | 4638 | ||
| 4119 | static void __init cgroup_init_subsys(struct cgroup_subsys *ss) | 4639 | static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) |
| 4120 | { | 4640 | { |
| 4121 | struct cgroup_subsys_state *css; | 4641 | struct cgroup_subsys_state *css; |
| 4122 | 4642 | ||
| 4123 | printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); | 4643 | printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); |
| 4124 | 4644 | ||
| 4125 | mutex_lock(&cgroup_tree_mutex); | ||
| 4126 | mutex_lock(&cgroup_mutex); | 4645 | mutex_lock(&cgroup_mutex); |
| 4127 | 4646 | ||
| 4647 | idr_init(&ss->css_idr); | ||
| 4128 | INIT_LIST_HEAD(&ss->cfts); | 4648 | INIT_LIST_HEAD(&ss->cfts); |
| 4129 | 4649 | ||
| 4130 | /* Create the root cgroup state for this subsystem */ | 4650 | /* Create the root cgroup state for this subsystem */ |
| @@ -4132,7 +4652,21 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) | |||
| 4132 | css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss)); | 4652 | css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss)); |
| 4133 | /* We don't handle early failures gracefully */ | 4653 | /* We don't handle early failures gracefully */ |
| 4134 | BUG_ON(IS_ERR(css)); | 4654 | BUG_ON(IS_ERR(css)); |
| 4135 | init_css(css, ss, &cgrp_dfl_root.cgrp); | 4655 | init_and_link_css(css, ss, &cgrp_dfl_root.cgrp); |
| 4656 | |||
| 4657 | /* | ||
| 4658 | * Root csses are never destroyed and we can't initialize | ||
| 4659 | * percpu_ref during early init. Disable refcnting. | ||
| 4660 | */ | ||
| 4661 | css->flags |= CSS_NO_REF; | ||
| 4662 | |||
| 4663 | if (early) { | ||
| 4664 | /* allocation can't be done safely during early init */ | ||
| 4665 | css->id = 1; | ||
| 4666 | } else { | ||
| 4667 | css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL); | ||
| 4668 | BUG_ON(css->id < 0); | ||
| 4669 | } | ||
| 4136 | 4670 | ||
| 4137 | /* Update the init_css_set to contain a subsys | 4671 | /* Update the init_css_set to contain a subsys |
| 4138 | * pointer to this state - since the subsystem is | 4672 | * pointer to this state - since the subsystem is |
| @@ -4149,10 +4683,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) | |||
| 4149 | 4683 | ||
| 4150 | BUG_ON(online_css(css)); | 4684 | BUG_ON(online_css(css)); |
| 4151 | 4685 | ||
| 4152 | cgrp_dfl_root.cgrp.subsys_mask |= 1 << ss->id; | ||
| 4153 | |||
| 4154 | mutex_unlock(&cgroup_mutex); | 4686 | mutex_unlock(&cgroup_mutex); |
| 4155 | mutex_unlock(&cgroup_tree_mutex); | ||
| 4156 | } | 4687 | } |
| 4157 | 4688 | ||
| 4158 | /** | 4689 | /** |
| @@ -4169,6 +4700,8 @@ int __init cgroup_init_early(void) | |||
| 4169 | int i; | 4700 | int i; |
| 4170 | 4701 | ||
| 4171 | init_cgroup_root(&cgrp_dfl_root, &opts); | 4702 | init_cgroup_root(&cgrp_dfl_root, &opts); |
| 4703 | cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF; | ||
| 4704 | |||
| 4172 | RCU_INIT_POINTER(init_task.cgroups, &init_css_set); | 4705 | RCU_INIT_POINTER(init_task.cgroups, &init_css_set); |
| 4173 | 4706 | ||
| 4174 | for_each_subsys(ss, i) { | 4707 | for_each_subsys(ss, i) { |
| @@ -4183,7 +4716,7 @@ int __init cgroup_init_early(void) | |||
| 4183 | ss->name = cgroup_subsys_name[i]; | 4716 | ss->name = cgroup_subsys_name[i]; |
| 4184 | 4717 | ||
| 4185 | if (ss->early_init) | 4718 | if (ss->early_init) |
| 4186 | cgroup_init_subsys(ss); | 4719 | cgroup_init_subsys(ss, true); |
| 4187 | } | 4720 | } |
| 4188 | return 0; | 4721 | return 0; |
| 4189 | } | 4722 | } |
| @@ -4202,7 +4735,6 @@ int __init cgroup_init(void) | |||
| 4202 | 4735 | ||
| 4203 | BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); | 4736 | BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); |
| 4204 | 4737 | ||
| 4205 | mutex_lock(&cgroup_tree_mutex); | ||
| 4206 | mutex_lock(&cgroup_mutex); | 4738 | mutex_lock(&cgroup_mutex); |
| 4207 | 4739 | ||
| 4208 | /* Add init_css_set to the hash table */ | 4740 | /* Add init_css_set to the hash table */ |
| @@ -4212,18 +4744,31 @@ int __init cgroup_init(void) | |||
| 4212 | BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0)); | 4744 | BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0)); |
| 4213 | 4745 | ||
| 4214 | mutex_unlock(&cgroup_mutex); | 4746 | mutex_unlock(&cgroup_mutex); |
| 4215 | mutex_unlock(&cgroup_tree_mutex); | ||
| 4216 | 4747 | ||
| 4217 | for_each_subsys(ss, ssid) { | 4748 | for_each_subsys(ss, ssid) { |
| 4218 | if (!ss->early_init) | 4749 | if (ss->early_init) { |
| 4219 | cgroup_init_subsys(ss); | 4750 | struct cgroup_subsys_state *css = |
| 4751 | init_css_set.subsys[ss->id]; | ||
| 4752 | |||
| 4753 | css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, | ||
| 4754 | GFP_KERNEL); | ||
| 4755 | BUG_ON(css->id < 0); | ||
| 4756 | } else { | ||
| 4757 | cgroup_init_subsys(ss, false); | ||
| 4758 | } | ||
| 4759 | |||
| 4760 | list_add_tail(&init_css_set.e_cset_node[ssid], | ||
| 4761 | &cgrp_dfl_root.cgrp.e_csets[ssid]); | ||
| 4220 | 4762 | ||
| 4221 | /* | 4763 | /* |
| 4222 | * cftype registration needs kmalloc and can't be done | 4764 | * Setting dfl_root subsys_mask needs to consider the |
| 4223 | * during early_init. Register base cftypes separately. | 4765 | * disabled flag and cftype registration needs kmalloc, |
| 4766 | * both of which aren't available during early_init. | ||
| 4224 | */ | 4767 | */ |
| 4225 | if (ss->base_cftypes) | 4768 | if (!ss->disabled) { |
| 4769 | cgrp_dfl_root.subsys_mask |= 1 << ss->id; | ||
| 4226 | WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes)); | 4770 | WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes)); |
| 4771 | } | ||
| 4227 | } | 4772 | } |
| 4228 | 4773 | ||
| 4229 | cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); | 4774 | cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); |
| @@ -4306,7 +4851,7 @@ int proc_cgroup_show(struct seq_file *m, void *v) | |||
| 4306 | 4851 | ||
| 4307 | seq_printf(m, "%d:", root->hierarchy_id); | 4852 | seq_printf(m, "%d:", root->hierarchy_id); |
| 4308 | for_each_subsys(ss, ssid) | 4853 | for_each_subsys(ss, ssid) |
| 4309 | if (root->cgrp.subsys_mask & (1 << ssid)) | 4854 | if (root->subsys_mask & (1 << ssid)) |
| 4310 | seq_printf(m, "%s%s", count++ ? "," : "", ss->name); | 4855 | seq_printf(m, "%s%s", count++ ? "," : "", ss->name); |
| 4311 | if (strlen(root->name)) | 4856 | if (strlen(root->name)) |
| 4312 | seq_printf(m, "%sname=%s", count ? "," : "", | 4857 | seq_printf(m, "%sname=%s", count ? "," : "", |
| @@ -4501,8 +5046,8 @@ void cgroup_exit(struct task_struct *tsk) | |||
| 4501 | 5046 | ||
| 4502 | static void check_for_release(struct cgroup *cgrp) | 5047 | static void check_for_release(struct cgroup *cgrp) |
| 4503 | { | 5048 | { |
| 4504 | if (cgroup_is_releasable(cgrp) && | 5049 | if (cgroup_is_releasable(cgrp) && list_empty(&cgrp->cset_links) && |
| 4505 | list_empty(&cgrp->cset_links) && list_empty(&cgrp->children)) { | 5050 | !css_has_online_children(&cgrp->self)) { |
| 4506 | /* | 5051 | /* |
| 4507 | * Control Group is currently removeable. If it's not | 5052 | * Control Group is currently removeable. If it's not |
| 4508 | * already queued for a userspace notification, queue | 5053 | * already queued for a userspace notification, queue |
| @@ -4619,7 +5164,7 @@ static int __init cgroup_disable(char *str) | |||
| 4619 | __setup("cgroup_disable=", cgroup_disable); | 5164 | __setup("cgroup_disable=", cgroup_disable); |
| 4620 | 5165 | ||
| 4621 | /** | 5166 | /** |
| 4622 | * css_tryget_from_dir - get corresponding css from the dentry of a cgroup dir | 5167 | * css_tryget_online_from_dir - get corresponding css from a cgroup dentry |
| 4623 | * @dentry: directory dentry of interest | 5168 | * @dentry: directory dentry of interest |
| 4624 | * @ss: subsystem of interest | 5169 | * @ss: subsystem of interest |
| 4625 | * | 5170 | * |
| @@ -4627,8 +5172,8 @@ __setup("cgroup_disable=", cgroup_disable); | |||
| 4627 | * to get the corresponding css and return it. If such css doesn't exist | 5172 | * to get the corresponding css and return it. If such css doesn't exist |
| 4628 | * or can't be pinned, an ERR_PTR value is returned. | 5173 | * or can't be pinned, an ERR_PTR value is returned. |
| 4629 | */ | 5174 | */ |
| 4630 | struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry, | 5175 | struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, |
| 4631 | struct cgroup_subsys *ss) | 5176 | struct cgroup_subsys *ss) |
| 4632 | { | 5177 | { |
| 4633 | struct kernfs_node *kn = kernfs_node_from_dentry(dentry); | 5178 | struct kernfs_node *kn = kernfs_node_from_dentry(dentry); |
| 4634 | struct cgroup_subsys_state *css = NULL; | 5179 | struct cgroup_subsys_state *css = NULL; |
| @@ -4644,13 +5189,13 @@ struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry, | |||
| 4644 | /* | 5189 | /* |
| 4645 | * This path doesn't originate from kernfs and @kn could already | 5190 | * This path doesn't originate from kernfs and @kn could already |
| 4646 | * have been or be removed at any point. @kn->priv is RCU | 5191 | * have been or be removed at any point. @kn->priv is RCU |
| 4647 | * protected for this access. See destroy_locked() for details. | 5192 | * protected for this access. See cgroup_rmdir() for details. |
| 4648 | */ | 5193 | */ |
| 4649 | cgrp = rcu_dereference(kn->priv); | 5194 | cgrp = rcu_dereference(kn->priv); |
| 4650 | if (cgrp) | 5195 | if (cgrp) |
| 4651 | css = cgroup_css(cgrp, ss); | 5196 | css = cgroup_css(cgrp, ss); |
| 4652 | 5197 | ||
| 4653 | if (!css || !css_tryget(css)) | 5198 | if (!css || !css_tryget_online(css)) |
| 4654 | css = ERR_PTR(-ENOENT); | 5199 | css = ERR_PTR(-ENOENT); |
| 4655 | 5200 | ||
| 4656 | rcu_read_unlock(); | 5201 | rcu_read_unlock(); |
| @@ -4667,14 +5212,8 @@ struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry, | |||
| 4667 | */ | 5212 | */ |
| 4668 | struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) | 5213 | struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) |
| 4669 | { | 5214 | { |
| 4670 | struct cgroup *cgrp; | 5215 | WARN_ON_ONCE(!rcu_read_lock_held()); |
| 4671 | 5216 | return idr_find(&ss->css_idr, id); | |
| 4672 | cgroup_assert_mutexes_or_rcu_locked(); | ||
| 4673 | |||
| 4674 | cgrp = idr_find(&ss->root->cgroup_idr, id); | ||
| 4675 | if (cgrp) | ||
| 4676 | return cgroup_css(cgrp, ss); | ||
| 4677 | return NULL; | ||
| 4678 | } | 5217 | } |
| 4679 | 5218 | ||
| 4680 | #ifdef CONFIG_CGROUP_DEBUG | 5219 | #ifdef CONFIG_CGROUP_DEBUG |
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 2bc4a2256444..a79e40f9d700 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c | |||
| @@ -21,6 +21,7 @@ | |||
| 21 | #include <linux/uaccess.h> | 21 | #include <linux/uaccess.h> |
| 22 | #include <linux/freezer.h> | 22 | #include <linux/freezer.h> |
| 23 | #include <linux/seq_file.h> | 23 | #include <linux/seq_file.h> |
| 24 | #include <linux/mutex.h> | ||
| 24 | 25 | ||
| 25 | /* | 26 | /* |
| 26 | * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is | 27 | * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is |
| @@ -42,9 +43,10 @@ enum freezer_state_flags { | |||
| 42 | struct freezer { | 43 | struct freezer { |
| 43 | struct cgroup_subsys_state css; | 44 | struct cgroup_subsys_state css; |
| 44 | unsigned int state; | 45 | unsigned int state; |
| 45 | spinlock_t lock; | ||
| 46 | }; | 46 | }; |
| 47 | 47 | ||
| 48 | static DEFINE_MUTEX(freezer_mutex); | ||
| 49 | |||
| 48 | static inline struct freezer *css_freezer(struct cgroup_subsys_state *css) | 50 | static inline struct freezer *css_freezer(struct cgroup_subsys_state *css) |
| 49 | { | 51 | { |
| 50 | return css ? container_of(css, struct freezer, css) : NULL; | 52 | return css ? container_of(css, struct freezer, css) : NULL; |
| @@ -57,7 +59,7 @@ static inline struct freezer *task_freezer(struct task_struct *task) | |||
| 57 | 59 | ||
| 58 | static struct freezer *parent_freezer(struct freezer *freezer) | 60 | static struct freezer *parent_freezer(struct freezer *freezer) |
| 59 | { | 61 | { |
| 60 | return css_freezer(css_parent(&freezer->css)); | 62 | return css_freezer(freezer->css.parent); |
| 61 | } | 63 | } |
| 62 | 64 | ||
| 63 | bool cgroup_freezing(struct task_struct *task) | 65 | bool cgroup_freezing(struct task_struct *task) |
| @@ -71,10 +73,6 @@ bool cgroup_freezing(struct task_struct *task) | |||
| 71 | return ret; | 73 | return ret; |
| 72 | } | 74 | } |
| 73 | 75 | ||
| 74 | /* | ||
| 75 | * cgroups_write_string() limits the size of freezer state strings to | ||
| 76 | * CGROUP_LOCAL_BUFFER_SIZE | ||
| 77 | */ | ||
| 78 | static const char *freezer_state_strs(unsigned int state) | 76 | static const char *freezer_state_strs(unsigned int state) |
| 79 | { | 77 | { |
| 80 | if (state & CGROUP_FROZEN) | 78 | if (state & CGROUP_FROZEN) |
| @@ -93,7 +91,6 @@ freezer_css_alloc(struct cgroup_subsys_state *parent_css) | |||
| 93 | if (!freezer) | 91 | if (!freezer) |
| 94 | return ERR_PTR(-ENOMEM); | 92 | return ERR_PTR(-ENOMEM); |
| 95 | 93 | ||
| 96 | spin_lock_init(&freezer->lock); | ||
| 97 | return &freezer->css; | 94 | return &freezer->css; |
| 98 | } | 95 | } |
| 99 | 96 | ||
| @@ -110,14 +107,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css) | |||
| 110 | struct freezer *freezer = css_freezer(css); | 107 | struct freezer *freezer = css_freezer(css); |
| 111 | struct freezer *parent = parent_freezer(freezer); | 108 | struct freezer *parent = parent_freezer(freezer); |
| 112 | 109 | ||
| 113 | /* | 110 | mutex_lock(&freezer_mutex); |
| 114 | * The following double locking and freezing state inheritance | ||
| 115 | * guarantee that @cgroup can never escape ancestors' freezing | ||
| 116 | * states. See css_for_each_descendant_pre() for details. | ||
| 117 | */ | ||
| 118 | if (parent) | ||
| 119 | spin_lock_irq(&parent->lock); | ||
| 120 | spin_lock_nested(&freezer->lock, SINGLE_DEPTH_NESTING); | ||
| 121 | 111 | ||
| 122 | freezer->state |= CGROUP_FREEZER_ONLINE; | 112 | freezer->state |= CGROUP_FREEZER_ONLINE; |
| 123 | 113 | ||
| @@ -126,10 +116,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css) | |||
| 126 | atomic_inc(&system_freezing_cnt); | 116 | atomic_inc(&system_freezing_cnt); |
| 127 | } | 117 | } |
| 128 | 118 | ||
| 129 | spin_unlock(&freezer->lock); | 119 | mutex_unlock(&freezer_mutex); |
| 130 | if (parent) | ||
| 131 | spin_unlock_irq(&parent->lock); | ||
| 132 | |||
| 133 | return 0; | 120 | return 0; |
| 134 | } | 121 | } |
| 135 | 122 | ||
| @@ -144,14 +131,14 @@ static void freezer_css_offline(struct cgroup_subsys_state *css) | |||
| 144 | { | 131 | { |
| 145 | struct freezer *freezer = css_freezer(css); | 132 | struct freezer *freezer = css_freezer(css); |
| 146 | 133 | ||
| 147 | spin_lock_irq(&freezer->lock); | 134 | mutex_lock(&freezer_mutex); |
| 148 | 135 | ||
| 149 | if (freezer->state & CGROUP_FREEZING) | 136 | if (freezer->state & CGROUP_FREEZING) |
| 150 | atomic_dec(&system_freezing_cnt); | 137 | atomic_dec(&system_freezing_cnt); |
| 151 | 138 | ||
| 152 | freezer->state = 0; | 139 | freezer->state = 0; |
| 153 | 140 | ||
| 154 | spin_unlock_irq(&freezer->lock); | 141 | mutex_unlock(&freezer_mutex); |
| 155 | } | 142 | } |
| 156 | 143 | ||
| 157 | static void freezer_css_free(struct cgroup_subsys_state *css) | 144 | static void freezer_css_free(struct cgroup_subsys_state *css) |
| @@ -175,7 +162,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css, | |||
| 175 | struct task_struct *task; | 162 | struct task_struct *task; |
| 176 | bool clear_frozen = false; | 163 | bool clear_frozen = false; |
| 177 | 164 | ||
| 178 | spin_lock_irq(&freezer->lock); | 165 | mutex_lock(&freezer_mutex); |
| 179 | 166 | ||
| 180 | /* | 167 | /* |
| 181 | * Make the new tasks conform to the current state of @new_css. | 168 | * Make the new tasks conform to the current state of @new_css. |
| @@ -197,21 +184,13 @@ static void freezer_attach(struct cgroup_subsys_state *new_css, | |||
| 197 | } | 184 | } |
| 198 | } | 185 | } |
| 199 | 186 | ||
| 200 | spin_unlock_irq(&freezer->lock); | 187 | /* propagate FROZEN clearing upwards */ |
| 201 | |||
| 202 | /* | ||
| 203 | * Propagate FROZEN clearing upwards. We may race with | ||
| 204 | * update_if_frozen(), but as long as both work bottom-up, either | ||
| 205 | * update_if_frozen() sees child's FROZEN cleared or we clear the | ||
| 206 | * parent's FROZEN later. No parent w/ !FROZEN children can be | ||
| 207 | * left FROZEN. | ||
| 208 | */ | ||
| 209 | while (clear_frozen && (freezer = parent_freezer(freezer))) { | 188 | while (clear_frozen && (freezer = parent_freezer(freezer))) { |
| 210 | spin_lock_irq(&freezer->lock); | ||
| 211 | freezer->state &= ~CGROUP_FROZEN; | 189 | freezer->state &= ~CGROUP_FROZEN; |
| 212 | clear_frozen = freezer->state & CGROUP_FREEZING; | 190 | clear_frozen = freezer->state & CGROUP_FREEZING; |
| 213 | spin_unlock_irq(&freezer->lock); | ||
| 214 | } | 191 | } |
| 192 | |||
| 193 | mutex_unlock(&freezer_mutex); | ||
| 215 | } | 194 | } |
| 216 | 195 | ||
| 217 | /** | 196 | /** |
| @@ -228,9 +207,6 @@ static void freezer_fork(struct task_struct *task) | |||
| 228 | { | 207 | { |
| 229 | struct freezer *freezer; | 208 | struct freezer *freezer; |
| 230 | 209 | ||
| 231 | rcu_read_lock(); | ||
| 232 | freezer = task_freezer(task); | ||
| 233 | |||
| 234 | /* | 210 | /* |
| 235 | * The root cgroup is non-freezable, so we can skip locking the | 211 | * The root cgroup is non-freezable, so we can skip locking the |
| 236 | * freezer. This is safe regardless of race with task migration. | 212 | * freezer. This is safe regardless of race with task migration. |
| @@ -238,24 +214,18 @@ static void freezer_fork(struct task_struct *task) | |||
| 238 | * to do. If we lost and root is the new cgroup, noop is still the | 214 | * to do. If we lost and root is the new cgroup, noop is still the |
| 239 | * right thing to do. | 215 | * right thing to do. |
| 240 | */ | 216 | */ |
| 241 | if (!parent_freezer(freezer)) | 217 | if (task_css_is_root(task, freezer_cgrp_id)) |
| 242 | goto out; | 218 | return; |
| 243 | 219 | ||
| 244 | /* | 220 | mutex_lock(&freezer_mutex); |
| 245 | * Grab @freezer->lock and freeze @task after verifying @task still | 221 | rcu_read_lock(); |
| 246 | * belongs to @freezer and it's freezing. The former is for the | 222 | |
| 247 | * case where we have raced against task migration and lost and | 223 | freezer = task_freezer(task); |
| 248 | * @task is already in a different cgroup which may not be frozen. | 224 | if (freezer->state & CGROUP_FREEZING) |
| 249 | * This isn't strictly necessary as freeze_task() is allowed to be | ||
| 250 | * called spuriously but let's do it anyway for, if nothing else, | ||
| 251 | * documentation. | ||
| 252 | */ | ||
| 253 | spin_lock_irq(&freezer->lock); | ||
| 254 | if (freezer == task_freezer(task) && (freezer->state & CGROUP_FREEZING)) | ||
| 255 | freeze_task(task); | 225 | freeze_task(task); |
| 256 | spin_unlock_irq(&freezer->lock); | 226 | |
| 257 | out: | ||
| 258 | rcu_read_unlock(); | 227 | rcu_read_unlock(); |
| 228 | mutex_unlock(&freezer_mutex); | ||
| 259 | } | 229 | } |
| 260 | 230 | ||
| 261 | /** | 231 | /** |
| @@ -281,22 +251,24 @@ static void update_if_frozen(struct cgroup_subsys_state *css) | |||
| 281 | struct css_task_iter it; | 251 | struct css_task_iter it; |
| 282 | struct task_struct *task; | 252 | struct task_struct *task; |
| 283 | 253 | ||
| 284 | WARN_ON_ONCE(!rcu_read_lock_held()); | 254 | lockdep_assert_held(&freezer_mutex); |
| 285 | |||
| 286 | spin_lock_irq(&freezer->lock); | ||
| 287 | 255 | ||
| 288 | if (!(freezer->state & CGROUP_FREEZING) || | 256 | if (!(freezer->state & CGROUP_FREEZING) || |
| 289 | (freezer->state & CGROUP_FROZEN)) | 257 | (freezer->state & CGROUP_FROZEN)) |
| 290 | goto out_unlock; | 258 | return; |
| 291 | 259 | ||
| 292 | /* are all (live) children frozen? */ | 260 | /* are all (live) children frozen? */ |
| 261 | rcu_read_lock(); | ||
| 293 | css_for_each_child(pos, css) { | 262 | css_for_each_child(pos, css) { |
| 294 | struct freezer *child = css_freezer(pos); | 263 | struct freezer *child = css_freezer(pos); |
| 295 | 264 | ||
| 296 | if ((child->state & CGROUP_FREEZER_ONLINE) && | 265 | if ((child->state & CGROUP_FREEZER_ONLINE) && |
| 297 | !(child->state & CGROUP_FROZEN)) | 266 | !(child->state & CGROUP_FROZEN)) { |
| 298 | goto out_unlock; | 267 | rcu_read_unlock(); |
| 268 | return; | ||
| 269 | } | ||
| 299 | } | 270 | } |
| 271 | rcu_read_unlock(); | ||
| 300 | 272 | ||
| 301 | /* are all tasks frozen? */ | 273 | /* are all tasks frozen? */ |
| 302 | css_task_iter_start(css, &it); | 274 | css_task_iter_start(css, &it); |
| @@ -317,21 +289,29 @@ static void update_if_frozen(struct cgroup_subsys_state *css) | |||
| 317 | freezer->state |= CGROUP_FROZEN; | 289 | freezer->state |= CGROUP_FROZEN; |
| 318 | out_iter_end: | 290 | out_iter_end: |
| 319 | css_task_iter_end(&it); | 291 | css_task_iter_end(&it); |
| 320 | out_unlock: | ||
| 321 | spin_unlock_irq(&freezer->lock); | ||
| 322 | } | 292 | } |
| 323 | 293 | ||
| 324 | static int freezer_read(struct seq_file *m, void *v) | 294 | static int freezer_read(struct seq_file *m, void *v) |
| 325 | { | 295 | { |
| 326 | struct cgroup_subsys_state *css = seq_css(m), *pos; | 296 | struct cgroup_subsys_state *css = seq_css(m), *pos; |
| 327 | 297 | ||
| 298 | mutex_lock(&freezer_mutex); | ||
| 328 | rcu_read_lock(); | 299 | rcu_read_lock(); |
| 329 | 300 | ||
| 330 | /* update states bottom-up */ | 301 | /* update states bottom-up */ |
| 331 | css_for_each_descendant_post(pos, css) | 302 | css_for_each_descendant_post(pos, css) { |
| 303 | if (!css_tryget_online(pos)) | ||
| 304 | continue; | ||
| 305 | rcu_read_unlock(); | ||
| 306 | |||
| 332 | update_if_frozen(pos); | 307 | update_if_frozen(pos); |
| 333 | 308 | ||
| 309 | rcu_read_lock(); | ||
| 310 | css_put(pos); | ||
| 311 | } | ||
| 312 | |||
| 334 | rcu_read_unlock(); | 313 | rcu_read_unlock(); |
| 314 | mutex_unlock(&freezer_mutex); | ||
| 335 | 315 | ||
| 336 | seq_puts(m, freezer_state_strs(css_freezer(css)->state)); | 316 | seq_puts(m, freezer_state_strs(css_freezer(css)->state)); |
| 337 | seq_putc(m, '\n'); | 317 | seq_putc(m, '\n'); |
| @@ -373,7 +353,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze, | |||
| 373 | unsigned int state) | 353 | unsigned int state) |
| 374 | { | 354 | { |
| 375 | /* also synchronizes against task migration, see freezer_attach() */ | 355 | /* also synchronizes against task migration, see freezer_attach() */ |
| 376 | lockdep_assert_held(&freezer->lock); | 356 | lockdep_assert_held(&freezer_mutex); |
| 377 | 357 | ||
| 378 | if (!(freezer->state & CGROUP_FREEZER_ONLINE)) | 358 | if (!(freezer->state & CGROUP_FREEZER_ONLINE)) |
| 379 | return; | 359 | return; |
| @@ -414,47 +394,47 @@ static void freezer_change_state(struct freezer *freezer, bool freeze) | |||
| 414 | * descendant will try to inherit its parent's FREEZING state as | 394 | * descendant will try to inherit its parent's FREEZING state as |
| 415 | * CGROUP_FREEZING_PARENT. | 395 | * CGROUP_FREEZING_PARENT. |
| 416 | */ | 396 | */ |
| 397 | mutex_lock(&freezer_mutex); | ||
| 417 | rcu_read_lock(); | 398 | rcu_read_lock(); |
| 418 | css_for_each_descendant_pre(pos, &freezer->css) { | 399 | css_for_each_descendant_pre(pos, &freezer->css) { |
| 419 | struct freezer *pos_f = css_freezer(pos); | 400 | struct freezer *pos_f = css_freezer(pos); |
| 420 | struct freezer *parent = parent_freezer(pos_f); | 401 | struct freezer *parent = parent_freezer(pos_f); |
| 421 | 402 | ||
| 422 | spin_lock_irq(&pos_f->lock); | 403 | if (!css_tryget_online(pos)) |
| 404 | continue; | ||
| 405 | rcu_read_unlock(); | ||
| 423 | 406 | ||
| 424 | if (pos_f == freezer) { | 407 | if (pos_f == freezer) |
| 425 | freezer_apply_state(pos_f, freeze, | 408 | freezer_apply_state(pos_f, freeze, |
| 426 | CGROUP_FREEZING_SELF); | 409 | CGROUP_FREEZING_SELF); |
| 427 | } else { | 410 | else |
| 428 | /* | ||
| 429 | * Our update to @parent->state is already visible | ||
| 430 | * which is all we need. No need to lock @parent. | ||
| 431 | * For more info on synchronization, see | ||
| 432 | * freezer_post_create(). | ||
| 433 | */ | ||
| 434 | freezer_apply_state(pos_f, | 411 | freezer_apply_state(pos_f, |
| 435 | parent->state & CGROUP_FREEZING, | 412 | parent->state & CGROUP_FREEZING, |
| 436 | CGROUP_FREEZING_PARENT); | 413 | CGROUP_FREEZING_PARENT); |
| 437 | } | ||
| 438 | 414 | ||
| 439 | spin_unlock_irq(&pos_f->lock); | 415 | rcu_read_lock(); |
| 416 | css_put(pos); | ||
| 440 | } | 417 | } |
| 441 | rcu_read_unlock(); | 418 | rcu_read_unlock(); |
| 419 | mutex_unlock(&freezer_mutex); | ||
| 442 | } | 420 | } |
| 443 | 421 | ||
| 444 | static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft, | 422 | static ssize_t freezer_write(struct kernfs_open_file *of, |
| 445 | char *buffer) | 423 | char *buf, size_t nbytes, loff_t off) |
| 446 | { | 424 | { |
| 447 | bool freeze; | 425 | bool freeze; |
| 448 | 426 | ||
| 449 | if (strcmp(buffer, freezer_state_strs(0)) == 0) | 427 | buf = strstrip(buf); |
| 428 | |||
| 429 | if (strcmp(buf, freezer_state_strs(0)) == 0) | ||
| 450 | freeze = false; | 430 | freeze = false; |
| 451 | else if (strcmp(buffer, freezer_state_strs(CGROUP_FROZEN)) == 0) | 431 | else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0) |
| 452 | freeze = true; | 432 | freeze = true; |
| 453 | else | 433 | else |
| 454 | return -EINVAL; | 434 | return -EINVAL; |
| 455 | 435 | ||
| 456 | freezer_change_state(css_freezer(css), freeze); | 436 | freezer_change_state(css_freezer(of_css(of)), freeze); |
| 457 | return 0; | 437 | return nbytes; |
| 458 | } | 438 | } |
| 459 | 439 | ||
| 460 | static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css, | 440 | static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css, |
| @@ -478,7 +458,7 @@ static struct cftype files[] = { | |||
| 478 | .name = "state", | 458 | .name = "state", |
| 479 | .flags = CFTYPE_NOT_ON_ROOT, | 459 | .flags = CFTYPE_NOT_ON_ROOT, |
| 480 | .seq_show = freezer_read, | 460 | .seq_show = freezer_read, |
| 481 | .write_string = freezer_write, | 461 | .write = freezer_write, |
| 482 | }, | 462 | }, |
| 483 | { | 463 | { |
| 484 | .name = "self_freezing", | 464 | .name = "self_freezing", |
diff --git a/kernel/compat.c b/kernel/compat.c index e40b0430b562..633394f442f8 100644 --- a/kernel/compat.c +++ b/kernel/compat.c | |||
| @@ -157,7 +157,7 @@ static int __compat_put_timespec(const struct timespec *ts, struct compat_timesp | |||
| 157 | int compat_get_timeval(struct timeval *tv, const void __user *utv) | 157 | int compat_get_timeval(struct timeval *tv, const void __user *utv) |
| 158 | { | 158 | { |
| 159 | if (COMPAT_USE_64BIT_TIME) | 159 | if (COMPAT_USE_64BIT_TIME) |
| 160 | return copy_from_user(tv, utv, sizeof *tv) ? -EFAULT : 0; | 160 | return copy_from_user(tv, utv, sizeof(*tv)) ? -EFAULT : 0; |
| 161 | else | 161 | else |
| 162 | return __compat_get_timeval(tv, utv); | 162 | return __compat_get_timeval(tv, utv); |
| 163 | } | 163 | } |
| @@ -166,7 +166,7 @@ EXPORT_SYMBOL_GPL(compat_get_timeval); | |||
| 166 | int compat_put_timeval(const struct timeval *tv, void __user *utv) | 166 | int compat_put_timeval(const struct timeval *tv, void __user *utv) |
| 167 | { | 167 | { |
| 168 | if (COMPAT_USE_64BIT_TIME) | 168 | if (COMPAT_USE_64BIT_TIME) |
| 169 | return copy_to_user(utv, tv, sizeof *tv) ? -EFAULT : 0; | 169 | return copy_to_user(utv, tv, sizeof(*tv)) ? -EFAULT : 0; |
| 170 | else | 170 | else |
| 171 | return __compat_put_timeval(tv, utv); | 171 | return __compat_put_timeval(tv, utv); |
| 172 | } | 172 | } |
| @@ -175,7 +175,7 @@ EXPORT_SYMBOL_GPL(compat_put_timeval); | |||
| 175 | int compat_get_timespec(struct timespec *ts, const void __user *uts) | 175 | int compat_get_timespec(struct timespec *ts, const void __user *uts) |
| 176 | { | 176 | { |
| 177 | if (COMPAT_USE_64BIT_TIME) | 177 | if (COMPAT_USE_64BIT_TIME) |
| 178 | return copy_from_user(ts, uts, sizeof *ts) ? -EFAULT : 0; | 178 | return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0; |
| 179 | else | 179 | else |
| 180 | return __compat_get_timespec(ts, uts); | 180 | return __compat_get_timespec(ts, uts); |
| 181 | } | 181 | } |
| @@ -184,7 +184,7 @@ EXPORT_SYMBOL_GPL(compat_get_timespec); | |||
| 184 | int compat_put_timespec(const struct timespec *ts, void __user *uts) | 184 | int compat_put_timespec(const struct timespec *ts, void __user *uts) |
| 185 | { | 185 | { |
| 186 | if (COMPAT_USE_64BIT_TIME) | 186 | if (COMPAT_USE_64BIT_TIME) |
| 187 | return copy_to_user(uts, ts, sizeof *ts) ? -EFAULT : 0; | 187 | return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0; |
| 188 | else | 188 | else |
| 189 | return __compat_put_timespec(ts, uts); | 189 | return __compat_put_timespec(ts, uts); |
| 190 | } | 190 | } |
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 6cb20d2e7ee0..019d45008448 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c | |||
| @@ -120,7 +120,7 @@ void context_tracking_user_enter(void) | |||
| 120 | * instead of preempt_schedule() to exit user context if needed before | 120 | * instead of preempt_schedule() to exit user context if needed before |
| 121 | * calling the scheduler. | 121 | * calling the scheduler. |
| 122 | */ | 122 | */ |
| 123 | asmlinkage void __sched notrace preempt_schedule_context(void) | 123 | asmlinkage __visible void __sched notrace preempt_schedule_context(void) |
| 124 | { | 124 | { |
| 125 | enum ctx_state prev_ctx; | 125 | enum ctx_state prev_ctx; |
| 126 | 126 | ||
diff --git a/kernel/cpu.c b/kernel/cpu.c index a9e710eef0e2..acf791c55b71 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
| @@ -283,8 +283,7 @@ static inline void check_for_tasks(int cpu) | |||
| 283 | task_cputime(p, &utime, &stime); | 283 | task_cputime(p, &utime, &stime); |
| 284 | if (task_cpu(p) == cpu && p->state == TASK_RUNNING && | 284 | if (task_cpu(p) == cpu && p->state == TASK_RUNNING && |
| 285 | (utime || stime)) | 285 | (utime || stime)) |
| 286 | printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d " | 286 | pr_warn("Task %s (pid = %d) is on cpu %d (state = %ld, flags = %x)\n", |
| 287 | "(state = %ld, flags = %x)\n", | ||
| 288 | p->comm, task_pid_nr(p), cpu, | 287 | p->comm, task_pid_nr(p), cpu, |
| 289 | p->state, p->flags); | 288 | p->state, p->flags); |
| 290 | } | 289 | } |
| @@ -336,8 +335,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) | |||
| 336 | if (err) { | 335 | if (err) { |
| 337 | nr_calls--; | 336 | nr_calls--; |
| 338 | __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL); | 337 | __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL); |
| 339 | printk("%s: attempt to take down CPU %u failed\n", | 338 | pr_warn("%s: attempt to take down CPU %u failed\n", |
| 340 | __func__, cpu); | 339 | __func__, cpu); |
| 341 | goto out_release; | 340 | goto out_release; |
| 342 | } | 341 | } |
| 343 | 342 | ||
| @@ -444,8 +443,8 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen) | |||
| 444 | ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); | 443 | ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); |
| 445 | if (ret) { | 444 | if (ret) { |
| 446 | nr_calls--; | 445 | nr_calls--; |
| 447 | printk(KERN_WARNING "%s: attempt to bring up CPU %u failed\n", | 446 | pr_warn("%s: attempt to bring up CPU %u failed\n", |
| 448 | __func__, cpu); | 447 | __func__, cpu); |
| 449 | goto out_notify; | 448 | goto out_notify; |
| 450 | } | 449 | } |
| 451 | 450 | ||
| @@ -475,11 +474,10 @@ int cpu_up(unsigned int cpu) | |||
| 475 | int err = 0; | 474 | int err = 0; |
| 476 | 475 | ||
| 477 | if (!cpu_possible(cpu)) { | 476 | if (!cpu_possible(cpu)) { |
| 478 | printk(KERN_ERR "can't online cpu %d because it is not " | 477 | pr_err("can't online cpu %d because it is not configured as may-hotadd at boot time\n", |
| 479 | "configured as may-hotadd at boot time\n", cpu); | 478 | cpu); |
| 480 | #if defined(CONFIG_IA64) | 479 | #if defined(CONFIG_IA64) |
| 481 | printk(KERN_ERR "please check additional_cpus= boot " | 480 | pr_err("please check additional_cpus= boot parameter\n"); |
| 482 | "parameter\n"); | ||
| 483 | #endif | 481 | #endif |
| 484 | return -EINVAL; | 482 | return -EINVAL; |
| 485 | } | 483 | } |
| @@ -518,7 +516,7 @@ int disable_nonboot_cpus(void) | |||
| 518 | */ | 516 | */ |
| 519 | cpumask_clear(frozen_cpus); | 517 | cpumask_clear(frozen_cpus); |
| 520 | 518 | ||
| 521 | printk("Disabling non-boot CPUs ...\n"); | 519 | pr_info("Disabling non-boot CPUs ...\n"); |
| 522 | for_each_online_cpu(cpu) { | 520 | for_each_online_cpu(cpu) { |
| 523 | if (cpu == first_cpu) | 521 | if (cpu == first_cpu) |
| 524 | continue; | 522 | continue; |
| @@ -526,8 +524,7 @@ int disable_nonboot_cpus(void) | |||
| 526 | if (!error) | 524 | if (!error) |
| 527 | cpumask_set_cpu(cpu, frozen_cpus); | 525 | cpumask_set_cpu(cpu, frozen_cpus); |
| 528 | else { | 526 | else { |
| 529 | printk(KERN_ERR "Error taking CPU%d down: %d\n", | 527 | pr_err("Error taking CPU%d down: %d\n", cpu, error); |
| 530 | cpu, error); | ||
| 531 | break; | 528 | break; |
| 532 | } | 529 | } |
| 533 | } | 530 | } |
| @@ -537,7 +534,7 @@ int disable_nonboot_cpus(void) | |||
| 537 | /* Make sure the CPUs won't be enabled by someone else */ | 534 | /* Make sure the CPUs won't be enabled by someone else */ |
| 538 | cpu_hotplug_disabled = 1; | 535 | cpu_hotplug_disabled = 1; |
| 539 | } else { | 536 | } else { |
| 540 | printk(KERN_ERR "Non-boot CPUs are not disabled\n"); | 537 | pr_err("Non-boot CPUs are not disabled\n"); |
| 541 | } | 538 | } |
| 542 | cpu_maps_update_done(); | 539 | cpu_maps_update_done(); |
| 543 | return error; | 540 | return error; |
| @@ -561,17 +558,17 @@ void __ref enable_nonboot_cpus(void) | |||
| 561 | if (cpumask_empty(frozen_cpus)) | 558 | if (cpumask_empty(frozen_cpus)) |
| 562 | goto out; | 559 | goto out; |
| 563 | 560 | ||
| 564 | printk(KERN_INFO "Enabling non-boot CPUs ...\n"); | 561 | pr_info("Enabling non-boot CPUs ...\n"); |
| 565 | 562 | ||
| 566 | arch_enable_nonboot_cpus_begin(); | 563 | arch_enable_nonboot_cpus_begin(); |
| 567 | 564 | ||
| 568 | for_each_cpu(cpu, frozen_cpus) { | 565 | for_each_cpu(cpu, frozen_cpus) { |
| 569 | error = _cpu_up(cpu, 1); | 566 | error = _cpu_up(cpu, 1); |
| 570 | if (!error) { | 567 | if (!error) { |
| 571 | printk(KERN_INFO "CPU%d is up\n", cpu); | 568 | pr_info("CPU%d is up\n", cpu); |
| 572 | continue; | 569 | continue; |
| 573 | } | 570 | } |
| 574 | printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); | 571 | pr_warn("Error taking CPU%d up: %d\n", cpu, error); |
| 575 | } | 572 | } |
| 576 | 573 | ||
| 577 | arch_enable_nonboot_cpus_end(); | 574 | arch_enable_nonboot_cpus_end(); |
| @@ -726,10 +723,12 @@ void set_cpu_present(unsigned int cpu, bool present) | |||
| 726 | 723 | ||
| 727 | void set_cpu_online(unsigned int cpu, bool online) | 724 | void set_cpu_online(unsigned int cpu, bool online) |
| 728 | { | 725 | { |
| 729 | if (online) | 726 | if (online) { |
| 730 | cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits)); | 727 | cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits)); |
| 731 | else | 728 | cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits)); |
| 729 | } else { | ||
| 732 | cpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits)); | 730 | cpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits)); |
| 731 | } | ||
| 733 | } | 732 | } |
| 734 | 733 | ||
| 735 | void set_cpu_active(unsigned int cpu, bool active) | 734 | void set_cpu_active(unsigned int cpu, bool active) |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 3d54c418bd06..f6b33c696224 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
| @@ -61,12 +61,7 @@ | |||
| 61 | #include <linux/cgroup.h> | 61 | #include <linux/cgroup.h> |
| 62 | #include <linux/wait.h> | 62 | #include <linux/wait.h> |
| 63 | 63 | ||
| 64 | /* | 64 | struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE; |
| 65 | * Tracks how many cpusets are currently defined in system. | ||
| 66 | * When there is only one cpuset (the root cpuset) we can | ||
| 67 | * short circuit some hooks. | ||
| 68 | */ | ||
| 69 | int number_of_cpusets __read_mostly; | ||
| 70 | 65 | ||
| 71 | /* See "Frequency meter" comments, below. */ | 66 | /* See "Frequency meter" comments, below. */ |
| 72 | 67 | ||
| @@ -124,7 +119,7 @@ static inline struct cpuset *task_cs(struct task_struct *task) | |||
| 124 | 119 | ||
| 125 | static inline struct cpuset *parent_cs(struct cpuset *cs) | 120 | static inline struct cpuset *parent_cs(struct cpuset *cs) |
| 126 | { | 121 | { |
| 127 | return css_cs(css_parent(&cs->css)); | 122 | return css_cs(cs->css.parent); |
| 128 | } | 123 | } |
| 129 | 124 | ||
| 130 | #ifdef CONFIG_NUMA | 125 | #ifdef CONFIG_NUMA |
| @@ -611,7 +606,7 @@ static int generate_sched_domains(cpumask_var_t **domains, | |||
| 611 | goto done; | 606 | goto done; |
| 612 | } | 607 | } |
| 613 | 608 | ||
| 614 | csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); | 609 | csa = kmalloc(nr_cpusets() * sizeof(cp), GFP_KERNEL); |
| 615 | if (!csa) | 610 | if (!csa) |
| 616 | goto done; | 611 | goto done; |
| 617 | csn = 0; | 612 | csn = 0; |
| @@ -696,11 +691,8 @@ restart: | |||
| 696 | if (nslot == ndoms) { | 691 | if (nslot == ndoms) { |
| 697 | static int warnings = 10; | 692 | static int warnings = 10; |
| 698 | if (warnings) { | 693 | if (warnings) { |
| 699 | printk(KERN_WARNING | 694 | pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n", |
| 700 | "rebuild_sched_domains confused:" | 695 | nslot, ndoms, csn, i, apn); |
| 701 | " nslot %d, ndoms %d, csn %d, i %d," | ||
| 702 | " apn %d\n", | ||
| 703 | nslot, ndoms, csn, i, apn); | ||
| 704 | warnings--; | 696 | warnings--; |
| 705 | } | 697 | } |
| 706 | continue; | 698 | continue; |
| @@ -875,7 +867,7 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root) | |||
| 875 | continue; | 867 | continue; |
| 876 | } | 868 | } |
| 877 | } | 869 | } |
| 878 | if (!css_tryget(&cp->css)) | 870 | if (!css_tryget_online(&cp->css)) |
| 879 | continue; | 871 | continue; |
| 880 | rcu_read_unlock(); | 872 | rcu_read_unlock(); |
| 881 | 873 | ||
| @@ -890,6 +882,7 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root) | |||
| 890 | /** | 882 | /** |
| 891 | * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it | 883 | * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it |
| 892 | * @cs: the cpuset to consider | 884 | * @cs: the cpuset to consider |
| 885 | * @trialcs: trial cpuset | ||
| 893 | * @buf: buffer of cpu numbers written to this cpuset | 886 | * @buf: buffer of cpu numbers written to this cpuset |
| 894 | */ | 887 | */ |
| 895 | static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, | 888 | static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, |
| @@ -1110,7 +1103,7 @@ static void update_tasks_nodemask_hier(struct cpuset *root_cs, bool update_root) | |||
| 1110 | continue; | 1103 | continue; |
| 1111 | } | 1104 | } |
| 1112 | } | 1105 | } |
| 1113 | if (!css_tryget(&cp->css)) | 1106 | if (!css_tryget_online(&cp->css)) |
| 1114 | continue; | 1107 | continue; |
| 1115 | rcu_read_unlock(); | 1108 | rcu_read_unlock(); |
| 1116 | 1109 | ||
| @@ -1605,13 +1598,15 @@ out_unlock: | |||
| 1605 | /* | 1598 | /* |
| 1606 | * Common handling for a write to a "cpus" or "mems" file. | 1599 | * Common handling for a write to a "cpus" or "mems" file. |
| 1607 | */ | 1600 | */ |
| 1608 | static int cpuset_write_resmask(struct cgroup_subsys_state *css, | 1601 | static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, |
| 1609 | struct cftype *cft, char *buf) | 1602 | char *buf, size_t nbytes, loff_t off) |
| 1610 | { | 1603 | { |
| 1611 | struct cpuset *cs = css_cs(css); | 1604 | struct cpuset *cs = css_cs(of_css(of)); |
| 1612 | struct cpuset *trialcs; | 1605 | struct cpuset *trialcs; |
| 1613 | int retval = -ENODEV; | 1606 | int retval = -ENODEV; |
| 1614 | 1607 | ||
| 1608 | buf = strstrip(buf); | ||
| 1609 | |||
| 1615 | /* | 1610 | /* |
| 1616 | * CPU or memory hotunplug may leave @cs w/o any execution | 1611 | * CPU or memory hotunplug may leave @cs w/o any execution |
| 1617 | * resources, in which case the hotplug code asynchronously updates | 1612 | * resources, in which case the hotplug code asynchronously updates |
| @@ -1635,7 +1630,7 @@ static int cpuset_write_resmask(struct cgroup_subsys_state *css, | |||
| 1635 | goto out_unlock; | 1630 | goto out_unlock; |
| 1636 | } | 1631 | } |
| 1637 | 1632 | ||
| 1638 | switch (cft->private) { | 1633 | switch (of_cft(of)->private) { |
| 1639 | case FILE_CPULIST: | 1634 | case FILE_CPULIST: |
| 1640 | retval = update_cpumask(cs, trialcs, buf); | 1635 | retval = update_cpumask(cs, trialcs, buf); |
| 1641 | break; | 1636 | break; |
| @@ -1650,7 +1645,7 @@ static int cpuset_write_resmask(struct cgroup_subsys_state *css, | |||
| 1650 | free_trial_cpuset(trialcs); | 1645 | free_trial_cpuset(trialcs); |
| 1651 | out_unlock: | 1646 | out_unlock: |
| 1652 | mutex_unlock(&cpuset_mutex); | 1647 | mutex_unlock(&cpuset_mutex); |
| 1653 | return retval; | 1648 | return retval ?: nbytes; |
| 1654 | } | 1649 | } |
| 1655 | 1650 | ||
| 1656 | /* | 1651 | /* |
| @@ -1752,7 +1747,7 @@ static struct cftype files[] = { | |||
| 1752 | { | 1747 | { |
| 1753 | .name = "cpus", | 1748 | .name = "cpus", |
| 1754 | .seq_show = cpuset_common_seq_show, | 1749 | .seq_show = cpuset_common_seq_show, |
| 1755 | .write_string = cpuset_write_resmask, | 1750 | .write = cpuset_write_resmask, |
| 1756 | .max_write_len = (100U + 6 * NR_CPUS), | 1751 | .max_write_len = (100U + 6 * NR_CPUS), |
| 1757 | .private = FILE_CPULIST, | 1752 | .private = FILE_CPULIST, |
| 1758 | }, | 1753 | }, |
| @@ -1760,7 +1755,7 @@ static struct cftype files[] = { | |||
| 1760 | { | 1755 | { |
| 1761 | .name = "mems", | 1756 | .name = "mems", |
| 1762 | .seq_show = cpuset_common_seq_show, | 1757 | .seq_show = cpuset_common_seq_show, |
| 1763 | .write_string = cpuset_write_resmask, | 1758 | .write = cpuset_write_resmask, |
| 1764 | .max_write_len = (100U + 6 * MAX_NUMNODES), | 1759 | .max_write_len = (100U + 6 * MAX_NUMNODES), |
| 1765 | .private = FILE_MEMLIST, | 1760 | .private = FILE_MEMLIST, |
| 1766 | }, | 1761 | }, |
| @@ -1888,7 +1883,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) | |||
| 1888 | if (is_spread_slab(parent)) | 1883 | if (is_spread_slab(parent)) |
| 1889 | set_bit(CS_SPREAD_SLAB, &cs->flags); | 1884 | set_bit(CS_SPREAD_SLAB, &cs->flags); |
| 1890 | 1885 | ||
| 1891 | number_of_cpusets++; | 1886 | cpuset_inc(); |
| 1892 | 1887 | ||
| 1893 | if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) | 1888 | if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) |
| 1894 | goto out_unlock; | 1889 | goto out_unlock; |
| @@ -1939,7 +1934,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) | |||
| 1939 | if (is_sched_load_balance(cs)) | 1934 | if (is_sched_load_balance(cs)) |
| 1940 | update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); | 1935 | update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); |
| 1941 | 1936 | ||
| 1942 | number_of_cpusets--; | 1937 | cpuset_dec(); |
| 1943 | clear_bit(CS_ONLINE, &cs->flags); | 1938 | clear_bit(CS_ONLINE, &cs->flags); |
| 1944 | 1939 | ||
| 1945 | mutex_unlock(&cpuset_mutex); | 1940 | mutex_unlock(&cpuset_mutex); |
| @@ -1992,7 +1987,6 @@ int __init cpuset_init(void) | |||
| 1992 | if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)) | 1987 | if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)) |
| 1993 | BUG(); | 1988 | BUG(); |
| 1994 | 1989 | ||
| 1995 | number_of_cpusets = 1; | ||
| 1996 | return 0; | 1990 | return 0; |
| 1997 | } | 1991 | } |
| 1998 | 1992 | ||
| @@ -2017,7 +2011,7 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) | |||
| 2017 | parent = parent_cs(parent); | 2011 | parent = parent_cs(parent); |
| 2018 | 2012 | ||
| 2019 | if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) { | 2013 | if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) { |
| 2020 | printk(KERN_ERR "cpuset: failed to transfer tasks out of empty cpuset "); | 2014 | pr_err("cpuset: failed to transfer tasks out of empty cpuset "); |
| 2021 | pr_cont_cgroup_name(cs->css.cgroup); | 2015 | pr_cont_cgroup_name(cs->css.cgroup); |
| 2022 | pr_cont("\n"); | 2016 | pr_cont("\n"); |
| 2023 | } | 2017 | } |
| @@ -2155,7 +2149,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) | |||
| 2155 | 2149 | ||
| 2156 | rcu_read_lock(); | 2150 | rcu_read_lock(); |
| 2157 | cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { | 2151 | cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { |
| 2158 | if (cs == &top_cpuset || !css_tryget(&cs->css)) | 2152 | if (cs == &top_cpuset || !css_tryget_online(&cs->css)) |
| 2159 | continue; | 2153 | continue; |
| 2160 | rcu_read_unlock(); | 2154 | rcu_read_unlock(); |
| 2161 | 2155 | ||
| @@ -2536,7 +2530,7 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, | |||
| 2536 | 2530 | ||
| 2537 | /** | 2531 | /** |
| 2538 | * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed | 2532 | * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed |
| 2539 | * @task: pointer to task_struct of some task. | 2533 | * @tsk: pointer to task_struct of some task. |
| 2540 | * | 2534 | * |
| 2541 | * Description: Prints @task's name, cpuset name, and cached copy of its | 2535 | * Description: Prints @task's name, cpuset name, and cached copy of its |
| 2542 | * mems_allowed to the kernel log. | 2536 | * mems_allowed to the kernel log. |
| @@ -2554,7 +2548,7 @@ void cpuset_print_task_mems_allowed(struct task_struct *tsk) | |||
| 2554 | cgrp = task_cs(tsk)->css.cgroup; | 2548 | cgrp = task_cs(tsk)->css.cgroup; |
| 2555 | nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN, | 2549 | nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN, |
| 2556 | tsk->mems_allowed); | 2550 | tsk->mems_allowed); |
| 2557 | printk(KERN_INFO "%s cpuset=", tsk->comm); | 2551 | pr_info("%s cpuset=", tsk->comm); |
| 2558 | pr_cont_cgroup_name(cgrp); | 2552 | pr_cont_cgroup_name(cgrp); |
| 2559 | pr_cont(" mems_allowed=%s\n", cpuset_nodelist); | 2553 | pr_cont(" mems_allowed=%s\n", cpuset_nodelist); |
| 2560 | 2554 | ||
| @@ -2646,10 +2640,10 @@ out: | |||
| 2646 | /* Display task mems_allowed in /proc/<pid>/status file. */ | 2640 | /* Display task mems_allowed in /proc/<pid>/status file. */ |
| 2647 | void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) | 2641 | void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) |
| 2648 | { | 2642 | { |
| 2649 | seq_printf(m, "Mems_allowed:\t"); | 2643 | seq_puts(m, "Mems_allowed:\t"); |
| 2650 | seq_nodemask(m, &task->mems_allowed); | 2644 | seq_nodemask(m, &task->mems_allowed); |
| 2651 | seq_printf(m, "\n"); | 2645 | seq_puts(m, "\n"); |
| 2652 | seq_printf(m, "Mems_allowed_list:\t"); | 2646 | seq_puts(m, "Mems_allowed_list:\t"); |
| 2653 | seq_nodemask_list(m, &task->mems_allowed); | 2647 | seq_nodemask_list(m, &task->mems_allowed); |
| 2654 | seq_printf(m, "\n"); | 2648 | seq_puts(m, "\n"); |
| 2655 | } | 2649 | } |
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 2956c8da1605..1adf62b39b96 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c | |||
| @@ -534,7 +534,7 @@ return_normal: | |||
| 534 | kgdb_info[cpu].exception_state &= | 534 | kgdb_info[cpu].exception_state &= |
| 535 | ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE); | 535 | ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE); |
| 536 | kgdb_info[cpu].enter_kgdb--; | 536 | kgdb_info[cpu].enter_kgdb--; |
| 537 | smp_mb__before_atomic_dec(); | 537 | smp_mb__before_atomic(); |
| 538 | atomic_dec(&slaves_in_kgdb); | 538 | atomic_dec(&slaves_in_kgdb); |
| 539 | dbg_touch_watchdogs(); | 539 | dbg_touch_watchdogs(); |
| 540 | local_irq_restore(flags); | 540 | local_irq_restore(flags); |
| @@ -662,7 +662,7 @@ kgdb_restore: | |||
| 662 | kgdb_info[cpu].exception_state &= | 662 | kgdb_info[cpu].exception_state &= |
| 663 | ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE); | 663 | ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE); |
| 664 | kgdb_info[cpu].enter_kgdb--; | 664 | kgdb_info[cpu].enter_kgdb--; |
| 665 | smp_mb__before_atomic_dec(); | 665 | smp_mb__before_atomic(); |
| 666 | atomic_dec(&masters_in_kgdb); | 666 | atomic_dec(&masters_in_kgdb); |
| 667 | /* Free kgdb_active */ | 667 | /* Free kgdb_active */ |
| 668 | atomic_set(&kgdb_active, -1); | 668 | atomic_set(&kgdb_active, -1); |
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index b03e0e814e43..fe15fff5df53 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c | |||
| @@ -21,7 +21,7 @@ | |||
| 21 | static void kdb_show_stack(struct task_struct *p, void *addr) | 21 | static void kdb_show_stack(struct task_struct *p, void *addr) |
| 22 | { | 22 | { |
| 23 | int old_lvl = console_loglevel; | 23 | int old_lvl = console_loglevel; |
| 24 | console_loglevel = 15; | 24 | console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH; |
| 25 | kdb_trap_printk++; | 25 | kdb_trap_printk++; |
| 26 | kdb_set_current_task(p); | 26 | kdb_set_current_task(p); |
| 27 | if (addr) { | 27 | if (addr) { |
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 14ff4849262c..7c70812caea5 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c | |||
| @@ -710,7 +710,7 @@ kdb_printit: | |||
| 710 | } | 710 | } |
| 711 | if (logging) { | 711 | if (logging) { |
| 712 | saved_loglevel = console_loglevel; | 712 | saved_loglevel = console_loglevel; |
| 713 | console_loglevel = 0; | 713 | console_loglevel = CONSOLE_LOGLEVEL_SILENT; |
| 714 | printk(KERN_INFO "%s", kdb_buffer); | 714 | printk(KERN_INFO "%s", kdb_buffer); |
| 715 | } | 715 | } |
| 716 | 716 | ||
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 0b097c8a1e50..2f7c760305ca 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c | |||
| @@ -1091,7 +1091,7 @@ static int kdb_reboot(int argc, const char **argv) | |||
| 1091 | static void kdb_dumpregs(struct pt_regs *regs) | 1091 | static void kdb_dumpregs(struct pt_regs *regs) |
| 1092 | { | 1092 | { |
| 1093 | int old_lvl = console_loglevel; | 1093 | int old_lvl = console_loglevel; |
| 1094 | console_loglevel = 15; | 1094 | console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH; |
| 1095 | kdb_trap_printk++; | 1095 | kdb_trap_printk++; |
| 1096 | show_regs(regs); | 1096 | show_regs(regs); |
| 1097 | kdb_trap_printk--; | 1097 | kdb_trap_printk--; |
diff --git a/kernel/events/core.c b/kernel/events/core.c index f83a71a3e46d..24d35cc38e42 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
| @@ -39,6 +39,7 @@ | |||
| 39 | #include <linux/hw_breakpoint.h> | 39 | #include <linux/hw_breakpoint.h> |
| 40 | #include <linux/mm_types.h> | 40 | #include <linux/mm_types.h> |
| 41 | #include <linux/cgroup.h> | 41 | #include <linux/cgroup.h> |
| 42 | #include <linux/module.h> | ||
| 42 | 43 | ||
| 43 | #include "internal.h" | 44 | #include "internal.h" |
| 44 | 45 | ||
| @@ -607,7 +608,8 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, | |||
| 607 | if (!f.file) | 608 | if (!f.file) |
| 608 | return -EBADF; | 609 | return -EBADF; |
| 609 | 610 | ||
| 610 | css = css_tryget_from_dir(f.file->f_dentry, &perf_event_cgrp_subsys); | 611 | css = css_tryget_online_from_dir(f.file->f_dentry, |
| 612 | &perf_event_cgrp_subsys); | ||
| 611 | if (IS_ERR(css)) { | 613 | if (IS_ERR(css)) { |
| 612 | ret = PTR_ERR(css); | 614 | ret = PTR_ERR(css); |
| 613 | goto out; | 615 | goto out; |
| @@ -1443,6 +1445,11 @@ group_sched_out(struct perf_event *group_event, | |||
| 1443 | cpuctx->exclusive = 0; | 1445 | cpuctx->exclusive = 0; |
| 1444 | } | 1446 | } |
| 1445 | 1447 | ||
| 1448 | struct remove_event { | ||
| 1449 | struct perf_event *event; | ||
| 1450 | bool detach_group; | ||
| 1451 | }; | ||
| 1452 | |||
| 1446 | /* | 1453 | /* |
| 1447 | * Cross CPU call to remove a performance event | 1454 | * Cross CPU call to remove a performance event |
| 1448 | * | 1455 | * |
| @@ -1451,12 +1458,15 @@ group_sched_out(struct perf_event *group_event, | |||
| 1451 | */ | 1458 | */ |
| 1452 | static int __perf_remove_from_context(void *info) | 1459 | static int __perf_remove_from_context(void *info) |
| 1453 | { | 1460 | { |
| 1454 | struct perf_event *event = info; | 1461 | struct remove_event *re = info; |
| 1462 | struct perf_event *event = re->event; | ||
| 1455 | struct perf_event_context *ctx = event->ctx; | 1463 | struct perf_event_context *ctx = event->ctx; |
| 1456 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | 1464 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); |
| 1457 | 1465 | ||
| 1458 | raw_spin_lock(&ctx->lock); | 1466 | raw_spin_lock(&ctx->lock); |
| 1459 | event_sched_out(event, cpuctx, ctx); | 1467 | event_sched_out(event, cpuctx, ctx); |
| 1468 | if (re->detach_group) | ||
| 1469 | perf_group_detach(event); | ||
| 1460 | list_del_event(event, ctx); | 1470 | list_del_event(event, ctx); |
| 1461 | if (!ctx->nr_events && cpuctx->task_ctx == ctx) { | 1471 | if (!ctx->nr_events && cpuctx->task_ctx == ctx) { |
| 1462 | ctx->is_active = 0; | 1472 | ctx->is_active = 0; |
| @@ -1481,10 +1491,14 @@ static int __perf_remove_from_context(void *info) | |||
| 1481 | * When called from perf_event_exit_task, it's OK because the | 1491 | * When called from perf_event_exit_task, it's OK because the |
| 1482 | * context has been detached from its task. | 1492 | * context has been detached from its task. |
| 1483 | */ | 1493 | */ |
| 1484 | static void perf_remove_from_context(struct perf_event *event) | 1494 | static void perf_remove_from_context(struct perf_event *event, bool detach_group) |
| 1485 | { | 1495 | { |
| 1486 | struct perf_event_context *ctx = event->ctx; | 1496 | struct perf_event_context *ctx = event->ctx; |
| 1487 | struct task_struct *task = ctx->task; | 1497 | struct task_struct *task = ctx->task; |
| 1498 | struct remove_event re = { | ||
| 1499 | .event = event, | ||
| 1500 | .detach_group = detach_group, | ||
| 1501 | }; | ||
| 1488 | 1502 | ||
| 1489 | lockdep_assert_held(&ctx->mutex); | 1503 | lockdep_assert_held(&ctx->mutex); |
| 1490 | 1504 | ||
| @@ -1493,12 +1507,12 @@ static void perf_remove_from_context(struct perf_event *event) | |||
| 1493 | * Per cpu events are removed via an smp call and | 1507 | * Per cpu events are removed via an smp call and |
| 1494 | * the removal is always successful. | 1508 | * the removal is always successful. |
| 1495 | */ | 1509 | */ |
| 1496 | cpu_function_call(event->cpu, __perf_remove_from_context, event); | 1510 | cpu_function_call(event->cpu, __perf_remove_from_context, &re); |
| 1497 | return; | 1511 | return; |
| 1498 | } | 1512 | } |
| 1499 | 1513 | ||
| 1500 | retry: | 1514 | retry: |
| 1501 | if (!task_function_call(task, __perf_remove_from_context, event)) | 1515 | if (!task_function_call(task, __perf_remove_from_context, &re)) |
| 1502 | return; | 1516 | return; |
| 1503 | 1517 | ||
| 1504 | raw_spin_lock_irq(&ctx->lock); | 1518 | raw_spin_lock_irq(&ctx->lock); |
| @@ -1515,6 +1529,8 @@ retry: | |||
| 1515 | * Since the task isn't running, its safe to remove the event, us | 1529 | * Since the task isn't running, its safe to remove the event, us |
| 1516 | * holding the ctx->lock ensures the task won't get scheduled in. | 1530 | * holding the ctx->lock ensures the task won't get scheduled in. |
| 1517 | */ | 1531 | */ |
| 1532 | if (detach_group) | ||
| 1533 | perf_group_detach(event); | ||
| 1518 | list_del_event(event, ctx); | 1534 | list_del_event(event, ctx); |
| 1519 | raw_spin_unlock_irq(&ctx->lock); | 1535 | raw_spin_unlock_irq(&ctx->lock); |
| 1520 | } | 1536 | } |
| @@ -1663,6 +1679,8 @@ event_sched_in(struct perf_event *event, | |||
| 1663 | u64 tstamp = perf_event_time(event); | 1679 | u64 tstamp = perf_event_time(event); |
| 1664 | int ret = 0; | 1680 | int ret = 0; |
| 1665 | 1681 | ||
| 1682 | lockdep_assert_held(&ctx->lock); | ||
| 1683 | |||
| 1666 | if (event->state <= PERF_EVENT_STATE_OFF) | 1684 | if (event->state <= PERF_EVENT_STATE_OFF) |
| 1667 | return 0; | 1685 | return 0; |
| 1668 | 1686 | ||
| @@ -3178,7 +3196,8 @@ static void free_event_rcu(struct rcu_head *head) | |||
| 3178 | } | 3196 | } |
| 3179 | 3197 | ||
| 3180 | static void ring_buffer_put(struct ring_buffer *rb); | 3198 | static void ring_buffer_put(struct ring_buffer *rb); |
| 3181 | static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb); | 3199 | static void ring_buffer_attach(struct perf_event *event, |
| 3200 | struct ring_buffer *rb); | ||
| 3182 | 3201 | ||
| 3183 | static void unaccount_event_cpu(struct perf_event *event, int cpu) | 3202 | static void unaccount_event_cpu(struct perf_event *event, int cpu) |
| 3184 | { | 3203 | { |
| @@ -3229,17 +3248,19 @@ static void __free_event(struct perf_event *event) | |||
| 3229 | if (event->ctx) | 3248 | if (event->ctx) |
| 3230 | put_ctx(event->ctx); | 3249 | put_ctx(event->ctx); |
| 3231 | 3250 | ||
| 3251 | if (event->pmu) | ||
| 3252 | module_put(event->pmu->module); | ||
| 3253 | |||
| 3232 | call_rcu(&event->rcu_head, free_event_rcu); | 3254 | call_rcu(&event->rcu_head, free_event_rcu); |
| 3233 | } | 3255 | } |
| 3234 | static void free_event(struct perf_event *event) | 3256 | |
| 3257 | static void _free_event(struct perf_event *event) | ||
| 3235 | { | 3258 | { |
| 3236 | irq_work_sync(&event->pending); | 3259 | irq_work_sync(&event->pending); |
| 3237 | 3260 | ||
| 3238 | unaccount_event(event); | 3261 | unaccount_event(event); |
| 3239 | 3262 | ||
| 3240 | if (event->rb) { | 3263 | if (event->rb) { |
| 3241 | struct ring_buffer *rb; | ||
| 3242 | |||
| 3243 | /* | 3264 | /* |
| 3244 | * Can happen when we close an event with re-directed output. | 3265 | * Can happen when we close an event with re-directed output. |
| 3245 | * | 3266 | * |
| @@ -3247,57 +3268,38 @@ static void free_event(struct perf_event *event) | |||
| 3247 | * over us; possibly making our ring_buffer_put() the last. | 3268 | * over us; possibly making our ring_buffer_put() the last. |
| 3248 | */ | 3269 | */ |
| 3249 | mutex_lock(&event->mmap_mutex); | 3270 | mutex_lock(&event->mmap_mutex); |
| 3250 | rb = event->rb; | 3271 | ring_buffer_attach(event, NULL); |
| 3251 | if (rb) { | ||
| 3252 | rcu_assign_pointer(event->rb, NULL); | ||
| 3253 | ring_buffer_detach(event, rb); | ||
| 3254 | ring_buffer_put(rb); /* could be last */ | ||
| 3255 | } | ||
| 3256 | mutex_unlock(&event->mmap_mutex); | 3272 | mutex_unlock(&event->mmap_mutex); |
| 3257 | } | 3273 | } |
| 3258 | 3274 | ||
| 3259 | if (is_cgroup_event(event)) | 3275 | if (is_cgroup_event(event)) |
| 3260 | perf_detach_cgroup(event); | 3276 | perf_detach_cgroup(event); |
| 3261 | 3277 | ||
| 3262 | |||
| 3263 | __free_event(event); | 3278 | __free_event(event); |
| 3264 | } | 3279 | } |
| 3265 | 3280 | ||
| 3266 | int perf_event_release_kernel(struct perf_event *event) | 3281 | /* |
| 3282 | * Used to free events which have a known refcount of 1, such as in error paths | ||
| 3283 | * where the event isn't exposed yet and inherited events. | ||
| 3284 | */ | ||
| 3285 | static void free_event(struct perf_event *event) | ||
| 3267 | { | 3286 | { |
| 3268 | struct perf_event_context *ctx = event->ctx; | 3287 | if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1, |
| 3269 | 3288 | "unexpected event refcount: %ld; ptr=%p\n", | |
| 3270 | WARN_ON_ONCE(ctx->parent_ctx); | 3289 | atomic_long_read(&event->refcount), event)) { |
| 3271 | /* | 3290 | /* leak to avoid use-after-free */ |
| 3272 | * There are two ways this annotation is useful: | 3291 | return; |
| 3273 | * | 3292 | } |
| 3274 | * 1) there is a lock recursion from perf_event_exit_task | ||
| 3275 | * see the comment there. | ||
| 3276 | * | ||
| 3277 | * 2) there is a lock-inversion with mmap_sem through | ||
| 3278 | * perf_event_read_group(), which takes faults while | ||
| 3279 | * holding ctx->mutex, however this is called after | ||
| 3280 | * the last filedesc died, so there is no possibility | ||
| 3281 | * to trigger the AB-BA case. | ||
| 3282 | */ | ||
| 3283 | mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); | ||
| 3284 | raw_spin_lock_irq(&ctx->lock); | ||
| 3285 | perf_group_detach(event); | ||
| 3286 | raw_spin_unlock_irq(&ctx->lock); | ||
| 3287 | perf_remove_from_context(event); | ||
| 3288 | mutex_unlock(&ctx->mutex); | ||
| 3289 | |||
| 3290 | free_event(event); | ||
| 3291 | 3293 | ||
| 3292 | return 0; | 3294 | _free_event(event); |
| 3293 | } | 3295 | } |
| 3294 | EXPORT_SYMBOL_GPL(perf_event_release_kernel); | ||
| 3295 | 3296 | ||
| 3296 | /* | 3297 | /* |
| 3297 | * Called when the last reference to the file is gone. | 3298 | * Called when the last reference to the file is gone. |
| 3298 | */ | 3299 | */ |
| 3299 | static void put_event(struct perf_event *event) | 3300 | static void put_event(struct perf_event *event) |
| 3300 | { | 3301 | { |
| 3302 | struct perf_event_context *ctx = event->ctx; | ||
| 3301 | struct task_struct *owner; | 3303 | struct task_struct *owner; |
| 3302 | 3304 | ||
| 3303 | if (!atomic_long_dec_and_test(&event->refcount)) | 3305 | if (!atomic_long_dec_and_test(&event->refcount)) |
| @@ -3336,9 +3338,33 @@ static void put_event(struct perf_event *event) | |||
| 3336 | put_task_struct(owner); | 3338 | put_task_struct(owner); |
| 3337 | } | 3339 | } |
| 3338 | 3340 | ||
| 3339 | perf_event_release_kernel(event); | 3341 | WARN_ON_ONCE(ctx->parent_ctx); |
| 3342 | /* | ||
| 3343 | * There are two ways this annotation is useful: | ||
| 3344 | * | ||
| 3345 | * 1) there is a lock recursion from perf_event_exit_task | ||
| 3346 | * see the comment there. | ||
| 3347 | * | ||
| 3348 | * 2) there is a lock-inversion with mmap_sem through | ||
| 3349 | * perf_event_read_group(), which takes faults while | ||
| 3350 | * holding ctx->mutex, however this is called after | ||
| 3351 | * the last filedesc died, so there is no possibility | ||
| 3352 | * to trigger the AB-BA case. | ||
| 3353 | */ | ||
| 3354 | mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); | ||
| 3355 | perf_remove_from_context(event, true); | ||
| 3356 | mutex_unlock(&ctx->mutex); | ||
| 3357 | |||
| 3358 | _free_event(event); | ||
| 3340 | } | 3359 | } |
| 3341 | 3360 | ||
| 3361 | int perf_event_release_kernel(struct perf_event *event) | ||
| 3362 | { | ||
| 3363 | put_event(event); | ||
| 3364 | return 0; | ||
| 3365 | } | ||
| 3366 | EXPORT_SYMBOL_GPL(perf_event_release_kernel); | ||
| 3367 | |||
| 3342 | static int perf_release(struct inode *inode, struct file *file) | 3368 | static int perf_release(struct inode *inode, struct file *file) |
| 3343 | { | 3369 | { |
| 3344 | put_event(file->private_data); | 3370 | put_event(file->private_data); |
| @@ -3839,28 +3865,47 @@ unlock: | |||
| 3839 | static void ring_buffer_attach(struct perf_event *event, | 3865 | static void ring_buffer_attach(struct perf_event *event, |
| 3840 | struct ring_buffer *rb) | 3866 | struct ring_buffer *rb) |
| 3841 | { | 3867 | { |
| 3868 | struct ring_buffer *old_rb = NULL; | ||
| 3842 | unsigned long flags; | 3869 | unsigned long flags; |
| 3843 | 3870 | ||
| 3844 | if (!list_empty(&event->rb_entry)) | 3871 | if (event->rb) { |
| 3845 | return; | 3872 | /* |
| 3873 | * Should be impossible, we set this when removing | ||
| 3874 | * event->rb_entry and wait/clear when adding event->rb_entry. | ||
| 3875 | */ | ||
| 3876 | WARN_ON_ONCE(event->rcu_pending); | ||
| 3846 | 3877 | ||
| 3847 | spin_lock_irqsave(&rb->event_lock, flags); | 3878 | old_rb = event->rb; |
| 3848 | if (list_empty(&event->rb_entry)) | 3879 | event->rcu_batches = get_state_synchronize_rcu(); |
| 3849 | list_add(&event->rb_entry, &rb->event_list); | 3880 | event->rcu_pending = 1; |
| 3850 | spin_unlock_irqrestore(&rb->event_lock, flags); | ||
| 3851 | } | ||
| 3852 | 3881 | ||
| 3853 | static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb) | 3882 | spin_lock_irqsave(&old_rb->event_lock, flags); |
| 3854 | { | 3883 | list_del_rcu(&event->rb_entry); |
| 3855 | unsigned long flags; | 3884 | spin_unlock_irqrestore(&old_rb->event_lock, flags); |
| 3885 | } | ||
| 3856 | 3886 | ||
| 3857 | if (list_empty(&event->rb_entry)) | 3887 | if (event->rcu_pending && rb) { |
| 3858 | return; | 3888 | cond_synchronize_rcu(event->rcu_batches); |
| 3889 | event->rcu_pending = 0; | ||
| 3890 | } | ||
| 3891 | |||
| 3892 | if (rb) { | ||
| 3893 | spin_lock_irqsave(&rb->event_lock, flags); | ||
| 3894 | list_add_rcu(&event->rb_entry, &rb->event_list); | ||
| 3895 | spin_unlock_irqrestore(&rb->event_lock, flags); | ||
| 3896 | } | ||
| 3897 | |||
| 3898 | rcu_assign_pointer(event->rb, rb); | ||
| 3859 | 3899 | ||
| 3860 | spin_lock_irqsave(&rb->event_lock, flags); | 3900 | if (old_rb) { |
| 3861 | list_del_init(&event->rb_entry); | 3901 | ring_buffer_put(old_rb); |
| 3862 | wake_up_all(&event->waitq); | 3902 | /* |
| 3863 | spin_unlock_irqrestore(&rb->event_lock, flags); | 3903 | * Since we detached before setting the new rb, so that we |
| 3904 | * could attach the new rb, we could have missed a wakeup. | ||
| 3905 | * Provide it now. | ||
| 3906 | */ | ||
| 3907 | wake_up_all(&event->waitq); | ||
| 3908 | } | ||
| 3864 | } | 3909 | } |
| 3865 | 3910 | ||
| 3866 | static void ring_buffer_wakeup(struct perf_event *event) | 3911 | static void ring_buffer_wakeup(struct perf_event *event) |
| @@ -3929,7 +3974,7 @@ static void perf_mmap_close(struct vm_area_struct *vma) | |||
| 3929 | { | 3974 | { |
| 3930 | struct perf_event *event = vma->vm_file->private_data; | 3975 | struct perf_event *event = vma->vm_file->private_data; |
| 3931 | 3976 | ||
| 3932 | struct ring_buffer *rb = event->rb; | 3977 | struct ring_buffer *rb = ring_buffer_get(event); |
| 3933 | struct user_struct *mmap_user = rb->mmap_user; | 3978 | struct user_struct *mmap_user = rb->mmap_user; |
| 3934 | int mmap_locked = rb->mmap_locked; | 3979 | int mmap_locked = rb->mmap_locked; |
| 3935 | unsigned long size = perf_data_size(rb); | 3980 | unsigned long size = perf_data_size(rb); |
| @@ -3937,18 +3982,14 @@ static void perf_mmap_close(struct vm_area_struct *vma) | |||
| 3937 | atomic_dec(&rb->mmap_count); | 3982 | atomic_dec(&rb->mmap_count); |
| 3938 | 3983 | ||
| 3939 | if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) | 3984 | if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) |
| 3940 | return; | 3985 | goto out_put; |
| 3941 | 3986 | ||
| 3942 | /* Detach current event from the buffer. */ | 3987 | ring_buffer_attach(event, NULL); |
| 3943 | rcu_assign_pointer(event->rb, NULL); | ||
| 3944 | ring_buffer_detach(event, rb); | ||
| 3945 | mutex_unlock(&event->mmap_mutex); | 3988 | mutex_unlock(&event->mmap_mutex); |
| 3946 | 3989 | ||
| 3947 | /* If there's still other mmap()s of this buffer, we're done. */ | 3990 | /* If there's still other mmap()s of this buffer, we're done. */ |
| 3948 | if (atomic_read(&rb->mmap_count)) { | 3991 | if (atomic_read(&rb->mmap_count)) |
| 3949 | ring_buffer_put(rb); /* can't be last */ | 3992 | goto out_put; |
| 3950 | return; | ||
| 3951 | } | ||
| 3952 | 3993 | ||
| 3953 | /* | 3994 | /* |
| 3954 | * No other mmap()s, detach from all other events that might redirect | 3995 | * No other mmap()s, detach from all other events that might redirect |
| @@ -3978,11 +4019,9 @@ again: | |||
| 3978 | * still restart the iteration to make sure we're not now | 4019 | * still restart the iteration to make sure we're not now |
| 3979 | * iterating the wrong list. | 4020 | * iterating the wrong list. |
| 3980 | */ | 4021 | */ |
| 3981 | if (event->rb == rb) { | 4022 | if (event->rb == rb) |
| 3982 | rcu_assign_pointer(event->rb, NULL); | 4023 | ring_buffer_attach(event, NULL); |
| 3983 | ring_buffer_detach(event, rb); | 4024 | |
| 3984 | ring_buffer_put(rb); /* can't be last, we still have one */ | ||
| 3985 | } | ||
| 3986 | mutex_unlock(&event->mmap_mutex); | 4025 | mutex_unlock(&event->mmap_mutex); |
| 3987 | put_event(event); | 4026 | put_event(event); |
| 3988 | 4027 | ||
| @@ -4007,6 +4046,7 @@ again: | |||
| 4007 | vma->vm_mm->pinned_vm -= mmap_locked; | 4046 | vma->vm_mm->pinned_vm -= mmap_locked; |
| 4008 | free_uid(mmap_user); | 4047 | free_uid(mmap_user); |
| 4009 | 4048 | ||
| 4049 | out_put: | ||
| 4010 | ring_buffer_put(rb); /* could be last */ | 4050 | ring_buffer_put(rb); /* could be last */ |
| 4011 | } | 4051 | } |
| 4012 | 4052 | ||
| @@ -4124,7 +4164,6 @@ again: | |||
| 4124 | vma->vm_mm->pinned_vm += extra; | 4164 | vma->vm_mm->pinned_vm += extra; |
| 4125 | 4165 | ||
| 4126 | ring_buffer_attach(event, rb); | 4166 | ring_buffer_attach(event, rb); |
| 4127 | rcu_assign_pointer(event->rb, rb); | ||
| 4128 | 4167 | ||
| 4129 | perf_event_init_userpage(event); | 4168 | perf_event_init_userpage(event); |
| 4130 | perf_event_update_userpage(event); | 4169 | perf_event_update_userpage(event); |
| @@ -5408,6 +5447,9 @@ struct swevent_htable { | |||
| 5408 | 5447 | ||
| 5409 | /* Recursion avoidance in each contexts */ | 5448 | /* Recursion avoidance in each contexts */ |
| 5410 | int recursion[PERF_NR_CONTEXTS]; | 5449 | int recursion[PERF_NR_CONTEXTS]; |
| 5450 | |||
| 5451 | /* Keeps track of cpu being initialized/exited */ | ||
| 5452 | bool online; | ||
| 5411 | }; | 5453 | }; |
| 5412 | 5454 | ||
| 5413 | static DEFINE_PER_CPU(struct swevent_htable, swevent_htable); | 5455 | static DEFINE_PER_CPU(struct swevent_htable, swevent_htable); |
| @@ -5654,8 +5696,14 @@ static int perf_swevent_add(struct perf_event *event, int flags) | |||
| 5654 | hwc->state = !(flags & PERF_EF_START); | 5696 | hwc->state = !(flags & PERF_EF_START); |
| 5655 | 5697 | ||
| 5656 | head = find_swevent_head(swhash, event); | 5698 | head = find_swevent_head(swhash, event); |
| 5657 | if (WARN_ON_ONCE(!head)) | 5699 | if (!head) { |
| 5700 | /* | ||
| 5701 | * We can race with cpu hotplug code. Do not | ||
| 5702 | * WARN if the cpu just got unplugged. | ||
| 5703 | */ | ||
| 5704 | WARN_ON_ONCE(swhash->online); | ||
| 5658 | return -EINVAL; | 5705 | return -EINVAL; |
| 5706 | } | ||
| 5659 | 5707 | ||
| 5660 | hlist_add_head_rcu(&event->hlist_entry, head); | 5708 | hlist_add_head_rcu(&event->hlist_entry, head); |
| 5661 | 5709 | ||
| @@ -6551,6 +6599,7 @@ free_pdc: | |||
| 6551 | free_percpu(pmu->pmu_disable_count); | 6599 | free_percpu(pmu->pmu_disable_count); |
| 6552 | goto unlock; | 6600 | goto unlock; |
| 6553 | } | 6601 | } |
| 6602 | EXPORT_SYMBOL_GPL(perf_pmu_register); | ||
| 6554 | 6603 | ||
| 6555 | void perf_pmu_unregister(struct pmu *pmu) | 6604 | void perf_pmu_unregister(struct pmu *pmu) |
| 6556 | { | 6605 | { |
| @@ -6572,6 +6621,7 @@ void perf_pmu_unregister(struct pmu *pmu) | |||
| 6572 | put_device(pmu->dev); | 6621 | put_device(pmu->dev); |
| 6573 | free_pmu_context(pmu); | 6622 | free_pmu_context(pmu); |
| 6574 | } | 6623 | } |
| 6624 | EXPORT_SYMBOL_GPL(perf_pmu_unregister); | ||
| 6575 | 6625 | ||
| 6576 | struct pmu *perf_init_event(struct perf_event *event) | 6626 | struct pmu *perf_init_event(struct perf_event *event) |
| 6577 | { | 6627 | { |
| @@ -6585,6 +6635,10 @@ struct pmu *perf_init_event(struct perf_event *event) | |||
| 6585 | pmu = idr_find(&pmu_idr, event->attr.type); | 6635 | pmu = idr_find(&pmu_idr, event->attr.type); |
| 6586 | rcu_read_unlock(); | 6636 | rcu_read_unlock(); |
| 6587 | if (pmu) { | 6637 | if (pmu) { |
| 6638 | if (!try_module_get(pmu->module)) { | ||
| 6639 | pmu = ERR_PTR(-ENODEV); | ||
| 6640 | goto unlock; | ||
| 6641 | } | ||
| 6588 | event->pmu = pmu; | 6642 | event->pmu = pmu; |
| 6589 | ret = pmu->event_init(event); | 6643 | ret = pmu->event_init(event); |
| 6590 | if (ret) | 6644 | if (ret) |
| @@ -6593,6 +6647,10 @@ struct pmu *perf_init_event(struct perf_event *event) | |||
| 6593 | } | 6647 | } |
| 6594 | 6648 | ||
| 6595 | list_for_each_entry_rcu(pmu, &pmus, entry) { | 6649 | list_for_each_entry_rcu(pmu, &pmus, entry) { |
| 6650 | if (!try_module_get(pmu->module)) { | ||
| 6651 | pmu = ERR_PTR(-ENODEV); | ||
| 6652 | goto unlock; | ||
| 6653 | } | ||
| 6596 | event->pmu = pmu; | 6654 | event->pmu = pmu; |
| 6597 | ret = pmu->event_init(event); | 6655 | ret = pmu->event_init(event); |
| 6598 | if (!ret) | 6656 | if (!ret) |
| @@ -6771,6 +6829,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, | |||
| 6771 | err_pmu: | 6829 | err_pmu: |
| 6772 | if (event->destroy) | 6830 | if (event->destroy) |
| 6773 | event->destroy(event); | 6831 | event->destroy(event); |
| 6832 | module_put(pmu->module); | ||
| 6774 | err_ns: | 6833 | err_ns: |
| 6775 | if (event->ns) | 6834 | if (event->ns) |
| 6776 | put_pid_ns(event->ns); | 6835 | put_pid_ns(event->ns); |
| @@ -6914,7 +6973,7 @@ err_size: | |||
| 6914 | static int | 6973 | static int |
| 6915 | perf_event_set_output(struct perf_event *event, struct perf_event *output_event) | 6974 | perf_event_set_output(struct perf_event *event, struct perf_event *output_event) |
| 6916 | { | 6975 | { |
| 6917 | struct ring_buffer *rb = NULL, *old_rb = NULL; | 6976 | struct ring_buffer *rb = NULL; |
| 6918 | int ret = -EINVAL; | 6977 | int ret = -EINVAL; |
| 6919 | 6978 | ||
| 6920 | if (!output_event) | 6979 | if (!output_event) |
| @@ -6942,8 +7001,6 @@ set: | |||
| 6942 | if (atomic_read(&event->mmap_count)) | 7001 | if (atomic_read(&event->mmap_count)) |
| 6943 | goto unlock; | 7002 | goto unlock; |
| 6944 | 7003 | ||
| 6945 | old_rb = event->rb; | ||
| 6946 | |||
| 6947 | if (output_event) { | 7004 | if (output_event) { |
| 6948 | /* get the rb we want to redirect to */ | 7005 | /* get the rb we want to redirect to */ |
| 6949 | rb = ring_buffer_get(output_event); | 7006 | rb = ring_buffer_get(output_event); |
| @@ -6951,23 +7008,7 @@ set: | |||
| 6951 | goto unlock; | 7008 | goto unlock; |
| 6952 | } | 7009 | } |
| 6953 | 7010 | ||
| 6954 | if (old_rb) | 7011 | ring_buffer_attach(event, rb); |
| 6955 | ring_buffer_detach(event, old_rb); | ||
| 6956 | |||
| 6957 | if (rb) | ||
| 6958 | ring_buffer_attach(event, rb); | ||
| 6959 | |||
| 6960 | rcu_assign_pointer(event->rb, rb); | ||
| 6961 | |||
| 6962 | if (old_rb) { | ||
| 6963 | ring_buffer_put(old_rb); | ||
| 6964 | /* | ||
| 6965 | * Since we detached before setting the new rb, so that we | ||
| 6966 | * could attach the new rb, we could have missed a wakeup. | ||
| 6967 | * Provide it now. | ||
| 6968 | */ | ||
| 6969 | wake_up_all(&event->waitq); | ||
| 6970 | } | ||
| 6971 | 7012 | ||
| 6972 | ret = 0; | 7013 | ret = 0; |
| 6973 | unlock: | 7014 | unlock: |
| @@ -7018,6 +7059,9 @@ SYSCALL_DEFINE5(perf_event_open, | |||
| 7018 | if (attr.freq) { | 7059 | if (attr.freq) { |
| 7019 | if (attr.sample_freq > sysctl_perf_event_sample_rate) | 7060 | if (attr.sample_freq > sysctl_perf_event_sample_rate) |
| 7020 | return -EINVAL; | 7061 | return -EINVAL; |
| 7062 | } else { | ||
| 7063 | if (attr.sample_period & (1ULL << 63)) | ||
| 7064 | return -EINVAL; | ||
| 7021 | } | 7065 | } |
| 7022 | 7066 | ||
| 7023 | /* | 7067 | /* |
| @@ -7055,20 +7099,26 @@ SYSCALL_DEFINE5(perf_event_open, | |||
| 7055 | } | 7099 | } |
| 7056 | } | 7100 | } |
| 7057 | 7101 | ||
| 7102 | if (task && group_leader && | ||
| 7103 | group_leader->attr.inherit != attr.inherit) { | ||
| 7104 | err = -EINVAL; | ||
| 7105 | goto err_task; | ||
| 7106 | } | ||
| 7107 | |||
| 7058 | get_online_cpus(); | 7108 | get_online_cpus(); |
| 7059 | 7109 | ||
| 7060 | event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, | 7110 | event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, |
| 7061 | NULL, NULL); | 7111 | NULL, NULL); |
| 7062 | if (IS_ERR(event)) { | 7112 | if (IS_ERR(event)) { |
| 7063 | err = PTR_ERR(event); | 7113 | err = PTR_ERR(event); |
| 7064 | goto err_task; | 7114 | goto err_cpus; |
| 7065 | } | 7115 | } |
| 7066 | 7116 | ||
| 7067 | if (flags & PERF_FLAG_PID_CGROUP) { | 7117 | if (flags & PERF_FLAG_PID_CGROUP) { |
| 7068 | err = perf_cgroup_connect(pid, event, &attr, group_leader); | 7118 | err = perf_cgroup_connect(pid, event, &attr, group_leader); |
| 7069 | if (err) { | 7119 | if (err) { |
| 7070 | __free_event(event); | 7120 | __free_event(event); |
| 7071 | goto err_task; | 7121 | goto err_cpus; |
| 7072 | } | 7122 | } |
| 7073 | } | 7123 | } |
| 7074 | 7124 | ||
| @@ -7165,7 +7215,7 @@ SYSCALL_DEFINE5(perf_event_open, | |||
| 7165 | struct perf_event_context *gctx = group_leader->ctx; | 7215 | struct perf_event_context *gctx = group_leader->ctx; |
| 7166 | 7216 | ||
| 7167 | mutex_lock(&gctx->mutex); | 7217 | mutex_lock(&gctx->mutex); |
| 7168 | perf_remove_from_context(group_leader); | 7218 | perf_remove_from_context(group_leader, false); |
| 7169 | 7219 | ||
| 7170 | /* | 7220 | /* |
| 7171 | * Removing from the context ends up with disabled | 7221 | * Removing from the context ends up with disabled |
| @@ -7175,7 +7225,7 @@ SYSCALL_DEFINE5(perf_event_open, | |||
| 7175 | perf_event__state_init(group_leader); | 7225 | perf_event__state_init(group_leader); |
| 7176 | list_for_each_entry(sibling, &group_leader->sibling_list, | 7226 | list_for_each_entry(sibling, &group_leader->sibling_list, |
| 7177 | group_entry) { | 7227 | group_entry) { |
| 7178 | perf_remove_from_context(sibling); | 7228 | perf_remove_from_context(sibling, false); |
| 7179 | perf_event__state_init(sibling); | 7229 | perf_event__state_init(sibling); |
| 7180 | put_ctx(gctx); | 7230 | put_ctx(gctx); |
| 7181 | } | 7231 | } |
| @@ -7230,8 +7280,9 @@ err_context: | |||
| 7230 | put_ctx(ctx); | 7280 | put_ctx(ctx); |
| 7231 | err_alloc: | 7281 | err_alloc: |
| 7232 | free_event(event); | 7282 | free_event(event); |
| 7233 | err_task: | 7283 | err_cpus: |
| 7234 | put_online_cpus(); | 7284 | put_online_cpus(); |
| 7285 | err_task: | ||
| 7235 | if (task) | 7286 | if (task) |
| 7236 | put_task_struct(task); | 7287 | put_task_struct(task); |
| 7237 | err_group_fd: | 7288 | err_group_fd: |
| @@ -7305,7 +7356,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) | |||
| 7305 | mutex_lock(&src_ctx->mutex); | 7356 | mutex_lock(&src_ctx->mutex); |
| 7306 | list_for_each_entry_safe(event, tmp, &src_ctx->event_list, | 7357 | list_for_each_entry_safe(event, tmp, &src_ctx->event_list, |
| 7307 | event_entry) { | 7358 | event_entry) { |
| 7308 | perf_remove_from_context(event); | 7359 | perf_remove_from_context(event, false); |
| 7309 | unaccount_event_cpu(event, src_cpu); | 7360 | unaccount_event_cpu(event, src_cpu); |
| 7310 | put_ctx(src_ctx); | 7361 | put_ctx(src_ctx); |
| 7311 | list_add(&event->migrate_entry, &events); | 7362 | list_add(&event->migrate_entry, &events); |
| @@ -7367,13 +7418,7 @@ __perf_event_exit_task(struct perf_event *child_event, | |||
| 7367 | struct perf_event_context *child_ctx, | 7418 | struct perf_event_context *child_ctx, |
| 7368 | struct task_struct *child) | 7419 | struct task_struct *child) |
| 7369 | { | 7420 | { |
| 7370 | if (child_event->parent) { | 7421 | perf_remove_from_context(child_event, true); |
| 7371 | raw_spin_lock_irq(&child_ctx->lock); | ||
| 7372 | perf_group_detach(child_event); | ||
| 7373 | raw_spin_unlock_irq(&child_ctx->lock); | ||
| 7374 | } | ||
| 7375 | |||
| 7376 | perf_remove_from_context(child_event); | ||
| 7377 | 7422 | ||
| 7378 | /* | 7423 | /* |
| 7379 | * It can happen that the parent exits first, and has events | 7424 | * It can happen that the parent exits first, and has events |
| @@ -7388,7 +7433,7 @@ __perf_event_exit_task(struct perf_event *child_event, | |||
| 7388 | 7433 | ||
| 7389 | static void perf_event_exit_task_context(struct task_struct *child, int ctxn) | 7434 | static void perf_event_exit_task_context(struct task_struct *child, int ctxn) |
| 7390 | { | 7435 | { |
| 7391 | struct perf_event *child_event, *tmp; | 7436 | struct perf_event *child_event; |
| 7392 | struct perf_event_context *child_ctx; | 7437 | struct perf_event_context *child_ctx; |
| 7393 | unsigned long flags; | 7438 | unsigned long flags; |
| 7394 | 7439 | ||
| @@ -7442,24 +7487,9 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) | |||
| 7442 | */ | 7487 | */ |
| 7443 | mutex_lock(&child_ctx->mutex); | 7488 | mutex_lock(&child_ctx->mutex); |
| 7444 | 7489 | ||
| 7445 | again: | 7490 | list_for_each_entry_rcu(child_event, &child_ctx->event_list, event_entry) |
| 7446 | list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups, | ||
| 7447 | group_entry) | ||
| 7448 | __perf_event_exit_task(child_event, child_ctx, child); | 7491 | __perf_event_exit_task(child_event, child_ctx, child); |
| 7449 | 7492 | ||
| 7450 | list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups, | ||
| 7451 | group_entry) | ||
| 7452 | __perf_event_exit_task(child_event, child_ctx, child); | ||
| 7453 | |||
| 7454 | /* | ||
| 7455 | * If the last event was a group event, it will have appended all | ||
| 7456 | * its siblings to the list, but we obtained 'tmp' before that which | ||
| 7457 | * will still point to the list head terminating the iteration. | ||
| 7458 | */ | ||
| 7459 | if (!list_empty(&child_ctx->pinned_groups) || | ||
| 7460 | !list_empty(&child_ctx->flexible_groups)) | ||
| 7461 | goto again; | ||
| 7462 | |||
| 7463 | mutex_unlock(&child_ctx->mutex); | 7493 | mutex_unlock(&child_ctx->mutex); |
| 7464 | 7494 | ||
| 7465 | put_ctx(child_ctx); | 7495 | put_ctx(child_ctx); |
| @@ -7724,6 +7754,8 @@ int perf_event_init_context(struct task_struct *child, int ctxn) | |||
| 7724 | * swapped under us. | 7754 | * swapped under us. |
| 7725 | */ | 7755 | */ |
| 7726 | parent_ctx = perf_pin_task_context(parent, ctxn); | 7756 | parent_ctx = perf_pin_task_context(parent, ctxn); |
| 7757 | if (!parent_ctx) | ||
| 7758 | return 0; | ||
| 7727 | 7759 | ||
| 7728 | /* | 7760 | /* |
| 7729 | * No need to check if parent_ctx != NULL here; since we saw | 7761 | * No need to check if parent_ctx != NULL here; since we saw |
| @@ -7835,6 +7867,7 @@ static void perf_event_init_cpu(int cpu) | |||
| 7835 | struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); | 7867 | struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); |
| 7836 | 7868 | ||
| 7837 | mutex_lock(&swhash->hlist_mutex); | 7869 | mutex_lock(&swhash->hlist_mutex); |
| 7870 | swhash->online = true; | ||
| 7838 | if (swhash->hlist_refcount > 0) { | 7871 | if (swhash->hlist_refcount > 0) { |
| 7839 | struct swevent_hlist *hlist; | 7872 | struct swevent_hlist *hlist; |
| 7840 | 7873 | ||
| @@ -7857,14 +7890,14 @@ static void perf_pmu_rotate_stop(struct pmu *pmu) | |||
| 7857 | 7890 | ||
| 7858 | static void __perf_event_exit_context(void *__info) | 7891 | static void __perf_event_exit_context(void *__info) |
| 7859 | { | 7892 | { |
| 7893 | struct remove_event re = { .detach_group = false }; | ||
| 7860 | struct perf_event_context *ctx = __info; | 7894 | struct perf_event_context *ctx = __info; |
| 7861 | struct perf_event *event; | ||
| 7862 | 7895 | ||
| 7863 | perf_pmu_rotate_stop(ctx->pmu); | 7896 | perf_pmu_rotate_stop(ctx->pmu); |
| 7864 | 7897 | ||
| 7865 | rcu_read_lock(); | 7898 | rcu_read_lock(); |
| 7866 | list_for_each_entry_rcu(event, &ctx->event_list, event_entry) | 7899 | list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry) |
| 7867 | __perf_remove_from_context(event); | 7900 | __perf_remove_from_context(&re); |
| 7868 | rcu_read_unlock(); | 7901 | rcu_read_unlock(); |
| 7869 | } | 7902 | } |
| 7870 | 7903 | ||
| @@ -7892,6 +7925,7 @@ static void perf_event_exit_cpu(int cpu) | |||
| 7892 | perf_event_exit_cpu_context(cpu); | 7925 | perf_event_exit_cpu_context(cpu); |
| 7893 | 7926 | ||
| 7894 | mutex_lock(&swhash->hlist_mutex); | 7927 | mutex_lock(&swhash->hlist_mutex); |
| 7928 | swhash->online = false; | ||
| 7895 | swevent_hlist_release(swhash); | 7929 | swevent_hlist_release(swhash); |
| 7896 | mutex_unlock(&swhash->hlist_mutex); | 7930 | mutex_unlock(&swhash->hlist_mutex); |
| 7897 | } | 7931 | } |
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 04709b66369d..adcd76a96839 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c | |||
| @@ -60,8 +60,6 @@ static struct percpu_rw_semaphore dup_mmap_sem; | |||
| 60 | 60 | ||
| 61 | /* Have a copy of original instruction */ | 61 | /* Have a copy of original instruction */ |
| 62 | #define UPROBE_COPY_INSN 0 | 62 | #define UPROBE_COPY_INSN 0 |
| 63 | /* Can skip singlestep */ | ||
| 64 | #define UPROBE_SKIP_SSTEP 1 | ||
| 65 | 63 | ||
| 66 | struct uprobe { | 64 | struct uprobe { |
| 67 | struct rb_node rb_node; /* node in the rb tree */ | 65 | struct rb_node rb_node; /* node in the rb tree */ |
| @@ -491,12 +489,9 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) | |||
| 491 | uprobe->offset = offset; | 489 | uprobe->offset = offset; |
| 492 | init_rwsem(&uprobe->register_rwsem); | 490 | init_rwsem(&uprobe->register_rwsem); |
| 493 | init_rwsem(&uprobe->consumer_rwsem); | 491 | init_rwsem(&uprobe->consumer_rwsem); |
| 494 | /* For now assume that the instruction need not be single-stepped */ | ||
| 495 | __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags); | ||
| 496 | 492 | ||
| 497 | /* add to uprobes_tree, sorted on inode:offset */ | 493 | /* add to uprobes_tree, sorted on inode:offset */ |
| 498 | cur_uprobe = insert_uprobe(uprobe); | 494 | cur_uprobe = insert_uprobe(uprobe); |
| 499 | |||
| 500 | /* a uprobe exists for this inode:offset combination */ | 495 | /* a uprobe exists for this inode:offset combination */ |
| 501 | if (cur_uprobe) { | 496 | if (cur_uprobe) { |
| 502 | kfree(uprobe); | 497 | kfree(uprobe); |
| @@ -1296,14 +1291,8 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe) | |||
| 1296 | if (unlikely(!xol_vaddr)) | 1291 | if (unlikely(!xol_vaddr)) |
| 1297 | return 0; | 1292 | return 0; |
| 1298 | 1293 | ||
| 1299 | /* Initialize the slot */ | 1294 | arch_uprobe_copy_ixol(area->page, xol_vaddr, |
| 1300 | copy_to_page(area->page, xol_vaddr, | 1295 | &uprobe->arch.ixol, sizeof(uprobe->arch.ixol)); |
| 1301 | &uprobe->arch.ixol, sizeof(uprobe->arch.ixol)); | ||
| 1302 | /* | ||
| 1303 | * We probably need flush_icache_user_range() but it needs vma. | ||
| 1304 | * This should work on supported architectures too. | ||
| 1305 | */ | ||
| 1306 | flush_dcache_page(area->page); | ||
| 1307 | 1296 | ||
| 1308 | return xol_vaddr; | 1297 | return xol_vaddr; |
| 1309 | } | 1298 | } |
| @@ -1346,6 +1335,21 @@ static void xol_free_insn_slot(struct task_struct *tsk) | |||
| 1346 | } | 1335 | } |
| 1347 | } | 1336 | } |
| 1348 | 1337 | ||
| 1338 | void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, | ||
| 1339 | void *src, unsigned long len) | ||
| 1340 | { | ||
| 1341 | /* Initialize the slot */ | ||
| 1342 | copy_to_page(page, vaddr, src, len); | ||
| 1343 | |||
| 1344 | /* | ||
| 1345 | * We probably need flush_icache_user_range() but it needs vma. | ||
| 1346 | * This should work on most of architectures by default. If | ||
| 1347 | * architecture needs to do something different it can define | ||
| 1348 | * its own version of the function. | ||
| 1349 | */ | ||
| 1350 | flush_dcache_page(page); | ||
| 1351 | } | ||
| 1352 | |||
| 1349 | /** | 1353 | /** |
| 1350 | * uprobe_get_swbp_addr - compute address of swbp given post-swbp regs | 1354 | * uprobe_get_swbp_addr - compute address of swbp given post-swbp regs |
| 1351 | * @regs: Reflects the saved state of the task after it has hit a breakpoint | 1355 | * @regs: Reflects the saved state of the task after it has hit a breakpoint |
| @@ -1628,20 +1632,6 @@ bool uprobe_deny_signal(void) | |||
| 1628 | return true; | 1632 | return true; |
| 1629 | } | 1633 | } |
| 1630 | 1634 | ||
| 1631 | /* | ||
| 1632 | * Avoid singlestepping the original instruction if the original instruction | ||
| 1633 | * is a NOP or can be emulated. | ||
| 1634 | */ | ||
| 1635 | static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs) | ||
| 1636 | { | ||
| 1637 | if (test_bit(UPROBE_SKIP_SSTEP, &uprobe->flags)) { | ||
| 1638 | if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) | ||
| 1639 | return true; | ||
| 1640 | clear_bit(UPROBE_SKIP_SSTEP, &uprobe->flags); | ||
| 1641 | } | ||
| 1642 | return false; | ||
| 1643 | } | ||
| 1644 | |||
| 1645 | static void mmf_recalc_uprobes(struct mm_struct *mm) | 1635 | static void mmf_recalc_uprobes(struct mm_struct *mm) |
| 1646 | { | 1636 | { |
| 1647 | struct vm_area_struct *vma; | 1637 | struct vm_area_struct *vma; |
| @@ -1868,13 +1858,13 @@ static void handle_swbp(struct pt_regs *regs) | |||
| 1868 | 1858 | ||
| 1869 | handler_chain(uprobe, regs); | 1859 | handler_chain(uprobe, regs); |
| 1870 | 1860 | ||
| 1871 | if (can_skip_sstep(uprobe, regs)) | 1861 | if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) |
| 1872 | goto out; | 1862 | goto out; |
| 1873 | 1863 | ||
| 1874 | if (!pre_ssout(uprobe, regs, bp_vaddr)) | 1864 | if (!pre_ssout(uprobe, regs, bp_vaddr)) |
| 1875 | return; | 1865 | return; |
| 1876 | 1866 | ||
| 1877 | /* can_skip_sstep() succeeded, or restart if can't singlestep */ | 1867 | /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */ |
| 1878 | out: | 1868 | out: |
| 1879 | put_uprobe(uprobe); | 1869 | put_uprobe(uprobe); |
| 1880 | } | 1870 | } |
| @@ -1886,10 +1876,11 @@ out: | |||
| 1886 | static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) | 1876 | static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) |
| 1887 | { | 1877 | { |
| 1888 | struct uprobe *uprobe; | 1878 | struct uprobe *uprobe; |
| 1879 | int err = 0; | ||
| 1889 | 1880 | ||
| 1890 | uprobe = utask->active_uprobe; | 1881 | uprobe = utask->active_uprobe; |
| 1891 | if (utask->state == UTASK_SSTEP_ACK) | 1882 | if (utask->state == UTASK_SSTEP_ACK) |
| 1892 | arch_uprobe_post_xol(&uprobe->arch, regs); | 1883 | err = arch_uprobe_post_xol(&uprobe->arch, regs); |
| 1893 | else if (utask->state == UTASK_SSTEP_TRAPPED) | 1884 | else if (utask->state == UTASK_SSTEP_TRAPPED) |
| 1894 | arch_uprobe_abort_xol(&uprobe->arch, regs); | 1885 | arch_uprobe_abort_xol(&uprobe->arch, regs); |
| 1895 | else | 1886 | else |
| @@ -1903,6 +1894,11 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) | |||
| 1903 | spin_lock_irq(¤t->sighand->siglock); | 1894 | spin_lock_irq(¤t->sighand->siglock); |
| 1904 | recalc_sigpending(); /* see uprobe_deny_signal() */ | 1895 | recalc_sigpending(); /* see uprobe_deny_signal() */ |
| 1905 | spin_unlock_irq(¤t->sighand->siglock); | 1896 | spin_unlock_irq(¤t->sighand->siglock); |
| 1897 | |||
| 1898 | if (unlikely(err)) { | ||
| 1899 | uprobe_warn(current, "execute the probed insn, sending SIGILL."); | ||
| 1900 | force_sig_info(SIGILL, SEND_SIG_FORCED, current); | ||
| 1901 | } | ||
| 1906 | } | 1902 | } |
| 1907 | 1903 | ||
| 1908 | /* | 1904 | /* |
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c index 0dbeae374225..83d4382f5699 100644 --- a/kernel/exec_domain.c +++ b/kernel/exec_domain.c | |||
| @@ -37,7 +37,7 @@ static unsigned long ident_map[32] = { | |||
| 37 | struct exec_domain default_exec_domain = { | 37 | struct exec_domain default_exec_domain = { |
| 38 | .name = "Linux", /* name */ | 38 | .name = "Linux", /* name */ |
| 39 | .handler = default_handler, /* lcall7 causes a seg fault. */ | 39 | .handler = default_handler, /* lcall7 causes a seg fault. */ |
| 40 | .pers_low = 0, /* PER_LINUX personality. */ | 40 | .pers_low = 0, /* PER_LINUX personality. */ |
| 41 | .pers_high = 0, /* PER_LINUX personality. */ | 41 | .pers_high = 0, /* PER_LINUX personality. */ |
| 42 | .signal_map = ident_map, /* Identity map signals. */ | 42 | .signal_map = ident_map, /* Identity map signals. */ |
| 43 | .signal_invmap = ident_map, /* - both ways. */ | 43 | .signal_invmap = ident_map, /* - both ways. */ |
| @@ -83,7 +83,7 @@ lookup_exec_domain(unsigned int personality) | |||
| 83 | ep = &default_exec_domain; | 83 | ep = &default_exec_domain; |
| 84 | out: | 84 | out: |
| 85 | read_unlock(&exec_domains_lock); | 85 | read_unlock(&exec_domains_lock); |
| 86 | return (ep); | 86 | return ep; |
| 87 | } | 87 | } |
| 88 | 88 | ||
| 89 | int | 89 | int |
| @@ -110,8 +110,9 @@ register_exec_domain(struct exec_domain *ep) | |||
| 110 | 110 | ||
| 111 | out: | 111 | out: |
| 112 | write_unlock(&exec_domains_lock); | 112 | write_unlock(&exec_domains_lock); |
| 113 | return (err); | 113 | return err; |
| 114 | } | 114 | } |
| 115 | EXPORT_SYMBOL(register_exec_domain); | ||
| 115 | 116 | ||
| 116 | int | 117 | int |
| 117 | unregister_exec_domain(struct exec_domain *ep) | 118 | unregister_exec_domain(struct exec_domain *ep) |
| @@ -133,6 +134,7 @@ unregister: | |||
| 133 | write_unlock(&exec_domains_lock); | 134 | write_unlock(&exec_domains_lock); |
| 134 | return 0; | 135 | return 0; |
| 135 | } | 136 | } |
| 137 | EXPORT_SYMBOL(unregister_exec_domain); | ||
| 136 | 138 | ||
| 137 | int __set_personality(unsigned int personality) | 139 | int __set_personality(unsigned int personality) |
| 138 | { | 140 | { |
| @@ -144,6 +146,7 @@ int __set_personality(unsigned int personality) | |||
| 144 | 146 | ||
| 145 | return 0; | 147 | return 0; |
| 146 | } | 148 | } |
| 149 | EXPORT_SYMBOL(__set_personality); | ||
| 147 | 150 | ||
| 148 | #ifdef CONFIG_PROC_FS | 151 | #ifdef CONFIG_PROC_FS |
| 149 | static int execdomains_proc_show(struct seq_file *m, void *v) | 152 | static int execdomains_proc_show(struct seq_file *m, void *v) |
| @@ -188,8 +191,3 @@ SYSCALL_DEFINE1(personality, unsigned int, personality) | |||
| 188 | 191 | ||
| 189 | return old; | 192 | return old; |
| 190 | } | 193 | } |
| 191 | |||
| 192 | |||
| 193 | EXPORT_SYMBOL(register_exec_domain); | ||
| 194 | EXPORT_SYMBOL(unregister_exec_domain); | ||
| 195 | EXPORT_SYMBOL(__set_personality); | ||
diff --git a/kernel/exit.c b/kernel/exit.c index 6ed6a1d552b5..e5c4668f1799 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
| @@ -313,46 +313,7 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) | |||
| 313 | } | 313 | } |
| 314 | } | 314 | } |
| 315 | 315 | ||
| 316 | /* | 316 | #ifdef CONFIG_MEMCG |
| 317 | * Let kernel threads use this to say that they allow a certain signal. | ||
| 318 | * Must not be used if kthread was cloned with CLONE_SIGHAND. | ||
| 319 | */ | ||
| 320 | int allow_signal(int sig) | ||
| 321 | { | ||
| 322 | if (!valid_signal(sig) || sig < 1) | ||
| 323 | return -EINVAL; | ||
| 324 | |||
| 325 | spin_lock_irq(¤t->sighand->siglock); | ||
| 326 | /* This is only needed for daemonize()'ed kthreads */ | ||
| 327 | sigdelset(¤t->blocked, sig); | ||
| 328 | /* | ||
| 329 | * Kernel threads handle their own signals. Let the signal code | ||
| 330 | * know it'll be handled, so that they don't get converted to | ||
| 331 | * SIGKILL or just silently dropped. | ||
| 332 | */ | ||
| 333 | current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2; | ||
| 334 | recalc_sigpending(); | ||
| 335 | spin_unlock_irq(¤t->sighand->siglock); | ||
| 336 | return 0; | ||
| 337 | } | ||
| 338 | |||
| 339 | EXPORT_SYMBOL(allow_signal); | ||
| 340 | |||
| 341 | int disallow_signal(int sig) | ||
| 342 | { | ||
| 343 | if (!valid_signal(sig) || sig < 1) | ||
| 344 | return -EINVAL; | ||
| 345 | |||
| 346 | spin_lock_irq(¤t->sighand->siglock); | ||
| 347 | current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN; | ||
| 348 | recalc_sigpending(); | ||
| 349 | spin_unlock_irq(¤t->sighand->siglock); | ||
| 350 | return 0; | ||
| 351 | } | ||
| 352 | |||
| 353 | EXPORT_SYMBOL(disallow_signal); | ||
| 354 | |||
| 355 | #ifdef CONFIG_MM_OWNER | ||
| 356 | /* | 317 | /* |
| 357 | * A task is exiting. If it owned this mm, find a new owner for the mm. | 318 | * A task is exiting. If it owned this mm, find a new owner for the mm. |
| 358 | */ | 319 | */ |
| @@ -395,14 +356,18 @@ retry: | |||
| 395 | } | 356 | } |
| 396 | 357 | ||
| 397 | /* | 358 | /* |
| 398 | * Search through everything else. We should not get | 359 | * Search through everything else, we should not get here often. |
| 399 | * here often | ||
| 400 | */ | 360 | */ |
| 401 | do_each_thread(g, c) { | 361 | for_each_process(g) { |
| 402 | if (c->mm == mm) | 362 | if (g->flags & PF_KTHREAD) |
| 403 | goto assign_new_owner; | 363 | continue; |
| 404 | } while_each_thread(g, c); | 364 | for_each_thread(g, c) { |
| 405 | 365 | if (c->mm == mm) | |
| 366 | goto assign_new_owner; | ||
| 367 | if (c->mm) | ||
| 368 | break; | ||
| 369 | } | ||
| 370 | } | ||
| 406 | read_unlock(&tasklist_lock); | 371 | read_unlock(&tasklist_lock); |
| 407 | /* | 372 | /* |
| 408 | * We found no owner yet mm_users > 1: this implies that we are | 373 | * We found no owner yet mm_users > 1: this implies that we are |
| @@ -434,7 +399,7 @@ assign_new_owner: | |||
| 434 | task_unlock(c); | 399 | task_unlock(c); |
| 435 | put_task_struct(c); | 400 | put_task_struct(c); |
| 436 | } | 401 | } |
| 437 | #endif /* CONFIG_MM_OWNER */ | 402 | #endif /* CONFIG_MEMCG */ |
| 438 | 403 | ||
| 439 | /* | 404 | /* |
| 440 | * Turn us into a lazy TLB process if we | 405 | * Turn us into a lazy TLB process if we |
diff --git a/kernel/fork.c b/kernel/fork.c index 54a8d26f612f..d2799d1fc952 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
| @@ -150,15 +150,15 @@ void __weak arch_release_thread_info(struct thread_info *ti) | |||
| 150 | static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, | 150 | static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, |
| 151 | int node) | 151 | int node) |
| 152 | { | 152 | { |
| 153 | struct page *page = alloc_pages_node(node, THREADINFO_GFP_ACCOUNTED, | 153 | struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP, |
| 154 | THREAD_SIZE_ORDER); | 154 | THREAD_SIZE_ORDER); |
| 155 | 155 | ||
| 156 | return page ? page_address(page) : NULL; | 156 | return page ? page_address(page) : NULL; |
| 157 | } | 157 | } |
| 158 | 158 | ||
| 159 | static inline void free_thread_info(struct thread_info *ti) | 159 | static inline void free_thread_info(struct thread_info *ti) |
| 160 | { | 160 | { |
| 161 | free_memcg_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER); | 161 | free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER); |
| 162 | } | 162 | } |
| 163 | # else | 163 | # else |
| 164 | static struct kmem_cache *thread_info_cache; | 164 | static struct kmem_cache *thread_info_cache; |
| @@ -1099,12 +1099,12 @@ static void rt_mutex_init_task(struct task_struct *p) | |||
| 1099 | #endif | 1099 | #endif |
| 1100 | } | 1100 | } |
| 1101 | 1101 | ||
| 1102 | #ifdef CONFIG_MM_OWNER | 1102 | #ifdef CONFIG_MEMCG |
| 1103 | void mm_init_owner(struct mm_struct *mm, struct task_struct *p) | 1103 | void mm_init_owner(struct mm_struct *mm, struct task_struct *p) |
| 1104 | { | 1104 | { |
| 1105 | mm->owner = p; | 1105 | mm->owner = p; |
| 1106 | } | 1106 | } |
| 1107 | #endif /* CONFIG_MM_OWNER */ | 1107 | #endif /* CONFIG_MEMCG */ |
| 1108 | 1108 | ||
| 1109 | /* | 1109 | /* |
| 1110 | * Initialize POSIX timer handling for a single task. | 1110 | * Initialize POSIX timer handling for a single task. |
| @@ -1606,10 +1606,12 @@ long do_fork(unsigned long clone_flags, | |||
| 1606 | */ | 1606 | */ |
| 1607 | if (!IS_ERR(p)) { | 1607 | if (!IS_ERR(p)) { |
| 1608 | struct completion vfork; | 1608 | struct completion vfork; |
| 1609 | struct pid *pid; | ||
| 1609 | 1610 | ||
| 1610 | trace_sched_process_fork(current, p); | 1611 | trace_sched_process_fork(current, p); |
| 1611 | 1612 | ||
| 1612 | nr = task_pid_vnr(p); | 1613 | pid = get_task_pid(p, PIDTYPE_PID); |
| 1614 | nr = pid_vnr(pid); | ||
| 1613 | 1615 | ||
| 1614 | if (clone_flags & CLONE_PARENT_SETTID) | 1616 | if (clone_flags & CLONE_PARENT_SETTID) |
| 1615 | put_user(nr, parent_tidptr); | 1617 | put_user(nr, parent_tidptr); |
| @@ -1624,12 +1626,14 @@ long do_fork(unsigned long clone_flags, | |||
| 1624 | 1626 | ||
| 1625 | /* forking complete and child started to run, tell ptracer */ | 1627 | /* forking complete and child started to run, tell ptracer */ |
| 1626 | if (unlikely(trace)) | 1628 | if (unlikely(trace)) |
| 1627 | ptrace_event(trace, nr); | 1629 | ptrace_event_pid(trace, pid); |
| 1628 | 1630 | ||
| 1629 | if (clone_flags & CLONE_VFORK) { | 1631 | if (clone_flags & CLONE_VFORK) { |
| 1630 | if (!wait_for_vfork_done(p, &vfork)) | 1632 | if (!wait_for_vfork_done(p, &vfork)) |
| 1631 | ptrace_event(PTRACE_EVENT_VFORK_DONE, nr); | 1633 | ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid); |
| 1632 | } | 1634 | } |
| 1635 | |||
| 1636 | put_pid(pid); | ||
| 1633 | } else { | 1637 | } else { |
| 1634 | nr = PTR_ERR(p); | 1638 | nr = PTR_ERR(p); |
| 1635 | } | 1639 | } |
diff --git a/kernel/futex.c b/kernel/futex.c index 5f589279e462..b632b5f3f094 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
| @@ -267,7 +267,7 @@ static inline void futex_get_mm(union futex_key *key) | |||
| 267 | * get_futex_key() implies a full barrier. This is relied upon | 267 | * get_futex_key() implies a full barrier. This is relied upon |
| 268 | * as full barrier (B), see the ordering comment above. | 268 | * as full barrier (B), see the ordering comment above. |
| 269 | */ | 269 | */ |
| 270 | smp_mb__after_atomic_inc(); | 270 | smp_mb__after_atomic(); |
| 271 | } | 271 | } |
| 272 | 272 | ||
| 273 | /* | 273 | /* |
| @@ -280,7 +280,7 @@ static inline void hb_waiters_inc(struct futex_hash_bucket *hb) | |||
| 280 | /* | 280 | /* |
| 281 | * Full barrier (A), see the ordering comment above. | 281 | * Full barrier (A), see the ordering comment above. |
| 282 | */ | 282 | */ |
| 283 | smp_mb__after_atomic_inc(); | 283 | smp_mb__after_atomic(); |
| 284 | #endif | 284 | #endif |
| 285 | } | 285 | } |
| 286 | 286 | ||
| @@ -743,6 +743,55 @@ void exit_pi_state_list(struct task_struct *curr) | |||
| 743 | raw_spin_unlock_irq(&curr->pi_lock); | 743 | raw_spin_unlock_irq(&curr->pi_lock); |
| 744 | } | 744 | } |
| 745 | 745 | ||
| 746 | /* | ||
| 747 | * We need to check the following states: | ||
| 748 | * | ||
| 749 | * Waiter | pi_state | pi->owner | uTID | uODIED | ? | ||
| 750 | * | ||
| 751 | * [1] NULL | --- | --- | 0 | 0/1 | Valid | ||
| 752 | * [2] NULL | --- | --- | >0 | 0/1 | Valid | ||
| 753 | * | ||
| 754 | * [3] Found | NULL | -- | Any | 0/1 | Invalid | ||
| 755 | * | ||
| 756 | * [4] Found | Found | NULL | 0 | 1 | Valid | ||
| 757 | * [5] Found | Found | NULL | >0 | 1 | Invalid | ||
| 758 | * | ||
| 759 | * [6] Found | Found | task | 0 | 1 | Valid | ||
| 760 | * | ||
| 761 | * [7] Found | Found | NULL | Any | 0 | Invalid | ||
| 762 | * | ||
| 763 | * [8] Found | Found | task | ==taskTID | 0/1 | Valid | ||
| 764 | * [9] Found | Found | task | 0 | 0 | Invalid | ||
| 765 | * [10] Found | Found | task | !=taskTID | 0/1 | Invalid | ||
| 766 | * | ||
| 767 | * [1] Indicates that the kernel can acquire the futex atomically. We | ||
| 768 | * came came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit. | ||
| 769 | * | ||
| 770 | * [2] Valid, if TID does not belong to a kernel thread. If no matching | ||
| 771 | * thread is found then it indicates that the owner TID has died. | ||
| 772 | * | ||
| 773 | * [3] Invalid. The waiter is queued on a non PI futex | ||
| 774 | * | ||
| 775 | * [4] Valid state after exit_robust_list(), which sets the user space | ||
| 776 | * value to FUTEX_WAITERS | FUTEX_OWNER_DIED. | ||
| 777 | * | ||
| 778 | * [5] The user space value got manipulated between exit_robust_list() | ||
| 779 | * and exit_pi_state_list() | ||
| 780 | * | ||
| 781 | * [6] Valid state after exit_pi_state_list() which sets the new owner in | ||
| 782 | * the pi_state but cannot access the user space value. | ||
| 783 | * | ||
| 784 | * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set. | ||
| 785 | * | ||
| 786 | * [8] Owner and user space value match | ||
| 787 | * | ||
| 788 | * [9] There is no transient state which sets the user space TID to 0 | ||
| 789 | * except exit_robust_list(), but this is indicated by the | ||
| 790 | * FUTEX_OWNER_DIED bit. See [4] | ||
| 791 | * | ||
| 792 | * [10] There is no transient state which leaves owner and user space | ||
| 793 | * TID out of sync. | ||
| 794 | */ | ||
| 746 | static int | 795 | static int |
| 747 | lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, | 796 | lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, |
| 748 | union futex_key *key, struct futex_pi_state **ps) | 797 | union futex_key *key, struct futex_pi_state **ps) |
| @@ -755,12 +804,13 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, | |||
| 755 | plist_for_each_entry_safe(this, next, &hb->chain, list) { | 804 | plist_for_each_entry_safe(this, next, &hb->chain, list) { |
| 756 | if (match_futex(&this->key, key)) { | 805 | if (match_futex(&this->key, key)) { |
| 757 | /* | 806 | /* |
| 758 | * Another waiter already exists - bump up | 807 | * Sanity check the waiter before increasing |
| 759 | * the refcount and return its pi_state: | 808 | * the refcount and attaching to it. |
| 760 | */ | 809 | */ |
| 761 | pi_state = this->pi_state; | 810 | pi_state = this->pi_state; |
| 762 | /* | 811 | /* |
| 763 | * Userspace might have messed up non-PI and PI futexes | 812 | * Userspace might have messed up non-PI and |
| 813 | * PI futexes [3] | ||
| 764 | */ | 814 | */ |
| 765 | if (unlikely(!pi_state)) | 815 | if (unlikely(!pi_state)) |
| 766 | return -EINVAL; | 816 | return -EINVAL; |
| @@ -768,34 +818,70 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, | |||
| 768 | WARN_ON(!atomic_read(&pi_state->refcount)); | 818 | WARN_ON(!atomic_read(&pi_state->refcount)); |
| 769 | 819 | ||
| 770 | /* | 820 | /* |
| 771 | * When pi_state->owner is NULL then the owner died | 821 | * Handle the owner died case: |
| 772 | * and another waiter is on the fly. pi_state->owner | ||
| 773 | * is fixed up by the task which acquires | ||
| 774 | * pi_state->rt_mutex. | ||
| 775 | * | ||
| 776 | * We do not check for pid == 0 which can happen when | ||
| 777 | * the owner died and robust_list_exit() cleared the | ||
| 778 | * TID. | ||
| 779 | */ | 822 | */ |
| 780 | if (pid && pi_state->owner) { | 823 | if (uval & FUTEX_OWNER_DIED) { |
| 824 | /* | ||
| 825 | * exit_pi_state_list sets owner to NULL and | ||
| 826 | * wakes the topmost waiter. The task which | ||
| 827 | * acquires the pi_state->rt_mutex will fixup | ||
| 828 | * owner. | ||
| 829 | */ | ||
| 830 | if (!pi_state->owner) { | ||
| 831 | /* | ||
| 832 | * No pi state owner, but the user | ||
| 833 | * space TID is not 0. Inconsistent | ||
| 834 | * state. [5] | ||
| 835 | */ | ||
| 836 | if (pid) | ||
| 837 | return -EINVAL; | ||
| 838 | /* | ||
| 839 | * Take a ref on the state and | ||
| 840 | * return. [4] | ||
| 841 | */ | ||
| 842 | goto out_state; | ||
| 843 | } | ||
| 844 | |||
| 781 | /* | 845 | /* |
| 782 | * Bail out if user space manipulated the | 846 | * If TID is 0, then either the dying owner |
| 783 | * futex value. | 847 | * has not yet executed exit_pi_state_list() |
| 848 | * or some waiter acquired the rtmutex in the | ||
| 849 | * pi state, but did not yet fixup the TID in | ||
| 850 | * user space. | ||
| 851 | * | ||
| 852 | * Take a ref on the state and return. [6] | ||
| 784 | */ | 853 | */ |
| 785 | if (pid != task_pid_vnr(pi_state->owner)) | 854 | if (!pid) |
| 855 | goto out_state; | ||
| 856 | } else { | ||
| 857 | /* | ||
| 858 | * If the owner died bit is not set, | ||
| 859 | * then the pi_state must have an | ||
| 860 | * owner. [7] | ||
| 861 | */ | ||
| 862 | if (!pi_state->owner) | ||
| 786 | return -EINVAL; | 863 | return -EINVAL; |
| 787 | } | 864 | } |
| 788 | 865 | ||
| 866 | /* | ||
| 867 | * Bail out if user space manipulated the | ||
| 868 | * futex value. If pi state exists then the | ||
| 869 | * owner TID must be the same as the user | ||
| 870 | * space TID. [9/10] | ||
| 871 | */ | ||
| 872 | if (pid != task_pid_vnr(pi_state->owner)) | ||
| 873 | return -EINVAL; | ||
| 874 | |||
| 875 | out_state: | ||
| 789 | atomic_inc(&pi_state->refcount); | 876 | atomic_inc(&pi_state->refcount); |
| 790 | *ps = pi_state; | 877 | *ps = pi_state; |
| 791 | |||
| 792 | return 0; | 878 | return 0; |
| 793 | } | 879 | } |
| 794 | } | 880 | } |
| 795 | 881 | ||
| 796 | /* | 882 | /* |
| 797 | * We are the first waiter - try to look up the real owner and attach | 883 | * We are the first waiter - try to look up the real owner and attach |
| 798 | * the new pi_state to it, but bail out when TID = 0 | 884 | * the new pi_state to it, but bail out when TID = 0 [1] |
| 799 | */ | 885 | */ |
| 800 | if (!pid) | 886 | if (!pid) |
| 801 | return -ESRCH; | 887 | return -ESRCH; |
| @@ -803,6 +889,11 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, | |||
| 803 | if (!p) | 889 | if (!p) |
| 804 | return -ESRCH; | 890 | return -ESRCH; |
| 805 | 891 | ||
| 892 | if (!p->mm) { | ||
| 893 | put_task_struct(p); | ||
| 894 | return -EPERM; | ||
| 895 | } | ||
| 896 | |||
| 806 | /* | 897 | /* |
| 807 | * We need to look at the task state flags to figure out, | 898 | * We need to look at the task state flags to figure out, |
| 808 | * whether the task is exiting. To protect against the do_exit | 899 | * whether the task is exiting. To protect against the do_exit |
| @@ -823,6 +914,9 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, | |||
| 823 | return ret; | 914 | return ret; |
| 824 | } | 915 | } |
| 825 | 916 | ||
| 917 | /* | ||
| 918 | * No existing pi state. First waiter. [2] | ||
| 919 | */ | ||
| 826 | pi_state = alloc_pi_state(); | 920 | pi_state = alloc_pi_state(); |
| 827 | 921 | ||
| 828 | /* | 922 | /* |
| @@ -894,10 +988,18 @@ retry: | |||
| 894 | return -EDEADLK; | 988 | return -EDEADLK; |
| 895 | 989 | ||
| 896 | /* | 990 | /* |
| 897 | * Surprise - we got the lock. Just return to userspace: | 991 | * Surprise - we got the lock, but we do not trust user space at all. |
| 898 | */ | 992 | */ |
| 899 | if (unlikely(!curval)) | 993 | if (unlikely(!curval)) { |
| 900 | return 1; | 994 | /* |
| 995 | * We verify whether there is kernel state for this | ||
| 996 | * futex. If not, we can safely assume, that the 0 -> | ||
| 997 | * TID transition is correct. If state exists, we do | ||
| 998 | * not bother to fixup the user space state as it was | ||
| 999 | * corrupted already. | ||
| 1000 | */ | ||
| 1001 | return futex_top_waiter(hb, key) ? -EINVAL : 1; | ||
| 1002 | } | ||
| 901 | 1003 | ||
| 902 | uval = curval; | 1004 | uval = curval; |
| 903 | 1005 | ||
| @@ -1028,6 +1130,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) | |||
| 1028 | struct task_struct *new_owner; | 1130 | struct task_struct *new_owner; |
| 1029 | struct futex_pi_state *pi_state = this->pi_state; | 1131 | struct futex_pi_state *pi_state = this->pi_state; |
| 1030 | u32 uninitialized_var(curval), newval; | 1132 | u32 uninitialized_var(curval), newval; |
| 1133 | int ret = 0; | ||
| 1031 | 1134 | ||
| 1032 | if (!pi_state) | 1135 | if (!pi_state) |
| 1033 | return -EINVAL; | 1136 | return -EINVAL; |
| @@ -1051,23 +1154,19 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) | |||
| 1051 | new_owner = this->task; | 1154 | new_owner = this->task; |
| 1052 | 1155 | ||
| 1053 | /* | 1156 | /* |
| 1054 | * We pass it to the next owner. (The WAITERS bit is always | 1157 | * We pass it to the next owner. The WAITERS bit is always |
| 1055 | * kept enabled while there is PI state around. We must also | 1158 | * kept enabled while there is PI state around. We cleanup the |
| 1056 | * preserve the owner died bit.) | 1159 | * owner died bit, because we are the owner. |
| 1057 | */ | 1160 | */ |
| 1058 | if (!(uval & FUTEX_OWNER_DIED)) { | 1161 | newval = FUTEX_WAITERS | task_pid_vnr(new_owner); |
| 1059 | int ret = 0; | ||
| 1060 | 1162 | ||
| 1061 | newval = FUTEX_WAITERS | task_pid_vnr(new_owner); | 1163 | if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) |
| 1062 | 1164 | ret = -EFAULT; | |
| 1063 | if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) | 1165 | else if (curval != uval) |
| 1064 | ret = -EFAULT; | 1166 | ret = -EINVAL; |
| 1065 | else if (curval != uval) | 1167 | if (ret) { |
| 1066 | ret = -EINVAL; | 1168 | raw_spin_unlock(&pi_state->pi_mutex.wait_lock); |
| 1067 | if (ret) { | 1169 | return ret; |
| 1068 | raw_spin_unlock(&pi_state->pi_mutex.wait_lock); | ||
| 1069 | return ret; | ||
| 1070 | } | ||
| 1071 | } | 1170 | } |
| 1072 | 1171 | ||
| 1073 | raw_spin_lock_irq(&pi_state->owner->pi_lock); | 1172 | raw_spin_lock_irq(&pi_state->owner->pi_lock); |
| @@ -1347,7 +1446,7 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, | |||
| 1347 | * | 1446 | * |
| 1348 | * Return: | 1447 | * Return: |
| 1349 | * 0 - failed to acquire the lock atomically; | 1448 | * 0 - failed to acquire the lock atomically; |
| 1350 | * 1 - acquired the lock; | 1449 | * >0 - acquired the lock, return value is vpid of the top_waiter |
| 1351 | * <0 - error | 1450 | * <0 - error |
| 1352 | */ | 1451 | */ |
| 1353 | static int futex_proxy_trylock_atomic(u32 __user *pifutex, | 1452 | static int futex_proxy_trylock_atomic(u32 __user *pifutex, |
| @@ -1358,7 +1457,7 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, | |||
| 1358 | { | 1457 | { |
| 1359 | struct futex_q *top_waiter = NULL; | 1458 | struct futex_q *top_waiter = NULL; |
| 1360 | u32 curval; | 1459 | u32 curval; |
| 1361 | int ret; | 1460 | int ret, vpid; |
| 1362 | 1461 | ||
| 1363 | if (get_futex_value_locked(&curval, pifutex)) | 1462 | if (get_futex_value_locked(&curval, pifutex)) |
| 1364 | return -EFAULT; | 1463 | return -EFAULT; |
| @@ -1386,11 +1485,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, | |||
| 1386 | * the contended case or if set_waiters is 1. The pi_state is returned | 1485 | * the contended case or if set_waiters is 1. The pi_state is returned |
| 1387 | * in ps in contended cases. | 1486 | * in ps in contended cases. |
| 1388 | */ | 1487 | */ |
| 1488 | vpid = task_pid_vnr(top_waiter->task); | ||
| 1389 | ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, | 1489 | ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, |
| 1390 | set_waiters); | 1490 | set_waiters); |
| 1391 | if (ret == 1) | 1491 | if (ret == 1) { |
| 1392 | requeue_pi_wake_futex(top_waiter, key2, hb2); | 1492 | requeue_pi_wake_futex(top_waiter, key2, hb2); |
| 1393 | 1493 | return vpid; | |
| 1494 | } | ||
| 1394 | return ret; | 1495 | return ret; |
| 1395 | } | 1496 | } |
| 1396 | 1497 | ||
| @@ -1421,10 +1522,16 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, | |||
| 1421 | struct futex_pi_state *pi_state = NULL; | 1522 | struct futex_pi_state *pi_state = NULL; |
| 1422 | struct futex_hash_bucket *hb1, *hb2; | 1523 | struct futex_hash_bucket *hb1, *hb2; |
| 1423 | struct futex_q *this, *next; | 1524 | struct futex_q *this, *next; |
| 1424 | u32 curval2; | ||
| 1425 | 1525 | ||
| 1426 | if (requeue_pi) { | 1526 | if (requeue_pi) { |
| 1427 | /* | 1527 | /* |
| 1528 | * Requeue PI only works on two distinct uaddrs. This | ||
| 1529 | * check is only valid for private futexes. See below. | ||
| 1530 | */ | ||
| 1531 | if (uaddr1 == uaddr2) | ||
| 1532 | return -EINVAL; | ||
| 1533 | |||
| 1534 | /* | ||
| 1428 | * requeue_pi requires a pi_state, try to allocate it now | 1535 | * requeue_pi requires a pi_state, try to allocate it now |
| 1429 | * without any locks in case it fails. | 1536 | * without any locks in case it fails. |
| 1430 | */ | 1537 | */ |
| @@ -1462,6 +1569,15 @@ retry: | |||
| 1462 | if (unlikely(ret != 0)) | 1569 | if (unlikely(ret != 0)) |
| 1463 | goto out_put_key1; | 1570 | goto out_put_key1; |
| 1464 | 1571 | ||
| 1572 | /* | ||
| 1573 | * The check above which compares uaddrs is not sufficient for | ||
| 1574 | * shared futexes. We need to compare the keys: | ||
| 1575 | */ | ||
| 1576 | if (requeue_pi && match_futex(&key1, &key2)) { | ||
| 1577 | ret = -EINVAL; | ||
| 1578 | goto out_put_keys; | ||
| 1579 | } | ||
| 1580 | |||
| 1465 | hb1 = hash_futex(&key1); | 1581 | hb1 = hash_futex(&key1); |
| 1466 | hb2 = hash_futex(&key2); | 1582 | hb2 = hash_futex(&key2); |
| 1467 | 1583 | ||
| @@ -1509,16 +1625,25 @@ retry_private: | |||
| 1509 | * At this point the top_waiter has either taken uaddr2 or is | 1625 | * At this point the top_waiter has either taken uaddr2 or is |
| 1510 | * waiting on it. If the former, then the pi_state will not | 1626 | * waiting on it. If the former, then the pi_state will not |
| 1511 | * exist yet, look it up one more time to ensure we have a | 1627 | * exist yet, look it up one more time to ensure we have a |
| 1512 | * reference to it. | 1628 | * reference to it. If the lock was taken, ret contains the |
| 1629 | * vpid of the top waiter task. | ||
| 1513 | */ | 1630 | */ |
| 1514 | if (ret == 1) { | 1631 | if (ret > 0) { |
| 1515 | WARN_ON(pi_state); | 1632 | WARN_ON(pi_state); |
| 1516 | drop_count++; | 1633 | drop_count++; |
| 1517 | task_count++; | 1634 | task_count++; |
| 1518 | ret = get_futex_value_locked(&curval2, uaddr2); | 1635 | /* |
| 1519 | if (!ret) | 1636 | * If we acquired the lock, then the user |
| 1520 | ret = lookup_pi_state(curval2, hb2, &key2, | 1637 | * space value of uaddr2 should be vpid. It |
| 1521 | &pi_state); | 1638 | * cannot be changed by the top waiter as it |
| 1639 | * is blocked on hb2 lock if it tries to do | ||
| 1640 | * so. If something fiddled with it behind our | ||
| 1641 | * back the pi state lookup might unearth | ||
| 1642 | * it. So we rather use the known value than | ||
| 1643 | * rereading and handing potential crap to | ||
| 1644 | * lookup_pi_state. | ||
| 1645 | */ | ||
| 1646 | ret = lookup_pi_state(ret, hb2, &key2, &pi_state); | ||
| 1522 | } | 1647 | } |
| 1523 | 1648 | ||
| 1524 | switch (ret) { | 1649 | switch (ret) { |
| @@ -2301,9 +2426,10 @@ retry: | |||
| 2301 | /* | 2426 | /* |
| 2302 | * To avoid races, try to do the TID -> 0 atomic transition | 2427 | * To avoid races, try to do the TID -> 0 atomic transition |
| 2303 | * again. If it succeeds then we can return without waking | 2428 | * again. If it succeeds then we can return without waking |
| 2304 | * anyone else up: | 2429 | * anyone else up. We only try this if neither the waiters nor |
| 2430 | * the owner died bit are set. | ||
| 2305 | */ | 2431 | */ |
| 2306 | if (!(uval & FUTEX_OWNER_DIED) && | 2432 | if (!(uval & ~FUTEX_TID_MASK) && |
| 2307 | cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0)) | 2433 | cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0)) |
| 2308 | goto pi_faulted; | 2434 | goto pi_faulted; |
| 2309 | /* | 2435 | /* |
| @@ -2333,11 +2459,9 @@ retry: | |||
| 2333 | /* | 2459 | /* |
| 2334 | * No waiters - kernel unlocks the futex: | 2460 | * No waiters - kernel unlocks the futex: |
| 2335 | */ | 2461 | */ |
| 2336 | if (!(uval & FUTEX_OWNER_DIED)) { | 2462 | ret = unlock_futex_pi(uaddr, uval); |
| 2337 | ret = unlock_futex_pi(uaddr, uval); | 2463 | if (ret == -EFAULT) |
| 2338 | if (ret == -EFAULT) | 2464 | goto pi_faulted; |
| 2339 | goto pi_faulted; | ||
| 2340 | } | ||
| 2341 | 2465 | ||
| 2342 | out_unlock: | 2466 | out_unlock: |
| 2343 | spin_unlock(&hb->lock); | 2467 | spin_unlock(&hb->lock); |
| @@ -2499,6 +2623,15 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, | |||
| 2499 | if (ret) | 2623 | if (ret) |
| 2500 | goto out_key2; | 2624 | goto out_key2; |
| 2501 | 2625 | ||
| 2626 | /* | ||
| 2627 | * The check above which compares uaddrs is not sufficient for | ||
| 2628 | * shared futexes. We need to compare the keys: | ||
| 2629 | */ | ||
| 2630 | if (match_futex(&q.key, &key2)) { | ||
| 2631 | ret = -EINVAL; | ||
| 2632 | goto out_put_keys; | ||
| 2633 | } | ||
| 2634 | |||
| 2502 | /* Queue the futex_q, drop the hb lock, wait for wakeup. */ | 2635 | /* Queue the futex_q, drop the hb lock, wait for wakeup. */ |
| 2503 | futex_wait_queue_me(hb, &q, to); | 2636 | futex_wait_queue_me(hb, &q, to); |
| 2504 | 2637 | ||
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c index f45b75b713c0..b358a802fd18 100644 --- a/kernel/gcov/base.c +++ b/kernel/gcov/base.c | |||
| @@ -85,6 +85,12 @@ void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters) | |||
| 85 | } | 85 | } |
| 86 | EXPORT_SYMBOL(__gcov_merge_ior); | 86 | EXPORT_SYMBOL(__gcov_merge_ior); |
| 87 | 87 | ||
| 88 | void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters) | ||
| 89 | { | ||
| 90 | /* Unused. */ | ||
| 91 | } | ||
| 92 | EXPORT_SYMBOL(__gcov_merge_time_profile); | ||
| 93 | |||
| 88 | /** | 94 | /** |
| 89 | * gcov_enable_events - enable event reporting through gcov_event() | 95 | * gcov_enable_events - enable event reporting through gcov_event() |
| 90 | * | 96 | * |
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index 2c6e4631c814..826ba9fb5e32 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c | |||
| @@ -18,7 +18,12 @@ | |||
| 18 | #include <linux/vmalloc.h> | 18 | #include <linux/vmalloc.h> |
| 19 | #include "gcov.h" | 19 | #include "gcov.h" |
| 20 | 20 | ||
| 21 | #if __GNUC__ == 4 && __GNUC_MINOR__ >= 9 | ||
| 22 | #define GCOV_COUNTERS 9 | ||
| 23 | #else | ||
| 21 | #define GCOV_COUNTERS 8 | 24 | #define GCOV_COUNTERS 8 |
| 25 | #endif | ||
| 26 | |||
| 22 | #define GCOV_TAG_FUNCTION_LENGTH 3 | 27 | #define GCOV_TAG_FUNCTION_LENGTH 3 |
| 23 | 28 | ||
| 24 | static struct gcov_info *gcov_info_head; | 29 | static struct gcov_info *gcov_info_head; |
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index d55092ceee29..3ab28993f6e0 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c | |||
| @@ -234,6 +234,11 @@ again: | |||
| 234 | goto again; | 234 | goto again; |
| 235 | } | 235 | } |
| 236 | timer->base = new_base; | 236 | timer->base = new_base; |
| 237 | } else { | ||
| 238 | if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) { | ||
| 239 | cpu = this_cpu; | ||
| 240 | goto again; | ||
| 241 | } | ||
| 237 | } | 242 | } |
| 238 | return new_base; | 243 | return new_base; |
| 239 | } | 244 | } |
| @@ -569,6 +574,23 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) | |||
| 569 | 574 | ||
| 570 | cpu_base->expires_next.tv64 = expires_next.tv64; | 575 | cpu_base->expires_next.tv64 = expires_next.tv64; |
| 571 | 576 | ||
| 577 | /* | ||
| 578 | * If a hang was detected in the last timer interrupt then we | ||
| 579 | * leave the hang delay active in the hardware. We want the | ||
| 580 | * system to make progress. That also prevents the following | ||
| 581 | * scenario: | ||
| 582 | * T1 expires 50ms from now | ||
| 583 | * T2 expires 5s from now | ||
| 584 | * | ||
| 585 | * T1 is removed, so this code is called and would reprogram | ||
| 586 | * the hardware to 5s from now. Any hrtimer_start after that | ||
| 587 | * will not reprogram the hardware due to hang_detected being | ||
| 588 | * set. So we'd effectivly block all timers until the T2 event | ||
| 589 | * fires. | ||
| 590 | */ | ||
| 591 | if (cpu_base->hang_detected) | ||
| 592 | return; | ||
| 593 | |||
| 572 | if (cpu_base->expires_next.tv64 != KTIME_MAX) | 594 | if (cpu_base->expires_next.tv64 != KTIME_MAX) |
| 573 | tick_program_event(cpu_base->expires_next, 1); | 595 | tick_program_event(cpu_base->expires_next, 1); |
| 574 | } | 596 | } |
| @@ -968,11 +990,8 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, | |||
| 968 | /* Remove an active timer from the queue: */ | 990 | /* Remove an active timer from the queue: */ |
| 969 | ret = remove_hrtimer(timer, base); | 991 | ret = remove_hrtimer(timer, base); |
| 970 | 992 | ||
| 971 | /* Switch the timer base, if necessary: */ | ||
| 972 | new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED); | ||
| 973 | |||
| 974 | if (mode & HRTIMER_MODE_REL) { | 993 | if (mode & HRTIMER_MODE_REL) { |
| 975 | tim = ktime_add_safe(tim, new_base->get_time()); | 994 | tim = ktime_add_safe(tim, base->get_time()); |
| 976 | /* | 995 | /* |
| 977 | * CONFIG_TIME_LOW_RES is a temporary way for architectures | 996 | * CONFIG_TIME_LOW_RES is a temporary way for architectures |
| 978 | * to signal that they simply return xtime in | 997 | * to signal that they simply return xtime in |
| @@ -987,6 +1006,9 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, | |||
| 987 | 1006 | ||
| 988 | hrtimer_set_expires_range_ns(timer, tim, delta_ns); | 1007 | hrtimer_set_expires_range_ns(timer, tim, delta_ns); |
| 989 | 1008 | ||
| 1009 | /* Switch the timer base, if necessary: */ | ||
| 1010 | new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED); | ||
| 1011 | |||
| 990 | timer_stats_hrtimer_set_start_info(timer); | 1012 | timer_stats_hrtimer_set_start_info(timer); |
| 991 | 1013 | ||
| 992 | leftmost = enqueue_hrtimer(timer, new_base); | 1014 | leftmost = enqueue_hrtimer(timer, new_base); |
| @@ -1017,6 +1039,7 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, | |||
| 1017 | 1039 | ||
| 1018 | return ret; | 1040 | return ret; |
| 1019 | } | 1041 | } |
| 1042 | EXPORT_SYMBOL_GPL(__hrtimer_start_range_ns); | ||
| 1020 | 1043 | ||
| 1021 | /** | 1044 | /** |
| 1022 | * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU | 1045 | * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU |
diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 06bb1417b063..06db12434d72 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c | |||
| @@ -52,8 +52,10 @@ unsigned int __read_mostly sysctl_hung_task_panic = | |||
| 52 | 52 | ||
| 53 | static int __init hung_task_panic_setup(char *str) | 53 | static int __init hung_task_panic_setup(char *str) |
| 54 | { | 54 | { |
| 55 | sysctl_hung_task_panic = simple_strtoul(str, NULL, 0); | 55 | int rc = kstrtouint(str, 0, &sysctl_hung_task_panic); |
| 56 | 56 | ||
| 57 | if (rc) | ||
| 58 | return rc; | ||
| 57 | return 1; | 59 | return 1; |
| 58 | } | 60 | } |
| 59 | __setup("hung_task_panic=", hung_task_panic_setup); | 61 | __setup("hung_task_panic=", hung_task_panic_setup); |
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 07cbdfea9ae2..d269cecdfbf0 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig | |||
| @@ -5,6 +5,10 @@ menu "IRQ subsystem" | |||
| 5 | config MAY_HAVE_SPARSE_IRQ | 5 | config MAY_HAVE_SPARSE_IRQ |
| 6 | bool | 6 | bool |
| 7 | 7 | ||
| 8 | # Legacy support, required for itanic | ||
| 9 | config GENERIC_IRQ_LEGACY | ||
| 10 | bool | ||
| 11 | |||
| 8 | # Enable the generic irq autoprobe mechanism | 12 | # Enable the generic irq autoprobe mechanism |
| 9 | config GENERIC_IRQ_PROBE | 13 | config GENERIC_IRQ_PROBE |
| 10 | bool | 14 | bool |
| @@ -17,6 +21,11 @@ config GENERIC_IRQ_SHOW | |||
| 17 | config GENERIC_IRQ_SHOW_LEVEL | 21 | config GENERIC_IRQ_SHOW_LEVEL |
| 18 | bool | 22 | bool |
| 19 | 23 | ||
| 24 | # Facility to allocate a hardware interrupt. This is legacy support | ||
| 25 | # and should not be used in new code. Use irq domains instead. | ||
| 26 | config GENERIC_IRQ_LEGACY_ALLOC_HWIRQ | ||
| 27 | bool | ||
| 28 | |||
| 20 | # Support for delayed migration from interrupt context | 29 | # Support for delayed migration from interrupt context |
| 21 | config GENERIC_PENDING_IRQ | 30 | config GENERIC_PENDING_IRQ |
| 22 | bool | 31 | bool |
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 6397df2d6945..a2b28a2fd7b1 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
| @@ -40,10 +40,9 @@ int irq_set_chip(unsigned int irq, struct irq_chip *chip) | |||
| 40 | irq_put_desc_unlock(desc, flags); | 40 | irq_put_desc_unlock(desc, flags); |
| 41 | /* | 41 | /* |
| 42 | * For !CONFIG_SPARSE_IRQ make the irq show up in | 42 | * For !CONFIG_SPARSE_IRQ make the irq show up in |
| 43 | * allocated_irqs. For the CONFIG_SPARSE_IRQ case, it is | 43 | * allocated_irqs. |
| 44 | * already marked, and this call is harmless. | ||
| 45 | */ | 44 | */ |
| 46 | irq_reserve_irq(irq); | 45 | irq_mark_irq(irq); |
| 47 | return 0; | 46 | return 0; |
| 48 | } | 47 | } |
| 49 | EXPORT_SYMBOL(irq_set_chip); | 48 | EXPORT_SYMBOL(irq_set_chip); |
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index ddf1ffeb79f1..099ea2e0eb88 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h | |||
| @@ -33,7 +33,7 @@ enum { | |||
| 33 | }; | 33 | }; |
| 34 | 34 | ||
| 35 | /* | 35 | /* |
| 36 | * Bit masks for desc->state | 36 | * Bit masks for desc->core_internal_state__do_not_mess_with_it |
| 37 | * | 37 | * |
| 38 | * IRQS_AUTODETECT - autodetection in progress | 38 | * IRQS_AUTODETECT - autodetection in progress |
| 39 | * IRQS_SPURIOUS_DISABLED - was disabled due to spurious interrupt | 39 | * IRQS_SPURIOUS_DISABLED - was disabled due to spurious interrupt |
| @@ -76,6 +76,12 @@ extern void mask_irq(struct irq_desc *desc); | |||
| 76 | extern void unmask_irq(struct irq_desc *desc); | 76 | extern void unmask_irq(struct irq_desc *desc); |
| 77 | extern void unmask_threaded_irq(struct irq_desc *desc); | 77 | extern void unmask_threaded_irq(struct irq_desc *desc); |
| 78 | 78 | ||
| 79 | #ifdef CONFIG_SPARSE_IRQ | ||
| 80 | static inline void irq_mark_irq(unsigned int irq) { } | ||
| 81 | #else | ||
| 82 | extern void irq_mark_irq(unsigned int irq); | ||
| 83 | #endif | ||
| 84 | |||
| 79 | extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); | 85 | extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); |
| 80 | 86 | ||
| 81 | irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action); | 87 | irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action); |
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index a7174617616b..7339e42a85ab 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c | |||
| @@ -278,7 +278,12 @@ EXPORT_SYMBOL(irq_to_desc); | |||
| 278 | 278 | ||
| 279 | static void free_desc(unsigned int irq) | 279 | static void free_desc(unsigned int irq) |
| 280 | { | 280 | { |
| 281 | dynamic_irq_cleanup(irq); | 281 | struct irq_desc *desc = irq_to_desc(irq); |
| 282 | unsigned long flags; | ||
| 283 | |||
| 284 | raw_spin_lock_irqsave(&desc->lock, flags); | ||
| 285 | desc_set_defaults(irq, desc, desc_node(desc), NULL); | ||
| 286 | raw_spin_unlock_irqrestore(&desc->lock, flags); | ||
| 282 | } | 287 | } |
| 283 | 288 | ||
| 284 | static inline int alloc_descs(unsigned int start, unsigned int cnt, int node, | 289 | static inline int alloc_descs(unsigned int start, unsigned int cnt, int node, |
| @@ -299,6 +304,20 @@ static int irq_expand_nr_irqs(unsigned int nr) | |||
| 299 | return -ENOMEM; | 304 | return -ENOMEM; |
| 300 | } | 305 | } |
| 301 | 306 | ||
| 307 | void irq_mark_irq(unsigned int irq) | ||
| 308 | { | ||
| 309 | mutex_lock(&sparse_irq_lock); | ||
| 310 | bitmap_set(allocated_irqs, irq, 1); | ||
| 311 | mutex_unlock(&sparse_irq_lock); | ||
| 312 | } | ||
| 313 | |||
| 314 | #ifdef CONFIG_GENERIC_IRQ_LEGACY | ||
| 315 | void irq_init_desc(unsigned int irq) | ||
| 316 | { | ||
| 317 | free_desc(irq); | ||
| 318 | } | ||
| 319 | #endif | ||
| 320 | |||
| 302 | #endif /* !CONFIG_SPARSE_IRQ */ | 321 | #endif /* !CONFIG_SPARSE_IRQ */ |
| 303 | 322 | ||
| 304 | /** | 323 | /** |
| @@ -363,6 +382,13 @@ __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, | |||
| 363 | if (from > irq) | 382 | if (from > irq) |
| 364 | return -EINVAL; | 383 | return -EINVAL; |
| 365 | from = irq; | 384 | from = irq; |
| 385 | } else { | ||
| 386 | /* | ||
| 387 | * For interrupts which are freely allocated the | ||
| 388 | * architecture can force a lower bound to the @from | ||
| 389 | * argument. x86 uses this to exclude the GSI space. | ||
| 390 | */ | ||
| 391 | from = arch_dynirq_lower_bound(from); | ||
| 366 | } | 392 | } |
| 367 | 393 | ||
| 368 | mutex_lock(&sparse_irq_lock); | 394 | mutex_lock(&sparse_irq_lock); |
| @@ -389,30 +415,56 @@ err: | |||
| 389 | } | 415 | } |
| 390 | EXPORT_SYMBOL_GPL(__irq_alloc_descs); | 416 | EXPORT_SYMBOL_GPL(__irq_alloc_descs); |
| 391 | 417 | ||
| 418 | #ifdef CONFIG_GENERIC_IRQ_LEGACY_ALLOC_HWIRQ | ||
| 392 | /** | 419 | /** |
| 393 | * irq_reserve_irqs - mark irqs allocated | 420 | * irq_alloc_hwirqs - Allocate an irq descriptor and initialize the hardware |
| 394 | * @from: mark from irq number | 421 | * @cnt: number of interrupts to allocate |
| 395 | * @cnt: number of irqs to mark | 422 | * @node: node on which to allocate |
| 396 | * | 423 | * |
| 397 | * Returns 0 on success or an appropriate error code | 424 | * Returns an interrupt number > 0 or 0, if the allocation fails. |
| 398 | */ | 425 | */ |
| 399 | int irq_reserve_irqs(unsigned int from, unsigned int cnt) | 426 | unsigned int irq_alloc_hwirqs(int cnt, int node) |
| 400 | { | 427 | { |
| 401 | unsigned int start; | 428 | int i, irq = __irq_alloc_descs(-1, 0, cnt, node, NULL); |
| 402 | int ret = 0; | ||
| 403 | 429 | ||
| 404 | if (!cnt || (from + cnt) > nr_irqs) | 430 | if (irq < 0) |
| 405 | return -EINVAL; | 431 | return 0; |
| 406 | 432 | ||
| 407 | mutex_lock(&sparse_irq_lock); | 433 | for (i = irq; cnt > 0; i++, cnt--) { |
| 408 | start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0); | 434 | if (arch_setup_hwirq(i, node)) |
| 409 | if (start == from) | 435 | goto err; |
| 410 | bitmap_set(allocated_irqs, start, cnt); | 436 | irq_clear_status_flags(i, _IRQ_NOREQUEST); |
| 411 | else | 437 | } |
| 412 | ret = -EEXIST; | 438 | return irq; |
| 413 | mutex_unlock(&sparse_irq_lock); | 439 | |
| 414 | return ret; | 440 | err: |
| 441 | for (i--; i >= irq; i--) { | ||
| 442 | irq_set_status_flags(i, _IRQ_NOREQUEST | _IRQ_NOPROBE); | ||
| 443 | arch_teardown_hwirq(i); | ||
| 444 | } | ||
| 445 | irq_free_descs(irq, cnt); | ||
| 446 | return 0; | ||
| 447 | } | ||
| 448 | EXPORT_SYMBOL_GPL(irq_alloc_hwirqs); | ||
| 449 | |||
| 450 | /** | ||
| 451 | * irq_free_hwirqs - Free irq descriptor and cleanup the hardware | ||
| 452 | * @from: Free from irq number | ||
| 453 | * @cnt: number of interrupts to free | ||
| 454 | * | ||
| 455 | */ | ||
| 456 | void irq_free_hwirqs(unsigned int from, int cnt) | ||
| 457 | { | ||
| 458 | int i; | ||
| 459 | |||
| 460 | for (i = from; cnt > 0; i++, cnt--) { | ||
| 461 | irq_set_status_flags(i, _IRQ_NOREQUEST | _IRQ_NOPROBE); | ||
| 462 | arch_teardown_hwirq(i); | ||
| 463 | } | ||
| 464 | irq_free_descs(from, cnt); | ||
| 415 | } | 465 | } |
| 466 | EXPORT_SYMBOL_GPL(irq_free_hwirqs); | ||
| 467 | #endif | ||
| 416 | 468 | ||
| 417 | /** | 469 | /** |
| 418 | * irq_get_next_irq - get next allocated irq number | 470 | * irq_get_next_irq - get next allocated irq number |
| @@ -475,20 +527,6 @@ int irq_set_percpu_devid(unsigned int irq) | |||
| 475 | return 0; | 527 | return 0; |
| 476 | } | 528 | } |
| 477 | 529 | ||
| 478 | /** | ||
| 479 | * dynamic_irq_cleanup - cleanup a dynamically allocated irq | ||
| 480 | * @irq: irq number to initialize | ||
| 481 | */ | ||
| 482 | void dynamic_irq_cleanup(unsigned int irq) | ||
| 483 | { | ||
| 484 | struct irq_desc *desc = irq_to_desc(irq); | ||
| 485 | unsigned long flags; | ||
| 486 | |||
| 487 | raw_spin_lock_irqsave(&desc->lock, flags); | ||
| 488 | desc_set_defaults(irq, desc, desc_node(desc), NULL); | ||
| 489 | raw_spin_unlock_irqrestore(&desc->lock, flags); | ||
| 490 | } | ||
| 491 | |||
| 492 | void kstat_incr_irq_this_cpu(unsigned int irq) | 530 | void kstat_incr_irq_this_cpu(unsigned int irq) |
| 493 | { | 531 | { |
| 494 | kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); | 532 | kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); |
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index f14033700c25..eb5e10e32e05 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c | |||
| @@ -27,14 +27,14 @@ static struct irq_domain *irq_default_domain; | |||
| 27 | * __irq_domain_add() - Allocate a new irq_domain data structure | 27 | * __irq_domain_add() - Allocate a new irq_domain data structure |
| 28 | * @of_node: optional device-tree node of the interrupt controller | 28 | * @of_node: optional device-tree node of the interrupt controller |
| 29 | * @size: Size of linear map; 0 for radix mapping only | 29 | * @size: Size of linear map; 0 for radix mapping only |
| 30 | * @hwirq_max: Maximum number of interrupts supported by controller | ||
| 30 | * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no | 31 | * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no |
| 31 | * direct mapping | 32 | * direct mapping |
| 32 | * @ops: map/unmap domain callbacks | 33 | * @ops: map/unmap domain callbacks |
| 33 | * @host_data: Controller private data pointer | 34 | * @host_data: Controller private data pointer |
| 34 | * | 35 | * |
| 35 | * Allocates and initialize and irq_domain structure. Caller is expected to | 36 | * Allocates and initialize and irq_domain structure. |
| 36 | * register allocated irq_domain with irq_domain_register(). Returns pointer | 37 | * Returns pointer to IRQ domain, or NULL on failure. |
| 37 | * to IRQ domain, or NULL on failure. | ||
| 38 | */ | 38 | */ |
| 39 | struct irq_domain *__irq_domain_add(struct device_node *of_node, int size, | 39 | struct irq_domain *__irq_domain_add(struct device_node *of_node, int size, |
| 40 | irq_hw_number_t hwirq_max, int direct_max, | 40 | irq_hw_number_t hwirq_max, int direct_max, |
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 2486a4c1a710..3dc6a61bf06a 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
| @@ -180,7 +180,7 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, | |||
| 180 | struct irq_chip *chip = irq_data_get_irq_chip(data); | 180 | struct irq_chip *chip = irq_data_get_irq_chip(data); |
| 181 | int ret; | 181 | int ret; |
| 182 | 182 | ||
| 183 | ret = chip->irq_set_affinity(data, mask, false); | 183 | ret = chip->irq_set_affinity(data, mask, force); |
| 184 | switch (ret) { | 184 | switch (ret) { |
| 185 | case IRQ_SET_MASK_OK: | 185 | case IRQ_SET_MASK_OK: |
| 186 | cpumask_copy(data->affinity, mask); | 186 | cpumask_copy(data->affinity, mask); |
| @@ -192,7 +192,8 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, | |||
| 192 | return ret; | 192 | return ret; |
| 193 | } | 193 | } |
| 194 | 194 | ||
| 195 | int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask) | 195 | int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask, |
| 196 | bool force) | ||
| 196 | { | 197 | { |
| 197 | struct irq_chip *chip = irq_data_get_irq_chip(data); | 198 | struct irq_chip *chip = irq_data_get_irq_chip(data); |
| 198 | struct irq_desc *desc = irq_data_to_desc(data); | 199 | struct irq_desc *desc = irq_data_to_desc(data); |
| @@ -202,7 +203,7 @@ int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask) | |||
| 202 | return -EINVAL; | 203 | return -EINVAL; |
| 203 | 204 | ||
| 204 | if (irq_can_move_pcntxt(data)) { | 205 | if (irq_can_move_pcntxt(data)) { |
| 205 | ret = irq_do_set_affinity(data, mask, false); | 206 | ret = irq_do_set_affinity(data, mask, force); |
| 206 | } else { | 207 | } else { |
| 207 | irqd_set_move_pending(data); | 208 | irqd_set_move_pending(data); |
| 208 | irq_copy_pending(desc, mask); | 209 | irq_copy_pending(desc, mask); |
| @@ -217,13 +218,7 @@ int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask) | |||
| 217 | return ret; | 218 | return ret; |
| 218 | } | 219 | } |
| 219 | 220 | ||
| 220 | /** | 221 | int __irq_set_affinity(unsigned int irq, const struct cpumask *mask, bool force) |
| 221 | * irq_set_affinity - Set the irq affinity of a given irq | ||
| 222 | * @irq: Interrupt to set affinity | ||
| 223 | * @mask: cpumask | ||
| 224 | * | ||
| 225 | */ | ||
| 226 | int irq_set_affinity(unsigned int irq, const struct cpumask *mask) | ||
| 227 | { | 222 | { |
| 228 | struct irq_desc *desc = irq_to_desc(irq); | 223 | struct irq_desc *desc = irq_to_desc(irq); |
| 229 | unsigned long flags; | 224 | unsigned long flags; |
| @@ -233,7 +228,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *mask) | |||
| 233 | return -EINVAL; | 228 | return -EINVAL; |
| 234 | 229 | ||
| 235 | raw_spin_lock_irqsave(&desc->lock, flags); | 230 | raw_spin_lock_irqsave(&desc->lock, flags); |
| 236 | ret = __irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask); | 231 | ret = irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask, force); |
| 237 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 232 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
| 238 | return ret; | 233 | return ret; |
| 239 | } | 234 | } |
| @@ -891,8 +886,8 @@ static int irq_thread(void *data) | |||
| 891 | irq_thread_check_affinity(desc, action); | 886 | irq_thread_check_affinity(desc, action); |
| 892 | 887 | ||
| 893 | action_ret = handler_fn(desc, action); | 888 | action_ret = handler_fn(desc, action); |
| 894 | if (!noirqdebug) | 889 | if (action_ret == IRQ_HANDLED) |
| 895 | note_interrupt(action->irq, desc, action_ret); | 890 | atomic_inc(&desc->threads_handled); |
| 896 | 891 | ||
| 897 | wake_threads_waitq(desc); | 892 | wake_threads_waitq(desc); |
| 898 | } | 893 | } |
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index a1d8cc63b56e..e2514b0e439e 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c | |||
| @@ -270,6 +270,8 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc, | |||
| 270 | return action && (action->flags & IRQF_IRQPOLL); | 270 | return action && (action->flags & IRQF_IRQPOLL); |
| 271 | } | 271 | } |
| 272 | 272 | ||
| 273 | #define SPURIOUS_DEFERRED 0x80000000 | ||
| 274 | |||
| 273 | void note_interrupt(unsigned int irq, struct irq_desc *desc, | 275 | void note_interrupt(unsigned int irq, struct irq_desc *desc, |
| 274 | irqreturn_t action_ret) | 276 | irqreturn_t action_ret) |
| 275 | { | 277 | { |
| @@ -277,15 +279,111 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, | |||
| 277 | irq_settings_is_polled(desc)) | 279 | irq_settings_is_polled(desc)) |
| 278 | return; | 280 | return; |
| 279 | 281 | ||
| 280 | /* we get here again via the threaded handler */ | ||
| 281 | if (action_ret == IRQ_WAKE_THREAD) | ||
| 282 | return; | ||
| 283 | |||
| 284 | if (bad_action_ret(action_ret)) { | 282 | if (bad_action_ret(action_ret)) { |
| 285 | report_bad_irq(irq, desc, action_ret); | 283 | report_bad_irq(irq, desc, action_ret); |
| 286 | return; | 284 | return; |
| 287 | } | 285 | } |
| 288 | 286 | ||
| 287 | /* | ||
| 288 | * We cannot call note_interrupt from the threaded handler | ||
| 289 | * because we need to look at the compound of all handlers | ||
| 290 | * (primary and threaded). Aside of that in the threaded | ||
| 291 | * shared case we have no serialization against an incoming | ||
| 292 | * hardware interrupt while we are dealing with a threaded | ||
| 293 | * result. | ||
| 294 | * | ||
| 295 | * So in case a thread is woken, we just note the fact and | ||
| 296 | * defer the analysis to the next hardware interrupt. | ||
| 297 | * | ||
| 298 | * The threaded handlers store whether they sucessfully | ||
| 299 | * handled an interrupt and we check whether that number | ||
| 300 | * changed versus the last invocation. | ||
| 301 | * | ||
| 302 | * We could handle all interrupts with the delayed by one | ||
| 303 | * mechanism, but for the non forced threaded case we'd just | ||
| 304 | * add pointless overhead to the straight hardirq interrupts | ||
| 305 | * for the sake of a few lines less code. | ||
| 306 | */ | ||
| 307 | if (action_ret & IRQ_WAKE_THREAD) { | ||
| 308 | /* | ||
| 309 | * There is a thread woken. Check whether one of the | ||
| 310 | * shared primary handlers returned IRQ_HANDLED. If | ||
| 311 | * not we defer the spurious detection to the next | ||
| 312 | * interrupt. | ||
| 313 | */ | ||
| 314 | if (action_ret == IRQ_WAKE_THREAD) { | ||
| 315 | int handled; | ||
| 316 | /* | ||
| 317 | * We use bit 31 of thread_handled_last to | ||
| 318 | * denote the deferred spurious detection | ||
| 319 | * active. No locking necessary as | ||
| 320 | * thread_handled_last is only accessed here | ||
| 321 | * and we have the guarantee that hard | ||
| 322 | * interrupts are not reentrant. | ||
| 323 | */ | ||
| 324 | if (!(desc->threads_handled_last & SPURIOUS_DEFERRED)) { | ||
| 325 | desc->threads_handled_last |= SPURIOUS_DEFERRED; | ||
| 326 | return; | ||
| 327 | } | ||
| 328 | /* | ||
| 329 | * Check whether one of the threaded handlers | ||
| 330 | * returned IRQ_HANDLED since the last | ||
| 331 | * interrupt happened. | ||
| 332 | * | ||
| 333 | * For simplicity we just set bit 31, as it is | ||
| 334 | * set in threads_handled_last as well. So we | ||
| 335 | * avoid extra masking. And we really do not | ||
| 336 | * care about the high bits of the handled | ||
| 337 | * count. We just care about the count being | ||
| 338 | * different than the one we saw before. | ||
| 339 | */ | ||
| 340 | handled = atomic_read(&desc->threads_handled); | ||
| 341 | handled |= SPURIOUS_DEFERRED; | ||
| 342 | if (handled != desc->threads_handled_last) { | ||
| 343 | action_ret = IRQ_HANDLED; | ||
| 344 | /* | ||
| 345 | * Note: We keep the SPURIOUS_DEFERRED | ||
| 346 | * bit set. We are handling the | ||
| 347 | * previous invocation right now. | ||
| 348 | * Keep it for the current one, so the | ||
| 349 | * next hardware interrupt will | ||
| 350 | * account for it. | ||
| 351 | */ | ||
| 352 | desc->threads_handled_last = handled; | ||
| 353 | } else { | ||
| 354 | /* | ||
| 355 | * None of the threaded handlers felt | ||
| 356 | * responsible for the last interrupt | ||
| 357 | * | ||
| 358 | * We keep the SPURIOUS_DEFERRED bit | ||
| 359 | * set in threads_handled_last as we | ||
| 360 | * need to account for the current | ||
| 361 | * interrupt as well. | ||
| 362 | */ | ||
| 363 | action_ret = IRQ_NONE; | ||
| 364 | } | ||
| 365 | } else { | ||
| 366 | /* | ||
| 367 | * One of the primary handlers returned | ||
| 368 | * IRQ_HANDLED. So we don't care about the | ||
| 369 | * threaded handlers on the same line. Clear | ||
| 370 | * the deferred detection bit. | ||
| 371 | * | ||
| 372 | * In theory we could/should check whether the | ||
| 373 | * deferred bit is set and take the result of | ||
| 374 | * the previous run into account here as | ||
| 375 | * well. But it's really not worth the | ||
| 376 | * trouble. If every other interrupt is | ||
| 377 | * handled we never trigger the spurious | ||
| 378 | * detector. And if this is just the one out | ||
| 379 | * of 100k unhandled ones which is handled | ||
| 380 | * then we merily delay the spurious detection | ||
| 381 | * by one hard interrupt. Not a real problem. | ||
| 382 | */ | ||
| 383 | desc->threads_handled_last &= ~SPURIOUS_DEFERRED; | ||
| 384 | } | ||
| 385 | } | ||
| 386 | |||
| 289 | if (unlikely(action_ret == IRQ_NONE)) { | 387 | if (unlikely(action_ret == IRQ_NONE)) { |
| 290 | /* | 388 | /* |
| 291 | * If we are seeing only the odd spurious IRQ caused by | 389 | * If we are seeing only the odd spurious IRQ caused by |
diff --git a/kernel/kexec.c b/kernel/kexec.c index c8380ad203bc..6748688813d0 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
| @@ -125,8 +125,8 @@ static struct page *kimage_alloc_page(struct kimage *image, | |||
| 125 | unsigned long dest); | 125 | unsigned long dest); |
| 126 | 126 | ||
| 127 | static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, | 127 | static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, |
| 128 | unsigned long nr_segments, | 128 | unsigned long nr_segments, |
| 129 | struct kexec_segment __user *segments) | 129 | struct kexec_segment __user *segments) |
| 130 | { | 130 | { |
| 131 | size_t segment_bytes; | 131 | size_t segment_bytes; |
| 132 | struct kimage *image; | 132 | struct kimage *image; |
| @@ -257,13 +257,13 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, | |||
| 257 | image->control_code_page = kimage_alloc_control_pages(image, | 257 | image->control_code_page = kimage_alloc_control_pages(image, |
| 258 | get_order(KEXEC_CONTROL_PAGE_SIZE)); | 258 | get_order(KEXEC_CONTROL_PAGE_SIZE)); |
| 259 | if (!image->control_code_page) { | 259 | if (!image->control_code_page) { |
| 260 | printk(KERN_ERR "Could not allocate control_code_buffer\n"); | 260 | pr_err("Could not allocate control_code_buffer\n"); |
| 261 | goto out_free; | 261 | goto out_free; |
| 262 | } | 262 | } |
| 263 | 263 | ||
| 264 | image->swap_page = kimage_alloc_control_pages(image, 0); | 264 | image->swap_page = kimage_alloc_control_pages(image, 0); |
| 265 | if (!image->swap_page) { | 265 | if (!image->swap_page) { |
| 266 | printk(KERN_ERR "Could not allocate swap buffer\n"); | 266 | pr_err("Could not allocate swap buffer\n"); |
| 267 | goto out_free; | 267 | goto out_free; |
| 268 | } | 268 | } |
| 269 | 269 | ||
| @@ -332,7 +332,7 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, | |||
| 332 | image->control_code_page = kimage_alloc_control_pages(image, | 332 | image->control_code_page = kimage_alloc_control_pages(image, |
| 333 | get_order(KEXEC_CONTROL_PAGE_SIZE)); | 333 | get_order(KEXEC_CONTROL_PAGE_SIZE)); |
| 334 | if (!image->control_code_page) { | 334 | if (!image->control_code_page) { |
| 335 | printk(KERN_ERR "Could not allocate control_code_buffer\n"); | 335 | pr_err("Could not allocate control_code_buffer\n"); |
| 336 | goto out_free; | 336 | goto out_free; |
| 337 | } | 337 | } |
| 338 | 338 | ||
| @@ -621,8 +621,8 @@ static void kimage_terminate(struct kimage *image) | |||
| 621 | 621 | ||
| 622 | #define for_each_kimage_entry(image, ptr, entry) \ | 622 | #define for_each_kimage_entry(image, ptr, entry) \ |
| 623 | for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ | 623 | for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ |
| 624 | ptr = (entry & IND_INDIRECTION)? \ | 624 | ptr = (entry & IND_INDIRECTION) ? \ |
| 625 | phys_to_virt((entry & PAGE_MASK)): ptr +1) | 625 | phys_to_virt((entry & PAGE_MASK)) : ptr + 1) |
| 626 | 626 | ||
| 627 | static void kimage_free_entry(kimage_entry_t entry) | 627 | static void kimage_free_entry(kimage_entry_t entry) |
| 628 | { | 628 | { |
| @@ -650,8 +650,7 @@ static void kimage_free(struct kimage *image) | |||
| 650 | * done with it. | 650 | * done with it. |
| 651 | */ | 651 | */ |
| 652 | ind = entry; | 652 | ind = entry; |
| 653 | } | 653 | } else if (entry & IND_SOURCE) |
| 654 | else if (entry & IND_SOURCE) | ||
| 655 | kimage_free_entry(entry); | 654 | kimage_free_entry(entry); |
| 656 | } | 655 | } |
| 657 | /* Free the final indirection page */ | 656 | /* Free the final indirection page */ |
| @@ -774,8 +773,7 @@ static struct page *kimage_alloc_page(struct kimage *image, | |||
| 774 | addr = old_addr; | 773 | addr = old_addr; |
| 775 | page = old_page; | 774 | page = old_page; |
| 776 | break; | 775 | break; |
| 777 | } | 776 | } else { |
| 778 | else { | ||
| 779 | /* Place the page on the destination list I | 777 | /* Place the page on the destination list I |
| 780 | * will use it later. | 778 | * will use it later. |
| 781 | */ | 779 | */ |
| @@ -1059,7 +1057,7 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, | |||
| 1059 | return -EINVAL; | 1057 | return -EINVAL; |
| 1060 | 1058 | ||
| 1061 | ksegments = compat_alloc_user_space(nr_segments * sizeof(out)); | 1059 | ksegments = compat_alloc_user_space(nr_segments * sizeof(out)); |
| 1062 | for (i=0; i < nr_segments; i++) { | 1060 | for (i = 0; i < nr_segments; i++) { |
| 1063 | result = copy_from_user(&in, &segments[i], sizeof(in)); | 1061 | result = copy_from_user(&in, &segments[i], sizeof(in)); |
| 1064 | if (result) | 1062 | if (result) |
| 1065 | return -EFAULT; | 1063 | return -EFAULT; |
| @@ -1214,14 +1212,14 @@ void crash_save_cpu(struct pt_regs *regs, int cpu) | |||
| 1214 | * squirrelled away. ELF notes happen to provide | 1212 | * squirrelled away. ELF notes happen to provide |
| 1215 | * all of that, so there is no need to invent something new. | 1213 | * all of that, so there is no need to invent something new. |
| 1216 | */ | 1214 | */ |
| 1217 | buf = (u32*)per_cpu_ptr(crash_notes, cpu); | 1215 | buf = (u32 *)per_cpu_ptr(crash_notes, cpu); |
| 1218 | if (!buf) | 1216 | if (!buf) |
| 1219 | return; | 1217 | return; |
| 1220 | memset(&prstatus, 0, sizeof(prstatus)); | 1218 | memset(&prstatus, 0, sizeof(prstatus)); |
| 1221 | prstatus.pr_pid = current->pid; | 1219 | prstatus.pr_pid = current->pid; |
| 1222 | elf_core_copy_kernel_regs(&prstatus.pr_reg, regs); | 1220 | elf_core_copy_kernel_regs(&prstatus.pr_reg, regs); |
| 1223 | buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, | 1221 | buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, |
| 1224 | &prstatus, sizeof(prstatus)); | 1222 | &prstatus, sizeof(prstatus)); |
| 1225 | final_note(buf); | 1223 | final_note(buf); |
| 1226 | } | 1224 | } |
| 1227 | 1225 | ||
| @@ -1230,8 +1228,7 @@ static int __init crash_notes_memory_init(void) | |||
| 1230 | /* Allocate memory for saving cpu registers. */ | 1228 | /* Allocate memory for saving cpu registers. */ |
| 1231 | crash_notes = alloc_percpu(note_buf_t); | 1229 | crash_notes = alloc_percpu(note_buf_t); |
| 1232 | if (!crash_notes) { | 1230 | if (!crash_notes) { |
| 1233 | printk("Kexec: Memory allocation for saving cpu register" | 1231 | pr_warn("Kexec: Memory allocation for saving cpu register states failed\n"); |
| 1234 | " states failed\n"); | ||
| 1235 | return -ENOMEM; | 1232 | return -ENOMEM; |
| 1236 | } | 1233 | } |
| 1237 | return 0; | 1234 | return 0; |
| @@ -1253,10 +1250,10 @@ subsys_initcall(crash_notes_memory_init); | |||
| 1253 | * | 1250 | * |
| 1254 | * The function returns 0 on success and -EINVAL on failure. | 1251 | * The function returns 0 on success and -EINVAL on failure. |
| 1255 | */ | 1252 | */ |
| 1256 | static int __init parse_crashkernel_mem(char *cmdline, | 1253 | static int __init parse_crashkernel_mem(char *cmdline, |
| 1257 | unsigned long long system_ram, | 1254 | unsigned long long system_ram, |
| 1258 | unsigned long long *crash_size, | 1255 | unsigned long long *crash_size, |
| 1259 | unsigned long long *crash_base) | 1256 | unsigned long long *crash_base) |
| 1260 | { | 1257 | { |
| 1261 | char *cur = cmdline, *tmp; | 1258 | char *cur = cmdline, *tmp; |
| 1262 | 1259 | ||
| @@ -1267,12 +1264,12 @@ static int __init parse_crashkernel_mem(char *cmdline, | |||
| 1267 | /* get the start of the range */ | 1264 | /* get the start of the range */ |
| 1268 | start = memparse(cur, &tmp); | 1265 | start = memparse(cur, &tmp); |
| 1269 | if (cur == tmp) { | 1266 | if (cur == tmp) { |
| 1270 | pr_warning("crashkernel: Memory value expected\n"); | 1267 | pr_warn("crashkernel: Memory value expected\n"); |
| 1271 | return -EINVAL; | 1268 | return -EINVAL; |
| 1272 | } | 1269 | } |
| 1273 | cur = tmp; | 1270 | cur = tmp; |
| 1274 | if (*cur != '-') { | 1271 | if (*cur != '-') { |
| 1275 | pr_warning("crashkernel: '-' expected\n"); | 1272 | pr_warn("crashkernel: '-' expected\n"); |
| 1276 | return -EINVAL; | 1273 | return -EINVAL; |
| 1277 | } | 1274 | } |
| 1278 | cur++; | 1275 | cur++; |
| @@ -1281,31 +1278,30 @@ static int __init parse_crashkernel_mem(char *cmdline, | |||
| 1281 | if (*cur != ':') { | 1278 | if (*cur != ':') { |
| 1282 | end = memparse(cur, &tmp); | 1279 | end = memparse(cur, &tmp); |
| 1283 | if (cur == tmp) { | 1280 | if (cur == tmp) { |
| 1284 | pr_warning("crashkernel: Memory " | 1281 | pr_warn("crashkernel: Memory value expected\n"); |
| 1285 | "value expected\n"); | ||
| 1286 | return -EINVAL; | 1282 | return -EINVAL; |
| 1287 | } | 1283 | } |
| 1288 | cur = tmp; | 1284 | cur = tmp; |
| 1289 | if (end <= start) { | 1285 | if (end <= start) { |
| 1290 | pr_warning("crashkernel: end <= start\n"); | 1286 | pr_warn("crashkernel: end <= start\n"); |
| 1291 | return -EINVAL; | 1287 | return -EINVAL; |
| 1292 | } | 1288 | } |
| 1293 | } | 1289 | } |
| 1294 | 1290 | ||
| 1295 | if (*cur != ':') { | 1291 | if (*cur != ':') { |
| 1296 | pr_warning("crashkernel: ':' expected\n"); | 1292 | pr_warn("crashkernel: ':' expected\n"); |
| 1297 | return -EINVAL; | 1293 | return -EINVAL; |
| 1298 | } | 1294 | } |
| 1299 | cur++; | 1295 | cur++; |
| 1300 | 1296 | ||
| 1301 | size = memparse(cur, &tmp); | 1297 | size = memparse(cur, &tmp); |
| 1302 | if (cur == tmp) { | 1298 | if (cur == tmp) { |
| 1303 | pr_warning("Memory value expected\n"); | 1299 | pr_warn("Memory value expected\n"); |
| 1304 | return -EINVAL; | 1300 | return -EINVAL; |
| 1305 | } | 1301 | } |
| 1306 | cur = tmp; | 1302 | cur = tmp; |
| 1307 | if (size >= system_ram) { | 1303 | if (size >= system_ram) { |
| 1308 | pr_warning("crashkernel: invalid size\n"); | 1304 | pr_warn("crashkernel: invalid size\n"); |
| 1309 | return -EINVAL; | 1305 | return -EINVAL; |
| 1310 | } | 1306 | } |
| 1311 | 1307 | ||
| @@ -1323,8 +1319,7 @@ static int __init parse_crashkernel_mem(char *cmdline, | |||
| 1323 | cur++; | 1319 | cur++; |
| 1324 | *crash_base = memparse(cur, &tmp); | 1320 | *crash_base = memparse(cur, &tmp); |
| 1325 | if (cur == tmp) { | 1321 | if (cur == tmp) { |
| 1326 | pr_warning("Memory value expected " | 1322 | pr_warn("Memory value expected after '@'\n"); |
| 1327 | "after '@'\n"); | ||
| 1328 | return -EINVAL; | 1323 | return -EINVAL; |
| 1329 | } | 1324 | } |
| 1330 | } | 1325 | } |
| @@ -1336,26 +1331,26 @@ static int __init parse_crashkernel_mem(char *cmdline, | |||
| 1336 | /* | 1331 | /* |
| 1337 | * That function parses "simple" (old) crashkernel command lines like | 1332 | * That function parses "simple" (old) crashkernel command lines like |
| 1338 | * | 1333 | * |
| 1339 | * crashkernel=size[@offset] | 1334 | * crashkernel=size[@offset] |
| 1340 | * | 1335 | * |
| 1341 | * It returns 0 on success and -EINVAL on failure. | 1336 | * It returns 0 on success and -EINVAL on failure. |
| 1342 | */ | 1337 | */ |
| 1343 | static int __init parse_crashkernel_simple(char *cmdline, | 1338 | static int __init parse_crashkernel_simple(char *cmdline, |
| 1344 | unsigned long long *crash_size, | 1339 | unsigned long long *crash_size, |
| 1345 | unsigned long long *crash_base) | 1340 | unsigned long long *crash_base) |
| 1346 | { | 1341 | { |
| 1347 | char *cur = cmdline; | 1342 | char *cur = cmdline; |
| 1348 | 1343 | ||
| 1349 | *crash_size = memparse(cmdline, &cur); | 1344 | *crash_size = memparse(cmdline, &cur); |
| 1350 | if (cmdline == cur) { | 1345 | if (cmdline == cur) { |
| 1351 | pr_warning("crashkernel: memory value expected\n"); | 1346 | pr_warn("crashkernel: memory value expected\n"); |
| 1352 | return -EINVAL; | 1347 | return -EINVAL; |
| 1353 | } | 1348 | } |
| 1354 | 1349 | ||
| 1355 | if (*cur == '@') | 1350 | if (*cur == '@') |
| 1356 | *crash_base = memparse(cur+1, &cur); | 1351 | *crash_base = memparse(cur+1, &cur); |
| 1357 | else if (*cur != ' ' && *cur != '\0') { | 1352 | else if (*cur != ' ' && *cur != '\0') { |
| 1358 | pr_warning("crashkernel: unrecognized char\n"); | 1353 | pr_warn("crashkernel: unrecognized char\n"); |
| 1359 | return -EINVAL; | 1354 | return -EINVAL; |
| 1360 | } | 1355 | } |
| 1361 | 1356 | ||
| @@ -1683,7 +1678,15 @@ int kernel_kexec(void) | |||
| 1683 | kexec_in_progress = true; | 1678 | kexec_in_progress = true; |
| 1684 | kernel_restart_prepare(NULL); | 1679 | kernel_restart_prepare(NULL); |
| 1685 | migrate_to_reboot_cpu(); | 1680 | migrate_to_reboot_cpu(); |
| 1686 | printk(KERN_EMERG "Starting new kernel\n"); | 1681 | |
| 1682 | /* | ||
| 1683 | * migrate_to_reboot_cpu() disables CPU hotplug assuming that | ||
| 1684 | * no further code needs to use CPU hotplug (which is true in | ||
| 1685 | * the reboot case). However, the kexec path depends on using | ||
| 1686 | * CPU hotplug again; so re-enable it here. | ||
| 1687 | */ | ||
| 1688 | cpu_hotplug_enable(); | ||
| 1689 | pr_emerg("Starting new kernel\n"); | ||
| 1687 | machine_shutdown(); | 1690 | machine_shutdown(); |
| 1688 | } | 1691 | } |
| 1689 | 1692 | ||
diff --git a/kernel/kmod.c b/kernel/kmod.c index 6b375af4958d..8637e041a247 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
| @@ -285,10 +285,7 @@ static int wait_for_helper(void *data) | |||
| 285 | pid_t pid; | 285 | pid_t pid; |
| 286 | 286 | ||
| 287 | /* If SIGCLD is ignored sys_wait4 won't populate the status. */ | 287 | /* If SIGCLD is ignored sys_wait4 won't populate the status. */ |
| 288 | spin_lock_irq(¤t->sighand->siglock); | 288 | kernel_sigaction(SIGCHLD, SIG_DFL); |
| 289 | current->sighand->action[SIGCHLD-1].sa.sa_handler = SIG_DFL; | ||
| 290 | spin_unlock_irq(¤t->sighand->siglock); | ||
| 291 | |||
| 292 | pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD); | 289 | pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD); |
| 293 | if (pid < 0) { | 290 | if (pid < 0) { |
| 294 | sub_info->retval = pid; | 291 | sub_info->retval = pid; |
| @@ -498,7 +495,7 @@ int __usermodehelper_disable(enum umh_disable_depth depth) | |||
| 498 | static void helper_lock(void) | 495 | static void helper_lock(void) |
| 499 | { | 496 | { |
| 500 | atomic_inc(&running_helpers); | 497 | atomic_inc(&running_helpers); |
| 501 | smp_mb__after_atomic_inc(); | 498 | smp_mb__after_atomic(); |
| 502 | } | 499 | } |
| 503 | 500 | ||
| 504 | static void helper_unlock(void) | 501 | static void helper_unlock(void) |
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 2495a9b14ac8..6683ccef9fff 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c | |||
| @@ -37,6 +37,7 @@ static ssize_t uevent_seqnum_show(struct kobject *kobj, | |||
| 37 | } | 37 | } |
| 38 | KERNEL_ATTR_RO(uevent_seqnum); | 38 | KERNEL_ATTR_RO(uevent_seqnum); |
| 39 | 39 | ||
| 40 | #ifdef CONFIG_UEVENT_HELPER | ||
| 40 | /* uevent helper program, used during early boot */ | 41 | /* uevent helper program, used during early boot */ |
| 41 | static ssize_t uevent_helper_show(struct kobject *kobj, | 42 | static ssize_t uevent_helper_show(struct kobject *kobj, |
| 42 | struct kobj_attribute *attr, char *buf) | 43 | struct kobj_attribute *attr, char *buf) |
| @@ -56,7 +57,7 @@ static ssize_t uevent_helper_store(struct kobject *kobj, | |||
| 56 | return count; | 57 | return count; |
| 57 | } | 58 | } |
| 58 | KERNEL_ATTR_RW(uevent_helper); | 59 | KERNEL_ATTR_RW(uevent_helper); |
| 59 | 60 | #endif | |
| 60 | 61 | ||
| 61 | #ifdef CONFIG_PROFILING | 62 | #ifdef CONFIG_PROFILING |
| 62 | static ssize_t profiling_show(struct kobject *kobj, | 63 | static ssize_t profiling_show(struct kobject *kobj, |
| @@ -189,7 +190,9 @@ EXPORT_SYMBOL_GPL(kernel_kobj); | |||
| 189 | static struct attribute * kernel_attrs[] = { | 190 | static struct attribute * kernel_attrs[] = { |
| 190 | &fscaps_attr.attr, | 191 | &fscaps_attr.attr, |
| 191 | &uevent_seqnum_attr.attr, | 192 | &uevent_seqnum_attr.attr, |
| 193 | #ifdef CONFIG_UEVENT_HELPER | ||
| 192 | &uevent_helper_attr.attr, | 194 | &uevent_helper_attr.attr, |
| 195 | #endif | ||
| 193 | #ifdef CONFIG_PROFILING | 196 | #ifdef CONFIG_PROFILING |
| 194 | &profiling_attr.attr, | 197 | &profiling_attr.attr, |
| 195 | #endif | 198 | #endif |
diff --git a/kernel/kthread.c b/kernel/kthread.c index 9a130ec06f7a..c2390f41307b 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
| @@ -262,7 +262,7 @@ static void create_kthread(struct kthread_create_info *create) | |||
| 262 | * kthread_stop() has been called). The return value should be zero | 262 | * kthread_stop() has been called). The return value should be zero |
| 263 | * or a negative error number; it will be passed to kthread_stop(). | 263 | * or a negative error number; it will be passed to kthread_stop(). |
| 264 | * | 264 | * |
| 265 | * Returns a task_struct or ERR_PTR(-ENOMEM). | 265 | * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR). |
| 266 | */ | 266 | */ |
| 267 | struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), | 267 | struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), |
| 268 | void *data, int node, | 268 | void *data, int node, |
| @@ -298,7 +298,7 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), | |||
| 298 | * that thread. | 298 | * that thread. |
| 299 | */ | 299 | */ |
| 300 | if (xchg(&create->done, NULL)) | 300 | if (xchg(&create->done, NULL)) |
| 301 | return ERR_PTR(-ENOMEM); | 301 | return ERR_PTR(-EINTR); |
| 302 | /* | 302 | /* |
| 303 | * kthreadd (or new kernel thread) will call complete() | 303 | * kthreadd (or new kernel thread) will call complete() |
| 304 | * shortly. | 304 | * shortly. |
diff --git a/kernel/latencytop.c b/kernel/latencytop.c index a462b317f9a0..a02812743a7e 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c | |||
| @@ -88,7 +88,8 @@ static void clear_global_latency_tracing(void) | |||
| 88 | } | 88 | } |
| 89 | 89 | ||
| 90 | static void __sched | 90 | static void __sched |
| 91 | account_global_scheduler_latency(struct task_struct *tsk, struct latency_record *lat) | 91 | account_global_scheduler_latency(struct task_struct *tsk, |
| 92 | struct latency_record *lat) | ||
| 92 | { | 93 | { |
| 93 | int firstnonnull = MAXLR + 1; | 94 | int firstnonnull = MAXLR + 1; |
| 94 | int i; | 95 | int i; |
| @@ -255,7 +256,7 @@ static int lstats_show(struct seq_file *m, void *v) | |||
| 255 | break; | 256 | break; |
| 256 | seq_printf(m, " %ps", (void *)bt); | 257 | seq_printf(m, " %ps", (void *)bt); |
| 257 | } | 258 | } |
| 258 | seq_printf(m, "\n"); | 259 | seq_puts(m, "\n"); |
| 259 | } | 260 | } |
| 260 | } | 261 | } |
| 261 | return 0; | 262 | return 0; |
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index b0e9467922e1..d24e4339b46d 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c | |||
| @@ -4188,7 +4188,7 @@ void debug_show_held_locks(struct task_struct *task) | |||
| 4188 | } | 4188 | } |
| 4189 | EXPORT_SYMBOL_GPL(debug_show_held_locks); | 4189 | EXPORT_SYMBOL_GPL(debug_show_held_locks); |
| 4190 | 4190 | ||
| 4191 | asmlinkage void lockdep_sys_exit(void) | 4191 | asmlinkage __visible void lockdep_sys_exit(void) |
| 4192 | { | 4192 | { |
| 4193 | struct task_struct *curr = current; | 4193 | struct task_struct *curr = current; |
| 4194 | 4194 | ||
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index 4f560cfedc8f..51c4b24b6328 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h | |||
| @@ -54,9 +54,9 @@ enum { | |||
| 54 | * table (if it's not there yet), and we check it for lock order | 54 | * table (if it's not there yet), and we check it for lock order |
| 55 | * conflicts and deadlocks. | 55 | * conflicts and deadlocks. |
| 56 | */ | 56 | */ |
| 57 | #define MAX_LOCKDEP_ENTRIES 16384UL | 57 | #define MAX_LOCKDEP_ENTRIES 32768UL |
| 58 | 58 | ||
| 59 | #define MAX_LOCKDEP_CHAINS_BITS 15 | 59 | #define MAX_LOCKDEP_CHAINS_BITS 16 |
| 60 | #define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) | 60 | #define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) |
| 61 | 61 | ||
| 62 | #define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5) | 62 | #define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5) |
| @@ -65,7 +65,7 @@ enum { | |||
| 65 | * Stack-trace: tightly packed array of stack backtrace | 65 | * Stack-trace: tightly packed array of stack backtrace |
| 66 | * addresses. Protected by the hash_lock. | 66 | * addresses. Protected by the hash_lock. |
| 67 | */ | 67 | */ |
| 68 | #define MAX_STACK_TRACE_ENTRIES 262144UL | 68 | #define MAX_STACK_TRACE_ENTRIES 524288UL |
| 69 | 69 | ||
| 70 | extern struct list_head all_lock_classes; | 70 | extern struct list_head all_lock_classes; |
| 71 | extern struct lock_chain lock_chains[]; | 71 | extern struct lock_chain lock_chains[]; |
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index f26b1a18e34e..0955b885d0dc 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c | |||
| @@ -82,14 +82,14 @@ struct lock_writer_stress_stats { | |||
| 82 | }; | 82 | }; |
| 83 | static struct lock_writer_stress_stats *lwsa; | 83 | static struct lock_writer_stress_stats *lwsa; |
| 84 | 84 | ||
| 85 | #if defined(MODULE) || defined(CONFIG_LOCK_TORTURE_TEST_RUNNABLE) | 85 | #if defined(MODULE) |
| 86 | #define LOCKTORTURE_RUNNABLE_INIT 1 | 86 | #define LOCKTORTURE_RUNNABLE_INIT 1 |
| 87 | #else | 87 | #else |
| 88 | #define LOCKTORTURE_RUNNABLE_INIT 0 | 88 | #define LOCKTORTURE_RUNNABLE_INIT 0 |
| 89 | #endif | 89 | #endif |
| 90 | int locktorture_runnable = LOCKTORTURE_RUNNABLE_INIT; | 90 | int locktorture_runnable = LOCKTORTURE_RUNNABLE_INIT; |
| 91 | module_param(locktorture_runnable, int, 0444); | 91 | module_param(locktorture_runnable, int, 0444); |
| 92 | MODULE_PARM_DESC(locktorture_runnable, "Start locktorture at boot"); | 92 | MODULE_PARM_DESC(locktorture_runnable, "Start locktorture at module init"); |
| 93 | 93 | ||
| 94 | /* Forward reference. */ | 94 | /* Forward reference. */ |
| 95 | static void lock_torture_cleanup(void); | 95 | static void lock_torture_cleanup(void); |
| @@ -216,10 +216,11 @@ static int lock_torture_writer(void *arg) | |||
| 216 | static DEFINE_TORTURE_RANDOM(rand); | 216 | static DEFINE_TORTURE_RANDOM(rand); |
| 217 | 217 | ||
| 218 | VERBOSE_TOROUT_STRING("lock_torture_writer task started"); | 218 | VERBOSE_TOROUT_STRING("lock_torture_writer task started"); |
| 219 | set_user_nice(current, 19); | 219 | set_user_nice(current, MAX_NICE); |
| 220 | 220 | ||
| 221 | do { | 221 | do { |
| 222 | schedule_timeout_uninterruptible(1); | 222 | if ((torture_random(&rand) & 0xfffff) == 0) |
| 223 | schedule_timeout_uninterruptible(1); | ||
| 223 | cur_ops->writelock(); | 224 | cur_ops->writelock(); |
| 224 | if (WARN_ON_ONCE(lock_is_write_held)) | 225 | if (WARN_ON_ONCE(lock_is_write_held)) |
| 225 | lwsp->n_write_lock_fail++; | 226 | lwsp->n_write_lock_fail++; |
| @@ -354,7 +355,8 @@ static int __init lock_torture_init(void) | |||
| 354 | &lock_busted_ops, &spin_lock_ops, &spin_lock_irq_ops, | 355 | &lock_busted_ops, &spin_lock_ops, &spin_lock_irq_ops, |
| 355 | }; | 356 | }; |
| 356 | 357 | ||
| 357 | torture_init_begin(torture_type, verbose, &locktorture_runnable); | 358 | if (!torture_init_begin(torture_type, verbose, &locktorture_runnable)) |
| 359 | return -EBUSY; | ||
| 358 | 360 | ||
| 359 | /* Process args and tell the world that the torturer is on the job. */ | 361 | /* Process args and tell the world that the torturer is on the job. */ |
| 360 | for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { | 362 | for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { |
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index e1191c996c59..5cf6731b98e9 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c | |||
| @@ -71,18 +71,17 @@ void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, | |||
| 71 | 71 | ||
| 72 | void debug_mutex_unlock(struct mutex *lock) | 72 | void debug_mutex_unlock(struct mutex *lock) |
| 73 | { | 73 | { |
| 74 | if (unlikely(!debug_locks)) | 74 | if (likely(debug_locks)) { |
| 75 | return; | 75 | DEBUG_LOCKS_WARN_ON(lock->magic != lock); |
| 76 | 76 | ||
| 77 | DEBUG_LOCKS_WARN_ON(lock->magic != lock); | 77 | if (!lock->owner) |
| 78 | DEBUG_LOCKS_WARN_ON(!lock->owner); | ||
| 79 | else | ||
| 80 | DEBUG_LOCKS_WARN_ON(lock->owner != current); | ||
| 78 | 81 | ||
| 79 | if (!lock->owner) | 82 | DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); |
| 80 | DEBUG_LOCKS_WARN_ON(!lock->owner); | 83 | mutex_clear_owner(lock); |
| 81 | else | 84 | } |
| 82 | DEBUG_LOCKS_WARN_ON(lock->owner != current); | ||
| 83 | |||
| 84 | DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); | ||
| 85 | mutex_clear_owner(lock); | ||
| 86 | 85 | ||
| 87 | /* | 86 | /* |
| 88 | * __mutex_slowpath_needs_to_unlock() is explicitly 0 for debug | 87 | * __mutex_slowpath_needs_to_unlock() is explicitly 0 for debug |
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index aa4dff04b594..a620d4d08ca6 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c | |||
| @@ -343,9 +343,16 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, | |||
| 343 | * top_waiter can be NULL, when we are in the deboosting | 343 | * top_waiter can be NULL, when we are in the deboosting |
| 344 | * mode! | 344 | * mode! |
| 345 | */ | 345 | */ |
| 346 | if (top_waiter && (!task_has_pi_waiters(task) || | 346 | if (top_waiter) { |
| 347 | top_waiter != task_top_pi_waiter(task))) | 347 | if (!task_has_pi_waiters(task)) |
| 348 | goto out_unlock_pi; | 348 | goto out_unlock_pi; |
| 349 | /* | ||
| 350 | * If deadlock detection is off, we stop here if we | ||
| 351 | * are not the top pi waiter of the task. | ||
| 352 | */ | ||
| 353 | if (!detect_deadlock && top_waiter != task_top_pi_waiter(task)) | ||
| 354 | goto out_unlock_pi; | ||
| 355 | } | ||
| 349 | 356 | ||
| 350 | /* | 357 | /* |
| 351 | * When deadlock detection is off then we check, if further | 358 | * When deadlock detection is off then we check, if further |
| @@ -361,7 +368,12 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, | |||
| 361 | goto retry; | 368 | goto retry; |
| 362 | } | 369 | } |
| 363 | 370 | ||
| 364 | /* Deadlock detection */ | 371 | /* |
| 372 | * Deadlock detection. If the lock is the same as the original | ||
| 373 | * lock which caused us to walk the lock chain or if the | ||
| 374 | * current lock is owned by the task which initiated the chain | ||
| 375 | * walk, we detected a deadlock. | ||
| 376 | */ | ||
| 365 | if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { | 377 | if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { |
| 366 | debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock); | 378 | debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock); |
| 367 | raw_spin_unlock(&lock->wait_lock); | 379 | raw_spin_unlock(&lock->wait_lock); |
| @@ -527,6 +539,18 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, | |||
| 527 | unsigned long flags; | 539 | unsigned long flags; |
| 528 | int chain_walk = 0, res; | 540 | int chain_walk = 0, res; |
| 529 | 541 | ||
| 542 | /* | ||
| 543 | * Early deadlock detection. We really don't want the task to | ||
| 544 | * enqueue on itself just to untangle the mess later. It's not | ||
| 545 | * only an optimization. We drop the locks, so another waiter | ||
| 546 | * can come in before the chain walk detects the deadlock. So | ||
| 547 | * the other will detect the deadlock and return -EDEADLOCK, | ||
| 548 | * which is wrong, as the other waiter is not in a deadlock | ||
| 549 | * situation. | ||
| 550 | */ | ||
| 551 | if (detect_deadlock && owner == task) | ||
| 552 | return -EDEADLK; | ||
| 553 | |||
| 530 | raw_spin_lock_irqsave(&task->pi_lock, flags); | 554 | raw_spin_lock_irqsave(&task->pi_lock, flags); |
| 531 | __rt_mutex_adjust_prio(task); | 555 | __rt_mutex_adjust_prio(task); |
| 532 | waiter->task = task; | 556 | waiter->task = task; |
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 1d66e08e897d..b4219ff87b8c 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c | |||
| @@ -12,6 +12,55 @@ | |||
| 12 | #include <linux/export.h> | 12 | #include <linux/export.h> |
| 13 | 13 | ||
| 14 | /* | 14 | /* |
| 15 | * Guide to the rw_semaphore's count field for common values. | ||
| 16 | * (32-bit case illustrated, similar for 64-bit) | ||
| 17 | * | ||
| 18 | * 0x0000000X (1) X readers active or attempting lock, no writer waiting | ||
| 19 | * X = #active_readers + #readers attempting to lock | ||
| 20 | * (X*ACTIVE_BIAS) | ||
| 21 | * | ||
| 22 | * 0x00000000 rwsem is unlocked, and no one is waiting for the lock or | ||
| 23 | * attempting to read lock or write lock. | ||
| 24 | * | ||
| 25 | * 0xffff000X (1) X readers active or attempting lock, with waiters for lock | ||
| 26 | * X = #active readers + # readers attempting lock | ||
| 27 | * (X*ACTIVE_BIAS + WAITING_BIAS) | ||
| 28 | * (2) 1 writer attempting lock, no waiters for lock | ||
| 29 | * X-1 = #active readers + #readers attempting lock | ||
| 30 | * ((X-1)*ACTIVE_BIAS + ACTIVE_WRITE_BIAS) | ||
| 31 | * (3) 1 writer active, no waiters for lock | ||
| 32 | * X-1 = #active readers + #readers attempting lock | ||
| 33 | * ((X-1)*ACTIVE_BIAS + ACTIVE_WRITE_BIAS) | ||
| 34 | * | ||
| 35 | * 0xffff0001 (1) 1 reader active or attempting lock, waiters for lock | ||
| 36 | * (WAITING_BIAS + ACTIVE_BIAS) | ||
| 37 | * (2) 1 writer active or attempting lock, no waiters for lock | ||
| 38 | * (ACTIVE_WRITE_BIAS) | ||
| 39 | * | ||
| 40 | * 0xffff0000 (1) There are writers or readers queued but none active | ||
| 41 | * or in the process of attempting lock. | ||
| 42 | * (WAITING_BIAS) | ||
| 43 | * Note: writer can attempt to steal lock for this count by adding | ||
| 44 | * ACTIVE_WRITE_BIAS in cmpxchg and checking the old count | ||
| 45 | * | ||
| 46 | * 0xfffe0001 (1) 1 writer active, or attempting lock. Waiters on queue. | ||
| 47 | * (ACTIVE_WRITE_BIAS + WAITING_BIAS) | ||
| 48 | * | ||
| 49 | * Note: Readers attempt to lock by adding ACTIVE_BIAS in down_read and checking | ||
| 50 | * the count becomes more than 0 for successful lock acquisition, | ||
| 51 | * i.e. the case where there are only readers or nobody has lock. | ||
| 52 | * (1st and 2nd case above). | ||
| 53 | * | ||
| 54 | * Writers attempt to lock by adding ACTIVE_WRITE_BIAS in down_write and | ||
| 55 | * checking the count becomes ACTIVE_WRITE_BIAS for successful lock | ||
| 56 | * acquisition (i.e. nobody else has lock or attempts lock). If | ||
| 57 | * unsuccessful, in rwsem_down_write_failed, we'll check to see if there | ||
| 58 | * are only waiters but none active (5th case above), and attempt to | ||
| 59 | * steal the lock. | ||
| 60 | * | ||
| 61 | */ | ||
| 62 | |||
| 63 | /* | ||
| 15 | * Initialize an rwsem: | 64 | * Initialize an rwsem: |
| 16 | */ | 65 | */ |
| 17 | void __init_rwsem(struct rw_semaphore *sem, const char *name, | 66 | void __init_rwsem(struct rw_semaphore *sem, const char *name, |
diff --git a/kernel/module.c b/kernel/module.c index 11869408f79b..079c4615607d 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
| @@ -815,9 +815,6 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, | |||
| 815 | return -EFAULT; | 815 | return -EFAULT; |
| 816 | name[MODULE_NAME_LEN-1] = '\0'; | 816 | name[MODULE_NAME_LEN-1] = '\0'; |
| 817 | 817 | ||
| 818 | if (!(flags & O_NONBLOCK)) | ||
| 819 | pr_warn("waiting module removal not supported: please upgrade\n"); | ||
| 820 | |||
| 821 | if (mutex_lock_interruptible(&module_mutex) != 0) | 818 | if (mutex_lock_interruptible(&module_mutex) != 0) |
| 822 | return -EINTR; | 819 | return -EINTR; |
| 823 | 820 | ||
| @@ -3271,6 +3268,9 @@ static int load_module(struct load_info *info, const char __user *uargs, | |||
| 3271 | 3268 | ||
| 3272 | dynamic_debug_setup(info->debug, info->num_debug); | 3269 | dynamic_debug_setup(info->debug, info->num_debug); |
| 3273 | 3270 | ||
| 3271 | /* Ftrace init must be called in the MODULE_STATE_UNFORMED state */ | ||
| 3272 | ftrace_module_init(mod); | ||
| 3273 | |||
| 3274 | /* Finally it's fully formed, ready to start executing. */ | 3274 | /* Finally it's fully formed, ready to start executing. */ |
| 3275 | err = complete_formation(mod, info); | 3275 | err = complete_formation(mod, info); |
| 3276 | if (err) | 3276 | if (err) |
diff --git a/kernel/panic.c b/kernel/panic.c index d02fa9fef46a..62e16cef9cc2 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
| @@ -32,6 +32,7 @@ static unsigned long tainted_mask; | |||
| 32 | static int pause_on_oops; | 32 | static int pause_on_oops; |
| 33 | static int pause_on_oops_flag; | 33 | static int pause_on_oops_flag; |
| 34 | static DEFINE_SPINLOCK(pause_on_oops_lock); | 34 | static DEFINE_SPINLOCK(pause_on_oops_lock); |
| 35 | static bool crash_kexec_post_notifiers; | ||
| 35 | 36 | ||
| 36 | int panic_timeout = CONFIG_PANIC_TIMEOUT; | 37 | int panic_timeout = CONFIG_PANIC_TIMEOUT; |
| 37 | EXPORT_SYMBOL_GPL(panic_timeout); | 38 | EXPORT_SYMBOL_GPL(panic_timeout); |
| @@ -112,9 +113,11 @@ void panic(const char *fmt, ...) | |||
| 112 | /* | 113 | /* |
| 113 | * If we have crashed and we have a crash kernel loaded let it handle | 114 | * If we have crashed and we have a crash kernel loaded let it handle |
| 114 | * everything else. | 115 | * everything else. |
| 115 | * Do we want to call this before we try to display a message? | 116 | * If we want to run this after calling panic_notifiers, pass |
| 117 | * the "crash_kexec_post_notifiers" option to the kernel. | ||
| 116 | */ | 118 | */ |
| 117 | crash_kexec(NULL); | 119 | if (!crash_kexec_post_notifiers) |
| 120 | crash_kexec(NULL); | ||
| 118 | 121 | ||
| 119 | /* | 122 | /* |
| 120 | * Note smp_send_stop is the usual smp shutdown function, which | 123 | * Note smp_send_stop is the usual smp shutdown function, which |
| @@ -131,6 +134,15 @@ void panic(const char *fmt, ...) | |||
| 131 | 134 | ||
| 132 | kmsg_dump(KMSG_DUMP_PANIC); | 135 | kmsg_dump(KMSG_DUMP_PANIC); |
| 133 | 136 | ||
| 137 | /* | ||
| 138 | * If you doubt kdump always works fine in any situation, | ||
| 139 | * "crash_kexec_post_notifiers" offers you a chance to run | ||
| 140 | * panic_notifiers and dumping kmsg before kdump. | ||
| 141 | * Note: since some panic_notifiers can make crashed kernel | ||
| 142 | * more unstable, it can increase risks of the kdump failure too. | ||
| 143 | */ | ||
| 144 | crash_kexec(NULL); | ||
| 145 | |||
| 134 | bust_spinlocks(0); | 146 | bust_spinlocks(0); |
| 135 | 147 | ||
| 136 | if (!panic_blink) | 148 | if (!panic_blink) |
| @@ -472,6 +484,13 @@ EXPORT_SYMBOL(__stack_chk_fail); | |||
| 472 | core_param(panic, panic_timeout, int, 0644); | 484 | core_param(panic, panic_timeout, int, 0644); |
| 473 | core_param(pause_on_oops, pause_on_oops, int, 0644); | 485 | core_param(pause_on_oops, pause_on_oops, int, 0644); |
| 474 | 486 | ||
| 487 | static int __init setup_crash_kexec_post_notifiers(char *s) | ||
| 488 | { | ||
| 489 | crash_kexec_post_notifiers = true; | ||
| 490 | return 0; | ||
| 491 | } | ||
| 492 | early_param("crash_kexec_post_notifiers", setup_crash_kexec_post_notifiers); | ||
| 493 | |||
| 475 | static int __init oops_setup(char *s) | 494 | static int __init oops_setup(char *s) |
| 476 | { | 495 | { |
| 477 | if (!s) | 496 | if (!s) |
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 2fac9cc79b3d..9a83d780facd 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
| @@ -257,8 +257,7 @@ config ARCH_HAS_OPP | |||
| 257 | bool | 257 | bool |
| 258 | 258 | ||
| 259 | config PM_OPP | 259 | config PM_OPP |
| 260 | bool "Operating Performance Point (OPP) Layer library" | 260 | bool |
| 261 | depends on ARCH_HAS_OPP | ||
| 262 | ---help--- | 261 | ---help--- |
| 263 | SOCs have a standard set of tuples consisting of frequency and | 262 | SOCs have a standard set of tuples consisting of frequency and |
| 264 | voltage pairs that the device will support per voltage domain. This | 263 | voltage pairs that the device will support per voltage domain. This |
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index f4f2073711d3..df88d55dc436 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c | |||
| @@ -35,7 +35,7 @@ | |||
| 35 | static int nocompress; | 35 | static int nocompress; |
| 36 | static int noresume; | 36 | static int noresume; |
| 37 | static int resume_wait; | 37 | static int resume_wait; |
| 38 | static int resume_delay; | 38 | static unsigned int resume_delay; |
| 39 | static char resume_file[256] = CONFIG_PM_STD_PARTITION; | 39 | static char resume_file[256] = CONFIG_PM_STD_PARTITION; |
| 40 | dev_t swsusp_resume_device; | 40 | dev_t swsusp_resume_device; |
| 41 | sector_t swsusp_resume_block; | 41 | sector_t swsusp_resume_block; |
| @@ -228,19 +228,23 @@ static void platform_recover(int platform_mode) | |||
| 228 | void swsusp_show_speed(struct timeval *start, struct timeval *stop, | 228 | void swsusp_show_speed(struct timeval *start, struct timeval *stop, |
| 229 | unsigned nr_pages, char *msg) | 229 | unsigned nr_pages, char *msg) |
| 230 | { | 230 | { |
| 231 | s64 elapsed_centisecs64; | 231 | u64 elapsed_centisecs64; |
| 232 | int centisecs; | 232 | unsigned int centisecs; |
| 233 | int k; | 233 | unsigned int k; |
| 234 | int kps; | 234 | unsigned int kps; |
| 235 | 235 | ||
| 236 | elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start); | 236 | elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start); |
| 237 | /* | ||
| 238 | * If "(s64)elapsed_centisecs64 < 0", it will print long elapsed time, | ||
| 239 | * it is obvious enough for what went wrong. | ||
| 240 | */ | ||
| 237 | do_div(elapsed_centisecs64, NSEC_PER_SEC / 100); | 241 | do_div(elapsed_centisecs64, NSEC_PER_SEC / 100); |
| 238 | centisecs = elapsed_centisecs64; | 242 | centisecs = elapsed_centisecs64; |
| 239 | if (centisecs == 0) | 243 | if (centisecs == 0) |
| 240 | centisecs = 1; /* avoid div-by-zero */ | 244 | centisecs = 1; /* avoid div-by-zero */ |
| 241 | k = nr_pages * (PAGE_SIZE / 1024); | 245 | k = nr_pages * (PAGE_SIZE / 1024); |
| 242 | kps = (k * 100) / centisecs; | 246 | kps = (k * 100) / centisecs; |
| 243 | printk(KERN_INFO "PM: %s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", | 247 | printk(KERN_INFO "PM: %s %u kbytes in %u.%02u seconds (%u.%02u MB/s)\n", |
| 244 | msg, k, | 248 | msg, k, |
| 245 | centisecs / 100, centisecs % 100, | 249 | centisecs / 100, centisecs % 100, |
| 246 | kps / 1000, (kps % 1000) / 10); | 250 | kps / 1000, (kps % 1000) / 10); |
| @@ -595,7 +599,8 @@ static void power_down(void) | |||
| 595 | case HIBERNATION_PLATFORM: | 599 | case HIBERNATION_PLATFORM: |
| 596 | hibernation_platform_enter(); | 600 | hibernation_platform_enter(); |
| 597 | case HIBERNATION_SHUTDOWN: | 601 | case HIBERNATION_SHUTDOWN: |
| 598 | kernel_power_off(); | 602 | if (pm_power_off) |
| 603 | kernel_power_off(); | ||
| 599 | break; | 604 | break; |
| 600 | #ifdef CONFIG_SUSPEND | 605 | #ifdef CONFIG_SUSPEND |
| 601 | case HIBERNATION_SUSPEND: | 606 | case HIBERNATION_SUSPEND: |
| @@ -623,7 +628,8 @@ static void power_down(void) | |||
| 623 | * corruption after resume. | 628 | * corruption after resume. |
| 624 | */ | 629 | */ |
| 625 | printk(KERN_CRIT "PM: Please power down manually\n"); | 630 | printk(KERN_CRIT "PM: Please power down manually\n"); |
| 626 | while(1); | 631 | while (1) |
| 632 | cpu_relax(); | ||
| 627 | } | 633 | } |
| 628 | 634 | ||
| 629 | /** | 635 | /** |
| @@ -1109,7 +1115,10 @@ static int __init resumewait_setup(char *str) | |||
| 1109 | 1115 | ||
| 1110 | static int __init resumedelay_setup(char *str) | 1116 | static int __init resumedelay_setup(char *str) |
| 1111 | { | 1117 | { |
| 1112 | resume_delay = simple_strtoul(str, NULL, 0); | 1118 | int rc = kstrtouint(str, 0, &resume_delay); |
| 1119 | |||
| 1120 | if (rc) | ||
| 1121 | return rc; | ||
| 1113 | return 1; | 1122 | return 1; |
| 1114 | } | 1123 | } |
| 1115 | 1124 | ||
diff --git a/kernel/power/main.c b/kernel/power/main.c index 6271bc4073ef..573410d6647e 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
| @@ -279,26 +279,26 @@ static inline void pm_print_times_init(void) {} | |||
| 279 | struct kobject *power_kobj; | 279 | struct kobject *power_kobj; |
| 280 | 280 | ||
| 281 | /** | 281 | /** |
| 282 | * state - control system power state. | 282 | * state - control system sleep states. |
| 283 | * | 283 | * |
| 284 | * show() returns what states are supported, which is hard-coded to | 284 | * show() returns available sleep state labels, which may be "mem", "standby", |
| 285 | * 'freeze' (Low-Power Idle), 'standby' (Power-On Suspend), | 285 | * "freeze" and "disk" (hibernation). See Documentation/power/states.txt for a |
| 286 | * 'mem' (Suspend-to-RAM), and 'disk' (Suspend-to-Disk). | 286 | * description of what they mean. |
| 287 | * | 287 | * |
| 288 | * store() accepts one of those strings, translates it into the | 288 | * store() accepts one of those strings, translates it into the proper |
| 289 | * proper enumerated value, and initiates a suspend transition. | 289 | * enumerated value, and initiates a suspend transition. |
| 290 | */ | 290 | */ |
| 291 | static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, | 291 | static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, |
| 292 | char *buf) | 292 | char *buf) |
| 293 | { | 293 | { |
| 294 | char *s = buf; | 294 | char *s = buf; |
| 295 | #ifdef CONFIG_SUSPEND | 295 | #ifdef CONFIG_SUSPEND |
| 296 | int i; | 296 | suspend_state_t i; |
| 297 | |||
| 298 | for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) | ||
| 299 | if (pm_states[i].state) | ||
| 300 | s += sprintf(s,"%s ", pm_states[i].label); | ||
| 297 | 301 | ||
| 298 | for (i = 0; i < PM_SUSPEND_MAX; i++) { | ||
| 299 | if (pm_states[i] && valid_state(i)) | ||
| 300 | s += sprintf(s,"%s ", pm_states[i]); | ||
| 301 | } | ||
| 302 | #endif | 302 | #endif |
| 303 | #ifdef CONFIG_HIBERNATION | 303 | #ifdef CONFIG_HIBERNATION |
| 304 | s += sprintf(s, "%s\n", "disk"); | 304 | s += sprintf(s, "%s\n", "disk"); |
| @@ -314,7 +314,7 @@ static suspend_state_t decode_state(const char *buf, size_t n) | |||
| 314 | { | 314 | { |
| 315 | #ifdef CONFIG_SUSPEND | 315 | #ifdef CONFIG_SUSPEND |
| 316 | suspend_state_t state = PM_SUSPEND_MIN; | 316 | suspend_state_t state = PM_SUSPEND_MIN; |
| 317 | const char * const *s; | 317 | struct pm_sleep_state *s; |
| 318 | #endif | 318 | #endif |
| 319 | char *p; | 319 | char *p; |
| 320 | int len; | 320 | int len; |
| @@ -328,8 +328,9 @@ static suspend_state_t decode_state(const char *buf, size_t n) | |||
| 328 | 328 | ||
| 329 | #ifdef CONFIG_SUSPEND | 329 | #ifdef CONFIG_SUSPEND |
| 330 | for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) | 330 | for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) |
| 331 | if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) | 331 | if (s->state && len == strlen(s->label) |
| 332 | return state; | 332 | && !strncmp(buf, s->label, len)) |
| 333 | return s->state; | ||
| 333 | #endif | 334 | #endif |
| 334 | 335 | ||
| 335 | return PM_SUSPEND_ON; | 336 | return PM_SUSPEND_ON; |
| @@ -447,8 +448,8 @@ static ssize_t autosleep_show(struct kobject *kobj, | |||
| 447 | 448 | ||
| 448 | #ifdef CONFIG_SUSPEND | 449 | #ifdef CONFIG_SUSPEND |
| 449 | if (state < PM_SUSPEND_MAX) | 450 | if (state < PM_SUSPEND_MAX) |
| 450 | return sprintf(buf, "%s\n", valid_state(state) ? | 451 | return sprintf(buf, "%s\n", pm_states[state].state ? |
| 451 | pm_states[state] : "error"); | 452 | pm_states[state].label : "error"); |
| 452 | #endif | 453 | #endif |
| 453 | #ifdef CONFIG_HIBERNATION | 454 | #ifdef CONFIG_HIBERNATION |
| 454 | return sprintf(buf, "disk\n"); | 455 | return sprintf(buf, "disk\n"); |
diff --git a/kernel/power/power.h b/kernel/power/power.h index 15f37ea08719..c60f13b5270a 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h | |||
| @@ -178,17 +178,20 @@ extern void swsusp_show_speed(struct timeval *, struct timeval *, | |||
| 178 | unsigned int, char *); | 178 | unsigned int, char *); |
| 179 | 179 | ||
| 180 | #ifdef CONFIG_SUSPEND | 180 | #ifdef CONFIG_SUSPEND |
| 181 | struct pm_sleep_state { | ||
| 182 | const char *label; | ||
| 183 | suspend_state_t state; | ||
| 184 | }; | ||
| 185 | |||
| 181 | /* kernel/power/suspend.c */ | 186 | /* kernel/power/suspend.c */ |
| 182 | extern const char *const pm_states[]; | 187 | extern struct pm_sleep_state pm_states[]; |
| 183 | 188 | ||
| 184 | extern bool valid_state(suspend_state_t state); | ||
| 185 | extern int suspend_devices_and_enter(suspend_state_t state); | 189 | extern int suspend_devices_and_enter(suspend_state_t state); |
| 186 | #else /* !CONFIG_SUSPEND */ | 190 | #else /* !CONFIG_SUSPEND */ |
| 187 | static inline int suspend_devices_and_enter(suspend_state_t state) | 191 | static inline int suspend_devices_and_enter(suspend_state_t state) |
| 188 | { | 192 | { |
| 189 | return -ENOSYS; | 193 | return -ENOSYS; |
| 190 | } | 194 | } |
| 191 | static inline bool valid_state(suspend_state_t state) { return false; } | ||
| 192 | #endif /* !CONFIG_SUSPEND */ | 195 | #endif /* !CONFIG_SUSPEND */ |
| 193 | 196 | ||
| 194 | #ifdef CONFIG_PM_TEST_SUSPEND | 197 | #ifdef CONFIG_PM_TEST_SUSPEND |
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 18fb7a2fb14b..1ea328aafdc9 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
| @@ -1586,7 +1586,7 @@ swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, | |||
| 1586 | return -ENOMEM; | 1586 | return -ENOMEM; |
| 1587 | } | 1587 | } |
| 1588 | 1588 | ||
| 1589 | asmlinkage int swsusp_save(void) | 1589 | asmlinkage __visible int swsusp_save(void) |
| 1590 | { | 1590 | { |
| 1591 | unsigned int nr_pages, nr_highmem; | 1591 | unsigned int nr_pages, nr_highmem; |
| 1592 | 1592 | ||
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index c3ad9cafe930..963e6d0f050b 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c | |||
| @@ -14,6 +14,7 @@ | |||
| 14 | #include <linux/init.h> | 14 | #include <linux/init.h> |
| 15 | #include <linux/console.h> | 15 | #include <linux/console.h> |
| 16 | #include <linux/cpu.h> | 16 | #include <linux/cpu.h> |
| 17 | #include <linux/cpuidle.h> | ||
| 17 | #include <linux/syscalls.h> | 18 | #include <linux/syscalls.h> |
| 18 | #include <linux/gfp.h> | 19 | #include <linux/gfp.h> |
| 19 | #include <linux/io.h> | 20 | #include <linux/io.h> |
| @@ -30,13 +31,14 @@ | |||
| 30 | 31 | ||
| 31 | #include "power.h" | 32 | #include "power.h" |
| 32 | 33 | ||
| 33 | const char *const pm_states[PM_SUSPEND_MAX] = { | 34 | struct pm_sleep_state pm_states[PM_SUSPEND_MAX] = { |
| 34 | [PM_SUSPEND_FREEZE] = "freeze", | 35 | [PM_SUSPEND_FREEZE] = { .label = "freeze", .state = PM_SUSPEND_FREEZE }, |
| 35 | [PM_SUSPEND_STANDBY] = "standby", | 36 | [PM_SUSPEND_STANDBY] = { .label = "standby", }, |
| 36 | [PM_SUSPEND_MEM] = "mem", | 37 | [PM_SUSPEND_MEM] = { .label = "mem", }, |
| 37 | }; | 38 | }; |
| 38 | 39 | ||
| 39 | static const struct platform_suspend_ops *suspend_ops; | 40 | static const struct platform_suspend_ops *suspend_ops; |
| 41 | static const struct platform_freeze_ops *freeze_ops; | ||
| 40 | 42 | ||
| 41 | static bool need_suspend_ops(suspend_state_t state) | 43 | static bool need_suspend_ops(suspend_state_t state) |
| 42 | { | 44 | { |
| @@ -46,6 +48,13 @@ static bool need_suspend_ops(suspend_state_t state) | |||
| 46 | static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head); | 48 | static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head); |
| 47 | static bool suspend_freeze_wake; | 49 | static bool suspend_freeze_wake; |
| 48 | 50 | ||
| 51 | void freeze_set_ops(const struct platform_freeze_ops *ops) | ||
| 52 | { | ||
| 53 | lock_system_sleep(); | ||
| 54 | freeze_ops = ops; | ||
| 55 | unlock_system_sleep(); | ||
| 56 | } | ||
| 57 | |||
| 49 | static void freeze_begin(void) | 58 | static void freeze_begin(void) |
| 50 | { | 59 | { |
| 51 | suspend_freeze_wake = false; | 60 | suspend_freeze_wake = false; |
| @@ -53,7 +62,11 @@ static void freeze_begin(void) | |||
| 53 | 62 | ||
| 54 | static void freeze_enter(void) | 63 | static void freeze_enter(void) |
| 55 | { | 64 | { |
| 65 | cpuidle_use_deepest_state(true); | ||
| 66 | cpuidle_resume(); | ||
| 56 | wait_event(suspend_freeze_wait_head, suspend_freeze_wake); | 67 | wait_event(suspend_freeze_wait_head, suspend_freeze_wake); |
| 68 | cpuidle_pause(); | ||
| 69 | cpuidle_use_deepest_state(false); | ||
| 57 | } | 70 | } |
| 58 | 71 | ||
| 59 | void freeze_wake(void) | 72 | void freeze_wake(void) |
| @@ -63,42 +76,62 @@ void freeze_wake(void) | |||
| 63 | } | 76 | } |
| 64 | EXPORT_SYMBOL_GPL(freeze_wake); | 77 | EXPORT_SYMBOL_GPL(freeze_wake); |
| 65 | 78 | ||
| 79 | static bool valid_state(suspend_state_t state) | ||
| 80 | { | ||
| 81 | /* | ||
| 82 | * PM_SUSPEND_STANDBY and PM_SUSPEND_MEM states need low level | ||
| 83 | * support and need to be valid to the low level | ||
| 84 | * implementation, no valid callback implies that none are valid. | ||
| 85 | */ | ||
| 86 | return suspend_ops && suspend_ops->valid && suspend_ops->valid(state); | ||
| 87 | } | ||
| 88 | |||
| 89 | /* | ||
| 90 | * If this is set, the "mem" label always corresponds to the deepest sleep state | ||
| 91 | * available, the "standby" label corresponds to the second deepest sleep state | ||
| 92 | * available (if any), and the "freeze" label corresponds to the remaining | ||
| 93 | * available sleep state (if there is one). | ||
| 94 | */ | ||
| 95 | static bool relative_states; | ||
| 96 | |||
| 97 | static int __init sleep_states_setup(char *str) | ||
| 98 | { | ||
| 99 | relative_states = !strncmp(str, "1", 1); | ||
| 100 | if (relative_states) { | ||
| 101 | pm_states[PM_SUSPEND_MEM].state = PM_SUSPEND_FREEZE; | ||
| 102 | pm_states[PM_SUSPEND_FREEZE].state = 0; | ||
| 103 | } | ||
| 104 | return 1; | ||
| 105 | } | ||
| 106 | |||
| 107 | __setup("relative_sleep_states=", sleep_states_setup); | ||
| 108 | |||
| 66 | /** | 109 | /** |
| 67 | * suspend_set_ops - Set the global suspend method table. | 110 | * suspend_set_ops - Set the global suspend method table. |
| 68 | * @ops: Suspend operations to use. | 111 | * @ops: Suspend operations to use. |
| 69 | */ | 112 | */ |
| 70 | void suspend_set_ops(const struct platform_suspend_ops *ops) | 113 | void suspend_set_ops(const struct platform_suspend_ops *ops) |
| 71 | { | 114 | { |
| 115 | suspend_state_t i; | ||
| 116 | int j = PM_SUSPEND_MAX - 1; | ||
| 117 | |||
| 72 | lock_system_sleep(); | 118 | lock_system_sleep(); |
| 119 | |||
| 73 | suspend_ops = ops; | 120 | suspend_ops = ops; |
| 121 | for (i = PM_SUSPEND_MEM; i >= PM_SUSPEND_STANDBY; i--) | ||
| 122 | if (valid_state(i)) | ||
| 123 | pm_states[j--].state = i; | ||
| 124 | else if (!relative_states) | ||
| 125 | pm_states[j--].state = 0; | ||
| 126 | |||
| 127 | pm_states[j--].state = PM_SUSPEND_FREEZE; | ||
| 128 | while (j >= PM_SUSPEND_MIN) | ||
| 129 | pm_states[j--].state = 0; | ||
| 130 | |||
| 74 | unlock_system_sleep(); | 131 | unlock_system_sleep(); |
| 75 | } | 132 | } |
| 76 | EXPORT_SYMBOL_GPL(suspend_set_ops); | 133 | EXPORT_SYMBOL_GPL(suspend_set_ops); |
| 77 | 134 | ||
| 78 | bool valid_state(suspend_state_t state) | ||
| 79 | { | ||
| 80 | if (state == PM_SUSPEND_FREEZE) { | ||
| 81 | #ifdef CONFIG_PM_DEBUG | ||
| 82 | if (pm_test_level != TEST_NONE && | ||
| 83 | pm_test_level != TEST_FREEZER && | ||
| 84 | pm_test_level != TEST_DEVICES && | ||
| 85 | pm_test_level != TEST_PLATFORM) { | ||
| 86 | printk(KERN_WARNING "Unsupported pm_test mode for " | ||
| 87 | "freeze state, please choose " | ||
| 88 | "none/freezer/devices/platform.\n"); | ||
| 89 | return false; | ||
| 90 | } | ||
| 91 | #endif | ||
| 92 | return true; | ||
| 93 | } | ||
| 94 | /* | ||
| 95 | * PM_SUSPEND_STANDBY and PM_SUSPEND_MEMORY states need lowlevel | ||
| 96 | * support and need to be valid to the lowlevel | ||
| 97 | * implementation, no valid callback implies that none are valid. | ||
| 98 | */ | ||
| 99 | return suspend_ops && suspend_ops->valid && suspend_ops->valid(state); | ||
| 100 | } | ||
| 101 | |||
| 102 | /** | 135 | /** |
| 103 | * suspend_valid_only_mem - Generic memory-only valid callback. | 136 | * suspend_valid_only_mem - Generic memory-only valid callback. |
| 104 | * | 137 | * |
| @@ -266,6 +299,10 @@ int suspend_devices_and_enter(suspend_state_t state) | |||
| 266 | error = suspend_ops->begin(state); | 299 | error = suspend_ops->begin(state); |
| 267 | if (error) | 300 | if (error) |
| 268 | goto Close; | 301 | goto Close; |
| 302 | } else if (state == PM_SUSPEND_FREEZE && freeze_ops->begin) { | ||
| 303 | error = freeze_ops->begin(); | ||
| 304 | if (error) | ||
| 305 | goto Close; | ||
| 269 | } | 306 | } |
| 270 | suspend_console(); | 307 | suspend_console(); |
| 271 | suspend_test_start(); | 308 | suspend_test_start(); |
| @@ -291,6 +328,9 @@ int suspend_devices_and_enter(suspend_state_t state) | |||
| 291 | Close: | 328 | Close: |
| 292 | if (need_suspend_ops(state) && suspend_ops->end) | 329 | if (need_suspend_ops(state) && suspend_ops->end) |
| 293 | suspend_ops->end(); | 330 | suspend_ops->end(); |
| 331 | else if (state == PM_SUSPEND_FREEZE && freeze_ops->end) | ||
| 332 | freeze_ops->end(); | ||
| 333 | |||
| 294 | trace_machine_suspend(PWR_EVENT_EXIT); | 334 | trace_machine_suspend(PWR_EVENT_EXIT); |
| 295 | return error; | 335 | return error; |
| 296 | 336 | ||
| @@ -325,9 +365,17 @@ static int enter_state(suspend_state_t state) | |||
| 325 | { | 365 | { |
| 326 | int error; | 366 | int error; |
| 327 | 367 | ||
| 328 | if (!valid_state(state)) | 368 | if (state == PM_SUSPEND_FREEZE) { |
| 329 | return -ENODEV; | 369 | #ifdef CONFIG_PM_DEBUG |
| 330 | 370 | if (pm_test_level != TEST_NONE && pm_test_level <= TEST_CPUS) { | |
| 371 | pr_warning("PM: Unsupported test mode for freeze state," | ||
| 372 | "please choose none/freezer/devices/platform.\n"); | ||
| 373 | return -EAGAIN; | ||
| 374 | } | ||
| 375 | #endif | ||
| 376 | } else if (!valid_state(state)) { | ||
| 377 | return -EINVAL; | ||
| 378 | } | ||
| 331 | if (!mutex_trylock(&pm_mutex)) | 379 | if (!mutex_trylock(&pm_mutex)) |
| 332 | return -EBUSY; | 380 | return -EBUSY; |
| 333 | 381 | ||
| @@ -338,7 +386,7 @@ static int enter_state(suspend_state_t state) | |||
| 338 | sys_sync(); | 386 | sys_sync(); |
| 339 | printk("done.\n"); | 387 | printk("done.\n"); |
| 340 | 388 | ||
| 341 | pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); | 389 | pr_debug("PM: Preparing system for %s sleep\n", pm_states[state].label); |
| 342 | error = suspend_prepare(state); | 390 | error = suspend_prepare(state); |
| 343 | if (error) | 391 | if (error) |
| 344 | goto Unlock; | 392 | goto Unlock; |
| @@ -346,7 +394,7 @@ static int enter_state(suspend_state_t state) | |||
| 346 | if (suspend_test(TEST_FREEZER)) | 394 | if (suspend_test(TEST_FREEZER)) |
| 347 | goto Finish; | 395 | goto Finish; |
| 348 | 396 | ||
| 349 | pr_debug("PM: Entering %s sleep\n", pm_states[state]); | 397 | pr_debug("PM: Entering %s sleep\n", pm_states[state].label); |
| 350 | pm_restrict_gfp_mask(); | 398 | pm_restrict_gfp_mask(); |
| 351 | error = suspend_devices_and_enter(state); | 399 | error = suspend_devices_and_enter(state); |
| 352 | pm_restore_gfp_mask(); | 400 | pm_restore_gfp_mask(); |
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index 9b2a1d58558d..269b097e78ea 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c | |||
| @@ -92,13 +92,13 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state) | |||
| 92 | } | 92 | } |
| 93 | 93 | ||
| 94 | if (state == PM_SUSPEND_MEM) { | 94 | if (state == PM_SUSPEND_MEM) { |
| 95 | printk(info_test, pm_states[state]); | 95 | printk(info_test, pm_states[state].label); |
| 96 | status = pm_suspend(state); | 96 | status = pm_suspend(state); |
| 97 | if (status == -ENODEV) | 97 | if (status == -ENODEV) |
| 98 | state = PM_SUSPEND_STANDBY; | 98 | state = PM_SUSPEND_STANDBY; |
| 99 | } | 99 | } |
| 100 | if (state == PM_SUSPEND_STANDBY) { | 100 | if (state == PM_SUSPEND_STANDBY) { |
| 101 | printk(info_test, pm_states[state]); | 101 | printk(info_test, pm_states[state].label); |
| 102 | status = pm_suspend(state); | 102 | status = pm_suspend(state); |
| 103 | } | 103 | } |
| 104 | if (status < 0) | 104 | if (status < 0) |
| @@ -136,18 +136,16 @@ static char warn_bad_state[] __initdata = | |||
| 136 | 136 | ||
| 137 | static int __init setup_test_suspend(char *value) | 137 | static int __init setup_test_suspend(char *value) |
| 138 | { | 138 | { |
| 139 | unsigned i; | 139 | suspend_state_t i; |
| 140 | 140 | ||
| 141 | /* "=mem" ==> "mem" */ | 141 | /* "=mem" ==> "mem" */ |
| 142 | value++; | 142 | value++; |
| 143 | for (i = 0; i < PM_SUSPEND_MAX; i++) { | 143 | for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) |
| 144 | if (!pm_states[i]) | 144 | if (!strcmp(pm_states[i].label, value)) { |
| 145 | continue; | 145 | test_state = pm_states[i].state; |
| 146 | if (strcmp(pm_states[i], value) != 0) | 146 | return 0; |
| 147 | continue; | 147 | } |
| 148 | test_state = (__force suspend_state_t) i; | 148 | |
| 149 | return 0; | ||
| 150 | } | ||
| 151 | printk(warn_bad_state, value); | 149 | printk(warn_bad_state, value); |
| 152 | return 0; | 150 | return 0; |
| 153 | } | 151 | } |
| @@ -164,8 +162,8 @@ static int __init test_suspend(void) | |||
| 164 | /* PM is initialized by now; is that state testable? */ | 162 | /* PM is initialized by now; is that state testable? */ |
| 165 | if (test_state == PM_SUSPEND_ON) | 163 | if (test_state == PM_SUSPEND_ON) |
| 166 | goto done; | 164 | goto done; |
| 167 | if (!valid_state(test_state)) { | 165 | if (!pm_states[test_state].state) { |
| 168 | printk(warn_bad_state, pm_states[test_state]); | 166 | printk(warn_bad_state, pm_states[test_state].label); |
| 169 | goto done; | 167 | goto done; |
| 170 | } | 168 | } |
| 171 | 169 | ||
diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 8c9a4819f798..aaa3261dea5d 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c | |||
| @@ -567,7 +567,7 @@ static int lzo_compress_threadfn(void *data) | |||
| 567 | 567 | ||
| 568 | /** | 568 | /** |
| 569 | * save_image_lzo - Save the suspend image data compressed with LZO. | 569 | * save_image_lzo - Save the suspend image data compressed with LZO. |
| 570 | * @handle: Swap mam handle to use for saving the image. | 570 | * @handle: Swap map handle to use for saving the image. |
| 571 | * @snapshot: Image to read data from. | 571 | * @snapshot: Image to read data from. |
| 572 | * @nr_to_write: Number of pages to save. | 572 | * @nr_to_write: Number of pages to save. |
| 573 | */ | 573 | */ |
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index a45b50962295..ea2d5f6962ed 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c | |||
| @@ -54,20 +54,16 @@ | |||
| 54 | #include "console_cmdline.h" | 54 | #include "console_cmdline.h" |
| 55 | #include "braille.h" | 55 | #include "braille.h" |
| 56 | 56 | ||
| 57 | /* printk's without a loglevel use this.. */ | ||
| 58 | #define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL | ||
| 59 | |||
| 60 | /* We show everything that is MORE important than this.. */ | ||
| 61 | #define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */ | ||
| 62 | #define DEFAULT_CONSOLE_LOGLEVEL 7 /* anything MORE serious than KERN_DEBUG */ | ||
| 63 | |||
| 64 | int console_printk[4] = { | 57 | int console_printk[4] = { |
| 65 | DEFAULT_CONSOLE_LOGLEVEL, /* console_loglevel */ | 58 | CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */ |
| 66 | DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */ | 59 | DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */ |
| 67 | MINIMUM_CONSOLE_LOGLEVEL, /* minimum_console_loglevel */ | 60 | CONSOLE_LOGLEVEL_MIN, /* minimum_console_loglevel */ |
| 68 | DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ | 61 | CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */ |
| 69 | }; | 62 | }; |
| 70 | 63 | ||
| 64 | /* Deferred messaged from sched code are marked by this special level */ | ||
| 65 | #define SCHED_MESSAGE_LOGLEVEL -2 | ||
| 66 | |||
| 71 | /* | 67 | /* |
| 72 | * Low level drivers may need that to know if they can schedule in | 68 | * Low level drivers may need that to know if they can schedule in |
| 73 | * their unblank() callback or not. So let's export it. | 69 | * their unblank() callback or not. So let's export it. |
| @@ -91,6 +87,29 @@ static struct lockdep_map console_lock_dep_map = { | |||
| 91 | #endif | 87 | #endif |
| 92 | 88 | ||
| 93 | /* | 89 | /* |
| 90 | * Helper macros to handle lockdep when locking/unlocking console_sem. We use | ||
| 91 | * macros instead of functions so that _RET_IP_ contains useful information. | ||
| 92 | */ | ||
| 93 | #define down_console_sem() do { \ | ||
| 94 | down(&console_sem);\ | ||
| 95 | mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_);\ | ||
| 96 | } while (0) | ||
| 97 | |||
| 98 | static int __down_trylock_console_sem(unsigned long ip) | ||
| 99 | { | ||
| 100 | if (down_trylock(&console_sem)) | ||
| 101 | return 1; | ||
| 102 | mutex_acquire(&console_lock_dep_map, 0, 1, ip); | ||
| 103 | return 0; | ||
| 104 | } | ||
| 105 | #define down_trylock_console_sem() __down_trylock_console_sem(_RET_IP_) | ||
| 106 | |||
| 107 | #define up_console_sem() do { \ | ||
| 108 | mutex_release(&console_lock_dep_map, 1, _RET_IP_);\ | ||
| 109 | up(&console_sem);\ | ||
| 110 | } while (0) | ||
| 111 | |||
| 112 | /* | ||
| 94 | * This is used for debugging the mess that is the VT code by | 113 | * This is used for debugging the mess that is the VT code by |
| 95 | * keeping track if we have the console semaphore held. It's | 114 | * keeping track if we have the console semaphore held. It's |
| 96 | * definitely not the perfect debug tool (we don't know if _WE_ | 115 | * definitely not the perfect debug tool (we don't know if _WE_ |
| @@ -206,8 +225,9 @@ struct printk_log { | |||
| 206 | }; | 225 | }; |
| 207 | 226 | ||
| 208 | /* | 227 | /* |
| 209 | * The logbuf_lock protects kmsg buffer, indices, counters. It is also | 228 | * The logbuf_lock protects kmsg buffer, indices, counters. This can be taken |
| 210 | * used in interesting ways to provide interlocking in console_unlock(); | 229 | * within the scheduler's rq lock. It must be released before calling |
| 230 | * console_unlock() or anything else that might wake up a process. | ||
| 211 | */ | 231 | */ |
| 212 | static DEFINE_RAW_SPINLOCK(logbuf_lock); | 232 | static DEFINE_RAW_SPINLOCK(logbuf_lock); |
| 213 | 233 | ||
| @@ -250,9 +270,6 @@ static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); | |||
| 250 | static char *log_buf = __log_buf; | 270 | static char *log_buf = __log_buf; |
| 251 | static u32 log_buf_len = __LOG_BUF_LEN; | 271 | static u32 log_buf_len = __LOG_BUF_LEN; |
| 252 | 272 | ||
| 253 | /* cpu currently holding logbuf_lock */ | ||
| 254 | static volatile unsigned int logbuf_cpu = UINT_MAX; | ||
| 255 | |||
| 256 | /* human readable text of the record */ | 273 | /* human readable text of the record */ |
| 257 | static char *log_text(const struct printk_log *msg) | 274 | static char *log_text(const struct printk_log *msg) |
| 258 | { | 275 | { |
| @@ -297,34 +314,106 @@ static u32 log_next(u32 idx) | |||
| 297 | return idx + msg->len; | 314 | return idx + msg->len; |
| 298 | } | 315 | } |
| 299 | 316 | ||
| 300 | /* insert record into the buffer, discard old ones, update heads */ | 317 | /* |
| 301 | static void log_store(int facility, int level, | 318 | * Check whether there is enough free space for the given message. |
| 302 | enum log_flags flags, u64 ts_nsec, | 319 | * |
| 303 | const char *dict, u16 dict_len, | 320 | * The same values of first_idx and next_idx mean that the buffer |
| 304 | const char *text, u16 text_len) | 321 | * is either empty or full. |
| 322 | * | ||
| 323 | * If the buffer is empty, we must respect the position of the indexes. | ||
| 324 | * They cannot be reset to the beginning of the buffer. | ||
| 325 | */ | ||
| 326 | static int logbuf_has_space(u32 msg_size, bool empty) | ||
| 305 | { | 327 | { |
| 306 | struct printk_log *msg; | 328 | u32 free; |
| 307 | u32 size, pad_len; | ||
| 308 | 329 | ||
| 309 | /* number of '\0' padding bytes to next message */ | 330 | if (log_next_idx > log_first_idx || empty) |
| 310 | size = sizeof(struct printk_log) + text_len + dict_len; | 331 | free = max(log_buf_len - log_next_idx, log_first_idx); |
| 311 | pad_len = (-size) & (LOG_ALIGN - 1); | 332 | else |
| 312 | size += pad_len; | 333 | free = log_first_idx - log_next_idx; |
| 334 | |||
| 335 | /* | ||
| 336 | * We need space also for an empty header that signalizes wrapping | ||
| 337 | * of the buffer. | ||
| 338 | */ | ||
| 339 | return free >= msg_size + sizeof(struct printk_log); | ||
| 340 | } | ||
| 313 | 341 | ||
| 342 | static int log_make_free_space(u32 msg_size) | ||
| 343 | { | ||
| 314 | while (log_first_seq < log_next_seq) { | 344 | while (log_first_seq < log_next_seq) { |
| 315 | u32 free; | 345 | if (logbuf_has_space(msg_size, false)) |
| 346 | return 0; | ||
| 347 | /* drop old messages until we have enough continuous space */ | ||
| 348 | log_first_idx = log_next(log_first_idx); | ||
| 349 | log_first_seq++; | ||
| 350 | } | ||
| 316 | 351 | ||
| 317 | if (log_next_idx > log_first_idx) | 352 | /* sequence numbers are equal, so the log buffer is empty */ |
| 318 | free = max(log_buf_len - log_next_idx, log_first_idx); | 353 | if (logbuf_has_space(msg_size, true)) |
| 319 | else | 354 | return 0; |
| 320 | free = log_first_idx - log_next_idx; | ||
| 321 | 355 | ||
| 322 | if (free >= size + sizeof(struct printk_log)) | 356 | return -ENOMEM; |
| 323 | break; | 357 | } |
| 324 | 358 | ||
| 325 | /* drop old messages until we have enough contiuous space */ | 359 | /* compute the message size including the padding bytes */ |
| 326 | log_first_idx = log_next(log_first_idx); | 360 | static u32 msg_used_size(u16 text_len, u16 dict_len, u32 *pad_len) |
| 327 | log_first_seq++; | 361 | { |
| 362 | u32 size; | ||
| 363 | |||
| 364 | size = sizeof(struct printk_log) + text_len + dict_len; | ||
| 365 | *pad_len = (-size) & (LOG_ALIGN - 1); | ||
| 366 | size += *pad_len; | ||
| 367 | |||
| 368 | return size; | ||
| 369 | } | ||
| 370 | |||
| 371 | /* | ||
| 372 | * Define how much of the log buffer we could take at maximum. The value | ||
| 373 | * must be greater than two. Note that only half of the buffer is available | ||
| 374 | * when the index points to the middle. | ||
| 375 | */ | ||
| 376 | #define MAX_LOG_TAKE_PART 4 | ||
| 377 | static const char trunc_msg[] = "<truncated>"; | ||
| 378 | |||
| 379 | static u32 truncate_msg(u16 *text_len, u16 *trunc_msg_len, | ||
| 380 | u16 *dict_len, u32 *pad_len) | ||
| 381 | { | ||
| 382 | /* | ||
| 383 | * The message should not take the whole buffer. Otherwise, it might | ||
| 384 | * get removed too soon. | ||
| 385 | */ | ||
| 386 | u32 max_text_len = log_buf_len / MAX_LOG_TAKE_PART; | ||
| 387 | if (*text_len > max_text_len) | ||
| 388 | *text_len = max_text_len; | ||
| 389 | /* enable the warning message */ | ||
| 390 | *trunc_msg_len = strlen(trunc_msg); | ||
| 391 | /* disable the "dict" completely */ | ||
| 392 | *dict_len = 0; | ||
| 393 | /* compute the size again, count also the warning message */ | ||
| 394 | return msg_used_size(*text_len + *trunc_msg_len, 0, pad_len); | ||
| 395 | } | ||
| 396 | |||
| 397 | /* insert record into the buffer, discard old ones, update heads */ | ||
| 398 | static int log_store(int facility, int level, | ||
| 399 | enum log_flags flags, u64 ts_nsec, | ||
| 400 | const char *dict, u16 dict_len, | ||
| 401 | const char *text, u16 text_len) | ||
| 402 | { | ||
| 403 | struct printk_log *msg; | ||
| 404 | u32 size, pad_len; | ||
| 405 | u16 trunc_msg_len = 0; | ||
| 406 | |||
| 407 | /* number of '\0' padding bytes to next message */ | ||
| 408 | size = msg_used_size(text_len, dict_len, &pad_len); | ||
| 409 | |||
| 410 | if (log_make_free_space(size)) { | ||
| 411 | /* truncate the message if it is too long for empty buffer */ | ||
| 412 | size = truncate_msg(&text_len, &trunc_msg_len, | ||
| 413 | &dict_len, &pad_len); | ||
| 414 | /* survive when the log buffer is too small for trunc_msg */ | ||
| 415 | if (log_make_free_space(size)) | ||
| 416 | return 0; | ||
| 328 | } | 417 | } |
| 329 | 418 | ||
| 330 | if (log_next_idx + size + sizeof(struct printk_log) > log_buf_len) { | 419 | if (log_next_idx + size + sizeof(struct printk_log) > log_buf_len) { |
| @@ -341,6 +430,10 @@ static void log_store(int facility, int level, | |||
| 341 | msg = (struct printk_log *)(log_buf + log_next_idx); | 430 | msg = (struct printk_log *)(log_buf + log_next_idx); |
| 342 | memcpy(log_text(msg), text, text_len); | 431 | memcpy(log_text(msg), text, text_len); |
| 343 | msg->text_len = text_len; | 432 | msg->text_len = text_len; |
| 433 | if (trunc_msg_len) { | ||
| 434 | memcpy(log_text(msg) + text_len, trunc_msg, trunc_msg_len); | ||
| 435 | msg->text_len += trunc_msg_len; | ||
| 436 | } | ||
| 344 | memcpy(log_dict(msg), dict, dict_len); | 437 | memcpy(log_dict(msg), dict, dict_len); |
| 345 | msg->dict_len = dict_len; | 438 | msg->dict_len = dict_len; |
| 346 | msg->facility = facility; | 439 | msg->facility = facility; |
| @@ -356,6 +449,8 @@ static void log_store(int facility, int level, | |||
| 356 | /* insert message */ | 449 | /* insert message */ |
| 357 | log_next_idx += msg->len; | 450 | log_next_idx += msg->len; |
| 358 | log_next_seq++; | 451 | log_next_seq++; |
| 452 | |||
| 453 | return msg->text_len; | ||
| 359 | } | 454 | } |
| 360 | 455 | ||
| 361 | #ifdef CONFIG_SECURITY_DMESG_RESTRICT | 456 | #ifdef CONFIG_SECURITY_DMESG_RESTRICT |
| @@ -1303,7 +1398,10 @@ static void zap_locks(void) | |||
| 1303 | sema_init(&console_sem, 1); | 1398 | sema_init(&console_sem, 1); |
| 1304 | } | 1399 | } |
| 1305 | 1400 | ||
| 1306 | /* Check if we have any console registered that can be called early in boot. */ | 1401 | /* |
| 1402 | * Check if we have any console that is capable of printing while cpu is | ||
| 1403 | * booting or shutting down. Requires console_sem. | ||
| 1404 | */ | ||
| 1307 | static int have_callable_console(void) | 1405 | static int have_callable_console(void) |
| 1308 | { | 1406 | { |
| 1309 | struct console *con; | 1407 | struct console *con; |
| @@ -1318,10 +1416,9 @@ static int have_callable_console(void) | |||
| 1318 | /* | 1416 | /* |
| 1319 | * Can we actually use the console at this time on this cpu? | 1417 | * Can we actually use the console at this time on this cpu? |
| 1320 | * | 1418 | * |
| 1321 | * Console drivers may assume that per-cpu resources have | 1419 | * Console drivers may assume that per-cpu resources have been allocated. So |
| 1322 | * been allocated. So unless they're explicitly marked as | 1420 | * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't |
| 1323 | * being able to cope (CON_ANYTIME) don't call them until | 1421 | * call them until this CPU is officially up. |
| 1324 | * this CPU is officially up. | ||
| 1325 | */ | 1422 | */ |
| 1326 | static inline int can_use_console(unsigned int cpu) | 1423 | static inline int can_use_console(unsigned int cpu) |
| 1327 | { | 1424 | { |
| @@ -1333,36 +1430,24 @@ static inline int can_use_console(unsigned int cpu) | |||
| 1333 | * messages from a 'printk'. Return true (and with the | 1430 | * messages from a 'printk'. Return true (and with the |
| 1334 | * console_lock held, and 'console_locked' set) if it | 1431 | * console_lock held, and 'console_locked' set) if it |
| 1335 | * is successful, false otherwise. | 1432 | * is successful, false otherwise. |
| 1336 | * | ||
| 1337 | * This gets called with the 'logbuf_lock' spinlock held and | ||
| 1338 | * interrupts disabled. It should return with 'lockbuf_lock' | ||
| 1339 | * released but interrupts still disabled. | ||
| 1340 | */ | 1433 | */ |
| 1341 | static int console_trylock_for_printk(unsigned int cpu) | 1434 | static int console_trylock_for_printk(void) |
| 1342 | __releases(&logbuf_lock) | ||
| 1343 | { | 1435 | { |
| 1344 | int retval = 0, wake = 0; | 1436 | unsigned int cpu = smp_processor_id(); |
| 1345 | 1437 | ||
| 1346 | if (console_trylock()) { | 1438 | if (!console_trylock()) |
| 1347 | retval = 1; | 1439 | return 0; |
| 1348 | 1440 | /* | |
| 1349 | /* | 1441 | * If we can't use the console, we need to release the console |
| 1350 | * If we can't use the console, we need to release | 1442 | * semaphore by hand to avoid flushing the buffer. We need to hold the |
| 1351 | * the console semaphore by hand to avoid flushing | 1443 | * console semaphore in order to do this test safely. |
| 1352 | * the buffer. We need to hold the console semaphore | 1444 | */ |
| 1353 | * in order to do this test safely. | 1445 | if (!can_use_console(cpu)) { |
| 1354 | */ | 1446 | console_locked = 0; |
| 1355 | if (!can_use_console(cpu)) { | 1447 | up_console_sem(); |
| 1356 | console_locked = 0; | 1448 | return 0; |
| 1357 | wake = 1; | ||
| 1358 | retval = 0; | ||
| 1359 | } | ||
| 1360 | } | 1449 | } |
| 1361 | logbuf_cpu = UINT_MAX; | 1450 | return 1; |
| 1362 | raw_spin_unlock(&logbuf_lock); | ||
| 1363 | if (wake) | ||
| 1364 | up(&console_sem); | ||
| 1365 | return retval; | ||
| 1366 | } | 1451 | } |
| 1367 | 1452 | ||
| 1368 | int printk_delay_msec __read_mostly; | 1453 | int printk_delay_msec __read_mostly; |
| @@ -1490,11 +1575,19 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
| 1490 | static int recursion_bug; | 1575 | static int recursion_bug; |
| 1491 | static char textbuf[LOG_LINE_MAX]; | 1576 | static char textbuf[LOG_LINE_MAX]; |
| 1492 | char *text = textbuf; | 1577 | char *text = textbuf; |
| 1493 | size_t text_len; | 1578 | size_t text_len = 0; |
| 1494 | enum log_flags lflags = 0; | 1579 | enum log_flags lflags = 0; |
| 1495 | unsigned long flags; | 1580 | unsigned long flags; |
| 1496 | int this_cpu; | 1581 | int this_cpu; |
| 1497 | int printed_len = 0; | 1582 | int printed_len = 0; |
| 1583 | bool in_sched = false; | ||
| 1584 | /* cpu currently holding logbuf_lock in this function */ | ||
| 1585 | static volatile unsigned int logbuf_cpu = UINT_MAX; | ||
| 1586 | |||
| 1587 | if (level == SCHED_MESSAGE_LOGLEVEL) { | ||
| 1588 | level = -1; | ||
| 1589 | in_sched = true; | ||
| 1590 | } | ||
| 1498 | 1591 | ||
| 1499 | boot_delay_msec(level); | 1592 | boot_delay_msec(level); |
| 1500 | printk_delay(); | 1593 | printk_delay(); |
| @@ -1516,7 +1609,8 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
| 1516 | */ | 1609 | */ |
| 1517 | if (!oops_in_progress && !lockdep_recursing(current)) { | 1610 | if (!oops_in_progress && !lockdep_recursing(current)) { |
| 1518 | recursion_bug = 1; | 1611 | recursion_bug = 1; |
| 1519 | goto out_restore_irqs; | 1612 | local_irq_restore(flags); |
| 1613 | return 0; | ||
| 1520 | } | 1614 | } |
| 1521 | zap_locks(); | 1615 | zap_locks(); |
| 1522 | } | 1616 | } |
| @@ -1530,17 +1624,22 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
| 1530 | "BUG: recent printk recursion!"; | 1624 | "BUG: recent printk recursion!"; |
| 1531 | 1625 | ||
| 1532 | recursion_bug = 0; | 1626 | recursion_bug = 0; |
| 1533 | printed_len += strlen(recursion_msg); | 1627 | text_len = strlen(recursion_msg); |
| 1534 | /* emit KERN_CRIT message */ | 1628 | /* emit KERN_CRIT message */ |
| 1535 | log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, | 1629 | printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, |
| 1536 | NULL, 0, recursion_msg, printed_len); | 1630 | NULL, 0, recursion_msg, text_len); |
| 1537 | } | 1631 | } |
| 1538 | 1632 | ||
| 1539 | /* | 1633 | /* |
| 1540 | * The printf needs to come first; we need the syslog | 1634 | * The printf needs to come first; we need the syslog |
| 1541 | * prefix which might be passed-in as a parameter. | 1635 | * prefix which might be passed-in as a parameter. |
| 1542 | */ | 1636 | */ |
| 1543 | text_len = vscnprintf(text, sizeof(textbuf), fmt, args); | 1637 | if (in_sched) |
| 1638 | text_len = scnprintf(text, sizeof(textbuf), | ||
| 1639 | KERN_WARNING "[sched_delayed] "); | ||
| 1640 | |||
| 1641 | text_len += vscnprintf(text + text_len, | ||
| 1642 | sizeof(textbuf) - text_len, fmt, args); | ||
| 1544 | 1643 | ||
| 1545 | /* mark and strip a trailing newline */ | 1644 | /* mark and strip a trailing newline */ |
| 1546 | if (text_len && text[text_len-1] == '\n') { | 1645 | if (text_len && text[text_len-1] == '\n') { |
| @@ -1586,9 +1685,12 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
| 1586 | cont_flush(LOG_NEWLINE); | 1685 | cont_flush(LOG_NEWLINE); |
| 1587 | 1686 | ||
| 1588 | /* buffer line if possible, otherwise store it right away */ | 1687 | /* buffer line if possible, otherwise store it right away */ |
| 1589 | if (!cont_add(facility, level, text, text_len)) | 1688 | if (cont_add(facility, level, text, text_len)) |
| 1590 | log_store(facility, level, lflags | LOG_CONT, 0, | 1689 | printed_len += text_len; |
| 1591 | dict, dictlen, text, text_len); | 1690 | else |
| 1691 | printed_len += log_store(facility, level, | ||
| 1692 | lflags | LOG_CONT, 0, | ||
| 1693 | dict, dictlen, text, text_len); | ||
| 1592 | } else { | 1694 | } else { |
| 1593 | bool stored = false; | 1695 | bool stored = false; |
| 1594 | 1696 | ||
| @@ -1607,26 +1709,35 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
| 1607 | cont_flush(LOG_NEWLINE); | 1709 | cont_flush(LOG_NEWLINE); |
| 1608 | } | 1710 | } |
| 1609 | 1711 | ||
| 1610 | if (!stored) | 1712 | if (stored) |
| 1611 | log_store(facility, level, lflags, 0, | 1713 | printed_len += text_len; |
| 1612 | dict, dictlen, text, text_len); | 1714 | else |
| 1715 | printed_len += log_store(facility, level, lflags, 0, | ||
| 1716 | dict, dictlen, text, text_len); | ||
| 1613 | } | 1717 | } |
| 1614 | printed_len += text_len; | 1718 | |
| 1719 | logbuf_cpu = UINT_MAX; | ||
| 1720 | raw_spin_unlock(&logbuf_lock); | ||
| 1721 | lockdep_on(); | ||
| 1722 | local_irq_restore(flags); | ||
| 1723 | |||
| 1724 | /* If called from the scheduler, we can not call up(). */ | ||
| 1725 | if (in_sched) | ||
| 1726 | return printed_len; | ||
| 1615 | 1727 | ||
| 1616 | /* | 1728 | /* |
| 1729 | * Disable preemption to avoid being preempted while holding | ||
| 1730 | * console_sem which would prevent anyone from printing to console | ||
| 1731 | */ | ||
| 1732 | preempt_disable(); | ||
| 1733 | /* | ||
| 1617 | * Try to acquire and then immediately release the console semaphore. | 1734 | * Try to acquire and then immediately release the console semaphore. |
| 1618 | * The release will print out buffers and wake up /dev/kmsg and syslog() | 1735 | * The release will print out buffers and wake up /dev/kmsg and syslog() |
| 1619 | * users. | 1736 | * users. |
| 1620 | * | ||
| 1621 | * The console_trylock_for_printk() function will release 'logbuf_lock' | ||
| 1622 | * regardless of whether it actually gets the console semaphore or not. | ||
| 1623 | */ | 1737 | */ |
| 1624 | if (console_trylock_for_printk(this_cpu)) | 1738 | if (console_trylock_for_printk()) |
| 1625 | console_unlock(); | 1739 | console_unlock(); |
| 1626 | 1740 | preempt_enable(); | |
| 1627 | lockdep_on(); | ||
| 1628 | out_restore_irqs: | ||
| 1629 | local_irq_restore(flags); | ||
| 1630 | 1741 | ||
| 1631 | return printed_len; | 1742 | return printed_len; |
| 1632 | } | 1743 | } |
| @@ -1674,7 +1785,7 @@ EXPORT_SYMBOL(printk_emit); | |||
| 1674 | * | 1785 | * |
| 1675 | * See the vsnprintf() documentation for format string extensions over C99. | 1786 | * See the vsnprintf() documentation for format string extensions over C99. |
| 1676 | */ | 1787 | */ |
| 1677 | asmlinkage int printk(const char *fmt, ...) | 1788 | asmlinkage __visible int printk(const char *fmt, ...) |
| 1678 | { | 1789 | { |
| 1679 | va_list args; | 1790 | va_list args; |
| 1680 | int r; | 1791 | int r; |
| @@ -1737,7 +1848,7 @@ void early_vprintk(const char *fmt, va_list ap) | |||
| 1737 | } | 1848 | } |
| 1738 | } | 1849 | } |
| 1739 | 1850 | ||
| 1740 | asmlinkage void early_printk(const char *fmt, ...) | 1851 | asmlinkage __visible void early_printk(const char *fmt, ...) |
| 1741 | { | 1852 | { |
| 1742 | va_list ap; | 1853 | va_list ap; |
| 1743 | 1854 | ||
| @@ -1882,16 +1993,14 @@ void suspend_console(void) | |||
| 1882 | printk("Suspending console(s) (use no_console_suspend to debug)\n"); | 1993 | printk("Suspending console(s) (use no_console_suspend to debug)\n"); |
| 1883 | console_lock(); | 1994 | console_lock(); |
| 1884 | console_suspended = 1; | 1995 | console_suspended = 1; |
| 1885 | up(&console_sem); | 1996 | up_console_sem(); |
| 1886 | mutex_release(&console_lock_dep_map, 1, _RET_IP_); | ||
| 1887 | } | 1997 | } |
| 1888 | 1998 | ||
| 1889 | void resume_console(void) | 1999 | void resume_console(void) |
| 1890 | { | 2000 | { |
| 1891 | if (!console_suspend_enabled) | 2001 | if (!console_suspend_enabled) |
| 1892 | return; | 2002 | return; |
| 1893 | down(&console_sem); | 2003 | down_console_sem(); |
| 1894 | mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_); | ||
| 1895 | console_suspended = 0; | 2004 | console_suspended = 0; |
| 1896 | console_unlock(); | 2005 | console_unlock(); |
| 1897 | } | 2006 | } |
| @@ -1933,12 +2042,11 @@ void console_lock(void) | |||
| 1933 | { | 2042 | { |
| 1934 | might_sleep(); | 2043 | might_sleep(); |
| 1935 | 2044 | ||
| 1936 | down(&console_sem); | 2045 | down_console_sem(); |
| 1937 | if (console_suspended) | 2046 | if (console_suspended) |
| 1938 | return; | 2047 | return; |
| 1939 | console_locked = 1; | 2048 | console_locked = 1; |
| 1940 | console_may_schedule = 1; | 2049 | console_may_schedule = 1; |
| 1941 | mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_); | ||
| 1942 | } | 2050 | } |
| 1943 | EXPORT_SYMBOL(console_lock); | 2051 | EXPORT_SYMBOL(console_lock); |
| 1944 | 2052 | ||
| @@ -1952,15 +2060,14 @@ EXPORT_SYMBOL(console_lock); | |||
| 1952 | */ | 2060 | */ |
| 1953 | int console_trylock(void) | 2061 | int console_trylock(void) |
| 1954 | { | 2062 | { |
| 1955 | if (down_trylock(&console_sem)) | 2063 | if (down_trylock_console_sem()) |
| 1956 | return 0; | 2064 | return 0; |
| 1957 | if (console_suspended) { | 2065 | if (console_suspended) { |
| 1958 | up(&console_sem); | 2066 | up_console_sem(); |
| 1959 | return 0; | 2067 | return 0; |
| 1960 | } | 2068 | } |
| 1961 | console_locked = 1; | 2069 | console_locked = 1; |
| 1962 | console_may_schedule = 0; | 2070 | console_may_schedule = 0; |
| 1963 | mutex_acquire(&console_lock_dep_map, 0, 1, _RET_IP_); | ||
| 1964 | return 1; | 2071 | return 1; |
| 1965 | } | 2072 | } |
| 1966 | EXPORT_SYMBOL(console_trylock); | 2073 | EXPORT_SYMBOL(console_trylock); |
| @@ -2022,7 +2129,7 @@ void console_unlock(void) | |||
| 2022 | bool retry; | 2129 | bool retry; |
| 2023 | 2130 | ||
| 2024 | if (console_suspended) { | 2131 | if (console_suspended) { |
| 2025 | up(&console_sem); | 2132 | up_console_sem(); |
| 2026 | return; | 2133 | return; |
| 2027 | } | 2134 | } |
| 2028 | 2135 | ||
| @@ -2043,10 +2150,15 @@ again: | |||
| 2043 | } | 2150 | } |
| 2044 | 2151 | ||
| 2045 | if (console_seq < log_first_seq) { | 2152 | if (console_seq < log_first_seq) { |
| 2153 | len = sprintf(text, "** %u printk messages dropped ** ", | ||
| 2154 | (unsigned)(log_first_seq - console_seq)); | ||
| 2155 | |||
| 2046 | /* messages are gone, move to first one */ | 2156 | /* messages are gone, move to first one */ |
| 2047 | console_seq = log_first_seq; | 2157 | console_seq = log_first_seq; |
| 2048 | console_idx = log_first_idx; | 2158 | console_idx = log_first_idx; |
| 2049 | console_prev = 0; | 2159 | console_prev = 0; |
| 2160 | } else { | ||
| 2161 | len = 0; | ||
| 2050 | } | 2162 | } |
| 2051 | skip: | 2163 | skip: |
| 2052 | if (console_seq == log_next_seq) | 2164 | if (console_seq == log_next_seq) |
| @@ -2071,8 +2183,8 @@ skip: | |||
| 2071 | } | 2183 | } |
| 2072 | 2184 | ||
| 2073 | level = msg->level; | 2185 | level = msg->level; |
| 2074 | len = msg_print_text(msg, console_prev, false, | 2186 | len += msg_print_text(msg, console_prev, false, |
| 2075 | text, sizeof(text)); | 2187 | text + len, sizeof(text) - len); |
| 2076 | console_idx = log_next(console_idx); | 2188 | console_idx = log_next(console_idx); |
| 2077 | console_seq++; | 2189 | console_seq++; |
| 2078 | console_prev = msg->flags; | 2190 | console_prev = msg->flags; |
| @@ -2084,7 +2196,6 @@ skip: | |||
| 2084 | local_irq_restore(flags); | 2196 | local_irq_restore(flags); |
| 2085 | } | 2197 | } |
| 2086 | console_locked = 0; | 2198 | console_locked = 0; |
| 2087 | mutex_release(&console_lock_dep_map, 1, _RET_IP_); | ||
| 2088 | 2199 | ||
| 2089 | /* Release the exclusive_console once it is used */ | 2200 | /* Release the exclusive_console once it is used */ |
| 2090 | if (unlikely(exclusive_console)) | 2201 | if (unlikely(exclusive_console)) |
| @@ -2092,7 +2203,7 @@ skip: | |||
| 2092 | 2203 | ||
| 2093 | raw_spin_unlock(&logbuf_lock); | 2204 | raw_spin_unlock(&logbuf_lock); |
| 2094 | 2205 | ||
| 2095 | up(&console_sem); | 2206 | up_console_sem(); |
| 2096 | 2207 | ||
| 2097 | /* | 2208 | /* |
| 2098 | * Someone could have filled up the buffer again, so re-check if there's | 2209 | * Someone could have filled up the buffer again, so re-check if there's |
| @@ -2137,7 +2248,7 @@ void console_unblank(void) | |||
| 2137 | * oops_in_progress is set to 1.. | 2248 | * oops_in_progress is set to 1.. |
| 2138 | */ | 2249 | */ |
| 2139 | if (oops_in_progress) { | 2250 | if (oops_in_progress) { |
| 2140 | if (down_trylock(&console_sem) != 0) | 2251 | if (down_trylock_console_sem() != 0) |
| 2141 | return; | 2252 | return; |
| 2142 | } else | 2253 | } else |
| 2143 | console_lock(); | 2254 | console_lock(); |
| @@ -2413,6 +2524,7 @@ int unregister_console(struct console *console) | |||
| 2413 | if (console_drivers != NULL && console->flags & CON_CONSDEV) | 2524 | if (console_drivers != NULL && console->flags & CON_CONSDEV) |
| 2414 | console_drivers->flags |= CON_CONSDEV; | 2525 | console_drivers->flags |= CON_CONSDEV; |
| 2415 | 2526 | ||
| 2527 | console->flags &= ~CON_ENABLED; | ||
| 2416 | console_unlock(); | 2528 | console_unlock(); |
| 2417 | console_sysfs_notify(); | 2529 | console_sysfs_notify(); |
| 2418 | return res; | 2530 | return res; |
| @@ -2437,21 +2549,19 @@ late_initcall(printk_late_init); | |||
| 2437 | /* | 2549 | /* |
| 2438 | * Delayed printk version, for scheduler-internal messages: | 2550 | * Delayed printk version, for scheduler-internal messages: |
| 2439 | */ | 2551 | */ |
| 2440 | #define PRINTK_BUF_SIZE 512 | ||
| 2441 | |||
| 2442 | #define PRINTK_PENDING_WAKEUP 0x01 | 2552 | #define PRINTK_PENDING_WAKEUP 0x01 |
| 2443 | #define PRINTK_PENDING_SCHED 0x02 | 2553 | #define PRINTK_PENDING_OUTPUT 0x02 |
| 2444 | 2554 | ||
| 2445 | static DEFINE_PER_CPU(int, printk_pending); | 2555 | static DEFINE_PER_CPU(int, printk_pending); |
| 2446 | static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf); | ||
| 2447 | 2556 | ||
| 2448 | static void wake_up_klogd_work_func(struct irq_work *irq_work) | 2557 | static void wake_up_klogd_work_func(struct irq_work *irq_work) |
| 2449 | { | 2558 | { |
| 2450 | int pending = __this_cpu_xchg(printk_pending, 0); | 2559 | int pending = __this_cpu_xchg(printk_pending, 0); |
| 2451 | 2560 | ||
| 2452 | if (pending & PRINTK_PENDING_SCHED) { | 2561 | if (pending & PRINTK_PENDING_OUTPUT) { |
| 2453 | char *buf = __get_cpu_var(printk_sched_buf); | 2562 | /* If trylock fails, someone else is doing the printing */ |
| 2454 | pr_warn("[sched_delayed] %s", buf); | 2563 | if (console_trylock()) |
| 2564 | console_unlock(); | ||
| 2455 | } | 2565 | } |
| 2456 | 2566 | ||
| 2457 | if (pending & PRINTK_PENDING_WAKEUP) | 2567 | if (pending & PRINTK_PENDING_WAKEUP) |
| @@ -2473,23 +2583,19 @@ void wake_up_klogd(void) | |||
| 2473 | preempt_enable(); | 2583 | preempt_enable(); |
| 2474 | } | 2584 | } |
| 2475 | 2585 | ||
| 2476 | int printk_sched(const char *fmt, ...) | 2586 | int printk_deferred(const char *fmt, ...) |
| 2477 | { | 2587 | { |
| 2478 | unsigned long flags; | ||
| 2479 | va_list args; | 2588 | va_list args; |
| 2480 | char *buf; | ||
| 2481 | int r; | 2589 | int r; |
| 2482 | 2590 | ||
| 2483 | local_irq_save(flags); | 2591 | preempt_disable(); |
| 2484 | buf = __get_cpu_var(printk_sched_buf); | ||
| 2485 | |||
| 2486 | va_start(args, fmt); | 2592 | va_start(args, fmt); |
| 2487 | r = vsnprintf(buf, PRINTK_BUF_SIZE, fmt, args); | 2593 | r = vprintk_emit(0, SCHED_MESSAGE_LOGLEVEL, NULL, 0, fmt, args); |
| 2488 | va_end(args); | 2594 | va_end(args); |
| 2489 | 2595 | ||
| 2490 | __this_cpu_or(printk_pending, PRINTK_PENDING_SCHED); | 2596 | __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT); |
| 2491 | irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); | 2597 | irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); |
| 2492 | local_irq_restore(flags); | 2598 | preempt_enable(); |
| 2493 | 2599 | ||
| 2494 | return r; | 2600 | return r; |
| 2495 | } | 2601 | } |
diff --git a/kernel/profile.c b/kernel/profile.c index cb980f0c731b..54bf5ba26420 100644 --- a/kernel/profile.c +++ b/kernel/profile.c | |||
| @@ -52,9 +52,9 @@ static DEFINE_MUTEX(profile_flip_mutex); | |||
| 52 | 52 | ||
| 53 | int profile_setup(char *str) | 53 | int profile_setup(char *str) |
| 54 | { | 54 | { |
| 55 | static char schedstr[] = "schedule"; | 55 | static const char schedstr[] = "schedule"; |
| 56 | static char sleepstr[] = "sleep"; | 56 | static const char sleepstr[] = "sleep"; |
| 57 | static char kvmstr[] = "kvm"; | 57 | static const char kvmstr[] = "kvm"; |
| 58 | int par; | 58 | int par; |
| 59 | 59 | ||
| 60 | if (!strncmp(str, sleepstr, strlen(sleepstr))) { | 60 | if (!strncmp(str, sleepstr, strlen(sleepstr))) { |
| @@ -64,12 +64,10 @@ int profile_setup(char *str) | |||
| 64 | str += strlen(sleepstr) + 1; | 64 | str += strlen(sleepstr) + 1; |
| 65 | if (get_option(&str, &par)) | 65 | if (get_option(&str, &par)) |
| 66 | prof_shift = par; | 66 | prof_shift = par; |
| 67 | printk(KERN_INFO | 67 | pr_info("kernel sleep profiling enabled (shift: %ld)\n", |
| 68 | "kernel sleep profiling enabled (shift: %ld)\n", | ||
| 69 | prof_shift); | 68 | prof_shift); |
| 70 | #else | 69 | #else |
| 71 | printk(KERN_WARNING | 70 | pr_warn("kernel sleep profiling requires CONFIG_SCHEDSTATS\n"); |
| 72 | "kernel sleep profiling requires CONFIG_SCHEDSTATS\n"); | ||
| 73 | #endif /* CONFIG_SCHEDSTATS */ | 71 | #endif /* CONFIG_SCHEDSTATS */ |
| 74 | } else if (!strncmp(str, schedstr, strlen(schedstr))) { | 72 | } else if (!strncmp(str, schedstr, strlen(schedstr))) { |
| 75 | prof_on = SCHED_PROFILING; | 73 | prof_on = SCHED_PROFILING; |
| @@ -77,8 +75,7 @@ int profile_setup(char *str) | |||
| 77 | str += strlen(schedstr) + 1; | 75 | str += strlen(schedstr) + 1; |
| 78 | if (get_option(&str, &par)) | 76 | if (get_option(&str, &par)) |
| 79 | prof_shift = par; | 77 | prof_shift = par; |
| 80 | printk(KERN_INFO | 78 | pr_info("kernel schedule profiling enabled (shift: %ld)\n", |
| 81 | "kernel schedule profiling enabled (shift: %ld)\n", | ||
| 82 | prof_shift); | 79 | prof_shift); |
| 83 | } else if (!strncmp(str, kvmstr, strlen(kvmstr))) { | 80 | } else if (!strncmp(str, kvmstr, strlen(kvmstr))) { |
| 84 | prof_on = KVM_PROFILING; | 81 | prof_on = KVM_PROFILING; |
| @@ -86,13 +83,12 @@ int profile_setup(char *str) | |||
| 86 | str += strlen(kvmstr) + 1; | 83 | str += strlen(kvmstr) + 1; |
| 87 | if (get_option(&str, &par)) | 84 | if (get_option(&str, &par)) |
| 88 | prof_shift = par; | 85 | prof_shift = par; |
| 89 | printk(KERN_INFO | 86 | pr_info("kernel KVM profiling enabled (shift: %ld)\n", |
| 90 | "kernel KVM profiling enabled (shift: %ld)\n", | ||
| 91 | prof_shift); | 87 | prof_shift); |
| 92 | } else if (get_option(&str, &par)) { | 88 | } else if (get_option(&str, &par)) { |
| 93 | prof_shift = par; | 89 | prof_shift = par; |
| 94 | prof_on = CPU_PROFILING; | 90 | prof_on = CPU_PROFILING; |
| 95 | printk(KERN_INFO "kernel profiling enabled (shift: %ld)\n", | 91 | pr_info("kernel profiling enabled (shift: %ld)\n", |
| 96 | prof_shift); | 92 | prof_shift); |
| 97 | } | 93 | } |
| 98 | return 1; | 94 | return 1; |
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index bd30bc61bc05..7fa34f86e5ba 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c | |||
| @@ -58,9 +58,11 @@ torture_param(int, fqs_duration, 0, | |||
| 58 | "Duration of fqs bursts (us), 0 to disable"); | 58 | "Duration of fqs bursts (us), 0 to disable"); |
| 59 | torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)"); | 59 | torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)"); |
| 60 | torture_param(int, fqs_stutter, 3, "Wait time between fqs bursts (s)"); | 60 | torture_param(int, fqs_stutter, 3, "Wait time between fqs bursts (s)"); |
| 61 | torture_param(bool, gp_cond, false, "Use conditional/async GP wait primitives"); | ||
| 61 | torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); | 62 | torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); |
| 62 | torture_param(bool, gp_normal, false, | 63 | torture_param(bool, gp_normal, false, |
| 63 | "Use normal (non-expedited) GP wait primitives"); | 64 | "Use normal (non-expedited) GP wait primitives"); |
| 65 | torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives"); | ||
| 64 | torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers"); | 66 | torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers"); |
| 65 | torture_param(int, n_barrier_cbs, 0, | 67 | torture_param(int, n_barrier_cbs, 0, |
| 66 | "# of callbacks/kthreads for barrier testing"); | 68 | "# of callbacks/kthreads for barrier testing"); |
| @@ -138,6 +140,18 @@ static long n_barrier_attempts; | |||
| 138 | static long n_barrier_successes; | 140 | static long n_barrier_successes; |
| 139 | static struct list_head rcu_torture_removed; | 141 | static struct list_head rcu_torture_removed; |
| 140 | 142 | ||
| 143 | static int rcu_torture_writer_state; | ||
| 144 | #define RTWS_FIXED_DELAY 0 | ||
| 145 | #define RTWS_DELAY 1 | ||
| 146 | #define RTWS_REPLACE 2 | ||
| 147 | #define RTWS_DEF_FREE 3 | ||
| 148 | #define RTWS_EXP_SYNC 4 | ||
| 149 | #define RTWS_COND_GET 5 | ||
| 150 | #define RTWS_COND_SYNC 6 | ||
| 151 | #define RTWS_SYNC 7 | ||
| 152 | #define RTWS_STUTTER 8 | ||
| 153 | #define RTWS_STOPPING 9 | ||
| 154 | |||
| 141 | #if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) | 155 | #if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) |
| 142 | #define RCUTORTURE_RUNNABLE_INIT 1 | 156 | #define RCUTORTURE_RUNNABLE_INIT 1 |
| 143 | #else | 157 | #else |
| @@ -214,6 +228,7 @@ rcu_torture_free(struct rcu_torture *p) | |||
| 214 | */ | 228 | */ |
| 215 | 229 | ||
| 216 | struct rcu_torture_ops { | 230 | struct rcu_torture_ops { |
| 231 | int ttype; | ||
| 217 | void (*init)(void); | 232 | void (*init)(void); |
| 218 | int (*readlock)(void); | 233 | int (*readlock)(void); |
| 219 | void (*read_delay)(struct torture_random_state *rrsp); | 234 | void (*read_delay)(struct torture_random_state *rrsp); |
| @@ -222,6 +237,8 @@ struct rcu_torture_ops { | |||
| 222 | void (*deferred_free)(struct rcu_torture *p); | 237 | void (*deferred_free)(struct rcu_torture *p); |
| 223 | void (*sync)(void); | 238 | void (*sync)(void); |
| 224 | void (*exp_sync)(void); | 239 | void (*exp_sync)(void); |
| 240 | unsigned long (*get_state)(void); | ||
| 241 | void (*cond_sync)(unsigned long oldstate); | ||
| 225 | void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); | 242 | void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); |
| 226 | void (*cb_barrier)(void); | 243 | void (*cb_barrier)(void); |
| 227 | void (*fqs)(void); | 244 | void (*fqs)(void); |
| @@ -273,10 +290,48 @@ static int rcu_torture_completed(void) | |||
| 273 | return rcu_batches_completed(); | 290 | return rcu_batches_completed(); |
| 274 | } | 291 | } |
| 275 | 292 | ||
| 293 | /* | ||
| 294 | * Update callback in the pipe. This should be invoked after a grace period. | ||
| 295 | */ | ||
| 296 | static bool | ||
| 297 | rcu_torture_pipe_update_one(struct rcu_torture *rp) | ||
| 298 | { | ||
| 299 | int i; | ||
| 300 | |||
| 301 | i = rp->rtort_pipe_count; | ||
| 302 | if (i > RCU_TORTURE_PIPE_LEN) | ||
| 303 | i = RCU_TORTURE_PIPE_LEN; | ||
| 304 | atomic_inc(&rcu_torture_wcount[i]); | ||
| 305 | if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { | ||
| 306 | rp->rtort_mbtest = 0; | ||
| 307 | return true; | ||
| 308 | } | ||
| 309 | return false; | ||
| 310 | } | ||
| 311 | |||
| 312 | /* | ||
| 313 | * Update all callbacks in the pipe. Suitable for synchronous grace-period | ||
| 314 | * primitives. | ||
| 315 | */ | ||
| 316 | static void | ||
| 317 | rcu_torture_pipe_update(struct rcu_torture *old_rp) | ||
| 318 | { | ||
| 319 | struct rcu_torture *rp; | ||
| 320 | struct rcu_torture *rp1; | ||
| 321 | |||
| 322 | if (old_rp) | ||
| 323 | list_add(&old_rp->rtort_free, &rcu_torture_removed); | ||
| 324 | list_for_each_entry_safe(rp, rp1, &rcu_torture_removed, rtort_free) { | ||
| 325 | if (rcu_torture_pipe_update_one(rp)) { | ||
| 326 | list_del(&rp->rtort_free); | ||
| 327 | rcu_torture_free(rp); | ||
| 328 | } | ||
| 329 | } | ||
| 330 | } | ||
| 331 | |||
| 276 | static void | 332 | static void |
| 277 | rcu_torture_cb(struct rcu_head *p) | 333 | rcu_torture_cb(struct rcu_head *p) |
| 278 | { | 334 | { |
| 279 | int i; | ||
| 280 | struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu); | 335 | struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu); |
| 281 | 336 | ||
| 282 | if (torture_must_stop_irq()) { | 337 | if (torture_must_stop_irq()) { |
| @@ -284,16 +339,10 @@ rcu_torture_cb(struct rcu_head *p) | |||
| 284 | /* The next initialization will pick up the pieces. */ | 339 | /* The next initialization will pick up the pieces. */ |
| 285 | return; | 340 | return; |
| 286 | } | 341 | } |
| 287 | i = rp->rtort_pipe_count; | 342 | if (rcu_torture_pipe_update_one(rp)) |
| 288 | if (i > RCU_TORTURE_PIPE_LEN) | ||
| 289 | i = RCU_TORTURE_PIPE_LEN; | ||
| 290 | atomic_inc(&rcu_torture_wcount[i]); | ||
| 291 | if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { | ||
| 292 | rp->rtort_mbtest = 0; | ||
| 293 | rcu_torture_free(rp); | 343 | rcu_torture_free(rp); |
| 294 | } else { | 344 | else |
| 295 | cur_ops->deferred_free(rp); | 345 | cur_ops->deferred_free(rp); |
| 296 | } | ||
| 297 | } | 346 | } |
| 298 | 347 | ||
| 299 | static int rcu_no_completed(void) | 348 | static int rcu_no_completed(void) |
| @@ -312,6 +361,7 @@ static void rcu_sync_torture_init(void) | |||
| 312 | } | 361 | } |
| 313 | 362 | ||
| 314 | static struct rcu_torture_ops rcu_ops = { | 363 | static struct rcu_torture_ops rcu_ops = { |
| 364 | .ttype = RCU_FLAVOR, | ||
| 315 | .init = rcu_sync_torture_init, | 365 | .init = rcu_sync_torture_init, |
| 316 | .readlock = rcu_torture_read_lock, | 366 | .readlock = rcu_torture_read_lock, |
| 317 | .read_delay = rcu_read_delay, | 367 | .read_delay = rcu_read_delay, |
| @@ -320,6 +370,8 @@ static struct rcu_torture_ops rcu_ops = { | |||
| 320 | .deferred_free = rcu_torture_deferred_free, | 370 | .deferred_free = rcu_torture_deferred_free, |
| 321 | .sync = synchronize_rcu, | 371 | .sync = synchronize_rcu, |
| 322 | .exp_sync = synchronize_rcu_expedited, | 372 | .exp_sync = synchronize_rcu_expedited, |
| 373 | .get_state = get_state_synchronize_rcu, | ||
| 374 | .cond_sync = cond_synchronize_rcu, | ||
| 323 | .call = call_rcu, | 375 | .call = call_rcu, |
| 324 | .cb_barrier = rcu_barrier, | 376 | .cb_barrier = rcu_barrier, |
| 325 | .fqs = rcu_force_quiescent_state, | 377 | .fqs = rcu_force_quiescent_state, |
| @@ -355,6 +407,7 @@ static void rcu_bh_torture_deferred_free(struct rcu_torture *p) | |||
| 355 | } | 407 | } |
| 356 | 408 | ||
| 357 | static struct rcu_torture_ops rcu_bh_ops = { | 409 | static struct rcu_torture_ops rcu_bh_ops = { |
| 410 | .ttype = RCU_BH_FLAVOR, | ||
| 358 | .init = rcu_sync_torture_init, | 411 | .init = rcu_sync_torture_init, |
| 359 | .readlock = rcu_bh_torture_read_lock, | 412 | .readlock = rcu_bh_torture_read_lock, |
| 360 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | 413 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ |
| @@ -397,6 +450,7 @@ call_rcu_busted(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | |||
| 397 | } | 450 | } |
| 398 | 451 | ||
| 399 | static struct rcu_torture_ops rcu_busted_ops = { | 452 | static struct rcu_torture_ops rcu_busted_ops = { |
| 453 | .ttype = INVALID_RCU_FLAVOR, | ||
| 400 | .init = rcu_sync_torture_init, | 454 | .init = rcu_sync_torture_init, |
| 401 | .readlock = rcu_torture_read_lock, | 455 | .readlock = rcu_torture_read_lock, |
| 402 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | 456 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ |
| @@ -479,9 +533,11 @@ static void srcu_torture_stats(char *page) | |||
| 479 | page += sprintf(page, "%s%s per-CPU(idx=%d):", | 533 | page += sprintf(page, "%s%s per-CPU(idx=%d):", |
| 480 | torture_type, TORTURE_FLAG, idx); | 534 | torture_type, TORTURE_FLAG, idx); |
| 481 | for_each_possible_cpu(cpu) { | 535 | for_each_possible_cpu(cpu) { |
| 482 | page += sprintf(page, " %d(%lu,%lu)", cpu, | 536 | long c0, c1; |
| 483 | per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx], | 537 | |
| 484 | per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]); | 538 | c0 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx]; |
| 539 | c1 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]; | ||
| 540 | page += sprintf(page, " %d(%ld,%ld)", cpu, c0, c1); | ||
| 485 | } | 541 | } |
| 486 | sprintf(page, "\n"); | 542 | sprintf(page, "\n"); |
| 487 | } | 543 | } |
| @@ -492,6 +548,7 @@ static void srcu_torture_synchronize_expedited(void) | |||
| 492 | } | 548 | } |
| 493 | 549 | ||
| 494 | static struct rcu_torture_ops srcu_ops = { | 550 | static struct rcu_torture_ops srcu_ops = { |
| 551 | .ttype = SRCU_FLAVOR, | ||
| 495 | .init = rcu_sync_torture_init, | 552 | .init = rcu_sync_torture_init, |
| 496 | .readlock = srcu_torture_read_lock, | 553 | .readlock = srcu_torture_read_lock, |
| 497 | .read_delay = srcu_read_delay, | 554 | .read_delay = srcu_read_delay, |
| @@ -527,6 +584,7 @@ static void rcu_sched_torture_deferred_free(struct rcu_torture *p) | |||
| 527 | } | 584 | } |
| 528 | 585 | ||
| 529 | static struct rcu_torture_ops sched_ops = { | 586 | static struct rcu_torture_ops sched_ops = { |
| 587 | .ttype = RCU_SCHED_FLAVOR, | ||
| 530 | .init = rcu_sync_torture_init, | 588 | .init = rcu_sync_torture_init, |
| 531 | .readlock = sched_torture_read_lock, | 589 | .readlock = sched_torture_read_lock, |
| 532 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | 590 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ |
| @@ -688,23 +746,59 @@ rcu_torture_fqs(void *arg) | |||
| 688 | static int | 746 | static int |
| 689 | rcu_torture_writer(void *arg) | 747 | rcu_torture_writer(void *arg) |
| 690 | { | 748 | { |
| 691 | bool exp; | 749 | unsigned long gp_snap; |
| 750 | bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal; | ||
| 751 | bool gp_sync1 = gp_sync; | ||
| 692 | int i; | 752 | int i; |
| 693 | struct rcu_torture *rp; | 753 | struct rcu_torture *rp; |
| 694 | struct rcu_torture *rp1; | ||
| 695 | struct rcu_torture *old_rp; | 754 | struct rcu_torture *old_rp; |
| 696 | static DEFINE_TORTURE_RANDOM(rand); | 755 | static DEFINE_TORTURE_RANDOM(rand); |
| 756 | int synctype[] = { RTWS_DEF_FREE, RTWS_EXP_SYNC, | ||
| 757 | RTWS_COND_GET, RTWS_SYNC }; | ||
| 758 | int nsynctypes = 0; | ||
| 697 | 759 | ||
| 698 | VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); | 760 | VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); |
| 699 | set_user_nice(current, MAX_NICE); | 761 | |
| 762 | /* Initialize synctype[] array. If none set, take default. */ | ||
| 763 | if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync) | ||
| 764 | gp_cond1 = gp_exp1 = gp_normal1 = gp_sync1 = true; | ||
| 765 | if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync) | ||
| 766 | synctype[nsynctypes++] = RTWS_COND_GET; | ||
| 767 | else if (gp_cond && (!cur_ops->get_state || !cur_ops->cond_sync)) | ||
| 768 | pr_alert("rcu_torture_writer: gp_cond without primitives.\n"); | ||
| 769 | if (gp_exp1 && cur_ops->exp_sync) | ||
| 770 | synctype[nsynctypes++] = RTWS_EXP_SYNC; | ||
| 771 | else if (gp_exp && !cur_ops->exp_sync) | ||
| 772 | pr_alert("rcu_torture_writer: gp_exp without primitives.\n"); | ||
| 773 | if (gp_normal1 && cur_ops->deferred_free) | ||
| 774 | synctype[nsynctypes++] = RTWS_DEF_FREE; | ||
| 775 | else if (gp_normal && !cur_ops->deferred_free) | ||
| 776 | pr_alert("rcu_torture_writer: gp_normal without primitives.\n"); | ||
| 777 | if (gp_sync1 && cur_ops->sync) | ||
| 778 | synctype[nsynctypes++] = RTWS_SYNC; | ||
| 779 | else if (gp_sync && !cur_ops->sync) | ||
| 780 | pr_alert("rcu_torture_writer: gp_sync without primitives.\n"); | ||
| 781 | if (WARN_ONCE(nsynctypes == 0, | ||
| 782 | "rcu_torture_writer: No update-side primitives.\n")) { | ||
| 783 | /* | ||
| 784 | * No updates primitives, so don't try updating. | ||
| 785 | * The resulting test won't be testing much, hence the | ||
| 786 | * above WARN_ONCE(). | ||
| 787 | */ | ||
| 788 | rcu_torture_writer_state = RTWS_STOPPING; | ||
| 789 | torture_kthread_stopping("rcu_torture_writer"); | ||
| 790 | } | ||
| 700 | 791 | ||
| 701 | do { | 792 | do { |
| 793 | rcu_torture_writer_state = RTWS_FIXED_DELAY; | ||
| 702 | schedule_timeout_uninterruptible(1); | 794 | schedule_timeout_uninterruptible(1); |
| 703 | rp = rcu_torture_alloc(); | 795 | rp = rcu_torture_alloc(); |
| 704 | if (rp == NULL) | 796 | if (rp == NULL) |
| 705 | continue; | 797 | continue; |
| 706 | rp->rtort_pipe_count = 0; | 798 | rp->rtort_pipe_count = 0; |
| 799 | rcu_torture_writer_state = RTWS_DELAY; | ||
| 707 | udelay(torture_random(&rand) & 0x3ff); | 800 | udelay(torture_random(&rand) & 0x3ff); |
| 801 | rcu_torture_writer_state = RTWS_REPLACE; | ||
| 708 | old_rp = rcu_dereference_check(rcu_torture_current, | 802 | old_rp = rcu_dereference_check(rcu_torture_current, |
| 709 | current == writer_task); | 803 | current == writer_task); |
| 710 | rp->rtort_mbtest = 1; | 804 | rp->rtort_mbtest = 1; |
| @@ -716,35 +810,42 @@ rcu_torture_writer(void *arg) | |||
| 716 | i = RCU_TORTURE_PIPE_LEN; | 810 | i = RCU_TORTURE_PIPE_LEN; |
| 717 | atomic_inc(&rcu_torture_wcount[i]); | 811 | atomic_inc(&rcu_torture_wcount[i]); |
| 718 | old_rp->rtort_pipe_count++; | 812 | old_rp->rtort_pipe_count++; |
| 719 | if (gp_normal == gp_exp) | 813 | switch (synctype[torture_random(&rand) % nsynctypes]) { |
| 720 | exp = !!(torture_random(&rand) & 0x80); | 814 | case RTWS_DEF_FREE: |
| 721 | else | 815 | rcu_torture_writer_state = RTWS_DEF_FREE; |
| 722 | exp = gp_exp; | ||
| 723 | if (!exp) { | ||
| 724 | cur_ops->deferred_free(old_rp); | 816 | cur_ops->deferred_free(old_rp); |
| 725 | } else { | 817 | break; |
| 818 | case RTWS_EXP_SYNC: | ||
| 819 | rcu_torture_writer_state = RTWS_EXP_SYNC; | ||
| 726 | cur_ops->exp_sync(); | 820 | cur_ops->exp_sync(); |
| 727 | list_add(&old_rp->rtort_free, | 821 | rcu_torture_pipe_update(old_rp); |
| 728 | &rcu_torture_removed); | 822 | break; |
| 729 | list_for_each_entry_safe(rp, rp1, | 823 | case RTWS_COND_GET: |
| 730 | &rcu_torture_removed, | 824 | rcu_torture_writer_state = RTWS_COND_GET; |
| 731 | rtort_free) { | 825 | gp_snap = cur_ops->get_state(); |
| 732 | i = rp->rtort_pipe_count; | 826 | i = torture_random(&rand) % 16; |
| 733 | if (i > RCU_TORTURE_PIPE_LEN) | 827 | if (i != 0) |
| 734 | i = RCU_TORTURE_PIPE_LEN; | 828 | schedule_timeout_interruptible(i); |
| 735 | atomic_inc(&rcu_torture_wcount[i]); | 829 | udelay(torture_random(&rand) % 1000); |
| 736 | if (++rp->rtort_pipe_count >= | 830 | rcu_torture_writer_state = RTWS_COND_SYNC; |
| 737 | RCU_TORTURE_PIPE_LEN) { | 831 | cur_ops->cond_sync(gp_snap); |
| 738 | rp->rtort_mbtest = 0; | 832 | rcu_torture_pipe_update(old_rp); |
| 739 | list_del(&rp->rtort_free); | 833 | break; |
| 740 | rcu_torture_free(rp); | 834 | case RTWS_SYNC: |
| 741 | } | 835 | rcu_torture_writer_state = RTWS_SYNC; |
| 742 | } | 836 | cur_ops->sync(); |
| 837 | rcu_torture_pipe_update(old_rp); | ||
| 838 | break; | ||
| 839 | default: | ||
| 840 | WARN_ON_ONCE(1); | ||
| 841 | break; | ||
| 743 | } | 842 | } |
| 744 | } | 843 | } |
| 745 | rcutorture_record_progress(++rcu_torture_current_version); | 844 | rcutorture_record_progress(++rcu_torture_current_version); |
| 845 | rcu_torture_writer_state = RTWS_STUTTER; | ||
| 746 | stutter_wait("rcu_torture_writer"); | 846 | stutter_wait("rcu_torture_writer"); |
| 747 | } while (!torture_must_stop()); | 847 | } while (!torture_must_stop()); |
| 848 | rcu_torture_writer_state = RTWS_STOPPING; | ||
| 748 | torture_kthread_stopping("rcu_torture_writer"); | 849 | torture_kthread_stopping("rcu_torture_writer"); |
| 749 | return 0; | 850 | return 0; |
| 750 | } | 851 | } |
| @@ -784,7 +885,7 @@ rcu_torture_fakewriter(void *arg) | |||
| 784 | return 0; | 885 | return 0; |
| 785 | } | 886 | } |
| 786 | 887 | ||
| 787 | void rcutorture_trace_dump(void) | 888 | static void rcutorture_trace_dump(void) |
| 788 | { | 889 | { |
| 789 | static atomic_t beenhere = ATOMIC_INIT(0); | 890 | static atomic_t beenhere = ATOMIC_INIT(0); |
| 790 | 891 | ||
| @@ -918,11 +1019,13 @@ rcu_torture_reader(void *arg) | |||
| 918 | __this_cpu_inc(rcu_torture_batch[completed]); | 1019 | __this_cpu_inc(rcu_torture_batch[completed]); |
| 919 | preempt_enable(); | 1020 | preempt_enable(); |
| 920 | cur_ops->readunlock(idx); | 1021 | cur_ops->readunlock(idx); |
| 921 | schedule(); | 1022 | cond_resched(); |
| 922 | stutter_wait("rcu_torture_reader"); | 1023 | stutter_wait("rcu_torture_reader"); |
| 923 | } while (!torture_must_stop()); | 1024 | } while (!torture_must_stop()); |
| 924 | if (irqreader && cur_ops->irq_capable) | 1025 | if (irqreader && cur_ops->irq_capable) { |
| 925 | del_timer_sync(&t); | 1026 | del_timer_sync(&t); |
| 1027 | destroy_timer_on_stack(&t); | ||
| 1028 | } | ||
| 926 | torture_kthread_stopping("rcu_torture_reader"); | 1029 | torture_kthread_stopping("rcu_torture_reader"); |
| 927 | return 0; | 1030 | return 0; |
| 928 | } | 1031 | } |
| @@ -937,6 +1040,7 @@ rcu_torture_printk(char *page) | |||
| 937 | int i; | 1040 | int i; |
| 938 | long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; | 1041 | long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; |
| 939 | long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; | 1042 | long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; |
| 1043 | static unsigned long rtcv_snap = ULONG_MAX; | ||
| 940 | 1044 | ||
| 941 | for_each_possible_cpu(cpu) { | 1045 | for_each_possible_cpu(cpu) { |
| 942 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { | 1046 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { |
| @@ -997,6 +1101,22 @@ rcu_torture_printk(char *page) | |||
| 997 | page += sprintf(page, "\n"); | 1101 | page += sprintf(page, "\n"); |
| 998 | if (cur_ops->stats) | 1102 | if (cur_ops->stats) |
| 999 | cur_ops->stats(page); | 1103 | cur_ops->stats(page); |
| 1104 | if (rtcv_snap == rcu_torture_current_version && | ||
| 1105 | rcu_torture_current != NULL) { | ||
| 1106 | int __maybe_unused flags; | ||
| 1107 | unsigned long __maybe_unused gpnum; | ||
| 1108 | unsigned long __maybe_unused completed; | ||
| 1109 | |||
| 1110 | rcutorture_get_gp_data(cur_ops->ttype, | ||
| 1111 | &flags, &gpnum, &completed); | ||
| 1112 | page += sprintf(page, | ||
| 1113 | "??? Writer stall state %d g%lu c%lu f%#x\n", | ||
| 1114 | rcu_torture_writer_state, | ||
| 1115 | gpnum, completed, flags); | ||
| 1116 | show_rcu_gp_kthreads(); | ||
| 1117 | rcutorture_trace_dump(); | ||
| 1118 | } | ||
| 1119 | rtcv_snap = rcu_torture_current_version; | ||
| 1000 | } | 1120 | } |
| 1001 | 1121 | ||
| 1002 | /* | 1122 | /* |
| @@ -1146,7 +1266,7 @@ static int __init rcu_torture_stall_init(void) | |||
| 1146 | } | 1266 | } |
| 1147 | 1267 | ||
| 1148 | /* Callback function for RCU barrier testing. */ | 1268 | /* Callback function for RCU barrier testing. */ |
| 1149 | void rcu_torture_barrier_cbf(struct rcu_head *rcu) | 1269 | static void rcu_torture_barrier_cbf(struct rcu_head *rcu) |
| 1150 | { | 1270 | { |
| 1151 | atomic_inc(&barrier_cbs_invoked); | 1271 | atomic_inc(&barrier_cbs_invoked); |
| 1152 | } | 1272 | } |
| @@ -1416,7 +1536,8 @@ rcu_torture_init(void) | |||
| 1416 | &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &sched_ops, | 1536 | &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &sched_ops, |
| 1417 | }; | 1537 | }; |
| 1418 | 1538 | ||
| 1419 | torture_init_begin(torture_type, verbose, &rcutorture_runnable); | 1539 | if (!torture_init_begin(torture_type, verbose, &rcutorture_runnable)) |
| 1540 | return -EBUSY; | ||
| 1420 | 1541 | ||
| 1421 | /* Process args and tell the world that the torturer is on the job. */ | 1542 | /* Process args and tell the world that the torturer is on the job. */ |
| 1422 | for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { | 1543 | for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { |
| @@ -1441,10 +1562,13 @@ rcu_torture_init(void) | |||
| 1441 | if (cur_ops->init) | 1562 | if (cur_ops->init) |
| 1442 | cur_ops->init(); /* no "goto unwind" prior to this point!!! */ | 1563 | cur_ops->init(); /* no "goto unwind" prior to this point!!! */ |
| 1443 | 1564 | ||
| 1444 | if (nreaders >= 0) | 1565 | if (nreaders >= 0) { |
| 1445 | nrealreaders = nreaders; | 1566 | nrealreaders = nreaders; |
| 1446 | else | 1567 | } else { |
| 1447 | nrealreaders = 2 * num_online_cpus(); | 1568 | nrealreaders = num_online_cpus() - 1; |
| 1569 | if (nrealreaders <= 0) | ||
| 1570 | nrealreaders = 1; | ||
| 1571 | } | ||
| 1448 | rcu_torture_print_module_parms(cur_ops, "Start of test"); | 1572 | rcu_torture_print_module_parms(cur_ops, "Start of test"); |
| 1449 | 1573 | ||
| 1450 | /* Set up the freelist. */ | 1574 | /* Set up the freelist. */ |
| @@ -1533,7 +1657,8 @@ rcu_torture_init(void) | |||
| 1533 | fqs_duration = 0; | 1657 | fqs_duration = 0; |
| 1534 | if (fqs_duration) { | 1658 | if (fqs_duration) { |
| 1535 | /* Create the fqs thread */ | 1659 | /* Create the fqs thread */ |
| 1536 | torture_create_kthread(rcu_torture_fqs, NULL, fqs_task); | 1660 | firsterr = torture_create_kthread(rcu_torture_fqs, NULL, |
| 1661 | fqs_task); | ||
| 1537 | if (firsterr) | 1662 | if (firsterr) |
| 1538 | goto unwind; | 1663 | goto unwind; |
| 1539 | } | 1664 | } |
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h index 431528520562..858c56569127 100644 --- a/kernel/rcu/tiny_plugin.h +++ b/kernel/rcu/tiny_plugin.h | |||
| @@ -144,7 +144,7 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp) | |||
| 144 | return; | 144 | return; |
| 145 | rcp->ticks_this_gp++; | 145 | rcp->ticks_this_gp++; |
| 146 | j = jiffies; | 146 | j = jiffies; |
| 147 | js = rcp->jiffies_stall; | 147 | js = ACCESS_ONCE(rcp->jiffies_stall); |
| 148 | if (*rcp->curtail && ULONG_CMP_GE(j, js)) { | 148 | if (*rcp->curtail && ULONG_CMP_GE(j, js)) { |
| 149 | pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n", | 149 | pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n", |
| 150 | rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting, | 150 | rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting, |
| @@ -152,17 +152,17 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp) | |||
| 152 | dump_stack(); | 152 | dump_stack(); |
| 153 | } | 153 | } |
| 154 | if (*rcp->curtail && ULONG_CMP_GE(j, js)) | 154 | if (*rcp->curtail && ULONG_CMP_GE(j, js)) |
| 155 | rcp->jiffies_stall = jiffies + | 155 | ACCESS_ONCE(rcp->jiffies_stall) = jiffies + |
| 156 | 3 * rcu_jiffies_till_stall_check() + 3; | 156 | 3 * rcu_jiffies_till_stall_check() + 3; |
| 157 | else if (ULONG_CMP_GE(j, js)) | 157 | else if (ULONG_CMP_GE(j, js)) |
| 158 | rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); | 158 | ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check(); |
| 159 | } | 159 | } |
| 160 | 160 | ||
| 161 | static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) | 161 | static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) |
| 162 | { | 162 | { |
| 163 | rcp->ticks_this_gp = 0; | 163 | rcp->ticks_this_gp = 0; |
| 164 | rcp->gp_start = jiffies; | 164 | rcp->gp_start = jiffies; |
| 165 | rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); | 165 | ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check(); |
| 166 | } | 166 | } |
| 167 | 167 | ||
| 168 | static void check_cpu_stalls(void) | 168 | static void check_cpu_stalls(void) |
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 0c47e300210a..f1ba77363fbb 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c | |||
| @@ -101,7 +101,7 @@ DEFINE_PER_CPU(struct rcu_data, sname##_data) | |||
| 101 | RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); | 101 | RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); |
| 102 | RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); | 102 | RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); |
| 103 | 103 | ||
| 104 | static struct rcu_state *rcu_state; | 104 | static struct rcu_state *rcu_state_p; |
| 105 | LIST_HEAD(rcu_struct_flavors); | 105 | LIST_HEAD(rcu_struct_flavors); |
| 106 | 106 | ||
| 107 | /* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */ | 107 | /* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */ |
| @@ -243,7 +243,7 @@ static ulong jiffies_till_next_fqs = ULONG_MAX; | |||
| 243 | module_param(jiffies_till_first_fqs, ulong, 0644); | 243 | module_param(jiffies_till_first_fqs, ulong, 0644); |
| 244 | module_param(jiffies_till_next_fqs, ulong, 0644); | 244 | module_param(jiffies_till_next_fqs, ulong, 0644); |
| 245 | 245 | ||
| 246 | static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, | 246 | static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, |
| 247 | struct rcu_data *rdp); | 247 | struct rcu_data *rdp); |
| 248 | static void force_qs_rnp(struct rcu_state *rsp, | 248 | static void force_qs_rnp(struct rcu_state *rsp, |
| 249 | int (*f)(struct rcu_data *rsp, bool *isidle, | 249 | int (*f)(struct rcu_data *rsp, bool *isidle, |
| @@ -271,6 +271,15 @@ long rcu_batches_completed_bh(void) | |||
| 271 | EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); | 271 | EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); |
| 272 | 272 | ||
| 273 | /* | 273 | /* |
| 274 | * Force a quiescent state. | ||
| 275 | */ | ||
| 276 | void rcu_force_quiescent_state(void) | ||
| 277 | { | ||
| 278 | force_quiescent_state(rcu_state_p); | ||
| 279 | } | ||
| 280 | EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); | ||
| 281 | |||
| 282 | /* | ||
| 274 | * Force a quiescent state for RCU BH. | 283 | * Force a quiescent state for RCU BH. |
| 275 | */ | 284 | */ |
| 276 | void rcu_bh_force_quiescent_state(void) | 285 | void rcu_bh_force_quiescent_state(void) |
| @@ -280,6 +289,21 @@ void rcu_bh_force_quiescent_state(void) | |||
| 280 | EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); | 289 | EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); |
| 281 | 290 | ||
| 282 | /* | 291 | /* |
| 292 | * Show the state of the grace-period kthreads. | ||
| 293 | */ | ||
| 294 | void show_rcu_gp_kthreads(void) | ||
| 295 | { | ||
| 296 | struct rcu_state *rsp; | ||
| 297 | |||
| 298 | for_each_rcu_flavor(rsp) { | ||
| 299 | pr_info("%s: wait state: %d ->state: %#lx\n", | ||
| 300 | rsp->name, rsp->gp_state, rsp->gp_kthread->state); | ||
| 301 | /* sched_show_task(rsp->gp_kthread); */ | ||
| 302 | } | ||
| 303 | } | ||
| 304 | EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads); | ||
| 305 | |||
| 306 | /* | ||
| 283 | * Record the number of times rcutorture tests have been initiated and | 307 | * Record the number of times rcutorture tests have been initiated and |
| 284 | * terminated. This information allows the debugfs tracing stats to be | 308 | * terminated. This information allows the debugfs tracing stats to be |
| 285 | * correlated to the rcutorture messages, even when the rcutorture module | 309 | * correlated to the rcutorture messages, even when the rcutorture module |
| @@ -294,6 +318,39 @@ void rcutorture_record_test_transition(void) | |||
| 294 | EXPORT_SYMBOL_GPL(rcutorture_record_test_transition); | 318 | EXPORT_SYMBOL_GPL(rcutorture_record_test_transition); |
| 295 | 319 | ||
| 296 | /* | 320 | /* |
| 321 | * Send along grace-period-related data for rcutorture diagnostics. | ||
| 322 | */ | ||
| 323 | void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, | ||
| 324 | unsigned long *gpnum, unsigned long *completed) | ||
| 325 | { | ||
| 326 | struct rcu_state *rsp = NULL; | ||
| 327 | |||
| 328 | switch (test_type) { | ||
| 329 | case RCU_FLAVOR: | ||
| 330 | rsp = rcu_state_p; | ||
| 331 | break; | ||
| 332 | case RCU_BH_FLAVOR: | ||
| 333 | rsp = &rcu_bh_state; | ||
| 334 | break; | ||
| 335 | case RCU_SCHED_FLAVOR: | ||
| 336 | rsp = &rcu_sched_state; | ||
| 337 | break; | ||
| 338 | default: | ||
| 339 | break; | ||
| 340 | } | ||
| 341 | if (rsp != NULL) { | ||
| 342 | *flags = ACCESS_ONCE(rsp->gp_flags); | ||
| 343 | *gpnum = ACCESS_ONCE(rsp->gpnum); | ||
| 344 | *completed = ACCESS_ONCE(rsp->completed); | ||
| 345 | return; | ||
| 346 | } | ||
| 347 | *flags = 0; | ||
| 348 | *gpnum = 0; | ||
| 349 | *completed = 0; | ||
| 350 | } | ||
| 351 | EXPORT_SYMBOL_GPL(rcutorture_get_gp_data); | ||
| 352 | |||
| 353 | /* | ||
| 297 | * Record the number of writer passes through the current rcutorture test. | 354 | * Record the number of writer passes through the current rcutorture test. |
| 298 | * This is also used to correlate debugfs tracing stats with the rcutorture | 355 | * This is also used to correlate debugfs tracing stats with the rcutorture |
| 299 | * messages. | 356 | * messages. |
| @@ -324,6 +381,28 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) | |||
| 324 | } | 381 | } |
| 325 | 382 | ||
| 326 | /* | 383 | /* |
| 384 | * Return the root node of the specified rcu_state structure. | ||
| 385 | */ | ||
| 386 | static struct rcu_node *rcu_get_root(struct rcu_state *rsp) | ||
| 387 | { | ||
| 388 | return &rsp->node[0]; | ||
| 389 | } | ||
| 390 | |||
| 391 | /* | ||
| 392 | * Is there any need for future grace periods? | ||
| 393 | * Interrupts must be disabled. If the caller does not hold the root | ||
| 394 | * rnp_node structure's ->lock, the results are advisory only. | ||
| 395 | */ | ||
| 396 | static int rcu_future_needs_gp(struct rcu_state *rsp) | ||
| 397 | { | ||
| 398 | struct rcu_node *rnp = rcu_get_root(rsp); | ||
| 399 | int idx = (ACCESS_ONCE(rnp->completed) + 1) & 0x1; | ||
| 400 | int *fp = &rnp->need_future_gp[idx]; | ||
| 401 | |||
| 402 | return ACCESS_ONCE(*fp); | ||
| 403 | } | ||
| 404 | |||
| 405 | /* | ||
| 327 | * Does the current CPU require a not-yet-started grace period? | 406 | * Does the current CPU require a not-yet-started grace period? |
| 328 | * The caller must have disabled interrupts to prevent races with | 407 | * The caller must have disabled interrupts to prevent races with |
| 329 | * normal callback registry. | 408 | * normal callback registry. |
| @@ -335,7 +414,7 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) | |||
| 335 | 414 | ||
| 336 | if (rcu_gp_in_progress(rsp)) | 415 | if (rcu_gp_in_progress(rsp)) |
| 337 | return 0; /* No, a grace period is already in progress. */ | 416 | return 0; /* No, a grace period is already in progress. */ |
| 338 | if (rcu_nocb_needs_gp(rsp)) | 417 | if (rcu_future_needs_gp(rsp)) |
| 339 | return 1; /* Yes, a no-CBs CPU needs one. */ | 418 | return 1; /* Yes, a no-CBs CPU needs one. */ |
| 340 | if (!rdp->nxttail[RCU_NEXT_TAIL]) | 419 | if (!rdp->nxttail[RCU_NEXT_TAIL]) |
| 341 | return 0; /* No, this is a no-CBs (or offline) CPU. */ | 420 | return 0; /* No, this is a no-CBs (or offline) CPU. */ |
| @@ -350,14 +429,6 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) | |||
| 350 | } | 429 | } |
| 351 | 430 | ||
| 352 | /* | 431 | /* |
| 353 | * Return the root node of the specified rcu_state structure. | ||
| 354 | */ | ||
| 355 | static struct rcu_node *rcu_get_root(struct rcu_state *rsp) | ||
| 356 | { | ||
| 357 | return &rsp->node[0]; | ||
| 358 | } | ||
| 359 | |||
| 360 | /* | ||
| 361 | * rcu_eqs_enter_common - current CPU is moving towards extended quiescent state | 432 | * rcu_eqs_enter_common - current CPU is moving towards extended quiescent state |
| 362 | * | 433 | * |
| 363 | * If the new value of the ->dynticks_nesting counter now is zero, | 434 | * If the new value of the ->dynticks_nesting counter now is zero, |
| @@ -387,9 +458,9 @@ static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, | |||
| 387 | } | 458 | } |
| 388 | rcu_prepare_for_idle(smp_processor_id()); | 459 | rcu_prepare_for_idle(smp_processor_id()); |
| 389 | /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ | 460 | /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ |
| 390 | smp_mb__before_atomic_inc(); /* See above. */ | 461 | smp_mb__before_atomic(); /* See above. */ |
| 391 | atomic_inc(&rdtp->dynticks); | 462 | atomic_inc(&rdtp->dynticks); |
| 392 | smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */ | 463 | smp_mb__after_atomic(); /* Force ordering with next sojourn. */ |
| 393 | WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); | 464 | WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); |
| 394 | 465 | ||
| 395 | /* | 466 | /* |
| @@ -507,10 +578,10 @@ void rcu_irq_exit(void) | |||
| 507 | static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval, | 578 | static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval, |
| 508 | int user) | 579 | int user) |
| 509 | { | 580 | { |
| 510 | smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */ | 581 | smp_mb__before_atomic(); /* Force ordering w/previous sojourn. */ |
| 511 | atomic_inc(&rdtp->dynticks); | 582 | atomic_inc(&rdtp->dynticks); |
| 512 | /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ | 583 | /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ |
| 513 | smp_mb__after_atomic_inc(); /* See above. */ | 584 | smp_mb__after_atomic(); /* See above. */ |
| 514 | WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); | 585 | WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); |
| 515 | rcu_cleanup_after_idle(smp_processor_id()); | 586 | rcu_cleanup_after_idle(smp_processor_id()); |
| 516 | trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting); | 587 | trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting); |
| @@ -635,10 +706,10 @@ void rcu_nmi_enter(void) | |||
| 635 | (atomic_read(&rdtp->dynticks) & 0x1)) | 706 | (atomic_read(&rdtp->dynticks) & 0x1)) |
| 636 | return; | 707 | return; |
| 637 | rdtp->dynticks_nmi_nesting++; | 708 | rdtp->dynticks_nmi_nesting++; |
| 638 | smp_mb__before_atomic_inc(); /* Force delay from prior write. */ | 709 | smp_mb__before_atomic(); /* Force delay from prior write. */ |
| 639 | atomic_inc(&rdtp->dynticks); | 710 | atomic_inc(&rdtp->dynticks); |
| 640 | /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ | 711 | /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ |
| 641 | smp_mb__after_atomic_inc(); /* See above. */ | 712 | smp_mb__after_atomic(); /* See above. */ |
| 642 | WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); | 713 | WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); |
| 643 | } | 714 | } |
| 644 | 715 | ||
| @@ -657,9 +728,9 @@ void rcu_nmi_exit(void) | |||
| 657 | --rdtp->dynticks_nmi_nesting != 0) | 728 | --rdtp->dynticks_nmi_nesting != 0) |
| 658 | return; | 729 | return; |
| 659 | /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ | 730 | /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ |
| 660 | smp_mb__before_atomic_inc(); /* See above. */ | 731 | smp_mb__before_atomic(); /* See above. */ |
| 661 | atomic_inc(&rdtp->dynticks); | 732 | atomic_inc(&rdtp->dynticks); |
| 662 | smp_mb__after_atomic_inc(); /* Force delay to next write. */ | 733 | smp_mb__after_atomic(); /* Force delay to next write. */ |
| 663 | WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); | 734 | WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); |
| 664 | } | 735 | } |
| 665 | 736 | ||
| @@ -758,7 +829,12 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp, | |||
| 758 | { | 829 | { |
| 759 | rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); | 830 | rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); |
| 760 | rcu_sysidle_check_cpu(rdp, isidle, maxj); | 831 | rcu_sysidle_check_cpu(rdp, isidle, maxj); |
| 761 | return (rdp->dynticks_snap & 0x1) == 0; | 832 | if ((rdp->dynticks_snap & 0x1) == 0) { |
| 833 | trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); | ||
| 834 | return 1; | ||
| 835 | } else { | ||
| 836 | return 0; | ||
| 837 | } | ||
| 762 | } | 838 | } |
| 763 | 839 | ||
| 764 | /* | 840 | /* |
| @@ -834,7 +910,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, | |||
| 834 | * we will beat on the first one until it gets unstuck, then move | 910 | * we will beat on the first one until it gets unstuck, then move |
| 835 | * to the next. Only do this for the primary flavor of RCU. | 911 | * to the next. Only do this for the primary flavor of RCU. |
| 836 | */ | 912 | */ |
| 837 | if (rdp->rsp == rcu_state && | 913 | if (rdp->rsp == rcu_state_p && |
| 838 | ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) { | 914 | ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) { |
| 839 | rdp->rsp->jiffies_resched += 5; | 915 | rdp->rsp->jiffies_resched += 5; |
| 840 | resched_cpu(rdp->cpu); | 916 | resched_cpu(rdp->cpu); |
| @@ -851,7 +927,7 @@ static void record_gp_stall_check_time(struct rcu_state *rsp) | |||
| 851 | rsp->gp_start = j; | 927 | rsp->gp_start = j; |
| 852 | smp_wmb(); /* Record start time before stall time. */ | 928 | smp_wmb(); /* Record start time before stall time. */ |
| 853 | j1 = rcu_jiffies_till_stall_check(); | 929 | j1 = rcu_jiffies_till_stall_check(); |
| 854 | rsp->jiffies_stall = j + j1; | 930 | ACCESS_ONCE(rsp->jiffies_stall) = j + j1; |
| 855 | rsp->jiffies_resched = j + j1 / 2; | 931 | rsp->jiffies_resched = j + j1 / 2; |
| 856 | } | 932 | } |
| 857 | 933 | ||
| @@ -890,12 +966,12 @@ static void print_other_cpu_stall(struct rcu_state *rsp) | |||
| 890 | /* Only let one CPU complain about others per time interval. */ | 966 | /* Only let one CPU complain about others per time interval. */ |
| 891 | 967 | ||
| 892 | raw_spin_lock_irqsave(&rnp->lock, flags); | 968 | raw_spin_lock_irqsave(&rnp->lock, flags); |
| 893 | delta = jiffies - rsp->jiffies_stall; | 969 | delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall); |
| 894 | if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { | 970 | if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { |
| 895 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 971 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 896 | return; | 972 | return; |
| 897 | } | 973 | } |
| 898 | rsp->jiffies_stall = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; | 974 | ACCESS_ONCE(rsp->jiffies_stall) = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; |
| 899 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 975 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 900 | 976 | ||
| 901 | /* | 977 | /* |
| @@ -932,9 +1008,9 @@ static void print_other_cpu_stall(struct rcu_state *rsp) | |||
| 932 | print_cpu_stall_info_end(); | 1008 | print_cpu_stall_info_end(); |
| 933 | for_each_possible_cpu(cpu) | 1009 | for_each_possible_cpu(cpu) |
| 934 | totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; | 1010 | totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; |
| 935 | pr_cont("(detected by %d, t=%ld jiffies, g=%lu, c=%lu, q=%lu)\n", | 1011 | pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n", |
| 936 | smp_processor_id(), (long)(jiffies - rsp->gp_start), | 1012 | smp_processor_id(), (long)(jiffies - rsp->gp_start), |
| 937 | rsp->gpnum, rsp->completed, totqlen); | 1013 | (long)rsp->gpnum, (long)rsp->completed, totqlen); |
| 938 | if (ndetected == 0) | 1014 | if (ndetected == 0) |
| 939 | pr_err("INFO: Stall ended before state dump start\n"); | 1015 | pr_err("INFO: Stall ended before state dump start\n"); |
| 940 | else if (!trigger_all_cpu_backtrace()) | 1016 | else if (!trigger_all_cpu_backtrace()) |
| @@ -947,12 +1023,6 @@ static void print_other_cpu_stall(struct rcu_state *rsp) | |||
| 947 | force_quiescent_state(rsp); /* Kick them all. */ | 1023 | force_quiescent_state(rsp); /* Kick them all. */ |
| 948 | } | 1024 | } |
| 949 | 1025 | ||
| 950 | /* | ||
| 951 | * This function really isn't for public consumption, but RCU is special in | ||
| 952 | * that context switches can allow the state machine to make progress. | ||
| 953 | */ | ||
| 954 | extern void resched_cpu(int cpu); | ||
| 955 | |||
| 956 | static void print_cpu_stall(struct rcu_state *rsp) | 1026 | static void print_cpu_stall(struct rcu_state *rsp) |
| 957 | { | 1027 | { |
| 958 | int cpu; | 1028 | int cpu; |
| @@ -971,14 +1041,15 @@ static void print_cpu_stall(struct rcu_state *rsp) | |||
| 971 | print_cpu_stall_info_end(); | 1041 | print_cpu_stall_info_end(); |
| 972 | for_each_possible_cpu(cpu) | 1042 | for_each_possible_cpu(cpu) |
| 973 | totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; | 1043 | totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; |
| 974 | pr_cont(" (t=%lu jiffies g=%lu c=%lu q=%lu)\n", | 1044 | pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n", |
| 975 | jiffies - rsp->gp_start, rsp->gpnum, rsp->completed, totqlen); | 1045 | jiffies - rsp->gp_start, |
| 1046 | (long)rsp->gpnum, (long)rsp->completed, totqlen); | ||
| 976 | if (!trigger_all_cpu_backtrace()) | 1047 | if (!trigger_all_cpu_backtrace()) |
| 977 | dump_stack(); | 1048 | dump_stack(); |
| 978 | 1049 | ||
| 979 | raw_spin_lock_irqsave(&rnp->lock, flags); | 1050 | raw_spin_lock_irqsave(&rnp->lock, flags); |
| 980 | if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall)) | 1051 | if (ULONG_CMP_GE(jiffies, ACCESS_ONCE(rsp->jiffies_stall))) |
| 981 | rsp->jiffies_stall = jiffies + | 1052 | ACCESS_ONCE(rsp->jiffies_stall) = jiffies + |
| 982 | 3 * rcu_jiffies_till_stall_check() + 3; | 1053 | 3 * rcu_jiffies_till_stall_check() + 3; |
| 983 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 1054 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 984 | 1055 | ||
| @@ -1062,7 +1133,7 @@ void rcu_cpu_stall_reset(void) | |||
| 1062 | struct rcu_state *rsp; | 1133 | struct rcu_state *rsp; |
| 1063 | 1134 | ||
| 1064 | for_each_rcu_flavor(rsp) | 1135 | for_each_rcu_flavor(rsp) |
| 1065 | rsp->jiffies_stall = jiffies + ULONG_MAX / 2; | 1136 | ACCESS_ONCE(rsp->jiffies_stall) = jiffies + ULONG_MAX / 2; |
| 1066 | } | 1137 | } |
| 1067 | 1138 | ||
| 1068 | /* | 1139 | /* |
| @@ -1123,15 +1194,18 @@ static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, | |||
| 1123 | /* | 1194 | /* |
| 1124 | * Start some future grace period, as needed to handle newly arrived | 1195 | * Start some future grace period, as needed to handle newly arrived |
| 1125 | * callbacks. The required future grace periods are recorded in each | 1196 | * callbacks. The required future grace periods are recorded in each |
| 1126 | * rcu_node structure's ->need_future_gp field. | 1197 | * rcu_node structure's ->need_future_gp field. Returns true if there |
| 1198 | * is reason to awaken the grace-period kthread. | ||
| 1127 | * | 1199 | * |
| 1128 | * The caller must hold the specified rcu_node structure's ->lock. | 1200 | * The caller must hold the specified rcu_node structure's ->lock. |
| 1129 | */ | 1201 | */ |
| 1130 | static unsigned long __maybe_unused | 1202 | static bool __maybe_unused |
| 1131 | rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) | 1203 | rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, |
| 1204 | unsigned long *c_out) | ||
| 1132 | { | 1205 | { |
| 1133 | unsigned long c; | 1206 | unsigned long c; |
| 1134 | int i; | 1207 | int i; |
| 1208 | bool ret = false; | ||
| 1135 | struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); | 1209 | struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); |
| 1136 | 1210 | ||
| 1137 | /* | 1211 | /* |
| @@ -1142,7 +1216,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) | |||
| 1142 | trace_rcu_future_gp(rnp, rdp, c, TPS("Startleaf")); | 1216 | trace_rcu_future_gp(rnp, rdp, c, TPS("Startleaf")); |
| 1143 | if (rnp->need_future_gp[c & 0x1]) { | 1217 | if (rnp->need_future_gp[c & 0x1]) { |
| 1144 | trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartleaf")); | 1218 | trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartleaf")); |
| 1145 | return c; | 1219 | goto out; |
| 1146 | } | 1220 | } |
| 1147 | 1221 | ||
| 1148 | /* | 1222 | /* |
| @@ -1156,7 +1230,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) | |||
| 1156 | ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) { | 1230 | ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) { |
| 1157 | rnp->need_future_gp[c & 0x1]++; | 1231 | rnp->need_future_gp[c & 0x1]++; |
| 1158 | trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf")); | 1232 | trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf")); |
| 1159 | return c; | 1233 | goto out; |
| 1160 | } | 1234 | } |
| 1161 | 1235 | ||
| 1162 | /* | 1236 | /* |
| @@ -1197,12 +1271,15 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) | |||
| 1197 | trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleafroot")); | 1271 | trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleafroot")); |
| 1198 | } else { | 1272 | } else { |
| 1199 | trace_rcu_future_gp(rnp, rdp, c, TPS("Startedroot")); | 1273 | trace_rcu_future_gp(rnp, rdp, c, TPS("Startedroot")); |
| 1200 | rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp); | 1274 | ret = rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp); |
| 1201 | } | 1275 | } |
| 1202 | unlock_out: | 1276 | unlock_out: |
| 1203 | if (rnp != rnp_root) | 1277 | if (rnp != rnp_root) |
| 1204 | raw_spin_unlock(&rnp_root->lock); | 1278 | raw_spin_unlock(&rnp_root->lock); |
| 1205 | return c; | 1279 | out: |
| 1280 | if (c_out != NULL) | ||
| 1281 | *c_out = c; | ||
| 1282 | return ret; | ||
| 1206 | } | 1283 | } |
| 1207 | 1284 | ||
| 1208 | /* | 1285 | /* |
| @@ -1226,25 +1303,43 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) | |||
| 1226 | } | 1303 | } |
| 1227 | 1304 | ||
| 1228 | /* | 1305 | /* |
| 1306 | * Awaken the grace-period kthread for the specified flavor of RCU. | ||
| 1307 | * Don't do a self-awaken, and don't bother awakening when there is | ||
| 1308 | * nothing for the grace-period kthread to do (as in several CPUs | ||
| 1309 | * raced to awaken, and we lost), and finally don't try to awaken | ||
| 1310 | * a kthread that has not yet been created. | ||
| 1311 | */ | ||
| 1312 | static void rcu_gp_kthread_wake(struct rcu_state *rsp) | ||
| 1313 | { | ||
| 1314 | if (current == rsp->gp_kthread || | ||
| 1315 | !ACCESS_ONCE(rsp->gp_flags) || | ||
| 1316 | !rsp->gp_kthread) | ||
| 1317 | return; | ||
| 1318 | wake_up(&rsp->gp_wq); | ||
| 1319 | } | ||
| 1320 | |||
| 1321 | /* | ||
| 1229 | * If there is room, assign a ->completed number to any callbacks on | 1322 | * If there is room, assign a ->completed number to any callbacks on |
| 1230 | * this CPU that have not already been assigned. Also accelerate any | 1323 | * this CPU that have not already been assigned. Also accelerate any |
| 1231 | * callbacks that were previously assigned a ->completed number that has | 1324 | * callbacks that were previously assigned a ->completed number that has |
| 1232 | * since proven to be too conservative, which can happen if callbacks get | 1325 | * since proven to be too conservative, which can happen if callbacks get |
| 1233 | * assigned a ->completed number while RCU is idle, but with reference to | 1326 | * assigned a ->completed number while RCU is idle, but with reference to |
| 1234 | * a non-root rcu_node structure. This function is idempotent, so it does | 1327 | * a non-root rcu_node structure. This function is idempotent, so it does |
| 1235 | * not hurt to call it repeatedly. | 1328 | * not hurt to call it repeatedly. Returns an flag saying that we should |
| 1329 | * awaken the RCU grace-period kthread. | ||
| 1236 | * | 1330 | * |
| 1237 | * The caller must hold rnp->lock with interrupts disabled. | 1331 | * The caller must hold rnp->lock with interrupts disabled. |
| 1238 | */ | 1332 | */ |
| 1239 | static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, | 1333 | static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, |
| 1240 | struct rcu_data *rdp) | 1334 | struct rcu_data *rdp) |
| 1241 | { | 1335 | { |
| 1242 | unsigned long c; | 1336 | unsigned long c; |
| 1243 | int i; | 1337 | int i; |
| 1338 | bool ret; | ||
| 1244 | 1339 | ||
| 1245 | /* If the CPU has no callbacks, nothing to do. */ | 1340 | /* If the CPU has no callbacks, nothing to do. */ |
| 1246 | if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) | 1341 | if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) |
| 1247 | return; | 1342 | return false; |
| 1248 | 1343 | ||
| 1249 | /* | 1344 | /* |
| 1250 | * Starting from the sublist containing the callbacks most | 1345 | * Starting from the sublist containing the callbacks most |
| @@ -1273,7 +1368,7 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, | |||
| 1273 | * be grouped into. | 1368 | * be grouped into. |
| 1274 | */ | 1369 | */ |
| 1275 | if (++i >= RCU_NEXT_TAIL) | 1370 | if (++i >= RCU_NEXT_TAIL) |
| 1276 | return; | 1371 | return false; |
| 1277 | 1372 | ||
| 1278 | /* | 1373 | /* |
| 1279 | * Assign all subsequent callbacks' ->completed number to the next | 1374 | * Assign all subsequent callbacks' ->completed number to the next |
| @@ -1285,13 +1380,14 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, | |||
| 1285 | rdp->nxtcompleted[i] = c; | 1380 | rdp->nxtcompleted[i] = c; |
| 1286 | } | 1381 | } |
| 1287 | /* Record any needed additional grace periods. */ | 1382 | /* Record any needed additional grace periods. */ |
| 1288 | rcu_start_future_gp(rnp, rdp); | 1383 | ret = rcu_start_future_gp(rnp, rdp, NULL); |
| 1289 | 1384 | ||
| 1290 | /* Trace depending on how much we were able to accelerate. */ | 1385 | /* Trace depending on how much we were able to accelerate. */ |
| 1291 | if (!*rdp->nxttail[RCU_WAIT_TAIL]) | 1386 | if (!*rdp->nxttail[RCU_WAIT_TAIL]) |
| 1292 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB")); | 1387 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB")); |
| 1293 | else | 1388 | else |
| 1294 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB")); | 1389 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB")); |
| 1390 | return ret; | ||
| 1295 | } | 1391 | } |
| 1296 | 1392 | ||
| 1297 | /* | 1393 | /* |
| @@ -1300,17 +1396,18 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, | |||
| 1300 | * assign ->completed numbers to any callbacks in the RCU_NEXT_TAIL | 1396 | * assign ->completed numbers to any callbacks in the RCU_NEXT_TAIL |
| 1301 | * sublist. This function is idempotent, so it does not hurt to | 1397 | * sublist. This function is idempotent, so it does not hurt to |
| 1302 | * invoke it repeatedly. As long as it is not invoked -too- often... | 1398 | * invoke it repeatedly. As long as it is not invoked -too- often... |
| 1399 | * Returns true if the RCU grace-period kthread needs to be awakened. | ||
| 1303 | * | 1400 | * |
| 1304 | * The caller must hold rnp->lock with interrupts disabled. | 1401 | * The caller must hold rnp->lock with interrupts disabled. |
| 1305 | */ | 1402 | */ |
| 1306 | static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, | 1403 | static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, |
| 1307 | struct rcu_data *rdp) | 1404 | struct rcu_data *rdp) |
| 1308 | { | 1405 | { |
| 1309 | int i, j; | 1406 | int i, j; |
| 1310 | 1407 | ||
| 1311 | /* If the CPU has no callbacks, nothing to do. */ | 1408 | /* If the CPU has no callbacks, nothing to do. */ |
| 1312 | if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) | 1409 | if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) |
| 1313 | return; | 1410 | return false; |
| 1314 | 1411 | ||
| 1315 | /* | 1412 | /* |
| 1316 | * Find all callbacks whose ->completed numbers indicate that they | 1413 | * Find all callbacks whose ->completed numbers indicate that they |
| @@ -1334,26 +1431,30 @@ static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, | |||
| 1334 | } | 1431 | } |
| 1335 | 1432 | ||
| 1336 | /* Classify any remaining callbacks. */ | 1433 | /* Classify any remaining callbacks. */ |
| 1337 | rcu_accelerate_cbs(rsp, rnp, rdp); | 1434 | return rcu_accelerate_cbs(rsp, rnp, rdp); |
| 1338 | } | 1435 | } |
| 1339 | 1436 | ||
| 1340 | /* | 1437 | /* |
| 1341 | * Update CPU-local rcu_data state to record the beginnings and ends of | 1438 | * Update CPU-local rcu_data state to record the beginnings and ends of |
| 1342 | * grace periods. The caller must hold the ->lock of the leaf rcu_node | 1439 | * grace periods. The caller must hold the ->lock of the leaf rcu_node |
| 1343 | * structure corresponding to the current CPU, and must have irqs disabled. | 1440 | * structure corresponding to the current CPU, and must have irqs disabled. |
| 1441 | * Returns true if the grace-period kthread needs to be awakened. | ||
| 1344 | */ | 1442 | */ |
| 1345 | static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) | 1443 | static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, |
| 1444 | struct rcu_data *rdp) | ||
| 1346 | { | 1445 | { |
| 1446 | bool ret; | ||
| 1447 | |||
| 1347 | /* Handle the ends of any preceding grace periods first. */ | 1448 | /* Handle the ends of any preceding grace periods first. */ |
| 1348 | if (rdp->completed == rnp->completed) { | 1449 | if (rdp->completed == rnp->completed) { |
| 1349 | 1450 | ||
| 1350 | /* No grace period end, so just accelerate recent callbacks. */ | 1451 | /* No grace period end, so just accelerate recent callbacks. */ |
| 1351 | rcu_accelerate_cbs(rsp, rnp, rdp); | 1452 | ret = rcu_accelerate_cbs(rsp, rnp, rdp); |
| 1352 | 1453 | ||
| 1353 | } else { | 1454 | } else { |
| 1354 | 1455 | ||
| 1355 | /* Advance callbacks. */ | 1456 | /* Advance callbacks. */ |
| 1356 | rcu_advance_cbs(rsp, rnp, rdp); | 1457 | ret = rcu_advance_cbs(rsp, rnp, rdp); |
| 1357 | 1458 | ||
| 1358 | /* Remember that we saw this grace-period completion. */ | 1459 | /* Remember that we saw this grace-period completion. */ |
| 1359 | rdp->completed = rnp->completed; | 1460 | rdp->completed = rnp->completed; |
| @@ -1372,11 +1473,13 @@ static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struc | |||
| 1372 | rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); | 1473 | rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); |
| 1373 | zero_cpu_stall_ticks(rdp); | 1474 | zero_cpu_stall_ticks(rdp); |
| 1374 | } | 1475 | } |
| 1476 | return ret; | ||
| 1375 | } | 1477 | } |
| 1376 | 1478 | ||
| 1377 | static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) | 1479 | static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) |
| 1378 | { | 1480 | { |
| 1379 | unsigned long flags; | 1481 | unsigned long flags; |
| 1482 | bool needwake; | ||
| 1380 | struct rcu_node *rnp; | 1483 | struct rcu_node *rnp; |
| 1381 | 1484 | ||
| 1382 | local_irq_save(flags); | 1485 | local_irq_save(flags); |
| @@ -1388,8 +1491,10 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) | |||
| 1388 | return; | 1491 | return; |
| 1389 | } | 1492 | } |
| 1390 | smp_mb__after_unlock_lock(); | 1493 | smp_mb__after_unlock_lock(); |
| 1391 | __note_gp_changes(rsp, rnp, rdp); | 1494 | needwake = __note_gp_changes(rsp, rnp, rdp); |
| 1392 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 1495 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 1496 | if (needwake) | ||
| 1497 | rcu_gp_kthread_wake(rsp); | ||
| 1393 | } | 1498 | } |
| 1394 | 1499 | ||
| 1395 | /* | 1500 | /* |
| @@ -1403,12 +1508,12 @@ static int rcu_gp_init(struct rcu_state *rsp) | |||
| 1403 | rcu_bind_gp_kthread(); | 1508 | rcu_bind_gp_kthread(); |
| 1404 | raw_spin_lock_irq(&rnp->lock); | 1509 | raw_spin_lock_irq(&rnp->lock); |
| 1405 | smp_mb__after_unlock_lock(); | 1510 | smp_mb__after_unlock_lock(); |
| 1406 | if (rsp->gp_flags == 0) { | 1511 | if (!ACCESS_ONCE(rsp->gp_flags)) { |
| 1407 | /* Spurious wakeup, tell caller to go back to sleep. */ | 1512 | /* Spurious wakeup, tell caller to go back to sleep. */ |
| 1408 | raw_spin_unlock_irq(&rnp->lock); | 1513 | raw_spin_unlock_irq(&rnp->lock); |
| 1409 | return 0; | 1514 | return 0; |
| 1410 | } | 1515 | } |
| 1411 | rsp->gp_flags = 0; /* Clear all flags: New grace period. */ | 1516 | ACCESS_ONCE(rsp->gp_flags) = 0; /* Clear all flags: New grace period. */ |
| 1412 | 1517 | ||
| 1413 | if (WARN_ON_ONCE(rcu_gp_in_progress(rsp))) { | 1518 | if (WARN_ON_ONCE(rcu_gp_in_progress(rsp))) { |
| 1414 | /* | 1519 | /* |
| @@ -1453,7 +1558,7 @@ static int rcu_gp_init(struct rcu_state *rsp) | |||
| 1453 | WARN_ON_ONCE(rnp->completed != rsp->completed); | 1558 | WARN_ON_ONCE(rnp->completed != rsp->completed); |
| 1454 | ACCESS_ONCE(rnp->completed) = rsp->completed; | 1559 | ACCESS_ONCE(rnp->completed) = rsp->completed; |
| 1455 | if (rnp == rdp->mynode) | 1560 | if (rnp == rdp->mynode) |
| 1456 | __note_gp_changes(rsp, rnp, rdp); | 1561 | (void)__note_gp_changes(rsp, rnp, rdp); |
| 1457 | rcu_preempt_boost_start_gp(rnp); | 1562 | rcu_preempt_boost_start_gp(rnp); |
| 1458 | trace_rcu_grace_period_init(rsp->name, rnp->gpnum, | 1563 | trace_rcu_grace_period_init(rsp->name, rnp->gpnum, |
| 1459 | rnp->level, rnp->grplo, | 1564 | rnp->level, rnp->grplo, |
| @@ -1501,7 +1606,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) | |||
| 1501 | if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { | 1606 | if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { |
| 1502 | raw_spin_lock_irq(&rnp->lock); | 1607 | raw_spin_lock_irq(&rnp->lock); |
| 1503 | smp_mb__after_unlock_lock(); | 1608 | smp_mb__after_unlock_lock(); |
| 1504 | rsp->gp_flags &= ~RCU_GP_FLAG_FQS; | 1609 | ACCESS_ONCE(rsp->gp_flags) &= ~RCU_GP_FLAG_FQS; |
| 1505 | raw_spin_unlock_irq(&rnp->lock); | 1610 | raw_spin_unlock_irq(&rnp->lock); |
| 1506 | } | 1611 | } |
| 1507 | return fqs_state; | 1612 | return fqs_state; |
| @@ -1513,6 +1618,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) | |||
| 1513 | static void rcu_gp_cleanup(struct rcu_state *rsp) | 1618 | static void rcu_gp_cleanup(struct rcu_state *rsp) |
| 1514 | { | 1619 | { |
| 1515 | unsigned long gp_duration; | 1620 | unsigned long gp_duration; |
| 1621 | bool needgp = false; | ||
| 1516 | int nocb = 0; | 1622 | int nocb = 0; |
| 1517 | struct rcu_data *rdp; | 1623 | struct rcu_data *rdp; |
| 1518 | struct rcu_node *rnp = rcu_get_root(rsp); | 1624 | struct rcu_node *rnp = rcu_get_root(rsp); |
| @@ -1548,7 +1654,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) | |||
| 1548 | ACCESS_ONCE(rnp->completed) = rsp->gpnum; | 1654 | ACCESS_ONCE(rnp->completed) = rsp->gpnum; |
| 1549 | rdp = this_cpu_ptr(rsp->rda); | 1655 | rdp = this_cpu_ptr(rsp->rda); |
| 1550 | if (rnp == rdp->mynode) | 1656 | if (rnp == rdp->mynode) |
| 1551 | __note_gp_changes(rsp, rnp, rdp); | 1657 | needgp = __note_gp_changes(rsp, rnp, rdp) || needgp; |
| 1552 | /* smp_mb() provided by prior unlock-lock pair. */ | 1658 | /* smp_mb() provided by prior unlock-lock pair. */ |
| 1553 | nocb += rcu_future_gp_cleanup(rsp, rnp); | 1659 | nocb += rcu_future_gp_cleanup(rsp, rnp); |
| 1554 | raw_spin_unlock_irq(&rnp->lock); | 1660 | raw_spin_unlock_irq(&rnp->lock); |
| @@ -1564,9 +1670,10 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) | |||
| 1564 | trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end")); | 1670 | trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end")); |
| 1565 | rsp->fqs_state = RCU_GP_IDLE; | 1671 | rsp->fqs_state = RCU_GP_IDLE; |
| 1566 | rdp = this_cpu_ptr(rsp->rda); | 1672 | rdp = this_cpu_ptr(rsp->rda); |
| 1567 | rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */ | 1673 | /* Advance CBs to reduce false positives below. */ |
| 1568 | if (cpu_needs_another_gp(rsp, rdp)) { | 1674 | needgp = rcu_advance_cbs(rsp, rnp, rdp) || needgp; |
| 1569 | rsp->gp_flags = RCU_GP_FLAG_INIT; | 1675 | if (needgp || cpu_needs_another_gp(rsp, rdp)) { |
| 1676 | ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT; | ||
| 1570 | trace_rcu_grace_period(rsp->name, | 1677 | trace_rcu_grace_period(rsp->name, |
| 1571 | ACCESS_ONCE(rsp->gpnum), | 1678 | ACCESS_ONCE(rsp->gpnum), |
| 1572 | TPS("newreq")); | 1679 | TPS("newreq")); |
| @@ -1593,6 +1700,7 @@ static int __noreturn rcu_gp_kthread(void *arg) | |||
| 1593 | trace_rcu_grace_period(rsp->name, | 1700 | trace_rcu_grace_period(rsp->name, |
| 1594 | ACCESS_ONCE(rsp->gpnum), | 1701 | ACCESS_ONCE(rsp->gpnum), |
| 1595 | TPS("reqwait")); | 1702 | TPS("reqwait")); |
| 1703 | rsp->gp_state = RCU_GP_WAIT_GPS; | ||
| 1596 | wait_event_interruptible(rsp->gp_wq, | 1704 | wait_event_interruptible(rsp->gp_wq, |
| 1597 | ACCESS_ONCE(rsp->gp_flags) & | 1705 | ACCESS_ONCE(rsp->gp_flags) & |
| 1598 | RCU_GP_FLAG_INIT); | 1706 | RCU_GP_FLAG_INIT); |
| @@ -1620,6 +1728,7 @@ static int __noreturn rcu_gp_kthread(void *arg) | |||
| 1620 | trace_rcu_grace_period(rsp->name, | 1728 | trace_rcu_grace_period(rsp->name, |
| 1621 | ACCESS_ONCE(rsp->gpnum), | 1729 | ACCESS_ONCE(rsp->gpnum), |
| 1622 | TPS("fqswait")); | 1730 | TPS("fqswait")); |
| 1731 | rsp->gp_state = RCU_GP_WAIT_FQS; | ||
| 1623 | ret = wait_event_interruptible_timeout(rsp->gp_wq, | 1732 | ret = wait_event_interruptible_timeout(rsp->gp_wq, |
| 1624 | ((gf = ACCESS_ONCE(rsp->gp_flags)) & | 1733 | ((gf = ACCESS_ONCE(rsp->gp_flags)) & |
| 1625 | RCU_GP_FLAG_FQS) || | 1734 | RCU_GP_FLAG_FQS) || |
| @@ -1665,14 +1774,6 @@ static int __noreturn rcu_gp_kthread(void *arg) | |||
| 1665 | } | 1774 | } |
| 1666 | } | 1775 | } |
| 1667 | 1776 | ||
| 1668 | static void rsp_wakeup(struct irq_work *work) | ||
| 1669 | { | ||
| 1670 | struct rcu_state *rsp = container_of(work, struct rcu_state, wakeup_work); | ||
| 1671 | |||
| 1672 | /* Wake up rcu_gp_kthread() to start the grace period. */ | ||
| 1673 | wake_up(&rsp->gp_wq); | ||
| 1674 | } | ||
| 1675 | |||
| 1676 | /* | 1777 | /* |
| 1677 | * Start a new RCU grace period if warranted, re-initializing the hierarchy | 1778 | * Start a new RCU grace period if warranted, re-initializing the hierarchy |
| 1678 | * in preparation for detecting the next grace period. The caller must hold | 1779 | * in preparation for detecting the next grace period. The caller must hold |
| @@ -1681,8 +1782,10 @@ static void rsp_wakeup(struct irq_work *work) | |||
| 1681 | * Note that it is legal for a dying CPU (which is marked as offline) to | 1782 | * Note that it is legal for a dying CPU (which is marked as offline) to |
| 1682 | * invoke this function. This can happen when the dying CPU reports its | 1783 | * invoke this function. This can happen when the dying CPU reports its |
| 1683 | * quiescent state. | 1784 | * quiescent state. |
| 1785 | * | ||
| 1786 | * Returns true if the grace-period kthread must be awakened. | ||
| 1684 | */ | 1787 | */ |
| 1685 | static void | 1788 | static bool |
| 1686 | rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, | 1789 | rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, |
| 1687 | struct rcu_data *rdp) | 1790 | struct rcu_data *rdp) |
| 1688 | { | 1791 | { |
| @@ -1693,20 +1796,18 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, | |||
| 1693 | * or a grace period is already in progress. | 1796 | * or a grace period is already in progress. |
| 1694 | * Either way, don't start a new grace period. | 1797 | * Either way, don't start a new grace period. |
| 1695 | */ | 1798 | */ |
| 1696 | return; | 1799 | return false; |
| 1697 | } | 1800 | } |
| 1698 | rsp->gp_flags = RCU_GP_FLAG_INIT; | 1801 | ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT; |
| 1699 | trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), | 1802 | trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), |
| 1700 | TPS("newreq")); | 1803 | TPS("newreq")); |
| 1701 | 1804 | ||
| 1702 | /* | 1805 | /* |
| 1703 | * We can't do wakeups while holding the rnp->lock, as that | 1806 | * We can't do wakeups while holding the rnp->lock, as that |
| 1704 | * could cause possible deadlocks with the rq->lock. Defer | 1807 | * could cause possible deadlocks with the rq->lock. Defer |
| 1705 | * the wakeup to interrupt context. And don't bother waking | 1808 | * the wakeup to our caller. |
| 1706 | * up the running kthread. | ||
| 1707 | */ | 1809 | */ |
| 1708 | if (current != rsp->gp_kthread) | 1810 | return true; |
| 1709 | irq_work_queue(&rsp->wakeup_work); | ||
| 1710 | } | 1811 | } |
| 1711 | 1812 | ||
| 1712 | /* | 1813 | /* |
| @@ -1715,12 +1816,14 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, | |||
| 1715 | * is invoked indirectly from rcu_advance_cbs(), which would result in | 1816 | * is invoked indirectly from rcu_advance_cbs(), which would result in |
| 1716 | * endless recursion -- or would do so if it wasn't for the self-deadlock | 1817 | * endless recursion -- or would do so if it wasn't for the self-deadlock |
| 1717 | * that is encountered beforehand. | 1818 | * that is encountered beforehand. |
| 1819 | * | ||
| 1820 | * Returns true if the grace-period kthread needs to be awakened. | ||
| 1718 | */ | 1821 | */ |
| 1719 | static void | 1822 | static bool rcu_start_gp(struct rcu_state *rsp) |
| 1720 | rcu_start_gp(struct rcu_state *rsp) | ||
| 1721 | { | 1823 | { |
| 1722 | struct rcu_data *rdp = this_cpu_ptr(rsp->rda); | 1824 | struct rcu_data *rdp = this_cpu_ptr(rsp->rda); |
| 1723 | struct rcu_node *rnp = rcu_get_root(rsp); | 1825 | struct rcu_node *rnp = rcu_get_root(rsp); |
| 1826 | bool ret = false; | ||
| 1724 | 1827 | ||
| 1725 | /* | 1828 | /* |
| 1726 | * If there is no grace period in progress right now, any | 1829 | * If there is no grace period in progress right now, any |
| @@ -1730,8 +1833,9 @@ rcu_start_gp(struct rcu_state *rsp) | |||
| 1730 | * resulting in pointless grace periods. So, advance callbacks | 1833 | * resulting in pointless grace periods. So, advance callbacks |
| 1731 | * then start the grace period! | 1834 | * then start the grace period! |
| 1732 | */ | 1835 | */ |
| 1733 | rcu_advance_cbs(rsp, rnp, rdp); | 1836 | ret = rcu_advance_cbs(rsp, rnp, rdp) || ret; |
| 1734 | rcu_start_gp_advanced(rsp, rnp, rdp); | 1837 | ret = rcu_start_gp_advanced(rsp, rnp, rdp) || ret; |
| 1838 | return ret; | ||
| 1735 | } | 1839 | } |
| 1736 | 1840 | ||
| 1737 | /* | 1841 | /* |
| @@ -1820,6 +1924,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) | |||
| 1820 | { | 1924 | { |
| 1821 | unsigned long flags; | 1925 | unsigned long flags; |
| 1822 | unsigned long mask; | 1926 | unsigned long mask; |
| 1927 | bool needwake; | ||
| 1823 | struct rcu_node *rnp; | 1928 | struct rcu_node *rnp; |
| 1824 | 1929 | ||
| 1825 | rnp = rdp->mynode; | 1930 | rnp = rdp->mynode; |
| @@ -1848,9 +1953,11 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) | |||
| 1848 | * This GP can't end until cpu checks in, so all of our | 1953 | * This GP can't end until cpu checks in, so all of our |
| 1849 | * callbacks can be processed during the next GP. | 1954 | * callbacks can be processed during the next GP. |
| 1850 | */ | 1955 | */ |
| 1851 | rcu_accelerate_cbs(rsp, rnp, rdp); | 1956 | needwake = rcu_accelerate_cbs(rsp, rnp, rdp); |
| 1852 | 1957 | ||
| 1853 | rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */ | 1958 | rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */ |
| 1959 | if (needwake) | ||
| 1960 | rcu_gp_kthread_wake(rsp); | ||
| 1854 | } | 1961 | } |
| 1855 | } | 1962 | } |
| 1856 | 1963 | ||
| @@ -1951,7 +2058,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, | |||
| 1951 | static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) | 2058 | static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) |
| 1952 | { | 2059 | { |
| 1953 | int i; | 2060 | int i; |
| 1954 | struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); | 2061 | struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); |
| 1955 | 2062 | ||
| 1956 | /* No-CBs CPUs are handled specially. */ | 2063 | /* No-CBs CPUs are handled specially. */ |
| 1957 | if (rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags)) | 2064 | if (rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags)) |
| @@ -2320,7 +2427,7 @@ static void force_quiescent_state(struct rcu_state *rsp) | |||
| 2320 | raw_spin_unlock_irqrestore(&rnp_old->lock, flags); | 2427 | raw_spin_unlock_irqrestore(&rnp_old->lock, flags); |
| 2321 | return; /* Someone beat us to it. */ | 2428 | return; /* Someone beat us to it. */ |
| 2322 | } | 2429 | } |
| 2323 | rsp->gp_flags |= RCU_GP_FLAG_FQS; | 2430 | ACCESS_ONCE(rsp->gp_flags) |= RCU_GP_FLAG_FQS; |
| 2324 | raw_spin_unlock_irqrestore(&rnp_old->lock, flags); | 2431 | raw_spin_unlock_irqrestore(&rnp_old->lock, flags); |
| 2325 | wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */ | 2432 | wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */ |
| 2326 | } | 2433 | } |
| @@ -2334,7 +2441,8 @@ static void | |||
| 2334 | __rcu_process_callbacks(struct rcu_state *rsp) | 2441 | __rcu_process_callbacks(struct rcu_state *rsp) |
| 2335 | { | 2442 | { |
| 2336 | unsigned long flags; | 2443 | unsigned long flags; |
| 2337 | struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); | 2444 | bool needwake; |
| 2445 | struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); | ||
| 2338 | 2446 | ||
| 2339 | WARN_ON_ONCE(rdp->beenonline == 0); | 2447 | WARN_ON_ONCE(rdp->beenonline == 0); |
| 2340 | 2448 | ||
| @@ -2345,8 +2453,10 @@ __rcu_process_callbacks(struct rcu_state *rsp) | |||
| 2345 | local_irq_save(flags); | 2453 | local_irq_save(flags); |
| 2346 | if (cpu_needs_another_gp(rsp, rdp)) { | 2454 | if (cpu_needs_another_gp(rsp, rdp)) { |
| 2347 | raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */ | 2455 | raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */ |
| 2348 | rcu_start_gp(rsp); | 2456 | needwake = rcu_start_gp(rsp); |
| 2349 | raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); | 2457 | raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); |
| 2458 | if (needwake) | ||
| 2459 | rcu_gp_kthread_wake(rsp); | ||
| 2350 | } else { | 2460 | } else { |
| 2351 | local_irq_restore(flags); | 2461 | local_irq_restore(flags); |
| 2352 | } | 2462 | } |
| @@ -2404,6 +2514,8 @@ static void invoke_rcu_core(void) | |||
| 2404 | static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, | 2514 | static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, |
| 2405 | struct rcu_head *head, unsigned long flags) | 2515 | struct rcu_head *head, unsigned long flags) |
| 2406 | { | 2516 | { |
| 2517 | bool needwake; | ||
| 2518 | |||
| 2407 | /* | 2519 | /* |
| 2408 | * If called from an extended quiescent state, invoke the RCU | 2520 | * If called from an extended quiescent state, invoke the RCU |
| 2409 | * core in order to force a re-evaluation of RCU's idleness. | 2521 | * core in order to force a re-evaluation of RCU's idleness. |
| @@ -2433,8 +2545,10 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, | |||
| 2433 | 2545 | ||
| 2434 | raw_spin_lock(&rnp_root->lock); | 2546 | raw_spin_lock(&rnp_root->lock); |
| 2435 | smp_mb__after_unlock_lock(); | 2547 | smp_mb__after_unlock_lock(); |
| 2436 | rcu_start_gp(rsp); | 2548 | needwake = rcu_start_gp(rsp); |
| 2437 | raw_spin_unlock(&rnp_root->lock); | 2549 | raw_spin_unlock(&rnp_root->lock); |
| 2550 | if (needwake) | ||
| 2551 | rcu_gp_kthread_wake(rsp); | ||
| 2438 | } else { | 2552 | } else { |
| 2439 | /* Give the grace period a kick. */ | 2553 | /* Give the grace period a kick. */ |
| 2440 | rdp->blimit = LONG_MAX; | 2554 | rdp->blimit = LONG_MAX; |
| @@ -2537,6 +2651,20 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | |||
| 2537 | EXPORT_SYMBOL_GPL(call_rcu_bh); | 2651 | EXPORT_SYMBOL_GPL(call_rcu_bh); |
| 2538 | 2652 | ||
| 2539 | /* | 2653 | /* |
| 2654 | * Queue an RCU callback for lazy invocation after a grace period. | ||
| 2655 | * This will likely be later named something like "call_rcu_lazy()", | ||
| 2656 | * but this change will require some way of tagging the lazy RCU | ||
| 2657 | * callbacks in the list of pending callbacks. Until then, this | ||
| 2658 | * function may only be called from __kfree_rcu(). | ||
| 2659 | */ | ||
| 2660 | void kfree_call_rcu(struct rcu_head *head, | ||
| 2661 | void (*func)(struct rcu_head *rcu)) | ||
| 2662 | { | ||
| 2663 | __call_rcu(head, func, rcu_state_p, -1, 1); | ||
| 2664 | } | ||
| 2665 | EXPORT_SYMBOL_GPL(kfree_call_rcu); | ||
| 2666 | |||
| 2667 | /* | ||
| 2540 | * Because a context switch is a grace period for RCU-sched and RCU-bh, | 2668 | * Because a context switch is a grace period for RCU-sched and RCU-bh, |
| 2541 | * any blocking grace-period wait automatically implies a grace period | 2669 | * any blocking grace-period wait automatically implies a grace period |
| 2542 | * if there is only one CPU online at any point time during execution | 2670 | * if there is only one CPU online at any point time during execution |
| @@ -2659,7 +2787,7 @@ unsigned long get_state_synchronize_rcu(void) | |||
| 2659 | * time-consuming work between get_state_synchronize_rcu() | 2787 | * time-consuming work between get_state_synchronize_rcu() |
| 2660 | * and cond_synchronize_rcu(). | 2788 | * and cond_synchronize_rcu(). |
| 2661 | */ | 2789 | */ |
| 2662 | return smp_load_acquire(&rcu_state->gpnum); | 2790 | return smp_load_acquire(&rcu_state_p->gpnum); |
| 2663 | } | 2791 | } |
| 2664 | EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); | 2792 | EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); |
| 2665 | 2793 | ||
| @@ -2685,7 +2813,7 @@ void cond_synchronize_rcu(unsigned long oldstate) | |||
| 2685 | * Ensure that this load happens before any RCU-destructive | 2813 | * Ensure that this load happens before any RCU-destructive |
| 2686 | * actions the caller might carry out after we return. | 2814 | * actions the caller might carry out after we return. |
| 2687 | */ | 2815 | */ |
| 2688 | newstate = smp_load_acquire(&rcu_state->completed); | 2816 | newstate = smp_load_acquire(&rcu_state_p->completed); |
| 2689 | if (ULONG_CMP_GE(oldstate, newstate)) | 2817 | if (ULONG_CMP_GE(oldstate, newstate)) |
| 2690 | synchronize_rcu(); | 2818 | synchronize_rcu(); |
| 2691 | } | 2819 | } |
| @@ -2790,7 +2918,7 @@ void synchronize_sched_expedited(void) | |||
| 2790 | s = atomic_long_read(&rsp->expedited_done); | 2918 | s = atomic_long_read(&rsp->expedited_done); |
| 2791 | if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { | 2919 | if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { |
| 2792 | /* ensure test happens before caller kfree */ | 2920 | /* ensure test happens before caller kfree */ |
| 2793 | smp_mb__before_atomic_inc(); /* ^^^ */ | 2921 | smp_mb__before_atomic(); /* ^^^ */ |
| 2794 | atomic_long_inc(&rsp->expedited_workdone1); | 2922 | atomic_long_inc(&rsp->expedited_workdone1); |
| 2795 | return; | 2923 | return; |
| 2796 | } | 2924 | } |
| @@ -2808,7 +2936,7 @@ void synchronize_sched_expedited(void) | |||
| 2808 | s = atomic_long_read(&rsp->expedited_done); | 2936 | s = atomic_long_read(&rsp->expedited_done); |
| 2809 | if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { | 2937 | if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { |
| 2810 | /* ensure test happens before caller kfree */ | 2938 | /* ensure test happens before caller kfree */ |
| 2811 | smp_mb__before_atomic_inc(); /* ^^^ */ | 2939 | smp_mb__before_atomic(); /* ^^^ */ |
| 2812 | atomic_long_inc(&rsp->expedited_workdone2); | 2940 | atomic_long_inc(&rsp->expedited_workdone2); |
| 2813 | return; | 2941 | return; |
| 2814 | } | 2942 | } |
| @@ -2837,7 +2965,7 @@ void synchronize_sched_expedited(void) | |||
| 2837 | s = atomic_long_read(&rsp->expedited_done); | 2965 | s = atomic_long_read(&rsp->expedited_done); |
| 2838 | if (ULONG_CMP_GE((ulong)s, (ulong)snap)) { | 2966 | if (ULONG_CMP_GE((ulong)s, (ulong)snap)) { |
| 2839 | /* ensure test happens before caller kfree */ | 2967 | /* ensure test happens before caller kfree */ |
| 2840 | smp_mb__before_atomic_inc(); /* ^^^ */ | 2968 | smp_mb__before_atomic(); /* ^^^ */ |
| 2841 | atomic_long_inc(&rsp->expedited_done_lost); | 2969 | atomic_long_inc(&rsp->expedited_done_lost); |
| 2842 | break; | 2970 | break; |
| 2843 | } | 2971 | } |
| @@ -2988,7 +3116,7 @@ static void rcu_barrier_callback(struct rcu_head *rhp) | |||
| 2988 | static void rcu_barrier_func(void *type) | 3116 | static void rcu_barrier_func(void *type) |
| 2989 | { | 3117 | { |
| 2990 | struct rcu_state *rsp = type; | 3118 | struct rcu_state *rsp = type; |
| 2991 | struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); | 3119 | struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); |
| 2992 | 3120 | ||
| 2993 | _rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done); | 3121 | _rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done); |
| 2994 | atomic_inc(&rsp->barrier_cpu_count); | 3122 | atomic_inc(&rsp->barrier_cpu_count); |
| @@ -3160,7 +3288,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) | |||
| 3160 | * that this CPU cannot possibly have any RCU callbacks in flight yet. | 3288 | * that this CPU cannot possibly have any RCU callbacks in flight yet. |
| 3161 | */ | 3289 | */ |
| 3162 | static void | 3290 | static void |
| 3163 | rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) | 3291 | rcu_init_percpu_data(int cpu, struct rcu_state *rsp) |
| 3164 | { | 3292 | { |
| 3165 | unsigned long flags; | 3293 | unsigned long flags; |
| 3166 | unsigned long mask; | 3294 | unsigned long mask; |
| @@ -3173,7 +3301,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) | |||
| 3173 | /* Set up local state, ensuring consistent view of global state. */ | 3301 | /* Set up local state, ensuring consistent view of global state. */ |
| 3174 | raw_spin_lock_irqsave(&rnp->lock, flags); | 3302 | raw_spin_lock_irqsave(&rnp->lock, flags); |
| 3175 | rdp->beenonline = 1; /* We have now been online. */ | 3303 | rdp->beenonline = 1; /* We have now been online. */ |
| 3176 | rdp->preemptible = preemptible; | ||
| 3177 | rdp->qlen_last_fqs_check = 0; | 3304 | rdp->qlen_last_fqs_check = 0; |
| 3178 | rdp->n_force_qs_snap = rsp->n_force_qs; | 3305 | rdp->n_force_qs_snap = rsp->n_force_qs; |
| 3179 | rdp->blimit = blimit; | 3306 | rdp->blimit = blimit; |
| @@ -3217,8 +3344,7 @@ static void rcu_prepare_cpu(int cpu) | |||
| 3217 | struct rcu_state *rsp; | 3344 | struct rcu_state *rsp; |
| 3218 | 3345 | ||
| 3219 | for_each_rcu_flavor(rsp) | 3346 | for_each_rcu_flavor(rsp) |
| 3220 | rcu_init_percpu_data(cpu, rsp, | 3347 | rcu_init_percpu_data(cpu, rsp); |
| 3221 | strcmp(rsp->name, "rcu_preempt") == 0); | ||
| 3222 | } | 3348 | } |
| 3223 | 3349 | ||
| 3224 | /* | 3350 | /* |
| @@ -3228,7 +3354,7 @@ static int rcu_cpu_notify(struct notifier_block *self, | |||
| 3228 | unsigned long action, void *hcpu) | 3354 | unsigned long action, void *hcpu) |
| 3229 | { | 3355 | { |
| 3230 | long cpu = (long)hcpu; | 3356 | long cpu = (long)hcpu; |
| 3231 | struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); | 3357 | struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu); |
| 3232 | struct rcu_node *rnp = rdp->mynode; | 3358 | struct rcu_node *rnp = rdp->mynode; |
| 3233 | struct rcu_state *rsp; | 3359 | struct rcu_state *rsp; |
| 3234 | 3360 | ||
| @@ -3402,8 +3528,8 @@ static void __init rcu_init_one(struct rcu_state *rsp, | |||
| 3402 | rnp->qsmaskinit = 0; | 3528 | rnp->qsmaskinit = 0; |
| 3403 | rnp->grplo = j * cpustride; | 3529 | rnp->grplo = j * cpustride; |
| 3404 | rnp->grphi = (j + 1) * cpustride - 1; | 3530 | rnp->grphi = (j + 1) * cpustride - 1; |
| 3405 | if (rnp->grphi >= NR_CPUS) | 3531 | if (rnp->grphi >= nr_cpu_ids) |
| 3406 | rnp->grphi = NR_CPUS - 1; | 3532 | rnp->grphi = nr_cpu_ids - 1; |
| 3407 | if (i == 0) { | 3533 | if (i == 0) { |
| 3408 | rnp->grpnum = 0; | 3534 | rnp->grpnum = 0; |
| 3409 | rnp->grpmask = 0; | 3535 | rnp->grpmask = 0; |
| @@ -3422,7 +3548,6 @@ static void __init rcu_init_one(struct rcu_state *rsp, | |||
| 3422 | 3548 | ||
| 3423 | rsp->rda = rda; | 3549 | rsp->rda = rda; |
| 3424 | init_waitqueue_head(&rsp->gp_wq); | 3550 | init_waitqueue_head(&rsp->gp_wq); |
| 3425 | init_irq_work(&rsp->wakeup_work, rsp_wakeup); | ||
| 3426 | rnp = rsp->level[rcu_num_lvls - 1]; | 3551 | rnp = rsp->level[rcu_num_lvls - 1]; |
| 3427 | for_each_possible_cpu(i) { | 3552 | for_each_possible_cpu(i) { |
| 3428 | while (i > rnp->grphi) | 3553 | while (i > rnp->grphi) |
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 75dc3c39a02a..bf2c1e669691 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h | |||
| @@ -252,7 +252,6 @@ struct rcu_data { | |||
| 252 | bool passed_quiesce; /* User-mode/idle loop etc. */ | 252 | bool passed_quiesce; /* User-mode/idle loop etc. */ |
| 253 | bool qs_pending; /* Core waits for quiesc state. */ | 253 | bool qs_pending; /* Core waits for quiesc state. */ |
| 254 | bool beenonline; /* CPU online at least once. */ | 254 | bool beenonline; /* CPU online at least once. */ |
| 255 | bool preemptible; /* Preemptible RCU? */ | ||
| 256 | struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ | 255 | struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ |
| 257 | unsigned long grpmask; /* Mask to apply to leaf qsmask. */ | 256 | unsigned long grpmask; /* Mask to apply to leaf qsmask. */ |
| 258 | #ifdef CONFIG_RCU_CPU_STALL_INFO | 257 | #ifdef CONFIG_RCU_CPU_STALL_INFO |
| @@ -406,7 +405,8 @@ struct rcu_state { | |||
| 406 | unsigned long completed; /* # of last completed gp. */ | 405 | unsigned long completed; /* # of last completed gp. */ |
| 407 | struct task_struct *gp_kthread; /* Task for grace periods. */ | 406 | struct task_struct *gp_kthread; /* Task for grace periods. */ |
| 408 | wait_queue_head_t gp_wq; /* Where GP task waits. */ | 407 | wait_queue_head_t gp_wq; /* Where GP task waits. */ |
| 409 | int gp_flags; /* Commands for GP task. */ | 408 | short gp_flags; /* Commands for GP task. */ |
| 409 | short gp_state; /* GP kthread sleep state. */ | ||
| 410 | 410 | ||
| 411 | /* End of fields guarded by root rcu_node's lock. */ | 411 | /* End of fields guarded by root rcu_node's lock. */ |
| 412 | 412 | ||
| @@ -462,13 +462,17 @@ struct rcu_state { | |||
| 462 | const char *name; /* Name of structure. */ | 462 | const char *name; /* Name of structure. */ |
| 463 | char abbr; /* Abbreviated name. */ | 463 | char abbr; /* Abbreviated name. */ |
| 464 | struct list_head flavors; /* List of RCU flavors. */ | 464 | struct list_head flavors; /* List of RCU flavors. */ |
| 465 | struct irq_work wakeup_work; /* Postponed wakeups */ | ||
| 466 | }; | 465 | }; |
| 467 | 466 | ||
| 468 | /* Values for rcu_state structure's gp_flags field. */ | 467 | /* Values for rcu_state structure's gp_flags field. */ |
| 469 | #define RCU_GP_FLAG_INIT 0x1 /* Need grace-period initialization. */ | 468 | #define RCU_GP_FLAG_INIT 0x1 /* Need grace-period initialization. */ |
| 470 | #define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */ | 469 | #define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */ |
| 471 | 470 | ||
| 471 | /* Values for rcu_state structure's gp_flags field. */ | ||
| 472 | #define RCU_GP_WAIT_INIT 0 /* Initial state. */ | ||
| 473 | #define RCU_GP_WAIT_GPS 1 /* Wait for grace-period start. */ | ||
| 474 | #define RCU_GP_WAIT_FQS 2 /* Wait for force-quiescent-state time. */ | ||
| 475 | |||
| 472 | extern struct list_head rcu_struct_flavors; | 476 | extern struct list_head rcu_struct_flavors; |
| 473 | 477 | ||
| 474 | /* Sequence through rcu_state structures for each RCU flavor. */ | 478 | /* Sequence through rcu_state structures for each RCU flavor. */ |
| @@ -547,7 +551,6 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); | |||
| 547 | static void print_cpu_stall_info_end(void); | 551 | static void print_cpu_stall_info_end(void); |
| 548 | static void zero_cpu_stall_ticks(struct rcu_data *rdp); | 552 | static void zero_cpu_stall_ticks(struct rcu_data *rdp); |
| 549 | static void increment_cpu_stall_ticks(void); | 553 | static void increment_cpu_stall_ticks(void); |
| 550 | static int rcu_nocb_needs_gp(struct rcu_state *rsp); | ||
| 551 | static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); | 554 | static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); |
| 552 | static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp); | 555 | static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp); |
| 553 | static void rcu_init_one_nocb(struct rcu_node *rnp); | 556 | static void rcu_init_one_nocb(struct rcu_node *rnp); |
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 962d1d589929..cbc2c45265e2 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h | |||
| @@ -116,7 +116,7 @@ static void __init rcu_bootup_announce_oddness(void) | |||
| 116 | #ifdef CONFIG_TREE_PREEMPT_RCU | 116 | #ifdef CONFIG_TREE_PREEMPT_RCU |
| 117 | 117 | ||
| 118 | RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); | 118 | RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); |
| 119 | static struct rcu_state *rcu_state = &rcu_preempt_state; | 119 | static struct rcu_state *rcu_state_p = &rcu_preempt_state; |
| 120 | 120 | ||
| 121 | static int rcu_preempted_readers_exp(struct rcu_node *rnp); | 121 | static int rcu_preempted_readers_exp(struct rcu_node *rnp); |
| 122 | 122 | ||
| @@ -149,15 +149,6 @@ long rcu_batches_completed(void) | |||
| 149 | EXPORT_SYMBOL_GPL(rcu_batches_completed); | 149 | EXPORT_SYMBOL_GPL(rcu_batches_completed); |
| 150 | 150 | ||
| 151 | /* | 151 | /* |
| 152 | * Force a quiescent state for preemptible RCU. | ||
| 153 | */ | ||
| 154 | void rcu_force_quiescent_state(void) | ||
| 155 | { | ||
| 156 | force_quiescent_state(&rcu_preempt_state); | ||
| 157 | } | ||
| 158 | EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); | ||
| 159 | |||
| 160 | /* | ||
| 161 | * Record a preemptible-RCU quiescent state for the specified CPU. Note | 152 | * Record a preemptible-RCU quiescent state for the specified CPU. Note |
| 162 | * that this just means that the task currently running on the CPU is | 153 | * that this just means that the task currently running on the CPU is |
| 163 | * not in a quiescent state. There might be any number of tasks blocked | 154 | * not in a quiescent state. There might be any number of tasks blocked |
| @@ -688,20 +679,6 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | |||
| 688 | } | 679 | } |
| 689 | EXPORT_SYMBOL_GPL(call_rcu); | 680 | EXPORT_SYMBOL_GPL(call_rcu); |
| 690 | 681 | ||
| 691 | /* | ||
| 692 | * Queue an RCU callback for lazy invocation after a grace period. | ||
| 693 | * This will likely be later named something like "call_rcu_lazy()", | ||
| 694 | * but this change will require some way of tagging the lazy RCU | ||
| 695 | * callbacks in the list of pending callbacks. Until then, this | ||
| 696 | * function may only be called from __kfree_rcu(). | ||
| 697 | */ | ||
| 698 | void kfree_call_rcu(struct rcu_head *head, | ||
| 699 | void (*func)(struct rcu_head *rcu)) | ||
| 700 | { | ||
| 701 | __call_rcu(head, func, &rcu_preempt_state, -1, 1); | ||
| 702 | } | ||
| 703 | EXPORT_SYMBOL_GPL(kfree_call_rcu); | ||
| 704 | |||
| 705 | /** | 682 | /** |
| 706 | * synchronize_rcu - wait until a grace period has elapsed. | 683 | * synchronize_rcu - wait until a grace period has elapsed. |
| 707 | * | 684 | * |
| @@ -970,7 +947,7 @@ void exit_rcu(void) | |||
| 970 | 947 | ||
| 971 | #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | 948 | #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ |
| 972 | 949 | ||
| 973 | static struct rcu_state *rcu_state = &rcu_sched_state; | 950 | static struct rcu_state *rcu_state_p = &rcu_sched_state; |
| 974 | 951 | ||
| 975 | /* | 952 | /* |
| 976 | * Tell them what RCU they are running. | 953 | * Tell them what RCU they are running. |
| @@ -991,16 +968,6 @@ long rcu_batches_completed(void) | |||
| 991 | EXPORT_SYMBOL_GPL(rcu_batches_completed); | 968 | EXPORT_SYMBOL_GPL(rcu_batches_completed); |
| 992 | 969 | ||
| 993 | /* | 970 | /* |
| 994 | * Force a quiescent state for RCU, which, because there is no preemptible | ||
| 995 | * RCU, becomes the same as rcu-sched. | ||
| 996 | */ | ||
| 997 | void rcu_force_quiescent_state(void) | ||
| 998 | { | ||
| 999 | rcu_sched_force_quiescent_state(); | ||
| 1000 | } | ||
| 1001 | EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); | ||
| 1002 | |||
| 1003 | /* | ||
| 1004 | * Because preemptible RCU does not exist, we never have to check for | 971 | * Because preemptible RCU does not exist, we never have to check for |
| 1005 | * CPUs being in quiescent states. | 972 | * CPUs being in quiescent states. |
| 1006 | */ | 973 | */ |
| @@ -1080,22 +1047,6 @@ static void rcu_preempt_check_callbacks(int cpu) | |||
| 1080 | } | 1047 | } |
| 1081 | 1048 | ||
| 1082 | /* | 1049 | /* |
| 1083 | * Queue an RCU callback for lazy invocation after a grace period. | ||
| 1084 | * This will likely be later named something like "call_rcu_lazy()", | ||
| 1085 | * but this change will require some way of tagging the lazy RCU | ||
| 1086 | * callbacks in the list of pending callbacks. Until then, this | ||
| 1087 | * function may only be called from __kfree_rcu(). | ||
| 1088 | * | ||
| 1089 | * Because there is no preemptible RCU, we use RCU-sched instead. | ||
| 1090 | */ | ||
| 1091 | void kfree_call_rcu(struct rcu_head *head, | ||
| 1092 | void (*func)(struct rcu_head *rcu)) | ||
| 1093 | { | ||
| 1094 | __call_rcu(head, func, &rcu_sched_state, -1, 1); | ||
| 1095 | } | ||
| 1096 | EXPORT_SYMBOL_GPL(kfree_call_rcu); | ||
| 1097 | |||
| 1098 | /* | ||
| 1099 | * Wait for an rcu-preempt grace period, but make it happen quickly. | 1050 | * Wait for an rcu-preempt grace period, but make it happen quickly. |
| 1100 | * But because preemptible RCU does not exist, map to rcu-sched. | 1051 | * But because preemptible RCU does not exist, map to rcu-sched. |
| 1101 | */ | 1052 | */ |
| @@ -1517,11 +1468,11 @@ static int __init rcu_spawn_kthreads(void) | |||
| 1517 | for_each_possible_cpu(cpu) | 1468 | for_each_possible_cpu(cpu) |
| 1518 | per_cpu(rcu_cpu_has_work, cpu) = 0; | 1469 | per_cpu(rcu_cpu_has_work, cpu) = 0; |
| 1519 | BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec)); | 1470 | BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec)); |
| 1520 | rnp = rcu_get_root(rcu_state); | 1471 | rnp = rcu_get_root(rcu_state_p); |
| 1521 | (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); | 1472 | (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp); |
| 1522 | if (NUM_RCU_NODES > 1) { | 1473 | if (NUM_RCU_NODES > 1) { |
| 1523 | rcu_for_each_leaf_node(rcu_state, rnp) | 1474 | rcu_for_each_leaf_node(rcu_state_p, rnp) |
| 1524 | (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); | 1475 | (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp); |
| 1525 | } | 1476 | } |
| 1526 | return 0; | 1477 | return 0; |
| 1527 | } | 1478 | } |
| @@ -1529,12 +1480,12 @@ early_initcall(rcu_spawn_kthreads); | |||
| 1529 | 1480 | ||
| 1530 | static void rcu_prepare_kthreads(int cpu) | 1481 | static void rcu_prepare_kthreads(int cpu) |
| 1531 | { | 1482 | { |
| 1532 | struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); | 1483 | struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu); |
| 1533 | struct rcu_node *rnp = rdp->mynode; | 1484 | struct rcu_node *rnp = rdp->mynode; |
| 1534 | 1485 | ||
| 1535 | /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */ | 1486 | /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */ |
| 1536 | if (rcu_scheduler_fully_active) | 1487 | if (rcu_scheduler_fully_active) |
| 1537 | (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); | 1488 | (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp); |
| 1538 | } | 1489 | } |
| 1539 | 1490 | ||
| 1540 | #else /* #ifdef CONFIG_RCU_BOOST */ | 1491 | #else /* #ifdef CONFIG_RCU_BOOST */ |
| @@ -1744,6 +1695,7 @@ int rcu_needs_cpu(int cpu, unsigned long *dj) | |||
| 1744 | static void rcu_prepare_for_idle(int cpu) | 1695 | static void rcu_prepare_for_idle(int cpu) |
| 1745 | { | 1696 | { |
| 1746 | #ifndef CONFIG_RCU_NOCB_CPU_ALL | 1697 | #ifndef CONFIG_RCU_NOCB_CPU_ALL |
| 1698 | bool needwake; | ||
| 1747 | struct rcu_data *rdp; | 1699 | struct rcu_data *rdp; |
| 1748 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); | 1700 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); |
| 1749 | struct rcu_node *rnp; | 1701 | struct rcu_node *rnp; |
| @@ -1792,8 +1744,10 @@ static void rcu_prepare_for_idle(int cpu) | |||
| 1792 | rnp = rdp->mynode; | 1744 | rnp = rdp->mynode; |
| 1793 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ | 1745 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ |
| 1794 | smp_mb__after_unlock_lock(); | 1746 | smp_mb__after_unlock_lock(); |
| 1795 | rcu_accelerate_cbs(rsp, rnp, rdp); | 1747 | needwake = rcu_accelerate_cbs(rsp, rnp, rdp); |
| 1796 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 1748 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
| 1749 | if (needwake) | ||
| 1750 | rcu_gp_kthread_wake(rsp); | ||
| 1797 | } | 1751 | } |
| 1798 | #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ | 1752 | #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ |
| 1799 | } | 1753 | } |
| @@ -1855,7 +1809,7 @@ static void rcu_oom_notify_cpu(void *unused) | |||
| 1855 | struct rcu_data *rdp; | 1809 | struct rcu_data *rdp; |
| 1856 | 1810 | ||
| 1857 | for_each_rcu_flavor(rsp) { | 1811 | for_each_rcu_flavor(rsp) { |
| 1858 | rdp = __this_cpu_ptr(rsp->rda); | 1812 | rdp = raw_cpu_ptr(rsp->rda); |
| 1859 | if (rdp->qlen_lazy != 0) { | 1813 | if (rdp->qlen_lazy != 0) { |
| 1860 | atomic_inc(&oom_callback_count); | 1814 | atomic_inc(&oom_callback_count); |
| 1861 | rsp->call(&rdp->oom_head, rcu_oom_callback); | 1815 | rsp->call(&rdp->oom_head, rcu_oom_callback); |
| @@ -1997,7 +1951,7 @@ static void increment_cpu_stall_ticks(void) | |||
| 1997 | struct rcu_state *rsp; | 1951 | struct rcu_state *rsp; |
| 1998 | 1952 | ||
| 1999 | for_each_rcu_flavor(rsp) | 1953 | for_each_rcu_flavor(rsp) |
| 2000 | __this_cpu_ptr(rsp->rda)->ticks_this_gp++; | 1954 | raw_cpu_inc(rsp->rda->ticks_this_gp); |
| 2001 | } | 1955 | } |
| 2002 | 1956 | ||
| 2003 | #else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ | 1957 | #else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ |
| @@ -2068,19 +2022,6 @@ static int __init parse_rcu_nocb_poll(char *arg) | |||
| 2068 | early_param("rcu_nocb_poll", parse_rcu_nocb_poll); | 2022 | early_param("rcu_nocb_poll", parse_rcu_nocb_poll); |
| 2069 | 2023 | ||
| 2070 | /* | 2024 | /* |
| 2071 | * Do any no-CBs CPUs need another grace period? | ||
| 2072 | * | ||
| 2073 | * Interrupts must be disabled. If the caller does not hold the root | ||
| 2074 | * rnp_node structure's ->lock, the results are advisory only. | ||
| 2075 | */ | ||
| 2076 | static int rcu_nocb_needs_gp(struct rcu_state *rsp) | ||
| 2077 | { | ||
| 2078 | struct rcu_node *rnp = rcu_get_root(rsp); | ||
| 2079 | |||
| 2080 | return rnp->need_future_gp[(ACCESS_ONCE(rnp->completed) + 1) & 0x1]; | ||
| 2081 | } | ||
| 2082 | |||
| 2083 | /* | ||
| 2084 | * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended | 2025 | * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended |
| 2085 | * grace period. | 2026 | * grace period. |
| 2086 | */ | 2027 | */ |
| @@ -2109,7 +2050,7 @@ static void rcu_init_one_nocb(struct rcu_node *rnp) | |||
| 2109 | } | 2050 | } |
| 2110 | 2051 | ||
| 2111 | #ifndef CONFIG_RCU_NOCB_CPU_ALL | 2052 | #ifndef CONFIG_RCU_NOCB_CPU_ALL |
| 2112 | /* Is the specified CPU a no-CPUs CPU? */ | 2053 | /* Is the specified CPU a no-CBs CPU? */ |
| 2113 | bool rcu_is_nocb_cpu(int cpu) | 2054 | bool rcu_is_nocb_cpu(int cpu) |
| 2114 | { | 2055 | { |
| 2115 | if (have_rcu_nocb_mask) | 2056 | if (have_rcu_nocb_mask) |
| @@ -2243,12 +2184,15 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) | |||
| 2243 | unsigned long c; | 2184 | unsigned long c; |
| 2244 | bool d; | 2185 | bool d; |
| 2245 | unsigned long flags; | 2186 | unsigned long flags; |
| 2187 | bool needwake; | ||
| 2246 | struct rcu_node *rnp = rdp->mynode; | 2188 | struct rcu_node *rnp = rdp->mynode; |
| 2247 | 2189 | ||
| 2248 | raw_spin_lock_irqsave(&rnp->lock, flags); | 2190 | raw_spin_lock_irqsave(&rnp->lock, flags); |
| 2249 | smp_mb__after_unlock_lock(); | 2191 | smp_mb__after_unlock_lock(); |
| 2250 | c = rcu_start_future_gp(rnp, rdp); | 2192 | needwake = rcu_start_future_gp(rnp, rdp, &c); |
| 2251 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 2193 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 2194 | if (needwake) | ||
| 2195 | rcu_gp_kthread_wake(rdp->rsp); | ||
| 2252 | 2196 | ||
| 2253 | /* | 2197 | /* |
| 2254 | * Wait for the grace period. Do so interruptibly to avoid messing | 2198 | * Wait for the grace period. Do so interruptibly to avoid messing |
| @@ -2402,11 +2346,6 @@ static bool init_nocb_callback_list(struct rcu_data *rdp) | |||
| 2402 | 2346 | ||
| 2403 | #else /* #ifdef CONFIG_RCU_NOCB_CPU */ | 2347 | #else /* #ifdef CONFIG_RCU_NOCB_CPU */ |
| 2404 | 2348 | ||
| 2405 | static int rcu_nocb_needs_gp(struct rcu_state *rsp) | ||
| 2406 | { | ||
| 2407 | return 0; | ||
| 2408 | } | ||
| 2409 | |||
| 2410 | static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) | 2349 | static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) |
| 2411 | { | 2350 | { |
| 2412 | } | 2351 | } |
| @@ -2523,9 +2462,9 @@ static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) | |||
| 2523 | /* Record start of fully idle period. */ | 2462 | /* Record start of fully idle period. */ |
| 2524 | j = jiffies; | 2463 | j = jiffies; |
| 2525 | ACCESS_ONCE(rdtp->dynticks_idle_jiffies) = j; | 2464 | ACCESS_ONCE(rdtp->dynticks_idle_jiffies) = j; |
| 2526 | smp_mb__before_atomic_inc(); | 2465 | smp_mb__before_atomic(); |
| 2527 | atomic_inc(&rdtp->dynticks_idle); | 2466 | atomic_inc(&rdtp->dynticks_idle); |
| 2528 | smp_mb__after_atomic_inc(); | 2467 | smp_mb__after_atomic(); |
| 2529 | WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1); | 2468 | WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1); |
| 2530 | } | 2469 | } |
| 2531 | 2470 | ||
| @@ -2590,9 +2529,9 @@ static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) | |||
| 2590 | } | 2529 | } |
| 2591 | 2530 | ||
| 2592 | /* Record end of idle period. */ | 2531 | /* Record end of idle period. */ |
| 2593 | smp_mb__before_atomic_inc(); | 2532 | smp_mb__before_atomic(); |
| 2594 | atomic_inc(&rdtp->dynticks_idle); | 2533 | atomic_inc(&rdtp->dynticks_idle); |
| 2595 | smp_mb__after_atomic_inc(); | 2534 | smp_mb__after_atomic(); |
| 2596 | WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1)); | 2535 | WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1)); |
| 2597 | 2536 | ||
| 2598 | /* | 2537 | /* |
| @@ -2657,20 +2596,6 @@ static bool is_sysidle_rcu_state(struct rcu_state *rsp) | |||
| 2657 | } | 2596 | } |
| 2658 | 2597 | ||
| 2659 | /* | 2598 | /* |
| 2660 | * Bind the grace-period kthread for the sysidle flavor of RCU to the | ||
| 2661 | * timekeeping CPU. | ||
| 2662 | */ | ||
| 2663 | static void rcu_bind_gp_kthread(void) | ||
| 2664 | { | ||
| 2665 | int cpu = ACCESS_ONCE(tick_do_timer_cpu); | ||
| 2666 | |||
| 2667 | if (cpu < 0 || cpu >= nr_cpu_ids) | ||
| 2668 | return; | ||
| 2669 | if (raw_smp_processor_id() != cpu) | ||
| 2670 | set_cpus_allowed_ptr(current, cpumask_of(cpu)); | ||
| 2671 | } | ||
| 2672 | |||
| 2673 | /* | ||
| 2674 | * Return a delay in jiffies based on the number of CPUs, rcu_node | 2599 | * Return a delay in jiffies based on the number of CPUs, rcu_node |
| 2675 | * leaf fanout, and jiffies tick rate. The idea is to allow larger | 2600 | * leaf fanout, and jiffies tick rate. The idea is to allow larger |
| 2676 | * systems more time to transition to full-idle state in order to | 2601 | * systems more time to transition to full-idle state in order to |
| @@ -2734,7 +2659,8 @@ static void rcu_sysidle(unsigned long j) | |||
| 2734 | static void rcu_sysidle_cancel(void) | 2659 | static void rcu_sysidle_cancel(void) |
| 2735 | { | 2660 | { |
| 2736 | smp_mb(); | 2661 | smp_mb(); |
| 2737 | ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT; | 2662 | if (full_sysidle_state > RCU_SYSIDLE_SHORT) |
| 2663 | ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT; | ||
| 2738 | } | 2664 | } |
| 2739 | 2665 | ||
| 2740 | /* | 2666 | /* |
| @@ -2880,10 +2806,6 @@ static bool is_sysidle_rcu_state(struct rcu_state *rsp) | |||
| 2880 | return false; | 2806 | return false; |
| 2881 | } | 2807 | } |
| 2882 | 2808 | ||
| 2883 | static void rcu_bind_gp_kthread(void) | ||
| 2884 | { | ||
| 2885 | } | ||
| 2886 | |||
| 2887 | static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, | 2809 | static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, |
| 2888 | unsigned long maxj) | 2810 | unsigned long maxj) |
| 2889 | { | 2811 | { |
| @@ -2914,3 +2836,19 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp) | |||
| 2914 | #endif /* #ifdef CONFIG_NO_HZ_FULL */ | 2836 | #endif /* #ifdef CONFIG_NO_HZ_FULL */ |
| 2915 | return 0; | 2837 | return 0; |
| 2916 | } | 2838 | } |
| 2839 | |||
| 2840 | /* | ||
| 2841 | * Bind the grace-period kthread for the sysidle flavor of RCU to the | ||
| 2842 | * timekeeping CPU. | ||
| 2843 | */ | ||
| 2844 | static void rcu_bind_gp_kthread(void) | ||
| 2845 | { | ||
| 2846 | #ifdef CONFIG_NO_HZ_FULL | ||
| 2847 | int cpu = ACCESS_ONCE(tick_do_timer_cpu); | ||
| 2848 | |||
| 2849 | if (cpu < 0 || cpu >= nr_cpu_ids) | ||
| 2850 | return; | ||
| 2851 | if (raw_smp_processor_id() != cpu) | ||
| 2852 | set_cpus_allowed_ptr(current, cpumask_of(cpu)); | ||
| 2853 | #endif /* #ifdef CONFIG_NO_HZ_FULL */ | ||
| 2854 | } | ||
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 4c0a9b0af469..a2aeb4df0f60 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c | |||
| @@ -320,6 +320,18 @@ int rcu_jiffies_till_stall_check(void) | |||
| 320 | return till_stall_check * HZ + RCU_STALL_DELAY_DELTA; | 320 | return till_stall_check * HZ + RCU_STALL_DELAY_DELTA; |
| 321 | } | 321 | } |
| 322 | 322 | ||
| 323 | void rcu_sysrq_start(void) | ||
| 324 | { | ||
| 325 | if (!rcu_cpu_stall_suppress) | ||
| 326 | rcu_cpu_stall_suppress = 2; | ||
| 327 | } | ||
| 328 | |||
| 329 | void rcu_sysrq_end(void) | ||
| 330 | { | ||
| 331 | if (rcu_cpu_stall_suppress == 2) | ||
| 332 | rcu_cpu_stall_suppress = 0; | ||
| 333 | } | ||
| 334 | |||
| 323 | static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) | 335 | static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) |
| 324 | { | 336 | { |
| 325 | rcu_cpu_stall_suppress = 1; | 337 | rcu_cpu_stall_suppress = 1; |
| @@ -338,3 +350,21 @@ static int __init check_cpu_stall_init(void) | |||
| 338 | early_initcall(check_cpu_stall_init); | 350 | early_initcall(check_cpu_stall_init); |
| 339 | 351 | ||
| 340 | #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ | 352 | #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ |
| 353 | |||
| 354 | /* | ||
| 355 | * Hooks for cond_resched() and friends to avoid RCU CPU stall warnings. | ||
| 356 | */ | ||
| 357 | |||
| 358 | DEFINE_PER_CPU(int, rcu_cond_resched_count); | ||
| 359 | |||
| 360 | /* | ||
| 361 | * Report a set of RCU quiescent states, for use by cond_resched() | ||
| 362 | * and friends. Out of line due to being called infrequently. | ||
| 363 | */ | ||
| 364 | void rcu_resched(void) | ||
| 365 | { | ||
| 366 | preempt_disable(); | ||
| 367 | __this_cpu_write(rcu_cond_resched_count, 0); | ||
| 368 | rcu_note_context_switch(smp_processor_id()); | ||
| 369 | preempt_enable(); | ||
| 370 | } | ||
diff --git a/kernel/reboot.c b/kernel/reboot.c index 662c83fc16b7..a3a9e240fcdb 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c | |||
| @@ -388,15 +388,22 @@ static int __init reboot_setup(char *str) | |||
| 388 | break; | 388 | break; |
| 389 | 389 | ||
| 390 | case 's': | 390 | case 's': |
| 391 | if (isdigit(*(str+1))) | 391 | { |
| 392 | reboot_cpu = simple_strtoul(str+1, NULL, 0); | 392 | int rc; |
| 393 | else if (str[1] == 'm' && str[2] == 'p' && | 393 | |
| 394 | isdigit(*(str+3))) | 394 | if (isdigit(*(str+1))) { |
| 395 | reboot_cpu = simple_strtoul(str+3, NULL, 0); | 395 | rc = kstrtoint(str+1, 0, &reboot_cpu); |
| 396 | else | 396 | if (rc) |
| 397 | return rc; | ||
| 398 | } else if (str[1] == 'm' && str[2] == 'p' && | ||
| 399 | isdigit(*(str+3))) { | ||
| 400 | rc = kstrtoint(str+3, 0, &reboot_cpu); | ||
| 401 | if (rc) | ||
| 402 | return rc; | ||
| 403 | } else | ||
| 397 | reboot_mode = REBOOT_SOFT; | 404 | reboot_mode = REBOOT_SOFT; |
| 398 | break; | 405 | break; |
| 399 | 406 | } | |
| 400 | case 'g': | 407 | case 'g': |
| 401 | reboot_mode = REBOOT_GPIO; | 408 | reboot_mode = REBOOT_GPIO; |
| 402 | break; | 409 | break; |
diff --git a/kernel/res_counter.c b/kernel/res_counter.c index 51dbac6a3633..e791130f85a7 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c | |||
| @@ -186,8 +186,11 @@ int res_counter_memparse_write_strategy(const char *buf, | |||
| 186 | 186 | ||
| 187 | /* return RES_COUNTER_MAX(unlimited) if "-1" is specified */ | 187 | /* return RES_COUNTER_MAX(unlimited) if "-1" is specified */ |
| 188 | if (*buf == '-') { | 188 | if (*buf == '-') { |
| 189 | res = simple_strtoull(buf + 1, &end, 10); | 189 | int rc = kstrtoull(buf + 1, 10, &res); |
| 190 | if (res != 1 || *end != '\0') | 190 | |
| 191 | if (rc) | ||
| 192 | return rc; | ||
| 193 | if (res != 1) | ||
| 191 | return -EINVAL; | 194 | return -EINVAL; |
| 192 | *resp = RES_COUNTER_MAX; | 195 | *resp = RES_COUNTER_MAX; |
| 193 | return 0; | 196 | return 0; |
diff --git a/kernel/resource.c b/kernel/resource.c index 8957d686e29b..3c2237ac32db 100644 --- a/kernel/resource.c +++ b/kernel/resource.c | |||
| @@ -1288,13 +1288,10 @@ int iomem_map_sanity_check(resource_size_t addr, unsigned long size) | |||
| 1288 | if (p->flags & IORESOURCE_BUSY) | 1288 | if (p->flags & IORESOURCE_BUSY) |
| 1289 | continue; | 1289 | continue; |
| 1290 | 1290 | ||
| 1291 | printk(KERN_WARNING "resource map sanity check conflict: " | 1291 | printk(KERN_WARNING "resource sanity check: requesting [mem %#010llx-%#010llx], which spans more than %s %pR\n", |
| 1292 | "0x%llx 0x%llx 0x%llx 0x%llx %s\n", | ||
| 1293 | (unsigned long long)addr, | 1292 | (unsigned long long)addr, |
| 1294 | (unsigned long long)(addr + size - 1), | 1293 | (unsigned long long)(addr + size - 1), |
| 1295 | (unsigned long long)p->start, | 1294 | p->name, p); |
| 1296 | (unsigned long long)p->end, | ||
| 1297 | p->name); | ||
| 1298 | err = -1; | 1295 | err = -1; |
| 1299 | break; | 1296 | break; |
| 1300 | } | 1297 | } |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 268a45ea238c..c6b98793d647 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
| @@ -90,6 +90,22 @@ | |||
| 90 | #define CREATE_TRACE_POINTS | 90 | #define CREATE_TRACE_POINTS |
| 91 | #include <trace/events/sched.h> | 91 | #include <trace/events/sched.h> |
| 92 | 92 | ||
| 93 | #ifdef smp_mb__before_atomic | ||
| 94 | void __smp_mb__before_atomic(void) | ||
| 95 | { | ||
| 96 | smp_mb__before_atomic(); | ||
| 97 | } | ||
| 98 | EXPORT_SYMBOL(__smp_mb__before_atomic); | ||
| 99 | #endif | ||
| 100 | |||
| 101 | #ifdef smp_mb__after_atomic | ||
| 102 | void __smp_mb__after_atomic(void) | ||
| 103 | { | ||
| 104 | smp_mb__after_atomic(); | ||
| 105 | } | ||
| 106 | EXPORT_SYMBOL(__smp_mb__after_atomic); | ||
| 107 | #endif | ||
| 108 | |||
| 93 | void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) | 109 | void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) |
| 94 | { | 110 | { |
| 95 | unsigned long delta; | 111 | unsigned long delta; |
| @@ -506,6 +522,39 @@ static inline void init_hrtick(void) | |||
| 506 | #endif /* CONFIG_SCHED_HRTICK */ | 522 | #endif /* CONFIG_SCHED_HRTICK */ |
| 507 | 523 | ||
| 508 | /* | 524 | /* |
| 525 | * cmpxchg based fetch_or, macro so it works for different integer types | ||
| 526 | */ | ||
| 527 | #define fetch_or(ptr, val) \ | ||
| 528 | ({ typeof(*(ptr)) __old, __val = *(ptr); \ | ||
| 529 | for (;;) { \ | ||
| 530 | __old = cmpxchg((ptr), __val, __val | (val)); \ | ||
| 531 | if (__old == __val) \ | ||
| 532 | break; \ | ||
| 533 | __val = __old; \ | ||
| 534 | } \ | ||
| 535 | __old; \ | ||
| 536 | }) | ||
| 537 | |||
| 538 | #ifdef TIF_POLLING_NRFLAG | ||
| 539 | /* | ||
| 540 | * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, | ||
| 541 | * this avoids any races wrt polling state changes and thereby avoids | ||
| 542 | * spurious IPIs. | ||
| 543 | */ | ||
| 544 | static bool set_nr_and_not_polling(struct task_struct *p) | ||
| 545 | { | ||
| 546 | struct thread_info *ti = task_thread_info(p); | ||
| 547 | return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); | ||
| 548 | } | ||
| 549 | #else | ||
| 550 | static bool set_nr_and_not_polling(struct task_struct *p) | ||
| 551 | { | ||
| 552 | set_tsk_need_resched(p); | ||
| 553 | return true; | ||
| 554 | } | ||
| 555 | #endif | ||
| 556 | |||
| 557 | /* | ||
| 509 | * resched_task - mark a task 'to be rescheduled now'. | 558 | * resched_task - mark a task 'to be rescheduled now'. |
| 510 | * | 559 | * |
| 511 | * On UP this means the setting of the need_resched flag, on SMP it | 560 | * On UP this means the setting of the need_resched flag, on SMP it |
| @@ -521,17 +570,15 @@ void resched_task(struct task_struct *p) | |||
| 521 | if (test_tsk_need_resched(p)) | 570 | if (test_tsk_need_resched(p)) |
| 522 | return; | 571 | return; |
| 523 | 572 | ||
| 524 | set_tsk_need_resched(p); | ||
| 525 | |||
| 526 | cpu = task_cpu(p); | 573 | cpu = task_cpu(p); |
| 574 | |||
| 527 | if (cpu == smp_processor_id()) { | 575 | if (cpu == smp_processor_id()) { |
| 576 | set_tsk_need_resched(p); | ||
| 528 | set_preempt_need_resched(); | 577 | set_preempt_need_resched(); |
| 529 | return; | 578 | return; |
| 530 | } | 579 | } |
| 531 | 580 | ||
| 532 | /* NEED_RESCHED must be visible before we test polling */ | 581 | if (set_nr_and_not_polling(p)) |
| 533 | smp_mb(); | ||
| 534 | if (!tsk_is_polling(p)) | ||
| 535 | smp_send_reschedule(cpu); | 582 | smp_send_reschedule(cpu); |
| 536 | } | 583 | } |
| 537 | 584 | ||
| @@ -1320,7 +1367,7 @@ out: | |||
| 1320 | * leave kernel. | 1367 | * leave kernel. |
| 1321 | */ | 1368 | */ |
| 1322 | if (p->mm && printk_ratelimit()) { | 1369 | if (p->mm && printk_ratelimit()) { |
| 1323 | printk_sched("process %d (%s) no longer affine to cpu%d\n", | 1370 | printk_deferred("process %d (%s) no longer affine to cpu%d\n", |
| 1324 | task_pid_nr(p), p->comm, cpu); | 1371 | task_pid_nr(p), p->comm, cpu); |
| 1325 | } | 1372 | } |
| 1326 | } | 1373 | } |
| @@ -2192,7 +2239,7 @@ static inline void post_schedule(struct rq *rq) | |||
| 2192 | * schedule_tail - first thing a freshly forked thread must call. | 2239 | * schedule_tail - first thing a freshly forked thread must call. |
| 2193 | * @prev: the thread we just switched away from. | 2240 | * @prev: the thread we just switched away from. |
| 2194 | */ | 2241 | */ |
| 2195 | asmlinkage void schedule_tail(struct task_struct *prev) | 2242 | asmlinkage __visible void schedule_tail(struct task_struct *prev) |
| 2196 | __releases(rq->lock) | 2243 | __releases(rq->lock) |
| 2197 | { | 2244 | { |
| 2198 | struct rq *rq = this_rq(); | 2245 | struct rq *rq = this_rq(); |
| @@ -2592,8 +2639,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev) | |||
| 2592 | if (likely(prev->sched_class == class && | 2639 | if (likely(prev->sched_class == class && |
| 2593 | rq->nr_running == rq->cfs.h_nr_running)) { | 2640 | rq->nr_running == rq->cfs.h_nr_running)) { |
| 2594 | p = fair_sched_class.pick_next_task(rq, prev); | 2641 | p = fair_sched_class.pick_next_task(rq, prev); |
| 2595 | if (likely(p && p != RETRY_TASK)) | 2642 | if (unlikely(p == RETRY_TASK)) |
| 2596 | return p; | 2643 | goto again; |
| 2644 | |||
| 2645 | /* assumes fair_sched_class->next == idle_sched_class */ | ||
| 2646 | if (unlikely(!p)) | ||
| 2647 | p = idle_sched_class.pick_next_task(rq, prev); | ||
| 2648 | |||
| 2649 | return p; | ||
| 2597 | } | 2650 | } |
| 2598 | 2651 | ||
| 2599 | again: | 2652 | again: |
| @@ -2741,7 +2794,7 @@ static inline void sched_submit_work(struct task_struct *tsk) | |||
| 2741 | blk_schedule_flush_plug(tsk); | 2794 | blk_schedule_flush_plug(tsk); |
| 2742 | } | 2795 | } |
| 2743 | 2796 | ||
| 2744 | asmlinkage void __sched schedule(void) | 2797 | asmlinkage __visible void __sched schedule(void) |
| 2745 | { | 2798 | { |
| 2746 | struct task_struct *tsk = current; | 2799 | struct task_struct *tsk = current; |
| 2747 | 2800 | ||
| @@ -2751,7 +2804,7 @@ asmlinkage void __sched schedule(void) | |||
| 2751 | EXPORT_SYMBOL(schedule); | 2804 | EXPORT_SYMBOL(schedule); |
| 2752 | 2805 | ||
| 2753 | #ifdef CONFIG_CONTEXT_TRACKING | 2806 | #ifdef CONFIG_CONTEXT_TRACKING |
| 2754 | asmlinkage void __sched schedule_user(void) | 2807 | asmlinkage __visible void __sched schedule_user(void) |
| 2755 | { | 2808 | { |
| 2756 | /* | 2809 | /* |
| 2757 | * If we come here after a random call to set_need_resched(), | 2810 | * If we come here after a random call to set_need_resched(), |
| @@ -2783,7 +2836,7 @@ void __sched schedule_preempt_disabled(void) | |||
| 2783 | * off of preempt_enable. Kernel preemptions off return from interrupt | 2836 | * off of preempt_enable. Kernel preemptions off return from interrupt |
| 2784 | * occur there and call schedule directly. | 2837 | * occur there and call schedule directly. |
| 2785 | */ | 2838 | */ |
| 2786 | asmlinkage void __sched notrace preempt_schedule(void) | 2839 | asmlinkage __visible void __sched notrace preempt_schedule(void) |
| 2787 | { | 2840 | { |
| 2788 | /* | 2841 | /* |
| 2789 | * If there is a non-zero preempt_count or interrupts are disabled, | 2842 | * If there is a non-zero preempt_count or interrupts are disabled, |
| @@ -2813,7 +2866,7 @@ EXPORT_SYMBOL(preempt_schedule); | |||
| 2813 | * Note, that this is called and return with irqs disabled. This will | 2866 | * Note, that this is called and return with irqs disabled. This will |
| 2814 | * protect us against recursive calling from irq. | 2867 | * protect us against recursive calling from irq. |
| 2815 | */ | 2868 | */ |
| 2816 | asmlinkage void __sched preempt_schedule_irq(void) | 2869 | asmlinkage __visible void __sched preempt_schedule_irq(void) |
| 2817 | { | 2870 | { |
| 2818 | enum ctx_state prev_state; | 2871 | enum ctx_state prev_state; |
| 2819 | 2872 | ||
| @@ -2996,7 +3049,7 @@ EXPORT_SYMBOL(set_user_nice); | |||
| 2996 | int can_nice(const struct task_struct *p, const int nice) | 3049 | int can_nice(const struct task_struct *p, const int nice) |
| 2997 | { | 3050 | { |
| 2998 | /* convert nice value [19,-20] to rlimit style value [1,40] */ | 3051 | /* convert nice value [19,-20] to rlimit style value [1,40] */ |
| 2999 | int nice_rlim = 20 - nice; | 3052 | int nice_rlim = nice_to_rlimit(nice); |
| 3000 | 3053 | ||
| 3001 | return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || | 3054 | return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || |
| 3002 | capable(CAP_SYS_NICE)); | 3055 | capable(CAP_SYS_NICE)); |
| @@ -3020,17 +3073,10 @@ SYSCALL_DEFINE1(nice, int, increment) | |||
| 3020 | * We don't have to worry. Conceptually one call occurs first | 3073 | * We don't have to worry. Conceptually one call occurs first |
| 3021 | * and we have a single winner. | 3074 | * and we have a single winner. |
| 3022 | */ | 3075 | */ |
| 3023 | if (increment < -40) | 3076 | increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); |
| 3024 | increment = -40; | ||
| 3025 | if (increment > 40) | ||
| 3026 | increment = 40; | ||
| 3027 | |||
| 3028 | nice = task_nice(current) + increment; | 3077 | nice = task_nice(current) + increment; |
| 3029 | if (nice < MIN_NICE) | ||
| 3030 | nice = MIN_NICE; | ||
| 3031 | if (nice > MAX_NICE) | ||
| 3032 | nice = MAX_NICE; | ||
| 3033 | 3078 | ||
| 3079 | nice = clamp_val(nice, MIN_NICE, MAX_NICE); | ||
| 3034 | if (increment < 0 && !can_nice(current, nice)) | 3080 | if (increment < 0 && !can_nice(current, nice)) |
| 3035 | return -EPERM; | 3081 | return -EPERM; |
| 3036 | 3082 | ||
| @@ -3124,6 +3170,7 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr) | |||
| 3124 | dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); | 3170 | dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); |
| 3125 | dl_se->dl_throttled = 0; | 3171 | dl_se->dl_throttled = 0; |
| 3126 | dl_se->dl_new = 1; | 3172 | dl_se->dl_new = 1; |
| 3173 | dl_se->dl_yielded = 0; | ||
| 3127 | } | 3174 | } |
| 3128 | 3175 | ||
| 3129 | static void __setscheduler_params(struct task_struct *p, | 3176 | static void __setscheduler_params(struct task_struct *p, |
| @@ -3188,17 +3235,40 @@ __getparam_dl(struct task_struct *p, struct sched_attr *attr) | |||
| 3188 | * We ask for the deadline not being zero, and greater or equal | 3235 | * We ask for the deadline not being zero, and greater or equal |
| 3189 | * than the runtime, as well as the period of being zero or | 3236 | * than the runtime, as well as the period of being zero or |
| 3190 | * greater than deadline. Furthermore, we have to be sure that | 3237 | * greater than deadline. Furthermore, we have to be sure that |
| 3191 | * user parameters are above the internal resolution (1us); we | 3238 | * user parameters are above the internal resolution of 1us (we |
| 3192 | * check sched_runtime only since it is always the smaller one. | 3239 | * check sched_runtime only since it is always the smaller one) and |
| 3240 | * below 2^63 ns (we have to check both sched_deadline and | ||
| 3241 | * sched_period, as the latter can be zero). | ||
| 3193 | */ | 3242 | */ |
| 3194 | static bool | 3243 | static bool |
| 3195 | __checkparam_dl(const struct sched_attr *attr) | 3244 | __checkparam_dl(const struct sched_attr *attr) |
| 3196 | { | 3245 | { |
| 3197 | return attr && attr->sched_deadline != 0 && | 3246 | /* deadline != 0 */ |
| 3198 | (attr->sched_period == 0 || | 3247 | if (attr->sched_deadline == 0) |
| 3199 | (s64)(attr->sched_period - attr->sched_deadline) >= 0) && | 3248 | return false; |
| 3200 | (s64)(attr->sched_deadline - attr->sched_runtime ) >= 0 && | 3249 | |
| 3201 | attr->sched_runtime >= (2 << (DL_SCALE - 1)); | 3250 | /* |
| 3251 | * Since we truncate DL_SCALE bits, make sure we're at least | ||
| 3252 | * that big. | ||
| 3253 | */ | ||
| 3254 | if (attr->sched_runtime < (1ULL << DL_SCALE)) | ||
| 3255 | return false; | ||
| 3256 | |||
| 3257 | /* | ||
| 3258 | * Since we use the MSB for wrap-around and sign issues, make | ||
| 3259 | * sure it's not set (mind that period can be equal to zero). | ||
| 3260 | */ | ||
| 3261 | if (attr->sched_deadline & (1ULL << 63) || | ||
| 3262 | attr->sched_period & (1ULL << 63)) | ||
| 3263 | return false; | ||
| 3264 | |||
| 3265 | /* runtime <= deadline <= period (if period != 0) */ | ||
| 3266 | if ((attr->sched_period != 0 && | ||
| 3267 | attr->sched_period < attr->sched_deadline) || | ||
| 3268 | attr->sched_deadline < attr->sched_runtime) | ||
| 3269 | return false; | ||
| 3270 | |||
| 3271 | return true; | ||
| 3202 | } | 3272 | } |
| 3203 | 3273 | ||
| 3204 | /* | 3274 | /* |
| @@ -3596,13 +3666,11 @@ static int sched_copy_attr(struct sched_attr __user *uattr, | |||
| 3596 | */ | 3666 | */ |
| 3597 | attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE); | 3667 | attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE); |
| 3598 | 3668 | ||
| 3599 | out: | 3669 | return 0; |
| 3600 | return ret; | ||
| 3601 | 3670 | ||
| 3602 | err_size: | 3671 | err_size: |
| 3603 | put_user(sizeof(*attr), &uattr->size); | 3672 | put_user(sizeof(*attr), &uattr->size); |
| 3604 | ret = -E2BIG; | 3673 | return -E2BIG; |
| 3605 | goto out; | ||
| 3606 | } | 3674 | } |
| 3607 | 3675 | ||
| 3608 | /** | 3676 | /** |
| @@ -3639,6 +3707,7 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) | |||
| 3639 | * sys_sched_setattr - same as above, but with extended sched_attr | 3707 | * sys_sched_setattr - same as above, but with extended sched_attr |
| 3640 | * @pid: the pid in question. | 3708 | * @pid: the pid in question. |
| 3641 | * @uattr: structure containing the extended parameters. | 3709 | * @uattr: structure containing the extended parameters. |
| 3710 | * @flags: for future extension. | ||
| 3642 | */ | 3711 | */ |
| 3643 | SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, | 3712 | SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, |
| 3644 | unsigned int, flags) | 3713 | unsigned int, flags) |
| @@ -3650,8 +3719,12 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, | |||
| 3650 | if (!uattr || pid < 0 || flags) | 3719 | if (!uattr || pid < 0 || flags) |
| 3651 | return -EINVAL; | 3720 | return -EINVAL; |
| 3652 | 3721 | ||
| 3653 | if (sched_copy_attr(uattr, &attr)) | 3722 | retval = sched_copy_attr(uattr, &attr); |
| 3654 | return -EFAULT; | 3723 | if (retval) |
| 3724 | return retval; | ||
| 3725 | |||
| 3726 | if ((int)attr.sched_policy < 0) | ||
| 3727 | return -EINVAL; | ||
| 3655 | 3728 | ||
| 3656 | rcu_read_lock(); | 3729 | rcu_read_lock(); |
| 3657 | retval = -ESRCH; | 3730 | retval = -ESRCH; |
| @@ -3701,7 +3774,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) | |||
| 3701 | */ | 3774 | */ |
| 3702 | SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) | 3775 | SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) |
| 3703 | { | 3776 | { |
| 3704 | struct sched_param lp; | 3777 | struct sched_param lp = { .sched_priority = 0 }; |
| 3705 | struct task_struct *p; | 3778 | struct task_struct *p; |
| 3706 | int retval; | 3779 | int retval; |
| 3707 | 3780 | ||
| @@ -3718,11 +3791,8 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) | |||
| 3718 | if (retval) | 3791 | if (retval) |
| 3719 | goto out_unlock; | 3792 | goto out_unlock; |
| 3720 | 3793 | ||
| 3721 | if (task_has_dl_policy(p)) { | 3794 | if (task_has_rt_policy(p)) |
| 3722 | retval = -EINVAL; | 3795 | lp.sched_priority = p->rt_priority; |
| 3723 | goto out_unlock; | ||
| 3724 | } | ||
| 3725 | lp.sched_priority = p->rt_priority; | ||
| 3726 | rcu_read_unlock(); | 3796 | rcu_read_unlock(); |
| 3727 | 3797 | ||
| 3728 | /* | 3798 | /* |
| @@ -3760,7 +3830,7 @@ static int sched_read_attr(struct sched_attr __user *uattr, | |||
| 3760 | 3830 | ||
| 3761 | for (; addr < end; addr++) { | 3831 | for (; addr < end; addr++) { |
| 3762 | if (*addr) | 3832 | if (*addr) |
| 3763 | goto err_size; | 3833 | return -EFBIG; |
| 3764 | } | 3834 | } |
| 3765 | 3835 | ||
| 3766 | attr->size = usize; | 3836 | attr->size = usize; |
| @@ -3770,12 +3840,7 @@ static int sched_read_attr(struct sched_attr __user *uattr, | |||
| 3770 | if (ret) | 3840 | if (ret) |
| 3771 | return -EFAULT; | 3841 | return -EFAULT; |
| 3772 | 3842 | ||
| 3773 | out: | 3843 | return 0; |
| 3774 | return ret; | ||
| 3775 | |||
| 3776 | err_size: | ||
| 3777 | ret = -E2BIG; | ||
| 3778 | goto out; | ||
| 3779 | } | 3844 | } |
| 3780 | 3845 | ||
| 3781 | /** | 3846 | /** |
| @@ -3783,6 +3848,7 @@ err_size: | |||
| 3783 | * @pid: the pid in question. | 3848 | * @pid: the pid in question. |
| 3784 | * @uattr: structure containing the extended parameters. | 3849 | * @uattr: structure containing the extended parameters. |
| 3785 | * @size: sizeof(attr) for fwd/bwd comp. | 3850 | * @size: sizeof(attr) for fwd/bwd comp. |
| 3851 | * @flags: for future extension. | ||
| 3786 | */ | 3852 | */ |
| 3787 | SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, | 3853 | SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, |
| 3788 | unsigned int, size, unsigned int, flags) | 3854 | unsigned int, size, unsigned int, flags) |
| @@ -4051,6 +4117,7 @@ static void __cond_resched(void) | |||
| 4051 | 4117 | ||
| 4052 | int __sched _cond_resched(void) | 4118 | int __sched _cond_resched(void) |
| 4053 | { | 4119 | { |
| 4120 | rcu_cond_resched(); | ||
| 4054 | if (should_resched()) { | 4121 | if (should_resched()) { |
| 4055 | __cond_resched(); | 4122 | __cond_resched(); |
| 4056 | return 1; | 4123 | return 1; |
| @@ -4069,15 +4136,18 @@ EXPORT_SYMBOL(_cond_resched); | |||
| 4069 | */ | 4136 | */ |
| 4070 | int __cond_resched_lock(spinlock_t *lock) | 4137 | int __cond_resched_lock(spinlock_t *lock) |
| 4071 | { | 4138 | { |
| 4139 | bool need_rcu_resched = rcu_should_resched(); | ||
| 4072 | int resched = should_resched(); | 4140 | int resched = should_resched(); |
| 4073 | int ret = 0; | 4141 | int ret = 0; |
| 4074 | 4142 | ||
| 4075 | lockdep_assert_held(lock); | 4143 | lockdep_assert_held(lock); |
| 4076 | 4144 | ||
| 4077 | if (spin_needbreak(lock) || resched) { | 4145 | if (spin_needbreak(lock) || resched || need_rcu_resched) { |
| 4078 | spin_unlock(lock); | 4146 | spin_unlock(lock); |
| 4079 | if (resched) | 4147 | if (resched) |
| 4080 | __cond_resched(); | 4148 | __cond_resched(); |
| 4149 | else if (unlikely(need_rcu_resched)) | ||
| 4150 | rcu_resched(); | ||
| 4081 | else | 4151 | else |
| 4082 | cpu_relax(); | 4152 | cpu_relax(); |
| 4083 | ret = 1; | 4153 | ret = 1; |
| @@ -4091,6 +4161,7 @@ int __sched __cond_resched_softirq(void) | |||
| 4091 | { | 4161 | { |
| 4092 | BUG_ON(!in_softirq()); | 4162 | BUG_ON(!in_softirq()); |
| 4093 | 4163 | ||
| 4164 | rcu_cond_resched(); /* BH disabled OK, just recording QSes. */ | ||
| 4094 | if (should_resched()) { | 4165 | if (should_resched()) { |
| 4095 | local_bh_enable(); | 4166 | local_bh_enable(); |
| 4096 | __cond_resched(); | 4167 | __cond_resched(); |
| @@ -5039,11 +5110,20 @@ static struct notifier_block migration_notifier = { | |||
| 5039 | .priority = CPU_PRI_MIGRATION, | 5110 | .priority = CPU_PRI_MIGRATION, |
| 5040 | }; | 5111 | }; |
| 5041 | 5112 | ||
| 5113 | static void __cpuinit set_cpu_rq_start_time(void) | ||
| 5114 | { | ||
| 5115 | int cpu = smp_processor_id(); | ||
| 5116 | struct rq *rq = cpu_rq(cpu); | ||
| 5117 | rq->age_stamp = sched_clock_cpu(cpu); | ||
| 5118 | } | ||
| 5119 | |||
| 5042 | static int sched_cpu_active(struct notifier_block *nfb, | 5120 | static int sched_cpu_active(struct notifier_block *nfb, |
| 5043 | unsigned long action, void *hcpu) | 5121 | unsigned long action, void *hcpu) |
| 5044 | { | 5122 | { |
| 5045 | switch (action & ~CPU_TASKS_FROZEN) { | 5123 | switch (action & ~CPU_TASKS_FROZEN) { |
| 5046 | case CPU_STARTING: | 5124 | case CPU_STARTING: |
| 5125 | set_cpu_rq_start_time(); | ||
| 5126 | return NOTIFY_OK; | ||
| 5047 | case CPU_DOWN_FAILED: | 5127 | case CPU_DOWN_FAILED: |
| 5048 | set_cpu_active((long)hcpu, true); | 5128 | set_cpu_active((long)hcpu, true); |
| 5049 | return NOTIFY_OK; | 5129 | return NOTIFY_OK; |
| @@ -5252,7 +5332,8 @@ static int sd_degenerate(struct sched_domain *sd) | |||
| 5252 | SD_BALANCE_FORK | | 5332 | SD_BALANCE_FORK | |
| 5253 | SD_BALANCE_EXEC | | 5333 | SD_BALANCE_EXEC | |
| 5254 | SD_SHARE_CPUPOWER | | 5334 | SD_SHARE_CPUPOWER | |
| 5255 | SD_SHARE_PKG_RESOURCES)) { | 5335 | SD_SHARE_PKG_RESOURCES | |
| 5336 | SD_SHARE_POWERDOMAIN)) { | ||
| 5256 | if (sd->groups != sd->groups->next) | 5337 | if (sd->groups != sd->groups->next) |
| 5257 | return 0; | 5338 | return 0; |
| 5258 | } | 5339 | } |
| @@ -5283,7 +5364,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) | |||
| 5283 | SD_BALANCE_EXEC | | 5364 | SD_BALANCE_EXEC | |
| 5284 | SD_SHARE_CPUPOWER | | 5365 | SD_SHARE_CPUPOWER | |
| 5285 | SD_SHARE_PKG_RESOURCES | | 5366 | SD_SHARE_PKG_RESOURCES | |
| 5286 | SD_PREFER_SIBLING); | 5367 | SD_PREFER_SIBLING | |
| 5368 | SD_SHARE_POWERDOMAIN); | ||
| 5287 | if (nr_node_ids == 1) | 5369 | if (nr_node_ids == 1) |
| 5288 | pflags &= ~SD_SERIALIZE; | 5370 | pflags &= ~SD_SERIALIZE; |
| 5289 | } | 5371 | } |
| @@ -5557,17 +5639,6 @@ static int __init isolated_cpu_setup(char *str) | |||
| 5557 | 5639 | ||
| 5558 | __setup("isolcpus=", isolated_cpu_setup); | 5640 | __setup("isolcpus=", isolated_cpu_setup); |
| 5559 | 5641 | ||
| 5560 | static const struct cpumask *cpu_cpu_mask(int cpu) | ||
| 5561 | { | ||
| 5562 | return cpumask_of_node(cpu_to_node(cpu)); | ||
| 5563 | } | ||
| 5564 | |||
| 5565 | struct sd_data { | ||
| 5566 | struct sched_domain **__percpu sd; | ||
| 5567 | struct sched_group **__percpu sg; | ||
| 5568 | struct sched_group_power **__percpu sgp; | ||
| 5569 | }; | ||
| 5570 | |||
| 5571 | struct s_data { | 5642 | struct s_data { |
| 5572 | struct sched_domain ** __percpu sd; | 5643 | struct sched_domain ** __percpu sd; |
| 5573 | struct root_domain *rd; | 5644 | struct root_domain *rd; |
| @@ -5580,21 +5651,6 @@ enum s_alloc { | |||
| 5580 | sa_none, | 5651 | sa_none, |
| 5581 | }; | 5652 | }; |
| 5582 | 5653 | ||
| 5583 | struct sched_domain_topology_level; | ||
| 5584 | |||
| 5585 | typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); | ||
| 5586 | typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); | ||
| 5587 | |||
| 5588 | #define SDTL_OVERLAP 0x01 | ||
| 5589 | |||
| 5590 | struct sched_domain_topology_level { | ||
| 5591 | sched_domain_init_f init; | ||
| 5592 | sched_domain_mask_f mask; | ||
| 5593 | int flags; | ||
| 5594 | int numa_level; | ||
| 5595 | struct sd_data data; | ||
| 5596 | }; | ||
| 5597 | |||
| 5598 | /* | 5654 | /* |
| 5599 | * Build an iteration mask that can exclude certain CPUs from the upwards | 5655 | * Build an iteration mask that can exclude certain CPUs from the upwards |
| 5600 | * domain traversal. | 5656 | * domain traversal. |
| @@ -5762,8 +5818,6 @@ build_sched_groups(struct sched_domain *sd, int cpu) | |||
| 5762 | continue; | 5818 | continue; |
| 5763 | 5819 | ||
| 5764 | group = get_group(i, sdd, &sg); | 5820 | group = get_group(i, sdd, &sg); |
| 5765 | cpumask_clear(sched_group_cpus(sg)); | ||
| 5766 | sg->sgp->power = 0; | ||
| 5767 | cpumask_setall(sched_group_mask(sg)); | 5821 | cpumask_setall(sched_group_mask(sg)); |
| 5768 | 5822 | ||
| 5769 | for_each_cpu(j, span) { | 5823 | for_each_cpu(j, span) { |
| @@ -5813,44 +5867,11 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
| 5813 | atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight); | 5867 | atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight); |
| 5814 | } | 5868 | } |
| 5815 | 5869 | ||
| 5816 | int __weak arch_sd_sibling_asym_packing(void) | ||
| 5817 | { | ||
| 5818 | return 0*SD_ASYM_PACKING; | ||
| 5819 | } | ||
| 5820 | |||
| 5821 | /* | 5870 | /* |
| 5822 | * Initializers for schedule domains | 5871 | * Initializers for schedule domains |
| 5823 | * Non-inlined to reduce accumulated stack pressure in build_sched_domains() | 5872 | * Non-inlined to reduce accumulated stack pressure in build_sched_domains() |
| 5824 | */ | 5873 | */ |
| 5825 | 5874 | ||
| 5826 | #ifdef CONFIG_SCHED_DEBUG | ||
| 5827 | # define SD_INIT_NAME(sd, type) sd->name = #type | ||
| 5828 | #else | ||
| 5829 | # define SD_INIT_NAME(sd, type) do { } while (0) | ||
| 5830 | #endif | ||
| 5831 | |||
| 5832 | #define SD_INIT_FUNC(type) \ | ||
| 5833 | static noinline struct sched_domain * \ | ||
| 5834 | sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \ | ||
| 5835 | { \ | ||
| 5836 | struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \ | ||
| 5837 | *sd = SD_##type##_INIT; \ | ||
| 5838 | SD_INIT_NAME(sd, type); \ | ||
| 5839 | sd->private = &tl->data; \ | ||
| 5840 | return sd; \ | ||
| 5841 | } | ||
| 5842 | |||
| 5843 | SD_INIT_FUNC(CPU) | ||
| 5844 | #ifdef CONFIG_SCHED_SMT | ||
| 5845 | SD_INIT_FUNC(SIBLING) | ||
| 5846 | #endif | ||
| 5847 | #ifdef CONFIG_SCHED_MC | ||
| 5848 | SD_INIT_FUNC(MC) | ||
| 5849 | #endif | ||
| 5850 | #ifdef CONFIG_SCHED_BOOK | ||
| 5851 | SD_INIT_FUNC(BOOK) | ||
| 5852 | #endif | ||
| 5853 | |||
| 5854 | static int default_relax_domain_level = -1; | 5875 | static int default_relax_domain_level = -1; |
| 5855 | int sched_domain_level_max; | 5876 | int sched_domain_level_max; |
| 5856 | 5877 | ||
| @@ -5938,97 +5959,154 @@ static void claim_allocations(int cpu, struct sched_domain *sd) | |||
| 5938 | *per_cpu_ptr(sdd->sgp, cpu) = NULL; | 5959 | *per_cpu_ptr(sdd->sgp, cpu) = NULL; |
| 5939 | } | 5960 | } |
| 5940 | 5961 | ||
| 5941 | #ifdef CONFIG_SCHED_SMT | ||
| 5942 | static const struct cpumask *cpu_smt_mask(int cpu) | ||
| 5943 | { | ||
| 5944 | return topology_thread_cpumask(cpu); | ||
| 5945 | } | ||
| 5946 | #endif | ||
| 5947 | |||
| 5948 | /* | ||
| 5949 | * Topology list, bottom-up. | ||
| 5950 | */ | ||
| 5951 | static struct sched_domain_topology_level default_topology[] = { | ||
| 5952 | #ifdef CONFIG_SCHED_SMT | ||
| 5953 | { sd_init_SIBLING, cpu_smt_mask, }, | ||
| 5954 | #endif | ||
| 5955 | #ifdef CONFIG_SCHED_MC | ||
| 5956 | { sd_init_MC, cpu_coregroup_mask, }, | ||
| 5957 | #endif | ||
| 5958 | #ifdef CONFIG_SCHED_BOOK | ||
| 5959 | { sd_init_BOOK, cpu_book_mask, }, | ||
| 5960 | #endif | ||
| 5961 | { sd_init_CPU, cpu_cpu_mask, }, | ||
| 5962 | { NULL, }, | ||
| 5963 | }; | ||
| 5964 | |||
| 5965 | static struct sched_domain_topology_level *sched_domain_topology = default_topology; | ||
| 5966 | |||
| 5967 | #define for_each_sd_topology(tl) \ | ||
| 5968 | for (tl = sched_domain_topology; tl->init; tl++) | ||
| 5969 | |||
| 5970 | #ifdef CONFIG_NUMA | 5962 | #ifdef CONFIG_NUMA |
| 5971 | |||
| 5972 | static int sched_domains_numa_levels; | 5963 | static int sched_domains_numa_levels; |
| 5973 | static int *sched_domains_numa_distance; | 5964 | static int *sched_domains_numa_distance; |
| 5974 | static struct cpumask ***sched_domains_numa_masks; | 5965 | static struct cpumask ***sched_domains_numa_masks; |
| 5975 | static int sched_domains_curr_level; | 5966 | static int sched_domains_curr_level; |
| 5967 | #endif | ||
| 5976 | 5968 | ||
| 5977 | static inline int sd_local_flags(int level) | 5969 | /* |
| 5978 | { | 5970 | * SD_flags allowed in topology descriptions. |
| 5979 | if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE) | 5971 | * |
| 5980 | return 0; | 5972 | * SD_SHARE_CPUPOWER - describes SMT topologies |
| 5981 | 5973 | * SD_SHARE_PKG_RESOURCES - describes shared caches | |
| 5982 | return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; | 5974 | * SD_NUMA - describes NUMA topologies |
| 5983 | } | 5975 | * SD_SHARE_POWERDOMAIN - describes shared power domain |
| 5976 | * | ||
| 5977 | * Odd one out: | ||
| 5978 | * SD_ASYM_PACKING - describes SMT quirks | ||
| 5979 | */ | ||
| 5980 | #define TOPOLOGY_SD_FLAGS \ | ||
| 5981 | (SD_SHARE_CPUPOWER | \ | ||
| 5982 | SD_SHARE_PKG_RESOURCES | \ | ||
| 5983 | SD_NUMA | \ | ||
| 5984 | SD_ASYM_PACKING | \ | ||
| 5985 | SD_SHARE_POWERDOMAIN) | ||
| 5984 | 5986 | ||
| 5985 | static struct sched_domain * | 5987 | static struct sched_domain * |
| 5986 | sd_numa_init(struct sched_domain_topology_level *tl, int cpu) | 5988 | sd_init(struct sched_domain_topology_level *tl, int cpu) |
| 5987 | { | 5989 | { |
| 5988 | struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); | 5990 | struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); |
| 5989 | int level = tl->numa_level; | 5991 | int sd_weight, sd_flags = 0; |
| 5990 | int sd_weight = cpumask_weight( | 5992 | |
| 5991 | sched_domains_numa_masks[level][cpu_to_node(cpu)]); | 5993 | #ifdef CONFIG_NUMA |
| 5994 | /* | ||
| 5995 | * Ugly hack to pass state to sd_numa_mask()... | ||
| 5996 | */ | ||
| 5997 | sched_domains_curr_level = tl->numa_level; | ||
| 5998 | #endif | ||
| 5999 | |||
| 6000 | sd_weight = cpumask_weight(tl->mask(cpu)); | ||
| 6001 | |||
| 6002 | if (tl->sd_flags) | ||
| 6003 | sd_flags = (*tl->sd_flags)(); | ||
| 6004 | if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS, | ||
| 6005 | "wrong sd_flags in topology description\n")) | ||
| 6006 | sd_flags &= ~TOPOLOGY_SD_FLAGS; | ||
| 5992 | 6007 | ||
| 5993 | *sd = (struct sched_domain){ | 6008 | *sd = (struct sched_domain){ |
| 5994 | .min_interval = sd_weight, | 6009 | .min_interval = sd_weight, |
| 5995 | .max_interval = 2*sd_weight, | 6010 | .max_interval = 2*sd_weight, |
| 5996 | .busy_factor = 32, | 6011 | .busy_factor = 32, |
| 5997 | .imbalance_pct = 125, | 6012 | .imbalance_pct = 125, |
| 5998 | .cache_nice_tries = 2, | 6013 | |
| 5999 | .busy_idx = 3, | 6014 | .cache_nice_tries = 0, |
| 6000 | .idle_idx = 2, | 6015 | .busy_idx = 0, |
| 6016 | .idle_idx = 0, | ||
| 6001 | .newidle_idx = 0, | 6017 | .newidle_idx = 0, |
| 6002 | .wake_idx = 0, | 6018 | .wake_idx = 0, |
| 6003 | .forkexec_idx = 0, | 6019 | .forkexec_idx = 0, |
| 6004 | 6020 | ||
| 6005 | .flags = 1*SD_LOAD_BALANCE | 6021 | .flags = 1*SD_LOAD_BALANCE |
| 6006 | | 1*SD_BALANCE_NEWIDLE | 6022 | | 1*SD_BALANCE_NEWIDLE |
| 6007 | | 0*SD_BALANCE_EXEC | 6023 | | 1*SD_BALANCE_EXEC |
| 6008 | | 0*SD_BALANCE_FORK | 6024 | | 1*SD_BALANCE_FORK |
| 6009 | | 0*SD_BALANCE_WAKE | 6025 | | 0*SD_BALANCE_WAKE |
| 6010 | | 0*SD_WAKE_AFFINE | 6026 | | 1*SD_WAKE_AFFINE |
| 6011 | | 0*SD_SHARE_CPUPOWER | 6027 | | 0*SD_SHARE_CPUPOWER |
| 6012 | | 0*SD_SHARE_PKG_RESOURCES | 6028 | | 0*SD_SHARE_PKG_RESOURCES |
| 6013 | | 1*SD_SERIALIZE | 6029 | | 0*SD_SERIALIZE |
| 6014 | | 0*SD_PREFER_SIBLING | 6030 | | 0*SD_PREFER_SIBLING |
| 6015 | | 1*SD_NUMA | 6031 | | 0*SD_NUMA |
| 6016 | | sd_local_flags(level) | 6032 | | sd_flags |
| 6017 | , | 6033 | , |
| 6034 | |||
| 6018 | .last_balance = jiffies, | 6035 | .last_balance = jiffies, |
| 6019 | .balance_interval = sd_weight, | 6036 | .balance_interval = sd_weight, |
| 6037 | .smt_gain = 0, | ||
| 6038 | .max_newidle_lb_cost = 0, | ||
| 6039 | .next_decay_max_lb_cost = jiffies, | ||
| 6040 | #ifdef CONFIG_SCHED_DEBUG | ||
| 6041 | .name = tl->name, | ||
| 6042 | #endif | ||
| 6020 | }; | 6043 | }; |
| 6021 | SD_INIT_NAME(sd, NUMA); | ||
| 6022 | sd->private = &tl->data; | ||
| 6023 | 6044 | ||
| 6024 | /* | 6045 | /* |
| 6025 | * Ugly hack to pass state to sd_numa_mask()... | 6046 | * Convert topological properties into behaviour. |
| 6026 | */ | 6047 | */ |
| 6027 | sched_domains_curr_level = tl->numa_level; | 6048 | |
| 6049 | if (sd->flags & SD_SHARE_CPUPOWER) { | ||
| 6050 | sd->imbalance_pct = 110; | ||
| 6051 | sd->smt_gain = 1178; /* ~15% */ | ||
| 6052 | |||
| 6053 | } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { | ||
| 6054 | sd->imbalance_pct = 117; | ||
| 6055 | sd->cache_nice_tries = 1; | ||
| 6056 | sd->busy_idx = 2; | ||
| 6057 | |||
| 6058 | #ifdef CONFIG_NUMA | ||
| 6059 | } else if (sd->flags & SD_NUMA) { | ||
| 6060 | sd->cache_nice_tries = 2; | ||
| 6061 | sd->busy_idx = 3; | ||
| 6062 | sd->idle_idx = 2; | ||
| 6063 | |||
| 6064 | sd->flags |= SD_SERIALIZE; | ||
| 6065 | if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) { | ||
| 6066 | sd->flags &= ~(SD_BALANCE_EXEC | | ||
| 6067 | SD_BALANCE_FORK | | ||
| 6068 | SD_WAKE_AFFINE); | ||
| 6069 | } | ||
| 6070 | |||
| 6071 | #endif | ||
| 6072 | } else { | ||
| 6073 | sd->flags |= SD_PREFER_SIBLING; | ||
| 6074 | sd->cache_nice_tries = 1; | ||
| 6075 | sd->busy_idx = 2; | ||
| 6076 | sd->idle_idx = 1; | ||
| 6077 | } | ||
| 6078 | |||
| 6079 | sd->private = &tl->data; | ||
| 6028 | 6080 | ||
| 6029 | return sd; | 6081 | return sd; |
| 6030 | } | 6082 | } |
| 6031 | 6083 | ||
| 6084 | /* | ||
| 6085 | * Topology list, bottom-up. | ||
| 6086 | */ | ||
| 6087 | static struct sched_domain_topology_level default_topology[] = { | ||
| 6088 | #ifdef CONFIG_SCHED_SMT | ||
| 6089 | { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, | ||
| 6090 | #endif | ||
| 6091 | #ifdef CONFIG_SCHED_MC | ||
| 6092 | { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, | ||
| 6093 | #endif | ||
| 6094 | { cpu_cpu_mask, SD_INIT_NAME(DIE) }, | ||
| 6095 | { NULL, }, | ||
| 6096 | }; | ||
| 6097 | |||
| 6098 | struct sched_domain_topology_level *sched_domain_topology = default_topology; | ||
| 6099 | |||
| 6100 | #define for_each_sd_topology(tl) \ | ||
| 6101 | for (tl = sched_domain_topology; tl->mask; tl++) | ||
| 6102 | |||
| 6103 | void set_sched_topology(struct sched_domain_topology_level *tl) | ||
| 6104 | { | ||
| 6105 | sched_domain_topology = tl; | ||
| 6106 | } | ||
| 6107 | |||
| 6108 | #ifdef CONFIG_NUMA | ||
| 6109 | |||
| 6032 | static const struct cpumask *sd_numa_mask(int cpu) | 6110 | static const struct cpumask *sd_numa_mask(int cpu) |
| 6033 | { | 6111 | { |
| 6034 | return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; | 6112 | return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; |
| @@ -6172,7 +6250,10 @@ static void sched_init_numa(void) | |||
| 6172 | } | 6250 | } |
| 6173 | } | 6251 | } |
| 6174 | 6252 | ||
| 6175 | tl = kzalloc((ARRAY_SIZE(default_topology) + level) * | 6253 | /* Compute default topology size */ |
| 6254 | for (i = 0; sched_domain_topology[i].mask; i++); | ||
| 6255 | |||
| 6256 | tl = kzalloc((i + level + 1) * | ||
| 6176 | sizeof(struct sched_domain_topology_level), GFP_KERNEL); | 6257 | sizeof(struct sched_domain_topology_level), GFP_KERNEL); |
| 6177 | if (!tl) | 6258 | if (!tl) |
| 6178 | return; | 6259 | return; |
| @@ -6180,18 +6261,19 @@ static void sched_init_numa(void) | |||
| 6180 | /* | 6261 | /* |
| 6181 | * Copy the default topology bits.. | 6262 | * Copy the default topology bits.. |
| 6182 | */ | 6263 | */ |
| 6183 | for (i = 0; default_topology[i].init; i++) | 6264 | for (i = 0; sched_domain_topology[i].mask; i++) |
| 6184 | tl[i] = default_topology[i]; | 6265 | tl[i] = sched_domain_topology[i]; |
| 6185 | 6266 | ||
| 6186 | /* | 6267 | /* |
| 6187 | * .. and append 'j' levels of NUMA goodness. | 6268 | * .. and append 'j' levels of NUMA goodness. |
| 6188 | */ | 6269 | */ |
| 6189 | for (j = 0; j < level; i++, j++) { | 6270 | for (j = 0; j < level; i++, j++) { |
| 6190 | tl[i] = (struct sched_domain_topology_level){ | 6271 | tl[i] = (struct sched_domain_topology_level){ |
| 6191 | .init = sd_numa_init, | ||
| 6192 | .mask = sd_numa_mask, | 6272 | .mask = sd_numa_mask, |
| 6273 | .sd_flags = cpu_numa_flags, | ||
| 6193 | .flags = SDTL_OVERLAP, | 6274 | .flags = SDTL_OVERLAP, |
| 6194 | .numa_level = j, | 6275 | .numa_level = j, |
| 6276 | SD_INIT_NAME(NUMA) | ||
| 6195 | }; | 6277 | }; |
| 6196 | } | 6278 | } |
| 6197 | 6279 | ||
| @@ -6349,7 +6431,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, | |||
| 6349 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | 6431 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, |
| 6350 | struct sched_domain *child, int cpu) | 6432 | struct sched_domain *child, int cpu) |
| 6351 | { | 6433 | { |
| 6352 | struct sched_domain *sd = tl->init(tl, cpu); | 6434 | struct sched_domain *sd = sd_init(tl, cpu); |
| 6353 | if (!sd) | 6435 | if (!sd) |
| 6354 | return child; | 6436 | return child; |
| 6355 | 6437 | ||
| @@ -6919,6 +7001,7 @@ void __init sched_init(void) | |||
| 6919 | if (cpu_isolated_map == NULL) | 7001 | if (cpu_isolated_map == NULL) |
| 6920 | zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); | 7002 | zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); |
| 6921 | idle_thread_set_boot_cpu(); | 7003 | idle_thread_set_boot_cpu(); |
| 7004 | set_cpu_rq_start_time(); | ||
| 6922 | #endif | 7005 | #endif |
| 6923 | init_sched_fair_class(); | 7006 | init_sched_fair_class(); |
| 6924 | 7007 | ||
| @@ -7586,7 +7669,7 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) | |||
| 7586 | static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) | 7669 | static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) |
| 7587 | { | 7670 | { |
| 7588 | struct task_group *tg = css_tg(css); | 7671 | struct task_group *tg = css_tg(css); |
| 7589 | struct task_group *parent = css_tg(css_parent(css)); | 7672 | struct task_group *parent = css_tg(css->parent); |
| 7590 | 7673 | ||
| 7591 | if (parent) | 7674 | if (parent) |
| 7592 | sched_online_group(tg, parent); | 7675 | sched_online_group(tg, parent); |
| @@ -7717,8 +7800,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) | |||
| 7717 | /* restart the period timer (if active) to handle new period expiry */ | 7800 | /* restart the period timer (if active) to handle new period expiry */ |
| 7718 | if (runtime_enabled && cfs_b->timer_active) { | 7801 | if (runtime_enabled && cfs_b->timer_active) { |
| 7719 | /* force a reprogram */ | 7802 | /* force a reprogram */ |
| 7720 | cfs_b->timer_active = 0; | 7803 | __start_cfs_bandwidth(cfs_b, true); |
| 7721 | __start_cfs_bandwidth(cfs_b); | ||
| 7722 | } | 7804 | } |
| 7723 | raw_spin_unlock_irq(&cfs_b->lock); | 7805 | raw_spin_unlock_irq(&cfs_b->lock); |
| 7724 | 7806 | ||
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index c143ee380e3a..9cf350c94ec4 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c | |||
| @@ -46,7 +46,7 @@ static inline struct cpuacct *task_ca(struct task_struct *tsk) | |||
| 46 | 46 | ||
| 47 | static inline struct cpuacct *parent_ca(struct cpuacct *ca) | 47 | static inline struct cpuacct *parent_ca(struct cpuacct *ca) |
| 48 | { | 48 | { |
| 49 | return css_ca(css_parent(&ca->css)); | 49 | return css_ca(ca->css.parent); |
| 50 | } | 50 | } |
| 51 | 51 | ||
| 52 | static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); | 52 | static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); |
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 5b9bb42b2d47..bd95963dae80 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c | |||
| @@ -13,6 +13,7 @@ | |||
| 13 | 13 | ||
| 14 | #include <linux/gfp.h> | 14 | #include <linux/gfp.h> |
| 15 | #include <linux/kernel.h> | 15 | #include <linux/kernel.h> |
| 16 | #include <linux/slab.h> | ||
| 16 | #include "cpudeadline.h" | 17 | #include "cpudeadline.h" |
| 17 | 18 | ||
| 18 | static inline int parent(int i) | 19 | static inline int parent(int i) |
| @@ -39,8 +40,10 @@ static void cpudl_exchange(struct cpudl *cp, int a, int b) | |||
| 39 | { | 40 | { |
| 40 | int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu; | 41 | int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu; |
| 41 | 42 | ||
| 42 | swap(cp->elements[a], cp->elements[b]); | 43 | swap(cp->elements[a].cpu, cp->elements[b].cpu); |
| 43 | swap(cp->cpu_to_idx[cpu_a], cp->cpu_to_idx[cpu_b]); | 44 | swap(cp->elements[a].dl , cp->elements[b].dl ); |
| 45 | |||
| 46 | swap(cp->elements[cpu_a].idx, cp->elements[cpu_b].idx); | ||
| 44 | } | 47 | } |
| 45 | 48 | ||
| 46 | static void cpudl_heapify(struct cpudl *cp, int idx) | 49 | static void cpudl_heapify(struct cpudl *cp, int idx) |
| @@ -140,7 +143,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) | |||
| 140 | WARN_ON(!cpu_present(cpu)); | 143 | WARN_ON(!cpu_present(cpu)); |
| 141 | 144 | ||
| 142 | raw_spin_lock_irqsave(&cp->lock, flags); | 145 | raw_spin_lock_irqsave(&cp->lock, flags); |
| 143 | old_idx = cp->cpu_to_idx[cpu]; | 146 | old_idx = cp->elements[cpu].idx; |
| 144 | if (!is_valid) { | 147 | if (!is_valid) { |
| 145 | /* remove item */ | 148 | /* remove item */ |
| 146 | if (old_idx == IDX_INVALID) { | 149 | if (old_idx == IDX_INVALID) { |
| @@ -155,8 +158,8 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) | |||
| 155 | cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl; | 158 | cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl; |
| 156 | cp->elements[old_idx].cpu = new_cpu; | 159 | cp->elements[old_idx].cpu = new_cpu; |
| 157 | cp->size--; | 160 | cp->size--; |
| 158 | cp->cpu_to_idx[new_cpu] = old_idx; | 161 | cp->elements[new_cpu].idx = old_idx; |
| 159 | cp->cpu_to_idx[cpu] = IDX_INVALID; | 162 | cp->elements[cpu].idx = IDX_INVALID; |
| 160 | while (old_idx > 0 && dl_time_before( | 163 | while (old_idx > 0 && dl_time_before( |
| 161 | cp->elements[parent(old_idx)].dl, | 164 | cp->elements[parent(old_idx)].dl, |
| 162 | cp->elements[old_idx].dl)) { | 165 | cp->elements[old_idx].dl)) { |
| @@ -173,7 +176,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) | |||
| 173 | cp->size++; | 176 | cp->size++; |
| 174 | cp->elements[cp->size - 1].dl = 0; | 177 | cp->elements[cp->size - 1].dl = 0; |
| 175 | cp->elements[cp->size - 1].cpu = cpu; | 178 | cp->elements[cp->size - 1].cpu = cpu; |
| 176 | cp->cpu_to_idx[cpu] = cp->size - 1; | 179 | cp->elements[cpu].idx = cp->size - 1; |
| 177 | cpudl_change_key(cp, cp->size - 1, dl); | 180 | cpudl_change_key(cp, cp->size - 1, dl); |
| 178 | cpumask_clear_cpu(cpu, cp->free_cpus); | 181 | cpumask_clear_cpu(cpu, cp->free_cpus); |
| 179 | } else { | 182 | } else { |
| @@ -195,10 +198,21 @@ int cpudl_init(struct cpudl *cp) | |||
| 195 | memset(cp, 0, sizeof(*cp)); | 198 | memset(cp, 0, sizeof(*cp)); |
| 196 | raw_spin_lock_init(&cp->lock); | 199 | raw_spin_lock_init(&cp->lock); |
| 197 | cp->size = 0; | 200 | cp->size = 0; |
| 198 | for (i = 0; i < NR_CPUS; i++) | 201 | |
| 199 | cp->cpu_to_idx[i] = IDX_INVALID; | 202 | cp->elements = kcalloc(nr_cpu_ids, |
| 200 | if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) | 203 | sizeof(struct cpudl_item), |
| 204 | GFP_KERNEL); | ||
| 205 | if (!cp->elements) | ||
| 206 | return -ENOMEM; | ||
| 207 | |||
| 208 | if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) { | ||
| 209 | kfree(cp->elements); | ||
| 201 | return -ENOMEM; | 210 | return -ENOMEM; |
| 211 | } | ||
| 212 | |||
| 213 | for_each_possible_cpu(i) | ||
| 214 | cp->elements[i].idx = IDX_INVALID; | ||
| 215 | |||
| 202 | cpumask_setall(cp->free_cpus); | 216 | cpumask_setall(cp->free_cpus); |
| 203 | 217 | ||
| 204 | return 0; | 218 | return 0; |
| @@ -210,7 +224,6 @@ int cpudl_init(struct cpudl *cp) | |||
| 210 | */ | 224 | */ |
| 211 | void cpudl_cleanup(struct cpudl *cp) | 225 | void cpudl_cleanup(struct cpudl *cp) |
| 212 | { | 226 | { |
| 213 | /* | 227 | free_cpumask_var(cp->free_cpus); |
| 214 | * nothing to do for the moment | 228 | kfree(cp->elements); |
| 215 | */ | ||
| 216 | } | 229 | } |
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index a202789a412c..538c9796ad4a 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h | |||
| @@ -5,17 +5,17 @@ | |||
| 5 | 5 | ||
| 6 | #define IDX_INVALID -1 | 6 | #define IDX_INVALID -1 |
| 7 | 7 | ||
| 8 | struct array_item { | 8 | struct cpudl_item { |
| 9 | u64 dl; | 9 | u64 dl; |
| 10 | int cpu; | 10 | int cpu; |
| 11 | int idx; | ||
| 11 | }; | 12 | }; |
| 12 | 13 | ||
| 13 | struct cpudl { | 14 | struct cpudl { |
| 14 | raw_spinlock_t lock; | 15 | raw_spinlock_t lock; |
| 15 | int size; | 16 | int size; |
| 16 | int cpu_to_idx[NR_CPUS]; | ||
| 17 | struct array_item elements[NR_CPUS]; | ||
| 18 | cpumask_var_t free_cpus; | 17 | cpumask_var_t free_cpus; |
| 18 | struct cpudl_item *elements; | ||
| 19 | }; | 19 | }; |
| 20 | 20 | ||
| 21 | 21 | ||
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 8b836b376d91..981fcd7dc394 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c | |||
| @@ -30,6 +30,7 @@ | |||
| 30 | #include <linux/gfp.h> | 30 | #include <linux/gfp.h> |
| 31 | #include <linux/sched.h> | 31 | #include <linux/sched.h> |
| 32 | #include <linux/sched/rt.h> | 32 | #include <linux/sched/rt.h> |
| 33 | #include <linux/slab.h> | ||
| 33 | #include "cpupri.h" | 34 | #include "cpupri.h" |
| 34 | 35 | ||
| 35 | /* Convert between a 140 based task->prio, and our 102 based cpupri */ | 36 | /* Convert between a 140 based task->prio, and our 102 based cpupri */ |
| @@ -70,8 +71,7 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, | |||
| 70 | int idx = 0; | 71 | int idx = 0; |
| 71 | int task_pri = convert_prio(p->prio); | 72 | int task_pri = convert_prio(p->prio); |
| 72 | 73 | ||
| 73 | if (task_pri >= MAX_RT_PRIO) | 74 | BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES); |
| 74 | return 0; | ||
| 75 | 75 | ||
| 76 | for (idx = 0; idx < task_pri; idx++) { | 76 | for (idx = 0; idx < task_pri; idx++) { |
| 77 | struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; | 77 | struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; |
| @@ -165,7 +165,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) | |||
| 165 | * do a write memory barrier, and then update the count, to | 165 | * do a write memory barrier, and then update the count, to |
| 166 | * make sure the vector is visible when count is set. | 166 | * make sure the vector is visible when count is set. |
| 167 | */ | 167 | */ |
| 168 | smp_mb__before_atomic_inc(); | 168 | smp_mb__before_atomic(); |
| 169 | atomic_inc(&(vec)->count); | 169 | atomic_inc(&(vec)->count); |
| 170 | do_mb = 1; | 170 | do_mb = 1; |
| 171 | } | 171 | } |
| @@ -185,14 +185,14 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) | |||
| 185 | * the new priority vec. | 185 | * the new priority vec. |
| 186 | */ | 186 | */ |
| 187 | if (do_mb) | 187 | if (do_mb) |
| 188 | smp_mb__after_atomic_inc(); | 188 | smp_mb__after_atomic(); |
| 189 | 189 | ||
| 190 | /* | 190 | /* |
| 191 | * When removing from the vector, we decrement the counter first | 191 | * When removing from the vector, we decrement the counter first |
| 192 | * do a memory barrier and then clear the mask. | 192 | * do a memory barrier and then clear the mask. |
| 193 | */ | 193 | */ |
| 194 | atomic_dec(&(vec)->count); | 194 | atomic_dec(&(vec)->count); |
| 195 | smp_mb__after_atomic_inc(); | 195 | smp_mb__after_atomic(); |
| 196 | cpumask_clear_cpu(cpu, vec->mask); | 196 | cpumask_clear_cpu(cpu, vec->mask); |
| 197 | } | 197 | } |
| 198 | 198 | ||
| @@ -219,8 +219,13 @@ int cpupri_init(struct cpupri *cp) | |||
| 219 | goto cleanup; | 219 | goto cleanup; |
| 220 | } | 220 | } |
| 221 | 221 | ||
| 222 | cp->cpu_to_pri = kcalloc(nr_cpu_ids, sizeof(int), GFP_KERNEL); | ||
| 223 | if (!cp->cpu_to_pri) | ||
| 224 | goto cleanup; | ||
| 225 | |||
| 222 | for_each_possible_cpu(i) | 226 | for_each_possible_cpu(i) |
| 223 | cp->cpu_to_pri[i] = CPUPRI_INVALID; | 227 | cp->cpu_to_pri[i] = CPUPRI_INVALID; |
| 228 | |||
| 224 | return 0; | 229 | return 0; |
| 225 | 230 | ||
| 226 | cleanup: | 231 | cleanup: |
| @@ -237,6 +242,7 @@ void cpupri_cleanup(struct cpupri *cp) | |||
| 237 | { | 242 | { |
| 238 | int i; | 243 | int i; |
| 239 | 244 | ||
| 245 | kfree(cp->cpu_to_pri); | ||
| 240 | for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) | 246 | for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) |
| 241 | free_cpumask_var(cp->pri_to_cpu[i].mask); | 247 | free_cpumask_var(cp->pri_to_cpu[i].mask); |
| 242 | } | 248 | } |
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h index f6d756173491..6b033347fdfd 100644 --- a/kernel/sched/cpupri.h +++ b/kernel/sched/cpupri.h | |||
| @@ -17,7 +17,7 @@ struct cpupri_vec { | |||
| 17 | 17 | ||
| 18 | struct cpupri { | 18 | struct cpupri { |
| 19 | struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; | 19 | struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; |
| 20 | int cpu_to_pri[NR_CPUS]; | 20 | int *cpu_to_pri; |
| 21 | }; | 21 | }; |
| 22 | 22 | ||
| 23 | #ifdef CONFIG_SMP | 23 | #ifdef CONFIG_SMP |
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index a95097cb4591..72fdf06ef865 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c | |||
| @@ -332,50 +332,50 @@ out: | |||
| 332 | * softirq as those do not count in task exec_runtime any more. | 332 | * softirq as those do not count in task exec_runtime any more. |
| 333 | */ | 333 | */ |
| 334 | static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | 334 | static void irqtime_account_process_tick(struct task_struct *p, int user_tick, |
| 335 | struct rq *rq) | 335 | struct rq *rq, int ticks) |
| 336 | { | 336 | { |
| 337 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); | 337 | cputime_t scaled = cputime_to_scaled(cputime_one_jiffy); |
| 338 | u64 cputime = (__force u64) cputime_one_jiffy; | ||
| 338 | u64 *cpustat = kcpustat_this_cpu->cpustat; | 339 | u64 *cpustat = kcpustat_this_cpu->cpustat; |
| 339 | 340 | ||
| 340 | if (steal_account_process_tick()) | 341 | if (steal_account_process_tick()) |
| 341 | return; | 342 | return; |
| 342 | 343 | ||
| 344 | cputime *= ticks; | ||
| 345 | scaled *= ticks; | ||
| 346 | |||
| 343 | if (irqtime_account_hi_update()) { | 347 | if (irqtime_account_hi_update()) { |
| 344 | cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy; | 348 | cpustat[CPUTIME_IRQ] += cputime; |
| 345 | } else if (irqtime_account_si_update()) { | 349 | } else if (irqtime_account_si_update()) { |
| 346 | cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy; | 350 | cpustat[CPUTIME_SOFTIRQ] += cputime; |
| 347 | } else if (this_cpu_ksoftirqd() == p) { | 351 | } else if (this_cpu_ksoftirqd() == p) { |
| 348 | /* | 352 | /* |
| 349 | * ksoftirqd time do not get accounted in cpu_softirq_time. | 353 | * ksoftirqd time do not get accounted in cpu_softirq_time. |
| 350 | * So, we have to handle it separately here. | 354 | * So, we have to handle it separately here. |
| 351 | * Also, p->stime needs to be updated for ksoftirqd. | 355 | * Also, p->stime needs to be updated for ksoftirqd. |
| 352 | */ | 356 | */ |
| 353 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, | 357 | __account_system_time(p, cputime, scaled, CPUTIME_SOFTIRQ); |
| 354 | CPUTIME_SOFTIRQ); | ||
| 355 | } else if (user_tick) { | 358 | } else if (user_tick) { |
| 356 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); | 359 | account_user_time(p, cputime, scaled); |
| 357 | } else if (p == rq->idle) { | 360 | } else if (p == rq->idle) { |
| 358 | account_idle_time(cputime_one_jiffy); | 361 | account_idle_time(cputime); |
| 359 | } else if (p->flags & PF_VCPU) { /* System time or guest time */ | 362 | } else if (p->flags & PF_VCPU) { /* System time or guest time */ |
| 360 | account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); | 363 | account_guest_time(p, cputime, scaled); |
| 361 | } else { | 364 | } else { |
| 362 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, | 365 | __account_system_time(p, cputime, scaled, CPUTIME_SYSTEM); |
| 363 | CPUTIME_SYSTEM); | ||
| 364 | } | 366 | } |
| 365 | } | 367 | } |
| 366 | 368 | ||
| 367 | static void irqtime_account_idle_ticks(int ticks) | 369 | static void irqtime_account_idle_ticks(int ticks) |
| 368 | { | 370 | { |
| 369 | int i; | ||
| 370 | struct rq *rq = this_rq(); | 371 | struct rq *rq = this_rq(); |
| 371 | 372 | ||
| 372 | for (i = 0; i < ticks; i++) | 373 | irqtime_account_process_tick(current, 0, rq, ticks); |
| 373 | irqtime_account_process_tick(current, 0, rq); | ||
| 374 | } | 374 | } |
| 375 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ | 375 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ |
| 376 | static inline void irqtime_account_idle_ticks(int ticks) {} | 376 | static inline void irqtime_account_idle_ticks(int ticks) {} |
| 377 | static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, | 377 | static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, |
| 378 | struct rq *rq) {} | 378 | struct rq *rq, int nr_ticks) {} |
| 379 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ | 379 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ |
| 380 | 380 | ||
| 381 | /* | 381 | /* |
| @@ -464,7 +464,7 @@ void account_process_tick(struct task_struct *p, int user_tick) | |||
| 464 | return; | 464 | return; |
| 465 | 465 | ||
| 466 | if (sched_clock_irqtime) { | 466 | if (sched_clock_irqtime) { |
| 467 | irqtime_account_process_tick(p, user_tick, rq); | 467 | irqtime_account_process_tick(p, user_tick, rq, 1); |
| 468 | return; | 468 | return; |
| 469 | } | 469 | } |
| 470 | 470 | ||
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 27ef40925525..2b8cbf09d1a4 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c | |||
| @@ -348,12 +348,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, | |||
| 348 | * entity. | 348 | * entity. |
| 349 | */ | 349 | */ |
| 350 | if (dl_time_before(dl_se->deadline, rq_clock(rq))) { | 350 | if (dl_time_before(dl_se->deadline, rq_clock(rq))) { |
| 351 | static bool lag_once = false; | 351 | printk_deferred_once("sched: DL replenish lagged to much\n"); |
| 352 | |||
| 353 | if (!lag_once) { | ||
| 354 | lag_once = true; | ||
| 355 | printk_sched("sched: DL replenish lagged to much\n"); | ||
| 356 | } | ||
| 357 | dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; | 352 | dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; |
| 358 | dl_se->runtime = pi_se->dl_runtime; | 353 | dl_se->runtime = pi_se->dl_runtime; |
| 359 | } | 354 | } |
| @@ -513,14 +508,22 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) | |||
| 513 | struct sched_dl_entity, | 508 | struct sched_dl_entity, |
| 514 | dl_timer); | 509 | dl_timer); |
| 515 | struct task_struct *p = dl_task_of(dl_se); | 510 | struct task_struct *p = dl_task_of(dl_se); |
| 516 | struct rq *rq = task_rq(p); | 511 | struct rq *rq; |
| 512 | again: | ||
| 513 | rq = task_rq(p); | ||
| 517 | raw_spin_lock(&rq->lock); | 514 | raw_spin_lock(&rq->lock); |
| 518 | 515 | ||
| 516 | if (rq != task_rq(p)) { | ||
| 517 | /* Task was moved, retrying. */ | ||
| 518 | raw_spin_unlock(&rq->lock); | ||
| 519 | goto again; | ||
| 520 | } | ||
| 521 | |||
| 519 | /* | 522 | /* |
| 520 | * We need to take care of a possible races here. In fact, the | 523 | * We need to take care of a possible races here. In fact, the |
| 521 | * task might have changed its scheduling policy to something | 524 | * task might have changed its scheduling policy to something |
| 522 | * different from SCHED_DEADLINE or changed its reservation | 525 | * different from SCHED_DEADLINE or changed its reservation |
| 523 | * parameters (through sched_setscheduler()). | 526 | * parameters (through sched_setattr()). |
| 524 | */ | 527 | */ |
| 525 | if (!dl_task(p) || dl_se->dl_new) | 528 | if (!dl_task(p) || dl_se->dl_new) |
| 526 | goto unlock; | 529 | goto unlock; |
| @@ -528,6 +531,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) | |||
| 528 | sched_clock_tick(); | 531 | sched_clock_tick(); |
| 529 | update_rq_clock(rq); | 532 | update_rq_clock(rq); |
| 530 | dl_se->dl_throttled = 0; | 533 | dl_se->dl_throttled = 0; |
| 534 | dl_se->dl_yielded = 0; | ||
| 531 | if (p->on_rq) { | 535 | if (p->on_rq) { |
| 532 | enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); | 536 | enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); |
| 533 | if (task_has_dl_policy(rq->curr)) | 537 | if (task_has_dl_policy(rq->curr)) |
| @@ -740,7 +744,7 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) | |||
| 740 | 744 | ||
| 741 | WARN_ON(!dl_prio(prio)); | 745 | WARN_ON(!dl_prio(prio)); |
| 742 | dl_rq->dl_nr_running++; | 746 | dl_rq->dl_nr_running++; |
| 743 | inc_nr_running(rq_of_dl_rq(dl_rq)); | 747 | add_nr_running(rq_of_dl_rq(dl_rq), 1); |
| 744 | 748 | ||
| 745 | inc_dl_deadline(dl_rq, deadline); | 749 | inc_dl_deadline(dl_rq, deadline); |
| 746 | inc_dl_migration(dl_se, dl_rq); | 750 | inc_dl_migration(dl_se, dl_rq); |
| @@ -754,7 +758,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) | |||
| 754 | WARN_ON(!dl_prio(prio)); | 758 | WARN_ON(!dl_prio(prio)); |
| 755 | WARN_ON(!dl_rq->dl_nr_running); | 759 | WARN_ON(!dl_rq->dl_nr_running); |
| 756 | dl_rq->dl_nr_running--; | 760 | dl_rq->dl_nr_running--; |
| 757 | dec_nr_running(rq_of_dl_rq(dl_rq)); | 761 | sub_nr_running(rq_of_dl_rq(dl_rq), 1); |
| 758 | 762 | ||
| 759 | dec_dl_deadline(dl_rq, dl_se->deadline); | 763 | dec_dl_deadline(dl_rq, dl_se->deadline); |
| 760 | dec_dl_migration(dl_se, dl_rq); | 764 | dec_dl_migration(dl_se, dl_rq); |
| @@ -893,10 +897,10 @@ static void yield_task_dl(struct rq *rq) | |||
| 893 | * We make the task go to sleep until its current deadline by | 897 | * We make the task go to sleep until its current deadline by |
| 894 | * forcing its runtime to zero. This way, update_curr_dl() stops | 898 | * forcing its runtime to zero. This way, update_curr_dl() stops |
| 895 | * it and the bandwidth timer will wake it up and will give it | 899 | * it and the bandwidth timer will wake it up and will give it |
| 896 | * new scheduling parameters (thanks to dl_new=1). | 900 | * new scheduling parameters (thanks to dl_yielded=1). |
| 897 | */ | 901 | */ |
| 898 | if (p->dl.runtime > 0) { | 902 | if (p->dl.runtime > 0) { |
| 899 | rq->curr->dl.dl_new = 1; | 903 | rq->curr->dl.dl_yielded = 1; |
| 900 | p->dl.runtime = 0; | 904 | p->dl.runtime = 0; |
| 901 | } | 905 | } |
| 902 | update_curr_dl(rq); | 906 | update_curr_dl(rq); |
| @@ -1021,8 +1025,17 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev) | |||
| 1021 | 1025 | ||
| 1022 | dl_rq = &rq->dl; | 1026 | dl_rq = &rq->dl; |
| 1023 | 1027 | ||
| 1024 | if (need_pull_dl_task(rq, prev)) | 1028 | if (need_pull_dl_task(rq, prev)) { |
| 1025 | pull_dl_task(rq); | 1029 | pull_dl_task(rq); |
| 1030 | /* | ||
| 1031 | * pull_rt_task() can drop (and re-acquire) rq->lock; this | ||
| 1032 | * means a stop task can slip in, in which case we need to | ||
| 1033 | * re-start task selection. | ||
| 1034 | */ | ||
| 1035 | if (rq->stop && rq->stop->on_rq) | ||
| 1036 | return RETRY_TASK; | ||
| 1037 | } | ||
| 1038 | |||
| 1026 | /* | 1039 | /* |
| 1027 | * When prev is DL, we may throttle it in put_prev_task(). | 1040 | * When prev is DL, we may throttle it in put_prev_task(). |
| 1028 | * So, we update time before we check for dl_nr_running. | 1041 | * So, we update time before we check for dl_nr_running. |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7e9bd0b1fa9e..9855e87d671a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
| @@ -1095,6 +1095,34 @@ static void task_numa_assign(struct task_numa_env *env, | |||
| 1095 | env->best_cpu = env->dst_cpu; | 1095 | env->best_cpu = env->dst_cpu; |
| 1096 | } | 1096 | } |
| 1097 | 1097 | ||
| 1098 | static bool load_too_imbalanced(long orig_src_load, long orig_dst_load, | ||
| 1099 | long src_load, long dst_load, | ||
| 1100 | struct task_numa_env *env) | ||
| 1101 | { | ||
| 1102 | long imb, old_imb; | ||
| 1103 | |||
| 1104 | /* We care about the slope of the imbalance, not the direction. */ | ||
| 1105 | if (dst_load < src_load) | ||
| 1106 | swap(dst_load, src_load); | ||
| 1107 | |||
| 1108 | /* Is the difference below the threshold? */ | ||
| 1109 | imb = dst_load * 100 - src_load * env->imbalance_pct; | ||
| 1110 | if (imb <= 0) | ||
| 1111 | return false; | ||
| 1112 | |||
| 1113 | /* | ||
| 1114 | * The imbalance is above the allowed threshold. | ||
| 1115 | * Compare it with the old imbalance. | ||
| 1116 | */ | ||
| 1117 | if (orig_dst_load < orig_src_load) | ||
| 1118 | swap(orig_dst_load, orig_src_load); | ||
| 1119 | |||
| 1120 | old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct; | ||
| 1121 | |||
| 1122 | /* Would this change make things worse? */ | ||
| 1123 | return (imb > old_imb); | ||
| 1124 | } | ||
| 1125 | |||
| 1098 | /* | 1126 | /* |
| 1099 | * This checks if the overall compute and NUMA accesses of the system would | 1127 | * This checks if the overall compute and NUMA accesses of the system would |
| 1100 | * be improved if the source tasks was migrated to the target dst_cpu taking | 1128 | * be improved if the source tasks was migrated to the target dst_cpu taking |
| @@ -1107,7 +1135,8 @@ static void task_numa_compare(struct task_numa_env *env, | |||
| 1107 | struct rq *src_rq = cpu_rq(env->src_cpu); | 1135 | struct rq *src_rq = cpu_rq(env->src_cpu); |
| 1108 | struct rq *dst_rq = cpu_rq(env->dst_cpu); | 1136 | struct rq *dst_rq = cpu_rq(env->dst_cpu); |
| 1109 | struct task_struct *cur; | 1137 | struct task_struct *cur; |
| 1110 | long dst_load, src_load; | 1138 | long orig_src_load, src_load; |
| 1139 | long orig_dst_load, dst_load; | ||
| 1111 | long load; | 1140 | long load; |
| 1112 | long imp = (groupimp > 0) ? groupimp : taskimp; | 1141 | long imp = (groupimp > 0) ? groupimp : taskimp; |
| 1113 | 1142 | ||
| @@ -1181,13 +1210,13 @@ static void task_numa_compare(struct task_numa_env *env, | |||
| 1181 | * In the overloaded case, try and keep the load balanced. | 1210 | * In the overloaded case, try and keep the load balanced. |
| 1182 | */ | 1211 | */ |
| 1183 | balance: | 1212 | balance: |
| 1184 | dst_load = env->dst_stats.load; | 1213 | orig_dst_load = env->dst_stats.load; |
| 1185 | src_load = env->src_stats.load; | 1214 | orig_src_load = env->src_stats.load; |
| 1186 | 1215 | ||
| 1187 | /* XXX missing power terms */ | 1216 | /* XXX missing power terms */ |
| 1188 | load = task_h_load(env->p); | 1217 | load = task_h_load(env->p); |
| 1189 | dst_load += load; | 1218 | dst_load = orig_dst_load + load; |
| 1190 | src_load -= load; | 1219 | src_load = orig_src_load - load; |
| 1191 | 1220 | ||
| 1192 | if (cur) { | 1221 | if (cur) { |
| 1193 | load = task_h_load(cur); | 1222 | load = task_h_load(cur); |
| @@ -1195,11 +1224,8 @@ balance: | |||
| 1195 | src_load += load; | 1224 | src_load += load; |
| 1196 | } | 1225 | } |
| 1197 | 1226 | ||
| 1198 | /* make src_load the smaller */ | 1227 | if (load_too_imbalanced(orig_src_load, orig_dst_load, |
| 1199 | if (dst_load < src_load) | 1228 | src_load, dst_load, env)) |
| 1200 | swap(dst_load, src_load); | ||
| 1201 | |||
| 1202 | if (src_load * env->imbalance_pct < dst_load * 100) | ||
| 1203 | goto unlock; | 1229 | goto unlock; |
| 1204 | 1230 | ||
| 1205 | assign: | 1231 | assign: |
| @@ -1301,7 +1327,16 @@ static int task_numa_migrate(struct task_struct *p) | |||
| 1301 | if (env.best_cpu == -1) | 1327 | if (env.best_cpu == -1) |
| 1302 | return -EAGAIN; | 1328 | return -EAGAIN; |
| 1303 | 1329 | ||
| 1304 | sched_setnuma(p, env.dst_nid); | 1330 | /* |
| 1331 | * If the task is part of a workload that spans multiple NUMA nodes, | ||
| 1332 | * and is migrating into one of the workload's active nodes, remember | ||
| 1333 | * this node as the task's preferred numa node, so the workload can | ||
| 1334 | * settle down. | ||
| 1335 | * A task that migrated to a second choice node will be better off | ||
| 1336 | * trying for a better one later. Do not set the preferred node here. | ||
| 1337 | */ | ||
| 1338 | if (p->numa_group && node_isset(env.dst_nid, p->numa_group->active_nodes)) | ||
| 1339 | sched_setnuma(p, env.dst_nid); | ||
| 1305 | 1340 | ||
| 1306 | /* | 1341 | /* |
| 1307 | * Reset the scan period if the task is being rescheduled on an | 1342 | * Reset the scan period if the task is being rescheduled on an |
| @@ -1326,12 +1361,15 @@ static int task_numa_migrate(struct task_struct *p) | |||
| 1326 | /* Attempt to migrate a task to a CPU on the preferred node. */ | 1361 | /* Attempt to migrate a task to a CPU on the preferred node. */ |
| 1327 | static void numa_migrate_preferred(struct task_struct *p) | 1362 | static void numa_migrate_preferred(struct task_struct *p) |
| 1328 | { | 1363 | { |
| 1364 | unsigned long interval = HZ; | ||
| 1365 | |||
| 1329 | /* This task has no NUMA fault statistics yet */ | 1366 | /* This task has no NUMA fault statistics yet */ |
| 1330 | if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory)) | 1367 | if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory)) |
| 1331 | return; | 1368 | return; |
| 1332 | 1369 | ||
| 1333 | /* Periodically retry migrating the task to the preferred node */ | 1370 | /* Periodically retry migrating the task to the preferred node */ |
| 1334 | p->numa_migrate_retry = jiffies + HZ; | 1371 | interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16); |
| 1372 | p->numa_migrate_retry = jiffies + interval; | ||
| 1335 | 1373 | ||
| 1336 | /* Success if task is already running on preferred CPU */ | 1374 | /* Success if task is already running on preferred CPU */ |
| 1337 | if (task_node(p) == p->numa_preferred_nid) | 1375 | if (task_node(p) == p->numa_preferred_nid) |
| @@ -1497,7 +1535,7 @@ static void task_numa_placement(struct task_struct *p) | |||
| 1497 | /* If the task is part of a group prevent parallel updates to group stats */ | 1535 | /* If the task is part of a group prevent parallel updates to group stats */ |
| 1498 | if (p->numa_group) { | 1536 | if (p->numa_group) { |
| 1499 | group_lock = &p->numa_group->lock; | 1537 | group_lock = &p->numa_group->lock; |
| 1500 | spin_lock(group_lock); | 1538 | spin_lock_irq(group_lock); |
| 1501 | } | 1539 | } |
| 1502 | 1540 | ||
| 1503 | /* Find the node with the highest number of faults */ | 1541 | /* Find the node with the highest number of faults */ |
| @@ -1572,7 +1610,7 @@ static void task_numa_placement(struct task_struct *p) | |||
| 1572 | } | 1610 | } |
| 1573 | } | 1611 | } |
| 1574 | 1612 | ||
| 1575 | spin_unlock(group_lock); | 1613 | spin_unlock_irq(group_lock); |
| 1576 | } | 1614 | } |
| 1577 | 1615 | ||
| 1578 | /* Preferred node as the node with the most faults */ | 1616 | /* Preferred node as the node with the most faults */ |
| @@ -1677,7 +1715,8 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, | |||
| 1677 | if (!join) | 1715 | if (!join) |
| 1678 | return; | 1716 | return; |
| 1679 | 1717 | ||
| 1680 | double_lock(&my_grp->lock, &grp->lock); | 1718 | BUG_ON(irqs_disabled()); |
| 1719 | double_lock_irq(&my_grp->lock, &grp->lock); | ||
| 1681 | 1720 | ||
| 1682 | for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) { | 1721 | for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) { |
| 1683 | my_grp->faults[i] -= p->numa_faults_memory[i]; | 1722 | my_grp->faults[i] -= p->numa_faults_memory[i]; |
| @@ -1691,7 +1730,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, | |||
| 1691 | grp->nr_tasks++; | 1730 | grp->nr_tasks++; |
| 1692 | 1731 | ||
| 1693 | spin_unlock(&my_grp->lock); | 1732 | spin_unlock(&my_grp->lock); |
| 1694 | spin_unlock(&grp->lock); | 1733 | spin_unlock_irq(&grp->lock); |
| 1695 | 1734 | ||
| 1696 | rcu_assign_pointer(p->numa_group, grp); | 1735 | rcu_assign_pointer(p->numa_group, grp); |
| 1697 | 1736 | ||
| @@ -1706,18 +1745,19 @@ no_join: | |||
| 1706 | void task_numa_free(struct task_struct *p) | 1745 | void task_numa_free(struct task_struct *p) |
| 1707 | { | 1746 | { |
| 1708 | struct numa_group *grp = p->numa_group; | 1747 | struct numa_group *grp = p->numa_group; |
| 1709 | int i; | ||
| 1710 | void *numa_faults = p->numa_faults_memory; | 1748 | void *numa_faults = p->numa_faults_memory; |
| 1749 | unsigned long flags; | ||
| 1750 | int i; | ||
| 1711 | 1751 | ||
| 1712 | if (grp) { | 1752 | if (grp) { |
| 1713 | spin_lock(&grp->lock); | 1753 | spin_lock_irqsave(&grp->lock, flags); |
| 1714 | for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) | 1754 | for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) |
| 1715 | grp->faults[i] -= p->numa_faults_memory[i]; | 1755 | grp->faults[i] -= p->numa_faults_memory[i]; |
| 1716 | grp->total_faults -= p->total_numa_faults; | 1756 | grp->total_faults -= p->total_numa_faults; |
| 1717 | 1757 | ||
| 1718 | list_del(&p->numa_entry); | 1758 | list_del(&p->numa_entry); |
| 1719 | grp->nr_tasks--; | 1759 | grp->nr_tasks--; |
| 1720 | spin_unlock(&grp->lock); | 1760 | spin_unlock_irqrestore(&grp->lock, flags); |
| 1721 | rcu_assign_pointer(p->numa_group, NULL); | 1761 | rcu_assign_pointer(p->numa_group, NULL); |
| 1722 | put_numa_group(grp); | 1762 | put_numa_group(grp); |
| 1723 | } | 1763 | } |
| @@ -1737,6 +1777,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) | |||
| 1737 | struct task_struct *p = current; | 1777 | struct task_struct *p = current; |
| 1738 | bool migrated = flags & TNF_MIGRATED; | 1778 | bool migrated = flags & TNF_MIGRATED; |
| 1739 | int cpu_node = task_node(current); | 1779 | int cpu_node = task_node(current); |
| 1780 | int local = !!(flags & TNF_FAULT_LOCAL); | ||
| 1740 | int priv; | 1781 | int priv; |
| 1741 | 1782 | ||
| 1742 | if (!numabalancing_enabled) | 1783 | if (!numabalancing_enabled) |
| @@ -1785,6 +1826,17 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) | |||
| 1785 | task_numa_group(p, last_cpupid, flags, &priv); | 1826 | task_numa_group(p, last_cpupid, flags, &priv); |
| 1786 | } | 1827 | } |
| 1787 | 1828 | ||
| 1829 | /* | ||
| 1830 | * If a workload spans multiple NUMA nodes, a shared fault that | ||
| 1831 | * occurs wholly within the set of nodes that the workload is | ||
| 1832 | * actively using should be counted as local. This allows the | ||
| 1833 | * scan rate to slow down when a workload has settled down. | ||
| 1834 | */ | ||
| 1835 | if (!priv && !local && p->numa_group && | ||
| 1836 | node_isset(cpu_node, p->numa_group->active_nodes) && | ||
| 1837 | node_isset(mem_node, p->numa_group->active_nodes)) | ||
| 1838 | local = 1; | ||
| 1839 | |||
| 1788 | task_numa_placement(p); | 1840 | task_numa_placement(p); |
| 1789 | 1841 | ||
| 1790 | /* | 1842 | /* |
| @@ -1799,7 +1851,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) | |||
| 1799 | 1851 | ||
| 1800 | p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages; | 1852 | p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages; |
| 1801 | p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages; | 1853 | p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages; |
| 1802 | p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages; | 1854 | p->numa_faults_locality[local] += pages; |
| 1803 | } | 1855 | } |
| 1804 | 1856 | ||
| 1805 | static void reset_ptenuma_scan(struct task_struct *p) | 1857 | static void reset_ptenuma_scan(struct task_struct *p) |
| @@ -3128,7 +3180,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) | |||
| 3128 | */ | 3180 | */ |
| 3129 | if (!cfs_b->timer_active) { | 3181 | if (!cfs_b->timer_active) { |
| 3130 | __refill_cfs_bandwidth_runtime(cfs_b); | 3182 | __refill_cfs_bandwidth_runtime(cfs_b); |
| 3131 | __start_cfs_bandwidth(cfs_b); | 3183 | __start_cfs_bandwidth(cfs_b, false); |
| 3132 | } | 3184 | } |
| 3133 | 3185 | ||
| 3134 | if (cfs_b->runtime > 0) { | 3186 | if (cfs_b->runtime > 0) { |
| @@ -3300,14 +3352,14 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) | |||
| 3300 | } | 3352 | } |
| 3301 | 3353 | ||
| 3302 | if (!se) | 3354 | if (!se) |
| 3303 | rq->nr_running -= task_delta; | 3355 | sub_nr_running(rq, task_delta); |
| 3304 | 3356 | ||
| 3305 | cfs_rq->throttled = 1; | 3357 | cfs_rq->throttled = 1; |
| 3306 | cfs_rq->throttled_clock = rq_clock(rq); | 3358 | cfs_rq->throttled_clock = rq_clock(rq); |
| 3307 | raw_spin_lock(&cfs_b->lock); | 3359 | raw_spin_lock(&cfs_b->lock); |
| 3308 | list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); | 3360 | list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); |
| 3309 | if (!cfs_b->timer_active) | 3361 | if (!cfs_b->timer_active) |
| 3310 | __start_cfs_bandwidth(cfs_b); | 3362 | __start_cfs_bandwidth(cfs_b, false); |
| 3311 | raw_spin_unlock(&cfs_b->lock); | 3363 | raw_spin_unlock(&cfs_b->lock); |
| 3312 | } | 3364 | } |
| 3313 | 3365 | ||
| @@ -3351,7 +3403,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) | |||
| 3351 | } | 3403 | } |
| 3352 | 3404 | ||
| 3353 | if (!se) | 3405 | if (!se) |
| 3354 | rq->nr_running += task_delta; | 3406 | add_nr_running(rq, task_delta); |
| 3355 | 3407 | ||
| 3356 | /* determine whether we need to wake up potentially idle cpu */ | 3408 | /* determine whether we need to wake up potentially idle cpu */ |
| 3357 | if (rq->curr == rq->idle && rq->cfs.nr_running) | 3409 | if (rq->curr == rq->idle && rq->cfs.nr_running) |
| @@ -3689,7 +3741,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) | |||
| 3689 | } | 3741 | } |
| 3690 | 3742 | ||
| 3691 | /* requires cfs_b->lock, may release to reprogram timer */ | 3743 | /* requires cfs_b->lock, may release to reprogram timer */ |
| 3692 | void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | 3744 | void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force) |
| 3693 | { | 3745 | { |
| 3694 | /* | 3746 | /* |
| 3695 | * The timer may be active because we're trying to set a new bandwidth | 3747 | * The timer may be active because we're trying to set a new bandwidth |
| @@ -3704,7 +3756,7 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | |||
| 3704 | cpu_relax(); | 3756 | cpu_relax(); |
| 3705 | raw_spin_lock(&cfs_b->lock); | 3757 | raw_spin_lock(&cfs_b->lock); |
| 3706 | /* if someone else restarted the timer then we're done */ | 3758 | /* if someone else restarted the timer then we're done */ |
| 3707 | if (cfs_b->timer_active) | 3759 | if (!force && cfs_b->timer_active) |
| 3708 | return; | 3760 | return; |
| 3709 | } | 3761 | } |
| 3710 | 3762 | ||
| @@ -3883,7 +3935,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
| 3883 | 3935 | ||
| 3884 | if (!se) { | 3936 | if (!se) { |
| 3885 | update_rq_runnable_avg(rq, rq->nr_running); | 3937 | update_rq_runnable_avg(rq, rq->nr_running); |
| 3886 | inc_nr_running(rq); | 3938 | add_nr_running(rq, 1); |
| 3887 | } | 3939 | } |
| 3888 | hrtick_update(rq); | 3940 | hrtick_update(rq); |
| 3889 | } | 3941 | } |
| @@ -3943,7 +3995,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
| 3943 | } | 3995 | } |
| 3944 | 3996 | ||
| 3945 | if (!se) { | 3997 | if (!se) { |
| 3946 | dec_nr_running(rq); | 3998 | sub_nr_running(rq, 1); |
| 3947 | update_rq_runnable_avg(rq, 1); | 3999 | update_rq_runnable_avg(rq, 1); |
| 3948 | } | 4000 | } |
| 3949 | hrtick_update(rq); | 4001 | hrtick_update(rq); |
| @@ -4014,7 +4066,7 @@ static void record_wakee(struct task_struct *p) | |||
| 4014 | * about the loss. | 4066 | * about the loss. |
| 4015 | */ | 4067 | */ |
| 4016 | if (jiffies > current->wakee_flip_decay_ts + HZ) { | 4068 | if (jiffies > current->wakee_flip_decay_ts + HZ) { |
| 4017 | current->wakee_flips = 0; | 4069 | current->wakee_flips >>= 1; |
| 4018 | current->wakee_flip_decay_ts = jiffies; | 4070 | current->wakee_flip_decay_ts = jiffies; |
| 4019 | } | 4071 | } |
| 4020 | 4072 | ||
| @@ -4448,10 +4500,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f | |||
| 4448 | sd = tmp; | 4500 | sd = tmp; |
| 4449 | } | 4501 | } |
| 4450 | 4502 | ||
| 4451 | if (affine_sd) { | 4503 | if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync)) |
| 4452 | if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) | 4504 | prev_cpu = cpu; |
| 4453 | prev_cpu = cpu; | ||
| 4454 | 4505 | ||
| 4506 | if (sd_flag & SD_BALANCE_WAKE) { | ||
| 4455 | new_cpu = select_idle_sibling(p, prev_cpu); | 4507 | new_cpu = select_idle_sibling(p, prev_cpu); |
| 4456 | goto unlock; | 4508 | goto unlock; |
| 4457 | } | 4509 | } |
| @@ -4519,6 +4571,9 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu) | |||
| 4519 | atomic_long_add(se->avg.load_avg_contrib, | 4571 | atomic_long_add(se->avg.load_avg_contrib, |
| 4520 | &cfs_rq->removed_load); | 4572 | &cfs_rq->removed_load); |
| 4521 | } | 4573 | } |
| 4574 | |||
| 4575 | /* We have migrated, no longer consider this task hot */ | ||
| 4576 | se->exec_start = 0; | ||
| 4522 | } | 4577 | } |
| 4523 | #endif /* CONFIG_SMP */ | 4578 | #endif /* CONFIG_SMP */ |
| 4524 | 4579 | ||
| @@ -5069,6 +5124,7 @@ task_hot(struct task_struct *p, u64 now) | |||
| 5069 | /* Returns true if the destination node has incurred more faults */ | 5124 | /* Returns true if the destination node has incurred more faults */ |
| 5070 | static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) | 5125 | static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) |
| 5071 | { | 5126 | { |
| 5127 | struct numa_group *numa_group = rcu_dereference(p->numa_group); | ||
| 5072 | int src_nid, dst_nid; | 5128 | int src_nid, dst_nid; |
| 5073 | 5129 | ||
| 5074 | if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory || | 5130 | if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory || |
| @@ -5082,21 +5138,29 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) | |||
| 5082 | if (src_nid == dst_nid) | 5138 | if (src_nid == dst_nid) |
| 5083 | return false; | 5139 | return false; |
| 5084 | 5140 | ||
| 5085 | /* Always encourage migration to the preferred node. */ | 5141 | if (numa_group) { |
| 5086 | if (dst_nid == p->numa_preferred_nid) | 5142 | /* Task is already in the group's interleave set. */ |
| 5087 | return true; | 5143 | if (node_isset(src_nid, numa_group->active_nodes)) |
| 5144 | return false; | ||
| 5145 | |||
| 5146 | /* Task is moving into the group's interleave set. */ | ||
| 5147 | if (node_isset(dst_nid, numa_group->active_nodes)) | ||
| 5148 | return true; | ||
| 5149 | |||
| 5150 | return group_faults(p, dst_nid) > group_faults(p, src_nid); | ||
| 5151 | } | ||
| 5088 | 5152 | ||
| 5089 | /* If both task and group weight improve, this move is a winner. */ | 5153 | /* Encourage migration to the preferred node. */ |
| 5090 | if (task_weight(p, dst_nid) > task_weight(p, src_nid) && | 5154 | if (dst_nid == p->numa_preferred_nid) |
| 5091 | group_weight(p, dst_nid) > group_weight(p, src_nid)) | ||
| 5092 | return true; | 5155 | return true; |
| 5093 | 5156 | ||
| 5094 | return false; | 5157 | return task_faults(p, dst_nid) > task_faults(p, src_nid); |
| 5095 | } | 5158 | } |
| 5096 | 5159 | ||
| 5097 | 5160 | ||
| 5098 | static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) | 5161 | static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) |
| 5099 | { | 5162 | { |
| 5163 | struct numa_group *numa_group = rcu_dereference(p->numa_group); | ||
| 5100 | int src_nid, dst_nid; | 5164 | int src_nid, dst_nid; |
| 5101 | 5165 | ||
| 5102 | if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) | 5166 | if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) |
| @@ -5111,16 +5175,23 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) | |||
| 5111 | if (src_nid == dst_nid) | 5175 | if (src_nid == dst_nid) |
| 5112 | return false; | 5176 | return false; |
| 5113 | 5177 | ||
| 5178 | if (numa_group) { | ||
| 5179 | /* Task is moving within/into the group's interleave set. */ | ||
| 5180 | if (node_isset(dst_nid, numa_group->active_nodes)) | ||
| 5181 | return false; | ||
| 5182 | |||
| 5183 | /* Task is moving out of the group's interleave set. */ | ||
| 5184 | if (node_isset(src_nid, numa_group->active_nodes)) | ||
| 5185 | return true; | ||
| 5186 | |||
| 5187 | return group_faults(p, dst_nid) < group_faults(p, src_nid); | ||
| 5188 | } | ||
| 5189 | |||
| 5114 | /* Migrating away from the preferred node is always bad. */ | 5190 | /* Migrating away from the preferred node is always bad. */ |
| 5115 | if (src_nid == p->numa_preferred_nid) | 5191 | if (src_nid == p->numa_preferred_nid) |
| 5116 | return true; | 5192 | return true; |
| 5117 | 5193 | ||
| 5118 | /* If either task or group weight get worse, don't do it. */ | 5194 | return task_faults(p, dst_nid) < task_faults(p, src_nid); |
| 5119 | if (task_weight(p, dst_nid) < task_weight(p, src_nid) || | ||
| 5120 | group_weight(p, dst_nid) < group_weight(p, src_nid)) | ||
| 5121 | return true; | ||
| 5122 | |||
| 5123 | return false; | ||
| 5124 | } | 5195 | } |
| 5125 | 5196 | ||
| 5126 | #else | 5197 | #else |
| @@ -5563,6 +5634,7 @@ static unsigned long scale_rt_power(int cpu) | |||
| 5563 | { | 5634 | { |
| 5564 | struct rq *rq = cpu_rq(cpu); | 5635 | struct rq *rq = cpu_rq(cpu); |
| 5565 | u64 total, available, age_stamp, avg; | 5636 | u64 total, available, age_stamp, avg; |
| 5637 | s64 delta; | ||
| 5566 | 5638 | ||
| 5567 | /* | 5639 | /* |
| 5568 | * Since we're reading these variables without serialization make sure | 5640 | * Since we're reading these variables without serialization make sure |
| @@ -5571,7 +5643,11 @@ static unsigned long scale_rt_power(int cpu) | |||
| 5571 | age_stamp = ACCESS_ONCE(rq->age_stamp); | 5643 | age_stamp = ACCESS_ONCE(rq->age_stamp); |
| 5572 | avg = ACCESS_ONCE(rq->rt_avg); | 5644 | avg = ACCESS_ONCE(rq->rt_avg); |
| 5573 | 5645 | ||
| 5574 | total = sched_avg_period() + (rq_clock(rq) - age_stamp); | 5646 | delta = rq_clock(rq) - age_stamp; |
| 5647 | if (unlikely(delta < 0)) | ||
| 5648 | delta = 0; | ||
| 5649 | |||
| 5650 | total = sched_avg_period() + delta; | ||
| 5575 | 5651 | ||
| 5576 | if (unlikely(total < avg)) { | 5652 | if (unlikely(total < avg)) { |
| 5577 | /* Ensures that power won't end up being negative */ | 5653 | /* Ensures that power won't end up being negative */ |
| @@ -6639,27 +6715,62 @@ out: | |||
| 6639 | return ld_moved; | 6715 | return ld_moved; |
| 6640 | } | 6716 | } |
| 6641 | 6717 | ||
| 6718 | static inline unsigned long | ||
| 6719 | get_sd_balance_interval(struct sched_domain *sd, int cpu_busy) | ||
| 6720 | { | ||
| 6721 | unsigned long interval = sd->balance_interval; | ||
| 6722 | |||
| 6723 | if (cpu_busy) | ||
| 6724 | interval *= sd->busy_factor; | ||
| 6725 | |||
| 6726 | /* scale ms to jiffies */ | ||
| 6727 | interval = msecs_to_jiffies(interval); | ||
| 6728 | interval = clamp(interval, 1UL, max_load_balance_interval); | ||
| 6729 | |||
| 6730 | return interval; | ||
| 6731 | } | ||
| 6732 | |||
| 6733 | static inline void | ||
| 6734 | update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance) | ||
| 6735 | { | ||
| 6736 | unsigned long interval, next; | ||
| 6737 | |||
| 6738 | interval = get_sd_balance_interval(sd, cpu_busy); | ||
| 6739 | next = sd->last_balance + interval; | ||
| 6740 | |||
| 6741 | if (time_after(*next_balance, next)) | ||
| 6742 | *next_balance = next; | ||
| 6743 | } | ||
| 6744 | |||
| 6642 | /* | 6745 | /* |
| 6643 | * idle_balance is called by schedule() if this_cpu is about to become | 6746 | * idle_balance is called by schedule() if this_cpu is about to become |
| 6644 | * idle. Attempts to pull tasks from other CPUs. | 6747 | * idle. Attempts to pull tasks from other CPUs. |
| 6645 | */ | 6748 | */ |
| 6646 | static int idle_balance(struct rq *this_rq) | 6749 | static int idle_balance(struct rq *this_rq) |
| 6647 | { | 6750 | { |
| 6751 | unsigned long next_balance = jiffies + HZ; | ||
| 6752 | int this_cpu = this_rq->cpu; | ||
| 6648 | struct sched_domain *sd; | 6753 | struct sched_domain *sd; |
| 6649 | int pulled_task = 0; | 6754 | int pulled_task = 0; |
| 6650 | unsigned long next_balance = jiffies + HZ; | ||
| 6651 | u64 curr_cost = 0; | 6755 | u64 curr_cost = 0; |
| 6652 | int this_cpu = this_rq->cpu; | ||
| 6653 | 6756 | ||
| 6654 | idle_enter_fair(this_rq); | 6757 | idle_enter_fair(this_rq); |
| 6758 | |||
| 6655 | /* | 6759 | /* |
| 6656 | * We must set idle_stamp _before_ calling idle_balance(), such that we | 6760 | * We must set idle_stamp _before_ calling idle_balance(), such that we |
| 6657 | * measure the duration of idle_balance() as idle time. | 6761 | * measure the duration of idle_balance() as idle time. |
| 6658 | */ | 6762 | */ |
| 6659 | this_rq->idle_stamp = rq_clock(this_rq); | 6763 | this_rq->idle_stamp = rq_clock(this_rq); |
| 6660 | 6764 | ||
| 6661 | if (this_rq->avg_idle < sysctl_sched_migration_cost) | 6765 | if (this_rq->avg_idle < sysctl_sched_migration_cost) { |
| 6766 | rcu_read_lock(); | ||
| 6767 | sd = rcu_dereference_check_sched_domain(this_rq->sd); | ||
| 6768 | if (sd) | ||
| 6769 | update_next_balance(sd, 0, &next_balance); | ||
| 6770 | rcu_read_unlock(); | ||
| 6771 | |||
| 6662 | goto out; | 6772 | goto out; |
| 6773 | } | ||
| 6663 | 6774 | ||
| 6664 | /* | 6775 | /* |
| 6665 | * Drop the rq->lock, but keep IRQ/preempt disabled. | 6776 | * Drop the rq->lock, but keep IRQ/preempt disabled. |
| @@ -6669,20 +6780,20 @@ static int idle_balance(struct rq *this_rq) | |||
| 6669 | update_blocked_averages(this_cpu); | 6780 | update_blocked_averages(this_cpu); |
| 6670 | rcu_read_lock(); | 6781 | rcu_read_lock(); |
| 6671 | for_each_domain(this_cpu, sd) { | 6782 | for_each_domain(this_cpu, sd) { |
| 6672 | unsigned long interval; | ||
| 6673 | int continue_balancing = 1; | 6783 | int continue_balancing = 1; |
| 6674 | u64 t0, domain_cost; | 6784 | u64 t0, domain_cost; |
| 6675 | 6785 | ||
| 6676 | if (!(sd->flags & SD_LOAD_BALANCE)) | 6786 | if (!(sd->flags & SD_LOAD_BALANCE)) |
| 6677 | continue; | 6787 | continue; |
| 6678 | 6788 | ||
| 6679 | if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) | 6789 | if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { |
| 6790 | update_next_balance(sd, 0, &next_balance); | ||
| 6680 | break; | 6791 | break; |
| 6792 | } | ||
| 6681 | 6793 | ||
| 6682 | if (sd->flags & SD_BALANCE_NEWIDLE) { | 6794 | if (sd->flags & SD_BALANCE_NEWIDLE) { |
| 6683 | t0 = sched_clock_cpu(this_cpu); | 6795 | t0 = sched_clock_cpu(this_cpu); |
| 6684 | 6796 | ||
| 6685 | /* If we've pulled tasks over stop searching: */ | ||
| 6686 | pulled_task = load_balance(this_cpu, this_rq, | 6797 | pulled_task = load_balance(this_cpu, this_rq, |
| 6687 | sd, CPU_NEWLY_IDLE, | 6798 | sd, CPU_NEWLY_IDLE, |
| 6688 | &continue_balancing); | 6799 | &continue_balancing); |
| @@ -6694,41 +6805,37 @@ static int idle_balance(struct rq *this_rq) | |||
| 6694 | curr_cost += domain_cost; | 6805 | curr_cost += domain_cost; |
| 6695 | } | 6806 | } |
| 6696 | 6807 | ||
| 6697 | interval = msecs_to_jiffies(sd->balance_interval); | 6808 | update_next_balance(sd, 0, &next_balance); |
| 6698 | if (time_after(next_balance, sd->last_balance + interval)) | 6809 | |
| 6699 | next_balance = sd->last_balance + interval; | 6810 | /* |
| 6700 | if (pulled_task) | 6811 | * Stop searching for tasks to pull if there are |
| 6812 | * now runnable tasks on this rq. | ||
| 6813 | */ | ||
| 6814 | if (pulled_task || this_rq->nr_running > 0) | ||
| 6701 | break; | 6815 | break; |
| 6702 | } | 6816 | } |
| 6703 | rcu_read_unlock(); | 6817 | rcu_read_unlock(); |
| 6704 | 6818 | ||
| 6705 | raw_spin_lock(&this_rq->lock); | 6819 | raw_spin_lock(&this_rq->lock); |
| 6706 | 6820 | ||
| 6821 | if (curr_cost > this_rq->max_idle_balance_cost) | ||
| 6822 | this_rq->max_idle_balance_cost = curr_cost; | ||
| 6823 | |||
| 6707 | /* | 6824 | /* |
| 6708 | * While browsing the domains, we released the rq lock. | 6825 | * While browsing the domains, we released the rq lock, a task could |
| 6709 | * A task could have be enqueued in the meantime | 6826 | * have been enqueued in the meantime. Since we're not going idle, |
| 6827 | * pretend we pulled a task. | ||
| 6710 | */ | 6828 | */ |
| 6711 | if (this_rq->cfs.h_nr_running && !pulled_task) { | 6829 | if (this_rq->cfs.h_nr_running && !pulled_task) |
| 6712 | pulled_task = 1; | 6830 | pulled_task = 1; |
| 6713 | goto out; | ||
| 6714 | } | ||
| 6715 | 6831 | ||
| 6716 | if (pulled_task || time_after(jiffies, this_rq->next_balance)) { | 6832 | out: |
| 6717 | /* | 6833 | /* Move the next balance forward */ |
| 6718 | * We are going idle. next_balance may be set based on | 6834 | if (time_after(this_rq->next_balance, next_balance)) |
| 6719 | * a busy processor. So reset next_balance. | ||
| 6720 | */ | ||
| 6721 | this_rq->next_balance = next_balance; | 6835 | this_rq->next_balance = next_balance; |
| 6722 | } | ||
| 6723 | |||
| 6724 | if (curr_cost > this_rq->max_idle_balance_cost) | ||
| 6725 | this_rq->max_idle_balance_cost = curr_cost; | ||
| 6726 | 6836 | ||
| 6727 | out: | ||
| 6728 | /* Is there a task of a high priority class? */ | 6837 | /* Is there a task of a high priority class? */ |
| 6729 | if (this_rq->nr_running != this_rq->cfs.h_nr_running && | 6838 | if (this_rq->nr_running != this_rq->cfs.h_nr_running) |
| 6730 | (this_rq->dl.dl_nr_running || | ||
| 6731 | (this_rq->rt.rt_nr_running && !rt_rq_throttled(&this_rq->rt)))) | ||
| 6732 | pulled_task = -1; | 6839 | pulled_task = -1; |
| 6733 | 6840 | ||
| 6734 | if (pulled_task) { | 6841 | if (pulled_task) { |
| @@ -7009,16 +7116,9 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) | |||
| 7009 | break; | 7116 | break; |
| 7010 | } | 7117 | } |
| 7011 | 7118 | ||
| 7012 | interval = sd->balance_interval; | 7119 | interval = get_sd_balance_interval(sd, idle != CPU_IDLE); |
| 7013 | if (idle != CPU_IDLE) | ||
| 7014 | interval *= sd->busy_factor; | ||
| 7015 | |||
| 7016 | /* scale ms to jiffies */ | ||
| 7017 | interval = msecs_to_jiffies(interval); | ||
| 7018 | interval = clamp(interval, 1UL, max_load_balance_interval); | ||
| 7019 | 7120 | ||
| 7020 | need_serialize = sd->flags & SD_SERIALIZE; | 7121 | need_serialize = sd->flags & SD_SERIALIZE; |
| 7021 | |||
| 7022 | if (need_serialize) { | 7122 | if (need_serialize) { |
| 7023 | if (!spin_trylock(&balancing)) | 7123 | if (!spin_trylock(&balancing)) |
| 7024 | goto out; | 7124 | goto out; |
| @@ -7034,6 +7134,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) | |||
| 7034 | idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE; | 7134 | idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE; |
| 7035 | } | 7135 | } |
| 7036 | sd->last_balance = jiffies; | 7136 | sd->last_balance = jiffies; |
| 7137 | interval = get_sd_balance_interval(sd, idle != CPU_IDLE); | ||
| 7037 | } | 7138 | } |
| 7038 | if (need_serialize) | 7139 | if (need_serialize) |
| 7039 | spin_unlock(&balancing); | 7140 | spin_unlock(&balancing); |
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 8f4390a079c7..25b9423abce9 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c | |||
| @@ -67,24 +67,21 @@ void __weak arch_cpu_idle(void) | |||
| 67 | * cpuidle_idle_call - the main idle function | 67 | * cpuidle_idle_call - the main idle function |
| 68 | * | 68 | * |
| 69 | * NOTE: no locks or semaphores should be used here | 69 | * NOTE: no locks or semaphores should be used here |
| 70 | * return non-zero on failure | ||
| 71 | */ | 70 | */ |
| 72 | static int cpuidle_idle_call(void) | 71 | static void cpuidle_idle_call(void) |
| 73 | { | 72 | { |
| 74 | struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); | 73 | struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); |
| 75 | struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); | 74 | struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); |
| 76 | int next_state, entered_state, ret; | 75 | int next_state, entered_state; |
| 77 | bool broadcast; | 76 | bool broadcast; |
| 78 | 77 | ||
| 79 | /* | 78 | /* |
| 80 | * Check if the idle task must be rescheduled. If it is the | 79 | * Check if the idle task must be rescheduled. If it is the |
| 81 | * case, exit the function after re-enabling the local irq and | 80 | * case, exit the function after re-enabling the local irq. |
| 82 | * set again the polling flag | ||
| 83 | */ | 81 | */ |
| 84 | if (current_clr_polling_and_test()) { | 82 | if (need_resched()) { |
| 85 | local_irq_enable(); | 83 | local_irq_enable(); |
| 86 | __current_set_polling(); | 84 | return; |
| 87 | return 0; | ||
| 88 | } | 85 | } |
| 89 | 86 | ||
| 90 | /* | 87 | /* |
| @@ -101,96 +98,79 @@ static int cpuidle_idle_call(void) | |||
| 101 | rcu_idle_enter(); | 98 | rcu_idle_enter(); |
| 102 | 99 | ||
| 103 | /* | 100 | /* |
| 104 | * Check if the cpuidle framework is ready, otherwise fallback | 101 | * Ask the cpuidle framework to choose a convenient idle state. |
| 105 | * to the default arch specific idle method | 102 | * Fall back to the default arch idle method on errors. |
| 106 | */ | 103 | */ |
| 107 | ret = cpuidle_enabled(drv, dev); | 104 | next_state = cpuidle_select(drv, dev); |
| 108 | 105 | if (next_state < 0) { | |
| 109 | if (!ret) { | 106 | use_default: |
| 110 | /* | 107 | /* |
| 111 | * Ask the governor to choose an idle state it thinks | 108 | * We can't use the cpuidle framework, let's use the default |
| 112 | * it is convenient to go to. There is *always* a | 109 | * idle routine. |
| 113 | * convenient idle state | ||
| 114 | */ | 110 | */ |
| 115 | next_state = cpuidle_select(drv, dev); | 111 | if (current_clr_polling_and_test()) |
| 116 | |||
| 117 | /* | ||
| 118 | * The idle task must be scheduled, it is pointless to | ||
| 119 | * go to idle, just update no idle residency and get | ||
| 120 | * out of this function | ||
| 121 | */ | ||
| 122 | if (current_clr_polling_and_test()) { | ||
| 123 | dev->last_residency = 0; | ||
| 124 | entered_state = next_state; | ||
| 125 | local_irq_enable(); | 112 | local_irq_enable(); |
| 126 | } else { | 113 | else |
| 127 | broadcast = !!(drv->states[next_state].flags & | 114 | arch_cpu_idle(); |
| 128 | CPUIDLE_FLAG_TIMER_STOP); | 115 | |
| 129 | 116 | goto exit_idle; | |
| 130 | if (broadcast) | ||
| 131 | /* | ||
| 132 | * Tell the time framework to switch | ||
| 133 | * to a broadcast timer because our | ||
| 134 | * local timer will be shutdown. If a | ||
| 135 | * local timer is used from another | ||
| 136 | * cpu as a broadcast timer, this call | ||
| 137 | * may fail if it is not available | ||
| 138 | */ | ||
| 139 | ret = clockevents_notify( | ||
| 140 | CLOCK_EVT_NOTIFY_BROADCAST_ENTER, | ||
| 141 | &dev->cpu); | ||
| 142 | |||
| 143 | if (!ret) { | ||
| 144 | trace_cpu_idle_rcuidle(next_state, dev->cpu); | ||
| 145 | |||
| 146 | /* | ||
| 147 | * Enter the idle state previously | ||
| 148 | * returned by the governor | ||
| 149 | * decision. This function will block | ||
| 150 | * until an interrupt occurs and will | ||
| 151 | * take care of re-enabling the local | ||
| 152 | * interrupts | ||
| 153 | */ | ||
| 154 | entered_state = cpuidle_enter(drv, dev, | ||
| 155 | next_state); | ||
| 156 | |||
| 157 | trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, | ||
| 158 | dev->cpu); | ||
| 159 | |||
| 160 | if (broadcast) | ||
| 161 | clockevents_notify( | ||
| 162 | CLOCK_EVT_NOTIFY_BROADCAST_EXIT, | ||
| 163 | &dev->cpu); | ||
| 164 | |||
| 165 | /* | ||
| 166 | * Give the governor an opportunity to reflect on the | ||
| 167 | * outcome | ||
| 168 | */ | ||
| 169 | cpuidle_reflect(dev, entered_state); | ||
| 170 | } | ||
| 171 | } | ||
| 172 | } | 117 | } |
| 173 | 118 | ||
| 119 | |||
| 174 | /* | 120 | /* |
| 175 | * We can't use the cpuidle framework, let's use the default | 121 | * The idle task must be scheduled, it is pointless to |
| 176 | * idle routine | 122 | * go to idle, just update no idle residency and get |
| 123 | * out of this function | ||
| 177 | */ | 124 | */ |
| 178 | if (ret) | 125 | if (current_clr_polling_and_test()) { |
| 179 | arch_cpu_idle(); | 126 | dev->last_residency = 0; |
| 127 | entered_state = next_state; | ||
| 128 | local_irq_enable(); | ||
| 129 | goto exit_idle; | ||
| 130 | } | ||
| 131 | |||
| 132 | broadcast = !!(drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP); | ||
| 180 | 133 | ||
| 134 | /* | ||
| 135 | * Tell the time framework to switch to a broadcast timer | ||
| 136 | * because our local timer will be shutdown. If a local timer | ||
| 137 | * is used from another cpu as a broadcast timer, this call may | ||
| 138 | * fail if it is not available | ||
| 139 | */ | ||
| 140 | if (broadcast && | ||
| 141 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu)) | ||
| 142 | goto use_default; | ||
| 143 | |||
| 144 | trace_cpu_idle_rcuidle(next_state, dev->cpu); | ||
| 145 | |||
| 146 | /* | ||
| 147 | * Enter the idle state previously returned by the governor decision. | ||
| 148 | * This function will block until an interrupt occurs and will take | ||
| 149 | * care of re-enabling the local interrupts | ||
| 150 | */ | ||
| 151 | entered_state = cpuidle_enter(drv, dev, next_state); | ||
| 152 | |||
| 153 | trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu); | ||
| 154 | |||
| 155 | if (broadcast) | ||
| 156 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu); | ||
| 157 | |||
| 158 | /* | ||
| 159 | * Give the governor an opportunity to reflect on the outcome | ||
| 160 | */ | ||
| 161 | cpuidle_reflect(dev, entered_state); | ||
| 162 | |||
| 163 | exit_idle: | ||
| 181 | __current_set_polling(); | 164 | __current_set_polling(); |
| 182 | 165 | ||
| 183 | /* | 166 | /* |
| 184 | * It is up to the idle functions to enable back the local | 167 | * It is up to the idle functions to reenable local interrupts |
| 185 | * interrupt | ||
| 186 | */ | 168 | */ |
| 187 | if (WARN_ON_ONCE(irqs_disabled())) | 169 | if (WARN_ON_ONCE(irqs_disabled())) |
| 188 | local_irq_enable(); | 170 | local_irq_enable(); |
| 189 | 171 | ||
| 190 | rcu_idle_exit(); | 172 | rcu_idle_exit(); |
| 191 | start_critical_timings(); | 173 | start_critical_timings(); |
| 192 | |||
| 193 | return 0; | ||
| 194 | } | 174 | } |
| 195 | 175 | ||
| 196 | /* | 176 | /* |
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index d8cdf1618551..b3512f1afce9 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
| @@ -79,6 +79,8 @@ void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) | |||
| 79 | rt_rq->overloaded = 0; | 79 | rt_rq->overloaded = 0; |
| 80 | plist_head_init(&rt_rq->pushable_tasks); | 80 | plist_head_init(&rt_rq->pushable_tasks); |
| 81 | #endif | 81 | #endif |
| 82 | /* We start is dequeued state, because no RT tasks are queued */ | ||
| 83 | rt_rq->rt_queued = 0; | ||
| 82 | 84 | ||
| 83 | rt_rq->rt_time = 0; | 85 | rt_rq->rt_time = 0; |
| 84 | rt_rq->rt_throttled = 0; | 86 | rt_rq->rt_throttled = 0; |
| @@ -112,6 +114,13 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) | |||
| 112 | return rt_se->rt_rq; | 114 | return rt_se->rt_rq; |
| 113 | } | 115 | } |
| 114 | 116 | ||
| 117 | static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se) | ||
| 118 | { | ||
| 119 | struct rt_rq *rt_rq = rt_se->rt_rq; | ||
| 120 | |||
| 121 | return rt_rq->rq; | ||
| 122 | } | ||
| 123 | |||
| 115 | void free_rt_sched_group(struct task_group *tg) | 124 | void free_rt_sched_group(struct task_group *tg) |
| 116 | { | 125 | { |
| 117 | int i; | 126 | int i; |
| @@ -211,10 +220,16 @@ static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) | |||
| 211 | return container_of(rt_rq, struct rq, rt); | 220 | return container_of(rt_rq, struct rq, rt); |
| 212 | } | 221 | } |
| 213 | 222 | ||
| 214 | static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) | 223 | static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se) |
| 215 | { | 224 | { |
| 216 | struct task_struct *p = rt_task_of(rt_se); | 225 | struct task_struct *p = rt_task_of(rt_se); |
| 217 | struct rq *rq = task_rq(p); | 226 | |
| 227 | return task_rq(p); | ||
| 228 | } | ||
| 229 | |||
| 230 | static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) | ||
| 231 | { | ||
| 232 | struct rq *rq = rq_of_rt_se(rt_se); | ||
| 218 | 233 | ||
| 219 | return &rq->rt; | 234 | return &rq->rt; |
| 220 | } | 235 | } |
| @@ -391,6 +406,9 @@ static inline void set_post_schedule(struct rq *rq) | |||
| 391 | } | 406 | } |
| 392 | #endif /* CONFIG_SMP */ | 407 | #endif /* CONFIG_SMP */ |
| 393 | 408 | ||
| 409 | static void enqueue_top_rt_rq(struct rt_rq *rt_rq); | ||
| 410 | static void dequeue_top_rt_rq(struct rt_rq *rt_rq); | ||
| 411 | |||
| 394 | static inline int on_rt_rq(struct sched_rt_entity *rt_se) | 412 | static inline int on_rt_rq(struct sched_rt_entity *rt_se) |
| 395 | { | 413 | { |
| 396 | return !list_empty(&rt_se->run_list); | 414 | return !list_empty(&rt_se->run_list); |
| @@ -452,8 +470,11 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) | |||
| 452 | rt_se = rt_rq->tg->rt_se[cpu]; | 470 | rt_se = rt_rq->tg->rt_se[cpu]; |
| 453 | 471 | ||
| 454 | if (rt_rq->rt_nr_running) { | 472 | if (rt_rq->rt_nr_running) { |
| 455 | if (rt_se && !on_rt_rq(rt_se)) | 473 | if (!rt_se) |
| 474 | enqueue_top_rt_rq(rt_rq); | ||
| 475 | else if (!on_rt_rq(rt_se)) | ||
| 456 | enqueue_rt_entity(rt_se, false); | 476 | enqueue_rt_entity(rt_se, false); |
| 477 | |||
| 457 | if (rt_rq->highest_prio.curr < curr->prio) | 478 | if (rt_rq->highest_prio.curr < curr->prio) |
| 458 | resched_task(curr); | 479 | resched_task(curr); |
| 459 | } | 480 | } |
| @@ -466,10 +487,17 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) | |||
| 466 | 487 | ||
| 467 | rt_se = rt_rq->tg->rt_se[cpu]; | 488 | rt_se = rt_rq->tg->rt_se[cpu]; |
| 468 | 489 | ||
| 469 | if (rt_se && on_rt_rq(rt_se)) | 490 | if (!rt_se) |
| 491 | dequeue_top_rt_rq(rt_rq); | ||
| 492 | else if (on_rt_rq(rt_se)) | ||
| 470 | dequeue_rt_entity(rt_se); | 493 | dequeue_rt_entity(rt_se); |
| 471 | } | 494 | } |
| 472 | 495 | ||
| 496 | static inline int rt_rq_throttled(struct rt_rq *rt_rq) | ||
| 497 | { | ||
| 498 | return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted; | ||
| 499 | } | ||
| 500 | |||
| 473 | static int rt_se_boosted(struct sched_rt_entity *rt_se) | 501 | static int rt_se_boosted(struct sched_rt_entity *rt_se) |
| 474 | { | 502 | { |
| 475 | struct rt_rq *rt_rq = group_rt_rq(rt_se); | 503 | struct rt_rq *rt_rq = group_rt_rq(rt_se); |
| @@ -532,12 +560,23 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) | |||
| 532 | 560 | ||
| 533 | static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) | 561 | static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) |
| 534 | { | 562 | { |
| 535 | if (rt_rq->rt_nr_running) | 563 | struct rq *rq = rq_of_rt_rq(rt_rq); |
| 536 | resched_task(rq_of_rt_rq(rt_rq)->curr); | 564 | |
| 565 | if (!rt_rq->rt_nr_running) | ||
| 566 | return; | ||
| 567 | |||
| 568 | enqueue_top_rt_rq(rt_rq); | ||
| 569 | resched_task(rq->curr); | ||
| 537 | } | 570 | } |
| 538 | 571 | ||
| 539 | static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) | 572 | static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) |
| 540 | { | 573 | { |
| 574 | dequeue_top_rt_rq(rt_rq); | ||
| 575 | } | ||
| 576 | |||
| 577 | static inline int rt_rq_throttled(struct rt_rq *rt_rq) | ||
| 578 | { | ||
| 579 | return rt_rq->rt_throttled; | ||
| 541 | } | 580 | } |
| 542 | 581 | ||
| 543 | static inline const struct cpumask *sched_rt_period_mask(void) | 582 | static inline const struct cpumask *sched_rt_period_mask(void) |
| @@ -851,14 +890,8 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) | |||
| 851 | * but accrue some time due to boosting. | 890 | * but accrue some time due to boosting. |
| 852 | */ | 891 | */ |
| 853 | if (likely(rt_b->rt_runtime)) { | 892 | if (likely(rt_b->rt_runtime)) { |
| 854 | static bool once = false; | ||
| 855 | |||
| 856 | rt_rq->rt_throttled = 1; | 893 | rt_rq->rt_throttled = 1; |
| 857 | 894 | printk_deferred_once("sched: RT throttling activated\n"); | |
| 858 | if (!once) { | ||
| 859 | once = true; | ||
| 860 | printk_sched("sched: RT throttling activated\n"); | ||
| 861 | } | ||
| 862 | } else { | 895 | } else { |
| 863 | /* | 896 | /* |
| 864 | * In case we did anyway, make it go away, | 897 | * In case we did anyway, make it go away, |
| @@ -922,6 +955,38 @@ static void update_curr_rt(struct rq *rq) | |||
| 922 | } | 955 | } |
| 923 | } | 956 | } |
| 924 | 957 | ||
| 958 | static void | ||
| 959 | dequeue_top_rt_rq(struct rt_rq *rt_rq) | ||
| 960 | { | ||
| 961 | struct rq *rq = rq_of_rt_rq(rt_rq); | ||
| 962 | |||
| 963 | BUG_ON(&rq->rt != rt_rq); | ||
| 964 | |||
| 965 | if (!rt_rq->rt_queued) | ||
| 966 | return; | ||
| 967 | |||
| 968 | BUG_ON(!rq->nr_running); | ||
| 969 | |||
| 970 | sub_nr_running(rq, rt_rq->rt_nr_running); | ||
| 971 | rt_rq->rt_queued = 0; | ||
| 972 | } | ||
| 973 | |||
| 974 | static void | ||
| 975 | enqueue_top_rt_rq(struct rt_rq *rt_rq) | ||
| 976 | { | ||
| 977 | struct rq *rq = rq_of_rt_rq(rt_rq); | ||
| 978 | |||
| 979 | BUG_ON(&rq->rt != rt_rq); | ||
| 980 | |||
| 981 | if (rt_rq->rt_queued) | ||
| 982 | return; | ||
| 983 | if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running) | ||
| 984 | return; | ||
| 985 | |||
| 986 | add_nr_running(rq, rt_rq->rt_nr_running); | ||
| 987 | rt_rq->rt_queued = 1; | ||
| 988 | } | ||
| 989 | |||
| 925 | #if defined CONFIG_SMP | 990 | #if defined CONFIG_SMP |
| 926 | 991 | ||
| 927 | static void | 992 | static void |
| @@ -1045,12 +1110,23 @@ void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {} | |||
| 1045 | #endif /* CONFIG_RT_GROUP_SCHED */ | 1110 | #endif /* CONFIG_RT_GROUP_SCHED */ |
| 1046 | 1111 | ||
| 1047 | static inline | 1112 | static inline |
| 1113 | unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se) | ||
| 1114 | { | ||
| 1115 | struct rt_rq *group_rq = group_rt_rq(rt_se); | ||
| 1116 | |||
| 1117 | if (group_rq) | ||
| 1118 | return group_rq->rt_nr_running; | ||
| 1119 | else | ||
| 1120 | return 1; | ||
| 1121 | } | ||
| 1122 | |||
| 1123 | static inline | ||
| 1048 | void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) | 1124 | void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) |
| 1049 | { | 1125 | { |
| 1050 | int prio = rt_se_prio(rt_se); | 1126 | int prio = rt_se_prio(rt_se); |
| 1051 | 1127 | ||
| 1052 | WARN_ON(!rt_prio(prio)); | 1128 | WARN_ON(!rt_prio(prio)); |
| 1053 | rt_rq->rt_nr_running++; | 1129 | rt_rq->rt_nr_running += rt_se_nr_running(rt_se); |
| 1054 | 1130 | ||
| 1055 | inc_rt_prio(rt_rq, prio); | 1131 | inc_rt_prio(rt_rq, prio); |
| 1056 | inc_rt_migration(rt_se, rt_rq); | 1132 | inc_rt_migration(rt_se, rt_rq); |
| @@ -1062,7 +1138,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) | |||
| 1062 | { | 1138 | { |
| 1063 | WARN_ON(!rt_prio(rt_se_prio(rt_se))); | 1139 | WARN_ON(!rt_prio(rt_se_prio(rt_se))); |
| 1064 | WARN_ON(!rt_rq->rt_nr_running); | 1140 | WARN_ON(!rt_rq->rt_nr_running); |
| 1065 | rt_rq->rt_nr_running--; | 1141 | rt_rq->rt_nr_running -= rt_se_nr_running(rt_se); |
| 1066 | 1142 | ||
| 1067 | dec_rt_prio(rt_rq, rt_se_prio(rt_se)); | 1143 | dec_rt_prio(rt_rq, rt_se_prio(rt_se)); |
| 1068 | dec_rt_migration(rt_se, rt_rq); | 1144 | dec_rt_migration(rt_se, rt_rq); |
| @@ -1119,6 +1195,8 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se) | |||
| 1119 | back = rt_se; | 1195 | back = rt_se; |
| 1120 | } | 1196 | } |
| 1121 | 1197 | ||
| 1198 | dequeue_top_rt_rq(rt_rq_of_se(back)); | ||
| 1199 | |||
| 1122 | for (rt_se = back; rt_se; rt_se = rt_se->back) { | 1200 | for (rt_se = back; rt_se; rt_se = rt_se->back) { |
| 1123 | if (on_rt_rq(rt_se)) | 1201 | if (on_rt_rq(rt_se)) |
| 1124 | __dequeue_rt_entity(rt_se); | 1202 | __dequeue_rt_entity(rt_se); |
| @@ -1127,13 +1205,18 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se) | |||
| 1127 | 1205 | ||
| 1128 | static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) | 1206 | static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) |
| 1129 | { | 1207 | { |
| 1208 | struct rq *rq = rq_of_rt_se(rt_se); | ||
| 1209 | |||
| 1130 | dequeue_rt_stack(rt_se); | 1210 | dequeue_rt_stack(rt_se); |
| 1131 | for_each_sched_rt_entity(rt_se) | 1211 | for_each_sched_rt_entity(rt_se) |
| 1132 | __enqueue_rt_entity(rt_se, head); | 1212 | __enqueue_rt_entity(rt_se, head); |
| 1213 | enqueue_top_rt_rq(&rq->rt); | ||
| 1133 | } | 1214 | } |
| 1134 | 1215 | ||
| 1135 | static void dequeue_rt_entity(struct sched_rt_entity *rt_se) | 1216 | static void dequeue_rt_entity(struct sched_rt_entity *rt_se) |
| 1136 | { | 1217 | { |
| 1218 | struct rq *rq = rq_of_rt_se(rt_se); | ||
| 1219 | |||
| 1137 | dequeue_rt_stack(rt_se); | 1220 | dequeue_rt_stack(rt_se); |
| 1138 | 1221 | ||
| 1139 | for_each_sched_rt_entity(rt_se) { | 1222 | for_each_sched_rt_entity(rt_se) { |
| @@ -1142,6 +1225,7 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se) | |||
| 1142 | if (rt_rq && rt_rq->rt_nr_running) | 1225 | if (rt_rq && rt_rq->rt_nr_running) |
| 1143 | __enqueue_rt_entity(rt_se, false); | 1226 | __enqueue_rt_entity(rt_se, false); |
| 1144 | } | 1227 | } |
| 1228 | enqueue_top_rt_rq(&rq->rt); | ||
| 1145 | } | 1229 | } |
| 1146 | 1230 | ||
| 1147 | /* | 1231 | /* |
| @@ -1159,8 +1243,6 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) | |||
| 1159 | 1243 | ||
| 1160 | if (!task_current(rq, p) && p->nr_cpus_allowed > 1) | 1244 | if (!task_current(rq, p) && p->nr_cpus_allowed > 1) |
| 1161 | enqueue_pushable_task(rq, p); | 1245 | enqueue_pushable_task(rq, p); |
| 1162 | |||
| 1163 | inc_nr_running(rq); | ||
| 1164 | } | 1246 | } |
| 1165 | 1247 | ||
| 1166 | static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) | 1248 | static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) |
| @@ -1171,8 +1253,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) | |||
| 1171 | dequeue_rt_entity(rt_se); | 1253 | dequeue_rt_entity(rt_se); |
| 1172 | 1254 | ||
| 1173 | dequeue_pushable_task(rq, p); | 1255 | dequeue_pushable_task(rq, p); |
| 1174 | |||
| 1175 | dec_nr_running(rq); | ||
| 1176 | } | 1256 | } |
| 1177 | 1257 | ||
| 1178 | /* | 1258 | /* |
| @@ -1362,10 +1442,11 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev) | |||
| 1362 | pull_rt_task(rq); | 1442 | pull_rt_task(rq); |
| 1363 | /* | 1443 | /* |
| 1364 | * pull_rt_task() can drop (and re-acquire) rq->lock; this | 1444 | * pull_rt_task() can drop (and re-acquire) rq->lock; this |
| 1365 | * means a dl task can slip in, in which case we need to | 1445 | * means a dl or stop task can slip in, in which case we need |
| 1366 | * re-start task selection. | 1446 | * to re-start task selection. |
| 1367 | */ | 1447 | */ |
| 1368 | if (unlikely(rq->dl.dl_nr_running)) | 1448 | if (unlikely((rq->stop && rq->stop->on_rq) || |
| 1449 | rq->dl.dl_nr_running)) | ||
| 1369 | return RETRY_TASK; | 1450 | return RETRY_TASK; |
| 1370 | } | 1451 | } |
| 1371 | 1452 | ||
| @@ -1376,10 +1457,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev) | |||
| 1376 | if (prev->sched_class == &rt_sched_class) | 1457 | if (prev->sched_class == &rt_sched_class) |
| 1377 | update_curr_rt(rq); | 1458 | update_curr_rt(rq); |
| 1378 | 1459 | ||
| 1379 | if (!rt_rq->rt_nr_running) | 1460 | if (!rt_rq->rt_queued) |
| 1380 | return NULL; | ||
| 1381 | |||
| 1382 | if (rt_rq_throttled(rt_rq)) | ||
| 1383 | return NULL; | 1461 | return NULL; |
| 1384 | 1462 | ||
| 1385 | put_prev_task(rq, prev); | 1463 | put_prev_task(rq, prev); |
| @@ -1891,9 +1969,9 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) | |||
| 1891 | */ | 1969 | */ |
| 1892 | if (p->on_rq && rq->curr != p) { | 1970 | if (p->on_rq && rq->curr != p) { |
| 1893 | #ifdef CONFIG_SMP | 1971 | #ifdef CONFIG_SMP |
| 1894 | if (rq->rt.overloaded && push_rt_task(rq) && | 1972 | if (p->nr_cpus_allowed > 1 && rq->rt.overloaded && |
| 1895 | /* Don't resched if we changed runqueues */ | 1973 | /* Don't resched if we changed runqueues */ |
| 1896 | rq != task_rq(p)) | 1974 | push_rt_task(rq) && rq != task_rq(p)) |
| 1897 | check_resched = 0; | 1975 | check_resched = 0; |
| 1898 | #endif /* CONFIG_SMP */ | 1976 | #endif /* CONFIG_SMP */ |
| 1899 | if (check_resched && p->prio < rq->curr->prio) | 1977 | if (check_resched && p->prio < rq->curr->prio) |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c9007f28d3a2..e47679b04d16 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
| @@ -278,7 +278,7 @@ extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b); | |||
| 278 | extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); | 278 | extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); |
| 279 | 279 | ||
| 280 | extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b); | 280 | extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b); |
| 281 | extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); | 281 | extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force); |
| 282 | extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq); | 282 | extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq); |
| 283 | 283 | ||
| 284 | extern void free_rt_sched_group(struct task_group *tg); | 284 | extern void free_rt_sched_group(struct task_group *tg); |
| @@ -409,6 +409,8 @@ struct rt_rq { | |||
| 409 | int overloaded; | 409 | int overloaded; |
| 410 | struct plist_head pushable_tasks; | 410 | struct plist_head pushable_tasks; |
| 411 | #endif | 411 | #endif |
| 412 | int rt_queued; | ||
| 413 | |||
| 412 | int rt_throttled; | 414 | int rt_throttled; |
| 413 | u64 rt_time; | 415 | u64 rt_time; |
| 414 | u64 rt_runtime; | 416 | u64 rt_runtime; |
| @@ -423,18 +425,6 @@ struct rt_rq { | |||
| 423 | #endif | 425 | #endif |
| 424 | }; | 426 | }; |
| 425 | 427 | ||
| 426 | #ifdef CONFIG_RT_GROUP_SCHED | ||
| 427 | static inline int rt_rq_throttled(struct rt_rq *rt_rq) | ||
| 428 | { | ||
| 429 | return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted; | ||
| 430 | } | ||
| 431 | #else | ||
| 432 | static inline int rt_rq_throttled(struct rt_rq *rt_rq) | ||
| 433 | { | ||
| 434 | return rt_rq->rt_throttled; | ||
| 435 | } | ||
| 436 | #endif | ||
| 437 | |||
| 438 | /* Deadline class' related fields in a runqueue */ | 428 | /* Deadline class' related fields in a runqueue */ |
| 439 | struct dl_rq { | 429 | struct dl_rq { |
| 440 | /* runqueue is an rbtree, ordered by deadline */ | 430 | /* runqueue is an rbtree, ordered by deadline */ |
| @@ -1216,12 +1206,14 @@ extern void update_idle_cpu_load(struct rq *this_rq); | |||
| 1216 | 1206 | ||
| 1217 | extern void init_task_runnable_average(struct task_struct *p); | 1207 | extern void init_task_runnable_average(struct task_struct *p); |
| 1218 | 1208 | ||
| 1219 | static inline void inc_nr_running(struct rq *rq) | 1209 | static inline void add_nr_running(struct rq *rq, unsigned count) |
| 1220 | { | 1210 | { |
| 1221 | rq->nr_running++; | 1211 | unsigned prev_nr = rq->nr_running; |
| 1212 | |||
| 1213 | rq->nr_running = prev_nr + count; | ||
| 1222 | 1214 | ||
| 1223 | #ifdef CONFIG_NO_HZ_FULL | 1215 | #ifdef CONFIG_NO_HZ_FULL |
| 1224 | if (rq->nr_running == 2) { | 1216 | if (prev_nr < 2 && rq->nr_running >= 2) { |
| 1225 | if (tick_nohz_full_cpu(rq->cpu)) { | 1217 | if (tick_nohz_full_cpu(rq->cpu)) { |
| 1226 | /* Order rq->nr_running write against the IPI */ | 1218 | /* Order rq->nr_running write against the IPI */ |
| 1227 | smp_wmb(); | 1219 | smp_wmb(); |
| @@ -1231,9 +1223,9 @@ static inline void inc_nr_running(struct rq *rq) | |||
| 1231 | #endif | 1223 | #endif |
| 1232 | } | 1224 | } |
| 1233 | 1225 | ||
| 1234 | static inline void dec_nr_running(struct rq *rq) | 1226 | static inline void sub_nr_running(struct rq *rq, unsigned count) |
| 1235 | { | 1227 | { |
| 1236 | rq->nr_running--; | 1228 | rq->nr_running -= count; |
| 1237 | } | 1229 | } |
| 1238 | 1230 | ||
| 1239 | static inline void rq_last_tick_reset(struct rq *rq) | 1231 | static inline void rq_last_tick_reset(struct rq *rq) |
| @@ -1385,6 +1377,15 @@ static inline void double_lock(spinlock_t *l1, spinlock_t *l2) | |||
| 1385 | spin_lock_nested(l2, SINGLE_DEPTH_NESTING); | 1377 | spin_lock_nested(l2, SINGLE_DEPTH_NESTING); |
| 1386 | } | 1378 | } |
| 1387 | 1379 | ||
| 1380 | static inline void double_lock_irq(spinlock_t *l1, spinlock_t *l2) | ||
| 1381 | { | ||
| 1382 | if (l1 > l2) | ||
| 1383 | swap(l1, l2); | ||
| 1384 | |||
| 1385 | spin_lock_irq(l1); | ||
| 1386 | spin_lock_nested(l2, SINGLE_DEPTH_NESTING); | ||
| 1387 | } | ||
| 1388 | |||
| 1388 | static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2) | 1389 | static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2) |
| 1389 | { | 1390 | { |
| 1390 | if (l1 > l2) | 1391 | if (l1 > l2) |
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index d6ce65dde541..bfe0edadbfbb 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c | |||
| @@ -41,13 +41,13 @@ pick_next_task_stop(struct rq *rq, struct task_struct *prev) | |||
| 41 | static void | 41 | static void |
| 42 | enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) | 42 | enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) |
| 43 | { | 43 | { |
| 44 | inc_nr_running(rq); | 44 | add_nr_running(rq, 1); |
| 45 | } | 45 | } |
| 46 | 46 | ||
| 47 | static void | 47 | static void |
| 48 | dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) | 48 | dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) |
| 49 | { | 49 | { |
| 50 | dec_nr_running(rq); | 50 | sub_nr_running(rq, 1); |
| 51 | } | 51 | } |
| 52 | 52 | ||
| 53 | static void yield_task_stop(struct rq *rq) | 53 | static void yield_task_stop(struct rq *rq) |
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 7d50f794e248..0ffa20ae657b 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c | |||
| @@ -394,7 +394,7 @@ EXPORT_SYMBOL(__wake_up_bit); | |||
| 394 | * | 394 | * |
| 395 | * In order for this to function properly, as it uses waitqueue_active() | 395 | * In order for this to function properly, as it uses waitqueue_active() |
| 396 | * internally, some kind of memory barrier must be done prior to calling | 396 | * internally, some kind of memory barrier must be done prior to calling |
| 397 | * this. Typically, this will be smp_mb__after_clear_bit(), but in some | 397 | * this. Typically, this will be smp_mb__after_atomic(), but in some |
| 398 | * cases where bitflags are manipulated non-atomically under a lock, one | 398 | * cases where bitflags are manipulated non-atomically under a lock, one |
| 399 | * may need to use a less regular barrier, such fs/inode.c's smp_mb(), | 399 | * may need to use a less regular barrier, such fs/inode.c's smp_mb(), |
| 400 | * because spin_unlock() does not guarantee a memory barrier. | 400 | * because spin_unlock() does not guarantee a memory barrier. |
diff --git a/kernel/seccomp.c b/kernel/seccomp.c index d8d046c0726a..f6d76bebe69f 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c | |||
| @@ -39,7 +39,7 @@ | |||
| 39 | * is only needed for handling filters shared across tasks. | 39 | * is only needed for handling filters shared across tasks. |
| 40 | * @prev: points to a previously installed, or inherited, filter | 40 | * @prev: points to a previously installed, or inherited, filter |
| 41 | * @len: the number of instructions in the program | 41 | * @len: the number of instructions in the program |
| 42 | * @insns: the BPF program instructions to evaluate | 42 | * @insnsi: the BPF program instructions to evaluate |
| 43 | * | 43 | * |
| 44 | * seccomp_filter objects are organized in a tree linked via the @prev | 44 | * seccomp_filter objects are organized in a tree linked via the @prev |
| 45 | * pointer. For any task, it appears to be a singly-linked list starting | 45 | * pointer. For any task, it appears to be a singly-linked list starting |
| @@ -69,18 +69,17 @@ static void populate_seccomp_data(struct seccomp_data *sd) | |||
| 69 | { | 69 | { |
| 70 | struct task_struct *task = current; | 70 | struct task_struct *task = current; |
| 71 | struct pt_regs *regs = task_pt_regs(task); | 71 | struct pt_regs *regs = task_pt_regs(task); |
| 72 | unsigned long args[6]; | ||
| 72 | 73 | ||
| 73 | sd->nr = syscall_get_nr(task, regs); | 74 | sd->nr = syscall_get_nr(task, regs); |
| 74 | sd->arch = syscall_get_arch(); | 75 | sd->arch = syscall_get_arch(); |
| 75 | 76 | syscall_get_arguments(task, regs, 0, 6, args); | |
| 76 | /* Unroll syscall_get_args to help gcc on arm. */ | 77 | sd->args[0] = args[0]; |
| 77 | syscall_get_arguments(task, regs, 0, 1, (unsigned long *) &sd->args[0]); | 78 | sd->args[1] = args[1]; |
| 78 | syscall_get_arguments(task, regs, 1, 1, (unsigned long *) &sd->args[1]); | 79 | sd->args[2] = args[2]; |
| 79 | syscall_get_arguments(task, regs, 2, 1, (unsigned long *) &sd->args[2]); | 80 | sd->args[3] = args[3]; |
| 80 | syscall_get_arguments(task, regs, 3, 1, (unsigned long *) &sd->args[3]); | 81 | sd->args[4] = args[4]; |
| 81 | syscall_get_arguments(task, regs, 4, 1, (unsigned long *) &sd->args[4]); | 82 | sd->args[5] = args[5]; |
| 82 | syscall_get_arguments(task, regs, 5, 1, (unsigned long *) &sd->args[5]); | ||
| 83 | |||
| 84 | sd->instruction_pointer = KSTK_EIP(task); | 83 | sd->instruction_pointer = KSTK_EIP(task); |
| 85 | } | 84 | } |
| 86 | 85 | ||
| @@ -221,7 +220,7 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) | |||
| 221 | return -ENOMEM; | 220 | return -ENOMEM; |
| 222 | 221 | ||
| 223 | /* | 222 | /* |
| 224 | * Installing a seccomp filter requires that the task have | 223 | * Installing a seccomp filter requires that the task has |
| 225 | * CAP_SYS_ADMIN in its namespace or be running with no_new_privs. | 224 | * CAP_SYS_ADMIN in its namespace or be running with no_new_privs. |
| 226 | * This avoids scenarios where unprivileged tasks can affect the | 225 | * This avoids scenarios where unprivileged tasks can affect the |
| 227 | * behavior of privileged children. | 226 | * behavior of privileged children. |
| @@ -256,6 +255,7 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) | |||
| 256 | goto free_prog; | 255 | goto free_prog; |
| 257 | 256 | ||
| 258 | /* Allocate a new seccomp_filter */ | 257 | /* Allocate a new seccomp_filter */ |
| 258 | ret = -ENOMEM; | ||
| 259 | filter = kzalloc(sizeof(struct seccomp_filter) + | 259 | filter = kzalloc(sizeof(struct seccomp_filter) + |
| 260 | sizeof(struct sock_filter_int) * new_len, | 260 | sizeof(struct sock_filter_int) * new_len, |
| 261 | GFP_KERNEL|__GFP_NOWARN); | 261 | GFP_KERNEL|__GFP_NOWARN); |
| @@ -265,6 +265,7 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) | |||
| 265 | ret = sk_convert_filter(fp, fprog->len, filter->insnsi, &new_len); | 265 | ret = sk_convert_filter(fp, fprog->len, filter->insnsi, &new_len); |
| 266 | if (ret) | 266 | if (ret) |
| 267 | goto free_filter; | 267 | goto free_filter; |
| 268 | kfree(fp); | ||
| 268 | 269 | ||
| 269 | atomic_set(&filter->usage, 1); | 270 | atomic_set(&filter->usage, 1); |
| 270 | filter->len = new_len; | 271 | filter->len = new_len; |
diff --git a/kernel/signal.c b/kernel/signal.c index 6ea13c09ae56..a4077e90f19f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
| @@ -277,6 +277,7 @@ void task_clear_jobctl_trapping(struct task_struct *task) | |||
| 277 | { | 277 | { |
| 278 | if (unlikely(task->jobctl & JOBCTL_TRAPPING)) { | 278 | if (unlikely(task->jobctl & JOBCTL_TRAPPING)) { |
| 279 | task->jobctl &= ~JOBCTL_TRAPPING; | 279 | task->jobctl &= ~JOBCTL_TRAPPING; |
| 280 | smp_mb(); /* advised by wake_up_bit() */ | ||
| 280 | wake_up_bit(&task->jobctl, JOBCTL_TRAPPING_BIT); | 281 | wake_up_bit(&task->jobctl, JOBCTL_TRAPPING_BIT); |
| 281 | } | 282 | } |
| 282 | } | 283 | } |
| @@ -705,11 +706,8 @@ void signal_wake_up_state(struct task_struct *t, unsigned int state) | |||
| 705 | * Returns 1 if any signals were found. | 706 | * Returns 1 if any signals were found. |
| 706 | * | 707 | * |
| 707 | * All callers must be holding the siglock. | 708 | * All callers must be holding the siglock. |
| 708 | * | ||
| 709 | * This version takes a sigset mask and looks at all signals, | ||
| 710 | * not just those in the first mask word. | ||
| 711 | */ | 709 | */ |
| 712 | static int rm_from_queue_full(sigset_t *mask, struct sigpending *s) | 710 | static int flush_sigqueue_mask(sigset_t *mask, struct sigpending *s) |
| 713 | { | 711 | { |
| 714 | struct sigqueue *q, *n; | 712 | struct sigqueue *q, *n; |
| 715 | sigset_t m; | 713 | sigset_t m; |
| @@ -727,29 +725,6 @@ static int rm_from_queue_full(sigset_t *mask, struct sigpending *s) | |||
| 727 | } | 725 | } |
| 728 | return 1; | 726 | return 1; |
| 729 | } | 727 | } |
| 730 | /* | ||
| 731 | * Remove signals in mask from the pending set and queue. | ||
| 732 | * Returns 1 if any signals were found. | ||
| 733 | * | ||
| 734 | * All callers must be holding the siglock. | ||
| 735 | */ | ||
| 736 | static int rm_from_queue(unsigned long mask, struct sigpending *s) | ||
| 737 | { | ||
| 738 | struct sigqueue *q, *n; | ||
| 739 | |||
| 740 | if (!sigtestsetmask(&s->signal, mask)) | ||
| 741 | return 0; | ||
| 742 | |||
| 743 | sigdelsetmask(&s->signal, mask); | ||
| 744 | list_for_each_entry_safe(q, n, &s->list, list) { | ||
| 745 | if (q->info.si_signo < SIGRTMIN && | ||
| 746 | (mask & sigmask(q->info.si_signo))) { | ||
| 747 | list_del_init(&q->list); | ||
| 748 | __sigqueue_free(q); | ||
| 749 | } | ||
| 750 | } | ||
| 751 | return 1; | ||
| 752 | } | ||
| 753 | 728 | ||
| 754 | static inline int is_si_special(const struct siginfo *info) | 729 | static inline int is_si_special(const struct siginfo *info) |
| 755 | { | 730 | { |
| @@ -861,6 +836,7 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force) | |||
| 861 | { | 836 | { |
| 862 | struct signal_struct *signal = p->signal; | 837 | struct signal_struct *signal = p->signal; |
| 863 | struct task_struct *t; | 838 | struct task_struct *t; |
| 839 | sigset_t flush; | ||
| 864 | 840 | ||
| 865 | if (signal->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP)) { | 841 | if (signal->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP)) { |
| 866 | if (signal->flags & SIGNAL_GROUP_COREDUMP) | 842 | if (signal->flags & SIGNAL_GROUP_COREDUMP) |
| @@ -872,26 +848,25 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force) | |||
| 872 | /* | 848 | /* |
| 873 | * This is a stop signal. Remove SIGCONT from all queues. | 849 | * This is a stop signal. Remove SIGCONT from all queues. |
| 874 | */ | 850 | */ |
| 875 | rm_from_queue(sigmask(SIGCONT), &signal->shared_pending); | 851 | siginitset(&flush, sigmask(SIGCONT)); |
| 876 | t = p; | 852 | flush_sigqueue_mask(&flush, &signal->shared_pending); |
| 877 | do { | 853 | for_each_thread(p, t) |
| 878 | rm_from_queue(sigmask(SIGCONT), &t->pending); | 854 | flush_sigqueue_mask(&flush, &t->pending); |
| 879 | } while_each_thread(p, t); | ||
| 880 | } else if (sig == SIGCONT) { | 855 | } else if (sig == SIGCONT) { |
| 881 | unsigned int why; | 856 | unsigned int why; |
| 882 | /* | 857 | /* |
| 883 | * Remove all stop signals from all queues, wake all threads. | 858 | * Remove all stop signals from all queues, wake all threads. |
| 884 | */ | 859 | */ |
| 885 | rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending); | 860 | siginitset(&flush, SIG_KERNEL_STOP_MASK); |
| 886 | t = p; | 861 | flush_sigqueue_mask(&flush, &signal->shared_pending); |
| 887 | do { | 862 | for_each_thread(p, t) { |
| 863 | flush_sigqueue_mask(&flush, &t->pending); | ||
| 888 | task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING); | 864 | task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING); |
| 889 | rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending); | ||
| 890 | if (likely(!(t->ptrace & PT_SEIZED))) | 865 | if (likely(!(t->ptrace & PT_SEIZED))) |
| 891 | wake_up_state(t, __TASK_STOPPED); | 866 | wake_up_state(t, __TASK_STOPPED); |
| 892 | else | 867 | else |
| 893 | ptrace_trap_notify(t); | 868 | ptrace_trap_notify(t); |
| 894 | } while_each_thread(p, t); | 869 | } |
| 895 | 870 | ||
| 896 | /* | 871 | /* |
| 897 | * Notify the parent with CLD_CONTINUED if we were stopped. | 872 | * Notify the parent with CLD_CONTINUED if we were stopped. |
| @@ -2854,7 +2829,7 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info, | |||
| 2854 | 2829 | ||
| 2855 | spin_lock_irq(&tsk->sighand->siglock); | 2830 | spin_lock_irq(&tsk->sighand->siglock); |
| 2856 | __set_task_blocked(tsk, &tsk->real_blocked); | 2831 | __set_task_blocked(tsk, &tsk->real_blocked); |
| 2857 | siginitset(&tsk->real_blocked, 0); | 2832 | sigemptyset(&tsk->real_blocked); |
| 2858 | sig = dequeue_signal(tsk, &mask, info); | 2833 | sig = dequeue_signal(tsk, &mask, info); |
| 2859 | } | 2834 | } |
| 2860 | spin_unlock_irq(&tsk->sighand->siglock); | 2835 | spin_unlock_irq(&tsk->sighand->siglock); |
| @@ -3091,18 +3066,39 @@ COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo, | |||
| 3091 | } | 3066 | } |
| 3092 | #endif | 3067 | #endif |
| 3093 | 3068 | ||
| 3069 | /* | ||
| 3070 | * For kthreads only, must not be used if cloned with CLONE_SIGHAND | ||
| 3071 | */ | ||
| 3072 | void kernel_sigaction(int sig, __sighandler_t action) | ||
| 3073 | { | ||
| 3074 | spin_lock_irq(¤t->sighand->siglock); | ||
| 3075 | current->sighand->action[sig - 1].sa.sa_handler = action; | ||
| 3076 | if (action == SIG_IGN) { | ||
| 3077 | sigset_t mask; | ||
| 3078 | |||
| 3079 | sigemptyset(&mask); | ||
| 3080 | sigaddset(&mask, sig); | ||
| 3081 | |||
| 3082 | flush_sigqueue_mask(&mask, ¤t->signal->shared_pending); | ||
| 3083 | flush_sigqueue_mask(&mask, ¤t->pending); | ||
| 3084 | recalc_sigpending(); | ||
| 3085 | } | ||
| 3086 | spin_unlock_irq(¤t->sighand->siglock); | ||
| 3087 | } | ||
| 3088 | EXPORT_SYMBOL(kernel_sigaction); | ||
| 3089 | |||
| 3094 | int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) | 3090 | int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) |
| 3095 | { | 3091 | { |
| 3096 | struct task_struct *t = current; | 3092 | struct task_struct *p = current, *t; |
| 3097 | struct k_sigaction *k; | 3093 | struct k_sigaction *k; |
| 3098 | sigset_t mask; | 3094 | sigset_t mask; |
| 3099 | 3095 | ||
| 3100 | if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig))) | 3096 | if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig))) |
| 3101 | return -EINVAL; | 3097 | return -EINVAL; |
| 3102 | 3098 | ||
| 3103 | k = &t->sighand->action[sig-1]; | 3099 | k = &p->sighand->action[sig-1]; |
| 3104 | 3100 | ||
| 3105 | spin_lock_irq(¤t->sighand->siglock); | 3101 | spin_lock_irq(&p->sighand->siglock); |
| 3106 | if (oact) | 3102 | if (oact) |
| 3107 | *oact = *k; | 3103 | *oact = *k; |
| 3108 | 3104 | ||
| @@ -3121,21 +3117,20 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) | |||
| 3121 | * (for example, SIGCHLD), shall cause the pending signal to | 3117 | * (for example, SIGCHLD), shall cause the pending signal to |
| 3122 | * be discarded, whether or not it is blocked" | 3118 | * be discarded, whether or not it is blocked" |
| 3123 | */ | 3119 | */ |
| 3124 | if (sig_handler_ignored(sig_handler(t, sig), sig)) { | 3120 | if (sig_handler_ignored(sig_handler(p, sig), sig)) { |
| 3125 | sigemptyset(&mask); | 3121 | sigemptyset(&mask); |
| 3126 | sigaddset(&mask, sig); | 3122 | sigaddset(&mask, sig); |
| 3127 | rm_from_queue_full(&mask, &t->signal->shared_pending); | 3123 | flush_sigqueue_mask(&mask, &p->signal->shared_pending); |
| 3128 | do { | 3124 | for_each_thread(p, t) |
| 3129 | rm_from_queue_full(&mask, &t->pending); | 3125 | flush_sigqueue_mask(&mask, &t->pending); |
| 3130 | } while_each_thread(current, t); | ||
| 3131 | } | 3126 | } |
| 3132 | } | 3127 | } |
| 3133 | 3128 | ||
| 3134 | spin_unlock_irq(¤t->sighand->siglock); | 3129 | spin_unlock_irq(&p->sighand->siglock); |
| 3135 | return 0; | 3130 | return 0; |
| 3136 | } | 3131 | } |
| 3137 | 3132 | ||
| 3138 | static int | 3133 | static int |
| 3139 | do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long sp) | 3134 | do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long sp) |
| 3140 | { | 3135 | { |
| 3141 | stack_t oss; | 3136 | stack_t oss; |
| @@ -3496,7 +3491,7 @@ COMPAT_SYSCALL_DEFINE3(sigaction, int, sig, | |||
| 3496 | } | 3491 | } |
| 3497 | #endif | 3492 | #endif |
| 3498 | 3493 | ||
| 3499 | #ifdef __ARCH_WANT_SYS_SGETMASK | 3494 | #ifdef CONFIG_SGETMASK_SYSCALL |
| 3500 | 3495 | ||
| 3501 | /* | 3496 | /* |
| 3502 | * For backwards compatibility. Functionality superseded by sigprocmask. | 3497 | * For backwards compatibility. Functionality superseded by sigprocmask. |
| @@ -3517,7 +3512,7 @@ SYSCALL_DEFINE1(ssetmask, int, newmask) | |||
| 3517 | 3512 | ||
| 3518 | return old; | 3513 | return old; |
| 3519 | } | 3514 | } |
| 3520 | #endif /* __ARCH_WANT_SGETMASK */ | 3515 | #endif /* CONFIG_SGETMASK_SYSCALL */ |
| 3521 | 3516 | ||
| 3522 | #ifdef __ARCH_WANT_SYS_SIGNAL | 3517 | #ifdef __ARCH_WANT_SYS_SIGNAL |
| 3523 | /* | 3518 | /* |
diff --git a/kernel/smp.c b/kernel/smp.c index 06d574e42c72..306f8180b0d5 100644 --- a/kernel/smp.c +++ b/kernel/smp.c | |||
| @@ -185,14 +185,26 @@ void generic_smp_call_function_single_interrupt(void) | |||
| 185 | { | 185 | { |
| 186 | struct llist_node *entry; | 186 | struct llist_node *entry; |
| 187 | struct call_single_data *csd, *csd_next; | 187 | struct call_single_data *csd, *csd_next; |
| 188 | static bool warned; | ||
| 189 | |||
| 190 | entry = llist_del_all(&__get_cpu_var(call_single_queue)); | ||
| 191 | entry = llist_reverse_order(entry); | ||
| 188 | 192 | ||
| 189 | /* | 193 | /* |
| 190 | * Shouldn't receive this interrupt on a cpu that is not yet online. | 194 | * Shouldn't receive this interrupt on a cpu that is not yet online. |
| 191 | */ | 195 | */ |
| 192 | WARN_ON_ONCE(!cpu_online(smp_processor_id())); | 196 | if (unlikely(!cpu_online(smp_processor_id()) && !warned)) { |
| 197 | warned = true; | ||
| 198 | WARN(1, "IPI on offline CPU %d\n", smp_processor_id()); | ||
| 193 | 199 | ||
| 194 | entry = llist_del_all(&__get_cpu_var(call_single_queue)); | 200 | /* |
| 195 | entry = llist_reverse_order(entry); | 201 | * We don't have to use the _safe() variant here |
| 202 | * because we are not invoking the IPI handlers yet. | ||
| 203 | */ | ||
| 204 | llist_for_each_entry(csd, entry, llist) | ||
| 205 | pr_warn("IPI callback %pS sent to offline CPU\n", | ||
| 206 | csd->func); | ||
| 207 | } | ||
| 196 | 208 | ||
| 197 | llist_for_each_entry_safe(csd, csd_next, entry, llist) { | 209 | llist_for_each_entry_safe(csd, csd_next, entry, llist) { |
| 198 | csd->func(csd->info); | 210 | csd->func(csd->info); |
diff --git a/kernel/softirq.c b/kernel/softirq.c index b50990a5bea0..5918d227730f 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
| @@ -223,7 +223,7 @@ static inline bool lockdep_softirq_start(void) { return false; } | |||
| 223 | static inline void lockdep_softirq_end(bool in_hardirq) { } | 223 | static inline void lockdep_softirq_end(bool in_hardirq) { } |
| 224 | #endif | 224 | #endif |
| 225 | 225 | ||
| 226 | asmlinkage void __do_softirq(void) | 226 | asmlinkage __visible void __do_softirq(void) |
| 227 | { | 227 | { |
| 228 | unsigned long end = jiffies + MAX_SOFTIRQ_TIME; | 228 | unsigned long end = jiffies + MAX_SOFTIRQ_TIME; |
| 229 | unsigned long old_flags = current->flags; | 229 | unsigned long old_flags = current->flags; |
| @@ -232,7 +232,6 @@ asmlinkage void __do_softirq(void) | |||
| 232 | bool in_hardirq; | 232 | bool in_hardirq; |
| 233 | __u32 pending; | 233 | __u32 pending; |
| 234 | int softirq_bit; | 234 | int softirq_bit; |
| 235 | int cpu; | ||
| 236 | 235 | ||
| 237 | /* | 236 | /* |
| 238 | * Mask out PF_MEMALLOC s current task context is borrowed for the | 237 | * Mask out PF_MEMALLOC s current task context is borrowed for the |
| @@ -247,7 +246,6 @@ asmlinkage void __do_softirq(void) | |||
| 247 | __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET); | 246 | __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET); |
| 248 | in_hardirq = lockdep_softirq_start(); | 247 | in_hardirq = lockdep_softirq_start(); |
| 249 | 248 | ||
| 250 | cpu = smp_processor_id(); | ||
| 251 | restart: | 249 | restart: |
| 252 | /* Reset the pending bitmask before enabling irqs */ | 250 | /* Reset the pending bitmask before enabling irqs */ |
| 253 | set_softirq_pending(0); | 251 | set_softirq_pending(0); |
| @@ -276,11 +274,11 @@ restart: | |||
| 276 | prev_count, preempt_count()); | 274 | prev_count, preempt_count()); |
| 277 | preempt_count_set(prev_count); | 275 | preempt_count_set(prev_count); |
| 278 | } | 276 | } |
| 279 | rcu_bh_qs(cpu); | ||
| 280 | h++; | 277 | h++; |
| 281 | pending >>= softirq_bit; | 278 | pending >>= softirq_bit; |
| 282 | } | 279 | } |
| 283 | 280 | ||
| 281 | rcu_bh_qs(smp_processor_id()); | ||
| 284 | local_irq_disable(); | 282 | local_irq_disable(); |
| 285 | 283 | ||
| 286 | pending = local_softirq_pending(); | 284 | pending = local_softirq_pending(); |
| @@ -299,7 +297,7 @@ restart: | |||
| 299 | tsk_restore_flags(current, old_flags, PF_MEMALLOC); | 297 | tsk_restore_flags(current, old_flags, PF_MEMALLOC); |
| 300 | } | 298 | } |
| 301 | 299 | ||
| 302 | asmlinkage void do_softirq(void) | 300 | asmlinkage __visible void do_softirq(void) |
| 303 | { | 301 | { |
| 304 | __u32 pending; | 302 | __u32 pending; |
| 305 | unsigned long flags; | 303 | unsigned long flags; |
| @@ -779,3 +777,8 @@ int __init __weak arch_early_irq_init(void) | |||
| 779 | { | 777 | { |
| 780 | return 0; | 778 | return 0; |
| 781 | } | 779 | } |
| 780 | |||
| 781 | unsigned int __weak arch_dynirq_lower_bound(unsigned int from) | ||
| 782 | { | ||
| 783 | return from; | ||
| 784 | } | ||
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 01fbae5b97b7..695f0c6cd169 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c | |||
| @@ -307,6 +307,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * | |||
| 307 | * @cpu: cpu to stop | 307 | * @cpu: cpu to stop |
| 308 | * @fn: function to execute | 308 | * @fn: function to execute |
| 309 | * @arg: argument to @fn | 309 | * @arg: argument to @fn |
| 310 | * @work_buf: pointer to cpu_stop_work structure | ||
| 310 | * | 311 | * |
| 311 | * Similar to stop_one_cpu() but doesn't wait for completion. The | 312 | * Similar to stop_one_cpu() but doesn't wait for completion. The |
| 312 | * caller is responsible for ensuring @work_buf is currently unused | 313 | * caller is responsible for ensuring @work_buf is currently unused |
diff --git a/kernel/sys.c b/kernel/sys.c index fba0f29401ea..66a751ebf9d9 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
| @@ -250,7 +250,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) | |||
| 250 | else | 250 | else |
| 251 | p = current; | 251 | p = current; |
| 252 | if (p) { | 252 | if (p) { |
| 253 | niceval = 20 - task_nice(p); | 253 | niceval = nice_to_rlimit(task_nice(p)); |
| 254 | if (niceval > retval) | 254 | if (niceval > retval) |
| 255 | retval = niceval; | 255 | retval = niceval; |
| 256 | } | 256 | } |
| @@ -261,7 +261,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) | |||
| 261 | else | 261 | else |
| 262 | pgrp = task_pgrp(current); | 262 | pgrp = task_pgrp(current); |
| 263 | do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { | 263 | do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { |
| 264 | niceval = 20 - task_nice(p); | 264 | niceval = nice_to_rlimit(task_nice(p)); |
| 265 | if (niceval > retval) | 265 | if (niceval > retval) |
| 266 | retval = niceval; | 266 | retval = niceval; |
| 267 | } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); | 267 | } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); |
| @@ -277,7 +277,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) | |||
| 277 | 277 | ||
| 278 | do_each_thread(g, p) { | 278 | do_each_thread(g, p) { |
| 279 | if (uid_eq(task_uid(p), uid)) { | 279 | if (uid_eq(task_uid(p), uid)) { |
| 280 | niceval = 20 - task_nice(p); | 280 | niceval = nice_to_rlimit(task_nice(p)); |
| 281 | if (niceval > retval) | 281 | if (niceval > retval) |
| 282 | retval = niceval; | 282 | retval = niceval; |
| 283 | } | 283 | } |
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index bc8d1b74a6b9..36441b51b5df 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c | |||
| @@ -135,6 +135,8 @@ cond_syscall(sys_setresgid16); | |||
| 135 | cond_syscall(sys_setresuid16); | 135 | cond_syscall(sys_setresuid16); |
| 136 | cond_syscall(sys_setreuid16); | 136 | cond_syscall(sys_setreuid16); |
| 137 | cond_syscall(sys_setuid16); | 137 | cond_syscall(sys_setuid16); |
| 138 | cond_syscall(sys_sgetmask); | ||
| 139 | cond_syscall(sys_ssetmask); | ||
| 138 | cond_syscall(sys_vm86old); | 140 | cond_syscall(sys_vm86old); |
| 139 | cond_syscall(sys_vm86); | 141 | cond_syscall(sys_vm86); |
| 140 | cond_syscall(sys_ipc); | 142 | cond_syscall(sys_ipc); |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 74f5b580fe34..db19e3e2aa4b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
| @@ -173,6 +173,13 @@ extern int no_unaligned_warning; | |||
| 173 | #endif | 173 | #endif |
| 174 | 174 | ||
| 175 | #ifdef CONFIG_PROC_SYSCTL | 175 | #ifdef CONFIG_PROC_SYSCTL |
| 176 | |||
| 177 | #define SYSCTL_WRITES_LEGACY -1 | ||
| 178 | #define SYSCTL_WRITES_WARN 0 | ||
| 179 | #define SYSCTL_WRITES_STRICT 1 | ||
| 180 | |||
| 181 | static int sysctl_writes_strict = SYSCTL_WRITES_WARN; | ||
| 182 | |||
| 176 | static int proc_do_cad_pid(struct ctl_table *table, int write, | 183 | static int proc_do_cad_pid(struct ctl_table *table, int write, |
| 177 | void __user *buffer, size_t *lenp, loff_t *ppos); | 184 | void __user *buffer, size_t *lenp, loff_t *ppos); |
| 178 | static int proc_taint(struct ctl_table *table, int write, | 185 | static int proc_taint(struct ctl_table *table, int write, |
| @@ -195,7 +202,7 @@ static int proc_dostring_coredump(struct ctl_table *table, int write, | |||
| 195 | /* Note: sysrq code uses it's own private copy */ | 202 | /* Note: sysrq code uses it's own private copy */ |
| 196 | static int __sysrq_enabled = CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE; | 203 | static int __sysrq_enabled = CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE; |
| 197 | 204 | ||
| 198 | static int sysrq_sysctl_handler(ctl_table *table, int write, | 205 | static int sysrq_sysctl_handler(struct ctl_table *table, int write, |
| 199 | void __user *buffer, size_t *lenp, | 206 | void __user *buffer, size_t *lenp, |
| 200 | loff_t *ppos) | 207 | loff_t *ppos) |
| 201 | { | 208 | { |
| @@ -495,6 +502,15 @@ static struct ctl_table kern_table[] = { | |||
| 495 | .mode = 0644, | 502 | .mode = 0644, |
| 496 | .proc_handler = proc_taint, | 503 | .proc_handler = proc_taint, |
| 497 | }, | 504 | }, |
| 505 | { | ||
| 506 | .procname = "sysctl_writes_strict", | ||
| 507 | .data = &sysctl_writes_strict, | ||
| 508 | .maxlen = sizeof(int), | ||
| 509 | .mode = 0644, | ||
| 510 | .proc_handler = proc_dointvec_minmax, | ||
| 511 | .extra1 = &neg_one, | ||
| 512 | .extra2 = &one, | ||
| 513 | }, | ||
| 498 | #endif | 514 | #endif |
| 499 | #ifdef CONFIG_LATENCYTOP | 515 | #ifdef CONFIG_LATENCYTOP |
| 500 | { | 516 | { |
| @@ -643,7 +659,7 @@ static struct ctl_table kern_table[] = { | |||
| 643 | .extra2 = &one, | 659 | .extra2 = &one, |
| 644 | }, | 660 | }, |
| 645 | #endif | 661 | #endif |
| 646 | 662 | #ifdef CONFIG_UEVENT_HELPER | |
| 647 | { | 663 | { |
| 648 | .procname = "hotplug", | 664 | .procname = "hotplug", |
| 649 | .data = &uevent_helper, | 665 | .data = &uevent_helper, |
| @@ -651,7 +667,7 @@ static struct ctl_table kern_table[] = { | |||
| 651 | .mode = 0644, | 667 | .mode = 0644, |
| 652 | .proc_handler = proc_dostring, | 668 | .proc_handler = proc_dostring, |
| 653 | }, | 669 | }, |
| 654 | 670 | #endif | |
| 655 | #ifdef CONFIG_CHR_DEV_SG | 671 | #ifdef CONFIG_CHR_DEV_SG |
| 656 | { | 672 | { |
| 657 | .procname = "sg-big-buff", | 673 | .procname = "sg-big-buff", |
| @@ -1418,8 +1434,13 @@ static struct ctl_table vm_table[] = { | |||
| 1418 | (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) | 1434 | (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) |
| 1419 | { | 1435 | { |
| 1420 | .procname = "vdso_enabled", | 1436 | .procname = "vdso_enabled", |
| 1437 | #ifdef CONFIG_X86_32 | ||
| 1438 | .data = &vdso32_enabled, | ||
| 1439 | .maxlen = sizeof(vdso32_enabled), | ||
| 1440 | #else | ||
| 1421 | .data = &vdso_enabled, | 1441 | .data = &vdso_enabled, |
| 1422 | .maxlen = sizeof(vdso_enabled), | 1442 | .maxlen = sizeof(vdso_enabled), |
| 1443 | #endif | ||
| 1423 | .mode = 0644, | 1444 | .mode = 0644, |
| 1424 | .proc_handler = proc_dointvec, | 1445 | .proc_handler = proc_dointvec, |
| 1425 | .extra1 = &zero, | 1446 | .extra1 = &zero, |
| @@ -1698,8 +1719,8 @@ int __init sysctl_init(void) | |||
| 1698 | 1719 | ||
| 1699 | #ifdef CONFIG_PROC_SYSCTL | 1720 | #ifdef CONFIG_PROC_SYSCTL |
| 1700 | 1721 | ||
| 1701 | static int _proc_do_string(void* data, int maxlen, int write, | 1722 | static int _proc_do_string(char *data, int maxlen, int write, |
| 1702 | void __user *buffer, | 1723 | char __user *buffer, |
| 1703 | size_t *lenp, loff_t *ppos) | 1724 | size_t *lenp, loff_t *ppos) |
| 1704 | { | 1725 | { |
| 1705 | size_t len; | 1726 | size_t len; |
| @@ -1712,21 +1733,30 @@ static int _proc_do_string(void* data, int maxlen, int write, | |||
| 1712 | } | 1733 | } |
| 1713 | 1734 | ||
| 1714 | if (write) { | 1735 | if (write) { |
| 1715 | len = 0; | 1736 | if (sysctl_writes_strict == SYSCTL_WRITES_STRICT) { |
| 1737 | /* Only continue writes not past the end of buffer. */ | ||
| 1738 | len = strlen(data); | ||
| 1739 | if (len > maxlen - 1) | ||
| 1740 | len = maxlen - 1; | ||
| 1741 | |||
| 1742 | if (*ppos > len) | ||
| 1743 | return 0; | ||
| 1744 | len = *ppos; | ||
| 1745 | } else { | ||
| 1746 | /* Start writing from beginning of buffer. */ | ||
| 1747 | len = 0; | ||
| 1748 | } | ||
| 1749 | |||
| 1750 | *ppos += *lenp; | ||
| 1716 | p = buffer; | 1751 | p = buffer; |
| 1717 | while (len < *lenp) { | 1752 | while ((p - buffer) < *lenp && len < maxlen - 1) { |
| 1718 | if (get_user(c, p++)) | 1753 | if (get_user(c, p++)) |
| 1719 | return -EFAULT; | 1754 | return -EFAULT; |
| 1720 | if (c == 0 || c == '\n') | 1755 | if (c == 0 || c == '\n') |
| 1721 | break; | 1756 | break; |
| 1722 | len++; | 1757 | data[len++] = c; |
| 1723 | } | 1758 | } |
| 1724 | if (len >= maxlen) | 1759 | data[len] = 0; |
| 1725 | len = maxlen-1; | ||
| 1726 | if(copy_from_user(data, buffer, len)) | ||
| 1727 | return -EFAULT; | ||
| 1728 | ((char *) data)[len] = 0; | ||
| 1729 | *ppos += *lenp; | ||
| 1730 | } else { | 1760 | } else { |
| 1731 | len = strlen(data); | 1761 | len = strlen(data); |
| 1732 | if (len > maxlen) | 1762 | if (len > maxlen) |
| @@ -1743,10 +1773,10 @@ static int _proc_do_string(void* data, int maxlen, int write, | |||
| 1743 | if (len > *lenp) | 1773 | if (len > *lenp) |
| 1744 | len = *lenp; | 1774 | len = *lenp; |
| 1745 | if (len) | 1775 | if (len) |
| 1746 | if(copy_to_user(buffer, data, len)) | 1776 | if (copy_to_user(buffer, data, len)) |
| 1747 | return -EFAULT; | 1777 | return -EFAULT; |
| 1748 | if (len < *lenp) { | 1778 | if (len < *lenp) { |
| 1749 | if(put_user('\n', ((char __user *) buffer) + len)) | 1779 | if (put_user('\n', buffer + len)) |
| 1750 | return -EFAULT; | 1780 | return -EFAULT; |
| 1751 | len++; | 1781 | len++; |
| 1752 | } | 1782 | } |
| @@ -1756,6 +1786,14 @@ static int _proc_do_string(void* data, int maxlen, int write, | |||
| 1756 | return 0; | 1786 | return 0; |
| 1757 | } | 1787 | } |
| 1758 | 1788 | ||
| 1789 | static void warn_sysctl_write(struct ctl_table *table) | ||
| 1790 | { | ||
| 1791 | pr_warn_once("%s wrote to %s when file position was not 0!\n" | ||
| 1792 | "This will not be supported in the future. To silence this\n" | ||
| 1793 | "warning, set kernel.sysctl_writes_strict = -1\n", | ||
| 1794 | current->comm, table->procname); | ||
| 1795 | } | ||
| 1796 | |||
| 1759 | /** | 1797 | /** |
| 1760 | * proc_dostring - read a string sysctl | 1798 | * proc_dostring - read a string sysctl |
| 1761 | * @table: the sysctl table | 1799 | * @table: the sysctl table |
| @@ -1776,8 +1814,11 @@ static int _proc_do_string(void* data, int maxlen, int write, | |||
| 1776 | int proc_dostring(struct ctl_table *table, int write, | 1814 | int proc_dostring(struct ctl_table *table, int write, |
| 1777 | void __user *buffer, size_t *lenp, loff_t *ppos) | 1815 | void __user *buffer, size_t *lenp, loff_t *ppos) |
| 1778 | { | 1816 | { |
| 1779 | return _proc_do_string(table->data, table->maxlen, write, | 1817 | if (write && *ppos && sysctl_writes_strict == SYSCTL_WRITES_WARN) |
| 1780 | buffer, lenp, ppos); | 1818 | warn_sysctl_write(table); |
| 1819 | |||
| 1820 | return _proc_do_string((char *)(table->data), table->maxlen, write, | ||
| 1821 | (char __user *)buffer, lenp, ppos); | ||
| 1781 | } | 1822 | } |
| 1782 | 1823 | ||
| 1783 | static size_t proc_skip_spaces(char **buf) | 1824 | static size_t proc_skip_spaces(char **buf) |
| @@ -1951,6 +1992,18 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, | |||
| 1951 | conv = do_proc_dointvec_conv; | 1992 | conv = do_proc_dointvec_conv; |
| 1952 | 1993 | ||
| 1953 | if (write) { | 1994 | if (write) { |
| 1995 | if (*ppos) { | ||
| 1996 | switch (sysctl_writes_strict) { | ||
| 1997 | case SYSCTL_WRITES_STRICT: | ||
| 1998 | goto out; | ||
| 1999 | case SYSCTL_WRITES_WARN: | ||
| 2000 | warn_sysctl_write(table); | ||
| 2001 | break; | ||
| 2002 | default: | ||
| 2003 | break; | ||
| 2004 | } | ||
| 2005 | } | ||
| 2006 | |||
| 1954 | if (left > PAGE_SIZE - 1) | 2007 | if (left > PAGE_SIZE - 1) |
| 1955 | left = PAGE_SIZE - 1; | 2008 | left = PAGE_SIZE - 1; |
| 1956 | page = __get_free_page(GFP_TEMPORARY); | 2009 | page = __get_free_page(GFP_TEMPORARY); |
| @@ -2008,6 +2061,7 @@ free: | |||
| 2008 | return err ? : -EINVAL; | 2061 | return err ? : -EINVAL; |
| 2009 | } | 2062 | } |
| 2010 | *lenp -= left; | 2063 | *lenp -= left; |
| 2064 | out: | ||
| 2011 | *ppos += *lenp; | 2065 | *ppos += *lenp; |
| 2012 | return err; | 2066 | return err; |
| 2013 | } | 2067 | } |
| @@ -2200,6 +2254,18 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int | |||
| 2200 | left = *lenp; | 2254 | left = *lenp; |
| 2201 | 2255 | ||
| 2202 | if (write) { | 2256 | if (write) { |
| 2257 | if (*ppos) { | ||
| 2258 | switch (sysctl_writes_strict) { | ||
| 2259 | case SYSCTL_WRITES_STRICT: | ||
| 2260 | goto out; | ||
| 2261 | case SYSCTL_WRITES_WARN: | ||
| 2262 | warn_sysctl_write(table); | ||
| 2263 | break; | ||
| 2264 | default: | ||
| 2265 | break; | ||
| 2266 | } | ||
| 2267 | } | ||
| 2268 | |||
| 2203 | if (left > PAGE_SIZE - 1) | 2269 | if (left > PAGE_SIZE - 1) |
| 2204 | left = PAGE_SIZE - 1; | 2270 | left = PAGE_SIZE - 1; |
| 2205 | page = __get_free_page(GFP_TEMPORARY); | 2271 | page = __get_free_page(GFP_TEMPORARY); |
| @@ -2255,6 +2321,7 @@ free: | |||
| 2255 | return err ? : -EINVAL; | 2321 | return err ? : -EINVAL; |
| 2256 | } | 2322 | } |
| 2257 | *lenp -= left; | 2323 | *lenp -= left; |
| 2324 | out: | ||
| 2258 | *ppos += *lenp; | 2325 | *ppos += *lenp; |
| 2259 | return err; | 2326 | return err; |
| 2260 | } | 2327 | } |
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 419a52cecd20..33db43a39515 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c | |||
| @@ -165,21 +165,21 @@ static inline void pps_set_freq(s64 freq) | |||
| 165 | 165 | ||
| 166 | static inline int is_error_status(int status) | 166 | static inline int is_error_status(int status) |
| 167 | { | 167 | { |
| 168 | return (time_status & (STA_UNSYNC|STA_CLOCKERR)) | 168 | return (status & (STA_UNSYNC|STA_CLOCKERR)) |
| 169 | /* PPS signal lost when either PPS time or | 169 | /* PPS signal lost when either PPS time or |
| 170 | * PPS frequency synchronization requested | 170 | * PPS frequency synchronization requested |
| 171 | */ | 171 | */ |
| 172 | || ((time_status & (STA_PPSFREQ|STA_PPSTIME)) | 172 | || ((status & (STA_PPSFREQ|STA_PPSTIME)) |
| 173 | && !(time_status & STA_PPSSIGNAL)) | 173 | && !(status & STA_PPSSIGNAL)) |
| 174 | /* PPS jitter exceeded when | 174 | /* PPS jitter exceeded when |
| 175 | * PPS time synchronization requested */ | 175 | * PPS time synchronization requested */ |
| 176 | || ((time_status & (STA_PPSTIME|STA_PPSJITTER)) | 176 | || ((status & (STA_PPSTIME|STA_PPSJITTER)) |
| 177 | == (STA_PPSTIME|STA_PPSJITTER)) | 177 | == (STA_PPSTIME|STA_PPSJITTER)) |
| 178 | /* PPS wander exceeded or calibration error when | 178 | /* PPS wander exceeded or calibration error when |
| 179 | * PPS frequency synchronization requested | 179 | * PPS frequency synchronization requested |
| 180 | */ | 180 | */ |
| 181 | || ((time_status & STA_PPSFREQ) | 181 | || ((status & STA_PPSFREQ) |
| 182 | && (time_status & (STA_PPSWANDER|STA_PPSERROR))); | 182 | && (status & (STA_PPSWANDER|STA_PPSERROR))); |
| 183 | } | 183 | } |
| 184 | 184 | ||
| 185 | static inline void pps_fill_timex(struct timex *txc) | 185 | static inline void pps_fill_timex(struct timex *txc) |
| @@ -786,8 +786,9 @@ static long hardpps_update_freq(struct pps_normtime freq_norm) | |||
| 786 | time_status |= STA_PPSERROR; | 786 | time_status |= STA_PPSERROR; |
| 787 | pps_errcnt++; | 787 | pps_errcnt++; |
| 788 | pps_dec_freq_interval(); | 788 | pps_dec_freq_interval(); |
| 789 | pr_err("hardpps: PPSERROR: interval too long - %ld s\n", | 789 | printk_deferred(KERN_ERR |
| 790 | freq_norm.sec); | 790 | "hardpps: PPSERROR: interval too long - %ld s\n", |
| 791 | freq_norm.sec); | ||
| 791 | return 0; | 792 | return 0; |
| 792 | } | 793 | } |
| 793 | 794 | ||
| @@ -800,7 +801,8 @@ static long hardpps_update_freq(struct pps_normtime freq_norm) | |||
| 800 | delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT); | 801 | delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT); |
| 801 | pps_freq = ftemp; | 802 | pps_freq = ftemp; |
| 802 | if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) { | 803 | if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) { |
| 803 | pr_warning("hardpps: PPSWANDER: change=%ld\n", delta); | 804 | printk_deferred(KERN_WARNING |
| 805 | "hardpps: PPSWANDER: change=%ld\n", delta); | ||
| 804 | time_status |= STA_PPSWANDER; | 806 | time_status |= STA_PPSWANDER; |
| 805 | pps_stbcnt++; | 807 | pps_stbcnt++; |
| 806 | pps_dec_freq_interval(); | 808 | pps_dec_freq_interval(); |
| @@ -844,8 +846,9 @@ static void hardpps_update_phase(long error) | |||
| 844 | * the time offset is updated. | 846 | * the time offset is updated. |
| 845 | */ | 847 | */ |
| 846 | if (jitter > (pps_jitter << PPS_POPCORN)) { | 848 | if (jitter > (pps_jitter << PPS_POPCORN)) { |
| 847 | pr_warning("hardpps: PPSJITTER: jitter=%ld, limit=%ld\n", | 849 | printk_deferred(KERN_WARNING |
| 848 | jitter, (pps_jitter << PPS_POPCORN)); | 850 | "hardpps: PPSJITTER: jitter=%ld, limit=%ld\n", |
| 851 | jitter, (pps_jitter << PPS_POPCORN)); | ||
| 849 | time_status |= STA_PPSJITTER; | 852 | time_status |= STA_PPSJITTER; |
| 850 | pps_jitcnt++; | 853 | pps_jitcnt++; |
| 851 | } else if (time_status & STA_PPSTIME) { | 854 | } else if (time_status & STA_PPSTIME) { |
| @@ -902,7 +905,7 @@ void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) | |||
| 902 | time_status |= STA_PPSJITTER; | 905 | time_status |= STA_PPSJITTER; |
| 903 | /* restart the frequency calibration interval */ | 906 | /* restart the frequency calibration interval */ |
| 904 | pps_fbase = *raw_ts; | 907 | pps_fbase = *raw_ts; |
| 905 | pr_err("hardpps: PPSJITTER: bad pulse\n"); | 908 | printk_deferred(KERN_ERR "hardpps: PPSJITTER: bad pulse\n"); |
| 906 | return; | 909 | return; |
| 907 | } | 910 | } |
| 908 | 911 | ||
| @@ -923,7 +926,10 @@ void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) | |||
| 923 | 926 | ||
| 924 | static int __init ntp_tick_adj_setup(char *str) | 927 | static int __init ntp_tick_adj_setup(char *str) |
| 925 | { | 928 | { |
| 926 | ntp_tick_adj = simple_strtol(str, NULL, 0); | 929 | int rc = kstrtol(str, 0, (long *)&ntp_tick_adj); |
| 930 | |||
| 931 | if (rc) | ||
| 932 | return rc; | ||
| 927 | ntp_tick_adj <<= NTP_SCALE_SHIFT; | 933 | ntp_tick_adj <<= NTP_SCALE_SHIFT; |
| 928 | 934 | ||
| 929 | return 1; | 935 | return 1; |
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 4d23dc4d8139..445106d2c729 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c | |||
| @@ -49,13 +49,6 @@ static u64 notrace jiffy_sched_clock_read(void) | |||
| 49 | return (u64)(jiffies - INITIAL_JIFFIES); | 49 | return (u64)(jiffies - INITIAL_JIFFIES); |
| 50 | } | 50 | } |
| 51 | 51 | ||
| 52 | static u32 __read_mostly (*read_sched_clock_32)(void); | ||
| 53 | |||
| 54 | static u64 notrace read_sched_clock_32_wrapper(void) | ||
| 55 | { | ||
| 56 | return read_sched_clock_32(); | ||
| 57 | } | ||
| 58 | |||
| 59 | static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read; | 52 | static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read; |
| 60 | 53 | ||
| 61 | static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) | 54 | static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) |
| @@ -176,12 +169,6 @@ void __init sched_clock_register(u64 (*read)(void), int bits, | |||
| 176 | pr_debug("Registered %pF as sched_clock source\n", read); | 169 | pr_debug("Registered %pF as sched_clock source\n", read); |
| 177 | } | 170 | } |
| 178 | 171 | ||
| 179 | void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate) | ||
| 180 | { | ||
| 181 | read_sched_clock_32 = read; | ||
| 182 | sched_clock_register(read_sched_clock_32_wrapper, bits, rate); | ||
| 183 | } | ||
| 184 | |||
| 185 | void __init sched_clock_postinit(void) | 172 | void __init sched_clock_postinit(void) |
| 186 | { | 173 | { |
| 187 | /* | 174 | /* |
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 015661279b68..0a0608edeb26 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c | |||
| @@ -276,7 +276,7 @@ static bool tick_check_preferred(struct clock_event_device *curdev, | |||
| 276 | bool tick_check_replacement(struct clock_event_device *curdev, | 276 | bool tick_check_replacement(struct clock_event_device *curdev, |
| 277 | struct clock_event_device *newdev) | 277 | struct clock_event_device *newdev) |
| 278 | { | 278 | { |
| 279 | if (tick_check_percpu(curdev, newdev, smp_processor_id())) | 279 | if (!tick_check_percpu(curdev, newdev, smp_processor_id())) |
| 280 | return false; | 280 | return false; |
| 281 | 281 | ||
| 282 | return tick_check_preferred(curdev, newdev); | 282 | return tick_check_preferred(curdev, newdev); |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 9f8af69c67ec..6558b7ac112d 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
| @@ -84,6 +84,9 @@ static void tick_do_update_jiffies64(ktime_t now) | |||
| 84 | 84 | ||
| 85 | /* Keep the tick_next_period variable up to date */ | 85 | /* Keep the tick_next_period variable up to date */ |
| 86 | tick_next_period = ktime_add(last_jiffies_update, tick_period); | 86 | tick_next_period = ktime_add(last_jiffies_update, tick_period); |
| 87 | } else { | ||
| 88 | write_sequnlock(&jiffies_lock); | ||
| 89 | return; | ||
| 87 | } | 90 | } |
| 88 | write_sequnlock(&jiffies_lock); | 91 | write_sequnlock(&jiffies_lock); |
| 89 | update_wall_time(); | 92 | update_wall_time(); |
| @@ -967,7 +970,7 @@ static void tick_nohz_switch_to_nohz(void) | |||
| 967 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | 970 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); |
| 968 | ktime_t next; | 971 | ktime_t next; |
| 969 | 972 | ||
| 970 | if (!tick_nohz_active) | 973 | if (!tick_nohz_enabled) |
| 971 | return; | 974 | return; |
| 972 | 975 | ||
| 973 | local_irq_disable(); | 976 | local_irq_disable(); |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index f7df8ea21707..32d8d6aaedb8 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
| @@ -852,8 +852,9 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, | |||
| 852 | struct timespec *delta) | 852 | struct timespec *delta) |
| 853 | { | 853 | { |
| 854 | if (!timespec_valid_strict(delta)) { | 854 | if (!timespec_valid_strict(delta)) { |
| 855 | printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid " | 855 | printk_deferred(KERN_WARNING |
| 856 | "sleep delta value!\n"); | 856 | "__timekeeping_inject_sleeptime: Invalid " |
| 857 | "sleep delta value!\n"); | ||
| 857 | return; | 858 | return; |
| 858 | } | 859 | } |
| 859 | tk_xtime_add(tk, delta); | 860 | tk_xtime_add(tk, delta); |
| @@ -1157,7 +1158,7 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) | |||
| 1157 | 1158 | ||
| 1158 | if (unlikely(tk->clock->maxadj && | 1159 | if (unlikely(tk->clock->maxadj && |
| 1159 | (tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) { | 1160 | (tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) { |
| 1160 | printk_once(KERN_WARNING | 1161 | printk_deferred_once(KERN_WARNING |
| 1161 | "Adjusting %s more than 11%% (%ld vs %ld)\n", | 1162 | "Adjusting %s more than 11%% (%ld vs %ld)\n", |
| 1162 | tk->clock->name, (long)tk->mult + adj, | 1163 | tk->clock->name, (long)tk->mult + adj, |
| 1163 | (long)tk->clock->mult + tk->clock->maxadj); | 1164 | (long)tk->clock->mult + tk->clock->maxadj); |
diff --git a/kernel/timer.c b/kernel/timer.c index 87bd529879c2..3bb01a323b2a 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
| @@ -838,7 +838,7 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires) | |||
| 838 | 838 | ||
| 839 | bit = find_last_bit(&mask, BITS_PER_LONG); | 839 | bit = find_last_bit(&mask, BITS_PER_LONG); |
| 840 | 840 | ||
| 841 | mask = (1 << bit) - 1; | 841 | mask = (1UL << bit) - 1; |
| 842 | 842 | ||
| 843 | expires_limit = expires_limit & ~(mask); | 843 | expires_limit = expires_limit & ~(mask); |
| 844 | 844 | ||
diff --git a/kernel/torture.c b/kernel/torture.c index acc9afc2f26e..40bb511cca48 100644 --- a/kernel/torture.c +++ b/kernel/torture.c | |||
| @@ -335,13 +335,8 @@ static void torture_shuffle_tasks(void) | |||
| 335 | shuffle_idle_cpu = cpumask_next(shuffle_idle_cpu, shuffle_tmp_mask); | 335 | shuffle_idle_cpu = cpumask_next(shuffle_idle_cpu, shuffle_tmp_mask); |
| 336 | if (shuffle_idle_cpu >= nr_cpu_ids) | 336 | if (shuffle_idle_cpu >= nr_cpu_ids) |
| 337 | shuffle_idle_cpu = -1; | 337 | shuffle_idle_cpu = -1; |
| 338 | if (shuffle_idle_cpu != -1) { | 338 | else |
| 339 | cpumask_clear_cpu(shuffle_idle_cpu, shuffle_tmp_mask); | 339 | cpumask_clear_cpu(shuffle_idle_cpu, shuffle_tmp_mask); |
| 340 | if (cpumask_empty(shuffle_tmp_mask)) { | ||
| 341 | put_online_cpus(); | ||
| 342 | return; | ||
| 343 | } | ||
| 344 | } | ||
| 345 | 340 | ||
| 346 | mutex_lock(&shuffle_task_mutex); | 341 | mutex_lock(&shuffle_task_mutex); |
| 347 | list_for_each_entry(stp, &shuffle_task_list, st_l) | 342 | list_for_each_entry(stp, &shuffle_task_list, st_l) |
| @@ -533,7 +528,11 @@ void stutter_wait(const char *title) | |||
| 533 | while (ACCESS_ONCE(stutter_pause_test) || | 528 | while (ACCESS_ONCE(stutter_pause_test) || |
| 534 | (torture_runnable && !ACCESS_ONCE(*torture_runnable))) { | 529 | (torture_runnable && !ACCESS_ONCE(*torture_runnable))) { |
| 535 | if (stutter_pause_test) | 530 | if (stutter_pause_test) |
| 536 | schedule_timeout_interruptible(1); | 531 | if (ACCESS_ONCE(stutter_pause_test) == 1) |
| 532 | schedule_timeout_interruptible(1); | ||
| 533 | else | ||
| 534 | while (ACCESS_ONCE(stutter_pause_test)) | ||
| 535 | cond_resched(); | ||
| 537 | else | 536 | else |
| 538 | schedule_timeout_interruptible(round_jiffies_relative(HZ)); | 537 | schedule_timeout_interruptible(round_jiffies_relative(HZ)); |
| 539 | torture_shutdown_absorb(title); | 538 | torture_shutdown_absorb(title); |
| @@ -550,7 +549,11 @@ static int torture_stutter(void *arg) | |||
| 550 | VERBOSE_TOROUT_STRING("torture_stutter task started"); | 549 | VERBOSE_TOROUT_STRING("torture_stutter task started"); |
| 551 | do { | 550 | do { |
| 552 | if (!torture_must_stop()) { | 551 | if (!torture_must_stop()) { |
| 553 | schedule_timeout_interruptible(stutter); | 552 | if (stutter > 1) { |
| 553 | schedule_timeout_interruptible(stutter - 1); | ||
| 554 | ACCESS_ONCE(stutter_pause_test) = 2; | ||
| 555 | } | ||
| 556 | schedule_timeout_interruptible(1); | ||
| 554 | ACCESS_ONCE(stutter_pause_test) = 1; | 557 | ACCESS_ONCE(stutter_pause_test) = 1; |
| 555 | } | 558 | } |
| 556 | if (!torture_must_stop()) | 559 | if (!torture_must_stop()) |
| @@ -596,21 +599,27 @@ static void torture_stutter_cleanup(void) | |||
| 596 | * The runnable parameter points to a flag that controls whether or not | 599 | * The runnable parameter points to a flag that controls whether or not |
| 597 | * the test is currently runnable. If there is no such flag, pass in NULL. | 600 | * the test is currently runnable. If there is no such flag, pass in NULL. |
| 598 | */ | 601 | */ |
| 599 | void __init torture_init_begin(char *ttype, bool v, int *runnable) | 602 | bool torture_init_begin(char *ttype, bool v, int *runnable) |
| 600 | { | 603 | { |
| 601 | mutex_lock(&fullstop_mutex); | 604 | mutex_lock(&fullstop_mutex); |
| 605 | if (torture_type != NULL) { | ||
| 606 | pr_alert("torture_init_begin: refusing %s init: %s running", | ||
| 607 | ttype, torture_type); | ||
| 608 | mutex_unlock(&fullstop_mutex); | ||
| 609 | return false; | ||
| 610 | } | ||
| 602 | torture_type = ttype; | 611 | torture_type = ttype; |
| 603 | verbose = v; | 612 | verbose = v; |
| 604 | torture_runnable = runnable; | 613 | torture_runnable = runnable; |
| 605 | fullstop = FULLSTOP_DONTSTOP; | 614 | fullstop = FULLSTOP_DONTSTOP; |
| 606 | 615 | return true; | |
| 607 | } | 616 | } |
| 608 | EXPORT_SYMBOL_GPL(torture_init_begin); | 617 | EXPORT_SYMBOL_GPL(torture_init_begin); |
| 609 | 618 | ||
| 610 | /* | 619 | /* |
| 611 | * Tell the torture module that initialization is complete. | 620 | * Tell the torture module that initialization is complete. |
| 612 | */ | 621 | */ |
| 613 | void __init torture_init_end(void) | 622 | void torture_init_end(void) |
| 614 | { | 623 | { |
| 615 | mutex_unlock(&fullstop_mutex); | 624 | mutex_unlock(&fullstop_mutex); |
| 616 | register_reboot_notifier(&torture_shutdown_nb); | 625 | register_reboot_notifier(&torture_shutdown_nb); |
| @@ -642,6 +651,9 @@ bool torture_cleanup(void) | |||
| 642 | torture_shuffle_cleanup(); | 651 | torture_shuffle_cleanup(); |
| 643 | torture_stutter_cleanup(); | 652 | torture_stutter_cleanup(); |
| 644 | torture_onoff_cleanup(); | 653 | torture_onoff_cleanup(); |
| 654 | mutex_lock(&fullstop_mutex); | ||
| 655 | torture_type = NULL; | ||
| 656 | mutex_unlock(&fullstop_mutex); | ||
| 645 | return false; | 657 | return false; |
| 646 | } | 658 | } |
| 647 | EXPORT_SYMBOL_GPL(torture_cleanup); | 659 | EXPORT_SYMBOL_GPL(torture_cleanup); |
| @@ -674,8 +686,10 @@ EXPORT_SYMBOL_GPL(torture_must_stop_irq); | |||
| 674 | */ | 686 | */ |
| 675 | void torture_kthread_stopping(char *title) | 687 | void torture_kthread_stopping(char *title) |
| 676 | { | 688 | { |
| 677 | if (verbose) | 689 | char buf[128]; |
| 678 | VERBOSE_TOROUT_STRING(title); | 690 | |
| 691 | snprintf(buf, sizeof(buf), "Stopping %s", title); | ||
| 692 | VERBOSE_TOROUT_STRING(buf); | ||
| 679 | while (!kthread_should_stop()) { | 693 | while (!kthread_should_stop()) { |
| 680 | torture_shutdown_absorb(title); | 694 | torture_shutdown_absorb(title); |
| 681 | schedule_timeout_uninterruptible(1); | 695 | schedule_timeout_uninterruptible(1); |
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 8639819f6cef..d4409356f40d 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig | |||
| @@ -535,6 +535,36 @@ config MMIOTRACE_TEST | |||
| 535 | 535 | ||
| 536 | Say N, unless you absolutely know what you are doing. | 536 | Say N, unless you absolutely know what you are doing. |
| 537 | 537 | ||
| 538 | config TRACEPOINT_BENCHMARK | ||
| 539 | bool "Add tracepoint that benchmarks tracepoints" | ||
| 540 | help | ||
| 541 | This option creates the tracepoint "benchmark:benchmark_event". | ||
| 542 | When the tracepoint is enabled, it kicks off a kernel thread that | ||
| 543 | goes into an infinite loop (calling cond_sched() to let other tasks | ||
| 544 | run), and calls the tracepoint. Each iteration will record the time | ||
| 545 | it took to write to the tracepoint and the next iteration that | ||
| 546 | data will be passed to the tracepoint itself. That is, the tracepoint | ||
| 547 | will report the time it took to do the previous tracepoint. | ||
| 548 | The string written to the tracepoint is a static string of 128 bytes | ||
| 549 | to keep the time the same. The initial string is simply a write of | ||
| 550 | "START". The second string records the cold cache time of the first | ||
| 551 | write which is not added to the rest of the calculations. | ||
| 552 | |||
| 553 | As it is a tight loop, it benchmarks as hot cache. That's fine because | ||
| 554 | we care most about hot paths that are probably in cache already. | ||
| 555 | |||
| 556 | An example of the output: | ||
| 557 | |||
| 558 | START | ||
| 559 | first=3672 [COLD CACHED] | ||
| 560 | last=632 first=3672 max=632 min=632 avg=316 std=446 std^2=199712 | ||
| 561 | last=278 first=3672 max=632 min=278 avg=303 std=316 std^2=100337 | ||
| 562 | last=277 first=3672 max=632 min=277 avg=296 std=258 std^2=67064 | ||
| 563 | last=273 first=3672 max=632 min=273 avg=292 std=224 std^2=50411 | ||
| 564 | last=273 first=3672 max=632 min=273 avg=288 std=200 std^2=40389 | ||
| 565 | last=281 first=3672 max=632 min=273 avg=287 std=183 std^2=33666 | ||
| 566 | |||
| 567 | |||
| 538 | config RING_BUFFER_BENCHMARK | 568 | config RING_BUFFER_BENCHMARK |
| 539 | tristate "Ring buffer benchmark stress tester" | 569 | tristate "Ring buffer benchmark stress tester" |
| 540 | depends on RING_BUFFER | 570 | depends on RING_BUFFER |
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 1378e84fbe39..2611613f14f1 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile | |||
| @@ -17,6 +17,7 @@ ifdef CONFIG_TRACING_BRANCHES | |||
| 17 | KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING | 17 | KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING |
| 18 | endif | 18 | endif |
| 19 | 19 | ||
| 20 | CFLAGS_trace_benchmark.o := -I$(src) | ||
| 20 | CFLAGS_trace_events_filter.o := -I$(src) | 21 | CFLAGS_trace_events_filter.o := -I$(src) |
| 21 | 22 | ||
| 22 | obj-$(CONFIG_TRACE_CLOCK) += trace_clock.o | 23 | obj-$(CONFIG_TRACE_CLOCK) += trace_clock.o |
| @@ -62,4 +63,6 @@ endif | |||
| 62 | obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o | 63 | obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o |
| 63 | obj-$(CONFIG_UPROBE_EVENT) += trace_uprobe.o | 64 | obj-$(CONFIG_UPROBE_EVENT) += trace_uprobe.o |
| 64 | 65 | ||
| 66 | obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o | ||
| 67 | |||
| 65 | libftrace-y := ftrace.o | 68 | libftrace-y := ftrace.o |
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1fd4b9479210..5b372e3ed675 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
| @@ -62,7 +62,7 @@ | |||
| 62 | #define FTRACE_HASH_DEFAULT_BITS 10 | 62 | #define FTRACE_HASH_DEFAULT_BITS 10 |
| 63 | #define FTRACE_HASH_MAX_BITS 12 | 63 | #define FTRACE_HASH_MAX_BITS 12 |
| 64 | 64 | ||
| 65 | #define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_CONTROL) | 65 | #define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_CONTROL) |
| 66 | 66 | ||
| 67 | #ifdef CONFIG_DYNAMIC_FTRACE | 67 | #ifdef CONFIG_DYNAMIC_FTRACE |
| 68 | #define INIT_REGEX_LOCK(opsname) \ | 68 | #define INIT_REGEX_LOCK(opsname) \ |
| @@ -103,7 +103,6 @@ static int ftrace_disabled __read_mostly; | |||
| 103 | 103 | ||
| 104 | static DEFINE_MUTEX(ftrace_lock); | 104 | static DEFINE_MUTEX(ftrace_lock); |
| 105 | 105 | ||
| 106 | static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end; | ||
| 107 | static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end; | 106 | static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end; |
| 108 | static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; | 107 | static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; |
| 109 | ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; | 108 | ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; |
| @@ -171,23 +170,6 @@ int ftrace_nr_registered_ops(void) | |||
| 171 | return cnt; | 170 | return cnt; |
| 172 | } | 171 | } |
| 173 | 172 | ||
| 174 | static void | ||
| 175 | ftrace_global_list_func(unsigned long ip, unsigned long parent_ip, | ||
| 176 | struct ftrace_ops *op, struct pt_regs *regs) | ||
| 177 | { | ||
| 178 | int bit; | ||
| 179 | |||
| 180 | bit = trace_test_and_set_recursion(TRACE_GLOBAL_START, TRACE_GLOBAL_MAX); | ||
| 181 | if (bit < 0) | ||
| 182 | return; | ||
| 183 | |||
| 184 | do_for_each_ftrace_op(op, ftrace_global_list) { | ||
| 185 | op->func(ip, parent_ip, op, regs); | ||
| 186 | } while_for_each_ftrace_op(op); | ||
| 187 | |||
| 188 | trace_clear_recursion(bit); | ||
| 189 | } | ||
| 190 | |||
| 191 | static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, | 173 | static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, |
| 192 | struct ftrace_ops *op, struct pt_regs *regs) | 174 | struct ftrace_ops *op, struct pt_regs *regs) |
| 193 | { | 175 | { |
| @@ -237,43 +219,6 @@ static int control_ops_alloc(struct ftrace_ops *ops) | |||
| 237 | return 0; | 219 | return 0; |
| 238 | } | 220 | } |
| 239 | 221 | ||
| 240 | static void update_global_ops(void) | ||
| 241 | { | ||
| 242 | ftrace_func_t func = ftrace_global_list_func; | ||
| 243 | void *private = NULL; | ||
| 244 | |||
| 245 | /* The list has its own recursion protection. */ | ||
| 246 | global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE; | ||
| 247 | |||
| 248 | /* | ||
| 249 | * If there's only one function registered, then call that | ||
| 250 | * function directly. Otherwise, we need to iterate over the | ||
| 251 | * registered callers. | ||
| 252 | */ | ||
| 253 | if (ftrace_global_list == &ftrace_list_end || | ||
| 254 | ftrace_global_list->next == &ftrace_list_end) { | ||
| 255 | func = ftrace_global_list->func; | ||
| 256 | private = ftrace_global_list->private; | ||
| 257 | /* | ||
| 258 | * As we are calling the function directly. | ||
| 259 | * If it does not have recursion protection, | ||
| 260 | * the function_trace_op needs to be updated | ||
| 261 | * accordingly. | ||
| 262 | */ | ||
| 263 | if (!(ftrace_global_list->flags & FTRACE_OPS_FL_RECURSION_SAFE)) | ||
| 264 | global_ops.flags &= ~FTRACE_OPS_FL_RECURSION_SAFE; | ||
| 265 | } | ||
| 266 | |||
| 267 | /* If we filter on pids, update to use the pid function */ | ||
| 268 | if (!list_empty(&ftrace_pids)) { | ||
| 269 | set_ftrace_pid_function(func); | ||
| 270 | func = ftrace_pid_func; | ||
| 271 | } | ||
| 272 | |||
| 273 | global_ops.func = func; | ||
| 274 | global_ops.private = private; | ||
| 275 | } | ||
| 276 | |||
| 277 | static void ftrace_sync(struct work_struct *work) | 222 | static void ftrace_sync(struct work_struct *work) |
| 278 | { | 223 | { |
| 279 | /* | 224 | /* |
| @@ -301,8 +246,6 @@ static void update_ftrace_function(void) | |||
| 301 | { | 246 | { |
| 302 | ftrace_func_t func; | 247 | ftrace_func_t func; |
| 303 | 248 | ||
| 304 | update_global_ops(); | ||
| 305 | |||
| 306 | /* | 249 | /* |
| 307 | * If we are at the end of the list and this ops is | 250 | * If we are at the end of the list and this ops is |
| 308 | * recursion safe and not dynamic and the arch supports passing ops, | 251 | * recursion safe and not dynamic and the arch supports passing ops, |
| @@ -314,10 +257,7 @@ static void update_ftrace_function(void) | |||
| 314 | (ftrace_ops_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) && | 257 | (ftrace_ops_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) && |
| 315 | !FTRACE_FORCE_LIST_FUNC)) { | 258 | !FTRACE_FORCE_LIST_FUNC)) { |
| 316 | /* Set the ftrace_ops that the arch callback uses */ | 259 | /* Set the ftrace_ops that the arch callback uses */ |
| 317 | if (ftrace_ops_list == &global_ops) | 260 | set_function_trace_op = ftrace_ops_list; |
| 318 | set_function_trace_op = ftrace_global_list; | ||
| 319 | else | ||
| 320 | set_function_trace_op = ftrace_ops_list; | ||
| 321 | func = ftrace_ops_list->func; | 261 | func = ftrace_ops_list->func; |
| 322 | } else { | 262 | } else { |
| 323 | /* Just use the default ftrace_ops */ | 263 | /* Just use the default ftrace_ops */ |
| @@ -373,6 +313,11 @@ static void update_ftrace_function(void) | |||
| 373 | ftrace_trace_function = func; | 313 | ftrace_trace_function = func; |
| 374 | } | 314 | } |
| 375 | 315 | ||
| 316 | int using_ftrace_ops_list_func(void) | ||
| 317 | { | ||
| 318 | return ftrace_trace_function == ftrace_ops_list_func; | ||
| 319 | } | ||
| 320 | |||
| 376 | static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) | 321 | static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) |
| 377 | { | 322 | { |
| 378 | ops->next = *list; | 323 | ops->next = *list; |
| @@ -434,16 +379,9 @@ static int __register_ftrace_function(struct ftrace_ops *ops) | |||
| 434 | if (ops->flags & FTRACE_OPS_FL_DELETED) | 379 | if (ops->flags & FTRACE_OPS_FL_DELETED) |
| 435 | return -EINVAL; | 380 | return -EINVAL; |
| 436 | 381 | ||
| 437 | if (FTRACE_WARN_ON(ops == &global_ops)) | ||
| 438 | return -EINVAL; | ||
| 439 | |||
| 440 | if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED)) | 382 | if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED)) |
| 441 | return -EBUSY; | 383 | return -EBUSY; |
| 442 | 384 | ||
| 443 | /* We don't support both control and global flags set. */ | ||
| 444 | if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK) | ||
| 445 | return -EINVAL; | ||
| 446 | |||
| 447 | #ifndef CONFIG_DYNAMIC_FTRACE_WITH_REGS | 385 | #ifndef CONFIG_DYNAMIC_FTRACE_WITH_REGS |
| 448 | /* | 386 | /* |
| 449 | * If the ftrace_ops specifies SAVE_REGS, then it only can be used | 387 | * If the ftrace_ops specifies SAVE_REGS, then it only can be used |
| @@ -461,10 +399,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops) | |||
| 461 | if (!core_kernel_data((unsigned long)ops)) | 399 | if (!core_kernel_data((unsigned long)ops)) |
| 462 | ops->flags |= FTRACE_OPS_FL_DYNAMIC; | 400 | ops->flags |= FTRACE_OPS_FL_DYNAMIC; |
| 463 | 401 | ||
| 464 | if (ops->flags & FTRACE_OPS_FL_GLOBAL) { | 402 | if (ops->flags & FTRACE_OPS_FL_CONTROL) { |
| 465 | add_ftrace_list_ops(&ftrace_global_list, &global_ops, ops); | ||
| 466 | ops->flags |= FTRACE_OPS_FL_ENABLED; | ||
| 467 | } else if (ops->flags & FTRACE_OPS_FL_CONTROL) { | ||
| 468 | if (control_ops_alloc(ops)) | 403 | if (control_ops_alloc(ops)) |
| 469 | return -ENOMEM; | 404 | return -ENOMEM; |
| 470 | add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops); | 405 | add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops); |
| @@ -484,15 +419,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) | |||
| 484 | if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED))) | 419 | if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED))) |
| 485 | return -EBUSY; | 420 | return -EBUSY; |
| 486 | 421 | ||
| 487 | if (FTRACE_WARN_ON(ops == &global_ops)) | 422 | if (ops->flags & FTRACE_OPS_FL_CONTROL) { |
| 488 | return -EINVAL; | ||
| 489 | |||
| 490 | if (ops->flags & FTRACE_OPS_FL_GLOBAL) { | ||
| 491 | ret = remove_ftrace_list_ops(&ftrace_global_list, | ||
| 492 | &global_ops, ops); | ||
| 493 | if (!ret) | ||
| 494 | ops->flags &= ~FTRACE_OPS_FL_ENABLED; | ||
| 495 | } else if (ops->flags & FTRACE_OPS_FL_CONTROL) { | ||
| 496 | ret = remove_ftrace_list_ops(&ftrace_control_list, | 423 | ret = remove_ftrace_list_ops(&ftrace_control_list, |
| 497 | &control_ops, ops); | 424 | &control_ops, ops); |
| 498 | } else | 425 | } else |
| @@ -895,7 +822,7 @@ function_profile_call(unsigned long ip, unsigned long parent_ip, | |||
| 895 | 822 | ||
| 896 | local_irq_save(flags); | 823 | local_irq_save(flags); |
| 897 | 824 | ||
| 898 | stat = &__get_cpu_var(ftrace_profile_stats); | 825 | stat = this_cpu_ptr(&ftrace_profile_stats); |
| 899 | if (!stat->hash || !ftrace_profile_enabled) | 826 | if (!stat->hash || !ftrace_profile_enabled) |
| 900 | goto out; | 827 | goto out; |
| 901 | 828 | ||
| @@ -926,7 +853,7 @@ static void profile_graph_return(struct ftrace_graph_ret *trace) | |||
| 926 | unsigned long flags; | 853 | unsigned long flags; |
| 927 | 854 | ||
| 928 | local_irq_save(flags); | 855 | local_irq_save(flags); |
| 929 | stat = &__get_cpu_var(ftrace_profile_stats); | 856 | stat = this_cpu_ptr(&ftrace_profile_stats); |
| 930 | if (!stat->hash || !ftrace_profile_enabled) | 857 | if (!stat->hash || !ftrace_profile_enabled) |
| 931 | goto out; | 858 | goto out; |
| 932 | 859 | ||
| @@ -1178,7 +1105,7 @@ struct ftrace_page { | |||
| 1178 | static struct ftrace_page *ftrace_pages_start; | 1105 | static struct ftrace_page *ftrace_pages_start; |
| 1179 | static struct ftrace_page *ftrace_pages; | 1106 | static struct ftrace_page *ftrace_pages; |
| 1180 | 1107 | ||
| 1181 | static bool ftrace_hash_empty(struct ftrace_hash *hash) | 1108 | static bool __always_inline ftrace_hash_empty(struct ftrace_hash *hash) |
| 1182 | { | 1109 | { |
| 1183 | return !hash || !hash->count; | 1110 | return !hash || !hash->count; |
| 1184 | } | 1111 | } |
| @@ -1625,7 +1552,14 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, | |||
| 1625 | in_other_hash = !!ftrace_lookup_ip(other_hash, rec->ip); | 1552 | in_other_hash = !!ftrace_lookup_ip(other_hash, rec->ip); |
| 1626 | 1553 | ||
| 1627 | /* | 1554 | /* |
| 1555 | * If filter_hash is set, we want to match all functions | ||
| 1556 | * that are in the hash but not in the other hash. | ||
| 1628 | * | 1557 | * |
| 1558 | * If filter_hash is not set, then we are decrementing. | ||
| 1559 | * That means we match anything that is in the hash | ||
| 1560 | * and also in the other_hash. That is, we need to turn | ||
| 1561 | * off functions in the other hash because they are disabled | ||
| 1562 | * by this hash. | ||
| 1629 | */ | 1563 | */ |
| 1630 | if (filter_hash && in_hash && !in_other_hash) | 1564 | if (filter_hash && in_hash && !in_other_hash) |
| 1631 | match = 1; | 1565 | match = 1; |
| @@ -1767,19 +1701,15 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) | |||
| 1767 | /* | 1701 | /* |
| 1768 | * If this record is being updated from a nop, then | 1702 | * If this record is being updated from a nop, then |
| 1769 | * return UPDATE_MAKE_CALL. | 1703 | * return UPDATE_MAKE_CALL. |
| 1770 | * Otherwise, if the EN flag is set, then return | ||
| 1771 | * UPDATE_MODIFY_CALL_REGS to tell the caller to convert | ||
| 1772 | * from the non-save regs, to a save regs function. | ||
| 1773 | * Otherwise, | 1704 | * Otherwise, |
| 1774 | * return UPDATE_MODIFY_CALL to tell the caller to convert | 1705 | * return UPDATE_MODIFY_CALL to tell the caller to convert |
| 1775 | * from the save regs, to a non-save regs function. | 1706 | * from the save regs, to a non-save regs function or |
| 1707 | * vice versa. | ||
| 1776 | */ | 1708 | */ |
| 1777 | if (flag & FTRACE_FL_ENABLED) | 1709 | if (flag & FTRACE_FL_ENABLED) |
| 1778 | return FTRACE_UPDATE_MAKE_CALL; | 1710 | return FTRACE_UPDATE_MAKE_CALL; |
| 1779 | else if (rec->flags & FTRACE_FL_REGS_EN) | 1711 | |
| 1780 | return FTRACE_UPDATE_MODIFY_CALL_REGS; | 1712 | return FTRACE_UPDATE_MODIFY_CALL; |
| 1781 | else | ||
| 1782 | return FTRACE_UPDATE_MODIFY_CALL; | ||
| 1783 | } | 1713 | } |
| 1784 | 1714 | ||
| 1785 | if (update) { | 1715 | if (update) { |
| @@ -1821,6 +1751,42 @@ int ftrace_test_record(struct dyn_ftrace *rec, int enable) | |||
| 1821 | return ftrace_check_record(rec, enable, 0); | 1751 | return ftrace_check_record(rec, enable, 0); |
| 1822 | } | 1752 | } |
| 1823 | 1753 | ||
| 1754 | /** | ||
| 1755 | * ftrace_get_addr_new - Get the call address to set to | ||
| 1756 | * @rec: The ftrace record descriptor | ||
| 1757 | * | ||
| 1758 | * If the record has the FTRACE_FL_REGS set, that means that it | ||
| 1759 | * wants to convert to a callback that saves all regs. If FTRACE_FL_REGS | ||
| 1760 | * is not not set, then it wants to convert to the normal callback. | ||
| 1761 | * | ||
| 1762 | * Returns the address of the trampoline to set to | ||
| 1763 | */ | ||
| 1764 | unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec) | ||
| 1765 | { | ||
| 1766 | if (rec->flags & FTRACE_FL_REGS) | ||
| 1767 | return (unsigned long)FTRACE_REGS_ADDR; | ||
| 1768 | else | ||
| 1769 | return (unsigned long)FTRACE_ADDR; | ||
| 1770 | } | ||
| 1771 | |||
| 1772 | /** | ||
| 1773 | * ftrace_get_addr_curr - Get the call address that is already there | ||
| 1774 | * @rec: The ftrace record descriptor | ||
| 1775 | * | ||
| 1776 | * The FTRACE_FL_REGS_EN is set when the record already points to | ||
| 1777 | * a function that saves all the regs. Basically the '_EN' version | ||
| 1778 | * represents the current state of the function. | ||
| 1779 | * | ||
| 1780 | * Returns the address of the trampoline that is currently being called | ||
| 1781 | */ | ||
| 1782 | unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec) | ||
| 1783 | { | ||
| 1784 | if (rec->flags & FTRACE_FL_REGS_EN) | ||
| 1785 | return (unsigned long)FTRACE_REGS_ADDR; | ||
| 1786 | else | ||
| 1787 | return (unsigned long)FTRACE_ADDR; | ||
| 1788 | } | ||
| 1789 | |||
| 1824 | static int | 1790 | static int |
| 1825 | __ftrace_replace_code(struct dyn_ftrace *rec, int enable) | 1791 | __ftrace_replace_code(struct dyn_ftrace *rec, int enable) |
| 1826 | { | 1792 | { |
| @@ -1828,12 +1794,12 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) | |||
| 1828 | unsigned long ftrace_addr; | 1794 | unsigned long ftrace_addr; |
| 1829 | int ret; | 1795 | int ret; |
| 1830 | 1796 | ||
| 1831 | ret = ftrace_update_record(rec, enable); | 1797 | ftrace_addr = ftrace_get_addr_new(rec); |
| 1832 | 1798 | ||
| 1833 | if (rec->flags & FTRACE_FL_REGS) | 1799 | /* This needs to be done before we call ftrace_update_record */ |
| 1834 | ftrace_addr = (unsigned long)FTRACE_REGS_ADDR; | 1800 | ftrace_old_addr = ftrace_get_addr_curr(rec); |
| 1835 | else | 1801 | |
| 1836 | ftrace_addr = (unsigned long)FTRACE_ADDR; | 1802 | ret = ftrace_update_record(rec, enable); |
| 1837 | 1803 | ||
| 1838 | switch (ret) { | 1804 | switch (ret) { |
| 1839 | case FTRACE_UPDATE_IGNORE: | 1805 | case FTRACE_UPDATE_IGNORE: |
| @@ -1845,13 +1811,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) | |||
| 1845 | case FTRACE_UPDATE_MAKE_NOP: | 1811 | case FTRACE_UPDATE_MAKE_NOP: |
| 1846 | return ftrace_make_nop(NULL, rec, ftrace_addr); | 1812 | return ftrace_make_nop(NULL, rec, ftrace_addr); |
| 1847 | 1813 | ||
| 1848 | case FTRACE_UPDATE_MODIFY_CALL_REGS: | ||
| 1849 | case FTRACE_UPDATE_MODIFY_CALL: | 1814 | case FTRACE_UPDATE_MODIFY_CALL: |
| 1850 | if (rec->flags & FTRACE_FL_REGS) | ||
| 1851 | ftrace_old_addr = (unsigned long)FTRACE_ADDR; | ||
| 1852 | else | ||
| 1853 | ftrace_old_addr = (unsigned long)FTRACE_REGS_ADDR; | ||
| 1854 | |||
| 1855 | return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr); | 1815 | return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr); |
| 1856 | } | 1816 | } |
| 1857 | 1817 | ||
| @@ -2115,7 +2075,6 @@ static void ftrace_startup_enable(int command) | |||
| 2115 | 2075 | ||
| 2116 | static int ftrace_startup(struct ftrace_ops *ops, int command) | 2076 | static int ftrace_startup(struct ftrace_ops *ops, int command) |
| 2117 | { | 2077 | { |
| 2118 | bool hash_enable = true; | ||
| 2119 | int ret; | 2078 | int ret; |
| 2120 | 2079 | ||
| 2121 | if (unlikely(ftrace_disabled)) | 2080 | if (unlikely(ftrace_disabled)) |
| @@ -2128,18 +2087,9 @@ static int ftrace_startup(struct ftrace_ops *ops, int command) | |||
| 2128 | ftrace_start_up++; | 2087 | ftrace_start_up++; |
| 2129 | command |= FTRACE_UPDATE_CALLS; | 2088 | command |= FTRACE_UPDATE_CALLS; |
| 2130 | 2089 | ||
| 2131 | /* ops marked global share the filter hashes */ | ||
| 2132 | if (ops->flags & FTRACE_OPS_FL_GLOBAL) { | ||
| 2133 | ops = &global_ops; | ||
| 2134 | /* Don't update hash if global is already set */ | ||
| 2135 | if (global_start_up) | ||
| 2136 | hash_enable = false; | ||
| 2137 | global_start_up++; | ||
| 2138 | } | ||
| 2139 | |||
| 2140 | ops->flags |= FTRACE_OPS_FL_ENABLED; | 2090 | ops->flags |= FTRACE_OPS_FL_ENABLED; |
| 2141 | if (hash_enable) | 2091 | |
| 2142 | ftrace_hash_rec_enable(ops, 1); | 2092 | ftrace_hash_rec_enable(ops, 1); |
| 2143 | 2093 | ||
| 2144 | ftrace_startup_enable(command); | 2094 | ftrace_startup_enable(command); |
| 2145 | 2095 | ||
| @@ -2148,7 +2098,6 @@ static int ftrace_startup(struct ftrace_ops *ops, int command) | |||
| 2148 | 2098 | ||
| 2149 | static int ftrace_shutdown(struct ftrace_ops *ops, int command) | 2099 | static int ftrace_shutdown(struct ftrace_ops *ops, int command) |
| 2150 | { | 2100 | { |
| 2151 | bool hash_disable = true; | ||
| 2152 | int ret; | 2101 | int ret; |
| 2153 | 2102 | ||
| 2154 | if (unlikely(ftrace_disabled)) | 2103 | if (unlikely(ftrace_disabled)) |
| @@ -2166,21 +2115,9 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) | |||
| 2166 | */ | 2115 | */ |
| 2167 | WARN_ON_ONCE(ftrace_start_up < 0); | 2116 | WARN_ON_ONCE(ftrace_start_up < 0); |
| 2168 | 2117 | ||
| 2169 | if (ops->flags & FTRACE_OPS_FL_GLOBAL) { | 2118 | ftrace_hash_rec_disable(ops, 1); |
| 2170 | ops = &global_ops; | ||
| 2171 | global_start_up--; | ||
| 2172 | WARN_ON_ONCE(global_start_up < 0); | ||
| 2173 | /* Don't update hash if global still has users */ | ||
| 2174 | if (global_start_up) { | ||
| 2175 | WARN_ON_ONCE(!ftrace_start_up); | ||
| 2176 | hash_disable = false; | ||
| 2177 | } | ||
| 2178 | } | ||
| 2179 | 2119 | ||
| 2180 | if (hash_disable) | 2120 | if (!global_start_up) |
| 2181 | ftrace_hash_rec_disable(ops, 1); | ||
| 2182 | |||
| 2183 | if (ops != &global_ops || !global_start_up) | ||
| 2184 | ops->flags &= ~FTRACE_OPS_FL_ENABLED; | 2121 | ops->flags &= ~FTRACE_OPS_FL_ENABLED; |
| 2185 | 2122 | ||
| 2186 | command |= FTRACE_UPDATE_CALLS; | 2123 | command |= FTRACE_UPDATE_CALLS; |
| @@ -3524,10 +3461,6 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, | |||
| 3524 | struct ftrace_hash *hash; | 3461 | struct ftrace_hash *hash; |
| 3525 | int ret; | 3462 | int ret; |
| 3526 | 3463 | ||
| 3527 | /* All global ops uses the global ops filters */ | ||
| 3528 | if (ops->flags & FTRACE_OPS_FL_GLOBAL) | ||
| 3529 | ops = &global_ops; | ||
| 3530 | |||
| 3531 | if (unlikely(ftrace_disabled)) | 3464 | if (unlikely(ftrace_disabled)) |
| 3532 | return -ENODEV; | 3465 | return -ENODEV; |
| 3533 | 3466 | ||
| @@ -3639,8 +3572,7 @@ int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, | |||
| 3639 | } | 3572 | } |
| 3640 | EXPORT_SYMBOL_GPL(ftrace_set_notrace); | 3573 | EXPORT_SYMBOL_GPL(ftrace_set_notrace); |
| 3641 | /** | 3574 | /** |
| 3642 | * ftrace_set_filter - set a function to filter on in ftrace | 3575 | * ftrace_set_global_filter - set a function to filter on with global tracers |
| 3643 | * @ops - the ops to set the filter with | ||
| 3644 | * @buf - the string that holds the function filter text. | 3576 | * @buf - the string that holds the function filter text. |
| 3645 | * @len - the length of the string. | 3577 | * @len - the length of the string. |
| 3646 | * @reset - non zero to reset all filters before applying this filter. | 3578 | * @reset - non zero to reset all filters before applying this filter. |
| @@ -3655,8 +3587,7 @@ void ftrace_set_global_filter(unsigned char *buf, int len, int reset) | |||
| 3655 | EXPORT_SYMBOL_GPL(ftrace_set_global_filter); | 3587 | EXPORT_SYMBOL_GPL(ftrace_set_global_filter); |
| 3656 | 3588 | ||
| 3657 | /** | 3589 | /** |
| 3658 | * ftrace_set_notrace - set a function to not trace in ftrace | 3590 | * ftrace_set_global_notrace - set a function to not trace with global tracers |
| 3659 | * @ops - the ops to set the notrace filter with | ||
| 3660 | * @buf - the string that holds the function notrace text. | 3591 | * @buf - the string that holds the function notrace text. |
| 3661 | * @len - the length of the string. | 3592 | * @len - the length of the string. |
| 3662 | * @reset - non zero to reset all filters before applying this filter. | 3593 | * @reset - non zero to reset all filters before applying this filter. |
| @@ -4330,16 +4261,11 @@ static void ftrace_init_module(struct module *mod, | |||
| 4330 | ftrace_process_locs(mod, start, end); | 4261 | ftrace_process_locs(mod, start, end); |
| 4331 | } | 4262 | } |
| 4332 | 4263 | ||
| 4333 | static int ftrace_module_notify_enter(struct notifier_block *self, | 4264 | void ftrace_module_init(struct module *mod) |
| 4334 | unsigned long val, void *data) | ||
| 4335 | { | 4265 | { |
| 4336 | struct module *mod = data; | 4266 | ftrace_init_module(mod, mod->ftrace_callsites, |
| 4337 | 4267 | mod->ftrace_callsites + | |
| 4338 | if (val == MODULE_STATE_COMING) | 4268 | mod->num_ftrace_callsites); |
| 4339 | ftrace_init_module(mod, mod->ftrace_callsites, | ||
| 4340 | mod->ftrace_callsites + | ||
| 4341 | mod->num_ftrace_callsites); | ||
| 4342 | return 0; | ||
| 4343 | } | 4269 | } |
| 4344 | 4270 | ||
| 4345 | static int ftrace_module_notify_exit(struct notifier_block *self, | 4271 | static int ftrace_module_notify_exit(struct notifier_block *self, |
| @@ -4353,11 +4279,6 @@ static int ftrace_module_notify_exit(struct notifier_block *self, | |||
| 4353 | return 0; | 4279 | return 0; |
| 4354 | } | 4280 | } |
| 4355 | #else | 4281 | #else |
| 4356 | static int ftrace_module_notify_enter(struct notifier_block *self, | ||
| 4357 | unsigned long val, void *data) | ||
| 4358 | { | ||
| 4359 | return 0; | ||
| 4360 | } | ||
| 4361 | static int ftrace_module_notify_exit(struct notifier_block *self, | 4282 | static int ftrace_module_notify_exit(struct notifier_block *self, |
| 4362 | unsigned long val, void *data) | 4283 | unsigned long val, void *data) |
| 4363 | { | 4284 | { |
| @@ -4365,11 +4286,6 @@ static int ftrace_module_notify_exit(struct notifier_block *self, | |||
| 4365 | } | 4286 | } |
| 4366 | #endif /* CONFIG_MODULES */ | 4287 | #endif /* CONFIG_MODULES */ |
| 4367 | 4288 | ||
| 4368 | struct notifier_block ftrace_module_enter_nb = { | ||
| 4369 | .notifier_call = ftrace_module_notify_enter, | ||
| 4370 | .priority = INT_MAX, /* Run before anything that can use kprobes */ | ||
| 4371 | }; | ||
| 4372 | |||
| 4373 | struct notifier_block ftrace_module_exit_nb = { | 4289 | struct notifier_block ftrace_module_exit_nb = { |
| 4374 | .notifier_call = ftrace_module_notify_exit, | 4290 | .notifier_call = ftrace_module_notify_exit, |
| 4375 | .priority = INT_MIN, /* Run after anything that can remove kprobes */ | 4291 | .priority = INT_MIN, /* Run after anything that can remove kprobes */ |
| @@ -4403,10 +4319,6 @@ void __init ftrace_init(void) | |||
| 4403 | __start_mcount_loc, | 4319 | __start_mcount_loc, |
| 4404 | __stop_mcount_loc); | 4320 | __stop_mcount_loc); |
| 4405 | 4321 | ||
| 4406 | ret = register_module_notifier(&ftrace_module_enter_nb); | ||
| 4407 | if (ret) | ||
| 4408 | pr_warning("Failed to register trace ftrace module enter notifier\n"); | ||
| 4409 | |||
| 4410 | ret = register_module_notifier(&ftrace_module_exit_nb); | 4322 | ret = register_module_notifier(&ftrace_module_exit_nb); |
| 4411 | if (ret) | 4323 | if (ret) |
| 4412 | pr_warning("Failed to register trace ftrace module exit notifier\n"); | 4324 | pr_warning("Failed to register trace ftrace module exit notifier\n"); |
| @@ -4462,6 +4374,34 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) | |||
| 4462 | 4374 | ||
| 4463 | #endif /* CONFIG_DYNAMIC_FTRACE */ | 4375 | #endif /* CONFIG_DYNAMIC_FTRACE */ |
| 4464 | 4376 | ||
| 4377 | __init void ftrace_init_global_array_ops(struct trace_array *tr) | ||
| 4378 | { | ||
| 4379 | tr->ops = &global_ops; | ||
| 4380 | tr->ops->private = tr; | ||
| 4381 | } | ||
| 4382 | |||
| 4383 | void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func) | ||
| 4384 | { | ||
| 4385 | /* If we filter on pids, update to use the pid function */ | ||
| 4386 | if (tr->flags & TRACE_ARRAY_FL_GLOBAL) { | ||
| 4387 | if (WARN_ON(tr->ops->func != ftrace_stub)) | ||
| 4388 | printk("ftrace ops had %pS for function\n", | ||
| 4389 | tr->ops->func); | ||
| 4390 | /* Only the top level instance does pid tracing */ | ||
| 4391 | if (!list_empty(&ftrace_pids)) { | ||
| 4392 | set_ftrace_pid_function(func); | ||
| 4393 | func = ftrace_pid_func; | ||
| 4394 | } | ||
| 4395 | } | ||
| 4396 | tr->ops->func = func; | ||
| 4397 | tr->ops->private = tr; | ||
| 4398 | } | ||
| 4399 | |||
| 4400 | void ftrace_reset_array_ops(struct trace_array *tr) | ||
| 4401 | { | ||
| 4402 | tr->ops->func = ftrace_stub; | ||
| 4403 | } | ||
| 4404 | |||
| 4465 | static void | 4405 | static void |
| 4466 | ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, | 4406 | ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, |
| 4467 | struct ftrace_ops *op, struct pt_regs *regs) | 4407 | struct ftrace_ops *op, struct pt_regs *regs) |
| @@ -4520,9 +4460,16 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, | |||
| 4520 | */ | 4460 | */ |
| 4521 | preempt_disable_notrace(); | 4461 | preempt_disable_notrace(); |
| 4522 | do_for_each_ftrace_op(op, ftrace_ops_list) { | 4462 | do_for_each_ftrace_op(op, ftrace_ops_list) { |
| 4523 | if (ftrace_ops_test(op, ip, regs)) | 4463 | if (ftrace_ops_test(op, ip, regs)) { |
| 4464 | if (WARN_ON(!op->func)) { | ||
| 4465 | function_trace_stop = 1; | ||
| 4466 | printk("op=%p %pS\n", op, op); | ||
| 4467 | goto out; | ||
| 4468 | } | ||
| 4524 | op->func(ip, parent_ip, op, regs); | 4469 | op->func(ip, parent_ip, op, regs); |
| 4470 | } | ||
| 4525 | } while_for_each_ftrace_op(op); | 4471 | } while_for_each_ftrace_op(op); |
| 4472 | out: | ||
| 4526 | preempt_enable_notrace(); | 4473 | preempt_enable_notrace(); |
| 4527 | trace_clear_recursion(bit); | 4474 | trace_clear_recursion(bit); |
| 4528 | } | 4475 | } |
| @@ -4927,7 +4874,6 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, | |||
| 4927 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | 4874 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER |
| 4928 | 4875 | ||
| 4929 | static int ftrace_graph_active; | 4876 | static int ftrace_graph_active; |
| 4930 | static struct notifier_block ftrace_suspend_notifier; | ||
| 4931 | 4877 | ||
| 4932 | int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) | 4878 | int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) |
| 4933 | { | 4879 | { |
| @@ -5073,13 +5019,6 @@ ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state, | |||
| 5073 | return NOTIFY_DONE; | 5019 | return NOTIFY_DONE; |
| 5074 | } | 5020 | } |
| 5075 | 5021 | ||
| 5076 | /* Just a place holder for function graph */ | ||
| 5077 | static struct ftrace_ops fgraph_ops __read_mostly = { | ||
| 5078 | .func = ftrace_stub, | ||
| 5079 | .flags = FTRACE_OPS_FL_STUB | FTRACE_OPS_FL_GLOBAL | | ||
| 5080 | FTRACE_OPS_FL_RECURSION_SAFE, | ||
| 5081 | }; | ||
| 5082 | |||
| 5083 | static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace) | 5022 | static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace) |
| 5084 | { | 5023 | { |
| 5085 | if (!ftrace_ops_test(&global_ops, trace->func, NULL)) | 5024 | if (!ftrace_ops_test(&global_ops, trace->func, NULL)) |
| @@ -5104,6 +5043,10 @@ static void update_function_graph_func(void) | |||
| 5104 | ftrace_graph_entry = ftrace_graph_entry_test; | 5043 | ftrace_graph_entry = ftrace_graph_entry_test; |
| 5105 | } | 5044 | } |
| 5106 | 5045 | ||
| 5046 | static struct notifier_block ftrace_suspend_notifier = { | ||
| 5047 | .notifier_call = ftrace_suspend_notifier_call, | ||
| 5048 | }; | ||
| 5049 | |||
| 5107 | int register_ftrace_graph(trace_func_graph_ret_t retfunc, | 5050 | int register_ftrace_graph(trace_func_graph_ret_t retfunc, |
| 5108 | trace_func_graph_ent_t entryfunc) | 5051 | trace_func_graph_ent_t entryfunc) |
| 5109 | { | 5052 | { |
| @@ -5117,7 +5060,6 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, | |||
| 5117 | goto out; | 5060 | goto out; |
| 5118 | } | 5061 | } |
| 5119 | 5062 | ||
| 5120 | ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call; | ||
| 5121 | register_pm_notifier(&ftrace_suspend_notifier); | 5063 | register_pm_notifier(&ftrace_suspend_notifier); |
| 5122 | 5064 | ||
| 5123 | ftrace_graph_active++; | 5065 | ftrace_graph_active++; |
| @@ -5139,7 +5081,10 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, | |||
| 5139 | ftrace_graph_entry = ftrace_graph_entry_test; | 5081 | ftrace_graph_entry = ftrace_graph_entry_test; |
| 5140 | update_function_graph_func(); | 5082 | update_function_graph_func(); |
| 5141 | 5083 | ||
| 5142 | ret = ftrace_startup(&fgraph_ops, FTRACE_START_FUNC_RET); | 5084 | /* Function graph doesn't use the .func field of global_ops */ |
| 5085 | global_ops.flags |= FTRACE_OPS_FL_STUB; | ||
| 5086 | |||
| 5087 | ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET); | ||
| 5143 | 5088 | ||
| 5144 | out: | 5089 | out: |
| 5145 | mutex_unlock(&ftrace_lock); | 5090 | mutex_unlock(&ftrace_lock); |
| @@ -5157,7 +5102,8 @@ void unregister_ftrace_graph(void) | |||
| 5157 | ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; | 5102 | ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; |
| 5158 | ftrace_graph_entry = ftrace_graph_entry_stub; | 5103 | ftrace_graph_entry = ftrace_graph_entry_stub; |
| 5159 | __ftrace_graph_entry = ftrace_graph_entry_stub; | 5104 | __ftrace_graph_entry = ftrace_graph_entry_stub; |
| 5160 | ftrace_shutdown(&fgraph_ops, FTRACE_STOP_FUNC_RET); | 5105 | ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET); |
| 5106 | global_ops.flags &= ~FTRACE_OPS_FL_STUB; | ||
| 5161 | unregister_pm_notifier(&ftrace_suspend_notifier); | 5107 | unregister_pm_notifier(&ftrace_suspend_notifier); |
| 5162 | unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); | 5108 | unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); |
| 5163 | 5109 | ||
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 737b0efa1a62..16f7038d1f4d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
| @@ -275,7 +275,7 @@ int call_filter_check_discard(struct ftrace_event_call *call, void *rec, | |||
| 275 | } | 275 | } |
| 276 | EXPORT_SYMBOL_GPL(call_filter_check_discard); | 276 | EXPORT_SYMBOL_GPL(call_filter_check_discard); |
| 277 | 277 | ||
| 278 | cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu) | 278 | static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu) |
| 279 | { | 279 | { |
| 280 | u64 ts; | 280 | u64 ts; |
| 281 | 281 | ||
| @@ -599,7 +599,7 @@ static int alloc_snapshot(struct trace_array *tr) | |||
| 599 | return 0; | 599 | return 0; |
| 600 | } | 600 | } |
| 601 | 601 | ||
| 602 | void free_snapshot(struct trace_array *tr) | 602 | static void free_snapshot(struct trace_array *tr) |
| 603 | { | 603 | { |
| 604 | /* | 604 | /* |
| 605 | * We don't free the ring buffer. instead, resize it because | 605 | * We don't free the ring buffer. instead, resize it because |
| @@ -963,27 +963,9 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) | |||
| 963 | return cnt; | 963 | return cnt; |
| 964 | } | 964 | } |
| 965 | 965 | ||
| 966 | /* | ||
| 967 | * ftrace_max_lock is used to protect the swapping of buffers | ||
| 968 | * when taking a max snapshot. The buffers themselves are | ||
| 969 | * protected by per_cpu spinlocks. But the action of the swap | ||
| 970 | * needs its own lock. | ||
| 971 | * | ||
| 972 | * This is defined as a arch_spinlock_t in order to help | ||
| 973 | * with performance when lockdep debugging is enabled. | ||
| 974 | * | ||
| 975 | * It is also used in other places outside the update_max_tr | ||
| 976 | * so it needs to be defined outside of the | ||
| 977 | * CONFIG_TRACER_MAX_TRACE. | ||
| 978 | */ | ||
| 979 | static arch_spinlock_t ftrace_max_lock = | ||
| 980 | (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; | ||
| 981 | |||
| 982 | unsigned long __read_mostly tracing_thresh; | 966 | unsigned long __read_mostly tracing_thresh; |
| 983 | 967 | ||
| 984 | #ifdef CONFIG_TRACER_MAX_TRACE | 968 | #ifdef CONFIG_TRACER_MAX_TRACE |
| 985 | unsigned long __read_mostly tracing_max_latency; | ||
| 986 | |||
| 987 | /* | 969 | /* |
| 988 | * Copy the new maximum trace into the separate maximum-trace | 970 | * Copy the new maximum trace into the separate maximum-trace |
| 989 | * structure. (this way the maximum trace is permanently saved, | 971 | * structure. (this way the maximum trace is permanently saved, |
| @@ -1000,7 +982,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) | |||
| 1000 | max_buf->cpu = cpu; | 982 | max_buf->cpu = cpu; |
| 1001 | max_buf->time_start = data->preempt_timestamp; | 983 | max_buf->time_start = data->preempt_timestamp; |
| 1002 | 984 | ||
| 1003 | max_data->saved_latency = tracing_max_latency; | 985 | max_data->saved_latency = tr->max_latency; |
| 1004 | max_data->critical_start = data->critical_start; | 986 | max_data->critical_start = data->critical_start; |
| 1005 | max_data->critical_end = data->critical_end; | 987 | max_data->critical_end = data->critical_end; |
| 1006 | 988 | ||
| @@ -1048,14 +1030,14 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) | |||
| 1048 | return; | 1030 | return; |
| 1049 | } | 1031 | } |
| 1050 | 1032 | ||
| 1051 | arch_spin_lock(&ftrace_max_lock); | 1033 | arch_spin_lock(&tr->max_lock); |
| 1052 | 1034 | ||
| 1053 | buf = tr->trace_buffer.buffer; | 1035 | buf = tr->trace_buffer.buffer; |
| 1054 | tr->trace_buffer.buffer = tr->max_buffer.buffer; | 1036 | tr->trace_buffer.buffer = tr->max_buffer.buffer; |
| 1055 | tr->max_buffer.buffer = buf; | 1037 | tr->max_buffer.buffer = buf; |
| 1056 | 1038 | ||
| 1057 | __update_max_tr(tr, tsk, cpu); | 1039 | __update_max_tr(tr, tsk, cpu); |
| 1058 | arch_spin_unlock(&ftrace_max_lock); | 1040 | arch_spin_unlock(&tr->max_lock); |
| 1059 | } | 1041 | } |
| 1060 | 1042 | ||
| 1061 | /** | 1043 | /** |
| @@ -1081,7 +1063,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) | |||
| 1081 | return; | 1063 | return; |
| 1082 | } | 1064 | } |
| 1083 | 1065 | ||
| 1084 | arch_spin_lock(&ftrace_max_lock); | 1066 | arch_spin_lock(&tr->max_lock); |
| 1085 | 1067 | ||
| 1086 | ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->trace_buffer.buffer, cpu); | 1068 | ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->trace_buffer.buffer, cpu); |
| 1087 | 1069 | ||
| @@ -1099,11 +1081,11 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) | |||
| 1099 | WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY); | 1081 | WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY); |
| 1100 | 1082 | ||
| 1101 | __update_max_tr(tr, tsk, cpu); | 1083 | __update_max_tr(tr, tsk, cpu); |
| 1102 | arch_spin_unlock(&ftrace_max_lock); | 1084 | arch_spin_unlock(&tr->max_lock); |
| 1103 | } | 1085 | } |
| 1104 | #endif /* CONFIG_TRACER_MAX_TRACE */ | 1086 | #endif /* CONFIG_TRACER_MAX_TRACE */ |
| 1105 | 1087 | ||
| 1106 | static void default_wait_pipe(struct trace_iterator *iter) | 1088 | static void wait_on_pipe(struct trace_iterator *iter) |
| 1107 | { | 1089 | { |
| 1108 | /* Iterators are static, they should be filled or empty */ | 1090 | /* Iterators are static, they should be filled or empty */ |
| 1109 | if (trace_buffer_iter(iter, iter->cpu_file)) | 1091 | if (trace_buffer_iter(iter, iter->cpu_file)) |
| @@ -1220,8 +1202,6 @@ int register_tracer(struct tracer *type) | |||
| 1220 | else | 1202 | else |
| 1221 | if (!type->flags->opts) | 1203 | if (!type->flags->opts) |
| 1222 | type->flags->opts = dummy_tracer_opt; | 1204 | type->flags->opts = dummy_tracer_opt; |
| 1223 | if (!type->wait_pipe) | ||
| 1224 | type->wait_pipe = default_wait_pipe; | ||
| 1225 | 1205 | ||
| 1226 | ret = run_tracer_selftest(type); | 1206 | ret = run_tracer_selftest(type); |
| 1227 | if (ret < 0) | 1207 | if (ret < 0) |
| @@ -1305,22 +1285,71 @@ void tracing_reset_all_online_cpus(void) | |||
| 1305 | } | 1285 | } |
| 1306 | } | 1286 | } |
| 1307 | 1287 | ||
| 1308 | #define SAVED_CMDLINES 128 | 1288 | #define SAVED_CMDLINES_DEFAULT 128 |
| 1309 | #define NO_CMDLINE_MAP UINT_MAX | 1289 | #define NO_CMDLINE_MAP UINT_MAX |
| 1310 | static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; | ||
| 1311 | static unsigned map_cmdline_to_pid[SAVED_CMDLINES]; | ||
| 1312 | static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN]; | ||
| 1313 | static int cmdline_idx; | ||
| 1314 | static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED; | 1290 | static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED; |
| 1291 | struct saved_cmdlines_buffer { | ||
| 1292 | unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; | ||
| 1293 | unsigned *map_cmdline_to_pid; | ||
| 1294 | unsigned cmdline_num; | ||
| 1295 | int cmdline_idx; | ||
| 1296 | char *saved_cmdlines; | ||
| 1297 | }; | ||
| 1298 | static struct saved_cmdlines_buffer *savedcmd; | ||
| 1315 | 1299 | ||
| 1316 | /* temporary disable recording */ | 1300 | /* temporary disable recording */ |
| 1317 | static atomic_t trace_record_cmdline_disabled __read_mostly; | 1301 | static atomic_t trace_record_cmdline_disabled __read_mostly; |
| 1318 | 1302 | ||
| 1319 | static void trace_init_cmdlines(void) | 1303 | static inline char *get_saved_cmdlines(int idx) |
| 1320 | { | 1304 | { |
| 1321 | memset(&map_pid_to_cmdline, NO_CMDLINE_MAP, sizeof(map_pid_to_cmdline)); | 1305 | return &savedcmd->saved_cmdlines[idx * TASK_COMM_LEN]; |
| 1322 | memset(&map_cmdline_to_pid, NO_CMDLINE_MAP, sizeof(map_cmdline_to_pid)); | 1306 | } |
| 1323 | cmdline_idx = 0; | 1307 | |
| 1308 | static inline void set_cmdline(int idx, const char *cmdline) | ||
| 1309 | { | ||
| 1310 | memcpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN); | ||
| 1311 | } | ||
| 1312 | |||
| 1313 | static int allocate_cmdlines_buffer(unsigned int val, | ||
| 1314 | struct saved_cmdlines_buffer *s) | ||
| 1315 | { | ||
| 1316 | s->map_cmdline_to_pid = kmalloc(val * sizeof(*s->map_cmdline_to_pid), | ||
| 1317 | GFP_KERNEL); | ||
| 1318 | if (!s->map_cmdline_to_pid) | ||
| 1319 | return -ENOMEM; | ||
| 1320 | |||
| 1321 | s->saved_cmdlines = kmalloc(val * TASK_COMM_LEN, GFP_KERNEL); | ||
| 1322 | if (!s->saved_cmdlines) { | ||
| 1323 | kfree(s->map_cmdline_to_pid); | ||
| 1324 | return -ENOMEM; | ||
| 1325 | } | ||
| 1326 | |||
| 1327 | s->cmdline_idx = 0; | ||
| 1328 | s->cmdline_num = val; | ||
| 1329 | memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP, | ||
| 1330 | sizeof(s->map_pid_to_cmdline)); | ||
| 1331 | memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP, | ||
| 1332 | val * sizeof(*s->map_cmdline_to_pid)); | ||
| 1333 | |||
| 1334 | return 0; | ||
| 1335 | } | ||
| 1336 | |||
| 1337 | static int trace_create_savedcmd(void) | ||
| 1338 | { | ||
| 1339 | int ret; | ||
| 1340 | |||
| 1341 | savedcmd = kmalloc(sizeof(struct saved_cmdlines_buffer), GFP_KERNEL); | ||
| 1342 | if (!savedcmd) | ||
| 1343 | return -ENOMEM; | ||
| 1344 | |||
| 1345 | ret = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT, savedcmd); | ||
| 1346 | if (ret < 0) { | ||
| 1347 | kfree(savedcmd); | ||
| 1348 | savedcmd = NULL; | ||
| 1349 | return -ENOMEM; | ||
| 1350 | } | ||
| 1351 | |||
| 1352 | return 0; | ||
| 1324 | } | 1353 | } |
| 1325 | 1354 | ||
| 1326 | int is_tracing_stopped(void) | 1355 | int is_tracing_stopped(void) |
| @@ -1353,7 +1382,7 @@ void tracing_start(void) | |||
| 1353 | } | 1382 | } |
| 1354 | 1383 | ||
| 1355 | /* Prevent the buffers from switching */ | 1384 | /* Prevent the buffers from switching */ |
| 1356 | arch_spin_lock(&ftrace_max_lock); | 1385 | arch_spin_lock(&global_trace.max_lock); |
| 1357 | 1386 | ||
| 1358 | buffer = global_trace.trace_buffer.buffer; | 1387 | buffer = global_trace.trace_buffer.buffer; |
| 1359 | if (buffer) | 1388 | if (buffer) |
| @@ -1365,7 +1394,7 @@ void tracing_start(void) | |||
| 1365 | ring_buffer_record_enable(buffer); | 1394 | ring_buffer_record_enable(buffer); |
| 1366 | #endif | 1395 | #endif |
| 1367 | 1396 | ||
| 1368 | arch_spin_unlock(&ftrace_max_lock); | 1397 | arch_spin_unlock(&global_trace.max_lock); |
| 1369 | 1398 | ||
| 1370 | ftrace_start(); | 1399 | ftrace_start(); |
| 1371 | out: | 1400 | out: |
| @@ -1420,7 +1449,7 @@ void tracing_stop(void) | |||
| 1420 | goto out; | 1449 | goto out; |
| 1421 | 1450 | ||
| 1422 | /* Prevent the buffers from switching */ | 1451 | /* Prevent the buffers from switching */ |
| 1423 | arch_spin_lock(&ftrace_max_lock); | 1452 | arch_spin_lock(&global_trace.max_lock); |
| 1424 | 1453 | ||
| 1425 | buffer = global_trace.trace_buffer.buffer; | 1454 | buffer = global_trace.trace_buffer.buffer; |
| 1426 | if (buffer) | 1455 | if (buffer) |
| @@ -1432,7 +1461,7 @@ void tracing_stop(void) | |||
| 1432 | ring_buffer_record_disable(buffer); | 1461 | ring_buffer_record_disable(buffer); |
| 1433 | #endif | 1462 | #endif |
| 1434 | 1463 | ||
| 1435 | arch_spin_unlock(&ftrace_max_lock); | 1464 | arch_spin_unlock(&global_trace.max_lock); |
| 1436 | 1465 | ||
| 1437 | out: | 1466 | out: |
| 1438 | raw_spin_unlock_irqrestore(&global_trace.start_lock, flags); | 1467 | raw_spin_unlock_irqrestore(&global_trace.start_lock, flags); |
| @@ -1461,12 +1490,12 @@ static void tracing_stop_tr(struct trace_array *tr) | |||
| 1461 | 1490 | ||
| 1462 | void trace_stop_cmdline_recording(void); | 1491 | void trace_stop_cmdline_recording(void); |
| 1463 | 1492 | ||
| 1464 | static void trace_save_cmdline(struct task_struct *tsk) | 1493 | static int trace_save_cmdline(struct task_struct *tsk) |
| 1465 | { | 1494 | { |
| 1466 | unsigned pid, idx; | 1495 | unsigned pid, idx; |
| 1467 | 1496 | ||
| 1468 | if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT)) | 1497 | if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT)) |
| 1469 | return; | 1498 | return 0; |
| 1470 | 1499 | ||
| 1471 | /* | 1500 | /* |
| 1472 | * It's not the end of the world if we don't get | 1501 | * It's not the end of the world if we don't get |
| @@ -1475,11 +1504,11 @@ static void trace_save_cmdline(struct task_struct *tsk) | |||
| 1475 | * so if we miss here, then better luck next time. | 1504 | * so if we miss here, then better luck next time. |
| 1476 | */ | 1505 | */ |
| 1477 | if (!arch_spin_trylock(&trace_cmdline_lock)) | 1506 | if (!arch_spin_trylock(&trace_cmdline_lock)) |
| 1478 | return; | 1507 | return 0; |
| 1479 | 1508 | ||
| 1480 | idx = map_pid_to_cmdline[tsk->pid]; | 1509 | idx = savedcmd->map_pid_to_cmdline[tsk->pid]; |
| 1481 | if (idx == NO_CMDLINE_MAP) { | 1510 | if (idx == NO_CMDLINE_MAP) { |
| 1482 | idx = (cmdline_idx + 1) % SAVED_CMDLINES; | 1511 | idx = (savedcmd->cmdline_idx + 1) % savedcmd->cmdline_num; |
| 1483 | 1512 | ||
| 1484 | /* | 1513 | /* |
| 1485 | * Check whether the cmdline buffer at idx has a pid | 1514 | * Check whether the cmdline buffer at idx has a pid |
| @@ -1487,22 +1516,24 @@ static void trace_save_cmdline(struct task_struct *tsk) | |||
| 1487 | * need to clear the map_pid_to_cmdline. Otherwise we | 1516 | * need to clear the map_pid_to_cmdline. Otherwise we |
| 1488 | * would read the new comm for the old pid. | 1517 | * would read the new comm for the old pid. |
| 1489 | */ | 1518 | */ |
| 1490 | pid = map_cmdline_to_pid[idx]; | 1519 | pid = savedcmd->map_cmdline_to_pid[idx]; |
| 1491 | if (pid != NO_CMDLINE_MAP) | 1520 | if (pid != NO_CMDLINE_MAP) |
| 1492 | map_pid_to_cmdline[pid] = NO_CMDLINE_MAP; | 1521 | savedcmd->map_pid_to_cmdline[pid] = NO_CMDLINE_MAP; |
| 1493 | 1522 | ||
| 1494 | map_cmdline_to_pid[idx] = tsk->pid; | 1523 | savedcmd->map_cmdline_to_pid[idx] = tsk->pid; |
| 1495 | map_pid_to_cmdline[tsk->pid] = idx; | 1524 | savedcmd->map_pid_to_cmdline[tsk->pid] = idx; |
| 1496 | 1525 | ||
| 1497 | cmdline_idx = idx; | 1526 | savedcmd->cmdline_idx = idx; |
| 1498 | } | 1527 | } |
| 1499 | 1528 | ||
| 1500 | memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN); | 1529 | set_cmdline(idx, tsk->comm); |
| 1501 | 1530 | ||
| 1502 | arch_spin_unlock(&trace_cmdline_lock); | 1531 | arch_spin_unlock(&trace_cmdline_lock); |
| 1532 | |||
| 1533 | return 1; | ||
| 1503 | } | 1534 | } |
| 1504 | 1535 | ||
| 1505 | void trace_find_cmdline(int pid, char comm[]) | 1536 | static void __trace_find_cmdline(int pid, char comm[]) |
| 1506 | { | 1537 | { |
| 1507 | unsigned map; | 1538 | unsigned map; |
| 1508 | 1539 | ||
| @@ -1521,13 +1552,19 @@ void trace_find_cmdline(int pid, char comm[]) | |||
| 1521 | return; | 1552 | return; |
| 1522 | } | 1553 | } |
| 1523 | 1554 | ||
| 1524 | preempt_disable(); | 1555 | map = savedcmd->map_pid_to_cmdline[pid]; |
| 1525 | arch_spin_lock(&trace_cmdline_lock); | ||
| 1526 | map = map_pid_to_cmdline[pid]; | ||
| 1527 | if (map != NO_CMDLINE_MAP) | 1556 | if (map != NO_CMDLINE_MAP) |
| 1528 | strcpy(comm, saved_cmdlines[map]); | 1557 | strcpy(comm, get_saved_cmdlines(map)); |
| 1529 | else | 1558 | else |
| 1530 | strcpy(comm, "<...>"); | 1559 | strcpy(comm, "<...>"); |
| 1560 | } | ||
| 1561 | |||
| 1562 | void trace_find_cmdline(int pid, char comm[]) | ||
| 1563 | { | ||
| 1564 | preempt_disable(); | ||
| 1565 | arch_spin_lock(&trace_cmdline_lock); | ||
| 1566 | |||
| 1567 | __trace_find_cmdline(pid, comm); | ||
| 1531 | 1568 | ||
| 1532 | arch_spin_unlock(&trace_cmdline_lock); | 1569 | arch_spin_unlock(&trace_cmdline_lock); |
| 1533 | preempt_enable(); | 1570 | preempt_enable(); |
| @@ -1541,9 +1578,8 @@ void tracing_record_cmdline(struct task_struct *tsk) | |||
| 1541 | if (!__this_cpu_read(trace_cmdline_save)) | 1578 | if (!__this_cpu_read(trace_cmdline_save)) |
| 1542 | return; | 1579 | return; |
| 1543 | 1580 | ||
| 1544 | __this_cpu_write(trace_cmdline_save, false); | 1581 | if (trace_save_cmdline(tsk)) |
| 1545 | 1582 | __this_cpu_write(trace_cmdline_save, false); | |
| 1546 | trace_save_cmdline(tsk); | ||
| 1547 | } | 1583 | } |
| 1548 | 1584 | ||
| 1549 | void | 1585 | void |
| @@ -1746,7 +1782,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, | |||
| 1746 | */ | 1782 | */ |
| 1747 | barrier(); | 1783 | barrier(); |
| 1748 | if (use_stack == 1) { | 1784 | if (use_stack == 1) { |
| 1749 | trace.entries = &__get_cpu_var(ftrace_stack).calls[0]; | 1785 | trace.entries = this_cpu_ptr(ftrace_stack.calls); |
| 1750 | trace.max_entries = FTRACE_STACK_MAX_ENTRIES; | 1786 | trace.max_entries = FTRACE_STACK_MAX_ENTRIES; |
| 1751 | 1787 | ||
| 1752 | if (regs) | 1788 | if (regs) |
| @@ -1995,7 +2031,21 @@ void trace_printk_init_buffers(void) | |||
| 1995 | if (alloc_percpu_trace_buffer()) | 2031 | if (alloc_percpu_trace_buffer()) |
| 1996 | return; | 2032 | return; |
| 1997 | 2033 | ||
| 1998 | pr_info("ftrace: Allocated trace_printk buffers\n"); | 2034 | /* trace_printk() is for debug use only. Don't use it in production. */ |
| 2035 | |||
| 2036 | pr_warning("\n**********************************************************\n"); | ||
| 2037 | pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); | ||
| 2038 | pr_warning("** **\n"); | ||
| 2039 | pr_warning("** trace_printk() being used. Allocating extra memory. **\n"); | ||
| 2040 | pr_warning("** **\n"); | ||
| 2041 | pr_warning("** This means that this is a DEBUG kernel and it is **\n"); | ||
| 2042 | pr_warning("** unsafe for produciton use. **\n"); | ||
| 2043 | pr_warning("** **\n"); | ||
| 2044 | pr_warning("** If you see this message and you are not debugging **\n"); | ||
| 2045 | pr_warning("** the kernel, report this immediately to your vendor! **\n"); | ||
| 2046 | pr_warning("** **\n"); | ||
| 2047 | pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); | ||
| 2048 | pr_warning("**********************************************************\n"); | ||
| 1999 | 2049 | ||
| 2000 | /* Expand the buffers to set size */ | 2050 | /* Expand the buffers to set size */ |
| 2001 | tracing_update_buffers(); | 2051 | tracing_update_buffers(); |
| @@ -3333,7 +3383,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, | |||
| 3333 | mutex_lock(&tracing_cpumask_update_lock); | 3383 | mutex_lock(&tracing_cpumask_update_lock); |
| 3334 | 3384 | ||
| 3335 | local_irq_disable(); | 3385 | local_irq_disable(); |
| 3336 | arch_spin_lock(&ftrace_max_lock); | 3386 | arch_spin_lock(&tr->max_lock); |
| 3337 | for_each_tracing_cpu(cpu) { | 3387 | for_each_tracing_cpu(cpu) { |
| 3338 | /* | 3388 | /* |
| 3339 | * Increase/decrease the disabled counter if we are | 3389 | * Increase/decrease the disabled counter if we are |
| @@ -3350,7 +3400,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, | |||
| 3350 | ring_buffer_record_enable_cpu(tr->trace_buffer.buffer, cpu); | 3400 | ring_buffer_record_enable_cpu(tr->trace_buffer.buffer, cpu); |
| 3351 | } | 3401 | } |
| 3352 | } | 3402 | } |
| 3353 | arch_spin_unlock(&ftrace_max_lock); | 3403 | arch_spin_unlock(&tr->max_lock); |
| 3354 | local_irq_enable(); | 3404 | local_irq_enable(); |
| 3355 | 3405 | ||
| 3356 | cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new); | 3406 | cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new); |
| @@ -3592,6 +3642,7 @@ static const char readme_msg[] = | |||
| 3592 | " trace_options\t\t- Set format or modify how tracing happens\n" | 3642 | " trace_options\t\t- Set format or modify how tracing happens\n" |
| 3593 | "\t\t\t Disable an option by adding a suffix 'no' to the\n" | 3643 | "\t\t\t Disable an option by adding a suffix 'no' to the\n" |
| 3594 | "\t\t\t option name\n" | 3644 | "\t\t\t option name\n" |
| 3645 | " saved_cmdlines_size\t- echo command number in here to store comm-pid list\n" | ||
| 3595 | #ifdef CONFIG_DYNAMIC_FTRACE | 3646 | #ifdef CONFIG_DYNAMIC_FTRACE |
| 3596 | "\n available_filter_functions - list of functions that can be filtered on\n" | 3647 | "\n available_filter_functions - list of functions that can be filtered on\n" |
| 3597 | " set_ftrace_filter\t- echo function name in here to only trace these\n" | 3648 | " set_ftrace_filter\t- echo function name in here to only trace these\n" |
| @@ -3705,55 +3756,153 @@ static const struct file_operations tracing_readme_fops = { | |||
| 3705 | .llseek = generic_file_llseek, | 3756 | .llseek = generic_file_llseek, |
| 3706 | }; | 3757 | }; |
| 3707 | 3758 | ||
| 3759 | static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos) | ||
| 3760 | { | ||
| 3761 | unsigned int *ptr = v; | ||
| 3762 | |||
| 3763 | if (*pos || m->count) | ||
| 3764 | ptr++; | ||
| 3765 | |||
| 3766 | (*pos)++; | ||
| 3767 | |||
| 3768 | for (; ptr < &savedcmd->map_cmdline_to_pid[savedcmd->cmdline_num]; | ||
| 3769 | ptr++) { | ||
| 3770 | if (*ptr == -1 || *ptr == NO_CMDLINE_MAP) | ||
| 3771 | continue; | ||
| 3772 | |||
| 3773 | return ptr; | ||
| 3774 | } | ||
| 3775 | |||
| 3776 | return NULL; | ||
| 3777 | } | ||
| 3778 | |||
| 3779 | static void *saved_cmdlines_start(struct seq_file *m, loff_t *pos) | ||
| 3780 | { | ||
| 3781 | void *v; | ||
| 3782 | loff_t l = 0; | ||
| 3783 | |||
| 3784 | preempt_disable(); | ||
| 3785 | arch_spin_lock(&trace_cmdline_lock); | ||
| 3786 | |||
| 3787 | v = &savedcmd->map_cmdline_to_pid[0]; | ||
| 3788 | while (l <= *pos) { | ||
| 3789 | v = saved_cmdlines_next(m, v, &l); | ||
| 3790 | if (!v) | ||
| 3791 | return NULL; | ||
| 3792 | } | ||
| 3793 | |||
| 3794 | return v; | ||
| 3795 | } | ||
| 3796 | |||
| 3797 | static void saved_cmdlines_stop(struct seq_file *m, void *v) | ||
| 3798 | { | ||
| 3799 | arch_spin_unlock(&trace_cmdline_lock); | ||
| 3800 | preempt_enable(); | ||
| 3801 | } | ||
| 3802 | |||
| 3803 | static int saved_cmdlines_show(struct seq_file *m, void *v) | ||
| 3804 | { | ||
| 3805 | char buf[TASK_COMM_LEN]; | ||
| 3806 | unsigned int *pid = v; | ||
| 3807 | |||
| 3808 | __trace_find_cmdline(*pid, buf); | ||
| 3809 | seq_printf(m, "%d %s\n", *pid, buf); | ||
| 3810 | return 0; | ||
| 3811 | } | ||
| 3812 | |||
| 3813 | static const struct seq_operations tracing_saved_cmdlines_seq_ops = { | ||
| 3814 | .start = saved_cmdlines_start, | ||
| 3815 | .next = saved_cmdlines_next, | ||
| 3816 | .stop = saved_cmdlines_stop, | ||
| 3817 | .show = saved_cmdlines_show, | ||
| 3818 | }; | ||
| 3819 | |||
| 3820 | static int tracing_saved_cmdlines_open(struct inode *inode, struct file *filp) | ||
| 3821 | { | ||
| 3822 | if (tracing_disabled) | ||
| 3823 | return -ENODEV; | ||
| 3824 | |||
| 3825 | return seq_open(filp, &tracing_saved_cmdlines_seq_ops); | ||
| 3826 | } | ||
| 3827 | |||
| 3828 | static const struct file_operations tracing_saved_cmdlines_fops = { | ||
| 3829 | .open = tracing_saved_cmdlines_open, | ||
| 3830 | .read = seq_read, | ||
| 3831 | .llseek = seq_lseek, | ||
| 3832 | .release = seq_release, | ||
| 3833 | }; | ||
| 3834 | |||
| 3708 | static ssize_t | 3835 | static ssize_t |
| 3709 | tracing_saved_cmdlines_read(struct file *file, char __user *ubuf, | 3836 | tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf, |
| 3710 | size_t cnt, loff_t *ppos) | 3837 | size_t cnt, loff_t *ppos) |
| 3711 | { | 3838 | { |
| 3712 | char *buf_comm; | 3839 | char buf[64]; |
| 3713 | char *file_buf; | 3840 | int r; |
| 3714 | char *buf; | 3841 | |
| 3715 | int len = 0; | 3842 | arch_spin_lock(&trace_cmdline_lock); |
| 3716 | int pid; | 3843 | r = sprintf(buf, "%u\n", savedcmd->cmdline_num); |
| 3717 | int i; | 3844 | arch_spin_unlock(&trace_cmdline_lock); |
| 3845 | |||
| 3846 | return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); | ||
| 3847 | } | ||
| 3848 | |||
| 3849 | static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s) | ||
| 3850 | { | ||
| 3851 | kfree(s->saved_cmdlines); | ||
| 3852 | kfree(s->map_cmdline_to_pid); | ||
| 3853 | kfree(s); | ||
| 3854 | } | ||
| 3855 | |||
| 3856 | static int tracing_resize_saved_cmdlines(unsigned int val) | ||
| 3857 | { | ||
| 3858 | struct saved_cmdlines_buffer *s, *savedcmd_temp; | ||
| 3718 | 3859 | ||
| 3719 | file_buf = kmalloc(SAVED_CMDLINES*(16+TASK_COMM_LEN), GFP_KERNEL); | 3860 | s = kmalloc(sizeof(struct saved_cmdlines_buffer), GFP_KERNEL); |
| 3720 | if (!file_buf) | 3861 | if (!s) |
| 3721 | return -ENOMEM; | 3862 | return -ENOMEM; |
| 3722 | 3863 | ||
| 3723 | buf_comm = kmalloc(TASK_COMM_LEN, GFP_KERNEL); | 3864 | if (allocate_cmdlines_buffer(val, s) < 0) { |
| 3724 | if (!buf_comm) { | 3865 | kfree(s); |
| 3725 | kfree(file_buf); | ||
| 3726 | return -ENOMEM; | 3866 | return -ENOMEM; |
| 3727 | } | 3867 | } |
| 3728 | 3868 | ||
| 3729 | buf = file_buf; | 3869 | arch_spin_lock(&trace_cmdline_lock); |
| 3870 | savedcmd_temp = savedcmd; | ||
| 3871 | savedcmd = s; | ||
| 3872 | arch_spin_unlock(&trace_cmdline_lock); | ||
| 3873 | free_saved_cmdlines_buffer(savedcmd_temp); | ||
| 3730 | 3874 | ||
| 3731 | for (i = 0; i < SAVED_CMDLINES; i++) { | 3875 | return 0; |
| 3732 | int r; | 3876 | } |
| 3733 | 3877 | ||
| 3734 | pid = map_cmdline_to_pid[i]; | 3878 | static ssize_t |
| 3735 | if (pid == -1 || pid == NO_CMDLINE_MAP) | 3879 | tracing_saved_cmdlines_size_write(struct file *filp, const char __user *ubuf, |
| 3736 | continue; | 3880 | size_t cnt, loff_t *ppos) |
| 3881 | { | ||
| 3882 | unsigned long val; | ||
| 3883 | int ret; | ||
| 3737 | 3884 | ||
| 3738 | trace_find_cmdline(pid, buf_comm); | 3885 | ret = kstrtoul_from_user(ubuf, cnt, 10, &val); |
| 3739 | r = sprintf(buf, "%d %s\n", pid, buf_comm); | 3886 | if (ret) |
| 3740 | buf += r; | 3887 | return ret; |
| 3741 | len += r; | ||
| 3742 | } | ||
| 3743 | 3888 | ||
| 3744 | len = simple_read_from_buffer(ubuf, cnt, ppos, | 3889 | /* must have at least 1 entry or less than PID_MAX_DEFAULT */ |
| 3745 | file_buf, len); | 3890 | if (!val || val > PID_MAX_DEFAULT) |
| 3891 | return -EINVAL; | ||
| 3746 | 3892 | ||
| 3747 | kfree(file_buf); | 3893 | ret = tracing_resize_saved_cmdlines((unsigned int)val); |
| 3748 | kfree(buf_comm); | 3894 | if (ret < 0) |
| 3895 | return ret; | ||
| 3749 | 3896 | ||
| 3750 | return len; | 3897 | *ppos += cnt; |
| 3898 | |||
| 3899 | return cnt; | ||
| 3751 | } | 3900 | } |
| 3752 | 3901 | ||
| 3753 | static const struct file_operations tracing_saved_cmdlines_fops = { | 3902 | static const struct file_operations tracing_saved_cmdlines_size_fops = { |
| 3754 | .open = tracing_open_generic, | 3903 | .open = tracing_open_generic, |
| 3755 | .read = tracing_saved_cmdlines_read, | 3904 | .read = tracing_saved_cmdlines_size_read, |
| 3756 | .llseek = generic_file_llseek, | 3905 | .write = tracing_saved_cmdlines_size_write, |
| 3757 | }; | 3906 | }; |
| 3758 | 3907 | ||
| 3759 | static ssize_t | 3908 | static ssize_t |
| @@ -4225,25 +4374,6 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table) | |||
| 4225 | return trace_poll(iter, filp, poll_table); | 4374 | return trace_poll(iter, filp, poll_table); |
| 4226 | } | 4375 | } |
| 4227 | 4376 | ||
| 4228 | /* | ||
| 4229 | * This is a make-shift waitqueue. | ||
| 4230 | * A tracer might use this callback on some rare cases: | ||
| 4231 | * | ||
| 4232 | * 1) the current tracer might hold the runqueue lock when it wakes up | ||
| 4233 | * a reader, hence a deadlock (sched, function, and function graph tracers) | ||
| 4234 | * 2) the function tracers, trace all functions, we don't want | ||
| 4235 | * the overhead of calling wake_up and friends | ||
| 4236 | * (and tracing them too) | ||
| 4237 | * | ||
| 4238 | * Anyway, this is really very primitive wakeup. | ||
| 4239 | */ | ||
| 4240 | void poll_wait_pipe(struct trace_iterator *iter) | ||
| 4241 | { | ||
| 4242 | set_current_state(TASK_INTERRUPTIBLE); | ||
| 4243 | /* sleep for 100 msecs, and try again. */ | ||
| 4244 | schedule_timeout(HZ / 10); | ||
| 4245 | } | ||
| 4246 | |||
| 4247 | /* Must be called with trace_types_lock mutex held. */ | 4377 | /* Must be called with trace_types_lock mutex held. */ |
| 4248 | static int tracing_wait_pipe(struct file *filp) | 4378 | static int tracing_wait_pipe(struct file *filp) |
| 4249 | { | 4379 | { |
| @@ -4255,15 +4385,6 @@ static int tracing_wait_pipe(struct file *filp) | |||
| 4255 | return -EAGAIN; | 4385 | return -EAGAIN; |
| 4256 | } | 4386 | } |
| 4257 | 4387 | ||
| 4258 | mutex_unlock(&iter->mutex); | ||
| 4259 | |||
| 4260 | iter->trace->wait_pipe(iter); | ||
| 4261 | |||
| 4262 | mutex_lock(&iter->mutex); | ||
| 4263 | |||
| 4264 | if (signal_pending(current)) | ||
| 4265 | return -EINTR; | ||
| 4266 | |||
| 4267 | /* | 4388 | /* |
| 4268 | * We block until we read something and tracing is disabled. | 4389 | * We block until we read something and tracing is disabled. |
| 4269 | * We still block if tracing is disabled, but we have never | 4390 | * We still block if tracing is disabled, but we have never |
| @@ -4275,6 +4396,15 @@ static int tracing_wait_pipe(struct file *filp) | |||
| 4275 | */ | 4396 | */ |
| 4276 | if (!tracing_is_on() && iter->pos) | 4397 | if (!tracing_is_on() && iter->pos) |
| 4277 | break; | 4398 | break; |
| 4399 | |||
| 4400 | mutex_unlock(&iter->mutex); | ||
| 4401 | |||
| 4402 | wait_on_pipe(iter); | ||
| 4403 | |||
| 4404 | mutex_lock(&iter->mutex); | ||
| 4405 | |||
| 4406 | if (signal_pending(current)) | ||
| 4407 | return -EINTR; | ||
| 4278 | } | 4408 | } |
| 4279 | 4409 | ||
| 4280 | return 1; | 4410 | return 1; |
| @@ -5197,7 +5327,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, | |||
| 5197 | goto out_unlock; | 5327 | goto out_unlock; |
| 5198 | } | 5328 | } |
| 5199 | mutex_unlock(&trace_types_lock); | 5329 | mutex_unlock(&trace_types_lock); |
| 5200 | iter->trace->wait_pipe(iter); | 5330 | wait_on_pipe(iter); |
| 5201 | mutex_lock(&trace_types_lock); | 5331 | mutex_lock(&trace_types_lock); |
| 5202 | if (signal_pending(current)) { | 5332 | if (signal_pending(current)) { |
| 5203 | size = -EINTR; | 5333 | size = -EINTR; |
| @@ -5408,7 +5538,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, | |||
| 5408 | goto out; | 5538 | goto out; |
| 5409 | } | 5539 | } |
| 5410 | mutex_unlock(&trace_types_lock); | 5540 | mutex_unlock(&trace_types_lock); |
| 5411 | iter->trace->wait_pipe(iter); | 5541 | wait_on_pipe(iter); |
| 5412 | mutex_lock(&trace_types_lock); | 5542 | mutex_lock(&trace_types_lock); |
| 5413 | if (signal_pending(current)) { | 5543 | if (signal_pending(current)) { |
| 5414 | ret = -EINTR; | 5544 | ret = -EINTR; |
| @@ -6102,6 +6232,25 @@ static int allocate_trace_buffers(struct trace_array *tr, int size) | |||
| 6102 | return 0; | 6232 | return 0; |
| 6103 | } | 6233 | } |
| 6104 | 6234 | ||
| 6235 | static void free_trace_buffers(struct trace_array *tr) | ||
| 6236 | { | ||
| 6237 | if (!tr) | ||
| 6238 | return; | ||
| 6239 | |||
| 6240 | if (tr->trace_buffer.buffer) { | ||
| 6241 | ring_buffer_free(tr->trace_buffer.buffer); | ||
| 6242 | tr->trace_buffer.buffer = NULL; | ||
| 6243 | free_percpu(tr->trace_buffer.data); | ||
| 6244 | } | ||
| 6245 | |||
| 6246 | #ifdef CONFIG_TRACER_MAX_TRACE | ||
| 6247 | if (tr->max_buffer.buffer) { | ||
| 6248 | ring_buffer_free(tr->max_buffer.buffer); | ||
| 6249 | tr->max_buffer.buffer = NULL; | ||
| 6250 | } | ||
| 6251 | #endif | ||
| 6252 | } | ||
| 6253 | |||
| 6105 | static int new_instance_create(const char *name) | 6254 | static int new_instance_create(const char *name) |
| 6106 | { | 6255 | { |
| 6107 | struct trace_array *tr; | 6256 | struct trace_array *tr; |
| @@ -6131,6 +6280,8 @@ static int new_instance_create(const char *name) | |||
| 6131 | 6280 | ||
| 6132 | raw_spin_lock_init(&tr->start_lock); | 6281 | raw_spin_lock_init(&tr->start_lock); |
| 6133 | 6282 | ||
| 6283 | tr->max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; | ||
| 6284 | |||
| 6134 | tr->current_trace = &nop_trace; | 6285 | tr->current_trace = &nop_trace; |
| 6135 | 6286 | ||
| 6136 | INIT_LIST_HEAD(&tr->systems); | 6287 | INIT_LIST_HEAD(&tr->systems); |
| @@ -6158,8 +6309,7 @@ static int new_instance_create(const char *name) | |||
| 6158 | return 0; | 6309 | return 0; |
| 6159 | 6310 | ||
| 6160 | out_free_tr: | 6311 | out_free_tr: |
| 6161 | if (tr->trace_buffer.buffer) | 6312 | free_trace_buffers(tr); |
| 6162 | ring_buffer_free(tr->trace_buffer.buffer); | ||
| 6163 | free_cpumask_var(tr->tracing_cpumask); | 6313 | free_cpumask_var(tr->tracing_cpumask); |
| 6164 | kfree(tr->name); | 6314 | kfree(tr->name); |
| 6165 | kfree(tr); | 6315 | kfree(tr); |
| @@ -6199,8 +6349,7 @@ static int instance_delete(const char *name) | |||
| 6199 | event_trace_del_tracer(tr); | 6349 | event_trace_del_tracer(tr); |
| 6200 | ftrace_destroy_function_files(tr); | 6350 | ftrace_destroy_function_files(tr); |
| 6201 | debugfs_remove_recursive(tr->dir); | 6351 | debugfs_remove_recursive(tr->dir); |
| 6202 | free_percpu(tr->trace_buffer.data); | 6352 | free_trace_buffers(tr); |
| 6203 | ring_buffer_free(tr->trace_buffer.buffer); | ||
| 6204 | 6353 | ||
| 6205 | kfree(tr->name); | 6354 | kfree(tr->name); |
| 6206 | kfree(tr); | 6355 | kfree(tr); |
| @@ -6328,6 +6477,11 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer) | |||
| 6328 | trace_create_file("tracing_on", 0644, d_tracer, | 6477 | trace_create_file("tracing_on", 0644, d_tracer, |
| 6329 | tr, &rb_simple_fops); | 6478 | tr, &rb_simple_fops); |
| 6330 | 6479 | ||
| 6480 | #ifdef CONFIG_TRACER_MAX_TRACE | ||
| 6481 | trace_create_file("tracing_max_latency", 0644, d_tracer, | ||
| 6482 | &tr->max_latency, &tracing_max_lat_fops); | ||
| 6483 | #endif | ||
| 6484 | |||
| 6331 | if (ftrace_create_function_files(tr, d_tracer)) | 6485 | if (ftrace_create_function_files(tr, d_tracer)) |
| 6332 | WARN(1, "Could not allocate function filter files"); | 6486 | WARN(1, "Could not allocate function filter files"); |
| 6333 | 6487 | ||
| @@ -6353,11 +6507,6 @@ static __init int tracer_init_debugfs(void) | |||
| 6353 | 6507 | ||
| 6354 | init_tracer_debugfs(&global_trace, d_tracer); | 6508 | init_tracer_debugfs(&global_trace, d_tracer); |
| 6355 | 6509 | ||
| 6356 | #ifdef CONFIG_TRACER_MAX_TRACE | ||
| 6357 | trace_create_file("tracing_max_latency", 0644, d_tracer, | ||
| 6358 | &tracing_max_latency, &tracing_max_lat_fops); | ||
| 6359 | #endif | ||
| 6360 | |||
| 6361 | trace_create_file("tracing_thresh", 0644, d_tracer, | 6510 | trace_create_file("tracing_thresh", 0644, d_tracer, |
| 6362 | &tracing_thresh, &tracing_max_lat_fops); | 6511 | &tracing_thresh, &tracing_max_lat_fops); |
| 6363 | 6512 | ||
| @@ -6367,6 +6516,9 @@ static __init int tracer_init_debugfs(void) | |||
| 6367 | trace_create_file("saved_cmdlines", 0444, d_tracer, | 6516 | trace_create_file("saved_cmdlines", 0444, d_tracer, |
| 6368 | NULL, &tracing_saved_cmdlines_fops); | 6517 | NULL, &tracing_saved_cmdlines_fops); |
| 6369 | 6518 | ||
| 6519 | trace_create_file("saved_cmdlines_size", 0644, d_tracer, | ||
| 6520 | NULL, &tracing_saved_cmdlines_size_fops); | ||
| 6521 | |||
| 6370 | #ifdef CONFIG_DYNAMIC_FTRACE | 6522 | #ifdef CONFIG_DYNAMIC_FTRACE |
| 6371 | trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, | 6523 | trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, |
| 6372 | &ftrace_update_tot_cnt, &tracing_dyn_info_fops); | 6524 | &ftrace_update_tot_cnt, &tracing_dyn_info_fops); |
| @@ -6603,18 +6755,19 @@ __init static int tracer_alloc_buffers(void) | |||
| 6603 | if (!temp_buffer) | 6755 | if (!temp_buffer) |
| 6604 | goto out_free_cpumask; | 6756 | goto out_free_cpumask; |
| 6605 | 6757 | ||
| 6758 | if (trace_create_savedcmd() < 0) | ||
| 6759 | goto out_free_temp_buffer; | ||
| 6760 | |||
| 6606 | /* TODO: make the number of buffers hot pluggable with CPUS */ | 6761 | /* TODO: make the number of buffers hot pluggable with CPUS */ |
| 6607 | if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) { | 6762 | if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) { |
| 6608 | printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); | 6763 | printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); |
| 6609 | WARN_ON(1); | 6764 | WARN_ON(1); |
| 6610 | goto out_free_temp_buffer; | 6765 | goto out_free_savedcmd; |
| 6611 | } | 6766 | } |
| 6612 | 6767 | ||
| 6613 | if (global_trace.buffer_disabled) | 6768 | if (global_trace.buffer_disabled) |
| 6614 | tracing_off(); | 6769 | tracing_off(); |
| 6615 | 6770 | ||
| 6616 | trace_init_cmdlines(); | ||
| 6617 | |||
| 6618 | if (trace_boot_clock) { | 6771 | if (trace_boot_clock) { |
| 6619 | ret = tracing_set_clock(&global_trace, trace_boot_clock); | 6772 | ret = tracing_set_clock(&global_trace, trace_boot_clock); |
| 6620 | if (ret < 0) | 6773 | if (ret < 0) |
| @@ -6629,6 +6782,10 @@ __init static int tracer_alloc_buffers(void) | |||
| 6629 | */ | 6782 | */ |
| 6630 | global_trace.current_trace = &nop_trace; | 6783 | global_trace.current_trace = &nop_trace; |
| 6631 | 6784 | ||
| 6785 | global_trace.max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; | ||
| 6786 | |||
| 6787 | ftrace_init_global_array_ops(&global_trace); | ||
| 6788 | |||
| 6632 | register_tracer(&nop_trace); | 6789 | register_tracer(&nop_trace); |
| 6633 | 6790 | ||
| 6634 | /* All seems OK, enable tracing */ | 6791 | /* All seems OK, enable tracing */ |
| @@ -6656,13 +6813,11 @@ __init static int tracer_alloc_buffers(void) | |||
| 6656 | 6813 | ||
| 6657 | return 0; | 6814 | return 0; |
| 6658 | 6815 | ||
| 6816 | out_free_savedcmd: | ||
| 6817 | free_saved_cmdlines_buffer(savedcmd); | ||
| 6659 | out_free_temp_buffer: | 6818 | out_free_temp_buffer: |
| 6660 | ring_buffer_free(temp_buffer); | 6819 | ring_buffer_free(temp_buffer); |
| 6661 | out_free_cpumask: | 6820 | out_free_cpumask: |
| 6662 | free_percpu(global_trace.trace_buffer.data); | ||
| 6663 | #ifdef CONFIG_TRACER_MAX_TRACE | ||
| 6664 | free_percpu(global_trace.max_buffer.data); | ||
| 6665 | #endif | ||
| 6666 | free_cpumask_var(global_trace.tracing_cpumask); | 6821 | free_cpumask_var(global_trace.tracing_cpumask); |
| 6667 | out_free_buffer_mask: | 6822 | out_free_buffer_mask: |
| 6668 | free_cpumask_var(tracing_buffer_mask); | 6823 | free_cpumask_var(tracing_buffer_mask); |
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 2e29d7ba5a52..9e82551dd566 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
| @@ -190,7 +190,22 @@ struct trace_array { | |||
| 190 | */ | 190 | */ |
| 191 | struct trace_buffer max_buffer; | 191 | struct trace_buffer max_buffer; |
| 192 | bool allocated_snapshot; | 192 | bool allocated_snapshot; |
| 193 | unsigned long max_latency; | ||
| 193 | #endif | 194 | #endif |
| 195 | /* | ||
| 196 | * max_lock is used to protect the swapping of buffers | ||
| 197 | * when taking a max snapshot. The buffers themselves are | ||
| 198 | * protected by per_cpu spinlocks. But the action of the swap | ||
| 199 | * needs its own lock. | ||
| 200 | * | ||
| 201 | * This is defined as a arch_spinlock_t in order to help | ||
| 202 | * with performance when lockdep debugging is enabled. | ||
| 203 | * | ||
| 204 | * It is also used in other places outside the update_max_tr | ||
| 205 | * so it needs to be defined outside of the | ||
| 206 | * CONFIG_TRACER_MAX_TRACE. | ||
| 207 | */ | ||
| 208 | arch_spinlock_t max_lock; | ||
| 194 | int buffer_disabled; | 209 | int buffer_disabled; |
| 195 | #ifdef CONFIG_FTRACE_SYSCALLS | 210 | #ifdef CONFIG_FTRACE_SYSCALLS |
| 196 | int sys_refcount_enter; | 211 | int sys_refcount_enter; |
| @@ -237,6 +252,9 @@ static inline struct trace_array *top_trace_array(void) | |||
| 237 | { | 252 | { |
| 238 | struct trace_array *tr; | 253 | struct trace_array *tr; |
| 239 | 254 | ||
| 255 | if (list_empty(ftrace_trace_arrays.prev)) | ||
| 256 | return NULL; | ||
| 257 | |||
| 240 | tr = list_entry(ftrace_trace_arrays.prev, | 258 | tr = list_entry(ftrace_trace_arrays.prev, |
| 241 | typeof(*tr), list); | 259 | typeof(*tr), list); |
| 242 | WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL)); | 260 | WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL)); |
| @@ -323,7 +341,6 @@ struct tracer_flags { | |||
| 323 | * @stop: called when tracing is paused (echo 0 > tracing_enabled) | 341 | * @stop: called when tracing is paused (echo 0 > tracing_enabled) |
| 324 | * @open: called when the trace file is opened | 342 | * @open: called when the trace file is opened |
| 325 | * @pipe_open: called when the trace_pipe file is opened | 343 | * @pipe_open: called when the trace_pipe file is opened |
| 326 | * @wait_pipe: override how the user waits for traces on trace_pipe | ||
| 327 | * @close: called when the trace file is released | 344 | * @close: called when the trace file is released |
| 328 | * @pipe_close: called when the trace_pipe file is released | 345 | * @pipe_close: called when the trace_pipe file is released |
| 329 | * @read: override the default read callback on trace_pipe | 346 | * @read: override the default read callback on trace_pipe |
| @@ -342,7 +359,6 @@ struct tracer { | |||
| 342 | void (*stop)(struct trace_array *tr); | 359 | void (*stop)(struct trace_array *tr); |
| 343 | void (*open)(struct trace_iterator *iter); | 360 | void (*open)(struct trace_iterator *iter); |
| 344 | void (*pipe_open)(struct trace_iterator *iter); | 361 | void (*pipe_open)(struct trace_iterator *iter); |
| 345 | void (*wait_pipe)(struct trace_iterator *iter); | ||
| 346 | void (*close)(struct trace_iterator *iter); | 362 | void (*close)(struct trace_iterator *iter); |
| 347 | void (*pipe_close)(struct trace_iterator *iter); | 363 | void (*pipe_close)(struct trace_iterator *iter); |
| 348 | ssize_t (*read)(struct trace_iterator *iter, | 364 | ssize_t (*read)(struct trace_iterator *iter, |
| @@ -416,13 +432,7 @@ enum { | |||
| 416 | TRACE_FTRACE_IRQ_BIT, | 432 | TRACE_FTRACE_IRQ_BIT, |
| 417 | TRACE_FTRACE_SIRQ_BIT, | 433 | TRACE_FTRACE_SIRQ_BIT, |
| 418 | 434 | ||
| 419 | /* GLOBAL_BITs must be greater than FTRACE_BITs */ | 435 | /* INTERNAL_BITs must be greater than FTRACE_BITs */ |
| 420 | TRACE_GLOBAL_BIT, | ||
| 421 | TRACE_GLOBAL_NMI_BIT, | ||
| 422 | TRACE_GLOBAL_IRQ_BIT, | ||
| 423 | TRACE_GLOBAL_SIRQ_BIT, | ||
| 424 | |||
| 425 | /* INTERNAL_BITs must be greater than GLOBAL_BITs */ | ||
| 426 | TRACE_INTERNAL_BIT, | 436 | TRACE_INTERNAL_BIT, |
| 427 | TRACE_INTERNAL_NMI_BIT, | 437 | TRACE_INTERNAL_NMI_BIT, |
| 428 | TRACE_INTERNAL_IRQ_BIT, | 438 | TRACE_INTERNAL_IRQ_BIT, |
| @@ -449,9 +459,6 @@ enum { | |||
| 449 | #define TRACE_FTRACE_START TRACE_FTRACE_BIT | 459 | #define TRACE_FTRACE_START TRACE_FTRACE_BIT |
| 450 | #define TRACE_FTRACE_MAX ((1 << (TRACE_FTRACE_START + TRACE_CONTEXT_BITS)) - 1) | 460 | #define TRACE_FTRACE_MAX ((1 << (TRACE_FTRACE_START + TRACE_CONTEXT_BITS)) - 1) |
| 451 | 461 | ||
| 452 | #define TRACE_GLOBAL_START TRACE_GLOBAL_BIT | ||
| 453 | #define TRACE_GLOBAL_MAX ((1 << (TRACE_GLOBAL_START + TRACE_CONTEXT_BITS)) - 1) | ||
| 454 | |||
| 455 | #define TRACE_LIST_START TRACE_INTERNAL_BIT | 462 | #define TRACE_LIST_START TRACE_INTERNAL_BIT |
| 456 | #define TRACE_LIST_MAX ((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1) | 463 | #define TRACE_LIST_MAX ((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1) |
| 457 | 464 | ||
| @@ -560,8 +567,6 @@ void trace_init_global_iter(struct trace_iterator *iter); | |||
| 560 | 567 | ||
| 561 | void tracing_iter_reset(struct trace_iterator *iter, int cpu); | 568 | void tracing_iter_reset(struct trace_iterator *iter, int cpu); |
| 562 | 569 | ||
| 563 | void poll_wait_pipe(struct trace_iterator *iter); | ||
| 564 | |||
| 565 | void tracing_sched_switch_trace(struct trace_array *tr, | 570 | void tracing_sched_switch_trace(struct trace_array *tr, |
| 566 | struct task_struct *prev, | 571 | struct task_struct *prev, |
| 567 | struct task_struct *next, | 572 | struct task_struct *next, |
| @@ -608,8 +613,6 @@ extern unsigned long nsecs_to_usecs(unsigned long nsecs); | |||
| 608 | extern unsigned long tracing_thresh; | 613 | extern unsigned long tracing_thresh; |
| 609 | 614 | ||
| 610 | #ifdef CONFIG_TRACER_MAX_TRACE | 615 | #ifdef CONFIG_TRACER_MAX_TRACE |
| 611 | extern unsigned long tracing_max_latency; | ||
| 612 | |||
| 613 | void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); | 616 | void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); |
| 614 | void update_max_tr_single(struct trace_array *tr, | 617 | void update_max_tr_single(struct trace_array *tr, |
| 615 | struct task_struct *tsk, int cpu); | 618 | struct task_struct *tsk, int cpu); |
| @@ -724,6 +727,8 @@ extern unsigned long trace_flags; | |||
| 724 | #define TRACE_GRAPH_PRINT_PROC 0x8 | 727 | #define TRACE_GRAPH_PRINT_PROC 0x8 |
| 725 | #define TRACE_GRAPH_PRINT_DURATION 0x10 | 728 | #define TRACE_GRAPH_PRINT_DURATION 0x10 |
| 726 | #define TRACE_GRAPH_PRINT_ABS_TIME 0x20 | 729 | #define TRACE_GRAPH_PRINT_ABS_TIME 0x20 |
| 730 | #define TRACE_GRAPH_PRINT_IRQS 0x40 | ||
| 731 | #define TRACE_GRAPH_PRINT_TAIL 0x80 | ||
| 727 | #define TRACE_GRAPH_PRINT_FILL_SHIFT 28 | 732 | #define TRACE_GRAPH_PRINT_FILL_SHIFT 28 |
| 728 | #define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT) | 733 | #define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT) |
| 729 | 734 | ||
| @@ -823,6 +828,10 @@ extern int ftrace_is_dead(void); | |||
| 823 | int ftrace_create_function_files(struct trace_array *tr, | 828 | int ftrace_create_function_files(struct trace_array *tr, |
| 824 | struct dentry *parent); | 829 | struct dentry *parent); |
| 825 | void ftrace_destroy_function_files(struct trace_array *tr); | 830 | void ftrace_destroy_function_files(struct trace_array *tr); |
| 831 | void ftrace_init_global_array_ops(struct trace_array *tr); | ||
| 832 | void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func); | ||
| 833 | void ftrace_reset_array_ops(struct trace_array *tr); | ||
| 834 | int using_ftrace_ops_list_func(void); | ||
| 826 | #else | 835 | #else |
| 827 | static inline int ftrace_trace_task(struct task_struct *task) | 836 | static inline int ftrace_trace_task(struct task_struct *task) |
| 828 | { | 837 | { |
| @@ -836,6 +845,11 @@ ftrace_create_function_files(struct trace_array *tr, | |||
| 836 | return 0; | 845 | return 0; |
| 837 | } | 846 | } |
| 838 | static inline void ftrace_destroy_function_files(struct trace_array *tr) { } | 847 | static inline void ftrace_destroy_function_files(struct trace_array *tr) { } |
| 848 | static inline __init void | ||
| 849 | ftrace_init_global_array_ops(struct trace_array *tr) { } | ||
| 850 | static inline void ftrace_reset_array_ops(struct trace_array *tr) { } | ||
| 851 | /* ftace_func_t type is not defined, use macro instead of static inline */ | ||
| 852 | #define ftrace_init_array_ops(tr, func) do { } while (0) | ||
| 839 | #endif /* CONFIG_FUNCTION_TRACER */ | 853 | #endif /* CONFIG_FUNCTION_TRACER */ |
| 840 | 854 | ||
| 841 | #if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE) | 855 | #if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE) |
diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c new file mode 100644 index 000000000000..40a14cbcf8e0 --- /dev/null +++ b/kernel/trace/trace_benchmark.c | |||
| @@ -0,0 +1,198 @@ | |||
| 1 | #include <linux/delay.h> | ||
| 2 | #include <linux/module.h> | ||
| 3 | #include <linux/kthread.h> | ||
| 4 | #include <linux/trace_clock.h> | ||
| 5 | |||
| 6 | #define CREATE_TRACE_POINTS | ||
| 7 | #include "trace_benchmark.h" | ||
| 8 | |||
| 9 | static struct task_struct *bm_event_thread; | ||
| 10 | |||
| 11 | static char bm_str[BENCHMARK_EVENT_STRLEN] = "START"; | ||
| 12 | |||
| 13 | static u64 bm_total; | ||
| 14 | static u64 bm_totalsq; | ||
| 15 | static u64 bm_last; | ||
| 16 | static u64 bm_max; | ||
| 17 | static u64 bm_min; | ||
| 18 | static u64 bm_first; | ||
| 19 | static u64 bm_cnt; | ||
| 20 | static u64 bm_stddev; | ||
| 21 | static unsigned int bm_avg; | ||
| 22 | static unsigned int bm_std; | ||
| 23 | |||
| 24 | /* | ||
| 25 | * This gets called in a loop recording the time it took to write | ||
| 26 | * the tracepoint. What it writes is the time statistics of the last | ||
| 27 | * tracepoint write. As there is nothing to write the first time | ||
| 28 | * it simply writes "START". As the first write is cold cache and | ||
| 29 | * the rest is hot, we save off that time in bm_first and it is | ||
| 30 | * reported as "first", which is shown in the second write to the | ||
| 31 | * tracepoint. The "first" field is writen within the statics from | ||
| 32 | * then on but never changes. | ||
| 33 | */ | ||
| 34 | static void trace_do_benchmark(void) | ||
| 35 | { | ||
| 36 | u64 start; | ||
| 37 | u64 stop; | ||
| 38 | u64 delta; | ||
| 39 | u64 stddev; | ||
| 40 | u64 seed; | ||
| 41 | u64 last_seed; | ||
| 42 | unsigned int avg; | ||
| 43 | unsigned int std = 0; | ||
| 44 | |||
| 45 | /* Only run if the tracepoint is actually active */ | ||
| 46 | if (!trace_benchmark_event_enabled()) | ||
| 47 | return; | ||
| 48 | |||
| 49 | local_irq_disable(); | ||
| 50 | start = trace_clock_local(); | ||
| 51 | trace_benchmark_event(bm_str); | ||
| 52 | stop = trace_clock_local(); | ||
| 53 | local_irq_enable(); | ||
| 54 | |||
| 55 | bm_cnt++; | ||
| 56 | |||
| 57 | delta = stop - start; | ||
| 58 | |||
| 59 | /* | ||
| 60 | * The first read is cold cached, keep it separate from the | ||
| 61 | * other calculations. | ||
| 62 | */ | ||
| 63 | if (bm_cnt == 1) { | ||
| 64 | bm_first = delta; | ||
| 65 | scnprintf(bm_str, BENCHMARK_EVENT_STRLEN, | ||
| 66 | "first=%llu [COLD CACHED]", bm_first); | ||
| 67 | return; | ||
| 68 | } | ||
| 69 | |||
| 70 | bm_last = delta; | ||
| 71 | |||
| 72 | if (delta > bm_max) | ||
| 73 | bm_max = delta; | ||
| 74 | if (!bm_min || delta < bm_min) | ||
| 75 | bm_min = delta; | ||
| 76 | |||
| 77 | /* | ||
| 78 | * When bm_cnt is greater than UINT_MAX, it breaks the statistics | ||
| 79 | * accounting. Freeze the statistics when that happens. | ||
| 80 | * We should have enough data for the avg and stddev anyway. | ||
| 81 | */ | ||
| 82 | if (bm_cnt > UINT_MAX) { | ||
| 83 | scnprintf(bm_str, BENCHMARK_EVENT_STRLEN, | ||
| 84 | "last=%llu first=%llu max=%llu min=%llu ** avg=%u std=%d std^2=%lld", | ||
| 85 | bm_last, bm_first, bm_max, bm_min, bm_avg, bm_std, bm_stddev); | ||
| 86 | return; | ||
| 87 | } | ||
| 88 | |||
| 89 | bm_total += delta; | ||
| 90 | bm_totalsq += delta * delta; | ||
| 91 | |||
| 92 | |||
| 93 | if (bm_cnt > 1) { | ||
| 94 | /* | ||
| 95 | * Apply Welford's method to calculate standard deviation: | ||
| 96 | * s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2) | ||
| 97 | */ | ||
| 98 | stddev = (u64)bm_cnt * bm_totalsq - bm_total * bm_total; | ||
| 99 | do_div(stddev, (u32)bm_cnt); | ||
| 100 | do_div(stddev, (u32)bm_cnt - 1); | ||
| 101 | } else | ||
| 102 | stddev = 0; | ||
| 103 | |||
| 104 | delta = bm_total; | ||
| 105 | do_div(delta, bm_cnt); | ||
| 106 | avg = delta; | ||
| 107 | |||
| 108 | if (stddev > 0) { | ||
| 109 | int i = 0; | ||
| 110 | /* | ||
| 111 | * stddev is the square of standard deviation but | ||
| 112 | * we want the actualy number. Use the average | ||
| 113 | * as our seed to find the std. | ||
| 114 | * | ||
| 115 | * The next try is: | ||
| 116 | * x = (x + N/x) / 2 | ||
| 117 | * | ||
| 118 | * Where N is the squared number to find the square | ||
| 119 | * root of. | ||
| 120 | */ | ||
| 121 | seed = avg; | ||
| 122 | do { | ||
| 123 | last_seed = seed; | ||
| 124 | seed = stddev; | ||
| 125 | if (!last_seed) | ||
| 126 | break; | ||
| 127 | do_div(seed, last_seed); | ||
| 128 | seed += last_seed; | ||
| 129 | do_div(seed, 2); | ||
| 130 | } while (i++ < 10 && last_seed != seed); | ||
| 131 | |||
| 132 | std = seed; | ||
| 133 | } | ||
| 134 | |||
| 135 | scnprintf(bm_str, BENCHMARK_EVENT_STRLEN, | ||
| 136 | "last=%llu first=%llu max=%llu min=%llu avg=%u std=%d std^2=%lld", | ||
| 137 | bm_last, bm_first, bm_max, bm_min, avg, std, stddev); | ||
| 138 | |||
| 139 | bm_std = std; | ||
| 140 | bm_avg = avg; | ||
| 141 | bm_stddev = stddev; | ||
| 142 | } | ||
| 143 | |||
| 144 | static int benchmark_event_kthread(void *arg) | ||
| 145 | { | ||
| 146 | /* sleep a bit to make sure the tracepoint gets activated */ | ||
| 147 | msleep(100); | ||
| 148 | |||
| 149 | while (!kthread_should_stop()) { | ||
| 150 | |||
| 151 | trace_do_benchmark(); | ||
| 152 | |||
| 153 | /* | ||
| 154 | * We don't go to sleep, but let others | ||
| 155 | * run as well. | ||
| 156 | */ | ||
| 157 | cond_resched(); | ||
| 158 | } | ||
| 159 | |||
| 160 | return 0; | ||
| 161 | } | ||
| 162 | |||
| 163 | /* | ||
| 164 | * When the benchmark tracepoint is enabled, it calls this | ||
| 165 | * function and the thread that calls the tracepoint is created. | ||
| 166 | */ | ||
| 167 | void trace_benchmark_reg(void) | ||
| 168 | { | ||
| 169 | bm_event_thread = kthread_run(benchmark_event_kthread, | ||
| 170 | NULL, "event_benchmark"); | ||
| 171 | WARN_ON(!bm_event_thread); | ||
| 172 | } | ||
| 173 | |||
| 174 | /* | ||
| 175 | * When the benchmark tracepoint is disabled, it calls this | ||
| 176 | * function and the thread that calls the tracepoint is deleted | ||
| 177 | * and all the numbers are reset. | ||
| 178 | */ | ||
| 179 | void trace_benchmark_unreg(void) | ||
| 180 | { | ||
| 181 | if (!bm_event_thread) | ||
| 182 | return; | ||
| 183 | |||
| 184 | kthread_stop(bm_event_thread); | ||
| 185 | |||
| 186 | strcpy(bm_str, "START"); | ||
| 187 | bm_total = 0; | ||
| 188 | bm_totalsq = 0; | ||
| 189 | bm_last = 0; | ||
| 190 | bm_max = 0; | ||
| 191 | bm_min = 0; | ||
| 192 | bm_cnt = 0; | ||
| 193 | /* These don't need to be reset but reset them anyway */ | ||
| 194 | bm_first = 0; | ||
| 195 | bm_std = 0; | ||
| 196 | bm_avg = 0; | ||
| 197 | bm_stddev = 0; | ||
| 198 | } | ||
diff --git a/kernel/trace/trace_benchmark.h b/kernel/trace/trace_benchmark.h new file mode 100644 index 000000000000..3c1df1df4e29 --- /dev/null +++ b/kernel/trace/trace_benchmark.h | |||
| @@ -0,0 +1,41 @@ | |||
| 1 | #undef TRACE_SYSTEM | ||
| 2 | #define TRACE_SYSTEM benchmark | ||
| 3 | |||
| 4 | #if !defined(_TRACE_BENCHMARK_H) || defined(TRACE_HEADER_MULTI_READ) | ||
| 5 | #define _TRACE_BENCHMARK_H | ||
| 6 | |||
| 7 | #include <linux/tracepoint.h> | ||
| 8 | |||
| 9 | extern void trace_benchmark_reg(void); | ||
| 10 | extern void trace_benchmark_unreg(void); | ||
| 11 | |||
| 12 | #define BENCHMARK_EVENT_STRLEN 128 | ||
| 13 | |||
| 14 | TRACE_EVENT_FN(benchmark_event, | ||
| 15 | |||
| 16 | TP_PROTO(const char *str), | ||
| 17 | |||
| 18 | TP_ARGS(str), | ||
| 19 | |||
| 20 | TP_STRUCT__entry( | ||
| 21 | __array( char, str, BENCHMARK_EVENT_STRLEN ) | ||
| 22 | ), | ||
| 23 | |||
| 24 | TP_fast_assign( | ||
| 25 | memcpy(__entry->str, str, BENCHMARK_EVENT_STRLEN); | ||
| 26 | ), | ||
| 27 | |||
| 28 | TP_printk("%s", __entry->str), | ||
| 29 | |||
| 30 | trace_benchmark_reg, trace_benchmark_unreg | ||
| 31 | ); | ||
| 32 | |||
| 33 | #endif /* _TRACE_BENCHMARK_H */ | ||
| 34 | |||
| 35 | #undef TRACE_INCLUDE_FILE | ||
| 36 | #undef TRACE_INCLUDE_PATH | ||
| 37 | #define TRACE_INCLUDE_PATH . | ||
| 38 | #define TRACE_INCLUDE_FILE trace_benchmark | ||
| 39 | |||
| 40 | /* This part must be outside protection */ | ||
| 41 | #include <trace/define_trace.h> | ||
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 3ddfd8f62c05..f99e0b3bca8c 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
| @@ -574,6 +574,9 @@ int trace_set_clr_event(const char *system, const char *event, int set) | |||
| 574 | { | 574 | { |
| 575 | struct trace_array *tr = top_trace_array(); | 575 | struct trace_array *tr = top_trace_array(); |
| 576 | 576 | ||
| 577 | if (!tr) | ||
| 578 | return -ENODEV; | ||
| 579 | |||
| 577 | return __ftrace_set_clr_event(tr, NULL, system, event, set); | 580 | return __ftrace_set_clr_event(tr, NULL, system, event, set); |
| 578 | } | 581 | } |
| 579 | EXPORT_SYMBOL_GPL(trace_set_clr_event); | 582 | EXPORT_SYMBOL_GPL(trace_set_clr_event); |
| @@ -2065,6 +2068,9 @@ event_enable_func(struct ftrace_hash *hash, | |||
| 2065 | bool enable; | 2068 | bool enable; |
| 2066 | int ret; | 2069 | int ret; |
| 2067 | 2070 | ||
| 2071 | if (!tr) | ||
| 2072 | return -ENODEV; | ||
| 2073 | |||
| 2068 | /* hash funcs only work with set_ftrace_filter */ | 2074 | /* hash funcs only work with set_ftrace_filter */ |
| 2069 | if (!enabled || !param) | 2075 | if (!enabled || !param) |
| 2070 | return -EINVAL; | 2076 | return -EINVAL; |
| @@ -2396,6 +2402,9 @@ static __init int event_trace_enable(void) | |||
| 2396 | char *token; | 2402 | char *token; |
| 2397 | int ret; | 2403 | int ret; |
| 2398 | 2404 | ||
| 2405 | if (!tr) | ||
| 2406 | return -ENODEV; | ||
| 2407 | |||
| 2399 | for_each_event(iter, __start_ftrace_events, __stop_ftrace_events) { | 2408 | for_each_event(iter, __start_ftrace_events, __stop_ftrace_events) { |
| 2400 | 2409 | ||
| 2401 | call = *iter; | 2410 | call = *iter; |
| @@ -2442,6 +2451,8 @@ static __init int event_trace_init(void) | |||
| 2442 | int ret; | 2451 | int ret; |
| 2443 | 2452 | ||
| 2444 | tr = top_trace_array(); | 2453 | tr = top_trace_array(); |
| 2454 | if (!tr) | ||
| 2455 | return -ENODEV; | ||
| 2445 | 2456 | ||
| 2446 | d_tracer = tracing_init_dentry(); | 2457 | d_tracer = tracing_init_dentry(); |
| 2447 | if (!d_tracer) | 2458 | if (!d_tracer) |
| @@ -2535,6 +2546,8 @@ static __init void event_trace_self_tests(void) | |||
| 2535 | int ret; | 2546 | int ret; |
| 2536 | 2547 | ||
| 2537 | tr = top_trace_array(); | 2548 | tr = top_trace_array(); |
| 2549 | if (!tr) | ||
| 2550 | return; | ||
| 2538 | 2551 | ||
| 2539 | pr_info("Running tests on trace events:\n"); | 2552 | pr_info("Running tests on trace events:\n"); |
| 2540 | 2553 | ||
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 925f537f07d1..4747b476a030 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c | |||
| @@ -77,7 +77,7 @@ event_triggers_call(struct ftrace_event_file *file, void *rec) | |||
| 77 | data->ops->func(data); | 77 | data->ops->func(data); |
| 78 | continue; | 78 | continue; |
| 79 | } | 79 | } |
| 80 | filter = rcu_dereference(data->filter); | 80 | filter = rcu_dereference_sched(data->filter); |
| 81 | if (filter && !filter_match_preds(filter, rec)) | 81 | if (filter && !filter_match_preds(filter, rec)) |
| 82 | continue; | 82 | continue; |
| 83 | if (data->cmd_ops->post_trigger) { | 83 | if (data->cmd_ops->post_trigger) { |
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 5b781d2be383..57f0ec962d2c 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c | |||
| @@ -26,8 +26,6 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, | |||
| 26 | static void | 26 | static void |
| 27 | function_stack_trace_call(unsigned long ip, unsigned long parent_ip, | 27 | function_stack_trace_call(unsigned long ip, unsigned long parent_ip, |
| 28 | struct ftrace_ops *op, struct pt_regs *pt_regs); | 28 | struct ftrace_ops *op, struct pt_regs *pt_regs); |
| 29 | static struct ftrace_ops trace_ops; | ||
| 30 | static struct ftrace_ops trace_stack_ops; | ||
| 31 | static struct tracer_flags func_flags; | 29 | static struct tracer_flags func_flags; |
| 32 | 30 | ||
| 33 | /* Our option */ | 31 | /* Our option */ |
| @@ -58,12 +56,16 @@ int ftrace_create_function_files(struct trace_array *tr, | |||
| 58 | { | 56 | { |
| 59 | int ret; | 57 | int ret; |
| 60 | 58 | ||
| 61 | /* The top level array uses the "global_ops". */ | 59 | /* |
| 62 | if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL)) { | 60 | * The top level array uses the "global_ops", and the files are |
| 63 | ret = allocate_ftrace_ops(tr); | 61 | * created on boot up. |
| 64 | if (ret) | 62 | */ |
| 65 | return ret; | 63 | if (tr->flags & TRACE_ARRAY_FL_GLOBAL) |
| 66 | } | 64 | return 0; |
| 65 | |||
| 66 | ret = allocate_ftrace_ops(tr); | ||
| 67 | if (ret) | ||
| 68 | return ret; | ||
| 67 | 69 | ||
| 68 | ftrace_create_filter_files(tr->ops, parent); | 70 | ftrace_create_filter_files(tr->ops, parent); |
| 69 | 71 | ||
| @@ -79,28 +81,24 @@ void ftrace_destroy_function_files(struct trace_array *tr) | |||
| 79 | 81 | ||
| 80 | static int function_trace_init(struct trace_array *tr) | 82 | static int function_trace_init(struct trace_array *tr) |
| 81 | { | 83 | { |
| 82 | struct ftrace_ops *ops; | 84 | ftrace_func_t func; |
| 83 | |||
| 84 | if (tr->flags & TRACE_ARRAY_FL_GLOBAL) { | ||
| 85 | /* There's only one global tr */ | ||
| 86 | if (!trace_ops.private) { | ||
| 87 | trace_ops.private = tr; | ||
| 88 | trace_stack_ops.private = tr; | ||
| 89 | } | ||
| 90 | 85 | ||
| 91 | if (func_flags.val & TRACE_FUNC_OPT_STACK) | 86 | /* |
| 92 | ops = &trace_stack_ops; | 87 | * Instance trace_arrays get their ops allocated |
| 93 | else | 88 | * at instance creation. Unless it failed |
| 94 | ops = &trace_ops; | 89 | * the allocation. |
| 95 | tr->ops = ops; | 90 | */ |
| 96 | } else if (!tr->ops) { | 91 | if (!tr->ops) |
| 97 | /* | ||
| 98 | * Instance trace_arrays get their ops allocated | ||
| 99 | * at instance creation. Unless it failed | ||
| 100 | * the allocation. | ||
| 101 | */ | ||
| 102 | return -ENOMEM; | 92 | return -ENOMEM; |
| 103 | } | 93 | |
| 94 | /* Currently only the global instance can do stack tracing */ | ||
| 95 | if (tr->flags & TRACE_ARRAY_FL_GLOBAL && | ||
| 96 | func_flags.val & TRACE_FUNC_OPT_STACK) | ||
| 97 | func = function_stack_trace_call; | ||
| 98 | else | ||
| 99 | func = function_trace_call; | ||
| 100 | |||
| 101 | ftrace_init_array_ops(tr, func); | ||
| 104 | 102 | ||
| 105 | tr->trace_buffer.cpu = get_cpu(); | 103 | tr->trace_buffer.cpu = get_cpu(); |
| 106 | put_cpu(); | 104 | put_cpu(); |
| @@ -114,6 +112,7 @@ static void function_trace_reset(struct trace_array *tr) | |||
| 114 | { | 112 | { |
| 115 | tracing_stop_function_trace(tr); | 113 | tracing_stop_function_trace(tr); |
| 116 | tracing_stop_cmdline_record(); | 114 | tracing_stop_cmdline_record(); |
| 115 | ftrace_reset_array_ops(tr); | ||
| 117 | } | 116 | } |
| 118 | 117 | ||
| 119 | static void function_trace_start(struct trace_array *tr) | 118 | static void function_trace_start(struct trace_array *tr) |
| @@ -195,18 +194,6 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip, | |||
| 195 | local_irq_restore(flags); | 194 | local_irq_restore(flags); |
| 196 | } | 195 | } |
| 197 | 196 | ||
| 198 | static struct ftrace_ops trace_ops __read_mostly = | ||
| 199 | { | ||
| 200 | .func = function_trace_call, | ||
| 201 | .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, | ||
| 202 | }; | ||
| 203 | |||
| 204 | static struct ftrace_ops trace_stack_ops __read_mostly = | ||
| 205 | { | ||
| 206 | .func = function_stack_trace_call, | ||
| 207 | .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, | ||
| 208 | }; | ||
| 209 | |||
| 210 | static struct tracer_opt func_opts[] = { | 197 | static struct tracer_opt func_opts[] = { |
| 211 | #ifdef CONFIG_STACKTRACE | 198 | #ifdef CONFIG_STACKTRACE |
| 212 | { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) }, | 199 | { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) }, |
| @@ -244,10 +231,10 @@ func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) | |||
| 244 | unregister_ftrace_function(tr->ops); | 231 | unregister_ftrace_function(tr->ops); |
| 245 | 232 | ||
| 246 | if (set) { | 233 | if (set) { |
| 247 | tr->ops = &trace_stack_ops; | 234 | tr->ops->func = function_stack_trace_call; |
| 248 | register_ftrace_function(tr->ops); | 235 | register_ftrace_function(tr->ops); |
| 249 | } else { | 236 | } else { |
| 250 | tr->ops = &trace_ops; | 237 | tr->ops->func = function_trace_call; |
| 251 | register_ftrace_function(tr->ops); | 238 | register_ftrace_function(tr->ops); |
| 252 | } | 239 | } |
| 253 | 240 | ||
| @@ -265,7 +252,6 @@ static struct tracer function_trace __tracer_data = | |||
| 265 | .init = function_trace_init, | 252 | .init = function_trace_init, |
| 266 | .reset = function_trace_reset, | 253 | .reset = function_trace_reset, |
| 267 | .start = function_trace_start, | 254 | .start = function_trace_start, |
| 268 | .wait_pipe = poll_wait_pipe, | ||
| 269 | .flags = &func_flags, | 255 | .flags = &func_flags, |
| 270 | .set_flag = func_set_flag, | 256 | .set_flag = func_set_flag, |
| 271 | .allow_instances = true, | 257 | .allow_instances = true, |
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index deff11200261..4de3e57f723c 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c | |||
| @@ -38,15 +38,6 @@ struct fgraph_data { | |||
| 38 | 38 | ||
| 39 | #define TRACE_GRAPH_INDENT 2 | 39 | #define TRACE_GRAPH_INDENT 2 |
| 40 | 40 | ||
| 41 | /* Flag options */ | ||
| 42 | #define TRACE_GRAPH_PRINT_OVERRUN 0x1 | ||
| 43 | #define TRACE_GRAPH_PRINT_CPU 0x2 | ||
| 44 | #define TRACE_GRAPH_PRINT_OVERHEAD 0x4 | ||
| 45 | #define TRACE_GRAPH_PRINT_PROC 0x8 | ||
| 46 | #define TRACE_GRAPH_PRINT_DURATION 0x10 | ||
| 47 | #define TRACE_GRAPH_PRINT_ABS_TIME 0x20 | ||
| 48 | #define TRACE_GRAPH_PRINT_IRQS 0x40 | ||
| 49 | |||
| 50 | static unsigned int max_depth; | 41 | static unsigned int max_depth; |
| 51 | 42 | ||
| 52 | static struct tracer_opt trace_opts[] = { | 43 | static struct tracer_opt trace_opts[] = { |
| @@ -64,11 +55,13 @@ static struct tracer_opt trace_opts[] = { | |||
| 64 | { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) }, | 55 | { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) }, |
| 65 | /* Display interrupts */ | 56 | /* Display interrupts */ |
| 66 | { TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) }, | 57 | { TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) }, |
| 58 | /* Display function name after trailing } */ | ||
| 59 | { TRACER_OPT(funcgraph-tail, TRACE_GRAPH_PRINT_TAIL) }, | ||
| 67 | { } /* Empty entry */ | 60 | { } /* Empty entry */ |
| 68 | }; | 61 | }; |
| 69 | 62 | ||
| 70 | static struct tracer_flags tracer_flags = { | 63 | static struct tracer_flags tracer_flags = { |
| 71 | /* Don't display overruns and proc by default */ | 64 | /* Don't display overruns, proc, or tail by default */ |
| 72 | .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD | | 65 | .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD | |
| 73 | TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS, | 66 | TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS, |
| 74 | .opts = trace_opts | 67 | .opts = trace_opts |
| @@ -1176,9 +1169,10 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, | |||
| 1176 | * If the return function does not have a matching entry, | 1169 | * If the return function does not have a matching entry, |
| 1177 | * then the entry was lost. Instead of just printing | 1170 | * then the entry was lost. Instead of just printing |
| 1178 | * the '}' and letting the user guess what function this | 1171 | * the '}' and letting the user guess what function this |
| 1179 | * belongs to, write out the function name. | 1172 | * belongs to, write out the function name. Always do |
| 1173 | * that if the funcgraph-tail option is enabled. | ||
| 1180 | */ | 1174 | */ |
| 1181 | if (func_match) { | 1175 | if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL)) { |
| 1182 | ret = trace_seq_puts(s, "}\n"); | 1176 | ret = trace_seq_puts(s, "}\n"); |
| 1183 | if (!ret) | 1177 | if (!ret) |
| 1184 | return TRACE_TYPE_PARTIAL_LINE; | 1178 | return TRACE_TYPE_PARTIAL_LINE; |
| @@ -1505,7 +1499,6 @@ static struct tracer graph_trace __tracer_data = { | |||
| 1505 | .pipe_open = graph_trace_open, | 1499 | .pipe_open = graph_trace_open, |
| 1506 | .close = graph_trace_close, | 1500 | .close = graph_trace_close, |
| 1507 | .pipe_close = graph_trace_close, | 1501 | .pipe_close = graph_trace_close, |
| 1508 | .wait_pipe = poll_wait_pipe, | ||
| 1509 | .init = graph_trace_init, | 1502 | .init = graph_trace_init, |
| 1510 | .reset = graph_trace_reset, | 1503 | .reset = graph_trace_reset, |
| 1511 | .print_line = print_graph_function, | 1504 | .print_line = print_graph_function, |
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 8ff02cbb892f..9bb104f748d0 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c | |||
| @@ -151,12 +151,6 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip, | |||
| 151 | 151 | ||
| 152 | atomic_dec(&data->disabled); | 152 | atomic_dec(&data->disabled); |
| 153 | } | 153 | } |
| 154 | |||
| 155 | static struct ftrace_ops trace_ops __read_mostly = | ||
| 156 | { | ||
| 157 | .func = irqsoff_tracer_call, | ||
| 158 | .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, | ||
| 159 | }; | ||
| 160 | #endif /* CONFIG_FUNCTION_TRACER */ | 154 | #endif /* CONFIG_FUNCTION_TRACER */ |
| 161 | 155 | ||
| 162 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | 156 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER |
| @@ -176,7 +170,7 @@ irqsoff_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) | |||
| 176 | for_each_possible_cpu(cpu) | 170 | for_each_possible_cpu(cpu) |
| 177 | per_cpu(tracing_cpu, cpu) = 0; | 171 | per_cpu(tracing_cpu, cpu) = 0; |
| 178 | 172 | ||
| 179 | tracing_max_latency = 0; | 173 | tr->max_latency = 0; |
| 180 | tracing_reset_online_cpus(&irqsoff_trace->trace_buffer); | 174 | tracing_reset_online_cpus(&irqsoff_trace->trace_buffer); |
| 181 | 175 | ||
| 182 | return start_irqsoff_tracer(irqsoff_trace, set); | 176 | return start_irqsoff_tracer(irqsoff_trace, set); |
| @@ -303,13 +297,13 @@ static void irqsoff_print_header(struct seq_file *s) | |||
| 303 | /* | 297 | /* |
| 304 | * Should this new latency be reported/recorded? | 298 | * Should this new latency be reported/recorded? |
| 305 | */ | 299 | */ |
| 306 | static int report_latency(cycle_t delta) | 300 | static int report_latency(struct trace_array *tr, cycle_t delta) |
| 307 | { | 301 | { |
| 308 | if (tracing_thresh) { | 302 | if (tracing_thresh) { |
| 309 | if (delta < tracing_thresh) | 303 | if (delta < tracing_thresh) |
| 310 | return 0; | 304 | return 0; |
| 311 | } else { | 305 | } else { |
| 312 | if (delta <= tracing_max_latency) | 306 | if (delta <= tr->max_latency) |
| 313 | return 0; | 307 | return 0; |
| 314 | } | 308 | } |
| 315 | return 1; | 309 | return 1; |
| @@ -333,13 +327,13 @@ check_critical_timing(struct trace_array *tr, | |||
| 333 | 327 | ||
| 334 | pc = preempt_count(); | 328 | pc = preempt_count(); |
| 335 | 329 | ||
| 336 | if (!report_latency(delta)) | 330 | if (!report_latency(tr, delta)) |
| 337 | goto out; | 331 | goto out; |
| 338 | 332 | ||
| 339 | raw_spin_lock_irqsave(&max_trace_lock, flags); | 333 | raw_spin_lock_irqsave(&max_trace_lock, flags); |
| 340 | 334 | ||
| 341 | /* check if we are still the max latency */ | 335 | /* check if we are still the max latency */ |
| 342 | if (!report_latency(delta)) | 336 | if (!report_latency(tr, delta)) |
| 343 | goto out_unlock; | 337 | goto out_unlock; |
| 344 | 338 | ||
| 345 | __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); | 339 | __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); |
| @@ -352,7 +346,7 @@ check_critical_timing(struct trace_array *tr, | |||
| 352 | data->critical_end = parent_ip; | 346 | data->critical_end = parent_ip; |
| 353 | 347 | ||
| 354 | if (likely(!is_tracing_stopped())) { | 348 | if (likely(!is_tracing_stopped())) { |
| 355 | tracing_max_latency = delta; | 349 | tr->max_latency = delta; |
| 356 | update_max_tr_single(tr, current, cpu); | 350 | update_max_tr_single(tr, current, cpu); |
| 357 | } | 351 | } |
| 358 | 352 | ||
| @@ -531,7 +525,7 @@ void trace_preempt_off(unsigned long a0, unsigned long a1) | |||
| 531 | } | 525 | } |
| 532 | #endif /* CONFIG_PREEMPT_TRACER */ | 526 | #endif /* CONFIG_PREEMPT_TRACER */ |
| 533 | 527 | ||
| 534 | static int register_irqsoff_function(int graph, int set) | 528 | static int register_irqsoff_function(struct trace_array *tr, int graph, int set) |
| 535 | { | 529 | { |
| 536 | int ret; | 530 | int ret; |
| 537 | 531 | ||
| @@ -543,7 +537,7 @@ static int register_irqsoff_function(int graph, int set) | |||
| 543 | ret = register_ftrace_graph(&irqsoff_graph_return, | 537 | ret = register_ftrace_graph(&irqsoff_graph_return, |
| 544 | &irqsoff_graph_entry); | 538 | &irqsoff_graph_entry); |
| 545 | else | 539 | else |
| 546 | ret = register_ftrace_function(&trace_ops); | 540 | ret = register_ftrace_function(tr->ops); |
| 547 | 541 | ||
| 548 | if (!ret) | 542 | if (!ret) |
| 549 | function_enabled = true; | 543 | function_enabled = true; |
| @@ -551,7 +545,7 @@ static int register_irqsoff_function(int graph, int set) | |||
| 551 | return ret; | 545 | return ret; |
| 552 | } | 546 | } |
| 553 | 547 | ||
| 554 | static void unregister_irqsoff_function(int graph) | 548 | static void unregister_irqsoff_function(struct trace_array *tr, int graph) |
| 555 | { | 549 | { |
| 556 | if (!function_enabled) | 550 | if (!function_enabled) |
| 557 | return; | 551 | return; |
| @@ -559,17 +553,17 @@ static void unregister_irqsoff_function(int graph) | |||
| 559 | if (graph) | 553 | if (graph) |
| 560 | unregister_ftrace_graph(); | 554 | unregister_ftrace_graph(); |
| 561 | else | 555 | else |
| 562 | unregister_ftrace_function(&trace_ops); | 556 | unregister_ftrace_function(tr->ops); |
| 563 | 557 | ||
| 564 | function_enabled = false; | 558 | function_enabled = false; |
| 565 | } | 559 | } |
| 566 | 560 | ||
| 567 | static void irqsoff_function_set(int set) | 561 | static void irqsoff_function_set(struct trace_array *tr, int set) |
| 568 | { | 562 | { |
| 569 | if (set) | 563 | if (set) |
| 570 | register_irqsoff_function(is_graph(), 1); | 564 | register_irqsoff_function(tr, is_graph(), 1); |
| 571 | else | 565 | else |
| 572 | unregister_irqsoff_function(is_graph()); | 566 | unregister_irqsoff_function(tr, is_graph()); |
| 573 | } | 567 | } |
| 574 | 568 | ||
| 575 | static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set) | 569 | static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set) |
| @@ -577,7 +571,7 @@ static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set) | |||
| 577 | struct tracer *tracer = tr->current_trace; | 571 | struct tracer *tracer = tr->current_trace; |
| 578 | 572 | ||
| 579 | if (mask & TRACE_ITER_FUNCTION) | 573 | if (mask & TRACE_ITER_FUNCTION) |
| 580 | irqsoff_function_set(set); | 574 | irqsoff_function_set(tr, set); |
| 581 | 575 | ||
| 582 | return trace_keep_overwrite(tracer, mask, set); | 576 | return trace_keep_overwrite(tracer, mask, set); |
| 583 | } | 577 | } |
| @@ -586,7 +580,7 @@ static int start_irqsoff_tracer(struct trace_array *tr, int graph) | |||
| 586 | { | 580 | { |
| 587 | int ret; | 581 | int ret; |
| 588 | 582 | ||
| 589 | ret = register_irqsoff_function(graph, 0); | 583 | ret = register_irqsoff_function(tr, graph, 0); |
| 590 | 584 | ||
| 591 | if (!ret && tracing_is_enabled()) | 585 | if (!ret && tracing_is_enabled()) |
| 592 | tracer_enabled = 1; | 586 | tracer_enabled = 1; |
| @@ -600,25 +594,37 @@ static void stop_irqsoff_tracer(struct trace_array *tr, int graph) | |||
| 600 | { | 594 | { |
| 601 | tracer_enabled = 0; | 595 | tracer_enabled = 0; |
| 602 | 596 | ||
| 603 | unregister_irqsoff_function(graph); | 597 | unregister_irqsoff_function(tr, graph); |
| 604 | } | 598 | } |
| 605 | 599 | ||
| 606 | static void __irqsoff_tracer_init(struct trace_array *tr) | 600 | static bool irqsoff_busy; |
| 601 | |||
| 602 | static int __irqsoff_tracer_init(struct trace_array *tr) | ||
| 607 | { | 603 | { |
| 604 | if (irqsoff_busy) | ||
| 605 | return -EBUSY; | ||
| 606 | |||
| 608 | save_flags = trace_flags; | 607 | save_flags = trace_flags; |
| 609 | 608 | ||
| 610 | /* non overwrite screws up the latency tracers */ | 609 | /* non overwrite screws up the latency tracers */ |
| 611 | set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); | 610 | set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); |
| 612 | set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1); | 611 | set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1); |
| 613 | 612 | ||
| 614 | tracing_max_latency = 0; | 613 | tr->max_latency = 0; |
| 615 | irqsoff_trace = tr; | 614 | irqsoff_trace = tr; |
| 616 | /* make sure that the tracer is visible */ | 615 | /* make sure that the tracer is visible */ |
| 617 | smp_wmb(); | 616 | smp_wmb(); |
| 618 | tracing_reset_online_cpus(&tr->trace_buffer); | 617 | tracing_reset_online_cpus(&tr->trace_buffer); |
| 619 | 618 | ||
| 620 | if (start_irqsoff_tracer(tr, is_graph())) | 619 | ftrace_init_array_ops(tr, irqsoff_tracer_call); |
| 620 | |||
| 621 | /* Only toplevel instance supports graph tracing */ | ||
| 622 | if (start_irqsoff_tracer(tr, (tr->flags & TRACE_ARRAY_FL_GLOBAL && | ||
| 623 | is_graph()))) | ||
| 621 | printk(KERN_ERR "failed to start irqsoff tracer\n"); | 624 | printk(KERN_ERR "failed to start irqsoff tracer\n"); |
| 625 | |||
| 626 | irqsoff_busy = true; | ||
| 627 | return 0; | ||
| 622 | } | 628 | } |
| 623 | 629 | ||
| 624 | static void irqsoff_tracer_reset(struct trace_array *tr) | 630 | static void irqsoff_tracer_reset(struct trace_array *tr) |
| @@ -630,6 +636,9 @@ static void irqsoff_tracer_reset(struct trace_array *tr) | |||
| 630 | 636 | ||
| 631 | set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); | 637 | set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); |
| 632 | set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag); | 638 | set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag); |
| 639 | ftrace_reset_array_ops(tr); | ||
| 640 | |||
| 641 | irqsoff_busy = false; | ||
| 633 | } | 642 | } |
| 634 | 643 | ||
| 635 | static void irqsoff_tracer_start(struct trace_array *tr) | 644 | static void irqsoff_tracer_start(struct trace_array *tr) |
| @@ -647,8 +656,7 @@ static int irqsoff_tracer_init(struct trace_array *tr) | |||
| 647 | { | 656 | { |
| 648 | trace_type = TRACER_IRQS_OFF; | 657 | trace_type = TRACER_IRQS_OFF; |
| 649 | 658 | ||
| 650 | __irqsoff_tracer_init(tr); | 659 | return __irqsoff_tracer_init(tr); |
| 651 | return 0; | ||
| 652 | } | 660 | } |
| 653 | static struct tracer irqsoff_tracer __read_mostly = | 661 | static struct tracer irqsoff_tracer __read_mostly = |
| 654 | { | 662 | { |
| @@ -668,6 +676,7 @@ static struct tracer irqsoff_tracer __read_mostly = | |||
| 668 | #endif | 676 | #endif |
| 669 | .open = irqsoff_trace_open, | 677 | .open = irqsoff_trace_open, |
| 670 | .close = irqsoff_trace_close, | 678 | .close = irqsoff_trace_close, |
| 679 | .allow_instances = true, | ||
| 671 | .use_max_tr = true, | 680 | .use_max_tr = true, |
| 672 | }; | 681 | }; |
| 673 | # define register_irqsoff(trace) register_tracer(&trace) | 682 | # define register_irqsoff(trace) register_tracer(&trace) |
| @@ -680,8 +689,7 @@ static int preemptoff_tracer_init(struct trace_array *tr) | |||
| 680 | { | 689 | { |
| 681 | trace_type = TRACER_PREEMPT_OFF; | 690 | trace_type = TRACER_PREEMPT_OFF; |
| 682 | 691 | ||
| 683 | __irqsoff_tracer_init(tr); | 692 | return __irqsoff_tracer_init(tr); |
| 684 | return 0; | ||
| 685 | } | 693 | } |
| 686 | 694 | ||
| 687 | static struct tracer preemptoff_tracer __read_mostly = | 695 | static struct tracer preemptoff_tracer __read_mostly = |
| @@ -702,6 +710,7 @@ static struct tracer preemptoff_tracer __read_mostly = | |||
| 702 | #endif | 710 | #endif |
| 703 | .open = irqsoff_trace_open, | 711 | .open = irqsoff_trace_open, |
| 704 | .close = irqsoff_trace_close, | 712 | .close = irqsoff_trace_close, |
| 713 | .allow_instances = true, | ||
| 705 | .use_max_tr = true, | 714 | .use_max_tr = true, |
| 706 | }; | 715 | }; |
| 707 | # define register_preemptoff(trace) register_tracer(&trace) | 716 | # define register_preemptoff(trace) register_tracer(&trace) |
| @@ -716,8 +725,7 @@ static int preemptirqsoff_tracer_init(struct trace_array *tr) | |||
| 716 | { | 725 | { |
| 717 | trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF; | 726 | trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF; |
| 718 | 727 | ||
| 719 | __irqsoff_tracer_init(tr); | 728 | return __irqsoff_tracer_init(tr); |
| 720 | return 0; | ||
| 721 | } | 729 | } |
| 722 | 730 | ||
| 723 | static struct tracer preemptirqsoff_tracer __read_mostly = | 731 | static struct tracer preemptirqsoff_tracer __read_mostly = |
| @@ -738,6 +746,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly = | |||
| 738 | #endif | 746 | #endif |
| 739 | .open = irqsoff_trace_open, | 747 | .open = irqsoff_trace_open, |
| 740 | .close = irqsoff_trace_close, | 748 | .close = irqsoff_trace_close, |
| 749 | .allow_instances = true, | ||
| 741 | .use_max_tr = true, | 750 | .use_max_tr = true, |
| 742 | }; | 751 | }; |
| 743 | 752 | ||
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 903ae28962be..ef2fba1f46b5 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c | |||
| @@ -1377,6 +1377,9 @@ static __init int kprobe_trace_self_tests_init(void) | |||
| 1377 | struct trace_kprobe *tk; | 1377 | struct trace_kprobe *tk; |
| 1378 | struct ftrace_event_file *file; | 1378 | struct ftrace_event_file *file; |
| 1379 | 1379 | ||
| 1380 | if (tracing_is_disabled()) | ||
| 1381 | return -ENODEV; | ||
| 1382 | |||
| 1380 | target = kprobe_trace_selftest_target; | 1383 | target = kprobe_trace_selftest_target; |
| 1381 | 1384 | ||
| 1382 | pr_info("Testing kprobe tracing: "); | 1385 | pr_info("Testing kprobe tracing: "); |
diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c index 69a5cc94c01a..fcf0a9e48916 100644 --- a/kernel/trace/trace_nop.c +++ b/kernel/trace/trace_nop.c | |||
| @@ -91,7 +91,6 @@ struct tracer nop_trace __read_mostly = | |||
| 91 | .name = "nop", | 91 | .name = "nop", |
| 92 | .init = nop_trace_init, | 92 | .init = nop_trace_init, |
| 93 | .reset = nop_trace_reset, | 93 | .reset = nop_trace_reset, |
| 94 | .wait_pipe = poll_wait_pipe, | ||
| 95 | #ifdef CONFIG_FTRACE_SELFTEST | 94 | #ifdef CONFIG_FTRACE_SELFTEST |
| 96 | .selftest = trace_selftest_startup_nop, | 95 | .selftest = trace_selftest_startup_nop, |
| 97 | #endif | 96 | #endif |
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index a436de18aa99..f3dad80c20b2 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c | |||
| @@ -126,6 +126,34 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...) | |||
| 126 | EXPORT_SYMBOL_GPL(trace_seq_printf); | 126 | EXPORT_SYMBOL_GPL(trace_seq_printf); |
| 127 | 127 | ||
| 128 | /** | 128 | /** |
| 129 | * trace_seq_bitmask - put a list of longs as a bitmask print output | ||
| 130 | * @s: trace sequence descriptor | ||
| 131 | * @maskp: points to an array of unsigned longs that represent a bitmask | ||
| 132 | * @nmaskbits: The number of bits that are valid in @maskp | ||
| 133 | * | ||
| 134 | * It returns 0 if the trace oversizes the buffer's free | ||
| 135 | * space, 1 otherwise. | ||
| 136 | * | ||
| 137 | * Writes a ASCII representation of a bitmask string into @s. | ||
| 138 | */ | ||
| 139 | int | ||
| 140 | trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp, | ||
| 141 | int nmaskbits) | ||
| 142 | { | ||
| 143 | int len = (PAGE_SIZE - 1) - s->len; | ||
| 144 | int ret; | ||
| 145 | |||
| 146 | if (s->full || !len) | ||
| 147 | return 0; | ||
| 148 | |||
| 149 | ret = bitmap_scnprintf(s->buffer, len, maskp, nmaskbits); | ||
| 150 | s->len += ret; | ||
| 151 | |||
| 152 | return 1; | ||
| 153 | } | ||
| 154 | EXPORT_SYMBOL_GPL(trace_seq_bitmask); | ||
| 155 | |||
| 156 | /** | ||
| 129 | * trace_seq_vprintf - sequence printing of trace information | 157 | * trace_seq_vprintf - sequence printing of trace information |
| 130 | * @s: trace sequence descriptor | 158 | * @s: trace sequence descriptor |
| 131 | * @fmt: printf format string | 159 | * @fmt: printf format string |
| @@ -399,6 +427,19 @@ EXPORT_SYMBOL(ftrace_print_symbols_seq_u64); | |||
| 399 | #endif | 427 | #endif |
| 400 | 428 | ||
| 401 | const char * | 429 | const char * |
| 430 | ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr, | ||
| 431 | unsigned int bitmask_size) | ||
| 432 | { | ||
| 433 | const char *ret = p->buffer + p->len; | ||
| 434 | |||
| 435 | trace_seq_bitmask(p, bitmask_ptr, bitmask_size * 8); | ||
| 436 | trace_seq_putc(p, 0); | ||
| 437 | |||
| 438 | return ret; | ||
| 439 | } | ||
| 440 | EXPORT_SYMBOL_GPL(ftrace_print_bitmask_seq); | ||
| 441 | |||
| 442 | const char * | ||
| 402 | ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len) | 443 | ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len) |
| 403 | { | 444 | { |
| 404 | int i; | 445 | int i; |
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index e14da5e97a69..19bd8928ce94 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c | |||
| @@ -130,15 +130,9 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip, | |||
| 130 | atomic_dec(&data->disabled); | 130 | atomic_dec(&data->disabled); |
| 131 | preempt_enable_notrace(); | 131 | preempt_enable_notrace(); |
| 132 | } | 132 | } |
| 133 | |||
| 134 | static struct ftrace_ops trace_ops __read_mostly = | ||
| 135 | { | ||
| 136 | .func = wakeup_tracer_call, | ||
| 137 | .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, | ||
| 138 | }; | ||
| 139 | #endif /* CONFIG_FUNCTION_TRACER */ | 133 | #endif /* CONFIG_FUNCTION_TRACER */ |
| 140 | 134 | ||
| 141 | static int register_wakeup_function(int graph, int set) | 135 | static int register_wakeup_function(struct trace_array *tr, int graph, int set) |
| 142 | { | 136 | { |
| 143 | int ret; | 137 | int ret; |
| 144 | 138 | ||
| @@ -150,7 +144,7 @@ static int register_wakeup_function(int graph, int set) | |||
| 150 | ret = register_ftrace_graph(&wakeup_graph_return, | 144 | ret = register_ftrace_graph(&wakeup_graph_return, |
| 151 | &wakeup_graph_entry); | 145 | &wakeup_graph_entry); |
| 152 | else | 146 | else |
| 153 | ret = register_ftrace_function(&trace_ops); | 147 | ret = register_ftrace_function(tr->ops); |
| 154 | 148 | ||
| 155 | if (!ret) | 149 | if (!ret) |
| 156 | function_enabled = true; | 150 | function_enabled = true; |
| @@ -158,7 +152,7 @@ static int register_wakeup_function(int graph, int set) | |||
| 158 | return ret; | 152 | return ret; |
| 159 | } | 153 | } |
| 160 | 154 | ||
| 161 | static void unregister_wakeup_function(int graph) | 155 | static void unregister_wakeup_function(struct trace_array *tr, int graph) |
| 162 | { | 156 | { |
| 163 | if (!function_enabled) | 157 | if (!function_enabled) |
| 164 | return; | 158 | return; |
| @@ -166,17 +160,17 @@ static void unregister_wakeup_function(int graph) | |||
| 166 | if (graph) | 160 | if (graph) |
| 167 | unregister_ftrace_graph(); | 161 | unregister_ftrace_graph(); |
| 168 | else | 162 | else |
| 169 | unregister_ftrace_function(&trace_ops); | 163 | unregister_ftrace_function(tr->ops); |
| 170 | 164 | ||
| 171 | function_enabled = false; | 165 | function_enabled = false; |
| 172 | } | 166 | } |
| 173 | 167 | ||
| 174 | static void wakeup_function_set(int set) | 168 | static void wakeup_function_set(struct trace_array *tr, int set) |
| 175 | { | 169 | { |
| 176 | if (set) | 170 | if (set) |
| 177 | register_wakeup_function(is_graph(), 1); | 171 | register_wakeup_function(tr, is_graph(), 1); |
| 178 | else | 172 | else |
| 179 | unregister_wakeup_function(is_graph()); | 173 | unregister_wakeup_function(tr, is_graph()); |
| 180 | } | 174 | } |
| 181 | 175 | ||
| 182 | static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set) | 176 | static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set) |
| @@ -184,16 +178,16 @@ static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set) | |||
| 184 | struct tracer *tracer = tr->current_trace; | 178 | struct tracer *tracer = tr->current_trace; |
| 185 | 179 | ||
| 186 | if (mask & TRACE_ITER_FUNCTION) | 180 | if (mask & TRACE_ITER_FUNCTION) |
| 187 | wakeup_function_set(set); | 181 | wakeup_function_set(tr, set); |
| 188 | 182 | ||
| 189 | return trace_keep_overwrite(tracer, mask, set); | 183 | return trace_keep_overwrite(tracer, mask, set); |
| 190 | } | 184 | } |
| 191 | 185 | ||
| 192 | static int start_func_tracer(int graph) | 186 | static int start_func_tracer(struct trace_array *tr, int graph) |
| 193 | { | 187 | { |
| 194 | int ret; | 188 | int ret; |
| 195 | 189 | ||
| 196 | ret = register_wakeup_function(graph, 0); | 190 | ret = register_wakeup_function(tr, graph, 0); |
| 197 | 191 | ||
| 198 | if (!ret && tracing_is_enabled()) | 192 | if (!ret && tracing_is_enabled()) |
| 199 | tracer_enabled = 1; | 193 | tracer_enabled = 1; |
| @@ -203,11 +197,11 @@ static int start_func_tracer(int graph) | |||
| 203 | return ret; | 197 | return ret; |
| 204 | } | 198 | } |
| 205 | 199 | ||
| 206 | static void stop_func_tracer(int graph) | 200 | static void stop_func_tracer(struct trace_array *tr, int graph) |
| 207 | { | 201 | { |
| 208 | tracer_enabled = 0; | 202 | tracer_enabled = 0; |
| 209 | 203 | ||
| 210 | unregister_wakeup_function(graph); | 204 | unregister_wakeup_function(tr, graph); |
| 211 | } | 205 | } |
| 212 | 206 | ||
| 213 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | 207 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER |
| @@ -221,12 +215,12 @@ wakeup_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) | |||
| 221 | if (!(is_graph() ^ set)) | 215 | if (!(is_graph() ^ set)) |
| 222 | return 0; | 216 | return 0; |
| 223 | 217 | ||
| 224 | stop_func_tracer(!set); | 218 | stop_func_tracer(tr, !set); |
| 225 | 219 | ||
| 226 | wakeup_reset(wakeup_trace); | 220 | wakeup_reset(wakeup_trace); |
| 227 | tracing_max_latency = 0; | 221 | tr->max_latency = 0; |
| 228 | 222 | ||
| 229 | return start_func_tracer(set); | 223 | return start_func_tracer(tr, set); |
| 230 | } | 224 | } |
| 231 | 225 | ||
| 232 | static int wakeup_graph_entry(struct ftrace_graph_ent *trace) | 226 | static int wakeup_graph_entry(struct ftrace_graph_ent *trace) |
| @@ -350,13 +344,13 @@ static void wakeup_print_header(struct seq_file *s) | |||
| 350 | /* | 344 | /* |
| 351 | * Should this new latency be reported/recorded? | 345 | * Should this new latency be reported/recorded? |
| 352 | */ | 346 | */ |
| 353 | static int report_latency(cycle_t delta) | 347 | static int report_latency(struct trace_array *tr, cycle_t delta) |
| 354 | { | 348 | { |
| 355 | if (tracing_thresh) { | 349 | if (tracing_thresh) { |
| 356 | if (delta < tracing_thresh) | 350 | if (delta < tracing_thresh) |
| 357 | return 0; | 351 | return 0; |
| 358 | } else { | 352 | } else { |
| 359 | if (delta <= tracing_max_latency) | 353 | if (delta <= tr->max_latency) |
| 360 | return 0; | 354 | return 0; |
| 361 | } | 355 | } |
| 362 | return 1; | 356 | return 1; |
| @@ -424,11 +418,11 @@ probe_wakeup_sched_switch(void *ignore, | |||
| 424 | T1 = ftrace_now(cpu); | 418 | T1 = ftrace_now(cpu); |
| 425 | delta = T1-T0; | 419 | delta = T1-T0; |
| 426 | 420 | ||
| 427 | if (!report_latency(delta)) | 421 | if (!report_latency(wakeup_trace, delta)) |
| 428 | goto out_unlock; | 422 | goto out_unlock; |
| 429 | 423 | ||
| 430 | if (likely(!is_tracing_stopped())) { | 424 | if (likely(!is_tracing_stopped())) { |
| 431 | tracing_max_latency = delta; | 425 | wakeup_trace->max_latency = delta; |
| 432 | update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu); | 426 | update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu); |
| 433 | } | 427 | } |
| 434 | 428 | ||
| @@ -587,7 +581,7 @@ static void start_wakeup_tracer(struct trace_array *tr) | |||
| 587 | */ | 581 | */ |
| 588 | smp_wmb(); | 582 | smp_wmb(); |
| 589 | 583 | ||
| 590 | if (start_func_tracer(is_graph())) | 584 | if (start_func_tracer(tr, is_graph())) |
| 591 | printk(KERN_ERR "failed to start wakeup tracer\n"); | 585 | printk(KERN_ERR "failed to start wakeup tracer\n"); |
| 592 | 586 | ||
| 593 | return; | 587 | return; |
| @@ -600,13 +594,15 @@ fail_deprobe: | |||
| 600 | static void stop_wakeup_tracer(struct trace_array *tr) | 594 | static void stop_wakeup_tracer(struct trace_array *tr) |
| 601 | { | 595 | { |
| 602 | tracer_enabled = 0; | 596 | tracer_enabled = 0; |
| 603 | stop_func_tracer(is_graph()); | 597 | stop_func_tracer(tr, is_graph()); |
| 604 | unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL); | 598 | unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL); |
| 605 | unregister_trace_sched_wakeup_new(probe_wakeup, NULL); | 599 | unregister_trace_sched_wakeup_new(probe_wakeup, NULL); |
| 606 | unregister_trace_sched_wakeup(probe_wakeup, NULL); | 600 | unregister_trace_sched_wakeup(probe_wakeup, NULL); |
| 607 | unregister_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL); | 601 | unregister_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL); |
| 608 | } | 602 | } |
| 609 | 603 | ||
| 604 | static bool wakeup_busy; | ||
| 605 | |||
| 610 | static int __wakeup_tracer_init(struct trace_array *tr) | 606 | static int __wakeup_tracer_init(struct trace_array *tr) |
| 611 | { | 607 | { |
| 612 | save_flags = trace_flags; | 608 | save_flags = trace_flags; |
| @@ -615,14 +611,20 @@ static int __wakeup_tracer_init(struct trace_array *tr) | |||
| 615 | set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); | 611 | set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); |
| 616 | set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1); | 612 | set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1); |
| 617 | 613 | ||
| 618 | tracing_max_latency = 0; | 614 | tr->max_latency = 0; |
| 619 | wakeup_trace = tr; | 615 | wakeup_trace = tr; |
| 616 | ftrace_init_array_ops(tr, wakeup_tracer_call); | ||
| 620 | start_wakeup_tracer(tr); | 617 | start_wakeup_tracer(tr); |
| 618 | |||
| 619 | wakeup_busy = true; | ||
| 621 | return 0; | 620 | return 0; |
| 622 | } | 621 | } |
| 623 | 622 | ||
| 624 | static int wakeup_tracer_init(struct trace_array *tr) | 623 | static int wakeup_tracer_init(struct trace_array *tr) |
| 625 | { | 624 | { |
| 625 | if (wakeup_busy) | ||
| 626 | return -EBUSY; | ||
| 627 | |||
| 626 | wakeup_dl = 0; | 628 | wakeup_dl = 0; |
| 627 | wakeup_rt = 0; | 629 | wakeup_rt = 0; |
| 628 | return __wakeup_tracer_init(tr); | 630 | return __wakeup_tracer_init(tr); |
| @@ -630,6 +632,9 @@ static int wakeup_tracer_init(struct trace_array *tr) | |||
| 630 | 632 | ||
| 631 | static int wakeup_rt_tracer_init(struct trace_array *tr) | 633 | static int wakeup_rt_tracer_init(struct trace_array *tr) |
| 632 | { | 634 | { |
| 635 | if (wakeup_busy) | ||
| 636 | return -EBUSY; | ||
| 637 | |||
| 633 | wakeup_dl = 0; | 638 | wakeup_dl = 0; |
| 634 | wakeup_rt = 1; | 639 | wakeup_rt = 1; |
| 635 | return __wakeup_tracer_init(tr); | 640 | return __wakeup_tracer_init(tr); |
| @@ -637,6 +642,9 @@ static int wakeup_rt_tracer_init(struct trace_array *tr) | |||
| 637 | 642 | ||
| 638 | static int wakeup_dl_tracer_init(struct trace_array *tr) | 643 | static int wakeup_dl_tracer_init(struct trace_array *tr) |
| 639 | { | 644 | { |
| 645 | if (wakeup_busy) | ||
| 646 | return -EBUSY; | ||
| 647 | |||
| 640 | wakeup_dl = 1; | 648 | wakeup_dl = 1; |
| 641 | wakeup_rt = 0; | 649 | wakeup_rt = 0; |
| 642 | return __wakeup_tracer_init(tr); | 650 | return __wakeup_tracer_init(tr); |
| @@ -653,6 +661,8 @@ static void wakeup_tracer_reset(struct trace_array *tr) | |||
| 653 | 661 | ||
| 654 | set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); | 662 | set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); |
| 655 | set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag); | 663 | set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag); |
| 664 | ftrace_reset_array_ops(tr); | ||
| 665 | wakeup_busy = false; | ||
| 656 | } | 666 | } |
| 657 | 667 | ||
| 658 | static void wakeup_tracer_start(struct trace_array *tr) | 668 | static void wakeup_tracer_start(struct trace_array *tr) |
| @@ -684,6 +694,7 @@ static struct tracer wakeup_tracer __read_mostly = | |||
| 684 | #endif | 694 | #endif |
| 685 | .open = wakeup_trace_open, | 695 | .open = wakeup_trace_open, |
| 686 | .close = wakeup_trace_close, | 696 | .close = wakeup_trace_close, |
| 697 | .allow_instances = true, | ||
| 687 | .use_max_tr = true, | 698 | .use_max_tr = true, |
| 688 | }; | 699 | }; |
| 689 | 700 | ||
| @@ -694,7 +705,6 @@ static struct tracer wakeup_rt_tracer __read_mostly = | |||
| 694 | .reset = wakeup_tracer_reset, | 705 | .reset = wakeup_tracer_reset, |
| 695 | .start = wakeup_tracer_start, | 706 | .start = wakeup_tracer_start, |
| 696 | .stop = wakeup_tracer_stop, | 707 | .stop = wakeup_tracer_stop, |
| 697 | .wait_pipe = poll_wait_pipe, | ||
| 698 | .print_max = true, | 708 | .print_max = true, |
| 699 | .print_header = wakeup_print_header, | 709 | .print_header = wakeup_print_header, |
| 700 | .print_line = wakeup_print_line, | 710 | .print_line = wakeup_print_line, |
| @@ -706,6 +716,7 @@ static struct tracer wakeup_rt_tracer __read_mostly = | |||
| 706 | #endif | 716 | #endif |
| 707 | .open = wakeup_trace_open, | 717 | .open = wakeup_trace_open, |
| 708 | .close = wakeup_trace_close, | 718 | .close = wakeup_trace_close, |
| 719 | .allow_instances = true, | ||
| 709 | .use_max_tr = true, | 720 | .use_max_tr = true, |
| 710 | }; | 721 | }; |
| 711 | 722 | ||
| @@ -716,7 +727,6 @@ static struct tracer wakeup_dl_tracer __read_mostly = | |||
| 716 | .reset = wakeup_tracer_reset, | 727 | .reset = wakeup_tracer_reset, |
| 717 | .start = wakeup_tracer_start, | 728 | .start = wakeup_tracer_start, |
| 718 | .stop = wakeup_tracer_stop, | 729 | .stop = wakeup_tracer_stop, |
| 719 | .wait_pipe = poll_wait_pipe, | ||
| 720 | .print_max = true, | 730 | .print_max = true, |
| 721 | .print_header = wakeup_print_header, | 731 | .print_header = wakeup_print_header, |
| 722 | .print_line = wakeup_print_line, | 732 | .print_line = wakeup_print_line, |
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index e98fca60974f..5ef60499dc8e 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c | |||
| @@ -65,7 +65,7 @@ static int trace_test_buffer(struct trace_buffer *buf, unsigned long *count) | |||
| 65 | 65 | ||
| 66 | /* Don't allow flipping of max traces now */ | 66 | /* Don't allow flipping of max traces now */ |
| 67 | local_irq_save(flags); | 67 | local_irq_save(flags); |
| 68 | arch_spin_lock(&ftrace_max_lock); | 68 | arch_spin_lock(&buf->tr->max_lock); |
| 69 | 69 | ||
| 70 | cnt = ring_buffer_entries(buf->buffer); | 70 | cnt = ring_buffer_entries(buf->buffer); |
| 71 | 71 | ||
| @@ -83,7 +83,7 @@ static int trace_test_buffer(struct trace_buffer *buf, unsigned long *count) | |||
| 83 | break; | 83 | break; |
| 84 | } | 84 | } |
| 85 | tracing_on(); | 85 | tracing_on(); |
| 86 | arch_spin_unlock(&ftrace_max_lock); | 86 | arch_spin_unlock(&buf->tr->max_lock); |
| 87 | local_irq_restore(flags); | 87 | local_irq_restore(flags); |
| 88 | 88 | ||
| 89 | if (count) | 89 | if (count) |
| @@ -161,11 +161,6 @@ static struct ftrace_ops test_probe3 = { | |||
| 161 | .flags = FTRACE_OPS_FL_RECURSION_SAFE, | 161 | .flags = FTRACE_OPS_FL_RECURSION_SAFE, |
| 162 | }; | 162 | }; |
| 163 | 163 | ||
| 164 | static struct ftrace_ops test_global = { | ||
| 165 | .func = trace_selftest_test_global_func, | ||
| 166 | .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, | ||
| 167 | }; | ||
| 168 | |||
| 169 | static void print_counts(void) | 164 | static void print_counts(void) |
| 170 | { | 165 | { |
| 171 | printk("(%d %d %d %d %d) ", | 166 | printk("(%d %d %d %d %d) ", |
| @@ -185,7 +180,7 @@ static void reset_counts(void) | |||
| 185 | trace_selftest_test_dyn_cnt = 0; | 180 | trace_selftest_test_dyn_cnt = 0; |
| 186 | } | 181 | } |
| 187 | 182 | ||
| 188 | static int trace_selftest_ops(int cnt) | 183 | static int trace_selftest_ops(struct trace_array *tr, int cnt) |
| 189 | { | 184 | { |
| 190 | int save_ftrace_enabled = ftrace_enabled; | 185 | int save_ftrace_enabled = ftrace_enabled; |
| 191 | struct ftrace_ops *dyn_ops; | 186 | struct ftrace_ops *dyn_ops; |
| @@ -220,7 +215,11 @@ static int trace_selftest_ops(int cnt) | |||
| 220 | register_ftrace_function(&test_probe1); | 215 | register_ftrace_function(&test_probe1); |
| 221 | register_ftrace_function(&test_probe2); | 216 | register_ftrace_function(&test_probe2); |
| 222 | register_ftrace_function(&test_probe3); | 217 | register_ftrace_function(&test_probe3); |
| 223 | register_ftrace_function(&test_global); | 218 | /* First time we are running with main function */ |
| 219 | if (cnt > 1) { | ||
| 220 | ftrace_init_array_ops(tr, trace_selftest_test_global_func); | ||
| 221 | register_ftrace_function(tr->ops); | ||
| 222 | } | ||
| 224 | 223 | ||
| 225 | DYN_FTRACE_TEST_NAME(); | 224 | DYN_FTRACE_TEST_NAME(); |
| 226 | 225 | ||
| @@ -232,8 +231,10 @@ static int trace_selftest_ops(int cnt) | |||
| 232 | goto out; | 231 | goto out; |
| 233 | if (trace_selftest_test_probe3_cnt != 1) | 232 | if (trace_selftest_test_probe3_cnt != 1) |
| 234 | goto out; | 233 | goto out; |
| 235 | if (trace_selftest_test_global_cnt == 0) | 234 | if (cnt > 1) { |
| 236 | goto out; | 235 | if (trace_selftest_test_global_cnt == 0) |
| 236 | goto out; | ||
| 237 | } | ||
| 237 | 238 | ||
| 238 | DYN_FTRACE_TEST_NAME2(); | 239 | DYN_FTRACE_TEST_NAME2(); |
| 239 | 240 | ||
| @@ -269,8 +270,10 @@ static int trace_selftest_ops(int cnt) | |||
| 269 | goto out_free; | 270 | goto out_free; |
| 270 | if (trace_selftest_test_probe3_cnt != 3) | 271 | if (trace_selftest_test_probe3_cnt != 3) |
| 271 | goto out_free; | 272 | goto out_free; |
| 272 | if (trace_selftest_test_global_cnt == 0) | 273 | if (cnt > 1) { |
| 273 | goto out; | 274 | if (trace_selftest_test_global_cnt == 0) |
| 275 | goto out; | ||
| 276 | } | ||
| 274 | if (trace_selftest_test_dyn_cnt == 0) | 277 | if (trace_selftest_test_dyn_cnt == 0) |
| 275 | goto out_free; | 278 | goto out_free; |
| 276 | 279 | ||
| @@ -295,7 +298,9 @@ static int trace_selftest_ops(int cnt) | |||
| 295 | unregister_ftrace_function(&test_probe1); | 298 | unregister_ftrace_function(&test_probe1); |
| 296 | unregister_ftrace_function(&test_probe2); | 299 | unregister_ftrace_function(&test_probe2); |
| 297 | unregister_ftrace_function(&test_probe3); | 300 | unregister_ftrace_function(&test_probe3); |
| 298 | unregister_ftrace_function(&test_global); | 301 | if (cnt > 1) |
| 302 | unregister_ftrace_function(tr->ops); | ||
| 303 | ftrace_reset_array_ops(tr); | ||
| 299 | 304 | ||
| 300 | /* Make sure everything is off */ | 305 | /* Make sure everything is off */ |
| 301 | reset_counts(); | 306 | reset_counts(); |
| @@ -315,9 +320,9 @@ static int trace_selftest_ops(int cnt) | |||
| 315 | } | 320 | } |
| 316 | 321 | ||
| 317 | /* Test dynamic code modification and ftrace filters */ | 322 | /* Test dynamic code modification and ftrace filters */ |
| 318 | int trace_selftest_startup_dynamic_tracing(struct tracer *trace, | 323 | static int trace_selftest_startup_dynamic_tracing(struct tracer *trace, |
| 319 | struct trace_array *tr, | 324 | struct trace_array *tr, |
| 320 | int (*func)(void)) | 325 | int (*func)(void)) |
| 321 | { | 326 | { |
| 322 | int save_ftrace_enabled = ftrace_enabled; | 327 | int save_ftrace_enabled = ftrace_enabled; |
| 323 | unsigned long count; | 328 | unsigned long count; |
| @@ -388,7 +393,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, | |||
| 388 | } | 393 | } |
| 389 | 394 | ||
| 390 | /* Test the ops with global tracing running */ | 395 | /* Test the ops with global tracing running */ |
| 391 | ret = trace_selftest_ops(1); | 396 | ret = trace_selftest_ops(tr, 1); |
| 392 | trace->reset(tr); | 397 | trace->reset(tr); |
| 393 | 398 | ||
| 394 | out: | 399 | out: |
| @@ -399,7 +404,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, | |||
| 399 | 404 | ||
| 400 | /* Test the ops with global tracing off */ | 405 | /* Test the ops with global tracing off */ |
| 401 | if (!ret) | 406 | if (!ret) |
| 402 | ret = trace_selftest_ops(2); | 407 | ret = trace_selftest_ops(tr, 2); |
| 403 | 408 | ||
| 404 | return ret; | 409 | return ret; |
| 405 | } | 410 | } |
| @@ -802,7 +807,7 @@ out: | |||
| 802 | int | 807 | int |
| 803 | trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) | 808 | trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) |
| 804 | { | 809 | { |
| 805 | unsigned long save_max = tracing_max_latency; | 810 | unsigned long save_max = tr->max_latency; |
| 806 | unsigned long count; | 811 | unsigned long count; |
| 807 | int ret; | 812 | int ret; |
| 808 | 813 | ||
| @@ -814,7 +819,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) | |||
| 814 | } | 819 | } |
| 815 | 820 | ||
| 816 | /* reset the max latency */ | 821 | /* reset the max latency */ |
| 817 | tracing_max_latency = 0; | 822 | tr->max_latency = 0; |
| 818 | /* disable interrupts for a bit */ | 823 | /* disable interrupts for a bit */ |
| 819 | local_irq_disable(); | 824 | local_irq_disable(); |
| 820 | udelay(100); | 825 | udelay(100); |
| @@ -841,7 +846,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) | |||
| 841 | ret = -1; | 846 | ret = -1; |
| 842 | } | 847 | } |
| 843 | 848 | ||
| 844 | tracing_max_latency = save_max; | 849 | tr->max_latency = save_max; |
| 845 | 850 | ||
| 846 | return ret; | 851 | return ret; |
| 847 | } | 852 | } |
| @@ -851,7 +856,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) | |||
| 851 | int | 856 | int |
| 852 | trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) | 857 | trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) |
| 853 | { | 858 | { |
| 854 | unsigned long save_max = tracing_max_latency; | 859 | unsigned long save_max = tr->max_latency; |
| 855 | unsigned long count; | 860 | unsigned long count; |
| 856 | int ret; | 861 | int ret; |
| 857 | 862 | ||
| @@ -876,7 +881,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) | |||
| 876 | } | 881 | } |
| 877 | 882 | ||
| 878 | /* reset the max latency */ | 883 | /* reset the max latency */ |
| 879 | tracing_max_latency = 0; | 884 | tr->max_latency = 0; |
| 880 | /* disable preemption for a bit */ | 885 | /* disable preemption for a bit */ |
| 881 | preempt_disable(); | 886 | preempt_disable(); |
| 882 | udelay(100); | 887 | udelay(100); |
| @@ -903,7 +908,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) | |||
| 903 | ret = -1; | 908 | ret = -1; |
| 904 | } | 909 | } |
| 905 | 910 | ||
| 906 | tracing_max_latency = save_max; | 911 | tr->max_latency = save_max; |
| 907 | 912 | ||
| 908 | return ret; | 913 | return ret; |
| 909 | } | 914 | } |
| @@ -913,7 +918,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) | |||
| 913 | int | 918 | int |
| 914 | trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *tr) | 919 | trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *tr) |
| 915 | { | 920 | { |
| 916 | unsigned long save_max = tracing_max_latency; | 921 | unsigned long save_max = tr->max_latency; |
| 917 | unsigned long count; | 922 | unsigned long count; |
| 918 | int ret; | 923 | int ret; |
| 919 | 924 | ||
| @@ -938,7 +943,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * | |||
| 938 | } | 943 | } |
| 939 | 944 | ||
| 940 | /* reset the max latency */ | 945 | /* reset the max latency */ |
| 941 | tracing_max_latency = 0; | 946 | tr->max_latency = 0; |
| 942 | 947 | ||
| 943 | /* disable preemption and interrupts for a bit */ | 948 | /* disable preemption and interrupts for a bit */ |
| 944 | preempt_disable(); | 949 | preempt_disable(); |
| @@ -973,7 +978,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * | |||
| 973 | } | 978 | } |
| 974 | 979 | ||
| 975 | /* do the test by disabling interrupts first this time */ | 980 | /* do the test by disabling interrupts first this time */ |
| 976 | tracing_max_latency = 0; | 981 | tr->max_latency = 0; |
| 977 | tracing_start(); | 982 | tracing_start(); |
| 978 | trace->start(tr); | 983 | trace->start(tr); |
| 979 | 984 | ||
| @@ -1004,7 +1009,7 @@ out: | |||
| 1004 | tracing_start(); | 1009 | tracing_start(); |
| 1005 | out_no_start: | 1010 | out_no_start: |
| 1006 | trace->reset(tr); | 1011 | trace->reset(tr); |
| 1007 | tracing_max_latency = save_max; | 1012 | tr->max_latency = save_max; |
| 1008 | 1013 | ||
| 1009 | return ret; | 1014 | return ret; |
| 1010 | } | 1015 | } |
| @@ -1057,7 +1062,7 @@ static int trace_wakeup_test_thread(void *data) | |||
| 1057 | int | 1062 | int |
| 1058 | trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) | 1063 | trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) |
| 1059 | { | 1064 | { |
| 1060 | unsigned long save_max = tracing_max_latency; | 1065 | unsigned long save_max = tr->max_latency; |
| 1061 | struct task_struct *p; | 1066 | struct task_struct *p; |
| 1062 | struct completion is_ready; | 1067 | struct completion is_ready; |
| 1063 | unsigned long count; | 1068 | unsigned long count; |
| @@ -1083,7 +1088,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) | |||
| 1083 | } | 1088 | } |
| 1084 | 1089 | ||
| 1085 | /* reset the max latency */ | 1090 | /* reset the max latency */ |
| 1086 | tracing_max_latency = 0; | 1091 | tr->max_latency = 0; |
| 1087 | 1092 | ||
| 1088 | while (p->on_rq) { | 1093 | while (p->on_rq) { |
| 1089 | /* | 1094 | /* |
| @@ -1113,7 +1118,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) | |||
| 1113 | trace->reset(tr); | 1118 | trace->reset(tr); |
| 1114 | tracing_start(); | 1119 | tracing_start(); |
| 1115 | 1120 | ||
| 1116 | tracing_max_latency = save_max; | 1121 | tr->max_latency = save_max; |
| 1117 | 1122 | ||
| 1118 | /* kill the thread */ | 1123 | /* kill the thread */ |
| 1119 | kthread_stop(p); | 1124 | kthread_stop(p); |
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 21b320e5d163..8a4e5cb66a4c 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c | |||
| @@ -51,11 +51,33 @@ static DEFINE_MUTEX(stack_sysctl_mutex); | |||
| 51 | int stack_tracer_enabled; | 51 | int stack_tracer_enabled; |
| 52 | static int last_stack_tracer_enabled; | 52 | static int last_stack_tracer_enabled; |
| 53 | 53 | ||
| 54 | static inline void print_max_stack(void) | ||
| 55 | { | ||
| 56 | long i; | ||
| 57 | int size; | ||
| 58 | |||
| 59 | pr_emerg(" Depth Size Location (%d entries)\n" | ||
| 60 | " ----- ---- --------\n", | ||
| 61 | max_stack_trace.nr_entries - 1); | ||
| 62 | |||
| 63 | for (i = 0; i < max_stack_trace.nr_entries; i++) { | ||
| 64 | if (stack_dump_trace[i] == ULONG_MAX) | ||
| 65 | break; | ||
| 66 | if (i+1 == max_stack_trace.nr_entries || | ||
| 67 | stack_dump_trace[i+1] == ULONG_MAX) | ||
| 68 | size = stack_dump_index[i]; | ||
| 69 | else | ||
| 70 | size = stack_dump_index[i] - stack_dump_index[i+1]; | ||
| 71 | |||
| 72 | pr_emerg("%3ld) %8d %5d %pS\n", i, stack_dump_index[i], | ||
| 73 | size, (void *)stack_dump_trace[i]); | ||
| 74 | } | ||
| 75 | } | ||
| 76 | |||
| 54 | static inline void | 77 | static inline void |
| 55 | check_stack(unsigned long ip, unsigned long *stack) | 78 | check_stack(unsigned long ip, unsigned long *stack) |
| 56 | { | 79 | { |
| 57 | unsigned long this_size, flags; | 80 | unsigned long this_size, flags; unsigned long *p, *top, *start; |
| 58 | unsigned long *p, *top, *start; | ||
| 59 | static int tracer_frame; | 81 | static int tracer_frame; |
| 60 | int frame_size = ACCESS_ONCE(tracer_frame); | 82 | int frame_size = ACCESS_ONCE(tracer_frame); |
| 61 | int i; | 83 | int i; |
| @@ -85,8 +107,12 @@ check_stack(unsigned long ip, unsigned long *stack) | |||
| 85 | 107 | ||
| 86 | max_stack_size = this_size; | 108 | max_stack_size = this_size; |
| 87 | 109 | ||
| 88 | max_stack_trace.nr_entries = 0; | 110 | max_stack_trace.nr_entries = 0; |
| 89 | max_stack_trace.skip = 3; | 111 | |
| 112 | if (using_ftrace_ops_list_func()) | ||
| 113 | max_stack_trace.skip = 4; | ||
| 114 | else | ||
| 115 | max_stack_trace.skip = 3; | ||
| 90 | 116 | ||
| 91 | save_stack_trace(&max_stack_trace); | 117 | save_stack_trace(&max_stack_trace); |
| 92 | 118 | ||
| @@ -145,8 +171,12 @@ check_stack(unsigned long ip, unsigned long *stack) | |||
| 145 | i++; | 171 | i++; |
| 146 | } | 172 | } |
| 147 | 173 | ||
| 148 | BUG_ON(current != &init_task && | 174 | if ((current != &init_task && |
| 149 | *(end_of_stack(current)) != STACK_END_MAGIC); | 175 | *(end_of_stack(current)) != STACK_END_MAGIC)) { |
| 176 | print_max_stack(); | ||
| 177 | BUG(); | ||
| 178 | } | ||
| 179 | |||
| 150 | out: | 180 | out: |
| 151 | arch_spin_unlock(&max_stack_lock); | 181 | arch_spin_unlock(&max_stack_lock); |
| 152 | local_irq_restore(flags); | 182 | local_irq_restore(flags); |
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 930e51462dc8..c082a7441345 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c | |||
| @@ -732,9 +732,15 @@ static int uprobe_buffer_enable(void) | |||
| 732 | 732 | ||
| 733 | static void uprobe_buffer_disable(void) | 733 | static void uprobe_buffer_disable(void) |
| 734 | { | 734 | { |
| 735 | int cpu; | ||
| 736 | |||
| 735 | BUG_ON(!mutex_is_locked(&event_mutex)); | 737 | BUG_ON(!mutex_is_locked(&event_mutex)); |
| 736 | 738 | ||
| 737 | if (--uprobe_buffer_refcnt == 0) { | 739 | if (--uprobe_buffer_refcnt == 0) { |
| 740 | for_each_possible_cpu(cpu) | ||
| 741 | free_page((unsigned long)per_cpu_ptr(uprobe_cpu_buffer, | ||
| 742 | cpu)->buf); | ||
| 743 | |||
| 738 | free_percpu(uprobe_cpu_buffer); | 744 | free_percpu(uprobe_cpu_buffer); |
| 739 | uprobe_cpu_buffer = NULL; | 745 | uprobe_cpu_buffer = NULL; |
| 740 | } | 746 | } |
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index ac5b23cf7212..33cbd8c203f8 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c | |||
| @@ -188,7 +188,6 @@ static int tracepoint_add_func(struct tracepoint *tp, | |||
| 188 | WARN_ON_ONCE(1); | 188 | WARN_ON_ONCE(1); |
| 189 | return PTR_ERR(old); | 189 | return PTR_ERR(old); |
| 190 | } | 190 | } |
| 191 | release_probes(old); | ||
| 192 | 191 | ||
| 193 | /* | 192 | /* |
| 194 | * rcu_assign_pointer has a smp_wmb() which makes sure that the new | 193 | * rcu_assign_pointer has a smp_wmb() which makes sure that the new |
| @@ -200,6 +199,7 @@ static int tracepoint_add_func(struct tracepoint *tp, | |||
| 200 | rcu_assign_pointer(tp->funcs, tp_funcs); | 199 | rcu_assign_pointer(tp->funcs, tp_funcs); |
| 201 | if (!static_key_enabled(&tp->key)) | 200 | if (!static_key_enabled(&tp->key)) |
| 202 | static_key_slow_inc(&tp->key); | 201 | static_key_slow_inc(&tp->key); |
| 202 | release_probes(old); | ||
| 203 | return 0; | 203 | return 0; |
| 204 | } | 204 | } |
| 205 | 205 | ||
| @@ -221,7 +221,6 @@ static int tracepoint_remove_func(struct tracepoint *tp, | |||
| 221 | WARN_ON_ONCE(1); | 221 | WARN_ON_ONCE(1); |
| 222 | return PTR_ERR(old); | 222 | return PTR_ERR(old); |
| 223 | } | 223 | } |
| 224 | release_probes(old); | ||
| 225 | 224 | ||
| 226 | if (!tp_funcs) { | 225 | if (!tp_funcs) { |
| 227 | /* Removed last function */ | 226 | /* Removed last function */ |
| @@ -232,6 +231,7 @@ static int tracepoint_remove_func(struct tracepoint *tp, | |||
| 232 | static_key_slow_dec(&tp->key); | 231 | static_key_slow_dec(&tp->key); |
| 233 | } | 232 | } |
| 234 | rcu_assign_pointer(tp->funcs, tp_funcs); | 233 | rcu_assign_pointer(tp->funcs, tp_funcs); |
| 234 | release_probes(old); | ||
| 235 | return 0; | 235 | return 0; |
| 236 | } | 236 | } |
| 237 | 237 | ||
| @@ -239,6 +239,7 @@ static int tracepoint_remove_func(struct tracepoint *tp, | |||
| 239 | * tracepoint_probe_register - Connect a probe to a tracepoint | 239 | * tracepoint_probe_register - Connect a probe to a tracepoint |
| 240 | * @tp: tracepoint | 240 | * @tp: tracepoint |
| 241 | * @probe: probe handler | 241 | * @probe: probe handler |
| 242 | * @data: tracepoint data | ||
| 242 | * | 243 | * |
| 243 | * Returns 0 if ok, error value on error. | 244 | * Returns 0 if ok, error value on error. |
| 244 | * Note: if @tp is within a module, the caller is responsible for | 245 | * Note: if @tp is within a module, the caller is responsible for |
| @@ -264,6 +265,7 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_register); | |||
| 264 | * tracepoint_probe_unregister - Disconnect a probe from a tracepoint | 265 | * tracepoint_probe_unregister - Disconnect a probe from a tracepoint |
| 265 | * @tp: tracepoint | 266 | * @tp: tracepoint |
| 266 | * @probe: probe function pointer | 267 | * @probe: probe function pointer |
| 268 | * @data: tracepoint data | ||
| 267 | * | 269 | * |
| 268 | * Returns 0 if ok, error value on error. | 270 | * Returns 0 if ok, error value on error. |
| 269 | */ | 271 | */ |
diff --git a/kernel/user.c b/kernel/user.c index 294fc6a94168..4efa39350e44 100644 --- a/kernel/user.c +++ b/kernel/user.c | |||
| @@ -87,7 +87,6 @@ static DEFINE_SPINLOCK(uidhash_lock); | |||
| 87 | struct user_struct root_user = { | 87 | struct user_struct root_user = { |
| 88 | .__count = ATOMIC_INIT(1), | 88 | .__count = ATOMIC_INIT(1), |
| 89 | .processes = ATOMIC_INIT(1), | 89 | .processes = ATOMIC_INIT(1), |
| 90 | .files = ATOMIC_INIT(0), | ||
| 91 | .sigpending = ATOMIC_INIT(0), | 90 | .sigpending = ATOMIC_INIT(0), |
| 92 | .locked_shm = 0, | 91 | .locked_shm = 0, |
| 93 | .uid = GLOBAL_ROOT_UID, | 92 | .uid = GLOBAL_ROOT_UID, |
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 0d8f6023fd8d..fcc02560fd6b 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c | |||
| @@ -152,7 +152,7 @@ static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) | |||
| 152 | 152 | ||
| 153 | /* Find the matching extent */ | 153 | /* Find the matching extent */ |
| 154 | extents = map->nr_extents; | 154 | extents = map->nr_extents; |
| 155 | smp_read_barrier_depends(); | 155 | smp_rmb(); |
| 156 | for (idx = 0; idx < extents; idx++) { | 156 | for (idx = 0; idx < extents; idx++) { |
| 157 | first = map->extent[idx].first; | 157 | first = map->extent[idx].first; |
| 158 | last = first + map->extent[idx].count - 1; | 158 | last = first + map->extent[idx].count - 1; |
| @@ -176,7 +176,7 @@ static u32 map_id_down(struct uid_gid_map *map, u32 id) | |||
| 176 | 176 | ||
| 177 | /* Find the matching extent */ | 177 | /* Find the matching extent */ |
| 178 | extents = map->nr_extents; | 178 | extents = map->nr_extents; |
| 179 | smp_read_barrier_depends(); | 179 | smp_rmb(); |
| 180 | for (idx = 0; idx < extents; idx++) { | 180 | for (idx = 0; idx < extents; idx++) { |
| 181 | first = map->extent[idx].first; | 181 | first = map->extent[idx].first; |
| 182 | last = first + map->extent[idx].count - 1; | 182 | last = first + map->extent[idx].count - 1; |
| @@ -199,7 +199,7 @@ static u32 map_id_up(struct uid_gid_map *map, u32 id) | |||
| 199 | 199 | ||
| 200 | /* Find the matching extent */ | 200 | /* Find the matching extent */ |
| 201 | extents = map->nr_extents; | 201 | extents = map->nr_extents; |
| 202 | smp_read_barrier_depends(); | 202 | smp_rmb(); |
| 203 | for (idx = 0; idx < extents; idx++) { | 203 | for (idx = 0; idx < extents; idx++) { |
| 204 | first = map->extent[idx].lower_first; | 204 | first = map->extent[idx].lower_first; |
| 205 | last = first + map->extent[idx].count - 1; | 205 | last = first + map->extent[idx].count - 1; |
| @@ -286,7 +286,7 @@ EXPORT_SYMBOL(from_kuid_munged); | |||
| 286 | /** | 286 | /** |
| 287 | * make_kgid - Map a user-namespace gid pair into a kgid. | 287 | * make_kgid - Map a user-namespace gid pair into a kgid. |
| 288 | * @ns: User namespace that the gid is in | 288 | * @ns: User namespace that the gid is in |
| 289 | * @uid: group identifier | 289 | * @gid: group identifier |
| 290 | * | 290 | * |
| 291 | * Maps a user-namespace gid pair into a kernel internal kgid, | 291 | * Maps a user-namespace gid pair into a kernel internal kgid, |
| 292 | * and returns that kgid. | 292 | * and returns that kgid. |
| @@ -482,7 +482,8 @@ static int projid_m_show(struct seq_file *seq, void *v) | |||
| 482 | return 0; | 482 | return 0; |
| 483 | } | 483 | } |
| 484 | 484 | ||
| 485 | static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map) | 485 | static void *m_start(struct seq_file *seq, loff_t *ppos, |
| 486 | struct uid_gid_map *map) | ||
| 486 | { | 487 | { |
| 487 | struct uid_gid_extent *extent = NULL; | 488 | struct uid_gid_extent *extent = NULL; |
| 488 | loff_t pos = *ppos; | 489 | loff_t pos = *ppos; |
| @@ -546,7 +547,8 @@ struct seq_operations proc_projid_seq_operations = { | |||
| 546 | .show = projid_m_show, | 547 | .show = projid_m_show, |
| 547 | }; | 548 | }; |
| 548 | 549 | ||
| 549 | static bool mappings_overlap(struct uid_gid_map *new_map, struct uid_gid_extent *extent) | 550 | static bool mappings_overlap(struct uid_gid_map *new_map, |
| 551 | struct uid_gid_extent *extent) | ||
| 550 | { | 552 | { |
| 551 | u32 upper_first, lower_first, upper_last, lower_last; | 553 | u32 upper_first, lower_first, upper_last, lower_last; |
| 552 | unsigned idx; | 554 | unsigned idx; |
| @@ -615,9 +617,8 @@ static ssize_t map_write(struct file *file, const char __user *buf, | |||
| 615 | * were written before the count of the extents. | 617 | * were written before the count of the extents. |
| 616 | * | 618 | * |
| 617 | * To achieve this smp_wmb() is used on guarantee the write | 619 | * To achieve this smp_wmb() is used on guarantee the write |
| 618 | * order and smp_read_barrier_depends() is guaranteed that we | 620 | * order and smp_rmb() is guaranteed that we don't have crazy |
| 619 | * don't have crazy architectures returning stale data. | 621 | * architectures returning stale data. |
| 620 | * | ||
| 621 | */ | 622 | */ |
| 622 | mutex_lock(&id_map_mutex); | 623 | mutex_lock(&id_map_mutex); |
| 623 | 624 | ||
| @@ -654,7 +655,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, | |||
| 654 | ret = -EINVAL; | 655 | ret = -EINVAL; |
| 655 | pos = kbuf; | 656 | pos = kbuf; |
| 656 | new_map.nr_extents = 0; | 657 | new_map.nr_extents = 0; |
| 657 | for (;pos; pos = next_line) { | 658 | for (; pos; pos = next_line) { |
| 658 | extent = &new_map.extent[new_map.nr_extents]; | 659 | extent = &new_map.extent[new_map.nr_extents]; |
| 659 | 660 | ||
| 660 | /* Find the end of line and ensure I don't look past it */ | 661 | /* Find the end of line and ensure I don't look past it */ |
| @@ -688,13 +689,16 @@ static ssize_t map_write(struct file *file, const char __user *buf, | |||
| 688 | 689 | ||
| 689 | /* Verify we have been given valid starting values */ | 690 | /* Verify we have been given valid starting values */ |
| 690 | if ((extent->first == (u32) -1) || | 691 | if ((extent->first == (u32) -1) || |
| 691 | (extent->lower_first == (u32) -1 )) | 692 | (extent->lower_first == (u32) -1)) |
| 692 | goto out; | 693 | goto out; |
| 693 | 694 | ||
| 694 | /* Verify count is not zero and does not cause the extent to wrap */ | 695 | /* Verify count is not zero and does not cause the |
| 696 | * extent to wrap | ||
| 697 | */ | ||
| 695 | if ((extent->first + extent->count) <= extent->first) | 698 | if ((extent->first + extent->count) <= extent->first) |
| 696 | goto out; | 699 | goto out; |
| 697 | if ((extent->lower_first + extent->count) <= extent->lower_first) | 700 | if ((extent->lower_first + extent->count) <= |
| 701 | extent->lower_first) | ||
| 698 | goto out; | 702 | goto out; |
| 699 | 703 | ||
| 700 | /* Do the ranges in extent overlap any previous extents? */ | 704 | /* Do the ranges in extent overlap any previous extents? */ |
| @@ -752,7 +756,8 @@ out: | |||
| 752 | return ret; | 756 | return ret; |
| 753 | } | 757 | } |
| 754 | 758 | ||
| 755 | ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) | 759 | ssize_t proc_uid_map_write(struct file *file, const char __user *buf, |
| 760 | size_t size, loff_t *ppos) | ||
| 756 | { | 761 | { |
| 757 | struct seq_file *seq = file->private_data; | 762 | struct seq_file *seq = file->private_data; |
| 758 | struct user_namespace *ns = seq->private; | 763 | struct user_namespace *ns = seq->private; |
| @@ -768,7 +773,8 @@ ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t siz | |||
| 768 | &ns->uid_map, &ns->parent->uid_map); | 773 | &ns->uid_map, &ns->parent->uid_map); |
| 769 | } | 774 | } |
| 770 | 775 | ||
| 771 | ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) | 776 | ssize_t proc_gid_map_write(struct file *file, const char __user *buf, |
| 777 | size_t size, loff_t *ppos) | ||
| 772 | { | 778 | { |
| 773 | struct seq_file *seq = file->private_data; | 779 | struct seq_file *seq = file->private_data; |
| 774 | struct user_namespace *ns = seq->private; | 780 | struct user_namespace *ns = seq->private; |
| @@ -784,7 +790,8 @@ ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t siz | |||
| 784 | &ns->gid_map, &ns->parent->gid_map); | 790 | &ns->gid_map, &ns->parent->gid_map); |
| 785 | } | 791 | } |
| 786 | 792 | ||
| 787 | ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) | 793 | ssize_t proc_projid_map_write(struct file *file, const char __user *buf, |
| 794 | size_t size, loff_t *ppos) | ||
| 788 | { | 795 | { |
| 789 | struct seq_file *seq = file->private_data; | 796 | struct seq_file *seq = file->private_data; |
| 790 | struct user_namespace *ns = seq->private; | 797 | struct user_namespace *ns = seq->private; |
| @@ -801,7 +808,7 @@ ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t | |||
| 801 | &ns->projid_map, &ns->parent->projid_map); | 808 | &ns->projid_map, &ns->parent->projid_map); |
| 802 | } | 809 | } |
| 803 | 810 | ||
| 804 | static bool new_idmap_permitted(const struct file *file, | 811 | static bool new_idmap_permitted(const struct file *file, |
| 805 | struct user_namespace *ns, int cap_setid, | 812 | struct user_namespace *ns, int cap_setid, |
| 806 | struct uid_gid_map *new_map) | 813 | struct uid_gid_map *new_map) |
| 807 | { | 814 | { |
| @@ -812,8 +819,7 @@ static bool new_idmap_permitted(const struct file *file, | |||
| 812 | kuid_t uid = make_kuid(ns->parent, id); | 819 | kuid_t uid = make_kuid(ns->parent, id); |
| 813 | if (uid_eq(uid, file->f_cred->fsuid)) | 820 | if (uid_eq(uid, file->f_cred->fsuid)) |
| 814 | return true; | 821 | return true; |
| 815 | } | 822 | } else if (cap_setid == CAP_SETGID) { |
| 816 | else if (cap_setid == CAP_SETGID) { | ||
| 817 | kgid_t gid = make_kgid(ns->parent, id); | 823 | kgid_t gid = make_kgid(ns->parent, id); |
| 818 | if (gid_eq(gid, file->f_cred->fsgid)) | 824 | if (gid_eq(gid, file->f_cred->fsgid)) |
| 819 | return true; | 825 | return true; |
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 4f69f9a5e221..c8eac43267e9 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c | |||
| @@ -17,7 +17,7 @@ | |||
| 17 | 17 | ||
| 18 | #ifdef CONFIG_PROC_SYSCTL | 18 | #ifdef CONFIG_PROC_SYSCTL |
| 19 | 19 | ||
| 20 | static void *get_uts(ctl_table *table, int write) | 20 | static void *get_uts(struct ctl_table *table, int write) |
| 21 | { | 21 | { |
| 22 | char *which = table->data; | 22 | char *which = table->data; |
| 23 | struct uts_namespace *uts_ns; | 23 | struct uts_namespace *uts_ns; |
| @@ -32,7 +32,7 @@ static void *get_uts(ctl_table *table, int write) | |||
| 32 | return which; | 32 | return which; |
| 33 | } | 33 | } |
| 34 | 34 | ||
| 35 | static void put_uts(ctl_table *table, int write, void *which) | 35 | static void put_uts(struct ctl_table *table, int write, void *which) |
| 36 | { | 36 | { |
| 37 | if (!write) | 37 | if (!write) |
| 38 | up_read(&uts_sem); | 38 | up_read(&uts_sem); |
| @@ -44,14 +44,14 @@ static void put_uts(ctl_table *table, int write, void *which) | |||
| 44 | * Special case of dostring for the UTS structure. This has locks | 44 | * Special case of dostring for the UTS structure. This has locks |
| 45 | * to observe. Should this be in kernel/sys.c ???? | 45 | * to observe. Should this be in kernel/sys.c ???? |
| 46 | */ | 46 | */ |
| 47 | static int proc_do_uts_string(ctl_table *table, int write, | 47 | static int proc_do_uts_string(struct ctl_table *table, int write, |
| 48 | void __user *buffer, size_t *lenp, loff_t *ppos) | 48 | void __user *buffer, size_t *lenp, loff_t *ppos) |
| 49 | { | 49 | { |
| 50 | struct ctl_table uts_table; | 50 | struct ctl_table uts_table; |
| 51 | int r; | 51 | int r; |
| 52 | memcpy(&uts_table, table, sizeof(uts_table)); | 52 | memcpy(&uts_table, table, sizeof(uts_table)); |
| 53 | uts_table.data = get_uts(table, write); | 53 | uts_table.data = get_uts(table, write); |
| 54 | r = proc_dostring(&uts_table,write,buffer,lenp, ppos); | 54 | r = proc_dostring(&uts_table, write, buffer, lenp, ppos); |
| 55 | put_uts(table, write, uts_table.data); | 55 | put_uts(table, write, uts_table.data); |
| 56 | 56 | ||
| 57 | if (write) | 57 | if (write) |
| @@ -135,4 +135,4 @@ static int __init utsname_sysctl_init(void) | |||
| 135 | return 0; | 135 | return 0; |
| 136 | } | 136 | } |
| 137 | 137 | ||
| 138 | __initcall(utsname_sysctl_init); | 138 | device_initcall(utsname_sysctl_init); |
diff --git a/kernel/watchdog.c b/kernel/watchdog.c index e90089fd78e0..516203e665fc 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c | |||
| @@ -138,7 +138,11 @@ static void __touch_watchdog(void) | |||
| 138 | 138 | ||
| 139 | void touch_softlockup_watchdog(void) | 139 | void touch_softlockup_watchdog(void) |
| 140 | { | 140 | { |
| 141 | __this_cpu_write(watchdog_touch_ts, 0); | 141 | /* |
| 142 | * Preemption can be enabled. It doesn't matter which CPU's timestamp | ||
| 143 | * gets zeroed here, so use the raw_ operation. | ||
| 144 | */ | ||
| 145 | raw_cpu_write(watchdog_touch_ts, 0); | ||
| 142 | } | 146 | } |
| 143 | EXPORT_SYMBOL(touch_softlockup_watchdog); | 147 | EXPORT_SYMBOL(touch_softlockup_watchdog); |
| 144 | 148 | ||
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 0ee63af30bd1..6203d2900877 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
| @@ -65,15 +65,12 @@ enum { | |||
| 65 | * be executing on any CPU. The pool behaves as an unbound one. | 65 | * be executing on any CPU. The pool behaves as an unbound one. |
| 66 | * | 66 | * |
| 67 | * Note that DISASSOCIATED should be flipped only while holding | 67 | * Note that DISASSOCIATED should be flipped only while holding |
| 68 | * manager_mutex to avoid changing binding state while | 68 | * attach_mutex to avoid changing binding state while |
| 69 | * create_worker() is in progress. | 69 | * worker_attach_to_pool() is in progress. |
| 70 | */ | 70 | */ |
| 71 | POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ | ||
| 72 | POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ | 71 | POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ |
| 73 | POOL_FREEZING = 1 << 3, /* freeze in progress */ | ||
| 74 | 72 | ||
| 75 | /* worker flags */ | 73 | /* worker flags */ |
| 76 | WORKER_STARTED = 1 << 0, /* started */ | ||
| 77 | WORKER_DIE = 1 << 1, /* die die die */ | 74 | WORKER_DIE = 1 << 1, /* die die die */ |
| 78 | WORKER_IDLE = 1 << 2, /* is idle */ | 75 | WORKER_IDLE = 1 << 2, /* is idle */ |
| 79 | WORKER_PREP = 1 << 3, /* preparing to run works */ | 76 | WORKER_PREP = 1 << 3, /* preparing to run works */ |
| @@ -100,10 +97,10 @@ enum { | |||
| 100 | 97 | ||
| 101 | /* | 98 | /* |
| 102 | * Rescue workers are used only on emergencies and shared by | 99 | * Rescue workers are used only on emergencies and shared by |
| 103 | * all cpus. Give -20. | 100 | * all cpus. Give MIN_NICE. |
| 104 | */ | 101 | */ |
| 105 | RESCUER_NICE_LEVEL = -20, | 102 | RESCUER_NICE_LEVEL = MIN_NICE, |
| 106 | HIGHPRI_NICE_LEVEL = -20, | 103 | HIGHPRI_NICE_LEVEL = MIN_NICE, |
| 107 | 104 | ||
| 108 | WQ_NAME_LEN = 24, | 105 | WQ_NAME_LEN = 24, |
| 109 | }; | 106 | }; |
| @@ -124,8 +121,7 @@ enum { | |||
| 124 | * cpu or grabbing pool->lock is enough for read access. If | 121 | * cpu or grabbing pool->lock is enough for read access. If |
| 125 | * POOL_DISASSOCIATED is set, it's identical to L. | 122 | * POOL_DISASSOCIATED is set, it's identical to L. |
| 126 | * | 123 | * |
| 127 | * MG: pool->manager_mutex and pool->lock protected. Writes require both | 124 | * A: pool->attach_mutex protected. |
| 128 | * locks. Reads can happen under either lock. | ||
| 129 | * | 125 | * |
| 130 | * PL: wq_pool_mutex protected. | 126 | * PL: wq_pool_mutex protected. |
| 131 | * | 127 | * |
| @@ -163,8 +159,11 @@ struct worker_pool { | |||
| 163 | 159 | ||
| 164 | /* see manage_workers() for details on the two manager mutexes */ | 160 | /* see manage_workers() for details on the two manager mutexes */ |
| 165 | struct mutex manager_arb; /* manager arbitration */ | 161 | struct mutex manager_arb; /* manager arbitration */ |
| 166 | struct mutex manager_mutex; /* manager exclusion */ | 162 | struct mutex attach_mutex; /* attach/detach exclusion */ |
| 167 | struct idr worker_idr; /* MG: worker IDs and iteration */ | 163 | struct list_head workers; /* A: attached workers */ |
| 164 | struct completion *detach_completion; /* all workers detached */ | ||
| 165 | |||
| 166 | struct ida worker_ida; /* worker IDs for task name */ | ||
| 168 | 167 | ||
| 169 | struct workqueue_attrs *attrs; /* I: worker attributes */ | 168 | struct workqueue_attrs *attrs; /* I: worker attributes */ |
| 170 | struct hlist_node hash_node; /* PL: unbound_pool_hash node */ | 169 | struct hlist_node hash_node; /* PL: unbound_pool_hash node */ |
| @@ -340,16 +339,6 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, | |||
| 340 | lockdep_is_held(&wq->mutex), \ | 339 | lockdep_is_held(&wq->mutex), \ |
| 341 | "sched RCU or wq->mutex should be held") | 340 | "sched RCU or wq->mutex should be held") |
| 342 | 341 | ||
| 343 | #ifdef CONFIG_LOCKDEP | ||
| 344 | #define assert_manager_or_pool_lock(pool) \ | ||
| 345 | WARN_ONCE(debug_locks && \ | ||
| 346 | !lockdep_is_held(&(pool)->manager_mutex) && \ | ||
| 347 | !lockdep_is_held(&(pool)->lock), \ | ||
| 348 | "pool->manager_mutex or ->lock should be held") | ||
| 349 | #else | ||
| 350 | #define assert_manager_or_pool_lock(pool) do { } while (0) | ||
| 351 | #endif | ||
| 352 | |||
| 353 | #define for_each_cpu_worker_pool(pool, cpu) \ | 342 | #define for_each_cpu_worker_pool(pool, cpu) \ |
| 354 | for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \ | 343 | for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \ |
| 355 | (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \ | 344 | (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \ |
| @@ -375,17 +364,16 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, | |||
| 375 | /** | 364 | /** |
| 376 | * for_each_pool_worker - iterate through all workers of a worker_pool | 365 | * for_each_pool_worker - iterate through all workers of a worker_pool |
| 377 | * @worker: iteration cursor | 366 | * @worker: iteration cursor |
| 378 | * @wi: integer used for iteration | ||
| 379 | * @pool: worker_pool to iterate workers of | 367 | * @pool: worker_pool to iterate workers of |
| 380 | * | 368 | * |
| 381 | * This must be called with either @pool->manager_mutex or ->lock held. | 369 | * This must be called with @pool->attach_mutex. |
| 382 | * | 370 | * |
| 383 | * The if/else clause exists only for the lockdep assertion and can be | 371 | * The if/else clause exists only for the lockdep assertion and can be |
| 384 | * ignored. | 372 | * ignored. |
| 385 | */ | 373 | */ |
| 386 | #define for_each_pool_worker(worker, wi, pool) \ | 374 | #define for_each_pool_worker(worker, pool) \ |
| 387 | idr_for_each_entry(&(pool)->worker_idr, (worker), (wi)) \ | 375 | list_for_each_entry((worker), &(pool)->workers, node) \ |
| 388 | if (({ assert_manager_or_pool_lock((pool)); false; })) { } \ | 376 | if (({ lockdep_assert_held(&pool->attach_mutex); false; })) { } \ |
| 389 | else | 377 | else |
| 390 | 378 | ||
| 391 | /** | 379 | /** |
| @@ -763,13 +751,6 @@ static bool need_to_create_worker(struct worker_pool *pool) | |||
| 763 | return need_more_worker(pool) && !may_start_working(pool); | 751 | return need_more_worker(pool) && !may_start_working(pool); |
| 764 | } | 752 | } |
| 765 | 753 | ||
| 766 | /* Do I need to be the manager? */ | ||
| 767 | static bool need_to_manage_workers(struct worker_pool *pool) | ||
| 768 | { | ||
| 769 | return need_to_create_worker(pool) || | ||
| 770 | (pool->flags & POOL_MANAGE_WORKERS); | ||
| 771 | } | ||
| 772 | |||
| 773 | /* Do we have too many workers and should some go away? */ | 754 | /* Do we have too many workers and should some go away? */ |
| 774 | static bool too_many_workers(struct worker_pool *pool) | 755 | static bool too_many_workers(struct worker_pool *pool) |
| 775 | { | 756 | { |
| @@ -791,8 +772,8 @@ static bool too_many_workers(struct worker_pool *pool) | |||
| 791 | * Wake up functions. | 772 | * Wake up functions. |
| 792 | */ | 773 | */ |
| 793 | 774 | ||
| 794 | /* Return the first worker. Safe with preemption disabled */ | 775 | /* Return the first idle worker. Safe with preemption disabled */ |
| 795 | static struct worker *first_worker(struct worker_pool *pool) | 776 | static struct worker *first_idle_worker(struct worker_pool *pool) |
| 796 | { | 777 | { |
| 797 | if (unlikely(list_empty(&pool->idle_list))) | 778 | if (unlikely(list_empty(&pool->idle_list))) |
| 798 | return NULL; | 779 | return NULL; |
| @@ -811,7 +792,7 @@ static struct worker *first_worker(struct worker_pool *pool) | |||
| 811 | */ | 792 | */ |
| 812 | static void wake_up_worker(struct worker_pool *pool) | 793 | static void wake_up_worker(struct worker_pool *pool) |
| 813 | { | 794 | { |
| 814 | struct worker *worker = first_worker(pool); | 795 | struct worker *worker = first_idle_worker(pool); |
| 815 | 796 | ||
| 816 | if (likely(worker)) | 797 | if (likely(worker)) |
| 817 | wake_up_process(worker->task); | 798 | wake_up_process(worker->task); |
| @@ -885,7 +866,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu) | |||
| 885 | */ | 866 | */ |
| 886 | if (atomic_dec_and_test(&pool->nr_running) && | 867 | if (atomic_dec_and_test(&pool->nr_running) && |
| 887 | !list_empty(&pool->worklist)) | 868 | !list_empty(&pool->worklist)) |
| 888 | to_wakeup = first_worker(pool); | 869 | to_wakeup = first_idle_worker(pool); |
| 889 | return to_wakeup ? to_wakeup->task : NULL; | 870 | return to_wakeup ? to_wakeup->task : NULL; |
| 890 | } | 871 | } |
| 891 | 872 | ||
| @@ -1621,70 +1602,6 @@ static void worker_leave_idle(struct worker *worker) | |||
| 1621 | list_del_init(&worker->entry); | 1602 | list_del_init(&worker->entry); |
| 1622 | } | 1603 | } |
| 1623 | 1604 | ||
| 1624 | /** | ||
| 1625 | * worker_maybe_bind_and_lock - try to bind %current to worker_pool and lock it | ||
| 1626 | * @pool: target worker_pool | ||
| 1627 | * | ||
| 1628 | * Bind %current to the cpu of @pool if it is associated and lock @pool. | ||
| 1629 | * | ||
| 1630 | * Works which are scheduled while the cpu is online must at least be | ||
| 1631 | * scheduled to a worker which is bound to the cpu so that if they are | ||
| 1632 | * flushed from cpu callbacks while cpu is going down, they are | ||
| 1633 | * guaranteed to execute on the cpu. | ||
| 1634 | * | ||
| 1635 | * This function is to be used by unbound workers and rescuers to bind | ||
| 1636 | * themselves to the target cpu and may race with cpu going down or | ||
| 1637 | * coming online. kthread_bind() can't be used because it may put the | ||
| 1638 | * worker to already dead cpu and set_cpus_allowed_ptr() can't be used | ||
| 1639 | * verbatim as it's best effort and blocking and pool may be | ||
| 1640 | * [dis]associated in the meantime. | ||
| 1641 | * | ||
| 1642 | * This function tries set_cpus_allowed() and locks pool and verifies the | ||
| 1643 | * binding against %POOL_DISASSOCIATED which is set during | ||
| 1644 | * %CPU_DOWN_PREPARE and cleared during %CPU_ONLINE, so if the worker | ||
| 1645 | * enters idle state or fetches works without dropping lock, it can | ||
| 1646 | * guarantee the scheduling requirement described in the first paragraph. | ||
| 1647 | * | ||
| 1648 | * CONTEXT: | ||
| 1649 | * Might sleep. Called without any lock but returns with pool->lock | ||
| 1650 | * held. | ||
| 1651 | * | ||
| 1652 | * Return: | ||
| 1653 | * %true if the associated pool is online (@worker is successfully | ||
| 1654 | * bound), %false if offline. | ||
| 1655 | */ | ||
| 1656 | static bool worker_maybe_bind_and_lock(struct worker_pool *pool) | ||
| 1657 | __acquires(&pool->lock) | ||
| 1658 | { | ||
| 1659 | while (true) { | ||
| 1660 | /* | ||
| 1661 | * The following call may fail, succeed or succeed | ||
| 1662 | * without actually migrating the task to the cpu if | ||
| 1663 | * it races with cpu hotunplug operation. Verify | ||
| 1664 | * against POOL_DISASSOCIATED. | ||
| 1665 | */ | ||
| 1666 | if (!(pool->flags & POOL_DISASSOCIATED)) | ||
| 1667 | set_cpus_allowed_ptr(current, pool->attrs->cpumask); | ||
| 1668 | |||
| 1669 | spin_lock_irq(&pool->lock); | ||
| 1670 | if (pool->flags & POOL_DISASSOCIATED) | ||
| 1671 | return false; | ||
| 1672 | if (task_cpu(current) == pool->cpu && | ||
| 1673 | cpumask_equal(¤t->cpus_allowed, pool->attrs->cpumask)) | ||
| 1674 | return true; | ||
| 1675 | spin_unlock_irq(&pool->lock); | ||
| 1676 | |||
| 1677 | /* | ||
| 1678 | * We've raced with CPU hot[un]plug. Give it a breather | ||
| 1679 | * and retry migration. cond_resched() is required here; | ||
| 1680 | * otherwise, we might deadlock against cpu_stop trying to | ||
| 1681 | * bring down the CPU on non-preemptive kernel. | ||
| 1682 | */ | ||
| 1683 | cpu_relax(); | ||
| 1684 | cond_resched(); | ||
| 1685 | } | ||
| 1686 | } | ||
| 1687 | |||
| 1688 | static struct worker *alloc_worker(void) | 1605 | static struct worker *alloc_worker(void) |
| 1689 | { | 1606 | { |
| 1690 | struct worker *worker; | 1607 | struct worker *worker; |
| @@ -1693,6 +1610,7 @@ static struct worker *alloc_worker(void) | |||
| 1693 | if (worker) { | 1610 | if (worker) { |
| 1694 | INIT_LIST_HEAD(&worker->entry); | 1611 | INIT_LIST_HEAD(&worker->entry); |
| 1695 | INIT_LIST_HEAD(&worker->scheduled); | 1612 | INIT_LIST_HEAD(&worker->scheduled); |
| 1613 | INIT_LIST_HEAD(&worker->node); | ||
| 1696 | /* on creation a worker is in !idle && prep state */ | 1614 | /* on creation a worker is in !idle && prep state */ |
| 1697 | worker->flags = WORKER_PREP; | 1615 | worker->flags = WORKER_PREP; |
| 1698 | } | 1616 | } |
| @@ -1700,12 +1618,68 @@ static struct worker *alloc_worker(void) | |||
| 1700 | } | 1618 | } |
| 1701 | 1619 | ||
| 1702 | /** | 1620 | /** |
| 1621 | * worker_attach_to_pool() - attach a worker to a pool | ||
| 1622 | * @worker: worker to be attached | ||
| 1623 | * @pool: the target pool | ||
| 1624 | * | ||
| 1625 | * Attach @worker to @pool. Once attached, the %WORKER_UNBOUND flag and | ||
| 1626 | * cpu-binding of @worker are kept coordinated with the pool across | ||
| 1627 | * cpu-[un]hotplugs. | ||
| 1628 | */ | ||
| 1629 | static void worker_attach_to_pool(struct worker *worker, | ||
| 1630 | struct worker_pool *pool) | ||
| 1631 | { | ||
| 1632 | mutex_lock(&pool->attach_mutex); | ||
| 1633 | |||
| 1634 | /* | ||
| 1635 | * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any | ||
| 1636 | * online CPUs. It'll be re-applied when any of the CPUs come up. | ||
| 1637 | */ | ||
| 1638 | set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); | ||
| 1639 | |||
| 1640 | /* | ||
| 1641 | * The pool->attach_mutex ensures %POOL_DISASSOCIATED remains | ||
| 1642 | * stable across this function. See the comments above the | ||
| 1643 | * flag definition for details. | ||
| 1644 | */ | ||
| 1645 | if (pool->flags & POOL_DISASSOCIATED) | ||
| 1646 | worker->flags |= WORKER_UNBOUND; | ||
| 1647 | |||
| 1648 | list_add_tail(&worker->node, &pool->workers); | ||
| 1649 | |||
| 1650 | mutex_unlock(&pool->attach_mutex); | ||
| 1651 | } | ||
| 1652 | |||
| 1653 | /** | ||
| 1654 | * worker_detach_from_pool() - detach a worker from its pool | ||
| 1655 | * @worker: worker which is attached to its pool | ||
| 1656 | * @pool: the pool @worker is attached to | ||
| 1657 | * | ||
| 1658 | * Undo the attaching which had been done in worker_attach_to_pool(). The | ||
| 1659 | * caller worker shouldn't access to the pool after detached except it has | ||
| 1660 | * other reference to the pool. | ||
| 1661 | */ | ||
| 1662 | static void worker_detach_from_pool(struct worker *worker, | ||
| 1663 | struct worker_pool *pool) | ||
| 1664 | { | ||
| 1665 | struct completion *detach_completion = NULL; | ||
| 1666 | |||
| 1667 | mutex_lock(&pool->attach_mutex); | ||
| 1668 | list_del(&worker->node); | ||
| 1669 | if (list_empty(&pool->workers)) | ||
| 1670 | detach_completion = pool->detach_completion; | ||
| 1671 | mutex_unlock(&pool->attach_mutex); | ||
| 1672 | |||
| 1673 | if (detach_completion) | ||
| 1674 | complete(detach_completion); | ||
| 1675 | } | ||
| 1676 | |||
| 1677 | /** | ||
| 1703 | * create_worker - create a new workqueue worker | 1678 | * create_worker - create a new workqueue worker |
| 1704 | * @pool: pool the new worker will belong to | 1679 | * @pool: pool the new worker will belong to |
| 1705 | * | 1680 | * |
| 1706 | * Create a new worker which is bound to @pool. The returned worker | 1681 | * Create a new worker which is attached to @pool. The new worker must be |
| 1707 | * can be started by calling start_worker() or destroyed using | 1682 | * started by start_worker(). |
| 1708 | * destroy_worker(). | ||
| 1709 | * | 1683 | * |
| 1710 | * CONTEXT: | 1684 | * CONTEXT: |
| 1711 | * Might sleep. Does GFP_KERNEL allocations. | 1685 | * Might sleep. Does GFP_KERNEL allocations. |
| @@ -1719,19 +1693,8 @@ static struct worker *create_worker(struct worker_pool *pool) | |||
| 1719 | int id = -1; | 1693 | int id = -1; |
| 1720 | char id_buf[16]; | 1694 | char id_buf[16]; |
| 1721 | 1695 | ||
| 1722 | lockdep_assert_held(&pool->manager_mutex); | 1696 | /* ID is needed to determine kthread name */ |
| 1723 | 1697 | id = ida_simple_get(&pool->worker_ida, 0, 0, GFP_KERNEL); | |
| 1724 | /* | ||
| 1725 | * ID is needed to determine kthread name. Allocate ID first | ||
| 1726 | * without installing the pointer. | ||
| 1727 | */ | ||
| 1728 | idr_preload(GFP_KERNEL); | ||
| 1729 | spin_lock_irq(&pool->lock); | ||
| 1730 | |||
| 1731 | id = idr_alloc(&pool->worker_idr, NULL, 0, 0, GFP_NOWAIT); | ||
| 1732 | |||
| 1733 | spin_unlock_irq(&pool->lock); | ||
| 1734 | idr_preload_end(); | ||
| 1735 | if (id < 0) | 1698 | if (id < 0) |
| 1736 | goto fail; | 1699 | goto fail; |
| 1737 | 1700 | ||
| @@ -1758,33 +1721,14 @@ static struct worker *create_worker(struct worker_pool *pool) | |||
| 1758 | /* prevent userland from meddling with cpumask of workqueue workers */ | 1721 | /* prevent userland from meddling with cpumask of workqueue workers */ |
| 1759 | worker->task->flags |= PF_NO_SETAFFINITY; | 1722 | worker->task->flags |= PF_NO_SETAFFINITY; |
| 1760 | 1723 | ||
| 1761 | /* | 1724 | /* successful, attach the worker to the pool */ |
| 1762 | * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any | 1725 | worker_attach_to_pool(worker, pool); |
| 1763 | * online CPUs. It'll be re-applied when any of the CPUs come up. | ||
| 1764 | */ | ||
| 1765 | set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); | ||
| 1766 | |||
| 1767 | /* | ||
| 1768 | * The caller is responsible for ensuring %POOL_DISASSOCIATED | ||
| 1769 | * remains stable across this function. See the comments above the | ||
| 1770 | * flag definition for details. | ||
| 1771 | */ | ||
| 1772 | if (pool->flags & POOL_DISASSOCIATED) | ||
| 1773 | worker->flags |= WORKER_UNBOUND; | ||
| 1774 | |||
| 1775 | /* successful, commit the pointer to idr */ | ||
| 1776 | spin_lock_irq(&pool->lock); | ||
| 1777 | idr_replace(&pool->worker_idr, worker, worker->id); | ||
| 1778 | spin_unlock_irq(&pool->lock); | ||
| 1779 | 1726 | ||
| 1780 | return worker; | 1727 | return worker; |
| 1781 | 1728 | ||
| 1782 | fail: | 1729 | fail: |
| 1783 | if (id >= 0) { | 1730 | if (id >= 0) |
| 1784 | spin_lock_irq(&pool->lock); | 1731 | ida_simple_remove(&pool->worker_ida, id); |
| 1785 | idr_remove(&pool->worker_idr, id); | ||
| 1786 | spin_unlock_irq(&pool->lock); | ||
| 1787 | } | ||
| 1788 | kfree(worker); | 1732 | kfree(worker); |
| 1789 | return NULL; | 1733 | return NULL; |
| 1790 | } | 1734 | } |
| @@ -1800,7 +1744,6 @@ fail: | |||
| 1800 | */ | 1744 | */ |
| 1801 | static void start_worker(struct worker *worker) | 1745 | static void start_worker(struct worker *worker) |
| 1802 | { | 1746 | { |
| 1803 | worker->flags |= WORKER_STARTED; | ||
| 1804 | worker->pool->nr_workers++; | 1747 | worker->pool->nr_workers++; |
| 1805 | worker_enter_idle(worker); | 1748 | worker_enter_idle(worker); |
| 1806 | wake_up_process(worker->task); | 1749 | wake_up_process(worker->task); |
| @@ -1818,8 +1761,6 @@ static int create_and_start_worker(struct worker_pool *pool) | |||
| 1818 | { | 1761 | { |
| 1819 | struct worker *worker; | 1762 | struct worker *worker; |
| 1820 | 1763 | ||
| 1821 | mutex_lock(&pool->manager_mutex); | ||
| 1822 | |||
| 1823 | worker = create_worker(pool); | 1764 | worker = create_worker(pool); |
| 1824 | if (worker) { | 1765 | if (worker) { |
| 1825 | spin_lock_irq(&pool->lock); | 1766 | spin_lock_irq(&pool->lock); |
| @@ -1827,8 +1768,6 @@ static int create_and_start_worker(struct worker_pool *pool) | |||
| 1827 | spin_unlock_irq(&pool->lock); | 1768 | spin_unlock_irq(&pool->lock); |
| 1828 | } | 1769 | } |
| 1829 | 1770 | ||
| 1830 | mutex_unlock(&pool->manager_mutex); | ||
| 1831 | |||
| 1832 | return worker ? 0 : -ENOMEM; | 1771 | return worker ? 0 : -ENOMEM; |
| 1833 | } | 1772 | } |
| 1834 | 1773 | ||
| @@ -1836,46 +1775,30 @@ static int create_and_start_worker(struct worker_pool *pool) | |||
| 1836 | * destroy_worker - destroy a workqueue worker | 1775 | * destroy_worker - destroy a workqueue worker |
| 1837 | * @worker: worker to be destroyed | 1776 | * @worker: worker to be destroyed |
| 1838 | * | 1777 | * |
| 1839 | * Destroy @worker and adjust @pool stats accordingly. | 1778 | * Destroy @worker and adjust @pool stats accordingly. The worker should |
| 1779 | * be idle. | ||
| 1840 | * | 1780 | * |
| 1841 | * CONTEXT: | 1781 | * CONTEXT: |
| 1842 | * spin_lock_irq(pool->lock) which is released and regrabbed. | 1782 | * spin_lock_irq(pool->lock). |
| 1843 | */ | 1783 | */ |
| 1844 | static void destroy_worker(struct worker *worker) | 1784 | static void destroy_worker(struct worker *worker) |
| 1845 | { | 1785 | { |
| 1846 | struct worker_pool *pool = worker->pool; | 1786 | struct worker_pool *pool = worker->pool; |
| 1847 | 1787 | ||
| 1848 | lockdep_assert_held(&pool->manager_mutex); | ||
| 1849 | lockdep_assert_held(&pool->lock); | 1788 | lockdep_assert_held(&pool->lock); |
| 1850 | 1789 | ||
| 1851 | /* sanity check frenzy */ | 1790 | /* sanity check frenzy */ |
| 1852 | if (WARN_ON(worker->current_work) || | 1791 | if (WARN_ON(worker->current_work) || |
| 1853 | WARN_ON(!list_empty(&worker->scheduled))) | 1792 | WARN_ON(!list_empty(&worker->scheduled)) || |
| 1793 | WARN_ON(!(worker->flags & WORKER_IDLE))) | ||
| 1854 | return; | 1794 | return; |
| 1855 | 1795 | ||
| 1856 | if (worker->flags & WORKER_STARTED) | 1796 | pool->nr_workers--; |
| 1857 | pool->nr_workers--; | 1797 | pool->nr_idle--; |
| 1858 | if (worker->flags & WORKER_IDLE) | ||
| 1859 | pool->nr_idle--; | ||
| 1860 | |||
| 1861 | /* | ||
| 1862 | * Once WORKER_DIE is set, the kworker may destroy itself at any | ||
| 1863 | * point. Pin to ensure the task stays until we're done with it. | ||
| 1864 | */ | ||
| 1865 | get_task_struct(worker->task); | ||
| 1866 | 1798 | ||
| 1867 | list_del_init(&worker->entry); | 1799 | list_del_init(&worker->entry); |
| 1868 | worker->flags |= WORKER_DIE; | 1800 | worker->flags |= WORKER_DIE; |
| 1869 | 1801 | wake_up_process(worker->task); | |
| 1870 | idr_remove(&pool->worker_idr, worker->id); | ||
| 1871 | |||
| 1872 | spin_unlock_irq(&pool->lock); | ||
| 1873 | |||
| 1874 | kthread_stop(worker->task); | ||
| 1875 | put_task_struct(worker->task); | ||
| 1876 | kfree(worker); | ||
| 1877 | |||
| 1878 | spin_lock_irq(&pool->lock); | ||
| 1879 | } | 1802 | } |
| 1880 | 1803 | ||
| 1881 | static void idle_worker_timeout(unsigned long __pool) | 1804 | static void idle_worker_timeout(unsigned long __pool) |
| @@ -1884,7 +1807,7 @@ static void idle_worker_timeout(unsigned long __pool) | |||
| 1884 | 1807 | ||
| 1885 | spin_lock_irq(&pool->lock); | 1808 | spin_lock_irq(&pool->lock); |
| 1886 | 1809 | ||
| 1887 | if (too_many_workers(pool)) { | 1810 | while (too_many_workers(pool)) { |
| 1888 | struct worker *worker; | 1811 | struct worker *worker; |
| 1889 | unsigned long expires; | 1812 | unsigned long expires; |
| 1890 | 1813 | ||
| @@ -1892,13 +1815,12 @@ static void idle_worker_timeout(unsigned long __pool) | |||
| 1892 | worker = list_entry(pool->idle_list.prev, struct worker, entry); | 1815 | worker = list_entry(pool->idle_list.prev, struct worker, entry); |
| 1893 | expires = worker->last_active + IDLE_WORKER_TIMEOUT; | 1816 | expires = worker->last_active + IDLE_WORKER_TIMEOUT; |
| 1894 | 1817 | ||
| 1895 | if (time_before(jiffies, expires)) | 1818 | if (time_before(jiffies, expires)) { |
| 1896 | mod_timer(&pool->idle_timer, expires); | 1819 | mod_timer(&pool->idle_timer, expires); |
| 1897 | else { | 1820 | break; |
| 1898 | /* it's been idle for too long, wake up manager */ | ||
| 1899 | pool->flags |= POOL_MANAGE_WORKERS; | ||
| 1900 | wake_up_worker(pool); | ||
| 1901 | } | 1821 | } |
| 1822 | |||
| 1823 | destroy_worker(worker); | ||
| 1902 | } | 1824 | } |
| 1903 | 1825 | ||
| 1904 | spin_unlock_irq(&pool->lock); | 1826 | spin_unlock_irq(&pool->lock); |
| @@ -1916,6 +1838,12 @@ static void send_mayday(struct work_struct *work) | |||
| 1916 | 1838 | ||
| 1917 | /* mayday mayday mayday */ | 1839 | /* mayday mayday mayday */ |
| 1918 | if (list_empty(&pwq->mayday_node)) { | 1840 | if (list_empty(&pwq->mayday_node)) { |
| 1841 | /* | ||
| 1842 | * If @pwq is for an unbound wq, its base ref may be put at | ||
| 1843 | * any time due to an attribute change. Pin @pwq until the | ||
| 1844 | * rescuer is done with it. | ||
| 1845 | */ | ||
| 1846 | get_pwq(pwq); | ||
| 1919 | list_add_tail(&pwq->mayday_node, &wq->maydays); | 1847 | list_add_tail(&pwq->mayday_node, &wq->maydays); |
| 1920 | wake_up_process(wq->rescuer->task); | 1848 | wake_up_process(wq->rescuer->task); |
| 1921 | } | 1849 | } |
| @@ -2011,44 +1939,6 @@ restart: | |||
| 2011 | } | 1939 | } |
| 2012 | 1940 | ||
| 2013 | /** | 1941 | /** |
| 2014 | * maybe_destroy_worker - destroy workers which have been idle for a while | ||
| 2015 | * @pool: pool to destroy workers for | ||
| 2016 | * | ||
| 2017 | * Destroy @pool workers which have been idle for longer than | ||
| 2018 | * IDLE_WORKER_TIMEOUT. | ||
| 2019 | * | ||
| 2020 | * LOCKING: | ||
| 2021 | * spin_lock_irq(pool->lock) which may be released and regrabbed | ||
| 2022 | * multiple times. Called only from manager. | ||
| 2023 | * | ||
| 2024 | * Return: | ||
| 2025 | * %false if no action was taken and pool->lock stayed locked, %true | ||
| 2026 | * otherwise. | ||
| 2027 | */ | ||
| 2028 | static bool maybe_destroy_workers(struct worker_pool *pool) | ||
| 2029 | { | ||
| 2030 | bool ret = false; | ||
| 2031 | |||
| 2032 | while (too_many_workers(pool)) { | ||
| 2033 | struct worker *worker; | ||
| 2034 | unsigned long expires; | ||
| 2035 | |||
| 2036 | worker = list_entry(pool->idle_list.prev, struct worker, entry); | ||
| 2037 | expires = worker->last_active + IDLE_WORKER_TIMEOUT; | ||
| 2038 | |||
| 2039 | if (time_before(jiffies, expires)) { | ||
| 2040 | mod_timer(&pool->idle_timer, expires); | ||
| 2041 | break; | ||
| 2042 | } | ||
| 2043 | |||
| 2044 | destroy_worker(worker); | ||
| 2045 | ret = true; | ||
| 2046 | } | ||
| 2047 | |||
| 2048 | return ret; | ||
| 2049 | } | ||
| 2050 | |||
| 2051 | /** | ||
| 2052 | * manage_workers - manage worker pool | 1942 | * manage_workers - manage worker pool |
| 2053 | * @worker: self | 1943 | * @worker: self |
| 2054 | * | 1944 | * |
| @@ -2077,8 +1967,6 @@ static bool manage_workers(struct worker *worker) | |||
| 2077 | bool ret = false; | 1967 | bool ret = false; |
| 2078 | 1968 | ||
| 2079 | /* | 1969 | /* |
| 2080 | * Managership is governed by two mutexes - manager_arb and | ||
| 2081 | * manager_mutex. manager_arb handles arbitration of manager role. | ||
| 2082 | * Anyone who successfully grabs manager_arb wins the arbitration | 1970 | * Anyone who successfully grabs manager_arb wins the arbitration |
| 2083 | * and becomes the manager. mutex_trylock() on pool->manager_arb | 1971 | * and becomes the manager. mutex_trylock() on pool->manager_arb |
| 2084 | * failure while holding pool->lock reliably indicates that someone | 1972 | * failure while holding pool->lock reliably indicates that someone |
| @@ -2087,40 +1975,12 @@ static bool manage_workers(struct worker *worker) | |||
| 2087 | * grabbing manager_arb is responsible for actually performing | 1975 | * grabbing manager_arb is responsible for actually performing |
| 2088 | * manager duties. If manager_arb is grabbed and released without | 1976 | * manager duties. If manager_arb is grabbed and released without |
| 2089 | * actual management, the pool may stall indefinitely. | 1977 | * actual management, the pool may stall indefinitely. |
| 2090 | * | ||
| 2091 | * manager_mutex is used for exclusion of actual management | ||
| 2092 | * operations. The holder of manager_mutex can be sure that none | ||
| 2093 | * of management operations, including creation and destruction of | ||
| 2094 | * workers, won't take place until the mutex is released. Because | ||
| 2095 | * manager_mutex doesn't interfere with manager role arbitration, | ||
| 2096 | * it is guaranteed that the pool's management, while may be | ||
| 2097 | * delayed, won't be disturbed by someone else grabbing | ||
| 2098 | * manager_mutex. | ||
| 2099 | */ | 1978 | */ |
| 2100 | if (!mutex_trylock(&pool->manager_arb)) | 1979 | if (!mutex_trylock(&pool->manager_arb)) |
| 2101 | return ret; | 1980 | return ret; |
| 2102 | 1981 | ||
| 2103 | /* | ||
| 2104 | * With manager arbitration won, manager_mutex would be free in | ||
| 2105 | * most cases. trylock first without dropping @pool->lock. | ||
| 2106 | */ | ||
| 2107 | if (unlikely(!mutex_trylock(&pool->manager_mutex))) { | ||
| 2108 | spin_unlock_irq(&pool->lock); | ||
| 2109 | mutex_lock(&pool->manager_mutex); | ||
| 2110 | spin_lock_irq(&pool->lock); | ||
| 2111 | ret = true; | ||
| 2112 | } | ||
| 2113 | |||
| 2114 | pool->flags &= ~POOL_MANAGE_WORKERS; | ||
| 2115 | |||
| 2116 | /* | ||
| 2117 | * Destroy and then create so that may_start_working() is true | ||
| 2118 | * on return. | ||
| 2119 | */ | ||
| 2120 | ret |= maybe_destroy_workers(pool); | ||
| 2121 | ret |= maybe_create_worker(pool); | 1982 | ret |= maybe_create_worker(pool); |
| 2122 | 1983 | ||
| 2123 | mutex_unlock(&pool->manager_mutex); | ||
| 2124 | mutex_unlock(&pool->manager_arb); | 1984 | mutex_unlock(&pool->manager_arb); |
| 2125 | return ret; | 1985 | return ret; |
| 2126 | } | 1986 | } |
| @@ -2308,6 +2168,11 @@ woke_up: | |||
| 2308 | spin_unlock_irq(&pool->lock); | 2168 | spin_unlock_irq(&pool->lock); |
| 2309 | WARN_ON_ONCE(!list_empty(&worker->entry)); | 2169 | WARN_ON_ONCE(!list_empty(&worker->entry)); |
| 2310 | worker->task->flags &= ~PF_WQ_WORKER; | 2170 | worker->task->flags &= ~PF_WQ_WORKER; |
| 2171 | |||
| 2172 | set_task_comm(worker->task, "kworker/dying"); | ||
| 2173 | ida_simple_remove(&pool->worker_ida, worker->id); | ||
| 2174 | worker_detach_from_pool(worker, pool); | ||
| 2175 | kfree(worker); | ||
| 2311 | return 0; | 2176 | return 0; |
| 2312 | } | 2177 | } |
| 2313 | 2178 | ||
| @@ -2355,9 +2220,6 @@ recheck: | |||
| 2355 | 2220 | ||
| 2356 | worker_set_flags(worker, WORKER_PREP, false); | 2221 | worker_set_flags(worker, WORKER_PREP, false); |
| 2357 | sleep: | 2222 | sleep: |
| 2358 | if (unlikely(need_to_manage_workers(pool)) && manage_workers(worker)) | ||
| 2359 | goto recheck; | ||
| 2360 | |||
| 2361 | /* | 2223 | /* |
| 2362 | * pool->lock is held and there's no work to process and no need to | 2224 | * pool->lock is held and there's no work to process and no need to |
| 2363 | * manage, sleep. Workers are woken up only while holding | 2225 | * manage, sleep. Workers are woken up only while holding |
| @@ -2398,6 +2260,7 @@ static int rescuer_thread(void *__rescuer) | |||
| 2398 | struct worker *rescuer = __rescuer; | 2260 | struct worker *rescuer = __rescuer; |
| 2399 | struct workqueue_struct *wq = rescuer->rescue_wq; | 2261 | struct workqueue_struct *wq = rescuer->rescue_wq; |
| 2400 | struct list_head *scheduled = &rescuer->scheduled; | 2262 | struct list_head *scheduled = &rescuer->scheduled; |
| 2263 | bool should_stop; | ||
| 2401 | 2264 | ||
| 2402 | set_user_nice(current, RESCUER_NICE_LEVEL); | 2265 | set_user_nice(current, RESCUER_NICE_LEVEL); |
| 2403 | 2266 | ||
| @@ -2409,11 +2272,15 @@ static int rescuer_thread(void *__rescuer) | |||
| 2409 | repeat: | 2272 | repeat: |
| 2410 | set_current_state(TASK_INTERRUPTIBLE); | 2273 | set_current_state(TASK_INTERRUPTIBLE); |
| 2411 | 2274 | ||
| 2412 | if (kthread_should_stop()) { | 2275 | /* |
| 2413 | __set_current_state(TASK_RUNNING); | 2276 | * By the time the rescuer is requested to stop, the workqueue |
| 2414 | rescuer->task->flags &= ~PF_WQ_WORKER; | 2277 | * shouldn't have any work pending, but @wq->maydays may still have |
| 2415 | return 0; | 2278 | * pwq(s) queued. This can happen by non-rescuer workers consuming |
| 2416 | } | 2279 | * all the work items before the rescuer got to them. Go through |
| 2280 | * @wq->maydays processing before acting on should_stop so that the | ||
| 2281 | * list is always empty on exit. | ||
| 2282 | */ | ||
| 2283 | should_stop = kthread_should_stop(); | ||
| 2417 | 2284 | ||
| 2418 | /* see whether any pwq is asking for help */ | 2285 | /* see whether any pwq is asking for help */ |
| 2419 | spin_lock_irq(&wq_mayday_lock); | 2286 | spin_lock_irq(&wq_mayday_lock); |
| @@ -2429,8 +2296,9 @@ repeat: | |||
| 2429 | 2296 | ||
| 2430 | spin_unlock_irq(&wq_mayday_lock); | 2297 | spin_unlock_irq(&wq_mayday_lock); |
| 2431 | 2298 | ||
| 2432 | /* migrate to the target cpu if possible */ | 2299 | worker_attach_to_pool(rescuer, pool); |
| 2433 | worker_maybe_bind_and_lock(pool); | 2300 | |
| 2301 | spin_lock_irq(&pool->lock); | ||
| 2434 | rescuer->pool = pool; | 2302 | rescuer->pool = pool; |
| 2435 | 2303 | ||
| 2436 | /* | 2304 | /* |
| @@ -2443,6 +2311,17 @@ repeat: | |||
| 2443 | move_linked_works(work, scheduled, &n); | 2311 | move_linked_works(work, scheduled, &n); |
| 2444 | 2312 | ||
| 2445 | process_scheduled_works(rescuer); | 2313 | process_scheduled_works(rescuer); |
| 2314 | spin_unlock_irq(&pool->lock); | ||
| 2315 | |||
| 2316 | worker_detach_from_pool(rescuer, pool); | ||
| 2317 | |||
| 2318 | spin_lock_irq(&pool->lock); | ||
| 2319 | |||
| 2320 | /* | ||
| 2321 | * Put the reference grabbed by send_mayday(). @pool won't | ||
| 2322 | * go away while we're holding its lock. | ||
| 2323 | */ | ||
| 2324 | put_pwq(pwq); | ||
| 2446 | 2325 | ||
| 2447 | /* | 2326 | /* |
| 2448 | * Leave this pool. If keep_working() is %true, notify a | 2327 | * Leave this pool. If keep_working() is %true, notify a |
| @@ -2459,6 +2338,12 @@ repeat: | |||
| 2459 | 2338 | ||
| 2460 | spin_unlock_irq(&wq_mayday_lock); | 2339 | spin_unlock_irq(&wq_mayday_lock); |
| 2461 | 2340 | ||
| 2341 | if (should_stop) { | ||
| 2342 | __set_current_state(TASK_RUNNING); | ||
| 2343 | rescuer->task->flags &= ~PF_WQ_WORKER; | ||
| 2344 | return 0; | ||
| 2345 | } | ||
| 2346 | |||
| 2462 | /* rescuers should never participate in concurrency management */ | 2347 | /* rescuers should never participate in concurrency management */ |
| 2463 | WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING)); | 2348 | WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING)); |
| 2464 | schedule(); | 2349 | schedule(); |
| @@ -3527,9 +3412,10 @@ static int init_worker_pool(struct worker_pool *pool) | |||
| 3527 | (unsigned long)pool); | 3412 | (unsigned long)pool); |
| 3528 | 3413 | ||
| 3529 | mutex_init(&pool->manager_arb); | 3414 | mutex_init(&pool->manager_arb); |
| 3530 | mutex_init(&pool->manager_mutex); | 3415 | mutex_init(&pool->attach_mutex); |
| 3531 | idr_init(&pool->worker_idr); | 3416 | INIT_LIST_HEAD(&pool->workers); |
| 3532 | 3417 | ||
| 3418 | ida_init(&pool->worker_ida); | ||
| 3533 | INIT_HLIST_NODE(&pool->hash_node); | 3419 | INIT_HLIST_NODE(&pool->hash_node); |
| 3534 | pool->refcnt = 1; | 3420 | pool->refcnt = 1; |
| 3535 | 3421 | ||
| @@ -3544,7 +3430,7 @@ static void rcu_free_pool(struct rcu_head *rcu) | |||
| 3544 | { | 3430 | { |
| 3545 | struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu); | 3431 | struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu); |
| 3546 | 3432 | ||
| 3547 | idr_destroy(&pool->worker_idr); | 3433 | ida_destroy(&pool->worker_ida); |
| 3548 | free_workqueue_attrs(pool->attrs); | 3434 | free_workqueue_attrs(pool->attrs); |
| 3549 | kfree(pool); | 3435 | kfree(pool); |
| 3550 | } | 3436 | } |
| @@ -3562,6 +3448,7 @@ static void rcu_free_pool(struct rcu_head *rcu) | |||
| 3562 | */ | 3448 | */ |
| 3563 | static void put_unbound_pool(struct worker_pool *pool) | 3449 | static void put_unbound_pool(struct worker_pool *pool) |
| 3564 | { | 3450 | { |
| 3451 | DECLARE_COMPLETION_ONSTACK(detach_completion); | ||
| 3565 | struct worker *worker; | 3452 | struct worker *worker; |
| 3566 | 3453 | ||
| 3567 | lockdep_assert_held(&wq_pool_mutex); | 3454 | lockdep_assert_held(&wq_pool_mutex); |
| @@ -3582,18 +3469,24 @@ static void put_unbound_pool(struct worker_pool *pool) | |||
| 3582 | /* | 3469 | /* |
| 3583 | * Become the manager and destroy all workers. Grabbing | 3470 | * Become the manager and destroy all workers. Grabbing |
| 3584 | * manager_arb prevents @pool's workers from blocking on | 3471 | * manager_arb prevents @pool's workers from blocking on |
| 3585 | * manager_mutex. | 3472 | * attach_mutex. |
| 3586 | */ | 3473 | */ |
| 3587 | mutex_lock(&pool->manager_arb); | 3474 | mutex_lock(&pool->manager_arb); |
| 3588 | mutex_lock(&pool->manager_mutex); | ||
| 3589 | spin_lock_irq(&pool->lock); | ||
| 3590 | 3475 | ||
| 3591 | while ((worker = first_worker(pool))) | 3476 | spin_lock_irq(&pool->lock); |
| 3477 | while ((worker = first_idle_worker(pool))) | ||
| 3592 | destroy_worker(worker); | 3478 | destroy_worker(worker); |
| 3593 | WARN_ON(pool->nr_workers || pool->nr_idle); | 3479 | WARN_ON(pool->nr_workers || pool->nr_idle); |
| 3594 | |||
| 3595 | spin_unlock_irq(&pool->lock); | 3480 | spin_unlock_irq(&pool->lock); |
| 3596 | mutex_unlock(&pool->manager_mutex); | 3481 | |
| 3482 | mutex_lock(&pool->attach_mutex); | ||
| 3483 | if (!list_empty(&pool->workers)) | ||
| 3484 | pool->detach_completion = &detach_completion; | ||
| 3485 | mutex_unlock(&pool->attach_mutex); | ||
| 3486 | |||
| 3487 | if (pool->detach_completion) | ||
| 3488 | wait_for_completion(pool->detach_completion); | ||
| 3489 | |||
| 3597 | mutex_unlock(&pool->manager_arb); | 3490 | mutex_unlock(&pool->manager_arb); |
| 3598 | 3491 | ||
| 3599 | /* shut down the timers */ | 3492 | /* shut down the timers */ |
| @@ -3639,9 +3532,6 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) | |||
| 3639 | if (!pool || init_worker_pool(pool) < 0) | 3532 | if (!pool || init_worker_pool(pool) < 0) |
| 3640 | goto fail; | 3533 | goto fail; |
| 3641 | 3534 | ||
| 3642 | if (workqueue_freezing) | ||
| 3643 | pool->flags |= POOL_FREEZING; | ||
| 3644 | |||
| 3645 | lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */ | 3535 | lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */ |
| 3646 | copy_workqueue_attrs(pool->attrs, attrs); | 3536 | copy_workqueue_attrs(pool->attrs, attrs); |
| 3647 | 3537 | ||
| @@ -3748,7 +3638,12 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq) | |||
| 3748 | 3638 | ||
| 3749 | spin_lock_irq(&pwq->pool->lock); | 3639 | spin_lock_irq(&pwq->pool->lock); |
| 3750 | 3640 | ||
| 3751 | if (!freezable || !(pwq->pool->flags & POOL_FREEZING)) { | 3641 | /* |
| 3642 | * During [un]freezing, the caller is responsible for ensuring that | ||
| 3643 | * this function is called at least once after @workqueue_freezing | ||
| 3644 | * is updated and visible. | ||
| 3645 | */ | ||
| 3646 | if (!freezable || !workqueue_freezing) { | ||
| 3752 | pwq->max_active = wq->saved_max_active; | 3647 | pwq->max_active = wq->saved_max_active; |
| 3753 | 3648 | ||
| 3754 | while (!list_empty(&pwq->delayed_works) && | 3649 | while (!list_empty(&pwq->delayed_works) && |
| @@ -4080,17 +3975,13 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu, | |||
| 4080 | * Let's determine what needs to be done. If the target cpumask is | 3975 | * Let's determine what needs to be done. If the target cpumask is |
| 4081 | * different from wq's, we need to compare it to @pwq's and create | 3976 | * different from wq's, we need to compare it to @pwq's and create |
| 4082 | * a new one if they don't match. If the target cpumask equals | 3977 | * a new one if they don't match. If the target cpumask equals |
| 4083 | * wq's, the default pwq should be used. If @pwq is already the | 3978 | * wq's, the default pwq should be used. |
| 4084 | * default one, nothing to do; otherwise, install the default one. | ||
| 4085 | */ | 3979 | */ |
| 4086 | if (wq_calc_node_cpumask(wq->unbound_attrs, node, cpu_off, cpumask)) { | 3980 | if (wq_calc_node_cpumask(wq->unbound_attrs, node, cpu_off, cpumask)) { |
| 4087 | if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask)) | 3981 | if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask)) |
| 4088 | goto out_unlock; | 3982 | goto out_unlock; |
| 4089 | } else { | 3983 | } else { |
| 4090 | if (pwq == wq->dfl_pwq) | 3984 | goto use_dfl_pwq; |
| 4091 | goto out_unlock; | ||
| 4092 | else | ||
| 4093 | goto use_dfl_pwq; | ||
| 4094 | } | 3985 | } |
| 4095 | 3986 | ||
| 4096 | mutex_unlock(&wq->mutex); | 3987 | mutex_unlock(&wq->mutex); |
| @@ -4098,9 +3989,10 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu, | |||
| 4098 | /* create a new pwq */ | 3989 | /* create a new pwq */ |
| 4099 | pwq = alloc_unbound_pwq(wq, target_attrs); | 3990 | pwq = alloc_unbound_pwq(wq, target_attrs); |
| 4100 | if (!pwq) { | 3991 | if (!pwq) { |
| 4101 | pr_warning("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n", | 3992 | pr_warn("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n", |
| 4102 | wq->name); | 3993 | wq->name); |
| 4103 | goto out_unlock; | 3994 | mutex_lock(&wq->mutex); |
| 3995 | goto use_dfl_pwq; | ||
| 4104 | } | 3996 | } |
| 4105 | 3997 | ||
| 4106 | /* | 3998 | /* |
| @@ -4575,28 +4467,27 @@ static void wq_unbind_fn(struct work_struct *work) | |||
| 4575 | int cpu = smp_processor_id(); | 4467 | int cpu = smp_processor_id(); |
| 4576 | struct worker_pool *pool; | 4468 | struct worker_pool *pool; |
| 4577 | struct worker *worker; | 4469 | struct worker *worker; |
| 4578 | int wi; | ||
| 4579 | 4470 | ||
| 4580 | for_each_cpu_worker_pool(pool, cpu) { | 4471 | for_each_cpu_worker_pool(pool, cpu) { |
| 4581 | WARN_ON_ONCE(cpu != smp_processor_id()); | 4472 | WARN_ON_ONCE(cpu != smp_processor_id()); |
| 4582 | 4473 | ||
| 4583 | mutex_lock(&pool->manager_mutex); | 4474 | mutex_lock(&pool->attach_mutex); |
| 4584 | spin_lock_irq(&pool->lock); | 4475 | spin_lock_irq(&pool->lock); |
| 4585 | 4476 | ||
| 4586 | /* | 4477 | /* |
| 4587 | * We've blocked all manager operations. Make all workers | 4478 | * We've blocked all attach/detach operations. Make all workers |
| 4588 | * unbound and set DISASSOCIATED. Before this, all workers | 4479 | * unbound and set DISASSOCIATED. Before this, all workers |
| 4589 | * except for the ones which are still executing works from | 4480 | * except for the ones which are still executing works from |
| 4590 | * before the last CPU down must be on the cpu. After | 4481 | * before the last CPU down must be on the cpu. After |
| 4591 | * this, they may become diasporas. | 4482 | * this, they may become diasporas. |
| 4592 | */ | 4483 | */ |
| 4593 | for_each_pool_worker(worker, wi, pool) | 4484 | for_each_pool_worker(worker, pool) |
| 4594 | worker->flags |= WORKER_UNBOUND; | 4485 | worker->flags |= WORKER_UNBOUND; |
| 4595 | 4486 | ||
| 4596 | pool->flags |= POOL_DISASSOCIATED; | 4487 | pool->flags |= POOL_DISASSOCIATED; |
| 4597 | 4488 | ||
| 4598 | spin_unlock_irq(&pool->lock); | 4489 | spin_unlock_irq(&pool->lock); |
| 4599 | mutex_unlock(&pool->manager_mutex); | 4490 | mutex_unlock(&pool->attach_mutex); |
| 4600 | 4491 | ||
| 4601 | /* | 4492 | /* |
| 4602 | * Call schedule() so that we cross rq->lock and thus can | 4493 | * Call schedule() so that we cross rq->lock and thus can |
| @@ -4636,9 +4527,8 @@ static void wq_unbind_fn(struct work_struct *work) | |||
| 4636 | static void rebind_workers(struct worker_pool *pool) | 4527 | static void rebind_workers(struct worker_pool *pool) |
| 4637 | { | 4528 | { |
| 4638 | struct worker *worker; | 4529 | struct worker *worker; |
| 4639 | int wi; | ||
| 4640 | 4530 | ||
| 4641 | lockdep_assert_held(&pool->manager_mutex); | 4531 | lockdep_assert_held(&pool->attach_mutex); |
| 4642 | 4532 | ||
| 4643 | /* | 4533 | /* |
| 4644 | * Restore CPU affinity of all workers. As all idle workers should | 4534 | * Restore CPU affinity of all workers. As all idle workers should |
| @@ -4647,13 +4537,13 @@ static void rebind_workers(struct worker_pool *pool) | |||
| 4647 | * of all workers first and then clear UNBOUND. As we're called | 4537 | * of all workers first and then clear UNBOUND. As we're called |
| 4648 | * from CPU_ONLINE, the following shouldn't fail. | 4538 | * from CPU_ONLINE, the following shouldn't fail. |
| 4649 | */ | 4539 | */ |
| 4650 | for_each_pool_worker(worker, wi, pool) | 4540 | for_each_pool_worker(worker, pool) |
| 4651 | WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, | 4541 | WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, |
| 4652 | pool->attrs->cpumask) < 0); | 4542 | pool->attrs->cpumask) < 0); |
| 4653 | 4543 | ||
| 4654 | spin_lock_irq(&pool->lock); | 4544 | spin_lock_irq(&pool->lock); |
| 4655 | 4545 | ||
| 4656 | for_each_pool_worker(worker, wi, pool) { | 4546 | for_each_pool_worker(worker, pool) { |
| 4657 | unsigned int worker_flags = worker->flags; | 4547 | unsigned int worker_flags = worker->flags; |
| 4658 | 4548 | ||
| 4659 | /* | 4549 | /* |
| @@ -4705,9 +4595,8 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu) | |||
| 4705 | { | 4595 | { |
| 4706 | static cpumask_t cpumask; | 4596 | static cpumask_t cpumask; |
| 4707 | struct worker *worker; | 4597 | struct worker *worker; |
| 4708 | int wi; | ||
| 4709 | 4598 | ||
| 4710 | lockdep_assert_held(&pool->manager_mutex); | 4599 | lockdep_assert_held(&pool->attach_mutex); |
| 4711 | 4600 | ||
| 4712 | /* is @cpu allowed for @pool? */ | 4601 | /* is @cpu allowed for @pool? */ |
| 4713 | if (!cpumask_test_cpu(cpu, pool->attrs->cpumask)) | 4602 | if (!cpumask_test_cpu(cpu, pool->attrs->cpumask)) |
| @@ -4719,7 +4608,7 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu) | |||
| 4719 | return; | 4608 | return; |
| 4720 | 4609 | ||
| 4721 | /* as we're called from CPU_ONLINE, the following shouldn't fail */ | 4610 | /* as we're called from CPU_ONLINE, the following shouldn't fail */ |
| 4722 | for_each_pool_worker(worker, wi, pool) | 4611 | for_each_pool_worker(worker, pool) |
| 4723 | WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, | 4612 | WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, |
| 4724 | pool->attrs->cpumask) < 0); | 4613 | pool->attrs->cpumask) < 0); |
| 4725 | } | 4614 | } |
| @@ -4752,7 +4641,7 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb, | |||
| 4752 | mutex_lock(&wq_pool_mutex); | 4641 | mutex_lock(&wq_pool_mutex); |
| 4753 | 4642 | ||
| 4754 | for_each_pool(pool, pi) { | 4643 | for_each_pool(pool, pi) { |
| 4755 | mutex_lock(&pool->manager_mutex); | 4644 | mutex_lock(&pool->attach_mutex); |
| 4756 | 4645 | ||
| 4757 | if (pool->cpu == cpu) { | 4646 | if (pool->cpu == cpu) { |
| 4758 | spin_lock_irq(&pool->lock); | 4647 | spin_lock_irq(&pool->lock); |
| @@ -4764,7 +4653,7 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb, | |||
| 4764 | restore_unbound_workers_cpumask(pool, cpu); | 4653 | restore_unbound_workers_cpumask(pool, cpu); |
| 4765 | } | 4654 | } |
| 4766 | 4655 | ||
| 4767 | mutex_unlock(&pool->manager_mutex); | 4656 | mutex_unlock(&pool->attach_mutex); |
| 4768 | } | 4657 | } |
| 4769 | 4658 | ||
| 4770 | /* update NUMA affinity of unbound workqueues */ | 4659 | /* update NUMA affinity of unbound workqueues */ |
| @@ -4863,24 +4752,14 @@ EXPORT_SYMBOL_GPL(work_on_cpu); | |||
| 4863 | */ | 4752 | */ |
| 4864 | void freeze_workqueues_begin(void) | 4753 | void freeze_workqueues_begin(void) |
| 4865 | { | 4754 | { |
| 4866 | struct worker_pool *pool; | ||
| 4867 | struct workqueue_struct *wq; | 4755 | struct workqueue_struct *wq; |
| 4868 | struct pool_workqueue *pwq; | 4756 | struct pool_workqueue *pwq; |
| 4869 | int pi; | ||
| 4870 | 4757 | ||
| 4871 | mutex_lock(&wq_pool_mutex); | 4758 | mutex_lock(&wq_pool_mutex); |
| 4872 | 4759 | ||
| 4873 | WARN_ON_ONCE(workqueue_freezing); | 4760 | WARN_ON_ONCE(workqueue_freezing); |
| 4874 | workqueue_freezing = true; | 4761 | workqueue_freezing = true; |
| 4875 | 4762 | ||
| 4876 | /* set FREEZING */ | ||
| 4877 | for_each_pool(pool, pi) { | ||
| 4878 | spin_lock_irq(&pool->lock); | ||
| 4879 | WARN_ON_ONCE(pool->flags & POOL_FREEZING); | ||
| 4880 | pool->flags |= POOL_FREEZING; | ||
| 4881 | spin_unlock_irq(&pool->lock); | ||
| 4882 | } | ||
| 4883 | |||
| 4884 | list_for_each_entry(wq, &workqueues, list) { | 4763 | list_for_each_entry(wq, &workqueues, list) { |
| 4885 | mutex_lock(&wq->mutex); | 4764 | mutex_lock(&wq->mutex); |
| 4886 | for_each_pwq(pwq, wq) | 4765 | for_each_pwq(pwq, wq) |
| @@ -4950,21 +4829,13 @@ void thaw_workqueues(void) | |||
| 4950 | { | 4829 | { |
| 4951 | struct workqueue_struct *wq; | 4830 | struct workqueue_struct *wq; |
| 4952 | struct pool_workqueue *pwq; | 4831 | struct pool_workqueue *pwq; |
| 4953 | struct worker_pool *pool; | ||
| 4954 | int pi; | ||
| 4955 | 4832 | ||
| 4956 | mutex_lock(&wq_pool_mutex); | 4833 | mutex_lock(&wq_pool_mutex); |
| 4957 | 4834 | ||
| 4958 | if (!workqueue_freezing) | 4835 | if (!workqueue_freezing) |
| 4959 | goto out_unlock; | 4836 | goto out_unlock; |
| 4960 | 4837 | ||
| 4961 | /* clear FREEZING */ | 4838 | workqueue_freezing = false; |
| 4962 | for_each_pool(pool, pi) { | ||
| 4963 | spin_lock_irq(&pool->lock); | ||
| 4964 | WARN_ON_ONCE(!(pool->flags & POOL_FREEZING)); | ||
| 4965 | pool->flags &= ~POOL_FREEZING; | ||
| 4966 | spin_unlock_irq(&pool->lock); | ||
| 4967 | } | ||
| 4968 | 4839 | ||
| 4969 | /* restore max_active and repopulate worklist */ | 4840 | /* restore max_active and repopulate worklist */ |
| 4970 | list_for_each_entry(wq, &workqueues, list) { | 4841 | list_for_each_entry(wq, &workqueues, list) { |
| @@ -4974,7 +4845,6 @@ void thaw_workqueues(void) | |||
| 4974 | mutex_unlock(&wq->mutex); | 4845 | mutex_unlock(&wq->mutex); |
| 4975 | } | 4846 | } |
| 4976 | 4847 | ||
| 4977 | workqueue_freezing = false; | ||
| 4978 | out_unlock: | 4848 | out_unlock: |
| 4979 | mutex_unlock(&wq_pool_mutex); | 4849 | mutex_unlock(&wq_pool_mutex); |
| 4980 | } | 4850 | } |
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index 7e2204db0b1a..45215870ac6c 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h | |||
| @@ -37,6 +37,8 @@ struct worker { | |||
| 37 | struct task_struct *task; /* I: worker task */ | 37 | struct task_struct *task; /* I: worker task */ |
| 38 | struct worker_pool *pool; /* I: the associated pool */ | 38 | struct worker_pool *pool; /* I: the associated pool */ |
| 39 | /* L: for rescuers */ | 39 | /* L: for rescuers */ |
| 40 | struct list_head node; /* A: anchored at pool->workers */ | ||
| 41 | /* A: runs through worker->node */ | ||
| 40 | 42 | ||
| 41 | unsigned long last_active; /* L: last active timestamp */ | 43 | unsigned long last_active; /* L: last active timestamp */ |
| 42 | unsigned int flags; /* X: flags */ | 44 | unsigned int flags; /* X: flags */ |
