diff options
Diffstat (limited to 'kernel')
114 files changed, 6839 insertions, 4213 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 35ef1185e359..a4d1aa8da9bc 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
| @@ -6,9 +6,9 @@ obj-y = fork.o exec_domain.o panic.o \ | |||
| 6 | cpu.o exit.o itimer.o time.o softirq.o resource.o \ | 6 | cpu.o exit.o itimer.o time.o softirq.o resource.o \ |
| 7 | sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ | 7 | sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ |
| 8 | signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ | 8 | signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ |
| 9 | rcupdate.o extable.o params.o posix-timers.o \ | 9 | extable.o params.o posix-timers.o \ |
| 10 | kthread.o wait.o sys_ni.o posix-cpu-timers.o mutex.o \ | 10 | kthread.o sys_ni.o posix-cpu-timers.o mutex.o \ |
| 11 | hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ | 11 | hrtimer.o rwsem.o nsproxy.o semaphore.o \ |
| 12 | notifier.o ksysfs.o cred.o reboot.o \ | 12 | notifier.o ksysfs.o cred.o reboot.o \ |
| 13 | async.o range.o groups.o lglock.o smpboot.o | 13 | async.o range.o groups.o lglock.o smpboot.o |
| 14 | 14 | ||
| @@ -26,6 +26,8 @@ obj-y += sched/ | |||
| 26 | obj-y += power/ | 26 | obj-y += power/ |
| 27 | obj-y += printk/ | 27 | obj-y += printk/ |
| 28 | obj-y += cpu/ | 28 | obj-y += cpu/ |
| 29 | obj-y += irq/ | ||
| 30 | obj-y += rcu/ | ||
| 29 | 31 | ||
| 30 | obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o | 32 | obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o |
| 31 | obj-$(CONFIG_FREEZER) += freezer.o | 33 | obj-$(CONFIG_FREEZER) += freezer.o |
| @@ -79,14 +81,7 @@ obj-$(CONFIG_KPROBES) += kprobes.o | |||
| 79 | obj-$(CONFIG_KGDB) += debug/ | 81 | obj-$(CONFIG_KGDB) += debug/ |
| 80 | obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o | 82 | obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o |
| 81 | obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o | 83 | obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o |
| 82 | obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ | ||
| 83 | obj-$(CONFIG_SECCOMP) += seccomp.o | 84 | obj-$(CONFIG_SECCOMP) += seccomp.o |
| 84 | obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o | ||
| 85 | obj-$(CONFIG_TREE_RCU) += rcutree.o | ||
| 86 | obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o | ||
| 87 | obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o | ||
| 88 | obj-$(CONFIG_TINY_RCU) += rcutiny.o | ||
| 89 | obj-$(CONFIG_TINY_PREEMPT_RCU) += rcutiny.o | ||
| 90 | obj-$(CONFIG_RELAY) += relay.o | 85 | obj-$(CONFIG_RELAY) += relay.o |
| 91 | obj-$(CONFIG_SYSCTL) += utsname_sysctl.o | 86 | obj-$(CONFIG_SYSCTL) += utsname_sysctl.o |
| 92 | obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o | 87 | obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o |
diff --git a/kernel/audit.c b/kernel/audit.c index 91e53d04b6a9..7b0e23a740ce 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
| @@ -1117,9 +1117,10 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, | |||
| 1117 | 1117 | ||
| 1118 | sleep_time = timeout_start + audit_backlog_wait_time - | 1118 | sleep_time = timeout_start + audit_backlog_wait_time - |
| 1119 | jiffies; | 1119 | jiffies; |
| 1120 | if ((long)sleep_time > 0) | 1120 | if ((long)sleep_time > 0) { |
| 1121 | wait_for_auditd(sleep_time); | 1121 | wait_for_auditd(sleep_time); |
| 1122 | continue; | 1122 | continue; |
| 1123 | } | ||
| 1123 | } | 1124 | } |
| 1124 | if (audit_rate_check() && printk_ratelimit()) | 1125 | if (audit_rate_check() && printk_ratelimit()) |
| 1125 | printk(KERN_WARNING | 1126 | printk(KERN_WARNING |
diff --git a/kernel/bounds.c b/kernel/bounds.c index 0c9b862292b2..e8ca97b5c386 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c | |||
| @@ -10,6 +10,7 @@ | |||
| 10 | #include <linux/mmzone.h> | 10 | #include <linux/mmzone.h> |
| 11 | #include <linux/kbuild.h> | 11 | #include <linux/kbuild.h> |
| 12 | #include <linux/page_cgroup.h> | 12 | #include <linux/page_cgroup.h> |
| 13 | #include <linux/log2.h> | ||
| 13 | 14 | ||
| 14 | void foo(void) | 15 | void foo(void) |
| 15 | { | 16 | { |
| @@ -17,5 +18,8 @@ void foo(void) | |||
| 17 | DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); | 18 | DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); |
| 18 | DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); | 19 | DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); |
| 19 | DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS); | 20 | DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS); |
| 21 | #ifdef CONFIG_SMP | ||
| 22 | DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS)); | ||
| 23 | #endif | ||
| 20 | /* End of constants */ | 24 | /* End of constants */ |
| 21 | } | 25 | } |
diff --git a/kernel/capability.c b/kernel/capability.c index f6c2ce5701e1..4e66bf9275b0 100644 --- a/kernel/capability.c +++ b/kernel/capability.c | |||
| @@ -433,18 +433,6 @@ bool capable(int cap) | |||
| 433 | EXPORT_SYMBOL(capable); | 433 | EXPORT_SYMBOL(capable); |
| 434 | 434 | ||
| 435 | /** | 435 | /** |
| 436 | * nsown_capable - Check superior capability to one's own user_ns | ||
| 437 | * @cap: The capability in question | ||
| 438 | * | ||
| 439 | * Return true if the current task has the given superior capability | ||
| 440 | * targeted at its own user namespace. | ||
| 441 | */ | ||
| 442 | bool nsown_capable(int cap) | ||
| 443 | { | ||
| 444 | return ns_capable(current_user_ns(), cap); | ||
| 445 | } | ||
| 446 | |||
| 447 | /** | ||
| 448 | * inode_capable - Check superior capability over inode | 436 | * inode_capable - Check superior capability over inode |
| 449 | * @inode: The inode in question | 437 | * @inode: The inode in question |
| 450 | * @cap: The capability in question | 438 | * @cap: The capability in question |
| @@ -464,3 +452,4 @@ bool inode_capable(const struct inode *inode, int cap) | |||
| 464 | 452 | ||
| 465 | return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid); | 453 | return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid); |
| 466 | } | 454 | } |
| 455 | EXPORT_SYMBOL(inode_capable); | ||
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 781845a013ab..e0839bcd48c8 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
| @@ -60,6 +60,7 @@ | |||
| 60 | #include <linux/poll.h> | 60 | #include <linux/poll.h> |
| 61 | #include <linux/flex_array.h> /* used in cgroup_attach_task */ | 61 | #include <linux/flex_array.h> /* used in cgroup_attach_task */ |
| 62 | #include <linux/kthread.h> | 62 | #include <linux/kthread.h> |
| 63 | #include <linux/file.h> | ||
| 63 | 64 | ||
| 64 | #include <linux/atomic.h> | 65 | #include <linux/atomic.h> |
| 65 | 66 | ||
| @@ -81,7 +82,7 @@ | |||
| 81 | */ | 82 | */ |
| 82 | #ifdef CONFIG_PROVE_RCU | 83 | #ifdef CONFIG_PROVE_RCU |
| 83 | DEFINE_MUTEX(cgroup_mutex); | 84 | DEFINE_MUTEX(cgroup_mutex); |
| 84 | EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for task_subsys_state_check() */ | 85 | EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for lockdep */ |
| 85 | #else | 86 | #else |
| 86 | static DEFINE_MUTEX(cgroup_mutex); | 87 | static DEFINE_MUTEX(cgroup_mutex); |
| 87 | #endif | 88 | #endif |
| @@ -117,51 +118,20 @@ struct cfent { | |||
| 117 | struct list_head node; | 118 | struct list_head node; |
| 118 | struct dentry *dentry; | 119 | struct dentry *dentry; |
| 119 | struct cftype *type; | 120 | struct cftype *type; |
| 121 | struct cgroup_subsys_state *css; | ||
| 120 | 122 | ||
| 121 | /* file xattrs */ | 123 | /* file xattrs */ |
| 122 | struct simple_xattrs xattrs; | 124 | struct simple_xattrs xattrs; |
| 123 | }; | 125 | }; |
| 124 | 126 | ||
| 125 | /* | 127 | /* |
| 126 | * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when | ||
| 127 | * cgroup_subsys->use_id != 0. | ||
| 128 | */ | ||
| 129 | #define CSS_ID_MAX (65535) | ||
| 130 | struct css_id { | ||
| 131 | /* | ||
| 132 | * The css to which this ID points. This pointer is set to valid value | ||
| 133 | * after cgroup is populated. If cgroup is removed, this will be NULL. | ||
| 134 | * This pointer is expected to be RCU-safe because destroy() | ||
| 135 | * is called after synchronize_rcu(). But for safe use, css_tryget() | ||
| 136 | * should be used for avoiding race. | ||
| 137 | */ | ||
| 138 | struct cgroup_subsys_state __rcu *css; | ||
| 139 | /* | ||
| 140 | * ID of this css. | ||
| 141 | */ | ||
| 142 | unsigned short id; | ||
| 143 | /* | ||
| 144 | * Depth in hierarchy which this ID belongs to. | ||
| 145 | */ | ||
| 146 | unsigned short depth; | ||
| 147 | /* | ||
| 148 | * ID is freed by RCU. (and lookup routine is RCU safe.) | ||
| 149 | */ | ||
| 150 | struct rcu_head rcu_head; | ||
| 151 | /* | ||
| 152 | * Hierarchy of CSS ID belongs to. | ||
| 153 | */ | ||
| 154 | unsigned short stack[0]; /* Array of Length (depth+1) */ | ||
| 155 | }; | ||
| 156 | |||
| 157 | /* | ||
| 158 | * cgroup_event represents events which userspace want to receive. | 128 | * cgroup_event represents events which userspace want to receive. |
| 159 | */ | 129 | */ |
| 160 | struct cgroup_event { | 130 | struct cgroup_event { |
| 161 | /* | 131 | /* |
| 162 | * Cgroup which the event belongs to. | 132 | * css which the event belongs to. |
| 163 | */ | 133 | */ |
| 164 | struct cgroup *cgrp; | 134 | struct cgroup_subsys_state *css; |
| 165 | /* | 135 | /* |
| 166 | * Control file which the event associated. | 136 | * Control file which the event associated. |
| 167 | */ | 137 | */ |
| @@ -215,10 +185,33 @@ static u64 cgroup_serial_nr_next = 1; | |||
| 215 | */ | 185 | */ |
| 216 | static int need_forkexit_callback __read_mostly; | 186 | static int need_forkexit_callback __read_mostly; |
| 217 | 187 | ||
| 218 | static void cgroup_offline_fn(struct work_struct *work); | 188 | static struct cftype cgroup_base_files[]; |
| 189 | |||
| 190 | static void cgroup_destroy_css_killed(struct cgroup *cgrp); | ||
| 219 | static int cgroup_destroy_locked(struct cgroup *cgrp); | 191 | static int cgroup_destroy_locked(struct cgroup *cgrp); |
| 220 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, | 192 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], |
| 221 | struct cftype cfts[], bool is_add); | 193 | bool is_add); |
| 194 | |||
| 195 | /** | ||
| 196 | * cgroup_css - obtain a cgroup's css for the specified subsystem | ||
| 197 | * @cgrp: the cgroup of interest | ||
| 198 | * @ss: the subsystem of interest (%NULL returns the dummy_css) | ||
| 199 | * | ||
| 200 | * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This | ||
| 201 | * function must be called either under cgroup_mutex or rcu_read_lock() and | ||
| 202 | * the caller is responsible for pinning the returned css if it wants to | ||
| 203 | * keep accessing it outside the said locks. This function may return | ||
| 204 | * %NULL if @cgrp doesn't have @subsys_id enabled. | ||
| 205 | */ | ||
| 206 | static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, | ||
| 207 | struct cgroup_subsys *ss) | ||
| 208 | { | ||
| 209 | if (ss) | ||
| 210 | return rcu_dereference_check(cgrp->subsys[ss->subsys_id], | ||
| 211 | lockdep_is_held(&cgroup_mutex)); | ||
| 212 | else | ||
| 213 | return &cgrp->dummy_css; | ||
| 214 | } | ||
| 222 | 215 | ||
| 223 | /* convenient tests for these bits */ | 216 | /* convenient tests for these bits */ |
| 224 | static inline bool cgroup_is_dead(const struct cgroup *cgrp) | 217 | static inline bool cgroup_is_dead(const struct cgroup *cgrp) |
| @@ -362,12 +355,11 @@ struct cgrp_cset_link { | |||
| 362 | static struct css_set init_css_set; | 355 | static struct css_set init_css_set; |
| 363 | static struct cgrp_cset_link init_cgrp_cset_link; | 356 | static struct cgrp_cset_link init_cgrp_cset_link; |
| 364 | 357 | ||
| 365 | static int cgroup_init_idr(struct cgroup_subsys *ss, | 358 | /* |
| 366 | struct cgroup_subsys_state *css); | 359 | * css_set_lock protects the list of css_set objects, and the chain of |
| 367 | 360 | * tasks off each css_set. Nests outside task->alloc_lock due to | |
| 368 | /* css_set_lock protects the list of css_set objects, and the | 361 | * css_task_iter_start(). |
| 369 | * chain of tasks off each css_set. Nests outside task->alloc_lock | 362 | */ |
| 370 | * due to cgroup_iter_start() */ | ||
| 371 | static DEFINE_RWLOCK(css_set_lock); | 363 | static DEFINE_RWLOCK(css_set_lock); |
| 372 | static int css_set_count; | 364 | static int css_set_count; |
| 373 | 365 | ||
| @@ -392,10 +384,12 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) | |||
| 392 | return key; | 384 | return key; |
| 393 | } | 385 | } |
| 394 | 386 | ||
| 395 | /* We don't maintain the lists running through each css_set to its | 387 | /* |
| 396 | * task until after the first call to cgroup_iter_start(). This | 388 | * We don't maintain the lists running through each css_set to its task |
| 397 | * reduces the fork()/exit() overhead for people who have cgroups | 389 | * until after the first call to css_task_iter_start(). This reduces the |
| 398 | * compiled into their kernel but not actually in use */ | 390 | * fork()/exit() overhead for people who have cgroups compiled into their |
| 391 | * kernel but not actually in use. | ||
| 392 | */ | ||
| 399 | static int use_task_css_set_links __read_mostly; | 393 | static int use_task_css_set_links __read_mostly; |
| 400 | 394 | ||
| 401 | static void __put_css_set(struct css_set *cset, int taskexit) | 395 | static void __put_css_set(struct css_set *cset, int taskexit) |
| @@ -464,7 +458,7 @@ static inline void put_css_set_taskexit(struct css_set *cset) | |||
| 464 | * @new_cgrp: cgroup that's being entered by the task | 458 | * @new_cgrp: cgroup that's being entered by the task |
| 465 | * @template: desired set of css pointers in css_set (pre-calculated) | 459 | * @template: desired set of css pointers in css_set (pre-calculated) |
| 466 | * | 460 | * |
| 467 | * Returns true if "cg" matches "old_cg" except for the hierarchy | 461 | * Returns true if "cset" matches "old_cset" except for the hierarchy |
| 468 | * which "new_cgrp" belongs to, for which it should match "new_cgrp". | 462 | * which "new_cgrp" belongs to, for which it should match "new_cgrp". |
| 469 | */ | 463 | */ |
| 470 | static bool compare_css_sets(struct css_set *cset, | 464 | static bool compare_css_sets(struct css_set *cset, |
| @@ -555,7 +549,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset, | |||
| 555 | /* Subsystem is in this hierarchy. So we want | 549 | /* Subsystem is in this hierarchy. So we want |
| 556 | * the subsystem state from the new | 550 | * the subsystem state from the new |
| 557 | * cgroup */ | 551 | * cgroup */ |
| 558 | template[i] = cgrp->subsys[i]; | 552 | template[i] = cgroup_css(cgrp, ss); |
| 559 | } else { | 553 | } else { |
| 560 | /* Subsystem is not in this hierarchy, so we | 554 | /* Subsystem is not in this hierarchy, so we |
| 561 | * don't want to change the subsystem state */ | 555 | * don't want to change the subsystem state */ |
| @@ -803,8 +797,7 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, | |||
| 803 | 797 | ||
| 804 | static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); | 798 | static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); |
| 805 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); | 799 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); |
| 806 | static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, | 800 | static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask); |
| 807 | unsigned long subsys_mask); | ||
| 808 | static const struct inode_operations cgroup_dir_inode_operations; | 801 | static const struct inode_operations cgroup_dir_inode_operations; |
| 809 | static const struct file_operations proc_cgroupstats_operations; | 802 | static const struct file_operations proc_cgroupstats_operations; |
| 810 | 803 | ||
| @@ -813,9 +806,6 @@ static struct backing_dev_info cgroup_backing_dev_info = { | |||
| 813 | .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, | 806 | .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, |
| 814 | }; | 807 | }; |
| 815 | 808 | ||
| 816 | static int alloc_css_id(struct cgroup_subsys *ss, | ||
| 817 | struct cgroup *parent, struct cgroup *child); | ||
| 818 | |||
| 819 | static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) | 809 | static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) |
| 820 | { | 810 | { |
| 821 | struct inode *inode = new_inode(sb); | 811 | struct inode *inode = new_inode(sb); |
| @@ -845,15 +835,8 @@ static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry) | |||
| 845 | static void cgroup_free_fn(struct work_struct *work) | 835 | static void cgroup_free_fn(struct work_struct *work) |
| 846 | { | 836 | { |
| 847 | struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); | 837 | struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); |
| 848 | struct cgroup_subsys *ss; | ||
| 849 | 838 | ||
| 850 | mutex_lock(&cgroup_mutex); | 839 | mutex_lock(&cgroup_mutex); |
| 851 | /* | ||
| 852 | * Release the subsystem state objects. | ||
| 853 | */ | ||
| 854 | for_each_root_subsys(cgrp->root, ss) | ||
| 855 | ss->css_free(cgrp); | ||
| 856 | |||
| 857 | cgrp->root->number_of_cgroups--; | 840 | cgrp->root->number_of_cgroups--; |
| 858 | mutex_unlock(&cgroup_mutex); | 841 | mutex_unlock(&cgroup_mutex); |
| 859 | 842 | ||
| @@ -864,8 +847,6 @@ static void cgroup_free_fn(struct work_struct *work) | |||
| 864 | */ | 847 | */ |
| 865 | dput(cgrp->parent->dentry); | 848 | dput(cgrp->parent->dentry); |
| 866 | 849 | ||
| 867 | ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); | ||
| 868 | |||
| 869 | /* | 850 | /* |
| 870 | * Drop the active superblock reference that we took when we | 851 | * Drop the active superblock reference that we took when we |
| 871 | * created the cgroup. This will free cgrp->root, if we are | 852 | * created the cgroup. This will free cgrp->root, if we are |
| @@ -956,27 +937,22 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) | |||
| 956 | } | 937 | } |
| 957 | 938 | ||
| 958 | /** | 939 | /** |
| 959 | * cgroup_clear_directory - selective removal of base and subsystem files | 940 | * cgroup_clear_dir - remove subsys files in a cgroup directory |
| 960 | * @dir: directory containing the files | 941 | * @cgrp: target cgroup |
| 961 | * @base_files: true if the base files should be removed | ||
| 962 | * @subsys_mask: mask of the subsystem ids whose files should be removed | 942 | * @subsys_mask: mask of the subsystem ids whose files should be removed |
| 963 | */ | 943 | */ |
| 964 | static void cgroup_clear_directory(struct dentry *dir, bool base_files, | 944 | static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) |
| 965 | unsigned long subsys_mask) | ||
| 966 | { | 945 | { |
| 967 | struct cgroup *cgrp = __d_cgrp(dir); | ||
| 968 | struct cgroup_subsys *ss; | 946 | struct cgroup_subsys *ss; |
| 947 | int i; | ||
| 969 | 948 | ||
| 970 | for_each_root_subsys(cgrp->root, ss) { | 949 | for_each_subsys(ss, i) { |
| 971 | struct cftype_set *set; | 950 | struct cftype_set *set; |
| 972 | if (!test_bit(ss->subsys_id, &subsys_mask)) | 951 | |
| 952 | if (!test_bit(i, &subsys_mask)) | ||
| 973 | continue; | 953 | continue; |
| 974 | list_for_each_entry(set, &ss->cftsets, node) | 954 | list_for_each_entry(set, &ss->cftsets, node) |
| 975 | cgroup_addrm_files(cgrp, NULL, set->cfts, false); | 955 | cgroup_addrm_files(cgrp, set->cfts, false); |
| 976 | } | ||
| 977 | if (base_files) { | ||
| 978 | while (!list_empty(&cgrp->files)) | ||
| 979 | cgroup_rm_file(cgrp, NULL); | ||
| 980 | } | 956 | } |
| 981 | } | 957 | } |
| 982 | 958 | ||
| @@ -986,9 +962,6 @@ static void cgroup_clear_directory(struct dentry *dir, bool base_files, | |||
| 986 | static void cgroup_d_remove_dir(struct dentry *dentry) | 962 | static void cgroup_d_remove_dir(struct dentry *dentry) |
| 987 | { | 963 | { |
| 988 | struct dentry *parent; | 964 | struct dentry *parent; |
| 989 | struct cgroupfs_root *root = dentry->d_sb->s_fs_info; | ||
| 990 | |||
| 991 | cgroup_clear_directory(dentry, true, root->subsys_mask); | ||
| 992 | 965 | ||
| 993 | parent = dentry->d_parent; | 966 | parent = dentry->d_parent; |
| 994 | spin_lock(&parent->d_lock); | 967 | spin_lock(&parent->d_lock); |
| @@ -1009,79 +982,84 @@ static int rebind_subsystems(struct cgroupfs_root *root, | |||
| 1009 | { | 982 | { |
| 1010 | struct cgroup *cgrp = &root->top_cgroup; | 983 | struct cgroup *cgrp = &root->top_cgroup; |
| 1011 | struct cgroup_subsys *ss; | 984 | struct cgroup_subsys *ss; |
| 1012 | int i; | 985 | unsigned long pinned = 0; |
| 986 | int i, ret; | ||
| 1013 | 987 | ||
| 1014 | BUG_ON(!mutex_is_locked(&cgroup_mutex)); | 988 | BUG_ON(!mutex_is_locked(&cgroup_mutex)); |
| 1015 | BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); | 989 | BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); |
| 1016 | 990 | ||
| 1017 | /* Check that any added subsystems are currently free */ | 991 | /* Check that any added subsystems are currently free */ |
| 1018 | for_each_subsys(ss, i) { | 992 | for_each_subsys(ss, i) { |
| 1019 | unsigned long bit = 1UL << i; | 993 | if (!(added_mask & (1 << i))) |
| 1020 | |||
| 1021 | if (!(bit & added_mask)) | ||
| 1022 | continue; | 994 | continue; |
| 1023 | 995 | ||
| 996 | /* is the subsystem mounted elsewhere? */ | ||
| 1024 | if (ss->root != &cgroup_dummy_root) { | 997 | if (ss->root != &cgroup_dummy_root) { |
| 1025 | /* Subsystem isn't free */ | 998 | ret = -EBUSY; |
| 1026 | return -EBUSY; | 999 | goto out_put; |
| 1000 | } | ||
| 1001 | |||
| 1002 | /* pin the module */ | ||
| 1003 | if (!try_module_get(ss->module)) { | ||
| 1004 | ret = -ENOENT; | ||
| 1005 | goto out_put; | ||
| 1027 | } | 1006 | } |
| 1007 | pinned |= 1 << i; | ||
| 1028 | } | 1008 | } |
| 1029 | 1009 | ||
| 1030 | /* Currently we don't handle adding/removing subsystems when | 1010 | /* subsys could be missing if unloaded between parsing and here */ |
| 1031 | * any child cgroups exist. This is theoretically supportable | 1011 | if (added_mask != pinned) { |
| 1032 | * but involves complex error handling, so it's being left until | 1012 | ret = -ENOENT; |
| 1033 | * later */ | 1013 | goto out_put; |
| 1034 | if (root->number_of_cgroups > 1) | 1014 | } |
| 1035 | return -EBUSY; | 1015 | |
| 1016 | ret = cgroup_populate_dir(cgrp, added_mask); | ||
| 1017 | if (ret) | ||
| 1018 | goto out_put; | ||
| 1019 | |||
| 1020 | /* | ||
| 1021 | * Nothing can fail from this point on. Remove files for the | ||
| 1022 | * removed subsystems and rebind each subsystem. | ||
| 1023 | */ | ||
| 1024 | cgroup_clear_dir(cgrp, removed_mask); | ||
| 1036 | 1025 | ||
| 1037 | /* Process each subsystem */ | ||
| 1038 | for_each_subsys(ss, i) { | 1026 | for_each_subsys(ss, i) { |
| 1039 | unsigned long bit = 1UL << i; | 1027 | unsigned long bit = 1UL << i; |
| 1040 | 1028 | ||
| 1041 | if (bit & added_mask) { | 1029 | if (bit & added_mask) { |
| 1042 | /* We're binding this subsystem to this hierarchy */ | 1030 | /* We're binding this subsystem to this hierarchy */ |
| 1043 | BUG_ON(cgrp->subsys[i]); | 1031 | BUG_ON(cgroup_css(cgrp, ss)); |
| 1044 | BUG_ON(!cgroup_dummy_top->subsys[i]); | 1032 | BUG_ON(!cgroup_css(cgroup_dummy_top, ss)); |
| 1045 | BUG_ON(cgroup_dummy_top->subsys[i]->cgroup != cgroup_dummy_top); | 1033 | BUG_ON(cgroup_css(cgroup_dummy_top, ss)->cgroup != cgroup_dummy_top); |
| 1034 | |||
| 1035 | rcu_assign_pointer(cgrp->subsys[i], | ||
| 1036 | cgroup_css(cgroup_dummy_top, ss)); | ||
| 1037 | cgroup_css(cgrp, ss)->cgroup = cgrp; | ||
| 1046 | 1038 | ||
| 1047 | cgrp->subsys[i] = cgroup_dummy_top->subsys[i]; | ||
| 1048 | cgrp->subsys[i]->cgroup = cgrp; | ||
| 1049 | list_move(&ss->sibling, &root->subsys_list); | 1039 | list_move(&ss->sibling, &root->subsys_list); |
| 1050 | ss->root = root; | 1040 | ss->root = root; |
| 1051 | if (ss->bind) | 1041 | if (ss->bind) |
| 1052 | ss->bind(cgrp); | 1042 | ss->bind(cgroup_css(cgrp, ss)); |
| 1053 | 1043 | ||
| 1054 | /* refcount was already taken, and we're keeping it */ | 1044 | /* refcount was already taken, and we're keeping it */ |
| 1055 | root->subsys_mask |= bit; | 1045 | root->subsys_mask |= bit; |
| 1056 | } else if (bit & removed_mask) { | 1046 | } else if (bit & removed_mask) { |
| 1057 | /* We're removing this subsystem */ | 1047 | /* We're removing this subsystem */ |
| 1058 | BUG_ON(cgrp->subsys[i] != cgroup_dummy_top->subsys[i]); | 1048 | BUG_ON(cgroup_css(cgrp, ss) != cgroup_css(cgroup_dummy_top, ss)); |
| 1059 | BUG_ON(cgrp->subsys[i]->cgroup != cgrp); | 1049 | BUG_ON(cgroup_css(cgrp, ss)->cgroup != cgrp); |
| 1060 | 1050 | ||
| 1061 | if (ss->bind) | 1051 | if (ss->bind) |
| 1062 | ss->bind(cgroup_dummy_top); | 1052 | ss->bind(cgroup_css(cgroup_dummy_top, ss)); |
| 1063 | cgroup_dummy_top->subsys[i]->cgroup = cgroup_dummy_top; | 1053 | |
| 1064 | cgrp->subsys[i] = NULL; | 1054 | cgroup_css(cgroup_dummy_top, ss)->cgroup = cgroup_dummy_top; |
| 1055 | RCU_INIT_POINTER(cgrp->subsys[i], NULL); | ||
| 1056 | |||
| 1065 | cgroup_subsys[i]->root = &cgroup_dummy_root; | 1057 | cgroup_subsys[i]->root = &cgroup_dummy_root; |
| 1066 | list_move(&ss->sibling, &cgroup_dummy_root.subsys_list); | 1058 | list_move(&ss->sibling, &cgroup_dummy_root.subsys_list); |
| 1067 | 1059 | ||
| 1068 | /* subsystem is now free - drop reference on module */ | 1060 | /* subsystem is now free - drop reference on module */ |
| 1069 | module_put(ss->module); | 1061 | module_put(ss->module); |
| 1070 | root->subsys_mask &= ~bit; | 1062 | root->subsys_mask &= ~bit; |
| 1071 | } else if (bit & root->subsys_mask) { | ||
| 1072 | /* Subsystem state should already exist */ | ||
| 1073 | BUG_ON(!cgrp->subsys[i]); | ||
| 1074 | /* | ||
| 1075 | * a refcount was taken, but we already had one, so | ||
| 1076 | * drop the extra reference. | ||
| 1077 | */ | ||
| 1078 | module_put(ss->module); | ||
| 1079 | #ifdef CONFIG_MODULE_UNLOAD | ||
| 1080 | BUG_ON(ss->module && !module_refcount(ss->module)); | ||
| 1081 | #endif | ||
| 1082 | } else { | ||
| 1083 | /* Subsystem state shouldn't exist */ | ||
| 1084 | BUG_ON(cgrp->subsys[i]); | ||
| 1085 | } | 1063 | } |
| 1086 | } | 1064 | } |
| 1087 | 1065 | ||
| @@ -1092,6 +1070,12 @@ static int rebind_subsystems(struct cgroupfs_root *root, | |||
| 1092 | root->flags |= CGRP_ROOT_SUBSYS_BOUND; | 1070 | root->flags |= CGRP_ROOT_SUBSYS_BOUND; |
| 1093 | 1071 | ||
| 1094 | return 0; | 1072 | return 0; |
| 1073 | |||
| 1074 | out_put: | ||
| 1075 | for_each_subsys(ss, i) | ||
| 1076 | if (pinned & (1 << i)) | ||
| 1077 | module_put(ss->module); | ||
| 1078 | return ret; | ||
| 1095 | } | 1079 | } |
| 1096 | 1080 | ||
| 1097 | static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) | 1081 | static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) |
| @@ -1142,7 +1126,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
| 1142 | char *token, *o = data; | 1126 | char *token, *o = data; |
| 1143 | bool all_ss = false, one_ss = false; | 1127 | bool all_ss = false, one_ss = false; |
| 1144 | unsigned long mask = (unsigned long)-1; | 1128 | unsigned long mask = (unsigned long)-1; |
| 1145 | bool module_pin_failed = false; | ||
| 1146 | struct cgroup_subsys *ss; | 1129 | struct cgroup_subsys *ss; |
| 1147 | int i; | 1130 | int i; |
| 1148 | 1131 | ||
| @@ -1285,52 +1268,9 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
| 1285 | if (!opts->subsys_mask && !opts->name) | 1268 | if (!opts->subsys_mask && !opts->name) |
| 1286 | return -EINVAL; | 1269 | return -EINVAL; |
| 1287 | 1270 | ||
| 1288 | /* | ||
| 1289 | * Grab references on all the modules we'll need, so the subsystems | ||
| 1290 | * don't dance around before rebind_subsystems attaches them. This may | ||
| 1291 | * take duplicate reference counts on a subsystem that's already used, | ||
| 1292 | * but rebind_subsystems handles this case. | ||
| 1293 | */ | ||
| 1294 | for_each_subsys(ss, i) { | ||
| 1295 | if (!(opts->subsys_mask & (1UL << i))) | ||
| 1296 | continue; | ||
| 1297 | if (!try_module_get(cgroup_subsys[i]->module)) { | ||
| 1298 | module_pin_failed = true; | ||
| 1299 | break; | ||
| 1300 | } | ||
| 1301 | } | ||
| 1302 | if (module_pin_failed) { | ||
| 1303 | /* | ||
| 1304 | * oops, one of the modules was going away. this means that we | ||
| 1305 | * raced with a module_delete call, and to the user this is | ||
| 1306 | * essentially a "subsystem doesn't exist" case. | ||
| 1307 | */ | ||
| 1308 | for (i--; i >= 0; i--) { | ||
| 1309 | /* drop refcounts only on the ones we took */ | ||
| 1310 | unsigned long bit = 1UL << i; | ||
| 1311 | |||
| 1312 | if (!(bit & opts->subsys_mask)) | ||
| 1313 | continue; | ||
| 1314 | module_put(cgroup_subsys[i]->module); | ||
| 1315 | } | ||
| 1316 | return -ENOENT; | ||
| 1317 | } | ||
| 1318 | |||
| 1319 | return 0; | 1271 | return 0; |
| 1320 | } | 1272 | } |
| 1321 | 1273 | ||
| 1322 | static void drop_parsed_module_refcounts(unsigned long subsys_mask) | ||
| 1323 | { | ||
| 1324 | struct cgroup_subsys *ss; | ||
| 1325 | int i; | ||
| 1326 | |||
| 1327 | mutex_lock(&cgroup_mutex); | ||
| 1328 | for_each_subsys(ss, i) | ||
| 1329 | if (subsys_mask & (1UL << i)) | ||
| 1330 | module_put(cgroup_subsys[i]->module); | ||
| 1331 | mutex_unlock(&cgroup_mutex); | ||
| 1332 | } | ||
| 1333 | |||
| 1334 | static int cgroup_remount(struct super_block *sb, int *flags, char *data) | 1274 | static int cgroup_remount(struct super_block *sb, int *flags, char *data) |
| 1335 | { | 1275 | { |
| 1336 | int ret = 0; | 1276 | int ret = 0; |
| @@ -1370,22 +1310,15 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) | |||
| 1370 | goto out_unlock; | 1310 | goto out_unlock; |
| 1371 | } | 1311 | } |
| 1372 | 1312 | ||
| 1373 | /* | 1313 | /* remounting is not allowed for populated hierarchies */ |
| 1374 | * Clear out the files of subsystems that should be removed, do | 1314 | if (root->number_of_cgroups > 1) { |
| 1375 | * this before rebind_subsystems, since rebind_subsystems may | 1315 | ret = -EBUSY; |
| 1376 | * change this hierarchy's subsys_list. | ||
| 1377 | */ | ||
| 1378 | cgroup_clear_directory(cgrp->dentry, false, removed_mask); | ||
| 1379 | |||
| 1380 | ret = rebind_subsystems(root, added_mask, removed_mask); | ||
| 1381 | if (ret) { | ||
| 1382 | /* rebind_subsystems failed, re-populate the removed files */ | ||
| 1383 | cgroup_populate_dir(cgrp, false, removed_mask); | ||
| 1384 | goto out_unlock; | 1316 | goto out_unlock; |
| 1385 | } | 1317 | } |
| 1386 | 1318 | ||
| 1387 | /* re-populate subsystem files */ | 1319 | ret = rebind_subsystems(root, added_mask, removed_mask); |
| 1388 | cgroup_populate_dir(cgrp, false, added_mask); | 1320 | if (ret) |
| 1321 | goto out_unlock; | ||
| 1389 | 1322 | ||
| 1390 | if (opts.release_agent) | 1323 | if (opts.release_agent) |
| 1391 | strcpy(root->release_agent_path, opts.release_agent); | 1324 | strcpy(root->release_agent_path, opts.release_agent); |
| @@ -1395,8 +1328,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) | |||
| 1395 | mutex_unlock(&cgroup_root_mutex); | 1328 | mutex_unlock(&cgroup_root_mutex); |
| 1396 | mutex_unlock(&cgroup_mutex); | 1329 | mutex_unlock(&cgroup_mutex); |
| 1397 | mutex_unlock(&cgrp->dentry->d_inode->i_mutex); | 1330 | mutex_unlock(&cgrp->dentry->d_inode->i_mutex); |
| 1398 | if (ret) | ||
| 1399 | drop_parsed_module_refcounts(opts.subsys_mask); | ||
| 1400 | return ret; | 1331 | return ret; |
| 1401 | } | 1332 | } |
| 1402 | 1333 | ||
| @@ -1416,6 +1347,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) | |||
| 1416 | INIT_LIST_HEAD(&cgrp->release_list); | 1347 | INIT_LIST_HEAD(&cgrp->release_list); |
| 1417 | INIT_LIST_HEAD(&cgrp->pidlists); | 1348 | INIT_LIST_HEAD(&cgrp->pidlists); |
| 1418 | mutex_init(&cgrp->pidlist_mutex); | 1349 | mutex_init(&cgrp->pidlist_mutex); |
| 1350 | cgrp->dummy_css.cgroup = cgrp; | ||
| 1419 | INIT_LIST_HEAD(&cgrp->event_list); | 1351 | INIT_LIST_HEAD(&cgrp->event_list); |
| 1420 | spin_lock_init(&cgrp->event_list_lock); | 1352 | spin_lock_init(&cgrp->event_list_lock); |
| 1421 | simple_xattrs_init(&cgrp->xattrs); | 1353 | simple_xattrs_init(&cgrp->xattrs); |
| @@ -1431,6 +1363,7 @@ static void init_cgroup_root(struct cgroupfs_root *root) | |||
| 1431 | cgrp->root = root; | 1363 | cgrp->root = root; |
| 1432 | RCU_INIT_POINTER(cgrp->name, &root_cgroup_name); | 1364 | RCU_INIT_POINTER(cgrp->name, &root_cgroup_name); |
| 1433 | init_cgroup_housekeeping(cgrp); | 1365 | init_cgroup_housekeeping(cgrp); |
| 1366 | idr_init(&root->cgroup_idr); | ||
| 1434 | } | 1367 | } |
| 1435 | 1368 | ||
| 1436 | static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end) | 1369 | static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end) |
| @@ -1503,7 +1436,6 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) | |||
| 1503 | */ | 1436 | */ |
| 1504 | root->subsys_mask = opts->subsys_mask; | 1437 | root->subsys_mask = opts->subsys_mask; |
| 1505 | root->flags = opts->flags; | 1438 | root->flags = opts->flags; |
| 1506 | ida_init(&root->cgroup_ida); | ||
| 1507 | if (opts->release_agent) | 1439 | if (opts->release_agent) |
| 1508 | strcpy(root->release_agent_path, opts->release_agent); | 1440 | strcpy(root->release_agent_path, opts->release_agent); |
| 1509 | if (opts->name) | 1441 | if (opts->name) |
| @@ -1519,7 +1451,7 @@ static void cgroup_free_root(struct cgroupfs_root *root) | |||
| 1519 | /* hierarhcy ID shoulid already have been released */ | 1451 | /* hierarhcy ID shoulid already have been released */ |
| 1520 | WARN_ON_ONCE(root->hierarchy_id); | 1452 | WARN_ON_ONCE(root->hierarchy_id); |
| 1521 | 1453 | ||
| 1522 | ida_destroy(&root->cgroup_ida); | 1454 | idr_destroy(&root->cgroup_idr); |
| 1523 | kfree(root); | 1455 | kfree(root); |
| 1524 | } | 1456 | } |
| 1525 | } | 1457 | } |
| @@ -1584,7 +1516,9 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
| 1584 | int ret = 0; | 1516 | int ret = 0; |
| 1585 | struct super_block *sb; | 1517 | struct super_block *sb; |
| 1586 | struct cgroupfs_root *new_root; | 1518 | struct cgroupfs_root *new_root; |
| 1519 | struct list_head tmp_links; | ||
| 1587 | struct inode *inode; | 1520 | struct inode *inode; |
| 1521 | const struct cred *cred; | ||
| 1588 | 1522 | ||
| 1589 | /* First find the desired set of subsystems */ | 1523 | /* First find the desired set of subsystems */ |
| 1590 | mutex_lock(&cgroup_mutex); | 1524 | mutex_lock(&cgroup_mutex); |
| @@ -1600,7 +1534,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
| 1600 | new_root = cgroup_root_from_opts(&opts); | 1534 | new_root = cgroup_root_from_opts(&opts); |
| 1601 | if (IS_ERR(new_root)) { | 1535 | if (IS_ERR(new_root)) { |
| 1602 | ret = PTR_ERR(new_root); | 1536 | ret = PTR_ERR(new_root); |
| 1603 | goto drop_modules; | 1537 | goto out_err; |
| 1604 | } | 1538 | } |
| 1605 | opts.new_root = new_root; | 1539 | opts.new_root = new_root; |
| 1606 | 1540 | ||
| @@ -1609,17 +1543,15 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
| 1609 | if (IS_ERR(sb)) { | 1543 | if (IS_ERR(sb)) { |
| 1610 | ret = PTR_ERR(sb); | 1544 | ret = PTR_ERR(sb); |
| 1611 | cgroup_free_root(opts.new_root); | 1545 | cgroup_free_root(opts.new_root); |
| 1612 | goto drop_modules; | 1546 | goto out_err; |
| 1613 | } | 1547 | } |
| 1614 | 1548 | ||
| 1615 | root = sb->s_fs_info; | 1549 | root = sb->s_fs_info; |
| 1616 | BUG_ON(!root); | 1550 | BUG_ON(!root); |
| 1617 | if (root == opts.new_root) { | 1551 | if (root == opts.new_root) { |
| 1618 | /* We used the new root structure, so this is a new hierarchy */ | 1552 | /* We used the new root structure, so this is a new hierarchy */ |
| 1619 | struct list_head tmp_links; | ||
| 1620 | struct cgroup *root_cgrp = &root->top_cgroup; | 1553 | struct cgroup *root_cgrp = &root->top_cgroup; |
| 1621 | struct cgroupfs_root *existing_root; | 1554 | struct cgroupfs_root *existing_root; |
| 1622 | const struct cred *cred; | ||
| 1623 | int i; | 1555 | int i; |
| 1624 | struct css_set *cset; | 1556 | struct css_set *cset; |
| 1625 | 1557 | ||
| @@ -1634,6 +1566,11 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
| 1634 | mutex_lock(&cgroup_mutex); | 1566 | mutex_lock(&cgroup_mutex); |
| 1635 | mutex_lock(&cgroup_root_mutex); | 1567 | mutex_lock(&cgroup_root_mutex); |
| 1636 | 1568 | ||
| 1569 | root_cgrp->id = idr_alloc(&root->cgroup_idr, root_cgrp, | ||
| 1570 | 0, 1, GFP_KERNEL); | ||
| 1571 | if (root_cgrp->id < 0) | ||
| 1572 | goto unlock_drop; | ||
| 1573 | |||
| 1637 | /* Check for name clashes with existing mounts */ | 1574 | /* Check for name clashes with existing mounts */ |
| 1638 | ret = -EBUSY; | 1575 | ret = -EBUSY; |
| 1639 | if (strlen(root->name)) | 1576 | if (strlen(root->name)) |
| @@ -1657,26 +1594,37 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
| 1657 | if (ret) | 1594 | if (ret) |
| 1658 | goto unlock_drop; | 1595 | goto unlock_drop; |
| 1659 | 1596 | ||
| 1597 | sb->s_root->d_fsdata = root_cgrp; | ||
| 1598 | root_cgrp->dentry = sb->s_root; | ||
| 1599 | |||
| 1600 | /* | ||
| 1601 | * We're inside get_sb() and will call lookup_one_len() to | ||
| 1602 | * create the root files, which doesn't work if SELinux is | ||
| 1603 | * in use. The following cred dancing somehow works around | ||
| 1604 | * it. See 2ce9738ba ("cgroupfs: use init_cred when | ||
| 1605 | * populating new cgroupfs mount") for more details. | ||
| 1606 | */ | ||
| 1607 | cred = override_creds(&init_cred); | ||
| 1608 | |||
| 1609 | ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true); | ||
| 1610 | if (ret) | ||
| 1611 | goto rm_base_files; | ||
| 1612 | |||
| 1660 | ret = rebind_subsystems(root, root->subsys_mask, 0); | 1613 | ret = rebind_subsystems(root, root->subsys_mask, 0); |
| 1661 | if (ret == -EBUSY) { | 1614 | if (ret) |
| 1662 | free_cgrp_cset_links(&tmp_links); | 1615 | goto rm_base_files; |
| 1663 | goto unlock_drop; | 1616 | |
| 1664 | } | 1617 | revert_creds(cred); |
| 1618 | |||
| 1665 | /* | 1619 | /* |
| 1666 | * There must be no failure case after here, since rebinding | 1620 | * There must be no failure case after here, since rebinding |
| 1667 | * takes care of subsystems' refcounts, which are explicitly | 1621 | * takes care of subsystems' refcounts, which are explicitly |
| 1668 | * dropped in the failure exit path. | 1622 | * dropped in the failure exit path. |
| 1669 | */ | 1623 | */ |
| 1670 | 1624 | ||
| 1671 | /* EBUSY should be the only error here */ | ||
| 1672 | BUG_ON(ret); | ||
| 1673 | |||
| 1674 | list_add(&root->root_list, &cgroup_roots); | 1625 | list_add(&root->root_list, &cgroup_roots); |
| 1675 | cgroup_root_count++; | 1626 | cgroup_root_count++; |
| 1676 | 1627 | ||
| 1677 | sb->s_root->d_fsdata = root_cgrp; | ||
| 1678 | root->top_cgroup.dentry = sb->s_root; | ||
| 1679 | |||
| 1680 | /* Link the top cgroup in this hierarchy into all | 1628 | /* Link the top cgroup in this hierarchy into all |
| 1681 | * the css_set objects */ | 1629 | * the css_set objects */ |
| 1682 | write_lock(&css_set_lock); | 1630 | write_lock(&css_set_lock); |
| @@ -1689,9 +1637,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
| 1689 | BUG_ON(!list_empty(&root_cgrp->children)); | 1637 | BUG_ON(!list_empty(&root_cgrp->children)); |
| 1690 | BUG_ON(root->number_of_cgroups != 1); | 1638 | BUG_ON(root->number_of_cgroups != 1); |
| 1691 | 1639 | ||
| 1692 | cred = override_creds(&init_cred); | ||
| 1693 | cgroup_populate_dir(root_cgrp, true, root->subsys_mask); | ||
| 1694 | revert_creds(cred); | ||
| 1695 | mutex_unlock(&cgroup_root_mutex); | 1640 | mutex_unlock(&cgroup_root_mutex); |
| 1696 | mutex_unlock(&cgroup_mutex); | 1641 | mutex_unlock(&cgroup_mutex); |
| 1697 | mutex_unlock(&inode->i_mutex); | 1642 | mutex_unlock(&inode->i_mutex); |
| @@ -1711,15 +1656,16 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
| 1711 | pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n"); | 1656 | pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n"); |
| 1712 | } | 1657 | } |
| 1713 | } | 1658 | } |
| 1714 | |||
| 1715 | /* no subsys rebinding, so refcounts don't change */ | ||
| 1716 | drop_parsed_module_refcounts(opts.subsys_mask); | ||
| 1717 | } | 1659 | } |
| 1718 | 1660 | ||
| 1719 | kfree(opts.release_agent); | 1661 | kfree(opts.release_agent); |
| 1720 | kfree(opts.name); | 1662 | kfree(opts.name); |
| 1721 | return dget(sb->s_root); | 1663 | return dget(sb->s_root); |
| 1722 | 1664 | ||
| 1665 | rm_base_files: | ||
| 1666 | free_cgrp_cset_links(&tmp_links); | ||
| 1667 | cgroup_addrm_files(&root->top_cgroup, cgroup_base_files, false); | ||
| 1668 | revert_creds(cred); | ||
| 1723 | unlock_drop: | 1669 | unlock_drop: |
| 1724 | cgroup_exit_root_id(root); | 1670 | cgroup_exit_root_id(root); |
| 1725 | mutex_unlock(&cgroup_root_mutex); | 1671 | mutex_unlock(&cgroup_root_mutex); |
| @@ -1727,8 +1673,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
| 1727 | mutex_unlock(&inode->i_mutex); | 1673 | mutex_unlock(&inode->i_mutex); |
| 1728 | drop_new_super: | 1674 | drop_new_super: |
| 1729 | deactivate_locked_super(sb); | 1675 | deactivate_locked_super(sb); |
| 1730 | drop_modules: | ||
| 1731 | drop_parsed_module_refcounts(opts.subsys_mask); | ||
| 1732 | out_err: | 1676 | out_err: |
| 1733 | kfree(opts.release_agent); | 1677 | kfree(opts.release_agent); |
| 1734 | kfree(opts.name); | 1678 | kfree(opts.name); |
| @@ -1746,6 +1690,7 @@ static void cgroup_kill_sb(struct super_block *sb) { | |||
| 1746 | BUG_ON(root->number_of_cgroups != 1); | 1690 | BUG_ON(root->number_of_cgroups != 1); |
| 1747 | BUG_ON(!list_empty(&cgrp->children)); | 1691 | BUG_ON(!list_empty(&cgrp->children)); |
| 1748 | 1692 | ||
| 1693 | mutex_lock(&cgrp->dentry->d_inode->i_mutex); | ||
| 1749 | mutex_lock(&cgroup_mutex); | 1694 | mutex_lock(&cgroup_mutex); |
| 1750 | mutex_lock(&cgroup_root_mutex); | 1695 | mutex_lock(&cgroup_root_mutex); |
| 1751 | 1696 | ||
| @@ -1778,6 +1723,7 @@ static void cgroup_kill_sb(struct super_block *sb) { | |||
| 1778 | 1723 | ||
| 1779 | mutex_unlock(&cgroup_root_mutex); | 1724 | mutex_unlock(&cgroup_root_mutex); |
| 1780 | mutex_unlock(&cgroup_mutex); | 1725 | mutex_unlock(&cgroup_mutex); |
| 1726 | mutex_unlock(&cgrp->dentry->d_inode->i_mutex); | ||
| 1781 | 1727 | ||
| 1782 | simple_xattrs_free(&cgrp->xattrs); | 1728 | simple_xattrs_free(&cgrp->xattrs); |
| 1783 | 1729 | ||
| @@ -1889,7 +1835,7 @@ EXPORT_SYMBOL_GPL(task_cgroup_path); | |||
| 1889 | struct task_and_cgroup { | 1835 | struct task_and_cgroup { |
| 1890 | struct task_struct *task; | 1836 | struct task_struct *task; |
| 1891 | struct cgroup *cgrp; | 1837 | struct cgroup *cgrp; |
| 1892 | struct css_set *cg; | 1838 | struct css_set *cset; |
| 1893 | }; | 1839 | }; |
| 1894 | 1840 | ||
| 1895 | struct cgroup_taskset { | 1841 | struct cgroup_taskset { |
| @@ -1939,18 +1885,20 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) | |||
| 1939 | EXPORT_SYMBOL_GPL(cgroup_taskset_next); | 1885 | EXPORT_SYMBOL_GPL(cgroup_taskset_next); |
| 1940 | 1886 | ||
| 1941 | /** | 1887 | /** |
| 1942 | * cgroup_taskset_cur_cgroup - return the matching cgroup for the current task | 1888 | * cgroup_taskset_cur_css - return the matching css for the current task |
| 1943 | * @tset: taskset of interest | 1889 | * @tset: taskset of interest |
| 1890 | * @subsys_id: the ID of the target subsystem | ||
| 1944 | * | 1891 | * |
| 1945 | * Return the cgroup for the current (last returned) task of @tset. This | 1892 | * Return the css for the current (last returned) task of @tset for |
| 1946 | * function must be preceded by either cgroup_taskset_first() or | 1893 | * subsystem specified by @subsys_id. This function must be preceded by |
| 1947 | * cgroup_taskset_next(). | 1894 | * either cgroup_taskset_first() or cgroup_taskset_next(). |
| 1948 | */ | 1895 | */ |
| 1949 | struct cgroup *cgroup_taskset_cur_cgroup(struct cgroup_taskset *tset) | 1896 | struct cgroup_subsys_state *cgroup_taskset_cur_css(struct cgroup_taskset *tset, |
| 1897 | int subsys_id) | ||
| 1950 | { | 1898 | { |
| 1951 | return tset->cur_cgrp; | 1899 | return cgroup_css(tset->cur_cgrp, cgroup_subsys[subsys_id]); |
| 1952 | } | 1900 | } |
| 1953 | EXPORT_SYMBOL_GPL(cgroup_taskset_cur_cgroup); | 1901 | EXPORT_SYMBOL_GPL(cgroup_taskset_cur_css); |
| 1954 | 1902 | ||
| 1955 | /** | 1903 | /** |
| 1956 | * cgroup_taskset_size - return the number of tasks in taskset | 1904 | * cgroup_taskset_size - return the number of tasks in taskset |
| @@ -2054,7 +2002,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, | |||
| 2054 | 2002 | ||
| 2055 | /* @tsk either already exited or can't exit until the end */ | 2003 | /* @tsk either already exited or can't exit until the end */ |
| 2056 | if (tsk->flags & PF_EXITING) | 2004 | if (tsk->flags & PF_EXITING) |
| 2057 | continue; | 2005 | goto next; |
| 2058 | 2006 | ||
| 2059 | /* as per above, nr_threads may decrease, but not increase. */ | 2007 | /* as per above, nr_threads may decrease, but not increase. */ |
| 2060 | BUG_ON(i >= group_size); | 2008 | BUG_ON(i >= group_size); |
| @@ -2062,7 +2010,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, | |||
| 2062 | ent.cgrp = task_cgroup_from_root(tsk, root); | 2010 | ent.cgrp = task_cgroup_from_root(tsk, root); |
| 2063 | /* nothing to do if this task is already in the cgroup */ | 2011 | /* nothing to do if this task is already in the cgroup */ |
| 2064 | if (ent.cgrp == cgrp) | 2012 | if (ent.cgrp == cgrp) |
| 2065 | continue; | 2013 | goto next; |
| 2066 | /* | 2014 | /* |
| 2067 | * saying GFP_ATOMIC has no effect here because we did prealloc | 2015 | * saying GFP_ATOMIC has no effect here because we did prealloc |
| 2068 | * earlier, but it's good form to communicate our expectations. | 2016 | * earlier, but it's good form to communicate our expectations. |
| @@ -2070,7 +2018,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, | |||
| 2070 | retval = flex_array_put(group, i, &ent, GFP_ATOMIC); | 2018 | retval = flex_array_put(group, i, &ent, GFP_ATOMIC); |
| 2071 | BUG_ON(retval != 0); | 2019 | BUG_ON(retval != 0); |
| 2072 | i++; | 2020 | i++; |
| 2073 | 2021 | next: | |
| 2074 | if (!threadgroup) | 2022 | if (!threadgroup) |
| 2075 | break; | 2023 | break; |
| 2076 | } while_each_thread(leader, tsk); | 2024 | } while_each_thread(leader, tsk); |
| @@ -2089,8 +2037,10 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, | |||
| 2089 | * step 1: check that we can legitimately attach to the cgroup. | 2037 | * step 1: check that we can legitimately attach to the cgroup. |
| 2090 | */ | 2038 | */ |
| 2091 | for_each_root_subsys(root, ss) { | 2039 | for_each_root_subsys(root, ss) { |
| 2040 | struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); | ||
| 2041 | |||
| 2092 | if (ss->can_attach) { | 2042 | if (ss->can_attach) { |
| 2093 | retval = ss->can_attach(cgrp, &tset); | 2043 | retval = ss->can_attach(css, &tset); |
| 2094 | if (retval) { | 2044 | if (retval) { |
| 2095 | failed_ss = ss; | 2045 | failed_ss = ss; |
| 2096 | goto out_cancel_attach; | 2046 | goto out_cancel_attach; |
| @@ -2107,8 +2057,8 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, | |||
| 2107 | 2057 | ||
| 2108 | tc = flex_array_get(group, i); | 2058 | tc = flex_array_get(group, i); |
| 2109 | old_cset = task_css_set(tc->task); | 2059 | old_cset = task_css_set(tc->task); |
| 2110 | tc->cg = find_css_set(old_cset, cgrp); | 2060 | tc->cset = find_css_set(old_cset, cgrp); |
| 2111 | if (!tc->cg) { | 2061 | if (!tc->cset) { |
| 2112 | retval = -ENOMEM; | 2062 | retval = -ENOMEM; |
| 2113 | goto out_put_css_set_refs; | 2063 | goto out_put_css_set_refs; |
| 2114 | } | 2064 | } |
| @@ -2121,7 +2071,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, | |||
| 2121 | */ | 2071 | */ |
| 2122 | for (i = 0; i < group_size; i++) { | 2072 | for (i = 0; i < group_size; i++) { |
| 2123 | tc = flex_array_get(group, i); | 2073 | tc = flex_array_get(group, i); |
| 2124 | cgroup_task_migrate(tc->cgrp, tc->task, tc->cg); | 2074 | cgroup_task_migrate(tc->cgrp, tc->task, tc->cset); |
| 2125 | } | 2075 | } |
| 2126 | /* nothing is sensitive to fork() after this point. */ | 2076 | /* nothing is sensitive to fork() after this point. */ |
| 2127 | 2077 | ||
| @@ -2129,8 +2079,10 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, | |||
| 2129 | * step 4: do subsystem attach callbacks. | 2079 | * step 4: do subsystem attach callbacks. |
| 2130 | */ | 2080 | */ |
| 2131 | for_each_root_subsys(root, ss) { | 2081 | for_each_root_subsys(root, ss) { |
| 2082 | struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); | ||
| 2083 | |||
| 2132 | if (ss->attach) | 2084 | if (ss->attach) |
| 2133 | ss->attach(cgrp, &tset); | 2085 | ss->attach(css, &tset); |
| 2134 | } | 2086 | } |
| 2135 | 2087 | ||
| 2136 | /* | 2088 | /* |
| @@ -2141,18 +2093,20 @@ out_put_css_set_refs: | |||
| 2141 | if (retval) { | 2093 | if (retval) { |
| 2142 | for (i = 0; i < group_size; i++) { | 2094 | for (i = 0; i < group_size; i++) { |
| 2143 | tc = flex_array_get(group, i); | 2095 | tc = flex_array_get(group, i); |
| 2144 | if (!tc->cg) | 2096 | if (!tc->cset) |
| 2145 | break; | 2097 | break; |
| 2146 | put_css_set(tc->cg); | 2098 | put_css_set(tc->cset); |
| 2147 | } | 2099 | } |
| 2148 | } | 2100 | } |
| 2149 | out_cancel_attach: | 2101 | out_cancel_attach: |
| 2150 | if (retval) { | 2102 | if (retval) { |
| 2151 | for_each_root_subsys(root, ss) { | 2103 | for_each_root_subsys(root, ss) { |
| 2104 | struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); | ||
| 2105 | |||
| 2152 | if (ss == failed_ss) | 2106 | if (ss == failed_ss) |
| 2153 | break; | 2107 | break; |
| 2154 | if (ss->cancel_attach) | 2108 | if (ss->cancel_attach) |
| 2155 | ss->cancel_attach(cgrp, &tset); | 2109 | ss->cancel_attach(css, &tset); |
| 2156 | } | 2110 | } |
| 2157 | } | 2111 | } |
| 2158 | out_free_group_list: | 2112 | out_free_group_list: |
| @@ -2253,9 +2207,9 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) | |||
| 2253 | 2207 | ||
| 2254 | mutex_lock(&cgroup_mutex); | 2208 | mutex_lock(&cgroup_mutex); |
| 2255 | for_each_active_root(root) { | 2209 | for_each_active_root(root) { |
| 2256 | struct cgroup *from_cg = task_cgroup_from_root(from, root); | 2210 | struct cgroup *from_cgrp = task_cgroup_from_root(from, root); |
| 2257 | 2211 | ||
| 2258 | retval = cgroup_attach_task(from_cg, tsk, false); | 2212 | retval = cgroup_attach_task(from_cgrp, tsk, false); |
| 2259 | if (retval) | 2213 | if (retval) |
| 2260 | break; | 2214 | break; |
| 2261 | } | 2215 | } |
| @@ -2265,34 +2219,38 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) | |||
| 2265 | } | 2219 | } |
| 2266 | EXPORT_SYMBOL_GPL(cgroup_attach_task_all); | 2220 | EXPORT_SYMBOL_GPL(cgroup_attach_task_all); |
| 2267 | 2221 | ||
| 2268 | static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) | 2222 | static int cgroup_tasks_write(struct cgroup_subsys_state *css, |
| 2223 | struct cftype *cft, u64 pid) | ||
| 2269 | { | 2224 | { |
| 2270 | return attach_task_by_pid(cgrp, pid, false); | 2225 | return attach_task_by_pid(css->cgroup, pid, false); |
| 2271 | } | 2226 | } |
| 2272 | 2227 | ||
| 2273 | static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid) | 2228 | static int cgroup_procs_write(struct cgroup_subsys_state *css, |
| 2229 | struct cftype *cft, u64 tgid) | ||
| 2274 | { | 2230 | { |
| 2275 | return attach_task_by_pid(cgrp, tgid, true); | 2231 | return attach_task_by_pid(css->cgroup, tgid, true); |
| 2276 | } | 2232 | } |
| 2277 | 2233 | ||
| 2278 | static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, | 2234 | static int cgroup_release_agent_write(struct cgroup_subsys_state *css, |
| 2279 | const char *buffer) | 2235 | struct cftype *cft, const char *buffer) |
| 2280 | { | 2236 | { |
| 2281 | BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); | 2237 | BUILD_BUG_ON(sizeof(css->cgroup->root->release_agent_path) < PATH_MAX); |
| 2282 | if (strlen(buffer) >= PATH_MAX) | 2238 | if (strlen(buffer) >= PATH_MAX) |
| 2283 | return -EINVAL; | 2239 | return -EINVAL; |
| 2284 | if (!cgroup_lock_live_group(cgrp)) | 2240 | if (!cgroup_lock_live_group(css->cgroup)) |
| 2285 | return -ENODEV; | 2241 | return -ENODEV; |
| 2286 | mutex_lock(&cgroup_root_mutex); | 2242 | mutex_lock(&cgroup_root_mutex); |
| 2287 | strcpy(cgrp->root->release_agent_path, buffer); | 2243 | strcpy(css->cgroup->root->release_agent_path, buffer); |
| 2288 | mutex_unlock(&cgroup_root_mutex); | 2244 | mutex_unlock(&cgroup_root_mutex); |
| 2289 | mutex_unlock(&cgroup_mutex); | 2245 | mutex_unlock(&cgroup_mutex); |
| 2290 | return 0; | 2246 | return 0; |
| 2291 | } | 2247 | } |
| 2292 | 2248 | ||
| 2293 | static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft, | 2249 | static int cgroup_release_agent_show(struct cgroup_subsys_state *css, |
| 2294 | struct seq_file *seq) | 2250 | struct cftype *cft, struct seq_file *seq) |
| 2295 | { | 2251 | { |
| 2252 | struct cgroup *cgrp = css->cgroup; | ||
| 2253 | |||
| 2296 | if (!cgroup_lock_live_group(cgrp)) | 2254 | if (!cgroup_lock_live_group(cgrp)) |
| 2297 | return -ENODEV; | 2255 | return -ENODEV; |
| 2298 | seq_puts(seq, cgrp->root->release_agent_path); | 2256 | seq_puts(seq, cgrp->root->release_agent_path); |
| @@ -2301,20 +2259,20 @@ static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft, | |||
| 2301 | return 0; | 2259 | return 0; |
| 2302 | } | 2260 | } |
| 2303 | 2261 | ||
| 2304 | static int cgroup_sane_behavior_show(struct cgroup *cgrp, struct cftype *cft, | 2262 | static int cgroup_sane_behavior_show(struct cgroup_subsys_state *css, |
| 2305 | struct seq_file *seq) | 2263 | struct cftype *cft, struct seq_file *seq) |
| 2306 | { | 2264 | { |
| 2307 | seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp)); | 2265 | seq_printf(seq, "%d\n", cgroup_sane_behavior(css->cgroup)); |
| 2308 | return 0; | 2266 | return 0; |
| 2309 | } | 2267 | } |
| 2310 | 2268 | ||
| 2311 | /* A buffer size big enough for numbers or short strings */ | 2269 | /* A buffer size big enough for numbers or short strings */ |
| 2312 | #define CGROUP_LOCAL_BUFFER_SIZE 64 | 2270 | #define CGROUP_LOCAL_BUFFER_SIZE 64 |
| 2313 | 2271 | ||
| 2314 | static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft, | 2272 | static ssize_t cgroup_write_X64(struct cgroup_subsys_state *css, |
| 2315 | struct file *file, | 2273 | struct cftype *cft, struct file *file, |
| 2316 | const char __user *userbuf, | 2274 | const char __user *userbuf, size_t nbytes, |
| 2317 | size_t nbytes, loff_t *unused_ppos) | 2275 | loff_t *unused_ppos) |
| 2318 | { | 2276 | { |
| 2319 | char buffer[CGROUP_LOCAL_BUFFER_SIZE]; | 2277 | char buffer[CGROUP_LOCAL_BUFFER_SIZE]; |
| 2320 | int retval = 0; | 2278 | int retval = 0; |
| @@ -2332,22 +2290,22 @@ static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft, | |||
| 2332 | u64 val = simple_strtoull(strstrip(buffer), &end, 0); | 2290 | u64 val = simple_strtoull(strstrip(buffer), &end, 0); |
| 2333 | if (*end) | 2291 | if (*end) |
| 2334 | return -EINVAL; | 2292 | return -EINVAL; |
| 2335 | retval = cft->write_u64(cgrp, cft, val); | 2293 | retval = cft->write_u64(css, cft, val); |
| 2336 | } else { | 2294 | } else { |
| 2337 | s64 val = simple_strtoll(strstrip(buffer), &end, 0); | 2295 | s64 val = simple_strtoll(strstrip(buffer), &end, 0); |
| 2338 | if (*end) | 2296 | if (*end) |
| 2339 | return -EINVAL; | 2297 | return -EINVAL; |
| 2340 | retval = cft->write_s64(cgrp, cft, val); | 2298 | retval = cft->write_s64(css, cft, val); |
| 2341 | } | 2299 | } |
| 2342 | if (!retval) | 2300 | if (!retval) |
| 2343 | retval = nbytes; | 2301 | retval = nbytes; |
| 2344 | return retval; | 2302 | return retval; |
| 2345 | } | 2303 | } |
| 2346 | 2304 | ||
| 2347 | static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft, | 2305 | static ssize_t cgroup_write_string(struct cgroup_subsys_state *css, |
| 2348 | struct file *file, | 2306 | struct cftype *cft, struct file *file, |
| 2349 | const char __user *userbuf, | 2307 | const char __user *userbuf, size_t nbytes, |
| 2350 | size_t nbytes, loff_t *unused_ppos) | 2308 | loff_t *unused_ppos) |
| 2351 | { | 2309 | { |
| 2352 | char local_buffer[CGROUP_LOCAL_BUFFER_SIZE]; | 2310 | char local_buffer[CGROUP_LOCAL_BUFFER_SIZE]; |
| 2353 | int retval = 0; | 2311 | int retval = 0; |
| @@ -2370,7 +2328,7 @@ static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft, | |||
| 2370 | } | 2328 | } |
| 2371 | 2329 | ||
| 2372 | buffer[nbytes] = 0; /* nul-terminate */ | 2330 | buffer[nbytes] = 0; /* nul-terminate */ |
| 2373 | retval = cft->write_string(cgrp, cft, strstrip(buffer)); | 2331 | retval = cft->write_string(css, cft, strstrip(buffer)); |
| 2374 | if (!retval) | 2332 | if (!retval) |
| 2375 | retval = nbytes; | 2333 | retval = nbytes; |
| 2376 | out: | 2334 | out: |
| @@ -2380,65 +2338,60 @@ out: | |||
| 2380 | } | 2338 | } |
| 2381 | 2339 | ||
| 2382 | static ssize_t cgroup_file_write(struct file *file, const char __user *buf, | 2340 | static ssize_t cgroup_file_write(struct file *file, const char __user *buf, |
| 2383 | size_t nbytes, loff_t *ppos) | 2341 | size_t nbytes, loff_t *ppos) |
| 2384 | { | 2342 | { |
| 2343 | struct cfent *cfe = __d_cfe(file->f_dentry); | ||
| 2385 | struct cftype *cft = __d_cft(file->f_dentry); | 2344 | struct cftype *cft = __d_cft(file->f_dentry); |
| 2386 | struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); | 2345 | struct cgroup_subsys_state *css = cfe->css; |
| 2387 | 2346 | ||
| 2388 | if (cgroup_is_dead(cgrp)) | ||
| 2389 | return -ENODEV; | ||
| 2390 | if (cft->write) | 2347 | if (cft->write) |
| 2391 | return cft->write(cgrp, cft, file, buf, nbytes, ppos); | 2348 | return cft->write(css, cft, file, buf, nbytes, ppos); |
| 2392 | if (cft->write_u64 || cft->write_s64) | 2349 | if (cft->write_u64 || cft->write_s64) |
| 2393 | return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos); | 2350 | return cgroup_write_X64(css, cft, file, buf, nbytes, ppos); |
| 2394 | if (cft->write_string) | 2351 | if (cft->write_string) |
| 2395 | return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos); | 2352 | return cgroup_write_string(css, cft, file, buf, nbytes, ppos); |
| 2396 | if (cft->trigger) { | 2353 | if (cft->trigger) { |
| 2397 | int ret = cft->trigger(cgrp, (unsigned int)cft->private); | 2354 | int ret = cft->trigger(css, (unsigned int)cft->private); |
| 2398 | return ret ? ret : nbytes; | 2355 | return ret ? ret : nbytes; |
| 2399 | } | 2356 | } |
| 2400 | return -EINVAL; | 2357 | return -EINVAL; |
| 2401 | } | 2358 | } |
| 2402 | 2359 | ||
| 2403 | static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft, | 2360 | static ssize_t cgroup_read_u64(struct cgroup_subsys_state *css, |
| 2404 | struct file *file, | 2361 | struct cftype *cft, struct file *file, |
| 2405 | char __user *buf, size_t nbytes, | 2362 | char __user *buf, size_t nbytes, loff_t *ppos) |
| 2406 | loff_t *ppos) | ||
| 2407 | { | 2363 | { |
| 2408 | char tmp[CGROUP_LOCAL_BUFFER_SIZE]; | 2364 | char tmp[CGROUP_LOCAL_BUFFER_SIZE]; |
| 2409 | u64 val = cft->read_u64(cgrp, cft); | 2365 | u64 val = cft->read_u64(css, cft); |
| 2410 | int len = sprintf(tmp, "%llu\n", (unsigned long long) val); | 2366 | int len = sprintf(tmp, "%llu\n", (unsigned long long) val); |
| 2411 | 2367 | ||
| 2412 | return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); | 2368 | return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); |
| 2413 | } | 2369 | } |
| 2414 | 2370 | ||
| 2415 | static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft, | 2371 | static ssize_t cgroup_read_s64(struct cgroup_subsys_state *css, |
| 2416 | struct file *file, | 2372 | struct cftype *cft, struct file *file, |
| 2417 | char __user *buf, size_t nbytes, | 2373 | char __user *buf, size_t nbytes, loff_t *ppos) |
| 2418 | loff_t *ppos) | ||
| 2419 | { | 2374 | { |
| 2420 | char tmp[CGROUP_LOCAL_BUFFER_SIZE]; | 2375 | char tmp[CGROUP_LOCAL_BUFFER_SIZE]; |
| 2421 | s64 val = cft->read_s64(cgrp, cft); | 2376 | s64 val = cft->read_s64(css, cft); |
| 2422 | int len = sprintf(tmp, "%lld\n", (long long) val); | 2377 | int len = sprintf(tmp, "%lld\n", (long long) val); |
| 2423 | 2378 | ||
| 2424 | return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); | 2379 | return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); |
| 2425 | } | 2380 | } |
| 2426 | 2381 | ||
| 2427 | static ssize_t cgroup_file_read(struct file *file, char __user *buf, | 2382 | static ssize_t cgroup_file_read(struct file *file, char __user *buf, |
| 2428 | size_t nbytes, loff_t *ppos) | 2383 | size_t nbytes, loff_t *ppos) |
| 2429 | { | 2384 | { |
| 2385 | struct cfent *cfe = __d_cfe(file->f_dentry); | ||
| 2430 | struct cftype *cft = __d_cft(file->f_dentry); | 2386 | struct cftype *cft = __d_cft(file->f_dentry); |
| 2431 | struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); | 2387 | struct cgroup_subsys_state *css = cfe->css; |
| 2432 | |||
| 2433 | if (cgroup_is_dead(cgrp)) | ||
| 2434 | return -ENODEV; | ||
| 2435 | 2388 | ||
| 2436 | if (cft->read) | 2389 | if (cft->read) |
| 2437 | return cft->read(cgrp, cft, file, buf, nbytes, ppos); | 2390 | return cft->read(css, cft, file, buf, nbytes, ppos); |
| 2438 | if (cft->read_u64) | 2391 | if (cft->read_u64) |
| 2439 | return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos); | 2392 | return cgroup_read_u64(css, cft, file, buf, nbytes, ppos); |
| 2440 | if (cft->read_s64) | 2393 | if (cft->read_s64) |
| 2441 | return cgroup_read_s64(cgrp, cft, file, buf, nbytes, ppos); | 2394 | return cgroup_read_s64(css, cft, file, buf, nbytes, ppos); |
| 2442 | return -EINVAL; | 2395 | return -EINVAL; |
| 2443 | } | 2396 | } |
| 2444 | 2397 | ||
| @@ -2447,11 +2400,6 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf, | |||
| 2447 | * supports string->u64 maps, but can be extended in future. | 2400 | * supports string->u64 maps, but can be extended in future. |
| 2448 | */ | 2401 | */ |
| 2449 | 2402 | ||
| 2450 | struct cgroup_seqfile_state { | ||
| 2451 | struct cftype *cft; | ||
| 2452 | struct cgroup *cgroup; | ||
| 2453 | }; | ||
| 2454 | |||
| 2455 | static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value) | 2403 | static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value) |
| 2456 | { | 2404 | { |
| 2457 | struct seq_file *sf = cb->state; | 2405 | struct seq_file *sf = cb->state; |
| @@ -2460,69 +2408,86 @@ static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value) | |||
| 2460 | 2408 | ||
| 2461 | static int cgroup_seqfile_show(struct seq_file *m, void *arg) | 2409 | static int cgroup_seqfile_show(struct seq_file *m, void *arg) |
| 2462 | { | 2410 | { |
| 2463 | struct cgroup_seqfile_state *state = m->private; | 2411 | struct cfent *cfe = m->private; |
| 2464 | struct cftype *cft = state->cft; | 2412 | struct cftype *cft = cfe->type; |
| 2413 | struct cgroup_subsys_state *css = cfe->css; | ||
| 2414 | |||
| 2465 | if (cft->read_map) { | 2415 | if (cft->read_map) { |
| 2466 | struct cgroup_map_cb cb = { | 2416 | struct cgroup_map_cb cb = { |
| 2467 | .fill = cgroup_map_add, | 2417 | .fill = cgroup_map_add, |
| 2468 | .state = m, | 2418 | .state = m, |
| 2469 | }; | 2419 | }; |
| 2470 | return cft->read_map(state->cgroup, cft, &cb); | 2420 | return cft->read_map(css, cft, &cb); |
| 2471 | } | 2421 | } |
| 2472 | return cft->read_seq_string(state->cgroup, cft, m); | 2422 | return cft->read_seq_string(css, cft, m); |
| 2473 | } | ||
| 2474 | |||
| 2475 | static int cgroup_seqfile_release(struct inode *inode, struct file *file) | ||
| 2476 | { | ||
| 2477 | struct seq_file *seq = file->private_data; | ||
| 2478 | kfree(seq->private); | ||
| 2479 | return single_release(inode, file); | ||
| 2480 | } | 2423 | } |
| 2481 | 2424 | ||
| 2482 | static const struct file_operations cgroup_seqfile_operations = { | 2425 | static const struct file_operations cgroup_seqfile_operations = { |
| 2483 | .read = seq_read, | 2426 | .read = seq_read, |
| 2484 | .write = cgroup_file_write, | 2427 | .write = cgroup_file_write, |
| 2485 | .llseek = seq_lseek, | 2428 | .llseek = seq_lseek, |
| 2486 | .release = cgroup_seqfile_release, | 2429 | .release = single_release, |
| 2487 | }; | 2430 | }; |
| 2488 | 2431 | ||
| 2489 | static int cgroup_file_open(struct inode *inode, struct file *file) | 2432 | static int cgroup_file_open(struct inode *inode, struct file *file) |
| 2490 | { | 2433 | { |
| 2434 | struct cfent *cfe = __d_cfe(file->f_dentry); | ||
| 2435 | struct cftype *cft = __d_cft(file->f_dentry); | ||
| 2436 | struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent); | ||
| 2437 | struct cgroup_subsys_state *css; | ||
| 2491 | int err; | 2438 | int err; |
| 2492 | struct cftype *cft; | ||
| 2493 | 2439 | ||
| 2494 | err = generic_file_open(inode, file); | 2440 | err = generic_file_open(inode, file); |
| 2495 | if (err) | 2441 | if (err) |
| 2496 | return err; | 2442 | return err; |
| 2497 | cft = __d_cft(file->f_dentry); | ||
| 2498 | 2443 | ||
| 2499 | if (cft->read_map || cft->read_seq_string) { | 2444 | /* |
| 2500 | struct cgroup_seqfile_state *state; | 2445 | * If the file belongs to a subsystem, pin the css. Will be |
| 2446 | * unpinned either on open failure or release. This ensures that | ||
| 2447 | * @css stays alive for all file operations. | ||
| 2448 | */ | ||
| 2449 | rcu_read_lock(); | ||
| 2450 | css = cgroup_css(cgrp, cft->ss); | ||
| 2451 | if (cft->ss && !css_tryget(css)) | ||
| 2452 | css = NULL; | ||
| 2453 | rcu_read_unlock(); | ||
| 2501 | 2454 | ||
| 2502 | state = kzalloc(sizeof(*state), GFP_USER); | 2455 | if (!css) |
| 2503 | if (!state) | 2456 | return -ENODEV; |
| 2504 | return -ENOMEM; | ||
| 2505 | 2457 | ||
| 2506 | state->cft = cft; | 2458 | /* |
| 2507 | state->cgroup = __d_cgrp(file->f_dentry->d_parent); | 2459 | * @cfe->css is used by read/write/close to determine the |
| 2460 | * associated css. @file->private_data would be a better place but | ||
| 2461 | * that's already used by seqfile. Multiple accessors may use it | ||
| 2462 | * simultaneously which is okay as the association never changes. | ||
| 2463 | */ | ||
| 2464 | WARN_ON_ONCE(cfe->css && cfe->css != css); | ||
| 2465 | cfe->css = css; | ||
| 2466 | |||
| 2467 | if (cft->read_map || cft->read_seq_string) { | ||
| 2508 | file->f_op = &cgroup_seqfile_operations; | 2468 | file->f_op = &cgroup_seqfile_operations; |
| 2509 | err = single_open(file, cgroup_seqfile_show, state); | 2469 | err = single_open(file, cgroup_seqfile_show, cfe); |
| 2510 | if (err < 0) | 2470 | } else if (cft->open) { |
| 2511 | kfree(state); | ||
| 2512 | } else if (cft->open) | ||
| 2513 | err = cft->open(inode, file); | 2471 | err = cft->open(inode, file); |
| 2514 | else | 2472 | } |
| 2515 | err = 0; | ||
| 2516 | 2473 | ||
| 2474 | if (css->ss && err) | ||
| 2475 | css_put(css); | ||
| 2517 | return err; | 2476 | return err; |
| 2518 | } | 2477 | } |
| 2519 | 2478 | ||
| 2520 | static int cgroup_file_release(struct inode *inode, struct file *file) | 2479 | static int cgroup_file_release(struct inode *inode, struct file *file) |
| 2521 | { | 2480 | { |
| 2481 | struct cfent *cfe = __d_cfe(file->f_dentry); | ||
| 2522 | struct cftype *cft = __d_cft(file->f_dentry); | 2482 | struct cftype *cft = __d_cft(file->f_dentry); |
| 2483 | struct cgroup_subsys_state *css = cfe->css; | ||
| 2484 | int ret = 0; | ||
| 2485 | |||
| 2523 | if (cft->release) | 2486 | if (cft->release) |
| 2524 | return cft->release(inode, file); | 2487 | ret = cft->release(inode, file); |
| 2525 | return 0; | 2488 | if (css->ss) |
| 2489 | css_put(css); | ||
| 2490 | return ret; | ||
| 2526 | } | 2491 | } |
| 2527 | 2492 | ||
| 2528 | /* | 2493 | /* |
| @@ -2736,8 +2701,7 @@ static umode_t cgroup_file_mode(const struct cftype *cft) | |||
| 2736 | return mode; | 2701 | return mode; |
| 2737 | } | 2702 | } |
| 2738 | 2703 | ||
| 2739 | static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, | 2704 | static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) |
| 2740 | struct cftype *cft) | ||
| 2741 | { | 2705 | { |
| 2742 | struct dentry *dir = cgrp->dentry; | 2706 | struct dentry *dir = cgrp->dentry; |
| 2743 | struct cgroup *parent = __d_cgrp(dir); | 2707 | struct cgroup *parent = __d_cgrp(dir); |
| @@ -2747,8 +2711,9 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, | |||
| 2747 | umode_t mode; | 2711 | umode_t mode; |
| 2748 | char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; | 2712 | char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; |
| 2749 | 2713 | ||
| 2750 | if (subsys && !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) { | 2714 | if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) && |
| 2751 | strcpy(name, subsys->name); | 2715 | !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) { |
| 2716 | strcpy(name, cft->ss->name); | ||
| 2752 | strcat(name, "."); | 2717 | strcat(name, "."); |
| 2753 | } | 2718 | } |
| 2754 | strcat(name, cft->name); | 2719 | strcat(name, cft->name); |
| @@ -2782,11 +2747,25 @@ out: | |||
| 2782 | return error; | 2747 | return error; |
| 2783 | } | 2748 | } |
| 2784 | 2749 | ||
| 2785 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, | 2750 | /** |
| 2786 | struct cftype cfts[], bool is_add) | 2751 | * cgroup_addrm_files - add or remove files to a cgroup directory |
| 2752 | * @cgrp: the target cgroup | ||
| 2753 | * @cfts: array of cftypes to be added | ||
| 2754 | * @is_add: whether to add or remove | ||
| 2755 | * | ||
| 2756 | * Depending on @is_add, add or remove files defined by @cfts on @cgrp. | ||
| 2757 | * For removals, this function never fails. If addition fails, this | ||
| 2758 | * function doesn't remove files already added. The caller is responsible | ||
| 2759 | * for cleaning up. | ||
| 2760 | */ | ||
| 2761 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], | ||
| 2762 | bool is_add) | ||
| 2787 | { | 2763 | { |
| 2788 | struct cftype *cft; | 2764 | struct cftype *cft; |
| 2789 | int err, ret = 0; | 2765 | int ret; |
| 2766 | |||
| 2767 | lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); | ||
| 2768 | lockdep_assert_held(&cgroup_mutex); | ||
| 2790 | 2769 | ||
| 2791 | for (cft = cfts; cft->name[0] != '\0'; cft++) { | 2770 | for (cft = cfts; cft->name[0] != '\0'; cft++) { |
| 2792 | /* does cft->flags tell us to skip this file on @cgrp? */ | 2771 | /* does cft->flags tell us to skip this file on @cgrp? */ |
| @@ -2798,16 +2777,17 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, | |||
| 2798 | continue; | 2777 | continue; |
| 2799 | 2778 | ||
| 2800 | if (is_add) { | 2779 | if (is_add) { |
| 2801 | err = cgroup_add_file(cgrp, subsys, cft); | 2780 | ret = cgroup_add_file(cgrp, cft); |
| 2802 | if (err) | 2781 | if (ret) { |
| 2803 | pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n", | 2782 | pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n", |
| 2804 | cft->name, err); | 2783 | cft->name, ret); |
| 2805 | ret = err; | 2784 | return ret; |
| 2785 | } | ||
| 2806 | } else { | 2786 | } else { |
| 2807 | cgroup_rm_file(cgrp, cft); | 2787 | cgroup_rm_file(cgrp, cft); |
| 2808 | } | 2788 | } |
| 2809 | } | 2789 | } |
| 2810 | return ret; | 2790 | return 0; |
| 2811 | } | 2791 | } |
| 2812 | 2792 | ||
| 2813 | static void cgroup_cfts_prepare(void) | 2793 | static void cgroup_cfts_prepare(void) |
| @@ -2816,28 +2796,30 @@ static void cgroup_cfts_prepare(void) | |||
| 2816 | /* | 2796 | /* |
| 2817 | * Thanks to the entanglement with vfs inode locking, we can't walk | 2797 | * Thanks to the entanglement with vfs inode locking, we can't walk |
| 2818 | * the existing cgroups under cgroup_mutex and create files. | 2798 | * the existing cgroups under cgroup_mutex and create files. |
| 2819 | * Instead, we use cgroup_for_each_descendant_pre() and drop RCU | 2799 | * Instead, we use css_for_each_descendant_pre() and drop RCU read |
| 2820 | * read lock before calling cgroup_addrm_files(). | 2800 | * lock before calling cgroup_addrm_files(). |
| 2821 | */ | 2801 | */ |
| 2822 | mutex_lock(&cgroup_mutex); | 2802 | mutex_lock(&cgroup_mutex); |
| 2823 | } | 2803 | } |
| 2824 | 2804 | ||
| 2825 | static void cgroup_cfts_commit(struct cgroup_subsys *ss, | 2805 | static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) |
| 2826 | struct cftype *cfts, bool is_add) | ||
| 2827 | __releases(&cgroup_mutex) | 2806 | __releases(&cgroup_mutex) |
| 2828 | { | 2807 | { |
| 2829 | LIST_HEAD(pending); | 2808 | LIST_HEAD(pending); |
| 2830 | struct cgroup *cgrp, *root = &ss->root->top_cgroup; | 2809 | struct cgroup_subsys *ss = cfts[0].ss; |
| 2810 | struct cgroup *root = &ss->root->top_cgroup; | ||
| 2831 | struct super_block *sb = ss->root->sb; | 2811 | struct super_block *sb = ss->root->sb; |
| 2832 | struct dentry *prev = NULL; | 2812 | struct dentry *prev = NULL; |
| 2833 | struct inode *inode; | 2813 | struct inode *inode; |
| 2814 | struct cgroup_subsys_state *css; | ||
| 2834 | u64 update_before; | 2815 | u64 update_before; |
| 2816 | int ret = 0; | ||
| 2835 | 2817 | ||
| 2836 | /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ | 2818 | /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ |
| 2837 | if (!cfts || ss->root == &cgroup_dummy_root || | 2819 | if (!cfts || ss->root == &cgroup_dummy_root || |
| 2838 | !atomic_inc_not_zero(&sb->s_active)) { | 2820 | !atomic_inc_not_zero(&sb->s_active)) { |
| 2839 | mutex_unlock(&cgroup_mutex); | 2821 | mutex_unlock(&cgroup_mutex); |
| 2840 | return; | 2822 | return 0; |
| 2841 | } | 2823 | } |
| 2842 | 2824 | ||
| 2843 | /* | 2825 | /* |
| @@ -2849,17 +2831,11 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss, | |||
| 2849 | 2831 | ||
| 2850 | mutex_unlock(&cgroup_mutex); | 2832 | mutex_unlock(&cgroup_mutex); |
| 2851 | 2833 | ||
| 2852 | /* @root always needs to be updated */ | ||
| 2853 | inode = root->dentry->d_inode; | ||
| 2854 | mutex_lock(&inode->i_mutex); | ||
| 2855 | mutex_lock(&cgroup_mutex); | ||
| 2856 | cgroup_addrm_files(root, ss, cfts, is_add); | ||
| 2857 | mutex_unlock(&cgroup_mutex); | ||
| 2858 | mutex_unlock(&inode->i_mutex); | ||
| 2859 | |||
| 2860 | /* add/rm files for all cgroups created before */ | 2834 | /* add/rm files for all cgroups created before */ |
| 2861 | rcu_read_lock(); | 2835 | rcu_read_lock(); |
| 2862 | cgroup_for_each_descendant_pre(cgrp, root) { | 2836 | css_for_each_descendant_pre(css, cgroup_css(root, ss)) { |
| 2837 | struct cgroup *cgrp = css->cgroup; | ||
| 2838 | |||
| 2863 | if (cgroup_is_dead(cgrp)) | 2839 | if (cgroup_is_dead(cgrp)) |
| 2864 | continue; | 2840 | continue; |
| 2865 | 2841 | ||
| @@ -2873,15 +2849,18 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss, | |||
| 2873 | mutex_lock(&inode->i_mutex); | 2849 | mutex_lock(&inode->i_mutex); |
| 2874 | mutex_lock(&cgroup_mutex); | 2850 | mutex_lock(&cgroup_mutex); |
| 2875 | if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp)) | 2851 | if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp)) |
| 2876 | cgroup_addrm_files(cgrp, ss, cfts, is_add); | 2852 | ret = cgroup_addrm_files(cgrp, cfts, is_add); |
| 2877 | mutex_unlock(&cgroup_mutex); | 2853 | mutex_unlock(&cgroup_mutex); |
| 2878 | mutex_unlock(&inode->i_mutex); | 2854 | mutex_unlock(&inode->i_mutex); |
| 2879 | 2855 | ||
| 2880 | rcu_read_lock(); | 2856 | rcu_read_lock(); |
| 2857 | if (ret) | ||
| 2858 | break; | ||
| 2881 | } | 2859 | } |
| 2882 | rcu_read_unlock(); | 2860 | rcu_read_unlock(); |
| 2883 | dput(prev); | 2861 | dput(prev); |
| 2884 | deactivate_super(sb); | 2862 | deactivate_super(sb); |
| 2863 | return ret; | ||
| 2885 | } | 2864 | } |
| 2886 | 2865 | ||
| 2887 | /** | 2866 | /** |
| @@ -2901,49 +2880,56 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss, | |||
| 2901 | int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) | 2880 | int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) |
| 2902 | { | 2881 | { |
| 2903 | struct cftype_set *set; | 2882 | struct cftype_set *set; |
| 2883 | struct cftype *cft; | ||
| 2884 | int ret; | ||
| 2904 | 2885 | ||
| 2905 | set = kzalloc(sizeof(*set), GFP_KERNEL); | 2886 | set = kzalloc(sizeof(*set), GFP_KERNEL); |
| 2906 | if (!set) | 2887 | if (!set) |
| 2907 | return -ENOMEM; | 2888 | return -ENOMEM; |
| 2908 | 2889 | ||
| 2890 | for (cft = cfts; cft->name[0] != '\0'; cft++) | ||
| 2891 | cft->ss = ss; | ||
| 2892 | |||
| 2909 | cgroup_cfts_prepare(); | 2893 | cgroup_cfts_prepare(); |
| 2910 | set->cfts = cfts; | 2894 | set->cfts = cfts; |
| 2911 | list_add_tail(&set->node, &ss->cftsets); | 2895 | list_add_tail(&set->node, &ss->cftsets); |
| 2912 | cgroup_cfts_commit(ss, cfts, true); | 2896 | ret = cgroup_cfts_commit(cfts, true); |
| 2913 | 2897 | if (ret) | |
| 2914 | return 0; | 2898 | cgroup_rm_cftypes(cfts); |
| 2899 | return ret; | ||
| 2915 | } | 2900 | } |
| 2916 | EXPORT_SYMBOL_GPL(cgroup_add_cftypes); | 2901 | EXPORT_SYMBOL_GPL(cgroup_add_cftypes); |
| 2917 | 2902 | ||
| 2918 | /** | 2903 | /** |
| 2919 | * cgroup_rm_cftypes - remove an array of cftypes from a subsystem | 2904 | * cgroup_rm_cftypes - remove an array of cftypes from a subsystem |
| 2920 | * @ss: target cgroup subsystem | ||
| 2921 | * @cfts: zero-length name terminated array of cftypes | 2905 | * @cfts: zero-length name terminated array of cftypes |
| 2922 | * | 2906 | * |
| 2923 | * Unregister @cfts from @ss. Files described by @cfts are removed from | 2907 | * Unregister @cfts. Files described by @cfts are removed from all |
| 2924 | * all existing cgroups to which @ss is attached and all future cgroups | 2908 | * existing cgroups and all future cgroups won't have them either. This |
| 2925 | * won't have them either. This function can be called anytime whether @ss | 2909 | * function can be called anytime whether @cfts' subsys is attached or not. |
| 2926 | * is attached or not. | ||
| 2927 | * | 2910 | * |
| 2928 | * Returns 0 on successful unregistration, -ENOENT if @cfts is not | 2911 | * Returns 0 on successful unregistration, -ENOENT if @cfts is not |
| 2929 | * registered with @ss. | 2912 | * registered. |
| 2930 | */ | 2913 | */ |
| 2931 | int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) | 2914 | int cgroup_rm_cftypes(struct cftype *cfts) |
| 2932 | { | 2915 | { |
| 2933 | struct cftype_set *set; | 2916 | struct cftype_set *set; |
| 2934 | 2917 | ||
| 2918 | if (!cfts || !cfts[0].ss) | ||
| 2919 | return -ENOENT; | ||
| 2920 | |||
| 2935 | cgroup_cfts_prepare(); | 2921 | cgroup_cfts_prepare(); |
| 2936 | 2922 | ||
| 2937 | list_for_each_entry(set, &ss->cftsets, node) { | 2923 | list_for_each_entry(set, &cfts[0].ss->cftsets, node) { |
| 2938 | if (set->cfts == cfts) { | 2924 | if (set->cfts == cfts) { |
| 2939 | list_del(&set->node); | 2925 | list_del(&set->node); |
| 2940 | kfree(set); | 2926 | kfree(set); |
| 2941 | cgroup_cfts_commit(ss, cfts, false); | 2927 | cgroup_cfts_commit(cfts, false); |
| 2942 | return 0; | 2928 | return 0; |
| 2943 | } | 2929 | } |
| 2944 | } | 2930 | } |
| 2945 | 2931 | ||
| 2946 | cgroup_cfts_commit(ss, NULL, false); | 2932 | cgroup_cfts_commit(NULL, false); |
| 2947 | return -ENOENT; | 2933 | return -ENOENT; |
| 2948 | } | 2934 | } |
| 2949 | 2935 | ||
| @@ -2966,34 +2952,10 @@ int cgroup_task_count(const struct cgroup *cgrp) | |||
| 2966 | } | 2952 | } |
| 2967 | 2953 | ||
| 2968 | /* | 2954 | /* |
| 2969 | * Advance a list_head iterator. The iterator should be positioned at | 2955 | * To reduce the fork() overhead for systems that are not actually using |
| 2970 | * the start of a css_set | 2956 | * their cgroups capability, we don't maintain the lists running through |
| 2971 | */ | 2957 | * each css_set to its tasks until we see the list actually used - in other |
| 2972 | static void cgroup_advance_iter(struct cgroup *cgrp, struct cgroup_iter *it) | 2958 | * words after the first call to css_task_iter_start(). |
| 2973 | { | ||
| 2974 | struct list_head *l = it->cset_link; | ||
| 2975 | struct cgrp_cset_link *link; | ||
| 2976 | struct css_set *cset; | ||
| 2977 | |||
| 2978 | /* Advance to the next non-empty css_set */ | ||
| 2979 | do { | ||
| 2980 | l = l->next; | ||
| 2981 | if (l == &cgrp->cset_links) { | ||
| 2982 | it->cset_link = NULL; | ||
| 2983 | return; | ||
| 2984 | } | ||
| 2985 | link = list_entry(l, struct cgrp_cset_link, cset_link); | ||
| 2986 | cset = link->cset; | ||
| 2987 | } while (list_empty(&cset->tasks)); | ||
| 2988 | it->cset_link = l; | ||
| 2989 | it->task = cset->tasks.next; | ||
| 2990 | } | ||
| 2991 | |||
| 2992 | /* | ||
| 2993 | * To reduce the fork() overhead for systems that are not actually | ||
| 2994 | * using their cgroups capability, we don't maintain the lists running | ||
| 2995 | * through each css_set to its tasks until we see the list actually | ||
| 2996 | * used - in other words after the first call to cgroup_iter_start(). | ||
| 2997 | */ | 2959 | */ |
| 2998 | static void cgroup_enable_task_cg_lists(void) | 2960 | static void cgroup_enable_task_cg_lists(void) |
| 2999 | { | 2961 | { |
| @@ -3024,16 +2986,21 @@ static void cgroup_enable_task_cg_lists(void) | |||
| 3024 | } | 2986 | } |
| 3025 | 2987 | ||
| 3026 | /** | 2988 | /** |
| 3027 | * cgroup_next_sibling - find the next sibling of a given cgroup | 2989 | * css_next_child - find the next child of a given css |
| 3028 | * @pos: the current cgroup | 2990 | * @pos_css: the current position (%NULL to initiate traversal) |
| 2991 | * @parent_css: css whose children to walk | ||
| 3029 | * | 2992 | * |
| 3030 | * This function returns the next sibling of @pos and should be called | 2993 | * This function returns the next child of @parent_css and should be called |
| 3031 | * under RCU read lock. The only requirement is that @pos is accessible. | 2994 | * under RCU read lock. The only requirement is that @parent_css and |
| 3032 | * The next sibling is guaranteed to be returned regardless of @pos's | 2995 | * @pos_css are accessible. The next sibling is guaranteed to be returned |
| 3033 | * state. | 2996 | * regardless of their states. |
| 3034 | */ | 2997 | */ |
| 3035 | struct cgroup *cgroup_next_sibling(struct cgroup *pos) | 2998 | struct cgroup_subsys_state * |
| 2999 | css_next_child(struct cgroup_subsys_state *pos_css, | ||
| 3000 | struct cgroup_subsys_state *parent_css) | ||
| 3036 | { | 3001 | { |
| 3002 | struct cgroup *pos = pos_css ? pos_css->cgroup : NULL; | ||
| 3003 | struct cgroup *cgrp = parent_css->cgroup; | ||
| 3037 | struct cgroup *next; | 3004 | struct cgroup *next; |
| 3038 | 3005 | ||
| 3039 | WARN_ON_ONCE(!rcu_read_lock_held()); | 3006 | WARN_ON_ONCE(!rcu_read_lock_held()); |
| @@ -3048,78 +3015,81 @@ struct cgroup *cgroup_next_sibling(struct cgroup *pos) | |||
| 3048 | * safe to dereference from this RCU critical section. If | 3015 | * safe to dereference from this RCU critical section. If |
| 3049 | * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed | 3016 | * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed |
| 3050 | * to be visible as %true here. | 3017 | * to be visible as %true here. |
| 3018 | * | ||
| 3019 | * If @pos is dead, its next pointer can't be dereferenced; | ||
| 3020 | * however, as each cgroup is given a monotonically increasing | ||
| 3021 | * unique serial number and always appended to the sibling list, | ||
| 3022 | * the next one can be found by walking the parent's children until | ||
| 3023 | * we see a cgroup with higher serial number than @pos's. While | ||
| 3024 | * this path can be slower, it's taken only when either the current | ||
| 3025 | * cgroup is removed or iteration and removal race. | ||
| 3051 | */ | 3026 | */ |
| 3052 | if (likely(!cgroup_is_dead(pos))) { | 3027 | if (!pos) { |
| 3028 | next = list_entry_rcu(cgrp->children.next, struct cgroup, sibling); | ||
| 3029 | } else if (likely(!cgroup_is_dead(pos))) { | ||
| 3053 | next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); | 3030 | next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); |
| 3054 | if (&next->sibling != &pos->parent->children) | 3031 | } else { |
| 3055 | return next; | 3032 | list_for_each_entry_rcu(next, &cgrp->children, sibling) |
| 3056 | return NULL; | 3033 | if (next->serial_nr > pos->serial_nr) |
| 3034 | break; | ||
| 3057 | } | 3035 | } |
| 3058 | 3036 | ||
| 3059 | /* | 3037 | if (&next->sibling == &cgrp->children) |
| 3060 | * Can't dereference the next pointer. Each cgroup is given a | 3038 | return NULL; |
| 3061 | * monotonically increasing unique serial number and always | 3039 | |
| 3062 | * appended to the sibling list, so the next one can be found by | 3040 | return cgroup_css(next, parent_css->ss); |
| 3063 | * walking the parent's children until we see a cgroup with higher | ||
| 3064 | * serial number than @pos's. | ||
| 3065 | * | ||
| 3066 | * While this path can be slow, it's taken only when either the | ||
| 3067 | * current cgroup is removed or iteration and removal race. | ||
| 3068 | */ | ||
| 3069 | list_for_each_entry_rcu(next, &pos->parent->children, sibling) | ||
| 3070 | if (next->serial_nr > pos->serial_nr) | ||
| 3071 | return next; | ||
| 3072 | return NULL; | ||
| 3073 | } | 3041 | } |
| 3074 | EXPORT_SYMBOL_GPL(cgroup_next_sibling); | 3042 | EXPORT_SYMBOL_GPL(css_next_child); |
| 3075 | 3043 | ||
| 3076 | /** | 3044 | /** |
| 3077 | * cgroup_next_descendant_pre - find the next descendant for pre-order walk | 3045 | * css_next_descendant_pre - find the next descendant for pre-order walk |
| 3078 | * @pos: the current position (%NULL to initiate traversal) | 3046 | * @pos: the current position (%NULL to initiate traversal) |
| 3079 | * @cgroup: cgroup whose descendants to walk | 3047 | * @root: css whose descendants to walk |
| 3080 | * | 3048 | * |
| 3081 | * To be used by cgroup_for_each_descendant_pre(). Find the next | 3049 | * To be used by css_for_each_descendant_pre(). Find the next descendant |
| 3082 | * descendant to visit for pre-order traversal of @cgroup's descendants. | 3050 | * to visit for pre-order traversal of @root's descendants. @root is |
| 3051 | * included in the iteration and the first node to be visited. | ||
| 3083 | * | 3052 | * |
| 3084 | * While this function requires RCU read locking, it doesn't require the | 3053 | * While this function requires RCU read locking, it doesn't require the |
| 3085 | * whole traversal to be contained in a single RCU critical section. This | 3054 | * whole traversal to be contained in a single RCU critical section. This |
| 3086 | * function will return the correct next descendant as long as both @pos | 3055 | * function will return the correct next descendant as long as both @pos |
| 3087 | * and @cgroup are accessible and @pos is a descendant of @cgroup. | 3056 | * and @root are accessible and @pos is a descendant of @root. |
| 3088 | */ | 3057 | */ |
| 3089 | struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, | 3058 | struct cgroup_subsys_state * |
| 3090 | struct cgroup *cgroup) | 3059 | css_next_descendant_pre(struct cgroup_subsys_state *pos, |
| 3060 | struct cgroup_subsys_state *root) | ||
| 3091 | { | 3061 | { |
| 3092 | struct cgroup *next; | 3062 | struct cgroup_subsys_state *next; |
| 3093 | 3063 | ||
| 3094 | WARN_ON_ONCE(!rcu_read_lock_held()); | 3064 | WARN_ON_ONCE(!rcu_read_lock_held()); |
| 3095 | 3065 | ||
| 3096 | /* if first iteration, pretend we just visited @cgroup */ | 3066 | /* if first iteration, visit @root */ |
| 3097 | if (!pos) | 3067 | if (!pos) |
| 3098 | pos = cgroup; | 3068 | return root; |
| 3099 | 3069 | ||
| 3100 | /* visit the first child if exists */ | 3070 | /* visit the first child if exists */ |
| 3101 | next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling); | 3071 | next = css_next_child(NULL, pos); |
| 3102 | if (next) | 3072 | if (next) |
| 3103 | return next; | 3073 | return next; |
| 3104 | 3074 | ||
| 3105 | /* no child, visit my or the closest ancestor's next sibling */ | 3075 | /* no child, visit my or the closest ancestor's next sibling */ |
| 3106 | while (pos != cgroup) { | 3076 | while (pos != root) { |
| 3107 | next = cgroup_next_sibling(pos); | 3077 | next = css_next_child(pos, css_parent(pos)); |
| 3108 | if (next) | 3078 | if (next) |
| 3109 | return next; | 3079 | return next; |
| 3110 | pos = pos->parent; | 3080 | pos = css_parent(pos); |
| 3111 | } | 3081 | } |
| 3112 | 3082 | ||
| 3113 | return NULL; | 3083 | return NULL; |
| 3114 | } | 3084 | } |
| 3115 | EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); | 3085 | EXPORT_SYMBOL_GPL(css_next_descendant_pre); |
| 3116 | 3086 | ||
| 3117 | /** | 3087 | /** |
| 3118 | * cgroup_rightmost_descendant - return the rightmost descendant of a cgroup | 3088 | * css_rightmost_descendant - return the rightmost descendant of a css |
| 3119 | * @pos: cgroup of interest | 3089 | * @pos: css of interest |
| 3120 | * | 3090 | * |
| 3121 | * Return the rightmost descendant of @pos. If there's no descendant, | 3091 | * Return the rightmost descendant of @pos. If there's no descendant, @pos |
| 3122 | * @pos is returned. This can be used during pre-order traversal to skip | 3092 | * is returned. This can be used during pre-order traversal to skip |
| 3123 | * subtree of @pos. | 3093 | * subtree of @pos. |
| 3124 | * | 3094 | * |
| 3125 | * While this function requires RCU read locking, it doesn't require the | 3095 | * While this function requires RCU read locking, it doesn't require the |
| @@ -3127,9 +3097,10 @@ EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); | |||
| 3127 | * function will return the correct rightmost descendant as long as @pos is | 3097 | * function will return the correct rightmost descendant as long as @pos is |
| 3128 | * accessible. | 3098 | * accessible. |
| 3129 | */ | 3099 | */ |
| 3130 | struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos) | 3100 | struct cgroup_subsys_state * |
| 3101 | css_rightmost_descendant(struct cgroup_subsys_state *pos) | ||
| 3131 | { | 3102 | { |
| 3132 | struct cgroup *last, *tmp; | 3103 | struct cgroup_subsys_state *last, *tmp; |
| 3133 | 3104 | ||
| 3134 | WARN_ON_ONCE(!rcu_read_lock_held()); | 3105 | WARN_ON_ONCE(!rcu_read_lock_held()); |
| 3135 | 3106 | ||
| @@ -3137,82 +3108,136 @@ struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos) | |||
| 3137 | last = pos; | 3108 | last = pos; |
| 3138 | /* ->prev isn't RCU safe, walk ->next till the end */ | 3109 | /* ->prev isn't RCU safe, walk ->next till the end */ |
| 3139 | pos = NULL; | 3110 | pos = NULL; |
| 3140 | list_for_each_entry_rcu(tmp, &last->children, sibling) | 3111 | css_for_each_child(tmp, last) |
| 3141 | pos = tmp; | 3112 | pos = tmp; |
| 3142 | } while (pos); | 3113 | } while (pos); |
| 3143 | 3114 | ||
| 3144 | return last; | 3115 | return last; |
| 3145 | } | 3116 | } |
| 3146 | EXPORT_SYMBOL_GPL(cgroup_rightmost_descendant); | 3117 | EXPORT_SYMBOL_GPL(css_rightmost_descendant); |
| 3147 | 3118 | ||
| 3148 | static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos) | 3119 | static struct cgroup_subsys_state * |
| 3120 | css_leftmost_descendant(struct cgroup_subsys_state *pos) | ||
| 3149 | { | 3121 | { |
| 3150 | struct cgroup *last; | 3122 | struct cgroup_subsys_state *last; |
| 3151 | 3123 | ||
| 3152 | do { | 3124 | do { |
| 3153 | last = pos; | 3125 | last = pos; |
| 3154 | pos = list_first_or_null_rcu(&pos->children, struct cgroup, | 3126 | pos = css_next_child(NULL, pos); |
| 3155 | sibling); | ||
| 3156 | } while (pos); | 3127 | } while (pos); |
| 3157 | 3128 | ||
| 3158 | return last; | 3129 | return last; |
| 3159 | } | 3130 | } |
| 3160 | 3131 | ||
| 3161 | /** | 3132 | /** |
| 3162 | * cgroup_next_descendant_post - find the next descendant for post-order walk | 3133 | * css_next_descendant_post - find the next descendant for post-order walk |
| 3163 | * @pos: the current position (%NULL to initiate traversal) | 3134 | * @pos: the current position (%NULL to initiate traversal) |
| 3164 | * @cgroup: cgroup whose descendants to walk | 3135 | * @root: css whose descendants to walk |
| 3165 | * | 3136 | * |
| 3166 | * To be used by cgroup_for_each_descendant_post(). Find the next | 3137 | * To be used by css_for_each_descendant_post(). Find the next descendant |
| 3167 | * descendant to visit for post-order traversal of @cgroup's descendants. | 3138 | * to visit for post-order traversal of @root's descendants. @root is |
| 3139 | * included in the iteration and the last node to be visited. | ||
| 3168 | * | 3140 | * |
| 3169 | * While this function requires RCU read locking, it doesn't require the | 3141 | * While this function requires RCU read locking, it doesn't require the |
| 3170 | * whole traversal to be contained in a single RCU critical section. This | 3142 | * whole traversal to be contained in a single RCU critical section. This |
| 3171 | * function will return the correct next descendant as long as both @pos | 3143 | * function will return the correct next descendant as long as both @pos |
| 3172 | * and @cgroup are accessible and @pos is a descendant of @cgroup. | 3144 | * and @cgroup are accessible and @pos is a descendant of @cgroup. |
| 3173 | */ | 3145 | */ |
| 3174 | struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, | 3146 | struct cgroup_subsys_state * |
| 3175 | struct cgroup *cgroup) | 3147 | css_next_descendant_post(struct cgroup_subsys_state *pos, |
| 3148 | struct cgroup_subsys_state *root) | ||
| 3176 | { | 3149 | { |
| 3177 | struct cgroup *next; | 3150 | struct cgroup_subsys_state *next; |
| 3178 | 3151 | ||
| 3179 | WARN_ON_ONCE(!rcu_read_lock_held()); | 3152 | WARN_ON_ONCE(!rcu_read_lock_held()); |
| 3180 | 3153 | ||
| 3181 | /* if first iteration, visit the leftmost descendant */ | 3154 | /* if first iteration, visit leftmost descendant which may be @root */ |
| 3182 | if (!pos) { | 3155 | if (!pos) |
| 3183 | next = cgroup_leftmost_descendant(cgroup); | 3156 | return css_leftmost_descendant(root); |
| 3184 | return next != cgroup ? next : NULL; | 3157 | |
| 3185 | } | 3158 | /* if we visited @root, we're done */ |
| 3159 | if (pos == root) | ||
| 3160 | return NULL; | ||
| 3186 | 3161 | ||
| 3187 | /* if there's an unvisited sibling, visit its leftmost descendant */ | 3162 | /* if there's an unvisited sibling, visit its leftmost descendant */ |
| 3188 | next = cgroup_next_sibling(pos); | 3163 | next = css_next_child(pos, css_parent(pos)); |
| 3189 | if (next) | 3164 | if (next) |
| 3190 | return cgroup_leftmost_descendant(next); | 3165 | return css_leftmost_descendant(next); |
| 3191 | 3166 | ||
| 3192 | /* no sibling left, visit parent */ | 3167 | /* no sibling left, visit parent */ |
| 3193 | next = pos->parent; | 3168 | return css_parent(pos); |
| 3194 | return next != cgroup ? next : NULL; | 3169 | } |
| 3170 | EXPORT_SYMBOL_GPL(css_next_descendant_post); | ||
| 3171 | |||
| 3172 | /** | ||
| 3173 | * css_advance_task_iter - advance a task itererator to the next css_set | ||
| 3174 | * @it: the iterator to advance | ||
| 3175 | * | ||
| 3176 | * Advance @it to the next css_set to walk. | ||
| 3177 | */ | ||
| 3178 | static void css_advance_task_iter(struct css_task_iter *it) | ||
| 3179 | { | ||
| 3180 | struct list_head *l = it->cset_link; | ||
| 3181 | struct cgrp_cset_link *link; | ||
| 3182 | struct css_set *cset; | ||
| 3183 | |||
| 3184 | /* Advance to the next non-empty css_set */ | ||
| 3185 | do { | ||
| 3186 | l = l->next; | ||
| 3187 | if (l == &it->origin_css->cgroup->cset_links) { | ||
| 3188 | it->cset_link = NULL; | ||
| 3189 | return; | ||
| 3190 | } | ||
| 3191 | link = list_entry(l, struct cgrp_cset_link, cset_link); | ||
| 3192 | cset = link->cset; | ||
| 3193 | } while (list_empty(&cset->tasks)); | ||
| 3194 | it->cset_link = l; | ||
| 3195 | it->task = cset->tasks.next; | ||
| 3195 | } | 3196 | } |
| 3196 | EXPORT_SYMBOL_GPL(cgroup_next_descendant_post); | ||
| 3197 | 3197 | ||
| 3198 | void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) | 3198 | /** |
| 3199 | * css_task_iter_start - initiate task iteration | ||
| 3200 | * @css: the css to walk tasks of | ||
| 3201 | * @it: the task iterator to use | ||
| 3202 | * | ||
| 3203 | * Initiate iteration through the tasks of @css. The caller can call | ||
| 3204 | * css_task_iter_next() to walk through the tasks until the function | ||
| 3205 | * returns NULL. On completion of iteration, css_task_iter_end() must be | ||
| 3206 | * called. | ||
| 3207 | * | ||
| 3208 | * Note that this function acquires a lock which is released when the | ||
| 3209 | * iteration finishes. The caller can't sleep while iteration is in | ||
| 3210 | * progress. | ||
| 3211 | */ | ||
| 3212 | void css_task_iter_start(struct cgroup_subsys_state *css, | ||
| 3213 | struct css_task_iter *it) | ||
| 3199 | __acquires(css_set_lock) | 3214 | __acquires(css_set_lock) |
| 3200 | { | 3215 | { |
| 3201 | /* | 3216 | /* |
| 3202 | * The first time anyone tries to iterate across a cgroup, | 3217 | * The first time anyone tries to iterate across a css, we need to |
| 3203 | * we need to enable the list linking each css_set to its | 3218 | * enable the list linking each css_set to its tasks, and fix up |
| 3204 | * tasks, and fix up all existing tasks. | 3219 | * all existing tasks. |
| 3205 | */ | 3220 | */ |
| 3206 | if (!use_task_css_set_links) | 3221 | if (!use_task_css_set_links) |
| 3207 | cgroup_enable_task_cg_lists(); | 3222 | cgroup_enable_task_cg_lists(); |
| 3208 | 3223 | ||
| 3209 | read_lock(&css_set_lock); | 3224 | read_lock(&css_set_lock); |
| 3210 | it->cset_link = &cgrp->cset_links; | 3225 | |
| 3211 | cgroup_advance_iter(cgrp, it); | 3226 | it->origin_css = css; |
| 3227 | it->cset_link = &css->cgroup->cset_links; | ||
| 3228 | |||
| 3229 | css_advance_task_iter(it); | ||
| 3212 | } | 3230 | } |
| 3213 | 3231 | ||
| 3214 | struct task_struct *cgroup_iter_next(struct cgroup *cgrp, | 3232 | /** |
| 3215 | struct cgroup_iter *it) | 3233 | * css_task_iter_next - return the next task for the iterator |
| 3234 | * @it: the task iterator being iterated | ||
| 3235 | * | ||
| 3236 | * The "next" function for task iteration. @it should have been | ||
| 3237 | * initialized via css_task_iter_start(). Returns NULL when the iteration | ||
| 3238 | * reaches the end. | ||
| 3239 | */ | ||
| 3240 | struct task_struct *css_task_iter_next(struct css_task_iter *it) | ||
| 3216 | { | 3241 | { |
| 3217 | struct task_struct *res; | 3242 | struct task_struct *res; |
| 3218 | struct list_head *l = it->task; | 3243 | struct list_head *l = it->task; |
| @@ -3226,16 +3251,24 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp, | |||
| 3226 | l = l->next; | 3251 | l = l->next; |
| 3227 | link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link); | 3252 | link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link); |
| 3228 | if (l == &link->cset->tasks) { | 3253 | if (l == &link->cset->tasks) { |
| 3229 | /* We reached the end of this task list - move on to | 3254 | /* |
| 3230 | * the next cg_cgroup_link */ | 3255 | * We reached the end of this task list - move on to the |
| 3231 | cgroup_advance_iter(cgrp, it); | 3256 | * next cgrp_cset_link. |
| 3257 | */ | ||
| 3258 | css_advance_task_iter(it); | ||
| 3232 | } else { | 3259 | } else { |
| 3233 | it->task = l; | 3260 | it->task = l; |
| 3234 | } | 3261 | } |
| 3235 | return res; | 3262 | return res; |
| 3236 | } | 3263 | } |
| 3237 | 3264 | ||
| 3238 | void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it) | 3265 | /** |
| 3266 | * css_task_iter_end - finish task iteration | ||
| 3267 | * @it: the task iterator to finish | ||
| 3268 | * | ||
| 3269 | * Finish task iteration started by css_task_iter_start(). | ||
| 3270 | */ | ||
| 3271 | void css_task_iter_end(struct css_task_iter *it) | ||
| 3239 | __releases(css_set_lock) | 3272 | __releases(css_set_lock) |
| 3240 | { | 3273 | { |
| 3241 | read_unlock(&css_set_lock); | 3274 | read_unlock(&css_set_lock); |
| @@ -3276,46 +3309,49 @@ static inline int started_after(void *p1, void *p2) | |||
| 3276 | } | 3309 | } |
| 3277 | 3310 | ||
| 3278 | /** | 3311 | /** |
| 3279 | * cgroup_scan_tasks - iterate though all the tasks in a cgroup | 3312 | * css_scan_tasks - iterate though all the tasks in a css |
| 3280 | * @scan: struct cgroup_scanner containing arguments for the scan | 3313 | * @css: the css to iterate tasks of |
| 3314 | * @test: optional test callback | ||
| 3315 | * @process: process callback | ||
| 3316 | * @data: data passed to @test and @process | ||
| 3317 | * @heap: optional pre-allocated heap used for task iteration | ||
| 3318 | * | ||
| 3319 | * Iterate through all the tasks in @css, calling @test for each, and if it | ||
| 3320 | * returns %true, call @process for it also. | ||
| 3281 | * | 3321 | * |
| 3282 | * Arguments include pointers to callback functions test_task() and | 3322 | * @test may be NULL, meaning always true (select all tasks), which |
| 3283 | * process_task(). | 3323 | * effectively duplicates css_task_iter_{start,next,end}() but does not |
| 3284 | * Iterate through all the tasks in a cgroup, calling test_task() for each, | 3324 | * lock css_set_lock for the call to @process. |
| 3285 | * and if it returns true, call process_task() for it also. | ||
| 3286 | * The test_task pointer may be NULL, meaning always true (select all tasks). | ||
| 3287 | * Effectively duplicates cgroup_iter_{start,next,end}() | ||
| 3288 | * but does not lock css_set_lock for the call to process_task(). | ||
| 3289 | * The struct cgroup_scanner may be embedded in any structure of the caller's | ||
| 3290 | * creation. | ||
| 3291 | * It is guaranteed that process_task() will act on every task that | ||
| 3292 | * is a member of the cgroup for the duration of this call. This | ||
| 3293 | * function may or may not call process_task() for tasks that exit | ||
| 3294 | * or move to a different cgroup during the call, or are forked or | ||
| 3295 | * move into the cgroup during the call. | ||
| 3296 | * | 3325 | * |
| 3297 | * Note that test_task() may be called with locks held, and may in some | 3326 | * It is guaranteed that @process will act on every task that is a member |
| 3298 | * situations be called multiple times for the same task, so it should | 3327 | * of @css for the duration of this call. This function may or may not |
| 3299 | * be cheap. | 3328 | * call @process for tasks that exit or move to a different css during the |
| 3300 | * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been | 3329 | * call, or are forked or move into the css during the call. |
| 3301 | * pre-allocated and will be used for heap operations (and its "gt" member will | 3330 | * |
| 3302 | * be overwritten), else a temporary heap will be used (allocation of which | 3331 | * Note that @test may be called with locks held, and may in some |
| 3303 | * may cause this function to fail). | 3332 | * situations be called multiple times for the same task, so it should be |
| 3333 | * cheap. | ||
| 3334 | * | ||
| 3335 | * If @heap is non-NULL, a heap has been pre-allocated and will be used for | ||
| 3336 | * heap operations (and its "gt" member will be overwritten), else a | ||
| 3337 | * temporary heap will be used (allocation of which may cause this function | ||
| 3338 | * to fail). | ||
| 3304 | */ | 3339 | */ |
| 3305 | int cgroup_scan_tasks(struct cgroup_scanner *scan) | 3340 | int css_scan_tasks(struct cgroup_subsys_state *css, |
| 3341 | bool (*test)(struct task_struct *, void *), | ||
| 3342 | void (*process)(struct task_struct *, void *), | ||
| 3343 | void *data, struct ptr_heap *heap) | ||
| 3306 | { | 3344 | { |
| 3307 | int retval, i; | 3345 | int retval, i; |
| 3308 | struct cgroup_iter it; | 3346 | struct css_task_iter it; |
| 3309 | struct task_struct *p, *dropped; | 3347 | struct task_struct *p, *dropped; |
| 3310 | /* Never dereference latest_task, since it's not refcounted */ | 3348 | /* Never dereference latest_task, since it's not refcounted */ |
| 3311 | struct task_struct *latest_task = NULL; | 3349 | struct task_struct *latest_task = NULL; |
| 3312 | struct ptr_heap tmp_heap; | 3350 | struct ptr_heap tmp_heap; |
| 3313 | struct ptr_heap *heap; | ||
| 3314 | struct timespec latest_time = { 0, 0 }; | 3351 | struct timespec latest_time = { 0, 0 }; |
| 3315 | 3352 | ||
| 3316 | if (scan->heap) { | 3353 | if (heap) { |
| 3317 | /* The caller supplied our heap and pre-allocated its memory */ | 3354 | /* The caller supplied our heap and pre-allocated its memory */ |
| 3318 | heap = scan->heap; | ||
| 3319 | heap->gt = &started_after; | 3355 | heap->gt = &started_after; |
| 3320 | } else { | 3356 | } else { |
| 3321 | /* We need to allocate our own heap memory */ | 3357 | /* We need to allocate our own heap memory */ |
| @@ -3328,25 +3364,24 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) | |||
| 3328 | 3364 | ||
| 3329 | again: | 3365 | again: |
| 3330 | /* | 3366 | /* |
| 3331 | * Scan tasks in the cgroup, using the scanner's "test_task" callback | 3367 | * Scan tasks in the css, using the @test callback to determine |
| 3332 | * to determine which are of interest, and using the scanner's | 3368 | * which are of interest, and invoking @process callback on the |
| 3333 | * "process_task" callback to process any of them that need an update. | 3369 | * ones which need an update. Since we don't want to hold any |
| 3334 | * Since we don't want to hold any locks during the task updates, | 3370 | * locks during the task updates, gather tasks to be processed in a |
| 3335 | * gather tasks to be processed in a heap structure. | 3371 | * heap structure. The heap is sorted by descending task start |
| 3336 | * The heap is sorted by descending task start time. | 3372 | * time. If the statically-sized heap fills up, we overflow tasks |
| 3337 | * If the statically-sized heap fills up, we overflow tasks that | 3373 | * that started later, and in future iterations only consider tasks |
| 3338 | * started later, and in future iterations only consider tasks that | 3374 | * that started after the latest task in the previous pass. This |
| 3339 | * started after the latest task in the previous pass. This | ||
| 3340 | * guarantees forward progress and that we don't miss any tasks. | 3375 | * guarantees forward progress and that we don't miss any tasks. |
| 3341 | */ | 3376 | */ |
| 3342 | heap->size = 0; | 3377 | heap->size = 0; |
| 3343 | cgroup_iter_start(scan->cg, &it); | 3378 | css_task_iter_start(css, &it); |
| 3344 | while ((p = cgroup_iter_next(scan->cg, &it))) { | 3379 | while ((p = css_task_iter_next(&it))) { |
| 3345 | /* | 3380 | /* |
| 3346 | * Only affect tasks that qualify per the caller's callback, | 3381 | * Only affect tasks that qualify per the caller's callback, |
| 3347 | * if he provided one | 3382 | * if he provided one |
| 3348 | */ | 3383 | */ |
| 3349 | if (scan->test_task && !scan->test_task(p, scan)) | 3384 | if (test && !test(p, data)) |
| 3350 | continue; | 3385 | continue; |
| 3351 | /* | 3386 | /* |
| 3352 | * Only process tasks that started after the last task | 3387 | * Only process tasks that started after the last task |
| @@ -3374,7 +3409,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) | |||
| 3374 | * the heap and wasn't inserted | 3409 | * the heap and wasn't inserted |
| 3375 | */ | 3410 | */ |
| 3376 | } | 3411 | } |
| 3377 | cgroup_iter_end(scan->cg, &it); | 3412 | css_task_iter_end(&it); |
| 3378 | 3413 | ||
| 3379 | if (heap->size) { | 3414 | if (heap->size) { |
| 3380 | for (i = 0; i < heap->size; i++) { | 3415 | for (i = 0; i < heap->size; i++) { |
| @@ -3384,7 +3419,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) | |||
| 3384 | latest_task = q; | 3419 | latest_task = q; |
| 3385 | } | 3420 | } |
| 3386 | /* Process the task per the caller's callback */ | 3421 | /* Process the task per the caller's callback */ |
| 3387 | scan->process_task(q, scan); | 3422 | process(q, data); |
| 3388 | put_task_struct(q); | 3423 | put_task_struct(q); |
| 3389 | } | 3424 | } |
| 3390 | /* | 3425 | /* |
| @@ -3401,10 +3436,9 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) | |||
| 3401 | return 0; | 3436 | return 0; |
| 3402 | } | 3437 | } |
| 3403 | 3438 | ||
| 3404 | static void cgroup_transfer_one_task(struct task_struct *task, | 3439 | static void cgroup_transfer_one_task(struct task_struct *task, void *data) |
| 3405 | struct cgroup_scanner *scan) | ||
| 3406 | { | 3440 | { |
| 3407 | struct cgroup *new_cgroup = scan->data; | 3441 | struct cgroup *new_cgroup = data; |
| 3408 | 3442 | ||
| 3409 | mutex_lock(&cgroup_mutex); | 3443 | mutex_lock(&cgroup_mutex); |
| 3410 | cgroup_attach_task(new_cgroup, task, false); | 3444 | cgroup_attach_task(new_cgroup, task, false); |
| @@ -3418,15 +3452,8 @@ static void cgroup_transfer_one_task(struct task_struct *task, | |||
| 3418 | */ | 3452 | */ |
| 3419 | int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) | 3453 | int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) |
| 3420 | { | 3454 | { |
| 3421 | struct cgroup_scanner scan; | 3455 | return css_scan_tasks(&from->dummy_css, NULL, cgroup_transfer_one_task, |
| 3422 | 3456 | to, NULL); | |
| 3423 | scan.cg = from; | ||
| 3424 | scan.test_task = NULL; /* select all tasks in cgroup */ | ||
| 3425 | scan.process_task = cgroup_transfer_one_task; | ||
| 3426 | scan.heap = NULL; | ||
| 3427 | scan.data = to; | ||
| 3428 | |||
| 3429 | return cgroup_scan_tasks(&scan); | ||
| 3430 | } | 3457 | } |
| 3431 | 3458 | ||
| 3432 | /* | 3459 | /* |
| @@ -3468,7 +3495,7 @@ struct cgroup_pidlist { | |||
| 3468 | /* pointer to the cgroup we belong to, for list removal purposes */ | 3495 | /* pointer to the cgroup we belong to, for list removal purposes */ |
| 3469 | struct cgroup *owner; | 3496 | struct cgroup *owner; |
| 3470 | /* protects the other fields */ | 3497 | /* protects the other fields */ |
| 3471 | struct rw_semaphore mutex; | 3498 | struct rw_semaphore rwsem; |
| 3472 | }; | 3499 | }; |
| 3473 | 3500 | ||
| 3474 | /* | 3501 | /* |
| @@ -3541,7 +3568,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, | |||
| 3541 | struct pid_namespace *ns = task_active_pid_ns(current); | 3568 | struct pid_namespace *ns = task_active_pid_ns(current); |
| 3542 | 3569 | ||
| 3543 | /* | 3570 | /* |
| 3544 | * We can't drop the pidlist_mutex before taking the l->mutex in case | 3571 | * We can't drop the pidlist_mutex before taking the l->rwsem in case |
| 3545 | * the last ref-holder is trying to remove l from the list at the same | 3572 | * the last ref-holder is trying to remove l from the list at the same |
| 3546 | * time. Holding the pidlist_mutex precludes somebody taking whichever | 3573 | * time. Holding the pidlist_mutex precludes somebody taking whichever |
| 3547 | * list we find out from under us - compare release_pid_array(). | 3574 | * list we find out from under us - compare release_pid_array(). |
| @@ -3550,7 +3577,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, | |||
| 3550 | list_for_each_entry(l, &cgrp->pidlists, links) { | 3577 | list_for_each_entry(l, &cgrp->pidlists, links) { |
| 3551 | if (l->key.type == type && l->key.ns == ns) { | 3578 | if (l->key.type == type && l->key.ns == ns) { |
| 3552 | /* make sure l doesn't vanish out from under us */ | 3579 | /* make sure l doesn't vanish out from under us */ |
| 3553 | down_write(&l->mutex); | 3580 | down_write(&l->rwsem); |
| 3554 | mutex_unlock(&cgrp->pidlist_mutex); | 3581 | mutex_unlock(&cgrp->pidlist_mutex); |
| 3555 | return l; | 3582 | return l; |
| 3556 | } | 3583 | } |
| @@ -3561,8 +3588,8 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, | |||
| 3561 | mutex_unlock(&cgrp->pidlist_mutex); | 3588 | mutex_unlock(&cgrp->pidlist_mutex); |
| 3562 | return l; | 3589 | return l; |
| 3563 | } | 3590 | } |
| 3564 | init_rwsem(&l->mutex); | 3591 | init_rwsem(&l->rwsem); |
| 3565 | down_write(&l->mutex); | 3592 | down_write(&l->rwsem); |
| 3566 | l->key.type = type; | 3593 | l->key.type = type; |
| 3567 | l->key.ns = get_pid_ns(ns); | 3594 | l->key.ns = get_pid_ns(ns); |
| 3568 | l->owner = cgrp; | 3595 | l->owner = cgrp; |
| @@ -3580,7 +3607,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, | |||
| 3580 | pid_t *array; | 3607 | pid_t *array; |
| 3581 | int length; | 3608 | int length; |
| 3582 | int pid, n = 0; /* used for populating the array */ | 3609 | int pid, n = 0; /* used for populating the array */ |
| 3583 | struct cgroup_iter it; | 3610 | struct css_task_iter it; |
| 3584 | struct task_struct *tsk; | 3611 | struct task_struct *tsk; |
| 3585 | struct cgroup_pidlist *l; | 3612 | struct cgroup_pidlist *l; |
| 3586 | 3613 | ||
| @@ -3595,8 +3622,8 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, | |||
| 3595 | if (!array) | 3622 | if (!array) |
| 3596 | return -ENOMEM; | 3623 | return -ENOMEM; |
| 3597 | /* now, populate the array */ | 3624 | /* now, populate the array */ |
| 3598 | cgroup_iter_start(cgrp, &it); | 3625 | css_task_iter_start(&cgrp->dummy_css, &it); |
| 3599 | while ((tsk = cgroup_iter_next(cgrp, &it))) { | 3626 | while ((tsk = css_task_iter_next(&it))) { |
| 3600 | if (unlikely(n == length)) | 3627 | if (unlikely(n == length)) |
| 3601 | break; | 3628 | break; |
| 3602 | /* get tgid or pid for procs or tasks file respectively */ | 3629 | /* get tgid or pid for procs or tasks file respectively */ |
| @@ -3607,7 +3634,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, | |||
| 3607 | if (pid > 0) /* make sure to only use valid results */ | 3634 | if (pid > 0) /* make sure to only use valid results */ |
| 3608 | array[n++] = pid; | 3635 | array[n++] = pid; |
| 3609 | } | 3636 | } |
| 3610 | cgroup_iter_end(cgrp, &it); | 3637 | css_task_iter_end(&it); |
| 3611 | length = n; | 3638 | length = n; |
| 3612 | /* now sort & (if procs) strip out duplicates */ | 3639 | /* now sort & (if procs) strip out duplicates */ |
| 3613 | sort(array, length, sizeof(pid_t), cmppid, NULL); | 3640 | sort(array, length, sizeof(pid_t), cmppid, NULL); |
| @@ -3623,7 +3650,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, | |||
| 3623 | l->list = array; | 3650 | l->list = array; |
| 3624 | l->length = length; | 3651 | l->length = length; |
| 3625 | l->use_count++; | 3652 | l->use_count++; |
| 3626 | up_write(&l->mutex); | 3653 | up_write(&l->rwsem); |
| 3627 | *lp = l; | 3654 | *lp = l; |
| 3628 | return 0; | 3655 | return 0; |
| 3629 | } | 3656 | } |
| @@ -3641,7 +3668,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) | |||
| 3641 | { | 3668 | { |
| 3642 | int ret = -EINVAL; | 3669 | int ret = -EINVAL; |
| 3643 | struct cgroup *cgrp; | 3670 | struct cgroup *cgrp; |
| 3644 | struct cgroup_iter it; | 3671 | struct css_task_iter it; |
| 3645 | struct task_struct *tsk; | 3672 | struct task_struct *tsk; |
| 3646 | 3673 | ||
| 3647 | /* | 3674 | /* |
| @@ -3655,8 +3682,8 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) | |||
| 3655 | ret = 0; | 3682 | ret = 0; |
| 3656 | cgrp = dentry->d_fsdata; | 3683 | cgrp = dentry->d_fsdata; |
| 3657 | 3684 | ||
| 3658 | cgroup_iter_start(cgrp, &it); | 3685 | css_task_iter_start(&cgrp->dummy_css, &it); |
| 3659 | while ((tsk = cgroup_iter_next(cgrp, &it))) { | 3686 | while ((tsk = css_task_iter_next(&it))) { |
| 3660 | switch (tsk->state) { | 3687 | switch (tsk->state) { |
| 3661 | case TASK_RUNNING: | 3688 | case TASK_RUNNING: |
| 3662 | stats->nr_running++; | 3689 | stats->nr_running++; |
| @@ -3676,7 +3703,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) | |||
| 3676 | break; | 3703 | break; |
| 3677 | } | 3704 | } |
| 3678 | } | 3705 | } |
| 3679 | cgroup_iter_end(cgrp, &it); | 3706 | css_task_iter_end(&it); |
| 3680 | 3707 | ||
| 3681 | err: | 3708 | err: |
| 3682 | return ret; | 3709 | return ret; |
| @@ -3701,7 +3728,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) | |||
| 3701 | int index = 0, pid = *pos; | 3728 | int index = 0, pid = *pos; |
| 3702 | int *iter; | 3729 | int *iter; |
| 3703 | 3730 | ||
| 3704 | down_read(&l->mutex); | 3731 | down_read(&l->rwsem); |
| 3705 | if (pid) { | 3732 | if (pid) { |
| 3706 | int end = l->length; | 3733 | int end = l->length; |
| 3707 | 3734 | ||
| @@ -3728,7 +3755,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) | |||
| 3728 | static void cgroup_pidlist_stop(struct seq_file *s, void *v) | 3755 | static void cgroup_pidlist_stop(struct seq_file *s, void *v) |
| 3729 | { | 3756 | { |
| 3730 | struct cgroup_pidlist *l = s->private; | 3757 | struct cgroup_pidlist *l = s->private; |
| 3731 | up_read(&l->mutex); | 3758 | up_read(&l->rwsem); |
| 3732 | } | 3759 | } |
| 3733 | 3760 | ||
| 3734 | static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) | 3761 | static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) |
| @@ -3774,7 +3801,7 @@ static void cgroup_release_pid_array(struct cgroup_pidlist *l) | |||
| 3774 | * pidlist_mutex, we have to take pidlist_mutex first. | 3801 | * pidlist_mutex, we have to take pidlist_mutex first. |
| 3775 | */ | 3802 | */ |
| 3776 | mutex_lock(&l->owner->pidlist_mutex); | 3803 | mutex_lock(&l->owner->pidlist_mutex); |
| 3777 | down_write(&l->mutex); | 3804 | down_write(&l->rwsem); |
| 3778 | BUG_ON(!l->use_count); | 3805 | BUG_ON(!l->use_count); |
| 3779 | if (!--l->use_count) { | 3806 | if (!--l->use_count) { |
| 3780 | /* we're the last user if refcount is 0; remove and free */ | 3807 | /* we're the last user if refcount is 0; remove and free */ |
| @@ -3782,12 +3809,12 @@ static void cgroup_release_pid_array(struct cgroup_pidlist *l) | |||
| 3782 | mutex_unlock(&l->owner->pidlist_mutex); | 3809 | mutex_unlock(&l->owner->pidlist_mutex); |
| 3783 | pidlist_free(l->list); | 3810 | pidlist_free(l->list); |
| 3784 | put_pid_ns(l->key.ns); | 3811 | put_pid_ns(l->key.ns); |
| 3785 | up_write(&l->mutex); | 3812 | up_write(&l->rwsem); |
| 3786 | kfree(l); | 3813 | kfree(l); |
| 3787 | return; | 3814 | return; |
| 3788 | } | 3815 | } |
| 3789 | mutex_unlock(&l->owner->pidlist_mutex); | 3816 | mutex_unlock(&l->owner->pidlist_mutex); |
| 3790 | up_write(&l->mutex); | 3817 | up_write(&l->rwsem); |
| 3791 | } | 3818 | } |
| 3792 | 3819 | ||
| 3793 | static int cgroup_pidlist_release(struct inode *inode, struct file *file) | 3820 | static int cgroup_pidlist_release(struct inode *inode, struct file *file) |
| @@ -3851,21 +3878,20 @@ static int cgroup_procs_open(struct inode *unused, struct file *file) | |||
| 3851 | return cgroup_pidlist_open(file, CGROUP_FILE_PROCS); | 3878 | return cgroup_pidlist_open(file, CGROUP_FILE_PROCS); |
| 3852 | } | 3879 | } |
| 3853 | 3880 | ||
| 3854 | static u64 cgroup_read_notify_on_release(struct cgroup *cgrp, | 3881 | static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, |
| 3855 | struct cftype *cft) | 3882 | struct cftype *cft) |
| 3856 | { | 3883 | { |
| 3857 | return notify_on_release(cgrp); | 3884 | return notify_on_release(css->cgroup); |
| 3858 | } | 3885 | } |
| 3859 | 3886 | ||
| 3860 | static int cgroup_write_notify_on_release(struct cgroup *cgrp, | 3887 | static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css, |
| 3861 | struct cftype *cft, | 3888 | struct cftype *cft, u64 val) |
| 3862 | u64 val) | ||
| 3863 | { | 3889 | { |
| 3864 | clear_bit(CGRP_RELEASABLE, &cgrp->flags); | 3890 | clear_bit(CGRP_RELEASABLE, &css->cgroup->flags); |
| 3865 | if (val) | 3891 | if (val) |
| 3866 | set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); | 3892 | set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); |
| 3867 | else | 3893 | else |
| 3868 | clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); | 3894 | clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); |
| 3869 | return 0; | 3895 | return 0; |
| 3870 | } | 3896 | } |
| 3871 | 3897 | ||
| @@ -3895,18 +3921,18 @@ static void cgroup_event_remove(struct work_struct *work) | |||
| 3895 | { | 3921 | { |
| 3896 | struct cgroup_event *event = container_of(work, struct cgroup_event, | 3922 | struct cgroup_event *event = container_of(work, struct cgroup_event, |
| 3897 | remove); | 3923 | remove); |
| 3898 | struct cgroup *cgrp = event->cgrp; | 3924 | struct cgroup_subsys_state *css = event->css; |
| 3899 | 3925 | ||
| 3900 | remove_wait_queue(event->wqh, &event->wait); | 3926 | remove_wait_queue(event->wqh, &event->wait); |
| 3901 | 3927 | ||
| 3902 | event->cft->unregister_event(cgrp, event->cft, event->eventfd); | 3928 | event->cft->unregister_event(css, event->cft, event->eventfd); |
| 3903 | 3929 | ||
| 3904 | /* Notify userspace the event is going away. */ | 3930 | /* Notify userspace the event is going away. */ |
| 3905 | eventfd_signal(event->eventfd, 1); | 3931 | eventfd_signal(event->eventfd, 1); |
| 3906 | 3932 | ||
| 3907 | eventfd_ctx_put(event->eventfd); | 3933 | eventfd_ctx_put(event->eventfd); |
| 3908 | kfree(event); | 3934 | kfree(event); |
| 3909 | cgroup_dput(cgrp); | 3935 | css_put(css); |
| 3910 | } | 3936 | } |
| 3911 | 3937 | ||
| 3912 | /* | 3938 | /* |
| @@ -3919,7 +3945,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, | |||
| 3919 | { | 3945 | { |
| 3920 | struct cgroup_event *event = container_of(wait, | 3946 | struct cgroup_event *event = container_of(wait, |
| 3921 | struct cgroup_event, wait); | 3947 | struct cgroup_event, wait); |
| 3922 | struct cgroup *cgrp = event->cgrp; | 3948 | struct cgroup *cgrp = event->css->cgroup; |
| 3923 | unsigned long flags = (unsigned long)key; | 3949 | unsigned long flags = (unsigned long)key; |
| 3924 | 3950 | ||
| 3925 | if (flags & POLLHUP) { | 3951 | if (flags & POLLHUP) { |
| @@ -3963,14 +3989,15 @@ static void cgroup_event_ptable_queue_proc(struct file *file, | |||
| 3963 | * Input must be in format '<event_fd> <control_fd> <args>'. | 3989 | * Input must be in format '<event_fd> <control_fd> <args>'. |
| 3964 | * Interpretation of args is defined by control file implementation. | 3990 | * Interpretation of args is defined by control file implementation. |
| 3965 | */ | 3991 | */ |
| 3966 | static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, | 3992 | static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, |
| 3967 | const char *buffer) | 3993 | struct cftype *cft, const char *buffer) |
| 3968 | { | 3994 | { |
| 3969 | struct cgroup_event *event = NULL; | 3995 | struct cgroup *cgrp = dummy_css->cgroup; |
| 3970 | struct cgroup *cgrp_cfile; | 3996 | struct cgroup_event *event; |
| 3997 | struct cgroup_subsys_state *cfile_css; | ||
| 3971 | unsigned int efd, cfd; | 3998 | unsigned int efd, cfd; |
| 3972 | struct file *efile = NULL; | 3999 | struct fd efile; |
| 3973 | struct file *cfile = NULL; | 4000 | struct fd cfile; |
| 3974 | char *endp; | 4001 | char *endp; |
| 3975 | int ret; | 4002 | int ret; |
| 3976 | 4003 | ||
| @@ -3987,109 +4014,113 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, | |||
| 3987 | event = kzalloc(sizeof(*event), GFP_KERNEL); | 4014 | event = kzalloc(sizeof(*event), GFP_KERNEL); |
| 3988 | if (!event) | 4015 | if (!event) |
| 3989 | return -ENOMEM; | 4016 | return -ENOMEM; |
| 3990 | event->cgrp = cgrp; | 4017 | |
| 3991 | INIT_LIST_HEAD(&event->list); | 4018 | INIT_LIST_HEAD(&event->list); |
| 3992 | init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); | 4019 | init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); |
| 3993 | init_waitqueue_func_entry(&event->wait, cgroup_event_wake); | 4020 | init_waitqueue_func_entry(&event->wait, cgroup_event_wake); |
| 3994 | INIT_WORK(&event->remove, cgroup_event_remove); | 4021 | INIT_WORK(&event->remove, cgroup_event_remove); |
| 3995 | 4022 | ||
| 3996 | efile = eventfd_fget(efd); | 4023 | efile = fdget(efd); |
| 3997 | if (IS_ERR(efile)) { | 4024 | if (!efile.file) { |
| 3998 | ret = PTR_ERR(efile); | 4025 | ret = -EBADF; |
| 3999 | goto fail; | 4026 | goto out_kfree; |
| 4000 | } | 4027 | } |
| 4001 | 4028 | ||
| 4002 | event->eventfd = eventfd_ctx_fileget(efile); | 4029 | event->eventfd = eventfd_ctx_fileget(efile.file); |
| 4003 | if (IS_ERR(event->eventfd)) { | 4030 | if (IS_ERR(event->eventfd)) { |
| 4004 | ret = PTR_ERR(event->eventfd); | 4031 | ret = PTR_ERR(event->eventfd); |
| 4005 | goto fail; | 4032 | goto out_put_efile; |
| 4006 | } | 4033 | } |
| 4007 | 4034 | ||
| 4008 | cfile = fget(cfd); | 4035 | cfile = fdget(cfd); |
| 4009 | if (!cfile) { | 4036 | if (!cfile.file) { |
| 4010 | ret = -EBADF; | 4037 | ret = -EBADF; |
| 4011 | goto fail; | 4038 | goto out_put_eventfd; |
| 4012 | } | 4039 | } |
| 4013 | 4040 | ||
| 4014 | /* the process need read permission on control file */ | 4041 | /* the process need read permission on control file */ |
| 4015 | /* AV: shouldn't we check that it's been opened for read instead? */ | 4042 | /* AV: shouldn't we check that it's been opened for read instead? */ |
| 4016 | ret = inode_permission(file_inode(cfile), MAY_READ); | 4043 | ret = inode_permission(file_inode(cfile.file), MAY_READ); |
| 4017 | if (ret < 0) | 4044 | if (ret < 0) |
| 4018 | goto fail; | 4045 | goto out_put_cfile; |
| 4019 | 4046 | ||
| 4020 | event->cft = __file_cft(cfile); | 4047 | event->cft = __file_cft(cfile.file); |
| 4021 | if (IS_ERR(event->cft)) { | 4048 | if (IS_ERR(event->cft)) { |
| 4022 | ret = PTR_ERR(event->cft); | 4049 | ret = PTR_ERR(event->cft); |
| 4023 | goto fail; | 4050 | goto out_put_cfile; |
| 4051 | } | ||
| 4052 | |||
| 4053 | if (!event->cft->ss) { | ||
| 4054 | ret = -EBADF; | ||
| 4055 | goto out_put_cfile; | ||
| 4024 | } | 4056 | } |
| 4025 | 4057 | ||
| 4026 | /* | 4058 | /* |
| 4027 | * The file to be monitored must be in the same cgroup as | 4059 | * Determine the css of @cfile, verify it belongs to the same |
| 4028 | * cgroup.event_control is. | 4060 | * cgroup as cgroup.event_control, and associate @event with it. |
| 4061 | * Remaining events are automatically removed on cgroup destruction | ||
| 4062 | * but the removal is asynchronous, so take an extra ref. | ||
| 4029 | */ | 4063 | */ |
| 4030 | cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent); | 4064 | rcu_read_lock(); |
| 4031 | if (cgrp_cfile != cgrp) { | 4065 | |
| 4032 | ret = -EINVAL; | 4066 | ret = -EINVAL; |
| 4033 | goto fail; | 4067 | event->css = cgroup_css(cgrp, event->cft->ss); |
| 4034 | } | 4068 | cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, event->cft->ss); |
| 4069 | if (event->css && event->css == cfile_css && css_tryget(event->css)) | ||
| 4070 | ret = 0; | ||
| 4071 | |||
| 4072 | rcu_read_unlock(); | ||
| 4073 | if (ret) | ||
| 4074 | goto out_put_cfile; | ||
| 4035 | 4075 | ||
| 4036 | if (!event->cft->register_event || !event->cft->unregister_event) { | 4076 | if (!event->cft->register_event || !event->cft->unregister_event) { |
| 4037 | ret = -EINVAL; | 4077 | ret = -EINVAL; |
| 4038 | goto fail; | 4078 | goto out_put_css; |
| 4039 | } | 4079 | } |
| 4040 | 4080 | ||
| 4041 | ret = event->cft->register_event(cgrp, event->cft, | 4081 | ret = event->cft->register_event(event->css, event->cft, |
| 4042 | event->eventfd, buffer); | 4082 | event->eventfd, buffer); |
| 4043 | if (ret) | 4083 | if (ret) |
| 4044 | goto fail; | 4084 | goto out_put_css; |
| 4045 | |||
| 4046 | efile->f_op->poll(efile, &event->pt); | ||
| 4047 | 4085 | ||
| 4048 | /* | 4086 | efile.file->f_op->poll(efile.file, &event->pt); |
| 4049 | * Events should be removed after rmdir of cgroup directory, but before | ||
| 4050 | * destroying subsystem state objects. Let's take reference to cgroup | ||
| 4051 | * directory dentry to do that. | ||
| 4052 | */ | ||
| 4053 | dget(cgrp->dentry); | ||
| 4054 | 4087 | ||
| 4055 | spin_lock(&cgrp->event_list_lock); | 4088 | spin_lock(&cgrp->event_list_lock); |
| 4056 | list_add(&event->list, &cgrp->event_list); | 4089 | list_add(&event->list, &cgrp->event_list); |
| 4057 | spin_unlock(&cgrp->event_list_lock); | 4090 | spin_unlock(&cgrp->event_list_lock); |
| 4058 | 4091 | ||
| 4059 | fput(cfile); | 4092 | fdput(cfile); |
| 4060 | fput(efile); | 4093 | fdput(efile); |
| 4061 | 4094 | ||
| 4062 | return 0; | 4095 | return 0; |
| 4063 | 4096 | ||
| 4064 | fail: | 4097 | out_put_css: |
| 4065 | if (cfile) | 4098 | css_put(event->css); |
| 4066 | fput(cfile); | 4099 | out_put_cfile: |
| 4067 | 4100 | fdput(cfile); | |
| 4068 | if (event && event->eventfd && !IS_ERR(event->eventfd)) | 4101 | out_put_eventfd: |
| 4069 | eventfd_ctx_put(event->eventfd); | 4102 | eventfd_ctx_put(event->eventfd); |
| 4070 | 4103 | out_put_efile: | |
| 4071 | if (!IS_ERR_OR_NULL(efile)) | 4104 | fdput(efile); |
| 4072 | fput(efile); | 4105 | out_kfree: |
| 4073 | |||
| 4074 | kfree(event); | 4106 | kfree(event); |
| 4075 | 4107 | ||
| 4076 | return ret; | 4108 | return ret; |
| 4077 | } | 4109 | } |
| 4078 | 4110 | ||
| 4079 | static u64 cgroup_clone_children_read(struct cgroup *cgrp, | 4111 | static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, |
| 4080 | struct cftype *cft) | 4112 | struct cftype *cft) |
| 4081 | { | 4113 | { |
| 4082 | return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); | 4114 | return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); |
| 4083 | } | 4115 | } |
| 4084 | 4116 | ||
| 4085 | static int cgroup_clone_children_write(struct cgroup *cgrp, | 4117 | static int cgroup_clone_children_write(struct cgroup_subsys_state *css, |
| 4086 | struct cftype *cft, | 4118 | struct cftype *cft, u64 val) |
| 4087 | u64 val) | ||
| 4088 | { | 4119 | { |
| 4089 | if (val) | 4120 | if (val) |
| 4090 | set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); | 4121 | set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); |
| 4091 | else | 4122 | else |
| 4092 | clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); | 4123 | clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); |
| 4093 | return 0; | 4124 | return 0; |
| 4094 | } | 4125 | } |
| 4095 | 4126 | ||
| @@ -4148,56 +4179,82 @@ static struct cftype cgroup_base_files[] = { | |||
| 4148 | }; | 4179 | }; |
| 4149 | 4180 | ||
| 4150 | /** | 4181 | /** |
| 4151 | * cgroup_populate_dir - selectively creation of files in a directory | 4182 | * cgroup_populate_dir - create subsys files in a cgroup directory |
| 4152 | * @cgrp: target cgroup | 4183 | * @cgrp: target cgroup |
| 4153 | * @base_files: true if the base files should be added | ||
| 4154 | * @subsys_mask: mask of the subsystem ids whose files should be added | 4184 | * @subsys_mask: mask of the subsystem ids whose files should be added |
| 4185 | * | ||
| 4186 | * On failure, no file is added. | ||
| 4155 | */ | 4187 | */ |
| 4156 | static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, | 4188 | static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) |
| 4157 | unsigned long subsys_mask) | ||
| 4158 | { | 4189 | { |
| 4159 | int err; | ||
| 4160 | struct cgroup_subsys *ss; | 4190 | struct cgroup_subsys *ss; |
| 4161 | 4191 | int i, ret = 0; | |
| 4162 | if (base_files) { | ||
| 4163 | err = cgroup_addrm_files(cgrp, NULL, cgroup_base_files, true); | ||
| 4164 | if (err < 0) | ||
| 4165 | return err; | ||
| 4166 | } | ||
| 4167 | 4192 | ||
| 4168 | /* process cftsets of each subsystem */ | 4193 | /* process cftsets of each subsystem */ |
| 4169 | for_each_root_subsys(cgrp->root, ss) { | 4194 | for_each_subsys(ss, i) { |
| 4170 | struct cftype_set *set; | 4195 | struct cftype_set *set; |
| 4171 | if (!test_bit(ss->subsys_id, &subsys_mask)) | 4196 | |
| 4197 | if (!test_bit(i, &subsys_mask)) | ||
| 4172 | continue; | 4198 | continue; |
| 4173 | 4199 | ||
| 4174 | list_for_each_entry(set, &ss->cftsets, node) | 4200 | list_for_each_entry(set, &ss->cftsets, node) { |
| 4175 | cgroup_addrm_files(cgrp, ss, set->cfts, true); | 4201 | ret = cgroup_addrm_files(cgrp, set->cfts, true); |
| 4202 | if (ret < 0) | ||
| 4203 | goto err; | ||
| 4204 | } | ||
| 4176 | } | 4205 | } |
| 4206 | return 0; | ||
| 4207 | err: | ||
| 4208 | cgroup_clear_dir(cgrp, subsys_mask); | ||
| 4209 | return ret; | ||
| 4210 | } | ||
| 4177 | 4211 | ||
| 4178 | /* This cgroup is ready now */ | 4212 | /* |
| 4179 | for_each_root_subsys(cgrp->root, ss) { | 4213 | * css destruction is four-stage process. |
| 4180 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | 4214 | * |
| 4181 | struct css_id *id = rcu_dereference_protected(css->id, true); | 4215 | * 1. Destruction starts. Killing of the percpu_ref is initiated. |
| 4216 | * Implemented in kill_css(). | ||
| 4217 | * | ||
| 4218 | * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs | ||
| 4219 | * and thus css_tryget() is guaranteed to fail, the css can be offlined | ||
| 4220 | * by invoking offline_css(). After offlining, the base ref is put. | ||
| 4221 | * Implemented in css_killed_work_fn(). | ||
| 4222 | * | ||
| 4223 | * 3. When the percpu_ref reaches zero, the only possible remaining | ||
| 4224 | * accessors are inside RCU read sections. css_release() schedules the | ||
| 4225 | * RCU callback. | ||
| 4226 | * | ||
| 4227 | * 4. After the grace period, the css can be freed. Implemented in | ||
| 4228 | * css_free_work_fn(). | ||
| 4229 | * | ||
| 4230 | * It is actually hairier because both step 2 and 4 require process context | ||
| 4231 | * and thus involve punting to css->destroy_work adding two additional | ||
| 4232 | * steps to the already complex sequence. | ||
| 4233 | */ | ||
| 4234 | static void css_free_work_fn(struct work_struct *work) | ||
| 4235 | { | ||
| 4236 | struct cgroup_subsys_state *css = | ||
| 4237 | container_of(work, struct cgroup_subsys_state, destroy_work); | ||
| 4238 | struct cgroup *cgrp = css->cgroup; | ||
| 4182 | 4239 | ||
| 4183 | /* | 4240 | if (css->parent) |
| 4184 | * Update id->css pointer and make this css visible from | 4241 | css_put(css->parent); |
| 4185 | * CSS ID functions. This pointer will be dereferened | ||
| 4186 | * from RCU-read-side without locks. | ||
| 4187 | */ | ||
| 4188 | if (id) | ||
| 4189 | rcu_assign_pointer(id->css, css); | ||
| 4190 | } | ||
| 4191 | 4242 | ||
| 4192 | return 0; | 4243 | css->ss->css_free(css); |
| 4244 | cgroup_dput(cgrp); | ||
| 4193 | } | 4245 | } |
| 4194 | 4246 | ||
| 4195 | static void css_dput_fn(struct work_struct *work) | 4247 | static void css_free_rcu_fn(struct rcu_head *rcu_head) |
| 4196 | { | 4248 | { |
| 4197 | struct cgroup_subsys_state *css = | 4249 | struct cgroup_subsys_state *css = |
| 4198 | container_of(work, struct cgroup_subsys_state, dput_work); | 4250 | container_of(rcu_head, struct cgroup_subsys_state, rcu_head); |
| 4199 | 4251 | ||
| 4200 | cgroup_dput(css->cgroup); | 4252 | /* |
| 4253 | * css holds an extra ref to @cgrp->dentry which is put on the last | ||
| 4254 | * css_put(). dput() requires process context which we don't have. | ||
| 4255 | */ | ||
| 4256 | INIT_WORK(&css->destroy_work, css_free_work_fn); | ||
| 4257 | schedule_work(&css->destroy_work); | ||
| 4201 | } | 4258 | } |
| 4202 | 4259 | ||
| 4203 | static void css_release(struct percpu_ref *ref) | 4260 | static void css_release(struct percpu_ref *ref) |
| @@ -4205,49 +4262,46 @@ static void css_release(struct percpu_ref *ref) | |||
| 4205 | struct cgroup_subsys_state *css = | 4262 | struct cgroup_subsys_state *css = |
| 4206 | container_of(ref, struct cgroup_subsys_state, refcnt); | 4263 | container_of(ref, struct cgroup_subsys_state, refcnt); |
| 4207 | 4264 | ||
| 4208 | schedule_work(&css->dput_work); | 4265 | call_rcu(&css->rcu_head, css_free_rcu_fn); |
| 4209 | } | 4266 | } |
| 4210 | 4267 | ||
| 4211 | static void init_cgroup_css(struct cgroup_subsys_state *css, | 4268 | static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, |
| 4212 | struct cgroup_subsys *ss, | 4269 | struct cgroup *cgrp) |
| 4213 | struct cgroup *cgrp) | ||
| 4214 | { | 4270 | { |
| 4215 | css->cgroup = cgrp; | 4271 | css->cgroup = cgrp; |
| 4272 | css->ss = ss; | ||
| 4216 | css->flags = 0; | 4273 | css->flags = 0; |
| 4217 | css->id = NULL; | 4274 | |
| 4218 | if (cgrp == cgroup_dummy_top) | 4275 | if (cgrp->parent) |
| 4276 | css->parent = cgroup_css(cgrp->parent, ss); | ||
| 4277 | else | ||
| 4219 | css->flags |= CSS_ROOT; | 4278 | css->flags |= CSS_ROOT; |
| 4220 | BUG_ON(cgrp->subsys[ss->subsys_id]); | ||
| 4221 | cgrp->subsys[ss->subsys_id] = css; | ||
| 4222 | 4279 | ||
| 4223 | /* | 4280 | BUG_ON(cgroup_css(cgrp, ss)); |
| 4224 | * css holds an extra ref to @cgrp->dentry which is put on the last | ||
| 4225 | * css_put(). dput() requires process context, which css_put() may | ||
| 4226 | * be called without. @css->dput_work will be used to invoke | ||
| 4227 | * dput() asynchronously from css_put(). | ||
| 4228 | */ | ||
| 4229 | INIT_WORK(&css->dput_work, css_dput_fn); | ||
| 4230 | } | 4281 | } |
| 4231 | 4282 | ||
| 4232 | /* invoke ->post_create() on a new CSS and mark it online if successful */ | 4283 | /* invoke ->css_online() on a new CSS and mark it online if successful */ |
| 4233 | static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp) | 4284 | static int online_css(struct cgroup_subsys_state *css) |
| 4234 | { | 4285 | { |
| 4286 | struct cgroup_subsys *ss = css->ss; | ||
| 4235 | int ret = 0; | 4287 | int ret = 0; |
| 4236 | 4288 | ||
| 4237 | lockdep_assert_held(&cgroup_mutex); | 4289 | lockdep_assert_held(&cgroup_mutex); |
| 4238 | 4290 | ||
| 4239 | if (ss->css_online) | 4291 | if (ss->css_online) |
| 4240 | ret = ss->css_online(cgrp); | 4292 | ret = ss->css_online(css); |
| 4241 | if (!ret) | 4293 | if (!ret) { |
| 4242 | cgrp->subsys[ss->subsys_id]->flags |= CSS_ONLINE; | 4294 | css->flags |= CSS_ONLINE; |
| 4295 | css->cgroup->nr_css++; | ||
| 4296 | rcu_assign_pointer(css->cgroup->subsys[ss->subsys_id], css); | ||
| 4297 | } | ||
| 4243 | return ret; | 4298 | return ret; |
| 4244 | } | 4299 | } |
| 4245 | 4300 | ||
| 4246 | /* if the CSS is online, invoke ->pre_destory() on it and mark it offline */ | 4301 | /* if the CSS is online, invoke ->css_offline() on it and mark it offline */ |
| 4247 | static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) | 4302 | static void offline_css(struct cgroup_subsys_state *css) |
| 4248 | __releases(&cgroup_mutex) __acquires(&cgroup_mutex) | ||
| 4249 | { | 4303 | { |
| 4250 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | 4304 | struct cgroup_subsys *ss = css->ss; |
| 4251 | 4305 | ||
| 4252 | lockdep_assert_held(&cgroup_mutex); | 4306 | lockdep_assert_held(&cgroup_mutex); |
| 4253 | 4307 | ||
| @@ -4255,9 +4309,11 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) | |||
| 4255 | return; | 4309 | return; |
| 4256 | 4310 | ||
| 4257 | if (ss->css_offline) | 4311 | if (ss->css_offline) |
| 4258 | ss->css_offline(cgrp); | 4312 | ss->css_offline(css); |
| 4259 | 4313 | ||
| 4260 | cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE; | 4314 | css->flags &= ~CSS_ONLINE; |
| 4315 | css->cgroup->nr_css--; | ||
| 4316 | RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css); | ||
| 4261 | } | 4317 | } |
| 4262 | 4318 | ||
| 4263 | /* | 4319 | /* |
| @@ -4271,6 +4327,7 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) | |||
| 4271 | static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | 4327 | static long cgroup_create(struct cgroup *parent, struct dentry *dentry, |
| 4272 | umode_t mode) | 4328 | umode_t mode) |
| 4273 | { | 4329 | { |
| 4330 | struct cgroup_subsys_state *css_ar[CGROUP_SUBSYS_COUNT] = { }; | ||
| 4274 | struct cgroup *cgrp; | 4331 | struct cgroup *cgrp; |
| 4275 | struct cgroup_name *name; | 4332 | struct cgroup_name *name; |
| 4276 | struct cgroupfs_root *root = parent->root; | 4333 | struct cgroupfs_root *root = parent->root; |
| @@ -4288,7 +4345,11 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
| 4288 | goto err_free_cgrp; | 4345 | goto err_free_cgrp; |
| 4289 | rcu_assign_pointer(cgrp->name, name); | 4346 | rcu_assign_pointer(cgrp->name, name); |
| 4290 | 4347 | ||
| 4291 | cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL); | 4348 | /* |
| 4349 | * Temporarily set the pointer to NULL, so idr_find() won't return | ||
| 4350 | * a half-baked cgroup. | ||
| 4351 | */ | ||
| 4352 | cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL); | ||
| 4292 | if (cgrp->id < 0) | 4353 | if (cgrp->id < 0) |
| 4293 | goto err_free_name; | 4354 | goto err_free_name; |
| 4294 | 4355 | ||
| @@ -4317,6 +4378,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
| 4317 | cgrp->dentry = dentry; | 4378 | cgrp->dentry = dentry; |
| 4318 | 4379 | ||
| 4319 | cgrp->parent = parent; | 4380 | cgrp->parent = parent; |
| 4381 | cgrp->dummy_css.parent = &parent->dummy_css; | ||
| 4320 | cgrp->root = parent->root; | 4382 | cgrp->root = parent->root; |
| 4321 | 4383 | ||
| 4322 | if (notify_on_release(parent)) | 4384 | if (notify_on_release(parent)) |
| @@ -4328,25 +4390,18 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
| 4328 | for_each_root_subsys(root, ss) { | 4390 | for_each_root_subsys(root, ss) { |
| 4329 | struct cgroup_subsys_state *css; | 4391 | struct cgroup_subsys_state *css; |
| 4330 | 4392 | ||
| 4331 | css = ss->css_alloc(cgrp); | 4393 | css = ss->css_alloc(cgroup_css(parent, ss)); |
| 4332 | if (IS_ERR(css)) { | 4394 | if (IS_ERR(css)) { |
| 4333 | err = PTR_ERR(css); | 4395 | err = PTR_ERR(css); |
| 4334 | goto err_free_all; | 4396 | goto err_free_all; |
| 4335 | } | 4397 | } |
| 4398 | css_ar[ss->subsys_id] = css; | ||
| 4336 | 4399 | ||
| 4337 | err = percpu_ref_init(&css->refcnt, css_release); | 4400 | err = percpu_ref_init(&css->refcnt, css_release); |
| 4338 | if (err) { | 4401 | if (err) |
| 4339 | ss->css_free(cgrp); | ||
| 4340 | goto err_free_all; | 4402 | goto err_free_all; |
| 4341 | } | ||
| 4342 | |||
| 4343 | init_cgroup_css(css, ss, cgrp); | ||
| 4344 | 4403 | ||
| 4345 | if (ss->use_id) { | 4404 | init_css(css, ss, cgrp); |
| 4346 | err = alloc_css_id(ss, parent, cgrp); | ||
| 4347 | if (err) | ||
| 4348 | goto err_free_all; | ||
| 4349 | } | ||
| 4350 | } | 4405 | } |
| 4351 | 4406 | ||
| 4352 | /* | 4407 | /* |
| @@ -4365,16 +4420,22 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
| 4365 | list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); | 4420 | list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); |
| 4366 | root->number_of_cgroups++; | 4421 | root->number_of_cgroups++; |
| 4367 | 4422 | ||
| 4368 | /* each css holds a ref to the cgroup's dentry */ | 4423 | /* each css holds a ref to the cgroup's dentry and the parent css */ |
| 4369 | for_each_root_subsys(root, ss) | 4424 | for_each_root_subsys(root, ss) { |
| 4425 | struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; | ||
| 4426 | |||
| 4370 | dget(dentry); | 4427 | dget(dentry); |
| 4428 | css_get(css->parent); | ||
| 4429 | } | ||
| 4371 | 4430 | ||
| 4372 | /* hold a ref to the parent's dentry */ | 4431 | /* hold a ref to the parent's dentry */ |
| 4373 | dget(parent->dentry); | 4432 | dget(parent->dentry); |
| 4374 | 4433 | ||
| 4375 | /* creation succeeded, notify subsystems */ | 4434 | /* creation succeeded, notify subsystems */ |
| 4376 | for_each_root_subsys(root, ss) { | 4435 | for_each_root_subsys(root, ss) { |
| 4377 | err = online_css(ss, cgrp); | 4436 | struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; |
| 4437 | |||
| 4438 | err = online_css(css); | ||
| 4378 | if (err) | 4439 | if (err) |
| 4379 | goto err_destroy; | 4440 | goto err_destroy; |
| 4380 | 4441 | ||
| @@ -4388,7 +4449,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
| 4388 | } | 4449 | } |
| 4389 | } | 4450 | } |
| 4390 | 4451 | ||
| 4391 | err = cgroup_populate_dir(cgrp, true, root->subsys_mask); | 4452 | idr_replace(&root->cgroup_idr, cgrp, cgrp->id); |
| 4453 | |||
| 4454 | err = cgroup_addrm_files(cgrp, cgroup_base_files, true); | ||
| 4455 | if (err) | ||
| 4456 | goto err_destroy; | ||
| 4457 | |||
| 4458 | err = cgroup_populate_dir(cgrp, root->subsys_mask); | ||
| 4392 | if (err) | 4459 | if (err) |
| 4393 | goto err_destroy; | 4460 | goto err_destroy; |
| 4394 | 4461 | ||
| @@ -4399,18 +4466,18 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
| 4399 | 4466 | ||
| 4400 | err_free_all: | 4467 | err_free_all: |
| 4401 | for_each_root_subsys(root, ss) { | 4468 | for_each_root_subsys(root, ss) { |
| 4402 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | 4469 | struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; |
| 4403 | 4470 | ||
| 4404 | if (css) { | 4471 | if (css) { |
| 4405 | percpu_ref_cancel_init(&css->refcnt); | 4472 | percpu_ref_cancel_init(&css->refcnt); |
| 4406 | ss->css_free(cgrp); | 4473 | ss->css_free(css); |
| 4407 | } | 4474 | } |
| 4408 | } | 4475 | } |
| 4409 | mutex_unlock(&cgroup_mutex); | 4476 | mutex_unlock(&cgroup_mutex); |
| 4410 | /* Release the reference count that we took on the superblock */ | 4477 | /* Release the reference count that we took on the superblock */ |
| 4411 | deactivate_super(sb); | 4478 | deactivate_super(sb); |
| 4412 | err_free_id: | 4479 | err_free_id: |
| 4413 | ida_simple_remove(&root->cgroup_ida, cgrp->id); | 4480 | idr_remove(&root->cgroup_idr, cgrp->id); |
| 4414 | err_free_name: | 4481 | err_free_name: |
| 4415 | kfree(rcu_dereference_raw(cgrp->name)); | 4482 | kfree(rcu_dereference_raw(cgrp->name)); |
| 4416 | err_free_cgrp: | 4483 | err_free_cgrp: |
| @@ -4432,22 +4499,84 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) | |||
| 4432 | return cgroup_create(c_parent, dentry, mode | S_IFDIR); | 4499 | return cgroup_create(c_parent, dentry, mode | S_IFDIR); |
| 4433 | } | 4500 | } |
| 4434 | 4501 | ||
| 4435 | static void cgroup_css_killed(struct cgroup *cgrp) | 4502 | /* |
| 4503 | * This is called when the refcnt of a css is confirmed to be killed. | ||
| 4504 | * css_tryget() is now guaranteed to fail. | ||
| 4505 | */ | ||
| 4506 | static void css_killed_work_fn(struct work_struct *work) | ||
| 4436 | { | 4507 | { |
| 4437 | if (!atomic_dec_and_test(&cgrp->css_kill_cnt)) | 4508 | struct cgroup_subsys_state *css = |
| 4438 | return; | 4509 | container_of(work, struct cgroup_subsys_state, destroy_work); |
| 4510 | struct cgroup *cgrp = css->cgroup; | ||
| 4439 | 4511 | ||
| 4440 | /* percpu ref's of all css's are killed, kick off the next step */ | 4512 | mutex_lock(&cgroup_mutex); |
| 4441 | INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn); | 4513 | |
| 4442 | schedule_work(&cgrp->destroy_work); | 4514 | /* |
| 4515 | * css_tryget() is guaranteed to fail now. Tell subsystems to | ||
| 4516 | * initate destruction. | ||
| 4517 | */ | ||
| 4518 | offline_css(css); | ||
| 4519 | |||
| 4520 | /* | ||
| 4521 | * If @cgrp is marked dead, it's waiting for refs of all css's to | ||
| 4522 | * be disabled before proceeding to the second phase of cgroup | ||
| 4523 | * destruction. If we are the last one, kick it off. | ||
| 4524 | */ | ||
| 4525 | if (!cgrp->nr_css && cgroup_is_dead(cgrp)) | ||
| 4526 | cgroup_destroy_css_killed(cgrp); | ||
| 4527 | |||
| 4528 | mutex_unlock(&cgroup_mutex); | ||
| 4529 | |||
| 4530 | /* | ||
| 4531 | * Put the css refs from kill_css(). Each css holds an extra | ||
| 4532 | * reference to the cgroup's dentry and cgroup removal proceeds | ||
| 4533 | * regardless of css refs. On the last put of each css, whenever | ||
| 4534 | * that may be, the extra dentry ref is put so that dentry | ||
| 4535 | * destruction happens only after all css's are released. | ||
| 4536 | */ | ||
| 4537 | css_put(css); | ||
| 4443 | } | 4538 | } |
| 4444 | 4539 | ||
| 4445 | static void css_ref_killed_fn(struct percpu_ref *ref) | 4540 | /* css kill confirmation processing requires process context, bounce */ |
| 4541 | static void css_killed_ref_fn(struct percpu_ref *ref) | ||
| 4446 | { | 4542 | { |
| 4447 | struct cgroup_subsys_state *css = | 4543 | struct cgroup_subsys_state *css = |
| 4448 | container_of(ref, struct cgroup_subsys_state, refcnt); | 4544 | container_of(ref, struct cgroup_subsys_state, refcnt); |
| 4449 | 4545 | ||
| 4450 | cgroup_css_killed(css->cgroup); | 4546 | INIT_WORK(&css->destroy_work, css_killed_work_fn); |
| 4547 | schedule_work(&css->destroy_work); | ||
| 4548 | } | ||
| 4549 | |||
| 4550 | /** | ||
| 4551 | * kill_css - destroy a css | ||
| 4552 | * @css: css to destroy | ||
| 4553 | * | ||
| 4554 | * This function initiates destruction of @css by removing cgroup interface | ||
| 4555 | * files and putting its base reference. ->css_offline() will be invoked | ||
| 4556 | * asynchronously once css_tryget() is guaranteed to fail and when the | ||
| 4557 | * reference count reaches zero, @css will be released. | ||
| 4558 | */ | ||
| 4559 | static void kill_css(struct cgroup_subsys_state *css) | ||
| 4560 | { | ||
| 4561 | cgroup_clear_dir(css->cgroup, 1 << css->ss->subsys_id); | ||
| 4562 | |||
| 4563 | /* | ||
| 4564 | * Killing would put the base ref, but we need to keep it alive | ||
| 4565 | * until after ->css_offline(). | ||
| 4566 | */ | ||
| 4567 | css_get(css); | ||
| 4568 | |||
| 4569 | /* | ||
| 4570 | * cgroup core guarantees that, by the time ->css_offline() is | ||
| 4571 | * invoked, no new css reference will be given out via | ||
| 4572 | * css_tryget(). We can't simply call percpu_ref_kill() and | ||
| 4573 | * proceed to offlining css's because percpu_ref_kill() doesn't | ||
| 4574 | * guarantee that the ref is seen as killed on all CPUs on return. | ||
| 4575 | * | ||
| 4576 | * Use percpu_ref_kill_and_confirm() to get notifications as each | ||
| 4577 | * css is confirmed to be seen as killed on all CPUs. | ||
| 4578 | */ | ||
| 4579 | percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn); | ||
| 4451 | } | 4580 | } |
| 4452 | 4581 | ||
| 4453 | /** | 4582 | /** |
| @@ -4480,6 +4609,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) | |||
| 4480 | struct dentry *d = cgrp->dentry; | 4609 | struct dentry *d = cgrp->dentry; |
| 4481 | struct cgroup_event *event, *tmp; | 4610 | struct cgroup_event *event, *tmp; |
| 4482 | struct cgroup_subsys *ss; | 4611 | struct cgroup_subsys *ss; |
| 4612 | struct cgroup *child; | ||
| 4483 | bool empty; | 4613 | bool empty; |
| 4484 | 4614 | ||
| 4485 | lockdep_assert_held(&d->d_inode->i_mutex); | 4615 | lockdep_assert_held(&d->d_inode->i_mutex); |
| @@ -4490,47 +4620,41 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) | |||
| 4490 | * @cgrp from being removed while __put_css_set() is in progress. | 4620 | * @cgrp from being removed while __put_css_set() is in progress. |
| 4491 | */ | 4621 | */ |
| 4492 | read_lock(&css_set_lock); | 4622 | read_lock(&css_set_lock); |
| 4493 | empty = list_empty(&cgrp->cset_links) && list_empty(&cgrp->children); | 4623 | empty = list_empty(&cgrp->cset_links); |
| 4494 | read_unlock(&css_set_lock); | 4624 | read_unlock(&css_set_lock); |
| 4495 | if (!empty) | 4625 | if (!empty) |
| 4496 | return -EBUSY; | 4626 | return -EBUSY; |
| 4497 | 4627 | ||
| 4498 | /* | 4628 | /* |
| 4499 | * Block new css_tryget() by killing css refcnts. cgroup core | 4629 | * Make sure there's no live children. We can't test ->children |
| 4500 | * guarantees that, by the time ->css_offline() is invoked, no new | 4630 | * emptiness as dead children linger on it while being destroyed; |
| 4501 | * css reference will be given out via css_tryget(). We can't | 4631 | * otherwise, "rmdir parent/child parent" may fail with -EBUSY. |
| 4502 | * simply call percpu_ref_kill() and proceed to offlining css's | ||
| 4503 | * because percpu_ref_kill() doesn't guarantee that the ref is seen | ||
| 4504 | * as killed on all CPUs on return. | ||
| 4505 | * | ||
| 4506 | * Use percpu_ref_kill_and_confirm() to get notifications as each | ||
| 4507 | * css is confirmed to be seen as killed on all CPUs. The | ||
| 4508 | * notification callback keeps track of the number of css's to be | ||
| 4509 | * killed and schedules cgroup_offline_fn() to perform the rest of | ||
| 4510 | * destruction once the percpu refs of all css's are confirmed to | ||
| 4511 | * be killed. | ||
| 4512 | */ | 4632 | */ |
| 4513 | atomic_set(&cgrp->css_kill_cnt, 1); | 4633 | empty = true; |
| 4514 | for_each_root_subsys(cgrp->root, ss) { | 4634 | rcu_read_lock(); |
| 4515 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | 4635 | list_for_each_entry_rcu(child, &cgrp->children, sibling) { |
| 4516 | 4636 | empty = cgroup_is_dead(child); | |
| 4517 | /* | 4637 | if (!empty) |
| 4518 | * Killing would put the base ref, but we need to keep it | 4638 | break; |
| 4519 | * alive until after ->css_offline. | ||
| 4520 | */ | ||
| 4521 | percpu_ref_get(&css->refcnt); | ||
| 4522 | |||
| 4523 | atomic_inc(&cgrp->css_kill_cnt); | ||
| 4524 | percpu_ref_kill_and_confirm(&css->refcnt, css_ref_killed_fn); | ||
| 4525 | } | 4639 | } |
| 4526 | cgroup_css_killed(cgrp); | 4640 | rcu_read_unlock(); |
| 4641 | if (!empty) | ||
| 4642 | return -EBUSY; | ||
| 4643 | |||
| 4644 | /* | ||
| 4645 | * Initiate massacre of all css's. cgroup_destroy_css_killed() | ||
| 4646 | * will be invoked to perform the rest of destruction once the | ||
| 4647 | * percpu refs of all css's are confirmed to be killed. | ||
| 4648 | */ | ||
| 4649 | for_each_root_subsys(cgrp->root, ss) | ||
| 4650 | kill_css(cgroup_css(cgrp, ss)); | ||
| 4527 | 4651 | ||
| 4528 | /* | 4652 | /* |
| 4529 | * Mark @cgrp dead. This prevents further task migration and child | 4653 | * Mark @cgrp dead. This prevents further task migration and child |
| 4530 | * creation by disabling cgroup_lock_live_group(). Note that | 4654 | * creation by disabling cgroup_lock_live_group(). Note that |
| 4531 | * CGRP_DEAD assertion is depended upon by cgroup_next_sibling() to | 4655 | * CGRP_DEAD assertion is depended upon by css_next_child() to |
| 4532 | * resume iteration after dropping RCU read lock. See | 4656 | * resume iteration after dropping RCU read lock. See |
| 4533 | * cgroup_next_sibling() for details. | 4657 | * css_next_child() for details. |
| 4534 | */ | 4658 | */ |
| 4535 | set_bit(CGRP_DEAD, &cgrp->flags); | 4659 | set_bit(CGRP_DEAD, &cgrp->flags); |
| 4536 | 4660 | ||
| @@ -4541,9 +4665,20 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) | |||
| 4541 | raw_spin_unlock(&release_list_lock); | 4665 | raw_spin_unlock(&release_list_lock); |
| 4542 | 4666 | ||
| 4543 | /* | 4667 | /* |
| 4544 | * Remove @cgrp directory. The removal puts the base ref but we | 4668 | * If @cgrp has css's attached, the second stage of cgroup |
| 4545 | * aren't quite done with @cgrp yet, so hold onto it. | 4669 | * destruction is kicked off from css_killed_work_fn() after the |
| 4670 | * refs of all attached css's are killed. If @cgrp doesn't have | ||
| 4671 | * any css, we kick it off here. | ||
| 4546 | */ | 4672 | */ |
| 4673 | if (!cgrp->nr_css) | ||
| 4674 | cgroup_destroy_css_killed(cgrp); | ||
| 4675 | |||
| 4676 | /* | ||
| 4677 | * Clear the base files and remove @cgrp directory. The removal | ||
| 4678 | * puts the base ref but we aren't quite done with @cgrp yet, so | ||
| 4679 | * hold onto it. | ||
| 4680 | */ | ||
| 4681 | cgroup_addrm_files(cgrp, cgroup_base_files, false); | ||
| 4547 | dget(d); | 4682 | dget(d); |
| 4548 | cgroup_d_remove_dir(d); | 4683 | cgroup_d_remove_dir(d); |
| 4549 | 4684 | ||
| @@ -4563,50 +4698,36 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) | |||
| 4563 | }; | 4698 | }; |
| 4564 | 4699 | ||
| 4565 | /** | 4700 | /** |
| 4566 | * cgroup_offline_fn - the second step of cgroup destruction | 4701 | * cgroup_destroy_css_killed - the second step of cgroup destruction |
| 4567 | * @work: cgroup->destroy_free_work | 4702 | * @work: cgroup->destroy_free_work |
| 4568 | * | 4703 | * |
| 4569 | * This function is invoked from a work item for a cgroup which is being | 4704 | * This function is invoked from a work item for a cgroup which is being |
| 4570 | * destroyed after the percpu refcnts of all css's are guaranteed to be | 4705 | * destroyed after all css's are offlined and performs the rest of |
| 4571 | * seen as killed on all CPUs, and performs the rest of destruction. This | 4706 | * destruction. This is the second step of destruction described in the |
| 4572 | * is the second step of destruction described in the comment above | 4707 | * comment above cgroup_destroy_locked(). |
| 4573 | * cgroup_destroy_locked(). | ||
| 4574 | */ | 4708 | */ |
| 4575 | static void cgroup_offline_fn(struct work_struct *work) | 4709 | static void cgroup_destroy_css_killed(struct cgroup *cgrp) |
| 4576 | { | 4710 | { |
| 4577 | struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); | ||
| 4578 | struct cgroup *parent = cgrp->parent; | 4711 | struct cgroup *parent = cgrp->parent; |
| 4579 | struct dentry *d = cgrp->dentry; | 4712 | struct dentry *d = cgrp->dentry; |
| 4580 | struct cgroup_subsys *ss; | ||
| 4581 | 4713 | ||
| 4582 | mutex_lock(&cgroup_mutex); | 4714 | lockdep_assert_held(&cgroup_mutex); |
| 4583 | 4715 | ||
| 4584 | /* | 4716 | /* delete this cgroup from parent->children */ |
| 4585 | * css_tryget() is guaranteed to fail now. Tell subsystems to | 4717 | list_del_rcu(&cgrp->sibling); |
| 4586 | * initate destruction. | ||
| 4587 | */ | ||
| 4588 | for_each_root_subsys(cgrp->root, ss) | ||
| 4589 | offline_css(ss, cgrp); | ||
| 4590 | 4718 | ||
| 4591 | /* | 4719 | /* |
| 4592 | * Put the css refs from cgroup_destroy_locked(). Each css holds | 4720 | * We should remove the cgroup object from idr before its grace |
| 4593 | * an extra reference to the cgroup's dentry and cgroup removal | 4721 | * period starts, so we won't be looking up a cgroup while the |
| 4594 | * proceeds regardless of css refs. On the last put of each css, | 4722 | * cgroup is being freed. |
| 4595 | * whenever that may be, the extra dentry ref is put so that dentry | ||
| 4596 | * destruction happens only after all css's are released. | ||
| 4597 | */ | 4723 | */ |
| 4598 | for_each_root_subsys(cgrp->root, ss) | 4724 | idr_remove(&cgrp->root->cgroup_idr, cgrp->id); |
| 4599 | css_put(cgrp->subsys[ss->subsys_id]); | 4725 | cgrp->id = -1; |
| 4600 | |||
| 4601 | /* delete this cgroup from parent->children */ | ||
| 4602 | list_del_rcu(&cgrp->sibling); | ||
| 4603 | 4726 | ||
| 4604 | dput(d); | 4727 | dput(d); |
| 4605 | 4728 | ||
| 4606 | set_bit(CGRP_RELEASABLE, &parent->flags); | 4729 | set_bit(CGRP_RELEASABLE, &parent->flags); |
| 4607 | check_for_release(parent); | 4730 | check_for_release(parent); |
| 4608 | |||
| 4609 | mutex_unlock(&cgroup_mutex); | ||
| 4610 | } | 4731 | } |
| 4611 | 4732 | ||
| 4612 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) | 4733 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) |
| @@ -4629,6 +4750,11 @@ static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss) | |||
| 4629 | * deregistration. | 4750 | * deregistration. |
| 4630 | */ | 4751 | */ |
| 4631 | if (ss->base_cftypes) { | 4752 | if (ss->base_cftypes) { |
| 4753 | struct cftype *cft; | ||
| 4754 | |||
| 4755 | for (cft = ss->base_cftypes; cft->name[0] != '\0'; cft++) | ||
| 4756 | cft->ss = ss; | ||
| 4757 | |||
| 4632 | ss->base_cftset.cfts = ss->base_cftypes; | 4758 | ss->base_cftset.cfts = ss->base_cftypes; |
| 4633 | list_add_tail(&ss->base_cftset.node, &ss->cftsets); | 4759 | list_add_tail(&ss->base_cftset.node, &ss->cftsets); |
| 4634 | } | 4760 | } |
| @@ -4648,10 +4774,10 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) | |||
| 4648 | /* Create the top cgroup state for this subsystem */ | 4774 | /* Create the top cgroup state for this subsystem */ |
| 4649 | list_add(&ss->sibling, &cgroup_dummy_root.subsys_list); | 4775 | list_add(&ss->sibling, &cgroup_dummy_root.subsys_list); |
| 4650 | ss->root = &cgroup_dummy_root; | 4776 | ss->root = &cgroup_dummy_root; |
| 4651 | css = ss->css_alloc(cgroup_dummy_top); | 4777 | css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss)); |
| 4652 | /* We don't handle early failures gracefully */ | 4778 | /* We don't handle early failures gracefully */ |
| 4653 | BUG_ON(IS_ERR(css)); | 4779 | BUG_ON(IS_ERR(css)); |
| 4654 | init_cgroup_css(css, ss, cgroup_dummy_top); | 4780 | init_css(css, ss, cgroup_dummy_top); |
| 4655 | 4781 | ||
| 4656 | /* Update the init_css_set to contain a subsys | 4782 | /* Update the init_css_set to contain a subsys |
| 4657 | * pointer to this state - since the subsystem is | 4783 | * pointer to this state - since the subsystem is |
| @@ -4666,7 +4792,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) | |||
| 4666 | * need to invoke fork callbacks here. */ | 4792 | * need to invoke fork callbacks here. */ |
| 4667 | BUG_ON(!list_empty(&init_task.tasks)); | 4793 | BUG_ON(!list_empty(&init_task.tasks)); |
| 4668 | 4794 | ||
| 4669 | BUG_ON(online_css(ss, cgroup_dummy_top)); | 4795 | BUG_ON(online_css(css)); |
| 4670 | 4796 | ||
| 4671 | mutex_unlock(&cgroup_mutex); | 4797 | mutex_unlock(&cgroup_mutex); |
| 4672 | 4798 | ||
| @@ -4727,7 +4853,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
| 4727 | * struct, so this can happen first (i.e. before the dummy root | 4853 | * struct, so this can happen first (i.e. before the dummy root |
| 4728 | * attachment). | 4854 | * attachment). |
| 4729 | */ | 4855 | */ |
| 4730 | css = ss->css_alloc(cgroup_dummy_top); | 4856 | css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss)); |
| 4731 | if (IS_ERR(css)) { | 4857 | if (IS_ERR(css)) { |
| 4732 | /* failure case - need to deassign the cgroup_subsys[] slot. */ | 4858 | /* failure case - need to deassign the cgroup_subsys[] slot. */ |
| 4733 | cgroup_subsys[ss->subsys_id] = NULL; | 4859 | cgroup_subsys[ss->subsys_id] = NULL; |
| @@ -4739,13 +4865,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
| 4739 | ss->root = &cgroup_dummy_root; | 4865 | ss->root = &cgroup_dummy_root; |
| 4740 | 4866 | ||
| 4741 | /* our new subsystem will be attached to the dummy hierarchy. */ | 4867 | /* our new subsystem will be attached to the dummy hierarchy. */ |
| 4742 | init_cgroup_css(css, ss, cgroup_dummy_top); | 4868 | init_css(css, ss, cgroup_dummy_top); |
| 4743 | /* init_idr must be after init_cgroup_css because it sets css->id. */ | ||
| 4744 | if (ss->use_id) { | ||
| 4745 | ret = cgroup_init_idr(ss, css); | ||
| 4746 | if (ret) | ||
| 4747 | goto err_unload; | ||
| 4748 | } | ||
| 4749 | 4869 | ||
| 4750 | /* | 4870 | /* |
| 4751 | * Now we need to entangle the css into the existing css_sets. unlike | 4871 | * Now we need to entangle the css into the existing css_sets. unlike |
| @@ -4770,7 +4890,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
| 4770 | } | 4890 | } |
| 4771 | write_unlock(&css_set_lock); | 4891 | write_unlock(&css_set_lock); |
| 4772 | 4892 | ||
| 4773 | ret = online_css(ss, cgroup_dummy_top); | 4893 | ret = online_css(css); |
| 4774 | if (ret) | 4894 | if (ret) |
| 4775 | goto err_unload; | 4895 | goto err_unload; |
| 4776 | 4896 | ||
| @@ -4802,17 +4922,14 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) | |||
| 4802 | 4922 | ||
| 4803 | /* | 4923 | /* |
| 4804 | * we shouldn't be called if the subsystem is in use, and the use of | 4924 | * we shouldn't be called if the subsystem is in use, and the use of |
| 4805 | * try_module_get in parse_cgroupfs_options should ensure that it | 4925 | * try_module_get() in rebind_subsystems() should ensure that it |
| 4806 | * doesn't start being used while we're killing it off. | 4926 | * doesn't start being used while we're killing it off. |
| 4807 | */ | 4927 | */ |
| 4808 | BUG_ON(ss->root != &cgroup_dummy_root); | 4928 | BUG_ON(ss->root != &cgroup_dummy_root); |
| 4809 | 4929 | ||
| 4810 | mutex_lock(&cgroup_mutex); | 4930 | mutex_lock(&cgroup_mutex); |
| 4811 | 4931 | ||
| 4812 | offline_css(ss, cgroup_dummy_top); | 4932 | offline_css(cgroup_css(cgroup_dummy_top, ss)); |
| 4813 | |||
| 4814 | if (ss->use_id) | ||
| 4815 | idr_destroy(&ss->idr); | ||
| 4816 | 4933 | ||
| 4817 | /* deassign the subsys_id */ | 4934 | /* deassign the subsys_id */ |
| 4818 | cgroup_subsys[ss->subsys_id] = NULL; | 4935 | cgroup_subsys[ss->subsys_id] = NULL; |
| @@ -4840,11 +4957,10 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) | |||
| 4840 | /* | 4957 | /* |
| 4841 | * remove subsystem's css from the cgroup_dummy_top and free it - | 4958 | * remove subsystem's css from the cgroup_dummy_top and free it - |
| 4842 | * need to free before marking as null because ss->css_free needs | 4959 | * need to free before marking as null because ss->css_free needs |
| 4843 | * the cgrp->subsys pointer to find their state. note that this | 4960 | * the cgrp->subsys pointer to find their state. |
| 4844 | * also takes care of freeing the css_id. | ||
| 4845 | */ | 4961 | */ |
| 4846 | ss->css_free(cgroup_dummy_top); | 4962 | ss->css_free(cgroup_css(cgroup_dummy_top, ss)); |
| 4847 | cgroup_dummy_top->subsys[ss->subsys_id] = NULL; | 4963 | RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL); |
| 4848 | 4964 | ||
| 4849 | mutex_unlock(&cgroup_mutex); | 4965 | mutex_unlock(&cgroup_mutex); |
| 4850 | } | 4966 | } |
| @@ -4912,8 +5028,6 @@ int __init cgroup_init(void) | |||
| 4912 | for_each_builtin_subsys(ss, i) { | 5028 | for_each_builtin_subsys(ss, i) { |
| 4913 | if (!ss->early_init) | 5029 | if (!ss->early_init) |
| 4914 | cgroup_init_subsys(ss); | 5030 | cgroup_init_subsys(ss); |
| 4915 | if (ss->use_id) | ||
| 4916 | cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]); | ||
| 4917 | } | 5031 | } |
| 4918 | 5032 | ||
| 4919 | /* allocate id for the dummy hierarchy */ | 5033 | /* allocate id for the dummy hierarchy */ |
| @@ -4926,6 +5040,10 @@ int __init cgroup_init(void) | |||
| 4926 | 5040 | ||
| 4927 | BUG_ON(cgroup_init_root_id(&cgroup_dummy_root, 0, 1)); | 5041 | BUG_ON(cgroup_init_root_id(&cgroup_dummy_root, 0, 1)); |
| 4928 | 5042 | ||
| 5043 | err = idr_alloc(&cgroup_dummy_root.cgroup_idr, cgroup_dummy_top, | ||
| 5044 | 0, 1, GFP_KERNEL); | ||
| 5045 | BUG_ON(err < 0); | ||
| 5046 | |||
| 4929 | mutex_unlock(&cgroup_root_mutex); | 5047 | mutex_unlock(&cgroup_root_mutex); |
| 4930 | mutex_unlock(&cgroup_mutex); | 5048 | mutex_unlock(&cgroup_mutex); |
| 4931 | 5049 | ||
| @@ -5082,7 +5200,7 @@ void cgroup_fork(struct task_struct *child) | |||
| 5082 | * Adds the task to the list running through its css_set if necessary and | 5200 | * Adds the task to the list running through its css_set if necessary and |
| 5083 | * call the subsystem fork() callbacks. Has to be after the task is | 5201 | * call the subsystem fork() callbacks. Has to be after the task is |
| 5084 | * visible on the task list in case we race with the first call to | 5202 | * visible on the task list in case we race with the first call to |
| 5085 | * cgroup_iter_start() - to guarantee that the new task ends up on its | 5203 | * cgroup_task_iter_start() - to guarantee that the new task ends up on its |
| 5086 | * list. | 5204 | * list. |
| 5087 | */ | 5205 | */ |
| 5088 | void cgroup_post_fork(struct task_struct *child) | 5206 | void cgroup_post_fork(struct task_struct *child) |
| @@ -5195,10 +5313,10 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) | |||
| 5195 | */ | 5313 | */ |
| 5196 | for_each_builtin_subsys(ss, i) { | 5314 | for_each_builtin_subsys(ss, i) { |
| 5197 | if (ss->exit) { | 5315 | if (ss->exit) { |
| 5198 | struct cgroup *old_cgrp = cset->subsys[i]->cgroup; | 5316 | struct cgroup_subsys_state *old_css = cset->subsys[i]; |
| 5199 | struct cgroup *cgrp = task_cgroup(tsk, i); | 5317 | struct cgroup_subsys_state *css = task_css(tsk, i); |
| 5200 | 5318 | ||
| 5201 | ss->exit(cgrp, old_cgrp, tsk); | 5319 | ss->exit(css, old_css, tsk); |
| 5202 | } | 5320 | } |
| 5203 | } | 5321 | } |
| 5204 | } | 5322 | } |
| @@ -5329,210 +5447,56 @@ static int __init cgroup_disable(char *str) | |||
| 5329 | } | 5447 | } |
| 5330 | __setup("cgroup_disable=", cgroup_disable); | 5448 | __setup("cgroup_disable=", cgroup_disable); |
| 5331 | 5449 | ||
| 5332 | /* | ||
| 5333 | * Functons for CSS ID. | ||
| 5334 | */ | ||
| 5335 | |||
| 5336 | /* to get ID other than 0, this should be called when !cgroup_is_dead() */ | ||
| 5337 | unsigned short css_id(struct cgroup_subsys_state *css) | ||
| 5338 | { | ||
| 5339 | struct css_id *cssid; | ||
| 5340 | |||
| 5341 | /* | ||
| 5342 | * This css_id() can return correct value when somone has refcnt | ||
| 5343 | * on this or this is under rcu_read_lock(). Once css->id is allocated, | ||
| 5344 | * it's unchanged until freed. | ||
| 5345 | */ | ||
| 5346 | cssid = rcu_dereference_raw(css->id); | ||
| 5347 | |||
| 5348 | if (cssid) | ||
| 5349 | return cssid->id; | ||
| 5350 | return 0; | ||
| 5351 | } | ||
| 5352 | EXPORT_SYMBOL_GPL(css_id); | ||
| 5353 | |||
| 5354 | /** | 5450 | /** |
| 5355 | * css_is_ancestor - test "root" css is an ancestor of "child" | 5451 | * css_from_dir - get corresponding css from the dentry of a cgroup dir |
| 5356 | * @child: the css to be tested. | 5452 | * @dentry: directory dentry of interest |
| 5357 | * @root: the css supporsed to be an ancestor of the child. | 5453 | * @ss: subsystem of interest |
| 5358 | * | 5454 | * |
| 5359 | * Returns true if "root" is an ancestor of "child" in its hierarchy. Because | 5455 | * Must be called under RCU read lock. The caller is responsible for |
| 5360 | * this function reads css->id, the caller must hold rcu_read_lock(). | 5456 | * pinning the returned css if it needs to be accessed outside the RCU |
| 5361 | * But, considering usual usage, the csses should be valid objects after test. | 5457 | * critical section. |
| 5362 | * Assuming that the caller will do some action to the child if this returns | ||
| 5363 | * returns true, the caller must take "child";s reference count. | ||
| 5364 | * If "child" is valid object and this returns true, "root" is valid, too. | ||
| 5365 | */ | ||
| 5366 | |||
| 5367 | bool css_is_ancestor(struct cgroup_subsys_state *child, | ||
| 5368 | const struct cgroup_subsys_state *root) | ||
| 5369 | { | ||
| 5370 | struct css_id *child_id; | ||
| 5371 | struct css_id *root_id; | ||
| 5372 | |||
| 5373 | child_id = rcu_dereference(child->id); | ||
| 5374 | if (!child_id) | ||
| 5375 | return false; | ||
| 5376 | root_id = rcu_dereference(root->id); | ||
| 5377 | if (!root_id) | ||
| 5378 | return false; | ||
| 5379 | if (child_id->depth < root_id->depth) | ||
| 5380 | return false; | ||
| 5381 | if (child_id->stack[root_id->depth] != root_id->id) | ||
| 5382 | return false; | ||
| 5383 | return true; | ||
| 5384 | } | ||
| 5385 | |||
| 5386 | void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) | ||
| 5387 | { | ||
| 5388 | struct css_id *id = rcu_dereference_protected(css->id, true); | ||
| 5389 | |||
| 5390 | /* When this is called before css_id initialization, id can be NULL */ | ||
| 5391 | if (!id) | ||
| 5392 | return; | ||
| 5393 | |||
| 5394 | BUG_ON(!ss->use_id); | ||
| 5395 | |||
| 5396 | rcu_assign_pointer(id->css, NULL); | ||
| 5397 | rcu_assign_pointer(css->id, NULL); | ||
| 5398 | spin_lock(&ss->id_lock); | ||
| 5399 | idr_remove(&ss->idr, id->id); | ||
| 5400 | spin_unlock(&ss->id_lock); | ||
| 5401 | kfree_rcu(id, rcu_head); | ||
| 5402 | } | ||
| 5403 | EXPORT_SYMBOL_GPL(free_css_id); | ||
| 5404 | |||
| 5405 | /* | ||
| 5406 | * This is called by init or create(). Then, calls to this function are | ||
| 5407 | * always serialized (By cgroup_mutex() at create()). | ||
| 5408 | */ | 5458 | */ |
| 5409 | 5459 | struct cgroup_subsys_state *css_from_dir(struct dentry *dentry, | |
| 5410 | static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) | 5460 | struct cgroup_subsys *ss) |
| 5411 | { | 5461 | { |
| 5412 | struct css_id *newid; | 5462 | struct cgroup *cgrp; |
| 5413 | int ret, size; | ||
| 5414 | |||
| 5415 | BUG_ON(!ss->use_id); | ||
| 5416 | |||
| 5417 | size = sizeof(*newid) + sizeof(unsigned short) * (depth + 1); | ||
| 5418 | newid = kzalloc(size, GFP_KERNEL); | ||
| 5419 | if (!newid) | ||
| 5420 | return ERR_PTR(-ENOMEM); | ||
| 5421 | |||
| 5422 | idr_preload(GFP_KERNEL); | ||
| 5423 | spin_lock(&ss->id_lock); | ||
| 5424 | /* Don't use 0. allocates an ID of 1-65535 */ | ||
| 5425 | ret = idr_alloc(&ss->idr, newid, 1, CSS_ID_MAX + 1, GFP_NOWAIT); | ||
| 5426 | spin_unlock(&ss->id_lock); | ||
| 5427 | idr_preload_end(); | ||
| 5428 | |||
| 5429 | /* Returns error when there are no free spaces for new ID.*/ | ||
| 5430 | if (ret < 0) | ||
| 5431 | goto err_out; | ||
| 5432 | |||
| 5433 | newid->id = ret; | ||
| 5434 | newid->depth = depth; | ||
| 5435 | return newid; | ||
| 5436 | err_out: | ||
| 5437 | kfree(newid); | ||
| 5438 | return ERR_PTR(ret); | ||
| 5439 | |||
| 5440 | } | ||
| 5441 | |||
| 5442 | static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss, | ||
| 5443 | struct cgroup_subsys_state *rootcss) | ||
| 5444 | { | ||
| 5445 | struct css_id *newid; | ||
| 5446 | |||
| 5447 | spin_lock_init(&ss->id_lock); | ||
| 5448 | idr_init(&ss->idr); | ||
| 5449 | |||
| 5450 | newid = get_new_cssid(ss, 0); | ||
| 5451 | if (IS_ERR(newid)) | ||
| 5452 | return PTR_ERR(newid); | ||
| 5453 | |||
| 5454 | newid->stack[0] = newid->id; | ||
| 5455 | RCU_INIT_POINTER(newid->css, rootcss); | ||
| 5456 | RCU_INIT_POINTER(rootcss->id, newid); | ||
| 5457 | return 0; | ||
| 5458 | } | ||
| 5459 | |||
| 5460 | static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent, | ||
| 5461 | struct cgroup *child) | ||
| 5462 | { | ||
| 5463 | int subsys_id, i, depth = 0; | ||
| 5464 | struct cgroup_subsys_state *parent_css, *child_css; | ||
| 5465 | struct css_id *child_id, *parent_id; | ||
| 5466 | |||
| 5467 | subsys_id = ss->subsys_id; | ||
| 5468 | parent_css = parent->subsys[subsys_id]; | ||
| 5469 | child_css = child->subsys[subsys_id]; | ||
| 5470 | parent_id = rcu_dereference_protected(parent_css->id, true); | ||
| 5471 | depth = parent_id->depth + 1; | ||
| 5472 | 5463 | ||
| 5473 | child_id = get_new_cssid(ss, depth); | 5464 | WARN_ON_ONCE(!rcu_read_lock_held()); |
| 5474 | if (IS_ERR(child_id)) | ||
| 5475 | return PTR_ERR(child_id); | ||
| 5476 | 5465 | ||
| 5477 | for (i = 0; i < depth; i++) | 5466 | /* is @dentry a cgroup dir? */ |
| 5478 | child_id->stack[i] = parent_id->stack[i]; | 5467 | if (!dentry->d_inode || |
| 5479 | child_id->stack[depth] = child_id->id; | 5468 | dentry->d_inode->i_op != &cgroup_dir_inode_operations) |
| 5480 | /* | 5469 | return ERR_PTR(-EBADF); |
| 5481 | * child_id->css pointer will be set after this cgroup is available | ||
| 5482 | * see cgroup_populate_dir() | ||
| 5483 | */ | ||
| 5484 | rcu_assign_pointer(child_css->id, child_id); | ||
| 5485 | 5470 | ||
| 5486 | return 0; | 5471 | cgrp = __d_cgrp(dentry); |
| 5472 | return cgroup_css(cgrp, ss) ?: ERR_PTR(-ENOENT); | ||
| 5487 | } | 5473 | } |
| 5488 | 5474 | ||
| 5489 | /** | 5475 | /** |
| 5490 | * css_lookup - lookup css by id | 5476 | * css_from_id - lookup css by id |
| 5491 | * @ss: cgroup subsys to be looked into. | 5477 | * @id: the cgroup id |
| 5492 | * @id: the id | 5478 | * @ss: cgroup subsys to be looked into |
| 5493 | * | 5479 | * |
| 5494 | * Returns pointer to cgroup_subsys_state if there is valid one with id. | 5480 | * Returns the css if there's valid one with @id, otherwise returns NULL. |
| 5495 | * NULL if not. Should be called under rcu_read_lock() | 5481 | * Should be called under rcu_read_lock(). |
| 5496 | */ | 5482 | */ |
| 5497 | struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id) | 5483 | struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) |
| 5498 | { | ||
| 5499 | struct css_id *cssid = NULL; | ||
| 5500 | |||
| 5501 | BUG_ON(!ss->use_id); | ||
| 5502 | cssid = idr_find(&ss->idr, id); | ||
| 5503 | |||
| 5504 | if (unlikely(!cssid)) | ||
| 5505 | return NULL; | ||
| 5506 | |||
| 5507 | return rcu_dereference(cssid->css); | ||
| 5508 | } | ||
| 5509 | EXPORT_SYMBOL_GPL(css_lookup); | ||
| 5510 | |||
| 5511 | /* | ||
| 5512 | * get corresponding css from file open on cgroupfs directory | ||
| 5513 | */ | ||
| 5514 | struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) | ||
| 5515 | { | 5484 | { |
| 5516 | struct cgroup *cgrp; | 5485 | struct cgroup *cgrp; |
| 5517 | struct inode *inode; | ||
| 5518 | struct cgroup_subsys_state *css; | ||
| 5519 | 5486 | ||
| 5520 | inode = file_inode(f); | 5487 | rcu_lockdep_assert(rcu_read_lock_held() || |
| 5521 | /* check in cgroup filesystem dir */ | 5488 | lockdep_is_held(&cgroup_mutex), |
| 5522 | if (inode->i_op != &cgroup_dir_inode_operations) | 5489 | "css_from_id() needs proper protection"); |
| 5523 | return ERR_PTR(-EBADF); | ||
| 5524 | 5490 | ||
| 5525 | if (id < 0 || id >= CGROUP_SUBSYS_COUNT) | 5491 | cgrp = idr_find(&ss->root->cgroup_idr, id); |
| 5526 | return ERR_PTR(-EINVAL); | 5492 | if (cgrp) |
| 5527 | 5493 | return cgroup_css(cgrp, ss); | |
| 5528 | /* get cgroup */ | 5494 | return NULL; |
| 5529 | cgrp = __d_cgrp(f->f_dentry); | ||
| 5530 | css = cgrp->subsys[id]; | ||
| 5531 | return css ? css : ERR_PTR(-ENOENT); | ||
| 5532 | } | 5495 | } |
| 5533 | 5496 | ||
| 5534 | #ifdef CONFIG_CGROUP_DEBUG | 5497 | #ifdef CONFIG_CGROUP_DEBUG |
| 5535 | static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cgrp) | 5498 | static struct cgroup_subsys_state * |
| 5499 | debug_css_alloc(struct cgroup_subsys_state *parent_css) | ||
| 5536 | { | 5500 | { |
| 5537 | struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); | 5501 | struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); |
| 5538 | 5502 | ||
| @@ -5542,22 +5506,24 @@ static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cgrp) | |||
| 5542 | return css; | 5506 | return css; |
| 5543 | } | 5507 | } |
| 5544 | 5508 | ||
| 5545 | static void debug_css_free(struct cgroup *cgrp) | 5509 | static void debug_css_free(struct cgroup_subsys_state *css) |
| 5546 | { | 5510 | { |
| 5547 | kfree(cgrp->subsys[debug_subsys_id]); | 5511 | kfree(css); |
| 5548 | } | 5512 | } |
| 5549 | 5513 | ||
| 5550 | static u64 debug_taskcount_read(struct cgroup *cgrp, struct cftype *cft) | 5514 | static u64 debug_taskcount_read(struct cgroup_subsys_state *css, |
| 5515 | struct cftype *cft) | ||
| 5551 | { | 5516 | { |
| 5552 | return cgroup_task_count(cgrp); | 5517 | return cgroup_task_count(css->cgroup); |
| 5553 | } | 5518 | } |
| 5554 | 5519 | ||
| 5555 | static u64 current_css_set_read(struct cgroup *cgrp, struct cftype *cft) | 5520 | static u64 current_css_set_read(struct cgroup_subsys_state *css, |
| 5521 | struct cftype *cft) | ||
| 5556 | { | 5522 | { |
| 5557 | return (u64)(unsigned long)current->cgroups; | 5523 | return (u64)(unsigned long)current->cgroups; |
| 5558 | } | 5524 | } |
| 5559 | 5525 | ||
| 5560 | static u64 current_css_set_refcount_read(struct cgroup *cgrp, | 5526 | static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css, |
| 5561 | struct cftype *cft) | 5527 | struct cftype *cft) |
| 5562 | { | 5528 | { |
| 5563 | u64 count; | 5529 | u64 count; |
| @@ -5568,7 +5534,7 @@ static u64 current_css_set_refcount_read(struct cgroup *cgrp, | |||
| 5568 | return count; | 5534 | return count; |
| 5569 | } | 5535 | } |
| 5570 | 5536 | ||
| 5571 | static int current_css_set_cg_links_read(struct cgroup *cgrp, | 5537 | static int current_css_set_cg_links_read(struct cgroup_subsys_state *css, |
| 5572 | struct cftype *cft, | 5538 | struct cftype *cft, |
| 5573 | struct seq_file *seq) | 5539 | struct seq_file *seq) |
| 5574 | { | 5540 | { |
| @@ -5595,14 +5561,13 @@ static int current_css_set_cg_links_read(struct cgroup *cgrp, | |||
| 5595 | } | 5561 | } |
| 5596 | 5562 | ||
| 5597 | #define MAX_TASKS_SHOWN_PER_CSS 25 | 5563 | #define MAX_TASKS_SHOWN_PER_CSS 25 |
| 5598 | static int cgroup_css_links_read(struct cgroup *cgrp, | 5564 | static int cgroup_css_links_read(struct cgroup_subsys_state *css, |
| 5599 | struct cftype *cft, | 5565 | struct cftype *cft, struct seq_file *seq) |
| 5600 | struct seq_file *seq) | ||
| 5601 | { | 5566 | { |
| 5602 | struct cgrp_cset_link *link; | 5567 | struct cgrp_cset_link *link; |
| 5603 | 5568 | ||
| 5604 | read_lock(&css_set_lock); | 5569 | read_lock(&css_set_lock); |
| 5605 | list_for_each_entry(link, &cgrp->cset_links, cset_link) { | 5570 | list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { |
| 5606 | struct css_set *cset = link->cset; | 5571 | struct css_set *cset = link->cset; |
| 5607 | struct task_struct *task; | 5572 | struct task_struct *task; |
| 5608 | int count = 0; | 5573 | int count = 0; |
| @@ -5621,9 +5586,9 @@ static int cgroup_css_links_read(struct cgroup *cgrp, | |||
| 5621 | return 0; | 5586 | return 0; |
| 5622 | } | 5587 | } |
| 5623 | 5588 | ||
| 5624 | static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft) | 5589 | static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft) |
| 5625 | { | 5590 | { |
| 5626 | return test_bit(CGRP_RELEASABLE, &cgrp->flags); | 5591 | return test_bit(CGRP_RELEASABLE, &css->cgroup->flags); |
| 5627 | } | 5592 | } |
| 5628 | 5593 | ||
| 5629 | static struct cftype debug_files[] = { | 5594 | static struct cftype debug_files[] = { |
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 75dda1ea5026..f0ff64d0ebaa 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c | |||
| @@ -45,25 +45,19 @@ struct freezer { | |||
| 45 | spinlock_t lock; | 45 | spinlock_t lock; |
| 46 | }; | 46 | }; |
| 47 | 47 | ||
| 48 | static inline struct freezer *cgroup_freezer(struct cgroup *cgroup) | 48 | static inline struct freezer *css_freezer(struct cgroup_subsys_state *css) |
| 49 | { | 49 | { |
| 50 | return container_of(cgroup_subsys_state(cgroup, freezer_subsys_id), | 50 | return css ? container_of(css, struct freezer, css) : NULL; |
| 51 | struct freezer, css); | ||
| 52 | } | 51 | } |
| 53 | 52 | ||
| 54 | static inline struct freezer *task_freezer(struct task_struct *task) | 53 | static inline struct freezer *task_freezer(struct task_struct *task) |
| 55 | { | 54 | { |
| 56 | return container_of(task_subsys_state(task, freezer_subsys_id), | 55 | return css_freezer(task_css(task, freezer_subsys_id)); |
| 57 | struct freezer, css); | ||
| 58 | } | 56 | } |
| 59 | 57 | ||
| 60 | static struct freezer *parent_freezer(struct freezer *freezer) | 58 | static struct freezer *parent_freezer(struct freezer *freezer) |
| 61 | { | 59 | { |
| 62 | struct cgroup *pcg = freezer->css.cgroup->parent; | 60 | return css_freezer(css_parent(&freezer->css)); |
| 63 | |||
| 64 | if (pcg) | ||
| 65 | return cgroup_freezer(pcg); | ||
| 66 | return NULL; | ||
| 67 | } | 61 | } |
| 68 | 62 | ||
| 69 | bool cgroup_freezing(struct task_struct *task) | 63 | bool cgroup_freezing(struct task_struct *task) |
| @@ -92,7 +86,8 @@ static const char *freezer_state_strs(unsigned int state) | |||
| 92 | 86 | ||
| 93 | struct cgroup_subsys freezer_subsys; | 87 | struct cgroup_subsys freezer_subsys; |
| 94 | 88 | ||
| 95 | static struct cgroup_subsys_state *freezer_css_alloc(struct cgroup *cgroup) | 89 | static struct cgroup_subsys_state * |
| 90 | freezer_css_alloc(struct cgroup_subsys_state *parent_css) | ||
| 96 | { | 91 | { |
| 97 | struct freezer *freezer; | 92 | struct freezer *freezer; |
| 98 | 93 | ||
| @@ -105,22 +100,22 @@ static struct cgroup_subsys_state *freezer_css_alloc(struct cgroup *cgroup) | |||
| 105 | } | 100 | } |
| 106 | 101 | ||
| 107 | /** | 102 | /** |
| 108 | * freezer_css_online - commit creation of a freezer cgroup | 103 | * freezer_css_online - commit creation of a freezer css |
| 109 | * @cgroup: cgroup being created | 104 | * @css: css being created |
| 110 | * | 105 | * |
| 111 | * We're committing to creation of @cgroup. Mark it online and inherit | 106 | * We're committing to creation of @css. Mark it online and inherit |
| 112 | * parent's freezing state while holding both parent's and our | 107 | * parent's freezing state while holding both parent's and our |
| 113 | * freezer->lock. | 108 | * freezer->lock. |
| 114 | */ | 109 | */ |
| 115 | static int freezer_css_online(struct cgroup *cgroup) | 110 | static int freezer_css_online(struct cgroup_subsys_state *css) |
| 116 | { | 111 | { |
| 117 | struct freezer *freezer = cgroup_freezer(cgroup); | 112 | struct freezer *freezer = css_freezer(css); |
| 118 | struct freezer *parent = parent_freezer(freezer); | 113 | struct freezer *parent = parent_freezer(freezer); |
| 119 | 114 | ||
| 120 | /* | 115 | /* |
| 121 | * The following double locking and freezing state inheritance | 116 | * The following double locking and freezing state inheritance |
| 122 | * guarantee that @cgroup can never escape ancestors' freezing | 117 | * guarantee that @cgroup can never escape ancestors' freezing |
| 123 | * states. See cgroup_for_each_descendant_pre() for details. | 118 | * states. See css_for_each_descendant_pre() for details. |
| 124 | */ | 119 | */ |
| 125 | if (parent) | 120 | if (parent) |
| 126 | spin_lock_irq(&parent->lock); | 121 | spin_lock_irq(&parent->lock); |
| @@ -141,15 +136,15 @@ static int freezer_css_online(struct cgroup *cgroup) | |||
| 141 | } | 136 | } |
| 142 | 137 | ||
| 143 | /** | 138 | /** |
| 144 | * freezer_css_offline - initiate destruction of @cgroup | 139 | * freezer_css_offline - initiate destruction of a freezer css |
| 145 | * @cgroup: cgroup being destroyed | 140 | * @css: css being destroyed |
| 146 | * | 141 | * |
| 147 | * @cgroup is going away. Mark it dead and decrement system_freezing_count | 142 | * @css is going away. Mark it dead and decrement system_freezing_count if |
| 148 | * if it was holding one. | 143 | * it was holding one. |
| 149 | */ | 144 | */ |
| 150 | static void freezer_css_offline(struct cgroup *cgroup) | 145 | static void freezer_css_offline(struct cgroup_subsys_state *css) |
| 151 | { | 146 | { |
| 152 | struct freezer *freezer = cgroup_freezer(cgroup); | 147 | struct freezer *freezer = css_freezer(css); |
| 153 | 148 | ||
| 154 | spin_lock_irq(&freezer->lock); | 149 | spin_lock_irq(&freezer->lock); |
| 155 | 150 | ||
| @@ -161,9 +156,9 @@ static void freezer_css_offline(struct cgroup *cgroup) | |||
| 161 | spin_unlock_irq(&freezer->lock); | 156 | spin_unlock_irq(&freezer->lock); |
| 162 | } | 157 | } |
| 163 | 158 | ||
| 164 | static void freezer_css_free(struct cgroup *cgroup) | 159 | static void freezer_css_free(struct cgroup_subsys_state *css) |
| 165 | { | 160 | { |
| 166 | kfree(cgroup_freezer(cgroup)); | 161 | kfree(css_freezer(css)); |
| 167 | } | 162 | } |
| 168 | 163 | ||
| 169 | /* | 164 | /* |
| @@ -175,25 +170,26 @@ static void freezer_css_free(struct cgroup *cgroup) | |||
| 175 | * @freezer->lock. freezer_attach() makes the new tasks conform to the | 170 | * @freezer->lock. freezer_attach() makes the new tasks conform to the |
| 176 | * current state and all following state changes can see the new tasks. | 171 | * current state and all following state changes can see the new tasks. |
| 177 | */ | 172 | */ |
| 178 | static void freezer_attach(struct cgroup *new_cgrp, struct cgroup_taskset *tset) | 173 | static void freezer_attach(struct cgroup_subsys_state *new_css, |
| 174 | struct cgroup_taskset *tset) | ||
| 179 | { | 175 | { |
| 180 | struct freezer *freezer = cgroup_freezer(new_cgrp); | 176 | struct freezer *freezer = css_freezer(new_css); |
| 181 | struct task_struct *task; | 177 | struct task_struct *task; |
| 182 | bool clear_frozen = false; | 178 | bool clear_frozen = false; |
| 183 | 179 | ||
| 184 | spin_lock_irq(&freezer->lock); | 180 | spin_lock_irq(&freezer->lock); |
| 185 | 181 | ||
| 186 | /* | 182 | /* |
| 187 | * Make the new tasks conform to the current state of @new_cgrp. | 183 | * Make the new tasks conform to the current state of @new_css. |
| 188 | * For simplicity, when migrating any task to a FROZEN cgroup, we | 184 | * For simplicity, when migrating any task to a FROZEN cgroup, we |
| 189 | * revert it to FREEZING and let update_if_frozen() determine the | 185 | * revert it to FREEZING and let update_if_frozen() determine the |
| 190 | * correct state later. | 186 | * correct state later. |
| 191 | * | 187 | * |
| 192 | * Tasks in @tset are on @new_cgrp but may not conform to its | 188 | * Tasks in @tset are on @new_css but may not conform to its |
| 193 | * current state before executing the following - !frozen tasks may | 189 | * current state before executing the following - !frozen tasks may |
| 194 | * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. | 190 | * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. |
| 195 | */ | 191 | */ |
| 196 | cgroup_taskset_for_each(task, new_cgrp, tset) { | 192 | cgroup_taskset_for_each(task, new_css, tset) { |
| 197 | if (!(freezer->state & CGROUP_FREEZING)) { | 193 | if (!(freezer->state & CGROUP_FREEZING)) { |
| 198 | __thaw_task(task); | 194 | __thaw_task(task); |
| 199 | } else { | 195 | } else { |
| @@ -231,7 +227,7 @@ static void freezer_fork(struct task_struct *task) | |||
| 231 | * The root cgroup is non-freezable, so we can skip the | 227 | * The root cgroup is non-freezable, so we can skip the |
| 232 | * following check. | 228 | * following check. |
| 233 | */ | 229 | */ |
| 234 | if (!freezer->css.cgroup->parent) | 230 | if (!parent_freezer(freezer)) |
| 235 | goto out; | 231 | goto out; |
| 236 | 232 | ||
| 237 | spin_lock_irq(&freezer->lock); | 233 | spin_lock_irq(&freezer->lock); |
| @@ -244,7 +240,7 @@ out: | |||
| 244 | 240 | ||
| 245 | /** | 241 | /** |
| 246 | * update_if_frozen - update whether a cgroup finished freezing | 242 | * update_if_frozen - update whether a cgroup finished freezing |
| 247 | * @cgroup: cgroup of interest | 243 | * @css: css of interest |
| 248 | * | 244 | * |
| 249 | * Once FREEZING is initiated, transition to FROZEN is lazily updated by | 245 | * Once FREEZING is initiated, transition to FROZEN is lazily updated by |
| 250 | * calling this function. If the current state is FREEZING but not FROZEN, | 246 | * calling this function. If the current state is FREEZING but not FROZEN, |
| @@ -255,14 +251,14 @@ out: | |||
| 255 | * update_if_frozen() on all descendants prior to invoking this function. | 251 | * update_if_frozen() on all descendants prior to invoking this function. |
| 256 | * | 252 | * |
| 257 | * Task states and freezer state might disagree while tasks are being | 253 | * Task states and freezer state might disagree while tasks are being |
| 258 | * migrated into or out of @cgroup, so we can't verify task states against | 254 | * migrated into or out of @css, so we can't verify task states against |
| 259 | * @freezer state here. See freezer_attach() for details. | 255 | * @freezer state here. See freezer_attach() for details. |
| 260 | */ | 256 | */ |
| 261 | static void update_if_frozen(struct cgroup *cgroup) | 257 | static void update_if_frozen(struct cgroup_subsys_state *css) |
| 262 | { | 258 | { |
| 263 | struct freezer *freezer = cgroup_freezer(cgroup); | 259 | struct freezer *freezer = css_freezer(css); |
| 264 | struct cgroup *pos; | 260 | struct cgroup_subsys_state *pos; |
| 265 | struct cgroup_iter it; | 261 | struct css_task_iter it; |
| 266 | struct task_struct *task; | 262 | struct task_struct *task; |
| 267 | 263 | ||
| 268 | WARN_ON_ONCE(!rcu_read_lock_held()); | 264 | WARN_ON_ONCE(!rcu_read_lock_held()); |
| @@ -274,8 +270,8 @@ static void update_if_frozen(struct cgroup *cgroup) | |||
| 274 | goto out_unlock; | 270 | goto out_unlock; |
| 275 | 271 | ||
| 276 | /* are all (live) children frozen? */ | 272 | /* are all (live) children frozen? */ |
| 277 | cgroup_for_each_child(pos, cgroup) { | 273 | css_for_each_child(pos, css) { |
| 278 | struct freezer *child = cgroup_freezer(pos); | 274 | struct freezer *child = css_freezer(pos); |
| 279 | 275 | ||
| 280 | if ((child->state & CGROUP_FREEZER_ONLINE) && | 276 | if ((child->state & CGROUP_FREEZER_ONLINE) && |
| 281 | !(child->state & CGROUP_FROZEN)) | 277 | !(child->state & CGROUP_FROZEN)) |
| @@ -283,9 +279,9 @@ static void update_if_frozen(struct cgroup *cgroup) | |||
| 283 | } | 279 | } |
| 284 | 280 | ||
| 285 | /* are all tasks frozen? */ | 281 | /* are all tasks frozen? */ |
| 286 | cgroup_iter_start(cgroup, &it); | 282 | css_task_iter_start(css, &it); |
| 287 | 283 | ||
| 288 | while ((task = cgroup_iter_next(cgroup, &it))) { | 284 | while ((task = css_task_iter_next(&it))) { |
| 289 | if (freezing(task)) { | 285 | if (freezing(task)) { |
| 290 | /* | 286 | /* |
| 291 | * freezer_should_skip() indicates that the task | 287 | * freezer_should_skip() indicates that the task |
| @@ -300,52 +296,49 @@ static void update_if_frozen(struct cgroup *cgroup) | |||
| 300 | 296 | ||
| 301 | freezer->state |= CGROUP_FROZEN; | 297 | freezer->state |= CGROUP_FROZEN; |
| 302 | out_iter_end: | 298 | out_iter_end: |
| 303 | cgroup_iter_end(cgroup, &it); | 299 | css_task_iter_end(&it); |
| 304 | out_unlock: | 300 | out_unlock: |
| 305 | spin_unlock_irq(&freezer->lock); | 301 | spin_unlock_irq(&freezer->lock); |
| 306 | } | 302 | } |
| 307 | 303 | ||
| 308 | static int freezer_read(struct cgroup *cgroup, struct cftype *cft, | 304 | static int freezer_read(struct cgroup_subsys_state *css, struct cftype *cft, |
| 309 | struct seq_file *m) | 305 | struct seq_file *m) |
| 310 | { | 306 | { |
| 311 | struct cgroup *pos; | 307 | struct cgroup_subsys_state *pos; |
| 312 | 308 | ||
| 313 | rcu_read_lock(); | 309 | rcu_read_lock(); |
| 314 | 310 | ||
| 315 | /* update states bottom-up */ | 311 | /* update states bottom-up */ |
| 316 | cgroup_for_each_descendant_post(pos, cgroup) | 312 | css_for_each_descendant_post(pos, css) |
| 317 | update_if_frozen(pos); | 313 | update_if_frozen(pos); |
| 318 | update_if_frozen(cgroup); | ||
| 319 | 314 | ||
| 320 | rcu_read_unlock(); | 315 | rcu_read_unlock(); |
| 321 | 316 | ||
| 322 | seq_puts(m, freezer_state_strs(cgroup_freezer(cgroup)->state)); | 317 | seq_puts(m, freezer_state_strs(css_freezer(css)->state)); |
| 323 | seq_putc(m, '\n'); | 318 | seq_putc(m, '\n'); |
| 324 | return 0; | 319 | return 0; |
| 325 | } | 320 | } |
| 326 | 321 | ||
| 327 | static void freeze_cgroup(struct freezer *freezer) | 322 | static void freeze_cgroup(struct freezer *freezer) |
| 328 | { | 323 | { |
| 329 | struct cgroup *cgroup = freezer->css.cgroup; | 324 | struct css_task_iter it; |
| 330 | struct cgroup_iter it; | ||
| 331 | struct task_struct *task; | 325 | struct task_struct *task; |
| 332 | 326 | ||
| 333 | cgroup_iter_start(cgroup, &it); | 327 | css_task_iter_start(&freezer->css, &it); |
| 334 | while ((task = cgroup_iter_next(cgroup, &it))) | 328 | while ((task = css_task_iter_next(&it))) |
| 335 | freeze_task(task); | 329 | freeze_task(task); |
| 336 | cgroup_iter_end(cgroup, &it); | 330 | css_task_iter_end(&it); |
| 337 | } | 331 | } |
| 338 | 332 | ||
| 339 | static void unfreeze_cgroup(struct freezer *freezer) | 333 | static void unfreeze_cgroup(struct freezer *freezer) |
| 340 | { | 334 | { |
| 341 | struct cgroup *cgroup = freezer->css.cgroup; | 335 | struct css_task_iter it; |
| 342 | struct cgroup_iter it; | ||
| 343 | struct task_struct *task; | 336 | struct task_struct *task; |
| 344 | 337 | ||
| 345 | cgroup_iter_start(cgroup, &it); | 338 | css_task_iter_start(&freezer->css, &it); |
| 346 | while ((task = cgroup_iter_next(cgroup, &it))) | 339 | while ((task = css_task_iter_next(&it))) |
| 347 | __thaw_task(task); | 340 | __thaw_task(task); |
| 348 | cgroup_iter_end(cgroup, &it); | 341 | css_task_iter_end(&it); |
| 349 | } | 342 | } |
| 350 | 343 | ||
| 351 | /** | 344 | /** |
| @@ -395,12 +388,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze, | |||
| 395 | */ | 388 | */ |
| 396 | static void freezer_change_state(struct freezer *freezer, bool freeze) | 389 | static void freezer_change_state(struct freezer *freezer, bool freeze) |
| 397 | { | 390 | { |
| 398 | struct cgroup *pos; | 391 | struct cgroup_subsys_state *pos; |
| 399 | |||
| 400 | /* update @freezer */ | ||
| 401 | spin_lock_irq(&freezer->lock); | ||
| 402 | freezer_apply_state(freezer, freeze, CGROUP_FREEZING_SELF); | ||
| 403 | spin_unlock_irq(&freezer->lock); | ||
| 404 | 392 | ||
| 405 | /* | 393 | /* |
| 406 | * Update all its descendants in pre-order traversal. Each | 394 | * Update all its descendants in pre-order traversal. Each |
| @@ -408,24 +396,33 @@ static void freezer_change_state(struct freezer *freezer, bool freeze) | |||
| 408 | * CGROUP_FREEZING_PARENT. | 396 | * CGROUP_FREEZING_PARENT. |
| 409 | */ | 397 | */ |
| 410 | rcu_read_lock(); | 398 | rcu_read_lock(); |
| 411 | cgroup_for_each_descendant_pre(pos, freezer->css.cgroup) { | 399 | css_for_each_descendant_pre(pos, &freezer->css) { |
| 412 | struct freezer *pos_f = cgroup_freezer(pos); | 400 | struct freezer *pos_f = css_freezer(pos); |
| 413 | struct freezer *parent = parent_freezer(pos_f); | 401 | struct freezer *parent = parent_freezer(pos_f); |
| 414 | 402 | ||
| 415 | /* | ||
| 416 | * Our update to @parent->state is already visible which is | ||
| 417 | * all we need. No need to lock @parent. For more info on | ||
| 418 | * synchronization, see freezer_post_create(). | ||
| 419 | */ | ||
| 420 | spin_lock_irq(&pos_f->lock); | 403 | spin_lock_irq(&pos_f->lock); |
| 421 | freezer_apply_state(pos_f, parent->state & CGROUP_FREEZING, | 404 | |
| 422 | CGROUP_FREEZING_PARENT); | 405 | if (pos_f == freezer) { |
| 406 | freezer_apply_state(pos_f, freeze, | ||
| 407 | CGROUP_FREEZING_SELF); | ||
| 408 | } else { | ||
| 409 | /* | ||
| 410 | * Our update to @parent->state is already visible | ||
| 411 | * which is all we need. No need to lock @parent. | ||
| 412 | * For more info on synchronization, see | ||
| 413 | * freezer_post_create(). | ||
| 414 | */ | ||
| 415 | freezer_apply_state(pos_f, | ||
| 416 | parent->state & CGROUP_FREEZING, | ||
| 417 | CGROUP_FREEZING_PARENT); | ||
| 418 | } | ||
| 419 | |||
| 423 | spin_unlock_irq(&pos_f->lock); | 420 | spin_unlock_irq(&pos_f->lock); |
| 424 | } | 421 | } |
| 425 | rcu_read_unlock(); | 422 | rcu_read_unlock(); |
| 426 | } | 423 | } |
| 427 | 424 | ||
| 428 | static int freezer_write(struct cgroup *cgroup, struct cftype *cft, | 425 | static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft, |
| 429 | const char *buffer) | 426 | const char *buffer) |
| 430 | { | 427 | { |
| 431 | bool freeze; | 428 | bool freeze; |
| @@ -437,20 +434,22 @@ static int freezer_write(struct cgroup *cgroup, struct cftype *cft, | |||
| 437 | else | 434 | else |
| 438 | return -EINVAL; | 435 | return -EINVAL; |
| 439 | 436 | ||
| 440 | freezer_change_state(cgroup_freezer(cgroup), freeze); | 437 | freezer_change_state(css_freezer(css), freeze); |
| 441 | return 0; | 438 | return 0; |
| 442 | } | 439 | } |
| 443 | 440 | ||
| 444 | static u64 freezer_self_freezing_read(struct cgroup *cgroup, struct cftype *cft) | 441 | static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css, |
| 442 | struct cftype *cft) | ||
| 445 | { | 443 | { |
| 446 | struct freezer *freezer = cgroup_freezer(cgroup); | 444 | struct freezer *freezer = css_freezer(css); |
| 447 | 445 | ||
| 448 | return (bool)(freezer->state & CGROUP_FREEZING_SELF); | 446 | return (bool)(freezer->state & CGROUP_FREEZING_SELF); |
| 449 | } | 447 | } |
| 450 | 448 | ||
| 451 | static u64 freezer_parent_freezing_read(struct cgroup *cgroup, struct cftype *cft) | 449 | static u64 freezer_parent_freezing_read(struct cgroup_subsys_state *css, |
| 450 | struct cftype *cft) | ||
| 452 | { | 451 | { |
| 453 | struct freezer *freezer = cgroup_freezer(cgroup); | 452 | struct freezer *freezer = css_freezer(css); |
| 454 | 453 | ||
| 455 | return (bool)(freezer->state & CGROUP_FREEZING_PARENT); | 454 | return (bool)(freezer->state & CGROUP_FREEZING_PARENT); |
| 456 | } | 455 | } |
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 383f8231e436..e5f3917aa05b 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c | |||
| @@ -20,26 +20,46 @@ | |||
| 20 | #include <linux/hardirq.h> | 20 | #include <linux/hardirq.h> |
| 21 | #include <linux/export.h> | 21 | #include <linux/export.h> |
| 22 | 22 | ||
| 23 | DEFINE_PER_CPU(struct context_tracking, context_tracking) = { | 23 | #define CREATE_TRACE_POINTS |
| 24 | #ifdef CONFIG_CONTEXT_TRACKING_FORCE | 24 | #include <trace/events/context_tracking.h> |
| 25 | .active = true, | 25 | |
| 26 | #endif | 26 | struct static_key context_tracking_enabled = STATIC_KEY_INIT_FALSE; |
| 27 | }; | 27 | EXPORT_SYMBOL_GPL(context_tracking_enabled); |
| 28 | |||
| 29 | DEFINE_PER_CPU(struct context_tracking, context_tracking); | ||
| 30 | EXPORT_SYMBOL_GPL(context_tracking); | ||
| 31 | |||
| 32 | void context_tracking_cpu_set(int cpu) | ||
| 33 | { | ||
| 34 | if (!per_cpu(context_tracking.active, cpu)) { | ||
| 35 | per_cpu(context_tracking.active, cpu) = true; | ||
| 36 | static_key_slow_inc(&context_tracking_enabled); | ||
| 37 | } | ||
| 38 | } | ||
| 28 | 39 | ||
| 29 | /** | 40 | /** |
| 30 | * user_enter - Inform the context tracking that the CPU is going to | 41 | * context_tracking_user_enter - Inform the context tracking that the CPU is going to |
| 31 | * enter userspace mode. | 42 | * enter userspace mode. |
| 32 | * | 43 | * |
| 33 | * This function must be called right before we switch from the kernel | 44 | * This function must be called right before we switch from the kernel |
| 34 | * to userspace, when it's guaranteed the remaining kernel instructions | 45 | * to userspace, when it's guaranteed the remaining kernel instructions |
| 35 | * to execute won't use any RCU read side critical section because this | 46 | * to execute won't use any RCU read side critical section because this |
| 36 | * function sets RCU in extended quiescent state. | 47 | * function sets RCU in extended quiescent state. |
| 37 | */ | 48 | */ |
| 38 | void user_enter(void) | 49 | void context_tracking_user_enter(void) |
| 39 | { | 50 | { |
| 40 | unsigned long flags; | 51 | unsigned long flags; |
| 41 | 52 | ||
| 42 | /* | 53 | /* |
| 54 | * Repeat the user_enter() check here because some archs may be calling | ||
| 55 | * this from asm and if no CPU needs context tracking, they shouldn't | ||
| 56 | * go further. Repeat the check here until they support the static key | ||
| 57 | * check. | ||
| 58 | */ | ||
| 59 | if (!static_key_false(&context_tracking_enabled)) | ||
| 60 | return; | ||
| 61 | |||
| 62 | /* | ||
| 43 | * Some contexts may involve an exception occuring in an irq, | 63 | * Some contexts may involve an exception occuring in an irq, |
| 44 | * leading to that nesting: | 64 | * leading to that nesting: |
| 45 | * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() | 65 | * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() |
| @@ -54,17 +74,32 @@ void user_enter(void) | |||
| 54 | WARN_ON_ONCE(!current->mm); | 74 | WARN_ON_ONCE(!current->mm); |
| 55 | 75 | ||
| 56 | local_irq_save(flags); | 76 | local_irq_save(flags); |
| 57 | if (__this_cpu_read(context_tracking.active) && | 77 | if ( __this_cpu_read(context_tracking.state) != IN_USER) { |
| 58 | __this_cpu_read(context_tracking.state) != IN_USER) { | 78 | if (__this_cpu_read(context_tracking.active)) { |
| 79 | trace_user_enter(0); | ||
| 80 | /* | ||
| 81 | * At this stage, only low level arch entry code remains and | ||
| 82 | * then we'll run in userspace. We can assume there won't be | ||
| 83 | * any RCU read-side critical section until the next call to | ||
| 84 | * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency | ||
| 85 | * on the tick. | ||
| 86 | */ | ||
| 87 | vtime_user_enter(current); | ||
| 88 | rcu_user_enter(); | ||
| 89 | } | ||
| 59 | /* | 90 | /* |
| 60 | * At this stage, only low level arch entry code remains and | 91 | * Even if context tracking is disabled on this CPU, because it's outside |
| 61 | * then we'll run in userspace. We can assume there won't be | 92 | * the full dynticks mask for example, we still have to keep track of the |
| 62 | * any RCU read-side critical section until the next call to | 93 | * context transitions and states to prevent inconsistency on those of |
| 63 | * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency | 94 | * other CPUs. |
| 64 | * on the tick. | 95 | * If a task triggers an exception in userspace, sleep on the exception |
| 96 | * handler and then migrate to another CPU, that new CPU must know where | ||
| 97 | * the exception returns by the time we call exception_exit(). | ||
| 98 | * This information can only be provided by the previous CPU when it called | ||
| 99 | * exception_enter(). | ||
| 100 | * OTOH we can spare the calls to vtime and RCU when context_tracking.active | ||
| 101 | * is false because we know that CPU is not tickless. | ||
| 65 | */ | 102 | */ |
| 66 | vtime_user_enter(current); | ||
| 67 | rcu_user_enter(); | ||
| 68 | __this_cpu_write(context_tracking.state, IN_USER); | 103 | __this_cpu_write(context_tracking.state, IN_USER); |
| 69 | } | 104 | } |
| 70 | local_irq_restore(flags); | 105 | local_irq_restore(flags); |
| @@ -85,12 +120,11 @@ void user_enter(void) | |||
| 85 | * instead of preempt_schedule() to exit user context if needed before | 120 | * instead of preempt_schedule() to exit user context if needed before |
| 86 | * calling the scheduler. | 121 | * calling the scheduler. |
| 87 | */ | 122 | */ |
| 88 | void __sched notrace preempt_schedule_context(void) | 123 | asmlinkage void __sched notrace preempt_schedule_context(void) |
| 89 | { | 124 | { |
| 90 | struct thread_info *ti = current_thread_info(); | ||
| 91 | enum ctx_state prev_ctx; | 125 | enum ctx_state prev_ctx; |
| 92 | 126 | ||
| 93 | if (likely(ti->preempt_count || irqs_disabled())) | 127 | if (likely(!preemptible())) |
| 94 | return; | 128 | return; |
| 95 | 129 | ||
| 96 | /* | 130 | /* |
| @@ -112,8 +146,8 @@ EXPORT_SYMBOL_GPL(preempt_schedule_context); | |||
| 112 | #endif /* CONFIG_PREEMPT */ | 146 | #endif /* CONFIG_PREEMPT */ |
| 113 | 147 | ||
| 114 | /** | 148 | /** |
| 115 | * user_exit - Inform the context tracking that the CPU is | 149 | * context_tracking_user_exit - Inform the context tracking that the CPU is |
| 116 | * exiting userspace mode and entering the kernel. | 150 | * exiting userspace mode and entering the kernel. |
| 117 | * | 151 | * |
| 118 | * This function must be called after we entered the kernel from userspace | 152 | * This function must be called after we entered the kernel from userspace |
| 119 | * before any use of RCU read side critical section. This potentially include | 153 | * before any use of RCU read side critical section. This potentially include |
| @@ -122,47 +156,34 @@ EXPORT_SYMBOL_GPL(preempt_schedule_context); | |||
| 122 | * This call supports re-entrancy. This way it can be called from any exception | 156 | * This call supports re-entrancy. This way it can be called from any exception |
| 123 | * handler without needing to know if we came from userspace or not. | 157 | * handler without needing to know if we came from userspace or not. |
| 124 | */ | 158 | */ |
| 125 | void user_exit(void) | 159 | void context_tracking_user_exit(void) |
| 126 | { | 160 | { |
| 127 | unsigned long flags; | 161 | unsigned long flags; |
| 128 | 162 | ||
| 163 | if (!static_key_false(&context_tracking_enabled)) | ||
| 164 | return; | ||
| 165 | |||
| 129 | if (in_interrupt()) | 166 | if (in_interrupt()) |
| 130 | return; | 167 | return; |
| 131 | 168 | ||
| 132 | local_irq_save(flags); | 169 | local_irq_save(flags); |
| 133 | if (__this_cpu_read(context_tracking.state) == IN_USER) { | 170 | if (__this_cpu_read(context_tracking.state) == IN_USER) { |
| 134 | /* | 171 | if (__this_cpu_read(context_tracking.active)) { |
| 135 | * We are going to run code that may use RCU. Inform | 172 | /* |
| 136 | * RCU core about that (ie: we may need the tick again). | 173 | * We are going to run code that may use RCU. Inform |
| 137 | */ | 174 | * RCU core about that (ie: we may need the tick again). |
| 138 | rcu_user_exit(); | 175 | */ |
| 139 | vtime_user_exit(current); | 176 | rcu_user_exit(); |
| 177 | vtime_user_exit(current); | ||
| 178 | trace_user_exit(0); | ||
| 179 | } | ||
| 140 | __this_cpu_write(context_tracking.state, IN_KERNEL); | 180 | __this_cpu_write(context_tracking.state, IN_KERNEL); |
| 141 | } | 181 | } |
| 142 | local_irq_restore(flags); | 182 | local_irq_restore(flags); |
| 143 | } | 183 | } |
| 144 | 184 | ||
| 145 | void guest_enter(void) | ||
| 146 | { | ||
| 147 | if (vtime_accounting_enabled()) | ||
| 148 | vtime_guest_enter(current); | ||
| 149 | else | ||
| 150 | __guest_enter(); | ||
| 151 | } | ||
| 152 | EXPORT_SYMBOL_GPL(guest_enter); | ||
| 153 | |||
| 154 | void guest_exit(void) | ||
| 155 | { | ||
| 156 | if (vtime_accounting_enabled()) | ||
| 157 | vtime_guest_exit(current); | ||
| 158 | else | ||
| 159 | __guest_exit(); | ||
| 160 | } | ||
| 161 | EXPORT_SYMBOL_GPL(guest_exit); | ||
| 162 | |||
| 163 | |||
| 164 | /** | 185 | /** |
| 165 | * context_tracking_task_switch - context switch the syscall callbacks | 186 | * __context_tracking_task_switch - context switch the syscall callbacks |
| 166 | * @prev: the task that is being switched out | 187 | * @prev: the task that is being switched out |
| 167 | * @next: the task that is being switched in | 188 | * @next: the task that is being switched in |
| 168 | * | 189 | * |
| @@ -174,11 +195,19 @@ EXPORT_SYMBOL_GPL(guest_exit); | |||
| 174 | * migrate to some CPU that doesn't do the context tracking. As such the TIF | 195 | * migrate to some CPU that doesn't do the context tracking. As such the TIF |
| 175 | * flag may not be desired there. | 196 | * flag may not be desired there. |
| 176 | */ | 197 | */ |
| 177 | void context_tracking_task_switch(struct task_struct *prev, | 198 | void __context_tracking_task_switch(struct task_struct *prev, |
| 178 | struct task_struct *next) | 199 | struct task_struct *next) |
| 179 | { | 200 | { |
| 180 | if (__this_cpu_read(context_tracking.active)) { | 201 | clear_tsk_thread_flag(prev, TIF_NOHZ); |
| 181 | clear_tsk_thread_flag(prev, TIF_NOHZ); | 202 | set_tsk_thread_flag(next, TIF_NOHZ); |
| 182 | set_tsk_thread_flag(next, TIF_NOHZ); | ||
| 183 | } | ||
| 184 | } | 203 | } |
| 204 | |||
| 205 | #ifdef CONFIG_CONTEXT_TRACKING_FORCE | ||
| 206 | void __init context_tracking_init(void) | ||
| 207 | { | ||
| 208 | int cpu; | ||
| 209 | |||
| 210 | for_each_possible_cpu(cpu) | ||
| 211 | context_tracking_cpu_set(cpu); | ||
| 212 | } | ||
| 213 | #endif | ||
diff --git a/kernel/cpu.c b/kernel/cpu.c index b2b227b82123..63aa50d7ce1e 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
| @@ -113,7 +113,7 @@ EXPORT_SYMBOL_GPL(put_online_cpus); | |||
| 113 | * get_online_cpus() not an api which is called all that often. | 113 | * get_online_cpus() not an api which is called all that often. |
| 114 | * | 114 | * |
| 115 | */ | 115 | */ |
| 116 | static void cpu_hotplug_begin(void) | 116 | void cpu_hotplug_begin(void) |
| 117 | { | 117 | { |
| 118 | cpu_hotplug.active_writer = current; | 118 | cpu_hotplug.active_writer = current; |
| 119 | 119 | ||
| @@ -127,7 +127,7 @@ static void cpu_hotplug_begin(void) | |||
| 127 | } | 127 | } |
| 128 | } | 128 | } |
| 129 | 129 | ||
| 130 | static void cpu_hotplug_done(void) | 130 | void cpu_hotplug_done(void) |
| 131 | { | 131 | { |
| 132 | cpu_hotplug.active_writer = NULL; | 132 | cpu_hotplug.active_writer = NULL; |
| 133 | mutex_unlock(&cpu_hotplug.lock); | 133 | mutex_unlock(&cpu_hotplug.lock); |
| @@ -154,10 +154,7 @@ void cpu_hotplug_enable(void) | |||
| 154 | cpu_maps_update_done(); | 154 | cpu_maps_update_done(); |
| 155 | } | 155 | } |
| 156 | 156 | ||
| 157 | #else /* #if CONFIG_HOTPLUG_CPU */ | 157 | #endif /* CONFIG_HOTPLUG_CPU */ |
| 158 | static void cpu_hotplug_begin(void) {} | ||
| 159 | static void cpu_hotplug_done(void) {} | ||
| 160 | #endif /* #else #if CONFIG_HOTPLUG_CPU */ | ||
| 161 | 158 | ||
| 162 | /* Need to know about CPUs going up/down? */ | 159 | /* Need to know about CPUs going up/down? */ |
| 163 | int __ref register_cpu_notifier(struct notifier_block *nb) | 160 | int __ref register_cpu_notifier(struct notifier_block *nb) |
| @@ -311,6 +308,23 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) | |||
| 311 | } | 308 | } |
| 312 | smpboot_park_threads(cpu); | 309 | smpboot_park_threads(cpu); |
| 313 | 310 | ||
| 311 | /* | ||
| 312 | * By now we've cleared cpu_active_mask, wait for all preempt-disabled | ||
| 313 | * and RCU users of this state to go away such that all new such users | ||
| 314 | * will observe it. | ||
| 315 | * | ||
| 316 | * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might | ||
| 317 | * not imply sync_sched(), so explicitly call both. | ||
| 318 | */ | ||
| 319 | #ifdef CONFIG_PREEMPT | ||
| 320 | synchronize_sched(); | ||
| 321 | #endif | ||
| 322 | synchronize_rcu(); | ||
| 323 | |||
| 324 | /* | ||
| 325 | * So now all preempt/rcu users must observe !cpu_active(). | ||
| 326 | */ | ||
| 327 | |||
| 314 | err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); | 328 | err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); |
| 315 | if (err) { | 329 | if (err) { |
| 316 | /* CPU didn't die: tell everyone. Can't complain. */ | 330 | /* CPU didn't die: tell everyone. Can't complain. */ |
diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c index e695c0a0bcb5..988573a9a387 100644 --- a/kernel/cpu/idle.c +++ b/kernel/cpu/idle.c | |||
| @@ -44,7 +44,7 @@ static inline int cpu_idle_poll(void) | |||
| 44 | rcu_idle_enter(); | 44 | rcu_idle_enter(); |
| 45 | trace_cpu_idle_rcuidle(0, smp_processor_id()); | 45 | trace_cpu_idle_rcuidle(0, smp_processor_id()); |
| 46 | local_irq_enable(); | 46 | local_irq_enable(); |
| 47 | while (!need_resched()) | 47 | while (!tif_need_resched()) |
| 48 | cpu_relax(); | 48 | cpu_relax(); |
| 49 | trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); | 49 | trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); |
| 50 | rcu_idle_exit(); | 50 | rcu_idle_exit(); |
| @@ -92,8 +92,7 @@ static void cpu_idle_loop(void) | |||
| 92 | if (cpu_idle_force_poll || tick_check_broadcast_expired()) { | 92 | if (cpu_idle_force_poll || tick_check_broadcast_expired()) { |
| 93 | cpu_idle_poll(); | 93 | cpu_idle_poll(); |
| 94 | } else { | 94 | } else { |
| 95 | current_clr_polling(); | 95 | if (!current_clr_polling_and_test()) { |
| 96 | if (!need_resched()) { | ||
| 97 | stop_critical_timings(); | 96 | stop_critical_timings(); |
| 98 | rcu_idle_enter(); | 97 | rcu_idle_enter(); |
| 99 | arch_cpu_idle(); | 98 | arch_cpu_idle(); |
| @@ -103,9 +102,16 @@ static void cpu_idle_loop(void) | |||
| 103 | } else { | 102 | } else { |
| 104 | local_irq_enable(); | 103 | local_irq_enable(); |
| 105 | } | 104 | } |
| 106 | current_set_polling(); | 105 | __current_set_polling(); |
| 107 | } | 106 | } |
| 108 | arch_cpu_idle_exit(); | 107 | arch_cpu_idle_exit(); |
| 108 | /* | ||
| 109 | * We need to test and propagate the TIF_NEED_RESCHED | ||
| 110 | * bit here because we might not have send the | ||
| 111 | * reschedule IPI to idle tasks. | ||
| 112 | */ | ||
| 113 | if (tif_need_resched()) | ||
| 114 | set_preempt_need_resched(); | ||
| 109 | } | 115 | } |
| 110 | tick_nohz_idle_exit(); | 116 | tick_nohz_idle_exit(); |
| 111 | schedule_preempt_disabled(); | 117 | schedule_preempt_disabled(); |
| @@ -129,7 +135,7 @@ void cpu_startup_entry(enum cpuhp_state state) | |||
| 129 | */ | 135 | */ |
| 130 | boot_init_stack_canary(); | 136 | boot_init_stack_canary(); |
| 131 | #endif | 137 | #endif |
| 132 | current_set_polling(); | 138 | __current_set_polling(); |
| 133 | arch_cpu_idle_prepare(); | 139 | arch_cpu_idle_prepare(); |
| 134 | cpu_idle_loop(); | 140 | cpu_idle_loop(); |
| 135 | } | 141 | } |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index e5657788fedd..6bf981e13c43 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
| @@ -68,10 +68,6 @@ | |||
| 68 | */ | 68 | */ |
| 69 | int number_of_cpusets __read_mostly; | 69 | int number_of_cpusets __read_mostly; |
| 70 | 70 | ||
| 71 | /* Forward declare cgroup structures */ | ||
| 72 | struct cgroup_subsys cpuset_subsys; | ||
| 73 | struct cpuset; | ||
| 74 | |||
| 75 | /* See "Frequency meter" comments, below. */ | 71 | /* See "Frequency meter" comments, below. */ |
| 76 | 72 | ||
| 77 | struct fmeter { | 73 | struct fmeter { |
| @@ -115,27 +111,20 @@ struct cpuset { | |||
| 115 | int relax_domain_level; | 111 | int relax_domain_level; |
| 116 | }; | 112 | }; |
| 117 | 113 | ||
| 118 | /* Retrieve the cpuset for a cgroup */ | 114 | static inline struct cpuset *css_cs(struct cgroup_subsys_state *css) |
| 119 | static inline struct cpuset *cgroup_cs(struct cgroup *cgrp) | ||
| 120 | { | 115 | { |
| 121 | return container_of(cgroup_subsys_state(cgrp, cpuset_subsys_id), | 116 | return css ? container_of(css, struct cpuset, css) : NULL; |
| 122 | struct cpuset, css); | ||
| 123 | } | 117 | } |
| 124 | 118 | ||
| 125 | /* Retrieve the cpuset for a task */ | 119 | /* Retrieve the cpuset for a task */ |
| 126 | static inline struct cpuset *task_cs(struct task_struct *task) | 120 | static inline struct cpuset *task_cs(struct task_struct *task) |
| 127 | { | 121 | { |
| 128 | return container_of(task_subsys_state(task, cpuset_subsys_id), | 122 | return css_cs(task_css(task, cpuset_subsys_id)); |
| 129 | struct cpuset, css); | ||
| 130 | } | 123 | } |
| 131 | 124 | ||
| 132 | static inline struct cpuset *parent_cs(const struct cpuset *cs) | 125 | static inline struct cpuset *parent_cs(struct cpuset *cs) |
| 133 | { | 126 | { |
| 134 | struct cgroup *pcgrp = cs->css.cgroup->parent; | 127 | return css_cs(css_parent(&cs->css)); |
| 135 | |||
| 136 | if (pcgrp) | ||
| 137 | return cgroup_cs(pcgrp); | ||
| 138 | return NULL; | ||
| 139 | } | 128 | } |
| 140 | 129 | ||
| 141 | #ifdef CONFIG_NUMA | 130 | #ifdef CONFIG_NUMA |
| @@ -212,29 +201,30 @@ static struct cpuset top_cpuset = { | |||
| 212 | /** | 201 | /** |
| 213 | * cpuset_for_each_child - traverse online children of a cpuset | 202 | * cpuset_for_each_child - traverse online children of a cpuset |
| 214 | * @child_cs: loop cursor pointing to the current child | 203 | * @child_cs: loop cursor pointing to the current child |
| 215 | * @pos_cgrp: used for iteration | 204 | * @pos_css: used for iteration |
| 216 | * @parent_cs: target cpuset to walk children of | 205 | * @parent_cs: target cpuset to walk children of |
| 217 | * | 206 | * |
| 218 | * Walk @child_cs through the online children of @parent_cs. Must be used | 207 | * Walk @child_cs through the online children of @parent_cs. Must be used |
| 219 | * with RCU read locked. | 208 | * with RCU read locked. |
| 220 | */ | 209 | */ |
| 221 | #define cpuset_for_each_child(child_cs, pos_cgrp, parent_cs) \ | 210 | #define cpuset_for_each_child(child_cs, pos_css, parent_cs) \ |
| 222 | cgroup_for_each_child((pos_cgrp), (parent_cs)->css.cgroup) \ | 211 | css_for_each_child((pos_css), &(parent_cs)->css) \ |
| 223 | if (is_cpuset_online(((child_cs) = cgroup_cs((pos_cgrp))))) | 212 | if (is_cpuset_online(((child_cs) = css_cs((pos_css))))) |
| 224 | 213 | ||
| 225 | /** | 214 | /** |
| 226 | * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants | 215 | * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants |
| 227 | * @des_cs: loop cursor pointing to the current descendant | 216 | * @des_cs: loop cursor pointing to the current descendant |
| 228 | * @pos_cgrp: used for iteration | 217 | * @pos_css: used for iteration |
| 229 | * @root_cs: target cpuset to walk ancestor of | 218 | * @root_cs: target cpuset to walk ancestor of |
| 230 | * | 219 | * |
| 231 | * Walk @des_cs through the online descendants of @root_cs. Must be used | 220 | * Walk @des_cs through the online descendants of @root_cs. Must be used |
| 232 | * with RCU read locked. The caller may modify @pos_cgrp by calling | 221 | * with RCU read locked. The caller may modify @pos_css by calling |
| 233 | * cgroup_rightmost_descendant() to skip subtree. | 222 | * css_rightmost_descendant() to skip subtree. @root_cs is included in the |
| 223 | * iteration and the first node to be visited. | ||
| 234 | */ | 224 | */ |
| 235 | #define cpuset_for_each_descendant_pre(des_cs, pos_cgrp, root_cs) \ | 225 | #define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \ |
| 236 | cgroup_for_each_descendant_pre((pos_cgrp), (root_cs)->css.cgroup) \ | 226 | css_for_each_descendant_pre((pos_css), &(root_cs)->css) \ |
| 237 | if (is_cpuset_online(((des_cs) = cgroup_cs((pos_cgrp))))) | 227 | if (is_cpuset_online(((des_cs) = css_cs((pos_css))))) |
| 238 | 228 | ||
| 239 | /* | 229 | /* |
| 240 | * There are two global mutexes guarding cpuset structures - cpuset_mutex | 230 | * There are two global mutexes guarding cpuset structures - cpuset_mutex |
| @@ -320,8 +310,7 @@ static struct file_system_type cpuset_fs_type = { | |||
| 320 | * | 310 | * |
| 321 | * Call with callback_mutex held. | 311 | * Call with callback_mutex held. |
| 322 | */ | 312 | */ |
| 323 | static void guarantee_online_cpus(const struct cpuset *cs, | 313 | static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) |
| 324 | struct cpumask *pmask) | ||
| 325 | { | 314 | { |
| 326 | while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) | 315 | while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) |
| 327 | cs = parent_cs(cs); | 316 | cs = parent_cs(cs); |
| @@ -339,7 +328,7 @@ static void guarantee_online_cpus(const struct cpuset *cs, | |||
| 339 | * | 328 | * |
| 340 | * Call with callback_mutex held. | 329 | * Call with callback_mutex held. |
| 341 | */ | 330 | */ |
| 342 | static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) | 331 | static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) |
| 343 | { | 332 | { |
| 344 | while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY])) | 333 | while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY])) |
| 345 | cs = parent_cs(cs); | 334 | cs = parent_cs(cs); |
| @@ -384,7 +373,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) | |||
| 384 | * alloc_trial_cpuset - allocate a trial cpuset | 373 | * alloc_trial_cpuset - allocate a trial cpuset |
| 385 | * @cs: the cpuset that the trial cpuset duplicates | 374 | * @cs: the cpuset that the trial cpuset duplicates |
| 386 | */ | 375 | */ |
| 387 | static struct cpuset *alloc_trial_cpuset(const struct cpuset *cs) | 376 | static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) |
| 388 | { | 377 | { |
| 389 | struct cpuset *trial; | 378 | struct cpuset *trial; |
| 390 | 379 | ||
| @@ -431,9 +420,9 @@ static void free_trial_cpuset(struct cpuset *trial) | |||
| 431 | * Return 0 if valid, -errno if not. | 420 | * Return 0 if valid, -errno if not. |
| 432 | */ | 421 | */ |
| 433 | 422 | ||
| 434 | static int validate_change(const struct cpuset *cur, const struct cpuset *trial) | 423 | static int validate_change(struct cpuset *cur, struct cpuset *trial) |
| 435 | { | 424 | { |
| 436 | struct cgroup *cgrp; | 425 | struct cgroup_subsys_state *css; |
| 437 | struct cpuset *c, *par; | 426 | struct cpuset *c, *par; |
| 438 | int ret; | 427 | int ret; |
| 439 | 428 | ||
| @@ -441,7 +430,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) | |||
| 441 | 430 | ||
| 442 | /* Each of our child cpusets must be a subset of us */ | 431 | /* Each of our child cpusets must be a subset of us */ |
| 443 | ret = -EBUSY; | 432 | ret = -EBUSY; |
| 444 | cpuset_for_each_child(c, cgrp, cur) | 433 | cpuset_for_each_child(c, css, cur) |
| 445 | if (!is_cpuset_subset(c, trial)) | 434 | if (!is_cpuset_subset(c, trial)) |
| 446 | goto out; | 435 | goto out; |
| 447 | 436 | ||
| @@ -462,7 +451,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) | |||
| 462 | * overlap | 451 | * overlap |
| 463 | */ | 452 | */ |
| 464 | ret = -EINVAL; | 453 | ret = -EINVAL; |
| 465 | cpuset_for_each_child(c, cgrp, par) { | 454 | cpuset_for_each_child(c, css, par) { |
| 466 | if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && | 455 | if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && |
| 467 | c != cur && | 456 | c != cur && |
| 468 | cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) | 457 | cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) |
| @@ -475,13 +464,17 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) | |||
| 475 | 464 | ||
| 476 | /* | 465 | /* |
| 477 | * Cpusets with tasks - existing or newly being attached - can't | 466 | * Cpusets with tasks - existing or newly being attached - can't |
| 478 | * have empty cpus_allowed or mems_allowed. | 467 | * be changed to have empty cpus_allowed or mems_allowed. |
| 479 | */ | 468 | */ |
| 480 | ret = -ENOSPC; | 469 | ret = -ENOSPC; |
| 481 | if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress) && | 470 | if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress)) { |
| 482 | (cpumask_empty(trial->cpus_allowed) && | 471 | if (!cpumask_empty(cur->cpus_allowed) && |
| 483 | nodes_empty(trial->mems_allowed))) | 472 | cpumask_empty(trial->cpus_allowed)) |
| 484 | goto out; | 473 | goto out; |
| 474 | if (!nodes_empty(cur->mems_allowed) && | ||
| 475 | nodes_empty(trial->mems_allowed)) | ||
| 476 | goto out; | ||
| 477 | } | ||
| 485 | 478 | ||
| 486 | ret = 0; | 479 | ret = 0; |
| 487 | out: | 480 | out: |
| @@ -511,13 +504,16 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr, | |||
| 511 | struct cpuset *root_cs) | 504 | struct cpuset *root_cs) |
| 512 | { | 505 | { |
| 513 | struct cpuset *cp; | 506 | struct cpuset *cp; |
| 514 | struct cgroup *pos_cgrp; | 507 | struct cgroup_subsys_state *pos_css; |
| 515 | 508 | ||
| 516 | rcu_read_lock(); | 509 | rcu_read_lock(); |
| 517 | cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { | 510 | cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { |
| 511 | if (cp == root_cs) | ||
| 512 | continue; | ||
| 513 | |||
| 518 | /* skip the whole subtree if @cp doesn't have any CPU */ | 514 | /* skip the whole subtree if @cp doesn't have any CPU */ |
| 519 | if (cpumask_empty(cp->cpus_allowed)) { | 515 | if (cpumask_empty(cp->cpus_allowed)) { |
| 520 | pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); | 516 | pos_css = css_rightmost_descendant(pos_css); |
| 521 | continue; | 517 | continue; |
| 522 | } | 518 | } |
| 523 | 519 | ||
| @@ -592,7 +588,7 @@ static int generate_sched_domains(cpumask_var_t **domains, | |||
| 592 | struct sched_domain_attr *dattr; /* attributes for custom domains */ | 588 | struct sched_domain_attr *dattr; /* attributes for custom domains */ |
| 593 | int ndoms = 0; /* number of sched domains in result */ | 589 | int ndoms = 0; /* number of sched domains in result */ |
| 594 | int nslot; /* next empty doms[] struct cpumask slot */ | 590 | int nslot; /* next empty doms[] struct cpumask slot */ |
| 595 | struct cgroup *pos_cgrp; | 591 | struct cgroup_subsys_state *pos_css; |
| 596 | 592 | ||
| 597 | doms = NULL; | 593 | doms = NULL; |
| 598 | dattr = NULL; | 594 | dattr = NULL; |
| @@ -621,7 +617,9 @@ static int generate_sched_domains(cpumask_var_t **domains, | |||
| 621 | csn = 0; | 617 | csn = 0; |
| 622 | 618 | ||
| 623 | rcu_read_lock(); | 619 | rcu_read_lock(); |
| 624 | cpuset_for_each_descendant_pre(cp, pos_cgrp, &top_cpuset) { | 620 | cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) { |
| 621 | if (cp == &top_cpuset) | ||
| 622 | continue; | ||
| 625 | /* | 623 | /* |
| 626 | * Continue traversing beyond @cp iff @cp has some CPUs and | 624 | * Continue traversing beyond @cp iff @cp has some CPUs and |
| 627 | * isn't load balancing. The former is obvious. The | 625 | * isn't load balancing. The former is obvious. The |
| @@ -638,7 +636,7 @@ static int generate_sched_domains(cpumask_var_t **domains, | |||
| 638 | csa[csn++] = cp; | 636 | csa[csn++] = cp; |
| 639 | 637 | ||
| 640 | /* skip @cp's subtree */ | 638 | /* skip @cp's subtree */ |
| 641 | pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); | 639 | pos_css = css_rightmost_descendant(pos_css); |
| 642 | } | 640 | } |
| 643 | rcu_read_unlock(); | 641 | rcu_read_unlock(); |
| 644 | 642 | ||
| @@ -833,52 +831,45 @@ static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs) | |||
| 833 | /** | 831 | /** |
| 834 | * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's | 832 | * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's |
| 835 | * @tsk: task to test | 833 | * @tsk: task to test |
| 836 | * @scan: struct cgroup_scanner containing the cgroup of the task | 834 | * @data: cpuset to @tsk belongs to |
| 837 | * | 835 | * |
| 838 | * Called by cgroup_scan_tasks() for each task in a cgroup whose | 836 | * Called by css_scan_tasks() for each task in a cgroup whose cpus_allowed |
| 839 | * cpus_allowed mask needs to be changed. | 837 | * mask needs to be changed. |
| 840 | * | 838 | * |
| 841 | * We don't need to re-check for the cgroup/cpuset membership, since we're | 839 | * We don't need to re-check for the cgroup/cpuset membership, since we're |
| 842 | * holding cpuset_mutex at this point. | 840 | * holding cpuset_mutex at this point. |
| 843 | */ | 841 | */ |
| 844 | static void cpuset_change_cpumask(struct task_struct *tsk, | 842 | static void cpuset_change_cpumask(struct task_struct *tsk, void *data) |
| 845 | struct cgroup_scanner *scan) | ||
| 846 | { | 843 | { |
| 847 | struct cpuset *cpus_cs; | 844 | struct cpuset *cs = data; |
| 845 | struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); | ||
| 848 | 846 | ||
| 849 | cpus_cs = effective_cpumask_cpuset(cgroup_cs(scan->cg)); | ||
| 850 | set_cpus_allowed_ptr(tsk, cpus_cs->cpus_allowed); | 847 | set_cpus_allowed_ptr(tsk, cpus_cs->cpus_allowed); |
| 851 | } | 848 | } |
| 852 | 849 | ||
| 853 | /** | 850 | /** |
| 854 | * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. | 851 | * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. |
| 855 | * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed | 852 | * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed |
| 856 | * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() | 853 | * @heap: if NULL, defer allocating heap memory to css_scan_tasks() |
| 857 | * | 854 | * |
| 858 | * Called with cpuset_mutex held | 855 | * Called with cpuset_mutex held |
| 859 | * | 856 | * |
| 860 | * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, | 857 | * The css_scan_tasks() function will scan all the tasks in a cgroup, |
| 861 | * calling callback functions for each. | 858 | * calling callback functions for each. |
| 862 | * | 859 | * |
| 863 | * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 | 860 | * No return value. It's guaranteed that css_scan_tasks() always returns 0 |
| 864 | * if @heap != NULL. | 861 | * if @heap != NULL. |
| 865 | */ | 862 | */ |
| 866 | static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) | 863 | static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) |
| 867 | { | 864 | { |
| 868 | struct cgroup_scanner scan; | 865 | css_scan_tasks(&cs->css, NULL, cpuset_change_cpumask, cs, heap); |
| 869 | |||
| 870 | scan.cg = cs->css.cgroup; | ||
| 871 | scan.test_task = NULL; | ||
| 872 | scan.process_task = cpuset_change_cpumask; | ||
| 873 | scan.heap = heap; | ||
| 874 | cgroup_scan_tasks(&scan); | ||
| 875 | } | 866 | } |
| 876 | 867 | ||
| 877 | /* | 868 | /* |
| 878 | * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy. | 869 | * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy. |
| 879 | * @root_cs: the root cpuset of the hierarchy | 870 | * @root_cs: the root cpuset of the hierarchy |
| 880 | * @update_root: update root cpuset or not? | 871 | * @update_root: update root cpuset or not? |
| 881 | * @heap: the heap used by cgroup_scan_tasks() | 872 | * @heap: the heap used by css_scan_tasks() |
| 882 | * | 873 | * |
| 883 | * This will update cpumasks of tasks in @root_cs and all other empty cpusets | 874 | * This will update cpumasks of tasks in @root_cs and all other empty cpusets |
| 884 | * which take on cpumask of @root_cs. | 875 | * which take on cpumask of @root_cs. |
| @@ -889,17 +880,19 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs, | |||
| 889 | bool update_root, struct ptr_heap *heap) | 880 | bool update_root, struct ptr_heap *heap) |
| 890 | { | 881 | { |
| 891 | struct cpuset *cp; | 882 | struct cpuset *cp; |
| 892 | struct cgroup *pos_cgrp; | 883 | struct cgroup_subsys_state *pos_css; |
| 893 | |||
| 894 | if (update_root) | ||
| 895 | update_tasks_cpumask(root_cs, heap); | ||
| 896 | 884 | ||
| 897 | rcu_read_lock(); | 885 | rcu_read_lock(); |
| 898 | cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { | 886 | cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { |
| 899 | /* skip the whole subtree if @cp have some CPU */ | 887 | if (cp == root_cs) { |
| 900 | if (!cpumask_empty(cp->cpus_allowed)) { | 888 | if (!update_root) |
| 901 | pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); | 889 | continue; |
| 902 | continue; | 890 | } else { |
| 891 | /* skip the whole subtree if @cp have some CPU */ | ||
| 892 | if (!cpumask_empty(cp->cpus_allowed)) { | ||
| 893 | pos_css = css_rightmost_descendant(pos_css); | ||
| 894 | continue; | ||
| 895 | } | ||
| 903 | } | 896 | } |
| 904 | if (!css_tryget(&cp->css)) | 897 | if (!css_tryget(&cp->css)) |
| 905 | continue; | 898 | continue; |
| @@ -1055,20 +1048,24 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, | |||
| 1055 | task_unlock(tsk); | 1048 | task_unlock(tsk); |
| 1056 | } | 1049 | } |
| 1057 | 1050 | ||
| 1051 | struct cpuset_change_nodemask_arg { | ||
| 1052 | struct cpuset *cs; | ||
| 1053 | nodemask_t *newmems; | ||
| 1054 | }; | ||
| 1055 | |||
| 1058 | /* | 1056 | /* |
| 1059 | * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy | 1057 | * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy |
| 1060 | * of it to cpuset's new mems_allowed, and migrate pages to new nodes if | 1058 | * of it to cpuset's new mems_allowed, and migrate pages to new nodes if |
| 1061 | * memory_migrate flag is set. Called with cpuset_mutex held. | 1059 | * memory_migrate flag is set. Called with cpuset_mutex held. |
| 1062 | */ | 1060 | */ |
| 1063 | static void cpuset_change_nodemask(struct task_struct *p, | 1061 | static void cpuset_change_nodemask(struct task_struct *p, void *data) |
| 1064 | struct cgroup_scanner *scan) | ||
| 1065 | { | 1062 | { |
| 1066 | struct cpuset *cs = cgroup_cs(scan->cg); | 1063 | struct cpuset_change_nodemask_arg *arg = data; |
| 1064 | struct cpuset *cs = arg->cs; | ||
| 1067 | struct mm_struct *mm; | 1065 | struct mm_struct *mm; |
| 1068 | int migrate; | 1066 | int migrate; |
| 1069 | nodemask_t *newmems = scan->data; | ||
| 1070 | 1067 | ||
| 1071 | cpuset_change_task_nodemask(p, newmems); | 1068 | cpuset_change_task_nodemask(p, arg->newmems); |
| 1072 | 1069 | ||
| 1073 | mm = get_task_mm(p); | 1070 | mm = get_task_mm(p); |
| 1074 | if (!mm) | 1071 | if (!mm) |
| @@ -1078,7 +1075,7 @@ static void cpuset_change_nodemask(struct task_struct *p, | |||
| 1078 | 1075 | ||
| 1079 | mpol_rebind_mm(mm, &cs->mems_allowed); | 1076 | mpol_rebind_mm(mm, &cs->mems_allowed); |
| 1080 | if (migrate) | 1077 | if (migrate) |
| 1081 | cpuset_migrate_mm(mm, &cs->old_mems_allowed, newmems); | 1078 | cpuset_migrate_mm(mm, &cs->old_mems_allowed, arg->newmems); |
| 1082 | mmput(mm); | 1079 | mmput(mm); |
| 1083 | } | 1080 | } |
| 1084 | 1081 | ||
| @@ -1087,28 +1084,22 @@ static void *cpuset_being_rebound; | |||
| 1087 | /** | 1084 | /** |
| 1088 | * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. | 1085 | * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. |
| 1089 | * @cs: the cpuset in which each task's mems_allowed mask needs to be changed | 1086 | * @cs: the cpuset in which each task's mems_allowed mask needs to be changed |
| 1090 | * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() | 1087 | * @heap: if NULL, defer allocating heap memory to css_scan_tasks() |
| 1091 | * | 1088 | * |
| 1092 | * Called with cpuset_mutex held | 1089 | * Called with cpuset_mutex held. No return value. It's guaranteed that |
| 1093 | * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 | 1090 | * css_scan_tasks() always returns 0 if @heap != NULL. |
| 1094 | * if @heap != NULL. | ||
| 1095 | */ | 1091 | */ |
| 1096 | static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) | 1092 | static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) |
| 1097 | { | 1093 | { |
| 1098 | static nodemask_t newmems; /* protected by cpuset_mutex */ | 1094 | static nodemask_t newmems; /* protected by cpuset_mutex */ |
| 1099 | struct cgroup_scanner scan; | ||
| 1100 | struct cpuset *mems_cs = effective_nodemask_cpuset(cs); | 1095 | struct cpuset *mems_cs = effective_nodemask_cpuset(cs); |
| 1096 | struct cpuset_change_nodemask_arg arg = { .cs = cs, | ||
| 1097 | .newmems = &newmems }; | ||
| 1101 | 1098 | ||
| 1102 | cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ | 1099 | cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ |
| 1103 | 1100 | ||
| 1104 | guarantee_online_mems(mems_cs, &newmems); | 1101 | guarantee_online_mems(mems_cs, &newmems); |
| 1105 | 1102 | ||
| 1106 | scan.cg = cs->css.cgroup; | ||
| 1107 | scan.test_task = NULL; | ||
| 1108 | scan.process_task = cpuset_change_nodemask; | ||
| 1109 | scan.heap = heap; | ||
| 1110 | scan.data = &newmems; | ||
| 1111 | |||
| 1112 | /* | 1103 | /* |
| 1113 | * The mpol_rebind_mm() call takes mmap_sem, which we couldn't | 1104 | * The mpol_rebind_mm() call takes mmap_sem, which we couldn't |
| 1114 | * take while holding tasklist_lock. Forks can happen - the | 1105 | * take while holding tasklist_lock. Forks can happen - the |
| @@ -1119,7 +1110,7 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) | |||
| 1119 | * It's ok if we rebind the same mm twice; mpol_rebind_mm() | 1110 | * It's ok if we rebind the same mm twice; mpol_rebind_mm() |
| 1120 | * is idempotent. Also migrate pages in each mm to new nodes. | 1111 | * is idempotent. Also migrate pages in each mm to new nodes. |
| 1121 | */ | 1112 | */ |
| 1122 | cgroup_scan_tasks(&scan); | 1113 | css_scan_tasks(&cs->css, NULL, cpuset_change_nodemask, &arg, heap); |
| 1123 | 1114 | ||
| 1124 | /* | 1115 | /* |
| 1125 | * All the tasks' nodemasks have been updated, update | 1116 | * All the tasks' nodemasks have been updated, update |
| @@ -1135,7 +1126,7 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) | |||
| 1135 | * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy. | 1126 | * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy. |
| 1136 | * @cs: the root cpuset of the hierarchy | 1127 | * @cs: the root cpuset of the hierarchy |
| 1137 | * @update_root: update the root cpuset or not? | 1128 | * @update_root: update the root cpuset or not? |
| 1138 | * @heap: the heap used by cgroup_scan_tasks() | 1129 | * @heap: the heap used by css_scan_tasks() |
| 1139 | * | 1130 | * |
| 1140 | * This will update nodemasks of tasks in @root_cs and all other empty cpusets | 1131 | * This will update nodemasks of tasks in @root_cs and all other empty cpusets |
| 1141 | * which take on nodemask of @root_cs. | 1132 | * which take on nodemask of @root_cs. |
| @@ -1146,17 +1137,19 @@ static void update_tasks_nodemask_hier(struct cpuset *root_cs, | |||
| 1146 | bool update_root, struct ptr_heap *heap) | 1137 | bool update_root, struct ptr_heap *heap) |
| 1147 | { | 1138 | { |
| 1148 | struct cpuset *cp; | 1139 | struct cpuset *cp; |
| 1149 | struct cgroup *pos_cgrp; | 1140 | struct cgroup_subsys_state *pos_css; |
| 1150 | |||
| 1151 | if (update_root) | ||
| 1152 | update_tasks_nodemask(root_cs, heap); | ||
| 1153 | 1141 | ||
| 1154 | rcu_read_lock(); | 1142 | rcu_read_lock(); |
| 1155 | cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { | 1143 | cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { |
| 1156 | /* skip the whole subtree if @cp have some CPU */ | 1144 | if (cp == root_cs) { |
| 1157 | if (!nodes_empty(cp->mems_allowed)) { | 1145 | if (!update_root) |
| 1158 | pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); | 1146 | continue; |
| 1159 | continue; | 1147 | } else { |
| 1148 | /* skip the whole subtree if @cp have some CPU */ | ||
| 1149 | if (!nodes_empty(cp->mems_allowed)) { | ||
| 1150 | pos_css = css_rightmost_descendant(pos_css); | ||
| 1151 | continue; | ||
| 1152 | } | ||
| 1160 | } | 1153 | } |
| 1161 | if (!css_tryget(&cp->css)) | 1154 | if (!css_tryget(&cp->css)) |
| 1162 | continue; | 1155 | continue; |
| @@ -1263,44 +1256,39 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) | |||
| 1263 | return 0; | 1256 | return 0; |
| 1264 | } | 1257 | } |
| 1265 | 1258 | ||
| 1266 | /* | 1259 | /** |
| 1267 | * cpuset_change_flag - make a task's spread flags the same as its cpuset's | 1260 | * cpuset_change_flag - make a task's spread flags the same as its cpuset's |
| 1268 | * @tsk: task to be updated | 1261 | * @tsk: task to be updated |
| 1269 | * @scan: struct cgroup_scanner containing the cgroup of the task | 1262 | * @data: cpuset to @tsk belongs to |
| 1270 | * | 1263 | * |
| 1271 | * Called by cgroup_scan_tasks() for each task in a cgroup. | 1264 | * Called by css_scan_tasks() for each task in a cgroup. |
| 1272 | * | 1265 | * |
| 1273 | * We don't need to re-check for the cgroup/cpuset membership, since we're | 1266 | * We don't need to re-check for the cgroup/cpuset membership, since we're |
| 1274 | * holding cpuset_mutex at this point. | 1267 | * holding cpuset_mutex at this point. |
| 1275 | */ | 1268 | */ |
| 1276 | static void cpuset_change_flag(struct task_struct *tsk, | 1269 | static void cpuset_change_flag(struct task_struct *tsk, void *data) |
| 1277 | struct cgroup_scanner *scan) | ||
| 1278 | { | 1270 | { |
| 1279 | cpuset_update_task_spread_flag(cgroup_cs(scan->cg), tsk); | 1271 | struct cpuset *cs = data; |
| 1272 | |||
| 1273 | cpuset_update_task_spread_flag(cs, tsk); | ||
| 1280 | } | 1274 | } |
| 1281 | 1275 | ||
| 1282 | /* | 1276 | /** |
| 1283 | * update_tasks_flags - update the spread flags of tasks in the cpuset. | 1277 | * update_tasks_flags - update the spread flags of tasks in the cpuset. |
| 1284 | * @cs: the cpuset in which each task's spread flags needs to be changed | 1278 | * @cs: the cpuset in which each task's spread flags needs to be changed |
| 1285 | * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() | 1279 | * @heap: if NULL, defer allocating heap memory to css_scan_tasks() |
| 1286 | * | 1280 | * |
| 1287 | * Called with cpuset_mutex held | 1281 | * Called with cpuset_mutex held |
| 1288 | * | 1282 | * |
| 1289 | * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, | 1283 | * The css_scan_tasks() function will scan all the tasks in a cgroup, |
| 1290 | * calling callback functions for each. | 1284 | * calling callback functions for each. |
| 1291 | * | 1285 | * |
| 1292 | * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 | 1286 | * No return value. It's guaranteed that css_scan_tasks() always returns 0 |
| 1293 | * if @heap != NULL. | 1287 | * if @heap != NULL. |
| 1294 | */ | 1288 | */ |
| 1295 | static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap) | 1289 | static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap) |
| 1296 | { | 1290 | { |
| 1297 | struct cgroup_scanner scan; | 1291 | css_scan_tasks(&cs->css, NULL, cpuset_change_flag, cs, heap); |
| 1298 | |||
| 1299 | scan.cg = cs->css.cgroup; | ||
| 1300 | scan.test_task = NULL; | ||
| 1301 | scan.process_task = cpuset_change_flag; | ||
| 1302 | scan.heap = heap; | ||
| 1303 | cgroup_scan_tasks(&scan); | ||
| 1304 | } | 1292 | } |
| 1305 | 1293 | ||
| 1306 | /* | 1294 | /* |
| @@ -1458,9 +1446,10 @@ static int fmeter_getrate(struct fmeter *fmp) | |||
| 1458 | } | 1446 | } |
| 1459 | 1447 | ||
| 1460 | /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ | 1448 | /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ |
| 1461 | static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) | 1449 | static int cpuset_can_attach(struct cgroup_subsys_state *css, |
| 1450 | struct cgroup_taskset *tset) | ||
| 1462 | { | 1451 | { |
| 1463 | struct cpuset *cs = cgroup_cs(cgrp); | 1452 | struct cpuset *cs = css_cs(css); |
| 1464 | struct task_struct *task; | 1453 | struct task_struct *task; |
| 1465 | int ret; | 1454 | int ret; |
| 1466 | 1455 | ||
| @@ -1471,11 +1460,11 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) | |||
| 1471 | * flag is set. | 1460 | * flag is set. |
| 1472 | */ | 1461 | */ |
| 1473 | ret = -ENOSPC; | 1462 | ret = -ENOSPC; |
| 1474 | if (!cgroup_sane_behavior(cgrp) && | 1463 | if (!cgroup_sane_behavior(css->cgroup) && |
| 1475 | (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) | 1464 | (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) |
| 1476 | goto out_unlock; | 1465 | goto out_unlock; |
| 1477 | 1466 | ||
| 1478 | cgroup_taskset_for_each(task, cgrp, tset) { | 1467 | cgroup_taskset_for_each(task, css, tset) { |
| 1479 | /* | 1468 | /* |
| 1480 | * Kthreads which disallow setaffinity shouldn't be moved | 1469 | * Kthreads which disallow setaffinity shouldn't be moved |
| 1481 | * to a new cpuset; we don't want to change their cpu | 1470 | * to a new cpuset; we don't want to change their cpu |
| @@ -1504,11 +1493,11 @@ out_unlock: | |||
| 1504 | return ret; | 1493 | return ret; |
| 1505 | } | 1494 | } |
| 1506 | 1495 | ||
| 1507 | static void cpuset_cancel_attach(struct cgroup *cgrp, | 1496 | static void cpuset_cancel_attach(struct cgroup_subsys_state *css, |
| 1508 | struct cgroup_taskset *tset) | 1497 | struct cgroup_taskset *tset) |
| 1509 | { | 1498 | { |
| 1510 | mutex_lock(&cpuset_mutex); | 1499 | mutex_lock(&cpuset_mutex); |
| 1511 | cgroup_cs(cgrp)->attach_in_progress--; | 1500 | css_cs(css)->attach_in_progress--; |
| 1512 | mutex_unlock(&cpuset_mutex); | 1501 | mutex_unlock(&cpuset_mutex); |
| 1513 | } | 1502 | } |
| 1514 | 1503 | ||
| @@ -1519,16 +1508,18 @@ static void cpuset_cancel_attach(struct cgroup *cgrp, | |||
| 1519 | */ | 1508 | */ |
| 1520 | static cpumask_var_t cpus_attach; | 1509 | static cpumask_var_t cpus_attach; |
| 1521 | 1510 | ||
| 1522 | static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) | 1511 | static void cpuset_attach(struct cgroup_subsys_state *css, |
| 1512 | struct cgroup_taskset *tset) | ||
| 1523 | { | 1513 | { |
| 1524 | /* static buf protected by cpuset_mutex */ | 1514 | /* static buf protected by cpuset_mutex */ |
| 1525 | static nodemask_t cpuset_attach_nodemask_to; | 1515 | static nodemask_t cpuset_attach_nodemask_to; |
| 1526 | struct mm_struct *mm; | 1516 | struct mm_struct *mm; |
| 1527 | struct task_struct *task; | 1517 | struct task_struct *task; |
| 1528 | struct task_struct *leader = cgroup_taskset_first(tset); | 1518 | struct task_struct *leader = cgroup_taskset_first(tset); |
| 1529 | struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset); | 1519 | struct cgroup_subsys_state *oldcss = cgroup_taskset_cur_css(tset, |
| 1530 | struct cpuset *cs = cgroup_cs(cgrp); | 1520 | cpuset_subsys_id); |
| 1531 | struct cpuset *oldcs = cgroup_cs(oldcgrp); | 1521 | struct cpuset *cs = css_cs(css); |
| 1522 | struct cpuset *oldcs = css_cs(oldcss); | ||
| 1532 | struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); | 1523 | struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); |
| 1533 | struct cpuset *mems_cs = effective_nodemask_cpuset(cs); | 1524 | struct cpuset *mems_cs = effective_nodemask_cpuset(cs); |
| 1534 | 1525 | ||
| @@ -1542,7 +1533,7 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) | |||
| 1542 | 1533 | ||
| 1543 | guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to); | 1534 | guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to); |
| 1544 | 1535 | ||
| 1545 | cgroup_taskset_for_each(task, cgrp, tset) { | 1536 | cgroup_taskset_for_each(task, css, tset) { |
| 1546 | /* | 1537 | /* |
| 1547 | * can_attach beforehand should guarantee that this doesn't | 1538 | * can_attach beforehand should guarantee that this doesn't |
| 1548 | * fail. TODO: have a better way to handle failure here | 1539 | * fail. TODO: have a better way to handle failure here |
| @@ -1604,15 +1595,18 @@ typedef enum { | |||
| 1604 | FILE_SPREAD_SLAB, | 1595 | FILE_SPREAD_SLAB, |
| 1605 | } cpuset_filetype_t; | 1596 | } cpuset_filetype_t; |
| 1606 | 1597 | ||
| 1607 | static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) | 1598 | static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, |
| 1599 | u64 val) | ||
| 1608 | { | 1600 | { |
| 1609 | struct cpuset *cs = cgroup_cs(cgrp); | 1601 | struct cpuset *cs = css_cs(css); |
| 1610 | cpuset_filetype_t type = cft->private; | 1602 | cpuset_filetype_t type = cft->private; |
| 1611 | int retval = -ENODEV; | 1603 | int retval = 0; |
| 1612 | 1604 | ||
| 1613 | mutex_lock(&cpuset_mutex); | 1605 | mutex_lock(&cpuset_mutex); |
| 1614 | if (!is_cpuset_online(cs)) | 1606 | if (!is_cpuset_online(cs)) { |
| 1607 | retval = -ENODEV; | ||
| 1615 | goto out_unlock; | 1608 | goto out_unlock; |
| 1609 | } | ||
| 1616 | 1610 | ||
| 1617 | switch (type) { | 1611 | switch (type) { |
| 1618 | case FILE_CPU_EXCLUSIVE: | 1612 | case FILE_CPU_EXCLUSIVE: |
| @@ -1651,9 +1645,10 @@ out_unlock: | |||
| 1651 | return retval; | 1645 | return retval; |
| 1652 | } | 1646 | } |
| 1653 | 1647 | ||
| 1654 | static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) | 1648 | static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, |
| 1649 | s64 val) | ||
| 1655 | { | 1650 | { |
| 1656 | struct cpuset *cs = cgroup_cs(cgrp); | 1651 | struct cpuset *cs = css_cs(css); |
| 1657 | cpuset_filetype_t type = cft->private; | 1652 | cpuset_filetype_t type = cft->private; |
| 1658 | int retval = -ENODEV; | 1653 | int retval = -ENODEV; |
| 1659 | 1654 | ||
| @@ -1677,10 +1672,10 @@ out_unlock: | |||
| 1677 | /* | 1672 | /* |
| 1678 | * Common handling for a write to a "cpus" or "mems" file. | 1673 | * Common handling for a write to a "cpus" or "mems" file. |
| 1679 | */ | 1674 | */ |
| 1680 | static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, | 1675 | static int cpuset_write_resmask(struct cgroup_subsys_state *css, |
| 1681 | const char *buf) | 1676 | struct cftype *cft, const char *buf) |
| 1682 | { | 1677 | { |
| 1683 | struct cpuset *cs = cgroup_cs(cgrp); | 1678 | struct cpuset *cs = css_cs(css); |
| 1684 | struct cpuset *trialcs; | 1679 | struct cpuset *trialcs; |
| 1685 | int retval = -ENODEV; | 1680 | int retval = -ENODEV; |
| 1686 | 1681 | ||
| @@ -1759,13 +1754,12 @@ static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs) | |||
| 1759 | return count; | 1754 | return count; |
| 1760 | } | 1755 | } |
| 1761 | 1756 | ||
| 1762 | static ssize_t cpuset_common_file_read(struct cgroup *cgrp, | 1757 | static ssize_t cpuset_common_file_read(struct cgroup_subsys_state *css, |
| 1763 | struct cftype *cft, | 1758 | struct cftype *cft, struct file *file, |
| 1764 | struct file *file, | 1759 | char __user *buf, size_t nbytes, |
| 1765 | char __user *buf, | 1760 | loff_t *ppos) |
| 1766 | size_t nbytes, loff_t *ppos) | ||
| 1767 | { | 1761 | { |
| 1768 | struct cpuset *cs = cgroup_cs(cgrp); | 1762 | struct cpuset *cs = css_cs(css); |
| 1769 | cpuset_filetype_t type = cft->private; | 1763 | cpuset_filetype_t type = cft->private; |
| 1770 | char *page; | 1764 | char *page; |
| 1771 | ssize_t retval = 0; | 1765 | ssize_t retval = 0; |
| @@ -1795,9 +1789,9 @@ out: | |||
| 1795 | return retval; | 1789 | return retval; |
| 1796 | } | 1790 | } |
| 1797 | 1791 | ||
| 1798 | static u64 cpuset_read_u64(struct cgroup *cgrp, struct cftype *cft) | 1792 | static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) |
| 1799 | { | 1793 | { |
| 1800 | struct cpuset *cs = cgroup_cs(cgrp); | 1794 | struct cpuset *cs = css_cs(css); |
| 1801 | cpuset_filetype_t type = cft->private; | 1795 | cpuset_filetype_t type = cft->private; |
| 1802 | switch (type) { | 1796 | switch (type) { |
| 1803 | case FILE_CPU_EXCLUSIVE: | 1797 | case FILE_CPU_EXCLUSIVE: |
| @@ -1826,9 +1820,9 @@ static u64 cpuset_read_u64(struct cgroup *cgrp, struct cftype *cft) | |||
| 1826 | return 0; | 1820 | return 0; |
| 1827 | } | 1821 | } |
| 1828 | 1822 | ||
| 1829 | static s64 cpuset_read_s64(struct cgroup *cgrp, struct cftype *cft) | 1823 | static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) |
| 1830 | { | 1824 | { |
| 1831 | struct cpuset *cs = cgroup_cs(cgrp); | 1825 | struct cpuset *cs = css_cs(css); |
| 1832 | cpuset_filetype_t type = cft->private; | 1826 | cpuset_filetype_t type = cft->private; |
| 1833 | switch (type) { | 1827 | switch (type) { |
| 1834 | case FILE_SCHED_RELAX_DOMAIN_LEVEL: | 1828 | case FILE_SCHED_RELAX_DOMAIN_LEVEL: |
| @@ -1943,11 +1937,12 @@ static struct cftype files[] = { | |||
| 1943 | * cgrp: control group that the new cpuset will be part of | 1937 | * cgrp: control group that the new cpuset will be part of |
| 1944 | */ | 1938 | */ |
| 1945 | 1939 | ||
| 1946 | static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cgrp) | 1940 | static struct cgroup_subsys_state * |
| 1941 | cpuset_css_alloc(struct cgroup_subsys_state *parent_css) | ||
| 1947 | { | 1942 | { |
| 1948 | struct cpuset *cs; | 1943 | struct cpuset *cs; |
| 1949 | 1944 | ||
| 1950 | if (!cgrp->parent) | 1945 | if (!parent_css) |
| 1951 | return &top_cpuset.css; | 1946 | return &top_cpuset.css; |
| 1952 | 1947 | ||
| 1953 | cs = kzalloc(sizeof(*cs), GFP_KERNEL); | 1948 | cs = kzalloc(sizeof(*cs), GFP_KERNEL); |
| @@ -1967,12 +1962,12 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cgrp) | |||
| 1967 | return &cs->css; | 1962 | return &cs->css; |
| 1968 | } | 1963 | } |
| 1969 | 1964 | ||
| 1970 | static int cpuset_css_online(struct cgroup *cgrp) | 1965 | static int cpuset_css_online(struct cgroup_subsys_state *css) |
| 1971 | { | 1966 | { |
| 1972 | struct cpuset *cs = cgroup_cs(cgrp); | 1967 | struct cpuset *cs = css_cs(css); |
| 1973 | struct cpuset *parent = parent_cs(cs); | 1968 | struct cpuset *parent = parent_cs(cs); |
| 1974 | struct cpuset *tmp_cs; | 1969 | struct cpuset *tmp_cs; |
| 1975 | struct cgroup *pos_cg; | 1970 | struct cgroup_subsys_state *pos_css; |
| 1976 | 1971 | ||
| 1977 | if (!parent) | 1972 | if (!parent) |
| 1978 | return 0; | 1973 | return 0; |
| @@ -1987,7 +1982,7 @@ static int cpuset_css_online(struct cgroup *cgrp) | |||
| 1987 | 1982 | ||
| 1988 | number_of_cpusets++; | 1983 | number_of_cpusets++; |
| 1989 | 1984 | ||
| 1990 | if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags)) | 1985 | if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) |
| 1991 | goto out_unlock; | 1986 | goto out_unlock; |
| 1992 | 1987 | ||
| 1993 | /* | 1988 | /* |
| @@ -2004,7 +1999,7 @@ static int cpuset_css_online(struct cgroup *cgrp) | |||
| 2004 | * (and likewise for mems) to the new cgroup. | 1999 | * (and likewise for mems) to the new cgroup. |
| 2005 | */ | 2000 | */ |
| 2006 | rcu_read_lock(); | 2001 | rcu_read_lock(); |
| 2007 | cpuset_for_each_child(tmp_cs, pos_cg, parent) { | 2002 | cpuset_for_each_child(tmp_cs, pos_css, parent) { |
| 2008 | if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) { | 2003 | if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) { |
| 2009 | rcu_read_unlock(); | 2004 | rcu_read_unlock(); |
| 2010 | goto out_unlock; | 2005 | goto out_unlock; |
| @@ -2021,9 +2016,15 @@ out_unlock: | |||
| 2021 | return 0; | 2016 | return 0; |
| 2022 | } | 2017 | } |
| 2023 | 2018 | ||
| 2024 | static void cpuset_css_offline(struct cgroup *cgrp) | 2019 | /* |
| 2020 | * If the cpuset being removed has its flag 'sched_load_balance' | ||
| 2021 | * enabled, then simulate turning sched_load_balance off, which | ||
| 2022 | * will call rebuild_sched_domains_locked(). | ||
| 2023 | */ | ||
| 2024 | |||
| 2025 | static void cpuset_css_offline(struct cgroup_subsys_state *css) | ||
| 2025 | { | 2026 | { |
| 2026 | struct cpuset *cs = cgroup_cs(cgrp); | 2027 | struct cpuset *cs = css_cs(css); |
| 2027 | 2028 | ||
| 2028 | mutex_lock(&cpuset_mutex); | 2029 | mutex_lock(&cpuset_mutex); |
| 2029 | 2030 | ||
| @@ -2036,15 +2037,9 @@ static void cpuset_css_offline(struct cgroup *cgrp) | |||
| 2036 | mutex_unlock(&cpuset_mutex); | 2037 | mutex_unlock(&cpuset_mutex); |
| 2037 | } | 2038 | } |
| 2038 | 2039 | ||
| 2039 | /* | 2040 | static void cpuset_css_free(struct cgroup_subsys_state *css) |
| 2040 | * If the cpuset being removed has its flag 'sched_load_balance' | ||
| 2041 | * enabled, then simulate turning sched_load_balance off, which | ||
| 2042 | * will call rebuild_sched_domains_locked(). | ||
| 2043 | */ | ||
| 2044 | |||
| 2045 | static void cpuset_css_free(struct cgroup *cgrp) | ||
| 2046 | { | 2041 | { |
| 2047 | struct cpuset *cs = cgroup_cs(cgrp); | 2042 | struct cpuset *cs = css_cs(css); |
| 2048 | 2043 | ||
| 2049 | free_cpumask_var(cs->cpus_allowed); | 2044 | free_cpumask_var(cs->cpus_allowed); |
| 2050 | kfree(cs); | 2045 | kfree(cs); |
| @@ -2251,11 +2246,11 @@ static void cpuset_hotplug_workfn(struct work_struct *work) | |||
| 2251 | /* if cpus or mems changed, we need to propagate to descendants */ | 2246 | /* if cpus or mems changed, we need to propagate to descendants */ |
| 2252 | if (cpus_updated || mems_updated) { | 2247 | if (cpus_updated || mems_updated) { |
| 2253 | struct cpuset *cs; | 2248 | struct cpuset *cs; |
| 2254 | struct cgroup *pos_cgrp; | 2249 | struct cgroup_subsys_state *pos_css; |
| 2255 | 2250 | ||
| 2256 | rcu_read_lock(); | 2251 | rcu_read_lock(); |
| 2257 | cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset) { | 2252 | cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { |
| 2258 | if (!css_tryget(&cs->css)) | 2253 | if (cs == &top_cpuset || !css_tryget(&cs->css)) |
| 2259 | continue; | 2254 | continue; |
| 2260 | rcu_read_unlock(); | 2255 | rcu_read_unlock(); |
| 2261 | 2256 | ||
| @@ -2344,7 +2339,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) | |||
| 2344 | 2339 | ||
| 2345 | void cpuset_cpus_allowed_fallback(struct task_struct *tsk) | 2340 | void cpuset_cpus_allowed_fallback(struct task_struct *tsk) |
| 2346 | { | 2341 | { |
| 2347 | const struct cpuset *cpus_cs; | 2342 | struct cpuset *cpus_cs; |
| 2348 | 2343 | ||
| 2349 | rcu_read_lock(); | 2344 | rcu_read_lock(); |
| 2350 | cpus_cs = effective_cpumask_cpuset(task_cs(tsk)); | 2345 | cpus_cs = effective_cpumask_cpuset(task_cs(tsk)); |
| @@ -2417,7 +2412,7 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) | |||
| 2417 | * callback_mutex. If no ancestor is mem_exclusive or mem_hardwall | 2412 | * callback_mutex. If no ancestor is mem_exclusive or mem_hardwall |
| 2418 | * (an unusual configuration), then returns the root cpuset. | 2413 | * (an unusual configuration), then returns the root cpuset. |
| 2419 | */ | 2414 | */ |
| 2420 | static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs) | 2415 | static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) |
| 2421 | { | 2416 | { |
| 2422 | while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs)) | 2417 | while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs)) |
| 2423 | cs = parent_cs(cs); | 2418 | cs = parent_cs(cs); |
| @@ -2487,7 +2482,7 @@ static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs) | |||
| 2487 | */ | 2482 | */ |
| 2488 | int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) | 2483 | int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) |
| 2489 | { | 2484 | { |
| 2490 | const struct cpuset *cs; /* current cpuset ancestors */ | 2485 | struct cpuset *cs; /* current cpuset ancestors */ |
| 2491 | int allowed; /* is allocation in zone z allowed? */ | 2486 | int allowed; /* is allocation in zone z allowed? */ |
| 2492 | 2487 | ||
| 2493 | if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) | 2488 | if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) |
| @@ -2725,7 +2720,7 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v) | |||
| 2725 | goto out_free; | 2720 | goto out_free; |
| 2726 | 2721 | ||
| 2727 | rcu_read_lock(); | 2722 | rcu_read_lock(); |
| 2728 | css = task_subsys_state(tsk, cpuset_subsys_id); | 2723 | css = task_css(tsk, cpuset_subsys_id); |
| 2729 | retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); | 2724 | retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); |
| 2730 | rcu_read_unlock(); | 2725 | rcu_read_unlock(); |
| 2731 | if (retval < 0) | 2726 | if (retval < 0) |
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 0506d447aed2..7d2f35e5df2f 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c | |||
| @@ -575,8 +575,12 @@ return_normal: | |||
| 575 | raw_spin_lock(&dbg_slave_lock); | 575 | raw_spin_lock(&dbg_slave_lock); |
| 576 | 576 | ||
| 577 | #ifdef CONFIG_SMP | 577 | #ifdef CONFIG_SMP |
| 578 | /* If send_ready set, slaves are already waiting */ | ||
| 579 | if (ks->send_ready) | ||
| 580 | atomic_set(ks->send_ready, 1); | ||
| 581 | |||
| 578 | /* Signal the other CPUs to enter kgdb_wait() */ | 582 | /* Signal the other CPUs to enter kgdb_wait() */ |
| 579 | if ((!kgdb_single_step) && kgdb_do_roundup) | 583 | else if ((!kgdb_single_step) && kgdb_do_roundup) |
| 580 | kgdb_roundup_cpus(flags); | 584 | kgdb_roundup_cpus(flags); |
| 581 | #endif | 585 | #endif |
| 582 | 586 | ||
| @@ -678,11 +682,11 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) | |||
| 678 | if (arch_kgdb_ops.enable_nmi) | 682 | if (arch_kgdb_ops.enable_nmi) |
| 679 | arch_kgdb_ops.enable_nmi(0); | 683 | arch_kgdb_ops.enable_nmi(0); |
| 680 | 684 | ||
| 685 | memset(ks, 0, sizeof(struct kgdb_state)); | ||
| 681 | ks->cpu = raw_smp_processor_id(); | 686 | ks->cpu = raw_smp_processor_id(); |
| 682 | ks->ex_vector = evector; | 687 | ks->ex_vector = evector; |
| 683 | ks->signo = signo; | 688 | ks->signo = signo; |
| 684 | ks->err_code = ecode; | 689 | ks->err_code = ecode; |
| 685 | ks->kgdb_usethreadid = 0; | ||
| 686 | ks->linux_regs = regs; | 690 | ks->linux_regs = regs; |
| 687 | 691 | ||
| 688 | if (kgdb_reenter_check(ks)) | 692 | if (kgdb_reenter_check(ks)) |
| @@ -732,6 +736,30 @@ int kgdb_nmicallback(int cpu, void *regs) | |||
| 732 | return 1; | 736 | return 1; |
| 733 | } | 737 | } |
| 734 | 738 | ||
| 739 | int kgdb_nmicallin(int cpu, int trapnr, void *regs, atomic_t *send_ready) | ||
| 740 | { | ||
| 741 | #ifdef CONFIG_SMP | ||
| 742 | if (!kgdb_io_ready(0) || !send_ready) | ||
| 743 | return 1; | ||
| 744 | |||
| 745 | if (kgdb_info[cpu].enter_kgdb == 0) { | ||
| 746 | struct kgdb_state kgdb_var; | ||
| 747 | struct kgdb_state *ks = &kgdb_var; | ||
| 748 | |||
| 749 | memset(ks, 0, sizeof(struct kgdb_state)); | ||
| 750 | ks->cpu = cpu; | ||
| 751 | ks->ex_vector = trapnr; | ||
| 752 | ks->signo = SIGTRAP; | ||
| 753 | ks->err_code = KGDB_KDB_REASON_SYSTEM_NMI; | ||
| 754 | ks->linux_regs = regs; | ||
| 755 | ks->send_ready = send_ready; | ||
| 756 | kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER); | ||
| 757 | return 0; | ||
| 758 | } | ||
| 759 | #endif | ||
| 760 | return 1; | ||
| 761 | } | ||
| 762 | |||
| 735 | static void kgdb_console_write(struct console *co, const char *s, | 763 | static void kgdb_console_write(struct console *co, const char *s, |
| 736 | unsigned count) | 764 | unsigned count) |
| 737 | { | 765 | { |
diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h index 2235967e78b0..572aa4f5677c 100644 --- a/kernel/debug/debug_core.h +++ b/kernel/debug/debug_core.h | |||
| @@ -26,6 +26,7 @@ struct kgdb_state { | |||
| 26 | unsigned long threadid; | 26 | unsigned long threadid; |
| 27 | long kgdb_usethreadid; | 27 | long kgdb_usethreadid; |
| 28 | struct pt_regs *linux_regs; | 28 | struct pt_regs *linux_regs; |
| 29 | atomic_t *send_ready; | ||
| 29 | }; | 30 | }; |
| 30 | 31 | ||
| 31 | /* Exception state values */ | 32 | /* Exception state values */ |
| @@ -74,11 +75,13 @@ extern int kdb_stub(struct kgdb_state *ks); | |||
| 74 | extern int kdb_parse(const char *cmdstr); | 75 | extern int kdb_parse(const char *cmdstr); |
| 75 | extern int kdb_common_init_state(struct kgdb_state *ks); | 76 | extern int kdb_common_init_state(struct kgdb_state *ks); |
| 76 | extern int kdb_common_deinit_state(void); | 77 | extern int kdb_common_deinit_state(void); |
| 78 | #define KGDB_KDB_REASON_SYSTEM_NMI KDB_REASON_SYSTEM_NMI | ||
| 77 | #else /* ! CONFIG_KGDB_KDB */ | 79 | #else /* ! CONFIG_KGDB_KDB */ |
| 78 | static inline int kdb_stub(struct kgdb_state *ks) | 80 | static inline int kdb_stub(struct kgdb_state *ks) |
| 79 | { | 81 | { |
| 80 | return DBG_PASS_EVENT; | 82 | return DBG_PASS_EVENT; |
| 81 | } | 83 | } |
| 84 | #define KGDB_KDB_REASON_SYSTEM_NMI 0 | ||
| 82 | #endif /* CONFIG_KGDB_KDB */ | 85 | #endif /* CONFIG_KGDB_KDB */ |
| 83 | 86 | ||
| 84 | #endif /* _DEBUG_CORE_H_ */ | 87 | #endif /* _DEBUG_CORE_H_ */ |
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c index 328d18ef31e4..8859ca34dcfe 100644 --- a/kernel/debug/kdb/kdb_debugger.c +++ b/kernel/debug/kdb/kdb_debugger.c | |||
| @@ -69,7 +69,10 @@ int kdb_stub(struct kgdb_state *ks) | |||
| 69 | if (atomic_read(&kgdb_setting_breakpoint)) | 69 | if (atomic_read(&kgdb_setting_breakpoint)) |
| 70 | reason = KDB_REASON_KEYBOARD; | 70 | reason = KDB_REASON_KEYBOARD; |
| 71 | 71 | ||
| 72 | if (in_nmi()) | 72 | if (ks->err_code == KDB_REASON_SYSTEM_NMI && ks->signo == SIGTRAP) |
| 73 | reason = KDB_REASON_SYSTEM_NMI; | ||
| 74 | |||
| 75 | else if (in_nmi()) | ||
| 73 | reason = KDB_REASON_NMI; | 76 | reason = KDB_REASON_NMI; |
| 74 | 77 | ||
| 75 | for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) { | 78 | for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) { |
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 00eb8f7fbf41..0b097c8a1e50 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c | |||
| @@ -1200,6 +1200,9 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs, | |||
| 1200 | instruction_pointer(regs)); | 1200 | instruction_pointer(regs)); |
| 1201 | kdb_dumpregs(regs); | 1201 | kdb_dumpregs(regs); |
| 1202 | break; | 1202 | break; |
| 1203 | case KDB_REASON_SYSTEM_NMI: | ||
| 1204 | kdb_printf("due to System NonMaskable Interrupt\n"); | ||
| 1205 | break; | ||
| 1203 | case KDB_REASON_NMI: | 1206 | case KDB_REASON_NMI: |
| 1204 | kdb_printf("due to NonMaskable Interrupt @ " | 1207 | kdb_printf("due to NonMaskable Interrupt @ " |
| 1205 | kdb_machreg_fmt "\n", | 1208 | kdb_machreg_fmt "\n", |
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index c77206184b8b..97b67df8fbfe 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c | |||
| @@ -116,6 +116,9 @@ int get_callchain_buffers(void) | |||
| 116 | 116 | ||
| 117 | err = alloc_callchain_buffers(); | 117 | err = alloc_callchain_buffers(); |
| 118 | exit: | 118 | exit: |
| 119 | if (err) | ||
| 120 | atomic_dec(&nr_callchain_events); | ||
| 121 | |||
| 119 | mutex_unlock(&callchain_mutex); | 122 | mutex_unlock(&callchain_mutex); |
| 120 | 123 | ||
| 121 | return err; | 124 | return err; |
diff --git a/kernel/events/core.c b/kernel/events/core.c index f86599e8c123..8c875ef6e120 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
| @@ -145,6 +145,7 @@ static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events); | |||
| 145 | static atomic_t nr_mmap_events __read_mostly; | 145 | static atomic_t nr_mmap_events __read_mostly; |
| 146 | static atomic_t nr_comm_events __read_mostly; | 146 | static atomic_t nr_comm_events __read_mostly; |
| 147 | static atomic_t nr_task_events __read_mostly; | 147 | static atomic_t nr_task_events __read_mostly; |
| 148 | static atomic_t nr_freq_events __read_mostly; | ||
| 148 | 149 | ||
| 149 | static LIST_HEAD(pmus); | 150 | static LIST_HEAD(pmus); |
| 150 | static DEFINE_MUTEX(pmus_lock); | 151 | static DEFINE_MUTEX(pmus_lock); |
| @@ -174,8 +175,8 @@ int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; | |||
| 174 | static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); | 175 | static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); |
| 175 | static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS; | 176 | static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS; |
| 176 | 177 | ||
| 177 | static atomic_t perf_sample_allowed_ns __read_mostly = | 178 | static int perf_sample_allowed_ns __read_mostly = |
| 178 | ATOMIC_INIT( DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100); | 179 | DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100; |
| 179 | 180 | ||
| 180 | void update_perf_cpu_limits(void) | 181 | void update_perf_cpu_limits(void) |
| 181 | { | 182 | { |
| @@ -183,7 +184,7 @@ void update_perf_cpu_limits(void) | |||
| 183 | 184 | ||
| 184 | tmp *= sysctl_perf_cpu_time_max_percent; | 185 | tmp *= sysctl_perf_cpu_time_max_percent; |
| 185 | do_div(tmp, 100); | 186 | do_div(tmp, 100); |
| 186 | atomic_set(&perf_sample_allowed_ns, tmp); | 187 | ACCESS_ONCE(perf_sample_allowed_ns) = tmp; |
| 187 | } | 188 | } |
| 188 | 189 | ||
| 189 | static int perf_rotate_context(struct perf_cpu_context *cpuctx); | 190 | static int perf_rotate_context(struct perf_cpu_context *cpuctx); |
| @@ -192,7 +193,7 @@ int perf_proc_update_handler(struct ctl_table *table, int write, | |||
| 192 | void __user *buffer, size_t *lenp, | 193 | void __user *buffer, size_t *lenp, |
| 193 | loff_t *ppos) | 194 | loff_t *ppos) |
| 194 | { | 195 | { |
| 195 | int ret = proc_dointvec(table, write, buffer, lenp, ppos); | 196 | int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); |
| 196 | 197 | ||
| 197 | if (ret || !write) | 198 | if (ret || !write) |
| 198 | return ret; | 199 | return ret; |
| @@ -227,14 +228,15 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, | |||
| 227 | * we detect that events are taking too long. | 228 | * we detect that events are taking too long. |
| 228 | */ | 229 | */ |
| 229 | #define NR_ACCUMULATED_SAMPLES 128 | 230 | #define NR_ACCUMULATED_SAMPLES 128 |
| 230 | DEFINE_PER_CPU(u64, running_sample_length); | 231 | static DEFINE_PER_CPU(u64, running_sample_length); |
| 231 | 232 | ||
| 232 | void perf_sample_event_took(u64 sample_len_ns) | 233 | void perf_sample_event_took(u64 sample_len_ns) |
| 233 | { | 234 | { |
| 234 | u64 avg_local_sample_len; | 235 | u64 avg_local_sample_len; |
| 235 | u64 local_samples_len; | 236 | u64 local_samples_len; |
| 237 | u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns); | ||
| 236 | 238 | ||
| 237 | if (atomic_read(&perf_sample_allowed_ns) == 0) | 239 | if (allowed_ns == 0) |
| 238 | return; | 240 | return; |
| 239 | 241 | ||
| 240 | /* decay the counter by 1 average sample */ | 242 | /* decay the counter by 1 average sample */ |
| @@ -250,7 +252,7 @@ void perf_sample_event_took(u64 sample_len_ns) | |||
| 250 | */ | 252 | */ |
| 251 | avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES; | 253 | avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES; |
| 252 | 254 | ||
| 253 | if (avg_local_sample_len <= atomic_read(&perf_sample_allowed_ns)) | 255 | if (avg_local_sample_len <= allowed_ns) |
| 254 | return; | 256 | return; |
| 255 | 257 | ||
| 256 | if (max_samples_per_tick <= 1) | 258 | if (max_samples_per_tick <= 1) |
| @@ -261,10 +263,9 @@ void perf_sample_event_took(u64 sample_len_ns) | |||
| 261 | perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; | 263 | perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; |
| 262 | 264 | ||
| 263 | printk_ratelimited(KERN_WARNING | 265 | printk_ratelimited(KERN_WARNING |
| 264 | "perf samples too long (%lld > %d), lowering " | 266 | "perf samples too long (%lld > %lld), lowering " |
| 265 | "kernel.perf_event_max_sample_rate to %d\n", | 267 | "kernel.perf_event_max_sample_rate to %d\n", |
| 266 | avg_local_sample_len, | 268 | avg_local_sample_len, allowed_ns, |
| 267 | atomic_read(&perf_sample_allowed_ns), | ||
| 268 | sysctl_perf_event_sample_rate); | 269 | sysctl_perf_event_sample_rate); |
| 269 | 270 | ||
| 270 | update_perf_cpu_limits(); | 271 | update_perf_cpu_limits(); |
| @@ -340,8 +341,8 @@ struct perf_cgroup { | |||
| 340 | static inline struct perf_cgroup * | 341 | static inline struct perf_cgroup * |
| 341 | perf_cgroup_from_task(struct task_struct *task) | 342 | perf_cgroup_from_task(struct task_struct *task) |
| 342 | { | 343 | { |
| 343 | return container_of(task_subsys_state(task, perf_subsys_id), | 344 | return container_of(task_css(task, perf_subsys_id), |
| 344 | struct perf_cgroup, css); | 345 | struct perf_cgroup, css); |
| 345 | } | 346 | } |
| 346 | 347 | ||
| 347 | static inline bool | 348 | static inline bool |
| @@ -591,7 +592,9 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, | |||
| 591 | if (!f.file) | 592 | if (!f.file) |
| 592 | return -EBADF; | 593 | return -EBADF; |
| 593 | 594 | ||
| 594 | css = cgroup_css_from_dir(f.file, perf_subsys_id); | 595 | rcu_read_lock(); |
| 596 | |||
| 597 | css = css_from_dir(f.file->f_dentry, &perf_subsys); | ||
| 595 | if (IS_ERR(css)) { | 598 | if (IS_ERR(css)) { |
| 596 | ret = PTR_ERR(css); | 599 | ret = PTR_ERR(css); |
| 597 | goto out; | 600 | goto out; |
| @@ -617,6 +620,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, | |||
| 617 | ret = -EINVAL; | 620 | ret = -EINVAL; |
| 618 | } | 621 | } |
| 619 | out: | 622 | out: |
| 623 | rcu_read_unlock(); | ||
| 620 | fdput(f); | 624 | fdput(f); |
| 621 | return ret; | 625 | return ret; |
| 622 | } | 626 | } |
| @@ -869,12 +873,8 @@ static void perf_pmu_rotate_start(struct pmu *pmu) | |||
| 869 | 873 | ||
| 870 | WARN_ON(!irqs_disabled()); | 874 | WARN_ON(!irqs_disabled()); |
| 871 | 875 | ||
| 872 | if (list_empty(&cpuctx->rotation_list)) { | 876 | if (list_empty(&cpuctx->rotation_list)) |
| 873 | int was_empty = list_empty(head); | ||
| 874 | list_add(&cpuctx->rotation_list, head); | 877 | list_add(&cpuctx->rotation_list, head); |
| 875 | if (was_empty) | ||
| 876 | tick_nohz_full_kick(); | ||
| 877 | } | ||
| 878 | } | 878 | } |
| 879 | 879 | ||
| 880 | static void get_ctx(struct perf_event_context *ctx) | 880 | static void get_ctx(struct perf_event_context *ctx) |
| @@ -899,6 +899,7 @@ static void unclone_ctx(struct perf_event_context *ctx) | |||
| 899 | put_ctx(ctx->parent_ctx); | 899 | put_ctx(ctx->parent_ctx); |
| 900 | ctx->parent_ctx = NULL; | 900 | ctx->parent_ctx = NULL; |
| 901 | } | 901 | } |
| 902 | ctx->generation++; | ||
| 902 | } | 903 | } |
| 903 | 904 | ||
| 904 | static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) | 905 | static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) |
| @@ -1136,6 +1137,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) | |||
| 1136 | ctx->nr_events++; | 1137 | ctx->nr_events++; |
| 1137 | if (event->attr.inherit_stat) | 1138 | if (event->attr.inherit_stat) |
| 1138 | ctx->nr_stat++; | 1139 | ctx->nr_stat++; |
| 1140 | |||
| 1141 | ctx->generation++; | ||
| 1139 | } | 1142 | } |
| 1140 | 1143 | ||
| 1141 | /* | 1144 | /* |
| @@ -1201,6 +1204,9 @@ static void perf_event__header_size(struct perf_event *event) | |||
| 1201 | if (sample_type & PERF_SAMPLE_DATA_SRC) | 1204 | if (sample_type & PERF_SAMPLE_DATA_SRC) |
| 1202 | size += sizeof(data->data_src.val); | 1205 | size += sizeof(data->data_src.val); |
| 1203 | 1206 | ||
| 1207 | if (sample_type & PERF_SAMPLE_TRANSACTION) | ||
| 1208 | size += sizeof(data->txn); | ||
| 1209 | |||
| 1204 | event->header_size = size; | 1210 | event->header_size = size; |
| 1205 | } | 1211 | } |
| 1206 | 1212 | ||
| @@ -1216,6 +1222,9 @@ static void perf_event__id_header_size(struct perf_event *event) | |||
| 1216 | if (sample_type & PERF_SAMPLE_TIME) | 1222 | if (sample_type & PERF_SAMPLE_TIME) |
| 1217 | size += sizeof(data->time); | 1223 | size += sizeof(data->time); |
| 1218 | 1224 | ||
| 1225 | if (sample_type & PERF_SAMPLE_IDENTIFIER) | ||
| 1226 | size += sizeof(data->id); | ||
| 1227 | |||
| 1219 | if (sample_type & PERF_SAMPLE_ID) | 1228 | if (sample_type & PERF_SAMPLE_ID) |
| 1220 | size += sizeof(data->id); | 1229 | size += sizeof(data->id); |
| 1221 | 1230 | ||
| @@ -1307,6 +1316,8 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) | |||
| 1307 | */ | 1316 | */ |
| 1308 | if (event->state > PERF_EVENT_STATE_OFF) | 1317 | if (event->state > PERF_EVENT_STATE_OFF) |
| 1309 | event->state = PERF_EVENT_STATE_OFF; | 1318 | event->state = PERF_EVENT_STATE_OFF; |
| 1319 | |||
| 1320 | ctx->generation++; | ||
| 1310 | } | 1321 | } |
| 1311 | 1322 | ||
| 1312 | static void perf_group_detach(struct perf_event *event) | 1323 | static void perf_group_detach(struct perf_event *event) |
| @@ -2143,22 +2154,38 @@ static void ctx_sched_out(struct perf_event_context *ctx, | |||
| 2143 | } | 2154 | } |
| 2144 | 2155 | ||
| 2145 | /* | 2156 | /* |
| 2146 | * Test whether two contexts are equivalent, i.e. whether they | 2157 | * Test whether two contexts are equivalent, i.e. whether they have both been |
| 2147 | * have both been cloned from the same version of the same context | 2158 | * cloned from the same version of the same context. |
| 2148 | * and they both have the same number of enabled events. | 2159 | * |
| 2149 | * If the number of enabled events is the same, then the set | 2160 | * Equivalence is measured using a generation number in the context that is |
| 2150 | * of enabled events should be the same, because these are both | 2161 | * incremented on each modification to it; see unclone_ctx(), list_add_event() |
| 2151 | * inherited contexts, therefore we can't access individual events | 2162 | * and list_del_event(). |
| 2152 | * in them directly with an fd; we can only enable/disable all | ||
| 2153 | * events via prctl, or enable/disable all events in a family | ||
| 2154 | * via ioctl, which will have the same effect on both contexts. | ||
| 2155 | */ | 2163 | */ |
| 2156 | static int context_equiv(struct perf_event_context *ctx1, | 2164 | static int context_equiv(struct perf_event_context *ctx1, |
| 2157 | struct perf_event_context *ctx2) | 2165 | struct perf_event_context *ctx2) |
| 2158 | { | 2166 | { |
| 2159 | return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx | 2167 | /* Pinning disables the swap optimization */ |
| 2160 | && ctx1->parent_gen == ctx2->parent_gen | 2168 | if (ctx1->pin_count || ctx2->pin_count) |
| 2161 | && !ctx1->pin_count && !ctx2->pin_count; | 2169 | return 0; |
| 2170 | |||
| 2171 | /* If ctx1 is the parent of ctx2 */ | ||
| 2172 | if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen) | ||
| 2173 | return 1; | ||
| 2174 | |||
| 2175 | /* If ctx2 is the parent of ctx1 */ | ||
| 2176 | if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation) | ||
| 2177 | return 1; | ||
| 2178 | |||
| 2179 | /* | ||
| 2180 | * If ctx1 and ctx2 have the same parent; we flatten the parent | ||
| 2181 | * hierarchy, see perf_event_init_context(). | ||
| 2182 | */ | ||
| 2183 | if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx && | ||
| 2184 | ctx1->parent_gen == ctx2->parent_gen) | ||
| 2185 | return 1; | ||
| 2186 | |||
| 2187 | /* Unmatched */ | ||
| 2188 | return 0; | ||
| 2162 | } | 2189 | } |
| 2163 | 2190 | ||
| 2164 | static void __perf_event_sync_stat(struct perf_event *event, | 2191 | static void __perf_event_sync_stat(struct perf_event *event, |
| @@ -2241,7 +2268,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, | |||
| 2241 | { | 2268 | { |
| 2242 | struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; | 2269 | struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; |
| 2243 | struct perf_event_context *next_ctx; | 2270 | struct perf_event_context *next_ctx; |
| 2244 | struct perf_event_context *parent; | 2271 | struct perf_event_context *parent, *next_parent; |
| 2245 | struct perf_cpu_context *cpuctx; | 2272 | struct perf_cpu_context *cpuctx; |
| 2246 | int do_switch = 1; | 2273 | int do_switch = 1; |
| 2247 | 2274 | ||
| @@ -2253,10 +2280,18 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, | |||
| 2253 | return; | 2280 | return; |
| 2254 | 2281 | ||
| 2255 | rcu_read_lock(); | 2282 | rcu_read_lock(); |
| 2256 | parent = rcu_dereference(ctx->parent_ctx); | ||
| 2257 | next_ctx = next->perf_event_ctxp[ctxn]; | 2283 | next_ctx = next->perf_event_ctxp[ctxn]; |
| 2258 | if (parent && next_ctx && | 2284 | if (!next_ctx) |
| 2259 | rcu_dereference(next_ctx->parent_ctx) == parent) { | 2285 | goto unlock; |
| 2286 | |||
| 2287 | parent = rcu_dereference(ctx->parent_ctx); | ||
| 2288 | next_parent = rcu_dereference(next_ctx->parent_ctx); | ||
| 2289 | |||
| 2290 | /* If neither context have a parent context; they cannot be clones. */ | ||
| 2291 | if (!parent && !next_parent) | ||
| 2292 | goto unlock; | ||
| 2293 | |||
| 2294 | if (next_parent == ctx || next_ctx == parent || next_parent == parent) { | ||
| 2260 | /* | 2295 | /* |
| 2261 | * Looks like the two contexts are clones, so we might be | 2296 | * Looks like the two contexts are clones, so we might be |
| 2262 | * able to optimize the context switch. We lock both | 2297 | * able to optimize the context switch. We lock both |
| @@ -2284,6 +2319,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, | |||
| 2284 | raw_spin_unlock(&next_ctx->lock); | 2319 | raw_spin_unlock(&next_ctx->lock); |
| 2285 | raw_spin_unlock(&ctx->lock); | 2320 | raw_spin_unlock(&ctx->lock); |
| 2286 | } | 2321 | } |
| 2322 | unlock: | ||
| 2287 | rcu_read_unlock(); | 2323 | rcu_read_unlock(); |
| 2288 | 2324 | ||
| 2289 | if (do_switch) { | 2325 | if (do_switch) { |
| @@ -2712,7 +2748,7 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, | |||
| 2712 | 2748 | ||
| 2713 | hwc = &event->hw; | 2749 | hwc = &event->hw; |
| 2714 | 2750 | ||
| 2715 | if (needs_unthr && hwc->interrupts == MAX_INTERRUPTS) { | 2751 | if (hwc->interrupts == MAX_INTERRUPTS) { |
| 2716 | hwc->interrupts = 0; | 2752 | hwc->interrupts = 0; |
| 2717 | perf_log_throttle(event, 1); | 2753 | perf_log_throttle(event, 1); |
| 2718 | event->pmu->start(event, 0); | 2754 | event->pmu->start(event, 0); |
| @@ -2811,10 +2847,11 @@ done: | |||
| 2811 | #ifdef CONFIG_NO_HZ_FULL | 2847 | #ifdef CONFIG_NO_HZ_FULL |
| 2812 | bool perf_event_can_stop_tick(void) | 2848 | bool perf_event_can_stop_tick(void) |
| 2813 | { | 2849 | { |
| 2814 | if (list_empty(&__get_cpu_var(rotation_list))) | 2850 | if (atomic_read(&nr_freq_events) || |
| 2815 | return true; | 2851 | __this_cpu_read(perf_throttled_count)) |
| 2816 | else | ||
| 2817 | return false; | 2852 | return false; |
| 2853 | else | ||
| 2854 | return true; | ||
| 2818 | } | 2855 | } |
| 2819 | #endif | 2856 | #endif |
| 2820 | 2857 | ||
| @@ -3128,36 +3165,63 @@ static void free_event_rcu(struct rcu_head *head) | |||
| 3128 | static void ring_buffer_put(struct ring_buffer *rb); | 3165 | static void ring_buffer_put(struct ring_buffer *rb); |
| 3129 | static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb); | 3166 | static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb); |
| 3130 | 3167 | ||
| 3131 | static void free_event(struct perf_event *event) | 3168 | static void unaccount_event_cpu(struct perf_event *event, int cpu) |
| 3132 | { | 3169 | { |
| 3133 | irq_work_sync(&event->pending); | 3170 | if (event->parent) |
| 3171 | return; | ||
| 3172 | |||
| 3173 | if (has_branch_stack(event)) { | ||
| 3174 | if (!(event->attach_state & PERF_ATTACH_TASK)) | ||
| 3175 | atomic_dec(&per_cpu(perf_branch_stack_events, cpu)); | ||
| 3176 | } | ||
| 3177 | if (is_cgroup_event(event)) | ||
| 3178 | atomic_dec(&per_cpu(perf_cgroup_events, cpu)); | ||
| 3179 | } | ||
| 3180 | |||
| 3181 | static void unaccount_event(struct perf_event *event) | ||
| 3182 | { | ||
| 3183 | if (event->parent) | ||
| 3184 | return; | ||
| 3134 | 3185 | ||
| 3186 | if (event->attach_state & PERF_ATTACH_TASK) | ||
| 3187 | static_key_slow_dec_deferred(&perf_sched_events); | ||
| 3188 | if (event->attr.mmap || event->attr.mmap_data) | ||
| 3189 | atomic_dec(&nr_mmap_events); | ||
| 3190 | if (event->attr.comm) | ||
| 3191 | atomic_dec(&nr_comm_events); | ||
| 3192 | if (event->attr.task) | ||
| 3193 | atomic_dec(&nr_task_events); | ||
| 3194 | if (event->attr.freq) | ||
| 3195 | atomic_dec(&nr_freq_events); | ||
| 3196 | if (is_cgroup_event(event)) | ||
| 3197 | static_key_slow_dec_deferred(&perf_sched_events); | ||
| 3198 | if (has_branch_stack(event)) | ||
| 3199 | static_key_slow_dec_deferred(&perf_sched_events); | ||
| 3200 | |||
| 3201 | unaccount_event_cpu(event, event->cpu); | ||
| 3202 | } | ||
| 3203 | |||
| 3204 | static void __free_event(struct perf_event *event) | ||
| 3205 | { | ||
| 3135 | if (!event->parent) { | 3206 | if (!event->parent) { |
| 3136 | if (event->attach_state & PERF_ATTACH_TASK) | ||
| 3137 | static_key_slow_dec_deferred(&perf_sched_events); | ||
| 3138 | if (event->attr.mmap || event->attr.mmap_data) | ||
| 3139 | atomic_dec(&nr_mmap_events); | ||
| 3140 | if (event->attr.comm) | ||
| 3141 | atomic_dec(&nr_comm_events); | ||
| 3142 | if (event->attr.task) | ||
| 3143 | atomic_dec(&nr_task_events); | ||
| 3144 | if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) | 3207 | if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) |
| 3145 | put_callchain_buffers(); | 3208 | put_callchain_buffers(); |
| 3146 | if (is_cgroup_event(event)) { | ||
| 3147 | atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); | ||
| 3148 | static_key_slow_dec_deferred(&perf_sched_events); | ||
| 3149 | } | ||
| 3150 | |||
| 3151 | if (has_branch_stack(event)) { | ||
| 3152 | static_key_slow_dec_deferred(&perf_sched_events); | ||
| 3153 | /* is system-wide event */ | ||
| 3154 | if (!(event->attach_state & PERF_ATTACH_TASK)) { | ||
| 3155 | atomic_dec(&per_cpu(perf_branch_stack_events, | ||
| 3156 | event->cpu)); | ||
| 3157 | } | ||
| 3158 | } | ||
| 3159 | } | 3209 | } |
| 3160 | 3210 | ||
| 3211 | if (event->destroy) | ||
| 3212 | event->destroy(event); | ||
| 3213 | |||
| 3214 | if (event->ctx) | ||
| 3215 | put_ctx(event->ctx); | ||
| 3216 | |||
| 3217 | call_rcu(&event->rcu_head, free_event_rcu); | ||
| 3218 | } | ||
| 3219 | static void free_event(struct perf_event *event) | ||
| 3220 | { | ||
| 3221 | irq_work_sync(&event->pending); | ||
| 3222 | |||
| 3223 | unaccount_event(event); | ||
| 3224 | |||
| 3161 | if (event->rb) { | 3225 | if (event->rb) { |
| 3162 | struct ring_buffer *rb; | 3226 | struct ring_buffer *rb; |
| 3163 | 3227 | ||
| @@ -3180,13 +3244,8 @@ static void free_event(struct perf_event *event) | |||
| 3180 | if (is_cgroup_event(event)) | 3244 | if (is_cgroup_event(event)) |
| 3181 | perf_detach_cgroup(event); | 3245 | perf_detach_cgroup(event); |
| 3182 | 3246 | ||
| 3183 | if (event->destroy) | ||
| 3184 | event->destroy(event); | ||
| 3185 | |||
| 3186 | if (event->ctx) | ||
| 3187 | put_ctx(event->ctx); | ||
| 3188 | 3247 | ||
| 3189 | call_rcu(&event->rcu_head, free_event_rcu); | 3248 | __free_event(event); |
| 3190 | } | 3249 | } |
| 3191 | 3250 | ||
| 3192 | int perf_event_release_kernel(struct perf_event *event) | 3251 | int perf_event_release_kernel(struct perf_event *event) |
| @@ -3544,6 +3603,15 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) | |||
| 3544 | case PERF_EVENT_IOC_PERIOD: | 3603 | case PERF_EVENT_IOC_PERIOD: |
| 3545 | return perf_event_period(event, (u64 __user *)arg); | 3604 | return perf_event_period(event, (u64 __user *)arg); |
| 3546 | 3605 | ||
| 3606 | case PERF_EVENT_IOC_ID: | ||
| 3607 | { | ||
| 3608 | u64 id = primary_event_id(event); | ||
| 3609 | |||
| 3610 | if (copy_to_user((void __user *)arg, &id, sizeof(id))) | ||
| 3611 | return -EFAULT; | ||
| 3612 | return 0; | ||
| 3613 | } | ||
| 3614 | |||
| 3547 | case PERF_EVENT_IOC_SET_OUTPUT: | 3615 | case PERF_EVENT_IOC_SET_OUTPUT: |
| 3548 | { | 3616 | { |
| 3549 | int ret; | 3617 | int ret; |
| @@ -3625,6 +3693,26 @@ static void calc_timer_values(struct perf_event *event, | |||
| 3625 | *running = ctx_time - event->tstamp_running; | 3693 | *running = ctx_time - event->tstamp_running; |
| 3626 | } | 3694 | } |
| 3627 | 3695 | ||
| 3696 | static void perf_event_init_userpage(struct perf_event *event) | ||
| 3697 | { | ||
| 3698 | struct perf_event_mmap_page *userpg; | ||
| 3699 | struct ring_buffer *rb; | ||
| 3700 | |||
| 3701 | rcu_read_lock(); | ||
| 3702 | rb = rcu_dereference(event->rb); | ||
| 3703 | if (!rb) | ||
| 3704 | goto unlock; | ||
| 3705 | |||
| 3706 | userpg = rb->user_page; | ||
| 3707 | |||
| 3708 | /* Allow new userspace to detect that bit 0 is deprecated */ | ||
| 3709 | userpg->cap_bit0_is_deprecated = 1; | ||
| 3710 | userpg->size = offsetof(struct perf_event_mmap_page, __reserved); | ||
| 3711 | |||
| 3712 | unlock: | ||
| 3713 | rcu_read_unlock(); | ||
| 3714 | } | ||
| 3715 | |||
| 3628 | void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) | 3716 | void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) |
| 3629 | { | 3717 | { |
| 3630 | } | 3718 | } |
| @@ -3641,6 +3729,10 @@ void perf_event_update_userpage(struct perf_event *event) | |||
| 3641 | u64 enabled, running, now; | 3729 | u64 enabled, running, now; |
| 3642 | 3730 | ||
| 3643 | rcu_read_lock(); | 3731 | rcu_read_lock(); |
| 3732 | rb = rcu_dereference(event->rb); | ||
| 3733 | if (!rb) | ||
| 3734 | goto unlock; | ||
| 3735 | |||
| 3644 | /* | 3736 | /* |
| 3645 | * compute total_time_enabled, total_time_running | 3737 | * compute total_time_enabled, total_time_running |
| 3646 | * based on snapshot values taken when the event | 3738 | * based on snapshot values taken when the event |
| @@ -3651,12 +3743,8 @@ void perf_event_update_userpage(struct perf_event *event) | |||
| 3651 | * NMI context | 3743 | * NMI context |
| 3652 | */ | 3744 | */ |
| 3653 | calc_timer_values(event, &now, &enabled, &running); | 3745 | calc_timer_values(event, &now, &enabled, &running); |
| 3654 | rb = rcu_dereference(event->rb); | ||
| 3655 | if (!rb) | ||
| 3656 | goto unlock; | ||
| 3657 | 3746 | ||
| 3658 | userpg = rb->user_page; | 3747 | userpg = rb->user_page; |
| 3659 | |||
| 3660 | /* | 3748 | /* |
| 3661 | * Disable preemption so as to not let the corresponding user-space | 3749 | * Disable preemption so as to not let the corresponding user-space |
| 3662 | * spin too long if we get preempted. | 3750 | * spin too long if we get preempted. |
| @@ -4009,6 +4097,7 @@ again: | |||
| 4009 | ring_buffer_attach(event, rb); | 4097 | ring_buffer_attach(event, rb); |
| 4010 | rcu_assign_pointer(event->rb, rb); | 4098 | rcu_assign_pointer(event->rb, rb); |
| 4011 | 4099 | ||
| 4100 | perf_event_init_userpage(event); | ||
| 4012 | perf_event_update_userpage(event); | 4101 | perf_event_update_userpage(event); |
| 4013 | 4102 | ||
| 4014 | unlock: | 4103 | unlock: |
| @@ -4251,7 +4340,7 @@ static void __perf_event_header__init_id(struct perf_event_header *header, | |||
| 4251 | if (sample_type & PERF_SAMPLE_TIME) | 4340 | if (sample_type & PERF_SAMPLE_TIME) |
| 4252 | data->time = perf_clock(); | 4341 | data->time = perf_clock(); |
| 4253 | 4342 | ||
| 4254 | if (sample_type & PERF_SAMPLE_ID) | 4343 | if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER)) |
| 4255 | data->id = primary_event_id(event); | 4344 | data->id = primary_event_id(event); |
| 4256 | 4345 | ||
| 4257 | if (sample_type & PERF_SAMPLE_STREAM_ID) | 4346 | if (sample_type & PERF_SAMPLE_STREAM_ID) |
| @@ -4290,6 +4379,9 @@ static void __perf_event__output_id_sample(struct perf_output_handle *handle, | |||
| 4290 | 4379 | ||
| 4291 | if (sample_type & PERF_SAMPLE_CPU) | 4380 | if (sample_type & PERF_SAMPLE_CPU) |
| 4292 | perf_output_put(handle, data->cpu_entry); | 4381 | perf_output_put(handle, data->cpu_entry); |
| 4382 | |||
| 4383 | if (sample_type & PERF_SAMPLE_IDENTIFIER) | ||
| 4384 | perf_output_put(handle, data->id); | ||
| 4293 | } | 4385 | } |
| 4294 | 4386 | ||
| 4295 | void perf_event__output_id_sample(struct perf_event *event, | 4387 | void perf_event__output_id_sample(struct perf_event *event, |
| @@ -4355,7 +4447,8 @@ static void perf_output_read_group(struct perf_output_handle *handle, | |||
| 4355 | list_for_each_entry(sub, &leader->sibling_list, group_entry) { | 4447 | list_for_each_entry(sub, &leader->sibling_list, group_entry) { |
| 4356 | n = 0; | 4448 | n = 0; |
| 4357 | 4449 | ||
| 4358 | if (sub != event) | 4450 | if ((sub != event) && |
| 4451 | (sub->state == PERF_EVENT_STATE_ACTIVE)) | ||
| 4359 | sub->pmu->read(sub); | 4452 | sub->pmu->read(sub); |
| 4360 | 4453 | ||
| 4361 | values[n++] = perf_event_count(sub); | 4454 | values[n++] = perf_event_count(sub); |
| @@ -4402,6 +4495,9 @@ void perf_output_sample(struct perf_output_handle *handle, | |||
| 4402 | 4495 | ||
| 4403 | perf_output_put(handle, *header); | 4496 | perf_output_put(handle, *header); |
| 4404 | 4497 | ||
| 4498 | if (sample_type & PERF_SAMPLE_IDENTIFIER) | ||
| 4499 | perf_output_put(handle, data->id); | ||
| 4500 | |||
| 4405 | if (sample_type & PERF_SAMPLE_IP) | 4501 | if (sample_type & PERF_SAMPLE_IP) |
| 4406 | perf_output_put(handle, data->ip); | 4502 | perf_output_put(handle, data->ip); |
| 4407 | 4503 | ||
| @@ -4462,20 +4558,6 @@ void perf_output_sample(struct perf_output_handle *handle, | |||
| 4462 | } | 4558 | } |
| 4463 | } | 4559 | } |
| 4464 | 4560 | ||
| 4465 | if (!event->attr.watermark) { | ||
| 4466 | int wakeup_events = event->attr.wakeup_events; | ||
| 4467 | |||
| 4468 | if (wakeup_events) { | ||
| 4469 | struct ring_buffer *rb = handle->rb; | ||
| 4470 | int events = local_inc_return(&rb->events); | ||
| 4471 | |||
| 4472 | if (events >= wakeup_events) { | ||
| 4473 | local_sub(wakeup_events, &rb->events); | ||
| 4474 | local_inc(&rb->wakeup); | ||
| 4475 | } | ||
| 4476 | } | ||
| 4477 | } | ||
| 4478 | |||
| 4479 | if (sample_type & PERF_SAMPLE_BRANCH_STACK) { | 4561 | if (sample_type & PERF_SAMPLE_BRANCH_STACK) { |
| 4480 | if (data->br_stack) { | 4562 | if (data->br_stack) { |
| 4481 | size_t size; | 4563 | size_t size; |
| @@ -4511,16 +4593,34 @@ void perf_output_sample(struct perf_output_handle *handle, | |||
| 4511 | } | 4593 | } |
| 4512 | } | 4594 | } |
| 4513 | 4595 | ||
| 4514 | if (sample_type & PERF_SAMPLE_STACK_USER) | 4596 | if (sample_type & PERF_SAMPLE_STACK_USER) { |
| 4515 | perf_output_sample_ustack(handle, | 4597 | perf_output_sample_ustack(handle, |
| 4516 | data->stack_user_size, | 4598 | data->stack_user_size, |
| 4517 | data->regs_user.regs); | 4599 | data->regs_user.regs); |
| 4600 | } | ||
| 4518 | 4601 | ||
| 4519 | if (sample_type & PERF_SAMPLE_WEIGHT) | 4602 | if (sample_type & PERF_SAMPLE_WEIGHT) |
| 4520 | perf_output_put(handle, data->weight); | 4603 | perf_output_put(handle, data->weight); |
| 4521 | 4604 | ||
| 4522 | if (sample_type & PERF_SAMPLE_DATA_SRC) | 4605 | if (sample_type & PERF_SAMPLE_DATA_SRC) |
| 4523 | perf_output_put(handle, data->data_src.val); | 4606 | perf_output_put(handle, data->data_src.val); |
| 4607 | |||
| 4608 | if (sample_type & PERF_SAMPLE_TRANSACTION) | ||
| 4609 | perf_output_put(handle, data->txn); | ||
| 4610 | |||
| 4611 | if (!event->attr.watermark) { | ||
| 4612 | int wakeup_events = event->attr.wakeup_events; | ||
| 4613 | |||
| 4614 | if (wakeup_events) { | ||
| 4615 | struct ring_buffer *rb = handle->rb; | ||
| 4616 | int events = local_inc_return(&rb->events); | ||
| 4617 | |||
| 4618 | if (events >= wakeup_events) { | ||
| 4619 | local_sub(wakeup_events, &rb->events); | ||
| 4620 | local_inc(&rb->wakeup); | ||
| 4621 | } | ||
| 4622 | } | ||
| 4623 | } | ||
| 4524 | } | 4624 | } |
| 4525 | 4625 | ||
| 4526 | void perf_prepare_sample(struct perf_event_header *header, | 4626 | void perf_prepare_sample(struct perf_event_header *header, |
| @@ -4680,12 +4780,10 @@ perf_event_read_event(struct perf_event *event, | |||
| 4680 | perf_output_end(&handle); | 4780 | perf_output_end(&handle); |
| 4681 | } | 4781 | } |
| 4682 | 4782 | ||
| 4683 | typedef int (perf_event_aux_match_cb)(struct perf_event *event, void *data); | ||
| 4684 | typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data); | 4783 | typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data); |
| 4685 | 4784 | ||
| 4686 | static void | 4785 | static void |
| 4687 | perf_event_aux_ctx(struct perf_event_context *ctx, | 4786 | perf_event_aux_ctx(struct perf_event_context *ctx, |
| 4688 | perf_event_aux_match_cb match, | ||
| 4689 | perf_event_aux_output_cb output, | 4787 | perf_event_aux_output_cb output, |
| 4690 | void *data) | 4788 | void *data) |
| 4691 | { | 4789 | { |
| @@ -4696,15 +4794,12 @@ perf_event_aux_ctx(struct perf_event_context *ctx, | |||
| 4696 | continue; | 4794 | continue; |
| 4697 | if (!event_filter_match(event)) | 4795 | if (!event_filter_match(event)) |
| 4698 | continue; | 4796 | continue; |
| 4699 | if (match(event, data)) | 4797 | output(event, data); |
| 4700 | output(event, data); | ||
| 4701 | } | 4798 | } |
| 4702 | } | 4799 | } |
| 4703 | 4800 | ||
| 4704 | static void | 4801 | static void |
| 4705 | perf_event_aux(perf_event_aux_match_cb match, | 4802 | perf_event_aux(perf_event_aux_output_cb output, void *data, |
| 4706 | perf_event_aux_output_cb output, | ||
| 4707 | void *data, | ||
| 4708 | struct perf_event_context *task_ctx) | 4803 | struct perf_event_context *task_ctx) |
| 4709 | { | 4804 | { |
| 4710 | struct perf_cpu_context *cpuctx; | 4805 | struct perf_cpu_context *cpuctx; |
| @@ -4717,7 +4812,7 @@ perf_event_aux(perf_event_aux_match_cb match, | |||
| 4717 | cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); | 4812 | cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); |
| 4718 | if (cpuctx->unique_pmu != pmu) | 4813 | if (cpuctx->unique_pmu != pmu) |
| 4719 | goto next; | 4814 | goto next; |
| 4720 | perf_event_aux_ctx(&cpuctx->ctx, match, output, data); | 4815 | perf_event_aux_ctx(&cpuctx->ctx, output, data); |
| 4721 | if (task_ctx) | 4816 | if (task_ctx) |
| 4722 | goto next; | 4817 | goto next; |
| 4723 | ctxn = pmu->task_ctx_nr; | 4818 | ctxn = pmu->task_ctx_nr; |
| @@ -4725,14 +4820,14 @@ perf_event_aux(perf_event_aux_match_cb match, | |||
| 4725 | goto next; | 4820 | goto next; |
| 4726 | ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); | 4821 | ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); |
| 4727 | if (ctx) | 4822 | if (ctx) |
| 4728 | perf_event_aux_ctx(ctx, match, output, data); | 4823 | perf_event_aux_ctx(ctx, output, data); |
| 4729 | next: | 4824 | next: |
| 4730 | put_cpu_ptr(pmu->pmu_cpu_context); | 4825 | put_cpu_ptr(pmu->pmu_cpu_context); |
| 4731 | } | 4826 | } |
| 4732 | 4827 | ||
| 4733 | if (task_ctx) { | 4828 | if (task_ctx) { |
| 4734 | preempt_disable(); | 4829 | preempt_disable(); |
| 4735 | perf_event_aux_ctx(task_ctx, match, output, data); | 4830 | perf_event_aux_ctx(task_ctx, output, data); |
| 4736 | preempt_enable(); | 4831 | preempt_enable(); |
| 4737 | } | 4832 | } |
| 4738 | rcu_read_unlock(); | 4833 | rcu_read_unlock(); |
| @@ -4741,7 +4836,7 @@ next: | |||
| 4741 | /* | 4836 | /* |
| 4742 | * task tracking -- fork/exit | 4837 | * task tracking -- fork/exit |
| 4743 | * | 4838 | * |
| 4744 | * enabled by: attr.comm | attr.mmap | attr.mmap_data | attr.task | 4839 | * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task |
| 4745 | */ | 4840 | */ |
| 4746 | 4841 | ||
| 4747 | struct perf_task_event { | 4842 | struct perf_task_event { |
| @@ -4759,6 +4854,13 @@ struct perf_task_event { | |||
| 4759 | } event_id; | 4854 | } event_id; |
| 4760 | }; | 4855 | }; |
| 4761 | 4856 | ||
| 4857 | static int perf_event_task_match(struct perf_event *event) | ||
| 4858 | { | ||
| 4859 | return event->attr.comm || event->attr.mmap || | ||
| 4860 | event->attr.mmap2 || event->attr.mmap_data || | ||
| 4861 | event->attr.task; | ||
| 4862 | } | ||
| 4863 | |||
| 4762 | static void perf_event_task_output(struct perf_event *event, | 4864 | static void perf_event_task_output(struct perf_event *event, |
| 4763 | void *data) | 4865 | void *data) |
| 4764 | { | 4866 | { |
| @@ -4768,6 +4870,9 @@ static void perf_event_task_output(struct perf_event *event, | |||
| 4768 | struct task_struct *task = task_event->task; | 4870 | struct task_struct *task = task_event->task; |
| 4769 | int ret, size = task_event->event_id.header.size; | 4871 | int ret, size = task_event->event_id.header.size; |
| 4770 | 4872 | ||
| 4873 | if (!perf_event_task_match(event)) | ||
| 4874 | return; | ||
| 4875 | |||
| 4771 | perf_event_header__init_id(&task_event->event_id.header, &sample, event); | 4876 | perf_event_header__init_id(&task_event->event_id.header, &sample, event); |
| 4772 | 4877 | ||
| 4773 | ret = perf_output_begin(&handle, event, | 4878 | ret = perf_output_begin(&handle, event, |
| @@ -4790,13 +4895,6 @@ out: | |||
| 4790 | task_event->event_id.header.size = size; | 4895 | task_event->event_id.header.size = size; |
| 4791 | } | 4896 | } |
| 4792 | 4897 | ||
| 4793 | static int perf_event_task_match(struct perf_event *event, | ||
| 4794 | void *data __maybe_unused) | ||
| 4795 | { | ||
| 4796 | return event->attr.comm || event->attr.mmap || | ||
| 4797 | event->attr.mmap_data || event->attr.task; | ||
| 4798 | } | ||
| 4799 | |||
| 4800 | static void perf_event_task(struct task_struct *task, | 4898 | static void perf_event_task(struct task_struct *task, |
| 4801 | struct perf_event_context *task_ctx, | 4899 | struct perf_event_context *task_ctx, |
| 4802 | int new) | 4900 | int new) |
| @@ -4825,8 +4923,7 @@ static void perf_event_task(struct task_struct *task, | |||
| 4825 | }, | 4923 | }, |
| 4826 | }; | 4924 | }; |
| 4827 | 4925 | ||
| 4828 | perf_event_aux(perf_event_task_match, | 4926 | perf_event_aux(perf_event_task_output, |
| 4829 | perf_event_task_output, | ||
| 4830 | &task_event, | 4927 | &task_event, |
| 4831 | task_ctx); | 4928 | task_ctx); |
| 4832 | } | 4929 | } |
| @@ -4853,6 +4950,11 @@ struct perf_comm_event { | |||
| 4853 | } event_id; | 4950 | } event_id; |
| 4854 | }; | 4951 | }; |
| 4855 | 4952 | ||
| 4953 | static int perf_event_comm_match(struct perf_event *event) | ||
| 4954 | { | ||
| 4955 | return event->attr.comm; | ||
| 4956 | } | ||
| 4957 | |||
| 4856 | static void perf_event_comm_output(struct perf_event *event, | 4958 | static void perf_event_comm_output(struct perf_event *event, |
| 4857 | void *data) | 4959 | void *data) |
| 4858 | { | 4960 | { |
| @@ -4862,6 +4964,9 @@ static void perf_event_comm_output(struct perf_event *event, | |||
| 4862 | int size = comm_event->event_id.header.size; | 4964 | int size = comm_event->event_id.header.size; |
| 4863 | int ret; | 4965 | int ret; |
| 4864 | 4966 | ||
| 4967 | if (!perf_event_comm_match(event)) | ||
| 4968 | return; | ||
| 4969 | |||
| 4865 | perf_event_header__init_id(&comm_event->event_id.header, &sample, event); | 4970 | perf_event_header__init_id(&comm_event->event_id.header, &sample, event); |
| 4866 | ret = perf_output_begin(&handle, event, | 4971 | ret = perf_output_begin(&handle, event, |
| 4867 | comm_event->event_id.header.size); | 4972 | comm_event->event_id.header.size); |
| @@ -4883,12 +4988,6 @@ out: | |||
| 4883 | comm_event->event_id.header.size = size; | 4988 | comm_event->event_id.header.size = size; |
| 4884 | } | 4989 | } |
| 4885 | 4990 | ||
| 4886 | static int perf_event_comm_match(struct perf_event *event, | ||
| 4887 | void *data __maybe_unused) | ||
| 4888 | { | ||
| 4889 | return event->attr.comm; | ||
| 4890 | } | ||
| 4891 | |||
| 4892 | static void perf_event_comm_event(struct perf_comm_event *comm_event) | 4991 | static void perf_event_comm_event(struct perf_comm_event *comm_event) |
| 4893 | { | 4992 | { |
| 4894 | char comm[TASK_COMM_LEN]; | 4993 | char comm[TASK_COMM_LEN]; |
| @@ -4903,8 +5002,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) | |||
| 4903 | 5002 | ||
| 4904 | comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; | 5003 | comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; |
| 4905 | 5004 | ||
| 4906 | perf_event_aux(perf_event_comm_match, | 5005 | perf_event_aux(perf_event_comm_output, |
| 4907 | perf_event_comm_output, | ||
| 4908 | comm_event, | 5006 | comm_event, |
| 4909 | NULL); | 5007 | NULL); |
| 4910 | } | 5008 | } |
| @@ -4955,6 +5053,9 @@ struct perf_mmap_event { | |||
| 4955 | 5053 | ||
| 4956 | const char *file_name; | 5054 | const char *file_name; |
| 4957 | int file_size; | 5055 | int file_size; |
| 5056 | int maj, min; | ||
| 5057 | u64 ino; | ||
| 5058 | u64 ino_generation; | ||
| 4958 | 5059 | ||
| 4959 | struct { | 5060 | struct { |
| 4960 | struct perf_event_header header; | 5061 | struct perf_event_header header; |
| @@ -4967,6 +5068,17 @@ struct perf_mmap_event { | |||
| 4967 | } event_id; | 5068 | } event_id; |
| 4968 | }; | 5069 | }; |
| 4969 | 5070 | ||
| 5071 | static int perf_event_mmap_match(struct perf_event *event, | ||
| 5072 | void *data) | ||
| 5073 | { | ||
| 5074 | struct perf_mmap_event *mmap_event = data; | ||
| 5075 | struct vm_area_struct *vma = mmap_event->vma; | ||
| 5076 | int executable = vma->vm_flags & VM_EXEC; | ||
| 5077 | |||
| 5078 | return (!executable && event->attr.mmap_data) || | ||
| 5079 | (executable && (event->attr.mmap || event->attr.mmap2)); | ||
| 5080 | } | ||
| 5081 | |||
| 4970 | static void perf_event_mmap_output(struct perf_event *event, | 5082 | static void perf_event_mmap_output(struct perf_event *event, |
| 4971 | void *data) | 5083 | void *data) |
| 4972 | { | 5084 | { |
| @@ -4976,6 +5088,17 @@ static void perf_event_mmap_output(struct perf_event *event, | |||
| 4976 | int size = mmap_event->event_id.header.size; | 5088 | int size = mmap_event->event_id.header.size; |
| 4977 | int ret; | 5089 | int ret; |
| 4978 | 5090 | ||
| 5091 | if (!perf_event_mmap_match(event, data)) | ||
| 5092 | return; | ||
| 5093 | |||
| 5094 | if (event->attr.mmap2) { | ||
| 5095 | mmap_event->event_id.header.type = PERF_RECORD_MMAP2; | ||
| 5096 | mmap_event->event_id.header.size += sizeof(mmap_event->maj); | ||
| 5097 | mmap_event->event_id.header.size += sizeof(mmap_event->min); | ||
| 5098 | mmap_event->event_id.header.size += sizeof(mmap_event->ino); | ||
| 5099 | mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation); | ||
| 5100 | } | ||
| 5101 | |||
| 4979 | perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); | 5102 | perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); |
| 4980 | ret = perf_output_begin(&handle, event, | 5103 | ret = perf_output_begin(&handle, event, |
| 4981 | mmap_event->event_id.header.size); | 5104 | mmap_event->event_id.header.size); |
| @@ -4986,6 +5109,14 @@ static void perf_event_mmap_output(struct perf_event *event, | |||
| 4986 | mmap_event->event_id.tid = perf_event_tid(event, current); | 5109 | mmap_event->event_id.tid = perf_event_tid(event, current); |
| 4987 | 5110 | ||
| 4988 | perf_output_put(&handle, mmap_event->event_id); | 5111 | perf_output_put(&handle, mmap_event->event_id); |
| 5112 | |||
| 5113 | if (event->attr.mmap2) { | ||
| 5114 | perf_output_put(&handle, mmap_event->maj); | ||
| 5115 | perf_output_put(&handle, mmap_event->min); | ||
| 5116 | perf_output_put(&handle, mmap_event->ino); | ||
| 5117 | perf_output_put(&handle, mmap_event->ino_generation); | ||
| 5118 | } | ||
| 5119 | |||
| 4989 | __output_copy(&handle, mmap_event->file_name, | 5120 | __output_copy(&handle, mmap_event->file_name, |
| 4990 | mmap_event->file_size); | 5121 | mmap_event->file_size); |
| 4991 | 5122 | ||
| @@ -4996,82 +5127,89 @@ out: | |||
| 4996 | mmap_event->event_id.header.size = size; | 5127 | mmap_event->event_id.header.size = size; |
| 4997 | } | 5128 | } |
| 4998 | 5129 | ||
| 4999 | static int perf_event_mmap_match(struct perf_event *event, | ||
| 5000 | void *data) | ||
| 5001 | { | ||
| 5002 | struct perf_mmap_event *mmap_event = data; | ||
| 5003 | struct vm_area_struct *vma = mmap_event->vma; | ||
| 5004 | int executable = vma->vm_flags & VM_EXEC; | ||
| 5005 | |||
| 5006 | return (!executable && event->attr.mmap_data) || | ||
| 5007 | (executable && event->attr.mmap); | ||
| 5008 | } | ||
| 5009 | |||
| 5010 | static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) | 5130 | static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) |
| 5011 | { | 5131 | { |
| 5012 | struct vm_area_struct *vma = mmap_event->vma; | 5132 | struct vm_area_struct *vma = mmap_event->vma; |
| 5013 | struct file *file = vma->vm_file; | 5133 | struct file *file = vma->vm_file; |
| 5134 | int maj = 0, min = 0; | ||
| 5135 | u64 ino = 0, gen = 0; | ||
| 5014 | unsigned int size; | 5136 | unsigned int size; |
| 5015 | char tmp[16]; | 5137 | char tmp[16]; |
| 5016 | char *buf = NULL; | 5138 | char *buf = NULL; |
| 5017 | const char *name; | 5139 | char *name; |
| 5018 | |||
| 5019 | memset(tmp, 0, sizeof(tmp)); | ||
| 5020 | 5140 | ||
| 5021 | if (file) { | 5141 | if (file) { |
| 5142 | struct inode *inode; | ||
| 5143 | dev_t dev; | ||
| 5144 | |||
| 5145 | buf = kmalloc(PATH_MAX, GFP_KERNEL); | ||
| 5146 | if (!buf) { | ||
| 5147 | name = "//enomem"; | ||
| 5148 | goto cpy_name; | ||
| 5149 | } | ||
| 5022 | /* | 5150 | /* |
| 5023 | * d_path works from the end of the rb backwards, so we | 5151 | * d_path() works from the end of the rb backwards, so we |
| 5024 | * need to add enough zero bytes after the string to handle | 5152 | * need to add enough zero bytes after the string to handle |
| 5025 | * the 64bit alignment we do later. | 5153 | * the 64bit alignment we do later. |
| 5026 | */ | 5154 | */ |
| 5027 | buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL); | 5155 | name = d_path(&file->f_path, buf, PATH_MAX - sizeof(u64)); |
| 5028 | if (!buf) { | ||
| 5029 | name = strncpy(tmp, "//enomem", sizeof(tmp)); | ||
| 5030 | goto got_name; | ||
| 5031 | } | ||
| 5032 | name = d_path(&file->f_path, buf, PATH_MAX); | ||
| 5033 | if (IS_ERR(name)) { | 5156 | if (IS_ERR(name)) { |
| 5034 | name = strncpy(tmp, "//toolong", sizeof(tmp)); | 5157 | name = "//toolong"; |
| 5035 | goto got_name; | 5158 | goto cpy_name; |
| 5036 | } | 5159 | } |
| 5160 | inode = file_inode(vma->vm_file); | ||
| 5161 | dev = inode->i_sb->s_dev; | ||
| 5162 | ino = inode->i_ino; | ||
| 5163 | gen = inode->i_generation; | ||
| 5164 | maj = MAJOR(dev); | ||
| 5165 | min = MINOR(dev); | ||
| 5166 | goto got_name; | ||
| 5037 | } else { | 5167 | } else { |
| 5038 | if (arch_vma_name(mmap_event->vma)) { | 5168 | name = (char *)arch_vma_name(vma); |
| 5039 | name = strncpy(tmp, arch_vma_name(mmap_event->vma), | 5169 | if (name) |
| 5040 | sizeof(tmp) - 1); | 5170 | goto cpy_name; |
| 5041 | tmp[sizeof(tmp) - 1] = '\0'; | ||
| 5042 | goto got_name; | ||
| 5043 | } | ||
| 5044 | 5171 | ||
| 5045 | if (!vma->vm_mm) { | 5172 | if (vma->vm_start <= vma->vm_mm->start_brk && |
| 5046 | name = strncpy(tmp, "[vdso]", sizeof(tmp)); | ||
| 5047 | goto got_name; | ||
| 5048 | } else if (vma->vm_start <= vma->vm_mm->start_brk && | ||
| 5049 | vma->vm_end >= vma->vm_mm->brk) { | 5173 | vma->vm_end >= vma->vm_mm->brk) { |
| 5050 | name = strncpy(tmp, "[heap]", sizeof(tmp)); | 5174 | name = "[heap]"; |
| 5051 | goto got_name; | 5175 | goto cpy_name; |
| 5052 | } else if (vma->vm_start <= vma->vm_mm->start_stack && | 5176 | } |
| 5177 | if (vma->vm_start <= vma->vm_mm->start_stack && | ||
| 5053 | vma->vm_end >= vma->vm_mm->start_stack) { | 5178 | vma->vm_end >= vma->vm_mm->start_stack) { |
| 5054 | name = strncpy(tmp, "[stack]", sizeof(tmp)); | 5179 | name = "[stack]"; |
| 5055 | goto got_name; | 5180 | goto cpy_name; |
| 5056 | } | 5181 | } |
| 5057 | 5182 | ||
| 5058 | name = strncpy(tmp, "//anon", sizeof(tmp)); | 5183 | name = "//anon"; |
| 5059 | goto got_name; | 5184 | goto cpy_name; |
| 5060 | } | 5185 | } |
| 5061 | 5186 | ||
| 5187 | cpy_name: | ||
| 5188 | strlcpy(tmp, name, sizeof(tmp)); | ||
| 5189 | name = tmp; | ||
| 5062 | got_name: | 5190 | got_name: |
| 5063 | size = ALIGN(strlen(name)+1, sizeof(u64)); | 5191 | /* |
| 5192 | * Since our buffer works in 8 byte units we need to align our string | ||
| 5193 | * size to a multiple of 8. However, we must guarantee the tail end is | ||
| 5194 | * zero'd out to avoid leaking random bits to userspace. | ||
| 5195 | */ | ||
| 5196 | size = strlen(name)+1; | ||
| 5197 | while (!IS_ALIGNED(size, sizeof(u64))) | ||
| 5198 | name[size++] = '\0'; | ||
| 5064 | 5199 | ||
| 5065 | mmap_event->file_name = name; | 5200 | mmap_event->file_name = name; |
| 5066 | mmap_event->file_size = size; | 5201 | mmap_event->file_size = size; |
| 5202 | mmap_event->maj = maj; | ||
| 5203 | mmap_event->min = min; | ||
| 5204 | mmap_event->ino = ino; | ||
| 5205 | mmap_event->ino_generation = gen; | ||
| 5067 | 5206 | ||
| 5068 | if (!(vma->vm_flags & VM_EXEC)) | 5207 | if (!(vma->vm_flags & VM_EXEC)) |
| 5069 | mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA; | 5208 | mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA; |
| 5070 | 5209 | ||
| 5071 | mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; | 5210 | mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; |
| 5072 | 5211 | ||
| 5073 | perf_event_aux(perf_event_mmap_match, | 5212 | perf_event_aux(perf_event_mmap_output, |
| 5074 | perf_event_mmap_output, | ||
| 5075 | mmap_event, | 5213 | mmap_event, |
| 5076 | NULL); | 5214 | NULL); |
| 5077 | 5215 | ||
| @@ -5101,6 +5239,10 @@ void perf_event_mmap(struct vm_area_struct *vma) | |||
| 5101 | .len = vma->vm_end - vma->vm_start, | 5239 | .len = vma->vm_end - vma->vm_start, |
| 5102 | .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT, | 5240 | .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT, |
| 5103 | }, | 5241 | }, |
| 5242 | /* .maj (attr_mmap2 only) */ | ||
| 5243 | /* .min (attr_mmap2 only) */ | ||
| 5244 | /* .ino (attr_mmap2 only) */ | ||
| 5245 | /* .ino_generation (attr_mmap2 only) */ | ||
| 5104 | }; | 5246 | }; |
| 5105 | 5247 | ||
| 5106 | perf_event_mmap_event(&mmap_event); | 5248 | perf_event_mmap_event(&mmap_event); |
| @@ -5178,6 +5320,7 @@ static int __perf_event_overflow(struct perf_event *event, | |||
| 5178 | __this_cpu_inc(perf_throttled_count); | 5320 | __this_cpu_inc(perf_throttled_count); |
| 5179 | hwc->interrupts = MAX_INTERRUPTS; | 5321 | hwc->interrupts = MAX_INTERRUPTS; |
| 5180 | perf_log_throttle(event, 0); | 5322 | perf_log_throttle(event, 0); |
| 5323 | tick_nohz_full_kick(); | ||
| 5181 | ret = 1; | 5324 | ret = 1; |
| 5182 | } | 5325 | } |
| 5183 | } | 5326 | } |
| @@ -6189,6 +6332,7 @@ type_show(struct device *dev, struct device_attribute *attr, char *page) | |||
| 6189 | 6332 | ||
| 6190 | return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type); | 6333 | return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type); |
| 6191 | } | 6334 | } |
| 6335 | static DEVICE_ATTR_RO(type); | ||
| 6192 | 6336 | ||
| 6193 | static ssize_t | 6337 | static ssize_t |
| 6194 | perf_event_mux_interval_ms_show(struct device *dev, | 6338 | perf_event_mux_interval_ms_show(struct device *dev, |
| @@ -6233,17 +6377,19 @@ perf_event_mux_interval_ms_store(struct device *dev, | |||
| 6233 | 6377 | ||
| 6234 | return count; | 6378 | return count; |
| 6235 | } | 6379 | } |
| 6380 | static DEVICE_ATTR_RW(perf_event_mux_interval_ms); | ||
| 6236 | 6381 | ||
| 6237 | static struct device_attribute pmu_dev_attrs[] = { | 6382 | static struct attribute *pmu_dev_attrs[] = { |
| 6238 | __ATTR_RO(type), | 6383 | &dev_attr_type.attr, |
| 6239 | __ATTR_RW(perf_event_mux_interval_ms), | 6384 | &dev_attr_perf_event_mux_interval_ms.attr, |
| 6240 | __ATTR_NULL, | 6385 | NULL, |
| 6241 | }; | 6386 | }; |
| 6387 | ATTRIBUTE_GROUPS(pmu_dev); | ||
| 6242 | 6388 | ||
| 6243 | static int pmu_bus_running; | 6389 | static int pmu_bus_running; |
| 6244 | static struct bus_type pmu_bus = { | 6390 | static struct bus_type pmu_bus = { |
| 6245 | .name = "event_source", | 6391 | .name = "event_source", |
| 6246 | .dev_attrs = pmu_dev_attrs, | 6392 | .dev_groups = pmu_dev_groups, |
| 6247 | }; | 6393 | }; |
| 6248 | 6394 | ||
| 6249 | static void pmu_dev_release(struct device *dev) | 6395 | static void pmu_dev_release(struct device *dev) |
| @@ -6443,6 +6589,44 @@ unlock: | |||
| 6443 | return pmu; | 6589 | return pmu; |
| 6444 | } | 6590 | } |
| 6445 | 6591 | ||
| 6592 | static void account_event_cpu(struct perf_event *event, int cpu) | ||
| 6593 | { | ||
| 6594 | if (event->parent) | ||
| 6595 | return; | ||
| 6596 | |||
| 6597 | if (has_branch_stack(event)) { | ||
| 6598 | if (!(event->attach_state & PERF_ATTACH_TASK)) | ||
| 6599 | atomic_inc(&per_cpu(perf_branch_stack_events, cpu)); | ||
| 6600 | } | ||
| 6601 | if (is_cgroup_event(event)) | ||
| 6602 | atomic_inc(&per_cpu(perf_cgroup_events, cpu)); | ||
| 6603 | } | ||
| 6604 | |||
| 6605 | static void account_event(struct perf_event *event) | ||
| 6606 | { | ||
| 6607 | if (event->parent) | ||
| 6608 | return; | ||
| 6609 | |||
| 6610 | if (event->attach_state & PERF_ATTACH_TASK) | ||
| 6611 | static_key_slow_inc(&perf_sched_events.key); | ||
| 6612 | if (event->attr.mmap || event->attr.mmap_data) | ||
| 6613 | atomic_inc(&nr_mmap_events); | ||
| 6614 | if (event->attr.comm) | ||
| 6615 | atomic_inc(&nr_comm_events); | ||
| 6616 | if (event->attr.task) | ||
| 6617 | atomic_inc(&nr_task_events); | ||
| 6618 | if (event->attr.freq) { | ||
| 6619 | if (atomic_inc_return(&nr_freq_events) == 1) | ||
| 6620 | tick_nohz_full_kick_all(); | ||
| 6621 | } | ||
| 6622 | if (has_branch_stack(event)) | ||
| 6623 | static_key_slow_inc(&perf_sched_events.key); | ||
| 6624 | if (is_cgroup_event(event)) | ||
| 6625 | static_key_slow_inc(&perf_sched_events.key); | ||
| 6626 | |||
| 6627 | account_event_cpu(event, event->cpu); | ||
| 6628 | } | ||
| 6629 | |||
| 6446 | /* | 6630 | /* |
| 6447 | * Allocate and initialize a event structure | 6631 | * Allocate and initialize a event structure |
| 6448 | */ | 6632 | */ |
| @@ -6457,7 +6641,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, | |||
| 6457 | struct pmu *pmu; | 6641 | struct pmu *pmu; |
| 6458 | struct perf_event *event; | 6642 | struct perf_event *event; |
| 6459 | struct hw_perf_event *hwc; | 6643 | struct hw_perf_event *hwc; |
| 6460 | long err; | 6644 | long err = -EINVAL; |
| 6461 | 6645 | ||
| 6462 | if ((unsigned)cpu >= nr_cpu_ids) { | 6646 | if ((unsigned)cpu >= nr_cpu_ids) { |
| 6463 | if (!task || cpu != -1) | 6647 | if (!task || cpu != -1) |
| @@ -6540,49 +6724,35 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, | |||
| 6540 | * we currently do not support PERF_FORMAT_GROUP on inherited events | 6724 | * we currently do not support PERF_FORMAT_GROUP on inherited events |
| 6541 | */ | 6725 | */ |
| 6542 | if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) | 6726 | if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) |
| 6543 | goto done; | 6727 | goto err_ns; |
| 6544 | 6728 | ||
| 6545 | pmu = perf_init_event(event); | 6729 | pmu = perf_init_event(event); |
| 6546 | |||
| 6547 | done: | ||
| 6548 | err = 0; | ||
| 6549 | if (!pmu) | 6730 | if (!pmu) |
| 6550 | err = -EINVAL; | 6731 | goto err_ns; |
| 6551 | else if (IS_ERR(pmu)) | 6732 | else if (IS_ERR(pmu)) { |
| 6552 | err = PTR_ERR(pmu); | 6733 | err = PTR_ERR(pmu); |
| 6553 | 6734 | goto err_ns; | |
| 6554 | if (err) { | ||
| 6555 | if (event->ns) | ||
| 6556 | put_pid_ns(event->ns); | ||
| 6557 | kfree(event); | ||
| 6558 | return ERR_PTR(err); | ||
| 6559 | } | 6735 | } |
| 6560 | 6736 | ||
| 6561 | if (!event->parent) { | 6737 | if (!event->parent) { |
| 6562 | if (event->attach_state & PERF_ATTACH_TASK) | ||
| 6563 | static_key_slow_inc(&perf_sched_events.key); | ||
| 6564 | if (event->attr.mmap || event->attr.mmap_data) | ||
| 6565 | atomic_inc(&nr_mmap_events); | ||
| 6566 | if (event->attr.comm) | ||
| 6567 | atomic_inc(&nr_comm_events); | ||
| 6568 | if (event->attr.task) | ||
| 6569 | atomic_inc(&nr_task_events); | ||
| 6570 | if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { | 6738 | if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { |
| 6571 | err = get_callchain_buffers(); | 6739 | err = get_callchain_buffers(); |
| 6572 | if (err) { | 6740 | if (err) |
| 6573 | free_event(event); | 6741 | goto err_pmu; |
| 6574 | return ERR_PTR(err); | ||
| 6575 | } | ||
| 6576 | } | ||
| 6577 | if (has_branch_stack(event)) { | ||
| 6578 | static_key_slow_inc(&perf_sched_events.key); | ||
| 6579 | if (!(event->attach_state & PERF_ATTACH_TASK)) | ||
| 6580 | atomic_inc(&per_cpu(perf_branch_stack_events, | ||
| 6581 | event->cpu)); | ||
| 6582 | } | 6742 | } |
| 6583 | } | 6743 | } |
| 6584 | 6744 | ||
| 6585 | return event; | 6745 | return event; |
| 6746 | |||
| 6747 | err_pmu: | ||
| 6748 | if (event->destroy) | ||
| 6749 | event->destroy(event); | ||
| 6750 | err_ns: | ||
| 6751 | if (event->ns) | ||
| 6752 | put_pid_ns(event->ns); | ||
| 6753 | kfree(event); | ||
| 6754 | |||
| 6755 | return ERR_PTR(err); | ||
| 6586 | } | 6756 | } |
| 6587 | 6757 | ||
| 6588 | static int perf_copy_attr(struct perf_event_attr __user *uattr, | 6758 | static int perf_copy_attr(struct perf_event_attr __user *uattr, |
| @@ -6640,6 +6810,10 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, | |||
| 6640 | if (ret) | 6810 | if (ret) |
| 6641 | return -EFAULT; | 6811 | return -EFAULT; |
| 6642 | 6812 | ||
| 6813 | /* disabled for now */ | ||
| 6814 | if (attr->mmap2) | ||
| 6815 | return -EINVAL; | ||
| 6816 | |||
| 6643 | if (attr->__reserved_1) | 6817 | if (attr->__reserved_1) |
| 6644 | return -EINVAL; | 6818 | return -EINVAL; |
| 6645 | 6819 | ||
| @@ -6864,17 +7038,14 @@ SYSCALL_DEFINE5(perf_event_open, | |||
| 6864 | 7038 | ||
| 6865 | if (flags & PERF_FLAG_PID_CGROUP) { | 7039 | if (flags & PERF_FLAG_PID_CGROUP) { |
| 6866 | err = perf_cgroup_connect(pid, event, &attr, group_leader); | 7040 | err = perf_cgroup_connect(pid, event, &attr, group_leader); |
| 6867 | if (err) | 7041 | if (err) { |
| 6868 | goto err_alloc; | 7042 | __free_event(event); |
| 6869 | /* | 7043 | goto err_task; |
| 6870 | * one more event: | 7044 | } |
| 6871 | * - that has cgroup constraint on event->cpu | ||
| 6872 | * - that may need work on context switch | ||
| 6873 | */ | ||
| 6874 | atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); | ||
| 6875 | static_key_slow_inc(&perf_sched_events.key); | ||
| 6876 | } | 7045 | } |
| 6877 | 7046 | ||
| 7047 | account_event(event); | ||
| 7048 | |||
| 6878 | /* | 7049 | /* |
| 6879 | * Special case software events and allow them to be part of | 7050 | * Special case software events and allow them to be part of |
| 6880 | * any hardware group. | 7051 | * any hardware group. |
| @@ -6998,7 +7169,6 @@ SYSCALL_DEFINE5(perf_event_open, | |||
| 6998 | } | 7169 | } |
| 6999 | 7170 | ||
| 7000 | perf_install_in_context(ctx, event, event->cpu); | 7171 | perf_install_in_context(ctx, event, event->cpu); |
| 7001 | ++ctx->generation; | ||
| 7002 | perf_unpin_context(ctx); | 7172 | perf_unpin_context(ctx); |
| 7003 | mutex_unlock(&ctx->mutex); | 7173 | mutex_unlock(&ctx->mutex); |
| 7004 | 7174 | ||
| @@ -7070,6 +7240,8 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, | |||
| 7070 | goto err; | 7240 | goto err; |
| 7071 | } | 7241 | } |
| 7072 | 7242 | ||
| 7243 | account_event(event); | ||
| 7244 | |||
| 7073 | ctx = find_get_context(event->pmu, task, cpu); | 7245 | ctx = find_get_context(event->pmu, task, cpu); |
| 7074 | if (IS_ERR(ctx)) { | 7246 | if (IS_ERR(ctx)) { |
| 7075 | err = PTR_ERR(ctx); | 7247 | err = PTR_ERR(ctx); |
| @@ -7079,7 +7251,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, | |||
| 7079 | WARN_ON_ONCE(ctx->parent_ctx); | 7251 | WARN_ON_ONCE(ctx->parent_ctx); |
| 7080 | mutex_lock(&ctx->mutex); | 7252 | mutex_lock(&ctx->mutex); |
| 7081 | perf_install_in_context(ctx, event, cpu); | 7253 | perf_install_in_context(ctx, event, cpu); |
| 7082 | ++ctx->generation; | ||
| 7083 | perf_unpin_context(ctx); | 7254 | perf_unpin_context(ctx); |
| 7084 | mutex_unlock(&ctx->mutex); | 7255 | mutex_unlock(&ctx->mutex); |
| 7085 | 7256 | ||
| @@ -7106,18 +7277,20 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) | |||
| 7106 | list_for_each_entry_safe(event, tmp, &src_ctx->event_list, | 7277 | list_for_each_entry_safe(event, tmp, &src_ctx->event_list, |
| 7107 | event_entry) { | 7278 | event_entry) { |
| 7108 | perf_remove_from_context(event); | 7279 | perf_remove_from_context(event); |
| 7280 | unaccount_event_cpu(event, src_cpu); | ||
| 7109 | put_ctx(src_ctx); | 7281 | put_ctx(src_ctx); |
| 7110 | list_add(&event->event_entry, &events); | 7282 | list_add(&event->migrate_entry, &events); |
| 7111 | } | 7283 | } |
| 7112 | mutex_unlock(&src_ctx->mutex); | 7284 | mutex_unlock(&src_ctx->mutex); |
| 7113 | 7285 | ||
| 7114 | synchronize_rcu(); | 7286 | synchronize_rcu(); |
| 7115 | 7287 | ||
| 7116 | mutex_lock(&dst_ctx->mutex); | 7288 | mutex_lock(&dst_ctx->mutex); |
| 7117 | list_for_each_entry_safe(event, tmp, &events, event_entry) { | 7289 | list_for_each_entry_safe(event, tmp, &events, migrate_entry) { |
| 7118 | list_del(&event->event_entry); | 7290 | list_del(&event->migrate_entry); |
| 7119 | if (event->state >= PERF_EVENT_STATE_OFF) | 7291 | if (event->state >= PERF_EVENT_STATE_OFF) |
| 7120 | event->state = PERF_EVENT_STATE_INACTIVE; | 7292 | event->state = PERF_EVENT_STATE_INACTIVE; |
| 7293 | account_event_cpu(event, dst_cpu); | ||
| 7121 | perf_install_in_context(dst_ctx, event, dst_cpu); | 7294 | perf_install_in_context(dst_ctx, event, dst_cpu); |
| 7122 | get_ctx(dst_ctx); | 7295 | get_ctx(dst_ctx); |
| 7123 | } | 7296 | } |
| @@ -7798,7 +7971,8 @@ unlock: | |||
| 7798 | device_initcall(perf_event_sysfs_init); | 7971 | device_initcall(perf_event_sysfs_init); |
| 7799 | 7972 | ||
| 7800 | #ifdef CONFIG_CGROUP_PERF | 7973 | #ifdef CONFIG_CGROUP_PERF |
| 7801 | static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont) | 7974 | static struct cgroup_subsys_state * |
| 7975 | perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) | ||
| 7802 | { | 7976 | { |
| 7803 | struct perf_cgroup *jc; | 7977 | struct perf_cgroup *jc; |
| 7804 | 7978 | ||
| @@ -7815,11 +7989,10 @@ static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont) | |||
| 7815 | return &jc->css; | 7989 | return &jc->css; |
| 7816 | } | 7990 | } |
| 7817 | 7991 | ||
| 7818 | static void perf_cgroup_css_free(struct cgroup *cont) | 7992 | static void perf_cgroup_css_free(struct cgroup_subsys_state *css) |
| 7819 | { | 7993 | { |
| 7820 | struct perf_cgroup *jc; | 7994 | struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css); |
| 7821 | jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), | 7995 | |
| 7822 | struct perf_cgroup, css); | ||
| 7823 | free_percpu(jc->info); | 7996 | free_percpu(jc->info); |
| 7824 | kfree(jc); | 7997 | kfree(jc); |
| 7825 | } | 7998 | } |
| @@ -7831,15 +8004,17 @@ static int __perf_cgroup_move(void *info) | |||
| 7831 | return 0; | 8004 | return 0; |
| 7832 | } | 8005 | } |
| 7833 | 8006 | ||
| 7834 | static void perf_cgroup_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) | 8007 | static void perf_cgroup_attach(struct cgroup_subsys_state *css, |
| 8008 | struct cgroup_taskset *tset) | ||
| 7835 | { | 8009 | { |
| 7836 | struct task_struct *task; | 8010 | struct task_struct *task; |
| 7837 | 8011 | ||
| 7838 | cgroup_taskset_for_each(task, cgrp, tset) | 8012 | cgroup_taskset_for_each(task, css, tset) |
| 7839 | task_function_call(task, __perf_cgroup_move, task); | 8013 | task_function_call(task, __perf_cgroup_move, task); |
| 7840 | } | 8014 | } |
| 7841 | 8015 | ||
| 7842 | static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, | 8016 | static void perf_cgroup_exit(struct cgroup_subsys_state *css, |
| 8017 | struct cgroup_subsys_state *old_css, | ||
| 7843 | struct task_struct *task) | 8018 | struct task_struct *task) |
| 7844 | { | 8019 | { |
| 7845 | /* | 8020 | /* |
diff --git a/kernel/events/internal.h b/kernel/events/internal.h index ca6599723be5..569b218782ad 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h | |||
| @@ -82,16 +82,16 @@ static inline unsigned long perf_data_size(struct ring_buffer *rb) | |||
| 82 | } | 82 | } |
| 83 | 83 | ||
| 84 | #define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \ | 84 | #define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \ |
| 85 | static inline unsigned int \ | 85 | static inline unsigned long \ |
| 86 | func_name(struct perf_output_handle *handle, \ | 86 | func_name(struct perf_output_handle *handle, \ |
| 87 | const void *buf, unsigned int len) \ | 87 | const void *buf, unsigned long len) \ |
| 88 | { \ | 88 | { \ |
| 89 | unsigned long size, written; \ | 89 | unsigned long size, written; \ |
| 90 | \ | 90 | \ |
| 91 | do { \ | 91 | do { \ |
| 92 | size = min_t(unsigned long, handle->size, len); \ | 92 | size = min(handle->size, len); \ |
| 93 | \ | ||
| 94 | written = memcpy_func(handle->addr, buf, size); \ | 93 | written = memcpy_func(handle->addr, buf, size); \ |
| 94 | written = size - written; \ | ||
| 95 | \ | 95 | \ |
| 96 | len -= written; \ | 96 | len -= written; \ |
| 97 | handle->addr += written; \ | 97 | handle->addr += written; \ |
| @@ -110,20 +110,37 @@ func_name(struct perf_output_handle *handle, \ | |||
| 110 | return len; \ | 110 | return len; \ |
| 111 | } | 111 | } |
| 112 | 112 | ||
| 113 | static inline int memcpy_common(void *dst, const void *src, size_t n) | 113 | static inline unsigned long |
| 114 | memcpy_common(void *dst, const void *src, unsigned long n) | ||
| 114 | { | 115 | { |
| 115 | memcpy(dst, src, n); | 116 | memcpy(dst, src, n); |
| 116 | return n; | 117 | return 0; |
| 117 | } | 118 | } |
| 118 | 119 | ||
| 119 | DEFINE_OUTPUT_COPY(__output_copy, memcpy_common) | 120 | DEFINE_OUTPUT_COPY(__output_copy, memcpy_common) |
| 120 | 121 | ||
| 121 | #define MEMCPY_SKIP(dst, src, n) (n) | 122 | static inline unsigned long |
| 123 | memcpy_skip(void *dst, const void *src, unsigned long n) | ||
| 124 | { | ||
| 125 | return 0; | ||
| 126 | } | ||
| 122 | 127 | ||
| 123 | DEFINE_OUTPUT_COPY(__output_skip, MEMCPY_SKIP) | 128 | DEFINE_OUTPUT_COPY(__output_skip, memcpy_skip) |
| 124 | 129 | ||
| 125 | #ifndef arch_perf_out_copy_user | 130 | #ifndef arch_perf_out_copy_user |
| 126 | #define arch_perf_out_copy_user __copy_from_user_inatomic | 131 | #define arch_perf_out_copy_user arch_perf_out_copy_user |
| 132 | |||
| 133 | static inline unsigned long | ||
| 134 | arch_perf_out_copy_user(void *dst, const void *src, unsigned long n) | ||
| 135 | { | ||
| 136 | unsigned long ret; | ||
| 137 | |||
| 138 | pagefault_disable(); | ||
| 139 | ret = __copy_from_user_inatomic(dst, src, n); | ||
| 140 | pagefault_enable(); | ||
| 141 | |||
| 142 | return ret; | ||
| 143 | } | ||
| 127 | #endif | 144 | #endif |
| 128 | 145 | ||
| 129 | DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user) | 146 | DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user) |
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index cd55144270b5..e8b168af135b 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c | |||
| @@ -12,40 +12,10 @@ | |||
| 12 | #include <linux/perf_event.h> | 12 | #include <linux/perf_event.h> |
| 13 | #include <linux/vmalloc.h> | 13 | #include <linux/vmalloc.h> |
| 14 | #include <linux/slab.h> | 14 | #include <linux/slab.h> |
| 15 | #include <linux/circ_buf.h> | ||
| 15 | 16 | ||
| 16 | #include "internal.h" | 17 | #include "internal.h" |
| 17 | 18 | ||
| 18 | static bool perf_output_space(struct ring_buffer *rb, unsigned long tail, | ||
| 19 | unsigned long offset, unsigned long head) | ||
| 20 | { | ||
| 21 | unsigned long sz = perf_data_size(rb); | ||
| 22 | unsigned long mask = sz - 1; | ||
| 23 | |||
| 24 | /* | ||
| 25 | * check if user-writable | ||
| 26 | * overwrite : over-write its own tail | ||
| 27 | * !overwrite: buffer possibly drops events. | ||
| 28 | */ | ||
| 29 | if (rb->overwrite) | ||
| 30 | return true; | ||
| 31 | |||
| 32 | /* | ||
| 33 | * verify that payload is not bigger than buffer | ||
| 34 | * otherwise masking logic may fail to detect | ||
| 35 | * the "not enough space" condition | ||
| 36 | */ | ||
| 37 | if ((head - offset) > sz) | ||
| 38 | return false; | ||
| 39 | |||
| 40 | offset = (offset - tail) & mask; | ||
| 41 | head = (head - tail) & mask; | ||
| 42 | |||
| 43 | if ((int)(head - offset) < 0) | ||
| 44 | return false; | ||
| 45 | |||
| 46 | return true; | ||
| 47 | } | ||
| 48 | |||
| 49 | static void perf_output_wakeup(struct perf_output_handle *handle) | 19 | static void perf_output_wakeup(struct perf_output_handle *handle) |
| 50 | { | 20 | { |
| 51 | atomic_set(&handle->rb->poll, POLL_IN); | 21 | atomic_set(&handle->rb->poll, POLL_IN); |
| @@ -87,15 +57,36 @@ again: | |||
| 87 | goto out; | 57 | goto out; |
| 88 | 58 | ||
| 89 | /* | 59 | /* |
| 90 | * Publish the known good head. Rely on the full barrier implied | 60 | * Since the mmap() consumer (userspace) can run on a different CPU: |
| 91 | * by atomic_dec_and_test() order the rb->head read and this | 61 | * |
| 92 | * write. | 62 | * kernel user |
| 63 | * | ||
| 64 | * READ ->data_tail READ ->data_head | ||
| 65 | * smp_mb() (A) smp_rmb() (C) | ||
| 66 | * WRITE $data READ $data | ||
| 67 | * smp_wmb() (B) smp_mb() (D) | ||
| 68 | * STORE ->data_head WRITE ->data_tail | ||
| 69 | * | ||
| 70 | * Where A pairs with D, and B pairs with C. | ||
| 71 | * | ||
| 72 | * I don't think A needs to be a full barrier because we won't in fact | ||
| 73 | * write data until we see the store from userspace. So we simply don't | ||
| 74 | * issue the data WRITE until we observe it. Be conservative for now. | ||
| 75 | * | ||
| 76 | * OTOH, D needs to be a full barrier since it separates the data READ | ||
| 77 | * from the tail WRITE. | ||
| 78 | * | ||
| 79 | * For B a WMB is sufficient since it separates two WRITEs, and for C | ||
| 80 | * an RMB is sufficient since it separates two READs. | ||
| 81 | * | ||
| 82 | * See perf_output_begin(). | ||
| 93 | */ | 83 | */ |
| 84 | smp_wmb(); | ||
| 94 | rb->user_page->data_head = head; | 85 | rb->user_page->data_head = head; |
| 95 | 86 | ||
| 96 | /* | 87 | /* |
| 97 | * Now check if we missed an update, rely on the (compiler) | 88 | * Now check if we missed an update -- rely on previous implied |
| 98 | * barrier in atomic_dec_and_test() to re-read rb->head. | 89 | * compiler barriers to force a re-read. |
| 99 | */ | 90 | */ |
| 100 | if (unlikely(head != local_read(&rb->head))) { | 91 | if (unlikely(head != local_read(&rb->head))) { |
| 101 | local_inc(&rb->nest); | 92 | local_inc(&rb->nest); |
| @@ -114,8 +105,7 @@ int perf_output_begin(struct perf_output_handle *handle, | |||
| 114 | { | 105 | { |
| 115 | struct ring_buffer *rb; | 106 | struct ring_buffer *rb; |
| 116 | unsigned long tail, offset, head; | 107 | unsigned long tail, offset, head; |
| 117 | int have_lost; | 108 | int have_lost, page_shift; |
| 118 | struct perf_sample_data sample_data; | ||
| 119 | struct { | 109 | struct { |
| 120 | struct perf_event_header header; | 110 | struct perf_event_header header; |
| 121 | u64 id; | 111 | u64 id; |
| @@ -130,55 +120,63 @@ int perf_output_begin(struct perf_output_handle *handle, | |||
| 130 | event = event->parent; | 120 | event = event->parent; |
| 131 | 121 | ||
| 132 | rb = rcu_dereference(event->rb); | 122 | rb = rcu_dereference(event->rb); |
| 133 | if (!rb) | 123 | if (unlikely(!rb)) |
| 134 | goto out; | 124 | goto out; |
| 135 | 125 | ||
| 136 | handle->rb = rb; | 126 | if (unlikely(!rb->nr_pages)) |
| 137 | handle->event = event; | ||
| 138 | |||
| 139 | if (!rb->nr_pages) | ||
| 140 | goto out; | 127 | goto out; |
| 141 | 128 | ||
| 129 | handle->rb = rb; | ||
| 130 | handle->event = event; | ||
| 131 | |||
| 142 | have_lost = local_read(&rb->lost); | 132 | have_lost = local_read(&rb->lost); |
| 143 | if (have_lost) { | 133 | if (unlikely(have_lost)) { |
| 144 | lost_event.header.size = sizeof(lost_event); | 134 | size += sizeof(lost_event); |
| 145 | perf_event_header__init_id(&lost_event.header, &sample_data, | 135 | if (event->attr.sample_id_all) |
| 146 | event); | 136 | size += event->id_header_size; |
| 147 | size += lost_event.header.size; | ||
| 148 | } | 137 | } |
| 149 | 138 | ||
| 150 | perf_output_get_handle(handle); | 139 | perf_output_get_handle(handle); |
| 151 | 140 | ||
| 152 | do { | 141 | do { |
| 153 | /* | ||
| 154 | * Userspace could choose to issue a mb() before updating the | ||
| 155 | * tail pointer. So that all reads will be completed before the | ||
| 156 | * write is issued. | ||
| 157 | */ | ||
| 158 | tail = ACCESS_ONCE(rb->user_page->data_tail); | 142 | tail = ACCESS_ONCE(rb->user_page->data_tail); |
| 159 | smp_rmb(); | ||
| 160 | offset = head = local_read(&rb->head); | 143 | offset = head = local_read(&rb->head); |
| 161 | head += size; | 144 | if (!rb->overwrite && |
| 162 | if (unlikely(!perf_output_space(rb, tail, offset, head))) | 145 | unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size)) |
| 163 | goto fail; | 146 | goto fail; |
| 147 | head += size; | ||
| 164 | } while (local_cmpxchg(&rb->head, offset, head) != offset); | 148 | } while (local_cmpxchg(&rb->head, offset, head) != offset); |
| 165 | 149 | ||
| 166 | if (head - local_read(&rb->wakeup) > rb->watermark) | 150 | /* |
| 151 | * Separate the userpage->tail read from the data stores below. | ||
| 152 | * Matches the MB userspace SHOULD issue after reading the data | ||
| 153 | * and before storing the new tail position. | ||
| 154 | * | ||
| 155 | * See perf_output_put_handle(). | ||
| 156 | */ | ||
| 157 | smp_mb(); | ||
| 158 | |||
| 159 | if (unlikely(head - local_read(&rb->wakeup) > rb->watermark)) | ||
| 167 | local_add(rb->watermark, &rb->wakeup); | 160 | local_add(rb->watermark, &rb->wakeup); |
| 168 | 161 | ||
| 169 | handle->page = offset >> (PAGE_SHIFT + page_order(rb)); | 162 | page_shift = PAGE_SHIFT + page_order(rb); |
| 170 | handle->page &= rb->nr_pages - 1; | ||
| 171 | handle->size = offset & ((PAGE_SIZE << page_order(rb)) - 1); | ||
| 172 | handle->addr = rb->data_pages[handle->page]; | ||
| 173 | handle->addr += handle->size; | ||
| 174 | handle->size = (PAGE_SIZE << page_order(rb)) - handle->size; | ||
| 175 | 163 | ||
| 176 | if (have_lost) { | 164 | handle->page = (offset >> page_shift) & (rb->nr_pages - 1); |
| 165 | offset &= (1UL << page_shift) - 1; | ||
| 166 | handle->addr = rb->data_pages[handle->page] + offset; | ||
| 167 | handle->size = (1UL << page_shift) - offset; | ||
| 168 | |||
| 169 | if (unlikely(have_lost)) { | ||
| 170 | struct perf_sample_data sample_data; | ||
| 171 | |||
| 172 | lost_event.header.size = sizeof(lost_event); | ||
| 177 | lost_event.header.type = PERF_RECORD_LOST; | 173 | lost_event.header.type = PERF_RECORD_LOST; |
| 178 | lost_event.header.misc = 0; | 174 | lost_event.header.misc = 0; |
| 179 | lost_event.id = event->id; | 175 | lost_event.id = event->id; |
| 180 | lost_event.lost = local_xchg(&rb->lost, 0); | 176 | lost_event.lost = local_xchg(&rb->lost, 0); |
| 181 | 177 | ||
| 178 | perf_event_header__init_id(&lost_event.header, | ||
| 179 | &sample_data, event); | ||
| 182 | perf_output_put(handle, lost_event); | 180 | perf_output_put(handle, lost_event); |
| 183 | perf_event__output_id_sample(event, handle, &sample_data); | 181 | perf_event__output_id_sample(event, handle, &sample_data); |
| 184 | } | 182 | } |
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index f3569747d629..24b7d6ca871b 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c | |||
| @@ -35,6 +35,7 @@ | |||
| 35 | #include <linux/kdebug.h> /* notifier mechanism */ | 35 | #include <linux/kdebug.h> /* notifier mechanism */ |
| 36 | #include "../../mm/internal.h" /* munlock_vma_page */ | 36 | #include "../../mm/internal.h" /* munlock_vma_page */ |
| 37 | #include <linux/percpu-rwsem.h> | 37 | #include <linux/percpu-rwsem.h> |
| 38 | #include <linux/task_work.h> | ||
| 38 | 39 | ||
| 39 | #include <linux/uprobes.h> | 40 | #include <linux/uprobes.h> |
| 40 | 41 | ||
| @@ -244,12 +245,12 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t | |||
| 244 | * the architecture. If an arch has variable length instruction and the | 245 | * the architecture. If an arch has variable length instruction and the |
| 245 | * breakpoint instruction is not of the smallest length instruction | 246 | * breakpoint instruction is not of the smallest length instruction |
| 246 | * supported by that architecture then we need to modify is_trap_at_addr and | 247 | * supported by that architecture then we need to modify is_trap_at_addr and |
| 247 | * write_opcode accordingly. This would never be a problem for archs that | 248 | * uprobe_write_opcode accordingly. This would never be a problem for archs |
| 248 | * have fixed length instructions. | 249 | * that have fixed length instructions. |
| 249 | */ | 250 | */ |
| 250 | 251 | ||
| 251 | /* | 252 | /* |
| 252 | * write_opcode - write the opcode at a given virtual address. | 253 | * uprobe_write_opcode - write the opcode at a given virtual address. |
| 253 | * @mm: the probed process address space. | 254 | * @mm: the probed process address space. |
| 254 | * @vaddr: the virtual address to store the opcode. | 255 | * @vaddr: the virtual address to store the opcode. |
| 255 | * @opcode: opcode to be written at @vaddr. | 256 | * @opcode: opcode to be written at @vaddr. |
| @@ -260,7 +261,7 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t | |||
| 260 | * For mm @mm, write the opcode at @vaddr. | 261 | * For mm @mm, write the opcode at @vaddr. |
| 261 | * Return 0 (success) or a negative errno. | 262 | * Return 0 (success) or a negative errno. |
| 262 | */ | 263 | */ |
| 263 | static int write_opcode(struct mm_struct *mm, unsigned long vaddr, | 264 | int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, |
| 264 | uprobe_opcode_t opcode) | 265 | uprobe_opcode_t opcode) |
| 265 | { | 266 | { |
| 266 | struct page *old_page, *new_page; | 267 | struct page *old_page, *new_page; |
| @@ -314,7 +315,7 @@ put_old: | |||
| 314 | */ | 315 | */ |
| 315 | int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) | 316 | int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) |
| 316 | { | 317 | { |
| 317 | return write_opcode(mm, vaddr, UPROBE_SWBP_INSN); | 318 | return uprobe_write_opcode(mm, vaddr, UPROBE_SWBP_INSN); |
| 318 | } | 319 | } |
| 319 | 320 | ||
| 320 | /** | 321 | /** |
| @@ -329,7 +330,7 @@ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned | |||
| 329 | int __weak | 330 | int __weak |
| 330 | set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) | 331 | set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) |
| 331 | { | 332 | { |
| 332 | return write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn); | 333 | return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn); |
| 333 | } | 334 | } |
| 334 | 335 | ||
| 335 | static int match_uprobe(struct uprobe *l, struct uprobe *r) | 336 | static int match_uprobe(struct uprobe *l, struct uprobe *r) |
| @@ -503,9 +504,8 @@ static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc) | |||
| 503 | return ret; | 504 | return ret; |
| 504 | } | 505 | } |
| 505 | 506 | ||
| 506 | static int | 507 | static int __copy_insn(struct address_space *mapping, struct file *filp, |
| 507 | __copy_insn(struct address_space *mapping, struct file *filp, char *insn, | 508 | void *insn, int nbytes, loff_t offset) |
| 508 | unsigned long nbytes, loff_t offset) | ||
| 509 | { | 509 | { |
| 510 | struct page *page; | 510 | struct page *page; |
| 511 | 511 | ||
| @@ -527,28 +527,28 @@ __copy_insn(struct address_space *mapping, struct file *filp, char *insn, | |||
| 527 | 527 | ||
| 528 | static int copy_insn(struct uprobe *uprobe, struct file *filp) | 528 | static int copy_insn(struct uprobe *uprobe, struct file *filp) |
| 529 | { | 529 | { |
| 530 | struct address_space *mapping; | 530 | struct address_space *mapping = uprobe->inode->i_mapping; |
| 531 | unsigned long nbytes; | 531 | loff_t offs = uprobe->offset; |
| 532 | int bytes; | 532 | void *insn = uprobe->arch.insn; |
| 533 | 533 | int size = MAX_UINSN_BYTES; | |
| 534 | nbytes = PAGE_SIZE - (uprobe->offset & ~PAGE_MASK); | 534 | int len, err = -EIO; |
| 535 | mapping = uprobe->inode->i_mapping; | ||
| 536 | 535 | ||
| 537 | /* Instruction at end of binary; copy only available bytes */ | 536 | /* Copy only available bytes, -EIO if nothing was read */ |
| 538 | if (uprobe->offset + MAX_UINSN_BYTES > uprobe->inode->i_size) | 537 | do { |
| 539 | bytes = uprobe->inode->i_size - uprobe->offset; | 538 | if (offs >= i_size_read(uprobe->inode)) |
| 540 | else | 539 | break; |
| 541 | bytes = MAX_UINSN_BYTES; | ||
| 542 | 540 | ||
| 543 | /* Instruction at the page-boundary; copy bytes in second page */ | 541 | len = min_t(int, size, PAGE_SIZE - (offs & ~PAGE_MASK)); |
| 544 | if (nbytes < bytes) { | 542 | err = __copy_insn(mapping, filp, insn, len, offs); |
| 545 | int err = __copy_insn(mapping, filp, uprobe->arch.insn + nbytes, | ||
| 546 | bytes - nbytes, uprobe->offset + nbytes); | ||
| 547 | if (err) | 543 | if (err) |
| 548 | return err; | 544 | break; |
| 549 | bytes = nbytes; | 545 | |
| 550 | } | 546 | insn += len; |
| 551 | return __copy_insn(mapping, filp, uprobe->arch.insn, bytes, uprobe->offset); | 547 | offs += len; |
| 548 | size -= len; | ||
| 549 | } while (size); | ||
| 550 | |||
| 551 | return err; | ||
| 552 | } | 552 | } |
| 553 | 553 | ||
| 554 | static int prepare_uprobe(struct uprobe *uprobe, struct file *file, | 554 | static int prepare_uprobe(struct uprobe *uprobe, struct file *file, |
| @@ -576,7 +576,7 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, | |||
| 576 | if (ret) | 576 | if (ret) |
| 577 | goto out; | 577 | goto out; |
| 578 | 578 | ||
| 579 | /* write_opcode() assumes we don't cross page boundary */ | 579 | /* uprobe_write_opcode() assumes we don't cross page boundary */ |
| 580 | BUG_ON((uprobe->offset & ~PAGE_MASK) + | 580 | BUG_ON((uprobe->offset & ~PAGE_MASK) + |
| 581 | UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); | 581 | UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); |
| 582 | 582 | ||
| @@ -1096,21 +1096,22 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon | |||
| 1096 | } | 1096 | } |
| 1097 | 1097 | ||
| 1098 | /* Slot allocation for XOL */ | 1098 | /* Slot allocation for XOL */ |
| 1099 | static int xol_add_vma(struct xol_area *area) | 1099 | static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) |
| 1100 | { | 1100 | { |
| 1101 | struct mm_struct *mm = current->mm; | ||
| 1102 | int ret = -EALREADY; | 1101 | int ret = -EALREADY; |
| 1103 | 1102 | ||
| 1104 | down_write(&mm->mmap_sem); | 1103 | down_write(&mm->mmap_sem); |
| 1105 | if (mm->uprobes_state.xol_area) | 1104 | if (mm->uprobes_state.xol_area) |
| 1106 | goto fail; | 1105 | goto fail; |
| 1107 | 1106 | ||
| 1108 | ret = -ENOMEM; | 1107 | if (!area->vaddr) { |
| 1109 | /* Try to map as high as possible, this is only a hint. */ | 1108 | /* Try to map as high as possible, this is only a hint. */ |
| 1110 | area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0); | 1109 | area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, |
| 1111 | if (area->vaddr & ~PAGE_MASK) { | 1110 | PAGE_SIZE, 0, 0); |
| 1112 | ret = area->vaddr; | 1111 | if (area->vaddr & ~PAGE_MASK) { |
| 1113 | goto fail; | 1112 | ret = area->vaddr; |
| 1113 | goto fail; | ||
| 1114 | } | ||
| 1114 | } | 1115 | } |
| 1115 | 1116 | ||
| 1116 | ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE, | 1117 | ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE, |
| @@ -1120,30 +1121,19 @@ static int xol_add_vma(struct xol_area *area) | |||
| 1120 | 1121 | ||
| 1121 | smp_wmb(); /* pairs with get_xol_area() */ | 1122 | smp_wmb(); /* pairs with get_xol_area() */ |
| 1122 | mm->uprobes_state.xol_area = area; | 1123 | mm->uprobes_state.xol_area = area; |
| 1123 | ret = 0; | ||
| 1124 | fail: | 1124 | fail: |
| 1125 | up_write(&mm->mmap_sem); | 1125 | up_write(&mm->mmap_sem); |
| 1126 | 1126 | ||
| 1127 | return ret; | 1127 | return ret; |
| 1128 | } | 1128 | } |
| 1129 | 1129 | ||
| 1130 | /* | 1130 | static struct xol_area *__create_xol_area(unsigned long vaddr) |
| 1131 | * get_xol_area - Allocate process's xol_area if necessary. | ||
| 1132 | * This area will be used for storing instructions for execution out of line. | ||
| 1133 | * | ||
| 1134 | * Returns the allocated area or NULL. | ||
| 1135 | */ | ||
| 1136 | static struct xol_area *get_xol_area(void) | ||
| 1137 | { | 1131 | { |
| 1138 | struct mm_struct *mm = current->mm; | 1132 | struct mm_struct *mm = current->mm; |
| 1139 | struct xol_area *area; | ||
| 1140 | uprobe_opcode_t insn = UPROBE_SWBP_INSN; | 1133 | uprobe_opcode_t insn = UPROBE_SWBP_INSN; |
| 1134 | struct xol_area *area; | ||
| 1141 | 1135 | ||
| 1142 | area = mm->uprobes_state.xol_area; | 1136 | area = kmalloc(sizeof(*area), GFP_KERNEL); |
| 1143 | if (area) | ||
| 1144 | goto ret; | ||
| 1145 | |||
| 1146 | area = kzalloc(sizeof(*area), GFP_KERNEL); | ||
| 1147 | if (unlikely(!area)) | 1137 | if (unlikely(!area)) |
| 1148 | goto out; | 1138 | goto out; |
| 1149 | 1139 | ||
| @@ -1155,13 +1145,14 @@ static struct xol_area *get_xol_area(void) | |||
| 1155 | if (!area->page) | 1145 | if (!area->page) |
| 1156 | goto free_bitmap; | 1146 | goto free_bitmap; |
| 1157 | 1147 | ||
| 1158 | /* allocate first slot of task's xol_area for the return probes */ | 1148 | area->vaddr = vaddr; |
| 1149 | init_waitqueue_head(&area->wq); | ||
| 1150 | /* Reserve the 1st slot for get_trampoline_vaddr() */ | ||
| 1159 | set_bit(0, area->bitmap); | 1151 | set_bit(0, area->bitmap); |
| 1160 | copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE); | ||
| 1161 | atomic_set(&area->slot_count, 1); | 1152 | atomic_set(&area->slot_count, 1); |
| 1162 | init_waitqueue_head(&area->wq); | 1153 | copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE); |
| 1163 | 1154 | ||
| 1164 | if (!xol_add_vma(area)) | 1155 | if (!xol_add_vma(mm, area)) |
| 1165 | return area; | 1156 | return area; |
| 1166 | 1157 | ||
| 1167 | __free_page(area->page); | 1158 | __free_page(area->page); |
| @@ -1170,9 +1161,25 @@ static struct xol_area *get_xol_area(void) | |||
| 1170 | free_area: | 1161 | free_area: |
| 1171 | kfree(area); | 1162 | kfree(area); |
| 1172 | out: | 1163 | out: |
| 1164 | return NULL; | ||
| 1165 | } | ||
| 1166 | |||
| 1167 | /* | ||
| 1168 | * get_xol_area - Allocate process's xol_area if necessary. | ||
| 1169 | * This area will be used for storing instructions for execution out of line. | ||
| 1170 | * | ||
| 1171 | * Returns the allocated area or NULL. | ||
| 1172 | */ | ||
| 1173 | static struct xol_area *get_xol_area(void) | ||
| 1174 | { | ||
| 1175 | struct mm_struct *mm = current->mm; | ||
| 1176 | struct xol_area *area; | ||
| 1177 | |||
| 1178 | if (!mm->uprobes_state.xol_area) | ||
| 1179 | __create_xol_area(0); | ||
| 1180 | |||
| 1173 | area = mm->uprobes_state.xol_area; | 1181 | area = mm->uprobes_state.xol_area; |
| 1174 | ret: | 1182 | smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */ |
| 1175 | smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */ | ||
| 1176 | return area; | 1183 | return area; |
| 1177 | } | 1184 | } |
| 1178 | 1185 | ||
| @@ -1256,7 +1263,8 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe) | |||
| 1256 | return 0; | 1263 | return 0; |
| 1257 | 1264 | ||
| 1258 | /* Initialize the slot */ | 1265 | /* Initialize the slot */ |
| 1259 | copy_to_page(area->page, xol_vaddr, uprobe->arch.insn, MAX_UINSN_BYTES); | 1266 | copy_to_page(area->page, xol_vaddr, |
| 1267 | uprobe->arch.ixol, sizeof(uprobe->arch.ixol)); | ||
| 1260 | /* | 1268 | /* |
| 1261 | * We probably need flush_icache_user_range() but it needs vma. | 1269 | * We probably need flush_icache_user_range() but it needs vma. |
| 1262 | * This should work on supported architectures too. | 1270 | * This should work on supported architectures too. |
| @@ -1345,14 +1353,6 @@ void uprobe_free_utask(struct task_struct *t) | |||
| 1345 | } | 1353 | } |
| 1346 | 1354 | ||
| 1347 | /* | 1355 | /* |
| 1348 | * Called in context of a new clone/fork from copy_process. | ||
| 1349 | */ | ||
| 1350 | void uprobe_copy_process(struct task_struct *t) | ||
| 1351 | { | ||
| 1352 | t->utask = NULL; | ||
| 1353 | } | ||
| 1354 | |||
| 1355 | /* | ||
| 1356 | * Allocate a uprobe_task object for the task if if necessary. | 1356 | * Allocate a uprobe_task object for the task if if necessary. |
| 1357 | * Called when the thread hits a breakpoint. | 1357 | * Called when the thread hits a breakpoint. |
| 1358 | * | 1358 | * |
| @@ -1367,6 +1367,90 @@ static struct uprobe_task *get_utask(void) | |||
| 1367 | return current->utask; | 1367 | return current->utask; |
| 1368 | } | 1368 | } |
| 1369 | 1369 | ||
| 1370 | static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask) | ||
| 1371 | { | ||
| 1372 | struct uprobe_task *n_utask; | ||
| 1373 | struct return_instance **p, *o, *n; | ||
| 1374 | |||
| 1375 | n_utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL); | ||
| 1376 | if (!n_utask) | ||
| 1377 | return -ENOMEM; | ||
| 1378 | t->utask = n_utask; | ||
| 1379 | |||
| 1380 | p = &n_utask->return_instances; | ||
| 1381 | for (o = o_utask->return_instances; o; o = o->next) { | ||
| 1382 | n = kmalloc(sizeof(struct return_instance), GFP_KERNEL); | ||
| 1383 | if (!n) | ||
| 1384 | return -ENOMEM; | ||
| 1385 | |||
| 1386 | *n = *o; | ||
| 1387 | atomic_inc(&n->uprobe->ref); | ||
| 1388 | n->next = NULL; | ||
| 1389 | |||
| 1390 | *p = n; | ||
| 1391 | p = &n->next; | ||
| 1392 | n_utask->depth++; | ||
| 1393 | } | ||
| 1394 | |||
| 1395 | return 0; | ||
| 1396 | } | ||
| 1397 | |||
| 1398 | static void uprobe_warn(struct task_struct *t, const char *msg) | ||
| 1399 | { | ||
| 1400 | pr_warn("uprobe: %s:%d failed to %s\n", | ||
| 1401 | current->comm, current->pid, msg); | ||
| 1402 | } | ||
| 1403 | |||
| 1404 | static void dup_xol_work(struct callback_head *work) | ||
| 1405 | { | ||
| 1406 | kfree(work); | ||
| 1407 | |||
| 1408 | if (current->flags & PF_EXITING) | ||
| 1409 | return; | ||
| 1410 | |||
| 1411 | if (!__create_xol_area(current->utask->vaddr)) | ||
| 1412 | uprobe_warn(current, "dup xol area"); | ||
| 1413 | } | ||
| 1414 | |||
| 1415 | /* | ||
| 1416 | * Called in context of a new clone/fork from copy_process. | ||
| 1417 | */ | ||
| 1418 | void uprobe_copy_process(struct task_struct *t, unsigned long flags) | ||
| 1419 | { | ||
| 1420 | struct uprobe_task *utask = current->utask; | ||
| 1421 | struct mm_struct *mm = current->mm; | ||
| 1422 | struct callback_head *work; | ||
| 1423 | struct xol_area *area; | ||
| 1424 | |||
| 1425 | t->utask = NULL; | ||
| 1426 | |||
| 1427 | if (!utask || !utask->return_instances) | ||
| 1428 | return; | ||
| 1429 | |||
| 1430 | if (mm == t->mm && !(flags & CLONE_VFORK)) | ||
| 1431 | return; | ||
| 1432 | |||
| 1433 | if (dup_utask(t, utask)) | ||
| 1434 | return uprobe_warn(t, "dup ret instances"); | ||
| 1435 | |||
| 1436 | /* The task can fork() after dup_xol_work() fails */ | ||
| 1437 | area = mm->uprobes_state.xol_area; | ||
| 1438 | if (!area) | ||
| 1439 | return uprobe_warn(t, "dup xol area"); | ||
| 1440 | |||
| 1441 | if (mm == t->mm) | ||
| 1442 | return; | ||
| 1443 | |||
| 1444 | /* TODO: move it into the union in uprobe_task */ | ||
| 1445 | work = kmalloc(sizeof(*work), GFP_KERNEL); | ||
| 1446 | if (!work) | ||
| 1447 | return uprobe_warn(t, "dup xol area"); | ||
| 1448 | |||
| 1449 | t->utask->vaddr = area->vaddr; | ||
| 1450 | init_task_work(work, dup_xol_work); | ||
| 1451 | task_work_add(t, work, true); | ||
| 1452 | } | ||
| 1453 | |||
| 1370 | /* | 1454 | /* |
| 1371 | * Current area->vaddr notion assume the trampoline address is always | 1455 | * Current area->vaddr notion assume the trampoline address is always |
| 1372 | * equal area->vaddr. | 1456 | * equal area->vaddr. |
| @@ -1682,12 +1766,10 @@ static bool handle_trampoline(struct pt_regs *regs) | |||
| 1682 | tmp = ri; | 1766 | tmp = ri; |
| 1683 | ri = ri->next; | 1767 | ri = ri->next; |
| 1684 | kfree(tmp); | 1768 | kfree(tmp); |
| 1769 | utask->depth--; | ||
| 1685 | 1770 | ||
| 1686 | if (!chained) | 1771 | if (!chained) |
| 1687 | break; | 1772 | break; |
| 1688 | |||
| 1689 | utask->depth--; | ||
| 1690 | |||
| 1691 | BUG_ON(!ri); | 1773 | BUG_ON(!ri); |
| 1692 | } | 1774 | } |
| 1693 | 1775 | ||
| @@ -1859,9 +1941,4 @@ static int __init init_uprobes(void) | |||
| 1859 | 1941 | ||
| 1860 | return register_die_notifier(&uprobe_exception_nb); | 1942 | return register_die_notifier(&uprobe_exception_nb); |
| 1861 | } | 1943 | } |
| 1862 | module_init(init_uprobes); | 1944 | __initcall(init_uprobes); |
| 1863 | |||
| 1864 | static void __exit exit_uprobes(void) | ||
| 1865 | { | ||
| 1866 | } | ||
| 1867 | module_exit(exit_uprobes); | ||
diff --git a/kernel/extable.c b/kernel/extable.c index 67460b93b1a1..832cb28105bb 100644 --- a/kernel/extable.c +++ b/kernel/extable.c | |||
| @@ -41,7 +41,7 @@ u32 __initdata main_extable_sort_needed = 1; | |||
| 41 | /* Sort the kernel's built-in exception table */ | 41 | /* Sort the kernel's built-in exception table */ |
| 42 | void __init sort_main_extable(void) | 42 | void __init sort_main_extable(void) |
| 43 | { | 43 | { |
| 44 | if (main_extable_sort_needed) { | 44 | if (main_extable_sort_needed && __stop___ex_table > __start___ex_table) { |
| 45 | pr_notice("Sorting __ex_table...\n"); | 45 | pr_notice("Sorting __ex_table...\n"); |
| 46 | sort_extable(__start___ex_table, __stop___ex_table); | 46 | sort_extable(__start___ex_table, __stop___ex_table); |
| 47 | } | 47 | } |
diff --git a/kernel/fork.c b/kernel/fork.c index 403d2bb8a968..f6d11fc67f72 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
| @@ -351,7 +351,6 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
| 351 | struct rb_node **rb_link, *rb_parent; | 351 | struct rb_node **rb_link, *rb_parent; |
| 352 | int retval; | 352 | int retval; |
| 353 | unsigned long charge; | 353 | unsigned long charge; |
| 354 | struct mempolicy *pol; | ||
| 355 | 354 | ||
| 356 | uprobe_start_dup_mmap(); | 355 | uprobe_start_dup_mmap(); |
| 357 | down_write(&oldmm->mmap_sem); | 356 | down_write(&oldmm->mmap_sem); |
| @@ -400,11 +399,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
| 400 | goto fail_nomem; | 399 | goto fail_nomem; |
| 401 | *tmp = *mpnt; | 400 | *tmp = *mpnt; |
| 402 | INIT_LIST_HEAD(&tmp->anon_vma_chain); | 401 | INIT_LIST_HEAD(&tmp->anon_vma_chain); |
| 403 | pol = mpol_dup(vma_policy(mpnt)); | 402 | retval = vma_dup_policy(mpnt, tmp); |
| 404 | retval = PTR_ERR(pol); | 403 | if (retval) |
| 405 | if (IS_ERR(pol)) | ||
| 406 | goto fail_nomem_policy; | 404 | goto fail_nomem_policy; |
| 407 | vma_set_policy(tmp, pol); | ||
| 408 | tmp->vm_mm = mm; | 405 | tmp->vm_mm = mm; |
| 409 | if (anon_vma_fork(tmp, mpnt)) | 406 | if (anon_vma_fork(tmp, mpnt)) |
| 410 | goto fail_nomem_anon_vma_fork; | 407 | goto fail_nomem_anon_vma_fork; |
| @@ -472,7 +469,7 @@ out: | |||
| 472 | uprobe_end_dup_mmap(); | 469 | uprobe_end_dup_mmap(); |
| 473 | return retval; | 470 | return retval; |
| 474 | fail_nomem_anon_vma_fork: | 471 | fail_nomem_anon_vma_fork: |
| 475 | mpol_put(pol); | 472 | mpol_put(vma_policy(tmp)); |
| 476 | fail_nomem_policy: | 473 | fail_nomem_policy: |
| 477 | kmem_cache_free(vm_area_cachep, tmp); | 474 | kmem_cache_free(vm_area_cachep, tmp); |
| 478 | fail_nomem: | 475 | fail_nomem: |
| @@ -522,7 +519,7 @@ static void mm_init_aio(struct mm_struct *mm) | |||
| 522 | { | 519 | { |
| 523 | #ifdef CONFIG_AIO | 520 | #ifdef CONFIG_AIO |
| 524 | spin_lock_init(&mm->ioctx_lock); | 521 | spin_lock_init(&mm->ioctx_lock); |
| 525 | INIT_HLIST_HEAD(&mm->ioctx_list); | 522 | mm->ioctx_table = NULL; |
| 526 | #endif | 523 | #endif |
| 527 | } | 524 | } |
| 528 | 525 | ||
| @@ -820,9 +817,6 @@ struct mm_struct *dup_mm(struct task_struct *tsk) | |||
| 820 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 817 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
| 821 | mm->pmd_huge_pte = NULL; | 818 | mm->pmd_huge_pte = NULL; |
| 822 | #endif | 819 | #endif |
| 823 | #ifdef CONFIG_NUMA_BALANCING | ||
| 824 | mm->first_nid = NUMA_PTE_SCAN_INIT; | ||
| 825 | #endif | ||
| 826 | if (!mm_init(mm, tsk)) | 820 | if (!mm_init(mm, tsk)) |
| 827 | goto fail_nomem; | 821 | goto fail_nomem; |
| 828 | 822 | ||
| @@ -1173,12 +1167,16 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
| 1173 | return ERR_PTR(-EINVAL); | 1167 | return ERR_PTR(-EINVAL); |
| 1174 | 1168 | ||
| 1175 | /* | 1169 | /* |
| 1176 | * If the new process will be in a different pid namespace | 1170 | * If the new process will be in a different pid or user namespace |
| 1177 | * don't allow the creation of threads. | 1171 | * do not allow it to share a thread group or signal handlers or |
| 1172 | * parent with the forking task. | ||
| 1178 | */ | 1173 | */ |
| 1179 | if ((clone_flags & (CLONE_VM|CLONE_NEWPID)) && | 1174 | if (clone_flags & (CLONE_SIGHAND | CLONE_PARENT)) { |
| 1180 | (task_active_pid_ns(current) != current->nsproxy->pid_ns)) | 1175 | if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) || |
| 1181 | return ERR_PTR(-EINVAL); | 1176 | (task_active_pid_ns(current) != |
| 1177 | current->nsproxy->pid_ns_for_children)) | ||
| 1178 | return ERR_PTR(-EINVAL); | ||
| 1179 | } | ||
| 1182 | 1180 | ||
| 1183 | retval = security_task_create(clone_flags); | 1181 | retval = security_task_create(clone_flags); |
| 1184 | if (retval) | 1182 | if (retval) |
| @@ -1312,7 +1310,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
| 1312 | #endif | 1310 | #endif |
| 1313 | 1311 | ||
| 1314 | /* Perform scheduler related setup. Assign this task to a CPU. */ | 1312 | /* Perform scheduler related setup. Assign this task to a CPU. */ |
| 1315 | sched_fork(p); | 1313 | sched_fork(clone_flags, p); |
| 1316 | 1314 | ||
| 1317 | retval = perf_event_init_task(p); | 1315 | retval = perf_event_init_task(p); |
| 1318 | if (retval) | 1316 | if (retval) |
| @@ -1351,7 +1349,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
| 1351 | 1349 | ||
| 1352 | if (pid != &init_struct_pid) { | 1350 | if (pid != &init_struct_pid) { |
| 1353 | retval = -ENOMEM; | 1351 | retval = -ENOMEM; |
| 1354 | pid = alloc_pid(p->nsproxy->pid_ns); | 1352 | pid = alloc_pid(p->nsproxy->pid_ns_for_children); |
| 1355 | if (!pid) | 1353 | if (!pid) |
| 1356 | goto bad_fork_cleanup_io; | 1354 | goto bad_fork_cleanup_io; |
| 1357 | } | 1355 | } |
| @@ -1372,7 +1370,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
| 1372 | INIT_LIST_HEAD(&p->pi_state_list); | 1370 | INIT_LIST_HEAD(&p->pi_state_list); |
| 1373 | p->pi_state_cache = NULL; | 1371 | p->pi_state_cache = NULL; |
| 1374 | #endif | 1372 | #endif |
| 1375 | uprobe_copy_process(p); | ||
| 1376 | /* | 1373 | /* |
| 1377 | * sigaltstack should be cleared when sharing the same VM | 1374 | * sigaltstack should be cleared when sharing the same VM |
| 1378 | */ | 1375 | */ |
| @@ -1489,6 +1486,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
| 1489 | perf_event_fork(p); | 1486 | perf_event_fork(p); |
| 1490 | 1487 | ||
| 1491 | trace_task_newtask(p, clone_flags); | 1488 | trace_task_newtask(p, clone_flags); |
| 1489 | uprobe_copy_process(p, clone_flags); | ||
| 1492 | 1490 | ||
| 1493 | return p; | 1491 | return p; |
| 1494 | 1492 | ||
| @@ -1575,15 +1573,6 @@ long do_fork(unsigned long clone_flags, | |||
| 1575 | long nr; | 1573 | long nr; |
| 1576 | 1574 | ||
| 1577 | /* | 1575 | /* |
| 1578 | * Do some preliminary argument and permissions checking before we | ||
| 1579 | * actually start allocating stuff | ||
| 1580 | */ | ||
| 1581 | if (clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) { | ||
| 1582 | if (clone_flags & (CLONE_THREAD|CLONE_PARENT)) | ||
| 1583 | return -EINVAL; | ||
| 1584 | } | ||
| 1585 | |||
| 1586 | /* | ||
| 1587 | * Determine whether and which event to report to ptracer. When | 1576 | * Determine whether and which event to report to ptracer. When |
| 1588 | * called from kernel_thread or CLONE_UNTRACED is explicitly | 1577 | * called from kernel_thread or CLONE_UNTRACED is explicitly |
| 1589 | * requested, no event is reported; otherwise, report if the event | 1578 | * requested, no event is reported; otherwise, report if the event |
| @@ -1679,6 +1668,12 @@ SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags, | |||
| 1679 | int __user *, parent_tidptr, | 1668 | int __user *, parent_tidptr, |
| 1680 | int __user *, child_tidptr, | 1669 | int __user *, child_tidptr, |
| 1681 | int, tls_val) | 1670 | int, tls_val) |
| 1671 | #elif defined(CONFIG_CLONE_BACKWARDS3) | ||
| 1672 | SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp, | ||
| 1673 | int, stack_size, | ||
| 1674 | int __user *, parent_tidptr, | ||
| 1675 | int __user *, child_tidptr, | ||
| 1676 | int, tls_val) | ||
| 1682 | #else | 1677 | #else |
| 1683 | SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, | 1678 | SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, |
| 1684 | int __user *, parent_tidptr, | 1679 | int __user *, parent_tidptr, |
| @@ -1818,11 +1813,6 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) | |||
| 1818 | if (unshare_flags & CLONE_NEWUSER) | 1813 | if (unshare_flags & CLONE_NEWUSER) |
| 1819 | unshare_flags |= CLONE_THREAD | CLONE_FS; | 1814 | unshare_flags |= CLONE_THREAD | CLONE_FS; |
| 1820 | /* | 1815 | /* |
| 1821 | * If unsharing a pid namespace must also unshare the thread. | ||
| 1822 | */ | ||
| 1823 | if (unshare_flags & CLONE_NEWPID) | ||
| 1824 | unshare_flags |= CLONE_THREAD; | ||
| 1825 | /* | ||
| 1826 | * If unsharing a thread from a thread group, must also unshare vm. | 1816 | * If unsharing a thread from a thread group, must also unshare vm. |
| 1827 | */ | 1817 | */ |
| 1828 | if (unshare_flags & CLONE_THREAD) | 1818 | if (unshare_flags & CLONE_THREAD) |
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c index 9bd0934f6c33..7a7d2ee96d42 100644 --- a/kernel/gcov/fs.c +++ b/kernel/gcov/fs.c | |||
| @@ -74,7 +74,7 @@ static int __init gcov_persist_setup(char *str) | |||
| 74 | { | 74 | { |
| 75 | unsigned long val; | 75 | unsigned long val; |
| 76 | 76 | ||
| 77 | if (strict_strtoul(str, 0, &val)) { | 77 | if (kstrtoul(str, 0, &val)) { |
| 78 | pr_warning("invalid gcov_persist parameter '%s'\n", str); | 78 | pr_warning("invalid gcov_persist parameter '%s'\n", str); |
| 79 | return 0; | 79 | return 0; |
| 80 | } | 80 | } |
diff --git a/kernel/groups.c b/kernel/groups.c index 6b2588dd04ff..90cf1c38c8ea 100644 --- a/kernel/groups.c +++ b/kernel/groups.c | |||
| @@ -233,7 +233,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist) | |||
| 233 | struct group_info *group_info; | 233 | struct group_info *group_info; |
| 234 | int retval; | 234 | int retval; |
| 235 | 235 | ||
| 236 | if (!nsown_capable(CAP_SETGID)) | 236 | if (!ns_capable(current_user_ns(), CAP_SETGID)) |
| 237 | return -EPERM; | 237 | return -EPERM; |
| 238 | if ((unsigned)gidsetsize > NGROUPS_MAX) | 238 | if ((unsigned)gidsetsize > NGROUPS_MAX) |
| 239 | return -EINVAL; | 239 | return -EINVAL; |
diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 6df614912b9d..3e97fb126e6b 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c | |||
| @@ -15,6 +15,7 @@ | |||
| 15 | #include <linux/lockdep.h> | 15 | #include <linux/lockdep.h> |
| 16 | #include <linux/export.h> | 16 | #include <linux/export.h> |
| 17 | #include <linux/sysctl.h> | 17 | #include <linux/sysctl.h> |
| 18 | #include <linux/utsname.h> | ||
| 18 | 19 | ||
| 19 | /* | 20 | /* |
| 20 | * The number of tasks checked: | 21 | * The number of tasks checked: |
| @@ -99,10 +100,14 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) | |||
| 99 | * Ok, the task did not get scheduled for more than 2 minutes, | 100 | * Ok, the task did not get scheduled for more than 2 minutes, |
| 100 | * complain: | 101 | * complain: |
| 101 | */ | 102 | */ |
| 102 | printk(KERN_ERR "INFO: task %s:%d blocked for more than " | 103 | pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n", |
| 103 | "%ld seconds.\n", t->comm, t->pid, timeout); | 104 | t->comm, t->pid, timeout); |
| 104 | printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" | 105 | pr_err(" %s %s %.*s\n", |
| 105 | " disables this message.\n"); | 106 | print_tainted(), init_utsname()->release, |
| 107 | (int)strcspn(init_utsname()->version, " "), | ||
| 108 | init_utsname()->version); | ||
| 109 | pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" | ||
| 110 | " disables this message.\n"); | ||
| 106 | sched_show_task(t); | 111 | sched_show_task(t); |
| 107 | debug_show_held_locks(t); | 112 | debug_show_held_locks(t); |
| 108 | 113 | ||
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index d1a758bc972a..4a1fef09f658 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig | |||
| @@ -1,15 +1,4 @@ | |||
| 1 | # Select this to activate the generic irq options below | ||
| 2 | config HAVE_GENERIC_HARDIRQS | ||
| 3 | bool | ||
| 4 | |||
| 5 | if HAVE_GENERIC_HARDIRQS | ||
| 6 | menu "IRQ subsystem" | 1 | menu "IRQ subsystem" |
| 7 | # | ||
| 8 | # Interrupt subsystem related configuration options | ||
| 9 | # | ||
| 10 | config GENERIC_HARDIRQS | ||
| 11 | def_bool y | ||
| 12 | |||
| 13 | # Options selectable by the architecture code | 2 | # Options selectable by the architecture code |
| 14 | 3 | ||
| 15 | # Make sparse irq Kconfig switch below available | 4 | # Make sparse irq Kconfig switch below available |
| @@ -84,4 +73,3 @@ config SPARSE_IRQ | |||
| 84 | If you don't know what to do here, say N. | 73 | If you don't know what to do here, say N. |
| 85 | 74 | ||
| 86 | endmenu | 75 | endmenu |
| 87 | endif | ||
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 706724e9835d..cf68bb36fe58 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c | |||
| @@ -465,27 +465,26 @@ int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base, | |||
| 465 | } | 465 | } |
| 466 | EXPORT_SYMBOL_GPL(irq_create_strict_mappings); | 466 | EXPORT_SYMBOL_GPL(irq_create_strict_mappings); |
| 467 | 467 | ||
| 468 | unsigned int irq_create_of_mapping(struct device_node *controller, | 468 | unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data) |
| 469 | const u32 *intspec, unsigned int intsize) | ||
| 470 | { | 469 | { |
| 471 | struct irq_domain *domain; | 470 | struct irq_domain *domain; |
| 472 | irq_hw_number_t hwirq; | 471 | irq_hw_number_t hwirq; |
| 473 | unsigned int type = IRQ_TYPE_NONE; | 472 | unsigned int type = IRQ_TYPE_NONE; |
| 474 | unsigned int virq; | 473 | unsigned int virq; |
| 475 | 474 | ||
| 476 | domain = controller ? irq_find_host(controller) : irq_default_domain; | 475 | domain = irq_data->np ? irq_find_host(irq_data->np) : irq_default_domain; |
| 477 | if (!domain) { | 476 | if (!domain) { |
| 478 | pr_warn("no irq domain found for %s !\n", | 477 | pr_warn("no irq domain found for %s !\n", |
| 479 | of_node_full_name(controller)); | 478 | of_node_full_name(irq_data->np)); |
| 480 | return 0; | 479 | return 0; |
| 481 | } | 480 | } |
| 482 | 481 | ||
| 483 | /* If domain has no translation, then we assume interrupt line */ | 482 | /* If domain has no translation, then we assume interrupt line */ |
| 484 | if (domain->ops->xlate == NULL) | 483 | if (domain->ops->xlate == NULL) |
| 485 | hwirq = intspec[0]; | 484 | hwirq = irq_data->args[0]; |
| 486 | else { | 485 | else { |
| 487 | if (domain->ops->xlate(domain, controller, intspec, intsize, | 486 | if (domain->ops->xlate(domain, irq_data->np, irq_data->args, |
| 488 | &hwirq, &type)) | 487 | irq_data->args_count, &hwirq, &type)) |
| 489 | return 0; | 488 | return 0; |
| 490 | } | 489 | } |
| 491 | 490 | ||
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 514bcfd855a8..3e59f951d42f 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
| @@ -956,7 +956,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
| 956 | goto out_mput; | 956 | goto out_mput; |
| 957 | } | 957 | } |
| 958 | 958 | ||
| 959 | sched_setscheduler(t, SCHED_FIFO, ¶m); | 959 | sched_setscheduler_nocheck(t, SCHED_FIFO, ¶m); |
| 960 | 960 | ||
| 961 | /* | 961 | /* |
| 962 | * We keep the reference to the task struct even if | 962 | * We keep the reference to the task struct even if |
diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 60f48fa0fd0d..297a9247a3b3 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c | |||
| @@ -13,6 +13,7 @@ | |||
| 13 | #include <linux/sort.h> | 13 | #include <linux/sort.h> |
| 14 | #include <linux/err.h> | 14 | #include <linux/err.h> |
| 15 | #include <linux/static_key.h> | 15 | #include <linux/static_key.h> |
| 16 | #include <linux/jump_label_ratelimit.h> | ||
| 16 | 17 | ||
| 17 | #ifdef HAVE_JUMP_LABEL | 18 | #ifdef HAVE_JUMP_LABEL |
| 18 | 19 | ||
diff --git a/kernel/kexec.c b/kernel/kexec.c index 59f7b55ba745..2a74f307c5ec 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
| @@ -1474,11 +1474,8 @@ static int __init __parse_crashkernel(char *cmdline, | |||
| 1474 | if (first_colon && (!first_space || first_colon < first_space)) | 1474 | if (first_colon && (!first_space || first_colon < first_space)) |
| 1475 | return parse_crashkernel_mem(ck_cmdline, system_ram, | 1475 | return parse_crashkernel_mem(ck_cmdline, system_ram, |
| 1476 | crash_size, crash_base); | 1476 | crash_size, crash_base); |
| 1477 | else | ||
| 1478 | return parse_crashkernel_simple(ck_cmdline, crash_size, | ||
| 1479 | crash_base); | ||
| 1480 | 1477 | ||
| 1481 | return 0; | 1478 | return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base); |
| 1482 | } | 1479 | } |
| 1483 | 1480 | ||
| 1484 | /* | 1481 | /* |
diff --git a/kernel/kmod.c b/kernel/kmod.c index fb326365b694..b086006c59e7 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
| @@ -571,6 +571,10 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) | |||
| 571 | DECLARE_COMPLETION_ONSTACK(done); | 571 | DECLARE_COMPLETION_ONSTACK(done); |
| 572 | int retval = 0; | 572 | int retval = 0; |
| 573 | 573 | ||
| 574 | if (!sub_info->path) { | ||
| 575 | call_usermodehelper_freeinfo(sub_info); | ||
| 576 | return -EINVAL; | ||
| 577 | } | ||
| 574 | helper_lock(); | 578 | helper_lock(); |
| 575 | if (!khelper_wq || usermodehelper_disabled) { | 579 | if (!khelper_wq || usermodehelper_disabled) { |
| 576 | retval = -EBUSY; | 580 | retval = -EBUSY; |
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 6e33498d665c..a0d367a49122 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
| @@ -112,6 +112,7 @@ static struct kprobe_blackpoint kprobe_blacklist[] = { | |||
| 112 | struct kprobe_insn_page { | 112 | struct kprobe_insn_page { |
| 113 | struct list_head list; | 113 | struct list_head list; |
| 114 | kprobe_opcode_t *insns; /* Page of instruction slots */ | 114 | kprobe_opcode_t *insns; /* Page of instruction slots */ |
| 115 | struct kprobe_insn_cache *cache; | ||
| 115 | int nused; | 116 | int nused; |
| 116 | int ngarbage; | 117 | int ngarbage; |
| 117 | char slot_used[]; | 118 | char slot_used[]; |
| @@ -121,12 +122,6 @@ struct kprobe_insn_page { | |||
| 121 | (offsetof(struct kprobe_insn_page, slot_used) + \ | 122 | (offsetof(struct kprobe_insn_page, slot_used) + \ |
| 122 | (sizeof(char) * (slots))) | 123 | (sizeof(char) * (slots))) |
| 123 | 124 | ||
| 124 | struct kprobe_insn_cache { | ||
| 125 | struct list_head pages; /* list of kprobe_insn_page */ | ||
| 126 | size_t insn_size; /* size of instruction slot */ | ||
| 127 | int nr_garbage; | ||
| 128 | }; | ||
| 129 | |||
| 130 | static int slots_per_page(struct kprobe_insn_cache *c) | 125 | static int slots_per_page(struct kprobe_insn_cache *c) |
| 131 | { | 126 | { |
| 132 | return PAGE_SIZE/(c->insn_size * sizeof(kprobe_opcode_t)); | 127 | return PAGE_SIZE/(c->insn_size * sizeof(kprobe_opcode_t)); |
| @@ -138,8 +133,20 @@ enum kprobe_slot_state { | |||
| 138 | SLOT_USED = 2, | 133 | SLOT_USED = 2, |
| 139 | }; | 134 | }; |
| 140 | 135 | ||
| 141 | static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_slots */ | 136 | static void *alloc_insn_page(void) |
| 142 | static struct kprobe_insn_cache kprobe_insn_slots = { | 137 | { |
| 138 | return module_alloc(PAGE_SIZE); | ||
| 139 | } | ||
| 140 | |||
| 141 | static void free_insn_page(void *page) | ||
| 142 | { | ||
| 143 | module_free(NULL, page); | ||
| 144 | } | ||
| 145 | |||
| 146 | struct kprobe_insn_cache kprobe_insn_slots = { | ||
| 147 | .mutex = __MUTEX_INITIALIZER(kprobe_insn_slots.mutex), | ||
| 148 | .alloc = alloc_insn_page, | ||
| 149 | .free = free_insn_page, | ||
| 143 | .pages = LIST_HEAD_INIT(kprobe_insn_slots.pages), | 150 | .pages = LIST_HEAD_INIT(kprobe_insn_slots.pages), |
| 144 | .insn_size = MAX_INSN_SIZE, | 151 | .insn_size = MAX_INSN_SIZE, |
| 145 | .nr_garbage = 0, | 152 | .nr_garbage = 0, |
| @@ -150,10 +157,12 @@ static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c); | |||
| 150 | * __get_insn_slot() - Find a slot on an executable page for an instruction. | 157 | * __get_insn_slot() - Find a slot on an executable page for an instruction. |
| 151 | * We allocate an executable page if there's no room on existing ones. | 158 | * We allocate an executable page if there's no room on existing ones. |
| 152 | */ | 159 | */ |
| 153 | static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c) | 160 | kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c) |
| 154 | { | 161 | { |
| 155 | struct kprobe_insn_page *kip; | 162 | struct kprobe_insn_page *kip; |
| 163 | kprobe_opcode_t *slot = NULL; | ||
| 156 | 164 | ||
| 165 | mutex_lock(&c->mutex); | ||
| 157 | retry: | 166 | retry: |
| 158 | list_for_each_entry(kip, &c->pages, list) { | 167 | list_for_each_entry(kip, &c->pages, list) { |
| 159 | if (kip->nused < slots_per_page(c)) { | 168 | if (kip->nused < slots_per_page(c)) { |
| @@ -162,7 +171,8 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c) | |||
| 162 | if (kip->slot_used[i] == SLOT_CLEAN) { | 171 | if (kip->slot_used[i] == SLOT_CLEAN) { |
| 163 | kip->slot_used[i] = SLOT_USED; | 172 | kip->slot_used[i] = SLOT_USED; |
| 164 | kip->nused++; | 173 | kip->nused++; |
| 165 | return kip->insns + (i * c->insn_size); | 174 | slot = kip->insns + (i * c->insn_size); |
| 175 | goto out; | ||
| 166 | } | 176 | } |
| 167 | } | 177 | } |
| 168 | /* kip->nused is broken. Fix it. */ | 178 | /* kip->nused is broken. Fix it. */ |
| @@ -178,37 +188,29 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c) | |||
| 178 | /* All out of space. Need to allocate a new page. */ | 188 | /* All out of space. Need to allocate a new page. */ |
| 179 | kip = kmalloc(KPROBE_INSN_PAGE_SIZE(slots_per_page(c)), GFP_KERNEL); | 189 | kip = kmalloc(KPROBE_INSN_PAGE_SIZE(slots_per_page(c)), GFP_KERNEL); |
| 180 | if (!kip) | 190 | if (!kip) |
| 181 | return NULL; | 191 | goto out; |
| 182 | 192 | ||
| 183 | /* | 193 | /* |
| 184 | * Use module_alloc so this page is within +/- 2GB of where the | 194 | * Use module_alloc so this page is within +/- 2GB of where the |
| 185 | * kernel image and loaded module images reside. This is required | 195 | * kernel image and loaded module images reside. This is required |
| 186 | * so x86_64 can correctly handle the %rip-relative fixups. | 196 | * so x86_64 can correctly handle the %rip-relative fixups. |
| 187 | */ | 197 | */ |
| 188 | kip->insns = module_alloc(PAGE_SIZE); | 198 | kip->insns = c->alloc(); |
| 189 | if (!kip->insns) { | 199 | if (!kip->insns) { |
| 190 | kfree(kip); | 200 | kfree(kip); |
| 191 | return NULL; | 201 | goto out; |
| 192 | } | 202 | } |
| 193 | INIT_LIST_HEAD(&kip->list); | 203 | INIT_LIST_HEAD(&kip->list); |
| 194 | memset(kip->slot_used, SLOT_CLEAN, slots_per_page(c)); | 204 | memset(kip->slot_used, SLOT_CLEAN, slots_per_page(c)); |
| 195 | kip->slot_used[0] = SLOT_USED; | 205 | kip->slot_used[0] = SLOT_USED; |
| 196 | kip->nused = 1; | 206 | kip->nused = 1; |
| 197 | kip->ngarbage = 0; | 207 | kip->ngarbage = 0; |
| 208 | kip->cache = c; | ||
| 198 | list_add(&kip->list, &c->pages); | 209 | list_add(&kip->list, &c->pages); |
| 199 | return kip->insns; | 210 | slot = kip->insns; |
| 200 | } | 211 | out: |
| 201 | 212 | mutex_unlock(&c->mutex); | |
| 202 | 213 | return slot; | |
| 203 | kprobe_opcode_t __kprobes *get_insn_slot(void) | ||
| 204 | { | ||
| 205 | kprobe_opcode_t *ret = NULL; | ||
| 206 | |||
| 207 | mutex_lock(&kprobe_insn_mutex); | ||
| 208 | ret = __get_insn_slot(&kprobe_insn_slots); | ||
| 209 | mutex_unlock(&kprobe_insn_mutex); | ||
| 210 | |||
| 211 | return ret; | ||
| 212 | } | 214 | } |
| 213 | 215 | ||
| 214 | /* Return 1 if all garbages are collected, otherwise 0. */ | 216 | /* Return 1 if all garbages are collected, otherwise 0. */ |
| @@ -225,7 +227,7 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx) | |||
| 225 | */ | 227 | */ |
| 226 | if (!list_is_singular(&kip->list)) { | 228 | if (!list_is_singular(&kip->list)) { |
| 227 | list_del(&kip->list); | 229 | list_del(&kip->list); |
| 228 | module_free(NULL, kip->insns); | 230 | kip->cache->free(kip->insns); |
| 229 | kfree(kip); | 231 | kfree(kip); |
| 230 | } | 232 | } |
| 231 | return 1; | 233 | return 1; |
| @@ -255,11 +257,12 @@ static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c) | |||
| 255 | return 0; | 257 | return 0; |
| 256 | } | 258 | } |
| 257 | 259 | ||
| 258 | static void __kprobes __free_insn_slot(struct kprobe_insn_cache *c, | 260 | void __kprobes __free_insn_slot(struct kprobe_insn_cache *c, |
| 259 | kprobe_opcode_t *slot, int dirty) | 261 | kprobe_opcode_t *slot, int dirty) |
| 260 | { | 262 | { |
| 261 | struct kprobe_insn_page *kip; | 263 | struct kprobe_insn_page *kip; |
| 262 | 264 | ||
| 265 | mutex_lock(&c->mutex); | ||
| 263 | list_for_each_entry(kip, &c->pages, list) { | 266 | list_for_each_entry(kip, &c->pages, list) { |
| 264 | long idx = ((long)slot - (long)kip->insns) / | 267 | long idx = ((long)slot - (long)kip->insns) / |
| 265 | (c->insn_size * sizeof(kprobe_opcode_t)); | 268 | (c->insn_size * sizeof(kprobe_opcode_t)); |
| @@ -272,45 +275,25 @@ static void __kprobes __free_insn_slot(struct kprobe_insn_cache *c, | |||
| 272 | collect_garbage_slots(c); | 275 | collect_garbage_slots(c); |
| 273 | } else | 276 | } else |
| 274 | collect_one_slot(kip, idx); | 277 | collect_one_slot(kip, idx); |
| 275 | return; | 278 | goto out; |
| 276 | } | 279 | } |
| 277 | } | 280 | } |
| 278 | /* Could not free this slot. */ | 281 | /* Could not free this slot. */ |
| 279 | WARN_ON(1); | 282 | WARN_ON(1); |
| 283 | out: | ||
| 284 | mutex_unlock(&c->mutex); | ||
| 280 | } | 285 | } |
| 281 | 286 | ||
| 282 | void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) | ||
| 283 | { | ||
| 284 | mutex_lock(&kprobe_insn_mutex); | ||
| 285 | __free_insn_slot(&kprobe_insn_slots, slot, dirty); | ||
| 286 | mutex_unlock(&kprobe_insn_mutex); | ||
| 287 | } | ||
| 288 | #ifdef CONFIG_OPTPROBES | 287 | #ifdef CONFIG_OPTPROBES |
| 289 | /* For optimized_kprobe buffer */ | 288 | /* For optimized_kprobe buffer */ |
| 290 | static DEFINE_MUTEX(kprobe_optinsn_mutex); /* Protects kprobe_optinsn_slots */ | 289 | struct kprobe_insn_cache kprobe_optinsn_slots = { |
| 291 | static struct kprobe_insn_cache kprobe_optinsn_slots = { | 290 | .mutex = __MUTEX_INITIALIZER(kprobe_optinsn_slots.mutex), |
| 291 | .alloc = alloc_insn_page, | ||
| 292 | .free = free_insn_page, | ||
| 292 | .pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages), | 293 | .pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages), |
| 293 | /* .insn_size is initialized later */ | 294 | /* .insn_size is initialized later */ |
| 294 | .nr_garbage = 0, | 295 | .nr_garbage = 0, |
| 295 | }; | 296 | }; |
| 296 | /* Get a slot for optimized_kprobe buffer */ | ||
| 297 | kprobe_opcode_t __kprobes *get_optinsn_slot(void) | ||
| 298 | { | ||
| 299 | kprobe_opcode_t *ret = NULL; | ||
| 300 | |||
| 301 | mutex_lock(&kprobe_optinsn_mutex); | ||
| 302 | ret = __get_insn_slot(&kprobe_optinsn_slots); | ||
| 303 | mutex_unlock(&kprobe_optinsn_mutex); | ||
| 304 | |||
| 305 | return ret; | ||
| 306 | } | ||
| 307 | |||
| 308 | void __kprobes free_optinsn_slot(kprobe_opcode_t * slot, int dirty) | ||
| 309 | { | ||
| 310 | mutex_lock(&kprobe_optinsn_mutex); | ||
| 311 | __free_insn_slot(&kprobe_optinsn_slots, slot, dirty); | ||
| 312 | mutex_unlock(&kprobe_optinsn_mutex); | ||
| 313 | } | ||
| 314 | #endif | 297 | #endif |
| 315 | #endif | 298 | #endif |
| 316 | 299 | ||
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 6ada93c23a9a..9659d38e008f 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c | |||
| @@ -113,7 +113,7 @@ static ssize_t kexec_crash_size_store(struct kobject *kobj, | |||
| 113 | unsigned long cnt; | 113 | unsigned long cnt; |
| 114 | int ret; | 114 | int ret; |
| 115 | 115 | ||
| 116 | if (strict_strtoul(buf, 0, &cnt)) | 116 | if (kstrtoul(buf, 0, &cnt)) |
| 117 | return -EINVAL; | 117 | return -EINVAL; |
| 118 | 118 | ||
| 119 | ret = crash_shrink_memory(cnt); | 119 | ret = crash_shrink_memory(cnt); |
diff --git a/kernel/lglock.c b/kernel/lglock.c index 6535a667a5a7..86ae2aebf004 100644 --- a/kernel/lglock.c +++ b/kernel/lglock.c | |||
| @@ -21,7 +21,7 @@ void lg_local_lock(struct lglock *lg) | |||
| 21 | arch_spinlock_t *lock; | 21 | arch_spinlock_t *lock; |
| 22 | 22 | ||
| 23 | preempt_disable(); | 23 | preempt_disable(); |
| 24 | rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_); | 24 | lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); |
| 25 | lock = this_cpu_ptr(lg->lock); | 25 | lock = this_cpu_ptr(lg->lock); |
| 26 | arch_spin_lock(lock); | 26 | arch_spin_lock(lock); |
| 27 | } | 27 | } |
| @@ -31,7 +31,7 @@ void lg_local_unlock(struct lglock *lg) | |||
| 31 | { | 31 | { |
| 32 | arch_spinlock_t *lock; | 32 | arch_spinlock_t *lock; |
| 33 | 33 | ||
| 34 | rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); | 34 | lock_release(&lg->lock_dep_map, 1, _RET_IP_); |
| 35 | lock = this_cpu_ptr(lg->lock); | 35 | lock = this_cpu_ptr(lg->lock); |
| 36 | arch_spin_unlock(lock); | 36 | arch_spin_unlock(lock); |
| 37 | preempt_enable(); | 37 | preempt_enable(); |
| @@ -43,7 +43,7 @@ void lg_local_lock_cpu(struct lglock *lg, int cpu) | |||
| 43 | arch_spinlock_t *lock; | 43 | arch_spinlock_t *lock; |
| 44 | 44 | ||
| 45 | preempt_disable(); | 45 | preempt_disable(); |
| 46 | rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_); | 46 | lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); |
| 47 | lock = per_cpu_ptr(lg->lock, cpu); | 47 | lock = per_cpu_ptr(lg->lock, cpu); |
| 48 | arch_spin_lock(lock); | 48 | arch_spin_lock(lock); |
| 49 | } | 49 | } |
| @@ -53,7 +53,7 @@ void lg_local_unlock_cpu(struct lglock *lg, int cpu) | |||
| 53 | { | 53 | { |
| 54 | arch_spinlock_t *lock; | 54 | arch_spinlock_t *lock; |
| 55 | 55 | ||
| 56 | rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); | 56 | lock_release(&lg->lock_dep_map, 1, _RET_IP_); |
| 57 | lock = per_cpu_ptr(lg->lock, cpu); | 57 | lock = per_cpu_ptr(lg->lock, cpu); |
| 58 | arch_spin_unlock(lock); | 58 | arch_spin_unlock(lock); |
| 59 | preempt_enable(); | 59 | preempt_enable(); |
| @@ -65,7 +65,7 @@ void lg_global_lock(struct lglock *lg) | |||
| 65 | int i; | 65 | int i; |
| 66 | 66 | ||
| 67 | preempt_disable(); | 67 | preempt_disable(); |
| 68 | rwlock_acquire(&lg->lock_dep_map, 0, 0, _RET_IP_); | 68 | lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); |
| 69 | for_each_possible_cpu(i) { | 69 | for_each_possible_cpu(i) { |
| 70 | arch_spinlock_t *lock; | 70 | arch_spinlock_t *lock; |
| 71 | lock = per_cpu_ptr(lg->lock, i); | 71 | lock = per_cpu_ptr(lg->lock, i); |
| @@ -78,7 +78,7 @@ void lg_global_unlock(struct lglock *lg) | |||
| 78 | { | 78 | { |
| 79 | int i; | 79 | int i; |
| 80 | 80 | ||
| 81 | rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); | 81 | lock_release(&lg->lock_dep_map, 1, _RET_IP_); |
| 82 | for_each_possible_cpu(i) { | 82 | for_each_possible_cpu(i) { |
| 83 | arch_spinlock_t *lock; | 83 | arch_spinlock_t *lock; |
| 84 | lock = per_cpu_ptr(lg->lock, i); | 84 | lock = per_cpu_ptr(lg->lock, i); |
diff --git a/kernel/lockdep.c b/kernel/lockdep.c index e16c45b9ee77..4e8e14c34e42 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c | |||
| @@ -4224,7 +4224,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) | |||
| 4224 | printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n", | 4224 | printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n", |
| 4225 | !rcu_lockdep_current_cpu_online() | 4225 | !rcu_lockdep_current_cpu_online() |
| 4226 | ? "RCU used illegally from offline CPU!\n" | 4226 | ? "RCU used illegally from offline CPU!\n" |
| 4227 | : rcu_is_cpu_idle() | 4227 | : !rcu_is_watching() |
| 4228 | ? "RCU used illegally from idle CPU!\n" | 4228 | ? "RCU used illegally from idle CPU!\n" |
| 4229 | : "", | 4229 | : "", |
| 4230 | rcu_scheduler_active, debug_locks); | 4230 | rcu_scheduler_active, debug_locks); |
| @@ -4247,7 +4247,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) | |||
| 4247 | * So complain bitterly if someone does call rcu_read_lock(), | 4247 | * So complain bitterly if someone does call rcu_read_lock(), |
| 4248 | * rcu_read_lock_bh() and so on from extended quiescent states. | 4248 | * rcu_read_lock_bh() and so on from extended quiescent states. |
| 4249 | */ | 4249 | */ |
| 4250 | if (rcu_is_cpu_idle()) | 4250 | if (!rcu_is_watching()) |
| 4251 | printk("RCU used illegally from extended quiescent state!\n"); | 4251 | printk("RCU used illegally from extended quiescent state!\n"); |
| 4252 | 4252 | ||
| 4253 | lockdep_print_held_locks(curr); | 4253 | lockdep_print_held_locks(curr); |
diff --git a/kernel/modsign_pubkey.c b/kernel/modsign_pubkey.c index 2b6e69909c39..7cbd4507a7e6 100644 --- a/kernel/modsign_pubkey.c +++ b/kernel/modsign_pubkey.c | |||
| @@ -18,14 +18,14 @@ | |||
| 18 | 18 | ||
| 19 | struct key *modsign_keyring; | 19 | struct key *modsign_keyring; |
| 20 | 20 | ||
| 21 | extern __initdata const u8 modsign_certificate_list[]; | 21 | extern __initconst const u8 modsign_certificate_list[]; |
| 22 | extern __initdata const u8 modsign_certificate_list_end[]; | 22 | extern __initconst const u8 modsign_certificate_list_end[]; |
| 23 | 23 | ||
| 24 | /* | 24 | /* |
| 25 | * We need to make sure ccache doesn't cache the .o file as it doesn't notice | 25 | * We need to make sure ccache doesn't cache the .o file as it doesn't notice |
| 26 | * if modsign.pub changes. | 26 | * if modsign.pub changes. |
| 27 | */ | 27 | */ |
| 28 | static __initdata const char annoy_ccache[] = __TIME__ "foo"; | 28 | static __initconst const char annoy_ccache[] = __TIME__ "foo"; |
| 29 | 29 | ||
| 30 | /* | 30 | /* |
| 31 | * Load the compiled-in keys | 31 | * Load the compiled-in keys |
diff --git a/kernel/module.c b/kernel/module.c index 206915830d29..dc582749fa13 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
| @@ -136,6 +136,7 @@ static int param_set_bool_enable_only(const char *val, | |||
| 136 | } | 136 | } |
| 137 | 137 | ||
| 138 | static const struct kernel_param_ops param_ops_bool_enable_only = { | 138 | static const struct kernel_param_ops param_ops_bool_enable_only = { |
| 139 | .flags = KERNEL_PARAM_FL_NOARG, | ||
| 139 | .set = param_set_bool_enable_only, | 140 | .set = param_set_bool_enable_only, |
| 140 | .get = param_get_bool, | 141 | .get = param_get_bool, |
| 141 | }; | 142 | }; |
| @@ -603,7 +604,7 @@ static void setup_modinfo_##field(struct module *mod, const char *s) \ | |||
| 603 | static ssize_t show_modinfo_##field(struct module_attribute *mattr, \ | 604 | static ssize_t show_modinfo_##field(struct module_attribute *mattr, \ |
| 604 | struct module_kobject *mk, char *buffer) \ | 605 | struct module_kobject *mk, char *buffer) \ |
| 605 | { \ | 606 | { \ |
| 606 | return sprintf(buffer, "%s\n", mk->mod->field); \ | 607 | return scnprintf(buffer, PAGE_SIZE, "%s\n", mk->mod->field); \ |
| 607 | } \ | 608 | } \ |
| 608 | static int modinfo_##field##_exists(struct module *mod) \ | 609 | static int modinfo_##field##_exists(struct module *mod) \ |
| 609 | { \ | 610 | { \ |
| @@ -1611,6 +1612,14 @@ static void module_remove_modinfo_attrs(struct module *mod) | |||
| 1611 | kfree(mod->modinfo_attrs); | 1612 | kfree(mod->modinfo_attrs); |
| 1612 | } | 1613 | } |
| 1613 | 1614 | ||
| 1615 | static void mod_kobject_put(struct module *mod) | ||
| 1616 | { | ||
| 1617 | DECLARE_COMPLETION_ONSTACK(c); | ||
| 1618 | mod->mkobj.kobj_completion = &c; | ||
| 1619 | kobject_put(&mod->mkobj.kobj); | ||
| 1620 | wait_for_completion(&c); | ||
| 1621 | } | ||
| 1622 | |||
| 1614 | static int mod_sysfs_init(struct module *mod) | 1623 | static int mod_sysfs_init(struct module *mod) |
| 1615 | { | 1624 | { |
| 1616 | int err; | 1625 | int err; |
| @@ -1638,7 +1647,7 @@ static int mod_sysfs_init(struct module *mod) | |||
| 1638 | err = kobject_init_and_add(&mod->mkobj.kobj, &module_ktype, NULL, | 1647 | err = kobject_init_and_add(&mod->mkobj.kobj, &module_ktype, NULL, |
| 1639 | "%s", mod->name); | 1648 | "%s", mod->name); |
| 1640 | if (err) | 1649 | if (err) |
| 1641 | kobject_put(&mod->mkobj.kobj); | 1650 | mod_kobject_put(mod); |
| 1642 | 1651 | ||
| 1643 | /* delay uevent until full sysfs population */ | 1652 | /* delay uevent until full sysfs population */ |
| 1644 | out: | 1653 | out: |
| @@ -1682,7 +1691,7 @@ out_unreg_param: | |||
| 1682 | out_unreg_holders: | 1691 | out_unreg_holders: |
| 1683 | kobject_put(mod->holders_dir); | 1692 | kobject_put(mod->holders_dir); |
| 1684 | out_unreg: | 1693 | out_unreg: |
| 1685 | kobject_put(&mod->mkobj.kobj); | 1694 | mod_kobject_put(mod); |
| 1686 | out: | 1695 | out: |
| 1687 | return err; | 1696 | return err; |
| 1688 | } | 1697 | } |
| @@ -1691,7 +1700,7 @@ static void mod_sysfs_fini(struct module *mod) | |||
| 1691 | { | 1700 | { |
| 1692 | remove_notes_attrs(mod); | 1701 | remove_notes_attrs(mod); |
| 1693 | remove_sect_attrs(mod); | 1702 | remove_sect_attrs(mod); |
| 1694 | kobject_put(&mod->mkobj.kobj); | 1703 | mod_kobject_put(mod); |
| 1695 | } | 1704 | } |
| 1696 | 1705 | ||
| 1697 | #else /* !CONFIG_SYSFS */ | 1706 | #else /* !CONFIG_SYSFS */ |
| @@ -2540,21 +2549,20 @@ static int copy_module_from_user(const void __user *umod, unsigned long len, | |||
| 2540 | /* Sets info->hdr and info->len. */ | 2549 | /* Sets info->hdr and info->len. */ |
| 2541 | static int copy_module_from_fd(int fd, struct load_info *info) | 2550 | static int copy_module_from_fd(int fd, struct load_info *info) |
| 2542 | { | 2551 | { |
| 2543 | struct file *file; | 2552 | struct fd f = fdget(fd); |
| 2544 | int err; | 2553 | int err; |
| 2545 | struct kstat stat; | 2554 | struct kstat stat; |
| 2546 | loff_t pos; | 2555 | loff_t pos; |
| 2547 | ssize_t bytes = 0; | 2556 | ssize_t bytes = 0; |
| 2548 | 2557 | ||
| 2549 | file = fget(fd); | 2558 | if (!f.file) |
| 2550 | if (!file) | ||
| 2551 | return -ENOEXEC; | 2559 | return -ENOEXEC; |
| 2552 | 2560 | ||
| 2553 | err = security_kernel_module_from_file(file); | 2561 | err = security_kernel_module_from_file(f.file); |
| 2554 | if (err) | 2562 | if (err) |
| 2555 | goto out; | 2563 | goto out; |
| 2556 | 2564 | ||
| 2557 | err = vfs_getattr(&file->f_path, &stat); | 2565 | err = vfs_getattr(&f.file->f_path, &stat); |
| 2558 | if (err) | 2566 | if (err) |
| 2559 | goto out; | 2567 | goto out; |
| 2560 | 2568 | ||
| @@ -2577,7 +2585,7 @@ static int copy_module_from_fd(int fd, struct load_info *info) | |||
| 2577 | 2585 | ||
| 2578 | pos = 0; | 2586 | pos = 0; |
| 2579 | while (pos < stat.size) { | 2587 | while (pos < stat.size) { |
| 2580 | bytes = kernel_read(file, pos, (char *)(info->hdr) + pos, | 2588 | bytes = kernel_read(f.file, pos, (char *)(info->hdr) + pos, |
| 2581 | stat.size - pos); | 2589 | stat.size - pos); |
| 2582 | if (bytes < 0) { | 2590 | if (bytes < 0) { |
| 2583 | vfree(info->hdr); | 2591 | vfree(info->hdr); |
| @@ -2591,7 +2599,7 @@ static int copy_module_from_fd(int fd, struct load_info *info) | |||
| 2591 | info->len = pos; | 2599 | info->len = pos; |
| 2592 | 2600 | ||
| 2593 | out: | 2601 | out: |
| 2594 | fput(file); | 2602 | fdput(f); |
| 2595 | return err; | 2603 | return err; |
| 2596 | } | 2604 | } |
| 2597 | 2605 | ||
diff --git a/kernel/mutex.c b/kernel/mutex.c index ff05f4bd86eb..d24105b1b794 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c | |||
| @@ -209,11 +209,13 @@ int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) | |||
| 209 | */ | 209 | */ |
| 210 | static inline int mutex_can_spin_on_owner(struct mutex *lock) | 210 | static inline int mutex_can_spin_on_owner(struct mutex *lock) |
| 211 | { | 211 | { |
| 212 | struct task_struct *owner; | ||
| 212 | int retval = 1; | 213 | int retval = 1; |
| 213 | 214 | ||
| 214 | rcu_read_lock(); | 215 | rcu_read_lock(); |
| 215 | if (lock->owner) | 216 | owner = ACCESS_ONCE(lock->owner); |
| 216 | retval = lock->owner->on_cpu; | 217 | if (owner) |
| 218 | retval = owner->on_cpu; | ||
| 217 | rcu_read_unlock(); | 219 | rcu_read_unlock(); |
| 218 | /* | 220 | /* |
| 219 | * if lock->owner is not set, the mutex owner may have just acquired | 221 | * if lock->owner is not set, the mutex owner may have just acquired |
| @@ -408,7 +410,7 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock, | |||
| 408 | static __always_inline int __sched | 410 | static __always_inline int __sched |
| 409 | __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, | 411 | __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, |
| 410 | struct lockdep_map *nest_lock, unsigned long ip, | 412 | struct lockdep_map *nest_lock, unsigned long ip, |
| 411 | struct ww_acquire_ctx *ww_ctx) | 413 | struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx) |
| 412 | { | 414 | { |
| 413 | struct task_struct *task = current; | 415 | struct task_struct *task = current; |
| 414 | struct mutex_waiter waiter; | 416 | struct mutex_waiter waiter; |
| @@ -448,7 +450,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, | |||
| 448 | struct task_struct *owner; | 450 | struct task_struct *owner; |
| 449 | struct mspin_node node; | 451 | struct mspin_node node; |
| 450 | 452 | ||
| 451 | if (!__builtin_constant_p(ww_ctx == NULL) && ww_ctx->acquired > 0) { | 453 | if (use_ww_ctx && ww_ctx->acquired > 0) { |
| 452 | struct ww_mutex *ww; | 454 | struct ww_mutex *ww; |
| 453 | 455 | ||
| 454 | ww = container_of(lock, struct ww_mutex, base); | 456 | ww = container_of(lock, struct ww_mutex, base); |
| @@ -461,7 +463,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, | |||
| 461 | * performed the optimistic spinning cannot be done. | 463 | * performed the optimistic spinning cannot be done. |
| 462 | */ | 464 | */ |
| 463 | if (ACCESS_ONCE(ww->ctx)) | 465 | if (ACCESS_ONCE(ww->ctx)) |
| 464 | break; | 466 | goto slowpath; |
| 465 | } | 467 | } |
| 466 | 468 | ||
| 467 | /* | 469 | /* |
| @@ -472,13 +474,13 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, | |||
| 472 | owner = ACCESS_ONCE(lock->owner); | 474 | owner = ACCESS_ONCE(lock->owner); |
| 473 | if (owner && !mutex_spin_on_owner(lock, owner)) { | 475 | if (owner && !mutex_spin_on_owner(lock, owner)) { |
| 474 | mspin_unlock(MLOCK(lock), &node); | 476 | mspin_unlock(MLOCK(lock), &node); |
| 475 | break; | 477 | goto slowpath; |
| 476 | } | 478 | } |
| 477 | 479 | ||
| 478 | if ((atomic_read(&lock->count) == 1) && | 480 | if ((atomic_read(&lock->count) == 1) && |
| 479 | (atomic_cmpxchg(&lock->count, 1, 0) == 1)) { | 481 | (atomic_cmpxchg(&lock->count, 1, 0) == 1)) { |
| 480 | lock_acquired(&lock->dep_map, ip); | 482 | lock_acquired(&lock->dep_map, ip); |
| 481 | if (!__builtin_constant_p(ww_ctx == NULL)) { | 483 | if (use_ww_ctx) { |
| 482 | struct ww_mutex *ww; | 484 | struct ww_mutex *ww; |
| 483 | ww = container_of(lock, struct ww_mutex, base); | 485 | ww = container_of(lock, struct ww_mutex, base); |
| 484 | 486 | ||
| @@ -499,7 +501,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, | |||
| 499 | * the owner complete. | 501 | * the owner complete. |
| 500 | */ | 502 | */ |
| 501 | if (!owner && (need_resched() || rt_task(task))) | 503 | if (!owner && (need_resched() || rt_task(task))) |
| 502 | break; | 504 | goto slowpath; |
| 503 | 505 | ||
| 504 | /* | 506 | /* |
| 505 | * The cpu_relax() call is a compiler barrier which forces | 507 | * The cpu_relax() call is a compiler barrier which forces |
| @@ -513,6 +515,10 @@ slowpath: | |||
| 513 | #endif | 515 | #endif |
| 514 | spin_lock_mutex(&lock->wait_lock, flags); | 516 | spin_lock_mutex(&lock->wait_lock, flags); |
| 515 | 517 | ||
| 518 | /* once more, can we acquire the lock? */ | ||
| 519 | if (MUTEX_SHOW_NO_WAITER(lock) && (atomic_xchg(&lock->count, 0) == 1)) | ||
| 520 | goto skip_wait; | ||
| 521 | |||
| 516 | debug_mutex_lock_common(lock, &waiter); | 522 | debug_mutex_lock_common(lock, &waiter); |
| 517 | debug_mutex_add_waiter(lock, &waiter, task_thread_info(task)); | 523 | debug_mutex_add_waiter(lock, &waiter, task_thread_info(task)); |
| 518 | 524 | ||
| @@ -520,9 +526,6 @@ slowpath: | |||
| 520 | list_add_tail(&waiter.list, &lock->wait_list); | 526 | list_add_tail(&waiter.list, &lock->wait_list); |
| 521 | waiter.task = task; | 527 | waiter.task = task; |
| 522 | 528 | ||
| 523 | if (MUTEX_SHOW_NO_WAITER(lock) && (atomic_xchg(&lock->count, -1) == 1)) | ||
| 524 | goto done; | ||
| 525 | |||
| 526 | lock_contended(&lock->dep_map, ip); | 529 | lock_contended(&lock->dep_map, ip); |
| 527 | 530 | ||
| 528 | for (;;) { | 531 | for (;;) { |
| @@ -536,7 +539,7 @@ slowpath: | |||
| 536 | * other waiters: | 539 | * other waiters: |
| 537 | */ | 540 | */ |
| 538 | if (MUTEX_SHOW_NO_WAITER(lock) && | 541 | if (MUTEX_SHOW_NO_WAITER(lock) && |
| 539 | (atomic_xchg(&lock->count, -1) == 1)) | 542 | (atomic_xchg(&lock->count, -1) == 1)) |
| 540 | break; | 543 | break; |
| 541 | 544 | ||
| 542 | /* | 545 | /* |
| @@ -548,7 +551,7 @@ slowpath: | |||
| 548 | goto err; | 551 | goto err; |
| 549 | } | 552 | } |
| 550 | 553 | ||
| 551 | if (!__builtin_constant_p(ww_ctx == NULL) && ww_ctx->acquired > 0) { | 554 | if (use_ww_ctx && ww_ctx->acquired > 0) { |
| 552 | ret = __mutex_lock_check_stamp(lock, ww_ctx); | 555 | ret = __mutex_lock_check_stamp(lock, ww_ctx); |
| 553 | if (ret) | 556 | if (ret) |
| 554 | goto err; | 557 | goto err; |
| @@ -561,24 +564,25 @@ slowpath: | |||
| 561 | schedule_preempt_disabled(); | 564 | schedule_preempt_disabled(); |
| 562 | spin_lock_mutex(&lock->wait_lock, flags); | 565 | spin_lock_mutex(&lock->wait_lock, flags); |
| 563 | } | 566 | } |
| 567 | mutex_remove_waiter(lock, &waiter, current_thread_info()); | ||
| 568 | /* set it to 0 if there are no waiters left: */ | ||
| 569 | if (likely(list_empty(&lock->wait_list))) | ||
| 570 | atomic_set(&lock->count, 0); | ||
| 571 | debug_mutex_free_waiter(&waiter); | ||
| 564 | 572 | ||
| 565 | done: | 573 | skip_wait: |
| 574 | /* got the lock - cleanup and rejoice! */ | ||
| 566 | lock_acquired(&lock->dep_map, ip); | 575 | lock_acquired(&lock->dep_map, ip); |
| 567 | /* got the lock - rejoice! */ | ||
| 568 | mutex_remove_waiter(lock, &waiter, current_thread_info()); | ||
| 569 | mutex_set_owner(lock); | 576 | mutex_set_owner(lock); |
| 570 | 577 | ||
| 571 | if (!__builtin_constant_p(ww_ctx == NULL)) { | 578 | if (use_ww_ctx) { |
| 572 | struct ww_mutex *ww = container_of(lock, | 579 | struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); |
| 573 | struct ww_mutex, | ||
| 574 | base); | ||
| 575 | struct mutex_waiter *cur; | 580 | struct mutex_waiter *cur; |
| 576 | 581 | ||
| 577 | /* | 582 | /* |
| 578 | * This branch gets optimized out for the common case, | 583 | * This branch gets optimized out for the common case, |
| 579 | * and is only important for ww_mutex_lock. | 584 | * and is only important for ww_mutex_lock. |
| 580 | */ | 585 | */ |
| 581 | |||
| 582 | ww_mutex_lock_acquired(ww, ww_ctx); | 586 | ww_mutex_lock_acquired(ww, ww_ctx); |
| 583 | ww->ctx = ww_ctx; | 587 | ww->ctx = ww_ctx; |
| 584 | 588 | ||
| @@ -592,15 +596,8 @@ done: | |||
| 592 | } | 596 | } |
| 593 | } | 597 | } |
| 594 | 598 | ||
| 595 | /* set it to 0 if there are no waiters left: */ | ||
| 596 | if (likely(list_empty(&lock->wait_list))) | ||
| 597 | atomic_set(&lock->count, 0); | ||
| 598 | |||
| 599 | spin_unlock_mutex(&lock->wait_lock, flags); | 599 | spin_unlock_mutex(&lock->wait_lock, flags); |
| 600 | |||
| 601 | debug_mutex_free_waiter(&waiter); | ||
| 602 | preempt_enable(); | 600 | preempt_enable(); |
| 603 | |||
| 604 | return 0; | 601 | return 0; |
| 605 | 602 | ||
| 606 | err: | 603 | err: |
| @@ -618,7 +615,7 @@ mutex_lock_nested(struct mutex *lock, unsigned int subclass) | |||
| 618 | { | 615 | { |
| 619 | might_sleep(); | 616 | might_sleep(); |
| 620 | __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, | 617 | __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, |
| 621 | subclass, NULL, _RET_IP_, NULL); | 618 | subclass, NULL, _RET_IP_, NULL, 0); |
| 622 | } | 619 | } |
| 623 | 620 | ||
| 624 | EXPORT_SYMBOL_GPL(mutex_lock_nested); | 621 | EXPORT_SYMBOL_GPL(mutex_lock_nested); |
| @@ -628,7 +625,7 @@ _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest) | |||
| 628 | { | 625 | { |
| 629 | might_sleep(); | 626 | might_sleep(); |
| 630 | __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, | 627 | __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, |
| 631 | 0, nest, _RET_IP_, NULL); | 628 | 0, nest, _RET_IP_, NULL, 0); |
| 632 | } | 629 | } |
| 633 | 630 | ||
| 634 | EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock); | 631 | EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock); |
| @@ -638,7 +635,7 @@ mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass) | |||
| 638 | { | 635 | { |
| 639 | might_sleep(); | 636 | might_sleep(); |
| 640 | return __mutex_lock_common(lock, TASK_KILLABLE, | 637 | return __mutex_lock_common(lock, TASK_KILLABLE, |
| 641 | subclass, NULL, _RET_IP_, NULL); | 638 | subclass, NULL, _RET_IP_, NULL, 0); |
| 642 | } | 639 | } |
| 643 | EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); | 640 | EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); |
| 644 | 641 | ||
| @@ -647,7 +644,7 @@ mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) | |||
| 647 | { | 644 | { |
| 648 | might_sleep(); | 645 | might_sleep(); |
| 649 | return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, | 646 | return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, |
| 650 | subclass, NULL, _RET_IP_, NULL); | 647 | subclass, NULL, _RET_IP_, NULL, 0); |
| 651 | } | 648 | } |
| 652 | 649 | ||
| 653 | EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); | 650 | EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); |
| @@ -685,8 +682,8 @@ __ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) | |||
| 685 | 682 | ||
| 686 | might_sleep(); | 683 | might_sleep(); |
| 687 | ret = __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, | 684 | ret = __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, |
| 688 | 0, &ctx->dep_map, _RET_IP_, ctx); | 685 | 0, &ctx->dep_map, _RET_IP_, ctx, 1); |
| 689 | if (!ret && ctx->acquired > 0) | 686 | if (!ret && ctx->acquired > 1) |
| 690 | return ww_mutex_deadlock_injection(lock, ctx); | 687 | return ww_mutex_deadlock_injection(lock, ctx); |
| 691 | 688 | ||
| 692 | return ret; | 689 | return ret; |
| @@ -700,9 +697,9 @@ __ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) | |||
| 700 | 697 | ||
| 701 | might_sleep(); | 698 | might_sleep(); |
| 702 | ret = __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, | 699 | ret = __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, |
| 703 | 0, &ctx->dep_map, _RET_IP_, ctx); | 700 | 0, &ctx->dep_map, _RET_IP_, ctx, 1); |
| 704 | 701 | ||
| 705 | if (!ret && ctx->acquired > 0) | 702 | if (!ret && ctx->acquired > 1) |
| 706 | return ww_mutex_deadlock_injection(lock, ctx); | 703 | return ww_mutex_deadlock_injection(lock, ctx); |
| 707 | 704 | ||
| 708 | return ret; | 705 | return ret; |
| @@ -812,28 +809,28 @@ __mutex_lock_slowpath(atomic_t *lock_count) | |||
| 812 | struct mutex *lock = container_of(lock_count, struct mutex, count); | 809 | struct mutex *lock = container_of(lock_count, struct mutex, count); |
| 813 | 810 | ||
| 814 | __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, | 811 | __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, |
| 815 | NULL, _RET_IP_, NULL); | 812 | NULL, _RET_IP_, NULL, 0); |
| 816 | } | 813 | } |
| 817 | 814 | ||
| 818 | static noinline int __sched | 815 | static noinline int __sched |
| 819 | __mutex_lock_killable_slowpath(struct mutex *lock) | 816 | __mutex_lock_killable_slowpath(struct mutex *lock) |
| 820 | { | 817 | { |
| 821 | return __mutex_lock_common(lock, TASK_KILLABLE, 0, | 818 | return __mutex_lock_common(lock, TASK_KILLABLE, 0, |
| 822 | NULL, _RET_IP_, NULL); | 819 | NULL, _RET_IP_, NULL, 0); |
| 823 | } | 820 | } |
| 824 | 821 | ||
| 825 | static noinline int __sched | 822 | static noinline int __sched |
| 826 | __mutex_lock_interruptible_slowpath(struct mutex *lock) | 823 | __mutex_lock_interruptible_slowpath(struct mutex *lock) |
| 827 | { | 824 | { |
| 828 | return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, | 825 | return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, |
| 829 | NULL, _RET_IP_, NULL); | 826 | NULL, _RET_IP_, NULL, 0); |
| 830 | } | 827 | } |
| 831 | 828 | ||
| 832 | static noinline int __sched | 829 | static noinline int __sched |
| 833 | __ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) | 830 | __ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) |
| 834 | { | 831 | { |
| 835 | return __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, 0, | 832 | return __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, 0, |
| 836 | NULL, _RET_IP_, ctx); | 833 | NULL, _RET_IP_, ctx, 1); |
| 837 | } | 834 | } |
| 838 | 835 | ||
| 839 | static noinline int __sched | 836 | static noinline int __sched |
| @@ -841,7 +838,7 @@ __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock, | |||
| 841 | struct ww_acquire_ctx *ctx) | 838 | struct ww_acquire_ctx *ctx) |
| 842 | { | 839 | { |
| 843 | return __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, 0, | 840 | return __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, 0, |
| 844 | NULL, _RET_IP_, ctx); | 841 | NULL, _RET_IP_, ctx, 1); |
| 845 | } | 842 | } |
| 846 | 843 | ||
| 847 | #endif | 844 | #endif |
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 364ceab15f0c..8e7811086b82 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c | |||
| @@ -29,15 +29,15 @@ | |||
| 29 | static struct kmem_cache *nsproxy_cachep; | 29 | static struct kmem_cache *nsproxy_cachep; |
| 30 | 30 | ||
| 31 | struct nsproxy init_nsproxy = { | 31 | struct nsproxy init_nsproxy = { |
| 32 | .count = ATOMIC_INIT(1), | 32 | .count = ATOMIC_INIT(1), |
| 33 | .uts_ns = &init_uts_ns, | 33 | .uts_ns = &init_uts_ns, |
| 34 | #if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC) | 34 | #if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC) |
| 35 | .ipc_ns = &init_ipc_ns, | 35 | .ipc_ns = &init_ipc_ns, |
| 36 | #endif | 36 | #endif |
| 37 | .mnt_ns = NULL, | 37 | .mnt_ns = NULL, |
| 38 | .pid_ns = &init_pid_ns, | 38 | .pid_ns_for_children = &init_pid_ns, |
| 39 | #ifdef CONFIG_NET | 39 | #ifdef CONFIG_NET |
| 40 | .net_ns = &init_net, | 40 | .net_ns = &init_net, |
| 41 | #endif | 41 | #endif |
| 42 | }; | 42 | }; |
| 43 | 43 | ||
| @@ -85,9 +85,10 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, | |||
| 85 | goto out_ipc; | 85 | goto out_ipc; |
| 86 | } | 86 | } |
| 87 | 87 | ||
| 88 | new_nsp->pid_ns = copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns); | 88 | new_nsp->pid_ns_for_children = |
| 89 | if (IS_ERR(new_nsp->pid_ns)) { | 89 | copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns_for_children); |
| 90 | err = PTR_ERR(new_nsp->pid_ns); | 90 | if (IS_ERR(new_nsp->pid_ns_for_children)) { |
| 91 | err = PTR_ERR(new_nsp->pid_ns_for_children); | ||
| 91 | goto out_pid; | 92 | goto out_pid; |
| 92 | } | 93 | } |
| 93 | 94 | ||
| @@ -100,8 +101,8 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, | |||
| 100 | return new_nsp; | 101 | return new_nsp; |
| 101 | 102 | ||
| 102 | out_net: | 103 | out_net: |
| 103 | if (new_nsp->pid_ns) | 104 | if (new_nsp->pid_ns_for_children) |
| 104 | put_pid_ns(new_nsp->pid_ns); | 105 | put_pid_ns(new_nsp->pid_ns_for_children); |
| 105 | out_pid: | 106 | out_pid: |
| 106 | if (new_nsp->ipc_ns) | 107 | if (new_nsp->ipc_ns) |
| 107 | put_ipc_ns(new_nsp->ipc_ns); | 108 | put_ipc_ns(new_nsp->ipc_ns); |
| @@ -125,22 +126,16 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) | |||
| 125 | struct nsproxy *old_ns = tsk->nsproxy; | 126 | struct nsproxy *old_ns = tsk->nsproxy; |
| 126 | struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns); | 127 | struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns); |
| 127 | struct nsproxy *new_ns; | 128 | struct nsproxy *new_ns; |
| 128 | int err = 0; | ||
| 129 | 129 | ||
| 130 | if (!old_ns) | 130 | if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | |
| 131 | CLONE_NEWPID | CLONE_NEWNET)))) { | ||
| 132 | get_nsproxy(old_ns); | ||
| 131 | return 0; | 133 | return 0; |
| 132 | |||
| 133 | get_nsproxy(old_ns); | ||
| 134 | |||
| 135 | if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | | ||
| 136 | CLONE_NEWPID | CLONE_NEWNET))) | ||
| 137 | return 0; | ||
| 138 | |||
| 139 | if (!ns_capable(user_ns, CAP_SYS_ADMIN)) { | ||
| 140 | err = -EPERM; | ||
| 141 | goto out; | ||
| 142 | } | 134 | } |
| 143 | 135 | ||
| 136 | if (!ns_capable(user_ns, CAP_SYS_ADMIN)) | ||
| 137 | return -EPERM; | ||
| 138 | |||
| 144 | /* | 139 | /* |
| 145 | * CLONE_NEWIPC must detach from the undolist: after switching | 140 | * CLONE_NEWIPC must detach from the undolist: after switching |
| 146 | * to a new ipc namespace, the semaphore arrays from the old | 141 | * to a new ipc namespace, the semaphore arrays from the old |
| @@ -148,22 +143,16 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) | |||
| 148 | * means share undolist with parent, so we must forbid using | 143 | * means share undolist with parent, so we must forbid using |
| 149 | * it along with CLONE_NEWIPC. | 144 | * it along with CLONE_NEWIPC. |
| 150 | */ | 145 | */ |
| 151 | if ((flags & CLONE_NEWIPC) && (flags & CLONE_SYSVSEM)) { | 146 | if ((flags & (CLONE_NEWIPC | CLONE_SYSVSEM)) == |
| 152 | err = -EINVAL; | 147 | (CLONE_NEWIPC | CLONE_SYSVSEM)) |
| 153 | goto out; | 148 | return -EINVAL; |
| 154 | } | ||
| 155 | 149 | ||
| 156 | new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs); | 150 | new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs); |
| 157 | if (IS_ERR(new_ns)) { | 151 | if (IS_ERR(new_ns)) |
| 158 | err = PTR_ERR(new_ns); | 152 | return PTR_ERR(new_ns); |
| 159 | goto out; | ||
| 160 | } | ||
| 161 | 153 | ||
| 162 | tsk->nsproxy = new_ns; | 154 | tsk->nsproxy = new_ns; |
| 163 | 155 | return 0; | |
| 164 | out: | ||
| 165 | put_nsproxy(old_ns); | ||
| 166 | return err; | ||
| 167 | } | 156 | } |
| 168 | 157 | ||
| 169 | void free_nsproxy(struct nsproxy *ns) | 158 | void free_nsproxy(struct nsproxy *ns) |
| @@ -174,8 +163,8 @@ void free_nsproxy(struct nsproxy *ns) | |||
| 174 | put_uts_ns(ns->uts_ns); | 163 | put_uts_ns(ns->uts_ns); |
| 175 | if (ns->ipc_ns) | 164 | if (ns->ipc_ns) |
| 176 | put_ipc_ns(ns->ipc_ns); | 165 | put_ipc_ns(ns->ipc_ns); |
| 177 | if (ns->pid_ns) | 166 | if (ns->pid_ns_for_children) |
| 178 | put_pid_ns(ns->pid_ns); | 167 | put_pid_ns(ns->pid_ns_for_children); |
| 179 | put_net(ns->net_ns); | 168 | put_net(ns->net_ns); |
| 180 | kmem_cache_free(nsproxy_cachep, ns); | 169 | kmem_cache_free(nsproxy_cachep, ns); |
| 181 | } | 170 | } |
diff --git a/kernel/padata.c b/kernel/padata.c index 072f4ee4eb89..07af2c95dcfe 100644 --- a/kernel/padata.c +++ b/kernel/padata.c | |||
| @@ -846,6 +846,8 @@ static int padata_cpu_callback(struct notifier_block *nfb, | |||
| 846 | switch (action) { | 846 | switch (action) { |
| 847 | case CPU_ONLINE: | 847 | case CPU_ONLINE: |
| 848 | case CPU_ONLINE_FROZEN: | 848 | case CPU_ONLINE_FROZEN: |
| 849 | case CPU_DOWN_FAILED: | ||
| 850 | case CPU_DOWN_FAILED_FROZEN: | ||
| 849 | if (!pinst_has_cpu(pinst, cpu)) | 851 | if (!pinst_has_cpu(pinst, cpu)) |
| 850 | break; | 852 | break; |
| 851 | mutex_lock(&pinst->lock); | 853 | mutex_lock(&pinst->lock); |
| @@ -857,6 +859,8 @@ static int padata_cpu_callback(struct notifier_block *nfb, | |||
| 857 | 859 | ||
| 858 | case CPU_DOWN_PREPARE: | 860 | case CPU_DOWN_PREPARE: |
| 859 | case CPU_DOWN_PREPARE_FROZEN: | 861 | case CPU_DOWN_PREPARE_FROZEN: |
| 862 | case CPU_UP_CANCELED: | ||
| 863 | case CPU_UP_CANCELED_FROZEN: | ||
| 860 | if (!pinst_has_cpu(pinst, cpu)) | 864 | if (!pinst_has_cpu(pinst, cpu)) |
| 861 | break; | 865 | break; |
| 862 | mutex_lock(&pinst->lock); | 866 | mutex_lock(&pinst->lock); |
| @@ -865,22 +869,6 @@ static int padata_cpu_callback(struct notifier_block *nfb, | |||
| 865 | if (err) | 869 | if (err) |
| 866 | return notifier_from_errno(err); | 870 | return notifier_from_errno(err); |
| 867 | break; | 871 | break; |
| 868 | |||
| 869 | case CPU_UP_CANCELED: | ||
| 870 | case CPU_UP_CANCELED_FROZEN: | ||
| 871 | if (!pinst_has_cpu(pinst, cpu)) | ||
| 872 | break; | ||
| 873 | mutex_lock(&pinst->lock); | ||
| 874 | __padata_remove_cpu(pinst, cpu); | ||
| 875 | mutex_unlock(&pinst->lock); | ||
| 876 | |||
| 877 | case CPU_DOWN_FAILED: | ||
| 878 | case CPU_DOWN_FAILED_FROZEN: | ||
| 879 | if (!pinst_has_cpu(pinst, cpu)) | ||
| 880 | break; | ||
| 881 | mutex_lock(&pinst->lock); | ||
| 882 | __padata_add_cpu(pinst, cpu); | ||
| 883 | mutex_unlock(&pinst->lock); | ||
| 884 | } | 872 | } |
| 885 | 873 | ||
| 886 | return NOTIFY_OK; | 874 | return NOTIFY_OK; |
| @@ -1086,18 +1074,18 @@ struct padata_instance *padata_alloc(struct workqueue_struct *wq, | |||
| 1086 | 1074 | ||
| 1087 | pinst->flags = 0; | 1075 | pinst->flags = 0; |
| 1088 | 1076 | ||
| 1089 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 1090 | pinst->cpu_notifier.notifier_call = padata_cpu_callback; | ||
| 1091 | pinst->cpu_notifier.priority = 0; | ||
| 1092 | register_hotcpu_notifier(&pinst->cpu_notifier); | ||
| 1093 | #endif | ||
| 1094 | |||
| 1095 | put_online_cpus(); | 1077 | put_online_cpus(); |
| 1096 | 1078 | ||
| 1097 | BLOCKING_INIT_NOTIFIER_HEAD(&pinst->cpumask_change_notifier); | 1079 | BLOCKING_INIT_NOTIFIER_HEAD(&pinst->cpumask_change_notifier); |
| 1098 | kobject_init(&pinst->kobj, &padata_attr_type); | 1080 | kobject_init(&pinst->kobj, &padata_attr_type); |
| 1099 | mutex_init(&pinst->lock); | 1081 | mutex_init(&pinst->lock); |
| 1100 | 1082 | ||
| 1083 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 1084 | pinst->cpu_notifier.notifier_call = padata_cpu_callback; | ||
| 1085 | pinst->cpu_notifier.priority = 0; | ||
| 1086 | register_hotcpu_notifier(&pinst->cpu_notifier); | ||
| 1087 | #endif | ||
| 1088 | |||
| 1101 | return pinst; | 1089 | return pinst; |
| 1102 | 1090 | ||
| 1103 | err_free_masks: | 1091 | err_free_masks: |
diff --git a/kernel/panic.c b/kernel/panic.c index 801864600514..b6c482ccc5db 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
| @@ -123,10 +123,14 @@ void panic(const char *fmt, ...) | |||
| 123 | */ | 123 | */ |
| 124 | smp_send_stop(); | 124 | smp_send_stop(); |
| 125 | 125 | ||
| 126 | kmsg_dump(KMSG_DUMP_PANIC); | 126 | /* |
| 127 | 127 | * Run any panic handlers, including those that might need to | |
| 128 | * add information to the kmsg dump output. | ||
| 129 | */ | ||
| 128 | atomic_notifier_call_chain(&panic_notifier_list, 0, buf); | 130 | atomic_notifier_call_chain(&panic_notifier_list, 0, buf); |
| 129 | 131 | ||
| 132 | kmsg_dump(KMSG_DUMP_PANIC); | ||
| 133 | |||
| 130 | bust_spinlocks(0); | 134 | bust_spinlocks(0); |
| 131 | 135 | ||
| 132 | if (!panic_blink) | 136 | if (!panic_blink) |
diff --git a/kernel/params.c b/kernel/params.c index 440e65d1a544..c00d5b502aa4 100644 --- a/kernel/params.c +++ b/kernel/params.c | |||
| @@ -103,8 +103,8 @@ static int parse_one(char *param, | |||
| 103 | || params[i].level > max_level) | 103 | || params[i].level > max_level) |
| 104 | return 0; | 104 | return 0; |
| 105 | /* No one handled NULL, so do it here. */ | 105 | /* No one handled NULL, so do it here. */ |
| 106 | if (!val && params[i].ops->set != param_set_bool | 106 | if (!val && |
| 107 | && params[i].ops->set != param_set_bint) | 107 | !(params[i].ops->flags & KERNEL_PARAM_FL_NOARG)) |
| 108 | return -EINVAL; | 108 | return -EINVAL; |
| 109 | pr_debug("handling %s with %p\n", param, | 109 | pr_debug("handling %s with %p\n", param, |
| 110 | params[i].ops->set); | 110 | params[i].ops->set); |
| @@ -241,7 +241,8 @@ int parse_args(const char *doing, | |||
| 241 | } \ | 241 | } \ |
| 242 | int param_get_##name(char *buffer, const struct kernel_param *kp) \ | 242 | int param_get_##name(char *buffer, const struct kernel_param *kp) \ |
| 243 | { \ | 243 | { \ |
| 244 | return sprintf(buffer, format, *((type *)kp->arg)); \ | 244 | return scnprintf(buffer, PAGE_SIZE, format, \ |
| 245 | *((type *)kp->arg)); \ | ||
| 245 | } \ | 246 | } \ |
| 246 | struct kernel_param_ops param_ops_##name = { \ | 247 | struct kernel_param_ops param_ops_##name = { \ |
| 247 | .set = param_set_##name, \ | 248 | .set = param_set_##name, \ |
| @@ -252,13 +253,13 @@ int parse_args(const char *doing, | |||
| 252 | EXPORT_SYMBOL(param_ops_##name) | 253 | EXPORT_SYMBOL(param_ops_##name) |
| 253 | 254 | ||
| 254 | 255 | ||
| 255 | STANDARD_PARAM_DEF(byte, unsigned char, "%c", unsigned long, strict_strtoul); | 256 | STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", unsigned long, kstrtoul); |
| 256 | STANDARD_PARAM_DEF(short, short, "%hi", long, strict_strtol); | 257 | STANDARD_PARAM_DEF(short, short, "%hi", long, kstrtol); |
| 257 | STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, strict_strtoul); | 258 | STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, kstrtoul); |
| 258 | STANDARD_PARAM_DEF(int, int, "%i", long, strict_strtol); | 259 | STANDARD_PARAM_DEF(int, int, "%i", long, kstrtol); |
| 259 | STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, strict_strtoul); | 260 | STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, kstrtoul); |
| 260 | STANDARD_PARAM_DEF(long, long, "%li", long, strict_strtol); | 261 | STANDARD_PARAM_DEF(long, long, "%li", long, kstrtol); |
| 261 | STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul); | 262 | STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, kstrtoul); |
| 262 | 263 | ||
| 263 | int param_set_charp(const char *val, const struct kernel_param *kp) | 264 | int param_set_charp(const char *val, const struct kernel_param *kp) |
| 264 | { | 265 | { |
| @@ -285,7 +286,7 @@ EXPORT_SYMBOL(param_set_charp); | |||
| 285 | 286 | ||
| 286 | int param_get_charp(char *buffer, const struct kernel_param *kp) | 287 | int param_get_charp(char *buffer, const struct kernel_param *kp) |
| 287 | { | 288 | { |
| 288 | return sprintf(buffer, "%s", *((char **)kp->arg)); | 289 | return scnprintf(buffer, PAGE_SIZE, "%s", *((char **)kp->arg)); |
| 289 | } | 290 | } |
| 290 | EXPORT_SYMBOL(param_get_charp); | 291 | EXPORT_SYMBOL(param_get_charp); |
| 291 | 292 | ||
| @@ -320,6 +321,7 @@ int param_get_bool(char *buffer, const struct kernel_param *kp) | |||
| 320 | EXPORT_SYMBOL(param_get_bool); | 321 | EXPORT_SYMBOL(param_get_bool); |
| 321 | 322 | ||
| 322 | struct kernel_param_ops param_ops_bool = { | 323 | struct kernel_param_ops param_ops_bool = { |
| 324 | .flags = KERNEL_PARAM_FL_NOARG, | ||
| 323 | .set = param_set_bool, | 325 | .set = param_set_bool, |
| 324 | .get = param_get_bool, | 326 | .get = param_get_bool, |
| 325 | }; | 327 | }; |
| @@ -370,6 +372,7 @@ int param_set_bint(const char *val, const struct kernel_param *kp) | |||
| 370 | EXPORT_SYMBOL(param_set_bint); | 372 | EXPORT_SYMBOL(param_set_bint); |
| 371 | 373 | ||
| 372 | struct kernel_param_ops param_ops_bint = { | 374 | struct kernel_param_ops param_ops_bint = { |
| 375 | .flags = KERNEL_PARAM_FL_NOARG, | ||
| 373 | .set = param_set_bint, | 376 | .set = param_set_bint, |
| 374 | .get = param_get_int, | 377 | .get = param_get_int, |
| 375 | }; | 378 | }; |
| @@ -827,7 +830,7 @@ ssize_t __modver_version_show(struct module_attribute *mattr, | |||
| 827 | struct module_version_attribute *vattr = | 830 | struct module_version_attribute *vattr = |
| 828 | container_of(mattr, struct module_version_attribute, mattr); | 831 | container_of(mattr, struct module_version_attribute, mattr); |
| 829 | 832 | ||
| 830 | return sprintf(buf, "%s\n", vattr->version); | 833 | return scnprintf(buf, PAGE_SIZE, "%s\n", vattr->version); |
| 831 | } | 834 | } |
| 832 | 835 | ||
| 833 | extern const struct module_version_attribute *__start___modver[]; | 836 | extern const struct module_version_attribute *__start___modver[]; |
| @@ -912,7 +915,14 @@ static const struct kset_uevent_ops module_uevent_ops = { | |||
| 912 | struct kset *module_kset; | 915 | struct kset *module_kset; |
| 913 | int module_sysfs_initialized; | 916 | int module_sysfs_initialized; |
| 914 | 917 | ||
| 918 | static void module_kobj_release(struct kobject *kobj) | ||
| 919 | { | ||
| 920 | struct module_kobject *mk = to_module_kobject(kobj); | ||
| 921 | complete(mk->kobj_completion); | ||
| 922 | } | ||
| 923 | |||
| 915 | struct kobj_type module_ktype = { | 924 | struct kobj_type module_ktype = { |
| 925 | .release = module_kobj_release, | ||
| 916 | .sysfs_ops = &module_sysfs_ops, | 926 | .sysfs_ops = &module_sysfs_ops, |
| 917 | }; | 927 | }; |
| 918 | 928 | ||
diff --git a/kernel/pid.c b/kernel/pid.c index 66505c1dfc51..9b9a26698144 100644 --- a/kernel/pid.c +++ b/kernel/pid.c | |||
| @@ -265,6 +265,7 @@ void free_pid(struct pid *pid) | |||
| 265 | struct pid_namespace *ns = upid->ns; | 265 | struct pid_namespace *ns = upid->ns; |
| 266 | hlist_del_rcu(&upid->pid_chain); | 266 | hlist_del_rcu(&upid->pid_chain); |
| 267 | switch(--ns->nr_hashed) { | 267 | switch(--ns->nr_hashed) { |
| 268 | case 2: | ||
| 268 | case 1: | 269 | case 1: |
| 269 | /* When all that is left in the pid namespace | 270 | /* When all that is left in the pid namespace |
| 270 | * is the reaper wake up the reaper. The reaper | 271 | * is the reaper wake up the reaper. The reaper |
| @@ -272,6 +273,11 @@ void free_pid(struct pid *pid) | |||
| 272 | */ | 273 | */ |
| 273 | wake_up_process(ns->child_reaper); | 274 | wake_up_process(ns->child_reaper); |
| 274 | break; | 275 | break; |
| 276 | case PIDNS_HASH_ADDING: | ||
| 277 | /* Handle a fork failure of the first process */ | ||
| 278 | WARN_ON(ns->child_reaper); | ||
| 279 | ns->nr_hashed = 0; | ||
| 280 | /* fall through */ | ||
| 275 | case 0: | 281 | case 0: |
| 276 | schedule_work(&ns->proc_work); | 282 | schedule_work(&ns->proc_work); |
| 277 | break; | 283 | break; |
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 6917e8edb48e..42086551a24a 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c | |||
| @@ -329,7 +329,7 @@ static int pidns_install(struct nsproxy *nsproxy, void *ns) | |||
| 329 | struct pid_namespace *ancestor, *new = ns; | 329 | struct pid_namespace *ancestor, *new = ns; |
| 330 | 330 | ||
| 331 | if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) || | 331 | if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) || |
| 332 | !nsown_capable(CAP_SYS_ADMIN)) | 332 | !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) |
| 333 | return -EPERM; | 333 | return -EPERM; |
| 334 | 334 | ||
| 335 | /* | 335 | /* |
| @@ -349,8 +349,8 @@ static int pidns_install(struct nsproxy *nsproxy, void *ns) | |||
| 349 | if (ancestor != active) | 349 | if (ancestor != active) |
| 350 | return -EINVAL; | 350 | return -EINVAL; |
| 351 | 351 | ||
| 352 | put_pid_ns(nsproxy->pid_ns); | 352 | put_pid_ns(nsproxy->pid_ns_for_children); |
| 353 | nsproxy->pid_ns = get_pid_ns(new); | 353 | nsproxy->pid_ns_for_children = get_pid_ns(new); |
| 354 | return 0; | 354 | return 0; |
| 355 | } | 355 | } |
| 356 | 356 | ||
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index b26f5f1e773e..0121dab83f43 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c | |||
| @@ -39,7 +39,7 @@ static int resume_delay; | |||
| 39 | static char resume_file[256] = CONFIG_PM_STD_PARTITION; | 39 | static char resume_file[256] = CONFIG_PM_STD_PARTITION; |
| 40 | dev_t swsusp_resume_device; | 40 | dev_t swsusp_resume_device; |
| 41 | sector_t swsusp_resume_block; | 41 | sector_t swsusp_resume_block; |
| 42 | int in_suspend __nosavedata; | 42 | __visible int in_suspend __nosavedata; |
| 43 | 43 | ||
| 44 | enum { | 44 | enum { |
| 45 | HIBERNATION_INVALID, | 45 | HIBERNATION_INVALID, |
| @@ -644,22 +644,23 @@ int hibernate(void) | |||
| 644 | if (error) | 644 | if (error) |
| 645 | goto Exit; | 645 | goto Exit; |
| 646 | 646 | ||
| 647 | /* Allocate memory management structures */ | ||
| 648 | error = create_basic_memory_bitmaps(); | ||
| 649 | if (error) | ||
| 650 | goto Exit; | ||
| 651 | |||
| 652 | printk(KERN_INFO "PM: Syncing filesystems ... "); | 647 | printk(KERN_INFO "PM: Syncing filesystems ... "); |
| 653 | sys_sync(); | 648 | sys_sync(); |
| 654 | printk("done.\n"); | 649 | printk("done.\n"); |
| 655 | 650 | ||
| 656 | error = freeze_processes(); | 651 | error = freeze_processes(); |
| 657 | if (error) | 652 | if (error) |
| 658 | goto Free_bitmaps; | 653 | goto Exit; |
| 654 | |||
| 655 | lock_device_hotplug(); | ||
| 656 | /* Allocate memory management structures */ | ||
| 657 | error = create_basic_memory_bitmaps(); | ||
| 658 | if (error) | ||
| 659 | goto Thaw; | ||
| 659 | 660 | ||
| 660 | error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); | 661 | error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); |
| 661 | if (error || freezer_test_done) | 662 | if (error || freezer_test_done) |
| 662 | goto Thaw; | 663 | goto Free_bitmaps; |
| 663 | 664 | ||
| 664 | if (in_suspend) { | 665 | if (in_suspend) { |
| 665 | unsigned int flags = 0; | 666 | unsigned int flags = 0; |
| @@ -682,14 +683,14 @@ int hibernate(void) | |||
| 682 | pr_debug("PM: Image restored successfully.\n"); | 683 | pr_debug("PM: Image restored successfully.\n"); |
| 683 | } | 684 | } |
| 684 | 685 | ||
| 686 | Free_bitmaps: | ||
| 687 | free_basic_memory_bitmaps(); | ||
| 685 | Thaw: | 688 | Thaw: |
| 689 | unlock_device_hotplug(); | ||
| 686 | thaw_processes(); | 690 | thaw_processes(); |
| 687 | 691 | ||
| 688 | /* Don't bother checking whether freezer_test_done is true */ | 692 | /* Don't bother checking whether freezer_test_done is true */ |
| 689 | freezer_test_done = false; | 693 | freezer_test_done = false; |
| 690 | |||
| 691 | Free_bitmaps: | ||
| 692 | free_basic_memory_bitmaps(); | ||
| 693 | Exit: | 694 | Exit: |
| 694 | pm_notifier_call_chain(PM_POST_HIBERNATION); | 695 | pm_notifier_call_chain(PM_POST_HIBERNATION); |
| 695 | pm_restore_console(); | 696 | pm_restore_console(); |
| @@ -806,21 +807,20 @@ static int software_resume(void) | |||
| 806 | pm_prepare_console(); | 807 | pm_prepare_console(); |
| 807 | error = pm_notifier_call_chain(PM_RESTORE_PREPARE); | 808 | error = pm_notifier_call_chain(PM_RESTORE_PREPARE); |
| 808 | if (error) | 809 | if (error) |
| 809 | goto close_finish; | 810 | goto Close_Finish; |
| 810 | |||
| 811 | error = create_basic_memory_bitmaps(); | ||
| 812 | if (error) | ||
| 813 | goto close_finish; | ||
| 814 | 811 | ||
| 815 | pr_debug("PM: Preparing processes for restore.\n"); | 812 | pr_debug("PM: Preparing processes for restore.\n"); |
| 816 | error = freeze_processes(); | 813 | error = freeze_processes(); |
| 817 | if (error) { | 814 | if (error) |
| 818 | swsusp_close(FMODE_READ); | 815 | goto Close_Finish; |
| 819 | goto Done; | ||
| 820 | } | ||
| 821 | 816 | ||
| 822 | pr_debug("PM: Loading hibernation image.\n"); | 817 | pr_debug("PM: Loading hibernation image.\n"); |
| 823 | 818 | ||
| 819 | lock_device_hotplug(); | ||
| 820 | error = create_basic_memory_bitmaps(); | ||
| 821 | if (error) | ||
| 822 | goto Thaw; | ||
| 823 | |||
| 824 | error = swsusp_read(&flags); | 824 | error = swsusp_read(&flags); |
| 825 | swsusp_close(FMODE_READ); | 825 | swsusp_close(FMODE_READ); |
| 826 | if (!error) | 826 | if (!error) |
| @@ -828,9 +828,10 @@ static int software_resume(void) | |||
| 828 | 828 | ||
| 829 | printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n"); | 829 | printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n"); |
| 830 | swsusp_free(); | 830 | swsusp_free(); |
| 831 | thaw_processes(); | ||
| 832 | Done: | ||
| 833 | free_basic_memory_bitmaps(); | 831 | free_basic_memory_bitmaps(); |
| 832 | Thaw: | ||
| 833 | unlock_device_hotplug(); | ||
| 834 | thaw_processes(); | ||
| 834 | Finish: | 835 | Finish: |
| 835 | pm_notifier_call_chain(PM_POST_RESTORE); | 836 | pm_notifier_call_chain(PM_POST_RESTORE); |
| 836 | pm_restore_console(); | 837 | pm_restore_console(); |
| @@ -840,12 +841,12 @@ static int software_resume(void) | |||
| 840 | mutex_unlock(&pm_mutex); | 841 | mutex_unlock(&pm_mutex); |
| 841 | pr_debug("PM: Hibernation image not present or could not be loaded.\n"); | 842 | pr_debug("PM: Hibernation image not present or could not be loaded.\n"); |
| 842 | return error; | 843 | return error; |
| 843 | close_finish: | 844 | Close_Finish: |
| 844 | swsusp_close(FMODE_READ); | 845 | swsusp_close(FMODE_READ); |
| 845 | goto Finish; | 846 | goto Finish; |
| 846 | } | 847 | } |
| 847 | 848 | ||
| 848 | late_initcall(software_resume); | 849 | late_initcall_sync(software_resume); |
| 849 | 850 | ||
| 850 | 851 | ||
| 851 | static const char * const hibernation_modes[] = { | 852 | static const char * const hibernation_modes[] = { |
diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 06fe28589e9c..a394297f8b2f 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c | |||
| @@ -296,6 +296,17 @@ int pm_qos_request_active(struct pm_qos_request *req) | |||
| 296 | } | 296 | } |
| 297 | EXPORT_SYMBOL_GPL(pm_qos_request_active); | 297 | EXPORT_SYMBOL_GPL(pm_qos_request_active); |
| 298 | 298 | ||
| 299 | static void __pm_qos_update_request(struct pm_qos_request *req, | ||
| 300 | s32 new_value) | ||
| 301 | { | ||
| 302 | trace_pm_qos_update_request(req->pm_qos_class, new_value); | ||
| 303 | |||
| 304 | if (new_value != req->node.prio) | ||
| 305 | pm_qos_update_target( | ||
| 306 | pm_qos_array[req->pm_qos_class]->constraints, | ||
| 307 | &req->node, PM_QOS_UPDATE_REQ, new_value); | ||
| 308 | } | ||
| 309 | |||
| 299 | /** | 310 | /** |
| 300 | * pm_qos_work_fn - the timeout handler of pm_qos_update_request_timeout | 311 | * pm_qos_work_fn - the timeout handler of pm_qos_update_request_timeout |
| 301 | * @work: work struct for the delayed work (timeout) | 312 | * @work: work struct for the delayed work (timeout) |
| @@ -308,7 +319,7 @@ static void pm_qos_work_fn(struct work_struct *work) | |||
| 308 | struct pm_qos_request, | 319 | struct pm_qos_request, |
| 309 | work); | 320 | work); |
| 310 | 321 | ||
| 311 | pm_qos_update_request(req, PM_QOS_DEFAULT_VALUE); | 322 | __pm_qos_update_request(req, PM_QOS_DEFAULT_VALUE); |
| 312 | } | 323 | } |
| 313 | 324 | ||
| 314 | /** | 325 | /** |
| @@ -364,12 +375,7 @@ void pm_qos_update_request(struct pm_qos_request *req, | |||
| 364 | } | 375 | } |
| 365 | 376 | ||
| 366 | cancel_delayed_work_sync(&req->work); | 377 | cancel_delayed_work_sync(&req->work); |
| 367 | 378 | __pm_qos_update_request(req, new_value); | |
| 368 | trace_pm_qos_update_request(req->pm_qos_class, new_value); | ||
| 369 | if (new_value != req->node.prio) | ||
| 370 | pm_qos_update_target( | ||
| 371 | pm_qos_array[req->pm_qos_class]->constraints, | ||
| 372 | &req->node, PM_QOS_UPDATE_REQ, new_value); | ||
| 373 | } | 379 | } |
| 374 | EXPORT_SYMBOL_GPL(pm_qos_update_request); | 380 | EXPORT_SYMBOL_GPL(pm_qos_update_request); |
| 375 | 381 | ||
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 349587bb03e1..98c3b34a4cff 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
| @@ -352,7 +352,7 @@ static int create_mem_extents(struct list_head *list, gfp_t gfp_mask) | |||
| 352 | struct mem_extent *ext, *cur, *aux; | 352 | struct mem_extent *ext, *cur, *aux; |
| 353 | 353 | ||
| 354 | zone_start = zone->zone_start_pfn; | 354 | zone_start = zone->zone_start_pfn; |
| 355 | zone_end = zone->zone_start_pfn + zone->spanned_pages; | 355 | zone_end = zone_end_pfn(zone); |
| 356 | 356 | ||
| 357 | list_for_each_entry(ext, list, hook) | 357 | list_for_each_entry(ext, list, hook) |
| 358 | if (zone_start <= ext->end) | 358 | if (zone_start <= ext->end) |
| @@ -743,7 +743,10 @@ int create_basic_memory_bitmaps(void) | |||
| 743 | struct memory_bitmap *bm1, *bm2; | 743 | struct memory_bitmap *bm1, *bm2; |
| 744 | int error = 0; | 744 | int error = 0; |
| 745 | 745 | ||
| 746 | BUG_ON(forbidden_pages_map || free_pages_map); | 746 | if (forbidden_pages_map && free_pages_map) |
| 747 | return 0; | ||
| 748 | else | ||
| 749 | BUG_ON(forbidden_pages_map || free_pages_map); | ||
| 747 | 750 | ||
| 748 | bm1 = kzalloc(sizeof(struct memory_bitmap), GFP_KERNEL); | 751 | bm1 = kzalloc(sizeof(struct memory_bitmap), GFP_KERNEL); |
| 749 | if (!bm1) | 752 | if (!bm1) |
| @@ -884,7 +887,7 @@ static unsigned int count_highmem_pages(void) | |||
| 884 | continue; | 887 | continue; |
| 885 | 888 | ||
| 886 | mark_free_pages(zone); | 889 | mark_free_pages(zone); |
| 887 | max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; | 890 | max_zone_pfn = zone_end_pfn(zone); |
| 888 | for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) | 891 | for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) |
| 889 | if (saveable_highmem_page(zone, pfn)) | 892 | if (saveable_highmem_page(zone, pfn)) |
| 890 | n++; | 893 | n++; |
| @@ -948,7 +951,7 @@ static unsigned int count_data_pages(void) | |||
| 948 | continue; | 951 | continue; |
| 949 | 952 | ||
| 950 | mark_free_pages(zone); | 953 | mark_free_pages(zone); |
| 951 | max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; | 954 | max_zone_pfn = zone_end_pfn(zone); |
| 952 | for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) | 955 | for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) |
| 953 | if (saveable_page(zone, pfn)) | 956 | if (saveable_page(zone, pfn)) |
| 954 | n++; | 957 | n++; |
| @@ -1041,7 +1044,7 @@ copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm) | |||
| 1041 | unsigned long max_zone_pfn; | 1044 | unsigned long max_zone_pfn; |
| 1042 | 1045 | ||
| 1043 | mark_free_pages(zone); | 1046 | mark_free_pages(zone); |
| 1044 | max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; | 1047 | max_zone_pfn = zone_end_pfn(zone); |
| 1045 | for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) | 1048 | for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) |
| 1046 | if (page_is_saveable(zone, pfn)) | 1049 | if (page_is_saveable(zone, pfn)) |
| 1047 | memory_bm_set_bit(orig_bm, pfn); | 1050 | memory_bm_set_bit(orig_bm, pfn); |
| @@ -1093,7 +1096,7 @@ void swsusp_free(void) | |||
| 1093 | unsigned long pfn, max_zone_pfn; | 1096 | unsigned long pfn, max_zone_pfn; |
| 1094 | 1097 | ||
| 1095 | for_each_populated_zone(zone) { | 1098 | for_each_populated_zone(zone) { |
| 1096 | max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; | 1099 | max_zone_pfn = zone_end_pfn(zone); |
| 1097 | for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) | 1100 | for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) |
| 1098 | if (pfn_valid(pfn)) { | 1101 | if (pfn_valid(pfn)) { |
| 1099 | struct page *page = pfn_to_page(pfn); | 1102 | struct page *page = pfn_to_page(pfn); |
| @@ -1755,7 +1758,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm) | |||
| 1755 | 1758 | ||
| 1756 | /* Clear page flags */ | 1759 | /* Clear page flags */ |
| 1757 | for_each_populated_zone(zone) { | 1760 | for_each_populated_zone(zone) { |
| 1758 | max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; | 1761 | max_zone_pfn = zone_end_pfn(zone); |
| 1759 | for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) | 1762 | for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) |
| 1760 | if (pfn_valid(pfn)) | 1763 | if (pfn_valid(pfn)) |
| 1761 | swsusp_unset_page_free(pfn_to_page(pfn)); | 1764 | swsusp_unset_page_free(pfn_to_page(pfn)); |
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index ece04223bb1e..62ee437b5c7e 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c | |||
| @@ -210,6 +210,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) | |||
| 210 | goto Platform_wake; | 210 | goto Platform_wake; |
| 211 | } | 211 | } |
| 212 | 212 | ||
| 213 | ftrace_stop(); | ||
| 213 | error = disable_nonboot_cpus(); | 214 | error = disable_nonboot_cpus(); |
| 214 | if (error || suspend_test(TEST_CPUS)) | 215 | if (error || suspend_test(TEST_CPUS)) |
| 215 | goto Enable_cpus; | 216 | goto Enable_cpus; |
| @@ -232,6 +233,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) | |||
| 232 | 233 | ||
| 233 | Enable_cpus: | 234 | Enable_cpus: |
| 234 | enable_nonboot_cpus(); | 235 | enable_nonboot_cpus(); |
| 236 | ftrace_start(); | ||
| 235 | 237 | ||
| 236 | Platform_wake: | 238 | Platform_wake: |
| 237 | if (need_suspend_ops(state) && suspend_ops->wake) | 239 | if (need_suspend_ops(state) && suspend_ops->wake) |
| @@ -265,7 +267,6 @@ int suspend_devices_and_enter(suspend_state_t state) | |||
| 265 | goto Close; | 267 | goto Close; |
| 266 | } | 268 | } |
| 267 | suspend_console(); | 269 | suspend_console(); |
| 268 | ftrace_stop(); | ||
| 269 | suspend_test_start(); | 270 | suspend_test_start(); |
| 270 | error = dpm_suspend_start(PMSG_SUSPEND); | 271 | error = dpm_suspend_start(PMSG_SUSPEND); |
| 271 | if (error) { | 272 | if (error) { |
| @@ -285,7 +286,6 @@ int suspend_devices_and_enter(suspend_state_t state) | |||
| 285 | suspend_test_start(); | 286 | suspend_test_start(); |
| 286 | dpm_resume_end(PMSG_RESUME); | 287 | dpm_resume_end(PMSG_RESUME); |
| 287 | suspend_test_finish("resume devices"); | 288 | suspend_test_finish("resume devices"); |
| 288 | ftrace_start(); | ||
| 289 | resume_console(); | 289 | resume_console(); |
| 290 | Close: | 290 | Close: |
| 291 | if (need_suspend_ops(state) && suspend_ops->end) | 291 | if (need_suspend_ops(state) && suspend_ops->end) |
diff --git a/kernel/power/user.c b/kernel/power/user.c index 4ed81e74f86f..957f06164ad1 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c | |||
| @@ -39,6 +39,7 @@ static struct snapshot_data { | |||
| 39 | char frozen; | 39 | char frozen; |
| 40 | char ready; | 40 | char ready; |
| 41 | char platform_support; | 41 | char platform_support; |
| 42 | bool free_bitmaps; | ||
| 42 | } snapshot_state; | 43 | } snapshot_state; |
| 43 | 44 | ||
| 44 | atomic_t snapshot_device_available = ATOMIC_INIT(1); | 45 | atomic_t snapshot_device_available = ATOMIC_INIT(1); |
| @@ -60,11 +61,6 @@ static int snapshot_open(struct inode *inode, struct file *filp) | |||
| 60 | error = -ENOSYS; | 61 | error = -ENOSYS; |
| 61 | goto Unlock; | 62 | goto Unlock; |
| 62 | } | 63 | } |
| 63 | if(create_basic_memory_bitmaps()) { | ||
| 64 | atomic_inc(&snapshot_device_available); | ||
| 65 | error = -ENOMEM; | ||
| 66 | goto Unlock; | ||
| 67 | } | ||
| 68 | nonseekable_open(inode, filp); | 64 | nonseekable_open(inode, filp); |
| 69 | data = &snapshot_state; | 65 | data = &snapshot_state; |
| 70 | filp->private_data = data; | 66 | filp->private_data = data; |
| @@ -87,13 +83,16 @@ static int snapshot_open(struct inode *inode, struct file *filp) | |||
| 87 | data->swap = -1; | 83 | data->swap = -1; |
| 88 | data->mode = O_WRONLY; | 84 | data->mode = O_WRONLY; |
| 89 | error = pm_notifier_call_chain(PM_RESTORE_PREPARE); | 85 | error = pm_notifier_call_chain(PM_RESTORE_PREPARE); |
| 86 | if (!error) { | ||
| 87 | error = create_basic_memory_bitmaps(); | ||
| 88 | data->free_bitmaps = !error; | ||
| 89 | } | ||
| 90 | if (error) | 90 | if (error) |
| 91 | pm_notifier_call_chain(PM_POST_RESTORE); | 91 | pm_notifier_call_chain(PM_POST_RESTORE); |
| 92 | } | 92 | } |
| 93 | if (error) { | 93 | if (error) |
| 94 | free_basic_memory_bitmaps(); | ||
| 95 | atomic_inc(&snapshot_device_available); | 94 | atomic_inc(&snapshot_device_available); |
| 96 | } | 95 | |
| 97 | data->frozen = 0; | 96 | data->frozen = 0; |
| 98 | data->ready = 0; | 97 | data->ready = 0; |
| 99 | data->platform_support = 0; | 98 | data->platform_support = 0; |
| @@ -111,12 +110,14 @@ static int snapshot_release(struct inode *inode, struct file *filp) | |||
| 111 | lock_system_sleep(); | 110 | lock_system_sleep(); |
| 112 | 111 | ||
| 113 | swsusp_free(); | 112 | swsusp_free(); |
| 114 | free_basic_memory_bitmaps(); | ||
| 115 | data = filp->private_data; | 113 | data = filp->private_data; |
| 116 | free_all_swap_pages(data->swap); | 114 | free_all_swap_pages(data->swap); |
| 117 | if (data->frozen) { | 115 | if (data->frozen) { |
| 118 | pm_restore_gfp_mask(); | 116 | pm_restore_gfp_mask(); |
| 117 | free_basic_memory_bitmaps(); | ||
| 119 | thaw_processes(); | 118 | thaw_processes(); |
| 119 | } else if (data->free_bitmaps) { | ||
| 120 | free_basic_memory_bitmaps(); | ||
| 120 | } | 121 | } |
| 121 | pm_notifier_call_chain(data->mode == O_RDONLY ? | 122 | pm_notifier_call_chain(data->mode == O_RDONLY ? |
| 122 | PM_POST_HIBERNATION : PM_POST_RESTORE); | 123 | PM_POST_HIBERNATION : PM_POST_RESTORE); |
| @@ -207,6 +208,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, | |||
| 207 | if (!mutex_trylock(&pm_mutex)) | 208 | if (!mutex_trylock(&pm_mutex)) |
| 208 | return -EBUSY; | 209 | return -EBUSY; |
| 209 | 210 | ||
| 211 | lock_device_hotplug(); | ||
| 210 | data = filp->private_data; | 212 | data = filp->private_data; |
| 211 | 213 | ||
| 212 | switch (cmd) { | 214 | switch (cmd) { |
| @@ -220,14 +222,23 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, | |||
| 220 | printk("done.\n"); | 222 | printk("done.\n"); |
| 221 | 223 | ||
| 222 | error = freeze_processes(); | 224 | error = freeze_processes(); |
| 223 | if (!error) | 225 | if (error) |
| 226 | break; | ||
| 227 | |||
| 228 | error = create_basic_memory_bitmaps(); | ||
| 229 | if (error) | ||
| 230 | thaw_processes(); | ||
| 231 | else | ||
| 224 | data->frozen = 1; | 232 | data->frozen = 1; |
| 233 | |||
| 225 | break; | 234 | break; |
| 226 | 235 | ||
| 227 | case SNAPSHOT_UNFREEZE: | 236 | case SNAPSHOT_UNFREEZE: |
| 228 | if (!data->frozen || data->ready) | 237 | if (!data->frozen || data->ready) |
| 229 | break; | 238 | break; |
| 230 | pm_restore_gfp_mask(); | 239 | pm_restore_gfp_mask(); |
| 240 | free_basic_memory_bitmaps(); | ||
| 241 | data->free_bitmaps = false; | ||
| 231 | thaw_processes(); | 242 | thaw_processes(); |
| 232 | data->frozen = 0; | 243 | data->frozen = 0; |
| 233 | break; | 244 | break; |
| @@ -371,6 +382,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, | |||
| 371 | 382 | ||
| 372 | } | 383 | } |
| 373 | 384 | ||
| 385 | unlock_device_hotplug(); | ||
| 374 | mutex_unlock(&pm_mutex); | 386 | mutex_unlock(&pm_mutex); |
| 375 | 387 | ||
| 376 | return error; | 388 | return error; |
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 5b5a7080e2a5..b4e8500afdb3 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c | |||
| @@ -2226,6 +2226,13 @@ void register_console(struct console *newcon) | |||
| 2226 | struct console *bcon = NULL; | 2226 | struct console *bcon = NULL; |
| 2227 | struct console_cmdline *c; | 2227 | struct console_cmdline *c; |
| 2228 | 2228 | ||
| 2229 | if (console_drivers) | ||
| 2230 | for_each_console(bcon) | ||
| 2231 | if (WARN(bcon == newcon, | ||
| 2232 | "console '%s%d' already registered\n", | ||
| 2233 | bcon->name, bcon->index)) | ||
| 2234 | return; | ||
| 2235 | |||
| 2229 | /* | 2236 | /* |
| 2230 | * before we register a new CON_BOOT console, make sure we don't | 2237 | * before we register a new CON_BOOT console, make sure we don't |
| 2231 | * already have a valid console | 2238 | * already have a valid console |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index a146ee327f6a..dd562e9aa2c8 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
| @@ -236,7 +236,7 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode) | |||
| 236 | */ | 236 | */ |
| 237 | int dumpable = 0; | 237 | int dumpable = 0; |
| 238 | /* Don't let security modules deny introspection */ | 238 | /* Don't let security modules deny introspection */ |
| 239 | if (task == current) | 239 | if (same_thread_group(task, current)) |
| 240 | return 0; | 240 | return 0; |
| 241 | rcu_read_lock(); | 241 | rcu_read_lock(); |
| 242 | tcred = __task_cred(task); | 242 | tcred = __task_cred(task); |
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile new file mode 100644 index 000000000000..01e9ec37a3e3 --- /dev/null +++ b/kernel/rcu/Makefile | |||
| @@ -0,0 +1,6 @@ | |||
| 1 | obj-y += update.o srcu.o | ||
| 2 | obj-$(CONFIG_RCU_TORTURE_TEST) += torture.o | ||
| 3 | obj-$(CONFIG_TREE_RCU) += tree.o | ||
| 4 | obj-$(CONFIG_TREE_PREEMPT_RCU) += tree.o | ||
| 5 | obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o | ||
| 6 | obj-$(CONFIG_TINY_RCU) += tiny.o | ||
diff --git a/kernel/rcu.h b/kernel/rcu/rcu.h index 7f8e7590e3e5..7859a0a3951e 100644 --- a/kernel/rcu.h +++ b/kernel/rcu/rcu.h | |||
| @@ -67,12 +67,15 @@ | |||
| 67 | 67 | ||
| 68 | extern struct debug_obj_descr rcuhead_debug_descr; | 68 | extern struct debug_obj_descr rcuhead_debug_descr; |
| 69 | 69 | ||
| 70 | static inline void debug_rcu_head_queue(struct rcu_head *head) | 70 | static inline int debug_rcu_head_queue(struct rcu_head *head) |
| 71 | { | 71 | { |
| 72 | debug_object_activate(head, &rcuhead_debug_descr); | 72 | int r1; |
| 73 | |||
| 74 | r1 = debug_object_activate(head, &rcuhead_debug_descr); | ||
| 73 | debug_object_active_state(head, &rcuhead_debug_descr, | 75 | debug_object_active_state(head, &rcuhead_debug_descr, |
| 74 | STATE_RCU_HEAD_READY, | 76 | STATE_RCU_HEAD_READY, |
| 75 | STATE_RCU_HEAD_QUEUED); | 77 | STATE_RCU_HEAD_QUEUED); |
| 78 | return r1; | ||
| 76 | } | 79 | } |
| 77 | 80 | ||
| 78 | static inline void debug_rcu_head_unqueue(struct rcu_head *head) | 81 | static inline void debug_rcu_head_unqueue(struct rcu_head *head) |
| @@ -83,8 +86,9 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head) | |||
| 83 | debug_object_deactivate(head, &rcuhead_debug_descr); | 86 | debug_object_deactivate(head, &rcuhead_debug_descr); |
| 84 | } | 87 | } |
| 85 | #else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ | 88 | #else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ |
| 86 | static inline void debug_rcu_head_queue(struct rcu_head *head) | 89 | static inline int debug_rcu_head_queue(struct rcu_head *head) |
| 87 | { | 90 | { |
| 91 | return 0; | ||
| 88 | } | 92 | } |
| 89 | 93 | ||
| 90 | static inline void debug_rcu_head_unqueue(struct rcu_head *head) | 94 | static inline void debug_rcu_head_unqueue(struct rcu_head *head) |
| @@ -94,7 +98,7 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head) | |||
| 94 | 98 | ||
| 95 | extern void kfree(const void *); | 99 | extern void kfree(const void *); |
| 96 | 100 | ||
| 97 | static inline bool __rcu_reclaim(char *rn, struct rcu_head *head) | 101 | static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) |
| 98 | { | 102 | { |
| 99 | unsigned long offset = (unsigned long)head->func; | 103 | unsigned long offset = (unsigned long)head->func; |
| 100 | 104 | ||
| @@ -118,4 +122,11 @@ int rcu_jiffies_till_stall_check(void); | |||
| 118 | 122 | ||
| 119 | #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ | 123 | #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ |
| 120 | 124 | ||
| 125 | /* | ||
| 126 | * Strings used in tracepoints need to be exported via the | ||
| 127 | * tracing system such that tools like perf and trace-cmd can | ||
| 128 | * translate the string address pointers to actual text. | ||
| 129 | */ | ||
| 130 | #define TPS(x) tracepoint_string(x) | ||
| 131 | |||
| 121 | #endif /* __LINUX_RCU_H */ | 132 | #endif /* __LINUX_RCU_H */ |
diff --git a/kernel/srcu.c b/kernel/rcu/srcu.c index 01d5ccb8bfe3..01d5ccb8bfe3 100644 --- a/kernel/srcu.c +++ b/kernel/rcu/srcu.c | |||
diff --git a/kernel/rcutiny.c b/kernel/rcu/tiny.c index aa344111de3e..0c9a934cfec1 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcu/tiny.c | |||
| @@ -35,6 +35,7 @@ | |||
| 35 | #include <linux/time.h> | 35 | #include <linux/time.h> |
| 36 | #include <linux/cpu.h> | 36 | #include <linux/cpu.h> |
| 37 | #include <linux/prefetch.h> | 37 | #include <linux/prefetch.h> |
| 38 | #include <linux/ftrace_event.h> | ||
| 38 | 39 | ||
| 39 | #ifdef CONFIG_RCU_TRACE | 40 | #ifdef CONFIG_RCU_TRACE |
| 40 | #include <trace/events/rcu.h> | 41 | #include <trace/events/rcu.h> |
| @@ -42,7 +43,7 @@ | |||
| 42 | 43 | ||
| 43 | #include "rcu.h" | 44 | #include "rcu.h" |
| 44 | 45 | ||
| 45 | /* Forward declarations for rcutiny_plugin.h. */ | 46 | /* Forward declarations for tiny_plugin.h. */ |
| 46 | struct rcu_ctrlblk; | 47 | struct rcu_ctrlblk; |
| 47 | static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); | 48 | static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); |
| 48 | static void rcu_process_callbacks(struct softirq_action *unused); | 49 | static void rcu_process_callbacks(struct softirq_action *unused); |
| @@ -52,22 +53,23 @@ static void __call_rcu(struct rcu_head *head, | |||
| 52 | 53 | ||
| 53 | static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; | 54 | static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; |
| 54 | 55 | ||
| 55 | #include "rcutiny_plugin.h" | 56 | #include "tiny_plugin.h" |
| 56 | 57 | ||
| 57 | /* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ | 58 | /* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ |
| 58 | static void rcu_idle_enter_common(long long newval) | 59 | static void rcu_idle_enter_common(long long newval) |
| 59 | { | 60 | { |
| 60 | if (newval) { | 61 | if (newval) { |
| 61 | RCU_TRACE(trace_rcu_dyntick("--=", | 62 | RCU_TRACE(trace_rcu_dyntick(TPS("--="), |
| 62 | rcu_dynticks_nesting, newval)); | 63 | rcu_dynticks_nesting, newval)); |
| 63 | rcu_dynticks_nesting = newval; | 64 | rcu_dynticks_nesting = newval; |
| 64 | return; | 65 | return; |
| 65 | } | 66 | } |
| 66 | RCU_TRACE(trace_rcu_dyntick("Start", rcu_dynticks_nesting, newval)); | 67 | RCU_TRACE(trace_rcu_dyntick(TPS("Start"), |
| 68 | rcu_dynticks_nesting, newval)); | ||
| 67 | if (!is_idle_task(current)) { | 69 | if (!is_idle_task(current)) { |
| 68 | struct task_struct *idle = idle_task(smp_processor_id()); | 70 | struct task_struct *idle __maybe_unused = idle_task(smp_processor_id()); |
| 69 | 71 | ||
| 70 | RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task", | 72 | RCU_TRACE(trace_rcu_dyntick(TPS("Entry error: not idle task"), |
| 71 | rcu_dynticks_nesting, newval)); | 73 | rcu_dynticks_nesting, newval)); |
| 72 | ftrace_dump(DUMP_ALL); | 74 | ftrace_dump(DUMP_ALL); |
| 73 | WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", | 75 | WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", |
| @@ -120,15 +122,15 @@ EXPORT_SYMBOL_GPL(rcu_irq_exit); | |||
| 120 | static void rcu_idle_exit_common(long long oldval) | 122 | static void rcu_idle_exit_common(long long oldval) |
| 121 | { | 123 | { |
| 122 | if (oldval) { | 124 | if (oldval) { |
| 123 | RCU_TRACE(trace_rcu_dyntick("++=", | 125 | RCU_TRACE(trace_rcu_dyntick(TPS("++="), |
| 124 | oldval, rcu_dynticks_nesting)); | 126 | oldval, rcu_dynticks_nesting)); |
| 125 | return; | 127 | return; |
| 126 | } | 128 | } |
| 127 | RCU_TRACE(trace_rcu_dyntick("End", oldval, rcu_dynticks_nesting)); | 129 | RCU_TRACE(trace_rcu_dyntick(TPS("End"), oldval, rcu_dynticks_nesting)); |
| 128 | if (!is_idle_task(current)) { | 130 | if (!is_idle_task(current)) { |
| 129 | struct task_struct *idle = idle_task(smp_processor_id()); | 131 | struct task_struct *idle __maybe_unused = idle_task(smp_processor_id()); |
| 130 | 132 | ||
| 131 | RCU_TRACE(trace_rcu_dyntick("Error on exit: not idle task", | 133 | RCU_TRACE(trace_rcu_dyntick(TPS("Exit error: not idle task"), |
| 132 | oldval, rcu_dynticks_nesting)); | 134 | oldval, rcu_dynticks_nesting)); |
| 133 | ftrace_dump(DUMP_ALL); | 135 | ftrace_dump(DUMP_ALL); |
| 134 | WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", | 136 | WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", |
| @@ -174,18 +176,18 @@ void rcu_irq_enter(void) | |||
| 174 | } | 176 | } |
| 175 | EXPORT_SYMBOL_GPL(rcu_irq_enter); | 177 | EXPORT_SYMBOL_GPL(rcu_irq_enter); |
| 176 | 178 | ||
| 177 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 179 | #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) |
| 178 | 180 | ||
| 179 | /* | 181 | /* |
| 180 | * Test whether RCU thinks that the current CPU is idle. | 182 | * Test whether RCU thinks that the current CPU is idle. |
| 181 | */ | 183 | */ |
| 182 | int rcu_is_cpu_idle(void) | 184 | bool __rcu_is_watching(void) |
| 183 | { | 185 | { |
| 184 | return !rcu_dynticks_nesting; | 186 | return rcu_dynticks_nesting; |
| 185 | } | 187 | } |
| 186 | EXPORT_SYMBOL(rcu_is_cpu_idle); | 188 | EXPORT_SYMBOL(__rcu_is_watching); |
| 187 | 189 | ||
| 188 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | 190 | #endif /* defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */ |
| 189 | 191 | ||
| 190 | /* | 192 | /* |
| 191 | * Test whether the current CPU was interrupted from idle. Nested | 193 | * Test whether the current CPU was interrupted from idle. Nested |
| @@ -264,7 +266,7 @@ void rcu_check_callbacks(int cpu, int user) | |||
| 264 | */ | 266 | */ |
| 265 | static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) | 267 | static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) |
| 266 | { | 268 | { |
| 267 | char *rn = NULL; | 269 | const char *rn = NULL; |
| 268 | struct rcu_head *next, *list; | 270 | struct rcu_head *next, *list; |
| 269 | unsigned long flags; | 271 | unsigned long flags; |
| 270 | RCU_TRACE(int cb_count = 0); | 272 | RCU_TRACE(int cb_count = 0); |
| @@ -273,7 +275,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) | |||
| 273 | if (&rcp->rcucblist == rcp->donetail) { | 275 | if (&rcp->rcucblist == rcp->donetail) { |
| 274 | RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, 0, -1)); | 276 | RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, 0, -1)); |
| 275 | RCU_TRACE(trace_rcu_batch_end(rcp->name, 0, | 277 | RCU_TRACE(trace_rcu_batch_end(rcp->name, 0, |
| 276 | ACCESS_ONCE(rcp->rcucblist), | 278 | !!ACCESS_ONCE(rcp->rcucblist), |
| 277 | need_resched(), | 279 | need_resched(), |
| 278 | is_idle_task(current), | 280 | is_idle_task(current), |
| 279 | false)); | 281 | false)); |
| @@ -304,7 +306,8 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) | |||
| 304 | RCU_TRACE(cb_count++); | 306 | RCU_TRACE(cb_count++); |
| 305 | } | 307 | } |
| 306 | RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); | 308 | RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); |
| 307 | RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count, 0, need_resched(), | 309 | RCU_TRACE(trace_rcu_batch_end(rcp->name, |
| 310 | cb_count, 0, need_resched(), | ||
| 308 | is_idle_task(current), | 311 | is_idle_task(current), |
| 309 | false)); | 312 | false)); |
| 310 | } | 313 | } |
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcu/tiny_plugin.h index 0cd385acccfa..280d06cae352 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcu/tiny_plugin.h | |||
| @@ -36,7 +36,7 @@ struct rcu_ctrlblk { | |||
| 36 | RCU_TRACE(unsigned long gp_start); /* Start time for stalls. */ | 36 | RCU_TRACE(unsigned long gp_start); /* Start time for stalls. */ |
| 37 | RCU_TRACE(unsigned long ticks_this_gp); /* Statistic for stalls. */ | 37 | RCU_TRACE(unsigned long ticks_this_gp); /* Statistic for stalls. */ |
| 38 | RCU_TRACE(unsigned long jiffies_stall); /* Jiffies at next stall. */ | 38 | RCU_TRACE(unsigned long jiffies_stall); /* Jiffies at next stall. */ |
| 39 | RCU_TRACE(char *name); /* Name of RCU type. */ | 39 | RCU_TRACE(const char *name); /* Name of RCU type. */ |
| 40 | }; | 40 | }; |
| 41 | 41 | ||
| 42 | /* Definition for rcupdate control block. */ | 42 | /* Definition for rcupdate control block. */ |
diff --git a/kernel/rcutorture.c b/kernel/rcu/torture.c index f4871e52c546..3929cd451511 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcu/torture.c | |||
| @@ -52,72 +52,84 @@ | |||
| 52 | MODULE_LICENSE("GPL"); | 52 | MODULE_LICENSE("GPL"); |
| 53 | MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>"); | 53 | MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>"); |
| 54 | 54 | ||
| 55 | static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ | 55 | MODULE_ALIAS("rcutorture"); |
| 56 | static int nfakewriters = 4; /* # fake writer threads */ | 56 | #ifdef MODULE_PARAM_PREFIX |
| 57 | static int stat_interval = 60; /* Interval between stats, in seconds. */ | 57 | #undef MODULE_PARAM_PREFIX |
| 58 | /* Zero means "only at end of test". */ | 58 | #endif |
| 59 | static bool verbose; /* Print more debug info. */ | 59 | #define MODULE_PARAM_PREFIX "rcutorture." |
| 60 | static bool test_no_idle_hz = true; | ||
| 61 | /* Test RCU support for tickless idle CPUs. */ | ||
| 62 | static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ | ||
| 63 | static int stutter = 5; /* Start/stop testing interval (in sec) */ | ||
| 64 | static int irqreader = 1; /* RCU readers from irq (timers). */ | ||
| 65 | static int fqs_duration; /* Duration of bursts (us), 0 to disable. */ | ||
| 66 | static int fqs_holdoff; /* Hold time within burst (us). */ | ||
| 67 | static int fqs_stutter = 3; /* Wait time between bursts (s). */ | ||
| 68 | static int n_barrier_cbs; /* Number of callbacks to test RCU barriers. */ | ||
| 69 | static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */ | ||
| 70 | static int onoff_holdoff; /* Seconds after boot before CPU hotplugs. */ | ||
| 71 | static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */ | ||
| 72 | static int stall_cpu; /* CPU-stall duration (s). 0 for no stall. */ | ||
| 73 | static int stall_cpu_holdoff = 10; /* Time to wait until stall (s). */ | ||
| 74 | static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */ | ||
| 75 | static int test_boost_interval = 7; /* Interval between boost tests, seconds. */ | ||
| 76 | static int test_boost_duration = 4; /* Duration of each boost test, seconds. */ | ||
| 77 | static char *torture_type = "rcu"; /* What RCU implementation to torture. */ | ||
| 78 | 60 | ||
| 79 | module_param(nreaders, int, 0444); | 61 | static int fqs_duration; |
| 80 | MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); | ||
| 81 | module_param(nfakewriters, int, 0444); | ||
| 82 | MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads"); | ||
| 83 | module_param(stat_interval, int, 0644); | ||
| 84 | MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); | ||
| 85 | module_param(verbose, bool, 0444); | ||
| 86 | MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); | ||
| 87 | module_param(test_no_idle_hz, bool, 0444); | ||
| 88 | MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); | ||
| 89 | module_param(shuffle_interval, int, 0444); | ||
| 90 | MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); | ||
| 91 | module_param(stutter, int, 0444); | ||
| 92 | MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test"); | ||
| 93 | module_param(irqreader, int, 0444); | ||
| 94 | MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers"); | ||
| 95 | module_param(fqs_duration, int, 0444); | 62 | module_param(fqs_duration, int, 0444); |
| 96 | MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us)"); | 63 | MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us), 0 to disable"); |
| 64 | static int fqs_holdoff; | ||
| 97 | module_param(fqs_holdoff, int, 0444); | 65 | module_param(fqs_holdoff, int, 0444); |
| 98 | MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); | 66 | MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); |
| 67 | static int fqs_stutter = 3; | ||
| 99 | module_param(fqs_stutter, int, 0444); | 68 | module_param(fqs_stutter, int, 0444); |
| 100 | MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); | 69 | MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); |
| 70 | static bool gp_exp; | ||
| 71 | module_param(gp_exp, bool, 0444); | ||
| 72 | MODULE_PARM_DESC(gp_exp, "Use expedited GP wait primitives"); | ||
| 73 | static bool gp_normal; | ||
| 74 | module_param(gp_normal, bool, 0444); | ||
| 75 | MODULE_PARM_DESC(gp_normal, "Use normal (non-expedited) GP wait primitives"); | ||
| 76 | static int irqreader = 1; | ||
| 77 | module_param(irqreader, int, 0444); | ||
| 78 | MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers"); | ||
| 79 | static int n_barrier_cbs; | ||
| 101 | module_param(n_barrier_cbs, int, 0444); | 80 | module_param(n_barrier_cbs, int, 0444); |
| 102 | MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing"); | 81 | MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing"); |
| 103 | module_param(onoff_interval, int, 0444); | 82 | static int nfakewriters = 4; |
| 104 | MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); | 83 | module_param(nfakewriters, int, 0444); |
| 84 | MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads"); | ||
| 85 | static int nreaders = -1; | ||
| 86 | module_param(nreaders, int, 0444); | ||
| 87 | MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); | ||
| 88 | static int object_debug; | ||
| 89 | module_param(object_debug, int, 0444); | ||
| 90 | MODULE_PARM_DESC(object_debug, "Enable debug-object double call_rcu() testing"); | ||
| 91 | static int onoff_holdoff; | ||
| 105 | module_param(onoff_holdoff, int, 0444); | 92 | module_param(onoff_holdoff, int, 0444); |
| 106 | MODULE_PARM_DESC(onoff_holdoff, "Time after boot before CPU hotplugs (s)"); | 93 | MODULE_PARM_DESC(onoff_holdoff, "Time after boot before CPU hotplugs (s)"); |
| 94 | static int onoff_interval; | ||
| 95 | module_param(onoff_interval, int, 0444); | ||
| 96 | MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); | ||
| 97 | static int shuffle_interval = 3; | ||
| 98 | module_param(shuffle_interval, int, 0444); | ||
| 99 | MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); | ||
| 100 | static int shutdown_secs; | ||
| 107 | module_param(shutdown_secs, int, 0444); | 101 | module_param(shutdown_secs, int, 0444); |
| 108 | MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), zero to disable."); | 102 | MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), <= zero to disable."); |
| 103 | static int stall_cpu; | ||
| 109 | module_param(stall_cpu, int, 0444); | 104 | module_param(stall_cpu, int, 0444); |
| 110 | MODULE_PARM_DESC(stall_cpu, "Stall duration (s), zero to disable."); | 105 | MODULE_PARM_DESC(stall_cpu, "Stall duration (s), zero to disable."); |
| 106 | static int stall_cpu_holdoff = 10; | ||
| 111 | module_param(stall_cpu_holdoff, int, 0444); | 107 | module_param(stall_cpu_holdoff, int, 0444); |
| 112 | MODULE_PARM_DESC(stall_cpu_holdoff, "Time to wait before starting stall (s)."); | 108 | MODULE_PARM_DESC(stall_cpu_holdoff, "Time to wait before starting stall (s)."); |
| 109 | static int stat_interval = 60; | ||
| 110 | module_param(stat_interval, int, 0644); | ||
| 111 | MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); | ||
| 112 | static int stutter = 5; | ||
| 113 | module_param(stutter, int, 0444); | ||
| 114 | MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test"); | ||
| 115 | static int test_boost = 1; | ||
| 113 | module_param(test_boost, int, 0444); | 116 | module_param(test_boost, int, 0444); |
| 114 | MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); | 117 | MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); |
| 115 | module_param(test_boost_interval, int, 0444); | 118 | static int test_boost_duration = 4; |
| 116 | MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds."); | ||
| 117 | module_param(test_boost_duration, int, 0444); | 119 | module_param(test_boost_duration, int, 0444); |
| 118 | MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds."); | 120 | MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds."); |
| 121 | static int test_boost_interval = 7; | ||
| 122 | module_param(test_boost_interval, int, 0444); | ||
| 123 | MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds."); | ||
| 124 | static bool test_no_idle_hz = true; | ||
| 125 | module_param(test_no_idle_hz, bool, 0444); | ||
| 126 | MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); | ||
| 127 | static char *torture_type = "rcu"; | ||
| 119 | module_param(torture_type, charp, 0444); | 128 | module_param(torture_type, charp, 0444); |
| 120 | MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); | 129 | MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)"); |
| 130 | static bool verbose; | ||
| 131 | module_param(verbose, bool, 0444); | ||
| 132 | MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); | ||
| 121 | 133 | ||
| 122 | #define TORTURE_FLAG "-torture:" | 134 | #define TORTURE_FLAG "-torture:" |
| 123 | #define PRINTK_STRING(s) \ | 135 | #define PRINTK_STRING(s) \ |
| @@ -267,7 +279,7 @@ rcutorture_shutdown_notify(struct notifier_block *unused1, | |||
| 267 | * Absorb kthreads into a kernel function that won't return, so that | 279 | * Absorb kthreads into a kernel function that won't return, so that |
| 268 | * they won't ever access module text or data again. | 280 | * they won't ever access module text or data again. |
| 269 | */ | 281 | */ |
| 270 | static void rcutorture_shutdown_absorb(char *title) | 282 | static void rcutorture_shutdown_absorb(const char *title) |
| 271 | { | 283 | { |
| 272 | if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { | 284 | if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { |
| 273 | pr_notice( | 285 | pr_notice( |
| @@ -337,7 +349,7 @@ rcu_random(struct rcu_random_state *rrsp) | |||
| 337 | } | 349 | } |
| 338 | 350 | ||
| 339 | static void | 351 | static void |
| 340 | rcu_stutter_wait(char *title) | 352 | rcu_stutter_wait(const char *title) |
| 341 | { | 353 | { |
| 342 | while (stutter_pause_test || !rcutorture_runnable) { | 354 | while (stutter_pause_test || !rcutorture_runnable) { |
| 343 | if (rcutorture_runnable) | 355 | if (rcutorture_runnable) |
| @@ -360,13 +372,14 @@ struct rcu_torture_ops { | |||
| 360 | int (*completed)(void); | 372 | int (*completed)(void); |
| 361 | void (*deferred_free)(struct rcu_torture *p); | 373 | void (*deferred_free)(struct rcu_torture *p); |
| 362 | void (*sync)(void); | 374 | void (*sync)(void); |
| 375 | void (*exp_sync)(void); | ||
| 363 | void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); | 376 | void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); |
| 364 | void (*cb_barrier)(void); | 377 | void (*cb_barrier)(void); |
| 365 | void (*fqs)(void); | 378 | void (*fqs)(void); |
| 366 | int (*stats)(char *page); | 379 | int (*stats)(char *page); |
| 367 | int irq_capable; | 380 | int irq_capable; |
| 368 | int can_boost; | 381 | int can_boost; |
| 369 | char *name; | 382 | const char *name; |
| 370 | }; | 383 | }; |
| 371 | 384 | ||
| 372 | static struct rcu_torture_ops *cur_ops; | 385 | static struct rcu_torture_ops *cur_ops; |
| @@ -443,81 +456,27 @@ static void rcu_torture_deferred_free(struct rcu_torture *p) | |||
| 443 | call_rcu(&p->rtort_rcu, rcu_torture_cb); | 456 | call_rcu(&p->rtort_rcu, rcu_torture_cb); |
| 444 | } | 457 | } |
| 445 | 458 | ||
| 446 | static struct rcu_torture_ops rcu_ops = { | ||
| 447 | .init = NULL, | ||
| 448 | .readlock = rcu_torture_read_lock, | ||
| 449 | .read_delay = rcu_read_delay, | ||
| 450 | .readunlock = rcu_torture_read_unlock, | ||
| 451 | .completed = rcu_torture_completed, | ||
| 452 | .deferred_free = rcu_torture_deferred_free, | ||
| 453 | .sync = synchronize_rcu, | ||
| 454 | .call = call_rcu, | ||
| 455 | .cb_barrier = rcu_barrier, | ||
| 456 | .fqs = rcu_force_quiescent_state, | ||
| 457 | .stats = NULL, | ||
| 458 | .irq_capable = 1, | ||
| 459 | .can_boost = rcu_can_boost(), | ||
| 460 | .name = "rcu" | ||
| 461 | }; | ||
| 462 | |||
| 463 | static void rcu_sync_torture_deferred_free(struct rcu_torture *p) | ||
| 464 | { | ||
| 465 | int i; | ||
| 466 | struct rcu_torture *rp; | ||
| 467 | struct rcu_torture *rp1; | ||
| 468 | |||
| 469 | cur_ops->sync(); | ||
| 470 | list_add(&p->rtort_free, &rcu_torture_removed); | ||
| 471 | list_for_each_entry_safe(rp, rp1, &rcu_torture_removed, rtort_free) { | ||
| 472 | i = rp->rtort_pipe_count; | ||
| 473 | if (i > RCU_TORTURE_PIPE_LEN) | ||
| 474 | i = RCU_TORTURE_PIPE_LEN; | ||
| 475 | atomic_inc(&rcu_torture_wcount[i]); | ||
| 476 | if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { | ||
| 477 | rp->rtort_mbtest = 0; | ||
| 478 | list_del(&rp->rtort_free); | ||
| 479 | rcu_torture_free(rp); | ||
| 480 | } | ||
| 481 | } | ||
| 482 | } | ||
| 483 | |||
| 484 | static void rcu_sync_torture_init(void) | 459 | static void rcu_sync_torture_init(void) |
| 485 | { | 460 | { |
| 486 | INIT_LIST_HEAD(&rcu_torture_removed); | 461 | INIT_LIST_HEAD(&rcu_torture_removed); |
| 487 | } | 462 | } |
| 488 | 463 | ||
| 489 | static struct rcu_torture_ops rcu_sync_ops = { | 464 | static struct rcu_torture_ops rcu_ops = { |
| 490 | .init = rcu_sync_torture_init, | 465 | .init = rcu_sync_torture_init, |
| 491 | .readlock = rcu_torture_read_lock, | 466 | .readlock = rcu_torture_read_lock, |
| 492 | .read_delay = rcu_read_delay, | 467 | .read_delay = rcu_read_delay, |
| 493 | .readunlock = rcu_torture_read_unlock, | 468 | .readunlock = rcu_torture_read_unlock, |
| 494 | .completed = rcu_torture_completed, | 469 | .completed = rcu_torture_completed, |
| 495 | .deferred_free = rcu_sync_torture_deferred_free, | 470 | .deferred_free = rcu_torture_deferred_free, |
| 496 | .sync = synchronize_rcu, | 471 | .sync = synchronize_rcu, |
| 497 | .call = NULL, | 472 | .exp_sync = synchronize_rcu_expedited, |
| 498 | .cb_barrier = NULL, | 473 | .call = call_rcu, |
| 499 | .fqs = rcu_force_quiescent_state, | 474 | .cb_barrier = rcu_barrier, |
| 500 | .stats = NULL, | ||
| 501 | .irq_capable = 1, | ||
| 502 | .can_boost = rcu_can_boost(), | ||
| 503 | .name = "rcu_sync" | ||
| 504 | }; | ||
| 505 | |||
| 506 | static struct rcu_torture_ops rcu_expedited_ops = { | ||
| 507 | .init = rcu_sync_torture_init, | ||
| 508 | .readlock = rcu_torture_read_lock, | ||
| 509 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | ||
| 510 | .readunlock = rcu_torture_read_unlock, | ||
| 511 | .completed = rcu_no_completed, | ||
| 512 | .deferred_free = rcu_sync_torture_deferred_free, | ||
| 513 | .sync = synchronize_rcu_expedited, | ||
| 514 | .call = NULL, | ||
| 515 | .cb_barrier = NULL, | ||
| 516 | .fqs = rcu_force_quiescent_state, | 475 | .fqs = rcu_force_quiescent_state, |
| 517 | .stats = NULL, | 476 | .stats = NULL, |
| 518 | .irq_capable = 1, | 477 | .irq_capable = 1, |
| 519 | .can_boost = rcu_can_boost(), | 478 | .can_boost = rcu_can_boost(), |
| 520 | .name = "rcu_expedited" | 479 | .name = "rcu" |
| 521 | }; | 480 | }; |
| 522 | 481 | ||
| 523 | /* | 482 | /* |
| @@ -546,13 +505,14 @@ static void rcu_bh_torture_deferred_free(struct rcu_torture *p) | |||
| 546 | } | 505 | } |
| 547 | 506 | ||
| 548 | static struct rcu_torture_ops rcu_bh_ops = { | 507 | static struct rcu_torture_ops rcu_bh_ops = { |
| 549 | .init = NULL, | 508 | .init = rcu_sync_torture_init, |
| 550 | .readlock = rcu_bh_torture_read_lock, | 509 | .readlock = rcu_bh_torture_read_lock, |
| 551 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | 510 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ |
| 552 | .readunlock = rcu_bh_torture_read_unlock, | 511 | .readunlock = rcu_bh_torture_read_unlock, |
| 553 | .completed = rcu_bh_torture_completed, | 512 | .completed = rcu_bh_torture_completed, |
| 554 | .deferred_free = rcu_bh_torture_deferred_free, | 513 | .deferred_free = rcu_bh_torture_deferred_free, |
| 555 | .sync = synchronize_rcu_bh, | 514 | .sync = synchronize_rcu_bh, |
| 515 | .exp_sync = synchronize_rcu_bh_expedited, | ||
| 556 | .call = call_rcu_bh, | 516 | .call = call_rcu_bh, |
| 557 | .cb_barrier = rcu_barrier_bh, | 517 | .cb_barrier = rcu_barrier_bh, |
| 558 | .fqs = rcu_bh_force_quiescent_state, | 518 | .fqs = rcu_bh_force_quiescent_state, |
| @@ -561,38 +521,6 @@ static struct rcu_torture_ops rcu_bh_ops = { | |||
| 561 | .name = "rcu_bh" | 521 | .name = "rcu_bh" |
| 562 | }; | 522 | }; |
| 563 | 523 | ||
| 564 | static struct rcu_torture_ops rcu_bh_sync_ops = { | ||
| 565 | .init = rcu_sync_torture_init, | ||
| 566 | .readlock = rcu_bh_torture_read_lock, | ||
| 567 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | ||
| 568 | .readunlock = rcu_bh_torture_read_unlock, | ||
| 569 | .completed = rcu_bh_torture_completed, | ||
| 570 | .deferred_free = rcu_sync_torture_deferred_free, | ||
| 571 | .sync = synchronize_rcu_bh, | ||
| 572 | .call = NULL, | ||
| 573 | .cb_barrier = NULL, | ||
| 574 | .fqs = rcu_bh_force_quiescent_state, | ||
| 575 | .stats = NULL, | ||
| 576 | .irq_capable = 1, | ||
| 577 | .name = "rcu_bh_sync" | ||
| 578 | }; | ||
| 579 | |||
| 580 | static struct rcu_torture_ops rcu_bh_expedited_ops = { | ||
| 581 | .init = rcu_sync_torture_init, | ||
| 582 | .readlock = rcu_bh_torture_read_lock, | ||
| 583 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | ||
| 584 | .readunlock = rcu_bh_torture_read_unlock, | ||
| 585 | .completed = rcu_bh_torture_completed, | ||
| 586 | .deferred_free = rcu_sync_torture_deferred_free, | ||
| 587 | .sync = synchronize_rcu_bh_expedited, | ||
| 588 | .call = NULL, | ||
| 589 | .cb_barrier = NULL, | ||
| 590 | .fqs = rcu_bh_force_quiescent_state, | ||
| 591 | .stats = NULL, | ||
| 592 | .irq_capable = 1, | ||
| 593 | .name = "rcu_bh_expedited" | ||
| 594 | }; | ||
| 595 | |||
| 596 | /* | 524 | /* |
| 597 | * Definitions for srcu torture testing. | 525 | * Definitions for srcu torture testing. |
| 598 | */ | 526 | */ |
| @@ -667,6 +595,11 @@ static int srcu_torture_stats(char *page) | |||
| 667 | return cnt; | 595 | return cnt; |
| 668 | } | 596 | } |
| 669 | 597 | ||
| 598 | static void srcu_torture_synchronize_expedited(void) | ||
| 599 | { | ||
| 600 | synchronize_srcu_expedited(&srcu_ctl); | ||
| 601 | } | ||
| 602 | |||
| 670 | static struct rcu_torture_ops srcu_ops = { | 603 | static struct rcu_torture_ops srcu_ops = { |
| 671 | .init = rcu_sync_torture_init, | 604 | .init = rcu_sync_torture_init, |
| 672 | .readlock = srcu_torture_read_lock, | 605 | .readlock = srcu_torture_read_lock, |
| @@ -675,45 +608,13 @@ static struct rcu_torture_ops srcu_ops = { | |||
| 675 | .completed = srcu_torture_completed, | 608 | .completed = srcu_torture_completed, |
| 676 | .deferred_free = srcu_torture_deferred_free, | 609 | .deferred_free = srcu_torture_deferred_free, |
| 677 | .sync = srcu_torture_synchronize, | 610 | .sync = srcu_torture_synchronize, |
| 611 | .exp_sync = srcu_torture_synchronize_expedited, | ||
| 678 | .call = srcu_torture_call, | 612 | .call = srcu_torture_call, |
| 679 | .cb_barrier = srcu_torture_barrier, | 613 | .cb_barrier = srcu_torture_barrier, |
| 680 | .stats = srcu_torture_stats, | 614 | .stats = srcu_torture_stats, |
| 681 | .name = "srcu" | 615 | .name = "srcu" |
| 682 | }; | 616 | }; |
| 683 | 617 | ||
| 684 | static struct rcu_torture_ops srcu_sync_ops = { | ||
| 685 | .init = rcu_sync_torture_init, | ||
| 686 | .readlock = srcu_torture_read_lock, | ||
| 687 | .read_delay = srcu_read_delay, | ||
| 688 | .readunlock = srcu_torture_read_unlock, | ||
| 689 | .completed = srcu_torture_completed, | ||
| 690 | .deferred_free = rcu_sync_torture_deferred_free, | ||
| 691 | .sync = srcu_torture_synchronize, | ||
| 692 | .call = NULL, | ||
| 693 | .cb_barrier = NULL, | ||
| 694 | .stats = srcu_torture_stats, | ||
| 695 | .name = "srcu_sync" | ||
| 696 | }; | ||
| 697 | |||
| 698 | static void srcu_torture_synchronize_expedited(void) | ||
| 699 | { | ||
| 700 | synchronize_srcu_expedited(&srcu_ctl); | ||
| 701 | } | ||
| 702 | |||
| 703 | static struct rcu_torture_ops srcu_expedited_ops = { | ||
| 704 | .init = rcu_sync_torture_init, | ||
| 705 | .readlock = srcu_torture_read_lock, | ||
| 706 | .read_delay = srcu_read_delay, | ||
| 707 | .readunlock = srcu_torture_read_unlock, | ||
| 708 | .completed = srcu_torture_completed, | ||
| 709 | .deferred_free = rcu_sync_torture_deferred_free, | ||
| 710 | .sync = srcu_torture_synchronize_expedited, | ||
| 711 | .call = NULL, | ||
| 712 | .cb_barrier = NULL, | ||
| 713 | .stats = srcu_torture_stats, | ||
| 714 | .name = "srcu_expedited" | ||
| 715 | }; | ||
| 716 | |||
| 717 | /* | 618 | /* |
| 718 | * Definitions for sched torture testing. | 619 | * Definitions for sched torture testing. |
| 719 | */ | 620 | */ |
| @@ -742,6 +643,8 @@ static struct rcu_torture_ops sched_ops = { | |||
| 742 | .completed = rcu_no_completed, | 643 | .completed = rcu_no_completed, |
| 743 | .deferred_free = rcu_sched_torture_deferred_free, | 644 | .deferred_free = rcu_sched_torture_deferred_free, |
| 744 | .sync = synchronize_sched, | 645 | .sync = synchronize_sched, |
| 646 | .exp_sync = synchronize_sched_expedited, | ||
| 647 | .call = call_rcu_sched, | ||
| 745 | .cb_barrier = rcu_barrier_sched, | 648 | .cb_barrier = rcu_barrier_sched, |
| 746 | .fqs = rcu_sched_force_quiescent_state, | 649 | .fqs = rcu_sched_force_quiescent_state, |
| 747 | .stats = NULL, | 650 | .stats = NULL, |
| @@ -749,35 +652,6 @@ static struct rcu_torture_ops sched_ops = { | |||
| 749 | .name = "sched" | 652 | .name = "sched" |
| 750 | }; | 653 | }; |
| 751 | 654 | ||
| 752 | static struct rcu_torture_ops sched_sync_ops = { | ||
| 753 | .init = rcu_sync_torture_init, | ||
| 754 | .readlock = sched_torture_read_lock, | ||
| 755 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | ||
| 756 | .readunlock = sched_torture_read_unlock, | ||
| 757 | .completed = rcu_no_completed, | ||
| 758 | .deferred_free = rcu_sync_torture_deferred_free, | ||
| 759 | .sync = synchronize_sched, | ||
| 760 | .cb_barrier = NULL, | ||
| 761 | .fqs = rcu_sched_force_quiescent_state, | ||
| 762 | .stats = NULL, | ||
| 763 | .name = "sched_sync" | ||
| 764 | }; | ||
| 765 | |||
| 766 | static struct rcu_torture_ops sched_expedited_ops = { | ||
| 767 | .init = rcu_sync_torture_init, | ||
| 768 | .readlock = sched_torture_read_lock, | ||
| 769 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | ||
| 770 | .readunlock = sched_torture_read_unlock, | ||
| 771 | .completed = rcu_no_completed, | ||
| 772 | .deferred_free = rcu_sync_torture_deferred_free, | ||
| 773 | .sync = synchronize_sched_expedited, | ||
| 774 | .cb_barrier = NULL, | ||
| 775 | .fqs = rcu_sched_force_quiescent_state, | ||
| 776 | .stats = NULL, | ||
| 777 | .irq_capable = 1, | ||
| 778 | .name = "sched_expedited" | ||
| 779 | }; | ||
| 780 | |||
| 781 | /* | 655 | /* |
| 782 | * RCU torture priority-boost testing. Runs one real-time thread per | 656 | * RCU torture priority-boost testing. Runs one real-time thread per |
| 783 | * CPU for moderate bursts, repeatedly registering RCU callbacks and | 657 | * CPU for moderate bursts, repeatedly registering RCU callbacks and |
| @@ -927,9 +801,10 @@ rcu_torture_fqs(void *arg) | |||
| 927 | static int | 801 | static int |
| 928 | rcu_torture_writer(void *arg) | 802 | rcu_torture_writer(void *arg) |
| 929 | { | 803 | { |
| 804 | bool exp; | ||
| 930 | int i; | 805 | int i; |
| 931 | long oldbatch = rcu_batches_completed(); | ||
| 932 | struct rcu_torture *rp; | 806 | struct rcu_torture *rp; |
| 807 | struct rcu_torture *rp1; | ||
| 933 | struct rcu_torture *old_rp; | 808 | struct rcu_torture *old_rp; |
| 934 | static DEFINE_RCU_RANDOM(rand); | 809 | static DEFINE_RCU_RANDOM(rand); |
| 935 | 810 | ||
| @@ -954,10 +829,33 @@ rcu_torture_writer(void *arg) | |||
| 954 | i = RCU_TORTURE_PIPE_LEN; | 829 | i = RCU_TORTURE_PIPE_LEN; |
| 955 | atomic_inc(&rcu_torture_wcount[i]); | 830 | atomic_inc(&rcu_torture_wcount[i]); |
| 956 | old_rp->rtort_pipe_count++; | 831 | old_rp->rtort_pipe_count++; |
| 957 | cur_ops->deferred_free(old_rp); | 832 | if (gp_normal == gp_exp) |
| 833 | exp = !!(rcu_random(&rand) & 0x80); | ||
| 834 | else | ||
| 835 | exp = gp_exp; | ||
| 836 | if (!exp) { | ||
| 837 | cur_ops->deferred_free(old_rp); | ||
| 838 | } else { | ||
| 839 | cur_ops->exp_sync(); | ||
| 840 | list_add(&old_rp->rtort_free, | ||
| 841 | &rcu_torture_removed); | ||
| 842 | list_for_each_entry_safe(rp, rp1, | ||
| 843 | &rcu_torture_removed, | ||
| 844 | rtort_free) { | ||
| 845 | i = rp->rtort_pipe_count; | ||
| 846 | if (i > RCU_TORTURE_PIPE_LEN) | ||
| 847 | i = RCU_TORTURE_PIPE_LEN; | ||
| 848 | atomic_inc(&rcu_torture_wcount[i]); | ||
| 849 | if (++rp->rtort_pipe_count >= | ||
| 850 | RCU_TORTURE_PIPE_LEN) { | ||
| 851 | rp->rtort_mbtest = 0; | ||
| 852 | list_del(&rp->rtort_free); | ||
| 853 | rcu_torture_free(rp); | ||
| 854 | } | ||
| 855 | } | ||
| 856 | } | ||
| 958 | } | 857 | } |
| 959 | rcutorture_record_progress(++rcu_torture_current_version); | 858 | rcutorture_record_progress(++rcu_torture_current_version); |
| 960 | oldbatch = cur_ops->completed(); | ||
| 961 | rcu_stutter_wait("rcu_torture_writer"); | 859 | rcu_stutter_wait("rcu_torture_writer"); |
| 962 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); | 860 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); |
| 963 | VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); | 861 | VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); |
| @@ -983,10 +881,18 @@ rcu_torture_fakewriter(void *arg) | |||
| 983 | schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); | 881 | schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); |
| 984 | udelay(rcu_random(&rand) & 0x3ff); | 882 | udelay(rcu_random(&rand) & 0x3ff); |
| 985 | if (cur_ops->cb_barrier != NULL && | 883 | if (cur_ops->cb_barrier != NULL && |
| 986 | rcu_random(&rand) % (nfakewriters * 8) == 0) | 884 | rcu_random(&rand) % (nfakewriters * 8) == 0) { |
| 987 | cur_ops->cb_barrier(); | 885 | cur_ops->cb_barrier(); |
| 988 | else | 886 | } else if (gp_normal == gp_exp) { |
| 887 | if (rcu_random(&rand) & 0x80) | ||
| 888 | cur_ops->sync(); | ||
| 889 | else | ||
| 890 | cur_ops->exp_sync(); | ||
| 891 | } else if (gp_normal) { | ||
| 989 | cur_ops->sync(); | 892 | cur_ops->sync(); |
| 893 | } else { | ||
| 894 | cur_ops->exp_sync(); | ||
| 895 | } | ||
| 990 | rcu_stutter_wait("rcu_torture_fakewriter"); | 896 | rcu_stutter_wait("rcu_torture_fakewriter"); |
| 991 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); | 897 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); |
| 992 | 898 | ||
| @@ -1364,7 +1270,7 @@ rcu_torture_stutter(void *arg) | |||
| 1364 | } | 1270 | } |
| 1365 | 1271 | ||
| 1366 | static inline void | 1272 | static inline void |
| 1367 | rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag) | 1273 | rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) |
| 1368 | { | 1274 | { |
| 1369 | pr_alert("%s" TORTURE_FLAG | 1275 | pr_alert("%s" TORTURE_FLAG |
| 1370 | "--- %s: nreaders=%d nfakewriters=%d " | 1276 | "--- %s: nreaders=%d nfakewriters=%d " |
| @@ -1534,7 +1440,13 @@ rcu_torture_onoff(void *arg) | |||
| 1534 | torture_type, cpu); | 1440 | torture_type, cpu); |
| 1535 | starttime = jiffies; | 1441 | starttime = jiffies; |
| 1536 | n_online_attempts++; | 1442 | n_online_attempts++; |
| 1537 | if (cpu_up(cpu) == 0) { | 1443 | ret = cpu_up(cpu); |
| 1444 | if (ret) { | ||
| 1445 | if (verbose) | ||
| 1446 | pr_alert("%s" TORTURE_FLAG | ||
| 1447 | "rcu_torture_onoff task: online %d failed: errno %d\n", | ||
| 1448 | torture_type, cpu, ret); | ||
| 1449 | } else { | ||
| 1538 | if (verbose) | 1450 | if (verbose) |
| 1539 | pr_alert("%s" TORTURE_FLAG | 1451 | pr_alert("%s" TORTURE_FLAG |
| 1540 | "rcu_torture_onoff task: onlined %d\n", | 1452 | "rcu_torture_onoff task: onlined %d\n", |
| @@ -1934,6 +1846,62 @@ rcu_torture_cleanup(void) | |||
| 1934 | rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); | 1846 | rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); |
| 1935 | } | 1847 | } |
| 1936 | 1848 | ||
| 1849 | #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD | ||
| 1850 | static void rcu_torture_leak_cb(struct rcu_head *rhp) | ||
| 1851 | { | ||
| 1852 | } | ||
| 1853 | |||
| 1854 | static void rcu_torture_err_cb(struct rcu_head *rhp) | ||
| 1855 | { | ||
| 1856 | /* | ||
| 1857 | * This -might- happen due to race conditions, but is unlikely. | ||
| 1858 | * The scenario that leads to this happening is that the | ||
| 1859 | * first of the pair of duplicate callbacks is queued, | ||
| 1860 | * someone else starts a grace period that includes that | ||
| 1861 | * callback, then the second of the pair must wait for the | ||
| 1862 | * next grace period. Unlikely, but can happen. If it | ||
| 1863 | * does happen, the debug-objects subsystem won't have splatted. | ||
| 1864 | */ | ||
| 1865 | pr_alert("rcutorture: duplicated callback was invoked.\n"); | ||
| 1866 | } | ||
| 1867 | #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ | ||
| 1868 | |||
| 1869 | /* | ||
| 1870 | * Verify that double-free causes debug-objects to complain, but only | ||
| 1871 | * if CONFIG_DEBUG_OBJECTS_RCU_HEAD=y. Otherwise, say that the test | ||
| 1872 | * cannot be carried out. | ||
| 1873 | */ | ||
| 1874 | static void rcu_test_debug_objects(void) | ||
| 1875 | { | ||
| 1876 | #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD | ||
| 1877 | struct rcu_head rh1; | ||
| 1878 | struct rcu_head rh2; | ||
| 1879 | |||
| 1880 | init_rcu_head_on_stack(&rh1); | ||
| 1881 | init_rcu_head_on_stack(&rh2); | ||
| 1882 | pr_alert("rcutorture: WARN: Duplicate call_rcu() test starting.\n"); | ||
| 1883 | |||
| 1884 | /* Try to queue the rh2 pair of callbacks for the same grace period. */ | ||
| 1885 | preempt_disable(); /* Prevent preemption from interrupting test. */ | ||
| 1886 | rcu_read_lock(); /* Make it impossible to finish a grace period. */ | ||
| 1887 | call_rcu(&rh1, rcu_torture_leak_cb); /* Start grace period. */ | ||
| 1888 | local_irq_disable(); /* Make it harder to start a new grace period. */ | ||
| 1889 | call_rcu(&rh2, rcu_torture_leak_cb); | ||
| 1890 | call_rcu(&rh2, rcu_torture_err_cb); /* Duplicate callback. */ | ||
| 1891 | local_irq_enable(); | ||
| 1892 | rcu_read_unlock(); | ||
| 1893 | preempt_enable(); | ||
| 1894 | |||
| 1895 | /* Wait for them all to get done so we can safely return. */ | ||
| 1896 | rcu_barrier(); | ||
| 1897 | pr_alert("rcutorture: WARN: Duplicate call_rcu() test complete.\n"); | ||
| 1898 | destroy_rcu_head_on_stack(&rh1); | ||
| 1899 | destroy_rcu_head_on_stack(&rh2); | ||
| 1900 | #else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ | ||
| 1901 | pr_alert("rcutorture: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n"); | ||
| 1902 | #endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ | ||
| 1903 | } | ||
| 1904 | |||
| 1937 | static int __init | 1905 | static int __init |
| 1938 | rcu_torture_init(void) | 1906 | rcu_torture_init(void) |
| 1939 | { | 1907 | { |
| @@ -1941,11 +1909,9 @@ rcu_torture_init(void) | |||
| 1941 | int cpu; | 1909 | int cpu; |
| 1942 | int firsterr = 0; | 1910 | int firsterr = 0; |
| 1943 | int retval; | 1911 | int retval; |
| 1944 | static struct rcu_torture_ops *torture_ops[] = | 1912 | static struct rcu_torture_ops *torture_ops[] = { |
| 1945 | { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, | 1913 | &rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops, |
| 1946 | &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, | 1914 | }; |
| 1947 | &srcu_ops, &srcu_sync_ops, &srcu_expedited_ops, | ||
| 1948 | &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; | ||
| 1949 | 1915 | ||
| 1950 | mutex_lock(&fullstop_mutex); | 1916 | mutex_lock(&fullstop_mutex); |
| 1951 | 1917 | ||
| @@ -2163,6 +2129,8 @@ rcu_torture_init(void) | |||
| 2163 | firsterr = retval; | 2129 | firsterr = retval; |
| 2164 | goto unwind; | 2130 | goto unwind; |
| 2165 | } | 2131 | } |
| 2132 | if (object_debug) | ||
| 2133 | rcu_test_debug_objects(); | ||
| 2166 | rcutorture_record_test_transition(); | 2134 | rcutorture_record_test_transition(); |
| 2167 | mutex_unlock(&fullstop_mutex); | 2135 | mutex_unlock(&fullstop_mutex); |
| 2168 | return 0; | 2136 | return 0; |
diff --git a/kernel/rcutree.c b/kernel/rcu/tree.c index 068de3a93606..4c06ddfea7cd 100644 --- a/kernel/rcutree.c +++ b/kernel/rcu/tree.c | |||
| @@ -41,6 +41,7 @@ | |||
| 41 | #include <linux/export.h> | 41 | #include <linux/export.h> |
| 42 | #include <linux/completion.h> | 42 | #include <linux/completion.h> |
| 43 | #include <linux/moduleparam.h> | 43 | #include <linux/moduleparam.h> |
| 44 | #include <linux/module.h> | ||
| 44 | #include <linux/percpu.h> | 45 | #include <linux/percpu.h> |
| 45 | #include <linux/notifier.h> | 46 | #include <linux/notifier.h> |
| 46 | #include <linux/cpu.h> | 47 | #include <linux/cpu.h> |
| @@ -53,18 +54,37 @@ | |||
| 53 | #include <linux/delay.h> | 54 | #include <linux/delay.h> |
| 54 | #include <linux/stop_machine.h> | 55 | #include <linux/stop_machine.h> |
| 55 | #include <linux/random.h> | 56 | #include <linux/random.h> |
| 57 | #include <linux/ftrace_event.h> | ||
| 58 | #include <linux/suspend.h> | ||
| 56 | 59 | ||
| 57 | #include "rcutree.h" | 60 | #include "tree.h" |
| 58 | #include <trace/events/rcu.h> | 61 | #include <trace/events/rcu.h> |
| 59 | 62 | ||
| 60 | #include "rcu.h" | 63 | #include "rcu.h" |
| 61 | 64 | ||
| 65 | MODULE_ALIAS("rcutree"); | ||
| 66 | #ifdef MODULE_PARAM_PREFIX | ||
| 67 | #undef MODULE_PARAM_PREFIX | ||
| 68 | #endif | ||
| 69 | #define MODULE_PARAM_PREFIX "rcutree." | ||
| 70 | |||
| 62 | /* Data structures. */ | 71 | /* Data structures. */ |
| 63 | 72 | ||
| 64 | static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; | 73 | static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; |
| 65 | static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; | 74 | static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; |
| 66 | 75 | ||
| 67 | #define RCU_STATE_INITIALIZER(sname, sabbr, cr) { \ | 76 | /* |
| 77 | * In order to export the rcu_state name to the tracing tools, it | ||
| 78 | * needs to be added in the __tracepoint_string section. | ||
| 79 | * This requires defining a separate variable tp_<sname>_varname | ||
| 80 | * that points to the string being used, and this will allow | ||
| 81 | * the tracing userspace tools to be able to decipher the string | ||
| 82 | * address to the matching string. | ||
| 83 | */ | ||
| 84 | #define RCU_STATE_INITIALIZER(sname, sabbr, cr) \ | ||
| 85 | static char sname##_varname[] = #sname; \ | ||
| 86 | static const char *tp_##sname##_varname __used __tracepoint_string = sname##_varname; \ | ||
| 87 | struct rcu_state sname##_state = { \ | ||
| 68 | .level = { &sname##_state.node[0] }, \ | 88 | .level = { &sname##_state.node[0] }, \ |
| 69 | .call = cr, \ | 89 | .call = cr, \ |
| 70 | .fqs_state = RCU_GP_IDLE, \ | 90 | .fqs_state = RCU_GP_IDLE, \ |
| @@ -75,16 +95,13 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; | |||
| 75 | .orphan_donetail = &sname##_state.orphan_donelist, \ | 95 | .orphan_donetail = &sname##_state.orphan_donelist, \ |
| 76 | .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ | 96 | .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ |
| 77 | .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \ | 97 | .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \ |
| 78 | .name = #sname, \ | 98 | .name = sname##_varname, \ |
| 79 | .abbr = sabbr, \ | 99 | .abbr = sabbr, \ |
| 80 | } | 100 | }; \ |
| 81 | 101 | DEFINE_PER_CPU(struct rcu_data, sname##_data) | |
| 82 | struct rcu_state rcu_sched_state = | ||
| 83 | RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); | ||
| 84 | DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); | ||
| 85 | 102 | ||
| 86 | struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); | 103 | RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); |
| 87 | DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); | 104 | RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); |
| 88 | 105 | ||
| 89 | static struct rcu_state *rcu_state; | 106 | static struct rcu_state *rcu_state; |
| 90 | LIST_HEAD(rcu_struct_flavors); | 107 | LIST_HEAD(rcu_struct_flavors); |
| @@ -178,7 +195,7 @@ void rcu_sched_qs(int cpu) | |||
| 178 | struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu); | 195 | struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu); |
| 179 | 196 | ||
| 180 | if (rdp->passed_quiesce == 0) | 197 | if (rdp->passed_quiesce == 0) |
| 181 | trace_rcu_grace_period("rcu_sched", rdp->gpnum, "cpuqs"); | 198 | trace_rcu_grace_period(TPS("rcu_sched"), rdp->gpnum, TPS("cpuqs")); |
| 182 | rdp->passed_quiesce = 1; | 199 | rdp->passed_quiesce = 1; |
| 183 | } | 200 | } |
| 184 | 201 | ||
| @@ -187,7 +204,7 @@ void rcu_bh_qs(int cpu) | |||
| 187 | struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); | 204 | struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); |
| 188 | 205 | ||
| 189 | if (rdp->passed_quiesce == 0) | 206 | if (rdp->passed_quiesce == 0) |
| 190 | trace_rcu_grace_period("rcu_bh", rdp->gpnum, "cpuqs"); | 207 | trace_rcu_grace_period(TPS("rcu_bh"), rdp->gpnum, TPS("cpuqs")); |
| 191 | rdp->passed_quiesce = 1; | 208 | rdp->passed_quiesce = 1; |
| 192 | } | 209 | } |
| 193 | 210 | ||
| @@ -198,16 +215,20 @@ void rcu_bh_qs(int cpu) | |||
| 198 | */ | 215 | */ |
| 199 | void rcu_note_context_switch(int cpu) | 216 | void rcu_note_context_switch(int cpu) |
| 200 | { | 217 | { |
| 201 | trace_rcu_utilization("Start context switch"); | 218 | trace_rcu_utilization(TPS("Start context switch")); |
| 202 | rcu_sched_qs(cpu); | 219 | rcu_sched_qs(cpu); |
| 203 | rcu_preempt_note_context_switch(cpu); | 220 | rcu_preempt_note_context_switch(cpu); |
| 204 | trace_rcu_utilization("End context switch"); | 221 | trace_rcu_utilization(TPS("End context switch")); |
| 205 | } | 222 | } |
| 206 | EXPORT_SYMBOL_GPL(rcu_note_context_switch); | 223 | EXPORT_SYMBOL_GPL(rcu_note_context_switch); |
| 207 | 224 | ||
| 208 | DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { | 225 | static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { |
| 209 | .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, | 226 | .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, |
| 210 | .dynticks = ATOMIC_INIT(1), | 227 | .dynticks = ATOMIC_INIT(1), |
| 228 | #ifdef CONFIG_NO_HZ_FULL_SYSIDLE | ||
| 229 | .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE, | ||
| 230 | .dynticks_idle = ATOMIC_INIT(1), | ||
| 231 | #endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ | ||
| 211 | }; | 232 | }; |
| 212 | 233 | ||
| 213 | static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ | 234 | static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ |
| @@ -226,7 +247,10 @@ module_param(jiffies_till_next_fqs, ulong, 0644); | |||
| 226 | 247 | ||
| 227 | static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, | 248 | static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, |
| 228 | struct rcu_data *rdp); | 249 | struct rcu_data *rdp); |
| 229 | static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)); | 250 | static void force_qs_rnp(struct rcu_state *rsp, |
| 251 | int (*f)(struct rcu_data *rsp, bool *isidle, | ||
| 252 | unsigned long *maxj), | ||
| 253 | bool *isidle, unsigned long *maxj); | ||
| 230 | static void force_quiescent_state(struct rcu_state *rsp); | 254 | static void force_quiescent_state(struct rcu_state *rsp); |
| 231 | static int rcu_pending(int cpu); | 255 | static int rcu_pending(int cpu); |
| 232 | 256 | ||
| @@ -345,11 +369,12 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp) | |||
| 345 | static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, | 369 | static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, |
| 346 | bool user) | 370 | bool user) |
| 347 | { | 371 | { |
| 348 | trace_rcu_dyntick("Start", oldval, rdtp->dynticks_nesting); | 372 | trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting); |
| 349 | if (!user && !is_idle_task(current)) { | 373 | if (!user && !is_idle_task(current)) { |
| 350 | struct task_struct *idle = idle_task(smp_processor_id()); | 374 | struct task_struct *idle __maybe_unused = |
| 375 | idle_task(smp_processor_id()); | ||
| 351 | 376 | ||
| 352 | trace_rcu_dyntick("Error on entry: not idle task", oldval, 0); | 377 | trace_rcu_dyntick(TPS("Error on entry: not idle task"), oldval, 0); |
| 353 | ftrace_dump(DUMP_ORIG); | 378 | ftrace_dump(DUMP_ORIG); |
| 354 | WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", | 379 | WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", |
| 355 | current->pid, current->comm, | 380 | current->pid, current->comm, |
| @@ -383,7 +408,7 @@ static void rcu_eqs_enter(bool user) | |||
| 383 | long long oldval; | 408 | long long oldval; |
| 384 | struct rcu_dynticks *rdtp; | 409 | struct rcu_dynticks *rdtp; |
| 385 | 410 | ||
| 386 | rdtp = &__get_cpu_var(rcu_dynticks); | 411 | rdtp = this_cpu_ptr(&rcu_dynticks); |
| 387 | oldval = rdtp->dynticks_nesting; | 412 | oldval = rdtp->dynticks_nesting; |
| 388 | WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0); | 413 | WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0); |
| 389 | if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) | 414 | if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) |
| @@ -411,6 +436,7 @@ void rcu_idle_enter(void) | |||
| 411 | 436 | ||
| 412 | local_irq_save(flags); | 437 | local_irq_save(flags); |
| 413 | rcu_eqs_enter(false); | 438 | rcu_eqs_enter(false); |
| 439 | rcu_sysidle_enter(this_cpu_ptr(&rcu_dynticks), 0); | ||
| 414 | local_irq_restore(flags); | 440 | local_irq_restore(flags); |
| 415 | } | 441 | } |
| 416 | EXPORT_SYMBOL_GPL(rcu_idle_enter); | 442 | EXPORT_SYMBOL_GPL(rcu_idle_enter); |
| @@ -428,27 +454,6 @@ void rcu_user_enter(void) | |||
| 428 | { | 454 | { |
| 429 | rcu_eqs_enter(1); | 455 | rcu_eqs_enter(1); |
| 430 | } | 456 | } |
| 431 | |||
| 432 | /** | ||
| 433 | * rcu_user_enter_after_irq - inform RCU that we are going to resume userspace | ||
| 434 | * after the current irq returns. | ||
| 435 | * | ||
| 436 | * This is similar to rcu_user_enter() but in the context of a non-nesting | ||
| 437 | * irq. After this call, RCU enters into idle mode when the interrupt | ||
| 438 | * returns. | ||
| 439 | */ | ||
| 440 | void rcu_user_enter_after_irq(void) | ||
| 441 | { | ||
| 442 | unsigned long flags; | ||
| 443 | struct rcu_dynticks *rdtp; | ||
| 444 | |||
| 445 | local_irq_save(flags); | ||
| 446 | rdtp = &__get_cpu_var(rcu_dynticks); | ||
| 447 | /* Ensure this irq is interrupting a non-idle RCU state. */ | ||
| 448 | WARN_ON_ONCE(!(rdtp->dynticks_nesting & DYNTICK_TASK_MASK)); | ||
| 449 | rdtp->dynticks_nesting = 1; | ||
| 450 | local_irq_restore(flags); | ||
| 451 | } | ||
| 452 | #endif /* CONFIG_RCU_USER_QS */ | 457 | #endif /* CONFIG_RCU_USER_QS */ |
| 453 | 458 | ||
| 454 | /** | 459 | /** |
| @@ -474,14 +479,15 @@ void rcu_irq_exit(void) | |||
| 474 | struct rcu_dynticks *rdtp; | 479 | struct rcu_dynticks *rdtp; |
| 475 | 480 | ||
| 476 | local_irq_save(flags); | 481 | local_irq_save(flags); |
| 477 | rdtp = &__get_cpu_var(rcu_dynticks); | 482 | rdtp = this_cpu_ptr(&rcu_dynticks); |
| 478 | oldval = rdtp->dynticks_nesting; | 483 | oldval = rdtp->dynticks_nesting; |
| 479 | rdtp->dynticks_nesting--; | 484 | rdtp->dynticks_nesting--; |
| 480 | WARN_ON_ONCE(rdtp->dynticks_nesting < 0); | 485 | WARN_ON_ONCE(rdtp->dynticks_nesting < 0); |
| 481 | if (rdtp->dynticks_nesting) | 486 | if (rdtp->dynticks_nesting) |
| 482 | trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting); | 487 | trace_rcu_dyntick(TPS("--="), oldval, rdtp->dynticks_nesting); |
| 483 | else | 488 | else |
| 484 | rcu_eqs_enter_common(rdtp, oldval, true); | 489 | rcu_eqs_enter_common(rdtp, oldval, true); |
| 490 | rcu_sysidle_enter(rdtp, 1); | ||
| 485 | local_irq_restore(flags); | 491 | local_irq_restore(flags); |
| 486 | } | 492 | } |
| 487 | 493 | ||
| @@ -501,11 +507,12 @@ static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval, | |||
| 501 | smp_mb__after_atomic_inc(); /* See above. */ | 507 | smp_mb__after_atomic_inc(); /* See above. */ |
| 502 | WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); | 508 | WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); |
| 503 | rcu_cleanup_after_idle(smp_processor_id()); | 509 | rcu_cleanup_after_idle(smp_processor_id()); |
| 504 | trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting); | 510 | trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting); |
| 505 | if (!user && !is_idle_task(current)) { | 511 | if (!user && !is_idle_task(current)) { |
| 506 | struct task_struct *idle = idle_task(smp_processor_id()); | 512 | struct task_struct *idle __maybe_unused = |
| 513 | idle_task(smp_processor_id()); | ||
| 507 | 514 | ||
| 508 | trace_rcu_dyntick("Error on exit: not idle task", | 515 | trace_rcu_dyntick(TPS("Error on exit: not idle task"), |
| 509 | oldval, rdtp->dynticks_nesting); | 516 | oldval, rdtp->dynticks_nesting); |
| 510 | ftrace_dump(DUMP_ORIG); | 517 | ftrace_dump(DUMP_ORIG); |
| 511 | WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", | 518 | WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", |
| @@ -523,7 +530,7 @@ static void rcu_eqs_exit(bool user) | |||
| 523 | struct rcu_dynticks *rdtp; | 530 | struct rcu_dynticks *rdtp; |
| 524 | long long oldval; | 531 | long long oldval; |
| 525 | 532 | ||
| 526 | rdtp = &__get_cpu_var(rcu_dynticks); | 533 | rdtp = this_cpu_ptr(&rcu_dynticks); |
| 527 | oldval = rdtp->dynticks_nesting; | 534 | oldval = rdtp->dynticks_nesting; |
| 528 | WARN_ON_ONCE(oldval < 0); | 535 | WARN_ON_ONCE(oldval < 0); |
| 529 | if (oldval & DYNTICK_TASK_NEST_MASK) | 536 | if (oldval & DYNTICK_TASK_NEST_MASK) |
| @@ -550,6 +557,7 @@ void rcu_idle_exit(void) | |||
| 550 | 557 | ||
| 551 | local_irq_save(flags); | 558 | local_irq_save(flags); |
| 552 | rcu_eqs_exit(false); | 559 | rcu_eqs_exit(false); |
| 560 | rcu_sysidle_exit(this_cpu_ptr(&rcu_dynticks), 0); | ||
| 553 | local_irq_restore(flags); | 561 | local_irq_restore(flags); |
| 554 | } | 562 | } |
| 555 | EXPORT_SYMBOL_GPL(rcu_idle_exit); | 563 | EXPORT_SYMBOL_GPL(rcu_idle_exit); |
| @@ -565,28 +573,6 @@ void rcu_user_exit(void) | |||
| 565 | { | 573 | { |
| 566 | rcu_eqs_exit(1); | 574 | rcu_eqs_exit(1); |
| 567 | } | 575 | } |
| 568 | |||
| 569 | /** | ||
| 570 | * rcu_user_exit_after_irq - inform RCU that we won't resume to userspace | ||
| 571 | * idle mode after the current non-nesting irq returns. | ||
| 572 | * | ||
| 573 | * This is similar to rcu_user_exit() but in the context of an irq. | ||
| 574 | * This is called when the irq has interrupted a userspace RCU idle mode | ||
| 575 | * context. When the current non-nesting interrupt returns after this call, | ||
| 576 | * the CPU won't restore the RCU idle mode. | ||
| 577 | */ | ||
| 578 | void rcu_user_exit_after_irq(void) | ||
| 579 | { | ||
| 580 | unsigned long flags; | ||
| 581 | struct rcu_dynticks *rdtp; | ||
| 582 | |||
| 583 | local_irq_save(flags); | ||
| 584 | rdtp = &__get_cpu_var(rcu_dynticks); | ||
| 585 | /* Ensure we are interrupting an RCU idle mode. */ | ||
| 586 | WARN_ON_ONCE(rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK); | ||
| 587 | rdtp->dynticks_nesting += DYNTICK_TASK_EXIT_IDLE; | ||
| 588 | local_irq_restore(flags); | ||
| 589 | } | ||
| 590 | #endif /* CONFIG_RCU_USER_QS */ | 576 | #endif /* CONFIG_RCU_USER_QS */ |
| 591 | 577 | ||
| 592 | /** | 578 | /** |
| @@ -615,14 +601,15 @@ void rcu_irq_enter(void) | |||
| 615 | long long oldval; | 601 | long long oldval; |
| 616 | 602 | ||
| 617 | local_irq_save(flags); | 603 | local_irq_save(flags); |
| 618 | rdtp = &__get_cpu_var(rcu_dynticks); | 604 | rdtp = this_cpu_ptr(&rcu_dynticks); |
| 619 | oldval = rdtp->dynticks_nesting; | 605 | oldval = rdtp->dynticks_nesting; |
| 620 | rdtp->dynticks_nesting++; | 606 | rdtp->dynticks_nesting++; |
| 621 | WARN_ON_ONCE(rdtp->dynticks_nesting == 0); | 607 | WARN_ON_ONCE(rdtp->dynticks_nesting == 0); |
| 622 | if (oldval) | 608 | if (oldval) |
| 623 | trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting); | 609 | trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting); |
| 624 | else | 610 | else |
| 625 | rcu_eqs_exit_common(rdtp, oldval, true); | 611 | rcu_eqs_exit_common(rdtp, oldval, true); |
| 612 | rcu_sysidle_exit(rdtp, 1); | ||
| 626 | local_irq_restore(flags); | 613 | local_irq_restore(flags); |
| 627 | } | 614 | } |
| 628 | 615 | ||
| @@ -635,7 +622,7 @@ void rcu_irq_enter(void) | |||
| 635 | */ | 622 | */ |
| 636 | void rcu_nmi_enter(void) | 623 | void rcu_nmi_enter(void) |
| 637 | { | 624 | { |
| 638 | struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); | 625 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); |
| 639 | 626 | ||
| 640 | if (rdtp->dynticks_nmi_nesting == 0 && | 627 | if (rdtp->dynticks_nmi_nesting == 0 && |
| 641 | (atomic_read(&rdtp->dynticks) & 0x1)) | 628 | (atomic_read(&rdtp->dynticks) & 0x1)) |
| @@ -657,7 +644,7 @@ void rcu_nmi_enter(void) | |||
| 657 | */ | 644 | */ |
| 658 | void rcu_nmi_exit(void) | 645 | void rcu_nmi_exit(void) |
| 659 | { | 646 | { |
| 660 | struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); | 647 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); |
| 661 | 648 | ||
| 662 | if (rdtp->dynticks_nmi_nesting == 0 || | 649 | if (rdtp->dynticks_nmi_nesting == 0 || |
| 663 | --rdtp->dynticks_nmi_nesting != 0) | 650 | --rdtp->dynticks_nmi_nesting != 0) |
| @@ -670,21 +657,34 @@ void rcu_nmi_exit(void) | |||
| 670 | } | 657 | } |
| 671 | 658 | ||
| 672 | /** | 659 | /** |
| 673 | * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle | 660 | * __rcu_is_watching - are RCU read-side critical sections safe? |
| 661 | * | ||
| 662 | * Return true if RCU is watching the running CPU, which means that | ||
| 663 | * this CPU can safely enter RCU read-side critical sections. Unlike | ||
| 664 | * rcu_is_watching(), the caller of __rcu_is_watching() must have at | ||
| 665 | * least disabled preemption. | ||
| 666 | */ | ||
| 667 | bool __rcu_is_watching(void) | ||
| 668 | { | ||
| 669 | return atomic_read(this_cpu_ptr(&rcu_dynticks.dynticks)) & 0x1; | ||
| 670 | } | ||
| 671 | |||
| 672 | /** | ||
| 673 | * rcu_is_watching - see if RCU thinks that the current CPU is idle | ||
| 674 | * | 674 | * |
| 675 | * If the current CPU is in its idle loop and is neither in an interrupt | 675 | * If the current CPU is in its idle loop and is neither in an interrupt |
| 676 | * or NMI handler, return true. | 676 | * or NMI handler, return true. |
| 677 | */ | 677 | */ |
| 678 | int rcu_is_cpu_idle(void) | 678 | bool rcu_is_watching(void) |
| 679 | { | 679 | { |
| 680 | int ret; | 680 | int ret; |
| 681 | 681 | ||
| 682 | preempt_disable(); | 682 | preempt_disable(); |
| 683 | ret = (atomic_read(&__get_cpu_var(rcu_dynticks).dynticks) & 0x1) == 0; | 683 | ret = __rcu_is_watching(); |
| 684 | preempt_enable(); | 684 | preempt_enable(); |
| 685 | return ret; | 685 | return ret; |
| 686 | } | 686 | } |
| 687 | EXPORT_SYMBOL(rcu_is_cpu_idle); | 687 | EXPORT_SYMBOL_GPL(rcu_is_watching); |
| 688 | 688 | ||
| 689 | #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) | 689 | #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) |
| 690 | 690 | ||
| @@ -718,7 +718,7 @@ bool rcu_lockdep_current_cpu_online(void) | |||
| 718 | if (in_nmi()) | 718 | if (in_nmi()) |
| 719 | return 1; | 719 | return 1; |
| 720 | preempt_disable(); | 720 | preempt_disable(); |
| 721 | rdp = &__get_cpu_var(rcu_sched_data); | 721 | rdp = this_cpu_ptr(&rcu_sched_data); |
| 722 | rnp = rdp->mynode; | 722 | rnp = rdp->mynode; |
| 723 | ret = (rdp->grpmask & rnp->qsmaskinit) || | 723 | ret = (rdp->grpmask & rnp->qsmaskinit) || |
| 724 | !rcu_scheduler_fully_active; | 724 | !rcu_scheduler_fully_active; |
| @@ -738,7 +738,7 @@ EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); | |||
| 738 | */ | 738 | */ |
| 739 | static int rcu_is_cpu_rrupt_from_idle(void) | 739 | static int rcu_is_cpu_rrupt_from_idle(void) |
| 740 | { | 740 | { |
| 741 | return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1; | 741 | return __this_cpu_read(rcu_dynticks.dynticks_nesting) <= 1; |
| 742 | } | 742 | } |
| 743 | 743 | ||
| 744 | /* | 744 | /* |
| @@ -746,9 +746,11 @@ static int rcu_is_cpu_rrupt_from_idle(void) | |||
| 746 | * credit them with an implicit quiescent state. Return 1 if this CPU | 746 | * credit them with an implicit quiescent state. Return 1 if this CPU |
| 747 | * is in dynticks idle mode, which is an extended quiescent state. | 747 | * is in dynticks idle mode, which is an extended quiescent state. |
| 748 | */ | 748 | */ |
| 749 | static int dyntick_save_progress_counter(struct rcu_data *rdp) | 749 | static int dyntick_save_progress_counter(struct rcu_data *rdp, |
| 750 | bool *isidle, unsigned long *maxj) | ||
| 750 | { | 751 | { |
| 751 | rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); | 752 | rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); |
| 753 | rcu_sysidle_check_cpu(rdp, isidle, maxj); | ||
| 752 | return (rdp->dynticks_snap & 0x1) == 0; | 754 | return (rdp->dynticks_snap & 0x1) == 0; |
| 753 | } | 755 | } |
| 754 | 756 | ||
| @@ -758,7 +760,8 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp) | |||
| 758 | * idle state since the last call to dyntick_save_progress_counter() | 760 | * idle state since the last call to dyntick_save_progress_counter() |
| 759 | * for this same CPU, or by virtue of having been offline. | 761 | * for this same CPU, or by virtue of having been offline. |
| 760 | */ | 762 | */ |
| 761 | static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) | 763 | static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, |
| 764 | bool *isidle, unsigned long *maxj) | ||
| 762 | { | 765 | { |
| 763 | unsigned int curr; | 766 | unsigned int curr; |
| 764 | unsigned int snap; | 767 | unsigned int snap; |
| @@ -775,7 +778,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) | |||
| 775 | * of the current RCU grace period. | 778 | * of the current RCU grace period. |
| 776 | */ | 779 | */ |
| 777 | if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) { | 780 | if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) { |
| 778 | trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "dti"); | 781 | trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); |
| 779 | rdp->dynticks_fqs++; | 782 | rdp->dynticks_fqs++; |
| 780 | return 1; | 783 | return 1; |
| 781 | } | 784 | } |
| @@ -795,7 +798,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) | |||
| 795 | return 0; /* Grace period is not old enough. */ | 798 | return 0; /* Grace period is not old enough. */ |
| 796 | barrier(); | 799 | barrier(); |
| 797 | if (cpu_is_offline(rdp->cpu)) { | 800 | if (cpu_is_offline(rdp->cpu)) { |
| 798 | trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl"); | 801 | trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("ofl")); |
| 799 | rdp->offline_fqs++; | 802 | rdp->offline_fqs++; |
| 800 | return 1; | 803 | return 1; |
| 801 | } | 804 | } |
| @@ -814,8 +817,11 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) | |||
| 814 | 817 | ||
| 815 | static void record_gp_stall_check_time(struct rcu_state *rsp) | 818 | static void record_gp_stall_check_time(struct rcu_state *rsp) |
| 816 | { | 819 | { |
| 817 | rsp->gp_start = jiffies; | 820 | unsigned long j = ACCESS_ONCE(jiffies); |
| 818 | rsp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); | 821 | |
| 822 | rsp->gp_start = j; | ||
| 823 | smp_wmb(); /* Record start time before stall time. */ | ||
| 824 | rsp->jiffies_stall = j + rcu_jiffies_till_stall_check(); | ||
| 819 | } | 825 | } |
| 820 | 826 | ||
| 821 | /* | 827 | /* |
| @@ -910,6 +916,12 @@ static void print_other_cpu_stall(struct rcu_state *rsp) | |||
| 910 | force_quiescent_state(rsp); /* Kick them all. */ | 916 | force_quiescent_state(rsp); /* Kick them all. */ |
| 911 | } | 917 | } |
| 912 | 918 | ||
| 919 | /* | ||
| 920 | * This function really isn't for public consumption, but RCU is special in | ||
| 921 | * that context switches can allow the state machine to make progress. | ||
| 922 | */ | ||
| 923 | extern void resched_cpu(int cpu); | ||
| 924 | |||
| 913 | static void print_cpu_stall(struct rcu_state *rsp) | 925 | static void print_cpu_stall(struct rcu_state *rsp) |
| 914 | { | 926 | { |
| 915 | int cpu; | 927 | int cpu; |
| @@ -939,22 +951,60 @@ static void print_cpu_stall(struct rcu_state *rsp) | |||
| 939 | 3 * rcu_jiffies_till_stall_check() + 3; | 951 | 3 * rcu_jiffies_till_stall_check() + 3; |
| 940 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 952 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 941 | 953 | ||
| 942 | set_need_resched(); /* kick ourselves to get things going. */ | 954 | /* |
| 955 | * Attempt to revive the RCU machinery by forcing a context switch. | ||
| 956 | * | ||
| 957 | * A context switch would normally allow the RCU state machine to make | ||
| 958 | * progress and it could be we're stuck in kernel space without context | ||
| 959 | * switches for an entirely unreasonable amount of time. | ||
| 960 | */ | ||
| 961 | resched_cpu(smp_processor_id()); | ||
| 943 | } | 962 | } |
| 944 | 963 | ||
| 945 | static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) | 964 | static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) |
| 946 | { | 965 | { |
| 966 | unsigned long completed; | ||
| 967 | unsigned long gpnum; | ||
| 968 | unsigned long gps; | ||
| 947 | unsigned long j; | 969 | unsigned long j; |
| 948 | unsigned long js; | 970 | unsigned long js; |
| 949 | struct rcu_node *rnp; | 971 | struct rcu_node *rnp; |
| 950 | 972 | ||
| 951 | if (rcu_cpu_stall_suppress) | 973 | if (rcu_cpu_stall_suppress || !rcu_gp_in_progress(rsp)) |
| 952 | return; | 974 | return; |
| 953 | j = ACCESS_ONCE(jiffies); | 975 | j = ACCESS_ONCE(jiffies); |
| 976 | |||
| 977 | /* | ||
| 978 | * Lots of memory barriers to reject false positives. | ||
| 979 | * | ||
| 980 | * The idea is to pick up rsp->gpnum, then rsp->jiffies_stall, | ||
| 981 | * then rsp->gp_start, and finally rsp->completed. These values | ||
| 982 | * are updated in the opposite order with memory barriers (or | ||
| 983 | * equivalent) during grace-period initialization and cleanup. | ||
| 984 | * Now, a false positive can occur if we get an new value of | ||
| 985 | * rsp->gp_start and a old value of rsp->jiffies_stall. But given | ||
| 986 | * the memory barriers, the only way that this can happen is if one | ||
| 987 | * grace period ends and another starts between these two fetches. | ||
| 988 | * Detect this by comparing rsp->completed with the previous fetch | ||
| 989 | * from rsp->gpnum. | ||
| 990 | * | ||
| 991 | * Given this check, comparisons of jiffies, rsp->jiffies_stall, | ||
| 992 | * and rsp->gp_start suffice to forestall false positives. | ||
| 993 | */ | ||
| 994 | gpnum = ACCESS_ONCE(rsp->gpnum); | ||
| 995 | smp_rmb(); /* Pick up ->gpnum first... */ | ||
| 954 | js = ACCESS_ONCE(rsp->jiffies_stall); | 996 | js = ACCESS_ONCE(rsp->jiffies_stall); |
| 997 | smp_rmb(); /* ...then ->jiffies_stall before the rest... */ | ||
| 998 | gps = ACCESS_ONCE(rsp->gp_start); | ||
| 999 | smp_rmb(); /* ...and finally ->gp_start before ->completed. */ | ||
| 1000 | completed = ACCESS_ONCE(rsp->completed); | ||
| 1001 | if (ULONG_CMP_GE(completed, gpnum) || | ||
| 1002 | ULONG_CMP_LT(j, js) || | ||
| 1003 | ULONG_CMP_GE(gps, js)) | ||
| 1004 | return; /* No stall or GP completed since entering function. */ | ||
| 955 | rnp = rdp->mynode; | 1005 | rnp = rdp->mynode; |
| 956 | if (rcu_gp_in_progress(rsp) && | 1006 | if (rcu_gp_in_progress(rsp) && |
| 957 | (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) { | 1007 | (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask)) { |
| 958 | 1008 | ||
| 959 | /* We haven't checked in, so go dump stack. */ | 1009 | /* We haven't checked in, so go dump stack. */ |
| 960 | print_cpu_stall(rsp); | 1010 | print_cpu_stall(rsp); |
| @@ -1032,7 +1082,7 @@ static unsigned long rcu_cbs_completed(struct rcu_state *rsp, | |||
| 1032 | * rcu_nocb_wait_gp(). | 1082 | * rcu_nocb_wait_gp(). |
| 1033 | */ | 1083 | */ |
| 1034 | static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, | 1084 | static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, |
| 1035 | unsigned long c, char *s) | 1085 | unsigned long c, const char *s) |
| 1036 | { | 1086 | { |
| 1037 | trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum, | 1087 | trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum, |
| 1038 | rnp->completed, c, rnp->level, | 1088 | rnp->completed, c, rnp->level, |
| @@ -1058,9 +1108,9 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) | |||
| 1058 | * grace period is already marked as needed, return to the caller. | 1108 | * grace period is already marked as needed, return to the caller. |
| 1059 | */ | 1109 | */ |
| 1060 | c = rcu_cbs_completed(rdp->rsp, rnp); | 1110 | c = rcu_cbs_completed(rdp->rsp, rnp); |
| 1061 | trace_rcu_future_gp(rnp, rdp, c, "Startleaf"); | 1111 | trace_rcu_future_gp(rnp, rdp, c, TPS("Startleaf")); |
| 1062 | if (rnp->need_future_gp[c & 0x1]) { | 1112 | if (rnp->need_future_gp[c & 0x1]) { |
| 1063 | trace_rcu_future_gp(rnp, rdp, c, "Prestartleaf"); | 1113 | trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartleaf")); |
| 1064 | return c; | 1114 | return c; |
| 1065 | } | 1115 | } |
| 1066 | 1116 | ||
| @@ -1074,7 +1124,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) | |||
| 1074 | if (rnp->gpnum != rnp->completed || | 1124 | if (rnp->gpnum != rnp->completed || |
| 1075 | ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) { | 1125 | ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) { |
| 1076 | rnp->need_future_gp[c & 0x1]++; | 1126 | rnp->need_future_gp[c & 0x1]++; |
| 1077 | trace_rcu_future_gp(rnp, rdp, c, "Startedleaf"); | 1127 | trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf")); |
| 1078 | return c; | 1128 | return c; |
| 1079 | } | 1129 | } |
| 1080 | 1130 | ||
| @@ -1102,7 +1152,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) | |||
| 1102 | * recorded, trace and leave. | 1152 | * recorded, trace and leave. |
| 1103 | */ | 1153 | */ |
| 1104 | if (rnp_root->need_future_gp[c & 0x1]) { | 1154 | if (rnp_root->need_future_gp[c & 0x1]) { |
| 1105 | trace_rcu_future_gp(rnp, rdp, c, "Prestartedroot"); | 1155 | trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartedroot")); |
| 1106 | goto unlock_out; | 1156 | goto unlock_out; |
| 1107 | } | 1157 | } |
| 1108 | 1158 | ||
| @@ -1111,9 +1161,9 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) | |||
| 1111 | 1161 | ||
| 1112 | /* If a grace period is not already in progress, start one. */ | 1162 | /* If a grace period is not already in progress, start one. */ |
| 1113 | if (rnp_root->gpnum != rnp_root->completed) { | 1163 | if (rnp_root->gpnum != rnp_root->completed) { |
| 1114 | trace_rcu_future_gp(rnp, rdp, c, "Startedleafroot"); | 1164 | trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleafroot")); |
| 1115 | } else { | 1165 | } else { |
| 1116 | trace_rcu_future_gp(rnp, rdp, c, "Startedroot"); | 1166 | trace_rcu_future_gp(rnp, rdp, c, TPS("Startedroot")); |
| 1117 | rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp); | 1167 | rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp); |
| 1118 | } | 1168 | } |
| 1119 | unlock_out: | 1169 | unlock_out: |
| @@ -1137,7 +1187,8 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) | |||
| 1137 | rcu_nocb_gp_cleanup(rsp, rnp); | 1187 | rcu_nocb_gp_cleanup(rsp, rnp); |
| 1138 | rnp->need_future_gp[c & 0x1] = 0; | 1188 | rnp->need_future_gp[c & 0x1] = 0; |
| 1139 | needmore = rnp->need_future_gp[(c + 1) & 0x1]; | 1189 | needmore = rnp->need_future_gp[(c + 1) & 0x1]; |
| 1140 | trace_rcu_future_gp(rnp, rdp, c, needmore ? "CleanupMore" : "Cleanup"); | 1190 | trace_rcu_future_gp(rnp, rdp, c, |
| 1191 | needmore ? TPS("CleanupMore") : TPS("Cleanup")); | ||
| 1141 | return needmore; | 1192 | return needmore; |
| 1142 | } | 1193 | } |
| 1143 | 1194 | ||
| @@ -1205,9 +1256,9 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, | |||
| 1205 | 1256 | ||
| 1206 | /* Trace depending on how much we were able to accelerate. */ | 1257 | /* Trace depending on how much we were able to accelerate. */ |
| 1207 | if (!*rdp->nxttail[RCU_WAIT_TAIL]) | 1258 | if (!*rdp->nxttail[RCU_WAIT_TAIL]) |
| 1208 | trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccWaitCB"); | 1259 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB")); |
| 1209 | else | 1260 | else |
| 1210 | trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccReadyCB"); | 1261 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB")); |
| 1211 | } | 1262 | } |
| 1212 | 1263 | ||
| 1213 | /* | 1264 | /* |
| @@ -1273,7 +1324,7 @@ static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struc | |||
| 1273 | 1324 | ||
| 1274 | /* Remember that we saw this grace-period completion. */ | 1325 | /* Remember that we saw this grace-period completion. */ |
| 1275 | rdp->completed = rnp->completed; | 1326 | rdp->completed = rnp->completed; |
| 1276 | trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuend"); | 1327 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend")); |
| 1277 | } | 1328 | } |
| 1278 | 1329 | ||
| 1279 | if (rdp->gpnum != rnp->gpnum) { | 1330 | if (rdp->gpnum != rnp->gpnum) { |
| @@ -1283,7 +1334,7 @@ static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struc | |||
| 1283 | * go looking for one. | 1334 | * go looking for one. |
| 1284 | */ | 1335 | */ |
| 1285 | rdp->gpnum = rnp->gpnum; | 1336 | rdp->gpnum = rnp->gpnum; |
| 1286 | trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart"); | 1337 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); |
| 1287 | rdp->passed_quiesce = 0; | 1338 | rdp->passed_quiesce = 0; |
| 1288 | rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); | 1339 | rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); |
| 1289 | zero_cpu_stall_ticks(rdp); | 1340 | zero_cpu_stall_ticks(rdp); |
| @@ -1308,26 +1359,36 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) | |||
| 1308 | } | 1359 | } |
| 1309 | 1360 | ||
| 1310 | /* | 1361 | /* |
| 1311 | * Initialize a new grace period. | 1362 | * Initialize a new grace period. Return 0 if no grace period required. |
| 1312 | */ | 1363 | */ |
| 1313 | static int rcu_gp_init(struct rcu_state *rsp) | 1364 | static int rcu_gp_init(struct rcu_state *rsp) |
| 1314 | { | 1365 | { |
| 1315 | struct rcu_data *rdp; | 1366 | struct rcu_data *rdp; |
| 1316 | struct rcu_node *rnp = rcu_get_root(rsp); | 1367 | struct rcu_node *rnp = rcu_get_root(rsp); |
| 1317 | 1368 | ||
| 1369 | rcu_bind_gp_kthread(); | ||
| 1318 | raw_spin_lock_irq(&rnp->lock); | 1370 | raw_spin_lock_irq(&rnp->lock); |
| 1371 | if (rsp->gp_flags == 0) { | ||
| 1372 | /* Spurious wakeup, tell caller to go back to sleep. */ | ||
| 1373 | raw_spin_unlock_irq(&rnp->lock); | ||
| 1374 | return 0; | ||
| 1375 | } | ||
| 1319 | rsp->gp_flags = 0; /* Clear all flags: New grace period. */ | 1376 | rsp->gp_flags = 0; /* Clear all flags: New grace period. */ |
| 1320 | 1377 | ||
| 1321 | if (rcu_gp_in_progress(rsp)) { | 1378 | if (WARN_ON_ONCE(rcu_gp_in_progress(rsp))) { |
| 1322 | /* Grace period already in progress, don't start another. */ | 1379 | /* |
| 1380 | * Grace period already in progress, don't start another. | ||
| 1381 | * Not supposed to be able to happen. | ||
| 1382 | */ | ||
| 1323 | raw_spin_unlock_irq(&rnp->lock); | 1383 | raw_spin_unlock_irq(&rnp->lock); |
| 1324 | return 0; | 1384 | return 0; |
| 1325 | } | 1385 | } |
| 1326 | 1386 | ||
| 1327 | /* Advance to a new grace period and initialize state. */ | 1387 | /* Advance to a new grace period and initialize state. */ |
| 1328 | rsp->gpnum++; | ||
| 1329 | trace_rcu_grace_period(rsp->name, rsp->gpnum, "start"); | ||
| 1330 | record_gp_stall_check_time(rsp); | 1388 | record_gp_stall_check_time(rsp); |
| 1389 | smp_wmb(); /* Record GP times before starting GP. */ | ||
| 1390 | rsp->gpnum++; | ||
| 1391 | trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); | ||
| 1331 | raw_spin_unlock_irq(&rnp->lock); | 1392 | raw_spin_unlock_irq(&rnp->lock); |
| 1332 | 1393 | ||
| 1333 | /* Exclude any concurrent CPU-hotplug operations. */ | 1394 | /* Exclude any concurrent CPU-hotplug operations. */ |
| @@ -1376,19 +1437,28 @@ static int rcu_gp_init(struct rcu_state *rsp) | |||
| 1376 | /* | 1437 | /* |
| 1377 | * Do one round of quiescent-state forcing. | 1438 | * Do one round of quiescent-state forcing. |
| 1378 | */ | 1439 | */ |
| 1379 | int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) | 1440 | static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) |
| 1380 | { | 1441 | { |
| 1381 | int fqs_state = fqs_state_in; | 1442 | int fqs_state = fqs_state_in; |
| 1443 | bool isidle = false; | ||
| 1444 | unsigned long maxj; | ||
| 1382 | struct rcu_node *rnp = rcu_get_root(rsp); | 1445 | struct rcu_node *rnp = rcu_get_root(rsp); |
| 1383 | 1446 | ||
| 1384 | rsp->n_force_qs++; | 1447 | rsp->n_force_qs++; |
| 1385 | if (fqs_state == RCU_SAVE_DYNTICK) { | 1448 | if (fqs_state == RCU_SAVE_DYNTICK) { |
| 1386 | /* Collect dyntick-idle snapshots. */ | 1449 | /* Collect dyntick-idle snapshots. */ |
| 1387 | force_qs_rnp(rsp, dyntick_save_progress_counter); | 1450 | if (is_sysidle_rcu_state(rsp)) { |
| 1451 | isidle = 1; | ||
| 1452 | maxj = jiffies - ULONG_MAX / 4; | ||
| 1453 | } | ||
| 1454 | force_qs_rnp(rsp, dyntick_save_progress_counter, | ||
| 1455 | &isidle, &maxj); | ||
| 1456 | rcu_sysidle_report_gp(rsp, isidle, maxj); | ||
| 1388 | fqs_state = RCU_FORCE_QS; | 1457 | fqs_state = RCU_FORCE_QS; |
| 1389 | } else { | 1458 | } else { |
| 1390 | /* Handle dyntick-idle and offline CPUs. */ | 1459 | /* Handle dyntick-idle and offline CPUs. */ |
| 1391 | force_qs_rnp(rsp, rcu_implicit_dynticks_qs); | 1460 | isidle = 0; |
| 1461 | force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj); | ||
| 1392 | } | 1462 | } |
| 1393 | /* Clear flag to prevent immediate re-entry. */ | 1463 | /* Clear flag to prevent immediate re-entry. */ |
| 1394 | if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { | 1464 | if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { |
| @@ -1448,12 +1518,16 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) | |||
| 1448 | rcu_nocb_gp_set(rnp, nocb); | 1518 | rcu_nocb_gp_set(rnp, nocb); |
| 1449 | 1519 | ||
| 1450 | rsp->completed = rsp->gpnum; /* Declare grace period done. */ | 1520 | rsp->completed = rsp->gpnum; /* Declare grace period done. */ |
| 1451 | trace_rcu_grace_period(rsp->name, rsp->completed, "end"); | 1521 | trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end")); |
| 1452 | rsp->fqs_state = RCU_GP_IDLE; | 1522 | rsp->fqs_state = RCU_GP_IDLE; |
| 1453 | rdp = this_cpu_ptr(rsp->rda); | 1523 | rdp = this_cpu_ptr(rsp->rda); |
| 1454 | rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */ | 1524 | rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */ |
| 1455 | if (cpu_needs_another_gp(rsp, rdp)) | 1525 | if (cpu_needs_another_gp(rsp, rdp)) { |
| 1456 | rsp->gp_flags = 1; | 1526 | rsp->gp_flags = RCU_GP_FLAG_INIT; |
| 1527 | trace_rcu_grace_period(rsp->name, | ||
| 1528 | ACCESS_ONCE(rsp->gpnum), | ||
| 1529 | TPS("newreq")); | ||
| 1530 | } | ||
| 1457 | raw_spin_unlock_irq(&rnp->lock); | 1531 | raw_spin_unlock_irq(&rnp->lock); |
| 1458 | } | 1532 | } |
| 1459 | 1533 | ||
| @@ -1463,6 +1537,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) | |||
| 1463 | static int __noreturn rcu_gp_kthread(void *arg) | 1537 | static int __noreturn rcu_gp_kthread(void *arg) |
| 1464 | { | 1538 | { |
| 1465 | int fqs_state; | 1539 | int fqs_state; |
| 1540 | int gf; | ||
| 1466 | unsigned long j; | 1541 | unsigned long j; |
| 1467 | int ret; | 1542 | int ret; |
| 1468 | struct rcu_state *rsp = arg; | 1543 | struct rcu_state *rsp = arg; |
| @@ -1472,14 +1547,19 @@ static int __noreturn rcu_gp_kthread(void *arg) | |||
| 1472 | 1547 | ||
| 1473 | /* Handle grace-period start. */ | 1548 | /* Handle grace-period start. */ |
| 1474 | for (;;) { | 1549 | for (;;) { |
| 1550 | trace_rcu_grace_period(rsp->name, | ||
| 1551 | ACCESS_ONCE(rsp->gpnum), | ||
| 1552 | TPS("reqwait")); | ||
| 1475 | wait_event_interruptible(rsp->gp_wq, | 1553 | wait_event_interruptible(rsp->gp_wq, |
| 1476 | rsp->gp_flags & | 1554 | ACCESS_ONCE(rsp->gp_flags) & |
| 1477 | RCU_GP_FLAG_INIT); | 1555 | RCU_GP_FLAG_INIT); |
| 1478 | if ((rsp->gp_flags & RCU_GP_FLAG_INIT) && | 1556 | if (rcu_gp_init(rsp)) |
| 1479 | rcu_gp_init(rsp)) | ||
| 1480 | break; | 1557 | break; |
| 1481 | cond_resched(); | 1558 | cond_resched(); |
| 1482 | flush_signals(current); | 1559 | flush_signals(current); |
| 1560 | trace_rcu_grace_period(rsp->name, | ||
| 1561 | ACCESS_ONCE(rsp->gpnum), | ||
| 1562 | TPS("reqwaitsig")); | ||
| 1483 | } | 1563 | } |
| 1484 | 1564 | ||
| 1485 | /* Handle quiescent-state forcing. */ | 1565 | /* Handle quiescent-state forcing. */ |
| @@ -1489,10 +1569,16 @@ static int __noreturn rcu_gp_kthread(void *arg) | |||
| 1489 | j = HZ; | 1569 | j = HZ; |
| 1490 | jiffies_till_first_fqs = HZ; | 1570 | jiffies_till_first_fqs = HZ; |
| 1491 | } | 1571 | } |
| 1572 | ret = 0; | ||
| 1492 | for (;;) { | 1573 | for (;;) { |
| 1493 | rsp->jiffies_force_qs = jiffies + j; | 1574 | if (!ret) |
| 1575 | rsp->jiffies_force_qs = jiffies + j; | ||
| 1576 | trace_rcu_grace_period(rsp->name, | ||
| 1577 | ACCESS_ONCE(rsp->gpnum), | ||
| 1578 | TPS("fqswait")); | ||
| 1494 | ret = wait_event_interruptible_timeout(rsp->gp_wq, | 1579 | ret = wait_event_interruptible_timeout(rsp->gp_wq, |
| 1495 | (rsp->gp_flags & RCU_GP_FLAG_FQS) || | 1580 | ((gf = ACCESS_ONCE(rsp->gp_flags)) & |
| 1581 | RCU_GP_FLAG_FQS) || | ||
| 1496 | (!ACCESS_ONCE(rnp->qsmask) && | 1582 | (!ACCESS_ONCE(rnp->qsmask) && |
| 1497 | !rcu_preempt_blocked_readers_cgp(rnp)), | 1583 | !rcu_preempt_blocked_readers_cgp(rnp)), |
| 1498 | j); | 1584 | j); |
| @@ -1501,13 +1587,23 @@ static int __noreturn rcu_gp_kthread(void *arg) | |||
| 1501 | !rcu_preempt_blocked_readers_cgp(rnp)) | 1587 | !rcu_preempt_blocked_readers_cgp(rnp)) |
| 1502 | break; | 1588 | break; |
| 1503 | /* If time for quiescent-state forcing, do it. */ | 1589 | /* If time for quiescent-state forcing, do it. */ |
| 1504 | if (ret == 0 || (rsp->gp_flags & RCU_GP_FLAG_FQS)) { | 1590 | if (ULONG_CMP_GE(jiffies, rsp->jiffies_force_qs) || |
| 1591 | (gf & RCU_GP_FLAG_FQS)) { | ||
| 1592 | trace_rcu_grace_period(rsp->name, | ||
| 1593 | ACCESS_ONCE(rsp->gpnum), | ||
| 1594 | TPS("fqsstart")); | ||
| 1505 | fqs_state = rcu_gp_fqs(rsp, fqs_state); | 1595 | fqs_state = rcu_gp_fqs(rsp, fqs_state); |
| 1596 | trace_rcu_grace_period(rsp->name, | ||
| 1597 | ACCESS_ONCE(rsp->gpnum), | ||
| 1598 | TPS("fqsend")); | ||
| 1506 | cond_resched(); | 1599 | cond_resched(); |
| 1507 | } else { | 1600 | } else { |
| 1508 | /* Deal with stray signal. */ | 1601 | /* Deal with stray signal. */ |
| 1509 | cond_resched(); | 1602 | cond_resched(); |
| 1510 | flush_signals(current); | 1603 | flush_signals(current); |
| 1604 | trace_rcu_grace_period(rsp->name, | ||
| 1605 | ACCESS_ONCE(rsp->gpnum), | ||
| 1606 | TPS("fqswaitsig")); | ||
| 1511 | } | 1607 | } |
| 1512 | j = jiffies_till_next_fqs; | 1608 | j = jiffies_till_next_fqs; |
| 1513 | if (j > HZ) { | 1609 | if (j > HZ) { |
| @@ -1555,13 +1651,17 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, | |||
| 1555 | return; | 1651 | return; |
| 1556 | } | 1652 | } |
| 1557 | rsp->gp_flags = RCU_GP_FLAG_INIT; | 1653 | rsp->gp_flags = RCU_GP_FLAG_INIT; |
| 1654 | trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), | ||
| 1655 | TPS("newreq")); | ||
| 1558 | 1656 | ||
| 1559 | /* | 1657 | /* |
| 1560 | * We can't do wakeups while holding the rnp->lock, as that | 1658 | * We can't do wakeups while holding the rnp->lock, as that |
| 1561 | * could cause possible deadlocks with the rq->lock. Deter | 1659 | * could cause possible deadlocks with the rq->lock. Defer |
| 1562 | * the wakeup to interrupt context. | 1660 | * the wakeup to interrupt context. And don't bother waking |
| 1661 | * up the running kthread. | ||
| 1563 | */ | 1662 | */ |
| 1564 | irq_work_queue(&rsp->wakeup_work); | 1663 | if (current != rsp->gp_kthread) |
| 1664 | irq_work_queue(&rsp->wakeup_work); | ||
| 1565 | } | 1665 | } |
| 1566 | 1666 | ||
| 1567 | /* | 1667 | /* |
| @@ -1857,7 +1957,7 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) | |||
| 1857 | RCU_TRACE(mask = rdp->grpmask); | 1957 | RCU_TRACE(mask = rdp->grpmask); |
| 1858 | trace_rcu_grace_period(rsp->name, | 1958 | trace_rcu_grace_period(rsp->name, |
| 1859 | rnp->gpnum + 1 - !!(rnp->qsmask & mask), | 1959 | rnp->gpnum + 1 - !!(rnp->qsmask & mask), |
| 1860 | "cpuofl"); | 1960 | TPS("cpuofl")); |
| 1861 | } | 1961 | } |
| 1862 | 1962 | ||
| 1863 | /* | 1963 | /* |
| @@ -2044,7 +2144,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
| 2044 | */ | 2144 | */ |
| 2045 | void rcu_check_callbacks(int cpu, int user) | 2145 | void rcu_check_callbacks(int cpu, int user) |
| 2046 | { | 2146 | { |
| 2047 | trace_rcu_utilization("Start scheduler-tick"); | 2147 | trace_rcu_utilization(TPS("Start scheduler-tick")); |
| 2048 | increment_cpu_stall_ticks(); | 2148 | increment_cpu_stall_ticks(); |
| 2049 | if (user || rcu_is_cpu_rrupt_from_idle()) { | 2149 | if (user || rcu_is_cpu_rrupt_from_idle()) { |
| 2050 | 2150 | ||
| @@ -2077,7 +2177,7 @@ void rcu_check_callbacks(int cpu, int user) | |||
| 2077 | rcu_preempt_check_callbacks(cpu); | 2177 | rcu_preempt_check_callbacks(cpu); |
| 2078 | if (rcu_pending(cpu)) | 2178 | if (rcu_pending(cpu)) |
| 2079 | invoke_rcu_core(); | 2179 | invoke_rcu_core(); |
| 2080 | trace_rcu_utilization("End scheduler-tick"); | 2180 | trace_rcu_utilization(TPS("End scheduler-tick")); |
| 2081 | } | 2181 | } |
| 2082 | 2182 | ||
| 2083 | /* | 2183 | /* |
| @@ -2087,7 +2187,10 @@ void rcu_check_callbacks(int cpu, int user) | |||
| 2087 | * | 2187 | * |
| 2088 | * The caller must have suppressed start of new grace periods. | 2188 | * The caller must have suppressed start of new grace periods. |
| 2089 | */ | 2189 | */ |
| 2090 | static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) | 2190 | static void force_qs_rnp(struct rcu_state *rsp, |
| 2191 | int (*f)(struct rcu_data *rsp, bool *isidle, | ||
| 2192 | unsigned long *maxj), | ||
| 2193 | bool *isidle, unsigned long *maxj) | ||
| 2091 | { | 2194 | { |
| 2092 | unsigned long bit; | 2195 | unsigned long bit; |
| 2093 | int cpu; | 2196 | int cpu; |
| @@ -2110,9 +2213,12 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) | |||
| 2110 | cpu = rnp->grplo; | 2213 | cpu = rnp->grplo; |
| 2111 | bit = 1; | 2214 | bit = 1; |
| 2112 | for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { | 2215 | for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { |
| 2113 | if ((rnp->qsmask & bit) != 0 && | 2216 | if ((rnp->qsmask & bit) != 0) { |
| 2114 | f(per_cpu_ptr(rsp->rda, cpu))) | 2217 | if ((rnp->qsmaskinit & bit) != 0) |
| 2115 | mask |= bit; | 2218 | *isidle = 0; |
| 2219 | if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) | ||
| 2220 | mask |= bit; | ||
| 2221 | } | ||
| 2116 | } | 2222 | } |
| 2117 | if (mask != 0) { | 2223 | if (mask != 0) { |
| 2118 | 2224 | ||
| @@ -2208,10 +2314,10 @@ static void rcu_process_callbacks(struct softirq_action *unused) | |||
| 2208 | 2314 | ||
| 2209 | if (cpu_is_offline(smp_processor_id())) | 2315 | if (cpu_is_offline(smp_processor_id())) |
| 2210 | return; | 2316 | return; |
| 2211 | trace_rcu_utilization("Start RCU core"); | 2317 | trace_rcu_utilization(TPS("Start RCU core")); |
| 2212 | for_each_rcu_flavor(rsp) | 2318 | for_each_rcu_flavor(rsp) |
| 2213 | __rcu_process_callbacks(rsp); | 2319 | __rcu_process_callbacks(rsp); |
| 2214 | trace_rcu_utilization("End RCU core"); | 2320 | trace_rcu_utilization(TPS("End RCU core")); |
| 2215 | } | 2321 | } |
| 2216 | 2322 | ||
| 2217 | /* | 2323 | /* |
| @@ -2248,7 +2354,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, | |||
| 2248 | * If called from an extended quiescent state, invoke the RCU | 2354 | * If called from an extended quiescent state, invoke the RCU |
| 2249 | * core in order to force a re-evaluation of RCU's idleness. | 2355 | * core in order to force a re-evaluation of RCU's idleness. |
| 2250 | */ | 2356 | */ |
| 2251 | if (rcu_is_cpu_idle() && cpu_online(smp_processor_id())) | 2357 | if (!rcu_is_watching() && cpu_online(smp_processor_id())) |
| 2252 | invoke_rcu_core(); | 2358 | invoke_rcu_core(); |
| 2253 | 2359 | ||
| 2254 | /* If interrupts were disabled or CPU offline, don't invoke RCU core. */ | 2360 | /* If interrupts were disabled or CPU offline, don't invoke RCU core. */ |
| @@ -2287,6 +2393,13 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, | |||
| 2287 | } | 2393 | } |
| 2288 | 2394 | ||
| 2289 | /* | 2395 | /* |
| 2396 | * RCU callback function to leak a callback. | ||
| 2397 | */ | ||
| 2398 | static void rcu_leak_callback(struct rcu_head *rhp) | ||
| 2399 | { | ||
| 2400 | } | ||
| 2401 | |||
| 2402 | /* | ||
| 2290 | * Helper function for call_rcu() and friends. The cpu argument will | 2403 | * Helper function for call_rcu() and friends. The cpu argument will |
| 2291 | * normally be -1, indicating "currently running CPU". It may specify | 2404 | * normally be -1, indicating "currently running CPU". It may specify |
| 2292 | * a CPU only if that CPU is a no-CBs CPU. Currently, only _rcu_barrier() | 2405 | * a CPU only if that CPU is a no-CBs CPU. Currently, only _rcu_barrier() |
| @@ -2300,7 +2413,12 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
| 2300 | struct rcu_data *rdp; | 2413 | struct rcu_data *rdp; |
| 2301 | 2414 | ||
| 2302 | WARN_ON_ONCE((unsigned long)head & 0x3); /* Misaligned rcu_head! */ | 2415 | WARN_ON_ONCE((unsigned long)head & 0x3); /* Misaligned rcu_head! */ |
| 2303 | debug_rcu_head_queue(head); | 2416 | if (debug_rcu_head_queue(head)) { |
| 2417 | /* Probable double call_rcu(), so leak the callback. */ | ||
| 2418 | ACCESS_ONCE(head->func) = rcu_leak_callback; | ||
| 2419 | WARN_ONCE(1, "__call_rcu(): Leaked duplicate callback\n"); | ||
| 2420 | return; | ||
| 2421 | } | ||
| 2304 | head->func = func; | 2422 | head->func = func; |
| 2305 | head->next = NULL; | 2423 | head->next = NULL; |
| 2306 | 2424 | ||
| @@ -2706,10 +2824,13 @@ static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy) | |||
| 2706 | 2824 | ||
| 2707 | for_each_rcu_flavor(rsp) { | 2825 | for_each_rcu_flavor(rsp) { |
| 2708 | rdp = per_cpu_ptr(rsp->rda, cpu); | 2826 | rdp = per_cpu_ptr(rsp->rda, cpu); |
| 2709 | if (rdp->qlen != rdp->qlen_lazy) | 2827 | if (!rdp->nxtlist) |
| 2828 | continue; | ||
| 2829 | hc = true; | ||
| 2830 | if (rdp->qlen != rdp->qlen_lazy || !all_lazy) { | ||
| 2710 | al = false; | 2831 | al = false; |
| 2711 | if (rdp->nxtlist) | 2832 | break; |
| 2712 | hc = true; | 2833 | } |
| 2713 | } | 2834 | } |
| 2714 | if (all_lazy) | 2835 | if (all_lazy) |
| 2715 | *all_lazy = al; | 2836 | *all_lazy = al; |
| @@ -2720,7 +2841,7 @@ static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy) | |||
| 2720 | * Helper function for _rcu_barrier() tracing. If tracing is disabled, | 2841 | * Helper function for _rcu_barrier() tracing. If tracing is disabled, |
| 2721 | * the compiler is expected to optimize this away. | 2842 | * the compiler is expected to optimize this away. |
| 2722 | */ | 2843 | */ |
| 2723 | static void _rcu_barrier_trace(struct rcu_state *rsp, char *s, | 2844 | static void _rcu_barrier_trace(struct rcu_state *rsp, const char *s, |
| 2724 | int cpu, unsigned long done) | 2845 | int cpu, unsigned long done) |
| 2725 | { | 2846 | { |
| 2726 | trace_rcu_barrier(rsp->name, s, cpu, | 2847 | trace_rcu_barrier(rsp->name, s, cpu, |
| @@ -2785,9 +2906,20 @@ static void _rcu_barrier(struct rcu_state *rsp) | |||
| 2785 | * transition. The "if" expression below therefore rounds the old | 2906 | * transition. The "if" expression below therefore rounds the old |
| 2786 | * value up to the next even number and adds two before comparing. | 2907 | * value up to the next even number and adds two before comparing. |
| 2787 | */ | 2908 | */ |
| 2788 | snap_done = ACCESS_ONCE(rsp->n_barrier_done); | 2909 | snap_done = rsp->n_barrier_done; |
| 2789 | _rcu_barrier_trace(rsp, "Check", -1, snap_done); | 2910 | _rcu_barrier_trace(rsp, "Check", -1, snap_done); |
| 2790 | if (ULONG_CMP_GE(snap_done, ((snap + 1) & ~0x1) + 2)) { | 2911 | |
| 2912 | /* | ||
| 2913 | * If the value in snap is odd, we needed to wait for the current | ||
| 2914 | * rcu_barrier() to complete, then wait for the next one, in other | ||
| 2915 | * words, we need the value of snap_done to be three larger than | ||
| 2916 | * the value of snap. On the other hand, if the value in snap is | ||
| 2917 | * even, we only had to wait for the next rcu_barrier() to complete, | ||
| 2918 | * in other words, we need the value of snap_done to be only two | ||
| 2919 | * greater than the value of snap. The "(snap + 3) & ~0x1" computes | ||
| 2920 | * this for us (thank you, Linus!). | ||
| 2921 | */ | ||
| 2922 | if (ULONG_CMP_GE(snap_done, (snap + 3) & ~0x1)) { | ||
| 2791 | _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done); | 2923 | _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done); |
| 2792 | smp_mb(); /* caller's subsequent code after above check. */ | 2924 | smp_mb(); /* caller's subsequent code after above check. */ |
| 2793 | mutex_unlock(&rsp->barrier_mutex); | 2925 | mutex_unlock(&rsp->barrier_mutex); |
| @@ -2930,6 +3062,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) | |||
| 2930 | rdp->blimit = blimit; | 3062 | rdp->blimit = blimit; |
| 2931 | init_callback_list(rdp); /* Re-enable callbacks on this CPU. */ | 3063 | init_callback_list(rdp); /* Re-enable callbacks on this CPU. */ |
| 2932 | rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; | 3064 | rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; |
| 3065 | rcu_sysidle_init_percpu_data(rdp->dynticks); | ||
| 2933 | atomic_set(&rdp->dynticks->dynticks, | 3066 | atomic_set(&rdp->dynticks->dynticks, |
| 2934 | (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); | 3067 | (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); |
| 2935 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 3068 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
| @@ -2952,7 +3085,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) | |||
| 2952 | rdp->completed = rnp->completed; | 3085 | rdp->completed = rnp->completed; |
| 2953 | rdp->passed_quiesce = 0; | 3086 | rdp->passed_quiesce = 0; |
| 2954 | rdp->qs_pending = 0; | 3087 | rdp->qs_pending = 0; |
| 2955 | trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuonl"); | 3088 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); |
| 2956 | } | 3089 | } |
| 2957 | raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ | 3090 | raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ |
| 2958 | rnp = rnp->parent; | 3091 | rnp = rnp->parent; |
| @@ -2982,7 +3115,7 @@ static int rcu_cpu_notify(struct notifier_block *self, | |||
| 2982 | struct rcu_node *rnp = rdp->mynode; | 3115 | struct rcu_node *rnp = rdp->mynode; |
| 2983 | struct rcu_state *rsp; | 3116 | struct rcu_state *rsp; |
| 2984 | 3117 | ||
| 2985 | trace_rcu_utilization("Start CPU hotplug"); | 3118 | trace_rcu_utilization(TPS("Start CPU hotplug")); |
| 2986 | switch (action) { | 3119 | switch (action) { |
| 2987 | case CPU_UP_PREPARE: | 3120 | case CPU_UP_PREPARE: |
| 2988 | case CPU_UP_PREPARE_FROZEN: | 3121 | case CPU_UP_PREPARE_FROZEN: |
| @@ -3011,7 +3144,26 @@ static int rcu_cpu_notify(struct notifier_block *self, | |||
| 3011 | default: | 3144 | default: |
| 3012 | break; | 3145 | break; |
| 3013 | } | 3146 | } |
| 3014 | trace_rcu_utilization("End CPU hotplug"); | 3147 | trace_rcu_utilization(TPS("End CPU hotplug")); |
| 3148 | return NOTIFY_OK; | ||
| 3149 | } | ||
| 3150 | |||
| 3151 | static int rcu_pm_notify(struct notifier_block *self, | ||
| 3152 | unsigned long action, void *hcpu) | ||
| 3153 | { | ||
| 3154 | switch (action) { | ||
| 3155 | case PM_HIBERNATION_PREPARE: | ||
| 3156 | case PM_SUSPEND_PREPARE: | ||
| 3157 | if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */ | ||
| 3158 | rcu_expedited = 1; | ||
| 3159 | break; | ||
| 3160 | case PM_POST_HIBERNATION: | ||
| 3161 | case PM_POST_SUSPEND: | ||
| 3162 | rcu_expedited = 0; | ||
| 3163 | break; | ||
| 3164 | default: | ||
| 3165 | break; | ||
| 3166 | } | ||
| 3015 | return NOTIFY_OK; | 3167 | return NOTIFY_OK; |
| 3016 | } | 3168 | } |
| 3017 | 3169 | ||
| @@ -3166,7 +3318,7 @@ static void __init rcu_init_one(struct rcu_state *rsp, | |||
| 3166 | 3318 | ||
| 3167 | /* | 3319 | /* |
| 3168 | * Compute the rcu_node tree geometry from kernel parameters. This cannot | 3320 | * Compute the rcu_node tree geometry from kernel parameters. This cannot |
| 3169 | * replace the definitions in rcutree.h because those are needed to size | 3321 | * replace the definitions in tree.h because those are needed to size |
| 3170 | * the ->node array in the rcu_state structure. | 3322 | * the ->node array in the rcu_state structure. |
| 3171 | */ | 3323 | */ |
| 3172 | static void __init rcu_init_geometry(void) | 3324 | static void __init rcu_init_geometry(void) |
| @@ -3245,8 +3397,8 @@ void __init rcu_init(void) | |||
| 3245 | 3397 | ||
| 3246 | rcu_bootup_announce(); | 3398 | rcu_bootup_announce(); |
| 3247 | rcu_init_geometry(); | 3399 | rcu_init_geometry(); |
| 3248 | rcu_init_one(&rcu_sched_state, &rcu_sched_data); | ||
| 3249 | rcu_init_one(&rcu_bh_state, &rcu_bh_data); | 3400 | rcu_init_one(&rcu_bh_state, &rcu_bh_data); |
| 3401 | rcu_init_one(&rcu_sched_state, &rcu_sched_data); | ||
| 3250 | __rcu_init_preempt(); | 3402 | __rcu_init_preempt(); |
| 3251 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); | 3403 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); |
| 3252 | 3404 | ||
| @@ -3256,8 +3408,9 @@ void __init rcu_init(void) | |||
| 3256 | * or the scheduler are operational. | 3408 | * or the scheduler are operational. |
| 3257 | */ | 3409 | */ |
| 3258 | cpu_notifier(rcu_cpu_notify, 0); | 3410 | cpu_notifier(rcu_cpu_notify, 0); |
| 3411 | pm_notifier(rcu_pm_notify, 0); | ||
| 3259 | for_each_online_cpu(cpu) | 3412 | for_each_online_cpu(cpu) |
| 3260 | rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); | 3413 | rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); |
| 3261 | } | 3414 | } |
| 3262 | 3415 | ||
| 3263 | #include "rcutree_plugin.h" | 3416 | #include "tree_plugin.h" |
diff --git a/kernel/rcutree.h b/kernel/rcu/tree.h index b3832581043c..52be957c9fe2 100644 --- a/kernel/rcutree.h +++ b/kernel/rcu/tree.h | |||
| @@ -88,6 +88,14 @@ struct rcu_dynticks { | |||
| 88 | /* Process level is worth LLONG_MAX/2. */ | 88 | /* Process level is worth LLONG_MAX/2. */ |
| 89 | int dynticks_nmi_nesting; /* Track NMI nesting level. */ | 89 | int dynticks_nmi_nesting; /* Track NMI nesting level. */ |
| 90 | atomic_t dynticks; /* Even value for idle, else odd. */ | 90 | atomic_t dynticks; /* Even value for idle, else odd. */ |
| 91 | #ifdef CONFIG_NO_HZ_FULL_SYSIDLE | ||
| 92 | long long dynticks_idle_nesting; | ||
| 93 | /* irq/process nesting level from idle. */ | ||
| 94 | atomic_t dynticks_idle; /* Even value for idle, else odd. */ | ||
| 95 | /* "Idle" excludes userspace execution. */ | ||
| 96 | unsigned long dynticks_idle_jiffies; | ||
| 97 | /* End of last non-NMI non-idle period. */ | ||
| 98 | #endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ | ||
| 91 | #ifdef CONFIG_RCU_FAST_NO_HZ | 99 | #ifdef CONFIG_RCU_FAST_NO_HZ |
| 92 | bool all_lazy; /* Are all CPU's CBs lazy? */ | 100 | bool all_lazy; /* Are all CPU's CBs lazy? */ |
| 93 | unsigned long nonlazy_posted; | 101 | unsigned long nonlazy_posted; |
| @@ -96,6 +104,8 @@ struct rcu_dynticks { | |||
| 96 | /* idle-period nonlazy_posted snapshot. */ | 104 | /* idle-period nonlazy_posted snapshot. */ |
| 97 | unsigned long last_accelerate; | 105 | unsigned long last_accelerate; |
| 98 | /* Last jiffy CBs were accelerated. */ | 106 | /* Last jiffy CBs were accelerated. */ |
| 107 | unsigned long last_advance_all; | ||
| 108 | /* Last jiffy CBs were all advanced. */ | ||
| 99 | int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ | 109 | int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ |
| 100 | #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ | 110 | #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ |
| 101 | }; | 111 | }; |
| @@ -445,7 +455,7 @@ struct rcu_state { | |||
| 445 | /* for CPU stalls. */ | 455 | /* for CPU stalls. */ |
| 446 | unsigned long gp_max; /* Maximum GP duration in */ | 456 | unsigned long gp_max; /* Maximum GP duration in */ |
| 447 | /* jiffies. */ | 457 | /* jiffies. */ |
| 448 | char *name; /* Name of structure. */ | 458 | const char *name; /* Name of structure. */ |
| 449 | char abbr; /* Abbreviated name. */ | 459 | char abbr; /* Abbreviated name. */ |
| 450 | struct list_head flavors; /* List of RCU flavors. */ | 460 | struct list_head flavors; /* List of RCU flavors. */ |
| 451 | struct irq_work wakeup_work; /* Postponed wakeups */ | 461 | struct irq_work wakeup_work; /* Postponed wakeups */ |
| @@ -545,6 +555,15 @@ static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); | |||
| 545 | static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); | 555 | static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); |
| 546 | static void rcu_kick_nohz_cpu(int cpu); | 556 | static void rcu_kick_nohz_cpu(int cpu); |
| 547 | static bool init_nocb_callback_list(struct rcu_data *rdp); | 557 | static bool init_nocb_callback_list(struct rcu_data *rdp); |
| 558 | static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq); | ||
| 559 | static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq); | ||
| 560 | static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, | ||
| 561 | unsigned long *maxj); | ||
| 562 | static bool is_sysidle_rcu_state(struct rcu_state *rsp); | ||
| 563 | static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, | ||
| 564 | unsigned long maxj); | ||
| 565 | static void rcu_bind_gp_kthread(void); | ||
| 566 | static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp); | ||
| 548 | 567 | ||
| 549 | #endif /* #ifndef RCU_TREE_NONCORE */ | 568 | #endif /* #ifndef RCU_TREE_NONCORE */ |
| 550 | 569 | ||
diff --git a/kernel/rcutree_plugin.h b/kernel/rcu/tree_plugin.h index 769e12e3151b..3822ac0c4b27 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcu/tree_plugin.h | |||
| @@ -28,7 +28,7 @@ | |||
| 28 | #include <linux/gfp.h> | 28 | #include <linux/gfp.h> |
| 29 | #include <linux/oom.h> | 29 | #include <linux/oom.h> |
| 30 | #include <linux/smpboot.h> | 30 | #include <linux/smpboot.h> |
| 31 | #include <linux/tick.h> | 31 | #include "../time/tick-internal.h" |
| 32 | 32 | ||
| 33 | #define RCU_KTHREAD_PRIO 1 | 33 | #define RCU_KTHREAD_PRIO 1 |
| 34 | 34 | ||
| @@ -96,10 +96,15 @@ static void __init rcu_bootup_announce_oddness(void) | |||
| 96 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */ | 96 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */ |
| 97 | #ifdef CONFIG_RCU_NOCB_CPU_ALL | 97 | #ifdef CONFIG_RCU_NOCB_CPU_ALL |
| 98 | pr_info("\tOffload RCU callbacks from all CPUs\n"); | 98 | pr_info("\tOffload RCU callbacks from all CPUs\n"); |
| 99 | cpumask_setall(rcu_nocb_mask); | 99 | cpumask_copy(rcu_nocb_mask, cpu_possible_mask); |
| 100 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */ | 100 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */ |
| 101 | #endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */ | 101 | #endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */ |
| 102 | if (have_rcu_nocb_mask) { | 102 | if (have_rcu_nocb_mask) { |
| 103 | if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) { | ||
| 104 | pr_info("\tNote: kernel parameter 'rcu_nocbs=' contains nonexistent CPUs.\n"); | ||
| 105 | cpumask_and(rcu_nocb_mask, cpu_possible_mask, | ||
| 106 | rcu_nocb_mask); | ||
| 107 | } | ||
| 103 | cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); | 108 | cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); |
| 104 | pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf); | 109 | pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf); |
| 105 | if (rcu_nocb_poll) | 110 | if (rcu_nocb_poll) |
| @@ -110,9 +115,7 @@ static void __init rcu_bootup_announce_oddness(void) | |||
| 110 | 115 | ||
| 111 | #ifdef CONFIG_TREE_PREEMPT_RCU | 116 | #ifdef CONFIG_TREE_PREEMPT_RCU |
| 112 | 117 | ||
| 113 | struct rcu_state rcu_preempt_state = | 118 | RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); |
| 114 | RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); | ||
| 115 | DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); | ||
| 116 | static struct rcu_state *rcu_state = &rcu_preempt_state; | 119 | static struct rcu_state *rcu_state = &rcu_preempt_state; |
| 117 | 120 | ||
| 118 | static int rcu_preempted_readers_exp(struct rcu_node *rnp); | 121 | static int rcu_preempted_readers_exp(struct rcu_node *rnp); |
| @@ -169,7 +172,7 @@ static void rcu_preempt_qs(int cpu) | |||
| 169 | struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); | 172 | struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); |
| 170 | 173 | ||
| 171 | if (rdp->passed_quiesce == 0) | 174 | if (rdp->passed_quiesce == 0) |
| 172 | trace_rcu_grace_period("rcu_preempt", rdp->gpnum, "cpuqs"); | 175 | trace_rcu_grace_period(TPS("rcu_preempt"), rdp->gpnum, TPS("cpuqs")); |
| 173 | rdp->passed_quiesce = 1; | 176 | rdp->passed_quiesce = 1; |
| 174 | current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; | 177 | current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; |
| 175 | } | 178 | } |
| @@ -388,7 +391,7 @@ void rcu_read_unlock_special(struct task_struct *t) | |||
| 388 | np = rcu_next_node_entry(t, rnp); | 391 | np = rcu_next_node_entry(t, rnp); |
| 389 | list_del_init(&t->rcu_node_entry); | 392 | list_del_init(&t->rcu_node_entry); |
| 390 | t->rcu_blocked_node = NULL; | 393 | t->rcu_blocked_node = NULL; |
| 391 | trace_rcu_unlock_preempted_task("rcu_preempt", | 394 | trace_rcu_unlock_preempted_task(TPS("rcu_preempt"), |
| 392 | rnp->gpnum, t->pid); | 395 | rnp->gpnum, t->pid); |
| 393 | if (&t->rcu_node_entry == rnp->gp_tasks) | 396 | if (&t->rcu_node_entry == rnp->gp_tasks) |
| 394 | rnp->gp_tasks = np; | 397 | rnp->gp_tasks = np; |
| @@ -412,7 +415,7 @@ void rcu_read_unlock_special(struct task_struct *t) | |||
| 412 | */ | 415 | */ |
| 413 | empty_exp_now = !rcu_preempted_readers_exp(rnp); | 416 | empty_exp_now = !rcu_preempted_readers_exp(rnp); |
| 414 | if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) { | 417 | if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) { |
| 415 | trace_rcu_quiescent_state_report("preempt_rcu", | 418 | trace_rcu_quiescent_state_report(TPS("preempt_rcu"), |
| 416 | rnp->gpnum, | 419 | rnp->gpnum, |
| 417 | 0, rnp->qsmask, | 420 | 0, rnp->qsmask, |
| 418 | rnp->level, | 421 | rnp->level, |
| @@ -662,7 +665,7 @@ static void rcu_preempt_check_callbacks(int cpu) | |||
| 662 | 665 | ||
| 663 | static void rcu_preempt_do_callbacks(void) | 666 | static void rcu_preempt_do_callbacks(void) |
| 664 | { | 667 | { |
| 665 | rcu_do_batch(&rcu_preempt_state, &__get_cpu_var(rcu_preempt_data)); | 668 | rcu_do_batch(&rcu_preempt_state, this_cpu_ptr(&rcu_preempt_data)); |
| 666 | } | 669 | } |
| 667 | 670 | ||
| 668 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 671 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
| @@ -1130,7 +1133,7 @@ void exit_rcu(void) | |||
| 1130 | 1133 | ||
| 1131 | #ifdef CONFIG_RCU_BOOST | 1134 | #ifdef CONFIG_RCU_BOOST |
| 1132 | 1135 | ||
| 1133 | #include "rtmutex_common.h" | 1136 | #include "../rtmutex_common.h" |
| 1134 | 1137 | ||
| 1135 | #ifdef CONFIG_RCU_TRACE | 1138 | #ifdef CONFIG_RCU_TRACE |
| 1136 | 1139 | ||
| @@ -1250,12 +1253,12 @@ static int rcu_boost_kthread(void *arg) | |||
| 1250 | int spincnt = 0; | 1253 | int spincnt = 0; |
| 1251 | int more2boost; | 1254 | int more2boost; |
| 1252 | 1255 | ||
| 1253 | trace_rcu_utilization("Start boost kthread@init"); | 1256 | trace_rcu_utilization(TPS("Start boost kthread@init")); |
| 1254 | for (;;) { | 1257 | for (;;) { |
| 1255 | rnp->boost_kthread_status = RCU_KTHREAD_WAITING; | 1258 | rnp->boost_kthread_status = RCU_KTHREAD_WAITING; |
| 1256 | trace_rcu_utilization("End boost kthread@rcu_wait"); | 1259 | trace_rcu_utilization(TPS("End boost kthread@rcu_wait")); |
| 1257 | rcu_wait(rnp->boost_tasks || rnp->exp_tasks); | 1260 | rcu_wait(rnp->boost_tasks || rnp->exp_tasks); |
| 1258 | trace_rcu_utilization("Start boost kthread@rcu_wait"); | 1261 | trace_rcu_utilization(TPS("Start boost kthread@rcu_wait")); |
| 1259 | rnp->boost_kthread_status = RCU_KTHREAD_RUNNING; | 1262 | rnp->boost_kthread_status = RCU_KTHREAD_RUNNING; |
| 1260 | more2boost = rcu_boost(rnp); | 1263 | more2boost = rcu_boost(rnp); |
| 1261 | if (more2boost) | 1264 | if (more2boost) |
| @@ -1264,14 +1267,14 @@ static int rcu_boost_kthread(void *arg) | |||
| 1264 | spincnt = 0; | 1267 | spincnt = 0; |
| 1265 | if (spincnt > 10) { | 1268 | if (spincnt > 10) { |
| 1266 | rnp->boost_kthread_status = RCU_KTHREAD_YIELDING; | 1269 | rnp->boost_kthread_status = RCU_KTHREAD_YIELDING; |
| 1267 | trace_rcu_utilization("End boost kthread@rcu_yield"); | 1270 | trace_rcu_utilization(TPS("End boost kthread@rcu_yield")); |
| 1268 | schedule_timeout_interruptible(2); | 1271 | schedule_timeout_interruptible(2); |
| 1269 | trace_rcu_utilization("Start boost kthread@rcu_yield"); | 1272 | trace_rcu_utilization(TPS("Start boost kthread@rcu_yield")); |
| 1270 | spincnt = 0; | 1273 | spincnt = 0; |
| 1271 | } | 1274 | } |
| 1272 | } | 1275 | } |
| 1273 | /* NOTREACHED */ | 1276 | /* NOTREACHED */ |
| 1274 | trace_rcu_utilization("End boost kthread@notreached"); | 1277 | trace_rcu_utilization(TPS("End boost kthread@notreached")); |
| 1275 | return 0; | 1278 | return 0; |
| 1276 | } | 1279 | } |
| 1277 | 1280 | ||
| @@ -1334,7 +1337,7 @@ static void invoke_rcu_callbacks_kthread(void) | |||
| 1334 | */ | 1337 | */ |
| 1335 | static bool rcu_is_callbacks_kthread(void) | 1338 | static bool rcu_is_callbacks_kthread(void) |
| 1336 | { | 1339 | { |
| 1337 | return __get_cpu_var(rcu_cpu_kthread_task) == current; | 1340 | return __this_cpu_read(rcu_cpu_kthread_task) == current; |
| 1338 | } | 1341 | } |
| 1339 | 1342 | ||
| 1340 | #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) | 1343 | #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) |
| @@ -1384,8 +1387,8 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, | |||
| 1384 | 1387 | ||
| 1385 | static void rcu_kthread_do_work(void) | 1388 | static void rcu_kthread_do_work(void) |
| 1386 | { | 1389 | { |
| 1387 | rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data)); | 1390 | rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data)); |
| 1388 | rcu_do_batch(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); | 1391 | rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data)); |
| 1389 | rcu_preempt_do_callbacks(); | 1392 | rcu_preempt_do_callbacks(); |
| 1390 | } | 1393 | } |
| 1391 | 1394 | ||
| @@ -1404,7 +1407,7 @@ static void rcu_cpu_kthread_park(unsigned int cpu) | |||
| 1404 | 1407 | ||
| 1405 | static int rcu_cpu_kthread_should_run(unsigned int cpu) | 1408 | static int rcu_cpu_kthread_should_run(unsigned int cpu) |
| 1406 | { | 1409 | { |
| 1407 | return __get_cpu_var(rcu_cpu_has_work); | 1410 | return __this_cpu_read(rcu_cpu_has_work); |
| 1408 | } | 1411 | } |
| 1409 | 1412 | ||
| 1410 | /* | 1413 | /* |
| @@ -1414,12 +1417,12 @@ static int rcu_cpu_kthread_should_run(unsigned int cpu) | |||
| 1414 | */ | 1417 | */ |
| 1415 | static void rcu_cpu_kthread(unsigned int cpu) | 1418 | static void rcu_cpu_kthread(unsigned int cpu) |
| 1416 | { | 1419 | { |
| 1417 | unsigned int *statusp = &__get_cpu_var(rcu_cpu_kthread_status); | 1420 | unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status); |
| 1418 | char work, *workp = &__get_cpu_var(rcu_cpu_has_work); | 1421 | char work, *workp = this_cpu_ptr(&rcu_cpu_has_work); |
| 1419 | int spincnt; | 1422 | int spincnt; |
| 1420 | 1423 | ||
| 1421 | for (spincnt = 0; spincnt < 10; spincnt++) { | 1424 | for (spincnt = 0; spincnt < 10; spincnt++) { |
| 1422 | trace_rcu_utilization("Start CPU kthread@rcu_wait"); | 1425 | trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait")); |
| 1423 | local_bh_disable(); | 1426 | local_bh_disable(); |
| 1424 | *statusp = RCU_KTHREAD_RUNNING; | 1427 | *statusp = RCU_KTHREAD_RUNNING; |
| 1425 | this_cpu_inc(rcu_cpu_kthread_loops); | 1428 | this_cpu_inc(rcu_cpu_kthread_loops); |
| @@ -1431,15 +1434,15 @@ static void rcu_cpu_kthread(unsigned int cpu) | |||
| 1431 | rcu_kthread_do_work(); | 1434 | rcu_kthread_do_work(); |
| 1432 | local_bh_enable(); | 1435 | local_bh_enable(); |
| 1433 | if (*workp == 0) { | 1436 | if (*workp == 0) { |
| 1434 | trace_rcu_utilization("End CPU kthread@rcu_wait"); | 1437 | trace_rcu_utilization(TPS("End CPU kthread@rcu_wait")); |
| 1435 | *statusp = RCU_KTHREAD_WAITING; | 1438 | *statusp = RCU_KTHREAD_WAITING; |
| 1436 | return; | 1439 | return; |
| 1437 | } | 1440 | } |
| 1438 | } | 1441 | } |
| 1439 | *statusp = RCU_KTHREAD_YIELDING; | 1442 | *statusp = RCU_KTHREAD_YIELDING; |
| 1440 | trace_rcu_utilization("Start CPU kthread@rcu_yield"); | 1443 | trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield")); |
| 1441 | schedule_timeout_interruptible(2); | 1444 | schedule_timeout_interruptible(2); |
| 1442 | trace_rcu_utilization("End CPU kthread@rcu_yield"); | 1445 | trace_rcu_utilization(TPS("End CPU kthread@rcu_yield")); |
| 1443 | *statusp = RCU_KTHREAD_WAITING; | 1446 | *statusp = RCU_KTHREAD_WAITING; |
| 1444 | } | 1447 | } |
| 1445 | 1448 | ||
| @@ -1632,17 +1635,23 @@ module_param(rcu_idle_lazy_gp_delay, int, 0644); | |||
| 1632 | extern int tick_nohz_enabled; | 1635 | extern int tick_nohz_enabled; |
| 1633 | 1636 | ||
| 1634 | /* | 1637 | /* |
| 1635 | * Try to advance callbacks for all flavors of RCU on the current CPU. | 1638 | * Try to advance callbacks for all flavors of RCU on the current CPU, but |
| 1636 | * Afterwards, if there are any callbacks ready for immediate invocation, | 1639 | * only if it has been awhile since the last time we did so. Afterwards, |
| 1637 | * return true. | 1640 | * if there are any callbacks ready for immediate invocation, return true. |
| 1638 | */ | 1641 | */ |
| 1639 | static bool rcu_try_advance_all_cbs(void) | 1642 | static bool rcu_try_advance_all_cbs(void) |
| 1640 | { | 1643 | { |
| 1641 | bool cbs_ready = false; | 1644 | bool cbs_ready = false; |
| 1642 | struct rcu_data *rdp; | 1645 | struct rcu_data *rdp; |
| 1646 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); | ||
| 1643 | struct rcu_node *rnp; | 1647 | struct rcu_node *rnp; |
| 1644 | struct rcu_state *rsp; | 1648 | struct rcu_state *rsp; |
| 1645 | 1649 | ||
| 1650 | /* Exit early if we advanced recently. */ | ||
| 1651 | if (jiffies == rdtp->last_advance_all) | ||
| 1652 | return 0; | ||
| 1653 | rdtp->last_advance_all = jiffies; | ||
| 1654 | |||
| 1646 | for_each_rcu_flavor(rsp) { | 1655 | for_each_rcu_flavor(rsp) { |
| 1647 | rdp = this_cpu_ptr(rsp->rda); | 1656 | rdp = this_cpu_ptr(rsp->rda); |
| 1648 | rnp = rdp->mynode; | 1657 | rnp = rdp->mynode; |
| @@ -1741,6 +1750,8 @@ static void rcu_prepare_for_idle(int cpu) | |||
| 1741 | */ | 1750 | */ |
| 1742 | if (rdtp->all_lazy && | 1751 | if (rdtp->all_lazy && |
| 1743 | rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) { | 1752 | rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) { |
| 1753 | rdtp->all_lazy = false; | ||
| 1754 | rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; | ||
| 1744 | invoke_rcu_core(); | 1755 | invoke_rcu_core(); |
| 1745 | return; | 1756 | return; |
| 1746 | } | 1757 | } |
| @@ -1770,17 +1781,11 @@ static void rcu_prepare_for_idle(int cpu) | |||
| 1770 | */ | 1781 | */ |
| 1771 | static void rcu_cleanup_after_idle(int cpu) | 1782 | static void rcu_cleanup_after_idle(int cpu) |
| 1772 | { | 1783 | { |
| 1773 | struct rcu_data *rdp; | ||
| 1774 | struct rcu_state *rsp; | ||
| 1775 | 1784 | ||
| 1776 | if (rcu_is_nocb_cpu(cpu)) | 1785 | if (rcu_is_nocb_cpu(cpu)) |
| 1777 | return; | 1786 | return; |
| 1778 | rcu_try_advance_all_cbs(); | 1787 | if (rcu_try_advance_all_cbs()) |
| 1779 | for_each_rcu_flavor(rsp) { | 1788 | invoke_rcu_core(); |
| 1780 | rdp = per_cpu_ptr(rsp->rda, cpu); | ||
| 1781 | if (cpu_has_callbacks_ready_to_invoke(rdp)) | ||
| 1782 | invoke_rcu_core(); | ||
| 1783 | } | ||
| 1784 | } | 1789 | } |
| 1785 | 1790 | ||
| 1786 | /* | 1791 | /* |
| @@ -2110,15 +2115,22 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, | |||
| 2110 | 2115 | ||
| 2111 | /* If we are not being polled and there is a kthread, awaken it ... */ | 2116 | /* If we are not being polled and there is a kthread, awaken it ... */ |
| 2112 | t = ACCESS_ONCE(rdp->nocb_kthread); | 2117 | t = ACCESS_ONCE(rdp->nocb_kthread); |
| 2113 | if (rcu_nocb_poll | !t) | 2118 | if (rcu_nocb_poll || !t) { |
| 2119 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | ||
| 2120 | TPS("WakeNotPoll")); | ||
| 2114 | return; | 2121 | return; |
| 2122 | } | ||
| 2115 | len = atomic_long_read(&rdp->nocb_q_count); | 2123 | len = atomic_long_read(&rdp->nocb_q_count); |
| 2116 | if (old_rhpp == &rdp->nocb_head) { | 2124 | if (old_rhpp == &rdp->nocb_head) { |
| 2117 | wake_up(&rdp->nocb_wq); /* ... only if queue was empty ... */ | 2125 | wake_up(&rdp->nocb_wq); /* ... only if queue was empty ... */ |
| 2118 | rdp->qlen_last_fqs_check = 0; | 2126 | rdp->qlen_last_fqs_check = 0; |
| 2127 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmpty")); | ||
| 2119 | } else if (len > rdp->qlen_last_fqs_check + qhimark) { | 2128 | } else if (len > rdp->qlen_last_fqs_check + qhimark) { |
| 2120 | wake_up_process(t); /* ... or if many callbacks queued. */ | 2129 | wake_up_process(t); /* ... or if many callbacks queued. */ |
| 2121 | rdp->qlen_last_fqs_check = LONG_MAX / 2; | 2130 | rdp->qlen_last_fqs_check = LONG_MAX / 2; |
| 2131 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf")); | ||
| 2132 | } else { | ||
| 2133 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeNot")); | ||
| 2122 | } | 2134 | } |
| 2123 | return; | 2135 | return; |
| 2124 | } | 2136 | } |
| @@ -2142,10 +2154,12 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, | |||
| 2142 | if (__is_kfree_rcu_offset((unsigned long)rhp->func)) | 2154 | if (__is_kfree_rcu_offset((unsigned long)rhp->func)) |
| 2143 | trace_rcu_kfree_callback(rdp->rsp->name, rhp, | 2155 | trace_rcu_kfree_callback(rdp->rsp->name, rhp, |
| 2144 | (unsigned long)rhp->func, | 2156 | (unsigned long)rhp->func, |
| 2145 | rdp->qlen_lazy, rdp->qlen); | 2157 | -atomic_long_read(&rdp->nocb_q_count_lazy), |
| 2158 | -atomic_long_read(&rdp->nocb_q_count)); | ||
| 2146 | else | 2159 | else |
| 2147 | trace_rcu_callback(rdp->rsp->name, rhp, | 2160 | trace_rcu_callback(rdp->rsp->name, rhp, |
| 2148 | rdp->qlen_lazy, rdp->qlen); | 2161 | -atomic_long_read(&rdp->nocb_q_count_lazy), |
| 2162 | -atomic_long_read(&rdp->nocb_q_count)); | ||
| 2149 | return 1; | 2163 | return 1; |
| 2150 | } | 2164 | } |
| 2151 | 2165 | ||
| @@ -2202,7 +2216,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) | |||
| 2202 | * Wait for the grace period. Do so interruptibly to avoid messing | 2216 | * Wait for the grace period. Do so interruptibly to avoid messing |
| 2203 | * up the load average. | 2217 | * up the load average. |
| 2204 | */ | 2218 | */ |
| 2205 | trace_rcu_future_gp(rnp, rdp, c, "StartWait"); | 2219 | trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait")); |
| 2206 | for (;;) { | 2220 | for (;;) { |
| 2207 | wait_event_interruptible( | 2221 | wait_event_interruptible( |
| 2208 | rnp->nocb_gp_wq[c & 0x1], | 2222 | rnp->nocb_gp_wq[c & 0x1], |
| @@ -2210,9 +2224,9 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) | |||
| 2210 | if (likely(d)) | 2224 | if (likely(d)) |
| 2211 | break; | 2225 | break; |
| 2212 | flush_signals(current); | 2226 | flush_signals(current); |
| 2213 | trace_rcu_future_gp(rnp, rdp, c, "ResumeWait"); | 2227 | trace_rcu_future_gp(rnp, rdp, c, TPS("ResumeWait")); |
| 2214 | } | 2228 | } |
| 2215 | trace_rcu_future_gp(rnp, rdp, c, "EndWait"); | 2229 | trace_rcu_future_gp(rnp, rdp, c, TPS("EndWait")); |
| 2216 | smp_mb(); /* Ensure that CB invocation happens after GP end. */ | 2230 | smp_mb(); /* Ensure that CB invocation happens after GP end. */ |
| 2217 | } | 2231 | } |
| 2218 | 2232 | ||
| @@ -2223,6 +2237,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) | |||
| 2223 | static int rcu_nocb_kthread(void *arg) | 2237 | static int rcu_nocb_kthread(void *arg) |
| 2224 | { | 2238 | { |
| 2225 | int c, cl; | 2239 | int c, cl; |
| 2240 | bool firsttime = 1; | ||
| 2226 | struct rcu_head *list; | 2241 | struct rcu_head *list; |
| 2227 | struct rcu_head *next; | 2242 | struct rcu_head *next; |
| 2228 | struct rcu_head **tail; | 2243 | struct rcu_head **tail; |
| @@ -2231,14 +2246,27 @@ static int rcu_nocb_kthread(void *arg) | |||
| 2231 | /* Each pass through this loop invokes one batch of callbacks */ | 2246 | /* Each pass through this loop invokes one batch of callbacks */ |
| 2232 | for (;;) { | 2247 | for (;;) { |
| 2233 | /* If not polling, wait for next batch of callbacks. */ | 2248 | /* If not polling, wait for next batch of callbacks. */ |
| 2234 | if (!rcu_nocb_poll) | 2249 | if (!rcu_nocb_poll) { |
| 2250 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | ||
| 2251 | TPS("Sleep")); | ||
| 2235 | wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head); | 2252 | wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head); |
| 2253 | } else if (firsttime) { | ||
| 2254 | firsttime = 0; | ||
| 2255 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | ||
| 2256 | TPS("Poll")); | ||
| 2257 | } | ||
| 2236 | list = ACCESS_ONCE(rdp->nocb_head); | 2258 | list = ACCESS_ONCE(rdp->nocb_head); |
| 2237 | if (!list) { | 2259 | if (!list) { |
| 2260 | if (!rcu_nocb_poll) | ||
| 2261 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | ||
| 2262 | TPS("WokeEmpty")); | ||
| 2238 | schedule_timeout_interruptible(1); | 2263 | schedule_timeout_interruptible(1); |
| 2239 | flush_signals(current); | 2264 | flush_signals(current); |
| 2240 | continue; | 2265 | continue; |
| 2241 | } | 2266 | } |
| 2267 | firsttime = 1; | ||
| 2268 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | ||
| 2269 | TPS("WokeNonEmpty")); | ||
| 2242 | 2270 | ||
| 2243 | /* | 2271 | /* |
| 2244 | * Extract queued callbacks, update counts, and wait | 2272 | * Extract queued callbacks, update counts, and wait |
| @@ -2259,7 +2287,11 @@ static int rcu_nocb_kthread(void *arg) | |||
| 2259 | next = list->next; | 2287 | next = list->next; |
| 2260 | /* Wait for enqueuing to complete, if needed. */ | 2288 | /* Wait for enqueuing to complete, if needed. */ |
| 2261 | while (next == NULL && &list->next != tail) { | 2289 | while (next == NULL && &list->next != tail) { |
| 2290 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | ||
| 2291 | TPS("WaitQueue")); | ||
| 2262 | schedule_timeout_interruptible(1); | 2292 | schedule_timeout_interruptible(1); |
| 2293 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | ||
| 2294 | TPS("WokeQueue")); | ||
| 2263 | next = list->next; | 2295 | next = list->next; |
| 2264 | } | 2296 | } |
| 2265 | debug_rcu_head_unqueue(list); | 2297 | debug_rcu_head_unqueue(list); |
| @@ -2375,3 +2407,425 @@ static void rcu_kick_nohz_cpu(int cpu) | |||
| 2375 | smp_send_reschedule(cpu); | 2407 | smp_send_reschedule(cpu); |
| 2376 | #endif /* #ifdef CONFIG_NO_HZ_FULL */ | 2408 | #endif /* #ifdef CONFIG_NO_HZ_FULL */ |
| 2377 | } | 2409 | } |
| 2410 | |||
| 2411 | |||
| 2412 | #ifdef CONFIG_NO_HZ_FULL_SYSIDLE | ||
| 2413 | |||
| 2414 | /* | ||
| 2415 | * Define RCU flavor that holds sysidle state. This needs to be the | ||
| 2416 | * most active flavor of RCU. | ||
| 2417 | */ | ||
| 2418 | #ifdef CONFIG_PREEMPT_RCU | ||
| 2419 | static struct rcu_state *rcu_sysidle_state = &rcu_preempt_state; | ||
| 2420 | #else /* #ifdef CONFIG_PREEMPT_RCU */ | ||
| 2421 | static struct rcu_state *rcu_sysidle_state = &rcu_sched_state; | ||
| 2422 | #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ | ||
| 2423 | |||
| 2424 | static int full_sysidle_state; /* Current system-idle state. */ | ||
| 2425 | #define RCU_SYSIDLE_NOT 0 /* Some CPU is not idle. */ | ||
| 2426 | #define RCU_SYSIDLE_SHORT 1 /* All CPUs idle for brief period. */ | ||
| 2427 | #define RCU_SYSIDLE_LONG 2 /* All CPUs idle for long enough. */ | ||
| 2428 | #define RCU_SYSIDLE_FULL 3 /* All CPUs idle, ready for sysidle. */ | ||
| 2429 | #define RCU_SYSIDLE_FULL_NOTED 4 /* Actually entered sysidle state. */ | ||
| 2430 | |||
| 2431 | /* | ||
| 2432 | * Invoked to note exit from irq or task transition to idle. Note that | ||
| 2433 | * usermode execution does -not- count as idle here! After all, we want | ||
| 2434 | * to detect full-system idle states, not RCU quiescent states and grace | ||
| 2435 | * periods. The caller must have disabled interrupts. | ||
| 2436 | */ | ||
| 2437 | static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) | ||
| 2438 | { | ||
| 2439 | unsigned long j; | ||
| 2440 | |||
| 2441 | /* Adjust nesting, check for fully idle. */ | ||
| 2442 | if (irq) { | ||
| 2443 | rdtp->dynticks_idle_nesting--; | ||
| 2444 | WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0); | ||
| 2445 | if (rdtp->dynticks_idle_nesting != 0) | ||
| 2446 | return; /* Still not fully idle. */ | ||
| 2447 | } else { | ||
| 2448 | if ((rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) == | ||
| 2449 | DYNTICK_TASK_NEST_VALUE) { | ||
| 2450 | rdtp->dynticks_idle_nesting = 0; | ||
| 2451 | } else { | ||
| 2452 | rdtp->dynticks_idle_nesting -= DYNTICK_TASK_NEST_VALUE; | ||
| 2453 | WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0); | ||
| 2454 | return; /* Still not fully idle. */ | ||
| 2455 | } | ||
| 2456 | } | ||
| 2457 | |||
| 2458 | /* Record start of fully idle period. */ | ||
| 2459 | j = jiffies; | ||
| 2460 | ACCESS_ONCE(rdtp->dynticks_idle_jiffies) = j; | ||
| 2461 | smp_mb__before_atomic_inc(); | ||
| 2462 | atomic_inc(&rdtp->dynticks_idle); | ||
| 2463 | smp_mb__after_atomic_inc(); | ||
| 2464 | WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1); | ||
| 2465 | } | ||
| 2466 | |||
| 2467 | /* | ||
| 2468 | * Unconditionally force exit from full system-idle state. This is | ||
| 2469 | * invoked when a normal CPU exits idle, but must be called separately | ||
| 2470 | * for the timekeeping CPU (tick_do_timer_cpu). The reason for this | ||
| 2471 | * is that the timekeeping CPU is permitted to take scheduling-clock | ||
| 2472 | * interrupts while the system is in system-idle state, and of course | ||
| 2473 | * rcu_sysidle_exit() has no way of distinguishing a scheduling-clock | ||
| 2474 | * interrupt from any other type of interrupt. | ||
| 2475 | */ | ||
| 2476 | void rcu_sysidle_force_exit(void) | ||
| 2477 | { | ||
| 2478 | int oldstate = ACCESS_ONCE(full_sysidle_state); | ||
| 2479 | int newoldstate; | ||
| 2480 | |||
| 2481 | /* | ||
| 2482 | * Each pass through the following loop attempts to exit full | ||
| 2483 | * system-idle state. If contention proves to be a problem, | ||
| 2484 | * a trylock-based contention tree could be used here. | ||
| 2485 | */ | ||
| 2486 | while (oldstate > RCU_SYSIDLE_SHORT) { | ||
| 2487 | newoldstate = cmpxchg(&full_sysidle_state, | ||
| 2488 | oldstate, RCU_SYSIDLE_NOT); | ||
| 2489 | if (oldstate == newoldstate && | ||
| 2490 | oldstate == RCU_SYSIDLE_FULL_NOTED) { | ||
| 2491 | rcu_kick_nohz_cpu(tick_do_timer_cpu); | ||
| 2492 | return; /* We cleared it, done! */ | ||
| 2493 | } | ||
| 2494 | oldstate = newoldstate; | ||
| 2495 | } | ||
| 2496 | smp_mb(); /* Order initial oldstate fetch vs. later non-idle work. */ | ||
| 2497 | } | ||
| 2498 | |||
| 2499 | /* | ||
| 2500 | * Invoked to note entry to irq or task transition from idle. Note that | ||
| 2501 | * usermode execution does -not- count as idle here! The caller must | ||
| 2502 | * have disabled interrupts. | ||
| 2503 | */ | ||
| 2504 | static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) | ||
| 2505 | { | ||
| 2506 | /* Adjust nesting, check for already non-idle. */ | ||
| 2507 | if (irq) { | ||
| 2508 | rdtp->dynticks_idle_nesting++; | ||
| 2509 | WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0); | ||
| 2510 | if (rdtp->dynticks_idle_nesting != 1) | ||
| 2511 | return; /* Already non-idle. */ | ||
| 2512 | } else { | ||
| 2513 | /* | ||
| 2514 | * Allow for irq misnesting. Yes, it really is possible | ||
| 2515 | * to enter an irq handler then never leave it, and maybe | ||
| 2516 | * also vice versa. Handle both possibilities. | ||
| 2517 | */ | ||
| 2518 | if (rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) { | ||
| 2519 | rdtp->dynticks_idle_nesting += DYNTICK_TASK_NEST_VALUE; | ||
| 2520 | WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0); | ||
| 2521 | return; /* Already non-idle. */ | ||
| 2522 | } else { | ||
| 2523 | rdtp->dynticks_idle_nesting = DYNTICK_TASK_EXIT_IDLE; | ||
| 2524 | } | ||
| 2525 | } | ||
| 2526 | |||
| 2527 | /* Record end of idle period. */ | ||
| 2528 | smp_mb__before_atomic_inc(); | ||
| 2529 | atomic_inc(&rdtp->dynticks_idle); | ||
| 2530 | smp_mb__after_atomic_inc(); | ||
| 2531 | WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1)); | ||
| 2532 | |||
| 2533 | /* | ||
| 2534 | * If we are the timekeeping CPU, we are permitted to be non-idle | ||
| 2535 | * during a system-idle state. This must be the case, because | ||
| 2536 | * the timekeeping CPU has to take scheduling-clock interrupts | ||
| 2537 | * during the time that the system is transitioning to full | ||
| 2538 | * system-idle state. This means that the timekeeping CPU must | ||
| 2539 | * invoke rcu_sysidle_force_exit() directly if it does anything | ||
| 2540 | * more than take a scheduling-clock interrupt. | ||
| 2541 | */ | ||
| 2542 | if (smp_processor_id() == tick_do_timer_cpu) | ||
| 2543 | return; | ||
| 2544 | |||
| 2545 | /* Update system-idle state: We are clearly no longer fully idle! */ | ||
| 2546 | rcu_sysidle_force_exit(); | ||
| 2547 | } | ||
| 2548 | |||
| 2549 | /* | ||
| 2550 | * Check to see if the current CPU is idle. Note that usermode execution | ||
| 2551 | * does not count as idle. The caller must have disabled interrupts. | ||
| 2552 | */ | ||
| 2553 | static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, | ||
| 2554 | unsigned long *maxj) | ||
| 2555 | { | ||
| 2556 | int cur; | ||
| 2557 | unsigned long j; | ||
| 2558 | struct rcu_dynticks *rdtp = rdp->dynticks; | ||
| 2559 | |||
| 2560 | /* | ||
| 2561 | * If some other CPU has already reported non-idle, if this is | ||
| 2562 | * not the flavor of RCU that tracks sysidle state, or if this | ||
| 2563 | * is an offline or the timekeeping CPU, nothing to do. | ||
| 2564 | */ | ||
| 2565 | if (!*isidle || rdp->rsp != rcu_sysidle_state || | ||
| 2566 | cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu) | ||
| 2567 | return; | ||
| 2568 | if (rcu_gp_in_progress(rdp->rsp)) | ||
| 2569 | WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu); | ||
| 2570 | |||
| 2571 | /* Pick up current idle and NMI-nesting counter and check. */ | ||
| 2572 | cur = atomic_read(&rdtp->dynticks_idle); | ||
| 2573 | if (cur & 0x1) { | ||
| 2574 | *isidle = false; /* We are not idle! */ | ||
| 2575 | return; | ||
| 2576 | } | ||
| 2577 | smp_mb(); /* Read counters before timestamps. */ | ||
| 2578 | |||
| 2579 | /* Pick up timestamps. */ | ||
| 2580 | j = ACCESS_ONCE(rdtp->dynticks_idle_jiffies); | ||
| 2581 | /* If this CPU entered idle more recently, update maxj timestamp. */ | ||
| 2582 | if (ULONG_CMP_LT(*maxj, j)) | ||
| 2583 | *maxj = j; | ||
| 2584 | } | ||
| 2585 | |||
| 2586 | /* | ||
| 2587 | * Is this the flavor of RCU that is handling full-system idle? | ||
| 2588 | */ | ||
| 2589 | static bool is_sysidle_rcu_state(struct rcu_state *rsp) | ||
| 2590 | { | ||
| 2591 | return rsp == rcu_sysidle_state; | ||
| 2592 | } | ||
| 2593 | |||
| 2594 | /* | ||
| 2595 | * Bind the grace-period kthread for the sysidle flavor of RCU to the | ||
| 2596 | * timekeeping CPU. | ||
| 2597 | */ | ||
| 2598 | static void rcu_bind_gp_kthread(void) | ||
| 2599 | { | ||
| 2600 | int cpu = ACCESS_ONCE(tick_do_timer_cpu); | ||
| 2601 | |||
| 2602 | if (cpu < 0 || cpu >= nr_cpu_ids) | ||
| 2603 | return; | ||
| 2604 | if (raw_smp_processor_id() != cpu) | ||
| 2605 | set_cpus_allowed_ptr(current, cpumask_of(cpu)); | ||
| 2606 | } | ||
| 2607 | |||
| 2608 | /* | ||
| 2609 | * Return a delay in jiffies based on the number of CPUs, rcu_node | ||
| 2610 | * leaf fanout, and jiffies tick rate. The idea is to allow larger | ||
| 2611 | * systems more time to transition to full-idle state in order to | ||
| 2612 | * avoid the cache thrashing that otherwise occur on the state variable. | ||
| 2613 | * Really small systems (less than a couple of tens of CPUs) should | ||
| 2614 | * instead use a single global atomically incremented counter, and later | ||
| 2615 | * versions of this will automatically reconfigure themselves accordingly. | ||
| 2616 | */ | ||
| 2617 | static unsigned long rcu_sysidle_delay(void) | ||
| 2618 | { | ||
| 2619 | if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) | ||
| 2620 | return 0; | ||
| 2621 | return DIV_ROUND_UP(nr_cpu_ids * HZ, rcu_fanout_leaf * 1000); | ||
| 2622 | } | ||
| 2623 | |||
| 2624 | /* | ||
| 2625 | * Advance the full-system-idle state. This is invoked when all of | ||
| 2626 | * the non-timekeeping CPUs are idle. | ||
| 2627 | */ | ||
| 2628 | static void rcu_sysidle(unsigned long j) | ||
| 2629 | { | ||
| 2630 | /* Check the current state. */ | ||
| 2631 | switch (ACCESS_ONCE(full_sysidle_state)) { | ||
| 2632 | case RCU_SYSIDLE_NOT: | ||
| 2633 | |||
| 2634 | /* First time all are idle, so note a short idle period. */ | ||
| 2635 | ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_SHORT; | ||
| 2636 | break; | ||
| 2637 | |||
| 2638 | case RCU_SYSIDLE_SHORT: | ||
| 2639 | |||
| 2640 | /* | ||
| 2641 | * Idle for a bit, time to advance to next state? | ||
| 2642 | * cmpxchg failure means race with non-idle, let them win. | ||
| 2643 | */ | ||
| 2644 | if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay())) | ||
| 2645 | (void)cmpxchg(&full_sysidle_state, | ||
| 2646 | RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG); | ||
| 2647 | break; | ||
| 2648 | |||
| 2649 | case RCU_SYSIDLE_LONG: | ||
| 2650 | |||
| 2651 | /* | ||
| 2652 | * Do an additional check pass before advancing to full. | ||
| 2653 | * cmpxchg failure means race with non-idle, let them win. | ||
| 2654 | */ | ||
| 2655 | if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay())) | ||
| 2656 | (void)cmpxchg(&full_sysidle_state, | ||
| 2657 | RCU_SYSIDLE_LONG, RCU_SYSIDLE_FULL); | ||
| 2658 | break; | ||
| 2659 | |||
| 2660 | default: | ||
| 2661 | break; | ||
| 2662 | } | ||
| 2663 | } | ||
| 2664 | |||
| 2665 | /* | ||
| 2666 | * Found a non-idle non-timekeeping CPU, so kick the system-idle state | ||
| 2667 | * back to the beginning. | ||
| 2668 | */ | ||
| 2669 | static void rcu_sysidle_cancel(void) | ||
| 2670 | { | ||
| 2671 | smp_mb(); | ||
| 2672 | ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT; | ||
| 2673 | } | ||
| 2674 | |||
| 2675 | /* | ||
| 2676 | * Update the sysidle state based on the results of a force-quiescent-state | ||
| 2677 | * scan of the CPUs' dyntick-idle state. | ||
| 2678 | */ | ||
| 2679 | static void rcu_sysidle_report(struct rcu_state *rsp, int isidle, | ||
| 2680 | unsigned long maxj, bool gpkt) | ||
| 2681 | { | ||
| 2682 | if (rsp != rcu_sysidle_state) | ||
| 2683 | return; /* Wrong flavor, ignore. */ | ||
| 2684 | if (gpkt && nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) | ||
| 2685 | return; /* Running state machine from timekeeping CPU. */ | ||
| 2686 | if (isidle) | ||
| 2687 | rcu_sysidle(maxj); /* More idle! */ | ||
| 2688 | else | ||
| 2689 | rcu_sysidle_cancel(); /* Idle is over. */ | ||
| 2690 | } | ||
| 2691 | |||
| 2692 | /* | ||
| 2693 | * Wrapper for rcu_sysidle_report() when called from the grace-period | ||
| 2694 | * kthread's context. | ||
| 2695 | */ | ||
| 2696 | static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, | ||
| 2697 | unsigned long maxj) | ||
| 2698 | { | ||
| 2699 | rcu_sysidle_report(rsp, isidle, maxj, true); | ||
| 2700 | } | ||
| 2701 | |||
| 2702 | /* Callback and function for forcing an RCU grace period. */ | ||
| 2703 | struct rcu_sysidle_head { | ||
| 2704 | struct rcu_head rh; | ||
| 2705 | int inuse; | ||
| 2706 | }; | ||
| 2707 | |||
| 2708 | static void rcu_sysidle_cb(struct rcu_head *rhp) | ||
| 2709 | { | ||
| 2710 | struct rcu_sysidle_head *rshp; | ||
| 2711 | |||
| 2712 | /* | ||
| 2713 | * The following memory barrier is needed to replace the | ||
| 2714 | * memory barriers that would normally be in the memory | ||
| 2715 | * allocator. | ||
| 2716 | */ | ||
| 2717 | smp_mb(); /* grace period precedes setting inuse. */ | ||
| 2718 | |||
| 2719 | rshp = container_of(rhp, struct rcu_sysidle_head, rh); | ||
| 2720 | ACCESS_ONCE(rshp->inuse) = 0; | ||
| 2721 | } | ||
| 2722 | |||
| 2723 | /* | ||
| 2724 | * Check to see if the system is fully idle, other than the timekeeping CPU. | ||
| 2725 | * The caller must have disabled interrupts. | ||
| 2726 | */ | ||
| 2727 | bool rcu_sys_is_idle(void) | ||
| 2728 | { | ||
| 2729 | static struct rcu_sysidle_head rsh; | ||
| 2730 | int rss = ACCESS_ONCE(full_sysidle_state); | ||
| 2731 | |||
| 2732 | if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu)) | ||
| 2733 | return false; | ||
| 2734 | |||
| 2735 | /* Handle small-system case by doing a full scan of CPUs. */ | ||
| 2736 | if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) { | ||
| 2737 | int oldrss = rss - 1; | ||
| 2738 | |||
| 2739 | /* | ||
| 2740 | * One pass to advance to each state up to _FULL. | ||
| 2741 | * Give up if any pass fails to advance the state. | ||
| 2742 | */ | ||
| 2743 | while (rss < RCU_SYSIDLE_FULL && oldrss < rss) { | ||
| 2744 | int cpu; | ||
| 2745 | bool isidle = true; | ||
| 2746 | unsigned long maxj = jiffies - ULONG_MAX / 4; | ||
| 2747 | struct rcu_data *rdp; | ||
| 2748 | |||
| 2749 | /* Scan all the CPUs looking for nonidle CPUs. */ | ||
| 2750 | for_each_possible_cpu(cpu) { | ||
| 2751 | rdp = per_cpu_ptr(rcu_sysidle_state->rda, cpu); | ||
| 2752 | rcu_sysidle_check_cpu(rdp, &isidle, &maxj); | ||
| 2753 | if (!isidle) | ||
| 2754 | break; | ||
| 2755 | } | ||
| 2756 | rcu_sysidle_report(rcu_sysidle_state, | ||
| 2757 | isidle, maxj, false); | ||
| 2758 | oldrss = rss; | ||
| 2759 | rss = ACCESS_ONCE(full_sysidle_state); | ||
| 2760 | } | ||
| 2761 | } | ||
| 2762 | |||
| 2763 | /* If this is the first observation of an idle period, record it. */ | ||
| 2764 | if (rss == RCU_SYSIDLE_FULL) { | ||
| 2765 | rss = cmpxchg(&full_sysidle_state, | ||
| 2766 | RCU_SYSIDLE_FULL, RCU_SYSIDLE_FULL_NOTED); | ||
| 2767 | return rss == RCU_SYSIDLE_FULL; | ||
| 2768 | } | ||
| 2769 | |||
| 2770 | smp_mb(); /* ensure rss load happens before later caller actions. */ | ||
| 2771 | |||
| 2772 | /* If already fully idle, tell the caller (in case of races). */ | ||
| 2773 | if (rss == RCU_SYSIDLE_FULL_NOTED) | ||
| 2774 | return true; | ||
| 2775 | |||
| 2776 | /* | ||
| 2777 | * If we aren't there yet, and a grace period is not in flight, | ||
| 2778 | * initiate a grace period. Either way, tell the caller that | ||
| 2779 | * we are not there yet. We use an xchg() rather than an assignment | ||
| 2780 | * to make up for the memory barriers that would otherwise be | ||
| 2781 | * provided by the memory allocator. | ||
| 2782 | */ | ||
| 2783 | if (nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL && | ||
| 2784 | !rcu_gp_in_progress(rcu_sysidle_state) && | ||
| 2785 | !rsh.inuse && xchg(&rsh.inuse, 1) == 0) | ||
| 2786 | call_rcu(&rsh.rh, rcu_sysidle_cb); | ||
| 2787 | return false; | ||
| 2788 | } | ||
| 2789 | |||
| 2790 | /* | ||
| 2791 | * Initialize dynticks sysidle state for CPUs coming online. | ||
| 2792 | */ | ||
| 2793 | static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) | ||
| 2794 | { | ||
| 2795 | rdtp->dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE; | ||
| 2796 | } | ||
| 2797 | |||
| 2798 | #else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ | ||
| 2799 | |||
| 2800 | static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) | ||
| 2801 | { | ||
| 2802 | } | ||
| 2803 | |||
| 2804 | static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) | ||
| 2805 | { | ||
| 2806 | } | ||
| 2807 | |||
| 2808 | static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, | ||
| 2809 | unsigned long *maxj) | ||
| 2810 | { | ||
| 2811 | } | ||
| 2812 | |||
| 2813 | static bool is_sysidle_rcu_state(struct rcu_state *rsp) | ||
| 2814 | { | ||
| 2815 | return false; | ||
| 2816 | } | ||
| 2817 | |||
| 2818 | static void rcu_bind_gp_kthread(void) | ||
| 2819 | { | ||
| 2820 | } | ||
| 2821 | |||
| 2822 | static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, | ||
| 2823 | unsigned long maxj) | ||
| 2824 | { | ||
| 2825 | } | ||
| 2826 | |||
| 2827 | static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) | ||
| 2828 | { | ||
| 2829 | } | ||
| 2830 | |||
| 2831 | #endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ | ||
diff --git a/kernel/rcutree_trace.c b/kernel/rcu/tree_trace.c index cf6c17412932..3596797b7e46 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcu/tree_trace.c | |||
| @@ -44,7 +44,7 @@ | |||
| 44 | #include <linux/seq_file.h> | 44 | #include <linux/seq_file.h> |
| 45 | 45 | ||
| 46 | #define RCU_TREE_NONCORE | 46 | #define RCU_TREE_NONCORE |
| 47 | #include "rcutree.h" | 47 | #include "tree.h" |
| 48 | 48 | ||
| 49 | static int r_open(struct inode *inode, struct file *file, | 49 | static int r_open(struct inode *inode, struct file *file, |
| 50 | const struct seq_operations *op) | 50 | const struct seq_operations *op) |
diff --git a/kernel/rcupdate.c b/kernel/rcu/update.c index cce6ba8bbace..6cb3dff89e2b 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcu/update.c | |||
| @@ -53,6 +53,12 @@ | |||
| 53 | 53 | ||
| 54 | #include "rcu.h" | 54 | #include "rcu.h" |
| 55 | 55 | ||
| 56 | MODULE_ALIAS("rcupdate"); | ||
| 57 | #ifdef MODULE_PARAM_PREFIX | ||
| 58 | #undef MODULE_PARAM_PREFIX | ||
| 59 | #endif | ||
| 60 | #define MODULE_PARAM_PREFIX "rcupdate." | ||
| 61 | |||
| 56 | module_param(rcu_expedited, int, 0); | 62 | module_param(rcu_expedited, int, 0); |
| 57 | 63 | ||
| 58 | #ifdef CONFIG_PREEMPT_RCU | 64 | #ifdef CONFIG_PREEMPT_RCU |
| @@ -122,7 +128,7 @@ struct lockdep_map rcu_sched_lock_map = | |||
| 122 | STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key); | 128 | STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key); |
| 123 | EXPORT_SYMBOL_GPL(rcu_sched_lock_map); | 129 | EXPORT_SYMBOL_GPL(rcu_sched_lock_map); |
| 124 | 130 | ||
| 125 | int debug_lockdep_rcu_enabled(void) | 131 | int notrace debug_lockdep_rcu_enabled(void) |
| 126 | { | 132 | { |
| 127 | return rcu_scheduler_active && debug_locks && | 133 | return rcu_scheduler_active && debug_locks && |
| 128 | current->lockdep_recursion == 0; | 134 | current->lockdep_recursion == 0; |
| @@ -148,7 +154,7 @@ int rcu_read_lock_bh_held(void) | |||
| 148 | { | 154 | { |
| 149 | if (!debug_lockdep_rcu_enabled()) | 155 | if (!debug_lockdep_rcu_enabled()) |
| 150 | return 1; | 156 | return 1; |
| 151 | if (rcu_is_cpu_idle()) | 157 | if (!rcu_is_watching()) |
| 152 | return 0; | 158 | return 0; |
| 153 | if (!rcu_lockdep_current_cpu_online()) | 159 | if (!rcu_lockdep_current_cpu_online()) |
| 154 | return 0; | 160 | return 0; |
| @@ -212,43 +218,6 @@ static inline void debug_rcu_head_free(struct rcu_head *head) | |||
| 212 | } | 218 | } |
| 213 | 219 | ||
| 214 | /* | 220 | /* |
| 215 | * fixup_init is called when: | ||
| 216 | * - an active object is initialized | ||
| 217 | */ | ||
| 218 | static int rcuhead_fixup_init(void *addr, enum debug_obj_state state) | ||
| 219 | { | ||
| 220 | struct rcu_head *head = addr; | ||
| 221 | |||
| 222 | switch (state) { | ||
| 223 | case ODEBUG_STATE_ACTIVE: | ||
| 224 | /* | ||
| 225 | * Ensure that queued callbacks are all executed. | ||
| 226 | * If we detect that we are nested in a RCU read-side critical | ||
| 227 | * section, we should simply fail, otherwise we would deadlock. | ||
| 228 | * In !PREEMPT configurations, there is no way to tell if we are | ||
| 229 | * in a RCU read-side critical section or not, so we never | ||
| 230 | * attempt any fixup and just print a warning. | ||
| 231 | */ | ||
| 232 | #ifndef CONFIG_PREEMPT | ||
| 233 | WARN_ON_ONCE(1); | ||
| 234 | return 0; | ||
| 235 | #endif | ||
| 236 | if (rcu_preempt_depth() != 0 || preempt_count() != 0 || | ||
| 237 | irqs_disabled()) { | ||
| 238 | WARN_ON_ONCE(1); | ||
| 239 | return 0; | ||
| 240 | } | ||
| 241 | rcu_barrier(); | ||
| 242 | rcu_barrier_sched(); | ||
| 243 | rcu_barrier_bh(); | ||
| 244 | debug_object_init(head, &rcuhead_debug_descr); | ||
| 245 | return 1; | ||
| 246 | default: | ||
| 247 | return 0; | ||
| 248 | } | ||
| 249 | } | ||
| 250 | |||
| 251 | /* | ||
| 252 | * fixup_activate is called when: | 221 | * fixup_activate is called when: |
| 253 | * - an active object is activated | 222 | * - an active object is activated |
| 254 | * - an unknown object is activated (might be a statically initialized object) | 223 | * - an unknown object is activated (might be a statically initialized object) |
| @@ -268,69 +237,8 @@ static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state) | |||
| 268 | debug_object_init(head, &rcuhead_debug_descr); | 237 | debug_object_init(head, &rcuhead_debug_descr); |
| 269 | debug_object_activate(head, &rcuhead_debug_descr); | 238 | debug_object_activate(head, &rcuhead_debug_descr); |
| 270 | return 0; | 239 | return 0; |
| 271 | |||
| 272 | case ODEBUG_STATE_ACTIVE: | ||
| 273 | /* | ||
| 274 | * Ensure that queued callbacks are all executed. | ||
| 275 | * If we detect that we are nested in a RCU read-side critical | ||
| 276 | * section, we should simply fail, otherwise we would deadlock. | ||
| 277 | * In !PREEMPT configurations, there is no way to tell if we are | ||
| 278 | * in a RCU read-side critical section or not, so we never | ||
| 279 | * attempt any fixup and just print a warning. | ||
| 280 | */ | ||
| 281 | #ifndef CONFIG_PREEMPT | ||
| 282 | WARN_ON_ONCE(1); | ||
| 283 | return 0; | ||
| 284 | #endif | ||
| 285 | if (rcu_preempt_depth() != 0 || preempt_count() != 0 || | ||
| 286 | irqs_disabled()) { | ||
| 287 | WARN_ON_ONCE(1); | ||
| 288 | return 0; | ||
| 289 | } | ||
| 290 | rcu_barrier(); | ||
| 291 | rcu_barrier_sched(); | ||
| 292 | rcu_barrier_bh(); | ||
| 293 | debug_object_activate(head, &rcuhead_debug_descr); | ||
| 294 | return 1; | ||
| 295 | default: | 240 | default: |
| 296 | return 0; | ||
| 297 | } | ||
| 298 | } | ||
| 299 | |||
| 300 | /* | ||
| 301 | * fixup_free is called when: | ||
| 302 | * - an active object is freed | ||
| 303 | */ | ||
| 304 | static int rcuhead_fixup_free(void *addr, enum debug_obj_state state) | ||
| 305 | { | ||
| 306 | struct rcu_head *head = addr; | ||
| 307 | |||
| 308 | switch (state) { | ||
| 309 | case ODEBUG_STATE_ACTIVE: | ||
| 310 | /* | ||
| 311 | * Ensure that queued callbacks are all executed. | ||
| 312 | * If we detect that we are nested in a RCU read-side critical | ||
| 313 | * section, we should simply fail, otherwise we would deadlock. | ||
| 314 | * In !PREEMPT configurations, there is no way to tell if we are | ||
| 315 | * in a RCU read-side critical section or not, so we never | ||
| 316 | * attempt any fixup and just print a warning. | ||
| 317 | */ | ||
| 318 | #ifndef CONFIG_PREEMPT | ||
| 319 | WARN_ON_ONCE(1); | ||
| 320 | return 0; | ||
| 321 | #endif | ||
| 322 | if (rcu_preempt_depth() != 0 || preempt_count() != 0 || | ||
| 323 | irqs_disabled()) { | ||
| 324 | WARN_ON_ONCE(1); | ||
| 325 | return 0; | ||
| 326 | } | ||
| 327 | rcu_barrier(); | ||
| 328 | rcu_barrier_sched(); | ||
| 329 | rcu_barrier_bh(); | ||
| 330 | debug_object_free(head, &rcuhead_debug_descr); | ||
| 331 | return 1; | 241 | return 1; |
| 332 | default: | ||
| 333 | return 0; | ||
| 334 | } | 242 | } |
| 335 | } | 243 | } |
| 336 | 244 | ||
| @@ -369,15 +277,13 @@ EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack); | |||
| 369 | 277 | ||
| 370 | struct debug_obj_descr rcuhead_debug_descr = { | 278 | struct debug_obj_descr rcuhead_debug_descr = { |
| 371 | .name = "rcu_head", | 279 | .name = "rcu_head", |
| 372 | .fixup_init = rcuhead_fixup_init, | ||
| 373 | .fixup_activate = rcuhead_fixup_activate, | 280 | .fixup_activate = rcuhead_fixup_activate, |
| 374 | .fixup_free = rcuhead_fixup_free, | ||
| 375 | }; | 281 | }; |
| 376 | EXPORT_SYMBOL_GPL(rcuhead_debug_descr); | 282 | EXPORT_SYMBOL_GPL(rcuhead_debug_descr); |
| 377 | #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ | 283 | #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ |
| 378 | 284 | ||
| 379 | #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE) | 285 | #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE) |
| 380 | void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp, | 286 | void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp, |
| 381 | unsigned long secs, | 287 | unsigned long secs, |
| 382 | unsigned long c_old, unsigned long c) | 288 | unsigned long c_old, unsigned long c) |
| 383 | { | 289 | { |
| @@ -398,7 +304,7 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read); | |||
| 398 | #endif | 304 | #endif |
| 399 | 305 | ||
| 400 | int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ | 306 | int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ |
| 401 | int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; | 307 | static int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; |
| 402 | 308 | ||
| 403 | module_param(rcu_cpu_stall_suppress, int, 0644); | 309 | module_param(rcu_cpu_stall_suppress, int, 0644); |
| 404 | module_param(rcu_cpu_stall_timeout, int, 0644); | 310 | module_param(rcu_cpu_stall_timeout, int, 0644); |
diff --git a/kernel/reboot.c b/kernel/reboot.c index 269ed9384cc4..f813b3474646 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c | |||
| @@ -32,7 +32,14 @@ EXPORT_SYMBOL(cad_pid); | |||
| 32 | #endif | 32 | #endif |
| 33 | enum reboot_mode reboot_mode DEFAULT_REBOOT_MODE; | 33 | enum reboot_mode reboot_mode DEFAULT_REBOOT_MODE; |
| 34 | 34 | ||
| 35 | int reboot_default; | 35 | /* |
| 36 | * This variable is used privately to keep track of whether or not | ||
| 37 | * reboot_type is still set to its default value (i.e., reboot= hasn't | ||
| 38 | * been set on the command line). This is needed so that we can | ||
| 39 | * suppress DMI scanning for reboot quirks. Without it, it's | ||
| 40 | * impossible to override a faulty reboot quirk without recompiling. | ||
| 41 | */ | ||
| 42 | int reboot_default = 1; | ||
| 36 | int reboot_cpu; | 43 | int reboot_cpu; |
| 37 | enum reboot_type reboot_type = BOOT_ACPI; | 44 | enum reboot_type reboot_type = BOOT_ACPI; |
| 38 | int reboot_force; | 45 | int reboot_force; |
diff --git a/kernel/res_counter.c b/kernel/res_counter.c index ff55247e7049..4aa8a305aede 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c | |||
| @@ -17,8 +17,8 @@ | |||
| 17 | void res_counter_init(struct res_counter *counter, struct res_counter *parent) | 17 | void res_counter_init(struct res_counter *counter, struct res_counter *parent) |
| 18 | { | 18 | { |
| 19 | spin_lock_init(&counter->lock); | 19 | spin_lock_init(&counter->lock); |
| 20 | counter->limit = RESOURCE_MAX; | 20 | counter->limit = RES_COUNTER_MAX; |
| 21 | counter->soft_limit = RESOURCE_MAX; | 21 | counter->soft_limit = RES_COUNTER_MAX; |
| 22 | counter->parent = parent; | 22 | counter->parent = parent; |
| 23 | } | 23 | } |
| 24 | 24 | ||
| @@ -178,23 +178,30 @@ u64 res_counter_read_u64(struct res_counter *counter, int member) | |||
| 178 | #endif | 178 | #endif |
| 179 | 179 | ||
| 180 | int res_counter_memparse_write_strategy(const char *buf, | 180 | int res_counter_memparse_write_strategy(const char *buf, |
| 181 | unsigned long long *res) | 181 | unsigned long long *resp) |
| 182 | { | 182 | { |
| 183 | char *end; | 183 | char *end; |
| 184 | unsigned long long res; | ||
| 184 | 185 | ||
| 185 | /* return RESOURCE_MAX(unlimited) if "-1" is specified */ | 186 | /* return RES_COUNTER_MAX(unlimited) if "-1" is specified */ |
| 186 | if (*buf == '-') { | 187 | if (*buf == '-') { |
| 187 | *res = simple_strtoull(buf + 1, &end, 10); | 188 | res = simple_strtoull(buf + 1, &end, 10); |
| 188 | if (*res != 1 || *end != '\0') | 189 | if (res != 1 || *end != '\0') |
| 189 | return -EINVAL; | 190 | return -EINVAL; |
| 190 | *res = RESOURCE_MAX; | 191 | *resp = RES_COUNTER_MAX; |
| 191 | return 0; | 192 | return 0; |
| 192 | } | 193 | } |
| 193 | 194 | ||
| 194 | *res = memparse(buf, &end); | 195 | res = memparse(buf, &end); |
| 195 | if (*end != '\0') | 196 | if (*end != '\0') |
| 196 | return -EINVAL; | 197 | return -EINVAL; |
| 197 | 198 | ||
| 198 | *res = PAGE_ALIGN(*res); | 199 | if (PAGE_ALIGN(res) >= res) |
| 200 | res = PAGE_ALIGN(res); | ||
| 201 | else | ||
| 202 | res = RES_COUNTER_MAX; | ||
| 203 | |||
| 204 | *resp = res; | ||
| 205 | |||
| 199 | return 0; | 206 | return 0; |
| 200 | } | 207 | } |
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 54adcf35f495..7b621409cf15 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile | |||
| @@ -12,6 +12,7 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer | |||
| 12 | endif | 12 | endif |
| 13 | 13 | ||
| 14 | obj-y += core.o proc.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o | 14 | obj-y += core.o proc.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o |
| 15 | obj-y += wait.o completion.o | ||
| 15 | obj-$(CONFIG_SMP) += cpupri.o | 16 | obj-$(CONFIG_SMP) += cpupri.o |
| 16 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o | 17 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o |
| 17 | obj-$(CONFIG_SCHEDSTATS) += stats.o | 18 | obj-$(CONFIG_SCHEDSTATS) += stats.o |
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c new file mode 100644 index 000000000000..a63f4dc27909 --- /dev/null +++ b/kernel/sched/completion.c | |||
| @@ -0,0 +1,299 @@ | |||
| 1 | /* | ||
| 2 | * Generic wait-for-completion handler; | ||
| 3 | * | ||
| 4 | * It differs from semaphores in that their default case is the opposite, | ||
| 5 | * wait_for_completion default blocks whereas semaphore default non-block. The | ||
| 6 | * interface also makes it easy to 'complete' multiple waiting threads, | ||
| 7 | * something which isn't entirely natural for semaphores. | ||
| 8 | * | ||
| 9 | * But more importantly, the primitive documents the usage. Semaphores would | ||
| 10 | * typically be used for exclusion which gives rise to priority inversion. | ||
| 11 | * Waiting for completion is a typically sync point, but not an exclusion point. | ||
| 12 | */ | ||
| 13 | |||
| 14 | #include <linux/sched.h> | ||
| 15 | #include <linux/completion.h> | ||
| 16 | |||
| 17 | /** | ||
| 18 | * complete: - signals a single thread waiting on this completion | ||
| 19 | * @x: holds the state of this particular completion | ||
| 20 | * | ||
| 21 | * This will wake up a single thread waiting on this completion. Threads will be | ||
| 22 | * awakened in the same order in which they were queued. | ||
| 23 | * | ||
| 24 | * See also complete_all(), wait_for_completion() and related routines. | ||
| 25 | * | ||
| 26 | * It may be assumed that this function implies a write memory barrier before | ||
| 27 | * changing the task state if and only if any tasks are woken up. | ||
| 28 | */ | ||
| 29 | void complete(struct completion *x) | ||
| 30 | { | ||
| 31 | unsigned long flags; | ||
| 32 | |||
| 33 | spin_lock_irqsave(&x->wait.lock, flags); | ||
| 34 | x->done++; | ||
| 35 | __wake_up_locked(&x->wait, TASK_NORMAL, 1); | ||
| 36 | spin_unlock_irqrestore(&x->wait.lock, flags); | ||
| 37 | } | ||
| 38 | EXPORT_SYMBOL(complete); | ||
| 39 | |||
| 40 | /** | ||
| 41 | * complete_all: - signals all threads waiting on this completion | ||
| 42 | * @x: holds the state of this particular completion | ||
| 43 | * | ||
| 44 | * This will wake up all threads waiting on this particular completion event. | ||
| 45 | * | ||
| 46 | * It may be assumed that this function implies a write memory barrier before | ||
| 47 | * changing the task state if and only if any tasks are woken up. | ||
| 48 | */ | ||
| 49 | void complete_all(struct completion *x) | ||
| 50 | { | ||
| 51 | unsigned long flags; | ||
| 52 | |||
| 53 | spin_lock_irqsave(&x->wait.lock, flags); | ||
| 54 | x->done += UINT_MAX/2; | ||
| 55 | __wake_up_locked(&x->wait, TASK_NORMAL, 0); | ||
| 56 | spin_unlock_irqrestore(&x->wait.lock, flags); | ||
| 57 | } | ||
| 58 | EXPORT_SYMBOL(complete_all); | ||
| 59 | |||
| 60 | static inline long __sched | ||
| 61 | do_wait_for_common(struct completion *x, | ||
| 62 | long (*action)(long), long timeout, int state) | ||
| 63 | { | ||
| 64 | if (!x->done) { | ||
| 65 | DECLARE_WAITQUEUE(wait, current); | ||
| 66 | |||
| 67 | __add_wait_queue_tail_exclusive(&x->wait, &wait); | ||
| 68 | do { | ||
| 69 | if (signal_pending_state(state, current)) { | ||
| 70 | timeout = -ERESTARTSYS; | ||
| 71 | break; | ||
| 72 | } | ||
| 73 | __set_current_state(state); | ||
| 74 | spin_unlock_irq(&x->wait.lock); | ||
| 75 | timeout = action(timeout); | ||
| 76 | spin_lock_irq(&x->wait.lock); | ||
| 77 | } while (!x->done && timeout); | ||
| 78 | __remove_wait_queue(&x->wait, &wait); | ||
| 79 | if (!x->done) | ||
| 80 | return timeout; | ||
| 81 | } | ||
| 82 | x->done--; | ||
| 83 | return timeout ?: 1; | ||
| 84 | } | ||
| 85 | |||
| 86 | static inline long __sched | ||
| 87 | __wait_for_common(struct completion *x, | ||
| 88 | long (*action)(long), long timeout, int state) | ||
| 89 | { | ||
| 90 | might_sleep(); | ||
| 91 | |||
| 92 | spin_lock_irq(&x->wait.lock); | ||
| 93 | timeout = do_wait_for_common(x, action, timeout, state); | ||
| 94 | spin_unlock_irq(&x->wait.lock); | ||
| 95 | return timeout; | ||
| 96 | } | ||
| 97 | |||
| 98 | static long __sched | ||
| 99 | wait_for_common(struct completion *x, long timeout, int state) | ||
| 100 | { | ||
| 101 | return __wait_for_common(x, schedule_timeout, timeout, state); | ||
| 102 | } | ||
| 103 | |||
| 104 | static long __sched | ||
| 105 | wait_for_common_io(struct completion *x, long timeout, int state) | ||
| 106 | { | ||
| 107 | return __wait_for_common(x, io_schedule_timeout, timeout, state); | ||
| 108 | } | ||
| 109 | |||
| 110 | /** | ||
| 111 | * wait_for_completion: - waits for completion of a task | ||
| 112 | * @x: holds the state of this particular completion | ||
| 113 | * | ||
| 114 | * This waits to be signaled for completion of a specific task. It is NOT | ||
| 115 | * interruptible and there is no timeout. | ||
| 116 | * | ||
| 117 | * See also similar routines (i.e. wait_for_completion_timeout()) with timeout | ||
| 118 | * and interrupt capability. Also see complete(). | ||
| 119 | */ | ||
| 120 | void __sched wait_for_completion(struct completion *x) | ||
| 121 | { | ||
| 122 | wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); | ||
| 123 | } | ||
| 124 | EXPORT_SYMBOL(wait_for_completion); | ||
| 125 | |||
| 126 | /** | ||
| 127 | * wait_for_completion_timeout: - waits for completion of a task (w/timeout) | ||
| 128 | * @x: holds the state of this particular completion | ||
| 129 | * @timeout: timeout value in jiffies | ||
| 130 | * | ||
| 131 | * This waits for either a completion of a specific task to be signaled or for a | ||
| 132 | * specified timeout to expire. The timeout is in jiffies. It is not | ||
| 133 | * interruptible. | ||
| 134 | * | ||
| 135 | * Return: 0 if timed out, and positive (at least 1, or number of jiffies left | ||
| 136 | * till timeout) if completed. | ||
| 137 | */ | ||
| 138 | unsigned long __sched | ||
| 139 | wait_for_completion_timeout(struct completion *x, unsigned long timeout) | ||
| 140 | { | ||
| 141 | return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE); | ||
| 142 | } | ||
| 143 | EXPORT_SYMBOL(wait_for_completion_timeout); | ||
| 144 | |||
| 145 | /** | ||
| 146 | * wait_for_completion_io: - waits for completion of a task | ||
| 147 | * @x: holds the state of this particular completion | ||
| 148 | * | ||
| 149 | * This waits to be signaled for completion of a specific task. It is NOT | ||
| 150 | * interruptible and there is no timeout. The caller is accounted as waiting | ||
| 151 | * for IO. | ||
| 152 | */ | ||
| 153 | void __sched wait_for_completion_io(struct completion *x) | ||
| 154 | { | ||
| 155 | wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); | ||
| 156 | } | ||
| 157 | EXPORT_SYMBOL(wait_for_completion_io); | ||
| 158 | |||
| 159 | /** | ||
| 160 | * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout) | ||
| 161 | * @x: holds the state of this particular completion | ||
| 162 | * @timeout: timeout value in jiffies | ||
| 163 | * | ||
| 164 | * This waits for either a completion of a specific task to be signaled or for a | ||
| 165 | * specified timeout to expire. The timeout is in jiffies. It is not | ||
| 166 | * interruptible. The caller is accounted as waiting for IO. | ||
| 167 | * | ||
| 168 | * Return: 0 if timed out, and positive (at least 1, or number of jiffies left | ||
| 169 | * till timeout) if completed. | ||
| 170 | */ | ||
| 171 | unsigned long __sched | ||
| 172 | wait_for_completion_io_timeout(struct completion *x, unsigned long timeout) | ||
| 173 | { | ||
| 174 | return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE); | ||
| 175 | } | ||
| 176 | EXPORT_SYMBOL(wait_for_completion_io_timeout); | ||
| 177 | |||
| 178 | /** | ||
| 179 | * wait_for_completion_interruptible: - waits for completion of a task (w/intr) | ||
| 180 | * @x: holds the state of this particular completion | ||
| 181 | * | ||
| 182 | * This waits for completion of a specific task to be signaled. It is | ||
| 183 | * interruptible. | ||
| 184 | * | ||
| 185 | * Return: -ERESTARTSYS if interrupted, 0 if completed. | ||
| 186 | */ | ||
| 187 | int __sched wait_for_completion_interruptible(struct completion *x) | ||
| 188 | { | ||
| 189 | long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); | ||
| 190 | if (t == -ERESTARTSYS) | ||
| 191 | return t; | ||
| 192 | return 0; | ||
| 193 | } | ||
| 194 | EXPORT_SYMBOL(wait_for_completion_interruptible); | ||
| 195 | |||
| 196 | /** | ||
| 197 | * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) | ||
| 198 | * @x: holds the state of this particular completion | ||
| 199 | * @timeout: timeout value in jiffies | ||
| 200 | * | ||
| 201 | * This waits for either a completion of a specific task to be signaled or for a | ||
| 202 | * specified timeout to expire. It is interruptible. The timeout is in jiffies. | ||
| 203 | * | ||
| 204 | * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1, | ||
| 205 | * or number of jiffies left till timeout) if completed. | ||
| 206 | */ | ||
| 207 | long __sched | ||
| 208 | wait_for_completion_interruptible_timeout(struct completion *x, | ||
| 209 | unsigned long timeout) | ||
| 210 | { | ||
| 211 | return wait_for_common(x, timeout, TASK_INTERRUPTIBLE); | ||
| 212 | } | ||
| 213 | EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); | ||
| 214 | |||
| 215 | /** | ||
| 216 | * wait_for_completion_killable: - waits for completion of a task (killable) | ||
| 217 | * @x: holds the state of this particular completion | ||
| 218 | * | ||
| 219 | * This waits to be signaled for completion of a specific task. It can be | ||
| 220 | * interrupted by a kill signal. | ||
| 221 | * | ||
| 222 | * Return: -ERESTARTSYS if interrupted, 0 if completed. | ||
| 223 | */ | ||
| 224 | int __sched wait_for_completion_killable(struct completion *x) | ||
| 225 | { | ||
| 226 | long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); | ||
| 227 | if (t == -ERESTARTSYS) | ||
| 228 | return t; | ||
| 229 | return 0; | ||
| 230 | } | ||
| 231 | EXPORT_SYMBOL(wait_for_completion_killable); | ||
| 232 | |||
| 233 | /** | ||
| 234 | * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable)) | ||
| 235 | * @x: holds the state of this particular completion | ||
| 236 | * @timeout: timeout value in jiffies | ||
| 237 | * | ||
| 238 | * This waits for either a completion of a specific task to be | ||
| 239 | * signaled or for a specified timeout to expire. It can be | ||
| 240 | * interrupted by a kill signal. The timeout is in jiffies. | ||
| 241 | * | ||
| 242 | * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1, | ||
| 243 | * or number of jiffies left till timeout) if completed. | ||
| 244 | */ | ||
| 245 | long __sched | ||
| 246 | wait_for_completion_killable_timeout(struct completion *x, | ||
| 247 | unsigned long timeout) | ||
| 248 | { | ||
| 249 | return wait_for_common(x, timeout, TASK_KILLABLE); | ||
| 250 | } | ||
| 251 | EXPORT_SYMBOL(wait_for_completion_killable_timeout); | ||
| 252 | |||
| 253 | /** | ||
| 254 | * try_wait_for_completion - try to decrement a completion without blocking | ||
| 255 | * @x: completion structure | ||
| 256 | * | ||
| 257 | * Return: 0 if a decrement cannot be done without blocking | ||
| 258 | * 1 if a decrement succeeded. | ||
| 259 | * | ||
| 260 | * If a completion is being used as a counting completion, | ||
| 261 | * attempt to decrement the counter without blocking. This | ||
| 262 | * enables us to avoid waiting if the resource the completion | ||
| 263 | * is protecting is not available. | ||
| 264 | */ | ||
| 265 | bool try_wait_for_completion(struct completion *x) | ||
| 266 | { | ||
| 267 | unsigned long flags; | ||
| 268 | int ret = 1; | ||
| 269 | |||
| 270 | spin_lock_irqsave(&x->wait.lock, flags); | ||
| 271 | if (!x->done) | ||
| 272 | ret = 0; | ||
| 273 | else | ||
| 274 | x->done--; | ||
| 275 | spin_unlock_irqrestore(&x->wait.lock, flags); | ||
| 276 | return ret; | ||
| 277 | } | ||
| 278 | EXPORT_SYMBOL(try_wait_for_completion); | ||
| 279 | |||
| 280 | /** | ||
| 281 | * completion_done - Test to see if a completion has any waiters | ||
| 282 | * @x: completion structure | ||
| 283 | * | ||
| 284 | * Return: 0 if there are waiters (wait_for_completion() in progress) | ||
| 285 | * 1 if there are no waiters. | ||
| 286 | * | ||
| 287 | */ | ||
| 288 | bool completion_done(struct completion *x) | ||
| 289 | { | ||
| 290 | unsigned long flags; | ||
| 291 | int ret = 1; | ||
| 292 | |||
| 293 | spin_lock_irqsave(&x->wait.lock, flags); | ||
| 294 | if (!x->done) | ||
| 295 | ret = 0; | ||
| 296 | spin_unlock_irqrestore(&x->wait.lock, flags); | ||
| 297 | return ret; | ||
| 298 | } | ||
| 299 | EXPORT_SYMBOL(completion_done); | ||
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b7c32cb7bfeb..1deccd78be98 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
| @@ -513,12 +513,11 @@ static inline void init_hrtick(void) | |||
| 513 | * might also involve a cross-CPU call to trigger the scheduler on | 513 | * might also involve a cross-CPU call to trigger the scheduler on |
| 514 | * the target CPU. | 514 | * the target CPU. |
| 515 | */ | 515 | */ |
| 516 | #ifdef CONFIG_SMP | ||
| 517 | void resched_task(struct task_struct *p) | 516 | void resched_task(struct task_struct *p) |
| 518 | { | 517 | { |
| 519 | int cpu; | 518 | int cpu; |
| 520 | 519 | ||
| 521 | assert_raw_spin_locked(&task_rq(p)->lock); | 520 | lockdep_assert_held(&task_rq(p)->lock); |
| 522 | 521 | ||
| 523 | if (test_tsk_need_resched(p)) | 522 | if (test_tsk_need_resched(p)) |
| 524 | return; | 523 | return; |
| @@ -526,8 +525,10 @@ void resched_task(struct task_struct *p) | |||
| 526 | set_tsk_need_resched(p); | 525 | set_tsk_need_resched(p); |
| 527 | 526 | ||
| 528 | cpu = task_cpu(p); | 527 | cpu = task_cpu(p); |
| 529 | if (cpu == smp_processor_id()) | 528 | if (cpu == smp_processor_id()) { |
| 529 | set_preempt_need_resched(); | ||
| 530 | return; | 530 | return; |
| 531 | } | ||
| 531 | 532 | ||
| 532 | /* NEED_RESCHED must be visible before we test polling */ | 533 | /* NEED_RESCHED must be visible before we test polling */ |
| 533 | smp_mb(); | 534 | smp_mb(); |
| @@ -546,6 +547,7 @@ void resched_cpu(int cpu) | |||
| 546 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 547 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
| 547 | } | 548 | } |
| 548 | 549 | ||
| 550 | #ifdef CONFIG_SMP | ||
| 549 | #ifdef CONFIG_NO_HZ_COMMON | 551 | #ifdef CONFIG_NO_HZ_COMMON |
| 550 | /* | 552 | /* |
| 551 | * In the semi idle case, use the nearest busy cpu for migrating timers | 553 | * In the semi idle case, use the nearest busy cpu for migrating timers |
| @@ -693,12 +695,6 @@ void sched_avg_update(struct rq *rq) | |||
| 693 | } | 695 | } |
| 694 | } | 696 | } |
| 695 | 697 | ||
| 696 | #else /* !CONFIG_SMP */ | ||
| 697 | void resched_task(struct task_struct *p) | ||
| 698 | { | ||
| 699 | assert_raw_spin_locked(&task_rq(p)->lock); | ||
| 700 | set_tsk_need_resched(p); | ||
| 701 | } | ||
| 702 | #endif /* CONFIG_SMP */ | 698 | #endif /* CONFIG_SMP */ |
| 703 | 699 | ||
| 704 | #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ | 700 | #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ |
| @@ -767,14 +763,14 @@ static void set_load_weight(struct task_struct *p) | |||
| 767 | static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) | 763 | static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) |
| 768 | { | 764 | { |
| 769 | update_rq_clock(rq); | 765 | update_rq_clock(rq); |
| 770 | sched_info_queued(p); | 766 | sched_info_queued(rq, p); |
| 771 | p->sched_class->enqueue_task(rq, p, flags); | 767 | p->sched_class->enqueue_task(rq, p, flags); |
| 772 | } | 768 | } |
| 773 | 769 | ||
| 774 | static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) | 770 | static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) |
| 775 | { | 771 | { |
| 776 | update_rq_clock(rq); | 772 | update_rq_clock(rq); |
| 777 | sched_info_dequeued(p); | 773 | sched_info_dequeued(rq, p); |
| 778 | p->sched_class->dequeue_task(rq, p, flags); | 774 | p->sched_class->dequeue_task(rq, p, flags); |
| 779 | } | 775 | } |
| 780 | 776 | ||
| @@ -933,6 +929,8 @@ static int effective_prio(struct task_struct *p) | |||
| 933 | /** | 929 | /** |
| 934 | * task_curr - is this task currently executing on a CPU? | 930 | * task_curr - is this task currently executing on a CPU? |
| 935 | * @p: the task in question. | 931 | * @p: the task in question. |
| 932 | * | ||
| 933 | * Return: 1 if the task is currently executing. 0 otherwise. | ||
| 936 | */ | 934 | */ |
| 937 | inline int task_curr(const struct task_struct *p) | 935 | inline int task_curr(const struct task_struct *p) |
| 938 | { | 936 | { |
| @@ -976,13 +974,6 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) | |||
| 976 | rq->skip_clock_update = 1; | 974 | rq->skip_clock_update = 1; |
| 977 | } | 975 | } |
| 978 | 976 | ||
| 979 | static ATOMIC_NOTIFIER_HEAD(task_migration_notifier); | ||
| 980 | |||
| 981 | void register_task_migration_notifier(struct notifier_block *n) | ||
| 982 | { | ||
| 983 | atomic_notifier_chain_register(&task_migration_notifier, n); | ||
| 984 | } | ||
| 985 | |||
| 986 | #ifdef CONFIG_SMP | 977 | #ifdef CONFIG_SMP |
| 987 | void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | 978 | void set_task_cpu(struct task_struct *p, unsigned int new_cpu) |
| 988 | { | 979 | { |
| @@ -992,7 +983,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |||
| 992 | * ttwu() will sort out the placement. | 983 | * ttwu() will sort out the placement. |
| 993 | */ | 984 | */ |
| 994 | WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && | 985 | WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && |
| 995 | !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); | 986 | !(task_preempt_count(p) & PREEMPT_ACTIVE)); |
| 996 | 987 | ||
| 997 | #ifdef CONFIG_LOCKDEP | 988 | #ifdef CONFIG_LOCKDEP |
| 998 | /* | 989 | /* |
| @@ -1013,21 +1004,114 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |||
| 1013 | trace_sched_migrate_task(p, new_cpu); | 1004 | trace_sched_migrate_task(p, new_cpu); |
| 1014 | 1005 | ||
| 1015 | if (task_cpu(p) != new_cpu) { | 1006 | if (task_cpu(p) != new_cpu) { |
| 1016 | struct task_migration_notifier tmn; | ||
| 1017 | |||
| 1018 | if (p->sched_class->migrate_task_rq) | 1007 | if (p->sched_class->migrate_task_rq) |
| 1019 | p->sched_class->migrate_task_rq(p, new_cpu); | 1008 | p->sched_class->migrate_task_rq(p, new_cpu); |
| 1020 | p->se.nr_migrations++; | 1009 | p->se.nr_migrations++; |
| 1021 | perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); | 1010 | perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); |
| 1011 | } | ||
| 1012 | |||
| 1013 | __set_task_cpu(p, new_cpu); | ||
| 1014 | } | ||
| 1015 | |||
| 1016 | static void __migrate_swap_task(struct task_struct *p, int cpu) | ||
| 1017 | { | ||
| 1018 | if (p->on_rq) { | ||
| 1019 | struct rq *src_rq, *dst_rq; | ||
| 1022 | 1020 | ||
| 1023 | tmn.task = p; | 1021 | src_rq = task_rq(p); |
| 1024 | tmn.from_cpu = task_cpu(p); | 1022 | dst_rq = cpu_rq(cpu); |
| 1025 | tmn.to_cpu = new_cpu; | ||
| 1026 | 1023 | ||
| 1027 | atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn); | 1024 | deactivate_task(src_rq, p, 0); |
| 1025 | set_task_cpu(p, cpu); | ||
| 1026 | activate_task(dst_rq, p, 0); | ||
| 1027 | check_preempt_curr(dst_rq, p, 0); | ||
| 1028 | } else { | ||
| 1029 | /* | ||
| 1030 | * Task isn't running anymore; make it appear like we migrated | ||
| 1031 | * it before it went to sleep. This means on wakeup we make the | ||
| 1032 | * previous cpu our targer instead of where it really is. | ||
| 1033 | */ | ||
| 1034 | p->wake_cpu = cpu; | ||
| 1028 | } | 1035 | } |
| 1036 | } | ||
| 1029 | 1037 | ||
| 1030 | __set_task_cpu(p, new_cpu); | 1038 | struct migration_swap_arg { |
| 1039 | struct task_struct *src_task, *dst_task; | ||
| 1040 | int src_cpu, dst_cpu; | ||
| 1041 | }; | ||
| 1042 | |||
| 1043 | static int migrate_swap_stop(void *data) | ||
| 1044 | { | ||
| 1045 | struct migration_swap_arg *arg = data; | ||
| 1046 | struct rq *src_rq, *dst_rq; | ||
| 1047 | int ret = -EAGAIN; | ||
| 1048 | |||
| 1049 | src_rq = cpu_rq(arg->src_cpu); | ||
| 1050 | dst_rq = cpu_rq(arg->dst_cpu); | ||
| 1051 | |||
| 1052 | double_raw_lock(&arg->src_task->pi_lock, | ||
| 1053 | &arg->dst_task->pi_lock); | ||
| 1054 | double_rq_lock(src_rq, dst_rq); | ||
| 1055 | if (task_cpu(arg->dst_task) != arg->dst_cpu) | ||
| 1056 | goto unlock; | ||
| 1057 | |||
| 1058 | if (task_cpu(arg->src_task) != arg->src_cpu) | ||
| 1059 | goto unlock; | ||
| 1060 | |||
| 1061 | if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task))) | ||
| 1062 | goto unlock; | ||
| 1063 | |||
| 1064 | if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task))) | ||
| 1065 | goto unlock; | ||
| 1066 | |||
| 1067 | __migrate_swap_task(arg->src_task, arg->dst_cpu); | ||
| 1068 | __migrate_swap_task(arg->dst_task, arg->src_cpu); | ||
| 1069 | |||
| 1070 | ret = 0; | ||
| 1071 | |||
| 1072 | unlock: | ||
| 1073 | double_rq_unlock(src_rq, dst_rq); | ||
| 1074 | raw_spin_unlock(&arg->dst_task->pi_lock); | ||
| 1075 | raw_spin_unlock(&arg->src_task->pi_lock); | ||
| 1076 | |||
| 1077 | return ret; | ||
| 1078 | } | ||
| 1079 | |||
| 1080 | /* | ||
| 1081 | * Cross migrate two tasks | ||
| 1082 | */ | ||
| 1083 | int migrate_swap(struct task_struct *cur, struct task_struct *p) | ||
| 1084 | { | ||
| 1085 | struct migration_swap_arg arg; | ||
| 1086 | int ret = -EINVAL; | ||
| 1087 | |||
| 1088 | arg = (struct migration_swap_arg){ | ||
| 1089 | .src_task = cur, | ||
| 1090 | .src_cpu = task_cpu(cur), | ||
| 1091 | .dst_task = p, | ||
| 1092 | .dst_cpu = task_cpu(p), | ||
| 1093 | }; | ||
| 1094 | |||
| 1095 | if (arg.src_cpu == arg.dst_cpu) | ||
| 1096 | goto out; | ||
| 1097 | |||
| 1098 | /* | ||
| 1099 | * These three tests are all lockless; this is OK since all of them | ||
| 1100 | * will be re-checked with proper locks held further down the line. | ||
| 1101 | */ | ||
| 1102 | if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu)) | ||
| 1103 | goto out; | ||
| 1104 | |||
| 1105 | if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task))) | ||
| 1106 | goto out; | ||
| 1107 | |||
| 1108 | if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task))) | ||
| 1109 | goto out; | ||
| 1110 | |||
| 1111 | ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg); | ||
| 1112 | |||
| 1113 | out: | ||
| 1114 | return ret; | ||
| 1031 | } | 1115 | } |
| 1032 | 1116 | ||
| 1033 | struct migration_arg { | 1117 | struct migration_arg { |
| @@ -1249,9 +1333,9 @@ out: | |||
| 1249 | * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. | 1333 | * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. |
| 1250 | */ | 1334 | */ |
| 1251 | static inline | 1335 | static inline |
| 1252 | int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) | 1336 | int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) |
| 1253 | { | 1337 | { |
| 1254 | int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); | 1338 | cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); |
| 1255 | 1339 | ||
| 1256 | /* | 1340 | /* |
| 1257 | * In order not to call set_task_cpu() on a blocking task we need | 1341 | * In order not to call set_task_cpu() on a blocking task we need |
| @@ -1343,12 +1427,13 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) | |||
| 1343 | 1427 | ||
| 1344 | if (rq->idle_stamp) { | 1428 | if (rq->idle_stamp) { |
| 1345 | u64 delta = rq_clock(rq) - rq->idle_stamp; | 1429 | u64 delta = rq_clock(rq) - rq->idle_stamp; |
| 1346 | u64 max = 2*sysctl_sched_migration_cost; | 1430 | u64 max = 2*rq->max_idle_balance_cost; |
| 1347 | 1431 | ||
| 1348 | if (delta > max) | 1432 | update_avg(&rq->avg_idle, delta); |
| 1433 | |||
| 1434 | if (rq->avg_idle > max) | ||
| 1349 | rq->avg_idle = max; | 1435 | rq->avg_idle = max; |
| 1350 | else | 1436 | |
| 1351 | update_avg(&rq->avg_idle, delta); | ||
| 1352 | rq->idle_stamp = 0; | 1437 | rq->idle_stamp = 0; |
| 1353 | } | 1438 | } |
| 1354 | #endif | 1439 | #endif |
| @@ -1409,6 +1494,14 @@ static void sched_ttwu_pending(void) | |||
| 1409 | 1494 | ||
| 1410 | void scheduler_ipi(void) | 1495 | void scheduler_ipi(void) |
| 1411 | { | 1496 | { |
| 1497 | /* | ||
| 1498 | * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting | ||
| 1499 | * TIF_NEED_RESCHED remotely (for the first time) will also send | ||
| 1500 | * this IPI. | ||
| 1501 | */ | ||
| 1502 | if (tif_need_resched()) | ||
| 1503 | set_preempt_need_resched(); | ||
| 1504 | |||
| 1412 | if (llist_empty(&this_rq()->wake_list) | 1505 | if (llist_empty(&this_rq()->wake_list) |
| 1413 | && !tick_nohz_full_cpu(smp_processor_id()) | 1506 | && !tick_nohz_full_cpu(smp_processor_id()) |
| 1414 | && !got_nohz_idle_kick()) | 1507 | && !got_nohz_idle_kick()) |
| @@ -1482,7 +1575,7 @@ static void ttwu_queue(struct task_struct *p, int cpu) | |||
| 1482 | * the simpler "current->state = TASK_RUNNING" to mark yourself | 1575 | * the simpler "current->state = TASK_RUNNING" to mark yourself |
| 1483 | * runnable without the overhead of this. | 1576 | * runnable without the overhead of this. |
| 1484 | * | 1577 | * |
| 1485 | * Returns %true if @p was woken up, %false if it was already running | 1578 | * Return: %true if @p was woken up, %false if it was already running. |
| 1486 | * or @state didn't match @p's state. | 1579 | * or @state didn't match @p's state. |
| 1487 | */ | 1580 | */ |
| 1488 | static int | 1581 | static int |
| @@ -1491,7 +1584,13 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) | |||
| 1491 | unsigned long flags; | 1584 | unsigned long flags; |
| 1492 | int cpu, success = 0; | 1585 | int cpu, success = 0; |
| 1493 | 1586 | ||
| 1494 | smp_wmb(); | 1587 | /* |
| 1588 | * If we are going to wake up a thread waiting for CONDITION we | ||
| 1589 | * need to ensure that CONDITION=1 done by the caller can not be | ||
| 1590 | * reordered with p->state check below. This pairs with mb() in | ||
| 1591 | * set_current_state() the waiting thread does. | ||
| 1592 | */ | ||
| 1593 | smp_mb__before_spinlock(); | ||
| 1495 | raw_spin_lock_irqsave(&p->pi_lock, flags); | 1594 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
| 1496 | if (!(p->state & state)) | 1595 | if (!(p->state & state)) |
| 1497 | goto out; | 1596 | goto out; |
| @@ -1520,7 +1619,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) | |||
| 1520 | if (p->sched_class->task_waking) | 1619 | if (p->sched_class->task_waking) |
| 1521 | p->sched_class->task_waking(p); | 1620 | p->sched_class->task_waking(p); |
| 1522 | 1621 | ||
| 1523 | cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); | 1622 | cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); |
| 1524 | if (task_cpu(p) != cpu) { | 1623 | if (task_cpu(p) != cpu) { |
| 1525 | wake_flags |= WF_MIGRATED; | 1624 | wake_flags |= WF_MIGRATED; |
| 1526 | set_task_cpu(p, cpu); | 1625 | set_task_cpu(p, cpu); |
| @@ -1577,8 +1676,9 @@ out: | |||
| 1577 | * @p: The process to be woken up. | 1676 | * @p: The process to be woken up. |
| 1578 | * | 1677 | * |
| 1579 | * Attempt to wake up the nominated process and move it to the set of runnable | 1678 | * Attempt to wake up the nominated process and move it to the set of runnable |
| 1580 | * processes. Returns 1 if the process was woken up, 0 if it was already | 1679 | * processes. |
| 1581 | * running. | 1680 | * |
| 1681 | * Return: 1 if the process was woken up, 0 if it was already running. | ||
| 1582 | * | 1682 | * |
| 1583 | * It may be assumed that this function implies a write memory barrier before | 1683 | * It may be assumed that this function implies a write memory barrier before |
| 1584 | * changing the task state if and only if any tasks are woken up. | 1684 | * changing the task state if and only if any tasks are woken up. |
| @@ -1601,7 +1701,7 @@ int wake_up_state(struct task_struct *p, unsigned int state) | |||
| 1601 | * | 1701 | * |
| 1602 | * __sched_fork() is basic setup used by init_idle() too: | 1702 | * __sched_fork() is basic setup used by init_idle() too: |
| 1603 | */ | 1703 | */ |
| 1604 | static void __sched_fork(struct task_struct *p) | 1704 | static void __sched_fork(unsigned long clone_flags, struct task_struct *p) |
| 1605 | { | 1705 | { |
| 1606 | p->on_rq = 0; | 1706 | p->on_rq = 0; |
| 1607 | 1707 | ||
| @@ -1625,16 +1725,24 @@ static void __sched_fork(struct task_struct *p) | |||
| 1625 | 1725 | ||
| 1626 | #ifdef CONFIG_NUMA_BALANCING | 1726 | #ifdef CONFIG_NUMA_BALANCING |
| 1627 | if (p->mm && atomic_read(&p->mm->mm_users) == 1) { | 1727 | if (p->mm && atomic_read(&p->mm->mm_users) == 1) { |
| 1628 | p->mm->numa_next_scan = jiffies; | 1728 | p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); |
| 1629 | p->mm->numa_next_reset = jiffies; | ||
| 1630 | p->mm->numa_scan_seq = 0; | 1729 | p->mm->numa_scan_seq = 0; |
| 1631 | } | 1730 | } |
| 1632 | 1731 | ||
| 1732 | if (clone_flags & CLONE_VM) | ||
| 1733 | p->numa_preferred_nid = current->numa_preferred_nid; | ||
| 1734 | else | ||
| 1735 | p->numa_preferred_nid = -1; | ||
| 1736 | |||
| 1633 | p->node_stamp = 0ULL; | 1737 | p->node_stamp = 0ULL; |
| 1634 | p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; | 1738 | p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; |
| 1635 | p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0; | ||
| 1636 | p->numa_scan_period = sysctl_numa_balancing_scan_delay; | 1739 | p->numa_scan_period = sysctl_numa_balancing_scan_delay; |
| 1637 | p->numa_work.next = &p->numa_work; | 1740 | p->numa_work.next = &p->numa_work; |
| 1741 | p->numa_faults = NULL; | ||
| 1742 | p->numa_faults_buffer = NULL; | ||
| 1743 | |||
| 1744 | INIT_LIST_HEAD(&p->numa_entry); | ||
| 1745 | p->numa_group = NULL; | ||
| 1638 | #endif /* CONFIG_NUMA_BALANCING */ | 1746 | #endif /* CONFIG_NUMA_BALANCING */ |
| 1639 | } | 1747 | } |
| 1640 | 1748 | ||
| @@ -1660,12 +1768,12 @@ void set_numabalancing_state(bool enabled) | |||
| 1660 | /* | 1768 | /* |
| 1661 | * fork()/clone()-time setup: | 1769 | * fork()/clone()-time setup: |
| 1662 | */ | 1770 | */ |
| 1663 | void sched_fork(struct task_struct *p) | 1771 | void sched_fork(unsigned long clone_flags, struct task_struct *p) |
| 1664 | { | 1772 | { |
| 1665 | unsigned long flags; | 1773 | unsigned long flags; |
| 1666 | int cpu = get_cpu(); | 1774 | int cpu = get_cpu(); |
| 1667 | 1775 | ||
| 1668 | __sched_fork(p); | 1776 | __sched_fork(clone_flags, p); |
| 1669 | /* | 1777 | /* |
| 1670 | * We mark the process as running here. This guarantees that | 1778 | * We mark the process as running here. This guarantees that |
| 1671 | * nobody will actually run it, and a signal or other external | 1779 | * nobody will actually run it, and a signal or other external |
| @@ -1723,10 +1831,7 @@ void sched_fork(struct task_struct *p) | |||
| 1723 | #if defined(CONFIG_SMP) | 1831 | #if defined(CONFIG_SMP) |
| 1724 | p->on_cpu = 0; | 1832 | p->on_cpu = 0; |
| 1725 | #endif | 1833 | #endif |
| 1726 | #ifdef CONFIG_PREEMPT_COUNT | 1834 | init_task_preempt_count(p); |
| 1727 | /* Want to start with kernel preemption disabled. */ | ||
| 1728 | task_thread_info(p)->preempt_count = 1; | ||
| 1729 | #endif | ||
| 1730 | #ifdef CONFIG_SMP | 1835 | #ifdef CONFIG_SMP |
| 1731 | plist_node_init(&p->pushable_tasks, MAX_PRIO); | 1836 | plist_node_init(&p->pushable_tasks, MAX_PRIO); |
| 1732 | #endif | 1837 | #endif |
| @@ -1753,7 +1858,7 @@ void wake_up_new_task(struct task_struct *p) | |||
| 1753 | * - cpus_allowed can change in the fork path | 1858 | * - cpus_allowed can change in the fork path |
| 1754 | * - any previously selected cpu might disappear through hotplug | 1859 | * - any previously selected cpu might disappear through hotplug |
| 1755 | */ | 1860 | */ |
| 1756 | set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); | 1861 | set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); |
| 1757 | #endif | 1862 | #endif |
| 1758 | 1863 | ||
| 1759 | /* Initialize new task's runnable average */ | 1864 | /* Initialize new task's runnable average */ |
| @@ -1844,7 +1949,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, | |||
| 1844 | struct task_struct *next) | 1949 | struct task_struct *next) |
| 1845 | { | 1950 | { |
| 1846 | trace_sched_switch(prev, next); | 1951 | trace_sched_switch(prev, next); |
| 1847 | sched_info_switch(prev, next); | 1952 | sched_info_switch(rq, prev, next); |
| 1848 | perf_event_task_sched_out(prev, next); | 1953 | perf_event_task_sched_out(prev, next); |
| 1849 | fire_sched_out_preempt_notifiers(prev, next); | 1954 | fire_sched_out_preempt_notifiers(prev, next); |
| 1850 | prepare_lock_switch(rq, next); | 1955 | prepare_lock_switch(rq, next); |
| @@ -1896,6 +2001,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
| 1896 | if (mm) | 2001 | if (mm) |
| 1897 | mmdrop(mm); | 2002 | mmdrop(mm); |
| 1898 | if (unlikely(prev_state == TASK_DEAD)) { | 2003 | if (unlikely(prev_state == TASK_DEAD)) { |
| 2004 | task_numa_free(prev); | ||
| 2005 | |||
| 1899 | /* | 2006 | /* |
| 1900 | * Remove function-return probe instances associated with this | 2007 | * Remove function-return probe instances associated with this |
| 1901 | * task and put them back on the free list. | 2008 | * task and put them back on the free list. |
| @@ -2079,7 +2186,7 @@ void sched_exec(void) | |||
| 2079 | int dest_cpu; | 2186 | int dest_cpu; |
| 2080 | 2187 | ||
| 2081 | raw_spin_lock_irqsave(&p->pi_lock, flags); | 2188 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
| 2082 | dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0); | 2189 | dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0); |
| 2083 | if (dest_cpu == smp_processor_id()) | 2190 | if (dest_cpu == smp_processor_id()) |
| 2084 | goto unlock; | 2191 | goto unlock; |
| 2085 | 2192 | ||
| @@ -2191,6 +2298,8 @@ void scheduler_tick(void) | |||
| 2191 | * This makes sure that uptime, CFS vruntime, load | 2298 | * This makes sure that uptime, CFS vruntime, load |
| 2192 | * balancing, etc... continue to move forward, even | 2299 | * balancing, etc... continue to move forward, even |
| 2193 | * with a very low granularity. | 2300 | * with a very low granularity. |
| 2301 | * | ||
| 2302 | * Return: Maximum deferment in nanoseconds. | ||
| 2194 | */ | 2303 | */ |
| 2195 | u64 scheduler_tick_max_deferment(void) | 2304 | u64 scheduler_tick_max_deferment(void) |
| 2196 | { | 2305 | { |
| @@ -2219,7 +2328,7 @@ notrace unsigned long get_parent_ip(unsigned long addr) | |||
| 2219 | #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ | 2328 | #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ |
| 2220 | defined(CONFIG_PREEMPT_TRACER)) | 2329 | defined(CONFIG_PREEMPT_TRACER)) |
| 2221 | 2330 | ||
| 2222 | void __kprobes add_preempt_count(int val) | 2331 | void __kprobes preempt_count_add(int val) |
| 2223 | { | 2332 | { |
| 2224 | #ifdef CONFIG_DEBUG_PREEMPT | 2333 | #ifdef CONFIG_DEBUG_PREEMPT |
| 2225 | /* | 2334 | /* |
| @@ -2228,7 +2337,7 @@ void __kprobes add_preempt_count(int val) | |||
| 2228 | if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) | 2337 | if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) |
| 2229 | return; | 2338 | return; |
| 2230 | #endif | 2339 | #endif |
| 2231 | preempt_count() += val; | 2340 | __preempt_count_add(val); |
| 2232 | #ifdef CONFIG_DEBUG_PREEMPT | 2341 | #ifdef CONFIG_DEBUG_PREEMPT |
| 2233 | /* | 2342 | /* |
| 2234 | * Spinlock count overflowing soon? | 2343 | * Spinlock count overflowing soon? |
| @@ -2239,9 +2348,9 @@ void __kprobes add_preempt_count(int val) | |||
| 2239 | if (preempt_count() == val) | 2348 | if (preempt_count() == val) |
| 2240 | trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); | 2349 | trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); |
| 2241 | } | 2350 | } |
| 2242 | EXPORT_SYMBOL(add_preempt_count); | 2351 | EXPORT_SYMBOL(preempt_count_add); |
| 2243 | 2352 | ||
| 2244 | void __kprobes sub_preempt_count(int val) | 2353 | void __kprobes preempt_count_sub(int val) |
| 2245 | { | 2354 | { |
| 2246 | #ifdef CONFIG_DEBUG_PREEMPT | 2355 | #ifdef CONFIG_DEBUG_PREEMPT |
| 2247 | /* | 2356 | /* |
| @@ -2259,9 +2368,9 @@ void __kprobes sub_preempt_count(int val) | |||
| 2259 | 2368 | ||
| 2260 | if (preempt_count() == val) | 2369 | if (preempt_count() == val) |
| 2261 | trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); | 2370 | trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); |
| 2262 | preempt_count() -= val; | 2371 | __preempt_count_sub(val); |
| 2263 | } | 2372 | } |
| 2264 | EXPORT_SYMBOL(sub_preempt_count); | 2373 | EXPORT_SYMBOL(preempt_count_sub); |
| 2265 | 2374 | ||
| 2266 | #endif | 2375 | #endif |
| 2267 | 2376 | ||
| @@ -2394,6 +2503,12 @@ need_resched: | |||
| 2394 | if (sched_feat(HRTICK)) | 2503 | if (sched_feat(HRTICK)) |
| 2395 | hrtick_clear(rq); | 2504 | hrtick_clear(rq); |
| 2396 | 2505 | ||
| 2506 | /* | ||
| 2507 | * Make sure that signal_pending_state()->signal_pending() below | ||
| 2508 | * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) | ||
| 2509 | * done by the caller to avoid the race with signal_wake_up(). | ||
| 2510 | */ | ||
| 2511 | smp_mb__before_spinlock(); | ||
| 2397 | raw_spin_lock_irq(&rq->lock); | 2512 | raw_spin_lock_irq(&rq->lock); |
| 2398 | 2513 | ||
| 2399 | switch_count = &prev->nivcsw; | 2514 | switch_count = &prev->nivcsw; |
| @@ -2428,6 +2543,7 @@ need_resched: | |||
| 2428 | put_prev_task(rq, prev); | 2543 | put_prev_task(rq, prev); |
| 2429 | next = pick_next_task(rq); | 2544 | next = pick_next_task(rq); |
| 2430 | clear_tsk_need_resched(prev); | 2545 | clear_tsk_need_resched(prev); |
| 2546 | clear_preempt_need_resched(); | ||
| 2431 | rq->skip_clock_update = 0; | 2547 | rq->skip_clock_update = 0; |
| 2432 | 2548 | ||
| 2433 | if (likely(prev != next)) { | 2549 | if (likely(prev != next)) { |
| @@ -2510,19 +2626,17 @@ void __sched schedule_preempt_disabled(void) | |||
| 2510 | */ | 2626 | */ |
| 2511 | asmlinkage void __sched notrace preempt_schedule(void) | 2627 | asmlinkage void __sched notrace preempt_schedule(void) |
| 2512 | { | 2628 | { |
| 2513 | struct thread_info *ti = current_thread_info(); | ||
| 2514 | |||
| 2515 | /* | 2629 | /* |
| 2516 | * If there is a non-zero preempt_count or interrupts are disabled, | 2630 | * If there is a non-zero preempt_count or interrupts are disabled, |
| 2517 | * we do not want to preempt the current task. Just return.. | 2631 | * we do not want to preempt the current task. Just return.. |
| 2518 | */ | 2632 | */ |
| 2519 | if (likely(ti->preempt_count || irqs_disabled())) | 2633 | if (likely(!preemptible())) |
| 2520 | return; | 2634 | return; |
| 2521 | 2635 | ||
| 2522 | do { | 2636 | do { |
| 2523 | add_preempt_count_notrace(PREEMPT_ACTIVE); | 2637 | __preempt_count_add(PREEMPT_ACTIVE); |
| 2524 | __schedule(); | 2638 | __schedule(); |
| 2525 | sub_preempt_count_notrace(PREEMPT_ACTIVE); | 2639 | __preempt_count_sub(PREEMPT_ACTIVE); |
| 2526 | 2640 | ||
| 2527 | /* | 2641 | /* |
| 2528 | * Check again in case we missed a preemption opportunity | 2642 | * Check again in case we missed a preemption opportunity |
| @@ -2541,20 +2655,19 @@ EXPORT_SYMBOL(preempt_schedule); | |||
| 2541 | */ | 2655 | */ |
| 2542 | asmlinkage void __sched preempt_schedule_irq(void) | 2656 | asmlinkage void __sched preempt_schedule_irq(void) |
| 2543 | { | 2657 | { |
| 2544 | struct thread_info *ti = current_thread_info(); | ||
| 2545 | enum ctx_state prev_state; | 2658 | enum ctx_state prev_state; |
| 2546 | 2659 | ||
| 2547 | /* Catch callers which need to be fixed */ | 2660 | /* Catch callers which need to be fixed */ |
| 2548 | BUG_ON(ti->preempt_count || !irqs_disabled()); | 2661 | BUG_ON(preempt_count() || !irqs_disabled()); |
| 2549 | 2662 | ||
| 2550 | prev_state = exception_enter(); | 2663 | prev_state = exception_enter(); |
| 2551 | 2664 | ||
| 2552 | do { | 2665 | do { |
| 2553 | add_preempt_count(PREEMPT_ACTIVE); | 2666 | __preempt_count_add(PREEMPT_ACTIVE); |
| 2554 | local_irq_enable(); | 2667 | local_irq_enable(); |
| 2555 | __schedule(); | 2668 | __schedule(); |
| 2556 | local_irq_disable(); | 2669 | local_irq_disable(); |
| 2557 | sub_preempt_count(PREEMPT_ACTIVE); | 2670 | __preempt_count_sub(PREEMPT_ACTIVE); |
| 2558 | 2671 | ||
| 2559 | /* | 2672 | /* |
| 2560 | * Check again in case we missed a preemption opportunity | 2673 | * Check again in case we missed a preemption opportunity |
| @@ -2575,393 +2688,6 @@ int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, | |||
| 2575 | } | 2688 | } |
| 2576 | EXPORT_SYMBOL(default_wake_function); | 2689 | EXPORT_SYMBOL(default_wake_function); |
| 2577 | 2690 | ||
| 2578 | /* | ||
| 2579 | * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just | ||
| 2580 | * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve | ||
| 2581 | * number) then we wake all the non-exclusive tasks and one exclusive task. | ||
| 2582 | * | ||
| 2583 | * There are circumstances in which we can try to wake a task which has already | ||
| 2584 | * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns | ||
| 2585 | * zero in this (rare) case, and we handle it by continuing to scan the queue. | ||
| 2586 | */ | ||
| 2587 | static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, | ||
| 2588 | int nr_exclusive, int wake_flags, void *key) | ||
| 2589 | { | ||
| 2590 | wait_queue_t *curr, *next; | ||
| 2591 | |||
| 2592 | list_for_each_entry_safe(curr, next, &q->task_list, task_list) { | ||
| 2593 | unsigned flags = curr->flags; | ||
| 2594 | |||
| 2595 | if (curr->func(curr, mode, wake_flags, key) && | ||
| 2596 | (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) | ||
| 2597 | break; | ||
| 2598 | } | ||
| 2599 | } | ||
| 2600 | |||
| 2601 | /** | ||
| 2602 | * __wake_up - wake up threads blocked on a waitqueue. | ||
| 2603 | * @q: the waitqueue | ||
| 2604 | * @mode: which threads | ||
| 2605 | * @nr_exclusive: how many wake-one or wake-many threads to wake up | ||
| 2606 | * @key: is directly passed to the wakeup function | ||
| 2607 | * | ||
| 2608 | * It may be assumed that this function implies a write memory barrier before | ||
| 2609 | * changing the task state if and only if any tasks are woken up. | ||
| 2610 | */ | ||
| 2611 | void __wake_up(wait_queue_head_t *q, unsigned int mode, | ||
| 2612 | int nr_exclusive, void *key) | ||
| 2613 | { | ||
| 2614 | unsigned long flags; | ||
| 2615 | |||
| 2616 | spin_lock_irqsave(&q->lock, flags); | ||
| 2617 | __wake_up_common(q, mode, nr_exclusive, 0, key); | ||
| 2618 | spin_unlock_irqrestore(&q->lock, flags); | ||
| 2619 | } | ||
| 2620 | EXPORT_SYMBOL(__wake_up); | ||
| 2621 | |||
| 2622 | /* | ||
| 2623 | * Same as __wake_up but called with the spinlock in wait_queue_head_t held. | ||
| 2624 | */ | ||
| 2625 | void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr) | ||
| 2626 | { | ||
| 2627 | __wake_up_common(q, mode, nr, 0, NULL); | ||
| 2628 | } | ||
| 2629 | EXPORT_SYMBOL_GPL(__wake_up_locked); | ||
| 2630 | |||
| 2631 | void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) | ||
| 2632 | { | ||
| 2633 | __wake_up_common(q, mode, 1, 0, key); | ||
| 2634 | } | ||
| 2635 | EXPORT_SYMBOL_GPL(__wake_up_locked_key); | ||
| 2636 | |||
| 2637 | /** | ||
| 2638 | * __wake_up_sync_key - wake up threads blocked on a waitqueue. | ||
| 2639 | * @q: the waitqueue | ||
| 2640 | * @mode: which threads | ||
| 2641 | * @nr_exclusive: how many wake-one or wake-many threads to wake up | ||
| 2642 | * @key: opaque value to be passed to wakeup targets | ||
| 2643 | * | ||
| 2644 | * The sync wakeup differs that the waker knows that it will schedule | ||
| 2645 | * away soon, so while the target thread will be woken up, it will not | ||
| 2646 | * be migrated to another CPU - ie. the two threads are 'synchronized' | ||
| 2647 | * with each other. This can prevent needless bouncing between CPUs. | ||
| 2648 | * | ||
| 2649 | * On UP it can prevent extra preemption. | ||
| 2650 | * | ||
| 2651 | * It may be assumed that this function implies a write memory barrier before | ||
| 2652 | * changing the task state if and only if any tasks are woken up. | ||
| 2653 | */ | ||
| 2654 | void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, | ||
| 2655 | int nr_exclusive, void *key) | ||
| 2656 | { | ||
| 2657 | unsigned long flags; | ||
| 2658 | int wake_flags = WF_SYNC; | ||
| 2659 | |||
| 2660 | if (unlikely(!q)) | ||
| 2661 | return; | ||
| 2662 | |||
| 2663 | if (unlikely(!nr_exclusive)) | ||
| 2664 | wake_flags = 0; | ||
| 2665 | |||
| 2666 | spin_lock_irqsave(&q->lock, flags); | ||
| 2667 | __wake_up_common(q, mode, nr_exclusive, wake_flags, key); | ||
| 2668 | spin_unlock_irqrestore(&q->lock, flags); | ||
| 2669 | } | ||
| 2670 | EXPORT_SYMBOL_GPL(__wake_up_sync_key); | ||
| 2671 | |||
| 2672 | /* | ||
| 2673 | * __wake_up_sync - see __wake_up_sync_key() | ||
| 2674 | */ | ||
| 2675 | void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) | ||
| 2676 | { | ||
| 2677 | __wake_up_sync_key(q, mode, nr_exclusive, NULL); | ||
| 2678 | } | ||
| 2679 | EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ | ||
| 2680 | |||
| 2681 | /** | ||
| 2682 | * complete: - signals a single thread waiting on this completion | ||
| 2683 | * @x: holds the state of this particular completion | ||
| 2684 | * | ||
| 2685 | * This will wake up a single thread waiting on this completion. Threads will be | ||
| 2686 | * awakened in the same order in which they were queued. | ||
| 2687 | * | ||
| 2688 | * See also complete_all(), wait_for_completion() and related routines. | ||
| 2689 | * | ||
| 2690 | * It may be assumed that this function implies a write memory barrier before | ||
| 2691 | * changing the task state if and only if any tasks are woken up. | ||
| 2692 | */ | ||
| 2693 | void complete(struct completion *x) | ||
| 2694 | { | ||
| 2695 | unsigned long flags; | ||
| 2696 | |||
| 2697 | spin_lock_irqsave(&x->wait.lock, flags); | ||
| 2698 | x->done++; | ||
| 2699 | __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL); | ||
| 2700 | spin_unlock_irqrestore(&x->wait.lock, flags); | ||
| 2701 | } | ||
| 2702 | EXPORT_SYMBOL(complete); | ||
| 2703 | |||
| 2704 | /** | ||
| 2705 | * complete_all: - signals all threads waiting on this completion | ||
| 2706 | * @x: holds the state of this particular completion | ||
| 2707 | * | ||
| 2708 | * This will wake up all threads waiting on this particular completion event. | ||
| 2709 | * | ||
| 2710 | * It may be assumed that this function implies a write memory barrier before | ||
| 2711 | * changing the task state if and only if any tasks are woken up. | ||
| 2712 | */ | ||
| 2713 | void complete_all(struct completion *x) | ||
| 2714 | { | ||
| 2715 | unsigned long flags; | ||
| 2716 | |||
| 2717 | spin_lock_irqsave(&x->wait.lock, flags); | ||
| 2718 | x->done += UINT_MAX/2; | ||
| 2719 | __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL); | ||
| 2720 | spin_unlock_irqrestore(&x->wait.lock, flags); | ||
| 2721 | } | ||
| 2722 | EXPORT_SYMBOL(complete_all); | ||
| 2723 | |||
| 2724 | static inline long __sched | ||
| 2725 | do_wait_for_common(struct completion *x, | ||
| 2726 | long (*action)(long), long timeout, int state) | ||
| 2727 | { | ||
| 2728 | if (!x->done) { | ||
| 2729 | DECLARE_WAITQUEUE(wait, current); | ||
| 2730 | |||
| 2731 | __add_wait_queue_tail_exclusive(&x->wait, &wait); | ||
| 2732 | do { | ||
| 2733 | if (signal_pending_state(state, current)) { | ||
| 2734 | timeout = -ERESTARTSYS; | ||
| 2735 | break; | ||
| 2736 | } | ||
| 2737 | __set_current_state(state); | ||
| 2738 | spin_unlock_irq(&x->wait.lock); | ||
| 2739 | timeout = action(timeout); | ||
| 2740 | spin_lock_irq(&x->wait.lock); | ||
| 2741 | } while (!x->done && timeout); | ||
| 2742 | __remove_wait_queue(&x->wait, &wait); | ||
| 2743 | if (!x->done) | ||
| 2744 | return timeout; | ||
| 2745 | } | ||
| 2746 | x->done--; | ||
| 2747 | return timeout ?: 1; | ||
| 2748 | } | ||
| 2749 | |||
| 2750 | static inline long __sched | ||
| 2751 | __wait_for_common(struct completion *x, | ||
| 2752 | long (*action)(long), long timeout, int state) | ||
| 2753 | { | ||
| 2754 | might_sleep(); | ||
| 2755 | |||
| 2756 | spin_lock_irq(&x->wait.lock); | ||
| 2757 | timeout = do_wait_for_common(x, action, timeout, state); | ||
| 2758 | spin_unlock_irq(&x->wait.lock); | ||
| 2759 | return timeout; | ||
| 2760 | } | ||
| 2761 | |||
| 2762 | static long __sched | ||
| 2763 | wait_for_common(struct completion *x, long timeout, int state) | ||
| 2764 | { | ||
| 2765 | return __wait_for_common(x, schedule_timeout, timeout, state); | ||
| 2766 | } | ||
| 2767 | |||
| 2768 | static long __sched | ||
| 2769 | wait_for_common_io(struct completion *x, long timeout, int state) | ||
| 2770 | { | ||
| 2771 | return __wait_for_common(x, io_schedule_timeout, timeout, state); | ||
| 2772 | } | ||
| 2773 | |||
| 2774 | /** | ||
| 2775 | * wait_for_completion: - waits for completion of a task | ||
| 2776 | * @x: holds the state of this particular completion | ||
| 2777 | * | ||
| 2778 | * This waits to be signaled for completion of a specific task. It is NOT | ||
| 2779 | * interruptible and there is no timeout. | ||
| 2780 | * | ||
| 2781 | * See also similar routines (i.e. wait_for_completion_timeout()) with timeout | ||
| 2782 | * and interrupt capability. Also see complete(). | ||
| 2783 | */ | ||
| 2784 | void __sched wait_for_completion(struct completion *x) | ||
| 2785 | { | ||
| 2786 | wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); | ||
| 2787 | } | ||
| 2788 | EXPORT_SYMBOL(wait_for_completion); | ||
| 2789 | |||
| 2790 | /** | ||
| 2791 | * wait_for_completion_timeout: - waits for completion of a task (w/timeout) | ||
| 2792 | * @x: holds the state of this particular completion | ||
| 2793 | * @timeout: timeout value in jiffies | ||
| 2794 | * | ||
| 2795 | * This waits for either a completion of a specific task to be signaled or for a | ||
| 2796 | * specified timeout to expire. The timeout is in jiffies. It is not | ||
| 2797 | * interruptible. | ||
| 2798 | * | ||
| 2799 | * The return value is 0 if timed out, and positive (at least 1, or number of | ||
| 2800 | * jiffies left till timeout) if completed. | ||
| 2801 | */ | ||
| 2802 | unsigned long __sched | ||
| 2803 | wait_for_completion_timeout(struct completion *x, unsigned long timeout) | ||
| 2804 | { | ||
| 2805 | return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE); | ||
| 2806 | } | ||
| 2807 | EXPORT_SYMBOL(wait_for_completion_timeout); | ||
| 2808 | |||
| 2809 | /** | ||
| 2810 | * wait_for_completion_io: - waits for completion of a task | ||
| 2811 | * @x: holds the state of this particular completion | ||
| 2812 | * | ||
| 2813 | * This waits to be signaled for completion of a specific task. It is NOT | ||
| 2814 | * interruptible and there is no timeout. The caller is accounted as waiting | ||
| 2815 | * for IO. | ||
| 2816 | */ | ||
| 2817 | void __sched wait_for_completion_io(struct completion *x) | ||
| 2818 | { | ||
| 2819 | wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); | ||
| 2820 | } | ||
| 2821 | EXPORT_SYMBOL(wait_for_completion_io); | ||
| 2822 | |||
| 2823 | /** | ||
| 2824 | * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout) | ||
| 2825 | * @x: holds the state of this particular completion | ||
| 2826 | * @timeout: timeout value in jiffies | ||
| 2827 | * | ||
| 2828 | * This waits for either a completion of a specific task to be signaled or for a | ||
| 2829 | * specified timeout to expire. The timeout is in jiffies. It is not | ||
| 2830 | * interruptible. The caller is accounted as waiting for IO. | ||
| 2831 | * | ||
| 2832 | * The return value is 0 if timed out, and positive (at least 1, or number of | ||
| 2833 | * jiffies left till timeout) if completed. | ||
| 2834 | */ | ||
| 2835 | unsigned long __sched | ||
| 2836 | wait_for_completion_io_timeout(struct completion *x, unsigned long timeout) | ||
| 2837 | { | ||
| 2838 | return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE); | ||
| 2839 | } | ||
| 2840 | EXPORT_SYMBOL(wait_for_completion_io_timeout); | ||
| 2841 | |||
| 2842 | /** | ||
| 2843 | * wait_for_completion_interruptible: - waits for completion of a task (w/intr) | ||
| 2844 | * @x: holds the state of this particular completion | ||
| 2845 | * | ||
| 2846 | * This waits for completion of a specific task to be signaled. It is | ||
| 2847 | * interruptible. | ||
| 2848 | * | ||
| 2849 | * The return value is -ERESTARTSYS if interrupted, 0 if completed. | ||
| 2850 | */ | ||
| 2851 | int __sched wait_for_completion_interruptible(struct completion *x) | ||
| 2852 | { | ||
| 2853 | long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); | ||
| 2854 | if (t == -ERESTARTSYS) | ||
| 2855 | return t; | ||
| 2856 | return 0; | ||
| 2857 | } | ||
| 2858 | EXPORT_SYMBOL(wait_for_completion_interruptible); | ||
| 2859 | |||
| 2860 | /** | ||
| 2861 | * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) | ||
| 2862 | * @x: holds the state of this particular completion | ||
| 2863 | * @timeout: timeout value in jiffies | ||
| 2864 | * | ||
| 2865 | * This waits for either a completion of a specific task to be signaled or for a | ||
| 2866 | * specified timeout to expire. It is interruptible. The timeout is in jiffies. | ||
| 2867 | * | ||
| 2868 | * The return value is -ERESTARTSYS if interrupted, 0 if timed out, | ||
| 2869 | * positive (at least 1, or number of jiffies left till timeout) if completed. | ||
| 2870 | */ | ||
| 2871 | long __sched | ||
| 2872 | wait_for_completion_interruptible_timeout(struct completion *x, | ||
| 2873 | unsigned long timeout) | ||
| 2874 | { | ||
| 2875 | return wait_for_common(x, timeout, TASK_INTERRUPTIBLE); | ||
| 2876 | } | ||
| 2877 | EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); | ||
| 2878 | |||
| 2879 | /** | ||
| 2880 | * wait_for_completion_killable: - waits for completion of a task (killable) | ||
| 2881 | * @x: holds the state of this particular completion | ||
| 2882 | * | ||
| 2883 | * This waits to be signaled for completion of a specific task. It can be | ||
| 2884 | * interrupted by a kill signal. | ||
| 2885 | * | ||
| 2886 | * The return value is -ERESTARTSYS if interrupted, 0 if completed. | ||
| 2887 | */ | ||
| 2888 | int __sched wait_for_completion_killable(struct completion *x) | ||
| 2889 | { | ||
| 2890 | long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); | ||
| 2891 | if (t == -ERESTARTSYS) | ||
| 2892 | return t; | ||
| 2893 | return 0; | ||
| 2894 | } | ||
| 2895 | EXPORT_SYMBOL(wait_for_completion_killable); | ||
| 2896 | |||
| 2897 | /** | ||
| 2898 | * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable)) | ||
| 2899 | * @x: holds the state of this particular completion | ||
| 2900 | * @timeout: timeout value in jiffies | ||
| 2901 | * | ||
| 2902 | * This waits for either a completion of a specific task to be | ||
| 2903 | * signaled or for a specified timeout to expire. It can be | ||
| 2904 | * interrupted by a kill signal. The timeout is in jiffies. | ||
| 2905 | * | ||
| 2906 | * The return value is -ERESTARTSYS if interrupted, 0 if timed out, | ||
| 2907 | * positive (at least 1, or number of jiffies left till timeout) if completed. | ||
| 2908 | */ | ||
| 2909 | long __sched | ||
| 2910 | wait_for_completion_killable_timeout(struct completion *x, | ||
| 2911 | unsigned long timeout) | ||
| 2912 | { | ||
| 2913 | return wait_for_common(x, timeout, TASK_KILLABLE); | ||
| 2914 | } | ||
| 2915 | EXPORT_SYMBOL(wait_for_completion_killable_timeout); | ||
| 2916 | |||
| 2917 | /** | ||
| 2918 | * try_wait_for_completion - try to decrement a completion without blocking | ||
| 2919 | * @x: completion structure | ||
| 2920 | * | ||
| 2921 | * Returns: 0 if a decrement cannot be done without blocking | ||
| 2922 | * 1 if a decrement succeeded. | ||
| 2923 | * | ||
| 2924 | * If a completion is being used as a counting completion, | ||
| 2925 | * attempt to decrement the counter without blocking. This | ||
| 2926 | * enables us to avoid waiting if the resource the completion | ||
| 2927 | * is protecting is not available. | ||
| 2928 | */ | ||
| 2929 | bool try_wait_for_completion(struct completion *x) | ||
| 2930 | { | ||
| 2931 | unsigned long flags; | ||
| 2932 | int ret = 1; | ||
| 2933 | |||
| 2934 | spin_lock_irqsave(&x->wait.lock, flags); | ||
| 2935 | if (!x->done) | ||
| 2936 | ret = 0; | ||
| 2937 | else | ||
| 2938 | x->done--; | ||
| 2939 | spin_unlock_irqrestore(&x->wait.lock, flags); | ||
| 2940 | return ret; | ||
| 2941 | } | ||
| 2942 | EXPORT_SYMBOL(try_wait_for_completion); | ||
| 2943 | |||
| 2944 | /** | ||
| 2945 | * completion_done - Test to see if a completion has any waiters | ||
| 2946 | * @x: completion structure | ||
| 2947 | * | ||
| 2948 | * Returns: 0 if there are waiters (wait_for_completion() in progress) | ||
| 2949 | * 1 if there are no waiters. | ||
| 2950 | * | ||
| 2951 | */ | ||
| 2952 | bool completion_done(struct completion *x) | ||
| 2953 | { | ||
| 2954 | unsigned long flags; | ||
| 2955 | int ret = 1; | ||
| 2956 | |||
| 2957 | spin_lock_irqsave(&x->wait.lock, flags); | ||
| 2958 | if (!x->done) | ||
| 2959 | ret = 0; | ||
| 2960 | spin_unlock_irqrestore(&x->wait.lock, flags); | ||
| 2961 | return ret; | ||
| 2962 | } | ||
| 2963 | EXPORT_SYMBOL(completion_done); | ||
| 2964 | |||
| 2965 | static long __sched | 2691 | static long __sched |
| 2966 | sleep_on_common(wait_queue_head_t *q, int state, long timeout) | 2692 | sleep_on_common(wait_queue_head_t *q, int state, long timeout) |
| 2967 | { | 2693 | { |
| @@ -3182,7 +2908,7 @@ SYSCALL_DEFINE1(nice, int, increment) | |||
| 3182 | * task_prio - return the priority value of a given task. | 2908 | * task_prio - return the priority value of a given task. |
| 3183 | * @p: the task in question. | 2909 | * @p: the task in question. |
| 3184 | * | 2910 | * |
| 3185 | * This is the priority value as seen by users in /proc. | 2911 | * Return: The priority value as seen by users in /proc. |
| 3186 | * RT tasks are offset by -200. Normal tasks are centered | 2912 | * RT tasks are offset by -200. Normal tasks are centered |
| 3187 | * around 0, value goes from -16 to +15. | 2913 | * around 0, value goes from -16 to +15. |
| 3188 | */ | 2914 | */ |
| @@ -3194,6 +2920,8 @@ int task_prio(const struct task_struct *p) | |||
| 3194 | /** | 2920 | /** |
| 3195 | * task_nice - return the nice value of a given task. | 2921 | * task_nice - return the nice value of a given task. |
| 3196 | * @p: the task in question. | 2922 | * @p: the task in question. |
| 2923 | * | ||
| 2924 | * Return: The nice value [ -20 ... 0 ... 19 ]. | ||
| 3197 | */ | 2925 | */ |
| 3198 | int task_nice(const struct task_struct *p) | 2926 | int task_nice(const struct task_struct *p) |
| 3199 | { | 2927 | { |
| @@ -3204,6 +2932,8 @@ EXPORT_SYMBOL(task_nice); | |||
| 3204 | /** | 2932 | /** |
| 3205 | * idle_cpu - is a given cpu idle currently? | 2933 | * idle_cpu - is a given cpu idle currently? |
| 3206 | * @cpu: the processor in question. | 2934 | * @cpu: the processor in question. |
| 2935 | * | ||
| 2936 | * Return: 1 if the CPU is currently idle. 0 otherwise. | ||
| 3207 | */ | 2937 | */ |
| 3208 | int idle_cpu(int cpu) | 2938 | int idle_cpu(int cpu) |
| 3209 | { | 2939 | { |
| @@ -3226,6 +2956,8 @@ int idle_cpu(int cpu) | |||
| 3226 | /** | 2956 | /** |
| 3227 | * idle_task - return the idle task for a given cpu. | 2957 | * idle_task - return the idle task for a given cpu. |
| 3228 | * @cpu: the processor in question. | 2958 | * @cpu: the processor in question. |
| 2959 | * | ||
| 2960 | * Return: The idle task for the cpu @cpu. | ||
| 3229 | */ | 2961 | */ |
| 3230 | struct task_struct *idle_task(int cpu) | 2962 | struct task_struct *idle_task(int cpu) |
| 3231 | { | 2963 | { |
| @@ -3235,6 +2967,8 @@ struct task_struct *idle_task(int cpu) | |||
| 3235 | /** | 2967 | /** |
| 3236 | * find_process_by_pid - find a process with a matching PID value. | 2968 | * find_process_by_pid - find a process with a matching PID value. |
| 3237 | * @pid: the pid in question. | 2969 | * @pid: the pid in question. |
| 2970 | * | ||
| 2971 | * The task of @pid, if found. %NULL otherwise. | ||
| 3238 | */ | 2972 | */ |
| 3239 | static struct task_struct *find_process_by_pid(pid_t pid) | 2973 | static struct task_struct *find_process_by_pid(pid_t pid) |
| 3240 | { | 2974 | { |
| @@ -3432,6 +3166,8 @@ recheck: | |||
| 3432 | * @policy: new policy. | 3166 | * @policy: new policy. |
| 3433 | * @param: structure containing the new RT priority. | 3167 | * @param: structure containing the new RT priority. |
| 3434 | * | 3168 | * |
| 3169 | * Return: 0 on success. An error code otherwise. | ||
| 3170 | * | ||
| 3435 | * NOTE that the task may be already dead. | 3171 | * NOTE that the task may be already dead. |
| 3436 | */ | 3172 | */ |
| 3437 | int sched_setscheduler(struct task_struct *p, int policy, | 3173 | int sched_setscheduler(struct task_struct *p, int policy, |
| @@ -3451,6 +3187,8 @@ EXPORT_SYMBOL_GPL(sched_setscheduler); | |||
| 3451 | * current context has permission. For example, this is needed in | 3187 | * current context has permission. For example, this is needed in |
| 3452 | * stop_machine(): we create temporary high priority worker threads, | 3188 | * stop_machine(): we create temporary high priority worker threads, |
| 3453 | * but our caller might not have that capability. | 3189 | * but our caller might not have that capability. |
| 3190 | * | ||
| 3191 | * Return: 0 on success. An error code otherwise. | ||
| 3454 | */ | 3192 | */ |
| 3455 | int sched_setscheduler_nocheck(struct task_struct *p, int policy, | 3193 | int sched_setscheduler_nocheck(struct task_struct *p, int policy, |
| 3456 | const struct sched_param *param) | 3194 | const struct sched_param *param) |
| @@ -3485,6 +3223,8 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) | |||
| 3485 | * @pid: the pid in question. | 3223 | * @pid: the pid in question. |
| 3486 | * @policy: new policy. | 3224 | * @policy: new policy. |
| 3487 | * @param: structure containing the new RT priority. | 3225 | * @param: structure containing the new RT priority. |
| 3226 | * | ||
| 3227 | * Return: 0 on success. An error code otherwise. | ||
| 3488 | */ | 3228 | */ |
| 3489 | SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, | 3229 | SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, |
| 3490 | struct sched_param __user *, param) | 3230 | struct sched_param __user *, param) |
| @@ -3500,6 +3240,8 @@ SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, | |||
| 3500 | * sys_sched_setparam - set/change the RT priority of a thread | 3240 | * sys_sched_setparam - set/change the RT priority of a thread |
| 3501 | * @pid: the pid in question. | 3241 | * @pid: the pid in question. |
| 3502 | * @param: structure containing the new RT priority. | 3242 | * @param: structure containing the new RT priority. |
| 3243 | * | ||
| 3244 | * Return: 0 on success. An error code otherwise. | ||
| 3503 | */ | 3245 | */ |
| 3504 | SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) | 3246 | SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) |
| 3505 | { | 3247 | { |
| @@ -3509,6 +3251,9 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) | |||
| 3509 | /** | 3251 | /** |
| 3510 | * sys_sched_getscheduler - get the policy (scheduling class) of a thread | 3252 | * sys_sched_getscheduler - get the policy (scheduling class) of a thread |
| 3511 | * @pid: the pid in question. | 3253 | * @pid: the pid in question. |
| 3254 | * | ||
| 3255 | * Return: On success, the policy of the thread. Otherwise, a negative error | ||
| 3256 | * code. | ||
| 3512 | */ | 3257 | */ |
| 3513 | SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) | 3258 | SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) |
| 3514 | { | 3259 | { |
| @@ -3535,6 +3280,9 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) | |||
| 3535 | * sys_sched_getparam - get the RT priority of a thread | 3280 | * sys_sched_getparam - get the RT priority of a thread |
| 3536 | * @pid: the pid in question. | 3281 | * @pid: the pid in question. |
| 3537 | * @param: structure containing the RT priority. | 3282 | * @param: structure containing the RT priority. |
| 3283 | * | ||
| 3284 | * Return: On success, 0 and the RT priority is in @param. Otherwise, an error | ||
| 3285 | * code. | ||
| 3538 | */ | 3286 | */ |
| 3539 | SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) | 3287 | SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) |
| 3540 | { | 3288 | { |
| @@ -3576,13 +3324,11 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) | |||
| 3576 | struct task_struct *p; | 3324 | struct task_struct *p; |
| 3577 | int retval; | 3325 | int retval; |
| 3578 | 3326 | ||
| 3579 | get_online_cpus(); | ||
| 3580 | rcu_read_lock(); | 3327 | rcu_read_lock(); |
| 3581 | 3328 | ||
| 3582 | p = find_process_by_pid(pid); | 3329 | p = find_process_by_pid(pid); |
| 3583 | if (!p) { | 3330 | if (!p) { |
| 3584 | rcu_read_unlock(); | 3331 | rcu_read_unlock(); |
| 3585 | put_online_cpus(); | ||
| 3586 | return -ESRCH; | 3332 | return -ESRCH; |
| 3587 | } | 3333 | } |
| 3588 | 3334 | ||
| @@ -3639,7 +3385,6 @@ out_free_cpus_allowed: | |||
| 3639 | free_cpumask_var(cpus_allowed); | 3385 | free_cpumask_var(cpus_allowed); |
| 3640 | out_put_task: | 3386 | out_put_task: |
| 3641 | put_task_struct(p); | 3387 | put_task_struct(p); |
| 3642 | put_online_cpus(); | ||
| 3643 | return retval; | 3388 | return retval; |
| 3644 | } | 3389 | } |
| 3645 | 3390 | ||
| @@ -3659,6 +3404,8 @@ static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, | |||
| 3659 | * @pid: pid of the process | 3404 | * @pid: pid of the process |
| 3660 | * @len: length in bytes of the bitmask pointed to by user_mask_ptr | 3405 | * @len: length in bytes of the bitmask pointed to by user_mask_ptr |
| 3661 | * @user_mask_ptr: user-space pointer to the new cpu mask | 3406 | * @user_mask_ptr: user-space pointer to the new cpu mask |
| 3407 | * | ||
| 3408 | * Return: 0 on success. An error code otherwise. | ||
| 3662 | */ | 3409 | */ |
| 3663 | SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, | 3410 | SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, |
| 3664 | unsigned long __user *, user_mask_ptr) | 3411 | unsigned long __user *, user_mask_ptr) |
| @@ -3682,7 +3429,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) | |||
| 3682 | unsigned long flags; | 3429 | unsigned long flags; |
| 3683 | int retval; | 3430 | int retval; |
| 3684 | 3431 | ||
| 3685 | get_online_cpus(); | ||
| 3686 | rcu_read_lock(); | 3432 | rcu_read_lock(); |
| 3687 | 3433 | ||
| 3688 | retval = -ESRCH; | 3434 | retval = -ESRCH; |
| @@ -3695,12 +3441,11 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) | |||
| 3695 | goto out_unlock; | 3441 | goto out_unlock; |
| 3696 | 3442 | ||
| 3697 | raw_spin_lock_irqsave(&p->pi_lock, flags); | 3443 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
| 3698 | cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); | 3444 | cpumask_and(mask, &p->cpus_allowed, cpu_active_mask); |
| 3699 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | 3445 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
| 3700 | 3446 | ||
| 3701 | out_unlock: | 3447 | out_unlock: |
| 3702 | rcu_read_unlock(); | 3448 | rcu_read_unlock(); |
| 3703 | put_online_cpus(); | ||
| 3704 | 3449 | ||
| 3705 | return retval; | 3450 | return retval; |
| 3706 | } | 3451 | } |
| @@ -3710,6 +3455,8 @@ out_unlock: | |||
| 3710 | * @pid: pid of the process | 3455 | * @pid: pid of the process |
| 3711 | * @len: length in bytes of the bitmask pointed to by user_mask_ptr | 3456 | * @len: length in bytes of the bitmask pointed to by user_mask_ptr |
| 3712 | * @user_mask_ptr: user-space pointer to hold the current cpu mask | 3457 | * @user_mask_ptr: user-space pointer to hold the current cpu mask |
| 3458 | * | ||
| 3459 | * Return: 0 on success. An error code otherwise. | ||
| 3713 | */ | 3460 | */ |
| 3714 | SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, | 3461 | SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, |
| 3715 | unsigned long __user *, user_mask_ptr) | 3462 | unsigned long __user *, user_mask_ptr) |
| @@ -3744,6 +3491,8 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, | |||
| 3744 | * | 3491 | * |
| 3745 | * This function yields the current CPU to other tasks. If there are no | 3492 | * This function yields the current CPU to other tasks. If there are no |
| 3746 | * other threads running on this CPU then this function will return. | 3493 | * other threads running on this CPU then this function will return. |
| 3494 | * | ||
| 3495 | * Return: 0. | ||
| 3747 | */ | 3496 | */ |
| 3748 | SYSCALL_DEFINE0(sched_yield) | 3497 | SYSCALL_DEFINE0(sched_yield) |
| 3749 | { | 3498 | { |
| @@ -3766,16 +3515,11 @@ SYSCALL_DEFINE0(sched_yield) | |||
| 3766 | return 0; | 3515 | return 0; |
| 3767 | } | 3516 | } |
| 3768 | 3517 | ||
| 3769 | static inline int should_resched(void) | ||
| 3770 | { | ||
| 3771 | return need_resched() && !(preempt_count() & PREEMPT_ACTIVE); | ||
| 3772 | } | ||
| 3773 | |||
| 3774 | static void __cond_resched(void) | 3518 | static void __cond_resched(void) |
| 3775 | { | 3519 | { |
| 3776 | add_preempt_count(PREEMPT_ACTIVE); | 3520 | __preempt_count_add(PREEMPT_ACTIVE); |
| 3777 | __schedule(); | 3521 | __schedule(); |
| 3778 | sub_preempt_count(PREEMPT_ACTIVE); | 3522 | __preempt_count_sub(PREEMPT_ACTIVE); |
| 3779 | } | 3523 | } |
| 3780 | 3524 | ||
| 3781 | int __sched _cond_resched(void) | 3525 | int __sched _cond_resched(void) |
| @@ -3869,7 +3613,7 @@ EXPORT_SYMBOL(yield); | |||
| 3869 | * It's the caller's job to ensure that the target task struct | 3613 | * It's the caller's job to ensure that the target task struct |
| 3870 | * can't go away on us before we can do any checks. | 3614 | * can't go away on us before we can do any checks. |
| 3871 | * | 3615 | * |
| 3872 | * Returns: | 3616 | * Return: |
| 3873 | * true (>0) if we indeed boosted the target task. | 3617 | * true (>0) if we indeed boosted the target task. |
| 3874 | * false (0) if we failed to boost the target. | 3618 | * false (0) if we failed to boost the target. |
| 3875 | * -ESRCH if there's no task to yield to. | 3619 | * -ESRCH if there's no task to yield to. |
| @@ -3972,8 +3716,9 @@ long __sched io_schedule_timeout(long timeout) | |||
| 3972 | * sys_sched_get_priority_max - return maximum RT priority. | 3716 | * sys_sched_get_priority_max - return maximum RT priority. |
| 3973 | * @policy: scheduling class. | 3717 | * @policy: scheduling class. |
| 3974 | * | 3718 | * |
| 3975 | * this syscall returns the maximum rt_priority that can be used | 3719 | * Return: On success, this syscall returns the maximum |
| 3976 | * by a given scheduling class. | 3720 | * rt_priority that can be used by a given scheduling class. |
| 3721 | * On failure, a negative error code is returned. | ||
| 3977 | */ | 3722 | */ |
| 3978 | SYSCALL_DEFINE1(sched_get_priority_max, int, policy) | 3723 | SYSCALL_DEFINE1(sched_get_priority_max, int, policy) |
| 3979 | { | 3724 | { |
| @@ -3997,8 +3742,9 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy) | |||
| 3997 | * sys_sched_get_priority_min - return minimum RT priority. | 3742 | * sys_sched_get_priority_min - return minimum RT priority. |
| 3998 | * @policy: scheduling class. | 3743 | * @policy: scheduling class. |
| 3999 | * | 3744 | * |
| 4000 | * this syscall returns the minimum rt_priority that can be used | 3745 | * Return: On success, this syscall returns the minimum |
| 4001 | * by a given scheduling class. | 3746 | * rt_priority that can be used by a given scheduling class. |
| 3747 | * On failure, a negative error code is returned. | ||
| 4002 | */ | 3748 | */ |
| 4003 | SYSCALL_DEFINE1(sched_get_priority_min, int, policy) | 3749 | SYSCALL_DEFINE1(sched_get_priority_min, int, policy) |
| 4004 | { | 3750 | { |
| @@ -4024,6 +3770,9 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy) | |||
| 4024 | * | 3770 | * |
| 4025 | * this syscall writes the default timeslice value of a given process | 3771 | * this syscall writes the default timeslice value of a given process |
| 4026 | * into the user-space timespec buffer. A value of '0' means infinity. | 3772 | * into the user-space timespec buffer. A value of '0' means infinity. |
| 3773 | * | ||
| 3774 | * Return: On success, 0 and the timeslice is in @interval. Otherwise, | ||
| 3775 | * an error code. | ||
| 4027 | */ | 3776 | */ |
| 4028 | SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, | 3777 | SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, |
| 4029 | struct timespec __user *, interval) | 3778 | struct timespec __user *, interval) |
| @@ -4153,7 +3902,7 @@ void init_idle(struct task_struct *idle, int cpu) | |||
| 4153 | 3902 | ||
| 4154 | raw_spin_lock_irqsave(&rq->lock, flags); | 3903 | raw_spin_lock_irqsave(&rq->lock, flags); |
| 4155 | 3904 | ||
| 4156 | __sched_fork(idle); | 3905 | __sched_fork(0, idle); |
| 4157 | idle->state = TASK_RUNNING; | 3906 | idle->state = TASK_RUNNING; |
| 4158 | idle->se.exec_start = sched_clock(); | 3907 | idle->se.exec_start = sched_clock(); |
| 4159 | 3908 | ||
| @@ -4179,7 +3928,7 @@ void init_idle(struct task_struct *idle, int cpu) | |||
| 4179 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 3928 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
| 4180 | 3929 | ||
| 4181 | /* Set the preempt count _outside_ the spinlocks! */ | 3930 | /* Set the preempt count _outside_ the spinlocks! */ |
| 4182 | task_thread_info(idle)->preempt_count = 0; | 3931 | init_idle_preempt_count(idle, cpu); |
| 4183 | 3932 | ||
| 4184 | /* | 3933 | /* |
| 4185 | * The idle tasks have their own, simple scheduling class: | 3934 | * The idle tasks have their own, simple scheduling class: |
| @@ -4313,6 +4062,53 @@ fail: | |||
| 4313 | return ret; | 4062 | return ret; |
| 4314 | } | 4063 | } |
| 4315 | 4064 | ||
| 4065 | #ifdef CONFIG_NUMA_BALANCING | ||
| 4066 | /* Migrate current task p to target_cpu */ | ||
| 4067 | int migrate_task_to(struct task_struct *p, int target_cpu) | ||
| 4068 | { | ||
| 4069 | struct migration_arg arg = { p, target_cpu }; | ||
| 4070 | int curr_cpu = task_cpu(p); | ||
| 4071 | |||
| 4072 | if (curr_cpu == target_cpu) | ||
| 4073 | return 0; | ||
| 4074 | |||
| 4075 | if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p))) | ||
| 4076 | return -EINVAL; | ||
| 4077 | |||
| 4078 | /* TODO: This is not properly updating schedstats */ | ||
| 4079 | |||
| 4080 | return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); | ||
| 4081 | } | ||
| 4082 | |||
| 4083 | /* | ||
| 4084 | * Requeue a task on a given node and accurately track the number of NUMA | ||
| 4085 | * tasks on the runqueues | ||
| 4086 | */ | ||
| 4087 | void sched_setnuma(struct task_struct *p, int nid) | ||
| 4088 | { | ||
| 4089 | struct rq *rq; | ||
| 4090 | unsigned long flags; | ||
| 4091 | bool on_rq, running; | ||
| 4092 | |||
| 4093 | rq = task_rq_lock(p, &flags); | ||
| 4094 | on_rq = p->on_rq; | ||
| 4095 | running = task_current(rq, p); | ||
| 4096 | |||
| 4097 | if (on_rq) | ||
| 4098 | dequeue_task(rq, p, 0); | ||
| 4099 | if (running) | ||
| 4100 | p->sched_class->put_prev_task(rq, p); | ||
| 4101 | |||
| 4102 | p->numa_preferred_nid = nid; | ||
| 4103 | |||
| 4104 | if (running) | ||
| 4105 | p->sched_class->set_curr_task(rq); | ||
| 4106 | if (on_rq) | ||
| 4107 | enqueue_task(rq, p, 0); | ||
| 4108 | task_rq_unlock(rq, p, &flags); | ||
| 4109 | } | ||
| 4110 | #endif | ||
| 4111 | |||
| 4316 | /* | 4112 | /* |
| 4317 | * migration_cpu_stop - this will be executed by a highprio stopper thread | 4113 | * migration_cpu_stop - this will be executed by a highprio stopper thread |
| 4318 | * and performs thread migration by bumping thread off CPU then | 4114 | * and performs thread migration by bumping thread off CPU then |
| @@ -4914,7 +4710,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) | |||
| 4914 | SD_BALANCE_FORK | | 4710 | SD_BALANCE_FORK | |
| 4915 | SD_BALANCE_EXEC | | 4711 | SD_BALANCE_EXEC | |
| 4916 | SD_SHARE_CPUPOWER | | 4712 | SD_SHARE_CPUPOWER | |
| 4917 | SD_SHARE_PKG_RESOURCES); | 4713 | SD_SHARE_PKG_RESOURCES | |
| 4714 | SD_PREFER_SIBLING); | ||
| 4918 | if (nr_node_ids == 1) | 4715 | if (nr_node_ids == 1) |
| 4919 | pflags &= ~SD_SERIALIZE; | 4716 | pflags &= ~SD_SERIALIZE; |
| 4920 | } | 4717 | } |
| @@ -5083,19 +4880,34 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) | |||
| 5083 | * two cpus are in the same cache domain, see cpus_share_cache(). | 4880 | * two cpus are in the same cache domain, see cpus_share_cache(). |
| 5084 | */ | 4881 | */ |
| 5085 | DEFINE_PER_CPU(struct sched_domain *, sd_llc); | 4882 | DEFINE_PER_CPU(struct sched_domain *, sd_llc); |
| 4883 | DEFINE_PER_CPU(int, sd_llc_size); | ||
| 5086 | DEFINE_PER_CPU(int, sd_llc_id); | 4884 | DEFINE_PER_CPU(int, sd_llc_id); |
| 4885 | DEFINE_PER_CPU(struct sched_domain *, sd_numa); | ||
| 4886 | DEFINE_PER_CPU(struct sched_domain *, sd_busy); | ||
| 4887 | DEFINE_PER_CPU(struct sched_domain *, sd_asym); | ||
| 5087 | 4888 | ||
| 5088 | static void update_top_cache_domain(int cpu) | 4889 | static void update_top_cache_domain(int cpu) |
| 5089 | { | 4890 | { |
| 5090 | struct sched_domain *sd; | 4891 | struct sched_domain *sd; |
| 5091 | int id = cpu; | 4892 | int id = cpu; |
| 4893 | int size = 1; | ||
| 5092 | 4894 | ||
| 5093 | sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); | 4895 | sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); |
| 5094 | if (sd) | 4896 | if (sd) { |
| 5095 | id = cpumask_first(sched_domain_span(sd)); | 4897 | id = cpumask_first(sched_domain_span(sd)); |
| 4898 | size = cpumask_weight(sched_domain_span(sd)); | ||
| 4899 | rcu_assign_pointer(per_cpu(sd_busy, cpu), sd->parent); | ||
| 4900 | } | ||
| 5096 | 4901 | ||
| 5097 | rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); | 4902 | rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); |
| 4903 | per_cpu(sd_llc_size, cpu) = size; | ||
| 5098 | per_cpu(sd_llc_id, cpu) = id; | 4904 | per_cpu(sd_llc_id, cpu) = id; |
| 4905 | |||
| 4906 | sd = lowest_flag_domain(cpu, SD_NUMA); | ||
| 4907 | rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); | ||
| 4908 | |||
| 4909 | sd = highest_flag_domain(cpu, SD_ASYM_PACKING); | ||
| 4910 | rcu_assign_pointer(per_cpu(sd_asym, cpu), sd); | ||
| 5099 | } | 4911 | } |
| 5100 | 4912 | ||
| 5101 | /* | 4913 | /* |
| @@ -5118,6 +4930,13 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | |||
| 5118 | tmp->parent = parent->parent; | 4930 | tmp->parent = parent->parent; |
| 5119 | if (parent->parent) | 4931 | if (parent->parent) |
| 5120 | parent->parent->child = tmp; | 4932 | parent->parent->child = tmp; |
| 4933 | /* | ||
| 4934 | * Transfer SD_PREFER_SIBLING down in case of a | ||
| 4935 | * degenerate parent; the spans match for this | ||
| 4936 | * so the property transfers. | ||
| 4937 | */ | ||
| 4938 | if (parent->flags & SD_PREFER_SIBLING) | ||
| 4939 | tmp->flags |= SD_PREFER_SIBLING; | ||
| 5121 | destroy_sched_domain(parent, cpu); | 4940 | destroy_sched_domain(parent, cpu); |
| 5122 | } else | 4941 | } else |
| 5123 | tmp = tmp->parent; | 4942 | tmp = tmp->parent; |
| @@ -5608,6 +5427,7 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu) | |||
| 5608 | | 0*SD_SHARE_PKG_RESOURCES | 5427 | | 0*SD_SHARE_PKG_RESOURCES |
| 5609 | | 1*SD_SERIALIZE | 5428 | | 1*SD_SERIALIZE |
| 5610 | | 0*SD_PREFER_SIBLING | 5429 | | 0*SD_PREFER_SIBLING |
| 5430 | | 1*SD_NUMA | ||
| 5611 | | sd_local_flags(level) | 5431 | | sd_local_flags(level) |
| 5612 | , | 5432 | , |
| 5613 | .last_balance = jiffies, | 5433 | .last_balance = jiffies, |
| @@ -6184,8 +6004,9 @@ match1: | |||
| 6184 | ; | 6004 | ; |
| 6185 | } | 6005 | } |
| 6186 | 6006 | ||
| 6007 | n = ndoms_cur; | ||
| 6187 | if (doms_new == NULL) { | 6008 | if (doms_new == NULL) { |
| 6188 | ndoms_cur = 0; | 6009 | n = 0; |
| 6189 | doms_new = &fallback_doms; | 6010 | doms_new = &fallback_doms; |
| 6190 | cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); | 6011 | cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); |
| 6191 | WARN_ON_ONCE(dattr_new); | 6012 | WARN_ON_ONCE(dattr_new); |
| @@ -6193,7 +6014,7 @@ match1: | |||
| 6193 | 6014 | ||
| 6194 | /* Build new domains */ | 6015 | /* Build new domains */ |
| 6195 | for (i = 0; i < ndoms_new; i++) { | 6016 | for (i = 0; i < ndoms_new; i++) { |
| 6196 | for (j = 0; j < ndoms_cur && !new_topology; j++) { | 6017 | for (j = 0; j < n && !new_topology; j++) { |
| 6197 | if (cpumask_equal(doms_new[i], doms_cur[j]) | 6018 | if (cpumask_equal(doms_new[i], doms_cur[j]) |
| 6198 | && dattrs_equal(dattr_new, i, dattr_cur, j)) | 6019 | && dattrs_equal(dattr_new, i, dattr_cur, j)) |
| 6199 | goto match2; | 6020 | goto match2; |
| @@ -6288,14 +6109,17 @@ void __init sched_init_smp(void) | |||
| 6288 | 6109 | ||
| 6289 | sched_init_numa(); | 6110 | sched_init_numa(); |
| 6290 | 6111 | ||
| 6291 | get_online_cpus(); | 6112 | /* |
| 6113 | * There's no userspace yet to cause hotplug operations; hence all the | ||
| 6114 | * cpu masks are stable and all blatant races in the below code cannot | ||
| 6115 | * happen. | ||
| 6116 | */ | ||
| 6292 | mutex_lock(&sched_domains_mutex); | 6117 | mutex_lock(&sched_domains_mutex); |
| 6293 | init_sched_domains(cpu_active_mask); | 6118 | init_sched_domains(cpu_active_mask); |
| 6294 | cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); | 6119 | cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); |
| 6295 | if (cpumask_empty(non_isolated_cpus)) | 6120 | if (cpumask_empty(non_isolated_cpus)) |
| 6296 | cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); | 6121 | cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); |
| 6297 | mutex_unlock(&sched_domains_mutex); | 6122 | mutex_unlock(&sched_domains_mutex); |
| 6298 | put_online_cpus(); | ||
| 6299 | 6123 | ||
| 6300 | hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE); | 6124 | hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE); |
| 6301 | hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); | 6125 | hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); |
| @@ -6458,6 +6282,7 @@ void __init sched_init(void) | |||
| 6458 | rq->online = 0; | 6282 | rq->online = 0; |
| 6459 | rq->idle_stamp = 0; | 6283 | rq->idle_stamp = 0; |
| 6460 | rq->avg_idle = 2*sysctl_sched_migration_cost; | 6284 | rq->avg_idle = 2*sysctl_sched_migration_cost; |
| 6285 | rq->max_idle_balance_cost = sysctl_sched_migration_cost; | ||
| 6461 | 6286 | ||
| 6462 | INIT_LIST_HEAD(&rq->cfs_tasks); | 6287 | INIT_LIST_HEAD(&rq->cfs_tasks); |
| 6463 | 6288 | ||
| @@ -6632,6 +6457,8 @@ void normalize_rt_tasks(void) | |||
| 6632 | * @cpu: the processor in question. | 6457 | * @cpu: the processor in question. |
| 6633 | * | 6458 | * |
| 6634 | * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! | 6459 | * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! |
| 6460 | * | ||
| 6461 | * Return: The current task for @cpu. | ||
| 6635 | */ | 6462 | */ |
| 6636 | struct task_struct *curr_task(int cpu) | 6463 | struct task_struct *curr_task(int cpu) |
| 6637 | { | 6464 | { |
| @@ -6763,7 +6590,7 @@ void sched_move_task(struct task_struct *tsk) | |||
| 6763 | if (unlikely(running)) | 6590 | if (unlikely(running)) |
| 6764 | tsk->sched_class->put_prev_task(rq, tsk); | 6591 | tsk->sched_class->put_prev_task(rq, tsk); |
| 6765 | 6592 | ||
| 6766 | tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id, | 6593 | tg = container_of(task_css_check(tsk, cpu_cgroup_subsys_id, |
| 6767 | lockdep_is_held(&tsk->sighand->siglock)), | 6594 | lockdep_is_held(&tsk->sighand->siglock)), |
| 6768 | struct task_group, css); | 6595 | struct task_group, css); |
| 6769 | tg = autogroup_task_group(tsk, tg); | 6596 | tg = autogroup_task_group(tsk, tg); |
| @@ -7085,23 +6912,22 @@ int sched_rt_handler(struct ctl_table *table, int write, | |||
| 7085 | 6912 | ||
| 7086 | #ifdef CONFIG_CGROUP_SCHED | 6913 | #ifdef CONFIG_CGROUP_SCHED |
| 7087 | 6914 | ||
| 7088 | /* return corresponding task_group object of a cgroup */ | 6915 | static inline struct task_group *css_tg(struct cgroup_subsys_state *css) |
| 7089 | static inline struct task_group *cgroup_tg(struct cgroup *cgrp) | ||
| 7090 | { | 6916 | { |
| 7091 | return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id), | 6917 | return css ? container_of(css, struct task_group, css) : NULL; |
| 7092 | struct task_group, css); | ||
| 7093 | } | 6918 | } |
| 7094 | 6919 | ||
| 7095 | static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp) | 6920 | static struct cgroup_subsys_state * |
| 6921 | cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) | ||
| 7096 | { | 6922 | { |
| 7097 | struct task_group *tg, *parent; | 6923 | struct task_group *parent = css_tg(parent_css); |
| 6924 | struct task_group *tg; | ||
| 7098 | 6925 | ||
| 7099 | if (!cgrp->parent) { | 6926 | if (!parent) { |
| 7100 | /* This is early initialization for the top cgroup */ | 6927 | /* This is early initialization for the top cgroup */ |
| 7101 | return &root_task_group.css; | 6928 | return &root_task_group.css; |
| 7102 | } | 6929 | } |
| 7103 | 6930 | ||
| 7104 | parent = cgroup_tg(cgrp->parent); | ||
| 7105 | tg = sched_create_group(parent); | 6931 | tg = sched_create_group(parent); |
| 7106 | if (IS_ERR(tg)) | 6932 | if (IS_ERR(tg)) |
| 7107 | return ERR_PTR(-ENOMEM); | 6933 | return ERR_PTR(-ENOMEM); |
| @@ -7109,41 +6935,38 @@ static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp) | |||
| 7109 | return &tg->css; | 6935 | return &tg->css; |
| 7110 | } | 6936 | } |
| 7111 | 6937 | ||
| 7112 | static int cpu_cgroup_css_online(struct cgroup *cgrp) | 6938 | static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) |
| 7113 | { | 6939 | { |
| 7114 | struct task_group *tg = cgroup_tg(cgrp); | 6940 | struct task_group *tg = css_tg(css); |
| 7115 | struct task_group *parent; | 6941 | struct task_group *parent = css_tg(css_parent(css)); |
| 7116 | |||
| 7117 | if (!cgrp->parent) | ||
| 7118 | return 0; | ||
| 7119 | 6942 | ||
| 7120 | parent = cgroup_tg(cgrp->parent); | 6943 | if (parent) |
| 7121 | sched_online_group(tg, parent); | 6944 | sched_online_group(tg, parent); |
| 7122 | return 0; | 6945 | return 0; |
| 7123 | } | 6946 | } |
| 7124 | 6947 | ||
| 7125 | static void cpu_cgroup_css_free(struct cgroup *cgrp) | 6948 | static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) |
| 7126 | { | 6949 | { |
| 7127 | struct task_group *tg = cgroup_tg(cgrp); | 6950 | struct task_group *tg = css_tg(css); |
| 7128 | 6951 | ||
| 7129 | sched_destroy_group(tg); | 6952 | sched_destroy_group(tg); |
| 7130 | } | 6953 | } |
| 7131 | 6954 | ||
| 7132 | static void cpu_cgroup_css_offline(struct cgroup *cgrp) | 6955 | static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) |
| 7133 | { | 6956 | { |
| 7134 | struct task_group *tg = cgroup_tg(cgrp); | 6957 | struct task_group *tg = css_tg(css); |
| 7135 | 6958 | ||
| 7136 | sched_offline_group(tg); | 6959 | sched_offline_group(tg); |
| 7137 | } | 6960 | } |
| 7138 | 6961 | ||
| 7139 | static int cpu_cgroup_can_attach(struct cgroup *cgrp, | 6962 | static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, |
| 7140 | struct cgroup_taskset *tset) | 6963 | struct cgroup_taskset *tset) |
| 7141 | { | 6964 | { |
| 7142 | struct task_struct *task; | 6965 | struct task_struct *task; |
| 7143 | 6966 | ||
| 7144 | cgroup_taskset_for_each(task, cgrp, tset) { | 6967 | cgroup_taskset_for_each(task, css, tset) { |
| 7145 | #ifdef CONFIG_RT_GROUP_SCHED | 6968 | #ifdef CONFIG_RT_GROUP_SCHED |
| 7146 | if (!sched_rt_can_attach(cgroup_tg(cgrp), task)) | 6969 | if (!sched_rt_can_attach(css_tg(css), task)) |
| 7147 | return -EINVAL; | 6970 | return -EINVAL; |
| 7148 | #else | 6971 | #else |
| 7149 | /* We don't support RT-tasks being in separate groups */ | 6972 | /* We don't support RT-tasks being in separate groups */ |
| @@ -7154,18 +6977,18 @@ static int cpu_cgroup_can_attach(struct cgroup *cgrp, | |||
| 7154 | return 0; | 6977 | return 0; |
| 7155 | } | 6978 | } |
| 7156 | 6979 | ||
| 7157 | static void cpu_cgroup_attach(struct cgroup *cgrp, | 6980 | static void cpu_cgroup_attach(struct cgroup_subsys_state *css, |
| 7158 | struct cgroup_taskset *tset) | 6981 | struct cgroup_taskset *tset) |
| 7159 | { | 6982 | { |
| 7160 | struct task_struct *task; | 6983 | struct task_struct *task; |
| 7161 | 6984 | ||
| 7162 | cgroup_taskset_for_each(task, cgrp, tset) | 6985 | cgroup_taskset_for_each(task, css, tset) |
| 7163 | sched_move_task(task); | 6986 | sched_move_task(task); |
| 7164 | } | 6987 | } |
| 7165 | 6988 | ||
| 7166 | static void | 6989 | static void cpu_cgroup_exit(struct cgroup_subsys_state *css, |
| 7167 | cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, | 6990 | struct cgroup_subsys_state *old_css, |
| 7168 | struct task_struct *task) | 6991 | struct task_struct *task) |
| 7169 | { | 6992 | { |
| 7170 | /* | 6993 | /* |
| 7171 | * cgroup_exit() is called in the copy_process() failure path. | 6994 | * cgroup_exit() is called in the copy_process() failure path. |
| @@ -7179,15 +7002,16 @@ cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, | |||
| 7179 | } | 7002 | } |
| 7180 | 7003 | ||
| 7181 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7004 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 7182 | static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, | 7005 | static int cpu_shares_write_u64(struct cgroup_subsys_state *css, |
| 7183 | u64 shareval) | 7006 | struct cftype *cftype, u64 shareval) |
| 7184 | { | 7007 | { |
| 7185 | return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval)); | 7008 | return sched_group_set_shares(css_tg(css), scale_load(shareval)); |
| 7186 | } | 7009 | } |
| 7187 | 7010 | ||
| 7188 | static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) | 7011 | static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css, |
| 7012 | struct cftype *cft) | ||
| 7189 | { | 7013 | { |
| 7190 | struct task_group *tg = cgroup_tg(cgrp); | 7014 | struct task_group *tg = css_tg(css); |
| 7191 | 7015 | ||
| 7192 | return (u64) scale_load_down(tg->shares); | 7016 | return (u64) scale_load_down(tg->shares); |
| 7193 | } | 7017 | } |
| @@ -7231,7 +7055,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) | |||
| 7231 | 7055 | ||
| 7232 | runtime_enabled = quota != RUNTIME_INF; | 7056 | runtime_enabled = quota != RUNTIME_INF; |
| 7233 | runtime_was_enabled = cfs_b->quota != RUNTIME_INF; | 7057 | runtime_was_enabled = cfs_b->quota != RUNTIME_INF; |
| 7234 | account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled); | 7058 | /* |
| 7059 | * If we need to toggle cfs_bandwidth_used, off->on must occur | ||
| 7060 | * before making related changes, and on->off must occur afterwards | ||
| 7061 | */ | ||
| 7062 | if (runtime_enabled && !runtime_was_enabled) | ||
| 7063 | cfs_bandwidth_usage_inc(); | ||
| 7235 | raw_spin_lock_irq(&cfs_b->lock); | 7064 | raw_spin_lock_irq(&cfs_b->lock); |
| 7236 | cfs_b->period = ns_to_ktime(period); | 7065 | cfs_b->period = ns_to_ktime(period); |
| 7237 | cfs_b->quota = quota; | 7066 | cfs_b->quota = quota; |
| @@ -7257,6 +7086,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) | |||
| 7257 | unthrottle_cfs_rq(cfs_rq); | 7086 | unthrottle_cfs_rq(cfs_rq); |
| 7258 | raw_spin_unlock_irq(&rq->lock); | 7087 | raw_spin_unlock_irq(&rq->lock); |
| 7259 | } | 7088 | } |
| 7089 | if (runtime_was_enabled && !runtime_enabled) | ||
| 7090 | cfs_bandwidth_usage_dec(); | ||
| 7260 | out_unlock: | 7091 | out_unlock: |
| 7261 | mutex_unlock(&cfs_constraints_mutex); | 7092 | mutex_unlock(&cfs_constraints_mutex); |
| 7262 | 7093 | ||
| @@ -7309,26 +7140,28 @@ long tg_get_cfs_period(struct task_group *tg) | |||
| 7309 | return cfs_period_us; | 7140 | return cfs_period_us; |
| 7310 | } | 7141 | } |
| 7311 | 7142 | ||
| 7312 | static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft) | 7143 | static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css, |
| 7144 | struct cftype *cft) | ||
| 7313 | { | 7145 | { |
| 7314 | return tg_get_cfs_quota(cgroup_tg(cgrp)); | 7146 | return tg_get_cfs_quota(css_tg(css)); |
| 7315 | } | 7147 | } |
| 7316 | 7148 | ||
| 7317 | static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype, | 7149 | static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css, |
| 7318 | s64 cfs_quota_us) | 7150 | struct cftype *cftype, s64 cfs_quota_us) |
| 7319 | { | 7151 | { |
| 7320 | return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us); | 7152 | return tg_set_cfs_quota(css_tg(css), cfs_quota_us); |
| 7321 | } | 7153 | } |
| 7322 | 7154 | ||
| 7323 | static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft) | 7155 | static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css, |
| 7156 | struct cftype *cft) | ||
| 7324 | { | 7157 | { |
| 7325 | return tg_get_cfs_period(cgroup_tg(cgrp)); | 7158 | return tg_get_cfs_period(css_tg(css)); |
| 7326 | } | 7159 | } |
| 7327 | 7160 | ||
| 7328 | static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype, | 7161 | static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css, |
| 7329 | u64 cfs_period_us) | 7162 | struct cftype *cftype, u64 cfs_period_us) |
| 7330 | { | 7163 | { |
| 7331 | return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us); | 7164 | return tg_set_cfs_period(css_tg(css), cfs_period_us); |
| 7332 | } | 7165 | } |
| 7333 | 7166 | ||
| 7334 | struct cfs_schedulable_data { | 7167 | struct cfs_schedulable_data { |
| @@ -7409,10 +7242,10 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) | |||
| 7409 | return ret; | 7242 | return ret; |
| 7410 | } | 7243 | } |
| 7411 | 7244 | ||
| 7412 | static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft, | 7245 | static int cpu_stats_show(struct cgroup_subsys_state *css, struct cftype *cft, |
| 7413 | struct cgroup_map_cb *cb) | 7246 | struct cgroup_map_cb *cb) |
| 7414 | { | 7247 | { |
| 7415 | struct task_group *tg = cgroup_tg(cgrp); | 7248 | struct task_group *tg = css_tg(css); |
| 7416 | struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; | 7249 | struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; |
| 7417 | 7250 | ||
| 7418 | cb->fill(cb, "nr_periods", cfs_b->nr_periods); | 7251 | cb->fill(cb, "nr_periods", cfs_b->nr_periods); |
| @@ -7425,26 +7258,28 @@ static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft, | |||
| 7425 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 7258 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
| 7426 | 7259 | ||
| 7427 | #ifdef CONFIG_RT_GROUP_SCHED | 7260 | #ifdef CONFIG_RT_GROUP_SCHED |
| 7428 | static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, | 7261 | static int cpu_rt_runtime_write(struct cgroup_subsys_state *css, |
| 7429 | s64 val) | 7262 | struct cftype *cft, s64 val) |
| 7430 | { | 7263 | { |
| 7431 | return sched_group_set_rt_runtime(cgroup_tg(cgrp), val); | 7264 | return sched_group_set_rt_runtime(css_tg(css), val); |
| 7432 | } | 7265 | } |
| 7433 | 7266 | ||
| 7434 | static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft) | 7267 | static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css, |
| 7268 | struct cftype *cft) | ||
| 7435 | { | 7269 | { |
| 7436 | return sched_group_rt_runtime(cgroup_tg(cgrp)); | 7270 | return sched_group_rt_runtime(css_tg(css)); |
| 7437 | } | 7271 | } |
| 7438 | 7272 | ||
| 7439 | static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype, | 7273 | static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css, |
| 7440 | u64 rt_period_us) | 7274 | struct cftype *cftype, u64 rt_period_us) |
| 7441 | { | 7275 | { |
| 7442 | return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us); | 7276 | return sched_group_set_rt_period(css_tg(css), rt_period_us); |
| 7443 | } | 7277 | } |
| 7444 | 7278 | ||
| 7445 | static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft) | 7279 | static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, |
| 7280 | struct cftype *cft) | ||
| 7446 | { | 7281 | { |
| 7447 | return sched_group_rt_period(cgroup_tg(cgrp)); | 7282 | return sched_group_rt_period(css_tg(css)); |
| 7448 | } | 7283 | } |
| 7449 | #endif /* CONFIG_RT_GROUP_SCHED */ | 7284 | #endif /* CONFIG_RT_GROUP_SCHED */ |
| 7450 | 7285 | ||
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index dbb7e2cd95eb..f64722ff0299 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c | |||
| @@ -33,30 +33,20 @@ struct cpuacct { | |||
| 33 | struct kernel_cpustat __percpu *cpustat; | 33 | struct kernel_cpustat __percpu *cpustat; |
| 34 | }; | 34 | }; |
| 35 | 35 | ||
| 36 | /* return cpu accounting group corresponding to this container */ | 36 | static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css) |
| 37 | static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) | ||
| 38 | { | 37 | { |
| 39 | return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), | 38 | return css ? container_of(css, struct cpuacct, css) : NULL; |
| 40 | struct cpuacct, css); | ||
| 41 | } | 39 | } |
| 42 | 40 | ||
| 43 | /* return cpu accounting group to which this task belongs */ | 41 | /* return cpu accounting group to which this task belongs */ |
| 44 | static inline struct cpuacct *task_ca(struct task_struct *tsk) | 42 | static inline struct cpuacct *task_ca(struct task_struct *tsk) |
| 45 | { | 43 | { |
| 46 | return container_of(task_subsys_state(tsk, cpuacct_subsys_id), | 44 | return css_ca(task_css(tsk, cpuacct_subsys_id)); |
| 47 | struct cpuacct, css); | ||
| 48 | } | ||
| 49 | |||
| 50 | static inline struct cpuacct *__parent_ca(struct cpuacct *ca) | ||
| 51 | { | ||
| 52 | return cgroup_ca(ca->css.cgroup->parent); | ||
| 53 | } | 45 | } |
| 54 | 46 | ||
| 55 | static inline struct cpuacct *parent_ca(struct cpuacct *ca) | 47 | static inline struct cpuacct *parent_ca(struct cpuacct *ca) |
| 56 | { | 48 | { |
| 57 | if (!ca->css.cgroup->parent) | 49 | return css_ca(css_parent(&ca->css)); |
| 58 | return NULL; | ||
| 59 | return cgroup_ca(ca->css.cgroup->parent); | ||
| 60 | } | 50 | } |
| 61 | 51 | ||
| 62 | static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); | 52 | static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); |
| @@ -66,11 +56,12 @@ static struct cpuacct root_cpuacct = { | |||
| 66 | }; | 56 | }; |
| 67 | 57 | ||
| 68 | /* create a new cpu accounting group */ | 58 | /* create a new cpu accounting group */ |
| 69 | static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp) | 59 | static struct cgroup_subsys_state * |
| 60 | cpuacct_css_alloc(struct cgroup_subsys_state *parent_css) | ||
| 70 | { | 61 | { |
| 71 | struct cpuacct *ca; | 62 | struct cpuacct *ca; |
| 72 | 63 | ||
| 73 | if (!cgrp->parent) | 64 | if (!parent_css) |
| 74 | return &root_cpuacct.css; | 65 | return &root_cpuacct.css; |
| 75 | 66 | ||
| 76 | ca = kzalloc(sizeof(*ca), GFP_KERNEL); | 67 | ca = kzalloc(sizeof(*ca), GFP_KERNEL); |
| @@ -96,9 +87,9 @@ out: | |||
| 96 | } | 87 | } |
| 97 | 88 | ||
| 98 | /* destroy an existing cpu accounting group */ | 89 | /* destroy an existing cpu accounting group */ |
| 99 | static void cpuacct_css_free(struct cgroup *cgrp) | 90 | static void cpuacct_css_free(struct cgroup_subsys_state *css) |
| 100 | { | 91 | { |
| 101 | struct cpuacct *ca = cgroup_ca(cgrp); | 92 | struct cpuacct *ca = css_ca(css); |
| 102 | 93 | ||
| 103 | free_percpu(ca->cpustat); | 94 | free_percpu(ca->cpustat); |
| 104 | free_percpu(ca->cpuusage); | 95 | free_percpu(ca->cpuusage); |
| @@ -141,9 +132,9 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) | |||
| 141 | } | 132 | } |
| 142 | 133 | ||
| 143 | /* return total cpu usage (in nanoseconds) of a group */ | 134 | /* return total cpu usage (in nanoseconds) of a group */ |
| 144 | static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) | 135 | static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft) |
| 145 | { | 136 | { |
| 146 | struct cpuacct *ca = cgroup_ca(cgrp); | 137 | struct cpuacct *ca = css_ca(css); |
| 147 | u64 totalcpuusage = 0; | 138 | u64 totalcpuusage = 0; |
| 148 | int i; | 139 | int i; |
| 149 | 140 | ||
| @@ -153,10 +144,10 @@ static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) | |||
| 153 | return totalcpuusage; | 144 | return totalcpuusage; |
| 154 | } | 145 | } |
| 155 | 146 | ||
| 156 | static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, | 147 | static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, |
| 157 | u64 reset) | 148 | u64 reset) |
| 158 | { | 149 | { |
| 159 | struct cpuacct *ca = cgroup_ca(cgrp); | 150 | struct cpuacct *ca = css_ca(css); |
| 160 | int err = 0; | 151 | int err = 0; |
| 161 | int i; | 152 | int i; |
| 162 | 153 | ||
| @@ -172,10 +163,10 @@ out: | |||
| 172 | return err; | 163 | return err; |
| 173 | } | 164 | } |
| 174 | 165 | ||
| 175 | static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, | 166 | static int cpuacct_percpu_seq_read(struct cgroup_subsys_state *css, |
| 176 | struct seq_file *m) | 167 | struct cftype *cft, struct seq_file *m) |
| 177 | { | 168 | { |
| 178 | struct cpuacct *ca = cgroup_ca(cgroup); | 169 | struct cpuacct *ca = css_ca(css); |
| 179 | u64 percpu; | 170 | u64 percpu; |
| 180 | int i; | 171 | int i; |
| 181 | 172 | ||
| @@ -192,10 +183,10 @@ static const char * const cpuacct_stat_desc[] = { | |||
| 192 | [CPUACCT_STAT_SYSTEM] = "system", | 183 | [CPUACCT_STAT_SYSTEM] = "system", |
| 193 | }; | 184 | }; |
| 194 | 185 | ||
| 195 | static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, | 186 | static int cpuacct_stats_show(struct cgroup_subsys_state *css, |
| 196 | struct cgroup_map_cb *cb) | 187 | struct cftype *cft, struct cgroup_map_cb *cb) |
| 197 | { | 188 | { |
| 198 | struct cpuacct *ca = cgroup_ca(cgrp); | 189 | struct cpuacct *ca = css_ca(css); |
| 199 | int cpu; | 190 | int cpu; |
| 200 | s64 val = 0; | 191 | s64 val = 0; |
| 201 | 192 | ||
| @@ -281,7 +272,7 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val) | |||
| 281 | while (ca != &root_cpuacct) { | 272 | while (ca != &root_cpuacct) { |
| 282 | kcpustat = this_cpu_ptr(ca->cpustat); | 273 | kcpustat = this_cpu_ptr(ca->cpustat); |
| 283 | kcpustat->cpustat[index] += val; | 274 | kcpustat->cpustat[index] += val; |
| 284 | ca = __parent_ca(ca); | 275 | ca = parent_ca(ca); |
| 285 | } | 276 | } |
| 286 | rcu_read_unlock(); | 277 | rcu_read_unlock(); |
| 287 | } | 278 | } |
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 1095e878a46f..8b836b376d91 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c | |||
| @@ -62,7 +62,7 @@ static int convert_prio(int prio) | |||
| 62 | * any discrepancies created by racing against the uncertainty of the current | 62 | * any discrepancies created by racing against the uncertainty of the current |
| 63 | * priority configuration. | 63 | * priority configuration. |
| 64 | * | 64 | * |
| 65 | * Returns: (int)bool - CPUs were found | 65 | * Return: (int)bool - CPUs were found |
| 66 | */ | 66 | */ |
| 67 | int cpupri_find(struct cpupri *cp, struct task_struct *p, | 67 | int cpupri_find(struct cpupri *cp, struct task_struct *p, |
| 68 | struct cpumask *lowest_mask) | 68 | struct cpumask *lowest_mask) |
| @@ -203,7 +203,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) | |||
| 203 | * cpupri_init - initialize the cpupri structure | 203 | * cpupri_init - initialize the cpupri structure |
| 204 | * @cp: The cpupri context | 204 | * @cp: The cpupri context |
| 205 | * | 205 | * |
| 206 | * Returns: -ENOMEM if memory fails. | 206 | * Return: -ENOMEM on memory allocation failure. |
| 207 | */ | 207 | */ |
| 208 | int cpupri_init(struct cpupri *cp) | 208 | int cpupri_init(struct cpupri *cp) |
| 209 | { | 209 | { |
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index a7959e05a9d5..99947919e30b 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c | |||
| @@ -121,7 +121,7 @@ static inline void task_group_account_field(struct task_struct *p, int index, | |||
| 121 | * is the only cgroup, then nothing else should be necessary. | 121 | * is the only cgroup, then nothing else should be necessary. |
| 122 | * | 122 | * |
| 123 | */ | 123 | */ |
| 124 | __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; | 124 | __this_cpu_add(kernel_cpustat.cpustat[index], tmp); |
| 125 | 125 | ||
| 126 | cpuacct_account_field(p, index, tmp); | 126 | cpuacct_account_field(p, index, tmp); |
| 127 | } | 127 | } |
| @@ -378,11 +378,8 @@ static inline void irqtime_account_process_tick(struct task_struct *p, int user_ | |||
| 378 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING | 378 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING |
| 379 | 379 | ||
| 380 | #ifndef __ARCH_HAS_VTIME_TASK_SWITCH | 380 | #ifndef __ARCH_HAS_VTIME_TASK_SWITCH |
| 381 | void vtime_task_switch(struct task_struct *prev) | 381 | void vtime_common_task_switch(struct task_struct *prev) |
| 382 | { | 382 | { |
| 383 | if (!vtime_accounting_enabled()) | ||
| 384 | return; | ||
| 385 | |||
| 386 | if (is_idle_task(prev)) | 383 | if (is_idle_task(prev)) |
| 387 | vtime_account_idle(prev); | 384 | vtime_account_idle(prev); |
| 388 | else | 385 | else |
| @@ -404,11 +401,8 @@ void vtime_task_switch(struct task_struct *prev) | |||
| 404 | * vtime_account(). | 401 | * vtime_account(). |
| 405 | */ | 402 | */ |
| 406 | #ifndef __ARCH_HAS_VTIME_ACCOUNT | 403 | #ifndef __ARCH_HAS_VTIME_ACCOUNT |
| 407 | void vtime_account_irq_enter(struct task_struct *tsk) | 404 | void vtime_common_account_irq_enter(struct task_struct *tsk) |
| 408 | { | 405 | { |
| 409 | if (!vtime_accounting_enabled()) | ||
| 410 | return; | ||
| 411 | |||
| 412 | if (!in_interrupt()) { | 406 | if (!in_interrupt()) { |
| 413 | /* | 407 | /* |
| 414 | * If we interrupted user, context_tracking_in_user() | 408 | * If we interrupted user, context_tracking_in_user() |
| @@ -428,7 +422,7 @@ void vtime_account_irq_enter(struct task_struct *tsk) | |||
| 428 | } | 422 | } |
| 429 | vtime_account_system(tsk); | 423 | vtime_account_system(tsk); |
| 430 | } | 424 | } |
| 431 | EXPORT_SYMBOL_GPL(vtime_account_irq_enter); | 425 | EXPORT_SYMBOL_GPL(vtime_common_account_irq_enter); |
| 432 | #endif /* __ARCH_HAS_VTIME_ACCOUNT */ | 426 | #endif /* __ARCH_HAS_VTIME_ACCOUNT */ |
| 433 | #endif /* CONFIG_VIRT_CPU_ACCOUNTING */ | 427 | #endif /* CONFIG_VIRT_CPU_ACCOUNTING */ |
| 434 | 428 | ||
| @@ -557,16 +551,7 @@ static void cputime_adjust(struct task_cputime *curr, | |||
| 557 | struct cputime *prev, | 551 | struct cputime *prev, |
| 558 | cputime_t *ut, cputime_t *st) | 552 | cputime_t *ut, cputime_t *st) |
| 559 | { | 553 | { |
| 560 | cputime_t rtime, stime, utime, total; | 554 | cputime_t rtime, stime, utime; |
| 561 | |||
| 562 | if (vtime_accounting_enabled()) { | ||
| 563 | *ut = curr->utime; | ||
| 564 | *st = curr->stime; | ||
| 565 | return; | ||
| 566 | } | ||
| 567 | |||
| 568 | stime = curr->stime; | ||
| 569 | total = stime + curr->utime; | ||
| 570 | 555 | ||
| 571 | /* | 556 | /* |
| 572 | * Tick based cputime accounting depend on random scheduling | 557 | * Tick based cputime accounting depend on random scheduling |
| @@ -588,13 +573,19 @@ static void cputime_adjust(struct task_cputime *curr, | |||
| 588 | if (prev->stime + prev->utime >= rtime) | 573 | if (prev->stime + prev->utime >= rtime) |
| 589 | goto out; | 574 | goto out; |
| 590 | 575 | ||
| 591 | if (total) { | 576 | stime = curr->stime; |
| 577 | utime = curr->utime; | ||
| 578 | |||
| 579 | if (utime == 0) { | ||
| 580 | stime = rtime; | ||
| 581 | } else if (stime == 0) { | ||
| 582 | utime = rtime; | ||
| 583 | } else { | ||
| 584 | cputime_t total = stime + utime; | ||
| 585 | |||
| 592 | stime = scale_stime((__force u64)stime, | 586 | stime = scale_stime((__force u64)stime, |
| 593 | (__force u64)rtime, (__force u64)total); | 587 | (__force u64)rtime, (__force u64)total); |
| 594 | utime = rtime - stime; | 588 | utime = rtime - stime; |
| 595 | } else { | ||
| 596 | stime = rtime; | ||
| 597 | utime = 0; | ||
| 598 | } | 589 | } |
| 599 | 590 | ||
| 600 | /* | 591 | /* |
| @@ -664,23 +655,17 @@ static void __vtime_account_system(struct task_struct *tsk) | |||
| 664 | 655 | ||
| 665 | void vtime_account_system(struct task_struct *tsk) | 656 | void vtime_account_system(struct task_struct *tsk) |
| 666 | { | 657 | { |
| 667 | if (!vtime_accounting_enabled()) | ||
| 668 | return; | ||
| 669 | |||
| 670 | write_seqlock(&tsk->vtime_seqlock); | 658 | write_seqlock(&tsk->vtime_seqlock); |
| 671 | __vtime_account_system(tsk); | 659 | __vtime_account_system(tsk); |
| 672 | write_sequnlock(&tsk->vtime_seqlock); | 660 | write_sequnlock(&tsk->vtime_seqlock); |
| 673 | } | 661 | } |
| 674 | 662 | ||
| 675 | void vtime_account_irq_exit(struct task_struct *tsk) | 663 | void vtime_gen_account_irq_exit(struct task_struct *tsk) |
| 676 | { | 664 | { |
| 677 | if (!vtime_accounting_enabled()) | ||
| 678 | return; | ||
| 679 | |||
| 680 | write_seqlock(&tsk->vtime_seqlock); | 665 | write_seqlock(&tsk->vtime_seqlock); |
| 666 | __vtime_account_system(tsk); | ||
| 681 | if (context_tracking_in_user()) | 667 | if (context_tracking_in_user()) |
| 682 | tsk->vtime_snap_whence = VTIME_USER; | 668 | tsk->vtime_snap_whence = VTIME_USER; |
| 683 | __vtime_account_system(tsk); | ||
| 684 | write_sequnlock(&tsk->vtime_seqlock); | 669 | write_sequnlock(&tsk->vtime_seqlock); |
| 685 | } | 670 | } |
| 686 | 671 | ||
| @@ -688,12 +673,8 @@ void vtime_account_user(struct task_struct *tsk) | |||
| 688 | { | 673 | { |
| 689 | cputime_t delta_cpu; | 674 | cputime_t delta_cpu; |
| 690 | 675 | ||
| 691 | if (!vtime_accounting_enabled()) | ||
| 692 | return; | ||
| 693 | |||
| 694 | delta_cpu = get_vtime_delta(tsk); | ||
| 695 | |||
| 696 | write_seqlock(&tsk->vtime_seqlock); | 676 | write_seqlock(&tsk->vtime_seqlock); |
| 677 | delta_cpu = get_vtime_delta(tsk); | ||
| 697 | tsk->vtime_snap_whence = VTIME_SYS; | 678 | tsk->vtime_snap_whence = VTIME_SYS; |
| 698 | account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); | 679 | account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); |
| 699 | write_sequnlock(&tsk->vtime_seqlock); | 680 | write_sequnlock(&tsk->vtime_seqlock); |
| @@ -701,22 +682,27 @@ void vtime_account_user(struct task_struct *tsk) | |||
| 701 | 682 | ||
| 702 | void vtime_user_enter(struct task_struct *tsk) | 683 | void vtime_user_enter(struct task_struct *tsk) |
| 703 | { | 684 | { |
| 704 | if (!vtime_accounting_enabled()) | ||
| 705 | return; | ||
| 706 | |||
| 707 | write_seqlock(&tsk->vtime_seqlock); | 685 | write_seqlock(&tsk->vtime_seqlock); |
| 708 | tsk->vtime_snap_whence = VTIME_USER; | ||
| 709 | __vtime_account_system(tsk); | 686 | __vtime_account_system(tsk); |
| 687 | tsk->vtime_snap_whence = VTIME_USER; | ||
| 710 | write_sequnlock(&tsk->vtime_seqlock); | 688 | write_sequnlock(&tsk->vtime_seqlock); |
| 711 | } | 689 | } |
| 712 | 690 | ||
| 713 | void vtime_guest_enter(struct task_struct *tsk) | 691 | void vtime_guest_enter(struct task_struct *tsk) |
| 714 | { | 692 | { |
| 693 | /* | ||
| 694 | * The flags must be updated under the lock with | ||
| 695 | * the vtime_snap flush and update. | ||
| 696 | * That enforces a right ordering and update sequence | ||
| 697 | * synchronization against the reader (task_gtime()) | ||
| 698 | * that can thus safely catch up with a tickless delta. | ||
| 699 | */ | ||
| 715 | write_seqlock(&tsk->vtime_seqlock); | 700 | write_seqlock(&tsk->vtime_seqlock); |
| 716 | __vtime_account_system(tsk); | 701 | __vtime_account_system(tsk); |
| 717 | current->flags |= PF_VCPU; | 702 | current->flags |= PF_VCPU; |
| 718 | write_sequnlock(&tsk->vtime_seqlock); | 703 | write_sequnlock(&tsk->vtime_seqlock); |
| 719 | } | 704 | } |
| 705 | EXPORT_SYMBOL_GPL(vtime_guest_enter); | ||
| 720 | 706 | ||
| 721 | void vtime_guest_exit(struct task_struct *tsk) | 707 | void vtime_guest_exit(struct task_struct *tsk) |
| 722 | { | 708 | { |
| @@ -725,6 +711,7 @@ void vtime_guest_exit(struct task_struct *tsk) | |||
| 725 | current->flags &= ~PF_VCPU; | 711 | current->flags &= ~PF_VCPU; |
| 726 | write_sequnlock(&tsk->vtime_seqlock); | 712 | write_sequnlock(&tsk->vtime_seqlock); |
| 727 | } | 713 | } |
| 714 | EXPORT_SYMBOL_GPL(vtime_guest_exit); | ||
| 728 | 715 | ||
| 729 | void vtime_account_idle(struct task_struct *tsk) | 716 | void vtime_account_idle(struct task_struct *tsk) |
| 730 | { | 717 | { |
| @@ -733,11 +720,6 @@ void vtime_account_idle(struct task_struct *tsk) | |||
| 733 | account_idle_time(delta_cpu); | 720 | account_idle_time(delta_cpu); |
| 734 | } | 721 | } |
| 735 | 722 | ||
| 736 | bool vtime_accounting_enabled(void) | ||
| 737 | { | ||
| 738 | return context_tracking_active(); | ||
| 739 | } | ||
| 740 | |||
| 741 | void arch_vtime_task_switch(struct task_struct *prev) | 723 | void arch_vtime_task_switch(struct task_struct *prev) |
| 742 | { | 724 | { |
| 743 | write_seqlock(&prev->vtime_seqlock); | 725 | write_seqlock(&prev->vtime_seqlock); |
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index e076bddd4c66..5c34d1817e8f 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
| @@ -15,6 +15,7 @@ | |||
| 15 | #include <linux/seq_file.h> | 15 | #include <linux/seq_file.h> |
| 16 | #include <linux/kallsyms.h> | 16 | #include <linux/kallsyms.h> |
| 17 | #include <linux/utsname.h> | 17 | #include <linux/utsname.h> |
| 18 | #include <linux/mempolicy.h> | ||
| 18 | 19 | ||
| 19 | #include "sched.h" | 20 | #include "sched.h" |
| 20 | 21 | ||
| @@ -124,7 +125,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) | |||
| 124 | SEQ_printf(m, " "); | 125 | SEQ_printf(m, " "); |
| 125 | 126 | ||
| 126 | SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ", | 127 | SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ", |
| 127 | p->comm, p->pid, | 128 | p->comm, task_pid_nr(p), |
| 128 | SPLIT_NS(p->se.vruntime), | 129 | SPLIT_NS(p->se.vruntime), |
| 129 | (long long)(p->nvcsw + p->nivcsw), | 130 | (long long)(p->nvcsw + p->nivcsw), |
| 130 | p->prio); | 131 | p->prio); |
| @@ -137,6 +138,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) | |||
| 137 | SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", | 138 | SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", |
| 138 | 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); | 139 | 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); |
| 139 | #endif | 140 | #endif |
| 141 | #ifdef CONFIG_NUMA_BALANCING | ||
| 142 | SEQ_printf(m, " %d", cpu_to_node(task_cpu(p))); | ||
| 143 | #endif | ||
| 140 | #ifdef CONFIG_CGROUP_SCHED | 144 | #ifdef CONFIG_CGROUP_SCHED |
| 141 | SEQ_printf(m, " %s", task_group_path(task_group(p))); | 145 | SEQ_printf(m, " %s", task_group_path(task_group(p))); |
| 142 | #endif | 146 | #endif |
| @@ -159,7 +163,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) | |||
| 159 | read_lock_irqsave(&tasklist_lock, flags); | 163 | read_lock_irqsave(&tasklist_lock, flags); |
| 160 | 164 | ||
| 161 | do_each_thread(g, p) { | 165 | do_each_thread(g, p) { |
| 162 | if (!p->on_rq || task_cpu(p) != rq_cpu) | 166 | if (task_cpu(p) != rq_cpu) |
| 163 | continue; | 167 | continue; |
| 164 | 168 | ||
| 165 | print_task(m, rq, p); | 169 | print_task(m, rq, p); |
| @@ -225,6 +229,14 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
| 225 | atomic_read(&cfs_rq->tg->runnable_avg)); | 229 | atomic_read(&cfs_rq->tg->runnable_avg)); |
| 226 | #endif | 230 | #endif |
| 227 | #endif | 231 | #endif |
| 232 | #ifdef CONFIG_CFS_BANDWIDTH | ||
| 233 | SEQ_printf(m, " .%-30s: %d\n", "tg->cfs_bandwidth.timer_active", | ||
| 234 | cfs_rq->tg->cfs_bandwidth.timer_active); | ||
| 235 | SEQ_printf(m, " .%-30s: %d\n", "throttled", | ||
| 236 | cfs_rq->throttled); | ||
| 237 | SEQ_printf(m, " .%-30s: %d\n", "throttle_count", | ||
| 238 | cfs_rq->throttle_count); | ||
| 239 | #endif | ||
| 228 | 240 | ||
| 229 | #ifdef CONFIG_FAIR_GROUP_SCHED | 241 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 230 | print_cfs_group_stats(m, cpu, cfs_rq->tg); | 242 | print_cfs_group_stats(m, cpu, cfs_rq->tg); |
| @@ -289,7 +301,7 @@ do { \ | |||
| 289 | P(nr_load_updates); | 301 | P(nr_load_updates); |
| 290 | P(nr_uninterruptible); | 302 | P(nr_uninterruptible); |
| 291 | PN(next_balance); | 303 | PN(next_balance); |
| 292 | P(curr->pid); | 304 | SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr))); |
| 293 | PN(clock); | 305 | PN(clock); |
| 294 | P(cpu_load[0]); | 306 | P(cpu_load[0]); |
| 295 | P(cpu_load[1]); | 307 | P(cpu_load[1]); |
| @@ -345,7 +357,7 @@ static void sched_debug_header(struct seq_file *m) | |||
| 345 | cpu_clk = local_clock(); | 357 | cpu_clk = local_clock(); |
| 346 | local_irq_restore(flags); | 358 | local_irq_restore(flags); |
| 347 | 359 | ||
| 348 | SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n", | 360 | SEQ_printf(m, "Sched Debug Version: v0.11, %s %.*s\n", |
| 349 | init_utsname()->release, | 361 | init_utsname()->release, |
| 350 | (int)strcspn(init_utsname()->version, " "), | 362 | (int)strcspn(init_utsname()->version, " "), |
| 351 | init_utsname()->version); | 363 | init_utsname()->version); |
| @@ -488,11 +500,61 @@ static int __init init_sched_debug_procfs(void) | |||
| 488 | 500 | ||
| 489 | __initcall(init_sched_debug_procfs); | 501 | __initcall(init_sched_debug_procfs); |
| 490 | 502 | ||
| 503 | #define __P(F) \ | ||
| 504 | SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) | ||
| 505 | #define P(F) \ | ||
| 506 | SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) | ||
| 507 | #define __PN(F) \ | ||
| 508 | SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) | ||
| 509 | #define PN(F) \ | ||
| 510 | SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) | ||
| 511 | |||
| 512 | |||
| 513 | static void sched_show_numa(struct task_struct *p, struct seq_file *m) | ||
| 514 | { | ||
| 515 | #ifdef CONFIG_NUMA_BALANCING | ||
| 516 | struct mempolicy *pol; | ||
| 517 | int node, i; | ||
| 518 | |||
| 519 | if (p->mm) | ||
| 520 | P(mm->numa_scan_seq); | ||
| 521 | |||
| 522 | task_lock(p); | ||
| 523 | pol = p->mempolicy; | ||
| 524 | if (pol && !(pol->flags & MPOL_F_MORON)) | ||
| 525 | pol = NULL; | ||
| 526 | mpol_get(pol); | ||
| 527 | task_unlock(p); | ||
| 528 | |||
| 529 | SEQ_printf(m, "numa_migrations, %ld\n", xchg(&p->numa_pages_migrated, 0)); | ||
| 530 | |||
| 531 | for_each_online_node(node) { | ||
| 532 | for (i = 0; i < 2; i++) { | ||
| 533 | unsigned long nr_faults = -1; | ||
| 534 | int cpu_current, home_node; | ||
| 535 | |||
| 536 | if (p->numa_faults) | ||
| 537 | nr_faults = p->numa_faults[2*node + i]; | ||
| 538 | |||
| 539 | cpu_current = !i ? (task_node(p) == node) : | ||
| 540 | (pol && node_isset(node, pol->v.nodes)); | ||
| 541 | |||
| 542 | home_node = (p->numa_preferred_nid == node); | ||
| 543 | |||
| 544 | SEQ_printf(m, "numa_faults, %d, %d, %d, %d, %ld\n", | ||
| 545 | i, node, cpu_current, home_node, nr_faults); | ||
| 546 | } | ||
| 547 | } | ||
| 548 | |||
| 549 | mpol_put(pol); | ||
| 550 | #endif | ||
| 551 | } | ||
| 552 | |||
| 491 | void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | 553 | void proc_sched_show_task(struct task_struct *p, struct seq_file *m) |
| 492 | { | 554 | { |
| 493 | unsigned long nr_switches; | 555 | unsigned long nr_switches; |
| 494 | 556 | ||
| 495 | SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, | 557 | SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr(p), |
| 496 | get_nr_threads(p)); | 558 | get_nr_threads(p)); |
| 497 | SEQ_printf(m, | 559 | SEQ_printf(m, |
| 498 | "---------------------------------------------------------" | 560 | "---------------------------------------------------------" |
| @@ -591,6 +653,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
| 591 | SEQ_printf(m, "%-45s:%21Ld\n", | 653 | SEQ_printf(m, "%-45s:%21Ld\n", |
| 592 | "clock-delta", (long long)(t1-t0)); | 654 | "clock-delta", (long long)(t1-t0)); |
| 593 | } | 655 | } |
| 656 | |||
| 657 | sched_show_numa(p, m); | ||
| 594 | } | 658 | } |
| 595 | 659 | ||
| 596 | void proc_sched_set_task(struct task_struct *p) | 660 | void proc_sched_set_task(struct task_struct *p) |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9565645e3202..df77c605c7a6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
| @@ -681,6 +681,8 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 681 | } | 681 | } |
| 682 | 682 | ||
| 683 | #ifdef CONFIG_SMP | 683 | #ifdef CONFIG_SMP |
| 684 | static unsigned long task_h_load(struct task_struct *p); | ||
| 685 | |||
| 684 | static inline void __update_task_entity_contrib(struct sched_entity *se); | 686 | static inline void __update_task_entity_contrib(struct sched_entity *se); |
| 685 | 687 | ||
| 686 | /* Give new task start runnable values to heavy its load in infant time */ | 688 | /* Give new task start runnable values to heavy its load in infant time */ |
| @@ -818,11 +820,12 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 818 | 820 | ||
| 819 | #ifdef CONFIG_NUMA_BALANCING | 821 | #ifdef CONFIG_NUMA_BALANCING |
| 820 | /* | 822 | /* |
| 821 | * numa task sample period in ms | 823 | * Approximate time to scan a full NUMA task in ms. The task scan period is |
| 824 | * calculated based on the tasks virtual memory size and | ||
| 825 | * numa_balancing_scan_size. | ||
| 822 | */ | 826 | */ |
| 823 | unsigned int sysctl_numa_balancing_scan_period_min = 100; | 827 | unsigned int sysctl_numa_balancing_scan_period_min = 1000; |
| 824 | unsigned int sysctl_numa_balancing_scan_period_max = 100*50; | 828 | unsigned int sysctl_numa_balancing_scan_period_max = 60000; |
| 825 | unsigned int sysctl_numa_balancing_scan_period_reset = 100*600; | ||
| 826 | 829 | ||
| 827 | /* Portion of address space to scan in MB */ | 830 | /* Portion of address space to scan in MB */ |
| 828 | unsigned int sysctl_numa_balancing_scan_size = 256; | 831 | unsigned int sysctl_numa_balancing_scan_size = 256; |
| @@ -830,41 +833,810 @@ unsigned int sysctl_numa_balancing_scan_size = 256; | |||
| 830 | /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ | 833 | /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ |
| 831 | unsigned int sysctl_numa_balancing_scan_delay = 1000; | 834 | unsigned int sysctl_numa_balancing_scan_delay = 1000; |
| 832 | 835 | ||
| 833 | static void task_numa_placement(struct task_struct *p) | 836 | /* |
| 837 | * After skipping a page migration on a shared page, skip N more numa page | ||
| 838 | * migrations unconditionally. This reduces the number of NUMA migrations | ||
| 839 | * in shared memory workloads, and has the effect of pulling tasks towards | ||
| 840 | * where their memory lives, over pulling the memory towards the task. | ||
| 841 | */ | ||
| 842 | unsigned int sysctl_numa_balancing_migrate_deferred = 16; | ||
| 843 | |||
| 844 | static unsigned int task_nr_scan_windows(struct task_struct *p) | ||
| 845 | { | ||
| 846 | unsigned long rss = 0; | ||
| 847 | unsigned long nr_scan_pages; | ||
| 848 | |||
| 849 | /* | ||
| 850 | * Calculations based on RSS as non-present and empty pages are skipped | ||
| 851 | * by the PTE scanner and NUMA hinting faults should be trapped based | ||
| 852 | * on resident pages | ||
| 853 | */ | ||
| 854 | nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT); | ||
| 855 | rss = get_mm_rss(p->mm); | ||
| 856 | if (!rss) | ||
| 857 | rss = nr_scan_pages; | ||
| 858 | |||
| 859 | rss = round_up(rss, nr_scan_pages); | ||
| 860 | return rss / nr_scan_pages; | ||
| 861 | } | ||
| 862 | |||
| 863 | /* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */ | ||
| 864 | #define MAX_SCAN_WINDOW 2560 | ||
| 865 | |||
| 866 | static unsigned int task_scan_min(struct task_struct *p) | ||
| 867 | { | ||
| 868 | unsigned int scan, floor; | ||
| 869 | unsigned int windows = 1; | ||
| 870 | |||
| 871 | if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW) | ||
| 872 | windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size; | ||
| 873 | floor = 1000 / windows; | ||
| 874 | |||
| 875 | scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p); | ||
| 876 | return max_t(unsigned int, floor, scan); | ||
| 877 | } | ||
| 878 | |||
| 879 | static unsigned int task_scan_max(struct task_struct *p) | ||
| 880 | { | ||
| 881 | unsigned int smin = task_scan_min(p); | ||
| 882 | unsigned int smax; | ||
| 883 | |||
| 884 | /* Watch for min being lower than max due to floor calculations */ | ||
| 885 | smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p); | ||
| 886 | return max(smin, smax); | ||
| 887 | } | ||
| 888 | |||
| 889 | /* | ||
| 890 | * Once a preferred node is selected the scheduler balancer will prefer moving | ||
| 891 | * a task to that node for sysctl_numa_balancing_settle_count number of PTE | ||
| 892 | * scans. This will give the process the chance to accumulate more faults on | ||
| 893 | * the preferred node but still allow the scheduler to move the task again if | ||
| 894 | * the nodes CPUs are overloaded. | ||
| 895 | */ | ||
| 896 | unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4; | ||
| 897 | |||
| 898 | static void account_numa_enqueue(struct rq *rq, struct task_struct *p) | ||
| 899 | { | ||
| 900 | rq->nr_numa_running += (p->numa_preferred_nid != -1); | ||
| 901 | rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p)); | ||
| 902 | } | ||
| 903 | |||
| 904 | static void account_numa_dequeue(struct rq *rq, struct task_struct *p) | ||
| 905 | { | ||
| 906 | rq->nr_numa_running -= (p->numa_preferred_nid != -1); | ||
| 907 | rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p)); | ||
| 908 | } | ||
| 909 | |||
| 910 | struct numa_group { | ||
| 911 | atomic_t refcount; | ||
| 912 | |||
| 913 | spinlock_t lock; /* nr_tasks, tasks */ | ||
| 914 | int nr_tasks; | ||
| 915 | pid_t gid; | ||
| 916 | struct list_head task_list; | ||
| 917 | |||
| 918 | struct rcu_head rcu; | ||
| 919 | unsigned long total_faults; | ||
| 920 | unsigned long faults[0]; | ||
| 921 | }; | ||
| 922 | |||
| 923 | pid_t task_numa_group_id(struct task_struct *p) | ||
| 924 | { | ||
| 925 | return p->numa_group ? p->numa_group->gid : 0; | ||
| 926 | } | ||
| 927 | |||
| 928 | static inline int task_faults_idx(int nid, int priv) | ||
| 929 | { | ||
| 930 | return 2 * nid + priv; | ||
| 931 | } | ||
| 932 | |||
| 933 | static inline unsigned long task_faults(struct task_struct *p, int nid) | ||
| 934 | { | ||
| 935 | if (!p->numa_faults) | ||
| 936 | return 0; | ||
| 937 | |||
| 938 | return p->numa_faults[task_faults_idx(nid, 0)] + | ||
| 939 | p->numa_faults[task_faults_idx(nid, 1)]; | ||
| 940 | } | ||
| 941 | |||
| 942 | static inline unsigned long group_faults(struct task_struct *p, int nid) | ||
| 943 | { | ||
| 944 | if (!p->numa_group) | ||
| 945 | return 0; | ||
| 946 | |||
| 947 | return p->numa_group->faults[2*nid] + p->numa_group->faults[2*nid+1]; | ||
| 948 | } | ||
| 949 | |||
| 950 | /* | ||
| 951 | * These return the fraction of accesses done by a particular task, or | ||
| 952 | * task group, on a particular numa node. The group weight is given a | ||
| 953 | * larger multiplier, in order to group tasks together that are almost | ||
| 954 | * evenly spread out between numa nodes. | ||
| 955 | */ | ||
| 956 | static inline unsigned long task_weight(struct task_struct *p, int nid) | ||
| 957 | { | ||
| 958 | unsigned long total_faults; | ||
| 959 | |||
| 960 | if (!p->numa_faults) | ||
| 961 | return 0; | ||
| 962 | |||
| 963 | total_faults = p->total_numa_faults; | ||
| 964 | |||
| 965 | if (!total_faults) | ||
| 966 | return 0; | ||
| 967 | |||
| 968 | return 1000 * task_faults(p, nid) / total_faults; | ||
| 969 | } | ||
| 970 | |||
| 971 | static inline unsigned long group_weight(struct task_struct *p, int nid) | ||
| 972 | { | ||
| 973 | if (!p->numa_group || !p->numa_group->total_faults) | ||
| 974 | return 0; | ||
| 975 | |||
| 976 | return 1000 * group_faults(p, nid) / p->numa_group->total_faults; | ||
| 977 | } | ||
| 978 | |||
| 979 | static unsigned long weighted_cpuload(const int cpu); | ||
| 980 | static unsigned long source_load(int cpu, int type); | ||
| 981 | static unsigned long target_load(int cpu, int type); | ||
| 982 | static unsigned long power_of(int cpu); | ||
| 983 | static long effective_load(struct task_group *tg, int cpu, long wl, long wg); | ||
| 984 | |||
| 985 | /* Cached statistics for all CPUs within a node */ | ||
| 986 | struct numa_stats { | ||
| 987 | unsigned long nr_running; | ||
| 988 | unsigned long load; | ||
| 989 | |||
| 990 | /* Total compute capacity of CPUs on a node */ | ||
| 991 | unsigned long power; | ||
| 992 | |||
| 993 | /* Approximate capacity in terms of runnable tasks on a node */ | ||
| 994 | unsigned long capacity; | ||
| 995 | int has_capacity; | ||
| 996 | }; | ||
| 997 | |||
| 998 | /* | ||
| 999 | * XXX borrowed from update_sg_lb_stats | ||
| 1000 | */ | ||
| 1001 | static void update_numa_stats(struct numa_stats *ns, int nid) | ||
| 1002 | { | ||
| 1003 | int cpu; | ||
| 1004 | |||
| 1005 | memset(ns, 0, sizeof(*ns)); | ||
| 1006 | for_each_cpu(cpu, cpumask_of_node(nid)) { | ||
| 1007 | struct rq *rq = cpu_rq(cpu); | ||
| 1008 | |||
| 1009 | ns->nr_running += rq->nr_running; | ||
| 1010 | ns->load += weighted_cpuload(cpu); | ||
| 1011 | ns->power += power_of(cpu); | ||
| 1012 | } | ||
| 1013 | |||
| 1014 | ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power; | ||
| 1015 | ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE); | ||
| 1016 | ns->has_capacity = (ns->nr_running < ns->capacity); | ||
| 1017 | } | ||
| 1018 | |||
| 1019 | struct task_numa_env { | ||
| 1020 | struct task_struct *p; | ||
| 1021 | |||
| 1022 | int src_cpu, src_nid; | ||
| 1023 | int dst_cpu, dst_nid; | ||
| 1024 | |||
| 1025 | struct numa_stats src_stats, dst_stats; | ||
| 1026 | |||
| 1027 | int imbalance_pct, idx; | ||
| 1028 | |||
| 1029 | struct task_struct *best_task; | ||
| 1030 | long best_imp; | ||
| 1031 | int best_cpu; | ||
| 1032 | }; | ||
| 1033 | |||
| 1034 | static void task_numa_assign(struct task_numa_env *env, | ||
| 1035 | struct task_struct *p, long imp) | ||
| 1036 | { | ||
| 1037 | if (env->best_task) | ||
| 1038 | put_task_struct(env->best_task); | ||
| 1039 | if (p) | ||
| 1040 | get_task_struct(p); | ||
| 1041 | |||
| 1042 | env->best_task = p; | ||
| 1043 | env->best_imp = imp; | ||
| 1044 | env->best_cpu = env->dst_cpu; | ||
| 1045 | } | ||
| 1046 | |||
| 1047 | /* | ||
| 1048 | * This checks if the overall compute and NUMA accesses of the system would | ||
| 1049 | * be improved if the source tasks was migrated to the target dst_cpu taking | ||
| 1050 | * into account that it might be best if task running on the dst_cpu should | ||
| 1051 | * be exchanged with the source task | ||
| 1052 | */ | ||
| 1053 | static void task_numa_compare(struct task_numa_env *env, | ||
| 1054 | long taskimp, long groupimp) | ||
| 1055 | { | ||
| 1056 | struct rq *src_rq = cpu_rq(env->src_cpu); | ||
| 1057 | struct rq *dst_rq = cpu_rq(env->dst_cpu); | ||
| 1058 | struct task_struct *cur; | ||
| 1059 | long dst_load, src_load; | ||
| 1060 | long load; | ||
| 1061 | long imp = (groupimp > 0) ? groupimp : taskimp; | ||
| 1062 | |||
| 1063 | rcu_read_lock(); | ||
| 1064 | cur = ACCESS_ONCE(dst_rq->curr); | ||
| 1065 | if (cur->pid == 0) /* idle */ | ||
| 1066 | cur = NULL; | ||
| 1067 | |||
| 1068 | /* | ||
| 1069 | * "imp" is the fault differential for the source task between the | ||
| 1070 | * source and destination node. Calculate the total differential for | ||
| 1071 | * the source task and potential destination task. The more negative | ||
| 1072 | * the value is, the more rmeote accesses that would be expected to | ||
| 1073 | * be incurred if the tasks were swapped. | ||
| 1074 | */ | ||
| 1075 | if (cur) { | ||
| 1076 | /* Skip this swap candidate if cannot move to the source cpu */ | ||
| 1077 | if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur))) | ||
| 1078 | goto unlock; | ||
| 1079 | |||
| 1080 | /* | ||
| 1081 | * If dst and source tasks are in the same NUMA group, or not | ||
| 1082 | * in any group then look only at task weights. | ||
| 1083 | */ | ||
| 1084 | if (cur->numa_group == env->p->numa_group) { | ||
| 1085 | imp = taskimp + task_weight(cur, env->src_nid) - | ||
| 1086 | task_weight(cur, env->dst_nid); | ||
| 1087 | /* | ||
| 1088 | * Add some hysteresis to prevent swapping the | ||
| 1089 | * tasks within a group over tiny differences. | ||
| 1090 | */ | ||
| 1091 | if (cur->numa_group) | ||
| 1092 | imp -= imp/16; | ||
| 1093 | } else { | ||
| 1094 | /* | ||
| 1095 | * Compare the group weights. If a task is all by | ||
| 1096 | * itself (not part of a group), use the task weight | ||
| 1097 | * instead. | ||
| 1098 | */ | ||
| 1099 | if (env->p->numa_group) | ||
| 1100 | imp = groupimp; | ||
| 1101 | else | ||
| 1102 | imp = taskimp; | ||
| 1103 | |||
| 1104 | if (cur->numa_group) | ||
| 1105 | imp += group_weight(cur, env->src_nid) - | ||
| 1106 | group_weight(cur, env->dst_nid); | ||
| 1107 | else | ||
| 1108 | imp += task_weight(cur, env->src_nid) - | ||
| 1109 | task_weight(cur, env->dst_nid); | ||
| 1110 | } | ||
| 1111 | } | ||
| 1112 | |||
| 1113 | if (imp < env->best_imp) | ||
| 1114 | goto unlock; | ||
| 1115 | |||
| 1116 | if (!cur) { | ||
| 1117 | /* Is there capacity at our destination? */ | ||
| 1118 | if (env->src_stats.has_capacity && | ||
| 1119 | !env->dst_stats.has_capacity) | ||
| 1120 | goto unlock; | ||
| 1121 | |||
| 1122 | goto balance; | ||
| 1123 | } | ||
| 1124 | |||
| 1125 | /* Balance doesn't matter much if we're running a task per cpu */ | ||
| 1126 | if (src_rq->nr_running == 1 && dst_rq->nr_running == 1) | ||
| 1127 | goto assign; | ||
| 1128 | |||
| 1129 | /* | ||
| 1130 | * In the overloaded case, try and keep the load balanced. | ||
| 1131 | */ | ||
| 1132 | balance: | ||
| 1133 | dst_load = env->dst_stats.load; | ||
| 1134 | src_load = env->src_stats.load; | ||
| 1135 | |||
| 1136 | /* XXX missing power terms */ | ||
| 1137 | load = task_h_load(env->p); | ||
| 1138 | dst_load += load; | ||
| 1139 | src_load -= load; | ||
| 1140 | |||
| 1141 | if (cur) { | ||
| 1142 | load = task_h_load(cur); | ||
| 1143 | dst_load -= load; | ||
| 1144 | src_load += load; | ||
| 1145 | } | ||
| 1146 | |||
| 1147 | /* make src_load the smaller */ | ||
| 1148 | if (dst_load < src_load) | ||
| 1149 | swap(dst_load, src_load); | ||
| 1150 | |||
| 1151 | if (src_load * env->imbalance_pct < dst_load * 100) | ||
| 1152 | goto unlock; | ||
| 1153 | |||
| 1154 | assign: | ||
| 1155 | task_numa_assign(env, cur, imp); | ||
| 1156 | unlock: | ||
| 1157 | rcu_read_unlock(); | ||
| 1158 | } | ||
| 1159 | |||
| 1160 | static void task_numa_find_cpu(struct task_numa_env *env, | ||
| 1161 | long taskimp, long groupimp) | ||
| 1162 | { | ||
| 1163 | int cpu; | ||
| 1164 | |||
| 1165 | for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) { | ||
| 1166 | /* Skip this CPU if the source task cannot migrate */ | ||
| 1167 | if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p))) | ||
| 1168 | continue; | ||
| 1169 | |||
| 1170 | env->dst_cpu = cpu; | ||
| 1171 | task_numa_compare(env, taskimp, groupimp); | ||
| 1172 | } | ||
| 1173 | } | ||
| 1174 | |||
| 1175 | static int task_numa_migrate(struct task_struct *p) | ||
| 1176 | { | ||
| 1177 | struct task_numa_env env = { | ||
| 1178 | .p = p, | ||
| 1179 | |||
| 1180 | .src_cpu = task_cpu(p), | ||
| 1181 | .src_nid = task_node(p), | ||
| 1182 | |||
| 1183 | .imbalance_pct = 112, | ||
| 1184 | |||
| 1185 | .best_task = NULL, | ||
| 1186 | .best_imp = 0, | ||
| 1187 | .best_cpu = -1 | ||
| 1188 | }; | ||
| 1189 | struct sched_domain *sd; | ||
| 1190 | unsigned long taskweight, groupweight; | ||
| 1191 | int nid, ret; | ||
| 1192 | long taskimp, groupimp; | ||
| 1193 | |||
| 1194 | /* | ||
| 1195 | * Pick the lowest SD_NUMA domain, as that would have the smallest | ||
| 1196 | * imbalance and would be the first to start moving tasks about. | ||
| 1197 | * | ||
| 1198 | * And we want to avoid any moving of tasks about, as that would create | ||
| 1199 | * random movement of tasks -- counter the numa conditions we're trying | ||
| 1200 | * to satisfy here. | ||
| 1201 | */ | ||
| 1202 | rcu_read_lock(); | ||
| 1203 | sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu)); | ||
| 1204 | env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2; | ||
| 1205 | rcu_read_unlock(); | ||
| 1206 | |||
| 1207 | taskweight = task_weight(p, env.src_nid); | ||
| 1208 | groupweight = group_weight(p, env.src_nid); | ||
| 1209 | update_numa_stats(&env.src_stats, env.src_nid); | ||
| 1210 | env.dst_nid = p->numa_preferred_nid; | ||
| 1211 | taskimp = task_weight(p, env.dst_nid) - taskweight; | ||
| 1212 | groupimp = group_weight(p, env.dst_nid) - groupweight; | ||
| 1213 | update_numa_stats(&env.dst_stats, env.dst_nid); | ||
| 1214 | |||
| 1215 | /* If the preferred nid has capacity, try to use it. */ | ||
| 1216 | if (env.dst_stats.has_capacity) | ||
| 1217 | task_numa_find_cpu(&env, taskimp, groupimp); | ||
| 1218 | |||
| 1219 | /* No space available on the preferred nid. Look elsewhere. */ | ||
| 1220 | if (env.best_cpu == -1) { | ||
| 1221 | for_each_online_node(nid) { | ||
| 1222 | if (nid == env.src_nid || nid == p->numa_preferred_nid) | ||
| 1223 | continue; | ||
| 1224 | |||
| 1225 | /* Only consider nodes where both task and groups benefit */ | ||
| 1226 | taskimp = task_weight(p, nid) - taskweight; | ||
| 1227 | groupimp = group_weight(p, nid) - groupweight; | ||
| 1228 | if (taskimp < 0 && groupimp < 0) | ||
| 1229 | continue; | ||
| 1230 | |||
| 1231 | env.dst_nid = nid; | ||
| 1232 | update_numa_stats(&env.dst_stats, env.dst_nid); | ||
| 1233 | task_numa_find_cpu(&env, taskimp, groupimp); | ||
| 1234 | } | ||
| 1235 | } | ||
| 1236 | |||
| 1237 | /* No better CPU than the current one was found. */ | ||
| 1238 | if (env.best_cpu == -1) | ||
| 1239 | return -EAGAIN; | ||
| 1240 | |||
| 1241 | sched_setnuma(p, env.dst_nid); | ||
| 1242 | |||
| 1243 | /* | ||
| 1244 | * Reset the scan period if the task is being rescheduled on an | ||
| 1245 | * alternative node to recheck if the tasks is now properly placed. | ||
| 1246 | */ | ||
| 1247 | p->numa_scan_period = task_scan_min(p); | ||
| 1248 | |||
| 1249 | if (env.best_task == NULL) { | ||
| 1250 | int ret = migrate_task_to(p, env.best_cpu); | ||
| 1251 | return ret; | ||
| 1252 | } | ||
| 1253 | |||
| 1254 | ret = migrate_swap(p, env.best_task); | ||
| 1255 | put_task_struct(env.best_task); | ||
| 1256 | return ret; | ||
| 1257 | } | ||
| 1258 | |||
| 1259 | /* Attempt to migrate a task to a CPU on the preferred node. */ | ||
| 1260 | static void numa_migrate_preferred(struct task_struct *p) | ||
| 1261 | { | ||
| 1262 | /* This task has no NUMA fault statistics yet */ | ||
| 1263 | if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) | ||
| 1264 | return; | ||
| 1265 | |||
| 1266 | /* Periodically retry migrating the task to the preferred node */ | ||
| 1267 | p->numa_migrate_retry = jiffies + HZ; | ||
| 1268 | |||
| 1269 | /* Success if task is already running on preferred CPU */ | ||
| 1270 | if (cpu_to_node(task_cpu(p)) == p->numa_preferred_nid) | ||
| 1271 | return; | ||
| 1272 | |||
| 1273 | /* Otherwise, try migrate to a CPU on the preferred node */ | ||
| 1274 | task_numa_migrate(p); | ||
| 1275 | } | ||
| 1276 | |||
| 1277 | /* | ||
| 1278 | * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS | ||
| 1279 | * increments. The more local the fault statistics are, the higher the scan | ||
| 1280 | * period will be for the next scan window. If local/remote ratio is below | ||
| 1281 | * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the | ||
| 1282 | * scan period will decrease | ||
| 1283 | */ | ||
| 1284 | #define NUMA_PERIOD_SLOTS 10 | ||
| 1285 | #define NUMA_PERIOD_THRESHOLD 3 | ||
| 1286 | |||
| 1287 | /* | ||
| 1288 | * Increase the scan period (slow down scanning) if the majority of | ||
| 1289 | * our memory is already on our local node, or if the majority of | ||
| 1290 | * the page accesses are shared with other processes. | ||
| 1291 | * Otherwise, decrease the scan period. | ||
| 1292 | */ | ||
| 1293 | static void update_task_scan_period(struct task_struct *p, | ||
| 1294 | unsigned long shared, unsigned long private) | ||
| 834 | { | 1295 | { |
| 835 | int seq; | 1296 | unsigned int period_slot; |
| 1297 | int ratio; | ||
| 1298 | int diff; | ||
| 1299 | |||
| 1300 | unsigned long remote = p->numa_faults_locality[0]; | ||
| 1301 | unsigned long local = p->numa_faults_locality[1]; | ||
| 1302 | |||
| 1303 | /* | ||
| 1304 | * If there were no record hinting faults then either the task is | ||
| 1305 | * completely idle or all activity is areas that are not of interest | ||
| 1306 | * to automatic numa balancing. Scan slower | ||
| 1307 | */ | ||
| 1308 | if (local + shared == 0) { | ||
| 1309 | p->numa_scan_period = min(p->numa_scan_period_max, | ||
| 1310 | p->numa_scan_period << 1); | ||
| 1311 | |||
| 1312 | p->mm->numa_next_scan = jiffies + | ||
| 1313 | msecs_to_jiffies(p->numa_scan_period); | ||
| 836 | 1314 | ||
| 837 | if (!p->mm) /* for example, ksmd faulting in a user's mm */ | ||
| 838 | return; | 1315 | return; |
| 1316 | } | ||
| 1317 | |||
| 1318 | /* | ||
| 1319 | * Prepare to scale scan period relative to the current period. | ||
| 1320 | * == NUMA_PERIOD_THRESHOLD scan period stays the same | ||
| 1321 | * < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster) | ||
| 1322 | * >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower) | ||
| 1323 | */ | ||
| 1324 | period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS); | ||
| 1325 | ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote); | ||
| 1326 | if (ratio >= NUMA_PERIOD_THRESHOLD) { | ||
| 1327 | int slot = ratio - NUMA_PERIOD_THRESHOLD; | ||
| 1328 | if (!slot) | ||
| 1329 | slot = 1; | ||
| 1330 | diff = slot * period_slot; | ||
| 1331 | } else { | ||
| 1332 | diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot; | ||
| 1333 | |||
| 1334 | /* | ||
| 1335 | * Scale scan rate increases based on sharing. There is an | ||
| 1336 | * inverse relationship between the degree of sharing and | ||
| 1337 | * the adjustment made to the scanning period. Broadly | ||
| 1338 | * speaking the intent is that there is little point | ||
| 1339 | * scanning faster if shared accesses dominate as it may | ||
| 1340 | * simply bounce migrations uselessly | ||
| 1341 | */ | ||
| 1342 | period_slot = DIV_ROUND_UP(diff, NUMA_PERIOD_SLOTS); | ||
| 1343 | ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared)); | ||
| 1344 | diff = (diff * ratio) / NUMA_PERIOD_SLOTS; | ||
| 1345 | } | ||
| 1346 | |||
| 1347 | p->numa_scan_period = clamp(p->numa_scan_period + diff, | ||
| 1348 | task_scan_min(p), task_scan_max(p)); | ||
| 1349 | memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); | ||
| 1350 | } | ||
| 1351 | |||
| 1352 | static void task_numa_placement(struct task_struct *p) | ||
| 1353 | { | ||
| 1354 | int seq, nid, max_nid = -1, max_group_nid = -1; | ||
| 1355 | unsigned long max_faults = 0, max_group_faults = 0; | ||
| 1356 | unsigned long fault_types[2] = { 0, 0 }; | ||
| 1357 | spinlock_t *group_lock = NULL; | ||
| 1358 | |||
| 839 | seq = ACCESS_ONCE(p->mm->numa_scan_seq); | 1359 | seq = ACCESS_ONCE(p->mm->numa_scan_seq); |
| 840 | if (p->numa_scan_seq == seq) | 1360 | if (p->numa_scan_seq == seq) |
| 841 | return; | 1361 | return; |
| 842 | p->numa_scan_seq = seq; | 1362 | p->numa_scan_seq = seq; |
| 1363 | p->numa_scan_period_max = task_scan_max(p); | ||
| 1364 | |||
| 1365 | /* If the task is part of a group prevent parallel updates to group stats */ | ||
| 1366 | if (p->numa_group) { | ||
| 1367 | group_lock = &p->numa_group->lock; | ||
| 1368 | spin_lock(group_lock); | ||
| 1369 | } | ||
| 1370 | |||
| 1371 | /* Find the node with the highest number of faults */ | ||
| 1372 | for_each_online_node(nid) { | ||
| 1373 | unsigned long faults = 0, group_faults = 0; | ||
| 1374 | int priv, i; | ||
| 1375 | |||
| 1376 | for (priv = 0; priv < 2; priv++) { | ||
| 1377 | long diff; | ||
| 1378 | |||
| 1379 | i = task_faults_idx(nid, priv); | ||
| 1380 | diff = -p->numa_faults[i]; | ||
| 1381 | |||
| 1382 | /* Decay existing window, copy faults since last scan */ | ||
| 1383 | p->numa_faults[i] >>= 1; | ||
| 1384 | p->numa_faults[i] += p->numa_faults_buffer[i]; | ||
| 1385 | fault_types[priv] += p->numa_faults_buffer[i]; | ||
| 1386 | p->numa_faults_buffer[i] = 0; | ||
| 1387 | |||
| 1388 | faults += p->numa_faults[i]; | ||
| 1389 | diff += p->numa_faults[i]; | ||
| 1390 | p->total_numa_faults += diff; | ||
| 1391 | if (p->numa_group) { | ||
| 1392 | /* safe because we can only change our own group */ | ||
| 1393 | p->numa_group->faults[i] += diff; | ||
| 1394 | p->numa_group->total_faults += diff; | ||
| 1395 | group_faults += p->numa_group->faults[i]; | ||
| 1396 | } | ||
| 1397 | } | ||
| 1398 | |||
| 1399 | if (faults > max_faults) { | ||
| 1400 | max_faults = faults; | ||
| 1401 | max_nid = nid; | ||
| 1402 | } | ||
| 1403 | |||
| 1404 | if (group_faults > max_group_faults) { | ||
| 1405 | max_group_faults = group_faults; | ||
| 1406 | max_group_nid = nid; | ||
| 1407 | } | ||
| 1408 | } | ||
| 1409 | |||
| 1410 | update_task_scan_period(p, fault_types[0], fault_types[1]); | ||
| 1411 | |||
| 1412 | if (p->numa_group) { | ||
| 1413 | /* | ||
| 1414 | * If the preferred task and group nids are different, | ||
| 1415 | * iterate over the nodes again to find the best place. | ||
| 1416 | */ | ||
| 1417 | if (max_nid != max_group_nid) { | ||
| 1418 | unsigned long weight, max_weight = 0; | ||
| 1419 | |||
| 1420 | for_each_online_node(nid) { | ||
| 1421 | weight = task_weight(p, nid) + group_weight(p, nid); | ||
| 1422 | if (weight > max_weight) { | ||
| 1423 | max_weight = weight; | ||
| 1424 | max_nid = nid; | ||
| 1425 | } | ||
| 1426 | } | ||
| 1427 | } | ||
| 1428 | |||
| 1429 | spin_unlock(group_lock); | ||
| 1430 | } | ||
| 1431 | |||
| 1432 | /* Preferred node as the node with the most faults */ | ||
| 1433 | if (max_faults && max_nid != p->numa_preferred_nid) { | ||
| 1434 | /* Update the preferred nid and migrate task if possible */ | ||
| 1435 | sched_setnuma(p, max_nid); | ||
| 1436 | numa_migrate_preferred(p); | ||
| 1437 | } | ||
| 1438 | } | ||
| 1439 | |||
| 1440 | static inline int get_numa_group(struct numa_group *grp) | ||
| 1441 | { | ||
| 1442 | return atomic_inc_not_zero(&grp->refcount); | ||
| 1443 | } | ||
| 1444 | |||
| 1445 | static inline void put_numa_group(struct numa_group *grp) | ||
| 1446 | { | ||
| 1447 | if (atomic_dec_and_test(&grp->refcount)) | ||
| 1448 | kfree_rcu(grp, rcu); | ||
| 1449 | } | ||
| 1450 | |||
| 1451 | static void task_numa_group(struct task_struct *p, int cpupid, int flags, | ||
| 1452 | int *priv) | ||
| 1453 | { | ||
| 1454 | struct numa_group *grp, *my_grp; | ||
| 1455 | struct task_struct *tsk; | ||
| 1456 | bool join = false; | ||
| 1457 | int cpu = cpupid_to_cpu(cpupid); | ||
| 1458 | int i; | ||
| 1459 | |||
| 1460 | if (unlikely(!p->numa_group)) { | ||
| 1461 | unsigned int size = sizeof(struct numa_group) + | ||
| 1462 | 2*nr_node_ids*sizeof(unsigned long); | ||
| 1463 | |||
| 1464 | grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); | ||
| 1465 | if (!grp) | ||
| 1466 | return; | ||
| 1467 | |||
| 1468 | atomic_set(&grp->refcount, 1); | ||
| 1469 | spin_lock_init(&grp->lock); | ||
| 1470 | INIT_LIST_HEAD(&grp->task_list); | ||
| 1471 | grp->gid = p->pid; | ||
| 1472 | |||
| 1473 | for (i = 0; i < 2*nr_node_ids; i++) | ||
| 1474 | grp->faults[i] = p->numa_faults[i]; | ||
| 1475 | |||
| 1476 | grp->total_faults = p->total_numa_faults; | ||
| 1477 | |||
| 1478 | list_add(&p->numa_entry, &grp->task_list); | ||
| 1479 | grp->nr_tasks++; | ||
| 1480 | rcu_assign_pointer(p->numa_group, grp); | ||
| 1481 | } | ||
| 1482 | |||
| 1483 | rcu_read_lock(); | ||
| 1484 | tsk = ACCESS_ONCE(cpu_rq(cpu)->curr); | ||
| 843 | 1485 | ||
| 844 | /* FIXME: Scheduling placement policy hints go here */ | 1486 | if (!cpupid_match_pid(tsk, cpupid)) |
| 1487 | goto no_join; | ||
| 1488 | |||
| 1489 | grp = rcu_dereference(tsk->numa_group); | ||
| 1490 | if (!grp) | ||
| 1491 | goto no_join; | ||
| 1492 | |||
| 1493 | my_grp = p->numa_group; | ||
| 1494 | if (grp == my_grp) | ||
| 1495 | goto no_join; | ||
| 1496 | |||
| 1497 | /* | ||
| 1498 | * Only join the other group if its bigger; if we're the bigger group, | ||
| 1499 | * the other task will join us. | ||
| 1500 | */ | ||
| 1501 | if (my_grp->nr_tasks > grp->nr_tasks) | ||
| 1502 | goto no_join; | ||
| 1503 | |||
| 1504 | /* | ||
| 1505 | * Tie-break on the grp address. | ||
| 1506 | */ | ||
| 1507 | if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp) | ||
| 1508 | goto no_join; | ||
| 1509 | |||
| 1510 | /* Always join threads in the same process. */ | ||
| 1511 | if (tsk->mm == current->mm) | ||
| 1512 | join = true; | ||
| 1513 | |||
| 1514 | /* Simple filter to avoid false positives due to PID collisions */ | ||
| 1515 | if (flags & TNF_SHARED) | ||
| 1516 | join = true; | ||
| 1517 | |||
| 1518 | /* Update priv based on whether false sharing was detected */ | ||
| 1519 | *priv = !join; | ||
| 1520 | |||
| 1521 | if (join && !get_numa_group(grp)) | ||
| 1522 | goto no_join; | ||
| 1523 | |||
| 1524 | rcu_read_unlock(); | ||
| 1525 | |||
| 1526 | if (!join) | ||
| 1527 | return; | ||
| 1528 | |||
| 1529 | double_lock(&my_grp->lock, &grp->lock); | ||
| 1530 | |||
| 1531 | for (i = 0; i < 2*nr_node_ids; i++) { | ||
| 1532 | my_grp->faults[i] -= p->numa_faults[i]; | ||
| 1533 | grp->faults[i] += p->numa_faults[i]; | ||
| 1534 | } | ||
| 1535 | my_grp->total_faults -= p->total_numa_faults; | ||
| 1536 | grp->total_faults += p->total_numa_faults; | ||
| 1537 | |||
| 1538 | list_move(&p->numa_entry, &grp->task_list); | ||
| 1539 | my_grp->nr_tasks--; | ||
| 1540 | grp->nr_tasks++; | ||
| 1541 | |||
| 1542 | spin_unlock(&my_grp->lock); | ||
| 1543 | spin_unlock(&grp->lock); | ||
| 1544 | |||
| 1545 | rcu_assign_pointer(p->numa_group, grp); | ||
| 1546 | |||
| 1547 | put_numa_group(my_grp); | ||
| 1548 | return; | ||
| 1549 | |||
| 1550 | no_join: | ||
| 1551 | rcu_read_unlock(); | ||
| 1552 | return; | ||
| 1553 | } | ||
| 1554 | |||
| 1555 | void task_numa_free(struct task_struct *p) | ||
| 1556 | { | ||
| 1557 | struct numa_group *grp = p->numa_group; | ||
| 1558 | int i; | ||
| 1559 | void *numa_faults = p->numa_faults; | ||
| 1560 | |||
| 1561 | if (grp) { | ||
| 1562 | spin_lock(&grp->lock); | ||
| 1563 | for (i = 0; i < 2*nr_node_ids; i++) | ||
| 1564 | grp->faults[i] -= p->numa_faults[i]; | ||
| 1565 | grp->total_faults -= p->total_numa_faults; | ||
| 1566 | |||
| 1567 | list_del(&p->numa_entry); | ||
| 1568 | grp->nr_tasks--; | ||
| 1569 | spin_unlock(&grp->lock); | ||
| 1570 | rcu_assign_pointer(p->numa_group, NULL); | ||
| 1571 | put_numa_group(grp); | ||
| 1572 | } | ||
| 1573 | |||
| 1574 | p->numa_faults = NULL; | ||
| 1575 | p->numa_faults_buffer = NULL; | ||
| 1576 | kfree(numa_faults); | ||
| 845 | } | 1577 | } |
| 846 | 1578 | ||
| 847 | /* | 1579 | /* |
| 848 | * Got a PROT_NONE fault for a page on @node. | 1580 | * Got a PROT_NONE fault for a page on @node. |
| 849 | */ | 1581 | */ |
| 850 | void task_numa_fault(int node, int pages, bool migrated) | 1582 | void task_numa_fault(int last_cpupid, int node, int pages, int flags) |
| 851 | { | 1583 | { |
| 852 | struct task_struct *p = current; | 1584 | struct task_struct *p = current; |
| 1585 | bool migrated = flags & TNF_MIGRATED; | ||
| 1586 | int priv; | ||
| 853 | 1587 | ||
| 854 | if (!numabalancing_enabled) | 1588 | if (!numabalancing_enabled) |
| 855 | return; | 1589 | return; |
| 856 | 1590 | ||
| 857 | /* FIXME: Allocate task-specific structure for placement policy here */ | 1591 | /* for example, ksmd faulting in a user's mm */ |
| 1592 | if (!p->mm) | ||
| 1593 | return; | ||
| 1594 | |||
| 1595 | /* Do not worry about placement if exiting */ | ||
| 1596 | if (p->state == TASK_DEAD) | ||
| 1597 | return; | ||
| 1598 | |||
| 1599 | /* Allocate buffer to track faults on a per-node basis */ | ||
| 1600 | if (unlikely(!p->numa_faults)) { | ||
| 1601 | int size = sizeof(*p->numa_faults) * 2 * nr_node_ids; | ||
| 1602 | |||
| 1603 | /* numa_faults and numa_faults_buffer share the allocation */ | ||
| 1604 | p->numa_faults = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN); | ||
| 1605 | if (!p->numa_faults) | ||
| 1606 | return; | ||
| 1607 | |||
| 1608 | BUG_ON(p->numa_faults_buffer); | ||
| 1609 | p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids); | ||
| 1610 | p->total_numa_faults = 0; | ||
| 1611 | memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); | ||
| 1612 | } | ||
| 858 | 1613 | ||
| 859 | /* | 1614 | /* |
| 860 | * If pages are properly placed (did not migrate) then scan slower. | 1615 | * First accesses are treated as private, otherwise consider accesses |
| 861 | * This is reset periodically in case of phase changes | 1616 | * to be private if the accessing pid has not changed |
| 862 | */ | 1617 | */ |
| 863 | if (!migrated) | 1618 | if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) { |
| 864 | p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max, | 1619 | priv = 1; |
| 865 | p->numa_scan_period + jiffies_to_msecs(10)); | 1620 | } else { |
| 1621 | priv = cpupid_match_pid(p, last_cpupid); | ||
| 1622 | if (!priv && !(flags & TNF_NO_GROUP)) | ||
| 1623 | task_numa_group(p, last_cpupid, flags, &priv); | ||
| 1624 | } | ||
| 866 | 1625 | ||
| 867 | task_numa_placement(p); | 1626 | task_numa_placement(p); |
| 1627 | |||
| 1628 | /* | ||
| 1629 | * Retry task to preferred node migration periodically, in case it | ||
| 1630 | * case it previously failed, or the scheduler moved us. | ||
| 1631 | */ | ||
| 1632 | if (time_after(jiffies, p->numa_migrate_retry)) | ||
| 1633 | numa_migrate_preferred(p); | ||
| 1634 | |||
| 1635 | if (migrated) | ||
| 1636 | p->numa_pages_migrated += pages; | ||
| 1637 | |||
| 1638 | p->numa_faults_buffer[task_faults_idx(node, priv)] += pages; | ||
| 1639 | p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages; | ||
| 868 | } | 1640 | } |
| 869 | 1641 | ||
| 870 | static void reset_ptenuma_scan(struct task_struct *p) | 1642 | static void reset_ptenuma_scan(struct task_struct *p) |
| @@ -884,6 +1656,7 @@ void task_numa_work(struct callback_head *work) | |||
| 884 | struct mm_struct *mm = p->mm; | 1656 | struct mm_struct *mm = p->mm; |
| 885 | struct vm_area_struct *vma; | 1657 | struct vm_area_struct *vma; |
| 886 | unsigned long start, end; | 1658 | unsigned long start, end; |
| 1659 | unsigned long nr_pte_updates = 0; | ||
| 887 | long pages; | 1660 | long pages; |
| 888 | 1661 | ||
| 889 | WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); | 1662 | WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); |
| @@ -900,35 +1673,9 @@ void task_numa_work(struct callback_head *work) | |||
| 900 | if (p->flags & PF_EXITING) | 1673 | if (p->flags & PF_EXITING) |
| 901 | return; | 1674 | return; |
| 902 | 1675 | ||
| 903 | /* | 1676 | if (!mm->numa_next_scan) { |
| 904 | * We do not care about task placement until a task runs on a node | 1677 | mm->numa_next_scan = now + |
| 905 | * other than the first one used by the address space. This is | 1678 | msecs_to_jiffies(sysctl_numa_balancing_scan_delay); |
| 906 | * largely because migrations are driven by what CPU the task | ||
| 907 | * is running on. If it's never scheduled on another node, it'll | ||
| 908 | * not migrate so why bother trapping the fault. | ||
| 909 | */ | ||
| 910 | if (mm->first_nid == NUMA_PTE_SCAN_INIT) | ||
| 911 | mm->first_nid = numa_node_id(); | ||
| 912 | if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) { | ||
| 913 | /* Are we running on a new node yet? */ | ||
| 914 | if (numa_node_id() == mm->first_nid && | ||
| 915 | !sched_feat_numa(NUMA_FORCE)) | ||
| 916 | return; | ||
| 917 | |||
| 918 | mm->first_nid = NUMA_PTE_SCAN_ACTIVE; | ||
| 919 | } | ||
| 920 | |||
| 921 | /* | ||
| 922 | * Reset the scan period if enough time has gone by. Objective is that | ||
| 923 | * scanning will be reduced if pages are properly placed. As tasks | ||
| 924 | * can enter different phases this needs to be re-examined. Lacking | ||
| 925 | * proper tracking of reference behaviour, this blunt hammer is used. | ||
| 926 | */ | ||
| 927 | migrate = mm->numa_next_reset; | ||
| 928 | if (time_after(now, migrate)) { | ||
| 929 | p->numa_scan_period = sysctl_numa_balancing_scan_period_min; | ||
| 930 | next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset); | ||
| 931 | xchg(&mm->numa_next_reset, next_scan); | ||
| 932 | } | 1679 | } |
| 933 | 1680 | ||
| 934 | /* | 1681 | /* |
| @@ -938,20 +1685,20 @@ void task_numa_work(struct callback_head *work) | |||
| 938 | if (time_before(now, migrate)) | 1685 | if (time_before(now, migrate)) |
| 939 | return; | 1686 | return; |
| 940 | 1687 | ||
| 941 | if (p->numa_scan_period == 0) | 1688 | if (p->numa_scan_period == 0) { |
| 942 | p->numa_scan_period = sysctl_numa_balancing_scan_period_min; | 1689 | p->numa_scan_period_max = task_scan_max(p); |
| 1690 | p->numa_scan_period = task_scan_min(p); | ||
| 1691 | } | ||
| 943 | 1692 | ||
| 944 | next_scan = now + msecs_to_jiffies(p->numa_scan_period); | 1693 | next_scan = now + msecs_to_jiffies(p->numa_scan_period); |
| 945 | if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate) | 1694 | if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate) |
| 946 | return; | 1695 | return; |
| 947 | 1696 | ||
| 948 | /* | 1697 | /* |
| 949 | * Do not set pte_numa if the current running node is rate-limited. | 1698 | * Delay this task enough that another task of this mm will likely win |
| 950 | * This loses statistics on the fault but if we are unwilling to | 1699 | * the next time around. |
| 951 | * migrate to this node, it is less likely we can do useful work | ||
| 952 | */ | 1700 | */ |
| 953 | if (migrate_ratelimited(numa_node_id())) | 1701 | p->node_stamp += 2 * TICK_NSEC; |
| 954 | return; | ||
| 955 | 1702 | ||
| 956 | start = mm->numa_scan_offset; | 1703 | start = mm->numa_scan_offset; |
| 957 | pages = sysctl_numa_balancing_scan_size; | 1704 | pages = sysctl_numa_balancing_scan_size; |
| @@ -967,18 +1714,32 @@ void task_numa_work(struct callback_head *work) | |||
| 967 | vma = mm->mmap; | 1714 | vma = mm->mmap; |
| 968 | } | 1715 | } |
| 969 | for (; vma; vma = vma->vm_next) { | 1716 | for (; vma; vma = vma->vm_next) { |
| 970 | if (!vma_migratable(vma)) | 1717 | if (!vma_migratable(vma) || !vma_policy_mof(p, vma)) |
| 971 | continue; | 1718 | continue; |
| 972 | 1719 | ||
| 973 | /* Skip small VMAs. They are not likely to be of relevance */ | 1720 | /* |
| 974 | if (vma->vm_end - vma->vm_start < HPAGE_SIZE) | 1721 | * Shared library pages mapped by multiple processes are not |
| 1722 | * migrated as it is expected they are cache replicated. Avoid | ||
| 1723 | * hinting faults in read-only file-backed mappings or the vdso | ||
| 1724 | * as migrating the pages will be of marginal benefit. | ||
| 1725 | */ | ||
| 1726 | if (!vma->vm_mm || | ||
| 1727 | (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) | ||
| 975 | continue; | 1728 | continue; |
| 976 | 1729 | ||
| 977 | do { | 1730 | do { |
| 978 | start = max(start, vma->vm_start); | 1731 | start = max(start, vma->vm_start); |
| 979 | end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); | 1732 | end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); |
| 980 | end = min(end, vma->vm_end); | 1733 | end = min(end, vma->vm_end); |
| 981 | pages -= change_prot_numa(vma, start, end); | 1734 | nr_pte_updates += change_prot_numa(vma, start, end); |
| 1735 | |||
| 1736 | /* | ||
| 1737 | * Scan sysctl_numa_balancing_scan_size but ensure that | ||
| 1738 | * at least one PTE is updated so that unused virtual | ||
| 1739 | * address space is quickly skipped. | ||
| 1740 | */ | ||
| 1741 | if (nr_pte_updates) | ||
| 1742 | pages -= (end - start) >> PAGE_SHIFT; | ||
| 982 | 1743 | ||
| 983 | start = end; | 1744 | start = end; |
| 984 | if (pages <= 0) | 1745 | if (pages <= 0) |
| @@ -988,10 +1749,10 @@ void task_numa_work(struct callback_head *work) | |||
| 988 | 1749 | ||
| 989 | out: | 1750 | out: |
| 990 | /* | 1751 | /* |
| 991 | * It is possible to reach the end of the VMA list but the last few VMAs are | 1752 | * It is possible to reach the end of the VMA list but the last few |
| 992 | * not guaranteed to the vma_migratable. If they are not, we would find the | 1753 | * VMAs are not guaranteed to the vma_migratable. If they are not, we |
| 993 | * !migratable VMA on the next scan but not reset the scanner to the start | 1754 | * would find the !migratable VMA on the next scan but not reset the |
| 994 | * so check it now. | 1755 | * scanner to the start so check it now. |
| 995 | */ | 1756 | */ |
| 996 | if (vma) | 1757 | if (vma) |
| 997 | mm->numa_scan_offset = start; | 1758 | mm->numa_scan_offset = start; |
| @@ -1025,8 +1786,8 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) | |||
| 1025 | 1786 | ||
| 1026 | if (now - curr->node_stamp > period) { | 1787 | if (now - curr->node_stamp > period) { |
| 1027 | if (!curr->node_stamp) | 1788 | if (!curr->node_stamp) |
| 1028 | curr->numa_scan_period = sysctl_numa_balancing_scan_period_min; | 1789 | curr->numa_scan_period = task_scan_min(curr); |
| 1029 | curr->node_stamp = now; | 1790 | curr->node_stamp += period; |
| 1030 | 1791 | ||
| 1031 | if (!time_before(jiffies, curr->mm->numa_next_scan)) { | 1792 | if (!time_before(jiffies, curr->mm->numa_next_scan)) { |
| 1032 | init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */ | 1793 | init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */ |
| @@ -1038,6 +1799,14 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) | |||
| 1038 | static void task_tick_numa(struct rq *rq, struct task_struct *curr) | 1799 | static void task_tick_numa(struct rq *rq, struct task_struct *curr) |
| 1039 | { | 1800 | { |
| 1040 | } | 1801 | } |
| 1802 | |||
| 1803 | static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p) | ||
| 1804 | { | ||
| 1805 | } | ||
| 1806 | |||
| 1807 | static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p) | ||
| 1808 | { | ||
| 1809 | } | ||
| 1041 | #endif /* CONFIG_NUMA_BALANCING */ | 1810 | #endif /* CONFIG_NUMA_BALANCING */ |
| 1042 | 1811 | ||
| 1043 | static void | 1812 | static void |
| @@ -1047,8 +1816,12 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 1047 | if (!parent_entity(se)) | 1816 | if (!parent_entity(se)) |
| 1048 | update_load_add(&rq_of(cfs_rq)->load, se->load.weight); | 1817 | update_load_add(&rq_of(cfs_rq)->load, se->load.weight); |
| 1049 | #ifdef CONFIG_SMP | 1818 | #ifdef CONFIG_SMP |
| 1050 | if (entity_is_task(se)) | 1819 | if (entity_is_task(se)) { |
| 1051 | list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); | 1820 | struct rq *rq = rq_of(cfs_rq); |
| 1821 | |||
| 1822 | account_numa_enqueue(rq, task_of(se)); | ||
| 1823 | list_add(&se->group_node, &rq->cfs_tasks); | ||
| 1824 | } | ||
| 1052 | #endif | 1825 | #endif |
| 1053 | cfs_rq->nr_running++; | 1826 | cfs_rq->nr_running++; |
| 1054 | } | 1827 | } |
| @@ -1059,8 +1832,10 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 1059 | update_load_sub(&cfs_rq->load, se->load.weight); | 1832 | update_load_sub(&cfs_rq->load, se->load.weight); |
| 1060 | if (!parent_entity(se)) | 1833 | if (!parent_entity(se)) |
| 1061 | update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); | 1834 | update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); |
| 1062 | if (entity_is_task(se)) | 1835 | if (entity_is_task(se)) { |
| 1836 | account_numa_dequeue(rq_of(cfs_rq), task_of(se)); | ||
| 1063 | list_del_init(&se->group_node); | 1837 | list_del_init(&se->group_node); |
| 1838 | } | ||
| 1064 | cfs_rq->nr_running--; | 1839 | cfs_rq->nr_running--; |
| 1065 | } | 1840 | } |
| 1066 | 1841 | ||
| @@ -2032,6 +2807,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) | |||
| 2032 | */ | 2807 | */ |
| 2033 | update_entity_load_avg(curr, 1); | 2808 | update_entity_load_avg(curr, 1); |
| 2034 | update_cfs_rq_blocked_load(cfs_rq, 1); | 2809 | update_cfs_rq_blocked_load(cfs_rq, 1); |
| 2810 | update_cfs_shares(cfs_rq); | ||
| 2035 | 2811 | ||
| 2036 | #ifdef CONFIG_SCHED_HRTICK | 2812 | #ifdef CONFIG_SCHED_HRTICK |
| 2037 | /* | 2813 | /* |
| @@ -2069,13 +2845,14 @@ static inline bool cfs_bandwidth_used(void) | |||
| 2069 | return static_key_false(&__cfs_bandwidth_used); | 2845 | return static_key_false(&__cfs_bandwidth_used); |
| 2070 | } | 2846 | } |
| 2071 | 2847 | ||
| 2072 | void account_cfs_bandwidth_used(int enabled, int was_enabled) | 2848 | void cfs_bandwidth_usage_inc(void) |
| 2073 | { | 2849 | { |
| 2074 | /* only need to count groups transitioning between enabled/!enabled */ | 2850 | static_key_slow_inc(&__cfs_bandwidth_used); |
| 2075 | if (enabled && !was_enabled) | 2851 | } |
| 2076 | static_key_slow_inc(&__cfs_bandwidth_used); | 2852 | |
| 2077 | else if (!enabled && was_enabled) | 2853 | void cfs_bandwidth_usage_dec(void) |
| 2078 | static_key_slow_dec(&__cfs_bandwidth_used); | 2854 | { |
| 2855 | static_key_slow_dec(&__cfs_bandwidth_used); | ||
| 2079 | } | 2856 | } |
| 2080 | #else /* HAVE_JUMP_LABEL */ | 2857 | #else /* HAVE_JUMP_LABEL */ |
| 2081 | static bool cfs_bandwidth_used(void) | 2858 | static bool cfs_bandwidth_used(void) |
| @@ -2083,7 +2860,8 @@ static bool cfs_bandwidth_used(void) | |||
| 2083 | return true; | 2860 | return true; |
| 2084 | } | 2861 | } |
| 2085 | 2862 | ||
| 2086 | void account_cfs_bandwidth_used(int enabled, int was_enabled) {} | 2863 | void cfs_bandwidth_usage_inc(void) {} |
| 2864 | void cfs_bandwidth_usage_dec(void) {} | ||
| 2087 | #endif /* HAVE_JUMP_LABEL */ | 2865 | #endif /* HAVE_JUMP_LABEL */ |
| 2088 | 2866 | ||
| 2089 | /* | 2867 | /* |
| @@ -2334,6 +3112,8 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) | |||
| 2334 | cfs_rq->throttled_clock = rq_clock(rq); | 3112 | cfs_rq->throttled_clock = rq_clock(rq); |
| 2335 | raw_spin_lock(&cfs_b->lock); | 3113 | raw_spin_lock(&cfs_b->lock); |
| 2336 | list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); | 3114 | list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); |
| 3115 | if (!cfs_b->timer_active) | ||
| 3116 | __start_cfs_bandwidth(cfs_b); | ||
| 2337 | raw_spin_unlock(&cfs_b->lock); | 3117 | raw_spin_unlock(&cfs_b->lock); |
| 2338 | } | 3118 | } |
| 2339 | 3119 | ||
| @@ -2447,6 +3227,13 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) | |||
| 2447 | if (idle) | 3227 | if (idle) |
| 2448 | goto out_unlock; | 3228 | goto out_unlock; |
| 2449 | 3229 | ||
| 3230 | /* | ||
| 3231 | * if we have relooped after returning idle once, we need to update our | ||
| 3232 | * status as actually running, so that other cpus doing | ||
| 3233 | * __start_cfs_bandwidth will stop trying to cancel us. | ||
| 3234 | */ | ||
| 3235 | cfs_b->timer_active = 1; | ||
| 3236 | |||
| 2450 | __refill_cfs_bandwidth_runtime(cfs_b); | 3237 | __refill_cfs_bandwidth_runtime(cfs_b); |
| 2451 | 3238 | ||
| 2452 | if (!throttled) { | 3239 | if (!throttled) { |
| @@ -2507,7 +3294,13 @@ static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC; | |||
| 2507 | /* how long we wait to gather additional slack before distributing */ | 3294 | /* how long we wait to gather additional slack before distributing */ |
| 2508 | static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC; | 3295 | static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC; |
| 2509 | 3296 | ||
| 2510 | /* are we near the end of the current quota period? */ | 3297 | /* |
| 3298 | * Are we near the end of the current quota period? | ||
| 3299 | * | ||
| 3300 | * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the | ||
| 3301 | * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of | ||
| 3302 | * migrate_hrtimers, base is never cleared, so we are fine. | ||
| 3303 | */ | ||
| 2511 | static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire) | 3304 | static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire) |
| 2512 | { | 3305 | { |
| 2513 | struct hrtimer *refresh_timer = &cfs_b->period_timer; | 3306 | struct hrtimer *refresh_timer = &cfs_b->period_timer; |
| @@ -2583,10 +3376,12 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) | |||
| 2583 | u64 expires; | 3376 | u64 expires; |
| 2584 | 3377 | ||
| 2585 | /* confirm we're still not at a refresh boundary */ | 3378 | /* confirm we're still not at a refresh boundary */ |
| 2586 | if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) | 3379 | raw_spin_lock(&cfs_b->lock); |
| 3380 | if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) { | ||
| 3381 | raw_spin_unlock(&cfs_b->lock); | ||
| 2587 | return; | 3382 | return; |
| 3383 | } | ||
| 2588 | 3384 | ||
| 2589 | raw_spin_lock(&cfs_b->lock); | ||
| 2590 | if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) { | 3385 | if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) { |
| 2591 | runtime = cfs_b->runtime; | 3386 | runtime = cfs_b->runtime; |
| 2592 | cfs_b->runtime = 0; | 3387 | cfs_b->runtime = 0; |
| @@ -2707,11 +3502,11 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | |||
| 2707 | * (timer_active==0 becomes visible before the hrtimer call-back | 3502 | * (timer_active==0 becomes visible before the hrtimer call-back |
| 2708 | * terminates). In either case we ensure that it's re-programmed | 3503 | * terminates). In either case we ensure that it's re-programmed |
| 2709 | */ | 3504 | */ |
| 2710 | while (unlikely(hrtimer_active(&cfs_b->period_timer))) { | 3505 | while (unlikely(hrtimer_active(&cfs_b->period_timer)) && |
| 3506 | hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) { | ||
| 3507 | /* bounce the lock to allow do_sched_cfs_period_timer to run */ | ||
| 2711 | raw_spin_unlock(&cfs_b->lock); | 3508 | raw_spin_unlock(&cfs_b->lock); |
| 2712 | /* ensure cfs_b->lock is available while we wait */ | 3509 | cpu_relax(); |
| 2713 | hrtimer_cancel(&cfs_b->period_timer); | ||
| 2714 | |||
| 2715 | raw_spin_lock(&cfs_b->lock); | 3510 | raw_spin_lock(&cfs_b->lock); |
| 2716 | /* if someone else restarted the timer then we're done */ | 3511 | /* if someone else restarted the timer then we're done */ |
| 2717 | if (cfs_b->timer_active) | 3512 | if (cfs_b->timer_active) |
| @@ -3017,6 +3812,23 @@ static unsigned long cpu_avg_load_per_task(int cpu) | |||
| 3017 | return 0; | 3812 | return 0; |
| 3018 | } | 3813 | } |
| 3019 | 3814 | ||
| 3815 | static void record_wakee(struct task_struct *p) | ||
| 3816 | { | ||
| 3817 | /* | ||
| 3818 | * Rough decay (wiping) for cost saving, don't worry | ||
| 3819 | * about the boundary, really active task won't care | ||
| 3820 | * about the loss. | ||
| 3821 | */ | ||
| 3822 | if (jiffies > current->wakee_flip_decay_ts + HZ) { | ||
| 3823 | current->wakee_flips = 0; | ||
| 3824 | current->wakee_flip_decay_ts = jiffies; | ||
| 3825 | } | ||
| 3826 | |||
| 3827 | if (current->last_wakee != p) { | ||
| 3828 | current->last_wakee = p; | ||
| 3829 | current->wakee_flips++; | ||
| 3830 | } | ||
| 3831 | } | ||
| 3020 | 3832 | ||
| 3021 | static void task_waking_fair(struct task_struct *p) | 3833 | static void task_waking_fair(struct task_struct *p) |
| 3022 | { | 3834 | { |
| @@ -3037,6 +3849,7 @@ static void task_waking_fair(struct task_struct *p) | |||
| 3037 | #endif | 3849 | #endif |
| 3038 | 3850 | ||
| 3039 | se->vruntime -= min_vruntime; | 3851 | se->vruntime -= min_vruntime; |
| 3852 | record_wakee(p); | ||
| 3040 | } | 3853 | } |
| 3041 | 3854 | ||
| 3042 | #ifdef CONFIG_FAIR_GROUP_SCHED | 3855 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| @@ -3094,7 +3907,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) | |||
| 3094 | { | 3907 | { |
| 3095 | struct sched_entity *se = tg->se[cpu]; | 3908 | struct sched_entity *se = tg->se[cpu]; |
| 3096 | 3909 | ||
| 3097 | if (!tg->parent) /* the trivial, non-cgroup case */ | 3910 | if (!tg->parent || !wl) /* the trivial, non-cgroup case */ |
| 3098 | return wl; | 3911 | return wl; |
| 3099 | 3912 | ||
| 3100 | for_each_sched_entity(se) { | 3913 | for_each_sched_entity(se) { |
| @@ -3147,14 +3960,35 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) | |||
| 3147 | } | 3960 | } |
| 3148 | #else | 3961 | #else |
| 3149 | 3962 | ||
| 3150 | static inline unsigned long effective_load(struct task_group *tg, int cpu, | 3963 | static long effective_load(struct task_group *tg, int cpu, long wl, long wg) |
| 3151 | unsigned long wl, unsigned long wg) | ||
| 3152 | { | 3964 | { |
| 3153 | return wl; | 3965 | return wl; |
| 3154 | } | 3966 | } |
| 3155 | 3967 | ||
| 3156 | #endif | 3968 | #endif |
| 3157 | 3969 | ||
| 3970 | static int wake_wide(struct task_struct *p) | ||
| 3971 | { | ||
| 3972 | int factor = this_cpu_read(sd_llc_size); | ||
| 3973 | |||
| 3974 | /* | ||
| 3975 | * Yeah, it's the switching-frequency, could means many wakee or | ||
| 3976 | * rapidly switch, use factor here will just help to automatically | ||
| 3977 | * adjust the loose-degree, so bigger node will lead to more pull. | ||
| 3978 | */ | ||
| 3979 | if (p->wakee_flips > factor) { | ||
| 3980 | /* | ||
| 3981 | * wakee is somewhat hot, it needs certain amount of cpu | ||
| 3982 | * resource, so if waker is far more hot, prefer to leave | ||
| 3983 | * it alone. | ||
| 3984 | */ | ||
| 3985 | if (current->wakee_flips > (factor * p->wakee_flips)) | ||
| 3986 | return 1; | ||
| 3987 | } | ||
| 3988 | |||
| 3989 | return 0; | ||
| 3990 | } | ||
| 3991 | |||
| 3158 | static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) | 3992 | static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) |
| 3159 | { | 3993 | { |
| 3160 | s64 this_load, load; | 3994 | s64 this_load, load; |
| @@ -3164,6 +3998,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) | |||
| 3164 | unsigned long weight; | 3998 | unsigned long weight; |
| 3165 | int balanced; | 3999 | int balanced; |
| 3166 | 4000 | ||
| 4001 | /* | ||
| 4002 | * If we wake multiple tasks be careful to not bounce | ||
| 4003 | * ourselves around too much. | ||
| 4004 | */ | ||
| 4005 | if (wake_wide(p)) | ||
| 4006 | return 0; | ||
| 4007 | |||
| 3167 | idx = sd->wake_idx; | 4008 | idx = sd->wake_idx; |
| 3168 | this_cpu = smp_processor_id(); | 4009 | this_cpu = smp_processor_id(); |
| 3169 | prev_cpu = task_cpu(p); | 4010 | prev_cpu = task_cpu(p); |
| @@ -3372,11 +4213,10 @@ done: | |||
| 3372 | * preempt must be disabled. | 4213 | * preempt must be disabled. |
| 3373 | */ | 4214 | */ |
| 3374 | static int | 4215 | static int |
| 3375 | select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) | 4216 | select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags) |
| 3376 | { | 4217 | { |
| 3377 | struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; | 4218 | struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; |
| 3378 | int cpu = smp_processor_id(); | 4219 | int cpu = smp_processor_id(); |
| 3379 | int prev_cpu = task_cpu(p); | ||
| 3380 | int new_cpu = cpu; | 4220 | int new_cpu = cpu; |
| 3381 | int want_affine = 0; | 4221 | int want_affine = 0; |
| 3382 | int sync = wake_flags & WF_SYNC; | 4222 | int sync = wake_flags & WF_SYNC; |
| @@ -3856,9 +4696,12 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
| 3856 | 4696 | ||
| 3857 | static unsigned long __read_mostly max_load_balance_interval = HZ/10; | 4697 | static unsigned long __read_mostly max_load_balance_interval = HZ/10; |
| 3858 | 4698 | ||
| 4699 | enum fbq_type { regular, remote, all }; | ||
| 4700 | |||
| 3859 | #define LBF_ALL_PINNED 0x01 | 4701 | #define LBF_ALL_PINNED 0x01 |
| 3860 | #define LBF_NEED_BREAK 0x02 | 4702 | #define LBF_NEED_BREAK 0x02 |
| 3861 | #define LBF_SOME_PINNED 0x04 | 4703 | #define LBF_DST_PINNED 0x04 |
| 4704 | #define LBF_SOME_PINNED 0x08 | ||
| 3862 | 4705 | ||
| 3863 | struct lb_env { | 4706 | struct lb_env { |
| 3864 | struct sched_domain *sd; | 4707 | struct sched_domain *sd; |
| @@ -3881,6 +4724,8 @@ struct lb_env { | |||
| 3881 | unsigned int loop; | 4724 | unsigned int loop; |
| 3882 | unsigned int loop_break; | 4725 | unsigned int loop_break; |
| 3883 | unsigned int loop_max; | 4726 | unsigned int loop_max; |
| 4727 | |||
| 4728 | enum fbq_type fbq_type; | ||
| 3884 | }; | 4729 | }; |
| 3885 | 4730 | ||
| 3886 | /* | 4731 | /* |
| @@ -3927,6 +4772,78 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) | |||
| 3927 | return delta < (s64)sysctl_sched_migration_cost; | 4772 | return delta < (s64)sysctl_sched_migration_cost; |
| 3928 | } | 4773 | } |
| 3929 | 4774 | ||
| 4775 | #ifdef CONFIG_NUMA_BALANCING | ||
| 4776 | /* Returns true if the destination node has incurred more faults */ | ||
| 4777 | static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) | ||
| 4778 | { | ||
| 4779 | int src_nid, dst_nid; | ||
| 4780 | |||
| 4781 | if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults || | ||
| 4782 | !(env->sd->flags & SD_NUMA)) { | ||
| 4783 | return false; | ||
| 4784 | } | ||
| 4785 | |||
| 4786 | src_nid = cpu_to_node(env->src_cpu); | ||
| 4787 | dst_nid = cpu_to_node(env->dst_cpu); | ||
| 4788 | |||
| 4789 | if (src_nid == dst_nid) | ||
| 4790 | return false; | ||
| 4791 | |||
| 4792 | /* Always encourage migration to the preferred node. */ | ||
| 4793 | if (dst_nid == p->numa_preferred_nid) | ||
| 4794 | return true; | ||
| 4795 | |||
| 4796 | /* If both task and group weight improve, this move is a winner. */ | ||
| 4797 | if (task_weight(p, dst_nid) > task_weight(p, src_nid) && | ||
| 4798 | group_weight(p, dst_nid) > group_weight(p, src_nid)) | ||
| 4799 | return true; | ||
| 4800 | |||
| 4801 | return false; | ||
| 4802 | } | ||
| 4803 | |||
| 4804 | |||
| 4805 | static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) | ||
| 4806 | { | ||
| 4807 | int src_nid, dst_nid; | ||
| 4808 | |||
| 4809 | if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) | ||
| 4810 | return false; | ||
| 4811 | |||
| 4812 | if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) | ||
| 4813 | return false; | ||
| 4814 | |||
| 4815 | src_nid = cpu_to_node(env->src_cpu); | ||
| 4816 | dst_nid = cpu_to_node(env->dst_cpu); | ||
| 4817 | |||
| 4818 | if (src_nid == dst_nid) | ||
| 4819 | return false; | ||
| 4820 | |||
| 4821 | /* Migrating away from the preferred node is always bad. */ | ||
| 4822 | if (src_nid == p->numa_preferred_nid) | ||
| 4823 | return true; | ||
| 4824 | |||
| 4825 | /* If either task or group weight get worse, don't do it. */ | ||
| 4826 | if (task_weight(p, dst_nid) < task_weight(p, src_nid) || | ||
| 4827 | group_weight(p, dst_nid) < group_weight(p, src_nid)) | ||
| 4828 | return true; | ||
| 4829 | |||
| 4830 | return false; | ||
| 4831 | } | ||
| 4832 | |||
| 4833 | #else | ||
| 4834 | static inline bool migrate_improves_locality(struct task_struct *p, | ||
| 4835 | struct lb_env *env) | ||
| 4836 | { | ||
| 4837 | return false; | ||
| 4838 | } | ||
| 4839 | |||
| 4840 | static inline bool migrate_degrades_locality(struct task_struct *p, | ||
| 4841 | struct lb_env *env) | ||
| 4842 | { | ||
| 4843 | return false; | ||
| 4844 | } | ||
| 4845 | #endif | ||
| 4846 | |||
| 3930 | /* | 4847 | /* |
| 3931 | * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? | 4848 | * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? |
| 3932 | */ | 4849 | */ |
| @@ -3949,6 +4866,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
| 3949 | 4866 | ||
| 3950 | schedstat_inc(p, se.statistics.nr_failed_migrations_affine); | 4867 | schedstat_inc(p, se.statistics.nr_failed_migrations_affine); |
| 3951 | 4868 | ||
| 4869 | env->flags |= LBF_SOME_PINNED; | ||
| 4870 | |||
| 3952 | /* | 4871 | /* |
| 3953 | * Remember if this task can be migrated to any other cpu in | 4872 | * Remember if this task can be migrated to any other cpu in |
| 3954 | * our sched_group. We may want to revisit it if we couldn't | 4873 | * our sched_group. We may want to revisit it if we couldn't |
| @@ -3957,13 +4876,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
| 3957 | * Also avoid computing new_dst_cpu if we have already computed | 4876 | * Also avoid computing new_dst_cpu if we have already computed |
| 3958 | * one in current iteration. | 4877 | * one in current iteration. |
| 3959 | */ | 4878 | */ |
| 3960 | if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED)) | 4879 | if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED)) |
| 3961 | return 0; | 4880 | return 0; |
| 3962 | 4881 | ||
| 3963 | /* Prevent to re-select dst_cpu via env's cpus */ | 4882 | /* Prevent to re-select dst_cpu via env's cpus */ |
| 3964 | for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { | 4883 | for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { |
| 3965 | if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) { | 4884 | if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) { |
| 3966 | env->flags |= LBF_SOME_PINNED; | 4885 | env->flags |= LBF_DST_PINNED; |
| 3967 | env->new_dst_cpu = cpu; | 4886 | env->new_dst_cpu = cpu; |
| 3968 | break; | 4887 | break; |
| 3969 | } | 4888 | } |
| @@ -3982,11 +4901,24 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
| 3982 | 4901 | ||
| 3983 | /* | 4902 | /* |
| 3984 | * Aggressive migration if: | 4903 | * Aggressive migration if: |
| 3985 | * 1) task is cache cold, or | 4904 | * 1) destination numa is preferred |
| 3986 | * 2) too many balance attempts have failed. | 4905 | * 2) task is cache cold, or |
| 4906 | * 3) too many balance attempts have failed. | ||
| 3987 | */ | 4907 | */ |
| 3988 | |||
| 3989 | tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd); | 4908 | tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd); |
| 4909 | if (!tsk_cache_hot) | ||
| 4910 | tsk_cache_hot = migrate_degrades_locality(p, env); | ||
| 4911 | |||
| 4912 | if (migrate_improves_locality(p, env)) { | ||
| 4913 | #ifdef CONFIG_SCHEDSTATS | ||
| 4914 | if (tsk_cache_hot) { | ||
| 4915 | schedstat_inc(env->sd, lb_hot_gained[env->idle]); | ||
| 4916 | schedstat_inc(p, se.statistics.nr_forced_migrations); | ||
| 4917 | } | ||
| 4918 | #endif | ||
| 4919 | return 1; | ||
| 4920 | } | ||
| 4921 | |||
| 3990 | if (!tsk_cache_hot || | 4922 | if (!tsk_cache_hot || |
| 3991 | env->sd->nr_balance_failed > env->sd->cache_nice_tries) { | 4923 | env->sd->nr_balance_failed > env->sd->cache_nice_tries) { |
| 3992 | 4924 | ||
| @@ -4029,8 +4961,6 @@ static int move_one_task(struct lb_env *env) | |||
| 4029 | return 0; | 4961 | return 0; |
| 4030 | } | 4962 | } |
| 4031 | 4963 | ||
| 4032 | static unsigned long task_h_load(struct task_struct *p); | ||
| 4033 | |||
| 4034 | static const unsigned int sched_nr_migrate_break = 32; | 4964 | static const unsigned int sched_nr_migrate_break = 32; |
| 4035 | 4965 | ||
| 4036 | /* | 4966 | /* |
| @@ -4171,47 +5101,48 @@ static void update_blocked_averages(int cpu) | |||
| 4171 | } | 5101 | } |
| 4172 | 5102 | ||
| 4173 | /* | 5103 | /* |
| 4174 | * Compute the cpu's hierarchical load factor for each task group. | 5104 | * Compute the hierarchical load factor for cfs_rq and all its ascendants. |
| 4175 | * This needs to be done in a top-down fashion because the load of a child | 5105 | * This needs to be done in a top-down fashion because the load of a child |
| 4176 | * group is a fraction of its parents load. | 5106 | * group is a fraction of its parents load. |
| 4177 | */ | 5107 | */ |
| 4178 | static int tg_load_down(struct task_group *tg, void *data) | 5108 | static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq) |
| 4179 | { | ||
| 4180 | unsigned long load; | ||
| 4181 | long cpu = (long)data; | ||
| 4182 | |||
| 4183 | if (!tg->parent) { | ||
| 4184 | load = cpu_rq(cpu)->avg.load_avg_contrib; | ||
| 4185 | } else { | ||
| 4186 | load = tg->parent->cfs_rq[cpu]->h_load; | ||
| 4187 | load = div64_ul(load * tg->se[cpu]->avg.load_avg_contrib, | ||
| 4188 | tg->parent->cfs_rq[cpu]->runnable_load_avg + 1); | ||
| 4189 | } | ||
| 4190 | |||
| 4191 | tg->cfs_rq[cpu]->h_load = load; | ||
| 4192 | |||
| 4193 | return 0; | ||
| 4194 | } | ||
| 4195 | |||
| 4196 | static void update_h_load(long cpu) | ||
| 4197 | { | 5109 | { |
| 4198 | struct rq *rq = cpu_rq(cpu); | 5110 | struct rq *rq = rq_of(cfs_rq); |
| 5111 | struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)]; | ||
| 4199 | unsigned long now = jiffies; | 5112 | unsigned long now = jiffies; |
| 5113 | unsigned long load; | ||
| 4200 | 5114 | ||
| 4201 | if (rq->h_load_throttle == now) | 5115 | if (cfs_rq->last_h_load_update == now) |
| 4202 | return; | 5116 | return; |
| 4203 | 5117 | ||
| 4204 | rq->h_load_throttle = now; | 5118 | cfs_rq->h_load_next = NULL; |
| 5119 | for_each_sched_entity(se) { | ||
| 5120 | cfs_rq = cfs_rq_of(se); | ||
| 5121 | cfs_rq->h_load_next = se; | ||
| 5122 | if (cfs_rq->last_h_load_update == now) | ||
| 5123 | break; | ||
| 5124 | } | ||
| 5125 | |||
| 5126 | if (!se) { | ||
| 5127 | cfs_rq->h_load = cfs_rq->runnable_load_avg; | ||
| 5128 | cfs_rq->last_h_load_update = now; | ||
| 5129 | } | ||
| 4205 | 5130 | ||
| 4206 | rcu_read_lock(); | 5131 | while ((se = cfs_rq->h_load_next) != NULL) { |
| 4207 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); | 5132 | load = cfs_rq->h_load; |
| 4208 | rcu_read_unlock(); | 5133 | load = div64_ul(load * se->avg.load_avg_contrib, |
| 5134 | cfs_rq->runnable_load_avg + 1); | ||
| 5135 | cfs_rq = group_cfs_rq(se); | ||
| 5136 | cfs_rq->h_load = load; | ||
| 5137 | cfs_rq->last_h_load_update = now; | ||
| 5138 | } | ||
| 4209 | } | 5139 | } |
| 4210 | 5140 | ||
| 4211 | static unsigned long task_h_load(struct task_struct *p) | 5141 | static unsigned long task_h_load(struct task_struct *p) |
| 4212 | { | 5142 | { |
| 4213 | struct cfs_rq *cfs_rq = task_cfs_rq(p); | 5143 | struct cfs_rq *cfs_rq = task_cfs_rq(p); |
| 4214 | 5144 | ||
| 5145 | update_cfs_rq_h_load(cfs_rq); | ||
| 4215 | return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load, | 5146 | return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load, |
| 4216 | cfs_rq->runnable_load_avg + 1); | 5147 | cfs_rq->runnable_load_avg + 1); |
| 4217 | } | 5148 | } |
| @@ -4220,10 +5151,6 @@ static inline void update_blocked_averages(int cpu) | |||
| 4220 | { | 5151 | { |
| 4221 | } | 5152 | } |
| 4222 | 5153 | ||
| 4223 | static inline void update_h_load(long cpu) | ||
| 4224 | { | ||
| 4225 | } | ||
| 4226 | |||
| 4227 | static unsigned long task_h_load(struct task_struct *p) | 5154 | static unsigned long task_h_load(struct task_struct *p) |
| 4228 | { | 5155 | { |
| 4229 | return p->se.avg.load_avg_contrib; | 5156 | return p->se.avg.load_avg_contrib; |
| @@ -4232,54 +5159,66 @@ static unsigned long task_h_load(struct task_struct *p) | |||
| 4232 | 5159 | ||
| 4233 | /********** Helpers for find_busiest_group ************************/ | 5160 | /********** Helpers for find_busiest_group ************************/ |
| 4234 | /* | 5161 | /* |
| 4235 | * sd_lb_stats - Structure to store the statistics of a sched_domain | ||
| 4236 | * during load balancing. | ||
| 4237 | */ | ||
| 4238 | struct sd_lb_stats { | ||
| 4239 | struct sched_group *busiest; /* Busiest group in this sd */ | ||
| 4240 | struct sched_group *this; /* Local group in this sd */ | ||
| 4241 | unsigned long total_load; /* Total load of all groups in sd */ | ||
| 4242 | unsigned long total_pwr; /* Total power of all groups in sd */ | ||
| 4243 | unsigned long avg_load; /* Average load across all groups in sd */ | ||
| 4244 | |||
| 4245 | /** Statistics of this group */ | ||
| 4246 | unsigned long this_load; | ||
| 4247 | unsigned long this_load_per_task; | ||
| 4248 | unsigned long this_nr_running; | ||
| 4249 | unsigned long this_has_capacity; | ||
| 4250 | unsigned int this_idle_cpus; | ||
| 4251 | |||
| 4252 | /* Statistics of the busiest group */ | ||
| 4253 | unsigned int busiest_idle_cpus; | ||
| 4254 | unsigned long max_load; | ||
| 4255 | unsigned long busiest_load_per_task; | ||
| 4256 | unsigned long busiest_nr_running; | ||
| 4257 | unsigned long busiest_group_capacity; | ||
| 4258 | unsigned long busiest_has_capacity; | ||
| 4259 | unsigned int busiest_group_weight; | ||
| 4260 | |||
| 4261 | int group_imb; /* Is there imbalance in this sd */ | ||
| 4262 | }; | ||
| 4263 | |||
| 4264 | /* | ||
| 4265 | * sg_lb_stats - stats of a sched_group required for load_balancing | 5162 | * sg_lb_stats - stats of a sched_group required for load_balancing |
| 4266 | */ | 5163 | */ |
| 4267 | struct sg_lb_stats { | 5164 | struct sg_lb_stats { |
| 4268 | unsigned long avg_load; /*Avg load across the CPUs of the group */ | 5165 | unsigned long avg_load; /*Avg load across the CPUs of the group */ |
| 4269 | unsigned long group_load; /* Total load over the CPUs of the group */ | 5166 | unsigned long group_load; /* Total load over the CPUs of the group */ |
| 4270 | unsigned long sum_nr_running; /* Nr tasks running in the group */ | ||
| 4271 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ | 5167 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ |
| 4272 | unsigned long group_capacity; | 5168 | unsigned long load_per_task; |
| 4273 | unsigned long idle_cpus; | 5169 | unsigned long group_power; |
| 4274 | unsigned long group_weight; | 5170 | unsigned int sum_nr_running; /* Nr tasks running in the group */ |
| 5171 | unsigned int group_capacity; | ||
| 5172 | unsigned int idle_cpus; | ||
| 5173 | unsigned int group_weight; | ||
| 4275 | int group_imb; /* Is there an imbalance in the group ? */ | 5174 | int group_imb; /* Is there an imbalance in the group ? */ |
| 4276 | int group_has_capacity; /* Is there extra capacity in the group? */ | 5175 | int group_has_capacity; /* Is there extra capacity in the group? */ |
| 5176 | #ifdef CONFIG_NUMA_BALANCING | ||
| 5177 | unsigned int nr_numa_running; | ||
| 5178 | unsigned int nr_preferred_running; | ||
| 5179 | #endif | ||
| 5180 | }; | ||
| 5181 | |||
| 5182 | /* | ||
| 5183 | * sd_lb_stats - Structure to store the statistics of a sched_domain | ||
| 5184 | * during load balancing. | ||
| 5185 | */ | ||
| 5186 | struct sd_lb_stats { | ||
| 5187 | struct sched_group *busiest; /* Busiest group in this sd */ | ||
| 5188 | struct sched_group *local; /* Local group in this sd */ | ||
| 5189 | unsigned long total_load; /* Total load of all groups in sd */ | ||
| 5190 | unsigned long total_pwr; /* Total power of all groups in sd */ | ||
| 5191 | unsigned long avg_load; /* Average load across all groups in sd */ | ||
| 5192 | |||
| 5193 | struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */ | ||
| 5194 | struct sg_lb_stats local_stat; /* Statistics of the local group */ | ||
| 4277 | }; | 5195 | }; |
| 4278 | 5196 | ||
| 5197 | static inline void init_sd_lb_stats(struct sd_lb_stats *sds) | ||
| 5198 | { | ||
| 5199 | /* | ||
| 5200 | * Skimp on the clearing to avoid duplicate work. We can avoid clearing | ||
| 5201 | * local_stat because update_sg_lb_stats() does a full clear/assignment. | ||
| 5202 | * We must however clear busiest_stat::avg_load because | ||
| 5203 | * update_sd_pick_busiest() reads this before assignment. | ||
| 5204 | */ | ||
| 5205 | *sds = (struct sd_lb_stats){ | ||
| 5206 | .busiest = NULL, | ||
| 5207 | .local = NULL, | ||
| 5208 | .total_load = 0UL, | ||
| 5209 | .total_pwr = 0UL, | ||
| 5210 | .busiest_stat = { | ||
| 5211 | .avg_load = 0UL, | ||
| 5212 | }, | ||
| 5213 | }; | ||
| 5214 | } | ||
| 5215 | |||
| 4279 | /** | 5216 | /** |
| 4280 | * get_sd_load_idx - Obtain the load index for a given sched domain. | 5217 | * get_sd_load_idx - Obtain the load index for a given sched domain. |
| 4281 | * @sd: The sched_domain whose load_idx is to be obtained. | 5218 | * @sd: The sched_domain whose load_idx is to be obtained. |
| 4282 | * @idle: The Idle status of the CPU for whose sd load_icx is obtained. | 5219 | * @idle: The idle status of the CPU for whose sd load_idx is obtained. |
| 5220 | * | ||
| 5221 | * Return: The load index. | ||
| 4283 | */ | 5222 | */ |
| 4284 | static inline int get_sd_load_idx(struct sched_domain *sd, | 5223 | static inline int get_sd_load_idx(struct sched_domain *sd, |
| 4285 | enum cpu_idle_type idle) | 5224 | enum cpu_idle_type idle) |
| @@ -4394,7 +5333,7 @@ void update_group_power(struct sched_domain *sd, int cpu) | |||
| 4394 | { | 5333 | { |
| 4395 | struct sched_domain *child = sd->child; | 5334 | struct sched_domain *child = sd->child; |
| 4396 | struct sched_group *group, *sdg = sd->groups; | 5335 | struct sched_group *group, *sdg = sd->groups; |
| 4397 | unsigned long power; | 5336 | unsigned long power, power_orig; |
| 4398 | unsigned long interval; | 5337 | unsigned long interval; |
| 4399 | 5338 | ||
| 4400 | interval = msecs_to_jiffies(sd->balance_interval); | 5339 | interval = msecs_to_jiffies(sd->balance_interval); |
| @@ -4406,7 +5345,7 @@ void update_group_power(struct sched_domain *sd, int cpu) | |||
| 4406 | return; | 5345 | return; |
| 4407 | } | 5346 | } |
| 4408 | 5347 | ||
| 4409 | power = 0; | 5348 | power_orig = power = 0; |
| 4410 | 5349 | ||
| 4411 | if (child->flags & SD_OVERLAP) { | 5350 | if (child->flags & SD_OVERLAP) { |
| 4412 | /* | 5351 | /* |
| @@ -4414,8 +5353,12 @@ void update_group_power(struct sched_domain *sd, int cpu) | |||
| 4414 | * span the current group. | 5353 | * span the current group. |
| 4415 | */ | 5354 | */ |
| 4416 | 5355 | ||
| 4417 | for_each_cpu(cpu, sched_group_cpus(sdg)) | 5356 | for_each_cpu(cpu, sched_group_cpus(sdg)) { |
| 4418 | power += power_of(cpu); | 5357 | struct sched_group *sg = cpu_rq(cpu)->sd->groups; |
| 5358 | |||
| 5359 | power_orig += sg->sgp->power_orig; | ||
| 5360 | power += sg->sgp->power; | ||
| 5361 | } | ||
| 4419 | } else { | 5362 | } else { |
| 4420 | /* | 5363 | /* |
| 4421 | * !SD_OVERLAP domains can assume that child groups | 5364 | * !SD_OVERLAP domains can assume that child groups |
| @@ -4424,12 +5367,14 @@ void update_group_power(struct sched_domain *sd, int cpu) | |||
| 4424 | 5367 | ||
| 4425 | group = child->groups; | 5368 | group = child->groups; |
| 4426 | do { | 5369 | do { |
| 5370 | power_orig += group->sgp->power_orig; | ||
| 4427 | power += group->sgp->power; | 5371 | power += group->sgp->power; |
| 4428 | group = group->next; | 5372 | group = group->next; |
| 4429 | } while (group != child->groups); | 5373 | } while (group != child->groups); |
| 4430 | } | 5374 | } |
| 4431 | 5375 | ||
| 4432 | sdg->sgp->power_orig = sdg->sgp->power = power; | 5376 | sdg->sgp->power_orig = power_orig; |
| 5377 | sdg->sgp->power = power; | ||
| 4433 | } | 5378 | } |
| 4434 | 5379 | ||
| 4435 | /* | 5380 | /* |
| @@ -4457,33 +5402,84 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | |||
| 4457 | return 0; | 5402 | return 0; |
| 4458 | } | 5403 | } |
| 4459 | 5404 | ||
| 5405 | /* | ||
| 5406 | * Group imbalance indicates (and tries to solve) the problem where balancing | ||
| 5407 | * groups is inadequate due to tsk_cpus_allowed() constraints. | ||
| 5408 | * | ||
| 5409 | * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a | ||
| 5410 | * cpumask covering 1 cpu of the first group and 3 cpus of the second group. | ||
| 5411 | * Something like: | ||
| 5412 | * | ||
| 5413 | * { 0 1 2 3 } { 4 5 6 7 } | ||
| 5414 | * * * * * | ||
| 5415 | * | ||
| 5416 | * If we were to balance group-wise we'd place two tasks in the first group and | ||
| 5417 | * two tasks in the second group. Clearly this is undesired as it will overload | ||
| 5418 | * cpu 3 and leave one of the cpus in the second group unused. | ||
| 5419 | * | ||
| 5420 | * The current solution to this issue is detecting the skew in the first group | ||
| 5421 | * by noticing the lower domain failed to reach balance and had difficulty | ||
| 5422 | * moving tasks due to affinity constraints. | ||
| 5423 | * | ||
| 5424 | * When this is so detected; this group becomes a candidate for busiest; see | ||
| 5425 | * update_sd_pick_busiest(). And calculate_imbalance() and | ||
| 5426 | * find_busiest_group() avoid some of the usual balance conditions to allow it | ||
| 5427 | * to create an effective group imbalance. | ||
| 5428 | * | ||
| 5429 | * This is a somewhat tricky proposition since the next run might not find the | ||
| 5430 | * group imbalance and decide the groups need to be balanced again. A most | ||
| 5431 | * subtle and fragile situation. | ||
| 5432 | */ | ||
| 5433 | |||
| 5434 | static inline int sg_imbalanced(struct sched_group *group) | ||
| 5435 | { | ||
| 5436 | return group->sgp->imbalance; | ||
| 5437 | } | ||
| 5438 | |||
| 5439 | /* | ||
| 5440 | * Compute the group capacity. | ||
| 5441 | * | ||
| 5442 | * Avoid the issue where N*frac(smt_power) >= 1 creates 'phantom' cores by | ||
| 5443 | * first dividing out the smt factor and computing the actual number of cores | ||
| 5444 | * and limit power unit capacity with that. | ||
| 5445 | */ | ||
| 5446 | static inline int sg_capacity(struct lb_env *env, struct sched_group *group) | ||
| 5447 | { | ||
| 5448 | unsigned int capacity, smt, cpus; | ||
| 5449 | unsigned int power, power_orig; | ||
| 5450 | |||
| 5451 | power = group->sgp->power; | ||
| 5452 | power_orig = group->sgp->power_orig; | ||
| 5453 | cpus = group->group_weight; | ||
| 5454 | |||
| 5455 | /* smt := ceil(cpus / power), assumes: 1 < smt_power < 2 */ | ||
| 5456 | smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, power_orig); | ||
| 5457 | capacity = cpus / smt; /* cores */ | ||
| 5458 | |||
| 5459 | capacity = min_t(unsigned, capacity, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE)); | ||
| 5460 | if (!capacity) | ||
| 5461 | capacity = fix_small_capacity(env->sd, group); | ||
| 5462 | |||
| 5463 | return capacity; | ||
| 5464 | } | ||
| 5465 | |||
| 4460 | /** | 5466 | /** |
| 4461 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. | 5467 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. |
| 4462 | * @env: The load balancing environment. | 5468 | * @env: The load balancing environment. |
| 4463 | * @group: sched_group whose statistics are to be updated. | 5469 | * @group: sched_group whose statistics are to be updated. |
| 4464 | * @load_idx: Load index of sched_domain of this_cpu for load calc. | 5470 | * @load_idx: Load index of sched_domain of this_cpu for load calc. |
| 4465 | * @local_group: Does group contain this_cpu. | 5471 | * @local_group: Does group contain this_cpu. |
| 4466 | * @balance: Should we balance. | ||
| 4467 | * @sgs: variable to hold the statistics for this group. | 5472 | * @sgs: variable to hold the statistics for this group. |
| 4468 | */ | 5473 | */ |
| 4469 | static inline void update_sg_lb_stats(struct lb_env *env, | 5474 | static inline void update_sg_lb_stats(struct lb_env *env, |
| 4470 | struct sched_group *group, int load_idx, | 5475 | struct sched_group *group, int load_idx, |
| 4471 | int local_group, int *balance, struct sg_lb_stats *sgs) | 5476 | int local_group, struct sg_lb_stats *sgs) |
| 4472 | { | 5477 | { |
| 4473 | unsigned long nr_running, max_nr_running, min_nr_running; | 5478 | unsigned long nr_running; |
| 4474 | unsigned long load, max_cpu_load, min_cpu_load; | 5479 | unsigned long load; |
| 4475 | unsigned int balance_cpu = -1, first_idle_cpu = 0; | ||
| 4476 | unsigned long avg_load_per_task = 0; | ||
| 4477 | int i; | 5480 | int i; |
| 4478 | 5481 | ||
| 4479 | if (local_group) | 5482 | memset(sgs, 0, sizeof(*sgs)); |
| 4480 | balance_cpu = group_balance_cpu(group); | ||
| 4481 | |||
| 4482 | /* Tally up the load of all CPUs in the group */ | ||
| 4483 | max_cpu_load = 0; | ||
| 4484 | min_cpu_load = ~0UL; | ||
| 4485 | max_nr_running = 0; | ||
| 4486 | min_nr_running = ~0UL; | ||
| 4487 | 5483 | ||
| 4488 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { | 5484 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { |
| 4489 | struct rq *rq = cpu_rq(i); | 5485 | struct rq *rq = cpu_rq(i); |
| @@ -4491,76 +5487,34 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
| 4491 | nr_running = rq->nr_running; | 5487 | nr_running = rq->nr_running; |
| 4492 | 5488 | ||
| 4493 | /* Bias balancing toward cpus of our domain */ | 5489 | /* Bias balancing toward cpus of our domain */ |
| 4494 | if (local_group) { | 5490 | if (local_group) |
| 4495 | if (idle_cpu(i) && !first_idle_cpu && | ||
| 4496 | cpumask_test_cpu(i, sched_group_mask(group))) { | ||
| 4497 | first_idle_cpu = 1; | ||
| 4498 | balance_cpu = i; | ||
| 4499 | } | ||
| 4500 | |||
| 4501 | load = target_load(i, load_idx); | 5491 | load = target_load(i, load_idx); |
| 4502 | } else { | 5492 | else |
| 4503 | load = source_load(i, load_idx); | 5493 | load = source_load(i, load_idx); |
| 4504 | if (load > max_cpu_load) | ||
| 4505 | max_cpu_load = load; | ||
| 4506 | if (min_cpu_load > load) | ||
| 4507 | min_cpu_load = load; | ||
| 4508 | |||
| 4509 | if (nr_running > max_nr_running) | ||
| 4510 | max_nr_running = nr_running; | ||
| 4511 | if (min_nr_running > nr_running) | ||
| 4512 | min_nr_running = nr_running; | ||
| 4513 | } | ||
| 4514 | 5494 | ||
| 4515 | sgs->group_load += load; | 5495 | sgs->group_load += load; |
| 4516 | sgs->sum_nr_running += nr_running; | 5496 | sgs->sum_nr_running += nr_running; |
| 5497 | #ifdef CONFIG_NUMA_BALANCING | ||
| 5498 | sgs->nr_numa_running += rq->nr_numa_running; | ||
| 5499 | sgs->nr_preferred_running += rq->nr_preferred_running; | ||
| 5500 | #endif | ||
| 4517 | sgs->sum_weighted_load += weighted_cpuload(i); | 5501 | sgs->sum_weighted_load += weighted_cpuload(i); |
| 4518 | if (idle_cpu(i)) | 5502 | if (idle_cpu(i)) |
| 4519 | sgs->idle_cpus++; | 5503 | sgs->idle_cpus++; |
| 4520 | } | 5504 | } |
| 4521 | 5505 | ||
| 4522 | /* | ||
| 4523 | * First idle cpu or the first cpu(busiest) in this sched group | ||
| 4524 | * is eligible for doing load balancing at this and above | ||
| 4525 | * domains. In the newly idle case, we will allow all the cpu's | ||
| 4526 | * to do the newly idle load balance. | ||
| 4527 | */ | ||
| 4528 | if (local_group) { | ||
| 4529 | if (env->idle != CPU_NEWLY_IDLE) { | ||
| 4530 | if (balance_cpu != env->dst_cpu) { | ||
| 4531 | *balance = 0; | ||
| 4532 | return; | ||
| 4533 | } | ||
| 4534 | update_group_power(env->sd, env->dst_cpu); | ||
| 4535 | } else if (time_after_eq(jiffies, group->sgp->next_update)) | ||
| 4536 | update_group_power(env->sd, env->dst_cpu); | ||
| 4537 | } | ||
| 4538 | |||
| 4539 | /* Adjust by relative CPU power of the group */ | 5506 | /* Adjust by relative CPU power of the group */ |
| 4540 | sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power; | 5507 | sgs->group_power = group->sgp->power; |
| 5508 | sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power; | ||
| 4541 | 5509 | ||
| 4542 | /* | ||
| 4543 | * Consider the group unbalanced when the imbalance is larger | ||
| 4544 | * than the average weight of a task. | ||
| 4545 | * | ||
| 4546 | * APZ: with cgroup the avg task weight can vary wildly and | ||
| 4547 | * might not be a suitable number - should we keep a | ||
| 4548 | * normalized nr_running number somewhere that negates | ||
| 4549 | * the hierarchy? | ||
| 4550 | */ | ||
| 4551 | if (sgs->sum_nr_running) | 5510 | if (sgs->sum_nr_running) |
| 4552 | avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; | 5511 | sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; |
| 4553 | 5512 | ||
| 4554 | if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && | ||
| 4555 | (max_nr_running - min_nr_running) > 1) | ||
| 4556 | sgs->group_imb = 1; | ||
| 4557 | |||
| 4558 | sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power, | ||
| 4559 | SCHED_POWER_SCALE); | ||
| 4560 | if (!sgs->group_capacity) | ||
| 4561 | sgs->group_capacity = fix_small_capacity(env->sd, group); | ||
| 4562 | sgs->group_weight = group->group_weight; | 5513 | sgs->group_weight = group->group_weight; |
| 4563 | 5514 | ||
| 5515 | sgs->group_imb = sg_imbalanced(group); | ||
| 5516 | sgs->group_capacity = sg_capacity(env, group); | ||
| 5517 | |||
| 4564 | if (sgs->group_capacity > sgs->sum_nr_running) | 5518 | if (sgs->group_capacity > sgs->sum_nr_running) |
| 4565 | sgs->group_has_capacity = 1; | 5519 | sgs->group_has_capacity = 1; |
| 4566 | } | 5520 | } |
| @@ -4574,13 +5528,16 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
| 4574 | * | 5528 | * |
| 4575 | * Determine if @sg is a busier group than the previously selected | 5529 | * Determine if @sg is a busier group than the previously selected |
| 4576 | * busiest group. | 5530 | * busiest group. |
| 5531 | * | ||
| 5532 | * Return: %true if @sg is a busier group than the previously selected | ||
| 5533 | * busiest group. %false otherwise. | ||
| 4577 | */ | 5534 | */ |
| 4578 | static bool update_sd_pick_busiest(struct lb_env *env, | 5535 | static bool update_sd_pick_busiest(struct lb_env *env, |
| 4579 | struct sd_lb_stats *sds, | 5536 | struct sd_lb_stats *sds, |
| 4580 | struct sched_group *sg, | 5537 | struct sched_group *sg, |
| 4581 | struct sg_lb_stats *sgs) | 5538 | struct sg_lb_stats *sgs) |
| 4582 | { | 5539 | { |
| 4583 | if (sgs->avg_load <= sds->max_load) | 5540 | if (sgs->avg_load <= sds->busiest_stat.avg_load) |
| 4584 | return false; | 5541 | return false; |
| 4585 | 5542 | ||
| 4586 | if (sgs->sum_nr_running > sgs->group_capacity) | 5543 | if (sgs->sum_nr_running > sgs->group_capacity) |
| @@ -4606,18 +5563,46 @@ static bool update_sd_pick_busiest(struct lb_env *env, | |||
| 4606 | return false; | 5563 | return false; |
| 4607 | } | 5564 | } |
| 4608 | 5565 | ||
| 5566 | #ifdef CONFIG_NUMA_BALANCING | ||
| 5567 | static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs) | ||
| 5568 | { | ||
| 5569 | if (sgs->sum_nr_running > sgs->nr_numa_running) | ||
| 5570 | return regular; | ||
| 5571 | if (sgs->sum_nr_running > sgs->nr_preferred_running) | ||
| 5572 | return remote; | ||
| 5573 | return all; | ||
| 5574 | } | ||
| 5575 | |||
| 5576 | static inline enum fbq_type fbq_classify_rq(struct rq *rq) | ||
| 5577 | { | ||
| 5578 | if (rq->nr_running > rq->nr_numa_running) | ||
| 5579 | return regular; | ||
| 5580 | if (rq->nr_running > rq->nr_preferred_running) | ||
| 5581 | return remote; | ||
| 5582 | return all; | ||
| 5583 | } | ||
| 5584 | #else | ||
| 5585 | static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs) | ||
| 5586 | { | ||
| 5587 | return all; | ||
| 5588 | } | ||
| 5589 | |||
| 5590 | static inline enum fbq_type fbq_classify_rq(struct rq *rq) | ||
| 5591 | { | ||
| 5592 | return regular; | ||
| 5593 | } | ||
| 5594 | #endif /* CONFIG_NUMA_BALANCING */ | ||
| 5595 | |||
| 4609 | /** | 5596 | /** |
| 4610 | * update_sd_lb_stats - Update sched_domain's statistics for load balancing. | 5597 | * update_sd_lb_stats - Update sched_domain's statistics for load balancing. |
| 4611 | * @env: The load balancing environment. | 5598 | * @env: The load balancing environment. |
| 4612 | * @balance: Should we balance. | ||
| 4613 | * @sds: variable to hold the statistics for this sched_domain. | 5599 | * @sds: variable to hold the statistics for this sched_domain. |
| 4614 | */ | 5600 | */ |
| 4615 | static inline void update_sd_lb_stats(struct lb_env *env, | 5601 | static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) |
| 4616 | int *balance, struct sd_lb_stats *sds) | ||
| 4617 | { | 5602 | { |
| 4618 | struct sched_domain *child = env->sd->child; | 5603 | struct sched_domain *child = env->sd->child; |
| 4619 | struct sched_group *sg = env->sd->groups; | 5604 | struct sched_group *sg = env->sd->groups; |
| 4620 | struct sg_lb_stats sgs; | 5605 | struct sg_lb_stats tmp_sgs; |
| 4621 | int load_idx, prefer_sibling = 0; | 5606 | int load_idx, prefer_sibling = 0; |
| 4622 | 5607 | ||
| 4623 | if (child && child->flags & SD_PREFER_SIBLING) | 5608 | if (child && child->flags & SD_PREFER_SIBLING) |
| @@ -4626,17 +5611,23 @@ static inline void update_sd_lb_stats(struct lb_env *env, | |||
| 4626 | load_idx = get_sd_load_idx(env->sd, env->idle); | 5611 | load_idx = get_sd_load_idx(env->sd, env->idle); |
| 4627 | 5612 | ||
| 4628 | do { | 5613 | do { |
| 5614 | struct sg_lb_stats *sgs = &tmp_sgs; | ||
| 4629 | int local_group; | 5615 | int local_group; |
| 4630 | 5616 | ||
| 4631 | local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); | 5617 | local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); |
| 4632 | memset(&sgs, 0, sizeof(sgs)); | 5618 | if (local_group) { |
| 4633 | update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs); | 5619 | sds->local = sg; |
| 5620 | sgs = &sds->local_stat; | ||
| 4634 | 5621 | ||
| 4635 | if (local_group && !(*balance)) | 5622 | if (env->idle != CPU_NEWLY_IDLE || |
| 4636 | return; | 5623 | time_after_eq(jiffies, sg->sgp->next_update)) |
| 5624 | update_group_power(env->sd, env->dst_cpu); | ||
| 5625 | } | ||
| 4637 | 5626 | ||
| 4638 | sds->total_load += sgs.group_load; | 5627 | update_sg_lb_stats(env, sg, load_idx, local_group, sgs); |
| 4639 | sds->total_pwr += sg->sgp->power; | 5628 | |
| 5629 | if (local_group) | ||
| 5630 | goto next_group; | ||
| 4640 | 5631 | ||
| 4641 | /* | 5632 | /* |
| 4642 | * In case the child domain prefers tasks go to siblings | 5633 | * In case the child domain prefers tasks go to siblings |
| @@ -4648,30 +5639,25 @@ static inline void update_sd_lb_stats(struct lb_env *env, | |||
| 4648 | * heaviest group when it is already under-utilized (possible | 5639 | * heaviest group when it is already under-utilized (possible |
| 4649 | * with a large weight task outweighs the tasks on the system). | 5640 | * with a large weight task outweighs the tasks on the system). |
| 4650 | */ | 5641 | */ |
| 4651 | if (prefer_sibling && !local_group && sds->this_has_capacity) | 5642 | if (prefer_sibling && sds->local && |
| 4652 | sgs.group_capacity = min(sgs.group_capacity, 1UL); | 5643 | sds->local_stat.group_has_capacity) |
| 5644 | sgs->group_capacity = min(sgs->group_capacity, 1U); | ||
| 4653 | 5645 | ||
| 4654 | if (local_group) { | 5646 | if (update_sd_pick_busiest(env, sds, sg, sgs)) { |
| 4655 | sds->this_load = sgs.avg_load; | ||
| 4656 | sds->this = sg; | ||
| 4657 | sds->this_nr_running = sgs.sum_nr_running; | ||
| 4658 | sds->this_load_per_task = sgs.sum_weighted_load; | ||
| 4659 | sds->this_has_capacity = sgs.group_has_capacity; | ||
| 4660 | sds->this_idle_cpus = sgs.idle_cpus; | ||
| 4661 | } else if (update_sd_pick_busiest(env, sds, sg, &sgs)) { | ||
| 4662 | sds->max_load = sgs.avg_load; | ||
| 4663 | sds->busiest = sg; | 5647 | sds->busiest = sg; |
| 4664 | sds->busiest_nr_running = sgs.sum_nr_running; | 5648 | sds->busiest_stat = *sgs; |
| 4665 | sds->busiest_idle_cpus = sgs.idle_cpus; | ||
| 4666 | sds->busiest_group_capacity = sgs.group_capacity; | ||
| 4667 | sds->busiest_load_per_task = sgs.sum_weighted_load; | ||
| 4668 | sds->busiest_has_capacity = sgs.group_has_capacity; | ||
| 4669 | sds->busiest_group_weight = sgs.group_weight; | ||
| 4670 | sds->group_imb = sgs.group_imb; | ||
| 4671 | } | 5649 | } |
| 4672 | 5650 | ||
| 5651 | next_group: | ||
| 5652 | /* Now, start updating sd_lb_stats */ | ||
| 5653 | sds->total_load += sgs->group_load; | ||
| 5654 | sds->total_pwr += sgs->group_power; | ||
| 5655 | |||
| 4673 | sg = sg->next; | 5656 | sg = sg->next; |
| 4674 | } while (sg != env->sd->groups); | 5657 | } while (sg != env->sd->groups); |
| 5658 | |||
| 5659 | if (env->sd->flags & SD_NUMA) | ||
| 5660 | env->fbq_type = fbq_classify_group(&sds->busiest_stat); | ||
| 4675 | } | 5661 | } |
| 4676 | 5662 | ||
| 4677 | /** | 5663 | /** |
| @@ -4691,7 +5677,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, | |||
| 4691 | * assuming lower CPU number will be equivalent to lower a SMT thread | 5677 | * assuming lower CPU number will be equivalent to lower a SMT thread |
| 4692 | * number. | 5678 | * number. |
| 4693 | * | 5679 | * |
| 4694 | * Returns 1 when packing is required and a task should be moved to | 5680 | * Return: 1 when packing is required and a task should be moved to |
| 4695 | * this CPU. The amount of the imbalance is returned in *imbalance. | 5681 | * this CPU. The amount of the imbalance is returned in *imbalance. |
| 4696 | * | 5682 | * |
| 4697 | * @env: The load balancing environment. | 5683 | * @env: The load balancing environment. |
| @@ -4712,7 +5698,8 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) | |||
| 4712 | return 0; | 5698 | return 0; |
| 4713 | 5699 | ||
| 4714 | env->imbalance = DIV_ROUND_CLOSEST( | 5700 | env->imbalance = DIV_ROUND_CLOSEST( |
| 4715 | sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE); | 5701 | sds->busiest_stat.avg_load * sds->busiest_stat.group_power, |
| 5702 | SCHED_POWER_SCALE); | ||
| 4716 | 5703 | ||
| 4717 | return 1; | 5704 | return 1; |
| 4718 | } | 5705 | } |
| @@ -4730,24 +5717,23 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) | |||
| 4730 | unsigned long tmp, pwr_now = 0, pwr_move = 0; | 5717 | unsigned long tmp, pwr_now = 0, pwr_move = 0; |
| 4731 | unsigned int imbn = 2; | 5718 | unsigned int imbn = 2; |
| 4732 | unsigned long scaled_busy_load_per_task; | 5719 | unsigned long scaled_busy_load_per_task; |
| 5720 | struct sg_lb_stats *local, *busiest; | ||
| 4733 | 5721 | ||
| 4734 | if (sds->this_nr_running) { | 5722 | local = &sds->local_stat; |
| 4735 | sds->this_load_per_task /= sds->this_nr_running; | 5723 | busiest = &sds->busiest_stat; |
| 4736 | if (sds->busiest_load_per_task > | 5724 | |
| 4737 | sds->this_load_per_task) | 5725 | if (!local->sum_nr_running) |
| 4738 | imbn = 1; | 5726 | local->load_per_task = cpu_avg_load_per_task(env->dst_cpu); |
| 4739 | } else { | 5727 | else if (busiest->load_per_task > local->load_per_task) |
| 4740 | sds->this_load_per_task = | 5728 | imbn = 1; |
| 4741 | cpu_avg_load_per_task(env->dst_cpu); | ||
| 4742 | } | ||
| 4743 | 5729 | ||
| 4744 | scaled_busy_load_per_task = sds->busiest_load_per_task | 5730 | scaled_busy_load_per_task = |
| 4745 | * SCHED_POWER_SCALE; | 5731 | (busiest->load_per_task * SCHED_POWER_SCALE) / |
| 4746 | scaled_busy_load_per_task /= sds->busiest->sgp->power; | 5732 | busiest->group_power; |
| 4747 | 5733 | ||
| 4748 | if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= | 5734 | if (busiest->avg_load + scaled_busy_load_per_task >= |
| 4749 | (scaled_busy_load_per_task * imbn)) { | 5735 | local->avg_load + (scaled_busy_load_per_task * imbn)) { |
| 4750 | env->imbalance = sds->busiest_load_per_task; | 5736 | env->imbalance = busiest->load_per_task; |
| 4751 | return; | 5737 | return; |
| 4752 | } | 5738 | } |
| 4753 | 5739 | ||
| @@ -4757,34 +5743,37 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) | |||
| 4757 | * moving them. | 5743 | * moving them. |
| 4758 | */ | 5744 | */ |
| 4759 | 5745 | ||
| 4760 | pwr_now += sds->busiest->sgp->power * | 5746 | pwr_now += busiest->group_power * |
| 4761 | min(sds->busiest_load_per_task, sds->max_load); | 5747 | min(busiest->load_per_task, busiest->avg_load); |
| 4762 | pwr_now += sds->this->sgp->power * | 5748 | pwr_now += local->group_power * |
| 4763 | min(sds->this_load_per_task, sds->this_load); | 5749 | min(local->load_per_task, local->avg_load); |
| 4764 | pwr_now /= SCHED_POWER_SCALE; | 5750 | pwr_now /= SCHED_POWER_SCALE; |
| 4765 | 5751 | ||
| 4766 | /* Amount of load we'd subtract */ | 5752 | /* Amount of load we'd subtract */ |
| 4767 | tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / | 5753 | tmp = (busiest->load_per_task * SCHED_POWER_SCALE) / |
| 4768 | sds->busiest->sgp->power; | 5754 | busiest->group_power; |
| 4769 | if (sds->max_load > tmp) | 5755 | if (busiest->avg_load > tmp) { |
| 4770 | pwr_move += sds->busiest->sgp->power * | 5756 | pwr_move += busiest->group_power * |
| 4771 | min(sds->busiest_load_per_task, sds->max_load - tmp); | 5757 | min(busiest->load_per_task, |
| 5758 | busiest->avg_load - tmp); | ||
| 5759 | } | ||
| 4772 | 5760 | ||
| 4773 | /* Amount of load we'd add */ | 5761 | /* Amount of load we'd add */ |
| 4774 | if (sds->max_load * sds->busiest->sgp->power < | 5762 | if (busiest->avg_load * busiest->group_power < |
| 4775 | sds->busiest_load_per_task * SCHED_POWER_SCALE) | 5763 | busiest->load_per_task * SCHED_POWER_SCALE) { |
| 4776 | tmp = (sds->max_load * sds->busiest->sgp->power) / | 5764 | tmp = (busiest->avg_load * busiest->group_power) / |
| 4777 | sds->this->sgp->power; | 5765 | local->group_power; |
| 4778 | else | 5766 | } else { |
| 4779 | tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / | 5767 | tmp = (busiest->load_per_task * SCHED_POWER_SCALE) / |
| 4780 | sds->this->sgp->power; | 5768 | local->group_power; |
| 4781 | pwr_move += sds->this->sgp->power * | 5769 | } |
| 4782 | min(sds->this_load_per_task, sds->this_load + tmp); | 5770 | pwr_move += local->group_power * |
| 5771 | min(local->load_per_task, local->avg_load + tmp); | ||
| 4783 | pwr_move /= SCHED_POWER_SCALE; | 5772 | pwr_move /= SCHED_POWER_SCALE; |
| 4784 | 5773 | ||
| 4785 | /* Move if we gain throughput */ | 5774 | /* Move if we gain throughput */ |
| 4786 | if (pwr_move > pwr_now) | 5775 | if (pwr_move > pwr_now) |
| 4787 | env->imbalance = sds->busiest_load_per_task; | 5776 | env->imbalance = busiest->load_per_task; |
| 4788 | } | 5777 | } |
| 4789 | 5778 | ||
| 4790 | /** | 5779 | /** |
| @@ -4796,11 +5785,18 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) | |||
| 4796 | static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds) | 5785 | static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds) |
| 4797 | { | 5786 | { |
| 4798 | unsigned long max_pull, load_above_capacity = ~0UL; | 5787 | unsigned long max_pull, load_above_capacity = ~0UL; |
| 5788 | struct sg_lb_stats *local, *busiest; | ||
| 5789 | |||
| 5790 | local = &sds->local_stat; | ||
| 5791 | busiest = &sds->busiest_stat; | ||
| 4799 | 5792 | ||
| 4800 | sds->busiest_load_per_task /= sds->busiest_nr_running; | 5793 | if (busiest->group_imb) { |
| 4801 | if (sds->group_imb) { | 5794 | /* |
| 4802 | sds->busiest_load_per_task = | 5795 | * In the group_imb case we cannot rely on group-wide averages |
| 4803 | min(sds->busiest_load_per_task, sds->avg_load); | 5796 | * to ensure cpu-load equilibrium, look at wider averages. XXX |
| 5797 | */ | ||
| 5798 | busiest->load_per_task = | ||
| 5799 | min(busiest->load_per_task, sds->avg_load); | ||
| 4804 | } | 5800 | } |
| 4805 | 5801 | ||
| 4806 | /* | 5802 | /* |
| @@ -4808,21 +5804,23 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
| 4808 | * max load less than avg load(as we skip the groups at or below | 5804 | * max load less than avg load(as we skip the groups at or below |
| 4809 | * its cpu_power, while calculating max_load..) | 5805 | * its cpu_power, while calculating max_load..) |
| 4810 | */ | 5806 | */ |
| 4811 | if (sds->max_load < sds->avg_load) { | 5807 | if (busiest->avg_load <= sds->avg_load || |
| 5808 | local->avg_load >= sds->avg_load) { | ||
| 4812 | env->imbalance = 0; | 5809 | env->imbalance = 0; |
| 4813 | return fix_small_imbalance(env, sds); | 5810 | return fix_small_imbalance(env, sds); |
| 4814 | } | 5811 | } |
| 4815 | 5812 | ||
| 4816 | if (!sds->group_imb) { | 5813 | if (!busiest->group_imb) { |
| 4817 | /* | 5814 | /* |
| 4818 | * Don't want to pull so many tasks that a group would go idle. | 5815 | * Don't want to pull so many tasks that a group would go idle. |
| 5816 | * Except of course for the group_imb case, since then we might | ||
| 5817 | * have to drop below capacity to reach cpu-load equilibrium. | ||
| 4819 | */ | 5818 | */ |
| 4820 | load_above_capacity = (sds->busiest_nr_running - | 5819 | load_above_capacity = |
| 4821 | sds->busiest_group_capacity); | 5820 | (busiest->sum_nr_running - busiest->group_capacity); |
| 4822 | 5821 | ||
| 4823 | load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); | 5822 | load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); |
| 4824 | 5823 | load_above_capacity /= busiest->group_power; | |
| 4825 | load_above_capacity /= sds->busiest->sgp->power; | ||
| 4826 | } | 5824 | } |
| 4827 | 5825 | ||
| 4828 | /* | 5826 | /* |
| @@ -4832,15 +5830,14 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
| 4832 | * we also don't want to reduce the group load below the group capacity | 5830 | * we also don't want to reduce the group load below the group capacity |
| 4833 | * (so that we can implement power-savings policies etc). Thus we look | 5831 | * (so that we can implement power-savings policies etc). Thus we look |
| 4834 | * for the minimum possible imbalance. | 5832 | * for the minimum possible imbalance. |
| 4835 | * Be careful of negative numbers as they'll appear as very large values | ||
| 4836 | * with unsigned longs. | ||
| 4837 | */ | 5833 | */ |
| 4838 | max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); | 5834 | max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity); |
| 4839 | 5835 | ||
| 4840 | /* How much load to actually move to equalise the imbalance */ | 5836 | /* How much load to actually move to equalise the imbalance */ |
| 4841 | env->imbalance = min(max_pull * sds->busiest->sgp->power, | 5837 | env->imbalance = min( |
| 4842 | (sds->avg_load - sds->this_load) * sds->this->sgp->power) | 5838 | max_pull * busiest->group_power, |
| 4843 | / SCHED_POWER_SCALE; | 5839 | (sds->avg_load - local->avg_load) * local->group_power |
| 5840 | ) / SCHED_POWER_SCALE; | ||
| 4844 | 5841 | ||
| 4845 | /* | 5842 | /* |
| 4846 | * if *imbalance is less than the average load per runnable task | 5843 | * if *imbalance is less than the average load per runnable task |
| @@ -4848,9 +5845,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
| 4848 | * a think about bumping its value to force at least one task to be | 5845 | * a think about bumping its value to force at least one task to be |
| 4849 | * moved | 5846 | * moved |
| 4850 | */ | 5847 | */ |
| 4851 | if (env->imbalance < sds->busiest_load_per_task) | 5848 | if (env->imbalance < busiest->load_per_task) |
| 4852 | return fix_small_imbalance(env, sds); | 5849 | return fix_small_imbalance(env, sds); |
| 4853 | |||
| 4854 | } | 5850 | } |
| 4855 | 5851 | ||
| 4856 | /******* find_busiest_group() helpers end here *********************/ | 5852 | /******* find_busiest_group() helpers end here *********************/ |
| @@ -4866,69 +5862,62 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
| 4866 | * to restore balance. | 5862 | * to restore balance. |
| 4867 | * | 5863 | * |
| 4868 | * @env: The load balancing environment. | 5864 | * @env: The load balancing environment. |
| 4869 | * @balance: Pointer to a variable indicating if this_cpu | ||
| 4870 | * is the appropriate cpu to perform load balancing at this_level. | ||
| 4871 | * | 5865 | * |
| 4872 | * Returns: - the busiest group if imbalance exists. | 5866 | * Return: - The busiest group if imbalance exists. |
| 4873 | * - If no imbalance and user has opted for power-savings balance, | 5867 | * - If no imbalance and user has opted for power-savings balance, |
| 4874 | * return the least loaded group whose CPUs can be | 5868 | * return the least loaded group whose CPUs can be |
| 4875 | * put to idle by rebalancing its tasks onto our group. | 5869 | * put to idle by rebalancing its tasks onto our group. |
| 4876 | */ | 5870 | */ |
| 4877 | static struct sched_group * | 5871 | static struct sched_group *find_busiest_group(struct lb_env *env) |
| 4878 | find_busiest_group(struct lb_env *env, int *balance) | ||
| 4879 | { | 5872 | { |
| 5873 | struct sg_lb_stats *local, *busiest; | ||
| 4880 | struct sd_lb_stats sds; | 5874 | struct sd_lb_stats sds; |
| 4881 | 5875 | ||
| 4882 | memset(&sds, 0, sizeof(sds)); | 5876 | init_sd_lb_stats(&sds); |
| 4883 | 5877 | ||
| 4884 | /* | 5878 | /* |
| 4885 | * Compute the various statistics relavent for load balancing at | 5879 | * Compute the various statistics relavent for load balancing at |
| 4886 | * this level. | 5880 | * this level. |
| 4887 | */ | 5881 | */ |
| 4888 | update_sd_lb_stats(env, balance, &sds); | 5882 | update_sd_lb_stats(env, &sds); |
| 4889 | 5883 | local = &sds.local_stat; | |
| 4890 | /* | 5884 | busiest = &sds.busiest_stat; |
| 4891 | * this_cpu is not the appropriate cpu to perform load balancing at | ||
| 4892 | * this level. | ||
| 4893 | */ | ||
| 4894 | if (!(*balance)) | ||
| 4895 | goto ret; | ||
| 4896 | 5885 | ||
| 4897 | if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && | 5886 | if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && |
| 4898 | check_asym_packing(env, &sds)) | 5887 | check_asym_packing(env, &sds)) |
| 4899 | return sds.busiest; | 5888 | return sds.busiest; |
| 4900 | 5889 | ||
| 4901 | /* There is no busy sibling group to pull tasks from */ | 5890 | /* There is no busy sibling group to pull tasks from */ |
| 4902 | if (!sds.busiest || sds.busiest_nr_running == 0) | 5891 | if (!sds.busiest || busiest->sum_nr_running == 0) |
| 4903 | goto out_balanced; | 5892 | goto out_balanced; |
| 4904 | 5893 | ||
| 4905 | sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr; | 5894 | sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr; |
| 4906 | 5895 | ||
| 4907 | /* | 5896 | /* |
| 4908 | * If the busiest group is imbalanced the below checks don't | 5897 | * If the busiest group is imbalanced the below checks don't |
| 4909 | * work because they assumes all things are equal, which typically | 5898 | * work because they assume all things are equal, which typically |
| 4910 | * isn't true due to cpus_allowed constraints and the like. | 5899 | * isn't true due to cpus_allowed constraints and the like. |
| 4911 | */ | 5900 | */ |
| 4912 | if (sds.group_imb) | 5901 | if (busiest->group_imb) |
| 4913 | goto force_balance; | 5902 | goto force_balance; |
| 4914 | 5903 | ||
| 4915 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ | 5904 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ |
| 4916 | if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity && | 5905 | if (env->idle == CPU_NEWLY_IDLE && local->group_has_capacity && |
| 4917 | !sds.busiest_has_capacity) | 5906 | !busiest->group_has_capacity) |
| 4918 | goto force_balance; | 5907 | goto force_balance; |
| 4919 | 5908 | ||
| 4920 | /* | 5909 | /* |
| 4921 | * If the local group is more busy than the selected busiest group | 5910 | * If the local group is more busy than the selected busiest group |
| 4922 | * don't try and pull any tasks. | 5911 | * don't try and pull any tasks. |
| 4923 | */ | 5912 | */ |
| 4924 | if (sds.this_load >= sds.max_load) | 5913 | if (local->avg_load >= busiest->avg_load) |
| 4925 | goto out_balanced; | 5914 | goto out_balanced; |
| 4926 | 5915 | ||
| 4927 | /* | 5916 | /* |
| 4928 | * Don't pull any tasks if this group is already above the domain | 5917 | * Don't pull any tasks if this group is already above the domain |
| 4929 | * average load. | 5918 | * average load. |
| 4930 | */ | 5919 | */ |
| 4931 | if (sds.this_load >= sds.avg_load) | 5920 | if (local->avg_load >= sds.avg_load) |
| 4932 | goto out_balanced; | 5921 | goto out_balanced; |
| 4933 | 5922 | ||
| 4934 | if (env->idle == CPU_IDLE) { | 5923 | if (env->idle == CPU_IDLE) { |
| @@ -4938,15 +5927,16 @@ find_busiest_group(struct lb_env *env, int *balance) | |||
| 4938 | * there is no imbalance between this and busiest group | 5927 | * there is no imbalance between this and busiest group |
| 4939 | * wrt to idle cpu's, it is balanced. | 5928 | * wrt to idle cpu's, it is balanced. |
| 4940 | */ | 5929 | */ |
| 4941 | if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) && | 5930 | if ((local->idle_cpus < busiest->idle_cpus) && |
| 4942 | sds.busiest_nr_running <= sds.busiest_group_weight) | 5931 | busiest->sum_nr_running <= busiest->group_weight) |
| 4943 | goto out_balanced; | 5932 | goto out_balanced; |
| 4944 | } else { | 5933 | } else { |
| 4945 | /* | 5934 | /* |
| 4946 | * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use | 5935 | * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use |
| 4947 | * imbalance_pct to be conservative. | 5936 | * imbalance_pct to be conservative. |
| 4948 | */ | 5937 | */ |
| 4949 | if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load) | 5938 | if (100 * busiest->avg_load <= |
| 5939 | env->sd->imbalance_pct * local->avg_load) | ||
| 4950 | goto out_balanced; | 5940 | goto out_balanced; |
| 4951 | } | 5941 | } |
| 4952 | 5942 | ||
| @@ -4956,7 +5946,6 @@ force_balance: | |||
| 4956 | return sds.busiest; | 5946 | return sds.busiest; |
| 4957 | 5947 | ||
| 4958 | out_balanced: | 5948 | out_balanced: |
| 4959 | ret: | ||
| 4960 | env->imbalance = 0; | 5949 | env->imbalance = 0; |
| 4961 | return NULL; | 5950 | return NULL; |
| 4962 | } | 5951 | } |
| @@ -4968,22 +5957,43 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
| 4968 | struct sched_group *group) | 5957 | struct sched_group *group) |
| 4969 | { | 5958 | { |
| 4970 | struct rq *busiest = NULL, *rq; | 5959 | struct rq *busiest = NULL, *rq; |
| 4971 | unsigned long max_load = 0; | 5960 | unsigned long busiest_load = 0, busiest_power = 1; |
| 4972 | int i; | 5961 | int i; |
| 4973 | 5962 | ||
| 4974 | for_each_cpu(i, sched_group_cpus(group)) { | 5963 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { |
| 4975 | unsigned long power = power_of(i); | 5964 | unsigned long power, capacity, wl; |
| 4976 | unsigned long capacity = DIV_ROUND_CLOSEST(power, | 5965 | enum fbq_type rt; |
| 4977 | SCHED_POWER_SCALE); | ||
| 4978 | unsigned long wl; | ||
| 4979 | 5966 | ||
| 4980 | if (!capacity) | 5967 | rq = cpu_rq(i); |
| 4981 | capacity = fix_small_capacity(env->sd, group); | 5968 | rt = fbq_classify_rq(rq); |
| 4982 | 5969 | ||
| 4983 | if (!cpumask_test_cpu(i, env->cpus)) | 5970 | /* |
| 5971 | * We classify groups/runqueues into three groups: | ||
| 5972 | * - regular: there are !numa tasks | ||
| 5973 | * - remote: there are numa tasks that run on the 'wrong' node | ||
| 5974 | * - all: there is no distinction | ||
| 5975 | * | ||
| 5976 | * In order to avoid migrating ideally placed numa tasks, | ||
| 5977 | * ignore those when there's better options. | ||
| 5978 | * | ||
| 5979 | * If we ignore the actual busiest queue to migrate another | ||
| 5980 | * task, the next balance pass can still reduce the busiest | ||
| 5981 | * queue by moving tasks around inside the node. | ||
| 5982 | * | ||
| 5983 | * If we cannot move enough load due to this classification | ||
| 5984 | * the next pass will adjust the group classification and | ||
| 5985 | * allow migration of more tasks. | ||
| 5986 | * | ||
| 5987 | * Both cases only affect the total convergence complexity. | ||
| 5988 | */ | ||
| 5989 | if (rt > env->fbq_type) | ||
| 4984 | continue; | 5990 | continue; |
| 4985 | 5991 | ||
| 4986 | rq = cpu_rq(i); | 5992 | power = power_of(i); |
| 5993 | capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); | ||
| 5994 | if (!capacity) | ||
| 5995 | capacity = fix_small_capacity(env->sd, group); | ||
| 5996 | |||
| 4987 | wl = weighted_cpuload(i); | 5997 | wl = weighted_cpuload(i); |
| 4988 | 5998 | ||
| 4989 | /* | 5999 | /* |
| @@ -4998,11 +6008,15 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
| 4998 | * the weighted_cpuload() scaled with the cpu power, so that | 6008 | * the weighted_cpuload() scaled with the cpu power, so that |
| 4999 | * the load can be moved away from the cpu that is potentially | 6009 | * the load can be moved away from the cpu that is potentially |
| 5000 | * running at a lower capacity. | 6010 | * running at a lower capacity. |
| 6011 | * | ||
| 6012 | * Thus we're looking for max(wl_i / power_i), crosswise | ||
| 6013 | * multiplication to rid ourselves of the division works out | ||
| 6014 | * to: wl_i * power_j > wl_j * power_i; where j is our | ||
| 6015 | * previous maximum. | ||
| 5001 | */ | 6016 | */ |
| 5002 | wl = (wl * SCHED_POWER_SCALE) / power; | 6017 | if (wl * busiest_power > busiest_load * power) { |
| 5003 | 6018 | busiest_load = wl; | |
| 5004 | if (wl > max_load) { | 6019 | busiest_power = power; |
| 5005 | max_load = wl; | ||
| 5006 | busiest = rq; | 6020 | busiest = rq; |
| 5007 | } | 6021 | } |
| 5008 | } | 6022 | } |
| @@ -5039,15 +6053,50 @@ static int need_active_balance(struct lb_env *env) | |||
| 5039 | 6053 | ||
| 5040 | static int active_load_balance_cpu_stop(void *data); | 6054 | static int active_load_balance_cpu_stop(void *data); |
| 5041 | 6055 | ||
| 6056 | static int should_we_balance(struct lb_env *env) | ||
| 6057 | { | ||
| 6058 | struct sched_group *sg = env->sd->groups; | ||
| 6059 | struct cpumask *sg_cpus, *sg_mask; | ||
| 6060 | int cpu, balance_cpu = -1; | ||
| 6061 | |||
| 6062 | /* | ||
| 6063 | * In the newly idle case, we will allow all the cpu's | ||
| 6064 | * to do the newly idle load balance. | ||
| 6065 | */ | ||
| 6066 | if (env->idle == CPU_NEWLY_IDLE) | ||
| 6067 | return 1; | ||
| 6068 | |||
| 6069 | sg_cpus = sched_group_cpus(sg); | ||
| 6070 | sg_mask = sched_group_mask(sg); | ||
| 6071 | /* Try to find first idle cpu */ | ||
| 6072 | for_each_cpu_and(cpu, sg_cpus, env->cpus) { | ||
| 6073 | if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu)) | ||
| 6074 | continue; | ||
| 6075 | |||
| 6076 | balance_cpu = cpu; | ||
| 6077 | break; | ||
| 6078 | } | ||
| 6079 | |||
| 6080 | if (balance_cpu == -1) | ||
| 6081 | balance_cpu = group_balance_cpu(sg); | ||
| 6082 | |||
| 6083 | /* | ||
| 6084 | * First idle cpu or the first cpu(busiest) in this sched group | ||
| 6085 | * is eligible for doing load balancing at this and above domains. | ||
| 6086 | */ | ||
| 6087 | return balance_cpu == env->dst_cpu; | ||
| 6088 | } | ||
| 6089 | |||
| 5042 | /* | 6090 | /* |
| 5043 | * Check this_cpu to ensure it is balanced within domain. Attempt to move | 6091 | * Check this_cpu to ensure it is balanced within domain. Attempt to move |
| 5044 | * tasks if there is an imbalance. | 6092 | * tasks if there is an imbalance. |
| 5045 | */ | 6093 | */ |
| 5046 | static int load_balance(int this_cpu, struct rq *this_rq, | 6094 | static int load_balance(int this_cpu, struct rq *this_rq, |
| 5047 | struct sched_domain *sd, enum cpu_idle_type idle, | 6095 | struct sched_domain *sd, enum cpu_idle_type idle, |
| 5048 | int *balance) | 6096 | int *continue_balancing) |
| 5049 | { | 6097 | { |
| 5050 | int ld_moved, cur_ld_moved, active_balance = 0; | 6098 | int ld_moved, cur_ld_moved, active_balance = 0; |
| 6099 | struct sched_domain *sd_parent = sd->parent; | ||
| 5051 | struct sched_group *group; | 6100 | struct sched_group *group; |
| 5052 | struct rq *busiest; | 6101 | struct rq *busiest; |
| 5053 | unsigned long flags; | 6102 | unsigned long flags; |
| @@ -5061,6 +6110,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
| 5061 | .idle = idle, | 6110 | .idle = idle, |
| 5062 | .loop_break = sched_nr_migrate_break, | 6111 | .loop_break = sched_nr_migrate_break, |
| 5063 | .cpus = cpus, | 6112 | .cpus = cpus, |
| 6113 | .fbq_type = all, | ||
| 5064 | }; | 6114 | }; |
| 5065 | 6115 | ||
| 5066 | /* | 6116 | /* |
| @@ -5075,11 +6125,12 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
| 5075 | schedstat_inc(sd, lb_count[idle]); | 6125 | schedstat_inc(sd, lb_count[idle]); |
| 5076 | 6126 | ||
| 5077 | redo: | 6127 | redo: |
| 5078 | group = find_busiest_group(&env, balance); | 6128 | if (!should_we_balance(&env)) { |
| 5079 | 6129 | *continue_balancing = 0; | |
| 5080 | if (*balance == 0) | ||
| 5081 | goto out_balanced; | 6130 | goto out_balanced; |
| 6131 | } | ||
| 5082 | 6132 | ||
| 6133 | group = find_busiest_group(&env); | ||
| 5083 | if (!group) { | 6134 | if (!group) { |
| 5084 | schedstat_inc(sd, lb_nobusyg[idle]); | 6135 | schedstat_inc(sd, lb_nobusyg[idle]); |
| 5085 | goto out_balanced; | 6136 | goto out_balanced; |
| @@ -5108,7 +6159,6 @@ redo: | |||
| 5108 | env.src_rq = busiest; | 6159 | env.src_rq = busiest; |
| 5109 | env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); | 6160 | env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); |
| 5110 | 6161 | ||
| 5111 | update_h_load(env.src_cpu); | ||
| 5112 | more_balance: | 6162 | more_balance: |
| 5113 | local_irq_save(flags); | 6163 | local_irq_save(flags); |
| 5114 | double_rq_lock(env.dst_rq, busiest); | 6164 | double_rq_lock(env.dst_rq, busiest); |
| @@ -5152,17 +6202,17 @@ more_balance: | |||
| 5152 | * moreover subsequent load balance cycles should correct the | 6202 | * moreover subsequent load balance cycles should correct the |
| 5153 | * excess load moved. | 6203 | * excess load moved. |
| 5154 | */ | 6204 | */ |
| 5155 | if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { | 6205 | if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) { |
| 6206 | |||
| 6207 | /* Prevent to re-select dst_cpu via env's cpus */ | ||
| 6208 | cpumask_clear_cpu(env.dst_cpu, env.cpus); | ||
| 5156 | 6209 | ||
| 5157 | env.dst_rq = cpu_rq(env.new_dst_cpu); | 6210 | env.dst_rq = cpu_rq(env.new_dst_cpu); |
| 5158 | env.dst_cpu = env.new_dst_cpu; | 6211 | env.dst_cpu = env.new_dst_cpu; |
| 5159 | env.flags &= ~LBF_SOME_PINNED; | 6212 | env.flags &= ~LBF_DST_PINNED; |
| 5160 | env.loop = 0; | 6213 | env.loop = 0; |
| 5161 | env.loop_break = sched_nr_migrate_break; | 6214 | env.loop_break = sched_nr_migrate_break; |
| 5162 | 6215 | ||
| 5163 | /* Prevent to re-select dst_cpu via env's cpus */ | ||
| 5164 | cpumask_clear_cpu(env.dst_cpu, env.cpus); | ||
| 5165 | |||
| 5166 | /* | 6216 | /* |
| 5167 | * Go back to "more_balance" rather than "redo" since we | 6217 | * Go back to "more_balance" rather than "redo" since we |
| 5168 | * need to continue with same src_cpu. | 6218 | * need to continue with same src_cpu. |
| @@ -5170,6 +6220,18 @@ more_balance: | |||
| 5170 | goto more_balance; | 6220 | goto more_balance; |
| 5171 | } | 6221 | } |
| 5172 | 6222 | ||
| 6223 | /* | ||
| 6224 | * We failed to reach balance because of affinity. | ||
| 6225 | */ | ||
| 6226 | if (sd_parent) { | ||
| 6227 | int *group_imbalance = &sd_parent->groups->sgp->imbalance; | ||
| 6228 | |||
| 6229 | if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { | ||
| 6230 | *group_imbalance = 1; | ||
| 6231 | } else if (*group_imbalance) | ||
| 6232 | *group_imbalance = 0; | ||
| 6233 | } | ||
| 6234 | |||
| 5173 | /* All tasks on this runqueue were pinned by CPU affinity */ | 6235 | /* All tasks on this runqueue were pinned by CPU affinity */ |
| 5174 | if (unlikely(env.flags & LBF_ALL_PINNED)) { | 6236 | if (unlikely(env.flags & LBF_ALL_PINNED)) { |
| 5175 | cpumask_clear_cpu(cpu_of(busiest), cpus); | 6237 | cpumask_clear_cpu(cpu_of(busiest), cpus); |
| @@ -5277,6 +6339,7 @@ void idle_balance(int this_cpu, struct rq *this_rq) | |||
| 5277 | struct sched_domain *sd; | 6339 | struct sched_domain *sd; |
| 5278 | int pulled_task = 0; | 6340 | int pulled_task = 0; |
| 5279 | unsigned long next_balance = jiffies + HZ; | 6341 | unsigned long next_balance = jiffies + HZ; |
| 6342 | u64 curr_cost = 0; | ||
| 5280 | 6343 | ||
| 5281 | this_rq->idle_stamp = rq_clock(this_rq); | 6344 | this_rq->idle_stamp = rq_clock(this_rq); |
| 5282 | 6345 | ||
| @@ -5292,15 +6355,28 @@ void idle_balance(int this_cpu, struct rq *this_rq) | |||
| 5292 | rcu_read_lock(); | 6355 | rcu_read_lock(); |
| 5293 | for_each_domain(this_cpu, sd) { | 6356 | for_each_domain(this_cpu, sd) { |
| 5294 | unsigned long interval; | 6357 | unsigned long interval; |
| 5295 | int balance = 1; | 6358 | int continue_balancing = 1; |
| 6359 | u64 t0, domain_cost; | ||
| 5296 | 6360 | ||
| 5297 | if (!(sd->flags & SD_LOAD_BALANCE)) | 6361 | if (!(sd->flags & SD_LOAD_BALANCE)) |
| 5298 | continue; | 6362 | continue; |
| 5299 | 6363 | ||
| 6364 | if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) | ||
| 6365 | break; | ||
| 6366 | |||
| 5300 | if (sd->flags & SD_BALANCE_NEWIDLE) { | 6367 | if (sd->flags & SD_BALANCE_NEWIDLE) { |
| 6368 | t0 = sched_clock_cpu(this_cpu); | ||
| 6369 | |||
| 5301 | /* If we've pulled tasks over stop searching: */ | 6370 | /* If we've pulled tasks over stop searching: */ |
| 5302 | pulled_task = load_balance(this_cpu, this_rq, | 6371 | pulled_task = load_balance(this_cpu, this_rq, |
| 5303 | sd, CPU_NEWLY_IDLE, &balance); | 6372 | sd, CPU_NEWLY_IDLE, |
| 6373 | &continue_balancing); | ||
| 6374 | |||
| 6375 | domain_cost = sched_clock_cpu(this_cpu) - t0; | ||
| 6376 | if (domain_cost > sd->max_newidle_lb_cost) | ||
| 6377 | sd->max_newidle_lb_cost = domain_cost; | ||
| 6378 | |||
| 6379 | curr_cost += domain_cost; | ||
| 5304 | } | 6380 | } |
| 5305 | 6381 | ||
| 5306 | interval = msecs_to_jiffies(sd->balance_interval); | 6382 | interval = msecs_to_jiffies(sd->balance_interval); |
| @@ -5322,6 +6398,9 @@ void idle_balance(int this_cpu, struct rq *this_rq) | |||
| 5322 | */ | 6398 | */ |
| 5323 | this_rq->next_balance = next_balance; | 6399 | this_rq->next_balance = next_balance; |
| 5324 | } | 6400 | } |
| 6401 | |||
| 6402 | if (curr_cost > this_rq->max_idle_balance_cost) | ||
| 6403 | this_rq->max_idle_balance_cost = curr_cost; | ||
| 5325 | } | 6404 | } |
| 5326 | 6405 | ||
| 5327 | /* | 6406 | /* |
| @@ -5455,16 +6534,16 @@ static inline void nohz_balance_exit_idle(int cpu) | |||
| 5455 | static inline void set_cpu_sd_state_busy(void) | 6534 | static inline void set_cpu_sd_state_busy(void) |
| 5456 | { | 6535 | { |
| 5457 | struct sched_domain *sd; | 6536 | struct sched_domain *sd; |
| 6537 | int cpu = smp_processor_id(); | ||
| 5458 | 6538 | ||
| 5459 | rcu_read_lock(); | 6539 | rcu_read_lock(); |
| 5460 | sd = rcu_dereference_check_sched_domain(this_rq()->sd); | 6540 | sd = rcu_dereference(per_cpu(sd_busy, cpu)); |
| 5461 | 6541 | ||
| 5462 | if (!sd || !sd->nohz_idle) | 6542 | if (!sd || !sd->nohz_idle) |
| 5463 | goto unlock; | 6543 | goto unlock; |
| 5464 | sd->nohz_idle = 0; | 6544 | sd->nohz_idle = 0; |
| 5465 | 6545 | ||
| 5466 | for (; sd; sd = sd->parent) | 6546 | atomic_inc(&sd->groups->sgp->nr_busy_cpus); |
| 5467 | atomic_inc(&sd->groups->sgp->nr_busy_cpus); | ||
| 5468 | unlock: | 6547 | unlock: |
| 5469 | rcu_read_unlock(); | 6548 | rcu_read_unlock(); |
| 5470 | } | 6549 | } |
| @@ -5472,16 +6551,16 @@ unlock: | |||
| 5472 | void set_cpu_sd_state_idle(void) | 6551 | void set_cpu_sd_state_idle(void) |
| 5473 | { | 6552 | { |
| 5474 | struct sched_domain *sd; | 6553 | struct sched_domain *sd; |
| 6554 | int cpu = smp_processor_id(); | ||
| 5475 | 6555 | ||
| 5476 | rcu_read_lock(); | 6556 | rcu_read_lock(); |
| 5477 | sd = rcu_dereference_check_sched_domain(this_rq()->sd); | 6557 | sd = rcu_dereference(per_cpu(sd_busy, cpu)); |
| 5478 | 6558 | ||
| 5479 | if (!sd || sd->nohz_idle) | 6559 | if (!sd || sd->nohz_idle) |
| 5480 | goto unlock; | 6560 | goto unlock; |
| 5481 | sd->nohz_idle = 1; | 6561 | sd->nohz_idle = 1; |
| 5482 | 6562 | ||
| 5483 | for (; sd; sd = sd->parent) | 6563 | atomic_dec(&sd->groups->sgp->nr_busy_cpus); |
| 5484 | atomic_dec(&sd->groups->sgp->nr_busy_cpus); | ||
| 5485 | unlock: | 6564 | unlock: |
| 5486 | rcu_read_unlock(); | 6565 | rcu_read_unlock(); |
| 5487 | } | 6566 | } |
| @@ -5538,22 +6617,46 @@ void update_max_interval(void) | |||
| 5538 | */ | 6617 | */ |
| 5539 | static void rebalance_domains(int cpu, enum cpu_idle_type idle) | 6618 | static void rebalance_domains(int cpu, enum cpu_idle_type idle) |
| 5540 | { | 6619 | { |
| 5541 | int balance = 1; | 6620 | int continue_balancing = 1; |
| 5542 | struct rq *rq = cpu_rq(cpu); | 6621 | struct rq *rq = cpu_rq(cpu); |
| 5543 | unsigned long interval; | 6622 | unsigned long interval; |
| 5544 | struct sched_domain *sd; | 6623 | struct sched_domain *sd; |
| 5545 | /* Earliest time when we have to do rebalance again */ | 6624 | /* Earliest time when we have to do rebalance again */ |
| 5546 | unsigned long next_balance = jiffies + 60*HZ; | 6625 | unsigned long next_balance = jiffies + 60*HZ; |
| 5547 | int update_next_balance = 0; | 6626 | int update_next_balance = 0; |
| 5548 | int need_serialize; | 6627 | int need_serialize, need_decay = 0; |
| 6628 | u64 max_cost = 0; | ||
| 5549 | 6629 | ||
| 5550 | update_blocked_averages(cpu); | 6630 | update_blocked_averages(cpu); |
| 5551 | 6631 | ||
| 5552 | rcu_read_lock(); | 6632 | rcu_read_lock(); |
| 5553 | for_each_domain(cpu, sd) { | 6633 | for_each_domain(cpu, sd) { |
| 6634 | /* | ||
| 6635 | * Decay the newidle max times here because this is a regular | ||
| 6636 | * visit to all the domains. Decay ~1% per second. | ||
| 6637 | */ | ||
| 6638 | if (time_after(jiffies, sd->next_decay_max_lb_cost)) { | ||
| 6639 | sd->max_newidle_lb_cost = | ||
| 6640 | (sd->max_newidle_lb_cost * 253) / 256; | ||
| 6641 | sd->next_decay_max_lb_cost = jiffies + HZ; | ||
| 6642 | need_decay = 1; | ||
| 6643 | } | ||
| 6644 | max_cost += sd->max_newidle_lb_cost; | ||
| 6645 | |||
| 5554 | if (!(sd->flags & SD_LOAD_BALANCE)) | 6646 | if (!(sd->flags & SD_LOAD_BALANCE)) |
| 5555 | continue; | 6647 | continue; |
| 5556 | 6648 | ||
| 6649 | /* | ||
| 6650 | * Stop the load balance at this level. There is another | ||
| 6651 | * CPU in our sched group which is doing load balancing more | ||
| 6652 | * actively. | ||
| 6653 | */ | ||
| 6654 | if (!continue_balancing) { | ||
| 6655 | if (need_decay) | ||
| 6656 | continue; | ||
| 6657 | break; | ||
| 6658 | } | ||
| 6659 | |||
| 5557 | interval = sd->balance_interval; | 6660 | interval = sd->balance_interval; |
| 5558 | if (idle != CPU_IDLE) | 6661 | if (idle != CPU_IDLE) |
| 5559 | interval *= sd->busy_factor; | 6662 | interval *= sd->busy_factor; |
| @@ -5570,9 +6673,9 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
| 5570 | } | 6673 | } |
| 5571 | 6674 | ||
| 5572 | if (time_after_eq(jiffies, sd->last_balance + interval)) { | 6675 | if (time_after_eq(jiffies, sd->last_balance + interval)) { |
| 5573 | if (load_balance(cpu, rq, sd, idle, &balance)) { | 6676 | if (load_balance(cpu, rq, sd, idle, &continue_balancing)) { |
| 5574 | /* | 6677 | /* |
| 5575 | * The LBF_SOME_PINNED logic could have changed | 6678 | * The LBF_DST_PINNED logic could have changed |
| 5576 | * env->dst_cpu, so we can't know our idle | 6679 | * env->dst_cpu, so we can't know our idle |
| 5577 | * state even if we migrated tasks. Update it. | 6680 | * state even if we migrated tasks. Update it. |
| 5578 | */ | 6681 | */ |
| @@ -5587,14 +6690,14 @@ out: | |||
| 5587 | next_balance = sd->last_balance + interval; | 6690 | next_balance = sd->last_balance + interval; |
| 5588 | update_next_balance = 1; | 6691 | update_next_balance = 1; |
| 5589 | } | 6692 | } |
| 5590 | 6693 | } | |
| 6694 | if (need_decay) { | ||
| 5591 | /* | 6695 | /* |
| 5592 | * Stop the load balance at this level. There is another | 6696 | * Ensure the rq-wide value also decays but keep it at a |
| 5593 | * CPU in our sched group which is doing load balancing more | 6697 | * reasonable floor to avoid funnies with rq->avg_idle. |
| 5594 | * actively. | ||
| 5595 | */ | 6698 | */ |
| 5596 | if (!balance) | 6699 | rq->max_idle_balance_cost = |
| 5597 | break; | 6700 | max((u64)sysctl_sched_migration_cost, max_cost); |
| 5598 | } | 6701 | } |
| 5599 | rcu_read_unlock(); | 6702 | rcu_read_unlock(); |
| 5600 | 6703 | ||
| @@ -5664,6 +6767,8 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu) | |||
| 5664 | { | 6767 | { |
| 5665 | unsigned long now = jiffies; | 6768 | unsigned long now = jiffies; |
| 5666 | struct sched_domain *sd; | 6769 | struct sched_domain *sd; |
| 6770 | struct sched_group_power *sgp; | ||
| 6771 | int nr_busy; | ||
| 5667 | 6772 | ||
| 5668 | if (unlikely(idle_cpu(cpu))) | 6773 | if (unlikely(idle_cpu(cpu))) |
| 5669 | return 0; | 6774 | return 0; |
| @@ -5689,22 +6794,22 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu) | |||
| 5689 | goto need_kick; | 6794 | goto need_kick; |
| 5690 | 6795 | ||
| 5691 | rcu_read_lock(); | 6796 | rcu_read_lock(); |
| 5692 | for_each_domain(cpu, sd) { | 6797 | sd = rcu_dereference(per_cpu(sd_busy, cpu)); |
| 5693 | struct sched_group *sg = sd->groups; | ||
| 5694 | struct sched_group_power *sgp = sg->sgp; | ||
| 5695 | int nr_busy = atomic_read(&sgp->nr_busy_cpus); | ||
| 5696 | 6798 | ||
| 5697 | if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1) | 6799 | if (sd) { |
| 5698 | goto need_kick_unlock; | 6800 | sgp = sd->groups->sgp; |
| 6801 | nr_busy = atomic_read(&sgp->nr_busy_cpus); | ||
| 5699 | 6802 | ||
| 5700 | if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight | 6803 | if (nr_busy > 1) |
| 5701 | && (cpumask_first_and(nohz.idle_cpus_mask, | ||
| 5702 | sched_domain_span(sd)) < cpu)) | ||
| 5703 | goto need_kick_unlock; | 6804 | goto need_kick_unlock; |
| 5704 | |||
| 5705 | if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING))) | ||
| 5706 | break; | ||
| 5707 | } | 6805 | } |
| 6806 | |||
| 6807 | sd = rcu_dereference(per_cpu(sd_asym, cpu)); | ||
| 6808 | |||
| 6809 | if (sd && (cpumask_first_and(nohz.idle_cpus_mask, | ||
| 6810 | sched_domain_span(sd)) < cpu)) | ||
| 6811 | goto need_kick_unlock; | ||
| 6812 | |||
| 5708 | rcu_read_unlock(); | 6813 | rcu_read_unlock(); |
| 5709 | return 0; | 6814 | return 0; |
| 5710 | 6815 | ||
| @@ -5812,11 +6917,15 @@ static void task_fork_fair(struct task_struct *p) | |||
| 5812 | cfs_rq = task_cfs_rq(current); | 6917 | cfs_rq = task_cfs_rq(current); |
| 5813 | curr = cfs_rq->curr; | 6918 | curr = cfs_rq->curr; |
| 5814 | 6919 | ||
| 5815 | if (unlikely(task_cpu(p) != this_cpu)) { | 6920 | /* |
| 5816 | rcu_read_lock(); | 6921 | * Not only the cpu but also the task_group of the parent might have |
| 5817 | __set_task_cpu(p, this_cpu); | 6922 | * been changed after parent->se.parent,cfs_rq were copied to |
| 5818 | rcu_read_unlock(); | 6923 | * child->se.parent,cfs_rq. So call __set_task_cpu() to make those |
| 5819 | } | 6924 | * of child point to valid ones. |
| 6925 | */ | ||
| 6926 | rcu_read_lock(); | ||
| 6927 | __set_task_cpu(p, this_cpu); | ||
| 6928 | rcu_read_unlock(); | ||
| 5820 | 6929 | ||
| 5821 | update_curr(cfs_rq); | 6930 | update_curr(cfs_rq); |
| 5822 | 6931 | ||
| @@ -5889,11 +6998,9 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) | |||
| 5889 | * and ensure we don't carry in an old decay_count if we | 6998 | * and ensure we don't carry in an old decay_count if we |
| 5890 | * switch back. | 6999 | * switch back. |
| 5891 | */ | 7000 | */ |
| 5892 | if (p->se.avg.decay_count) { | 7001 | if (se->avg.decay_count) { |
| 5893 | struct cfs_rq *cfs_rq = cfs_rq_of(&p->se); | 7002 | __synchronize_entity_decay(se); |
| 5894 | __synchronize_entity_decay(&p->se); | 7003 | subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib); |
| 5895 | subtract_blocked_load_contrib(cfs_rq, | ||
| 5896 | p->se.avg.load_avg_contrib); | ||
| 5897 | } | 7004 | } |
| 5898 | #endif | 7005 | #endif |
| 5899 | } | 7006 | } |
| @@ -6095,7 +7202,8 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, | |||
| 6095 | se->cfs_rq = parent->my_q; | 7202 | se->cfs_rq = parent->my_q; |
| 6096 | 7203 | ||
| 6097 | se->my_q = cfs_rq; | 7204 | se->my_q = cfs_rq; |
| 6098 | update_load_set(&se->load, 0); | 7205 | /* guarantee group entities always have weight */ |
| 7206 | update_load_set(&se->load, NICE_0_LOAD); | ||
| 6099 | se->parent = parent; | 7207 | se->parent = parent; |
| 6100 | } | 7208 | } |
| 6101 | 7209 | ||
diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 99399f8e4799..5716929a2e3a 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h | |||
| @@ -63,10 +63,23 @@ SCHED_FEAT(LB_MIN, false) | |||
| 63 | /* | 63 | /* |
| 64 | * Apply the automatic NUMA scheduling policy. Enabled automatically | 64 | * Apply the automatic NUMA scheduling policy. Enabled automatically |
| 65 | * at runtime if running on a NUMA machine. Can be controlled via | 65 | * at runtime if running on a NUMA machine. Can be controlled via |
| 66 | * numa_balancing=. Allow PTE scanning to be forced on UMA machines | 66 | * numa_balancing= |
| 67 | * for debugging the core machinery. | ||
| 68 | */ | 67 | */ |
| 69 | #ifdef CONFIG_NUMA_BALANCING | 68 | #ifdef CONFIG_NUMA_BALANCING |
| 70 | SCHED_FEAT(NUMA, false) | 69 | SCHED_FEAT(NUMA, false) |
| 71 | SCHED_FEAT(NUMA_FORCE, false) | 70 | |
| 71 | /* | ||
| 72 | * NUMA_FAVOUR_HIGHER will favor moving tasks towards nodes where a | ||
| 73 | * higher number of hinting faults are recorded during active load | ||
| 74 | * balancing. | ||
| 75 | */ | ||
| 76 | SCHED_FEAT(NUMA_FAVOUR_HIGHER, true) | ||
| 77 | |||
| 78 | /* | ||
| 79 | * NUMA_RESIST_LOWER will resist moving tasks towards nodes where a | ||
| 80 | * lower number of hinting faults have been recorded. As this has | ||
| 81 | * the potential to prevent a task ever migrating to a new node | ||
| 82 | * due to CPU overload it is disabled by default. | ||
| 83 | */ | ||
| 84 | SCHED_FEAT(NUMA_RESIST_LOWER, false) | ||
| 72 | #endif | 85 | #endif |
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index d8da01008d39..516c3d9ceea1 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c | |||
| @@ -9,7 +9,7 @@ | |||
| 9 | 9 | ||
| 10 | #ifdef CONFIG_SMP | 10 | #ifdef CONFIG_SMP |
| 11 | static int | 11 | static int |
| 12 | select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) | 12 | select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags) |
| 13 | { | 13 | { |
| 14 | return task_cpu(p); /* IDLE tasks as never migrated */ | 14 | return task_cpu(p); /* IDLE tasks as never migrated */ |
| 15 | } | 15 | } |
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 01970c8e64df..7d57275fc396 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
| @@ -246,8 +246,10 @@ static inline void rt_set_overload(struct rq *rq) | |||
| 246 | * if we should look at the mask. It would be a shame | 246 | * if we should look at the mask. It would be a shame |
| 247 | * if we looked at the mask, but the mask was not | 247 | * if we looked at the mask, but the mask was not |
| 248 | * updated yet. | 248 | * updated yet. |
| 249 | * | ||
| 250 | * Matched by the barrier in pull_rt_task(). | ||
| 249 | */ | 251 | */ |
| 250 | wmb(); | 252 | smp_wmb(); |
| 251 | atomic_inc(&rq->rd->rto_count); | 253 | atomic_inc(&rq->rd->rto_count); |
| 252 | } | 254 | } |
| 253 | 255 | ||
| @@ -1169,13 +1171,10 @@ static void yield_task_rt(struct rq *rq) | |||
| 1169 | static int find_lowest_rq(struct task_struct *task); | 1171 | static int find_lowest_rq(struct task_struct *task); |
| 1170 | 1172 | ||
| 1171 | static int | 1173 | static int |
| 1172 | select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) | 1174 | select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) |
| 1173 | { | 1175 | { |
| 1174 | struct task_struct *curr; | 1176 | struct task_struct *curr; |
| 1175 | struct rq *rq; | 1177 | struct rq *rq; |
| 1176 | int cpu; | ||
| 1177 | |||
| 1178 | cpu = task_cpu(p); | ||
| 1179 | 1178 | ||
| 1180 | if (p->nr_cpus_allowed == 1) | 1179 | if (p->nr_cpus_allowed == 1) |
| 1181 | goto out; | 1180 | goto out; |
| @@ -1213,8 +1212,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) | |||
| 1213 | */ | 1212 | */ |
| 1214 | if (curr && unlikely(rt_task(curr)) && | 1213 | if (curr && unlikely(rt_task(curr)) && |
| 1215 | (curr->nr_cpus_allowed < 2 || | 1214 | (curr->nr_cpus_allowed < 2 || |
| 1216 | curr->prio <= p->prio) && | 1215 | curr->prio <= p->prio)) { |
| 1217 | (p->nr_cpus_allowed > 1)) { | ||
| 1218 | int target = find_lowest_rq(p); | 1216 | int target = find_lowest_rq(p); |
| 1219 | 1217 | ||
| 1220 | if (target != -1) | 1218 | if (target != -1) |
| @@ -1630,6 +1628,12 @@ static int pull_rt_task(struct rq *this_rq) | |||
| 1630 | if (likely(!rt_overloaded(this_rq))) | 1628 | if (likely(!rt_overloaded(this_rq))) |
| 1631 | return 0; | 1629 | return 0; |
| 1632 | 1630 | ||
| 1631 | /* | ||
| 1632 | * Match the barrier from rt_set_overloaded; this guarantees that if we | ||
| 1633 | * see overloaded we must also see the rto_mask bit. | ||
| 1634 | */ | ||
| 1635 | smp_rmb(); | ||
| 1636 | |||
| 1633 | for_each_cpu(cpu, this_rq->rd->rto_mask) { | 1637 | for_each_cpu(cpu, this_rq->rd->rto_mask) { |
| 1634 | if (this_cpu == cpu) | 1638 | if (this_cpu == cpu) |
| 1635 | continue; | 1639 | continue; |
| @@ -1931,8 +1935,8 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) | |||
| 1931 | p->rt.time_slice = sched_rr_timeslice; | 1935 | p->rt.time_slice = sched_rr_timeslice; |
| 1932 | 1936 | ||
| 1933 | /* | 1937 | /* |
| 1934 | * Requeue to the end of queue if we (and all of our ancestors) are the | 1938 | * Requeue to the end of queue if we (and all of our ancestors) are not |
| 1935 | * only element on the queue | 1939 | * the only element on the queue |
| 1936 | */ | 1940 | */ |
| 1937 | for_each_sched_rt_entity(rt_se) { | 1941 | for_each_sched_rt_entity(rt_se) { |
| 1938 | if (rt_se->run_list.prev != rt_se->run_list.next) { | 1942 | if (rt_se->run_list.prev != rt_se->run_list.next) { |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ef0a7b2439dd..88c85b21d633 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
| @@ -6,6 +6,7 @@ | |||
| 6 | #include <linux/spinlock.h> | 6 | #include <linux/spinlock.h> |
| 7 | #include <linux/stop_machine.h> | 7 | #include <linux/stop_machine.h> |
| 8 | #include <linux/tick.h> | 8 | #include <linux/tick.h> |
| 9 | #include <linux/slab.h> | ||
| 9 | 10 | ||
| 10 | #include "cpupri.h" | 11 | #include "cpupri.h" |
| 11 | #include "cpuacct.h" | 12 | #include "cpuacct.h" |
| @@ -285,7 +286,6 @@ struct cfs_rq { | |||
| 285 | /* Required to track per-cpu representation of a task_group */ | 286 | /* Required to track per-cpu representation of a task_group */ |
| 286 | u32 tg_runnable_contrib; | 287 | u32 tg_runnable_contrib; |
| 287 | unsigned long tg_load_contrib; | 288 | unsigned long tg_load_contrib; |
| 288 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
| 289 | 289 | ||
| 290 | /* | 290 | /* |
| 291 | * h_load = weight * f(tg) | 291 | * h_load = weight * f(tg) |
| @@ -294,6 +294,9 @@ struct cfs_rq { | |||
| 294 | * this group. | 294 | * this group. |
| 295 | */ | 295 | */ |
| 296 | unsigned long h_load; | 296 | unsigned long h_load; |
| 297 | u64 last_h_load_update; | ||
| 298 | struct sched_entity *h_load_next; | ||
| 299 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
| 297 | #endif /* CONFIG_SMP */ | 300 | #endif /* CONFIG_SMP */ |
| 298 | 301 | ||
| 299 | #ifdef CONFIG_FAIR_GROUP_SCHED | 302 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| @@ -406,6 +409,10 @@ struct rq { | |||
| 406 | * remote CPUs use both these fields when doing load calculation. | 409 | * remote CPUs use both these fields when doing load calculation. |
| 407 | */ | 410 | */ |
| 408 | unsigned int nr_running; | 411 | unsigned int nr_running; |
| 412 | #ifdef CONFIG_NUMA_BALANCING | ||
| 413 | unsigned int nr_numa_running; | ||
| 414 | unsigned int nr_preferred_running; | ||
| 415 | #endif | ||
| 409 | #define CPU_LOAD_IDX_MAX 5 | 416 | #define CPU_LOAD_IDX_MAX 5 |
| 410 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; | 417 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; |
| 411 | unsigned long last_load_update_tick; | 418 | unsigned long last_load_update_tick; |
| @@ -429,9 +436,6 @@ struct rq { | |||
| 429 | #ifdef CONFIG_FAIR_GROUP_SCHED | 436 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 430 | /* list of leaf cfs_rq on this cpu: */ | 437 | /* list of leaf cfs_rq on this cpu: */ |
| 431 | struct list_head leaf_cfs_rq_list; | 438 | struct list_head leaf_cfs_rq_list; |
| 432 | #ifdef CONFIG_SMP | ||
| 433 | unsigned long h_load_throttle; | ||
| 434 | #endif /* CONFIG_SMP */ | ||
| 435 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 439 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
| 436 | 440 | ||
| 437 | #ifdef CONFIG_RT_GROUP_SCHED | 441 | #ifdef CONFIG_RT_GROUP_SCHED |
| @@ -477,6 +481,9 @@ struct rq { | |||
| 477 | u64 age_stamp; | 481 | u64 age_stamp; |
| 478 | u64 idle_stamp; | 482 | u64 idle_stamp; |
| 479 | u64 avg_idle; | 483 | u64 avg_idle; |
| 484 | |||
| 485 | /* This is used to determine avg_idle's max value */ | ||
| 486 | u64 max_idle_balance_cost; | ||
| 480 | #endif | 487 | #endif |
| 481 | 488 | ||
| 482 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | 489 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING |
| @@ -553,6 +560,12 @@ static inline u64 rq_clock_task(struct rq *rq) | |||
| 553 | return rq->clock_task; | 560 | return rq->clock_task; |
| 554 | } | 561 | } |
| 555 | 562 | ||
| 563 | #ifdef CONFIG_NUMA_BALANCING | ||
| 564 | extern void sched_setnuma(struct task_struct *p, int node); | ||
| 565 | extern int migrate_task_to(struct task_struct *p, int cpu); | ||
| 566 | extern int migrate_swap(struct task_struct *, struct task_struct *); | ||
| 567 | #endif /* CONFIG_NUMA_BALANCING */ | ||
| 568 | |||
| 556 | #ifdef CONFIG_SMP | 569 | #ifdef CONFIG_SMP |
| 557 | 570 | ||
| 558 | #define rcu_dereference_check_sched_domain(p) \ | 571 | #define rcu_dereference_check_sched_domain(p) \ |
| @@ -594,8 +607,24 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag) | |||
| 594 | return hsd; | 607 | return hsd; |
| 595 | } | 608 | } |
| 596 | 609 | ||
| 610 | static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) | ||
| 611 | { | ||
| 612 | struct sched_domain *sd; | ||
| 613 | |||
| 614 | for_each_domain(cpu, sd) { | ||
| 615 | if (sd->flags & flag) | ||
| 616 | break; | ||
| 617 | } | ||
| 618 | |||
| 619 | return sd; | ||
| 620 | } | ||
| 621 | |||
| 597 | DECLARE_PER_CPU(struct sched_domain *, sd_llc); | 622 | DECLARE_PER_CPU(struct sched_domain *, sd_llc); |
| 623 | DECLARE_PER_CPU(int, sd_llc_size); | ||
| 598 | DECLARE_PER_CPU(int, sd_llc_id); | 624 | DECLARE_PER_CPU(int, sd_llc_id); |
| 625 | DECLARE_PER_CPU(struct sched_domain *, sd_numa); | ||
| 626 | DECLARE_PER_CPU(struct sched_domain *, sd_busy); | ||
| 627 | DECLARE_PER_CPU(struct sched_domain *, sd_asym); | ||
| 599 | 628 | ||
| 600 | struct sched_group_power { | 629 | struct sched_group_power { |
| 601 | atomic_t ref; | 630 | atomic_t ref; |
| @@ -605,6 +634,7 @@ struct sched_group_power { | |||
| 605 | */ | 634 | */ |
| 606 | unsigned int power, power_orig; | 635 | unsigned int power, power_orig; |
| 607 | unsigned long next_update; | 636 | unsigned long next_update; |
| 637 | int imbalance; /* XXX unrelated to power but shared group state */ | ||
| 608 | /* | 638 | /* |
| 609 | * Number of busy cpus in this group. | 639 | * Number of busy cpus in this group. |
| 610 | */ | 640 | */ |
| @@ -665,9 +695,9 @@ extern int group_balance_cpu(struct sched_group *sg); | |||
| 665 | /* | 695 | /* |
| 666 | * Return the group to which this tasks belongs. | 696 | * Return the group to which this tasks belongs. |
| 667 | * | 697 | * |
| 668 | * We cannot use task_subsys_state() and friends because the cgroup | 698 | * We cannot use task_css() and friends because the cgroup subsystem |
| 669 | * subsystem changes that value before the cgroup_subsys::attach() method | 699 | * changes that value before the cgroup_subsys::attach() method is called, |
| 670 | * is called, therefore we cannot pin it and might observe the wrong value. | 700 | * therefore we cannot pin it and might observe the wrong value. |
| 671 | * | 701 | * |
| 672 | * The same is true for autogroup's p->signal->autogroup->tg, the autogroup | 702 | * The same is true for autogroup's p->signal->autogroup->tg, the autogroup |
| 673 | * core changes this before calling sched_move_task(). | 703 | * core changes this before calling sched_move_task(). |
| @@ -719,6 +749,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | |||
| 719 | */ | 749 | */ |
| 720 | smp_wmb(); | 750 | smp_wmb(); |
| 721 | task_thread_info(p)->cpu = cpu; | 751 | task_thread_info(p)->cpu = cpu; |
| 752 | p->wake_cpu = cpu; | ||
| 722 | #endif | 753 | #endif |
| 723 | } | 754 | } |
| 724 | 755 | ||
| @@ -974,7 +1005,7 @@ struct sched_class { | |||
| 974 | void (*put_prev_task) (struct rq *rq, struct task_struct *p); | 1005 | void (*put_prev_task) (struct rq *rq, struct task_struct *p); |
| 975 | 1006 | ||
| 976 | #ifdef CONFIG_SMP | 1007 | #ifdef CONFIG_SMP |
| 977 | int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags); | 1008 | int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); |
| 978 | void (*migrate_task_rq)(struct task_struct *p, int next_cpu); | 1009 | void (*migrate_task_rq)(struct task_struct *p, int next_cpu); |
| 979 | 1010 | ||
| 980 | void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); | 1011 | void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); |
| @@ -1220,6 +1251,24 @@ static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) | |||
| 1220 | lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); | 1251 | lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); |
| 1221 | } | 1252 | } |
| 1222 | 1253 | ||
| 1254 | static inline void double_lock(spinlock_t *l1, spinlock_t *l2) | ||
| 1255 | { | ||
| 1256 | if (l1 > l2) | ||
| 1257 | swap(l1, l2); | ||
| 1258 | |||
| 1259 | spin_lock(l1); | ||
| 1260 | spin_lock_nested(l2, SINGLE_DEPTH_NESTING); | ||
| 1261 | } | ||
| 1262 | |||
| 1263 | static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2) | ||
| 1264 | { | ||
| 1265 | if (l1 > l2) | ||
| 1266 | swap(l1, l2); | ||
| 1267 | |||
| 1268 | raw_spin_lock(l1); | ||
| 1269 | raw_spin_lock_nested(l2, SINGLE_DEPTH_NESTING); | ||
| 1270 | } | ||
| 1271 | |||
| 1223 | /* | 1272 | /* |
| 1224 | * double_rq_lock - safely lock two runqueues | 1273 | * double_rq_lock - safely lock two runqueues |
| 1225 | * | 1274 | * |
| @@ -1305,7 +1354,8 @@ extern void print_rt_stats(struct seq_file *m, int cpu); | |||
| 1305 | extern void init_cfs_rq(struct cfs_rq *cfs_rq); | 1354 | extern void init_cfs_rq(struct cfs_rq *cfs_rq); |
| 1306 | extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); | 1355 | extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); |
| 1307 | 1356 | ||
| 1308 | extern void account_cfs_bandwidth_used(int enabled, int was_enabled); | 1357 | extern void cfs_bandwidth_usage_inc(void); |
| 1358 | extern void cfs_bandwidth_usage_dec(void); | ||
| 1309 | 1359 | ||
| 1310 | #ifdef CONFIG_NO_HZ_COMMON | 1360 | #ifdef CONFIG_NO_HZ_COMMON |
| 1311 | enum rq_nohz_flag_bits { | 1361 | enum rq_nohz_flag_bits { |
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 5aef494fc8b4..4ab704339656 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h | |||
| @@ -59,9 +59,9 @@ static inline void sched_info_reset_dequeued(struct task_struct *t) | |||
| 59 | * from dequeue_task() to account for possible rq->clock skew across cpus. The | 59 | * from dequeue_task() to account for possible rq->clock skew across cpus. The |
| 60 | * delta taken on each cpu would annul the skew. | 60 | * delta taken on each cpu would annul the skew. |
| 61 | */ | 61 | */ |
| 62 | static inline void sched_info_dequeued(struct task_struct *t) | 62 | static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t) |
| 63 | { | 63 | { |
| 64 | unsigned long long now = rq_clock(task_rq(t)), delta = 0; | 64 | unsigned long long now = rq_clock(rq), delta = 0; |
| 65 | 65 | ||
| 66 | if (unlikely(sched_info_on())) | 66 | if (unlikely(sched_info_on())) |
| 67 | if (t->sched_info.last_queued) | 67 | if (t->sched_info.last_queued) |
| @@ -69,7 +69,7 @@ static inline void sched_info_dequeued(struct task_struct *t) | |||
| 69 | sched_info_reset_dequeued(t); | 69 | sched_info_reset_dequeued(t); |
| 70 | t->sched_info.run_delay += delta; | 70 | t->sched_info.run_delay += delta; |
| 71 | 71 | ||
| 72 | rq_sched_info_dequeued(task_rq(t), delta); | 72 | rq_sched_info_dequeued(rq, delta); |
| 73 | } | 73 | } |
| 74 | 74 | ||
| 75 | /* | 75 | /* |
| @@ -77,9 +77,9 @@ static inline void sched_info_dequeued(struct task_struct *t) | |||
| 77 | * long it was waiting to run. We also note when it began so that we | 77 | * long it was waiting to run. We also note when it began so that we |
| 78 | * can keep stats on how long its timeslice is. | 78 | * can keep stats on how long its timeslice is. |
| 79 | */ | 79 | */ |
| 80 | static void sched_info_arrive(struct task_struct *t) | 80 | static void sched_info_arrive(struct rq *rq, struct task_struct *t) |
| 81 | { | 81 | { |
| 82 | unsigned long long now = rq_clock(task_rq(t)), delta = 0; | 82 | unsigned long long now = rq_clock(rq), delta = 0; |
| 83 | 83 | ||
| 84 | if (t->sched_info.last_queued) | 84 | if (t->sched_info.last_queued) |
| 85 | delta = now - t->sched_info.last_queued; | 85 | delta = now - t->sched_info.last_queued; |
| @@ -88,7 +88,7 @@ static void sched_info_arrive(struct task_struct *t) | |||
| 88 | t->sched_info.last_arrival = now; | 88 | t->sched_info.last_arrival = now; |
| 89 | t->sched_info.pcount++; | 89 | t->sched_info.pcount++; |
| 90 | 90 | ||
| 91 | rq_sched_info_arrive(task_rq(t), delta); | 91 | rq_sched_info_arrive(rq, delta); |
| 92 | } | 92 | } |
| 93 | 93 | ||
| 94 | /* | 94 | /* |
| @@ -96,29 +96,30 @@ static void sched_info_arrive(struct task_struct *t) | |||
| 96 | * the timestamp if it is already not set. It's assumed that | 96 | * the timestamp if it is already not set. It's assumed that |
| 97 | * sched_info_dequeued() will clear that stamp when appropriate. | 97 | * sched_info_dequeued() will clear that stamp when appropriate. |
| 98 | */ | 98 | */ |
| 99 | static inline void sched_info_queued(struct task_struct *t) | 99 | static inline void sched_info_queued(struct rq *rq, struct task_struct *t) |
| 100 | { | 100 | { |
| 101 | if (unlikely(sched_info_on())) | 101 | if (unlikely(sched_info_on())) |
| 102 | if (!t->sched_info.last_queued) | 102 | if (!t->sched_info.last_queued) |
| 103 | t->sched_info.last_queued = rq_clock(task_rq(t)); | 103 | t->sched_info.last_queued = rq_clock(rq); |
| 104 | } | 104 | } |
| 105 | 105 | ||
| 106 | /* | 106 | /* |
| 107 | * Called when a process ceases being the active-running process, either | 107 | * Called when a process ceases being the active-running process involuntarily |
| 108 | * voluntarily or involuntarily. Now we can calculate how long we ran. | 108 | * due, typically, to expiring its time slice (this may also be called when |
| 109 | * switching to the idle task). Now we can calculate how long we ran. | ||
| 109 | * Also, if the process is still in the TASK_RUNNING state, call | 110 | * Also, if the process is still in the TASK_RUNNING state, call |
| 110 | * sched_info_queued() to mark that it has now again started waiting on | 111 | * sched_info_queued() to mark that it has now again started waiting on |
| 111 | * the runqueue. | 112 | * the runqueue. |
| 112 | */ | 113 | */ |
| 113 | static inline void sched_info_depart(struct task_struct *t) | 114 | static inline void sched_info_depart(struct rq *rq, struct task_struct *t) |
| 114 | { | 115 | { |
| 115 | unsigned long long delta = rq_clock(task_rq(t)) - | 116 | unsigned long long delta = rq_clock(rq) - |
| 116 | t->sched_info.last_arrival; | 117 | t->sched_info.last_arrival; |
| 117 | 118 | ||
| 118 | rq_sched_info_depart(task_rq(t), delta); | 119 | rq_sched_info_depart(rq, delta); |
| 119 | 120 | ||
| 120 | if (t->state == TASK_RUNNING) | 121 | if (t->state == TASK_RUNNING) |
| 121 | sched_info_queued(t); | 122 | sched_info_queued(rq, t); |
| 122 | } | 123 | } |
| 123 | 124 | ||
| 124 | /* | 125 | /* |
| @@ -127,32 +128,34 @@ static inline void sched_info_depart(struct task_struct *t) | |||
| 127 | * the idle task.) We are only called when prev != next. | 128 | * the idle task.) We are only called when prev != next. |
| 128 | */ | 129 | */ |
| 129 | static inline void | 130 | static inline void |
| 130 | __sched_info_switch(struct task_struct *prev, struct task_struct *next) | 131 | __sched_info_switch(struct rq *rq, |
| 132 | struct task_struct *prev, struct task_struct *next) | ||
| 131 | { | 133 | { |
| 132 | struct rq *rq = task_rq(prev); | ||
| 133 | |||
| 134 | /* | 134 | /* |
| 135 | * prev now departs the cpu. It's not interesting to record | 135 | * prev now departs the cpu. It's not interesting to record |
| 136 | * stats about how efficient we were at scheduling the idle | 136 | * stats about how efficient we were at scheduling the idle |
| 137 | * process, however. | 137 | * process, however. |
| 138 | */ | 138 | */ |
| 139 | if (prev != rq->idle) | 139 | if (prev != rq->idle) |
| 140 | sched_info_depart(prev); | 140 | sched_info_depart(rq, prev); |
| 141 | 141 | ||
| 142 | if (next != rq->idle) | 142 | if (next != rq->idle) |
| 143 | sched_info_arrive(next); | 143 | sched_info_arrive(rq, next); |
| 144 | } | 144 | } |
| 145 | static inline void | 145 | static inline void |
| 146 | sched_info_switch(struct task_struct *prev, struct task_struct *next) | 146 | sched_info_switch(struct rq *rq, |
| 147 | struct task_struct *prev, struct task_struct *next) | ||
| 147 | { | 148 | { |
| 148 | if (unlikely(sched_info_on())) | 149 | if (unlikely(sched_info_on())) |
| 149 | __sched_info_switch(prev, next); | 150 | __sched_info_switch(rq, prev, next); |
| 150 | } | 151 | } |
| 151 | #else | 152 | #else |
| 152 | #define sched_info_queued(t) do { } while (0) | 153 | #define sched_info_queued(rq, t) do { } while (0) |
| 153 | #define sched_info_reset_dequeued(t) do { } while (0) | 154 | #define sched_info_reset_dequeued(t) do { } while (0) |
| 154 | #define sched_info_dequeued(t) do { } while (0) | 155 | #define sched_info_dequeued(rq, t) do { } while (0) |
| 155 | #define sched_info_switch(t, next) do { } while (0) | 156 | #define sched_info_depart(rq, t) do { } while (0) |
| 157 | #define sched_info_arrive(rq, next) do { } while (0) | ||
| 158 | #define sched_info_switch(rq, t, next) do { } while (0) | ||
| 156 | #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ | 159 | #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ |
| 157 | 160 | ||
| 158 | /* | 161 | /* |
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index e08fbeeb54b9..47197de8abd9 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c | |||
| @@ -11,7 +11,7 @@ | |||
| 11 | 11 | ||
| 12 | #ifdef CONFIG_SMP | 12 | #ifdef CONFIG_SMP |
| 13 | static int | 13 | static int |
| 14 | select_task_rq_stop(struct task_struct *p, int sd_flag, int flags) | 14 | select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags) |
| 15 | { | 15 | { |
| 16 | return task_cpu(p); /* stop tasks as never migrate */ | 16 | return task_cpu(p); /* stop tasks as never migrate */ |
| 17 | } | 17 | } |
diff --git a/kernel/wait.c b/kernel/sched/wait.c index dec68bd4e9d8..7d50f794e248 100644 --- a/kernel/wait.c +++ b/kernel/sched/wait.c | |||
| @@ -53,6 +53,109 @@ EXPORT_SYMBOL(remove_wait_queue); | |||
| 53 | 53 | ||
| 54 | 54 | ||
| 55 | /* | 55 | /* |
| 56 | * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just | ||
| 57 | * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve | ||
| 58 | * number) then we wake all the non-exclusive tasks and one exclusive task. | ||
| 59 | * | ||
| 60 | * There are circumstances in which we can try to wake a task which has already | ||
| 61 | * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns | ||
| 62 | * zero in this (rare) case, and we handle it by continuing to scan the queue. | ||
| 63 | */ | ||
| 64 | static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, | ||
| 65 | int nr_exclusive, int wake_flags, void *key) | ||
| 66 | { | ||
| 67 | wait_queue_t *curr, *next; | ||
| 68 | |||
| 69 | list_for_each_entry_safe(curr, next, &q->task_list, task_list) { | ||
| 70 | unsigned flags = curr->flags; | ||
| 71 | |||
| 72 | if (curr->func(curr, mode, wake_flags, key) && | ||
| 73 | (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) | ||
| 74 | break; | ||
| 75 | } | ||
| 76 | } | ||
| 77 | |||
| 78 | /** | ||
| 79 | * __wake_up - wake up threads blocked on a waitqueue. | ||
| 80 | * @q: the waitqueue | ||
| 81 | * @mode: which threads | ||
| 82 | * @nr_exclusive: how many wake-one or wake-many threads to wake up | ||
| 83 | * @key: is directly passed to the wakeup function | ||
| 84 | * | ||
| 85 | * It may be assumed that this function implies a write memory barrier before | ||
| 86 | * changing the task state if and only if any tasks are woken up. | ||
| 87 | */ | ||
| 88 | void __wake_up(wait_queue_head_t *q, unsigned int mode, | ||
| 89 | int nr_exclusive, void *key) | ||
| 90 | { | ||
| 91 | unsigned long flags; | ||
| 92 | |||
| 93 | spin_lock_irqsave(&q->lock, flags); | ||
| 94 | __wake_up_common(q, mode, nr_exclusive, 0, key); | ||
| 95 | spin_unlock_irqrestore(&q->lock, flags); | ||
| 96 | } | ||
| 97 | EXPORT_SYMBOL(__wake_up); | ||
| 98 | |||
| 99 | /* | ||
| 100 | * Same as __wake_up but called with the spinlock in wait_queue_head_t held. | ||
| 101 | */ | ||
| 102 | void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr) | ||
| 103 | { | ||
| 104 | __wake_up_common(q, mode, nr, 0, NULL); | ||
| 105 | } | ||
| 106 | EXPORT_SYMBOL_GPL(__wake_up_locked); | ||
| 107 | |||
| 108 | void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) | ||
| 109 | { | ||
| 110 | __wake_up_common(q, mode, 1, 0, key); | ||
| 111 | } | ||
| 112 | EXPORT_SYMBOL_GPL(__wake_up_locked_key); | ||
| 113 | |||
| 114 | /** | ||
| 115 | * __wake_up_sync_key - wake up threads blocked on a waitqueue. | ||
| 116 | * @q: the waitqueue | ||
| 117 | * @mode: which threads | ||
| 118 | * @nr_exclusive: how many wake-one or wake-many threads to wake up | ||
| 119 | * @key: opaque value to be passed to wakeup targets | ||
| 120 | * | ||
| 121 | * The sync wakeup differs that the waker knows that it will schedule | ||
| 122 | * away soon, so while the target thread will be woken up, it will not | ||
| 123 | * be migrated to another CPU - ie. the two threads are 'synchronized' | ||
| 124 | * with each other. This can prevent needless bouncing between CPUs. | ||
| 125 | * | ||
| 126 | * On UP it can prevent extra preemption. | ||
| 127 | * | ||
| 128 | * It may be assumed that this function implies a write memory barrier before | ||
| 129 | * changing the task state if and only if any tasks are woken up. | ||
| 130 | */ | ||
| 131 | void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, | ||
| 132 | int nr_exclusive, void *key) | ||
| 133 | { | ||
| 134 | unsigned long flags; | ||
| 135 | int wake_flags = 1; /* XXX WF_SYNC */ | ||
| 136 | |||
| 137 | if (unlikely(!q)) | ||
| 138 | return; | ||
| 139 | |||
| 140 | if (unlikely(nr_exclusive != 1)) | ||
| 141 | wake_flags = 0; | ||
| 142 | |||
| 143 | spin_lock_irqsave(&q->lock, flags); | ||
| 144 | __wake_up_common(q, mode, nr_exclusive, wake_flags, key); | ||
| 145 | spin_unlock_irqrestore(&q->lock, flags); | ||
| 146 | } | ||
| 147 | EXPORT_SYMBOL_GPL(__wake_up_sync_key); | ||
| 148 | |||
| 149 | /* | ||
| 150 | * __wake_up_sync - see __wake_up_sync_key() | ||
| 151 | */ | ||
| 152 | void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) | ||
| 153 | { | ||
| 154 | __wake_up_sync_key(q, mode, nr_exclusive, NULL); | ||
| 155 | } | ||
| 156 | EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ | ||
| 157 | |||
| 158 | /* | ||
| 56 | * Note: we use "set_current_state()" _after_ the wait-queue add, | 159 | * Note: we use "set_current_state()" _after_ the wait-queue add, |
| 57 | * because we need a memory barrier there on SMP, so that any | 160 | * because we need a memory barrier there on SMP, so that any |
| 58 | * wake-function that tests for the wait-queue being active | 161 | * wake-function that tests for the wait-queue being active |
| @@ -92,6 +195,30 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state) | |||
| 92 | } | 195 | } |
| 93 | EXPORT_SYMBOL(prepare_to_wait_exclusive); | 196 | EXPORT_SYMBOL(prepare_to_wait_exclusive); |
| 94 | 197 | ||
| 198 | long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state) | ||
| 199 | { | ||
| 200 | unsigned long flags; | ||
| 201 | |||
| 202 | if (signal_pending_state(state, current)) | ||
| 203 | return -ERESTARTSYS; | ||
| 204 | |||
| 205 | wait->private = current; | ||
| 206 | wait->func = autoremove_wake_function; | ||
| 207 | |||
| 208 | spin_lock_irqsave(&q->lock, flags); | ||
| 209 | if (list_empty(&wait->task_list)) { | ||
| 210 | if (wait->flags & WQ_FLAG_EXCLUSIVE) | ||
| 211 | __add_wait_queue_tail(q, wait); | ||
| 212 | else | ||
| 213 | __add_wait_queue(q, wait); | ||
| 214 | } | ||
| 215 | set_current_state(state); | ||
| 216 | spin_unlock_irqrestore(&q->lock, flags); | ||
| 217 | |||
| 218 | return 0; | ||
| 219 | } | ||
| 220 | EXPORT_SYMBOL(prepare_to_wait_event); | ||
| 221 | |||
| 95 | /** | 222 | /** |
| 96 | * finish_wait - clean up after waiting in a queue | 223 | * finish_wait - clean up after waiting in a queue |
| 97 | * @q: waitqueue waited on | 224 | * @q: waitqueue waited on |
| @@ -363,8 +490,7 @@ EXPORT_SYMBOL(out_of_line_wait_on_atomic_t); | |||
| 363 | 490 | ||
| 364 | /** | 491 | /** |
| 365 | * wake_up_atomic_t - Wake up a waiter on a atomic_t | 492 | * wake_up_atomic_t - Wake up a waiter on a atomic_t |
| 366 | * @word: The word being waited on, a kernel virtual address | 493 | * @p: The atomic_t being waited on, a kernel virtual address |
| 367 | * @bit: The bit of the word being waited on | ||
| 368 | * | 494 | * |
| 369 | * Wake up anyone waiting for the atomic_t to go to zero. | 495 | * Wake up anyone waiting for the atomic_t to go to zero. |
| 370 | * | 496 | * |
diff --git a/kernel/signal.c b/kernel/signal.c index 50e41075ac77..ded28b91fa53 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
| @@ -3394,7 +3394,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig, | |||
| 3394 | new_ka.sa.sa_restorer = compat_ptr(restorer); | 3394 | new_ka.sa.sa_restorer = compat_ptr(restorer); |
| 3395 | #endif | 3395 | #endif |
| 3396 | ret |= copy_from_user(&mask, &act->sa_mask, sizeof(mask)); | 3396 | ret |= copy_from_user(&mask, &act->sa_mask, sizeof(mask)); |
| 3397 | ret |= __get_user(new_ka.sa.sa_flags, &act->sa_flags); | 3397 | ret |= get_user(new_ka.sa.sa_flags, &act->sa_flags); |
| 3398 | if (ret) | 3398 | if (ret) |
| 3399 | return -EFAULT; | 3399 | return -EFAULT; |
| 3400 | sigset_from_compat(&new_ka.sa.sa_mask, &mask); | 3400 | sigset_from_compat(&new_ka.sa.sa_mask, &mask); |
| @@ -3406,7 +3406,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig, | |||
| 3406 | ret = put_user(ptr_to_compat(old_ka.sa.sa_handler), | 3406 | ret = put_user(ptr_to_compat(old_ka.sa.sa_handler), |
| 3407 | &oact->sa_handler); | 3407 | &oact->sa_handler); |
| 3408 | ret |= copy_to_user(&oact->sa_mask, &mask, sizeof(mask)); | 3408 | ret |= copy_to_user(&oact->sa_mask, &mask, sizeof(mask)); |
| 3409 | ret |= __put_user(old_ka.sa.sa_flags, &oact->sa_flags); | 3409 | ret |= put_user(old_ka.sa.sa_flags, &oact->sa_flags); |
| 3410 | #ifdef __ARCH_HAS_SA_RESTORER | 3410 | #ifdef __ARCH_HAS_SA_RESTORER |
| 3411 | ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer), | 3411 | ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer), |
| 3412 | &oact->sa_restorer); | 3412 | &oact->sa_restorer); |
diff --git a/kernel/smp.c b/kernel/smp.c index fe9f773d7114..f5768b0c816a 100644 --- a/kernel/smp.c +++ b/kernel/smp.c | |||
| @@ -48,10 +48,13 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
| 48 | cpu_to_node(cpu))) | 48 | cpu_to_node(cpu))) |
| 49 | return notifier_from_errno(-ENOMEM); | 49 | return notifier_from_errno(-ENOMEM); |
| 50 | if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL, | 50 | if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL, |
| 51 | cpu_to_node(cpu))) | 51 | cpu_to_node(cpu))) { |
| 52 | free_cpumask_var(cfd->cpumask); | ||
| 52 | return notifier_from_errno(-ENOMEM); | 53 | return notifier_from_errno(-ENOMEM); |
| 54 | } | ||
| 53 | cfd->csd = alloc_percpu(struct call_single_data); | 55 | cfd->csd = alloc_percpu(struct call_single_data); |
| 54 | if (!cfd->csd) { | 56 | if (!cfd->csd) { |
| 57 | free_cpumask_var(cfd->cpumask_ipi); | ||
| 55 | free_cpumask_var(cfd->cpumask); | 58 | free_cpumask_var(cfd->cpumask); |
| 56 | return notifier_from_errno(-ENOMEM); | 59 | return notifier_from_errno(-ENOMEM); |
| 57 | } | 60 | } |
| @@ -186,25 +189,13 @@ void generic_smp_call_function_single_interrupt(void) | |||
| 186 | 189 | ||
| 187 | while (!list_empty(&list)) { | 190 | while (!list_empty(&list)) { |
| 188 | struct call_single_data *csd; | 191 | struct call_single_data *csd; |
| 189 | unsigned int csd_flags; | ||
| 190 | 192 | ||
| 191 | csd = list_entry(list.next, struct call_single_data, list); | 193 | csd = list_entry(list.next, struct call_single_data, list); |
| 192 | list_del(&csd->list); | 194 | list_del(&csd->list); |
| 193 | 195 | ||
| 194 | /* | ||
| 195 | * 'csd' can be invalid after this call if flags == 0 | ||
| 196 | * (when called through generic_exec_single()), | ||
| 197 | * so save them away before making the call: | ||
| 198 | */ | ||
| 199 | csd_flags = csd->flags; | ||
| 200 | |||
| 201 | csd->func(csd->info); | 196 | csd->func(csd->info); |
| 202 | 197 | ||
| 203 | /* | 198 | csd_unlock(csd); |
| 204 | * Unlocked CSDs are valid through generic_exec_single(): | ||
| 205 | */ | ||
| 206 | if (csd_flags & CSD_FLAG_LOCK) | ||
| 207 | csd_unlock(csd); | ||
| 208 | } | 199 | } |
| 209 | } | 200 | } |
| 210 | 201 | ||
| @@ -278,8 +269,6 @@ EXPORT_SYMBOL(smp_call_function_single); | |||
| 278 | * @wait: If true, wait until function has completed. | 269 | * @wait: If true, wait until function has completed. |
| 279 | * | 270 | * |
| 280 | * Returns 0 on success, else a negative status code (if no cpus were online). | 271 | * Returns 0 on success, else a negative status code (if no cpus were online). |
| 281 | * Note that @wait will be implicitly turned on in case of allocation failures, | ||
| 282 | * since we fall back to on-stack allocation. | ||
| 283 | * | 272 | * |
| 284 | * Selection preference: | 273 | * Selection preference: |
| 285 | * 1) current cpu if in @mask | 274 | * 1) current cpu if in @mask |
| @@ -535,6 +524,11 @@ void __init setup_nr_cpu_ids(void) | |||
| 535 | nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1; | 524 | nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1; |
| 536 | } | 525 | } |
| 537 | 526 | ||
| 527 | void __weak smp_announce(void) | ||
| 528 | { | ||
| 529 | printk(KERN_INFO "Brought up %d CPUs\n", num_online_cpus()); | ||
| 530 | } | ||
| 531 | |||
| 538 | /* Called by boot processor to activate the rest. */ | 532 | /* Called by boot processor to activate the rest. */ |
| 539 | void __init smp_init(void) | 533 | void __init smp_init(void) |
| 540 | { | 534 | { |
| @@ -551,7 +545,7 @@ void __init smp_init(void) | |||
| 551 | } | 545 | } |
| 552 | 546 | ||
| 553 | /* Any cleanup work */ | 547 | /* Any cleanup work */ |
| 554 | printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus()); | 548 | smp_announce(); |
| 555 | smp_cpus_done(setup_max_cpus); | 549 | smp_cpus_done(setup_max_cpus); |
| 556 | } | 550 | } |
| 557 | 551 | ||
| @@ -586,8 +580,10 @@ EXPORT_SYMBOL(on_each_cpu); | |||
| 586 | * | 580 | * |
| 587 | * If @wait is true, then returns once @func has returned. | 581 | * If @wait is true, then returns once @func has returned. |
| 588 | * | 582 | * |
| 589 | * You must not call this function with disabled interrupts or | 583 | * You must not call this function with disabled interrupts or from a |
| 590 | * from a hardware interrupt handler or from a bottom half handler. | 584 | * hardware interrupt handler or from a bottom half handler. The |
| 585 | * exception is that it may be used during early boot while | ||
| 586 | * early_boot_irqs_disabled is set. | ||
| 591 | */ | 587 | */ |
| 592 | void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func, | 588 | void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func, |
| 593 | void *info, bool wait) | 589 | void *info, bool wait) |
| @@ -596,9 +592,10 @@ void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func, | |||
| 596 | 592 | ||
| 597 | smp_call_function_many(mask, func, info, wait); | 593 | smp_call_function_many(mask, func, info, wait); |
| 598 | if (cpumask_test_cpu(cpu, mask)) { | 594 | if (cpumask_test_cpu(cpu, mask)) { |
| 599 | local_irq_disable(); | 595 | unsigned long flags; |
| 596 | local_irq_save(flags); | ||
| 600 | func(info); | 597 | func(info); |
| 601 | local_irq_enable(); | 598 | local_irq_restore(flags); |
| 602 | } | 599 | } |
| 603 | put_cpu(); | 600 | put_cpu(); |
| 604 | } | 601 | } |
diff --git a/kernel/softirq.c b/kernel/softirq.c index be3d3514c325..b24988353458 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
| @@ -29,7 +29,6 @@ | |||
| 29 | #define CREATE_TRACE_POINTS | 29 | #define CREATE_TRACE_POINTS |
| 30 | #include <trace/events/irq.h> | 30 | #include <trace/events/irq.h> |
| 31 | 31 | ||
| 32 | #include <asm/irq.h> | ||
| 33 | /* | 32 | /* |
| 34 | - No shared variables, all the data are CPU local. | 33 | - No shared variables, all the data are CPU local. |
| 35 | - If a softirq needs serialization, let it serialize itself | 34 | - If a softirq needs serialization, let it serialize itself |
| @@ -100,13 +99,13 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt) | |||
| 100 | 99 | ||
| 101 | raw_local_irq_save(flags); | 100 | raw_local_irq_save(flags); |
| 102 | /* | 101 | /* |
| 103 | * The preempt tracer hooks into add_preempt_count and will break | 102 | * The preempt tracer hooks into preempt_count_add and will break |
| 104 | * lockdep because it calls back into lockdep after SOFTIRQ_OFFSET | 103 | * lockdep because it calls back into lockdep after SOFTIRQ_OFFSET |
| 105 | * is set and before current->softirq_enabled is cleared. | 104 | * is set and before current->softirq_enabled is cleared. |
| 106 | * We must manually increment preempt_count here and manually | 105 | * We must manually increment preempt_count here and manually |
| 107 | * call the trace_preempt_off later. | 106 | * call the trace_preempt_off later. |
| 108 | */ | 107 | */ |
| 109 | preempt_count() += cnt; | 108 | __preempt_count_add(cnt); |
| 110 | /* | 109 | /* |
| 111 | * Were softirqs turned off above: | 110 | * Were softirqs turned off above: |
| 112 | */ | 111 | */ |
| @@ -120,7 +119,7 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt) | |||
| 120 | #else /* !CONFIG_TRACE_IRQFLAGS */ | 119 | #else /* !CONFIG_TRACE_IRQFLAGS */ |
| 121 | static inline void __local_bh_disable(unsigned long ip, unsigned int cnt) | 120 | static inline void __local_bh_disable(unsigned long ip, unsigned int cnt) |
| 122 | { | 121 | { |
| 123 | add_preempt_count(cnt); | 122 | preempt_count_add(cnt); |
| 124 | barrier(); | 123 | barrier(); |
| 125 | } | 124 | } |
| 126 | #endif /* CONFIG_TRACE_IRQFLAGS */ | 125 | #endif /* CONFIG_TRACE_IRQFLAGS */ |
| @@ -134,12 +133,11 @@ EXPORT_SYMBOL(local_bh_disable); | |||
| 134 | 133 | ||
| 135 | static void __local_bh_enable(unsigned int cnt) | 134 | static void __local_bh_enable(unsigned int cnt) |
| 136 | { | 135 | { |
| 137 | WARN_ON_ONCE(in_irq()); | ||
| 138 | WARN_ON_ONCE(!irqs_disabled()); | 136 | WARN_ON_ONCE(!irqs_disabled()); |
| 139 | 137 | ||
| 140 | if (softirq_count() == cnt) | 138 | if (softirq_count() == cnt) |
| 141 | trace_softirqs_on(_RET_IP_); | 139 | trace_softirqs_on(_RET_IP_); |
| 142 | sub_preempt_count(cnt); | 140 | preempt_count_sub(cnt); |
| 143 | } | 141 | } |
| 144 | 142 | ||
| 145 | /* | 143 | /* |
| @@ -149,6 +147,7 @@ static void __local_bh_enable(unsigned int cnt) | |||
| 149 | */ | 147 | */ |
| 150 | void _local_bh_enable(void) | 148 | void _local_bh_enable(void) |
| 151 | { | 149 | { |
| 150 | WARN_ON_ONCE(in_irq()); | ||
| 152 | __local_bh_enable(SOFTIRQ_DISABLE_OFFSET); | 151 | __local_bh_enable(SOFTIRQ_DISABLE_OFFSET); |
| 153 | } | 152 | } |
| 154 | 153 | ||
| @@ -169,12 +168,17 @@ static inline void _local_bh_enable_ip(unsigned long ip) | |||
| 169 | * Keep preemption disabled until we are done with | 168 | * Keep preemption disabled until we are done with |
| 170 | * softirq processing: | 169 | * softirq processing: |
| 171 | */ | 170 | */ |
| 172 | sub_preempt_count(SOFTIRQ_DISABLE_OFFSET - 1); | 171 | preempt_count_sub(SOFTIRQ_DISABLE_OFFSET - 1); |
| 173 | 172 | ||
| 174 | if (unlikely(!in_interrupt() && local_softirq_pending())) | 173 | if (unlikely(!in_interrupt() && local_softirq_pending())) { |
| 174 | /* | ||
| 175 | * Run softirq if any pending. And do it in its own stack | ||
| 176 | * as we may be calling this deep in a task call stack already. | ||
| 177 | */ | ||
| 175 | do_softirq(); | 178 | do_softirq(); |
| 179 | } | ||
| 176 | 180 | ||
| 177 | dec_preempt_count(); | 181 | preempt_count_dec(); |
| 178 | #ifdef CONFIG_TRACE_IRQFLAGS | 182 | #ifdef CONFIG_TRACE_IRQFLAGS |
| 179 | local_irq_enable(); | 183 | local_irq_enable(); |
| 180 | #endif | 184 | #endif |
| @@ -256,7 +260,7 @@ restart: | |||
| 256 | " exited with %08x?\n", vec_nr, | 260 | " exited with %08x?\n", vec_nr, |
| 257 | softirq_to_name[vec_nr], h->action, | 261 | softirq_to_name[vec_nr], h->action, |
| 258 | prev_count, preempt_count()); | 262 | prev_count, preempt_count()); |
| 259 | preempt_count() = prev_count; | 263 | preempt_count_set(prev_count); |
| 260 | } | 264 | } |
| 261 | 265 | ||
| 262 | rcu_bh_qs(cpu); | 266 | rcu_bh_qs(cpu); |
| @@ -280,10 +284,11 @@ restart: | |||
| 280 | 284 | ||
| 281 | account_irq_exit_time(current); | 285 | account_irq_exit_time(current); |
| 282 | __local_bh_enable(SOFTIRQ_OFFSET); | 286 | __local_bh_enable(SOFTIRQ_OFFSET); |
| 287 | WARN_ON_ONCE(in_interrupt()); | ||
| 283 | tsk_restore_flags(current, old_flags, PF_MEMALLOC); | 288 | tsk_restore_flags(current, old_flags, PF_MEMALLOC); |
| 284 | } | 289 | } |
| 285 | 290 | ||
| 286 | #ifndef __ARCH_HAS_DO_SOFTIRQ | 291 | |
| 287 | 292 | ||
| 288 | asmlinkage void do_softirq(void) | 293 | asmlinkage void do_softirq(void) |
| 289 | { | 294 | { |
| @@ -298,13 +303,11 @@ asmlinkage void do_softirq(void) | |||
| 298 | pending = local_softirq_pending(); | 303 | pending = local_softirq_pending(); |
| 299 | 304 | ||
| 300 | if (pending) | 305 | if (pending) |
| 301 | __do_softirq(); | 306 | do_softirq_own_stack(); |
| 302 | 307 | ||
| 303 | local_irq_restore(flags); | 308 | local_irq_restore(flags); |
| 304 | } | 309 | } |
| 305 | 310 | ||
| 306 | #endif | ||
| 307 | |||
| 308 | /* | 311 | /* |
| 309 | * Enter an interrupt context. | 312 | * Enter an interrupt context. |
| 310 | */ | 313 | */ |
| @@ -328,10 +331,25 @@ void irq_enter(void) | |||
| 328 | 331 | ||
| 329 | static inline void invoke_softirq(void) | 332 | static inline void invoke_softirq(void) |
| 330 | { | 333 | { |
| 331 | if (!force_irqthreads) | 334 | if (!force_irqthreads) { |
| 335 | #ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK | ||
| 336 | /* | ||
| 337 | * We can safely execute softirq on the current stack if | ||
| 338 | * it is the irq stack, because it should be near empty | ||
| 339 | * at this stage. | ||
| 340 | */ | ||
| 332 | __do_softirq(); | 341 | __do_softirq(); |
| 333 | else | 342 | #else |
| 343 | /* | ||
| 344 | * Otherwise, irq_exit() is called on the task stack that can | ||
| 345 | * be potentially deep already. So call softirq in its own stack | ||
| 346 | * to prevent from any overrun. | ||
| 347 | */ | ||
| 348 | do_softirq_own_stack(); | ||
| 349 | #endif | ||
| 350 | } else { | ||
| 334 | wakeup_softirqd(); | 351 | wakeup_softirqd(); |
| 352 | } | ||
| 335 | } | 353 | } |
| 336 | 354 | ||
| 337 | static inline void tick_irq_exit(void) | 355 | static inline void tick_irq_exit(void) |
| @@ -360,7 +378,7 @@ void irq_exit(void) | |||
| 360 | 378 | ||
| 361 | account_irq_exit_time(current); | 379 | account_irq_exit_time(current); |
| 362 | trace_hardirq_exit(); | 380 | trace_hardirq_exit(); |
| 363 | sub_preempt_count(HARDIRQ_OFFSET); | 381 | preempt_count_sub(HARDIRQ_OFFSET); |
| 364 | if (!in_interrupt() && local_softirq_pending()) | 382 | if (!in_interrupt() && local_softirq_pending()) |
| 365 | invoke_softirq(); | 383 | invoke_softirq(); |
| 366 | 384 | ||
| @@ -762,6 +780,10 @@ static void run_ksoftirqd(unsigned int cpu) | |||
| 762 | { | 780 | { |
| 763 | local_irq_disable(); | 781 | local_irq_disable(); |
| 764 | if (local_softirq_pending()) { | 782 | if (local_softirq_pending()) { |
| 783 | /* | ||
| 784 | * We can safely run softirq on inline stack, as we are not deep | ||
| 785 | * in the task stack here. | ||
| 786 | */ | ||
| 765 | __do_softirq(); | 787 | __do_softirq(); |
| 766 | rcu_note_context_switch(cpu); | 788 | rcu_note_context_switch(cpu); |
| 767 | local_irq_enable(); | 789 | local_irq_enable(); |
| @@ -876,7 +898,6 @@ int __init __weak early_irq_init(void) | |||
| 876 | return 0; | 898 | return 0; |
| 877 | } | 899 | } |
| 878 | 900 | ||
| 879 | #ifdef CONFIG_GENERIC_HARDIRQS | ||
| 880 | int __init __weak arch_probe_nr_irqs(void) | 901 | int __init __weak arch_probe_nr_irqs(void) |
| 881 | { | 902 | { |
| 882 | return NR_IRQS_LEGACY; | 903 | return NR_IRQS_LEGACY; |
| @@ -886,4 +907,3 @@ int __init __weak arch_early_irq_init(void) | |||
| 886 | { | 907 | { |
| 887 | return 0; | 908 | return 0; |
| 888 | } | 909 | } |
| 889 | #endif | ||
diff --git a/kernel/spinlock.c b/kernel/spinlock.c index 5cdd8065a3ce..4b082b5cac9e 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c | |||
| @@ -34,6 +34,20 @@ | |||
| 34 | #else | 34 | #else |
| 35 | #define raw_read_can_lock(l) read_can_lock(l) | 35 | #define raw_read_can_lock(l) read_can_lock(l) |
| 36 | #define raw_write_can_lock(l) write_can_lock(l) | 36 | #define raw_write_can_lock(l) write_can_lock(l) |
| 37 | |||
| 38 | /* | ||
| 39 | * Some architectures can relax in favour of the CPU owning the lock. | ||
| 40 | */ | ||
| 41 | #ifndef arch_read_relax | ||
| 42 | # define arch_read_relax(l) cpu_relax() | ||
| 43 | #endif | ||
| 44 | #ifndef arch_write_relax | ||
| 45 | # define arch_write_relax(l) cpu_relax() | ||
| 46 | #endif | ||
| 47 | #ifndef arch_spin_relax | ||
| 48 | # define arch_spin_relax(l) cpu_relax() | ||
| 49 | #endif | ||
| 50 | |||
| 37 | /* | 51 | /* |
| 38 | * We build the __lock_function inlines here. They are too large for | 52 | * We build the __lock_function inlines here. They are too large for |
| 39 | * inlining all over the place, but here is only one user per function | 53 | * inlining all over the place, but here is only one user per function |
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index c09f2955ae30..84571e09c907 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c | |||
| @@ -20,6 +20,7 @@ | |||
| 20 | #include <linux/kallsyms.h> | 20 | #include <linux/kallsyms.h> |
| 21 | #include <linux/smpboot.h> | 21 | #include <linux/smpboot.h> |
| 22 | #include <linux/atomic.h> | 22 | #include <linux/atomic.h> |
| 23 | #include <linux/lglock.h> | ||
| 23 | 24 | ||
| 24 | /* | 25 | /* |
| 25 | * Structure to determine completion condition and record errors. May | 26 | * Structure to determine completion condition and record errors. May |
| @@ -43,6 +44,14 @@ static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); | |||
| 43 | static DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task); | 44 | static DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task); |
| 44 | static bool stop_machine_initialized = false; | 45 | static bool stop_machine_initialized = false; |
| 45 | 46 | ||
| 47 | /* | ||
| 48 | * Avoids a race between stop_two_cpus and global stop_cpus, where | ||
| 49 | * the stoppers could get queued up in reverse order, leading to | ||
| 50 | * system deadlock. Using an lglock means stop_two_cpus remains | ||
| 51 | * relatively cheap. | ||
| 52 | */ | ||
| 53 | DEFINE_STATIC_LGLOCK(stop_cpus_lock); | ||
| 54 | |||
| 46 | static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo) | 55 | static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo) |
| 47 | { | 56 | { |
| 48 | memset(done, 0, sizeof(*done)); | 57 | memset(done, 0, sizeof(*done)); |
| @@ -115,6 +124,184 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg) | |||
| 115 | return done.executed ? done.ret : -ENOENT; | 124 | return done.executed ? done.ret : -ENOENT; |
| 116 | } | 125 | } |
| 117 | 126 | ||
| 127 | /* This controls the threads on each CPU. */ | ||
| 128 | enum multi_stop_state { | ||
| 129 | /* Dummy starting state for thread. */ | ||
| 130 | MULTI_STOP_NONE, | ||
| 131 | /* Awaiting everyone to be scheduled. */ | ||
| 132 | MULTI_STOP_PREPARE, | ||
| 133 | /* Disable interrupts. */ | ||
| 134 | MULTI_STOP_DISABLE_IRQ, | ||
| 135 | /* Run the function */ | ||
| 136 | MULTI_STOP_RUN, | ||
| 137 | /* Exit */ | ||
| 138 | MULTI_STOP_EXIT, | ||
| 139 | }; | ||
| 140 | |||
| 141 | struct multi_stop_data { | ||
| 142 | int (*fn)(void *); | ||
| 143 | void *data; | ||
| 144 | /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */ | ||
| 145 | unsigned int num_threads; | ||
| 146 | const struct cpumask *active_cpus; | ||
| 147 | |||
| 148 | enum multi_stop_state state; | ||
| 149 | atomic_t thread_ack; | ||
| 150 | }; | ||
| 151 | |||
| 152 | static void set_state(struct multi_stop_data *msdata, | ||
| 153 | enum multi_stop_state newstate) | ||
| 154 | { | ||
| 155 | /* Reset ack counter. */ | ||
| 156 | atomic_set(&msdata->thread_ack, msdata->num_threads); | ||
| 157 | smp_wmb(); | ||
| 158 | msdata->state = newstate; | ||
| 159 | } | ||
| 160 | |||
| 161 | /* Last one to ack a state moves to the next state. */ | ||
| 162 | static void ack_state(struct multi_stop_data *msdata) | ||
| 163 | { | ||
| 164 | if (atomic_dec_and_test(&msdata->thread_ack)) | ||
| 165 | set_state(msdata, msdata->state + 1); | ||
| 166 | } | ||
| 167 | |||
| 168 | /* This is the cpu_stop function which stops the CPU. */ | ||
| 169 | static int multi_cpu_stop(void *data) | ||
| 170 | { | ||
| 171 | struct multi_stop_data *msdata = data; | ||
| 172 | enum multi_stop_state curstate = MULTI_STOP_NONE; | ||
| 173 | int cpu = smp_processor_id(), err = 0; | ||
| 174 | unsigned long flags; | ||
| 175 | bool is_active; | ||
| 176 | |||
| 177 | /* | ||
| 178 | * When called from stop_machine_from_inactive_cpu(), irq might | ||
| 179 | * already be disabled. Save the state and restore it on exit. | ||
| 180 | */ | ||
| 181 | local_save_flags(flags); | ||
| 182 | |||
| 183 | if (!msdata->active_cpus) | ||
| 184 | is_active = cpu == cpumask_first(cpu_online_mask); | ||
| 185 | else | ||
| 186 | is_active = cpumask_test_cpu(cpu, msdata->active_cpus); | ||
| 187 | |||
| 188 | /* Simple state machine */ | ||
| 189 | do { | ||
| 190 | /* Chill out and ensure we re-read multi_stop_state. */ | ||
| 191 | cpu_relax(); | ||
| 192 | if (msdata->state != curstate) { | ||
| 193 | curstate = msdata->state; | ||
| 194 | switch (curstate) { | ||
| 195 | case MULTI_STOP_DISABLE_IRQ: | ||
| 196 | local_irq_disable(); | ||
| 197 | hard_irq_disable(); | ||
| 198 | break; | ||
| 199 | case MULTI_STOP_RUN: | ||
| 200 | if (is_active) | ||
| 201 | err = msdata->fn(msdata->data); | ||
| 202 | break; | ||
| 203 | default: | ||
| 204 | break; | ||
| 205 | } | ||
| 206 | ack_state(msdata); | ||
| 207 | } | ||
| 208 | } while (curstate != MULTI_STOP_EXIT); | ||
| 209 | |||
| 210 | local_irq_restore(flags); | ||
| 211 | return err; | ||
| 212 | } | ||
| 213 | |||
| 214 | struct irq_cpu_stop_queue_work_info { | ||
| 215 | int cpu1; | ||
| 216 | int cpu2; | ||
| 217 | struct cpu_stop_work *work1; | ||
| 218 | struct cpu_stop_work *work2; | ||
| 219 | }; | ||
| 220 | |||
| 221 | /* | ||
| 222 | * This function is always run with irqs and preemption disabled. | ||
| 223 | * This guarantees that both work1 and work2 get queued, before | ||
| 224 | * our local migrate thread gets the chance to preempt us. | ||
| 225 | */ | ||
| 226 | static void irq_cpu_stop_queue_work(void *arg) | ||
| 227 | { | ||
| 228 | struct irq_cpu_stop_queue_work_info *info = arg; | ||
| 229 | cpu_stop_queue_work(info->cpu1, info->work1); | ||
| 230 | cpu_stop_queue_work(info->cpu2, info->work2); | ||
| 231 | } | ||
| 232 | |||
| 233 | /** | ||
| 234 | * stop_two_cpus - stops two cpus | ||
| 235 | * @cpu1: the cpu to stop | ||
| 236 | * @cpu2: the other cpu to stop | ||
| 237 | * @fn: function to execute | ||
| 238 | * @arg: argument to @fn | ||
| 239 | * | ||
| 240 | * Stops both the current and specified CPU and runs @fn on one of them. | ||
| 241 | * | ||
| 242 | * returns when both are completed. | ||
| 243 | */ | ||
| 244 | int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg) | ||
| 245 | { | ||
| 246 | struct cpu_stop_done done; | ||
| 247 | struct cpu_stop_work work1, work2; | ||
| 248 | struct irq_cpu_stop_queue_work_info call_args; | ||
| 249 | struct multi_stop_data msdata; | ||
| 250 | |||
| 251 | preempt_disable(); | ||
| 252 | msdata = (struct multi_stop_data){ | ||
| 253 | .fn = fn, | ||
| 254 | .data = arg, | ||
| 255 | .num_threads = 2, | ||
| 256 | .active_cpus = cpumask_of(cpu1), | ||
| 257 | }; | ||
| 258 | |||
| 259 | work1 = work2 = (struct cpu_stop_work){ | ||
| 260 | .fn = multi_cpu_stop, | ||
| 261 | .arg = &msdata, | ||
| 262 | .done = &done | ||
| 263 | }; | ||
| 264 | |||
| 265 | call_args = (struct irq_cpu_stop_queue_work_info){ | ||
| 266 | .cpu1 = cpu1, | ||
| 267 | .cpu2 = cpu2, | ||
| 268 | .work1 = &work1, | ||
| 269 | .work2 = &work2, | ||
| 270 | }; | ||
| 271 | |||
| 272 | cpu_stop_init_done(&done, 2); | ||
| 273 | set_state(&msdata, MULTI_STOP_PREPARE); | ||
| 274 | |||
| 275 | /* | ||
| 276 | * If we observe both CPUs active we know _cpu_down() cannot yet have | ||
| 277 | * queued its stop_machine works and therefore ours will get executed | ||
| 278 | * first. Or its not either one of our CPUs that's getting unplugged, | ||
| 279 | * in which case we don't care. | ||
| 280 | * | ||
| 281 | * This relies on the stopper workqueues to be FIFO. | ||
| 282 | */ | ||
| 283 | if (!cpu_active(cpu1) || !cpu_active(cpu2)) { | ||
| 284 | preempt_enable(); | ||
| 285 | return -ENOENT; | ||
| 286 | } | ||
| 287 | |||
| 288 | lg_local_lock(&stop_cpus_lock); | ||
| 289 | /* | ||
| 290 | * Queuing needs to be done by the lowest numbered CPU, to ensure | ||
| 291 | * that works are always queued in the same order on every CPU. | ||
| 292 | * This prevents deadlocks. | ||
| 293 | */ | ||
| 294 | smp_call_function_single(min(cpu1, cpu2), | ||
| 295 | &irq_cpu_stop_queue_work, | ||
| 296 | &call_args, 0); | ||
| 297 | lg_local_unlock(&stop_cpus_lock); | ||
| 298 | preempt_enable(); | ||
| 299 | |||
| 300 | wait_for_completion(&done.completion); | ||
| 301 | |||
| 302 | return done.executed ? done.ret : -ENOENT; | ||
| 303 | } | ||
| 304 | |||
| 118 | /** | 305 | /** |
| 119 | * stop_one_cpu_nowait - stop a cpu but don't wait for completion | 306 | * stop_one_cpu_nowait - stop a cpu but don't wait for completion |
| 120 | * @cpu: cpu to stop | 307 | * @cpu: cpu to stop |
| @@ -159,10 +346,10 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask, | |||
| 159 | * preempted by a stopper which might wait for other stoppers | 346 | * preempted by a stopper which might wait for other stoppers |
| 160 | * to enter @fn which can lead to deadlock. | 347 | * to enter @fn which can lead to deadlock. |
| 161 | */ | 348 | */ |
| 162 | preempt_disable(); | 349 | lg_global_lock(&stop_cpus_lock); |
| 163 | for_each_cpu(cpu, cpumask) | 350 | for_each_cpu(cpu, cpumask) |
| 164 | cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu)); | 351 | cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu)); |
| 165 | preempt_enable(); | 352 | lg_global_unlock(&stop_cpus_lock); |
| 166 | } | 353 | } |
| 167 | 354 | ||
| 168 | static int __stop_cpus(const struct cpumask *cpumask, | 355 | static int __stop_cpus(const struct cpumask *cpumask, |
| @@ -359,98 +546,14 @@ early_initcall(cpu_stop_init); | |||
| 359 | 546 | ||
| 360 | #ifdef CONFIG_STOP_MACHINE | 547 | #ifdef CONFIG_STOP_MACHINE |
| 361 | 548 | ||
| 362 | /* This controls the threads on each CPU. */ | ||
| 363 | enum stopmachine_state { | ||
| 364 | /* Dummy starting state for thread. */ | ||
| 365 | STOPMACHINE_NONE, | ||
| 366 | /* Awaiting everyone to be scheduled. */ | ||
| 367 | STOPMACHINE_PREPARE, | ||
| 368 | /* Disable interrupts. */ | ||
| 369 | STOPMACHINE_DISABLE_IRQ, | ||
| 370 | /* Run the function */ | ||
| 371 | STOPMACHINE_RUN, | ||
| 372 | /* Exit */ | ||
| 373 | STOPMACHINE_EXIT, | ||
| 374 | }; | ||
| 375 | |||
| 376 | struct stop_machine_data { | ||
| 377 | int (*fn)(void *); | ||
| 378 | void *data; | ||
| 379 | /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */ | ||
| 380 | unsigned int num_threads; | ||
| 381 | const struct cpumask *active_cpus; | ||
| 382 | |||
| 383 | enum stopmachine_state state; | ||
| 384 | atomic_t thread_ack; | ||
| 385 | }; | ||
| 386 | |||
| 387 | static void set_state(struct stop_machine_data *smdata, | ||
| 388 | enum stopmachine_state newstate) | ||
| 389 | { | ||
| 390 | /* Reset ack counter. */ | ||
| 391 | atomic_set(&smdata->thread_ack, smdata->num_threads); | ||
| 392 | smp_wmb(); | ||
| 393 | smdata->state = newstate; | ||
| 394 | } | ||
| 395 | |||
| 396 | /* Last one to ack a state moves to the next state. */ | ||
| 397 | static void ack_state(struct stop_machine_data *smdata) | ||
| 398 | { | ||
| 399 | if (atomic_dec_and_test(&smdata->thread_ack)) | ||
| 400 | set_state(smdata, smdata->state + 1); | ||
| 401 | } | ||
| 402 | |||
| 403 | /* This is the cpu_stop function which stops the CPU. */ | ||
| 404 | static int stop_machine_cpu_stop(void *data) | ||
| 405 | { | ||
| 406 | struct stop_machine_data *smdata = data; | ||
| 407 | enum stopmachine_state curstate = STOPMACHINE_NONE; | ||
| 408 | int cpu = smp_processor_id(), err = 0; | ||
| 409 | unsigned long flags; | ||
| 410 | bool is_active; | ||
| 411 | |||
| 412 | /* | ||
| 413 | * When called from stop_machine_from_inactive_cpu(), irq might | ||
| 414 | * already be disabled. Save the state and restore it on exit. | ||
| 415 | */ | ||
| 416 | local_save_flags(flags); | ||
| 417 | |||
| 418 | if (!smdata->active_cpus) | ||
| 419 | is_active = cpu == cpumask_first(cpu_online_mask); | ||
| 420 | else | ||
| 421 | is_active = cpumask_test_cpu(cpu, smdata->active_cpus); | ||
| 422 | |||
| 423 | /* Simple state machine */ | ||
| 424 | do { | ||
| 425 | /* Chill out and ensure we re-read stopmachine_state. */ | ||
| 426 | cpu_relax(); | ||
| 427 | if (smdata->state != curstate) { | ||
| 428 | curstate = smdata->state; | ||
| 429 | switch (curstate) { | ||
| 430 | case STOPMACHINE_DISABLE_IRQ: | ||
| 431 | local_irq_disable(); | ||
| 432 | hard_irq_disable(); | ||
| 433 | break; | ||
| 434 | case STOPMACHINE_RUN: | ||
| 435 | if (is_active) | ||
| 436 | err = smdata->fn(smdata->data); | ||
| 437 | break; | ||
| 438 | default: | ||
| 439 | break; | ||
| 440 | } | ||
| 441 | ack_state(smdata); | ||
| 442 | } | ||
| 443 | } while (curstate != STOPMACHINE_EXIT); | ||
| 444 | |||
| 445 | local_irq_restore(flags); | ||
| 446 | return err; | ||
| 447 | } | ||
| 448 | |||
| 449 | int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) | 549 | int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) |
| 450 | { | 550 | { |
| 451 | struct stop_machine_data smdata = { .fn = fn, .data = data, | 551 | struct multi_stop_data msdata = { |
| 452 | .num_threads = num_online_cpus(), | 552 | .fn = fn, |
| 453 | .active_cpus = cpus }; | 553 | .data = data, |
| 554 | .num_threads = num_online_cpus(), | ||
| 555 | .active_cpus = cpus, | ||
| 556 | }; | ||
| 454 | 557 | ||
| 455 | if (!stop_machine_initialized) { | 558 | if (!stop_machine_initialized) { |
| 456 | /* | 559 | /* |
| @@ -461,7 +564,7 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) | |||
| 461 | unsigned long flags; | 564 | unsigned long flags; |
| 462 | int ret; | 565 | int ret; |
| 463 | 566 | ||
| 464 | WARN_ON_ONCE(smdata.num_threads != 1); | 567 | WARN_ON_ONCE(msdata.num_threads != 1); |
| 465 | 568 | ||
| 466 | local_irq_save(flags); | 569 | local_irq_save(flags); |
| 467 | hard_irq_disable(); | 570 | hard_irq_disable(); |
| @@ -472,8 +575,8 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) | |||
| 472 | } | 575 | } |
| 473 | 576 | ||
| 474 | /* Set the initial state and stop all online cpus. */ | 577 | /* Set the initial state and stop all online cpus. */ |
| 475 | set_state(&smdata, STOPMACHINE_PREPARE); | 578 | set_state(&msdata, MULTI_STOP_PREPARE); |
| 476 | return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata); | 579 | return stop_cpus(cpu_online_mask, multi_cpu_stop, &msdata); |
| 477 | } | 580 | } |
| 478 | 581 | ||
| 479 | int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) | 582 | int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) |
| @@ -513,25 +616,25 @@ EXPORT_SYMBOL_GPL(stop_machine); | |||
| 513 | int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data, | 616 | int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data, |
| 514 | const struct cpumask *cpus) | 617 | const struct cpumask *cpus) |
| 515 | { | 618 | { |
| 516 | struct stop_machine_data smdata = { .fn = fn, .data = data, | 619 | struct multi_stop_data msdata = { .fn = fn, .data = data, |
| 517 | .active_cpus = cpus }; | 620 | .active_cpus = cpus }; |
| 518 | struct cpu_stop_done done; | 621 | struct cpu_stop_done done; |
| 519 | int ret; | 622 | int ret; |
| 520 | 623 | ||
| 521 | /* Local CPU must be inactive and CPU hotplug in progress. */ | 624 | /* Local CPU must be inactive and CPU hotplug in progress. */ |
| 522 | BUG_ON(cpu_active(raw_smp_processor_id())); | 625 | BUG_ON(cpu_active(raw_smp_processor_id())); |
| 523 | smdata.num_threads = num_active_cpus() + 1; /* +1 for local */ | 626 | msdata.num_threads = num_active_cpus() + 1; /* +1 for local */ |
| 524 | 627 | ||
| 525 | /* No proper task established and can't sleep - busy wait for lock. */ | 628 | /* No proper task established and can't sleep - busy wait for lock. */ |
| 526 | while (!mutex_trylock(&stop_cpus_mutex)) | 629 | while (!mutex_trylock(&stop_cpus_mutex)) |
| 527 | cpu_relax(); | 630 | cpu_relax(); |
| 528 | 631 | ||
| 529 | /* Schedule work on other CPUs and execute directly for local CPU */ | 632 | /* Schedule work on other CPUs and execute directly for local CPU */ |
| 530 | set_state(&smdata, STOPMACHINE_PREPARE); | 633 | set_state(&msdata, MULTI_STOP_PREPARE); |
| 531 | cpu_stop_init_done(&done, num_active_cpus()); | 634 | cpu_stop_init_done(&done, num_active_cpus()); |
| 532 | queue_stop_cpus_work(cpu_active_mask, stop_machine_cpu_stop, &smdata, | 635 | queue_stop_cpus_work(cpu_active_mask, multi_cpu_stop, &msdata, |
| 533 | &done); | 636 | &done); |
| 534 | ret = stop_machine_cpu_stop(&smdata); | 637 | ret = multi_cpu_stop(&msdata); |
| 535 | 638 | ||
| 536 | /* Busy wait for completion. */ | 639 | /* Busy wait for completion. */ |
| 537 | while (!completion_done(&done.completion)) | 640 | while (!completion_done(&done.completion)) |
diff --git a/kernel/sys.c b/kernel/sys.c index 771129b299f8..c18ecca575b4 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
| @@ -337,7 +337,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) | |||
| 337 | if (rgid != (gid_t) -1) { | 337 | if (rgid != (gid_t) -1) { |
| 338 | if (gid_eq(old->gid, krgid) || | 338 | if (gid_eq(old->gid, krgid) || |
| 339 | gid_eq(old->egid, krgid) || | 339 | gid_eq(old->egid, krgid) || |
| 340 | nsown_capable(CAP_SETGID)) | 340 | ns_capable(old->user_ns, CAP_SETGID)) |
| 341 | new->gid = krgid; | 341 | new->gid = krgid; |
| 342 | else | 342 | else |
| 343 | goto error; | 343 | goto error; |
| @@ -346,7 +346,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) | |||
| 346 | if (gid_eq(old->gid, kegid) || | 346 | if (gid_eq(old->gid, kegid) || |
| 347 | gid_eq(old->egid, kegid) || | 347 | gid_eq(old->egid, kegid) || |
| 348 | gid_eq(old->sgid, kegid) || | 348 | gid_eq(old->sgid, kegid) || |
| 349 | nsown_capable(CAP_SETGID)) | 349 | ns_capable(old->user_ns, CAP_SETGID)) |
| 350 | new->egid = kegid; | 350 | new->egid = kegid; |
| 351 | else | 351 | else |
| 352 | goto error; | 352 | goto error; |
| @@ -387,7 +387,7 @@ SYSCALL_DEFINE1(setgid, gid_t, gid) | |||
| 387 | old = current_cred(); | 387 | old = current_cred(); |
| 388 | 388 | ||
| 389 | retval = -EPERM; | 389 | retval = -EPERM; |
| 390 | if (nsown_capable(CAP_SETGID)) | 390 | if (ns_capable(old->user_ns, CAP_SETGID)) |
| 391 | new->gid = new->egid = new->sgid = new->fsgid = kgid; | 391 | new->gid = new->egid = new->sgid = new->fsgid = kgid; |
| 392 | else if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->sgid)) | 392 | else if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->sgid)) |
| 393 | new->egid = new->fsgid = kgid; | 393 | new->egid = new->fsgid = kgid; |
| @@ -471,7 +471,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) | |||
| 471 | new->uid = kruid; | 471 | new->uid = kruid; |
| 472 | if (!uid_eq(old->uid, kruid) && | 472 | if (!uid_eq(old->uid, kruid) && |
| 473 | !uid_eq(old->euid, kruid) && | 473 | !uid_eq(old->euid, kruid) && |
| 474 | !nsown_capable(CAP_SETUID)) | 474 | !ns_capable(old->user_ns, CAP_SETUID)) |
| 475 | goto error; | 475 | goto error; |
| 476 | } | 476 | } |
| 477 | 477 | ||
| @@ -480,7 +480,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) | |||
| 480 | if (!uid_eq(old->uid, keuid) && | 480 | if (!uid_eq(old->uid, keuid) && |
| 481 | !uid_eq(old->euid, keuid) && | 481 | !uid_eq(old->euid, keuid) && |
| 482 | !uid_eq(old->suid, keuid) && | 482 | !uid_eq(old->suid, keuid) && |
| 483 | !nsown_capable(CAP_SETUID)) | 483 | !ns_capable(old->user_ns, CAP_SETUID)) |
| 484 | goto error; | 484 | goto error; |
| 485 | } | 485 | } |
| 486 | 486 | ||
| @@ -534,7 +534,7 @@ SYSCALL_DEFINE1(setuid, uid_t, uid) | |||
| 534 | old = current_cred(); | 534 | old = current_cred(); |
| 535 | 535 | ||
| 536 | retval = -EPERM; | 536 | retval = -EPERM; |
| 537 | if (nsown_capable(CAP_SETUID)) { | 537 | if (ns_capable(old->user_ns, CAP_SETUID)) { |
| 538 | new->suid = new->uid = kuid; | 538 | new->suid = new->uid = kuid; |
| 539 | if (!uid_eq(kuid, old->uid)) { | 539 | if (!uid_eq(kuid, old->uid)) { |
| 540 | retval = set_user(new); | 540 | retval = set_user(new); |
| @@ -591,7 +591,7 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) | |||
| 591 | old = current_cred(); | 591 | old = current_cred(); |
| 592 | 592 | ||
| 593 | retval = -EPERM; | 593 | retval = -EPERM; |
| 594 | if (!nsown_capable(CAP_SETUID)) { | 594 | if (!ns_capable(old->user_ns, CAP_SETUID)) { |
| 595 | if (ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) && | 595 | if (ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) && |
| 596 | !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid)) | 596 | !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid)) |
| 597 | goto error; | 597 | goto error; |
| @@ -673,7 +673,7 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) | |||
| 673 | old = current_cred(); | 673 | old = current_cred(); |
| 674 | 674 | ||
| 675 | retval = -EPERM; | 675 | retval = -EPERM; |
| 676 | if (!nsown_capable(CAP_SETGID)) { | 676 | if (!ns_capable(old->user_ns, CAP_SETGID)) { |
| 677 | if (rgid != (gid_t) -1 && !gid_eq(krgid, old->gid) && | 677 | if (rgid != (gid_t) -1 && !gid_eq(krgid, old->gid) && |
| 678 | !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid)) | 678 | !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid)) |
| 679 | goto error; | 679 | goto error; |
| @@ -744,7 +744,7 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid) | |||
| 744 | 744 | ||
| 745 | if (uid_eq(kuid, old->uid) || uid_eq(kuid, old->euid) || | 745 | if (uid_eq(kuid, old->uid) || uid_eq(kuid, old->euid) || |
| 746 | uid_eq(kuid, old->suid) || uid_eq(kuid, old->fsuid) || | 746 | uid_eq(kuid, old->suid) || uid_eq(kuid, old->fsuid) || |
| 747 | nsown_capable(CAP_SETUID)) { | 747 | ns_capable(old->user_ns, CAP_SETUID)) { |
| 748 | if (!uid_eq(kuid, old->fsuid)) { | 748 | if (!uid_eq(kuid, old->fsuid)) { |
| 749 | new->fsuid = kuid; | 749 | new->fsuid = kuid; |
| 750 | if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) | 750 | if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) |
| @@ -783,7 +783,7 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid) | |||
| 783 | 783 | ||
| 784 | if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->egid) || | 784 | if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->egid) || |
| 785 | gid_eq(kgid, old->sgid) || gid_eq(kgid, old->fsgid) || | 785 | gid_eq(kgid, old->sgid) || gid_eq(kgid, old->fsgid) || |
| 786 | nsown_capable(CAP_SETGID)) { | 786 | ns_capable(old->user_ns, CAP_SETGID)) { |
| 787 | if (!gid_eq(kgid, old->fsgid)) { | 787 | if (!gid_eq(kgid, old->fsgid)) { |
| 788 | new->fsgid = kgid; | 788 | new->fsgid = kgid; |
| 789 | goto change_okay; | 789 | goto change_okay; |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 07f6fc468e17..36547dddcdb8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
| @@ -190,7 +190,7 @@ static int proc_dostring_coredump(struct ctl_table *table, int write, | |||
| 190 | 190 | ||
| 191 | #ifdef CONFIG_MAGIC_SYSRQ | 191 | #ifdef CONFIG_MAGIC_SYSRQ |
| 192 | /* Note: sysrq code uses it's own private copy */ | 192 | /* Note: sysrq code uses it's own private copy */ |
| 193 | static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE; | 193 | static int __sysrq_enabled = CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE; |
| 194 | 194 | ||
| 195 | static int sysrq_sysctl_handler(ctl_table *table, int write, | 195 | static int sysrq_sysctl_handler(ctl_table *table, int write, |
| 196 | void __user *buffer, size_t *lenp, | 196 | void __user *buffer, size_t *lenp, |
| @@ -371,13 +371,6 @@ static struct ctl_table kern_table[] = { | |||
| 371 | .proc_handler = proc_dointvec, | 371 | .proc_handler = proc_dointvec, |
| 372 | }, | 372 | }, |
| 373 | { | 373 | { |
| 374 | .procname = "numa_balancing_scan_period_reset", | ||
| 375 | .data = &sysctl_numa_balancing_scan_period_reset, | ||
| 376 | .maxlen = sizeof(unsigned int), | ||
| 377 | .mode = 0644, | ||
| 378 | .proc_handler = proc_dointvec, | ||
| 379 | }, | ||
| 380 | { | ||
| 381 | .procname = "numa_balancing_scan_period_max_ms", | 374 | .procname = "numa_balancing_scan_period_max_ms", |
| 382 | .data = &sysctl_numa_balancing_scan_period_max, | 375 | .data = &sysctl_numa_balancing_scan_period_max, |
| 383 | .maxlen = sizeof(unsigned int), | 376 | .maxlen = sizeof(unsigned int), |
| @@ -391,6 +384,20 @@ static struct ctl_table kern_table[] = { | |||
| 391 | .mode = 0644, | 384 | .mode = 0644, |
| 392 | .proc_handler = proc_dointvec, | 385 | .proc_handler = proc_dointvec, |
| 393 | }, | 386 | }, |
| 387 | { | ||
| 388 | .procname = "numa_balancing_settle_count", | ||
| 389 | .data = &sysctl_numa_balancing_settle_count, | ||
| 390 | .maxlen = sizeof(unsigned int), | ||
| 391 | .mode = 0644, | ||
| 392 | .proc_handler = proc_dointvec, | ||
| 393 | }, | ||
| 394 | { | ||
| 395 | .procname = "numa_balancing_migrate_deferred", | ||
| 396 | .data = &sysctl_numa_balancing_migrate_deferred, | ||
| 397 | .maxlen = sizeof(unsigned int), | ||
| 398 | .mode = 0644, | ||
| 399 | .proc_handler = proc_dointvec, | ||
| 400 | }, | ||
| 394 | #endif /* CONFIG_NUMA_BALANCING */ | 401 | #endif /* CONFIG_NUMA_BALANCING */ |
| 395 | #endif /* CONFIG_SCHED_DEBUG */ | 402 | #endif /* CONFIG_SCHED_DEBUG */ |
| 396 | { | 403 | { |
| @@ -1049,6 +1056,7 @@ static struct ctl_table kern_table[] = { | |||
| 1049 | .maxlen = sizeof(sysctl_perf_event_sample_rate), | 1056 | .maxlen = sizeof(sysctl_perf_event_sample_rate), |
| 1050 | .mode = 0644, | 1057 | .mode = 0644, |
| 1051 | .proc_handler = perf_proc_update_handler, | 1058 | .proc_handler = perf_proc_update_handler, |
| 1059 | .extra1 = &one, | ||
| 1052 | }, | 1060 | }, |
| 1053 | { | 1061 | { |
| 1054 | .procname = "perf_cpu_time_max_percent", | 1062 | .procname = "perf_cpu_time_max_percent", |
| @@ -1225,7 +1233,7 @@ static struct ctl_table vm_table[] = { | |||
| 1225 | .data = &hugepages_treat_as_movable, | 1233 | .data = &hugepages_treat_as_movable, |
| 1226 | .maxlen = sizeof(int), | 1234 | .maxlen = sizeof(int), |
| 1227 | .mode = 0644, | 1235 | .mode = 0644, |
| 1228 | .proc_handler = hugetlb_treat_movable_handler, | 1236 | .proc_handler = proc_dointvec, |
| 1229 | }, | 1237 | }, |
| 1230 | { | 1238 | { |
| 1231 | .procname = "nr_overcommit_hugepages", | 1239 | .procname = "nr_overcommit_hugepages", |
| @@ -1471,14 +1479,14 @@ static struct ctl_table fs_table[] = { | |||
| 1471 | { | 1479 | { |
| 1472 | .procname = "inode-nr", | 1480 | .procname = "inode-nr", |
| 1473 | .data = &inodes_stat, | 1481 | .data = &inodes_stat, |
| 1474 | .maxlen = 2*sizeof(int), | 1482 | .maxlen = 2*sizeof(long), |
| 1475 | .mode = 0444, | 1483 | .mode = 0444, |
| 1476 | .proc_handler = proc_nr_inodes, | 1484 | .proc_handler = proc_nr_inodes, |
| 1477 | }, | 1485 | }, |
| 1478 | { | 1486 | { |
| 1479 | .procname = "inode-state", | 1487 | .procname = "inode-state", |
| 1480 | .data = &inodes_stat, | 1488 | .data = &inodes_stat, |
| 1481 | .maxlen = 7*sizeof(int), | 1489 | .maxlen = 7*sizeof(long), |
| 1482 | .mode = 0444, | 1490 | .mode = 0444, |
| 1483 | .proc_handler = proc_nr_inodes, | 1491 | .proc_handler = proc_nr_inodes, |
| 1484 | }, | 1492 | }, |
| @@ -1508,7 +1516,7 @@ static struct ctl_table fs_table[] = { | |||
| 1508 | { | 1516 | { |
| 1509 | .procname = "dentry-state", | 1517 | .procname = "dentry-state", |
| 1510 | .data = &dentry_stat, | 1518 | .data = &dentry_stat, |
| 1511 | .maxlen = 6*sizeof(int), | 1519 | .maxlen = 6*sizeof(long), |
| 1512 | .mode = 0444, | 1520 | .mode = 0444, |
| 1513 | .proc_handler = proc_nr_dentry, | 1521 | .proc_handler = proc_nr_dentry, |
| 1514 | }, | 1522 | }, |
diff --git a/kernel/task_work.c b/kernel/task_work.c index 65bd3c92d6f3..8727032e3a6f 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c | |||
| @@ -4,6 +4,23 @@ | |||
| 4 | 4 | ||
| 5 | static struct callback_head work_exited; /* all we need is ->next == NULL */ | 5 | static struct callback_head work_exited; /* all we need is ->next == NULL */ |
| 6 | 6 | ||
| 7 | /** | ||
| 8 | * task_work_add - ask the @task to execute @work->func() | ||
| 9 | * @task: the task which should run the callback | ||
| 10 | * @work: the callback to run | ||
| 11 | * @notify: send the notification if true | ||
| 12 | * | ||
| 13 | * Queue @work for task_work_run() below and notify the @task if @notify. | ||
| 14 | * Fails if the @task is exiting/exited and thus it can't process this @work. | ||
| 15 | * Otherwise @work->func() will be called when the @task returns from kernel | ||
| 16 | * mode or exits. | ||
| 17 | * | ||
| 18 | * This is like the signal handler which runs in kernel mode, but it doesn't | ||
| 19 | * try to wake up the @task. | ||
| 20 | * | ||
| 21 | * RETURNS: | ||
| 22 | * 0 if succeeds or -ESRCH. | ||
| 23 | */ | ||
| 7 | int | 24 | int |
| 8 | task_work_add(struct task_struct *task, struct callback_head *work, bool notify) | 25 | task_work_add(struct task_struct *task, struct callback_head *work, bool notify) |
| 9 | { | 26 | { |
| @@ -21,11 +38,22 @@ task_work_add(struct task_struct *task, struct callback_head *work, bool notify) | |||
| 21 | return 0; | 38 | return 0; |
| 22 | } | 39 | } |
| 23 | 40 | ||
| 41 | /** | ||
| 42 | * task_work_cancel - cancel a pending work added by task_work_add() | ||
| 43 | * @task: the task which should execute the work | ||
| 44 | * @func: identifies the work to remove | ||
| 45 | * | ||
| 46 | * Find the last queued pending work with ->func == @func and remove | ||
| 47 | * it from queue. | ||
| 48 | * | ||
| 49 | * RETURNS: | ||
| 50 | * The found work or NULL if not found. | ||
| 51 | */ | ||
| 24 | struct callback_head * | 52 | struct callback_head * |
| 25 | task_work_cancel(struct task_struct *task, task_work_func_t func) | 53 | task_work_cancel(struct task_struct *task, task_work_func_t func) |
| 26 | { | 54 | { |
| 27 | struct callback_head **pprev = &task->task_works; | 55 | struct callback_head **pprev = &task->task_works; |
| 28 | struct callback_head *work = NULL; | 56 | struct callback_head *work; |
| 29 | unsigned long flags; | 57 | unsigned long flags; |
| 30 | /* | 58 | /* |
| 31 | * If cmpxchg() fails we continue without updating pprev. | 59 | * If cmpxchg() fails we continue without updating pprev. |
| @@ -35,7 +63,7 @@ task_work_cancel(struct task_struct *task, task_work_func_t func) | |||
| 35 | */ | 63 | */ |
| 36 | raw_spin_lock_irqsave(&task->pi_lock, flags); | 64 | raw_spin_lock_irqsave(&task->pi_lock, flags); |
| 37 | while ((work = ACCESS_ONCE(*pprev))) { | 65 | while ((work = ACCESS_ONCE(*pprev))) { |
| 38 | read_barrier_depends(); | 66 | smp_read_barrier_depends(); |
| 39 | if (work->func != func) | 67 | if (work->func != func) |
| 40 | pprev = &work->next; | 68 | pprev = &work->next; |
| 41 | else if (cmpxchg(pprev, work, work->next) == work) | 69 | else if (cmpxchg(pprev, work, work->next) == work) |
| @@ -46,6 +74,14 @@ task_work_cancel(struct task_struct *task, task_work_func_t func) | |||
| 46 | return work; | 74 | return work; |
| 47 | } | 75 | } |
| 48 | 76 | ||
| 77 | /** | ||
| 78 | * task_work_run - execute the works added by task_work_add() | ||
| 79 | * | ||
| 80 | * Flush the pending works. Should be used by the core kernel code. | ||
| 81 | * Called before the task returns to the user-mode or stops, or when | ||
| 82 | * it exits. In the latter case task_work_add() can no longer add the | ||
| 83 | * new work after task_work_run() returns. | ||
| 84 | */ | ||
| 49 | void task_work_run(void) | 85 | void task_work_run(void) |
| 50 | { | 86 | { |
| 51 | struct task_struct *task = current; | 87 | struct task_struct *task = current; |
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 70f27e89012b..3ce6e8c5f3fc 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig | |||
| @@ -100,12 +100,11 @@ config NO_HZ_FULL | |||
| 100 | # RCU_USER_QS dependency | 100 | # RCU_USER_QS dependency |
| 101 | depends on HAVE_CONTEXT_TRACKING | 101 | depends on HAVE_CONTEXT_TRACKING |
| 102 | # VIRT_CPU_ACCOUNTING_GEN dependency | 102 | # VIRT_CPU_ACCOUNTING_GEN dependency |
| 103 | depends on 64BIT | 103 | depends on HAVE_VIRT_CPU_ACCOUNTING_GEN |
| 104 | select NO_HZ_COMMON | 104 | select NO_HZ_COMMON |
| 105 | select RCU_USER_QS | 105 | select RCU_USER_QS |
| 106 | select RCU_NOCB_CPU | 106 | select RCU_NOCB_CPU |
| 107 | select VIRT_CPU_ACCOUNTING_GEN | 107 | select VIRT_CPU_ACCOUNTING_GEN |
| 108 | select CONTEXT_TRACKING_FORCE | ||
| 109 | select IRQ_WORK | 108 | select IRQ_WORK |
| 110 | help | 109 | help |
| 111 | Adaptively try to shutdown the tick whenever possible, even when | 110 | Adaptively try to shutdown the tick whenever possible, even when |
| @@ -134,6 +133,56 @@ config NO_HZ_FULL_ALL | |||
| 134 | Note the boot CPU will still be kept outside the range to | 133 | Note the boot CPU will still be kept outside the range to |
| 135 | handle the timekeeping duty. | 134 | handle the timekeeping duty. |
| 136 | 135 | ||
| 136 | config NO_HZ_FULL_SYSIDLE | ||
| 137 | bool "Detect full-system idle state for full dynticks system" | ||
| 138 | depends on NO_HZ_FULL | ||
| 139 | default n | ||
| 140 | help | ||
| 141 | At least one CPU must keep the scheduling-clock tick running for | ||
| 142 | timekeeping purposes whenever there is a non-idle CPU, where | ||
| 143 | "non-idle" also includes dynticks CPUs as long as they are | ||
| 144 | running non-idle tasks. Because the underlying adaptive-tick | ||
| 145 | support cannot distinguish between all CPUs being idle and | ||
| 146 | all CPUs each running a single task in dynticks mode, the | ||
| 147 | underlying support simply ensures that there is always a CPU | ||
| 148 | handling the scheduling-clock tick, whether or not all CPUs | ||
| 149 | are idle. This Kconfig option enables scalable detection of | ||
| 150 | the all-CPUs-idle state, thus allowing the scheduling-clock | ||
| 151 | tick to be disabled when all CPUs are idle. Note that scalable | ||
| 152 | detection of the all-CPUs-idle state means that larger systems | ||
| 153 | will be slower to declare the all-CPUs-idle state. | ||
| 154 | |||
| 155 | Say Y if you would like to help debug all-CPUs-idle detection. | ||
| 156 | |||
| 157 | Say N if you are unsure. | ||
| 158 | |||
| 159 | config NO_HZ_FULL_SYSIDLE_SMALL | ||
| 160 | int "Number of CPUs above which large-system approach is used" | ||
| 161 | depends on NO_HZ_FULL_SYSIDLE | ||
| 162 | range 1 NR_CPUS | ||
| 163 | default 8 | ||
| 164 | help | ||
| 165 | The full-system idle detection mechanism takes a lazy approach | ||
| 166 | on large systems, as is required to attain decent scalability. | ||
| 167 | However, on smaller systems, scalability is not anywhere near as | ||
| 168 | large a concern as is energy efficiency. The sysidle subsystem | ||
| 169 | therefore uses a fast but non-scalable algorithm for small | ||
| 170 | systems and a lazier but scalable algorithm for large systems. | ||
| 171 | This Kconfig parameter defines the number of CPUs in the largest | ||
| 172 | system that will be considered to be "small". | ||
| 173 | |||
| 174 | The default value will be fine in most cases. Battery-powered | ||
| 175 | systems that (1) enable NO_HZ_FULL_SYSIDLE, (2) have larger | ||
| 176 | numbers of CPUs, and (3) are suffering from battery-lifetime | ||
| 177 | problems due to long sysidle latencies might wish to experiment | ||
| 178 | with larger values for this Kconfig parameter. On the other | ||
| 179 | hand, they might be even better served by disabling NO_HZ_FULL | ||
| 180 | entirely, given that NO_HZ_FULL is intended for HPC and | ||
| 181 | real-time workloads that at present do not tend to be run on | ||
| 182 | battery-powered systems. | ||
| 183 | |||
| 184 | Take the default if you are unsure. | ||
| 185 | |||
| 137 | config NO_HZ | 186 | config NO_HZ |
| 138 | bool "Old Idle dynticks config" | 187 | bool "Old Idle dynticks config" |
| 139 | depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS | 188 | depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS |
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index eec50fcef9e4..88c9c65a430d 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c | |||
| @@ -490,7 +490,7 @@ static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp) | |||
| 490 | clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid; | 490 | clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid; |
| 491 | 491 | ||
| 492 | if (!alarmtimer_get_rtcdev()) | 492 | if (!alarmtimer_get_rtcdev()) |
| 493 | return -ENOTSUPP; | 493 | return -EINVAL; |
| 494 | 494 | ||
| 495 | return hrtimer_get_res(baseid, tp); | 495 | return hrtimer_get_res(baseid, tp); |
| 496 | } | 496 | } |
| @@ -507,7 +507,7 @@ static int alarm_clock_get(clockid_t which_clock, struct timespec *tp) | |||
| 507 | struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)]; | 507 | struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)]; |
| 508 | 508 | ||
| 509 | if (!alarmtimer_get_rtcdev()) | 509 | if (!alarmtimer_get_rtcdev()) |
| 510 | return -ENOTSUPP; | 510 | return -EINVAL; |
| 511 | 511 | ||
| 512 | *tp = ktime_to_timespec(base->gettime()); | 512 | *tp = ktime_to_timespec(base->gettime()); |
| 513 | return 0; | 513 | return 0; |
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 38959c866789..086ad6043bcb 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c | |||
| @@ -33,29 +33,64 @@ struct ce_unbind { | |||
| 33 | int res; | 33 | int res; |
| 34 | }; | 34 | }; |
| 35 | 35 | ||
| 36 | /** | 36 | static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt, |
| 37 | * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds | 37 | bool ismax) |
| 38 | * @latch: value to convert | ||
| 39 | * @evt: pointer to clock event device descriptor | ||
| 40 | * | ||
| 41 | * Math helper, returns latch value converted to nanoseconds (bound checked) | ||
| 42 | */ | ||
| 43 | u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt) | ||
| 44 | { | 38 | { |
| 45 | u64 clc = (u64) latch << evt->shift; | 39 | u64 clc = (u64) latch << evt->shift; |
| 40 | u64 rnd; | ||
| 46 | 41 | ||
| 47 | if (unlikely(!evt->mult)) { | 42 | if (unlikely(!evt->mult)) { |
| 48 | evt->mult = 1; | 43 | evt->mult = 1; |
| 49 | WARN_ON(1); | 44 | WARN_ON(1); |
| 50 | } | 45 | } |
| 46 | rnd = (u64) evt->mult - 1; | ||
| 47 | |||
| 48 | /* | ||
| 49 | * Upper bound sanity check. If the backwards conversion is | ||
| 50 | * not equal latch, we know that the above shift overflowed. | ||
| 51 | */ | ||
| 52 | if ((clc >> evt->shift) != (u64)latch) | ||
| 53 | clc = ~0ULL; | ||
| 54 | |||
| 55 | /* | ||
| 56 | * Scaled math oddities: | ||
| 57 | * | ||
| 58 | * For mult <= (1 << shift) we can safely add mult - 1 to | ||
| 59 | * prevent integer rounding loss. So the backwards conversion | ||
| 60 | * from nsec to device ticks will be correct. | ||
| 61 | * | ||
| 62 | * For mult > (1 << shift), i.e. device frequency is > 1GHz we | ||
| 63 | * need to be careful. Adding mult - 1 will result in a value | ||
| 64 | * which when converted back to device ticks can be larger | ||
| 65 | * than latch by up to (mult - 1) >> shift. For the min_delta | ||
| 66 | * calculation we still want to apply this in order to stay | ||
| 67 | * above the minimum device ticks limit. For the upper limit | ||
| 68 | * we would end up with a latch value larger than the upper | ||
| 69 | * limit of the device, so we omit the add to stay below the | ||
| 70 | * device upper boundary. | ||
| 71 | * | ||
| 72 | * Also omit the add if it would overflow the u64 boundary. | ||
| 73 | */ | ||
| 74 | if ((~0ULL - clc > rnd) && | ||
| 75 | (!ismax || evt->mult <= (1U << evt->shift))) | ||
| 76 | clc += rnd; | ||
| 51 | 77 | ||
| 52 | do_div(clc, evt->mult); | 78 | do_div(clc, evt->mult); |
| 53 | if (clc < 1000) | ||
| 54 | clc = 1000; | ||
| 55 | if (clc > KTIME_MAX) | ||
| 56 | clc = KTIME_MAX; | ||
| 57 | 79 | ||
| 58 | return clc; | 80 | /* Deltas less than 1usec are pointless noise */ |
| 81 | return clc > 1000 ? clc : 1000; | ||
| 82 | } | ||
| 83 | |||
| 84 | /** | ||
| 85 | * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds | ||
| 86 | * @latch: value to convert | ||
| 87 | * @evt: pointer to clock event device descriptor | ||
| 88 | * | ||
| 89 | * Math helper, returns latch value converted to nanoseconds (bound checked) | ||
| 90 | */ | ||
| 91 | u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt) | ||
| 92 | { | ||
| 93 | return cev_delta2ns(latch, evt, false); | ||
| 59 | } | 94 | } |
| 60 | EXPORT_SYMBOL_GPL(clockevent_delta2ns); | 95 | EXPORT_SYMBOL_GPL(clockevent_delta2ns); |
| 61 | 96 | ||
| @@ -380,8 +415,8 @@ void clockevents_config(struct clock_event_device *dev, u32 freq) | |||
| 380 | sec = 600; | 415 | sec = 600; |
| 381 | 416 | ||
| 382 | clockevents_calc_mult_shift(dev, freq, sec); | 417 | clockevents_calc_mult_shift(dev, freq, sec); |
| 383 | dev->min_delta_ns = clockevent_delta2ns(dev->min_delta_ticks, dev); | 418 | dev->min_delta_ns = cev_delta2ns(dev->min_delta_ticks, dev, false); |
| 384 | dev->max_delta_ns = clockevent_delta2ns(dev->max_delta_ticks, dev); | 419 | dev->max_delta_ns = cev_delta2ns(dev->max_delta_ticks, dev, true); |
| 385 | } | 420 | } |
| 386 | 421 | ||
| 387 | /** | 422 | /** |
| @@ -584,7 +619,7 @@ static ssize_t sysfs_unbind_tick_dev(struct device *dev, | |||
| 584 | const char *buf, size_t count) | 619 | const char *buf, size_t count) |
| 585 | { | 620 | { |
| 586 | char name[CS_NAME_LEN]; | 621 | char name[CS_NAME_LEN]; |
| 587 | size_t ret = sysfs_get_uname(buf, name, count); | 622 | ssize_t ret = sysfs_get_uname(buf, name, count); |
| 588 | struct clock_event_device *ce; | 623 | struct clock_event_device *ce; |
| 589 | 624 | ||
| 590 | if (ret < 0) | 625 | if (ret < 0) |
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 50a8736757f3..ba3e502c955a 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c | |||
| @@ -479,6 +479,7 @@ static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { } | |||
| 479 | static inline void clocksource_resume_watchdog(void) { } | 479 | static inline void clocksource_resume_watchdog(void) { } |
| 480 | static inline int __clocksource_watchdog_kthread(void) { return 0; } | 480 | static inline int __clocksource_watchdog_kthread(void) { return 0; } |
| 481 | static bool clocksource_is_watchdog(struct clocksource *cs) { return false; } | 481 | static bool clocksource_is_watchdog(struct clocksource *cs) { return false; } |
| 482 | void clocksource_mark_unstable(struct clocksource *cs) { } | ||
| 482 | 483 | ||
| 483 | #endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ | 484 | #endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ |
| 484 | 485 | ||
| @@ -537,40 +538,55 @@ static u32 clocksource_max_adjustment(struct clocksource *cs) | |||
| 537 | } | 538 | } |
| 538 | 539 | ||
| 539 | /** | 540 | /** |
| 540 | * clocksource_max_deferment - Returns max time the clocksource can be deferred | 541 | * clocks_calc_max_nsecs - Returns maximum nanoseconds that can be converted |
| 541 | * @cs: Pointer to clocksource | 542 | * @mult: cycle to nanosecond multiplier |
| 542 | * | 543 | * @shift: cycle to nanosecond divisor (power of two) |
| 544 | * @maxadj: maximum adjustment value to mult (~11%) | ||
| 545 | * @mask: bitmask for two's complement subtraction of non 64 bit counters | ||
| 543 | */ | 546 | */ |
| 544 | static u64 clocksource_max_deferment(struct clocksource *cs) | 547 | u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask) |
| 545 | { | 548 | { |
| 546 | u64 max_nsecs, max_cycles; | 549 | u64 max_nsecs, max_cycles; |
| 547 | 550 | ||
| 548 | /* | 551 | /* |
| 549 | * Calculate the maximum number of cycles that we can pass to the | 552 | * Calculate the maximum number of cycles that we can pass to the |
| 550 | * cyc2ns function without overflowing a 64-bit signed result. The | 553 | * cyc2ns function without overflowing a 64-bit signed result. The |
| 551 | * maximum number of cycles is equal to ULLONG_MAX/(cs->mult+cs->maxadj) | 554 | * maximum number of cycles is equal to ULLONG_MAX/(mult+maxadj) |
| 552 | * which is equivalent to the below. | 555 | * which is equivalent to the below. |
| 553 | * max_cycles < (2^63)/(cs->mult + cs->maxadj) | 556 | * max_cycles < (2^63)/(mult + maxadj) |
| 554 | * max_cycles < 2^(log2((2^63)/(cs->mult + cs->maxadj))) | 557 | * max_cycles < 2^(log2((2^63)/(mult + maxadj))) |
| 555 | * max_cycles < 2^(log2(2^63) - log2(cs->mult + cs->maxadj)) | 558 | * max_cycles < 2^(log2(2^63) - log2(mult + maxadj)) |
| 556 | * max_cycles < 2^(63 - log2(cs->mult + cs->maxadj)) | 559 | * max_cycles < 2^(63 - log2(mult + maxadj)) |
| 557 | * max_cycles < 1 << (63 - log2(cs->mult + cs->maxadj)) | 560 | * max_cycles < 1 << (63 - log2(mult + maxadj)) |
| 558 | * Please note that we add 1 to the result of the log2 to account for | 561 | * Please note that we add 1 to the result of the log2 to account for |
| 559 | * any rounding errors, ensure the above inequality is satisfied and | 562 | * any rounding errors, ensure the above inequality is satisfied and |
| 560 | * no overflow will occur. | 563 | * no overflow will occur. |
| 561 | */ | 564 | */ |
| 562 | max_cycles = 1ULL << (63 - (ilog2(cs->mult + cs->maxadj) + 1)); | 565 | max_cycles = 1ULL << (63 - (ilog2(mult + maxadj) + 1)); |
| 563 | 566 | ||
| 564 | /* | 567 | /* |
| 565 | * The actual maximum number of cycles we can defer the clocksource is | 568 | * The actual maximum number of cycles we can defer the clocksource is |
| 566 | * determined by the minimum of max_cycles and cs->mask. | 569 | * determined by the minimum of max_cycles and mask. |
| 567 | * Note: Here we subtract the maxadj to make sure we don't sleep for | 570 | * Note: Here we subtract the maxadj to make sure we don't sleep for |
| 568 | * too long if there's a large negative adjustment. | 571 | * too long if there's a large negative adjustment. |
| 569 | */ | 572 | */ |
| 570 | max_cycles = min_t(u64, max_cycles, (u64) cs->mask); | 573 | max_cycles = min(max_cycles, mask); |
| 571 | max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult - cs->maxadj, | 574 | max_nsecs = clocksource_cyc2ns(max_cycles, mult - maxadj, shift); |
| 572 | cs->shift); | 575 | |
| 576 | return max_nsecs; | ||
| 577 | } | ||
| 578 | |||
| 579 | /** | ||
| 580 | * clocksource_max_deferment - Returns max time the clocksource can be deferred | ||
| 581 | * @cs: Pointer to clocksource | ||
| 582 | * | ||
| 583 | */ | ||
| 584 | static u64 clocksource_max_deferment(struct clocksource *cs) | ||
| 585 | { | ||
| 586 | u64 max_nsecs; | ||
| 573 | 587 | ||
| 588 | max_nsecs = clocks_calc_max_nsecs(cs->mult, cs->shift, cs->maxadj, | ||
| 589 | cs->mask); | ||
| 574 | /* | 590 | /* |
| 575 | * To ensure that the clocksource does not wrap whilst we are idle, | 591 | * To ensure that the clocksource does not wrap whilst we are idle, |
| 576 | * limit the time the clocksource can be deferred by 12.5%. Please | 592 | * limit the time the clocksource can be deferred by 12.5%. Please |
| @@ -893,7 +909,7 @@ sysfs_show_current_clocksources(struct device *dev, | |||
| 893 | return count; | 909 | return count; |
| 894 | } | 910 | } |
| 895 | 911 | ||
| 896 | size_t sysfs_get_uname(const char *buf, char *dst, size_t cnt) | 912 | ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt) |
| 897 | { | 913 | { |
| 898 | size_t ret = cnt; | 914 | size_t ret = cnt; |
| 899 | 915 | ||
| @@ -924,7 +940,7 @@ static ssize_t sysfs_override_clocksource(struct device *dev, | |||
| 924 | struct device_attribute *attr, | 940 | struct device_attribute *attr, |
| 925 | const char *buf, size_t count) | 941 | const char *buf, size_t count) |
| 926 | { | 942 | { |
| 927 | size_t ret; | 943 | ssize_t ret; |
| 928 | 944 | ||
| 929 | mutex_lock(&clocksource_mutex); | 945 | mutex_lock(&clocksource_mutex); |
| 930 | 946 | ||
| @@ -952,7 +968,7 @@ static ssize_t sysfs_unbind_clocksource(struct device *dev, | |||
| 952 | { | 968 | { |
| 953 | struct clocksource *cs; | 969 | struct clocksource *cs; |
| 954 | char name[CS_NAME_LEN]; | 970 | char name[CS_NAME_LEN]; |
| 955 | size_t ret; | 971 | ssize_t ret; |
| 956 | 972 | ||
| 957 | ret = sysfs_get_uname(buf, name, count); | 973 | ret = sysfs_get_uname(buf, name, count); |
| 958 | if (ret < 0) | 974 | if (ret < 0) |
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 8f5b3b98577b..af8d1d4f3d55 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c | |||
| @@ -475,6 +475,7 @@ static void sync_cmos_clock(struct work_struct *work) | |||
| 475 | * called as close as possible to 500 ms before the new second starts. | 475 | * called as close as possible to 500 ms before the new second starts. |
| 476 | * This code is run on a timer. If the clock is set, that timer | 476 | * This code is run on a timer. If the clock is set, that timer |
| 477 | * may not expire at the correct time. Thus, we adjust... | 477 | * may not expire at the correct time. Thus, we adjust... |
| 478 | * We want the clock to be within a couple of ticks from the target. | ||
| 478 | */ | 479 | */ |
| 479 | if (!ntp_synced()) { | 480 | if (!ntp_synced()) { |
| 480 | /* | 481 | /* |
| @@ -485,7 +486,7 @@ static void sync_cmos_clock(struct work_struct *work) | |||
| 485 | } | 486 | } |
| 486 | 487 | ||
| 487 | getnstimeofday(&now); | 488 | getnstimeofday(&now); |
| 488 | if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) { | 489 | if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) { |
| 489 | struct timespec adjust = now; | 490 | struct timespec adjust = now; |
| 490 | 491 | ||
| 491 | fail = -ENODEV; | 492 | fail = -ENODEV; |
| @@ -516,13 +517,13 @@ static void sync_cmos_clock(struct work_struct *work) | |||
| 516 | schedule_delayed_work(&sync_cmos_work, timespec_to_jiffies(&next)); | 517 | schedule_delayed_work(&sync_cmos_work, timespec_to_jiffies(&next)); |
| 517 | } | 518 | } |
| 518 | 519 | ||
| 519 | static void notify_cmos_timer(void) | 520 | void ntp_notify_cmos_timer(void) |
| 520 | { | 521 | { |
| 521 | schedule_delayed_work(&sync_cmos_work, 0); | 522 | schedule_delayed_work(&sync_cmos_work, 0); |
| 522 | } | 523 | } |
| 523 | 524 | ||
| 524 | #else | 525 | #else |
| 525 | static inline void notify_cmos_timer(void) { } | 526 | void ntp_notify_cmos_timer(void) { } |
| 526 | #endif | 527 | #endif |
| 527 | 528 | ||
| 528 | 529 | ||
| @@ -687,8 +688,6 @@ int __do_adjtimex(struct timex *txc, struct timespec *ts, s32 *time_tai) | |||
| 687 | if (!(time_status & STA_NANO)) | 688 | if (!(time_status & STA_NANO)) |
| 688 | txc->time.tv_usec /= NSEC_PER_USEC; | 689 | txc->time.tv_usec /= NSEC_PER_USEC; |
| 689 | 690 | ||
| 690 | notify_cmos_timer(); | ||
| 691 | |||
| 692 | return result; | 691 | return result; |
| 693 | } | 692 | } |
| 694 | 693 | ||
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index a326f27d7f09..68b799375981 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c | |||
| @@ -8,25 +8,28 @@ | |||
| 8 | #include <linux/clocksource.h> | 8 | #include <linux/clocksource.h> |
| 9 | #include <linux/init.h> | 9 | #include <linux/init.h> |
| 10 | #include <linux/jiffies.h> | 10 | #include <linux/jiffies.h> |
| 11 | #include <linux/ktime.h> | ||
| 11 | #include <linux/kernel.h> | 12 | #include <linux/kernel.h> |
| 12 | #include <linux/moduleparam.h> | 13 | #include <linux/moduleparam.h> |
| 13 | #include <linux/sched.h> | 14 | #include <linux/sched.h> |
| 14 | #include <linux/syscore_ops.h> | 15 | #include <linux/syscore_ops.h> |
| 15 | #include <linux/timer.h> | 16 | #include <linux/hrtimer.h> |
| 16 | #include <linux/sched_clock.h> | 17 | #include <linux/sched_clock.h> |
| 18 | #include <linux/seqlock.h> | ||
| 19 | #include <linux/bitops.h> | ||
| 17 | 20 | ||
| 18 | struct clock_data { | 21 | struct clock_data { |
| 22 | ktime_t wrap_kt; | ||
| 19 | u64 epoch_ns; | 23 | u64 epoch_ns; |
| 20 | u32 epoch_cyc; | 24 | u64 epoch_cyc; |
| 21 | u32 epoch_cyc_copy; | 25 | seqcount_t seq; |
| 22 | unsigned long rate; | 26 | unsigned long rate; |
| 23 | u32 mult; | 27 | u32 mult; |
| 24 | u32 shift; | 28 | u32 shift; |
| 25 | bool suspended; | 29 | bool suspended; |
| 26 | }; | 30 | }; |
| 27 | 31 | ||
| 28 | static void sched_clock_poll(unsigned long wrap_ticks); | 32 | static struct hrtimer sched_clock_timer; |
| 29 | static DEFINE_TIMER(sched_clock_timer, sched_clock_poll, 0, 0); | ||
| 30 | static int irqtime = -1; | 33 | static int irqtime = -1; |
| 31 | 34 | ||
| 32 | core_param(irqtime, irqtime, int, 0400); | 35 | core_param(irqtime, irqtime, int, 0400); |
| @@ -35,42 +38,46 @@ static struct clock_data cd = { | |||
| 35 | .mult = NSEC_PER_SEC / HZ, | 38 | .mult = NSEC_PER_SEC / HZ, |
| 36 | }; | 39 | }; |
| 37 | 40 | ||
| 38 | static u32 __read_mostly sched_clock_mask = 0xffffffff; | 41 | static u64 __read_mostly sched_clock_mask; |
| 39 | 42 | ||
| 40 | static u32 notrace jiffy_sched_clock_read(void) | 43 | static u64 notrace jiffy_sched_clock_read(void) |
| 41 | { | 44 | { |
| 42 | return (u32)(jiffies - INITIAL_JIFFIES); | 45 | /* |
| 46 | * We don't need to use get_jiffies_64 on 32-bit arches here | ||
| 47 | * because we register with BITS_PER_LONG | ||
| 48 | */ | ||
| 49 | return (u64)(jiffies - INITIAL_JIFFIES); | ||
| 43 | } | 50 | } |
| 44 | 51 | ||
| 45 | static u32 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read; | 52 | static u32 __read_mostly (*read_sched_clock_32)(void); |
| 53 | |||
| 54 | static u64 notrace read_sched_clock_32_wrapper(void) | ||
| 55 | { | ||
| 56 | return read_sched_clock_32(); | ||
| 57 | } | ||
| 58 | |||
| 59 | static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read; | ||
| 46 | 60 | ||
| 47 | static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) | 61 | static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) |
| 48 | { | 62 | { |
| 49 | return (cyc * mult) >> shift; | 63 | return (cyc * mult) >> shift; |
| 50 | } | 64 | } |
| 51 | 65 | ||
| 52 | static unsigned long long notrace sched_clock_32(void) | 66 | unsigned long long notrace sched_clock(void) |
| 53 | { | 67 | { |
| 54 | u64 epoch_ns; | 68 | u64 epoch_ns; |
| 55 | u32 epoch_cyc; | 69 | u64 epoch_cyc; |
| 56 | u32 cyc; | 70 | u64 cyc; |
| 71 | unsigned long seq; | ||
| 57 | 72 | ||
| 58 | if (cd.suspended) | 73 | if (cd.suspended) |
| 59 | return cd.epoch_ns; | 74 | return cd.epoch_ns; |
| 60 | 75 | ||
| 61 | /* | ||
| 62 | * Load the epoch_cyc and epoch_ns atomically. We do this by | ||
| 63 | * ensuring that we always write epoch_cyc, epoch_ns and | ||
| 64 | * epoch_cyc_copy in strict order, and read them in strict order. | ||
| 65 | * If epoch_cyc and epoch_cyc_copy are not equal, then we're in | ||
| 66 | * the middle of an update, and we should repeat the load. | ||
| 67 | */ | ||
| 68 | do { | 76 | do { |
| 77 | seq = read_seqcount_begin(&cd.seq); | ||
| 69 | epoch_cyc = cd.epoch_cyc; | 78 | epoch_cyc = cd.epoch_cyc; |
| 70 | smp_rmb(); | ||
| 71 | epoch_ns = cd.epoch_ns; | 79 | epoch_ns = cd.epoch_ns; |
| 72 | smp_rmb(); | 80 | } while (read_seqcount_retry(&cd.seq, seq)); |
| 73 | } while (epoch_cyc != cd.epoch_cyc_copy); | ||
| 74 | 81 | ||
| 75 | cyc = read_sched_clock(); | 82 | cyc = read_sched_clock(); |
| 76 | cyc = (cyc - epoch_cyc) & sched_clock_mask; | 83 | cyc = (cyc - epoch_cyc) & sched_clock_mask; |
| @@ -83,49 +90,46 @@ static unsigned long long notrace sched_clock_32(void) | |||
| 83 | static void notrace update_sched_clock(void) | 90 | static void notrace update_sched_clock(void) |
| 84 | { | 91 | { |
| 85 | unsigned long flags; | 92 | unsigned long flags; |
| 86 | u32 cyc; | 93 | u64 cyc; |
| 87 | u64 ns; | 94 | u64 ns; |
| 88 | 95 | ||
| 89 | cyc = read_sched_clock(); | 96 | cyc = read_sched_clock(); |
| 90 | ns = cd.epoch_ns + | 97 | ns = cd.epoch_ns + |
| 91 | cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask, | 98 | cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask, |
| 92 | cd.mult, cd.shift); | 99 | cd.mult, cd.shift); |
| 93 | /* | 100 | |
| 94 | * Write epoch_cyc and epoch_ns in a way that the update is | ||
| 95 | * detectable in cyc_to_fixed_sched_clock(). | ||
| 96 | */ | ||
| 97 | raw_local_irq_save(flags); | 101 | raw_local_irq_save(flags); |
| 98 | cd.epoch_cyc_copy = cyc; | 102 | write_seqcount_begin(&cd.seq); |
| 99 | smp_wmb(); | ||
| 100 | cd.epoch_ns = ns; | 103 | cd.epoch_ns = ns; |
| 101 | smp_wmb(); | ||
| 102 | cd.epoch_cyc = cyc; | 104 | cd.epoch_cyc = cyc; |
| 105 | write_seqcount_end(&cd.seq); | ||
| 103 | raw_local_irq_restore(flags); | 106 | raw_local_irq_restore(flags); |
| 104 | } | 107 | } |
| 105 | 108 | ||
| 106 | static void sched_clock_poll(unsigned long wrap_ticks) | 109 | static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt) |
| 107 | { | 110 | { |
| 108 | mod_timer(&sched_clock_timer, round_jiffies(jiffies + wrap_ticks)); | ||
| 109 | update_sched_clock(); | 111 | update_sched_clock(); |
| 112 | hrtimer_forward_now(hrt, cd.wrap_kt); | ||
| 113 | return HRTIMER_RESTART; | ||
| 110 | } | 114 | } |
| 111 | 115 | ||
| 112 | void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate) | 116 | void __init sched_clock_register(u64 (*read)(void), int bits, |
| 117 | unsigned long rate) | ||
| 113 | { | 118 | { |
| 114 | unsigned long r, w; | 119 | unsigned long r; |
| 115 | u64 res, wrap; | 120 | u64 res, wrap; |
| 116 | char r_unit; | 121 | char r_unit; |
| 117 | 122 | ||
| 118 | if (cd.rate > rate) | 123 | if (cd.rate > rate) |
| 119 | return; | 124 | return; |
| 120 | 125 | ||
| 121 | BUG_ON(bits > 32); | ||
| 122 | WARN_ON(!irqs_disabled()); | 126 | WARN_ON(!irqs_disabled()); |
| 123 | read_sched_clock = read; | 127 | read_sched_clock = read; |
| 124 | sched_clock_mask = (1 << bits) - 1; | 128 | sched_clock_mask = CLOCKSOURCE_MASK(bits); |
| 125 | cd.rate = rate; | 129 | cd.rate = rate; |
| 126 | 130 | ||
| 127 | /* calculate the mult/shift to convert counter ticks to ns. */ | 131 | /* calculate the mult/shift to convert counter ticks to ns. */ |
| 128 | clocks_calc_mult_shift(&cd.mult, &cd.shift, rate, NSEC_PER_SEC, 0); | 132 | clocks_calc_mult_shift(&cd.mult, &cd.shift, rate, NSEC_PER_SEC, 3600); |
| 129 | 133 | ||
| 130 | r = rate; | 134 | r = rate; |
| 131 | if (r >= 4000000) { | 135 | if (r >= 4000000) { |
| @@ -138,20 +142,14 @@ void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate) | |||
| 138 | r_unit = ' '; | 142 | r_unit = ' '; |
| 139 | 143 | ||
| 140 | /* calculate how many ns until we wrap */ | 144 | /* calculate how many ns until we wrap */ |
| 141 | wrap = cyc_to_ns((1ULL << bits) - 1, cd.mult, cd.shift); | 145 | wrap = clocks_calc_max_nsecs(cd.mult, cd.shift, 0, sched_clock_mask); |
| 142 | do_div(wrap, NSEC_PER_MSEC); | 146 | cd.wrap_kt = ns_to_ktime(wrap - (wrap >> 3)); |
| 143 | w = wrap; | ||
| 144 | 147 | ||
| 145 | /* calculate the ns resolution of this counter */ | 148 | /* calculate the ns resolution of this counter */ |
| 146 | res = cyc_to_ns(1ULL, cd.mult, cd.shift); | 149 | res = cyc_to_ns(1ULL, cd.mult, cd.shift); |
| 147 | pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lums\n", | 150 | pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n", |
| 148 | bits, r, r_unit, res, w); | 151 | bits, r, r_unit, res, wrap); |
| 149 | 152 | ||
| 150 | /* | ||
| 151 | * Start the timer to keep sched_clock() properly updated and | ||
| 152 | * sets the initial epoch. | ||
| 153 | */ | ||
| 154 | sched_clock_timer.data = msecs_to_jiffies(w - (w / 10)); | ||
| 155 | update_sched_clock(); | 153 | update_sched_clock(); |
| 156 | 154 | ||
| 157 | /* | 155 | /* |
| @@ -166,11 +164,10 @@ void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate) | |||
| 166 | pr_debug("Registered %pF as sched_clock source\n", read); | 164 | pr_debug("Registered %pF as sched_clock source\n", read); |
| 167 | } | 165 | } |
| 168 | 166 | ||
| 169 | unsigned long long __read_mostly (*sched_clock_func)(void) = sched_clock_32; | 167 | void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate) |
| 170 | |||
| 171 | unsigned long long notrace sched_clock(void) | ||
| 172 | { | 168 | { |
| 173 | return sched_clock_func(); | 169 | read_sched_clock_32 = read; |
| 170 | sched_clock_register(read_sched_clock_32_wrapper, bits, rate); | ||
| 174 | } | 171 | } |
| 175 | 172 | ||
| 176 | void __init sched_clock_postinit(void) | 173 | void __init sched_clock_postinit(void) |
| @@ -180,14 +177,22 @@ void __init sched_clock_postinit(void) | |||
| 180 | * make it the final one one. | 177 | * make it the final one one. |
| 181 | */ | 178 | */ |
| 182 | if (read_sched_clock == jiffy_sched_clock_read) | 179 | if (read_sched_clock == jiffy_sched_clock_read) |
| 183 | setup_sched_clock(jiffy_sched_clock_read, 32, HZ); | 180 | sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ); |
| 184 | 181 | ||
| 185 | sched_clock_poll(sched_clock_timer.data); | 182 | update_sched_clock(); |
| 183 | |||
| 184 | /* | ||
| 185 | * Start the timer to keep sched_clock() properly updated and | ||
| 186 | * sets the initial epoch. | ||
| 187 | */ | ||
| 188 | hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
| 189 | sched_clock_timer.function = sched_clock_poll; | ||
| 190 | hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); | ||
| 186 | } | 191 | } |
| 187 | 192 | ||
| 188 | static int sched_clock_suspend(void) | 193 | static int sched_clock_suspend(void) |
| 189 | { | 194 | { |
| 190 | sched_clock_poll(sched_clock_timer.data); | 195 | sched_clock_poll(&sched_clock_timer); |
| 191 | cd.suspended = true; | 196 | cd.suspended = true; |
| 192 | return 0; | 197 | return 0; |
| 193 | } | 198 | } |
| @@ -195,7 +200,6 @@ static int sched_clock_suspend(void) | |||
| 195 | static void sched_clock_resume(void) | 200 | static void sched_clock_resume(void) |
| 196 | { | 201 | { |
| 197 | cd.epoch_cyc = read_sched_clock(); | 202 | cd.epoch_cyc = read_sched_clock(); |
| 198 | cd.epoch_cyc_copy = cd.epoch_cyc; | ||
| 199 | cd.suspended = false; | 203 | cd.suspended = false; |
| 200 | } | 204 | } |
| 201 | 205 | ||
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 218bcb565fed..9532690daaa9 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c | |||
| @@ -70,6 +70,7 @@ static bool tick_check_broadcast_device(struct clock_event_device *curdev, | |||
| 70 | struct clock_event_device *newdev) | 70 | struct clock_event_device *newdev) |
| 71 | { | 71 | { |
| 72 | if ((newdev->features & CLOCK_EVT_FEAT_DUMMY) || | 72 | if ((newdev->features & CLOCK_EVT_FEAT_DUMMY) || |
| 73 | (newdev->features & CLOCK_EVT_FEAT_PERCPU) || | ||
| 73 | (newdev->features & CLOCK_EVT_FEAT_C3STOP)) | 74 | (newdev->features & CLOCK_EVT_FEAT_C3STOP)) |
| 74 | return false; | 75 | return false; |
| 75 | 76 | ||
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index bc906cad709b..18e71f7fbc2a 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h | |||
| @@ -31,7 +31,7 @@ extern void tick_install_replacement(struct clock_event_device *dev); | |||
| 31 | 31 | ||
| 32 | extern void clockevents_shutdown(struct clock_event_device *dev); | 32 | extern void clockevents_shutdown(struct clock_event_device *dev); |
| 33 | 33 | ||
| 34 | extern size_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); | 34 | extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); |
| 35 | 35 | ||
| 36 | /* | 36 | /* |
| 37 | * NO_HZ / high resolution timer shared code | 37 | * NO_HZ / high resolution timer shared code |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index e77edc97e036..3612fc77f834 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
| @@ -23,6 +23,7 @@ | |||
| 23 | #include <linux/irq_work.h> | 23 | #include <linux/irq_work.h> |
| 24 | #include <linux/posix-timers.h> | 24 | #include <linux/posix-timers.h> |
| 25 | #include <linux/perf_event.h> | 25 | #include <linux/perf_event.h> |
| 26 | #include <linux/context_tracking.h> | ||
| 26 | 27 | ||
| 27 | #include <asm/irq_regs.h> | 28 | #include <asm/irq_regs.h> |
| 28 | 29 | ||
| @@ -148,8 +149,8 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) | |||
| 148 | } | 149 | } |
| 149 | 150 | ||
| 150 | #ifdef CONFIG_NO_HZ_FULL | 151 | #ifdef CONFIG_NO_HZ_FULL |
| 151 | static cpumask_var_t nohz_full_mask; | 152 | cpumask_var_t tick_nohz_full_mask; |
| 152 | bool have_nohz_full_mask; | 153 | bool tick_nohz_full_running; |
| 153 | 154 | ||
| 154 | static bool can_stop_full_tick(void) | 155 | static bool can_stop_full_tick(void) |
| 155 | { | 156 | { |
| @@ -182,7 +183,8 @@ static bool can_stop_full_tick(void) | |||
| 182 | * Don't allow the user to think they can get | 183 | * Don't allow the user to think they can get |
| 183 | * full NO_HZ with this machine. | 184 | * full NO_HZ with this machine. |
| 184 | */ | 185 | */ |
| 185 | WARN_ONCE(1, "NO_HZ FULL will not work with unstable sched clock"); | 186 | WARN_ONCE(tick_nohz_full_running, |
| 187 | "NO_HZ FULL will not work with unstable sched clock"); | ||
| 186 | return false; | 188 | return false; |
| 187 | } | 189 | } |
| 188 | #endif | 190 | #endif |
| @@ -196,7 +198,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now); | |||
| 196 | * Re-evaluate the need for the tick on the current CPU | 198 | * Re-evaluate the need for the tick on the current CPU |
| 197 | * and restart it if necessary. | 199 | * and restart it if necessary. |
| 198 | */ | 200 | */ |
| 199 | void tick_nohz_full_check(void) | 201 | void __tick_nohz_full_check(void) |
| 200 | { | 202 | { |
| 201 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | 203 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); |
| 202 | 204 | ||
| @@ -210,7 +212,7 @@ void tick_nohz_full_check(void) | |||
| 210 | 212 | ||
| 211 | static void nohz_full_kick_work_func(struct irq_work *work) | 213 | static void nohz_full_kick_work_func(struct irq_work *work) |
| 212 | { | 214 | { |
| 213 | tick_nohz_full_check(); | 215 | __tick_nohz_full_check(); |
| 214 | } | 216 | } |
| 215 | 217 | ||
| 216 | static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { | 218 | static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { |
| @@ -229,7 +231,7 @@ void tick_nohz_full_kick(void) | |||
| 229 | 231 | ||
| 230 | static void nohz_full_kick_ipi(void *info) | 232 | static void nohz_full_kick_ipi(void *info) |
| 231 | { | 233 | { |
| 232 | tick_nohz_full_check(); | 234 | __tick_nohz_full_check(); |
| 233 | } | 235 | } |
| 234 | 236 | ||
| 235 | /* | 237 | /* |
| @@ -238,12 +240,13 @@ static void nohz_full_kick_ipi(void *info) | |||
| 238 | */ | 240 | */ |
| 239 | void tick_nohz_full_kick_all(void) | 241 | void tick_nohz_full_kick_all(void) |
| 240 | { | 242 | { |
| 241 | if (!have_nohz_full_mask) | 243 | if (!tick_nohz_full_running) |
| 242 | return; | 244 | return; |
| 243 | 245 | ||
| 244 | preempt_disable(); | 246 | preempt_disable(); |
| 245 | smp_call_function_many(nohz_full_mask, | 247 | smp_call_function_many(tick_nohz_full_mask, |
| 246 | nohz_full_kick_ipi, NULL, false); | 248 | nohz_full_kick_ipi, NULL, false); |
| 249 | tick_nohz_full_kick(); | ||
| 247 | preempt_enable(); | 250 | preempt_enable(); |
| 248 | } | 251 | } |
| 249 | 252 | ||
| @@ -252,7 +255,7 @@ void tick_nohz_full_kick_all(void) | |||
| 252 | * It might need the tick due to per task/process properties: | 255 | * It might need the tick due to per task/process properties: |
| 253 | * perf events, posix cpu timers, ... | 256 | * perf events, posix cpu timers, ... |
| 254 | */ | 257 | */ |
| 255 | void tick_nohz_task_switch(struct task_struct *tsk) | 258 | void __tick_nohz_task_switch(struct task_struct *tsk) |
| 256 | { | 259 | { |
| 257 | unsigned long flags; | 260 | unsigned long flags; |
| 258 | 261 | ||
| @@ -268,31 +271,23 @@ out: | |||
| 268 | local_irq_restore(flags); | 271 | local_irq_restore(flags); |
| 269 | } | 272 | } |
| 270 | 273 | ||
| 271 | int tick_nohz_full_cpu(int cpu) | ||
| 272 | { | ||
| 273 | if (!have_nohz_full_mask) | ||
| 274 | return 0; | ||
| 275 | |||
| 276 | return cpumask_test_cpu(cpu, nohz_full_mask); | ||
| 277 | } | ||
| 278 | |||
| 279 | /* Parse the boot-time nohz CPU list from the kernel parameters. */ | 274 | /* Parse the boot-time nohz CPU list from the kernel parameters. */ |
| 280 | static int __init tick_nohz_full_setup(char *str) | 275 | static int __init tick_nohz_full_setup(char *str) |
| 281 | { | 276 | { |
| 282 | int cpu; | 277 | int cpu; |
| 283 | 278 | ||
| 284 | alloc_bootmem_cpumask_var(&nohz_full_mask); | 279 | alloc_bootmem_cpumask_var(&tick_nohz_full_mask); |
| 285 | if (cpulist_parse(str, nohz_full_mask) < 0) { | 280 | if (cpulist_parse(str, tick_nohz_full_mask) < 0) { |
| 286 | pr_warning("NOHZ: Incorrect nohz_full cpumask\n"); | 281 | pr_warning("NOHZ: Incorrect nohz_full cpumask\n"); |
| 287 | return 1; | 282 | return 1; |
| 288 | } | 283 | } |
| 289 | 284 | ||
| 290 | cpu = smp_processor_id(); | 285 | cpu = smp_processor_id(); |
| 291 | if (cpumask_test_cpu(cpu, nohz_full_mask)) { | 286 | if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { |
| 292 | pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu); | 287 | pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu); |
| 293 | cpumask_clear_cpu(cpu, nohz_full_mask); | 288 | cpumask_clear_cpu(cpu, tick_nohz_full_mask); |
| 294 | } | 289 | } |
| 295 | have_nohz_full_mask = true; | 290 | tick_nohz_full_running = true; |
| 296 | 291 | ||
| 297 | return 1; | 292 | return 1; |
| 298 | } | 293 | } |
| @@ -310,7 +305,7 @@ static int tick_nohz_cpu_down_callback(struct notifier_block *nfb, | |||
| 310 | * If we handle the timekeeping duty for full dynticks CPUs, | 305 | * If we handle the timekeeping duty for full dynticks CPUs, |
| 311 | * we can't safely shutdown that CPU. | 306 | * we can't safely shutdown that CPU. |
| 312 | */ | 307 | */ |
| 313 | if (have_nohz_full_mask && tick_do_timer_cpu == cpu) | 308 | if (tick_nohz_full_running && tick_do_timer_cpu == cpu) |
| 314 | return NOTIFY_BAD; | 309 | return NOTIFY_BAD; |
| 315 | break; | 310 | break; |
| 316 | } | 311 | } |
| @@ -329,14 +324,14 @@ static int tick_nohz_init_all(void) | |||
| 329 | int err = -1; | 324 | int err = -1; |
| 330 | 325 | ||
| 331 | #ifdef CONFIG_NO_HZ_FULL_ALL | 326 | #ifdef CONFIG_NO_HZ_FULL_ALL |
| 332 | if (!alloc_cpumask_var(&nohz_full_mask, GFP_KERNEL)) { | 327 | if (!alloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL)) { |
| 333 | pr_err("NO_HZ: Can't allocate full dynticks cpumask\n"); | 328 | pr_err("NO_HZ: Can't allocate full dynticks cpumask\n"); |
| 334 | return err; | 329 | return err; |
| 335 | } | 330 | } |
| 336 | err = 0; | 331 | err = 0; |
| 337 | cpumask_setall(nohz_full_mask); | 332 | cpumask_setall(tick_nohz_full_mask); |
| 338 | cpumask_clear_cpu(smp_processor_id(), nohz_full_mask); | 333 | cpumask_clear_cpu(smp_processor_id(), tick_nohz_full_mask); |
| 339 | have_nohz_full_mask = true; | 334 | tick_nohz_full_running = true; |
| 340 | #endif | 335 | #endif |
| 341 | return err; | 336 | return err; |
| 342 | } | 337 | } |
| @@ -345,17 +340,18 @@ void __init tick_nohz_init(void) | |||
| 345 | { | 340 | { |
| 346 | int cpu; | 341 | int cpu; |
| 347 | 342 | ||
| 348 | if (!have_nohz_full_mask) { | 343 | if (!tick_nohz_full_running) { |
| 349 | if (tick_nohz_init_all() < 0) | 344 | if (tick_nohz_init_all() < 0) |
| 350 | return; | 345 | return; |
| 351 | } | 346 | } |
| 352 | 347 | ||
| 348 | for_each_cpu(cpu, tick_nohz_full_mask) | ||
| 349 | context_tracking_cpu_set(cpu); | ||
| 350 | |||
| 353 | cpu_notifier(tick_nohz_cpu_down_callback, 0); | 351 | cpu_notifier(tick_nohz_cpu_down_callback, 0); |
| 354 | cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), nohz_full_mask); | 352 | cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), tick_nohz_full_mask); |
| 355 | pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf); | 353 | pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf); |
| 356 | } | 354 | } |
| 357 | #else | ||
| 358 | #define have_nohz_full_mask (0) | ||
| 359 | #endif | 355 | #endif |
| 360 | 356 | ||
| 361 | /* | 357 | /* |
| @@ -733,7 +729,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) | |||
| 733 | return false; | 729 | return false; |
| 734 | } | 730 | } |
| 735 | 731 | ||
| 736 | if (have_nohz_full_mask) { | 732 | if (tick_nohz_full_enabled()) { |
| 737 | /* | 733 | /* |
| 738 | * Keep the tick alive to guarantee timekeeping progression | 734 | * Keep the tick alive to guarantee timekeeping progression |
| 739 | * if there are full dynticks CPUs around | 735 | * if there are full dynticks CPUs around |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 48b9fffabdc2..3abf53418b67 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
| @@ -1613,9 +1613,10 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, | |||
| 1613 | * ktime_get_update_offsets - hrtimer helper | 1613 | * ktime_get_update_offsets - hrtimer helper |
| 1614 | * @offs_real: pointer to storage for monotonic -> realtime offset | 1614 | * @offs_real: pointer to storage for monotonic -> realtime offset |
| 1615 | * @offs_boot: pointer to storage for monotonic -> boottime offset | 1615 | * @offs_boot: pointer to storage for monotonic -> boottime offset |
| 1616 | * @offs_tai: pointer to storage for monotonic -> clock tai offset | ||
| 1616 | * | 1617 | * |
| 1617 | * Returns current monotonic time and updates the offsets | 1618 | * Returns current monotonic time and updates the offsets |
| 1618 | * Called from hrtimer_interupt() or retrigger_next_event() | 1619 | * Called from hrtimer_interrupt() or retrigger_next_event() |
| 1619 | */ | 1620 | */ |
| 1620 | ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot, | 1621 | ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot, |
| 1621 | ktime_t *offs_tai) | 1622 | ktime_t *offs_tai) |
| @@ -1703,6 +1704,8 @@ int do_adjtimex(struct timex *txc) | |||
| 1703 | write_seqcount_end(&timekeeper_seq); | 1704 | write_seqcount_end(&timekeeper_seq); |
| 1704 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); | 1705 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); |
| 1705 | 1706 | ||
| 1707 | ntp_notify_cmos_timer(); | ||
| 1708 | |||
| 1706 | return ret; | 1709 | return ret; |
| 1707 | } | 1710 | } |
| 1708 | 1711 | ||
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 3bdf28323012..61ed862cdd37 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c | |||
| @@ -265,10 +265,9 @@ static inline void timer_list_header(struct seq_file *m, u64 now) | |||
| 265 | static int timer_list_show(struct seq_file *m, void *v) | 265 | static int timer_list_show(struct seq_file *m, void *v) |
| 266 | { | 266 | { |
| 267 | struct timer_list_iter *iter = v; | 267 | struct timer_list_iter *iter = v; |
| 268 | u64 now = ktime_to_ns(ktime_get()); | ||
| 269 | 268 | ||
| 270 | if (iter->cpu == -1 && !iter->second_pass) | 269 | if (iter->cpu == -1 && !iter->second_pass) |
| 271 | timer_list_header(m, now); | 270 | timer_list_header(m, iter->now); |
| 272 | else if (!iter->second_pass) | 271 | else if (!iter->second_pass) |
| 273 | print_cpu(m, iter->cpu, iter->now); | 272 | print_cpu(m, iter->cpu, iter->now); |
| 274 | #ifdef CONFIG_GENERIC_CLOCKEVENTS | 273 | #ifdef CONFIG_GENERIC_CLOCKEVENTS |
| @@ -298,33 +297,41 @@ void sysrq_timer_list_show(void) | |||
| 298 | return; | 297 | return; |
| 299 | } | 298 | } |
| 300 | 299 | ||
| 301 | static void *timer_list_start(struct seq_file *file, loff_t *offset) | 300 | static void *move_iter(struct timer_list_iter *iter, loff_t offset) |
| 302 | { | 301 | { |
| 303 | struct timer_list_iter *iter = file->private; | 302 | for (; offset; offset--) { |
| 304 | 303 | iter->cpu = cpumask_next(iter->cpu, cpu_online_mask); | |
| 305 | if (!*offset) { | 304 | if (iter->cpu >= nr_cpu_ids) { |
| 306 | iter->cpu = -1; | ||
| 307 | iter->now = ktime_to_ns(ktime_get()); | ||
| 308 | } else if (iter->cpu >= nr_cpu_ids) { | ||
| 309 | #ifdef CONFIG_GENERIC_CLOCKEVENTS | 305 | #ifdef CONFIG_GENERIC_CLOCKEVENTS |
| 310 | if (!iter->second_pass) { | 306 | if (!iter->second_pass) { |
| 311 | iter->cpu = -1; | 307 | iter->cpu = -1; |
| 312 | iter->second_pass = true; | 308 | iter->second_pass = true; |
| 313 | } else | 309 | } else |
| 314 | return NULL; | 310 | return NULL; |
| 315 | #else | 311 | #else |
| 316 | return NULL; | 312 | return NULL; |
| 317 | #endif | 313 | #endif |
| 314 | } | ||
| 318 | } | 315 | } |
| 319 | return iter; | 316 | return iter; |
| 320 | } | 317 | } |
| 321 | 318 | ||
| 319 | static void *timer_list_start(struct seq_file *file, loff_t *offset) | ||
| 320 | { | ||
| 321 | struct timer_list_iter *iter = file->private; | ||
| 322 | |||
| 323 | if (!*offset) | ||
| 324 | iter->now = ktime_to_ns(ktime_get()); | ||
| 325 | iter->cpu = -1; | ||
| 326 | iter->second_pass = false; | ||
| 327 | return move_iter(iter, *offset); | ||
| 328 | } | ||
| 329 | |||
| 322 | static void *timer_list_next(struct seq_file *file, void *v, loff_t *offset) | 330 | static void *timer_list_next(struct seq_file *file, void *v, loff_t *offset) |
| 323 | { | 331 | { |
| 324 | struct timer_list_iter *iter = file->private; | 332 | struct timer_list_iter *iter = file->private; |
| 325 | iter->cpu = cpumask_next(iter->cpu, cpu_online_mask); | ||
| 326 | ++*offset; | 333 | ++*offset; |
| 327 | return timer_list_start(file, offset); | 334 | return move_iter(iter, 1); |
| 328 | } | 335 | } |
| 329 | 336 | ||
| 330 | static void timer_list_stop(struct seq_file *seq, void *v) | 337 | static void timer_list_stop(struct seq_file *seq, void *v) |
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index 0b537f27b559..1fb08f21302e 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c | |||
| @@ -298,15 +298,15 @@ static int tstats_show(struct seq_file *m, void *v) | |||
| 298 | period = ktime_to_timespec(time); | 298 | period = ktime_to_timespec(time); |
| 299 | ms = period.tv_nsec / 1000000; | 299 | ms = period.tv_nsec / 1000000; |
| 300 | 300 | ||
| 301 | seq_puts(m, "Timer Stats Version: v0.2\n"); | 301 | seq_puts(m, "Timer Stats Version: v0.3\n"); |
| 302 | seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms); | 302 | seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms); |
| 303 | if (atomic_read(&overflow_count)) | 303 | if (atomic_read(&overflow_count)) |
| 304 | seq_printf(m, "Overflow: %d entries\n", | 304 | seq_printf(m, "Overflow: %d entries\n", atomic_read(&overflow_count)); |
| 305 | atomic_read(&overflow_count)); | 305 | seq_printf(m, "Collection: %s\n", timer_stats_active ? "active" : "inactive"); |
| 306 | 306 | ||
| 307 | for (i = 0; i < nr_entries; i++) { | 307 | for (i = 0; i < nr_entries; i++) { |
| 308 | entry = entries + i; | 308 | entry = entries + i; |
| 309 | if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) { | 309 | if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) { |
| 310 | seq_printf(m, "%4luD, %5d %-16s ", | 310 | seq_printf(m, "%4luD, %5d %-16s ", |
| 311 | entry->count, entry->pid, entry->comm); | 311 | entry->count, entry->pid, entry->comm); |
| 312 | } else { | 312 | } else { |
diff --git a/kernel/timer.c b/kernel/timer.c index 4296d13db3d1..6582b82fa966 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
| @@ -1092,7 +1092,7 @@ static int cascade(struct tvec_base *base, struct tvec *tv, int index) | |||
| 1092 | static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), | 1092 | static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), |
| 1093 | unsigned long data) | 1093 | unsigned long data) |
| 1094 | { | 1094 | { |
| 1095 | int preempt_count = preempt_count(); | 1095 | int count = preempt_count(); |
| 1096 | 1096 | ||
| 1097 | #ifdef CONFIG_LOCKDEP | 1097 | #ifdef CONFIG_LOCKDEP |
| 1098 | /* | 1098 | /* |
| @@ -1119,16 +1119,16 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), | |||
| 1119 | 1119 | ||
| 1120 | lock_map_release(&lockdep_map); | 1120 | lock_map_release(&lockdep_map); |
| 1121 | 1121 | ||
| 1122 | if (preempt_count != preempt_count()) { | 1122 | if (count != preempt_count()) { |
| 1123 | WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n", | 1123 | WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n", |
| 1124 | fn, preempt_count, preempt_count()); | 1124 | fn, count, preempt_count()); |
| 1125 | /* | 1125 | /* |
| 1126 | * Restore the preempt count. That gives us a decent | 1126 | * Restore the preempt count. That gives us a decent |
| 1127 | * chance to survive and extract information. If the | 1127 | * chance to survive and extract information. If the |
| 1128 | * callback kept a lock held, bad luck, but not worse | 1128 | * callback kept a lock held, bad luck, but not worse |
| 1129 | * than the BUG() we had. | 1129 | * than the BUG() we had. |
| 1130 | */ | 1130 | */ |
| 1131 | preempt_count() = preempt_count; | 1131 | preempt_count_set(count); |
| 1132 | } | 1132 | } |
| 1133 | } | 1133 | } |
| 1134 | 1134 | ||
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a6d098c6df3f..03cf44ac54d3 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
| @@ -1978,12 +1978,27 @@ int __weak ftrace_arch_code_modify_post_process(void) | |||
| 1978 | 1978 | ||
| 1979 | void ftrace_modify_all_code(int command) | 1979 | void ftrace_modify_all_code(int command) |
| 1980 | { | 1980 | { |
| 1981 | int update = command & FTRACE_UPDATE_TRACE_FUNC; | ||
| 1982 | |||
| 1983 | /* | ||
| 1984 | * If the ftrace_caller calls a ftrace_ops func directly, | ||
| 1985 | * we need to make sure that it only traces functions it | ||
| 1986 | * expects to trace. When doing the switch of functions, | ||
| 1987 | * we need to update to the ftrace_ops_list_func first | ||
| 1988 | * before the transition between old and new calls are set, | ||
| 1989 | * as the ftrace_ops_list_func will check the ops hashes | ||
| 1990 | * to make sure the ops are having the right functions | ||
| 1991 | * traced. | ||
| 1992 | */ | ||
| 1993 | if (update) | ||
| 1994 | ftrace_update_ftrace_func(ftrace_ops_list_func); | ||
| 1995 | |||
| 1981 | if (command & FTRACE_UPDATE_CALLS) | 1996 | if (command & FTRACE_UPDATE_CALLS) |
| 1982 | ftrace_replace_code(1); | 1997 | ftrace_replace_code(1); |
| 1983 | else if (command & FTRACE_DISABLE_CALLS) | 1998 | else if (command & FTRACE_DISABLE_CALLS) |
| 1984 | ftrace_replace_code(0); | 1999 | ftrace_replace_code(0); |
| 1985 | 2000 | ||
| 1986 | if (command & FTRACE_UPDATE_TRACE_FUNC) | 2001 | if (update && ftrace_trace_function != ftrace_ops_list_func) |
| 1987 | ftrace_update_ftrace_func(ftrace_trace_function); | 2002 | ftrace_update_ftrace_func(ftrace_trace_function); |
| 1988 | 2003 | ||
| 1989 | if (command & FTRACE_START_FUNC_RET) | 2004 | if (command & FTRACE_START_FUNC_RET) |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 496f94d57698..d9fea7dfd5d3 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
| @@ -1509,7 +1509,8 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, | |||
| 1509 | #endif | 1509 | #endif |
| 1510 | ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | | 1510 | ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | |
| 1511 | ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | | 1511 | ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | |
| 1512 | (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); | 1512 | (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) | |
| 1513 | (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0); | ||
| 1513 | } | 1514 | } |
| 1514 | EXPORT_SYMBOL_GPL(tracing_generic_entry_update); | 1515 | EXPORT_SYMBOL_GPL(tracing_generic_entry_update); |
| 1515 | 1516 | ||
| @@ -3166,11 +3167,6 @@ static const struct file_operations show_traces_fops = { | |||
| 3166 | }; | 3167 | }; |
| 3167 | 3168 | ||
| 3168 | /* | 3169 | /* |
| 3169 | * Only trace on a CPU if the bitmask is set: | ||
| 3170 | */ | ||
| 3171 | static cpumask_var_t tracing_cpumask; | ||
| 3172 | |||
| 3173 | /* | ||
| 3174 | * The tracer itself will not take this lock, but still we want | 3170 | * The tracer itself will not take this lock, but still we want |
| 3175 | * to provide a consistent cpumask to user-space: | 3171 | * to provide a consistent cpumask to user-space: |
| 3176 | */ | 3172 | */ |
| @@ -3186,11 +3182,12 @@ static ssize_t | |||
| 3186 | tracing_cpumask_read(struct file *filp, char __user *ubuf, | 3182 | tracing_cpumask_read(struct file *filp, char __user *ubuf, |
| 3187 | size_t count, loff_t *ppos) | 3183 | size_t count, loff_t *ppos) |
| 3188 | { | 3184 | { |
| 3185 | struct trace_array *tr = file_inode(filp)->i_private; | ||
| 3189 | int len; | 3186 | int len; |
| 3190 | 3187 | ||
| 3191 | mutex_lock(&tracing_cpumask_update_lock); | 3188 | mutex_lock(&tracing_cpumask_update_lock); |
| 3192 | 3189 | ||
| 3193 | len = cpumask_scnprintf(mask_str, count, tracing_cpumask); | 3190 | len = cpumask_scnprintf(mask_str, count, tr->tracing_cpumask); |
| 3194 | if (count - len < 2) { | 3191 | if (count - len < 2) { |
| 3195 | count = -EINVAL; | 3192 | count = -EINVAL; |
| 3196 | goto out_err; | 3193 | goto out_err; |
| @@ -3208,7 +3205,7 @@ static ssize_t | |||
| 3208 | tracing_cpumask_write(struct file *filp, const char __user *ubuf, | 3205 | tracing_cpumask_write(struct file *filp, const char __user *ubuf, |
| 3209 | size_t count, loff_t *ppos) | 3206 | size_t count, loff_t *ppos) |
| 3210 | { | 3207 | { |
| 3211 | struct trace_array *tr = filp->private_data; | 3208 | struct trace_array *tr = file_inode(filp)->i_private; |
| 3212 | cpumask_var_t tracing_cpumask_new; | 3209 | cpumask_var_t tracing_cpumask_new; |
| 3213 | int err, cpu; | 3210 | int err, cpu; |
| 3214 | 3211 | ||
| @@ -3228,12 +3225,12 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, | |||
| 3228 | * Increase/decrease the disabled counter if we are | 3225 | * Increase/decrease the disabled counter if we are |
| 3229 | * about to flip a bit in the cpumask: | 3226 | * about to flip a bit in the cpumask: |
| 3230 | */ | 3227 | */ |
| 3231 | if (cpumask_test_cpu(cpu, tracing_cpumask) && | 3228 | if (cpumask_test_cpu(cpu, tr->tracing_cpumask) && |
| 3232 | !cpumask_test_cpu(cpu, tracing_cpumask_new)) { | 3229 | !cpumask_test_cpu(cpu, tracing_cpumask_new)) { |
| 3233 | atomic_inc(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled); | 3230 | atomic_inc(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled); |
| 3234 | ring_buffer_record_disable_cpu(tr->trace_buffer.buffer, cpu); | 3231 | ring_buffer_record_disable_cpu(tr->trace_buffer.buffer, cpu); |
| 3235 | } | 3232 | } |
| 3236 | if (!cpumask_test_cpu(cpu, tracing_cpumask) && | 3233 | if (!cpumask_test_cpu(cpu, tr->tracing_cpumask) && |
| 3237 | cpumask_test_cpu(cpu, tracing_cpumask_new)) { | 3234 | cpumask_test_cpu(cpu, tracing_cpumask_new)) { |
| 3238 | atomic_dec(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled); | 3235 | atomic_dec(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled); |
| 3239 | ring_buffer_record_enable_cpu(tr->trace_buffer.buffer, cpu); | 3236 | ring_buffer_record_enable_cpu(tr->trace_buffer.buffer, cpu); |
| @@ -3242,7 +3239,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, | |||
| 3242 | arch_spin_unlock(&ftrace_max_lock); | 3239 | arch_spin_unlock(&ftrace_max_lock); |
| 3243 | local_irq_enable(); | 3240 | local_irq_enable(); |
| 3244 | 3241 | ||
| 3245 | cpumask_copy(tracing_cpumask, tracing_cpumask_new); | 3242 | cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new); |
| 3246 | 3243 | ||
| 3247 | mutex_unlock(&tracing_cpumask_update_lock); | 3244 | mutex_unlock(&tracing_cpumask_update_lock); |
| 3248 | free_cpumask_var(tracing_cpumask_new); | 3245 | free_cpumask_var(tracing_cpumask_new); |
| @@ -3256,9 +3253,10 @@ err_unlock: | |||
| 3256 | } | 3253 | } |
| 3257 | 3254 | ||
| 3258 | static const struct file_operations tracing_cpumask_fops = { | 3255 | static const struct file_operations tracing_cpumask_fops = { |
| 3259 | .open = tracing_open_generic, | 3256 | .open = tracing_open_generic_tr, |
| 3260 | .read = tracing_cpumask_read, | 3257 | .read = tracing_cpumask_read, |
| 3261 | .write = tracing_cpumask_write, | 3258 | .write = tracing_cpumask_write, |
| 3259 | .release = tracing_release_generic_tr, | ||
| 3262 | .llseek = generic_file_llseek, | 3260 | .llseek = generic_file_llseek, |
| 3263 | }; | 3261 | }; |
| 3264 | 3262 | ||
| @@ -5938,6 +5936,11 @@ static int new_instance_create(const char *name) | |||
| 5938 | if (!tr->name) | 5936 | if (!tr->name) |
| 5939 | goto out_free_tr; | 5937 | goto out_free_tr; |
| 5940 | 5938 | ||
| 5939 | if (!alloc_cpumask_var(&tr->tracing_cpumask, GFP_KERNEL)) | ||
| 5940 | goto out_free_tr; | ||
| 5941 | |||
| 5942 | cpumask_copy(tr->tracing_cpumask, cpu_all_mask); | ||
| 5943 | |||
| 5941 | raw_spin_lock_init(&tr->start_lock); | 5944 | raw_spin_lock_init(&tr->start_lock); |
| 5942 | 5945 | ||
| 5943 | tr->current_trace = &nop_trace; | 5946 | tr->current_trace = &nop_trace; |
| @@ -5969,6 +5972,7 @@ static int new_instance_create(const char *name) | |||
| 5969 | out_free_tr: | 5972 | out_free_tr: |
| 5970 | if (tr->trace_buffer.buffer) | 5973 | if (tr->trace_buffer.buffer) |
| 5971 | ring_buffer_free(tr->trace_buffer.buffer); | 5974 | ring_buffer_free(tr->trace_buffer.buffer); |
| 5975 | free_cpumask_var(tr->tracing_cpumask); | ||
| 5972 | kfree(tr->name); | 5976 | kfree(tr->name); |
| 5973 | kfree(tr); | 5977 | kfree(tr); |
| 5974 | 5978 | ||
| @@ -6098,6 +6102,9 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer) | |||
| 6098 | { | 6102 | { |
| 6099 | int cpu; | 6103 | int cpu; |
| 6100 | 6104 | ||
| 6105 | trace_create_file("tracing_cpumask", 0644, d_tracer, | ||
| 6106 | tr, &tracing_cpumask_fops); | ||
| 6107 | |||
| 6101 | trace_create_file("trace_options", 0644, d_tracer, | 6108 | trace_create_file("trace_options", 0644, d_tracer, |
| 6102 | tr, &tracing_iter_fops); | 6109 | tr, &tracing_iter_fops); |
| 6103 | 6110 | ||
| @@ -6147,9 +6154,6 @@ static __init int tracer_init_debugfs(void) | |||
| 6147 | 6154 | ||
| 6148 | init_tracer_debugfs(&global_trace, d_tracer); | 6155 | init_tracer_debugfs(&global_trace, d_tracer); |
| 6149 | 6156 | ||
| 6150 | trace_create_file("tracing_cpumask", 0644, d_tracer, | ||
| 6151 | &global_trace, &tracing_cpumask_fops); | ||
| 6152 | |||
| 6153 | trace_create_file("available_tracers", 0444, d_tracer, | 6157 | trace_create_file("available_tracers", 0444, d_tracer, |
| 6154 | &global_trace, &show_traces_fops); | 6158 | &global_trace, &show_traces_fops); |
| 6155 | 6159 | ||
| @@ -6371,7 +6375,7 @@ __init static int tracer_alloc_buffers(void) | |||
| 6371 | if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) | 6375 | if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) |
| 6372 | goto out; | 6376 | goto out; |
| 6373 | 6377 | ||
| 6374 | if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) | 6378 | if (!alloc_cpumask_var(&global_trace.tracing_cpumask, GFP_KERNEL)) |
| 6375 | goto out_free_buffer_mask; | 6379 | goto out_free_buffer_mask; |
| 6376 | 6380 | ||
| 6377 | /* Only allocate trace_printk buffers if a trace_printk exists */ | 6381 | /* Only allocate trace_printk buffers if a trace_printk exists */ |
| @@ -6386,7 +6390,7 @@ __init static int tracer_alloc_buffers(void) | |||
| 6386 | ring_buf_size = 1; | 6390 | ring_buf_size = 1; |
| 6387 | 6391 | ||
| 6388 | cpumask_copy(tracing_buffer_mask, cpu_possible_mask); | 6392 | cpumask_copy(tracing_buffer_mask, cpu_possible_mask); |
| 6389 | cpumask_copy(tracing_cpumask, cpu_all_mask); | 6393 | cpumask_copy(global_trace.tracing_cpumask, cpu_all_mask); |
| 6390 | 6394 | ||
| 6391 | raw_spin_lock_init(&global_trace.start_lock); | 6395 | raw_spin_lock_init(&global_trace.start_lock); |
| 6392 | 6396 | ||
| @@ -6441,7 +6445,7 @@ out_free_cpumask: | |||
| 6441 | #ifdef CONFIG_TRACER_MAX_TRACE | 6445 | #ifdef CONFIG_TRACER_MAX_TRACE |
| 6442 | free_percpu(global_trace.max_buffer.data); | 6446 | free_percpu(global_trace.max_buffer.data); |
| 6443 | #endif | 6447 | #endif |
| 6444 | free_cpumask_var(tracing_cpumask); | 6448 | free_cpumask_var(global_trace.tracing_cpumask); |
| 6445 | out_free_buffer_mask: | 6449 | out_free_buffer_mask: |
| 6446 | free_cpumask_var(tracing_buffer_mask); | 6450 | free_cpumask_var(tracing_buffer_mask); |
| 6447 | out: | 6451 | out: |
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index afaae41b0a02..73d08aa25b55 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
| @@ -124,6 +124,7 @@ enum trace_flag_type { | |||
| 124 | TRACE_FLAG_NEED_RESCHED = 0x04, | 124 | TRACE_FLAG_NEED_RESCHED = 0x04, |
| 125 | TRACE_FLAG_HARDIRQ = 0x08, | 125 | TRACE_FLAG_HARDIRQ = 0x08, |
| 126 | TRACE_FLAG_SOFTIRQ = 0x10, | 126 | TRACE_FLAG_SOFTIRQ = 0x10, |
| 127 | TRACE_FLAG_PREEMPT_RESCHED = 0x20, | ||
| 127 | }; | 128 | }; |
| 128 | 129 | ||
| 129 | #define TRACE_BUF_SIZE 1024 | 130 | #define TRACE_BUF_SIZE 1024 |
| @@ -206,6 +207,7 @@ struct trace_array { | |||
| 206 | struct dentry *event_dir; | 207 | struct dentry *event_dir; |
| 207 | struct list_head systems; | 208 | struct list_head systems; |
| 208 | struct list_head events; | 209 | struct list_head events; |
| 210 | cpumask_var_t tracing_cpumask; /* only trace on set CPUs */ | ||
| 209 | int ref; | 211 | int ref; |
| 210 | }; | 212 | }; |
| 211 | 213 | ||
| @@ -1022,6 +1024,9 @@ extern struct list_head ftrace_events; | |||
| 1022 | extern const char *__start___trace_bprintk_fmt[]; | 1024 | extern const char *__start___trace_bprintk_fmt[]; |
| 1023 | extern const char *__stop___trace_bprintk_fmt[]; | 1025 | extern const char *__stop___trace_bprintk_fmt[]; |
| 1024 | 1026 | ||
| 1027 | extern const char *__start___tracepoint_str[]; | ||
| 1028 | extern const char *__stop___tracepoint_str[]; | ||
| 1029 | |||
| 1025 | void trace_printk_init_buffers(void); | 1030 | void trace_printk_init_buffers(void); |
| 1026 | void trace_printk_start_comm(void); | 1031 | void trace_printk_start_comm(void); |
| 1027 | int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set); | 1032 | int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set); |
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 80c36bcf66e8..78e27e3b52ac 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c | |||
| @@ -26,7 +26,7 @@ static int perf_trace_event_perm(struct ftrace_event_call *tp_event, | |||
| 26 | { | 26 | { |
| 27 | /* The ftrace function trace is allowed only for root. */ | 27 | /* The ftrace function trace is allowed only for root. */ |
| 28 | if (ftrace_event_is_function(tp_event) && | 28 | if (ftrace_event_is_function(tp_event) && |
| 29 | perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) | 29 | perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) |
| 30 | return -EPERM; | 30 | return -EPERM; |
| 31 | 31 | ||
| 32 | /* No tracing, just counting, so no obvious leak */ | 32 | /* No tracing, just counting, so no obvious leak */ |
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 29a7ebcfb426..368a4d50cc30 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
| @@ -1489,12 +1489,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name, | |||
| 1489 | } | 1489 | } |
| 1490 | 1490 | ||
| 1491 | static int | 1491 | static int |
| 1492 | event_create_dir(struct dentry *parent, | 1492 | event_create_dir(struct dentry *parent, struct ftrace_event_file *file) |
| 1493 | struct ftrace_event_file *file, | ||
| 1494 | const struct file_operations *id, | ||
| 1495 | const struct file_operations *enable, | ||
| 1496 | const struct file_operations *filter, | ||
| 1497 | const struct file_operations *format) | ||
| 1498 | { | 1493 | { |
| 1499 | struct ftrace_event_call *call = file->event_call; | 1494 | struct ftrace_event_call *call = file->event_call; |
| 1500 | struct trace_array *tr = file->tr; | 1495 | struct trace_array *tr = file->tr; |
| @@ -1522,12 +1517,13 @@ event_create_dir(struct dentry *parent, | |||
| 1522 | 1517 | ||
| 1523 | if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) | 1518 | if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) |
| 1524 | trace_create_file("enable", 0644, file->dir, file, | 1519 | trace_create_file("enable", 0644, file->dir, file, |
| 1525 | enable); | 1520 | &ftrace_enable_fops); |
| 1526 | 1521 | ||
| 1527 | #ifdef CONFIG_PERF_EVENTS | 1522 | #ifdef CONFIG_PERF_EVENTS |
| 1528 | if (call->event.type && call->class->reg) | 1523 | if (call->event.type && call->class->reg) |
| 1529 | trace_create_file("id", 0444, file->dir, | 1524 | trace_create_file("id", 0444, file->dir, |
| 1530 | (void *)(long)call->event.type, id); | 1525 | (void *)(long)call->event.type, |
| 1526 | &ftrace_event_id_fops); | ||
| 1531 | #endif | 1527 | #endif |
| 1532 | 1528 | ||
| 1533 | /* | 1529 | /* |
| @@ -1544,10 +1540,10 @@ event_create_dir(struct dentry *parent, | |||
| 1544 | } | 1540 | } |
| 1545 | } | 1541 | } |
| 1546 | trace_create_file("filter", 0644, file->dir, call, | 1542 | trace_create_file("filter", 0644, file->dir, call, |
| 1547 | filter); | 1543 | &ftrace_event_filter_fops); |
| 1548 | 1544 | ||
| 1549 | trace_create_file("format", 0444, file->dir, call, | 1545 | trace_create_file("format", 0444, file->dir, call, |
| 1550 | format); | 1546 | &ftrace_event_format_fops); |
| 1551 | 1547 | ||
| 1552 | return 0; | 1548 | return 0; |
| 1553 | } | 1549 | } |
| @@ -1648,12 +1644,7 @@ trace_create_new_event(struct ftrace_event_call *call, | |||
| 1648 | 1644 | ||
| 1649 | /* Add an event to a trace directory */ | 1645 | /* Add an event to a trace directory */ |
| 1650 | static int | 1646 | static int |
| 1651 | __trace_add_new_event(struct ftrace_event_call *call, | 1647 | __trace_add_new_event(struct ftrace_event_call *call, struct trace_array *tr) |
| 1652 | struct trace_array *tr, | ||
| 1653 | const struct file_operations *id, | ||
| 1654 | const struct file_operations *enable, | ||
| 1655 | const struct file_operations *filter, | ||
| 1656 | const struct file_operations *format) | ||
| 1657 | { | 1648 | { |
| 1658 | struct ftrace_event_file *file; | 1649 | struct ftrace_event_file *file; |
| 1659 | 1650 | ||
| @@ -1661,7 +1652,7 @@ __trace_add_new_event(struct ftrace_event_call *call, | |||
| 1661 | if (!file) | 1652 | if (!file) |
| 1662 | return -ENOMEM; | 1653 | return -ENOMEM; |
| 1663 | 1654 | ||
| 1664 | return event_create_dir(tr->event_dir, file, id, enable, filter, format); | 1655 | return event_create_dir(tr->event_dir, file); |
| 1665 | } | 1656 | } |
| 1666 | 1657 | ||
| 1667 | /* | 1658 | /* |
| @@ -1683,8 +1674,7 @@ __trace_early_add_new_event(struct ftrace_event_call *call, | |||
| 1683 | } | 1674 | } |
| 1684 | 1675 | ||
| 1685 | struct ftrace_module_file_ops; | 1676 | struct ftrace_module_file_ops; |
| 1686 | static void __add_event_to_tracers(struct ftrace_event_call *call, | 1677 | static void __add_event_to_tracers(struct ftrace_event_call *call); |
| 1687 | struct ftrace_module_file_ops *file_ops); | ||
| 1688 | 1678 | ||
| 1689 | /* Add an additional event_call dynamically */ | 1679 | /* Add an additional event_call dynamically */ |
| 1690 | int trace_add_event_call(struct ftrace_event_call *call) | 1680 | int trace_add_event_call(struct ftrace_event_call *call) |
| @@ -1695,7 +1685,7 @@ int trace_add_event_call(struct ftrace_event_call *call) | |||
| 1695 | 1685 | ||
| 1696 | ret = __register_event(call, NULL); | 1686 | ret = __register_event(call, NULL); |
| 1697 | if (ret >= 0) | 1687 | if (ret >= 0) |
| 1698 | __add_event_to_tracers(call, NULL); | 1688 | __add_event_to_tracers(call); |
| 1699 | 1689 | ||
| 1700 | mutex_unlock(&event_mutex); | 1690 | mutex_unlock(&event_mutex); |
| 1701 | mutex_unlock(&trace_types_lock); | 1691 | mutex_unlock(&trace_types_lock); |
| @@ -1769,100 +1759,21 @@ int trace_remove_event_call(struct ftrace_event_call *call) | |||
| 1769 | 1759 | ||
| 1770 | #ifdef CONFIG_MODULES | 1760 | #ifdef CONFIG_MODULES |
| 1771 | 1761 | ||
| 1772 | static LIST_HEAD(ftrace_module_file_list); | ||
| 1773 | |||
| 1774 | /* | ||
| 1775 | * Modules must own their file_operations to keep up with | ||
| 1776 | * reference counting. | ||
| 1777 | */ | ||
| 1778 | struct ftrace_module_file_ops { | ||
| 1779 | struct list_head list; | ||
| 1780 | struct module *mod; | ||
| 1781 | struct file_operations id; | ||
| 1782 | struct file_operations enable; | ||
| 1783 | struct file_operations format; | ||
| 1784 | struct file_operations filter; | ||
| 1785 | }; | ||
| 1786 | |||
| 1787 | static struct ftrace_module_file_ops * | ||
| 1788 | find_ftrace_file_ops(struct ftrace_module_file_ops *file_ops, struct module *mod) | ||
| 1789 | { | ||
| 1790 | /* | ||
| 1791 | * As event_calls are added in groups by module, | ||
| 1792 | * when we find one file_ops, we don't need to search for | ||
| 1793 | * each call in that module, as the rest should be the | ||
| 1794 | * same. Only search for a new one if the last one did | ||
| 1795 | * not match. | ||
| 1796 | */ | ||
| 1797 | if (file_ops && mod == file_ops->mod) | ||
| 1798 | return file_ops; | ||
| 1799 | |||
| 1800 | list_for_each_entry(file_ops, &ftrace_module_file_list, list) { | ||
| 1801 | if (file_ops->mod == mod) | ||
| 1802 | return file_ops; | ||
| 1803 | } | ||
| 1804 | return NULL; | ||
| 1805 | } | ||
| 1806 | |||
| 1807 | static struct ftrace_module_file_ops * | ||
| 1808 | trace_create_file_ops(struct module *mod) | ||
| 1809 | { | ||
| 1810 | struct ftrace_module_file_ops *file_ops; | ||
| 1811 | |||
| 1812 | /* | ||
| 1813 | * This is a bit of a PITA. To allow for correct reference | ||
| 1814 | * counting, modules must "own" their file_operations. | ||
| 1815 | * To do this, we allocate the file operations that will be | ||
| 1816 | * used in the event directory. | ||
| 1817 | */ | ||
| 1818 | |||
| 1819 | file_ops = kmalloc(sizeof(*file_ops), GFP_KERNEL); | ||
| 1820 | if (!file_ops) | ||
| 1821 | return NULL; | ||
| 1822 | |||
| 1823 | file_ops->mod = mod; | ||
| 1824 | |||
| 1825 | file_ops->id = ftrace_event_id_fops; | ||
| 1826 | file_ops->id.owner = mod; | ||
| 1827 | |||
| 1828 | file_ops->enable = ftrace_enable_fops; | ||
| 1829 | file_ops->enable.owner = mod; | ||
| 1830 | |||
| 1831 | file_ops->filter = ftrace_event_filter_fops; | ||
| 1832 | file_ops->filter.owner = mod; | ||
| 1833 | |||
| 1834 | file_ops->format = ftrace_event_format_fops; | ||
| 1835 | file_ops->format.owner = mod; | ||
| 1836 | |||
| 1837 | list_add(&file_ops->list, &ftrace_module_file_list); | ||
| 1838 | |||
| 1839 | return file_ops; | ||
| 1840 | } | ||
| 1841 | |||
| 1842 | static void trace_module_add_events(struct module *mod) | 1762 | static void trace_module_add_events(struct module *mod) |
| 1843 | { | 1763 | { |
| 1844 | struct ftrace_module_file_ops *file_ops = NULL; | ||
| 1845 | struct ftrace_event_call **call, **start, **end; | 1764 | struct ftrace_event_call **call, **start, **end; |
| 1846 | 1765 | ||
| 1847 | start = mod->trace_events; | 1766 | start = mod->trace_events; |
| 1848 | end = mod->trace_events + mod->num_trace_events; | 1767 | end = mod->trace_events + mod->num_trace_events; |
| 1849 | 1768 | ||
| 1850 | if (start == end) | ||
| 1851 | return; | ||
| 1852 | |||
| 1853 | file_ops = trace_create_file_ops(mod); | ||
| 1854 | if (!file_ops) | ||
| 1855 | return; | ||
| 1856 | |||
| 1857 | for_each_event(call, start, end) { | 1769 | for_each_event(call, start, end) { |
| 1858 | __register_event(*call, mod); | 1770 | __register_event(*call, mod); |
| 1859 | __add_event_to_tracers(*call, file_ops); | 1771 | __add_event_to_tracers(*call); |
| 1860 | } | 1772 | } |
| 1861 | } | 1773 | } |
| 1862 | 1774 | ||
| 1863 | static void trace_module_remove_events(struct module *mod) | 1775 | static void trace_module_remove_events(struct module *mod) |
| 1864 | { | 1776 | { |
| 1865 | struct ftrace_module_file_ops *file_ops; | ||
| 1866 | struct ftrace_event_call *call, *p; | 1777 | struct ftrace_event_call *call, *p; |
| 1867 | bool clear_trace = false; | 1778 | bool clear_trace = false; |
| 1868 | 1779 | ||
| @@ -1874,16 +1785,6 @@ static void trace_module_remove_events(struct module *mod) | |||
| 1874 | __trace_remove_event_call(call); | 1785 | __trace_remove_event_call(call); |
| 1875 | } | 1786 | } |
| 1876 | } | 1787 | } |
| 1877 | |||
| 1878 | /* Now free the file_operations */ | ||
| 1879 | list_for_each_entry(file_ops, &ftrace_module_file_list, list) { | ||
| 1880 | if (file_ops->mod == mod) | ||
| 1881 | break; | ||
| 1882 | } | ||
| 1883 | if (&file_ops->list != &ftrace_module_file_list) { | ||
| 1884 | list_del(&file_ops->list); | ||
| 1885 | kfree(file_ops); | ||
| 1886 | } | ||
| 1887 | up_write(&trace_event_sem); | 1788 | up_write(&trace_event_sem); |
| 1888 | 1789 | ||
| 1889 | /* | 1790 | /* |
| @@ -1919,67 +1820,21 @@ static int trace_module_notify(struct notifier_block *self, | |||
| 1919 | return 0; | 1820 | return 0; |
| 1920 | } | 1821 | } |
| 1921 | 1822 | ||
| 1922 | static int | 1823 | static struct notifier_block trace_module_nb = { |
| 1923 | __trace_add_new_mod_event(struct ftrace_event_call *call, | 1824 | .notifier_call = trace_module_notify, |
| 1924 | struct trace_array *tr, | 1825 | .priority = 0, |
| 1925 | struct ftrace_module_file_ops *file_ops) | 1826 | }; |
| 1926 | { | ||
| 1927 | return __trace_add_new_event(call, tr, | ||
| 1928 | &file_ops->id, &file_ops->enable, | ||
| 1929 | &file_ops->filter, &file_ops->format); | ||
| 1930 | } | ||
| 1931 | |||
| 1932 | #else | ||
| 1933 | static inline struct ftrace_module_file_ops * | ||
| 1934 | find_ftrace_file_ops(struct ftrace_module_file_ops *file_ops, struct module *mod) | ||
| 1935 | { | ||
| 1936 | return NULL; | ||
| 1937 | } | ||
| 1938 | static inline int trace_module_notify(struct notifier_block *self, | ||
| 1939 | unsigned long val, void *data) | ||
| 1940 | { | ||
| 1941 | return 0; | ||
| 1942 | } | ||
| 1943 | static inline int | ||
| 1944 | __trace_add_new_mod_event(struct ftrace_event_call *call, | ||
| 1945 | struct trace_array *tr, | ||
| 1946 | struct ftrace_module_file_ops *file_ops) | ||
| 1947 | { | ||
| 1948 | return -ENODEV; | ||
| 1949 | } | ||
| 1950 | #endif /* CONFIG_MODULES */ | 1827 | #endif /* CONFIG_MODULES */ |
| 1951 | 1828 | ||
| 1952 | /* Create a new event directory structure for a trace directory. */ | 1829 | /* Create a new event directory structure for a trace directory. */ |
| 1953 | static void | 1830 | static void |
| 1954 | __trace_add_event_dirs(struct trace_array *tr) | 1831 | __trace_add_event_dirs(struct trace_array *tr) |
| 1955 | { | 1832 | { |
| 1956 | struct ftrace_module_file_ops *file_ops = NULL; | ||
| 1957 | struct ftrace_event_call *call; | 1833 | struct ftrace_event_call *call; |
| 1958 | int ret; | 1834 | int ret; |
| 1959 | 1835 | ||
| 1960 | list_for_each_entry(call, &ftrace_events, list) { | 1836 | list_for_each_entry(call, &ftrace_events, list) { |
| 1961 | if (call->mod) { | 1837 | ret = __trace_add_new_event(call, tr); |
| 1962 | /* | ||
| 1963 | * Directories for events by modules need to | ||
| 1964 | * keep module ref counts when opened (as we don't | ||
| 1965 | * want the module to disappear when reading one | ||
| 1966 | * of these files). The file_ops keep account of | ||
| 1967 | * the module ref count. | ||
| 1968 | */ | ||
| 1969 | file_ops = find_ftrace_file_ops(file_ops, call->mod); | ||
| 1970 | if (!file_ops) | ||
| 1971 | continue; /* Warn? */ | ||
| 1972 | ret = __trace_add_new_mod_event(call, tr, file_ops); | ||
| 1973 | if (ret < 0) | ||
| 1974 | pr_warning("Could not create directory for event %s\n", | ||
| 1975 | call->name); | ||
| 1976 | continue; | ||
| 1977 | } | ||
| 1978 | ret = __trace_add_new_event(call, tr, | ||
| 1979 | &ftrace_event_id_fops, | ||
| 1980 | &ftrace_enable_fops, | ||
| 1981 | &ftrace_event_filter_fops, | ||
| 1982 | &ftrace_event_format_fops); | ||
| 1983 | if (ret < 0) | 1838 | if (ret < 0) |
| 1984 | pr_warning("Could not create directory for event %s\n", | 1839 | pr_warning("Could not create directory for event %s\n", |
| 1985 | call->name); | 1840 | call->name); |
| @@ -2287,11 +2142,7 @@ __trace_early_add_event_dirs(struct trace_array *tr) | |||
| 2287 | 2142 | ||
| 2288 | 2143 | ||
| 2289 | list_for_each_entry(file, &tr->events, list) { | 2144 | list_for_each_entry(file, &tr->events, list) { |
| 2290 | ret = event_create_dir(tr->event_dir, file, | 2145 | ret = event_create_dir(tr->event_dir, file); |
| 2291 | &ftrace_event_id_fops, | ||
| 2292 | &ftrace_enable_fops, | ||
| 2293 | &ftrace_event_filter_fops, | ||
| 2294 | &ftrace_event_format_fops); | ||
| 2295 | if (ret < 0) | 2146 | if (ret < 0) |
| 2296 | pr_warning("Could not create directory for event %s\n", | 2147 | pr_warning("Could not create directory for event %s\n", |
| 2297 | file->event_call->name); | 2148 | file->event_call->name); |
| @@ -2332,29 +2183,14 @@ __trace_remove_event_dirs(struct trace_array *tr) | |||
| 2332 | remove_event_file_dir(file); | 2183 | remove_event_file_dir(file); |
| 2333 | } | 2184 | } |
| 2334 | 2185 | ||
| 2335 | static void | 2186 | static void __add_event_to_tracers(struct ftrace_event_call *call) |
| 2336 | __add_event_to_tracers(struct ftrace_event_call *call, | ||
| 2337 | struct ftrace_module_file_ops *file_ops) | ||
| 2338 | { | 2187 | { |
| 2339 | struct trace_array *tr; | 2188 | struct trace_array *tr; |
| 2340 | 2189 | ||
| 2341 | list_for_each_entry(tr, &ftrace_trace_arrays, list) { | 2190 | list_for_each_entry(tr, &ftrace_trace_arrays, list) |
| 2342 | if (file_ops) | 2191 | __trace_add_new_event(call, tr); |
| 2343 | __trace_add_new_mod_event(call, tr, file_ops); | ||
| 2344 | else | ||
| 2345 | __trace_add_new_event(call, tr, | ||
| 2346 | &ftrace_event_id_fops, | ||
| 2347 | &ftrace_enable_fops, | ||
| 2348 | &ftrace_event_filter_fops, | ||
| 2349 | &ftrace_event_format_fops); | ||
| 2350 | } | ||
| 2351 | } | 2192 | } |
| 2352 | 2193 | ||
| 2353 | static struct notifier_block trace_module_nb = { | ||
| 2354 | .notifier_call = trace_module_notify, | ||
| 2355 | .priority = 0, | ||
| 2356 | }; | ||
| 2357 | |||
| 2358 | extern struct ftrace_event_call *__start_ftrace_events[]; | 2194 | extern struct ftrace_event_call *__start_ftrace_events[]; |
| 2359 | extern struct ftrace_event_call *__stop_ftrace_events[]; | 2195 | extern struct ftrace_event_call *__stop_ftrace_events[]; |
| 2360 | 2196 | ||
| @@ -2559,10 +2395,11 @@ static __init int event_trace_init(void) | |||
| 2559 | if (ret) | 2395 | if (ret) |
| 2560 | return ret; | 2396 | return ret; |
| 2561 | 2397 | ||
| 2398 | #ifdef CONFIG_MODULES | ||
| 2562 | ret = register_module_notifier(&trace_module_nb); | 2399 | ret = register_module_notifier(&trace_module_nb); |
| 2563 | if (ret) | 2400 | if (ret) |
| 2564 | pr_warning("Failed to register trace events module notifier\n"); | 2401 | pr_warning("Failed to register trace events module notifier\n"); |
| 2565 | 2402 | #endif | |
| 2566 | return 0; | 2403 | return 0; |
| 2567 | } | 2404 | } |
| 2568 | early_initcall(event_trace_memsetup); | 2405 | early_initcall(event_trace_memsetup); |
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 34e7cbac0c9c..ed32284fbe32 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c | |||
| @@ -618,8 +618,23 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) | |||
| 618 | (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : | 618 | (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : |
| 619 | (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : | 619 | (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : |
| 620 | '.'; | 620 | '.'; |
| 621 | need_resched = | 621 | |
| 622 | (entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'; | 622 | switch (entry->flags & (TRACE_FLAG_NEED_RESCHED | |
| 623 | TRACE_FLAG_PREEMPT_RESCHED)) { | ||
| 624 | case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_PREEMPT_RESCHED: | ||
| 625 | need_resched = 'N'; | ||
| 626 | break; | ||
| 627 | case TRACE_FLAG_NEED_RESCHED: | ||
| 628 | need_resched = 'n'; | ||
| 629 | break; | ||
| 630 | case TRACE_FLAG_PREEMPT_RESCHED: | ||
| 631 | need_resched = 'p'; | ||
| 632 | break; | ||
| 633 | default: | ||
| 634 | need_resched = '.'; | ||
| 635 | break; | ||
| 636 | } | ||
| 637 | |||
| 623 | hardsoft_irq = | 638 | hardsoft_irq = |
| 624 | (hardirq && softirq) ? 'H' : | 639 | (hardirq && softirq) ? 'H' : |
| 625 | hardirq ? 'h' : | 640 | hardirq ? 'h' : |
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index a9077c1b4ad3..2900817ba65c 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c | |||
| @@ -244,12 +244,31 @@ static const char **find_next(void *v, loff_t *pos) | |||
| 244 | { | 244 | { |
| 245 | const char **fmt = v; | 245 | const char **fmt = v; |
| 246 | int start_index; | 246 | int start_index; |
| 247 | int last_index; | ||
| 247 | 248 | ||
| 248 | start_index = __stop___trace_bprintk_fmt - __start___trace_bprintk_fmt; | 249 | start_index = __stop___trace_bprintk_fmt - __start___trace_bprintk_fmt; |
| 249 | 250 | ||
| 250 | if (*pos < start_index) | 251 | if (*pos < start_index) |
| 251 | return __start___trace_bprintk_fmt + *pos; | 252 | return __start___trace_bprintk_fmt + *pos; |
| 252 | 253 | ||
| 254 | /* | ||
| 255 | * The __tracepoint_str section is treated the same as the | ||
| 256 | * __trace_printk_fmt section. The difference is that the | ||
| 257 | * __trace_printk_fmt section should only be used by trace_printk() | ||
| 258 | * in a debugging environment, as if anything exists in that section | ||
| 259 | * the trace_prink() helper buffers are allocated, which would just | ||
| 260 | * waste space in a production environment. | ||
| 261 | * | ||
| 262 | * The __tracepoint_str sections on the other hand are used by | ||
| 263 | * tracepoints which need to map pointers to their strings to | ||
| 264 | * the ASCII text for userspace. | ||
| 265 | */ | ||
| 266 | last_index = start_index; | ||
| 267 | start_index = __stop___tracepoint_str - __start___tracepoint_str; | ||
| 268 | |||
| 269 | if (*pos < last_index + start_index) | ||
| 270 | return __start___tracepoint_str + (*pos - last_index); | ||
| 271 | |||
| 253 | return find_next_mod_format(start_index, v, fmt, pos); | 272 | return find_next_mod_format(start_index, v, fmt, pos); |
| 254 | } | 273 | } |
| 255 | 274 | ||
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 8fd03657bc7d..559329d9bd2f 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c | |||
| @@ -200,8 +200,8 @@ extern char *__bad_type_size(void); | |||
| 200 | #type, #name, offsetof(typeof(trace), name), \ | 200 | #type, #name, offsetof(typeof(trace), name), \ |
| 201 | sizeof(trace.name), is_signed_type(type) | 201 | sizeof(trace.name), is_signed_type(type) |
| 202 | 202 | ||
| 203 | static | 203 | static int __init |
| 204 | int __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) | 204 | __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) |
| 205 | { | 205 | { |
| 206 | int i; | 206 | int i; |
| 207 | int pos = 0; | 207 | int pos = 0; |
| @@ -228,7 +228,7 @@ int __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) | |||
| 228 | return pos; | 228 | return pos; |
| 229 | } | 229 | } |
| 230 | 230 | ||
| 231 | static int set_syscall_print_fmt(struct ftrace_event_call *call) | 231 | static int __init set_syscall_print_fmt(struct ftrace_event_call *call) |
| 232 | { | 232 | { |
| 233 | char *print_fmt; | 233 | char *print_fmt; |
| 234 | int len; | 234 | int len; |
| @@ -253,7 +253,7 @@ static int set_syscall_print_fmt(struct ftrace_event_call *call) | |||
| 253 | return 0; | 253 | return 0; |
| 254 | } | 254 | } |
| 255 | 255 | ||
| 256 | static void free_syscall_print_fmt(struct ftrace_event_call *call) | 256 | static void __init free_syscall_print_fmt(struct ftrace_event_call *call) |
| 257 | { | 257 | { |
| 258 | struct syscall_metadata *entry = call->data; | 258 | struct syscall_metadata *entry = call->data; |
| 259 | 259 | ||
| @@ -459,7 +459,7 @@ static void unreg_event_syscall_exit(struct ftrace_event_file *file, | |||
| 459 | mutex_unlock(&syscall_trace_lock); | 459 | mutex_unlock(&syscall_trace_lock); |
| 460 | } | 460 | } |
| 461 | 461 | ||
| 462 | static int init_syscall_trace(struct ftrace_event_call *call) | 462 | static int __init init_syscall_trace(struct ftrace_event_call *call) |
| 463 | { | 463 | { |
| 464 | int id; | 464 | int id; |
| 465 | int num; | 465 | int num; |
diff --git a/kernel/uid16.c b/kernel/uid16.c index f6c83d7ef000..602e5bbbceff 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c | |||
| @@ -176,7 +176,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist) | |||
| 176 | struct group_info *group_info; | 176 | struct group_info *group_info; |
| 177 | int retval; | 177 | int retval; |
| 178 | 178 | ||
| 179 | if (!nsown_capable(CAP_SETGID)) | 179 | if (!ns_capable(current_user_ns(), CAP_SETGID)) |
| 180 | return -EPERM; | 180 | return -EPERM; |
| 181 | if ((unsigned)gidsetsize > NGROUPS_MAX) | 181 | if ((unsigned)gidsetsize > NGROUPS_MAX) |
| 182 | return -EINVAL; | 182 | return -EINVAL; |
diff --git a/kernel/up.c b/kernel/up.c index c54c75e9faf7..630d72bf7e41 100644 --- a/kernel/up.c +++ b/kernel/up.c | |||
| @@ -10,12 +10,64 @@ | |||
| 10 | int smp_call_function_single(int cpu, void (*func) (void *info), void *info, | 10 | int smp_call_function_single(int cpu, void (*func) (void *info), void *info, |
| 11 | int wait) | 11 | int wait) |
| 12 | { | 12 | { |
| 13 | unsigned long flags; | ||
| 14 | |||
| 13 | WARN_ON(cpu != 0); | 15 | WARN_ON(cpu != 0); |
| 14 | 16 | ||
| 15 | local_irq_disable(); | 17 | local_irq_save(flags); |
| 16 | (func)(info); | 18 | func(info); |
| 17 | local_irq_enable(); | 19 | local_irq_restore(flags); |
| 18 | 20 | ||
| 19 | return 0; | 21 | return 0; |
| 20 | } | 22 | } |
| 21 | EXPORT_SYMBOL(smp_call_function_single); | 23 | EXPORT_SYMBOL(smp_call_function_single); |
| 24 | |||
| 25 | int on_each_cpu(smp_call_func_t func, void *info, int wait) | ||
| 26 | { | ||
| 27 | unsigned long flags; | ||
| 28 | |||
| 29 | local_irq_save(flags); | ||
| 30 | func(info); | ||
| 31 | local_irq_restore(flags); | ||
| 32 | return 0; | ||
| 33 | } | ||
| 34 | EXPORT_SYMBOL(on_each_cpu); | ||
| 35 | |||
| 36 | /* | ||
| 37 | * Note we still need to test the mask even for UP | ||
| 38 | * because we actually can get an empty mask from | ||
| 39 | * code that on SMP might call us without the local | ||
| 40 | * CPU in the mask. | ||
| 41 | */ | ||
| 42 | void on_each_cpu_mask(const struct cpumask *mask, | ||
| 43 | smp_call_func_t func, void *info, bool wait) | ||
| 44 | { | ||
| 45 | unsigned long flags; | ||
| 46 | |||
| 47 | if (cpumask_test_cpu(0, mask)) { | ||
| 48 | local_irq_save(flags); | ||
| 49 | func(info); | ||
| 50 | local_irq_restore(flags); | ||
| 51 | } | ||
| 52 | } | ||
| 53 | EXPORT_SYMBOL(on_each_cpu_mask); | ||
| 54 | |||
| 55 | /* | ||
| 56 | * Preemption is disabled here to make sure the cond_func is called under the | ||
| 57 | * same condtions in UP and SMP. | ||
| 58 | */ | ||
| 59 | void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), | ||
| 60 | smp_call_func_t func, void *info, bool wait, | ||
| 61 | gfp_t gfp_flags) | ||
| 62 | { | ||
| 63 | unsigned long flags; | ||
| 64 | |||
| 65 | preempt_disable(); | ||
| 66 | if (cond_func(0, info)) { | ||
| 67 | local_irq_save(flags); | ||
| 68 | func(info); | ||
| 69 | local_irq_restore(flags); | ||
| 70 | } | ||
| 71 | preempt_enable(); | ||
| 72 | } | ||
| 73 | EXPORT_SYMBOL(on_each_cpu_cond); | ||
diff --git a/kernel/user.c b/kernel/user.c index 69b4c3d48cde..5bbb91988e69 100644 --- a/kernel/user.c +++ b/kernel/user.c | |||
| @@ -51,8 +51,6 @@ struct user_namespace init_user_ns = { | |||
| 51 | .owner = GLOBAL_ROOT_UID, | 51 | .owner = GLOBAL_ROOT_UID, |
| 52 | .group = GLOBAL_ROOT_GID, | 52 | .group = GLOBAL_ROOT_GID, |
| 53 | .proc_inum = PROC_USER_INIT_INO, | 53 | .proc_inum = PROC_USER_INIT_INO, |
| 54 | .may_mount_sysfs = true, | ||
| 55 | .may_mount_proc = true, | ||
| 56 | }; | 54 | }; |
| 57 | EXPORT_SYMBOL_GPL(init_user_ns); | 55 | EXPORT_SYMBOL_GPL(init_user_ns); |
| 58 | 56 | ||
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 9064b919a406..13fb1134ba58 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c | |||
| @@ -101,8 +101,6 @@ int create_user_ns(struct cred *new) | |||
| 101 | 101 | ||
| 102 | set_cred_user_ns(new, ns); | 102 | set_cred_user_ns(new, ns); |
| 103 | 103 | ||
| 104 | update_mnt_policy(ns); | ||
| 105 | |||
| 106 | return 0; | 104 | return 0; |
| 107 | } | 105 | } |
| 108 | 106 | ||
diff --git a/kernel/utsname.c b/kernel/utsname.c index 2fc8576efaa8..fd393124e507 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c | |||
| @@ -114,7 +114,7 @@ static int utsns_install(struct nsproxy *nsproxy, void *new) | |||
| 114 | struct uts_namespace *ns = new; | 114 | struct uts_namespace *ns = new; |
| 115 | 115 | ||
| 116 | if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || | 116 | if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || |
| 117 | !nsown_capable(CAP_SYS_ADMIN)) | 117 | !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) |
| 118 | return -EPERM; | 118 | return -EPERM; |
| 119 | 119 | ||
| 120 | get_uts_ns(ns); | 120 | get_uts_ns(ns); |
diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 1241d8c91d5e..4431610f049a 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c | |||
| @@ -486,7 +486,52 @@ static struct smp_hotplug_thread watchdog_threads = { | |||
| 486 | .unpark = watchdog_enable, | 486 | .unpark = watchdog_enable, |
| 487 | }; | 487 | }; |
| 488 | 488 | ||
| 489 | static int watchdog_enable_all_cpus(void) | 489 | static void restart_watchdog_hrtimer(void *info) |
| 490 | { | ||
| 491 | struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); | ||
| 492 | int ret; | ||
| 493 | |||
| 494 | /* | ||
| 495 | * No need to cancel and restart hrtimer if it is currently executing | ||
| 496 | * because it will reprogram itself with the new period now. | ||
| 497 | * We should never see it unqueued here because we are running per-cpu | ||
| 498 | * with interrupts disabled. | ||
| 499 | */ | ||
| 500 | ret = hrtimer_try_to_cancel(hrtimer); | ||
| 501 | if (ret == 1) | ||
| 502 | hrtimer_start(hrtimer, ns_to_ktime(sample_period), | ||
| 503 | HRTIMER_MODE_REL_PINNED); | ||
| 504 | } | ||
| 505 | |||
| 506 | static void update_timers(int cpu) | ||
| 507 | { | ||
| 508 | struct call_single_data data = {.func = restart_watchdog_hrtimer}; | ||
| 509 | /* | ||
| 510 | * Make sure that perf event counter will adopt to a new | ||
| 511 | * sampling period. Updating the sampling period directly would | ||
| 512 | * be much nicer but we do not have an API for that now so | ||
| 513 | * let's use a big hammer. | ||
| 514 | * Hrtimer will adopt the new period on the next tick but this | ||
| 515 | * might be late already so we have to restart the timer as well. | ||
| 516 | */ | ||
| 517 | watchdog_nmi_disable(cpu); | ||
| 518 | __smp_call_function_single(cpu, &data, 1); | ||
| 519 | watchdog_nmi_enable(cpu); | ||
| 520 | } | ||
| 521 | |||
| 522 | static void update_timers_all_cpus(void) | ||
| 523 | { | ||
| 524 | int cpu; | ||
| 525 | |||
| 526 | get_online_cpus(); | ||
| 527 | preempt_disable(); | ||
| 528 | for_each_online_cpu(cpu) | ||
| 529 | update_timers(cpu); | ||
| 530 | preempt_enable(); | ||
| 531 | put_online_cpus(); | ||
| 532 | } | ||
| 533 | |||
| 534 | static int watchdog_enable_all_cpus(bool sample_period_changed) | ||
| 490 | { | 535 | { |
| 491 | int err = 0; | 536 | int err = 0; |
| 492 | 537 | ||
| @@ -496,6 +541,8 @@ static int watchdog_enable_all_cpus(void) | |||
| 496 | pr_err("Failed to create watchdog threads, disabled\n"); | 541 | pr_err("Failed to create watchdog threads, disabled\n"); |
| 497 | else | 542 | else |
| 498 | watchdog_running = 1; | 543 | watchdog_running = 1; |
| 544 | } else if (sample_period_changed) { | ||
| 545 | update_timers_all_cpus(); | ||
| 499 | } | 546 | } |
| 500 | 547 | ||
| 501 | return err; | 548 | return err; |
| @@ -520,13 +567,15 @@ int proc_dowatchdog(struct ctl_table *table, int write, | |||
| 520 | void __user *buffer, size_t *lenp, loff_t *ppos) | 567 | void __user *buffer, size_t *lenp, loff_t *ppos) |
| 521 | { | 568 | { |
| 522 | int err, old_thresh, old_enabled; | 569 | int err, old_thresh, old_enabled; |
| 570 | static DEFINE_MUTEX(watchdog_proc_mutex); | ||
| 523 | 571 | ||
| 572 | mutex_lock(&watchdog_proc_mutex); | ||
| 524 | old_thresh = ACCESS_ONCE(watchdog_thresh); | 573 | old_thresh = ACCESS_ONCE(watchdog_thresh); |
| 525 | old_enabled = ACCESS_ONCE(watchdog_user_enabled); | 574 | old_enabled = ACCESS_ONCE(watchdog_user_enabled); |
| 526 | 575 | ||
| 527 | err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); | 576 | err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); |
| 528 | if (err || !write) | 577 | if (err || !write) |
| 529 | return err; | 578 | goto out; |
| 530 | 579 | ||
| 531 | set_sample_period(); | 580 | set_sample_period(); |
| 532 | /* | 581 | /* |
| @@ -535,7 +584,7 @@ int proc_dowatchdog(struct ctl_table *table, int write, | |||
| 535 | * watchdog_*_all_cpus() function takes care of this. | 584 | * watchdog_*_all_cpus() function takes care of this. |
| 536 | */ | 585 | */ |
| 537 | if (watchdog_user_enabled && watchdog_thresh) | 586 | if (watchdog_user_enabled && watchdog_thresh) |
| 538 | err = watchdog_enable_all_cpus(); | 587 | err = watchdog_enable_all_cpus(old_thresh != watchdog_thresh); |
| 539 | else | 588 | else |
| 540 | watchdog_disable_all_cpus(); | 589 | watchdog_disable_all_cpus(); |
| 541 | 590 | ||
| @@ -544,7 +593,8 @@ int proc_dowatchdog(struct ctl_table *table, int write, | |||
| 544 | watchdog_thresh = old_thresh; | 593 | watchdog_thresh = old_thresh; |
| 545 | watchdog_user_enabled = old_enabled; | 594 | watchdog_user_enabled = old_enabled; |
| 546 | } | 595 | } |
| 547 | 596 | out: | |
| 597 | mutex_unlock(&watchdog_proc_mutex); | ||
| 548 | return err; | 598 | return err; |
| 549 | } | 599 | } |
| 550 | #endif /* CONFIG_SYSCTL */ | 600 | #endif /* CONFIG_SYSCTL */ |
| @@ -553,14 +603,6 @@ void __init lockup_detector_init(void) | |||
| 553 | { | 603 | { |
| 554 | set_sample_period(); | 604 | set_sample_period(); |
| 555 | 605 | ||
| 556 | #ifdef CONFIG_NO_HZ_FULL | ||
| 557 | if (watchdog_user_enabled) { | ||
| 558 | watchdog_user_enabled = 0; | ||
| 559 | pr_warning("Disabled lockup detectors by default for full dynticks\n"); | ||
| 560 | pr_warning("You can reactivate it with 'sysctl -w kernel.watchdog=1'\n"); | ||
| 561 | } | ||
| 562 | #endif | ||
| 563 | |||
| 564 | if (watchdog_user_enabled) | 606 | if (watchdog_user_enabled) |
| 565 | watchdog_enable_all_cpus(); | 607 | watchdog_enable_all_cpus(false); |
| 566 | } | 608 | } |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 7f5d4be22034..987293d03ebc 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
| @@ -16,9 +16,10 @@ | |||
| 16 | * | 16 | * |
| 17 | * This is the generic async execution mechanism. Work items as are | 17 | * This is the generic async execution mechanism. Work items as are |
| 18 | * executed in process context. The worker pool is shared and | 18 | * executed in process context. The worker pool is shared and |
| 19 | * automatically managed. There is one worker pool for each CPU and | 19 | * automatically managed. There are two worker pools for each CPU (one for |
| 20 | * one extra for works which are better served by workers which are | 20 | * normal work items and the other for high priority ones) and some extra |
| 21 | * not bound to any specific CPU. | 21 | * pools for workqueues which are not bound to any specific CPU - the |
| 22 | * number of these backing pools is dynamic. | ||
| 22 | * | 23 | * |
| 23 | * Please read Documentation/workqueue.txt for details. | 24 | * Please read Documentation/workqueue.txt for details. |
| 24 | */ | 25 | */ |
| @@ -540,6 +541,8 @@ static int worker_pool_assign_id(struct worker_pool *pool) | |||
| 540 | * This must be called either with pwq_lock held or sched RCU read locked. | 541 | * This must be called either with pwq_lock held or sched RCU read locked. |
| 541 | * If the pwq needs to be used beyond the locking in effect, the caller is | 542 | * If the pwq needs to be used beyond the locking in effect, the caller is |
| 542 | * responsible for guaranteeing that the pwq stays online. | 543 | * responsible for guaranteeing that the pwq stays online. |
| 544 | * | ||
| 545 | * Return: The unbound pool_workqueue for @node. | ||
| 543 | */ | 546 | */ |
| 544 | static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq, | 547 | static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq, |
| 545 | int node) | 548 | int node) |
| @@ -638,8 +641,6 @@ static struct pool_workqueue *get_work_pwq(struct work_struct *work) | |||
| 638 | * get_work_pool - return the worker_pool a given work was associated with | 641 | * get_work_pool - return the worker_pool a given work was associated with |
| 639 | * @work: the work item of interest | 642 | * @work: the work item of interest |
| 640 | * | 643 | * |
| 641 | * Return the worker_pool @work was last associated with. %NULL if none. | ||
| 642 | * | ||
| 643 | * Pools are created and destroyed under wq_pool_mutex, and allows read | 644 | * Pools are created and destroyed under wq_pool_mutex, and allows read |
| 644 | * access under sched-RCU read lock. As such, this function should be | 645 | * access under sched-RCU read lock. As such, this function should be |
| 645 | * called under wq_pool_mutex or with preemption disabled. | 646 | * called under wq_pool_mutex or with preemption disabled. |
| @@ -648,6 +649,8 @@ static struct pool_workqueue *get_work_pwq(struct work_struct *work) | |||
| 648 | * mentioned locking is in effect. If the returned pool needs to be used | 649 | * mentioned locking is in effect. If the returned pool needs to be used |
| 649 | * beyond the critical section, the caller is responsible for ensuring the | 650 | * beyond the critical section, the caller is responsible for ensuring the |
| 650 | * returned pool is and stays online. | 651 | * returned pool is and stays online. |
| 652 | * | ||
| 653 | * Return: The worker_pool @work was last associated with. %NULL if none. | ||
| 651 | */ | 654 | */ |
| 652 | static struct worker_pool *get_work_pool(struct work_struct *work) | 655 | static struct worker_pool *get_work_pool(struct work_struct *work) |
| 653 | { | 656 | { |
| @@ -671,7 +674,7 @@ static struct worker_pool *get_work_pool(struct work_struct *work) | |||
| 671 | * get_work_pool_id - return the worker pool ID a given work is associated with | 674 | * get_work_pool_id - return the worker pool ID a given work is associated with |
| 672 | * @work: the work item of interest | 675 | * @work: the work item of interest |
| 673 | * | 676 | * |
| 674 | * Return the worker_pool ID @work was last associated with. | 677 | * Return: The worker_pool ID @work was last associated with. |
| 675 | * %WORK_OFFQ_POOL_NONE if none. | 678 | * %WORK_OFFQ_POOL_NONE if none. |
| 676 | */ | 679 | */ |
| 677 | static int get_work_pool_id(struct work_struct *work) | 680 | static int get_work_pool_id(struct work_struct *work) |
| @@ -830,7 +833,7 @@ void wq_worker_waking_up(struct task_struct *task, int cpu) | |||
| 830 | * CONTEXT: | 833 | * CONTEXT: |
| 831 | * spin_lock_irq(rq->lock) | 834 | * spin_lock_irq(rq->lock) |
| 832 | * | 835 | * |
| 833 | * RETURNS: | 836 | * Return: |
| 834 | * Worker task on @cpu to wake up, %NULL if none. | 837 | * Worker task on @cpu to wake up, %NULL if none. |
| 835 | */ | 838 | */ |
| 836 | struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu) | 839 | struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu) |
| @@ -965,8 +968,8 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) | |||
| 965 | * CONTEXT: | 968 | * CONTEXT: |
| 966 | * spin_lock_irq(pool->lock). | 969 | * spin_lock_irq(pool->lock). |
| 967 | * | 970 | * |
| 968 | * RETURNS: | 971 | * Return: |
| 969 | * Pointer to worker which is executing @work if found, NULL | 972 | * Pointer to worker which is executing @work if found, %NULL |
| 970 | * otherwise. | 973 | * otherwise. |
| 971 | */ | 974 | */ |
| 972 | static struct worker *find_worker_executing_work(struct worker_pool *pool, | 975 | static struct worker *find_worker_executing_work(struct worker_pool *pool, |
| @@ -1154,14 +1157,16 @@ out_put: | |||
| 1154 | * @flags: place to store irq state | 1157 | * @flags: place to store irq state |
| 1155 | * | 1158 | * |
| 1156 | * Try to grab PENDING bit of @work. This function can handle @work in any | 1159 | * Try to grab PENDING bit of @work. This function can handle @work in any |
| 1157 | * stable state - idle, on timer or on worklist. Return values are | 1160 | * stable state - idle, on timer or on worklist. |
| 1158 | * | 1161 | * |
| 1162 | * Return: | ||
| 1159 | * 1 if @work was pending and we successfully stole PENDING | 1163 | * 1 if @work was pending and we successfully stole PENDING |
| 1160 | * 0 if @work was idle and we claimed PENDING | 1164 | * 0 if @work was idle and we claimed PENDING |
| 1161 | * -EAGAIN if PENDING couldn't be grabbed at the moment, safe to busy-retry | 1165 | * -EAGAIN if PENDING couldn't be grabbed at the moment, safe to busy-retry |
| 1162 | * -ENOENT if someone else is canceling @work, this state may persist | 1166 | * -ENOENT if someone else is canceling @work, this state may persist |
| 1163 | * for arbitrarily long | 1167 | * for arbitrarily long |
| 1164 | * | 1168 | * |
| 1169 | * Note: | ||
| 1165 | * On >= 0 return, the caller owns @work's PENDING bit. To avoid getting | 1170 | * On >= 0 return, the caller owns @work's PENDING bit. To avoid getting |
| 1166 | * interrupted while holding PENDING and @work off queue, irq must be | 1171 | * interrupted while holding PENDING and @work off queue, irq must be |
| 1167 | * disabled on entry. This, combined with delayed_work->timer being | 1172 | * disabled on entry. This, combined with delayed_work->timer being |
| @@ -1403,10 +1408,10 @@ retry: | |||
| 1403 | * @wq: workqueue to use | 1408 | * @wq: workqueue to use |
| 1404 | * @work: work to queue | 1409 | * @work: work to queue |
| 1405 | * | 1410 | * |
| 1406 | * Returns %false if @work was already on a queue, %true otherwise. | ||
| 1407 | * | ||
| 1408 | * We queue the work to a specific CPU, the caller must ensure it | 1411 | * We queue the work to a specific CPU, the caller must ensure it |
| 1409 | * can't go away. | 1412 | * can't go away. |
| 1413 | * | ||
| 1414 | * Return: %false if @work was already on a queue, %true otherwise. | ||
| 1410 | */ | 1415 | */ |
| 1411 | bool queue_work_on(int cpu, struct workqueue_struct *wq, | 1416 | bool queue_work_on(int cpu, struct workqueue_struct *wq, |
| 1412 | struct work_struct *work) | 1417 | struct work_struct *work) |
| @@ -1476,7 +1481,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, | |||
| 1476 | * @dwork: work to queue | 1481 | * @dwork: work to queue |
| 1477 | * @delay: number of jiffies to wait before queueing | 1482 | * @delay: number of jiffies to wait before queueing |
| 1478 | * | 1483 | * |
| 1479 | * Returns %false if @work was already on a queue, %true otherwise. If | 1484 | * Return: %false if @work was already on a queue, %true otherwise. If |
| 1480 | * @delay is zero and @dwork is idle, it will be scheduled for immediate | 1485 | * @delay is zero and @dwork is idle, it will be scheduled for immediate |
| 1481 | * execution. | 1486 | * execution. |
| 1482 | */ | 1487 | */ |
| @@ -1512,7 +1517,7 @@ EXPORT_SYMBOL(queue_delayed_work_on); | |||
| 1512 | * zero, @work is guaranteed to be scheduled immediately regardless of its | 1517 | * zero, @work is guaranteed to be scheduled immediately regardless of its |
| 1513 | * current state. | 1518 | * current state. |
| 1514 | * | 1519 | * |
| 1515 | * Returns %false if @dwork was idle and queued, %true if @dwork was | 1520 | * Return: %false if @dwork was idle and queued, %true if @dwork was |
| 1516 | * pending and its timer was modified. | 1521 | * pending and its timer was modified. |
| 1517 | * | 1522 | * |
| 1518 | * This function is safe to call from any context including IRQ handler. | 1523 | * This function is safe to call from any context including IRQ handler. |
| @@ -1627,7 +1632,7 @@ static void worker_leave_idle(struct worker *worker) | |||
| 1627 | * Might sleep. Called without any lock but returns with pool->lock | 1632 | * Might sleep. Called without any lock but returns with pool->lock |
| 1628 | * held. | 1633 | * held. |
| 1629 | * | 1634 | * |
| 1630 | * RETURNS: | 1635 | * Return: |
| 1631 | * %true if the associated pool is online (@worker is successfully | 1636 | * %true if the associated pool is online (@worker is successfully |
| 1632 | * bound), %false if offline. | 1637 | * bound), %false if offline. |
| 1633 | */ | 1638 | */ |
| @@ -1688,7 +1693,7 @@ static struct worker *alloc_worker(void) | |||
| 1688 | * CONTEXT: | 1693 | * CONTEXT: |
| 1689 | * Might sleep. Does GFP_KERNEL allocations. | 1694 | * Might sleep. Does GFP_KERNEL allocations. |
| 1690 | * | 1695 | * |
| 1691 | * RETURNS: | 1696 | * Return: |
| 1692 | * Pointer to the newly created worker. | 1697 | * Pointer to the newly created worker. |
| 1693 | */ | 1698 | */ |
| 1694 | static struct worker *create_worker(struct worker_pool *pool) | 1699 | static struct worker *create_worker(struct worker_pool *pool) |
| @@ -1788,6 +1793,8 @@ static void start_worker(struct worker *worker) | |||
| 1788 | * @pool: the target pool | 1793 | * @pool: the target pool |
| 1789 | * | 1794 | * |
| 1790 | * Grab the managership of @pool and create and start a new worker for it. | 1795 | * Grab the managership of @pool and create and start a new worker for it. |
| 1796 | * | ||
| 1797 | * Return: 0 on success. A negative error code otherwise. | ||
| 1791 | */ | 1798 | */ |
| 1792 | static int create_and_start_worker(struct worker_pool *pool) | 1799 | static int create_and_start_worker(struct worker_pool *pool) |
| 1793 | { | 1800 | { |
| @@ -1932,7 +1939,7 @@ static void pool_mayday_timeout(unsigned long __pool) | |||
| 1932 | * multiple times. Does GFP_KERNEL allocations. Called only from | 1939 | * multiple times. Does GFP_KERNEL allocations. Called only from |
| 1933 | * manager. | 1940 | * manager. |
| 1934 | * | 1941 | * |
| 1935 | * RETURNS: | 1942 | * Return: |
| 1936 | * %false if no action was taken and pool->lock stayed locked, %true | 1943 | * %false if no action was taken and pool->lock stayed locked, %true |
| 1937 | * otherwise. | 1944 | * otherwise. |
| 1938 | */ | 1945 | */ |
| @@ -1989,7 +1996,7 @@ restart: | |||
| 1989 | * spin_lock_irq(pool->lock) which may be released and regrabbed | 1996 | * spin_lock_irq(pool->lock) which may be released and regrabbed |
| 1990 | * multiple times. Called only from manager. | 1997 | * multiple times. Called only from manager. |
| 1991 | * | 1998 | * |
| 1992 | * RETURNS: | 1999 | * Return: |
| 1993 | * %false if no action was taken and pool->lock stayed locked, %true | 2000 | * %false if no action was taken and pool->lock stayed locked, %true |
| 1994 | * otherwise. | 2001 | * otherwise. |
| 1995 | */ | 2002 | */ |
| @@ -2032,9 +2039,12 @@ static bool maybe_destroy_workers(struct worker_pool *pool) | |||
| 2032 | * spin_lock_irq(pool->lock) which may be released and regrabbed | 2039 | * spin_lock_irq(pool->lock) which may be released and regrabbed |
| 2033 | * multiple times. Does GFP_KERNEL allocations. | 2040 | * multiple times. Does GFP_KERNEL allocations. |
| 2034 | * | 2041 | * |
| 2035 | * RETURNS: | 2042 | * Return: |
| 2036 | * spin_lock_irq(pool->lock) which may be released and regrabbed | 2043 | * %false if the pool don't need management and the caller can safely start |
| 2037 | * multiple times. Does GFP_KERNEL allocations. | 2044 | * processing works, %true indicates that the function released pool->lock |
| 2045 | * and reacquired it to perform some management function and that the | ||
| 2046 | * conditions that the caller verified while holding the lock before | ||
| 2047 | * calling the function might no longer be true. | ||
| 2038 | */ | 2048 | */ |
| 2039 | static bool manage_workers(struct worker *worker) | 2049 | static bool manage_workers(struct worker *worker) |
| 2040 | { | 2050 | { |
| @@ -2201,6 +2211,15 @@ __acquires(&pool->lock) | |||
| 2201 | dump_stack(); | 2211 | dump_stack(); |
| 2202 | } | 2212 | } |
| 2203 | 2213 | ||
| 2214 | /* | ||
| 2215 | * The following prevents a kworker from hogging CPU on !PREEMPT | ||
| 2216 | * kernels, where a requeueing work item waiting for something to | ||
| 2217 | * happen could deadlock with stop_machine as such work item could | ||
| 2218 | * indefinitely requeue itself while all other CPUs are trapped in | ||
| 2219 | * stop_machine. | ||
| 2220 | */ | ||
| 2221 | cond_resched(); | ||
| 2222 | |||
| 2204 | spin_lock_irq(&pool->lock); | 2223 | spin_lock_irq(&pool->lock); |
| 2205 | 2224 | ||
| 2206 | /* clear cpu intensive status */ | 2225 | /* clear cpu intensive status */ |
| @@ -2246,6 +2265,8 @@ static void process_scheduled_works(struct worker *worker) | |||
| 2246 | * work items regardless of their specific target workqueue. The only | 2265 | * work items regardless of their specific target workqueue. The only |
| 2247 | * exception is work items which belong to workqueues with a rescuer which | 2266 | * exception is work items which belong to workqueues with a rescuer which |
| 2248 | * will be explained in rescuer_thread(). | 2267 | * will be explained in rescuer_thread(). |
| 2268 | * | ||
| 2269 | * Return: 0 | ||
| 2249 | */ | 2270 | */ |
| 2250 | static int worker_thread(void *__worker) | 2271 | static int worker_thread(void *__worker) |
| 2251 | { | 2272 | { |
| @@ -2344,6 +2365,8 @@ sleep: | |||
| 2344 | * those works so that forward progress can be guaranteed. | 2365 | * those works so that forward progress can be guaranteed. |
| 2345 | * | 2366 | * |
| 2346 | * This should happen rarely. | 2367 | * This should happen rarely. |
| 2368 | * | ||
| 2369 | * Return: 0 | ||
| 2347 | */ | 2370 | */ |
| 2348 | static int rescuer_thread(void *__rescuer) | 2371 | static int rescuer_thread(void *__rescuer) |
| 2349 | { | 2372 | { |
| @@ -2516,7 +2539,7 @@ static void insert_wq_barrier(struct pool_workqueue *pwq, | |||
| 2516 | * CONTEXT: | 2539 | * CONTEXT: |
| 2517 | * mutex_lock(wq->mutex). | 2540 | * mutex_lock(wq->mutex). |
| 2518 | * | 2541 | * |
| 2519 | * RETURNS: | 2542 | * Return: |
| 2520 | * %true if @flush_color >= 0 and there's something to flush. %false | 2543 | * %true if @flush_color >= 0 and there's something to flush. %false |
| 2521 | * otherwise. | 2544 | * otherwise. |
| 2522 | */ | 2545 | */ |
| @@ -2837,7 +2860,7 @@ static bool __flush_work(struct work_struct *work) | |||
| 2837 | * Wait until @work has finished execution. @work is guaranteed to be idle | 2860 | * Wait until @work has finished execution. @work is guaranteed to be idle |
| 2838 | * on return if it hasn't been requeued since flush started. | 2861 | * on return if it hasn't been requeued since flush started. |
| 2839 | * | 2862 | * |
| 2840 | * RETURNS: | 2863 | * Return: |
| 2841 | * %true if flush_work() waited for the work to finish execution, | 2864 | * %true if flush_work() waited for the work to finish execution, |
| 2842 | * %false if it was already idle. | 2865 | * %false if it was already idle. |
| 2843 | */ | 2866 | */ |
| @@ -2889,7 +2912,7 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork) | |||
| 2889 | * The caller must ensure that the workqueue on which @work was last | 2912 | * The caller must ensure that the workqueue on which @work was last |
| 2890 | * queued can't be destroyed before this function returns. | 2913 | * queued can't be destroyed before this function returns. |
| 2891 | * | 2914 | * |
| 2892 | * RETURNS: | 2915 | * Return: |
| 2893 | * %true if @work was pending, %false otherwise. | 2916 | * %true if @work was pending, %false otherwise. |
| 2894 | */ | 2917 | */ |
| 2895 | bool cancel_work_sync(struct work_struct *work) | 2918 | bool cancel_work_sync(struct work_struct *work) |
| @@ -2906,7 +2929,7 @@ EXPORT_SYMBOL_GPL(cancel_work_sync); | |||
| 2906 | * immediate execution. Like flush_work(), this function only | 2929 | * immediate execution. Like flush_work(), this function only |
| 2907 | * considers the last queueing instance of @dwork. | 2930 | * considers the last queueing instance of @dwork. |
| 2908 | * | 2931 | * |
| 2909 | * RETURNS: | 2932 | * Return: |
| 2910 | * %true if flush_work() waited for the work to finish execution, | 2933 | * %true if flush_work() waited for the work to finish execution, |
| 2911 | * %false if it was already idle. | 2934 | * %false if it was already idle. |
| 2912 | */ | 2935 | */ |
| @@ -2924,11 +2947,15 @@ EXPORT_SYMBOL(flush_delayed_work); | |||
| 2924 | * cancel_delayed_work - cancel a delayed work | 2947 | * cancel_delayed_work - cancel a delayed work |
| 2925 | * @dwork: delayed_work to cancel | 2948 | * @dwork: delayed_work to cancel |
| 2926 | * | 2949 | * |
| 2927 | * Kill off a pending delayed_work. Returns %true if @dwork was pending | 2950 | * Kill off a pending delayed_work. |
| 2928 | * and canceled; %false if wasn't pending. Note that the work callback | 2951 | * |
| 2929 | * function may still be running on return, unless it returns %true and the | 2952 | * Return: %true if @dwork was pending and canceled; %false if it wasn't |
| 2930 | * work doesn't re-arm itself. Explicitly flush or use | 2953 | * pending. |
| 2931 | * cancel_delayed_work_sync() to wait on it. | 2954 | * |
| 2955 | * Note: | ||
| 2956 | * The work callback function may still be running on return, unless | ||
| 2957 | * it returns %true and the work doesn't re-arm itself. Explicitly flush or | ||
| 2958 | * use cancel_delayed_work_sync() to wait on it. | ||
| 2932 | * | 2959 | * |
| 2933 | * This function is safe to call from any context including IRQ handler. | 2960 | * This function is safe to call from any context including IRQ handler. |
| 2934 | */ | 2961 | */ |
| @@ -2957,7 +2984,7 @@ EXPORT_SYMBOL(cancel_delayed_work); | |||
| 2957 | * | 2984 | * |
| 2958 | * This is cancel_work_sync() for delayed works. | 2985 | * This is cancel_work_sync() for delayed works. |
| 2959 | * | 2986 | * |
| 2960 | * RETURNS: | 2987 | * Return: |
| 2961 | * %true if @dwork was pending, %false otherwise. | 2988 | * %true if @dwork was pending, %false otherwise. |
| 2962 | */ | 2989 | */ |
| 2963 | bool cancel_delayed_work_sync(struct delayed_work *dwork) | 2990 | bool cancel_delayed_work_sync(struct delayed_work *dwork) |
| @@ -2974,7 +3001,7 @@ EXPORT_SYMBOL(cancel_delayed_work_sync); | |||
| 2974 | * system workqueue and blocks until all CPUs have completed. | 3001 | * system workqueue and blocks until all CPUs have completed. |
| 2975 | * schedule_on_each_cpu() is very slow. | 3002 | * schedule_on_each_cpu() is very slow. |
| 2976 | * | 3003 | * |
| 2977 | * RETURNS: | 3004 | * Return: |
| 2978 | * 0 on success, -errno on failure. | 3005 | * 0 on success, -errno on failure. |
| 2979 | */ | 3006 | */ |
| 2980 | int schedule_on_each_cpu(work_func_t func) | 3007 | int schedule_on_each_cpu(work_func_t func) |
| @@ -3042,7 +3069,7 @@ EXPORT_SYMBOL(flush_scheduled_work); | |||
| 3042 | * Executes the function immediately if process context is available, | 3069 | * Executes the function immediately if process context is available, |
| 3043 | * otherwise schedules the function for delayed execution. | 3070 | * otherwise schedules the function for delayed execution. |
| 3044 | * | 3071 | * |
| 3045 | * Returns: 0 - function was executed | 3072 | * Return: 0 - function was executed |
| 3046 | * 1 - function was scheduled for execution | 3073 | * 1 - function was scheduled for execution |
| 3047 | */ | 3074 | */ |
| 3048 | int execute_in_process_context(work_func_t fn, struct execute_work *ew) | 3075 | int execute_in_process_context(work_func_t fn, struct execute_work *ew) |
| @@ -3086,25 +3113,26 @@ static struct workqueue_struct *dev_to_wq(struct device *dev) | |||
| 3086 | return wq_dev->wq; | 3113 | return wq_dev->wq; |
| 3087 | } | 3114 | } |
| 3088 | 3115 | ||
| 3089 | static ssize_t wq_per_cpu_show(struct device *dev, | 3116 | static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr, |
| 3090 | struct device_attribute *attr, char *buf) | 3117 | char *buf) |
| 3091 | { | 3118 | { |
| 3092 | struct workqueue_struct *wq = dev_to_wq(dev); | 3119 | struct workqueue_struct *wq = dev_to_wq(dev); |
| 3093 | 3120 | ||
| 3094 | return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND)); | 3121 | return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND)); |
| 3095 | } | 3122 | } |
| 3123 | static DEVICE_ATTR_RO(per_cpu); | ||
| 3096 | 3124 | ||
| 3097 | static ssize_t wq_max_active_show(struct device *dev, | 3125 | static ssize_t max_active_show(struct device *dev, |
| 3098 | struct device_attribute *attr, char *buf) | 3126 | struct device_attribute *attr, char *buf) |
| 3099 | { | 3127 | { |
| 3100 | struct workqueue_struct *wq = dev_to_wq(dev); | 3128 | struct workqueue_struct *wq = dev_to_wq(dev); |
| 3101 | 3129 | ||
| 3102 | return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active); | 3130 | return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active); |
| 3103 | } | 3131 | } |
| 3104 | 3132 | ||
| 3105 | static ssize_t wq_max_active_store(struct device *dev, | 3133 | static ssize_t max_active_store(struct device *dev, |
| 3106 | struct device_attribute *attr, | 3134 | struct device_attribute *attr, const char *buf, |
| 3107 | const char *buf, size_t count) | 3135 | size_t count) |
| 3108 | { | 3136 | { |
| 3109 | struct workqueue_struct *wq = dev_to_wq(dev); | 3137 | struct workqueue_struct *wq = dev_to_wq(dev); |
| 3110 | int val; | 3138 | int val; |
| @@ -3115,12 +3143,14 @@ static ssize_t wq_max_active_store(struct device *dev, | |||
| 3115 | workqueue_set_max_active(wq, val); | 3143 | workqueue_set_max_active(wq, val); |
| 3116 | return count; | 3144 | return count; |
| 3117 | } | 3145 | } |
| 3146 | static DEVICE_ATTR_RW(max_active); | ||
| 3118 | 3147 | ||
| 3119 | static struct device_attribute wq_sysfs_attrs[] = { | 3148 | static struct attribute *wq_sysfs_attrs[] = { |
| 3120 | __ATTR(per_cpu, 0444, wq_per_cpu_show, NULL), | 3149 | &dev_attr_per_cpu.attr, |
| 3121 | __ATTR(max_active, 0644, wq_max_active_show, wq_max_active_store), | 3150 | &dev_attr_max_active.attr, |
| 3122 | __ATTR_NULL, | 3151 | NULL, |
| 3123 | }; | 3152 | }; |
| 3153 | ATTRIBUTE_GROUPS(wq_sysfs); | ||
| 3124 | 3154 | ||
| 3125 | static ssize_t wq_pool_ids_show(struct device *dev, | 3155 | static ssize_t wq_pool_ids_show(struct device *dev, |
| 3126 | struct device_attribute *attr, char *buf) | 3156 | struct device_attribute *attr, char *buf) |
| @@ -3270,7 +3300,7 @@ static struct device_attribute wq_sysfs_unbound_attrs[] = { | |||
| 3270 | 3300 | ||
| 3271 | static struct bus_type wq_subsys = { | 3301 | static struct bus_type wq_subsys = { |
| 3272 | .name = "workqueue", | 3302 | .name = "workqueue", |
| 3273 | .dev_attrs = wq_sysfs_attrs, | 3303 | .dev_groups = wq_sysfs_groups, |
| 3274 | }; | 3304 | }; |
| 3275 | 3305 | ||
| 3276 | static int __init wq_sysfs_init(void) | 3306 | static int __init wq_sysfs_init(void) |
| @@ -3299,7 +3329,7 @@ static void wq_device_release(struct device *dev) | |||
| 3299 | * apply_workqueue_attrs() may race against userland updating the | 3329 | * apply_workqueue_attrs() may race against userland updating the |
| 3300 | * attributes. | 3330 | * attributes. |
| 3301 | * | 3331 | * |
| 3302 | * Returns 0 on success, -errno on failure. | 3332 | * Return: 0 on success, -errno on failure. |
| 3303 | */ | 3333 | */ |
| 3304 | int workqueue_sysfs_register(struct workqueue_struct *wq) | 3334 | int workqueue_sysfs_register(struct workqueue_struct *wq) |
| 3305 | { | 3335 | { |
| @@ -3392,7 +3422,9 @@ void free_workqueue_attrs(struct workqueue_attrs *attrs) | |||
| 3392 | * @gfp_mask: allocation mask to use | 3422 | * @gfp_mask: allocation mask to use |
| 3393 | * | 3423 | * |
| 3394 | * Allocate a new workqueue_attrs, initialize with default settings and | 3424 | * Allocate a new workqueue_attrs, initialize with default settings and |
| 3395 | * return it. Returns NULL on failure. | 3425 | * return it. |
| 3426 | * | ||
| 3427 | * Return: The allocated new workqueue_attr on success. %NULL on failure. | ||
| 3396 | */ | 3428 | */ |
| 3397 | struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask) | 3429 | struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask) |
| 3398 | { | 3430 | { |
| @@ -3451,7 +3483,8 @@ static bool wqattrs_equal(const struct workqueue_attrs *a, | |||
| 3451 | * @pool: worker_pool to initialize | 3483 | * @pool: worker_pool to initialize |
| 3452 | * | 3484 | * |
| 3453 | * Initiailize a newly zalloc'd @pool. It also allocates @pool->attrs. | 3485 | * Initiailize a newly zalloc'd @pool. It also allocates @pool->attrs. |
| 3454 | * Returns 0 on success, -errno on failure. Even on failure, all fields | 3486 | * |
| 3487 | * Return: 0 on success, -errno on failure. Even on failure, all fields | ||
| 3455 | * inside @pool proper are initialized and put_unbound_pool() can be called | 3488 | * inside @pool proper are initialized and put_unbound_pool() can be called |
| 3456 | * on @pool safely to release it. | 3489 | * on @pool safely to release it. |
| 3457 | */ | 3490 | */ |
| @@ -3558,9 +3591,12 @@ static void put_unbound_pool(struct worker_pool *pool) | |||
| 3558 | * Obtain a worker_pool which has the same attributes as @attrs, bump the | 3591 | * Obtain a worker_pool which has the same attributes as @attrs, bump the |
| 3559 | * reference count and return it. If there already is a matching | 3592 | * reference count and return it. If there already is a matching |
| 3560 | * worker_pool, it will be used; otherwise, this function attempts to | 3593 | * worker_pool, it will be used; otherwise, this function attempts to |
| 3561 | * create a new one. On failure, returns NULL. | 3594 | * create a new one. |
| 3562 | * | 3595 | * |
| 3563 | * Should be called with wq_pool_mutex held. | 3596 | * Should be called with wq_pool_mutex held. |
| 3597 | * | ||
| 3598 | * Return: On success, a worker_pool with the same attributes as @attrs. | ||
| 3599 | * On failure, %NULL. | ||
| 3564 | */ | 3600 | */ |
| 3565 | static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) | 3601 | static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) |
| 3566 | { | 3602 | { |
| @@ -3796,9 +3832,7 @@ static void free_unbound_pwq(struct pool_workqueue *pwq) | |||
| 3796 | * | 3832 | * |
| 3797 | * Calculate the cpumask a workqueue with @attrs should use on @node. If | 3833 | * Calculate the cpumask a workqueue with @attrs should use on @node. If |
| 3798 | * @cpu_going_down is >= 0, that cpu is considered offline during | 3834 | * @cpu_going_down is >= 0, that cpu is considered offline during |
| 3799 | * calculation. The result is stored in @cpumask. This function returns | 3835 | * calculation. The result is stored in @cpumask. |
| 3800 | * %true if the resulting @cpumask is different from @attrs->cpumask, | ||
| 3801 | * %false if equal. | ||
| 3802 | * | 3836 | * |
| 3803 | * If NUMA affinity is not enabled, @attrs->cpumask is always used. If | 3837 | * If NUMA affinity is not enabled, @attrs->cpumask is always used. If |
| 3804 | * enabled and @node has online CPUs requested by @attrs, the returned | 3838 | * enabled and @node has online CPUs requested by @attrs, the returned |
| @@ -3807,6 +3841,9 @@ static void free_unbound_pwq(struct pool_workqueue *pwq) | |||
| 3807 | * | 3841 | * |
| 3808 | * The caller is responsible for ensuring that the cpumask of @node stays | 3842 | * The caller is responsible for ensuring that the cpumask of @node stays |
| 3809 | * stable. | 3843 | * stable. |
| 3844 | * | ||
| 3845 | * Return: %true if the resulting @cpumask is different from @attrs->cpumask, | ||
| 3846 | * %false if equal. | ||
| 3810 | */ | 3847 | */ |
| 3811 | static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node, | 3848 | static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node, |
| 3812 | int cpu_going_down, cpumask_t *cpumask) | 3849 | int cpu_going_down, cpumask_t *cpumask) |
| @@ -3860,8 +3897,9 @@ static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq, | |||
| 3860 | * items finish. Note that a work item which repeatedly requeues itself | 3897 | * items finish. Note that a work item which repeatedly requeues itself |
| 3861 | * back-to-back will stay on its current pwq. | 3898 | * back-to-back will stay on its current pwq. |
| 3862 | * | 3899 | * |
| 3863 | * Performs GFP_KERNEL allocations. Returns 0 on success and -errno on | 3900 | * Performs GFP_KERNEL allocations. |
| 3864 | * failure. | 3901 | * |
| 3902 | * Return: 0 on success and -errno on failure. | ||
| 3865 | */ | 3903 | */ |
| 3866 | int apply_workqueue_attrs(struct workqueue_struct *wq, | 3904 | int apply_workqueue_attrs(struct workqueue_struct *wq, |
| 3867 | const struct workqueue_attrs *attrs) | 3905 | const struct workqueue_attrs *attrs) |
| @@ -4329,6 +4367,8 @@ EXPORT_SYMBOL_GPL(workqueue_set_max_active); | |||
| 4329 | * | 4367 | * |
| 4330 | * Determine whether %current is a workqueue rescuer. Can be used from | 4368 | * Determine whether %current is a workqueue rescuer. Can be used from |
| 4331 | * work functions to determine whether it's being run off the rescuer task. | 4369 | * work functions to determine whether it's being run off the rescuer task. |
| 4370 | * | ||
| 4371 | * Return: %true if %current is a workqueue rescuer. %false otherwise. | ||
| 4332 | */ | 4372 | */ |
| 4333 | bool current_is_workqueue_rescuer(void) | 4373 | bool current_is_workqueue_rescuer(void) |
| 4334 | { | 4374 | { |
| @@ -4352,7 +4392,7 @@ bool current_is_workqueue_rescuer(void) | |||
| 4352 | * workqueue being congested on one CPU doesn't mean the workqueue is also | 4392 | * workqueue being congested on one CPU doesn't mean the workqueue is also |
| 4353 | * contested on other CPUs / NUMA nodes. | 4393 | * contested on other CPUs / NUMA nodes. |
| 4354 | * | 4394 | * |
| 4355 | * RETURNS: | 4395 | * Return: |
| 4356 | * %true if congested, %false otherwise. | 4396 | * %true if congested, %false otherwise. |
| 4357 | */ | 4397 | */ |
| 4358 | bool workqueue_congested(int cpu, struct workqueue_struct *wq) | 4398 | bool workqueue_congested(int cpu, struct workqueue_struct *wq) |
| @@ -4385,7 +4425,7 @@ EXPORT_SYMBOL_GPL(workqueue_congested); | |||
| 4385 | * synchronization around this function and the test result is | 4425 | * synchronization around this function and the test result is |
| 4386 | * unreliable and only useful as advisory hints or for debugging. | 4426 | * unreliable and only useful as advisory hints or for debugging. |
| 4387 | * | 4427 | * |
| 4388 | * RETURNS: | 4428 | * Return: |
| 4389 | * OR'd bitmask of WORK_BUSY_* bits. | 4429 | * OR'd bitmask of WORK_BUSY_* bits. |
| 4390 | */ | 4430 | */ |
| 4391 | unsigned int work_busy(struct work_struct *work) | 4431 | unsigned int work_busy(struct work_struct *work) |
| @@ -4763,9 +4803,10 @@ static void work_for_cpu_fn(struct work_struct *work) | |||
| 4763 | * @fn: the function to run | 4803 | * @fn: the function to run |
| 4764 | * @arg: the function arg | 4804 | * @arg: the function arg |
| 4765 | * | 4805 | * |
| 4766 | * This will return the value @fn returns. | ||
| 4767 | * It is up to the caller to ensure that the cpu doesn't go offline. | 4806 | * It is up to the caller to ensure that the cpu doesn't go offline. |
| 4768 | * The caller must not hold any locks which would prevent @fn from completing. | 4807 | * The caller must not hold any locks which would prevent @fn from completing. |
| 4808 | * | ||
| 4809 | * Return: The value @fn returns. | ||
| 4769 | */ | 4810 | */ |
| 4770 | long work_on_cpu(int cpu, long (*fn)(void *), void *arg) | 4811 | long work_on_cpu(int cpu, long (*fn)(void *), void *arg) |
| 4771 | { | 4812 | { |
| @@ -4837,7 +4878,7 @@ void freeze_workqueues_begin(void) | |||
| 4837 | * CONTEXT: | 4878 | * CONTEXT: |
| 4838 | * Grabs and releases wq_pool_mutex. | 4879 | * Grabs and releases wq_pool_mutex. |
| 4839 | * | 4880 | * |
| 4840 | * RETURNS: | 4881 | * Return: |
| 4841 | * %true if some freezable workqueues are still busy. %false if freezing | 4882 | * %true if some freezable workqueues are still busy. %false if freezing |
| 4842 | * is complete. | 4883 | * is complete. |
| 4843 | */ | 4884 | */ |
