diff options
author | Eric Paris <eparis@redhat.com> | 2013-11-22 18:57:08 -0500 |
---|---|---|
committer | Eric Paris <eparis@redhat.com> | 2013-11-22 18:57:54 -0500 |
commit | fc582aef7dcc27a7120cf232c1e76c569c7b6eab (patch) | |
tree | 7d275dd4ceab6067b91e9a25a5f6338b425fbccd /kernel | |
parent | 9175c9d2aed528800175ef81c90569d00d23f9be (diff) | |
parent | 5e01dc7b26d9f24f39abace5da98ccbd6a5ceb52 (diff) |
Merge tag 'v3.12'
Linux 3.12
Conflicts:
fs/exec.c
Diffstat (limited to 'kernel')
81 files changed, 3684 insertions, 2757 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 35ef1185e359..1ce47553fb02 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -26,6 +26,7 @@ obj-y += sched/ | |||
26 | obj-y += power/ | 26 | obj-y += power/ |
27 | obj-y += printk/ | 27 | obj-y += printk/ |
28 | obj-y += cpu/ | 28 | obj-y += cpu/ |
29 | obj-y += irq/ | ||
29 | 30 | ||
30 | obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o | 31 | obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o |
31 | obj-$(CONFIG_FREEZER) += freezer.o | 32 | obj-$(CONFIG_FREEZER) += freezer.o |
@@ -79,7 +80,6 @@ obj-$(CONFIG_KPROBES) += kprobes.o | |||
79 | obj-$(CONFIG_KGDB) += debug/ | 80 | obj-$(CONFIG_KGDB) += debug/ |
80 | obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o | 81 | obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o |
81 | obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o | 82 | obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o |
82 | obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ | ||
83 | obj-$(CONFIG_SECCOMP) += seccomp.o | 83 | obj-$(CONFIG_SECCOMP) += seccomp.o |
84 | obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o | 84 | obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o |
85 | obj-$(CONFIG_TREE_RCU) += rcutree.o | 85 | obj-$(CONFIG_TREE_RCU) += rcutree.o |
diff --git a/kernel/audit.c b/kernel/audit.c index b8831ac25b70..906ae5a0233a 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
@@ -1222,9 +1222,10 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, | |||
1222 | 1222 | ||
1223 | sleep_time = timeout_start + audit_backlog_wait_time - | 1223 | sleep_time = timeout_start + audit_backlog_wait_time - |
1224 | jiffies; | 1224 | jiffies; |
1225 | if ((long)sleep_time > 0) | 1225 | if ((long)sleep_time > 0) { |
1226 | wait_for_auditd(sleep_time); | 1226 | wait_for_auditd(sleep_time); |
1227 | continue; | 1227 | continue; |
1228 | } | ||
1228 | } | 1229 | } |
1229 | if (audit_rate_check() && printk_ratelimit()) | 1230 | if (audit_rate_check() && printk_ratelimit()) |
1230 | printk(KERN_WARNING | 1231 | printk(KERN_WARNING |
diff --git a/kernel/capability.c b/kernel/capability.c index f6c2ce5701e1..4e66bf9275b0 100644 --- a/kernel/capability.c +++ b/kernel/capability.c | |||
@@ -433,18 +433,6 @@ bool capable(int cap) | |||
433 | EXPORT_SYMBOL(capable); | 433 | EXPORT_SYMBOL(capable); |
434 | 434 | ||
435 | /** | 435 | /** |
436 | * nsown_capable - Check superior capability to one's own user_ns | ||
437 | * @cap: The capability in question | ||
438 | * | ||
439 | * Return true if the current task has the given superior capability | ||
440 | * targeted at its own user namespace. | ||
441 | */ | ||
442 | bool nsown_capable(int cap) | ||
443 | { | ||
444 | return ns_capable(current_user_ns(), cap); | ||
445 | } | ||
446 | |||
447 | /** | ||
448 | * inode_capable - Check superior capability over inode | 436 | * inode_capable - Check superior capability over inode |
449 | * @inode: The inode in question | 437 | * @inode: The inode in question |
450 | * @cap: The capability in question | 438 | * @cap: The capability in question |
@@ -464,3 +452,4 @@ bool inode_capable(const struct inode *inode, int cap) | |||
464 | 452 | ||
465 | return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid); | 453 | return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid); |
466 | } | 454 | } |
455 | EXPORT_SYMBOL(inode_capable); | ||
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e91963302c0d..8bd9cfdc70d7 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -60,6 +60,7 @@ | |||
60 | #include <linux/poll.h> | 60 | #include <linux/poll.h> |
61 | #include <linux/flex_array.h> /* used in cgroup_attach_task */ | 61 | #include <linux/flex_array.h> /* used in cgroup_attach_task */ |
62 | #include <linux/kthread.h> | 62 | #include <linux/kthread.h> |
63 | #include <linux/file.h> | ||
63 | 64 | ||
64 | #include <linux/atomic.h> | 65 | #include <linux/atomic.h> |
65 | 66 | ||
@@ -81,7 +82,7 @@ | |||
81 | */ | 82 | */ |
82 | #ifdef CONFIG_PROVE_RCU | 83 | #ifdef CONFIG_PROVE_RCU |
83 | DEFINE_MUTEX(cgroup_mutex); | 84 | DEFINE_MUTEX(cgroup_mutex); |
84 | EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for task_subsys_state_check() */ | 85 | EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for lockdep */ |
85 | #else | 86 | #else |
86 | static DEFINE_MUTEX(cgroup_mutex); | 87 | static DEFINE_MUTEX(cgroup_mutex); |
87 | #endif | 88 | #endif |
@@ -117,6 +118,7 @@ struct cfent { | |||
117 | struct list_head node; | 118 | struct list_head node; |
118 | struct dentry *dentry; | 119 | struct dentry *dentry; |
119 | struct cftype *type; | 120 | struct cftype *type; |
121 | struct cgroup_subsys_state *css; | ||
120 | 122 | ||
121 | /* file xattrs */ | 123 | /* file xattrs */ |
122 | struct simple_xattrs xattrs; | 124 | struct simple_xattrs xattrs; |
@@ -159,9 +161,9 @@ struct css_id { | |||
159 | */ | 161 | */ |
160 | struct cgroup_event { | 162 | struct cgroup_event { |
161 | /* | 163 | /* |
162 | * Cgroup which the event belongs to. | 164 | * css which the event belongs to. |
163 | */ | 165 | */ |
164 | struct cgroup *cgrp; | 166 | struct cgroup_subsys_state *css; |
165 | /* | 167 | /* |
166 | * Control file which the event associated. | 168 | * Control file which the event associated. |
167 | */ | 169 | */ |
@@ -215,10 +217,33 @@ static u64 cgroup_serial_nr_next = 1; | |||
215 | */ | 217 | */ |
216 | static int need_forkexit_callback __read_mostly; | 218 | static int need_forkexit_callback __read_mostly; |
217 | 219 | ||
218 | static void cgroup_offline_fn(struct work_struct *work); | 220 | static struct cftype cgroup_base_files[]; |
221 | |||
222 | static void cgroup_destroy_css_killed(struct cgroup *cgrp); | ||
219 | static int cgroup_destroy_locked(struct cgroup *cgrp); | 223 | static int cgroup_destroy_locked(struct cgroup *cgrp); |
220 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, | 224 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], |
221 | struct cftype cfts[], bool is_add); | 225 | bool is_add); |
226 | |||
227 | /** | ||
228 | * cgroup_css - obtain a cgroup's css for the specified subsystem | ||
229 | * @cgrp: the cgroup of interest | ||
230 | * @ss: the subsystem of interest (%NULL returns the dummy_css) | ||
231 | * | ||
232 | * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This | ||
233 | * function must be called either under cgroup_mutex or rcu_read_lock() and | ||
234 | * the caller is responsible for pinning the returned css if it wants to | ||
235 | * keep accessing it outside the said locks. This function may return | ||
236 | * %NULL if @cgrp doesn't have @subsys_id enabled. | ||
237 | */ | ||
238 | static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, | ||
239 | struct cgroup_subsys *ss) | ||
240 | { | ||
241 | if (ss) | ||
242 | return rcu_dereference_check(cgrp->subsys[ss->subsys_id], | ||
243 | lockdep_is_held(&cgroup_mutex)); | ||
244 | else | ||
245 | return &cgrp->dummy_css; | ||
246 | } | ||
222 | 247 | ||
223 | /* convenient tests for these bits */ | 248 | /* convenient tests for these bits */ |
224 | static inline bool cgroup_is_dead(const struct cgroup *cgrp) | 249 | static inline bool cgroup_is_dead(const struct cgroup *cgrp) |
@@ -365,9 +390,11 @@ static struct cgrp_cset_link init_cgrp_cset_link; | |||
365 | static int cgroup_init_idr(struct cgroup_subsys *ss, | 390 | static int cgroup_init_idr(struct cgroup_subsys *ss, |
366 | struct cgroup_subsys_state *css); | 391 | struct cgroup_subsys_state *css); |
367 | 392 | ||
368 | /* css_set_lock protects the list of css_set objects, and the | 393 | /* |
369 | * chain of tasks off each css_set. Nests outside task->alloc_lock | 394 | * css_set_lock protects the list of css_set objects, and the chain of |
370 | * due to cgroup_iter_start() */ | 395 | * tasks off each css_set. Nests outside task->alloc_lock due to |
396 | * css_task_iter_start(). | ||
397 | */ | ||
371 | static DEFINE_RWLOCK(css_set_lock); | 398 | static DEFINE_RWLOCK(css_set_lock); |
372 | static int css_set_count; | 399 | static int css_set_count; |
373 | 400 | ||
@@ -392,10 +419,12 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) | |||
392 | return key; | 419 | return key; |
393 | } | 420 | } |
394 | 421 | ||
395 | /* We don't maintain the lists running through each css_set to its | 422 | /* |
396 | * task until after the first call to cgroup_iter_start(). This | 423 | * We don't maintain the lists running through each css_set to its task |
397 | * reduces the fork()/exit() overhead for people who have cgroups | 424 | * until after the first call to css_task_iter_start(). This reduces the |
398 | * compiled into their kernel but not actually in use */ | 425 | * fork()/exit() overhead for people who have cgroups compiled into their |
426 | * kernel but not actually in use. | ||
427 | */ | ||
399 | static int use_task_css_set_links __read_mostly; | 428 | static int use_task_css_set_links __read_mostly; |
400 | 429 | ||
401 | static void __put_css_set(struct css_set *cset, int taskexit) | 430 | static void __put_css_set(struct css_set *cset, int taskexit) |
@@ -464,7 +493,7 @@ static inline void put_css_set_taskexit(struct css_set *cset) | |||
464 | * @new_cgrp: cgroup that's being entered by the task | 493 | * @new_cgrp: cgroup that's being entered by the task |
465 | * @template: desired set of css pointers in css_set (pre-calculated) | 494 | * @template: desired set of css pointers in css_set (pre-calculated) |
466 | * | 495 | * |
467 | * Returns true if "cg" matches "old_cg" except for the hierarchy | 496 | * Returns true if "cset" matches "old_cset" except for the hierarchy |
468 | * which "new_cgrp" belongs to, for which it should match "new_cgrp". | 497 | * which "new_cgrp" belongs to, for which it should match "new_cgrp". |
469 | */ | 498 | */ |
470 | static bool compare_css_sets(struct css_set *cset, | 499 | static bool compare_css_sets(struct css_set *cset, |
@@ -555,7 +584,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset, | |||
555 | /* Subsystem is in this hierarchy. So we want | 584 | /* Subsystem is in this hierarchy. So we want |
556 | * the subsystem state from the new | 585 | * the subsystem state from the new |
557 | * cgroup */ | 586 | * cgroup */ |
558 | template[i] = cgrp->subsys[i]; | 587 | template[i] = cgroup_css(cgrp, ss); |
559 | } else { | 588 | } else { |
560 | /* Subsystem is not in this hierarchy, so we | 589 | /* Subsystem is not in this hierarchy, so we |
561 | * don't want to change the subsystem state */ | 590 | * don't want to change the subsystem state */ |
@@ -803,8 +832,7 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, | |||
803 | 832 | ||
804 | static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); | 833 | static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); |
805 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); | 834 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); |
806 | static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, | 835 | static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask); |
807 | unsigned long subsys_mask); | ||
808 | static const struct inode_operations cgroup_dir_inode_operations; | 836 | static const struct inode_operations cgroup_dir_inode_operations; |
809 | static const struct file_operations proc_cgroupstats_operations; | 837 | static const struct file_operations proc_cgroupstats_operations; |
810 | 838 | ||
@@ -813,8 +841,7 @@ static struct backing_dev_info cgroup_backing_dev_info = { | |||
813 | .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, | 841 | .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, |
814 | }; | 842 | }; |
815 | 843 | ||
816 | static int alloc_css_id(struct cgroup_subsys *ss, | 844 | static int alloc_css_id(struct cgroup_subsys_state *child_css); |
817 | struct cgroup *parent, struct cgroup *child); | ||
818 | 845 | ||
819 | static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) | 846 | static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) |
820 | { | 847 | { |
@@ -845,15 +872,8 @@ static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry) | |||
845 | static void cgroup_free_fn(struct work_struct *work) | 872 | static void cgroup_free_fn(struct work_struct *work) |
846 | { | 873 | { |
847 | struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); | 874 | struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); |
848 | struct cgroup_subsys *ss; | ||
849 | 875 | ||
850 | mutex_lock(&cgroup_mutex); | 876 | mutex_lock(&cgroup_mutex); |
851 | /* | ||
852 | * Release the subsystem state objects. | ||
853 | */ | ||
854 | for_each_root_subsys(cgrp->root, ss) | ||
855 | ss->css_free(cgrp); | ||
856 | |||
857 | cgrp->root->number_of_cgroups--; | 877 | cgrp->root->number_of_cgroups--; |
858 | mutex_unlock(&cgroup_mutex); | 878 | mutex_unlock(&cgroup_mutex); |
859 | 879 | ||
@@ -864,8 +884,6 @@ static void cgroup_free_fn(struct work_struct *work) | |||
864 | */ | 884 | */ |
865 | dput(cgrp->parent->dentry); | 885 | dput(cgrp->parent->dentry); |
866 | 886 | ||
867 | ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); | ||
868 | |||
869 | /* | 887 | /* |
870 | * Drop the active superblock reference that we took when we | 888 | * Drop the active superblock reference that we took when we |
871 | * created the cgroup. This will free cgrp->root, if we are | 889 | * created the cgroup. This will free cgrp->root, if we are |
@@ -956,27 +974,22 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) | |||
956 | } | 974 | } |
957 | 975 | ||
958 | /** | 976 | /** |
959 | * cgroup_clear_directory - selective removal of base and subsystem files | 977 | * cgroup_clear_dir - remove subsys files in a cgroup directory |
960 | * @dir: directory containing the files | 978 | * @cgrp: target cgroup |
961 | * @base_files: true if the base files should be removed | ||
962 | * @subsys_mask: mask of the subsystem ids whose files should be removed | 979 | * @subsys_mask: mask of the subsystem ids whose files should be removed |
963 | */ | 980 | */ |
964 | static void cgroup_clear_directory(struct dentry *dir, bool base_files, | 981 | static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) |
965 | unsigned long subsys_mask) | ||
966 | { | 982 | { |
967 | struct cgroup *cgrp = __d_cgrp(dir); | ||
968 | struct cgroup_subsys *ss; | 983 | struct cgroup_subsys *ss; |
984 | int i; | ||
969 | 985 | ||
970 | for_each_root_subsys(cgrp->root, ss) { | 986 | for_each_subsys(ss, i) { |
971 | struct cftype_set *set; | 987 | struct cftype_set *set; |
972 | if (!test_bit(ss->subsys_id, &subsys_mask)) | 988 | |
989 | if (!test_bit(i, &subsys_mask)) | ||
973 | continue; | 990 | continue; |
974 | list_for_each_entry(set, &ss->cftsets, node) | 991 | list_for_each_entry(set, &ss->cftsets, node) |
975 | cgroup_addrm_files(cgrp, NULL, set->cfts, false); | 992 | cgroup_addrm_files(cgrp, set->cfts, false); |
976 | } | ||
977 | if (base_files) { | ||
978 | while (!list_empty(&cgrp->files)) | ||
979 | cgroup_rm_file(cgrp, NULL); | ||
980 | } | 993 | } |
981 | } | 994 | } |
982 | 995 | ||
@@ -986,9 +999,6 @@ static void cgroup_clear_directory(struct dentry *dir, bool base_files, | |||
986 | static void cgroup_d_remove_dir(struct dentry *dentry) | 999 | static void cgroup_d_remove_dir(struct dentry *dentry) |
987 | { | 1000 | { |
988 | struct dentry *parent; | 1001 | struct dentry *parent; |
989 | struct cgroupfs_root *root = dentry->d_sb->s_fs_info; | ||
990 | |||
991 | cgroup_clear_directory(dentry, true, root->subsys_mask); | ||
992 | 1002 | ||
993 | parent = dentry->d_parent; | 1003 | parent = dentry->d_parent; |
994 | spin_lock(&parent->d_lock); | 1004 | spin_lock(&parent->d_lock); |
@@ -1009,79 +1019,84 @@ static int rebind_subsystems(struct cgroupfs_root *root, | |||
1009 | { | 1019 | { |
1010 | struct cgroup *cgrp = &root->top_cgroup; | 1020 | struct cgroup *cgrp = &root->top_cgroup; |
1011 | struct cgroup_subsys *ss; | 1021 | struct cgroup_subsys *ss; |
1012 | int i; | 1022 | unsigned long pinned = 0; |
1023 | int i, ret; | ||
1013 | 1024 | ||
1014 | BUG_ON(!mutex_is_locked(&cgroup_mutex)); | 1025 | BUG_ON(!mutex_is_locked(&cgroup_mutex)); |
1015 | BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); | 1026 | BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); |
1016 | 1027 | ||
1017 | /* Check that any added subsystems are currently free */ | 1028 | /* Check that any added subsystems are currently free */ |
1018 | for_each_subsys(ss, i) { | 1029 | for_each_subsys(ss, i) { |
1019 | unsigned long bit = 1UL << i; | 1030 | if (!(added_mask & (1 << i))) |
1020 | |||
1021 | if (!(bit & added_mask)) | ||
1022 | continue; | 1031 | continue; |
1023 | 1032 | ||
1033 | /* is the subsystem mounted elsewhere? */ | ||
1024 | if (ss->root != &cgroup_dummy_root) { | 1034 | if (ss->root != &cgroup_dummy_root) { |
1025 | /* Subsystem isn't free */ | 1035 | ret = -EBUSY; |
1026 | return -EBUSY; | 1036 | goto out_put; |
1037 | } | ||
1038 | |||
1039 | /* pin the module */ | ||
1040 | if (!try_module_get(ss->module)) { | ||
1041 | ret = -ENOENT; | ||
1042 | goto out_put; | ||
1027 | } | 1043 | } |
1044 | pinned |= 1 << i; | ||
1028 | } | 1045 | } |
1029 | 1046 | ||
1030 | /* Currently we don't handle adding/removing subsystems when | 1047 | /* subsys could be missing if unloaded between parsing and here */ |
1031 | * any child cgroups exist. This is theoretically supportable | 1048 | if (added_mask != pinned) { |
1032 | * but involves complex error handling, so it's being left until | 1049 | ret = -ENOENT; |
1033 | * later */ | 1050 | goto out_put; |
1034 | if (root->number_of_cgroups > 1) | 1051 | } |
1035 | return -EBUSY; | 1052 | |
1053 | ret = cgroup_populate_dir(cgrp, added_mask); | ||
1054 | if (ret) | ||
1055 | goto out_put; | ||
1056 | |||
1057 | /* | ||
1058 | * Nothing can fail from this point on. Remove files for the | ||
1059 | * removed subsystems and rebind each subsystem. | ||
1060 | */ | ||
1061 | cgroup_clear_dir(cgrp, removed_mask); | ||
1036 | 1062 | ||
1037 | /* Process each subsystem */ | ||
1038 | for_each_subsys(ss, i) { | 1063 | for_each_subsys(ss, i) { |
1039 | unsigned long bit = 1UL << i; | 1064 | unsigned long bit = 1UL << i; |
1040 | 1065 | ||
1041 | if (bit & added_mask) { | 1066 | if (bit & added_mask) { |
1042 | /* We're binding this subsystem to this hierarchy */ | 1067 | /* We're binding this subsystem to this hierarchy */ |
1043 | BUG_ON(cgrp->subsys[i]); | 1068 | BUG_ON(cgroup_css(cgrp, ss)); |
1044 | BUG_ON(!cgroup_dummy_top->subsys[i]); | 1069 | BUG_ON(!cgroup_css(cgroup_dummy_top, ss)); |
1045 | BUG_ON(cgroup_dummy_top->subsys[i]->cgroup != cgroup_dummy_top); | 1070 | BUG_ON(cgroup_css(cgroup_dummy_top, ss)->cgroup != cgroup_dummy_top); |
1071 | |||
1072 | rcu_assign_pointer(cgrp->subsys[i], | ||
1073 | cgroup_css(cgroup_dummy_top, ss)); | ||
1074 | cgroup_css(cgrp, ss)->cgroup = cgrp; | ||
1046 | 1075 | ||
1047 | cgrp->subsys[i] = cgroup_dummy_top->subsys[i]; | ||
1048 | cgrp->subsys[i]->cgroup = cgrp; | ||
1049 | list_move(&ss->sibling, &root->subsys_list); | 1076 | list_move(&ss->sibling, &root->subsys_list); |
1050 | ss->root = root; | 1077 | ss->root = root; |
1051 | if (ss->bind) | 1078 | if (ss->bind) |
1052 | ss->bind(cgrp); | 1079 | ss->bind(cgroup_css(cgrp, ss)); |
1053 | 1080 | ||
1054 | /* refcount was already taken, and we're keeping it */ | 1081 | /* refcount was already taken, and we're keeping it */ |
1055 | root->subsys_mask |= bit; | 1082 | root->subsys_mask |= bit; |
1056 | } else if (bit & removed_mask) { | 1083 | } else if (bit & removed_mask) { |
1057 | /* We're removing this subsystem */ | 1084 | /* We're removing this subsystem */ |
1058 | BUG_ON(cgrp->subsys[i] != cgroup_dummy_top->subsys[i]); | 1085 | BUG_ON(cgroup_css(cgrp, ss) != cgroup_css(cgroup_dummy_top, ss)); |
1059 | BUG_ON(cgrp->subsys[i]->cgroup != cgrp); | 1086 | BUG_ON(cgroup_css(cgrp, ss)->cgroup != cgrp); |
1060 | 1087 | ||
1061 | if (ss->bind) | 1088 | if (ss->bind) |
1062 | ss->bind(cgroup_dummy_top); | 1089 | ss->bind(cgroup_css(cgroup_dummy_top, ss)); |
1063 | cgroup_dummy_top->subsys[i]->cgroup = cgroup_dummy_top; | 1090 | |
1064 | cgrp->subsys[i] = NULL; | 1091 | cgroup_css(cgroup_dummy_top, ss)->cgroup = cgroup_dummy_top; |
1092 | RCU_INIT_POINTER(cgrp->subsys[i], NULL); | ||
1093 | |||
1065 | cgroup_subsys[i]->root = &cgroup_dummy_root; | 1094 | cgroup_subsys[i]->root = &cgroup_dummy_root; |
1066 | list_move(&ss->sibling, &cgroup_dummy_root.subsys_list); | 1095 | list_move(&ss->sibling, &cgroup_dummy_root.subsys_list); |
1067 | 1096 | ||
1068 | /* subsystem is now free - drop reference on module */ | 1097 | /* subsystem is now free - drop reference on module */ |
1069 | module_put(ss->module); | 1098 | module_put(ss->module); |
1070 | root->subsys_mask &= ~bit; | 1099 | root->subsys_mask &= ~bit; |
1071 | } else if (bit & root->subsys_mask) { | ||
1072 | /* Subsystem state should already exist */ | ||
1073 | BUG_ON(!cgrp->subsys[i]); | ||
1074 | /* | ||
1075 | * a refcount was taken, but we already had one, so | ||
1076 | * drop the extra reference. | ||
1077 | */ | ||
1078 | module_put(ss->module); | ||
1079 | #ifdef CONFIG_MODULE_UNLOAD | ||
1080 | BUG_ON(ss->module && !module_refcount(ss->module)); | ||
1081 | #endif | ||
1082 | } else { | ||
1083 | /* Subsystem state shouldn't exist */ | ||
1084 | BUG_ON(cgrp->subsys[i]); | ||
1085 | } | 1100 | } |
1086 | } | 1101 | } |
1087 | 1102 | ||
@@ -1092,6 +1107,12 @@ static int rebind_subsystems(struct cgroupfs_root *root, | |||
1092 | root->flags |= CGRP_ROOT_SUBSYS_BOUND; | 1107 | root->flags |= CGRP_ROOT_SUBSYS_BOUND; |
1093 | 1108 | ||
1094 | return 0; | 1109 | return 0; |
1110 | |||
1111 | out_put: | ||
1112 | for_each_subsys(ss, i) | ||
1113 | if (pinned & (1 << i)) | ||
1114 | module_put(ss->module); | ||
1115 | return ret; | ||
1095 | } | 1116 | } |
1096 | 1117 | ||
1097 | static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) | 1118 | static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) |
@@ -1142,7 +1163,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
1142 | char *token, *o = data; | 1163 | char *token, *o = data; |
1143 | bool all_ss = false, one_ss = false; | 1164 | bool all_ss = false, one_ss = false; |
1144 | unsigned long mask = (unsigned long)-1; | 1165 | unsigned long mask = (unsigned long)-1; |
1145 | bool module_pin_failed = false; | ||
1146 | struct cgroup_subsys *ss; | 1166 | struct cgroup_subsys *ss; |
1147 | int i; | 1167 | int i; |
1148 | 1168 | ||
@@ -1285,52 +1305,9 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
1285 | if (!opts->subsys_mask && !opts->name) | 1305 | if (!opts->subsys_mask && !opts->name) |
1286 | return -EINVAL; | 1306 | return -EINVAL; |
1287 | 1307 | ||
1288 | /* | ||
1289 | * Grab references on all the modules we'll need, so the subsystems | ||
1290 | * don't dance around before rebind_subsystems attaches them. This may | ||
1291 | * take duplicate reference counts on a subsystem that's already used, | ||
1292 | * but rebind_subsystems handles this case. | ||
1293 | */ | ||
1294 | for_each_subsys(ss, i) { | ||
1295 | if (!(opts->subsys_mask & (1UL << i))) | ||
1296 | continue; | ||
1297 | if (!try_module_get(cgroup_subsys[i]->module)) { | ||
1298 | module_pin_failed = true; | ||
1299 | break; | ||
1300 | } | ||
1301 | } | ||
1302 | if (module_pin_failed) { | ||
1303 | /* | ||
1304 | * oops, one of the modules was going away. this means that we | ||
1305 | * raced with a module_delete call, and to the user this is | ||
1306 | * essentially a "subsystem doesn't exist" case. | ||
1307 | */ | ||
1308 | for (i--; i >= 0; i--) { | ||
1309 | /* drop refcounts only on the ones we took */ | ||
1310 | unsigned long bit = 1UL << i; | ||
1311 | |||
1312 | if (!(bit & opts->subsys_mask)) | ||
1313 | continue; | ||
1314 | module_put(cgroup_subsys[i]->module); | ||
1315 | } | ||
1316 | return -ENOENT; | ||
1317 | } | ||
1318 | |||
1319 | return 0; | 1308 | return 0; |
1320 | } | 1309 | } |
1321 | 1310 | ||
1322 | static void drop_parsed_module_refcounts(unsigned long subsys_mask) | ||
1323 | { | ||
1324 | struct cgroup_subsys *ss; | ||
1325 | int i; | ||
1326 | |||
1327 | mutex_lock(&cgroup_mutex); | ||
1328 | for_each_subsys(ss, i) | ||
1329 | if (subsys_mask & (1UL << i)) | ||
1330 | module_put(cgroup_subsys[i]->module); | ||
1331 | mutex_unlock(&cgroup_mutex); | ||
1332 | } | ||
1333 | |||
1334 | static int cgroup_remount(struct super_block *sb, int *flags, char *data) | 1311 | static int cgroup_remount(struct super_block *sb, int *flags, char *data) |
1335 | { | 1312 | { |
1336 | int ret = 0; | 1313 | int ret = 0; |
@@ -1370,22 +1347,15 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) | |||
1370 | goto out_unlock; | 1347 | goto out_unlock; |
1371 | } | 1348 | } |
1372 | 1349 | ||
1373 | /* | 1350 | /* remounting is not allowed for populated hierarchies */ |
1374 | * Clear out the files of subsystems that should be removed, do | 1351 | if (root->number_of_cgroups > 1) { |
1375 | * this before rebind_subsystems, since rebind_subsystems may | 1352 | ret = -EBUSY; |
1376 | * change this hierarchy's subsys_list. | ||
1377 | */ | ||
1378 | cgroup_clear_directory(cgrp->dentry, false, removed_mask); | ||
1379 | |||
1380 | ret = rebind_subsystems(root, added_mask, removed_mask); | ||
1381 | if (ret) { | ||
1382 | /* rebind_subsystems failed, re-populate the removed files */ | ||
1383 | cgroup_populate_dir(cgrp, false, removed_mask); | ||
1384 | goto out_unlock; | 1353 | goto out_unlock; |
1385 | } | 1354 | } |
1386 | 1355 | ||
1387 | /* re-populate subsystem files */ | 1356 | ret = rebind_subsystems(root, added_mask, removed_mask); |
1388 | cgroup_populate_dir(cgrp, false, added_mask); | 1357 | if (ret) |
1358 | goto out_unlock; | ||
1389 | 1359 | ||
1390 | if (opts.release_agent) | 1360 | if (opts.release_agent) |
1391 | strcpy(root->release_agent_path, opts.release_agent); | 1361 | strcpy(root->release_agent_path, opts.release_agent); |
@@ -1395,8 +1365,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) | |||
1395 | mutex_unlock(&cgroup_root_mutex); | 1365 | mutex_unlock(&cgroup_root_mutex); |
1396 | mutex_unlock(&cgroup_mutex); | 1366 | mutex_unlock(&cgroup_mutex); |
1397 | mutex_unlock(&cgrp->dentry->d_inode->i_mutex); | 1367 | mutex_unlock(&cgrp->dentry->d_inode->i_mutex); |
1398 | if (ret) | ||
1399 | drop_parsed_module_refcounts(opts.subsys_mask); | ||
1400 | return ret; | 1368 | return ret; |
1401 | } | 1369 | } |
1402 | 1370 | ||
@@ -1416,6 +1384,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) | |||
1416 | INIT_LIST_HEAD(&cgrp->release_list); | 1384 | INIT_LIST_HEAD(&cgrp->release_list); |
1417 | INIT_LIST_HEAD(&cgrp->pidlists); | 1385 | INIT_LIST_HEAD(&cgrp->pidlists); |
1418 | mutex_init(&cgrp->pidlist_mutex); | 1386 | mutex_init(&cgrp->pidlist_mutex); |
1387 | cgrp->dummy_css.cgroup = cgrp; | ||
1419 | INIT_LIST_HEAD(&cgrp->event_list); | 1388 | INIT_LIST_HEAD(&cgrp->event_list); |
1420 | spin_lock_init(&cgrp->event_list_lock); | 1389 | spin_lock_init(&cgrp->event_list_lock); |
1421 | simple_xattrs_init(&cgrp->xattrs); | 1390 | simple_xattrs_init(&cgrp->xattrs); |
@@ -1431,6 +1400,7 @@ static void init_cgroup_root(struct cgroupfs_root *root) | |||
1431 | cgrp->root = root; | 1400 | cgrp->root = root; |
1432 | RCU_INIT_POINTER(cgrp->name, &root_cgroup_name); | 1401 | RCU_INIT_POINTER(cgrp->name, &root_cgroup_name); |
1433 | init_cgroup_housekeeping(cgrp); | 1402 | init_cgroup_housekeeping(cgrp); |
1403 | idr_init(&root->cgroup_idr); | ||
1434 | } | 1404 | } |
1435 | 1405 | ||
1436 | static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end) | 1406 | static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end) |
@@ -1503,7 +1473,6 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) | |||
1503 | */ | 1473 | */ |
1504 | root->subsys_mask = opts->subsys_mask; | 1474 | root->subsys_mask = opts->subsys_mask; |
1505 | root->flags = opts->flags; | 1475 | root->flags = opts->flags; |
1506 | ida_init(&root->cgroup_ida); | ||
1507 | if (opts->release_agent) | 1476 | if (opts->release_agent) |
1508 | strcpy(root->release_agent_path, opts->release_agent); | 1477 | strcpy(root->release_agent_path, opts->release_agent); |
1509 | if (opts->name) | 1478 | if (opts->name) |
@@ -1519,7 +1488,7 @@ static void cgroup_free_root(struct cgroupfs_root *root) | |||
1519 | /* hierarhcy ID shoulid already have been released */ | 1488 | /* hierarhcy ID shoulid already have been released */ |
1520 | WARN_ON_ONCE(root->hierarchy_id); | 1489 | WARN_ON_ONCE(root->hierarchy_id); |
1521 | 1490 | ||
1522 | ida_destroy(&root->cgroup_ida); | 1491 | idr_destroy(&root->cgroup_idr); |
1523 | kfree(root); | 1492 | kfree(root); |
1524 | } | 1493 | } |
1525 | } | 1494 | } |
@@ -1584,7 +1553,9 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1584 | int ret = 0; | 1553 | int ret = 0; |
1585 | struct super_block *sb; | 1554 | struct super_block *sb; |
1586 | struct cgroupfs_root *new_root; | 1555 | struct cgroupfs_root *new_root; |
1556 | struct list_head tmp_links; | ||
1587 | struct inode *inode; | 1557 | struct inode *inode; |
1558 | const struct cred *cred; | ||
1588 | 1559 | ||
1589 | /* First find the desired set of subsystems */ | 1560 | /* First find the desired set of subsystems */ |
1590 | mutex_lock(&cgroup_mutex); | 1561 | mutex_lock(&cgroup_mutex); |
@@ -1600,7 +1571,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1600 | new_root = cgroup_root_from_opts(&opts); | 1571 | new_root = cgroup_root_from_opts(&opts); |
1601 | if (IS_ERR(new_root)) { | 1572 | if (IS_ERR(new_root)) { |
1602 | ret = PTR_ERR(new_root); | 1573 | ret = PTR_ERR(new_root); |
1603 | goto drop_modules; | 1574 | goto out_err; |
1604 | } | 1575 | } |
1605 | opts.new_root = new_root; | 1576 | opts.new_root = new_root; |
1606 | 1577 | ||
@@ -1609,17 +1580,15 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1609 | if (IS_ERR(sb)) { | 1580 | if (IS_ERR(sb)) { |
1610 | ret = PTR_ERR(sb); | 1581 | ret = PTR_ERR(sb); |
1611 | cgroup_free_root(opts.new_root); | 1582 | cgroup_free_root(opts.new_root); |
1612 | goto drop_modules; | 1583 | goto out_err; |
1613 | } | 1584 | } |
1614 | 1585 | ||
1615 | root = sb->s_fs_info; | 1586 | root = sb->s_fs_info; |
1616 | BUG_ON(!root); | 1587 | BUG_ON(!root); |
1617 | if (root == opts.new_root) { | 1588 | if (root == opts.new_root) { |
1618 | /* We used the new root structure, so this is a new hierarchy */ | 1589 | /* We used the new root structure, so this is a new hierarchy */ |
1619 | struct list_head tmp_links; | ||
1620 | struct cgroup *root_cgrp = &root->top_cgroup; | 1590 | struct cgroup *root_cgrp = &root->top_cgroup; |
1621 | struct cgroupfs_root *existing_root; | 1591 | struct cgroupfs_root *existing_root; |
1622 | const struct cred *cred; | ||
1623 | int i; | 1592 | int i; |
1624 | struct css_set *cset; | 1593 | struct css_set *cset; |
1625 | 1594 | ||
@@ -1634,6 +1603,11 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1634 | mutex_lock(&cgroup_mutex); | 1603 | mutex_lock(&cgroup_mutex); |
1635 | mutex_lock(&cgroup_root_mutex); | 1604 | mutex_lock(&cgroup_root_mutex); |
1636 | 1605 | ||
1606 | root_cgrp->id = idr_alloc(&root->cgroup_idr, root_cgrp, | ||
1607 | 0, 1, GFP_KERNEL); | ||
1608 | if (root_cgrp->id < 0) | ||
1609 | goto unlock_drop; | ||
1610 | |||
1637 | /* Check for name clashes with existing mounts */ | 1611 | /* Check for name clashes with existing mounts */ |
1638 | ret = -EBUSY; | 1612 | ret = -EBUSY; |
1639 | if (strlen(root->name)) | 1613 | if (strlen(root->name)) |
@@ -1657,26 +1631,37 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1657 | if (ret) | 1631 | if (ret) |
1658 | goto unlock_drop; | 1632 | goto unlock_drop; |
1659 | 1633 | ||
1634 | sb->s_root->d_fsdata = root_cgrp; | ||
1635 | root_cgrp->dentry = sb->s_root; | ||
1636 | |||
1637 | /* | ||
1638 | * We're inside get_sb() and will call lookup_one_len() to | ||
1639 | * create the root files, which doesn't work if SELinux is | ||
1640 | * in use. The following cred dancing somehow works around | ||
1641 | * it. See 2ce9738ba ("cgroupfs: use init_cred when | ||
1642 | * populating new cgroupfs mount") for more details. | ||
1643 | */ | ||
1644 | cred = override_creds(&init_cred); | ||
1645 | |||
1646 | ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true); | ||
1647 | if (ret) | ||
1648 | goto rm_base_files; | ||
1649 | |||
1660 | ret = rebind_subsystems(root, root->subsys_mask, 0); | 1650 | ret = rebind_subsystems(root, root->subsys_mask, 0); |
1661 | if (ret == -EBUSY) { | 1651 | if (ret) |
1662 | free_cgrp_cset_links(&tmp_links); | 1652 | goto rm_base_files; |
1663 | goto unlock_drop; | 1653 | |
1664 | } | 1654 | revert_creds(cred); |
1655 | |||
1665 | /* | 1656 | /* |
1666 | * There must be no failure case after here, since rebinding | 1657 | * There must be no failure case after here, since rebinding |
1667 | * takes care of subsystems' refcounts, which are explicitly | 1658 | * takes care of subsystems' refcounts, which are explicitly |
1668 | * dropped in the failure exit path. | 1659 | * dropped in the failure exit path. |
1669 | */ | 1660 | */ |
1670 | 1661 | ||
1671 | /* EBUSY should be the only error here */ | ||
1672 | BUG_ON(ret); | ||
1673 | |||
1674 | list_add(&root->root_list, &cgroup_roots); | 1662 | list_add(&root->root_list, &cgroup_roots); |
1675 | cgroup_root_count++; | 1663 | cgroup_root_count++; |
1676 | 1664 | ||
1677 | sb->s_root->d_fsdata = root_cgrp; | ||
1678 | root->top_cgroup.dentry = sb->s_root; | ||
1679 | |||
1680 | /* Link the top cgroup in this hierarchy into all | 1665 | /* Link the top cgroup in this hierarchy into all |
1681 | * the css_set objects */ | 1666 | * the css_set objects */ |
1682 | write_lock(&css_set_lock); | 1667 | write_lock(&css_set_lock); |
@@ -1689,9 +1674,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1689 | BUG_ON(!list_empty(&root_cgrp->children)); | 1674 | BUG_ON(!list_empty(&root_cgrp->children)); |
1690 | BUG_ON(root->number_of_cgroups != 1); | 1675 | BUG_ON(root->number_of_cgroups != 1); |
1691 | 1676 | ||
1692 | cred = override_creds(&init_cred); | ||
1693 | cgroup_populate_dir(root_cgrp, true, root->subsys_mask); | ||
1694 | revert_creds(cred); | ||
1695 | mutex_unlock(&cgroup_root_mutex); | 1677 | mutex_unlock(&cgroup_root_mutex); |
1696 | mutex_unlock(&cgroup_mutex); | 1678 | mutex_unlock(&cgroup_mutex); |
1697 | mutex_unlock(&inode->i_mutex); | 1679 | mutex_unlock(&inode->i_mutex); |
@@ -1711,15 +1693,16 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1711 | pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n"); | 1693 | pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n"); |
1712 | } | 1694 | } |
1713 | } | 1695 | } |
1714 | |||
1715 | /* no subsys rebinding, so refcounts don't change */ | ||
1716 | drop_parsed_module_refcounts(opts.subsys_mask); | ||
1717 | } | 1696 | } |
1718 | 1697 | ||
1719 | kfree(opts.release_agent); | 1698 | kfree(opts.release_agent); |
1720 | kfree(opts.name); | 1699 | kfree(opts.name); |
1721 | return dget(sb->s_root); | 1700 | return dget(sb->s_root); |
1722 | 1701 | ||
1702 | rm_base_files: | ||
1703 | free_cgrp_cset_links(&tmp_links); | ||
1704 | cgroup_addrm_files(&root->top_cgroup, cgroup_base_files, false); | ||
1705 | revert_creds(cred); | ||
1723 | unlock_drop: | 1706 | unlock_drop: |
1724 | cgroup_exit_root_id(root); | 1707 | cgroup_exit_root_id(root); |
1725 | mutex_unlock(&cgroup_root_mutex); | 1708 | mutex_unlock(&cgroup_root_mutex); |
@@ -1727,8 +1710,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1727 | mutex_unlock(&inode->i_mutex); | 1710 | mutex_unlock(&inode->i_mutex); |
1728 | drop_new_super: | 1711 | drop_new_super: |
1729 | deactivate_locked_super(sb); | 1712 | deactivate_locked_super(sb); |
1730 | drop_modules: | ||
1731 | drop_parsed_module_refcounts(opts.subsys_mask); | ||
1732 | out_err: | 1713 | out_err: |
1733 | kfree(opts.release_agent); | 1714 | kfree(opts.release_agent); |
1734 | kfree(opts.name); | 1715 | kfree(opts.name); |
@@ -1746,6 +1727,7 @@ static void cgroup_kill_sb(struct super_block *sb) { | |||
1746 | BUG_ON(root->number_of_cgroups != 1); | 1727 | BUG_ON(root->number_of_cgroups != 1); |
1747 | BUG_ON(!list_empty(&cgrp->children)); | 1728 | BUG_ON(!list_empty(&cgrp->children)); |
1748 | 1729 | ||
1730 | mutex_lock(&cgrp->dentry->d_inode->i_mutex); | ||
1749 | mutex_lock(&cgroup_mutex); | 1731 | mutex_lock(&cgroup_mutex); |
1750 | mutex_lock(&cgroup_root_mutex); | 1732 | mutex_lock(&cgroup_root_mutex); |
1751 | 1733 | ||
@@ -1778,6 +1760,7 @@ static void cgroup_kill_sb(struct super_block *sb) { | |||
1778 | 1760 | ||
1779 | mutex_unlock(&cgroup_root_mutex); | 1761 | mutex_unlock(&cgroup_root_mutex); |
1780 | mutex_unlock(&cgroup_mutex); | 1762 | mutex_unlock(&cgroup_mutex); |
1763 | mutex_unlock(&cgrp->dentry->d_inode->i_mutex); | ||
1781 | 1764 | ||
1782 | simple_xattrs_free(&cgrp->xattrs); | 1765 | simple_xattrs_free(&cgrp->xattrs); |
1783 | 1766 | ||
@@ -1889,7 +1872,7 @@ EXPORT_SYMBOL_GPL(task_cgroup_path); | |||
1889 | struct task_and_cgroup { | 1872 | struct task_and_cgroup { |
1890 | struct task_struct *task; | 1873 | struct task_struct *task; |
1891 | struct cgroup *cgrp; | 1874 | struct cgroup *cgrp; |
1892 | struct css_set *cg; | 1875 | struct css_set *cset; |
1893 | }; | 1876 | }; |
1894 | 1877 | ||
1895 | struct cgroup_taskset { | 1878 | struct cgroup_taskset { |
@@ -1939,18 +1922,20 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) | |||
1939 | EXPORT_SYMBOL_GPL(cgroup_taskset_next); | 1922 | EXPORT_SYMBOL_GPL(cgroup_taskset_next); |
1940 | 1923 | ||
1941 | /** | 1924 | /** |
1942 | * cgroup_taskset_cur_cgroup - return the matching cgroup for the current task | 1925 | * cgroup_taskset_cur_css - return the matching css for the current task |
1943 | * @tset: taskset of interest | 1926 | * @tset: taskset of interest |
1927 | * @subsys_id: the ID of the target subsystem | ||
1944 | * | 1928 | * |
1945 | * Return the cgroup for the current (last returned) task of @tset. This | 1929 | * Return the css for the current (last returned) task of @tset for |
1946 | * function must be preceded by either cgroup_taskset_first() or | 1930 | * subsystem specified by @subsys_id. This function must be preceded by |
1947 | * cgroup_taskset_next(). | 1931 | * either cgroup_taskset_first() or cgroup_taskset_next(). |
1948 | */ | 1932 | */ |
1949 | struct cgroup *cgroup_taskset_cur_cgroup(struct cgroup_taskset *tset) | 1933 | struct cgroup_subsys_state *cgroup_taskset_cur_css(struct cgroup_taskset *tset, |
1934 | int subsys_id) | ||
1950 | { | 1935 | { |
1951 | return tset->cur_cgrp; | 1936 | return cgroup_css(tset->cur_cgrp, cgroup_subsys[subsys_id]); |
1952 | } | 1937 | } |
1953 | EXPORT_SYMBOL_GPL(cgroup_taskset_cur_cgroup); | 1938 | EXPORT_SYMBOL_GPL(cgroup_taskset_cur_css); |
1954 | 1939 | ||
1955 | /** | 1940 | /** |
1956 | * cgroup_taskset_size - return the number of tasks in taskset | 1941 | * cgroup_taskset_size - return the number of tasks in taskset |
@@ -2054,7 +2039,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, | |||
2054 | 2039 | ||
2055 | /* @tsk either already exited or can't exit until the end */ | 2040 | /* @tsk either already exited or can't exit until the end */ |
2056 | if (tsk->flags & PF_EXITING) | 2041 | if (tsk->flags & PF_EXITING) |
2057 | continue; | 2042 | goto next; |
2058 | 2043 | ||
2059 | /* as per above, nr_threads may decrease, but not increase. */ | 2044 | /* as per above, nr_threads may decrease, but not increase. */ |
2060 | BUG_ON(i >= group_size); | 2045 | BUG_ON(i >= group_size); |
@@ -2062,7 +2047,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, | |||
2062 | ent.cgrp = task_cgroup_from_root(tsk, root); | 2047 | ent.cgrp = task_cgroup_from_root(tsk, root); |
2063 | /* nothing to do if this task is already in the cgroup */ | 2048 | /* nothing to do if this task is already in the cgroup */ |
2064 | if (ent.cgrp == cgrp) | 2049 | if (ent.cgrp == cgrp) |
2065 | continue; | 2050 | goto next; |
2066 | /* | 2051 | /* |
2067 | * saying GFP_ATOMIC has no effect here because we did prealloc | 2052 | * saying GFP_ATOMIC has no effect here because we did prealloc |
2068 | * earlier, but it's good form to communicate our expectations. | 2053 | * earlier, but it's good form to communicate our expectations. |
@@ -2070,7 +2055,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, | |||
2070 | retval = flex_array_put(group, i, &ent, GFP_ATOMIC); | 2055 | retval = flex_array_put(group, i, &ent, GFP_ATOMIC); |
2071 | BUG_ON(retval != 0); | 2056 | BUG_ON(retval != 0); |
2072 | i++; | 2057 | i++; |
2073 | 2058 | next: | |
2074 | if (!threadgroup) | 2059 | if (!threadgroup) |
2075 | break; | 2060 | break; |
2076 | } while_each_thread(leader, tsk); | 2061 | } while_each_thread(leader, tsk); |
@@ -2089,8 +2074,10 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, | |||
2089 | * step 1: check that we can legitimately attach to the cgroup. | 2074 | * step 1: check that we can legitimately attach to the cgroup. |
2090 | */ | 2075 | */ |
2091 | for_each_root_subsys(root, ss) { | 2076 | for_each_root_subsys(root, ss) { |
2077 | struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); | ||
2078 | |||
2092 | if (ss->can_attach) { | 2079 | if (ss->can_attach) { |
2093 | retval = ss->can_attach(cgrp, &tset); | 2080 | retval = ss->can_attach(css, &tset); |
2094 | if (retval) { | 2081 | if (retval) { |
2095 | failed_ss = ss; | 2082 | failed_ss = ss; |
2096 | goto out_cancel_attach; | 2083 | goto out_cancel_attach; |
@@ -2107,8 +2094,8 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, | |||
2107 | 2094 | ||
2108 | tc = flex_array_get(group, i); | 2095 | tc = flex_array_get(group, i); |
2109 | old_cset = task_css_set(tc->task); | 2096 | old_cset = task_css_set(tc->task); |
2110 | tc->cg = find_css_set(old_cset, cgrp); | 2097 | tc->cset = find_css_set(old_cset, cgrp); |
2111 | if (!tc->cg) { | 2098 | if (!tc->cset) { |
2112 | retval = -ENOMEM; | 2099 | retval = -ENOMEM; |
2113 | goto out_put_css_set_refs; | 2100 | goto out_put_css_set_refs; |
2114 | } | 2101 | } |
@@ -2121,7 +2108,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, | |||
2121 | */ | 2108 | */ |
2122 | for (i = 0; i < group_size; i++) { | 2109 | for (i = 0; i < group_size; i++) { |
2123 | tc = flex_array_get(group, i); | 2110 | tc = flex_array_get(group, i); |
2124 | cgroup_task_migrate(tc->cgrp, tc->task, tc->cg); | 2111 | cgroup_task_migrate(tc->cgrp, tc->task, tc->cset); |
2125 | } | 2112 | } |
2126 | /* nothing is sensitive to fork() after this point. */ | 2113 | /* nothing is sensitive to fork() after this point. */ |
2127 | 2114 | ||
@@ -2129,8 +2116,10 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, | |||
2129 | * step 4: do subsystem attach callbacks. | 2116 | * step 4: do subsystem attach callbacks. |
2130 | */ | 2117 | */ |
2131 | for_each_root_subsys(root, ss) { | 2118 | for_each_root_subsys(root, ss) { |
2119 | struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); | ||
2120 | |||
2132 | if (ss->attach) | 2121 | if (ss->attach) |
2133 | ss->attach(cgrp, &tset); | 2122 | ss->attach(css, &tset); |
2134 | } | 2123 | } |
2135 | 2124 | ||
2136 | /* | 2125 | /* |
@@ -2141,18 +2130,20 @@ out_put_css_set_refs: | |||
2141 | if (retval) { | 2130 | if (retval) { |
2142 | for (i = 0; i < group_size; i++) { | 2131 | for (i = 0; i < group_size; i++) { |
2143 | tc = flex_array_get(group, i); | 2132 | tc = flex_array_get(group, i); |
2144 | if (!tc->cg) | 2133 | if (!tc->cset) |
2145 | break; | 2134 | break; |
2146 | put_css_set(tc->cg); | 2135 | put_css_set(tc->cset); |
2147 | } | 2136 | } |
2148 | } | 2137 | } |
2149 | out_cancel_attach: | 2138 | out_cancel_attach: |
2150 | if (retval) { | 2139 | if (retval) { |
2151 | for_each_root_subsys(root, ss) { | 2140 | for_each_root_subsys(root, ss) { |
2141 | struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); | ||
2142 | |||
2152 | if (ss == failed_ss) | 2143 | if (ss == failed_ss) |
2153 | break; | 2144 | break; |
2154 | if (ss->cancel_attach) | 2145 | if (ss->cancel_attach) |
2155 | ss->cancel_attach(cgrp, &tset); | 2146 | ss->cancel_attach(css, &tset); |
2156 | } | 2147 | } |
2157 | } | 2148 | } |
2158 | out_free_group_list: | 2149 | out_free_group_list: |
@@ -2253,9 +2244,9 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) | |||
2253 | 2244 | ||
2254 | mutex_lock(&cgroup_mutex); | 2245 | mutex_lock(&cgroup_mutex); |
2255 | for_each_active_root(root) { | 2246 | for_each_active_root(root) { |
2256 | struct cgroup *from_cg = task_cgroup_from_root(from, root); | 2247 | struct cgroup *from_cgrp = task_cgroup_from_root(from, root); |
2257 | 2248 | ||
2258 | retval = cgroup_attach_task(from_cg, tsk, false); | 2249 | retval = cgroup_attach_task(from_cgrp, tsk, false); |
2259 | if (retval) | 2250 | if (retval) |
2260 | break; | 2251 | break; |
2261 | } | 2252 | } |
@@ -2265,34 +2256,38 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) | |||
2265 | } | 2256 | } |
2266 | EXPORT_SYMBOL_GPL(cgroup_attach_task_all); | 2257 | EXPORT_SYMBOL_GPL(cgroup_attach_task_all); |
2267 | 2258 | ||
2268 | static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) | 2259 | static int cgroup_tasks_write(struct cgroup_subsys_state *css, |
2260 | struct cftype *cft, u64 pid) | ||
2269 | { | 2261 | { |
2270 | return attach_task_by_pid(cgrp, pid, false); | 2262 | return attach_task_by_pid(css->cgroup, pid, false); |
2271 | } | 2263 | } |
2272 | 2264 | ||
2273 | static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid) | 2265 | static int cgroup_procs_write(struct cgroup_subsys_state *css, |
2266 | struct cftype *cft, u64 tgid) | ||
2274 | { | 2267 | { |
2275 | return attach_task_by_pid(cgrp, tgid, true); | 2268 | return attach_task_by_pid(css->cgroup, tgid, true); |
2276 | } | 2269 | } |
2277 | 2270 | ||
2278 | static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, | 2271 | static int cgroup_release_agent_write(struct cgroup_subsys_state *css, |
2279 | const char *buffer) | 2272 | struct cftype *cft, const char *buffer) |
2280 | { | 2273 | { |
2281 | BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); | 2274 | BUILD_BUG_ON(sizeof(css->cgroup->root->release_agent_path) < PATH_MAX); |
2282 | if (strlen(buffer) >= PATH_MAX) | 2275 | if (strlen(buffer) >= PATH_MAX) |
2283 | return -EINVAL; | 2276 | return -EINVAL; |
2284 | if (!cgroup_lock_live_group(cgrp)) | 2277 | if (!cgroup_lock_live_group(css->cgroup)) |
2285 | return -ENODEV; | 2278 | return -ENODEV; |
2286 | mutex_lock(&cgroup_root_mutex); | 2279 | mutex_lock(&cgroup_root_mutex); |
2287 | strcpy(cgrp->root->release_agent_path, buffer); | 2280 | strcpy(css->cgroup->root->release_agent_path, buffer); |
2288 | mutex_unlock(&cgroup_root_mutex); | 2281 | mutex_unlock(&cgroup_root_mutex); |
2289 | mutex_unlock(&cgroup_mutex); | 2282 | mutex_unlock(&cgroup_mutex); |
2290 | return 0; | 2283 | return 0; |
2291 | } | 2284 | } |
2292 | 2285 | ||
2293 | static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft, | 2286 | static int cgroup_release_agent_show(struct cgroup_subsys_state *css, |
2294 | struct seq_file *seq) | 2287 | struct cftype *cft, struct seq_file *seq) |
2295 | { | 2288 | { |
2289 | struct cgroup *cgrp = css->cgroup; | ||
2290 | |||
2296 | if (!cgroup_lock_live_group(cgrp)) | 2291 | if (!cgroup_lock_live_group(cgrp)) |
2297 | return -ENODEV; | 2292 | return -ENODEV; |
2298 | seq_puts(seq, cgrp->root->release_agent_path); | 2293 | seq_puts(seq, cgrp->root->release_agent_path); |
@@ -2301,20 +2296,20 @@ static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft, | |||
2301 | return 0; | 2296 | return 0; |
2302 | } | 2297 | } |
2303 | 2298 | ||
2304 | static int cgroup_sane_behavior_show(struct cgroup *cgrp, struct cftype *cft, | 2299 | static int cgroup_sane_behavior_show(struct cgroup_subsys_state *css, |
2305 | struct seq_file *seq) | 2300 | struct cftype *cft, struct seq_file *seq) |
2306 | { | 2301 | { |
2307 | seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp)); | 2302 | seq_printf(seq, "%d\n", cgroup_sane_behavior(css->cgroup)); |
2308 | return 0; | 2303 | return 0; |
2309 | } | 2304 | } |
2310 | 2305 | ||
2311 | /* A buffer size big enough for numbers or short strings */ | 2306 | /* A buffer size big enough for numbers or short strings */ |
2312 | #define CGROUP_LOCAL_BUFFER_SIZE 64 | 2307 | #define CGROUP_LOCAL_BUFFER_SIZE 64 |
2313 | 2308 | ||
2314 | static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft, | 2309 | static ssize_t cgroup_write_X64(struct cgroup_subsys_state *css, |
2315 | struct file *file, | 2310 | struct cftype *cft, struct file *file, |
2316 | const char __user *userbuf, | 2311 | const char __user *userbuf, size_t nbytes, |
2317 | size_t nbytes, loff_t *unused_ppos) | 2312 | loff_t *unused_ppos) |
2318 | { | 2313 | { |
2319 | char buffer[CGROUP_LOCAL_BUFFER_SIZE]; | 2314 | char buffer[CGROUP_LOCAL_BUFFER_SIZE]; |
2320 | int retval = 0; | 2315 | int retval = 0; |
@@ -2332,22 +2327,22 @@ static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft, | |||
2332 | u64 val = simple_strtoull(strstrip(buffer), &end, 0); | 2327 | u64 val = simple_strtoull(strstrip(buffer), &end, 0); |
2333 | if (*end) | 2328 | if (*end) |
2334 | return -EINVAL; | 2329 | return -EINVAL; |
2335 | retval = cft->write_u64(cgrp, cft, val); | 2330 | retval = cft->write_u64(css, cft, val); |
2336 | } else { | 2331 | } else { |
2337 | s64 val = simple_strtoll(strstrip(buffer), &end, 0); | 2332 | s64 val = simple_strtoll(strstrip(buffer), &end, 0); |
2338 | if (*end) | 2333 | if (*end) |
2339 | return -EINVAL; | 2334 | return -EINVAL; |
2340 | retval = cft->write_s64(cgrp, cft, val); | 2335 | retval = cft->write_s64(css, cft, val); |
2341 | } | 2336 | } |
2342 | if (!retval) | 2337 | if (!retval) |
2343 | retval = nbytes; | 2338 | retval = nbytes; |
2344 | return retval; | 2339 | return retval; |
2345 | } | 2340 | } |
2346 | 2341 | ||
2347 | static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft, | 2342 | static ssize_t cgroup_write_string(struct cgroup_subsys_state *css, |
2348 | struct file *file, | 2343 | struct cftype *cft, struct file *file, |
2349 | const char __user *userbuf, | 2344 | const char __user *userbuf, size_t nbytes, |
2350 | size_t nbytes, loff_t *unused_ppos) | 2345 | loff_t *unused_ppos) |
2351 | { | 2346 | { |
2352 | char local_buffer[CGROUP_LOCAL_BUFFER_SIZE]; | 2347 | char local_buffer[CGROUP_LOCAL_BUFFER_SIZE]; |
2353 | int retval = 0; | 2348 | int retval = 0; |
@@ -2370,7 +2365,7 @@ static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft, | |||
2370 | } | 2365 | } |
2371 | 2366 | ||
2372 | buffer[nbytes] = 0; /* nul-terminate */ | 2367 | buffer[nbytes] = 0; /* nul-terminate */ |
2373 | retval = cft->write_string(cgrp, cft, strstrip(buffer)); | 2368 | retval = cft->write_string(css, cft, strstrip(buffer)); |
2374 | if (!retval) | 2369 | if (!retval) |
2375 | retval = nbytes; | 2370 | retval = nbytes; |
2376 | out: | 2371 | out: |
@@ -2380,65 +2375,60 @@ out: | |||
2380 | } | 2375 | } |
2381 | 2376 | ||
2382 | static ssize_t cgroup_file_write(struct file *file, const char __user *buf, | 2377 | static ssize_t cgroup_file_write(struct file *file, const char __user *buf, |
2383 | size_t nbytes, loff_t *ppos) | 2378 | size_t nbytes, loff_t *ppos) |
2384 | { | 2379 | { |
2380 | struct cfent *cfe = __d_cfe(file->f_dentry); | ||
2385 | struct cftype *cft = __d_cft(file->f_dentry); | 2381 | struct cftype *cft = __d_cft(file->f_dentry); |
2386 | struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); | 2382 | struct cgroup_subsys_state *css = cfe->css; |
2387 | 2383 | ||
2388 | if (cgroup_is_dead(cgrp)) | ||
2389 | return -ENODEV; | ||
2390 | if (cft->write) | 2384 | if (cft->write) |
2391 | return cft->write(cgrp, cft, file, buf, nbytes, ppos); | 2385 | return cft->write(css, cft, file, buf, nbytes, ppos); |
2392 | if (cft->write_u64 || cft->write_s64) | 2386 | if (cft->write_u64 || cft->write_s64) |
2393 | return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos); | 2387 | return cgroup_write_X64(css, cft, file, buf, nbytes, ppos); |
2394 | if (cft->write_string) | 2388 | if (cft->write_string) |
2395 | return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos); | 2389 | return cgroup_write_string(css, cft, file, buf, nbytes, ppos); |
2396 | if (cft->trigger) { | 2390 | if (cft->trigger) { |
2397 | int ret = cft->trigger(cgrp, (unsigned int)cft->private); | 2391 | int ret = cft->trigger(css, (unsigned int)cft->private); |
2398 | return ret ? ret : nbytes; | 2392 | return ret ? ret : nbytes; |
2399 | } | 2393 | } |
2400 | return -EINVAL; | 2394 | return -EINVAL; |
2401 | } | 2395 | } |
2402 | 2396 | ||
2403 | static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft, | 2397 | static ssize_t cgroup_read_u64(struct cgroup_subsys_state *css, |
2404 | struct file *file, | 2398 | struct cftype *cft, struct file *file, |
2405 | char __user *buf, size_t nbytes, | 2399 | char __user *buf, size_t nbytes, loff_t *ppos) |
2406 | loff_t *ppos) | ||
2407 | { | 2400 | { |
2408 | char tmp[CGROUP_LOCAL_BUFFER_SIZE]; | 2401 | char tmp[CGROUP_LOCAL_BUFFER_SIZE]; |
2409 | u64 val = cft->read_u64(cgrp, cft); | 2402 | u64 val = cft->read_u64(css, cft); |
2410 | int len = sprintf(tmp, "%llu\n", (unsigned long long) val); | 2403 | int len = sprintf(tmp, "%llu\n", (unsigned long long) val); |
2411 | 2404 | ||
2412 | return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); | 2405 | return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); |
2413 | } | 2406 | } |
2414 | 2407 | ||
2415 | static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft, | 2408 | static ssize_t cgroup_read_s64(struct cgroup_subsys_state *css, |
2416 | struct file *file, | 2409 | struct cftype *cft, struct file *file, |
2417 | char __user *buf, size_t nbytes, | 2410 | char __user *buf, size_t nbytes, loff_t *ppos) |
2418 | loff_t *ppos) | ||
2419 | { | 2411 | { |
2420 | char tmp[CGROUP_LOCAL_BUFFER_SIZE]; | 2412 | char tmp[CGROUP_LOCAL_BUFFER_SIZE]; |
2421 | s64 val = cft->read_s64(cgrp, cft); | 2413 | s64 val = cft->read_s64(css, cft); |
2422 | int len = sprintf(tmp, "%lld\n", (long long) val); | 2414 | int len = sprintf(tmp, "%lld\n", (long long) val); |
2423 | 2415 | ||
2424 | return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); | 2416 | return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); |
2425 | } | 2417 | } |
2426 | 2418 | ||
2427 | static ssize_t cgroup_file_read(struct file *file, char __user *buf, | 2419 | static ssize_t cgroup_file_read(struct file *file, char __user *buf, |
2428 | size_t nbytes, loff_t *ppos) | 2420 | size_t nbytes, loff_t *ppos) |
2429 | { | 2421 | { |
2422 | struct cfent *cfe = __d_cfe(file->f_dentry); | ||
2430 | struct cftype *cft = __d_cft(file->f_dentry); | 2423 | struct cftype *cft = __d_cft(file->f_dentry); |
2431 | struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); | 2424 | struct cgroup_subsys_state *css = cfe->css; |
2432 | |||
2433 | if (cgroup_is_dead(cgrp)) | ||
2434 | return -ENODEV; | ||
2435 | 2425 | ||
2436 | if (cft->read) | 2426 | if (cft->read) |
2437 | return cft->read(cgrp, cft, file, buf, nbytes, ppos); | 2427 | return cft->read(css, cft, file, buf, nbytes, ppos); |
2438 | if (cft->read_u64) | 2428 | if (cft->read_u64) |
2439 | return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos); | 2429 | return cgroup_read_u64(css, cft, file, buf, nbytes, ppos); |
2440 | if (cft->read_s64) | 2430 | if (cft->read_s64) |
2441 | return cgroup_read_s64(cgrp, cft, file, buf, nbytes, ppos); | 2431 | return cgroup_read_s64(css, cft, file, buf, nbytes, ppos); |
2442 | return -EINVAL; | 2432 | return -EINVAL; |
2443 | } | 2433 | } |
2444 | 2434 | ||
@@ -2447,11 +2437,6 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf, | |||
2447 | * supports string->u64 maps, but can be extended in future. | 2437 | * supports string->u64 maps, but can be extended in future. |
2448 | */ | 2438 | */ |
2449 | 2439 | ||
2450 | struct cgroup_seqfile_state { | ||
2451 | struct cftype *cft; | ||
2452 | struct cgroup *cgroup; | ||
2453 | }; | ||
2454 | |||
2455 | static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value) | 2440 | static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value) |
2456 | { | 2441 | { |
2457 | struct seq_file *sf = cb->state; | 2442 | struct seq_file *sf = cb->state; |
@@ -2460,69 +2445,86 @@ static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value) | |||
2460 | 2445 | ||
2461 | static int cgroup_seqfile_show(struct seq_file *m, void *arg) | 2446 | static int cgroup_seqfile_show(struct seq_file *m, void *arg) |
2462 | { | 2447 | { |
2463 | struct cgroup_seqfile_state *state = m->private; | 2448 | struct cfent *cfe = m->private; |
2464 | struct cftype *cft = state->cft; | 2449 | struct cftype *cft = cfe->type; |
2450 | struct cgroup_subsys_state *css = cfe->css; | ||
2451 | |||
2465 | if (cft->read_map) { | 2452 | if (cft->read_map) { |
2466 | struct cgroup_map_cb cb = { | 2453 | struct cgroup_map_cb cb = { |
2467 | .fill = cgroup_map_add, | 2454 | .fill = cgroup_map_add, |
2468 | .state = m, | 2455 | .state = m, |
2469 | }; | 2456 | }; |
2470 | return cft->read_map(state->cgroup, cft, &cb); | 2457 | return cft->read_map(css, cft, &cb); |
2471 | } | 2458 | } |
2472 | return cft->read_seq_string(state->cgroup, cft, m); | 2459 | return cft->read_seq_string(css, cft, m); |
2473 | } | ||
2474 | |||
2475 | static int cgroup_seqfile_release(struct inode *inode, struct file *file) | ||
2476 | { | ||
2477 | struct seq_file *seq = file->private_data; | ||
2478 | kfree(seq->private); | ||
2479 | return single_release(inode, file); | ||
2480 | } | 2460 | } |
2481 | 2461 | ||
2482 | static const struct file_operations cgroup_seqfile_operations = { | 2462 | static const struct file_operations cgroup_seqfile_operations = { |
2483 | .read = seq_read, | 2463 | .read = seq_read, |
2484 | .write = cgroup_file_write, | 2464 | .write = cgroup_file_write, |
2485 | .llseek = seq_lseek, | 2465 | .llseek = seq_lseek, |
2486 | .release = cgroup_seqfile_release, | 2466 | .release = single_release, |
2487 | }; | 2467 | }; |
2488 | 2468 | ||
2489 | static int cgroup_file_open(struct inode *inode, struct file *file) | 2469 | static int cgroup_file_open(struct inode *inode, struct file *file) |
2490 | { | 2470 | { |
2471 | struct cfent *cfe = __d_cfe(file->f_dentry); | ||
2472 | struct cftype *cft = __d_cft(file->f_dentry); | ||
2473 | struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent); | ||
2474 | struct cgroup_subsys_state *css; | ||
2491 | int err; | 2475 | int err; |
2492 | struct cftype *cft; | ||
2493 | 2476 | ||
2494 | err = generic_file_open(inode, file); | 2477 | err = generic_file_open(inode, file); |
2495 | if (err) | 2478 | if (err) |
2496 | return err; | 2479 | return err; |
2497 | cft = __d_cft(file->f_dentry); | ||
2498 | 2480 | ||
2499 | if (cft->read_map || cft->read_seq_string) { | 2481 | /* |
2500 | struct cgroup_seqfile_state *state; | 2482 | * If the file belongs to a subsystem, pin the css. Will be |
2483 | * unpinned either on open failure or release. This ensures that | ||
2484 | * @css stays alive for all file operations. | ||
2485 | */ | ||
2486 | rcu_read_lock(); | ||
2487 | css = cgroup_css(cgrp, cft->ss); | ||
2488 | if (cft->ss && !css_tryget(css)) | ||
2489 | css = NULL; | ||
2490 | rcu_read_unlock(); | ||
2501 | 2491 | ||
2502 | state = kzalloc(sizeof(*state), GFP_USER); | 2492 | if (!css) |
2503 | if (!state) | 2493 | return -ENODEV; |
2504 | return -ENOMEM; | ||
2505 | 2494 | ||
2506 | state->cft = cft; | 2495 | /* |
2507 | state->cgroup = __d_cgrp(file->f_dentry->d_parent); | 2496 | * @cfe->css is used by read/write/close to determine the |
2497 | * associated css. @file->private_data would be a better place but | ||
2498 | * that's already used by seqfile. Multiple accessors may use it | ||
2499 | * simultaneously which is okay as the association never changes. | ||
2500 | */ | ||
2501 | WARN_ON_ONCE(cfe->css && cfe->css != css); | ||
2502 | cfe->css = css; | ||
2503 | |||
2504 | if (cft->read_map || cft->read_seq_string) { | ||
2508 | file->f_op = &cgroup_seqfile_operations; | 2505 | file->f_op = &cgroup_seqfile_operations; |
2509 | err = single_open(file, cgroup_seqfile_show, state); | 2506 | err = single_open(file, cgroup_seqfile_show, cfe); |
2510 | if (err < 0) | 2507 | } else if (cft->open) { |
2511 | kfree(state); | ||
2512 | } else if (cft->open) | ||
2513 | err = cft->open(inode, file); | 2508 | err = cft->open(inode, file); |
2514 | else | 2509 | } |
2515 | err = 0; | ||
2516 | 2510 | ||
2511 | if (css->ss && err) | ||
2512 | css_put(css); | ||
2517 | return err; | 2513 | return err; |
2518 | } | 2514 | } |
2519 | 2515 | ||
2520 | static int cgroup_file_release(struct inode *inode, struct file *file) | 2516 | static int cgroup_file_release(struct inode *inode, struct file *file) |
2521 | { | 2517 | { |
2518 | struct cfent *cfe = __d_cfe(file->f_dentry); | ||
2522 | struct cftype *cft = __d_cft(file->f_dentry); | 2519 | struct cftype *cft = __d_cft(file->f_dentry); |
2520 | struct cgroup_subsys_state *css = cfe->css; | ||
2521 | int ret = 0; | ||
2522 | |||
2523 | if (cft->release) | 2523 | if (cft->release) |
2524 | return cft->release(inode, file); | 2524 | ret = cft->release(inode, file); |
2525 | return 0; | 2525 | if (css->ss) |
2526 | css_put(css); | ||
2527 | return ret; | ||
2526 | } | 2528 | } |
2527 | 2529 | ||
2528 | /* | 2530 | /* |
@@ -2736,8 +2738,7 @@ static umode_t cgroup_file_mode(const struct cftype *cft) | |||
2736 | return mode; | 2738 | return mode; |
2737 | } | 2739 | } |
2738 | 2740 | ||
2739 | static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, | 2741 | static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) |
2740 | struct cftype *cft) | ||
2741 | { | 2742 | { |
2742 | struct dentry *dir = cgrp->dentry; | 2743 | struct dentry *dir = cgrp->dentry; |
2743 | struct cgroup *parent = __d_cgrp(dir); | 2744 | struct cgroup *parent = __d_cgrp(dir); |
@@ -2747,8 +2748,9 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, | |||
2747 | umode_t mode; | 2748 | umode_t mode; |
2748 | char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; | 2749 | char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; |
2749 | 2750 | ||
2750 | if (subsys && !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) { | 2751 | if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) && |
2751 | strcpy(name, subsys->name); | 2752 | !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) { |
2753 | strcpy(name, cft->ss->name); | ||
2752 | strcat(name, "."); | 2754 | strcat(name, "."); |
2753 | } | 2755 | } |
2754 | strcat(name, cft->name); | 2756 | strcat(name, cft->name); |
@@ -2782,11 +2784,25 @@ out: | |||
2782 | return error; | 2784 | return error; |
2783 | } | 2785 | } |
2784 | 2786 | ||
2785 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, | 2787 | /** |
2786 | struct cftype cfts[], bool is_add) | 2788 | * cgroup_addrm_files - add or remove files to a cgroup directory |
2789 | * @cgrp: the target cgroup | ||
2790 | * @cfts: array of cftypes to be added | ||
2791 | * @is_add: whether to add or remove | ||
2792 | * | ||
2793 | * Depending on @is_add, add or remove files defined by @cfts on @cgrp. | ||
2794 | * For removals, this function never fails. If addition fails, this | ||
2795 | * function doesn't remove files already added. The caller is responsible | ||
2796 | * for cleaning up. | ||
2797 | */ | ||
2798 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], | ||
2799 | bool is_add) | ||
2787 | { | 2800 | { |
2788 | struct cftype *cft; | 2801 | struct cftype *cft; |
2789 | int err, ret = 0; | 2802 | int ret; |
2803 | |||
2804 | lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); | ||
2805 | lockdep_assert_held(&cgroup_mutex); | ||
2790 | 2806 | ||
2791 | for (cft = cfts; cft->name[0] != '\0'; cft++) { | 2807 | for (cft = cfts; cft->name[0] != '\0'; cft++) { |
2792 | /* does cft->flags tell us to skip this file on @cgrp? */ | 2808 | /* does cft->flags tell us to skip this file on @cgrp? */ |
@@ -2798,16 +2814,17 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, | |||
2798 | continue; | 2814 | continue; |
2799 | 2815 | ||
2800 | if (is_add) { | 2816 | if (is_add) { |
2801 | err = cgroup_add_file(cgrp, subsys, cft); | 2817 | ret = cgroup_add_file(cgrp, cft); |
2802 | if (err) | 2818 | if (ret) { |
2803 | pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n", | 2819 | pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n", |
2804 | cft->name, err); | 2820 | cft->name, ret); |
2805 | ret = err; | 2821 | return ret; |
2822 | } | ||
2806 | } else { | 2823 | } else { |
2807 | cgroup_rm_file(cgrp, cft); | 2824 | cgroup_rm_file(cgrp, cft); |
2808 | } | 2825 | } |
2809 | } | 2826 | } |
2810 | return ret; | 2827 | return 0; |
2811 | } | 2828 | } |
2812 | 2829 | ||
2813 | static void cgroup_cfts_prepare(void) | 2830 | static void cgroup_cfts_prepare(void) |
@@ -2816,28 +2833,30 @@ static void cgroup_cfts_prepare(void) | |||
2816 | /* | 2833 | /* |
2817 | * Thanks to the entanglement with vfs inode locking, we can't walk | 2834 | * Thanks to the entanglement with vfs inode locking, we can't walk |
2818 | * the existing cgroups under cgroup_mutex and create files. | 2835 | * the existing cgroups under cgroup_mutex and create files. |
2819 | * Instead, we use cgroup_for_each_descendant_pre() and drop RCU | 2836 | * Instead, we use css_for_each_descendant_pre() and drop RCU read |
2820 | * read lock before calling cgroup_addrm_files(). | 2837 | * lock before calling cgroup_addrm_files(). |
2821 | */ | 2838 | */ |
2822 | mutex_lock(&cgroup_mutex); | 2839 | mutex_lock(&cgroup_mutex); |
2823 | } | 2840 | } |
2824 | 2841 | ||
2825 | static void cgroup_cfts_commit(struct cgroup_subsys *ss, | 2842 | static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) |
2826 | struct cftype *cfts, bool is_add) | ||
2827 | __releases(&cgroup_mutex) | 2843 | __releases(&cgroup_mutex) |
2828 | { | 2844 | { |
2829 | LIST_HEAD(pending); | 2845 | LIST_HEAD(pending); |
2830 | struct cgroup *cgrp, *root = &ss->root->top_cgroup; | 2846 | struct cgroup_subsys *ss = cfts[0].ss; |
2847 | struct cgroup *root = &ss->root->top_cgroup; | ||
2831 | struct super_block *sb = ss->root->sb; | 2848 | struct super_block *sb = ss->root->sb; |
2832 | struct dentry *prev = NULL; | 2849 | struct dentry *prev = NULL; |
2833 | struct inode *inode; | 2850 | struct inode *inode; |
2851 | struct cgroup_subsys_state *css; | ||
2834 | u64 update_before; | 2852 | u64 update_before; |
2853 | int ret = 0; | ||
2835 | 2854 | ||
2836 | /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ | 2855 | /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ |
2837 | if (!cfts || ss->root == &cgroup_dummy_root || | 2856 | if (!cfts || ss->root == &cgroup_dummy_root || |
2838 | !atomic_inc_not_zero(&sb->s_active)) { | 2857 | !atomic_inc_not_zero(&sb->s_active)) { |
2839 | mutex_unlock(&cgroup_mutex); | 2858 | mutex_unlock(&cgroup_mutex); |
2840 | return; | 2859 | return 0; |
2841 | } | 2860 | } |
2842 | 2861 | ||
2843 | /* | 2862 | /* |
@@ -2849,17 +2868,11 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss, | |||
2849 | 2868 | ||
2850 | mutex_unlock(&cgroup_mutex); | 2869 | mutex_unlock(&cgroup_mutex); |
2851 | 2870 | ||
2852 | /* @root always needs to be updated */ | ||
2853 | inode = root->dentry->d_inode; | ||
2854 | mutex_lock(&inode->i_mutex); | ||
2855 | mutex_lock(&cgroup_mutex); | ||
2856 | cgroup_addrm_files(root, ss, cfts, is_add); | ||
2857 | mutex_unlock(&cgroup_mutex); | ||
2858 | mutex_unlock(&inode->i_mutex); | ||
2859 | |||
2860 | /* add/rm files for all cgroups created before */ | 2871 | /* add/rm files for all cgroups created before */ |
2861 | rcu_read_lock(); | 2872 | rcu_read_lock(); |
2862 | cgroup_for_each_descendant_pre(cgrp, root) { | 2873 | css_for_each_descendant_pre(css, cgroup_css(root, ss)) { |
2874 | struct cgroup *cgrp = css->cgroup; | ||
2875 | |||
2863 | if (cgroup_is_dead(cgrp)) | 2876 | if (cgroup_is_dead(cgrp)) |
2864 | continue; | 2877 | continue; |
2865 | 2878 | ||
@@ -2873,15 +2886,18 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss, | |||
2873 | mutex_lock(&inode->i_mutex); | 2886 | mutex_lock(&inode->i_mutex); |
2874 | mutex_lock(&cgroup_mutex); | 2887 | mutex_lock(&cgroup_mutex); |
2875 | if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp)) | 2888 | if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp)) |
2876 | cgroup_addrm_files(cgrp, ss, cfts, is_add); | 2889 | ret = cgroup_addrm_files(cgrp, cfts, is_add); |
2877 | mutex_unlock(&cgroup_mutex); | 2890 | mutex_unlock(&cgroup_mutex); |
2878 | mutex_unlock(&inode->i_mutex); | 2891 | mutex_unlock(&inode->i_mutex); |
2879 | 2892 | ||
2880 | rcu_read_lock(); | 2893 | rcu_read_lock(); |
2894 | if (ret) | ||
2895 | break; | ||
2881 | } | 2896 | } |
2882 | rcu_read_unlock(); | 2897 | rcu_read_unlock(); |
2883 | dput(prev); | 2898 | dput(prev); |
2884 | deactivate_super(sb); | 2899 | deactivate_super(sb); |
2900 | return ret; | ||
2885 | } | 2901 | } |
2886 | 2902 | ||
2887 | /** | 2903 | /** |
@@ -2901,49 +2917,56 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss, | |||
2901 | int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) | 2917 | int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) |
2902 | { | 2918 | { |
2903 | struct cftype_set *set; | 2919 | struct cftype_set *set; |
2920 | struct cftype *cft; | ||
2921 | int ret; | ||
2904 | 2922 | ||
2905 | set = kzalloc(sizeof(*set), GFP_KERNEL); | 2923 | set = kzalloc(sizeof(*set), GFP_KERNEL); |
2906 | if (!set) | 2924 | if (!set) |
2907 | return -ENOMEM; | 2925 | return -ENOMEM; |
2908 | 2926 | ||
2927 | for (cft = cfts; cft->name[0] != '\0'; cft++) | ||
2928 | cft->ss = ss; | ||
2929 | |||
2909 | cgroup_cfts_prepare(); | 2930 | cgroup_cfts_prepare(); |
2910 | set->cfts = cfts; | 2931 | set->cfts = cfts; |
2911 | list_add_tail(&set->node, &ss->cftsets); | 2932 | list_add_tail(&set->node, &ss->cftsets); |
2912 | cgroup_cfts_commit(ss, cfts, true); | 2933 | ret = cgroup_cfts_commit(cfts, true); |
2913 | 2934 | if (ret) | |
2914 | return 0; | 2935 | cgroup_rm_cftypes(cfts); |
2936 | return ret; | ||
2915 | } | 2937 | } |
2916 | EXPORT_SYMBOL_GPL(cgroup_add_cftypes); | 2938 | EXPORT_SYMBOL_GPL(cgroup_add_cftypes); |
2917 | 2939 | ||
2918 | /** | 2940 | /** |
2919 | * cgroup_rm_cftypes - remove an array of cftypes from a subsystem | 2941 | * cgroup_rm_cftypes - remove an array of cftypes from a subsystem |
2920 | * @ss: target cgroup subsystem | ||
2921 | * @cfts: zero-length name terminated array of cftypes | 2942 | * @cfts: zero-length name terminated array of cftypes |
2922 | * | 2943 | * |
2923 | * Unregister @cfts from @ss. Files described by @cfts are removed from | 2944 | * Unregister @cfts. Files described by @cfts are removed from all |
2924 | * all existing cgroups to which @ss is attached and all future cgroups | 2945 | * existing cgroups and all future cgroups won't have them either. This |
2925 | * won't have them either. This function can be called anytime whether @ss | 2946 | * function can be called anytime whether @cfts' subsys is attached or not. |
2926 | * is attached or not. | ||
2927 | * | 2947 | * |
2928 | * Returns 0 on successful unregistration, -ENOENT if @cfts is not | 2948 | * Returns 0 on successful unregistration, -ENOENT if @cfts is not |
2929 | * registered with @ss. | 2949 | * registered. |
2930 | */ | 2950 | */ |
2931 | int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) | 2951 | int cgroup_rm_cftypes(struct cftype *cfts) |
2932 | { | 2952 | { |
2933 | struct cftype_set *set; | 2953 | struct cftype_set *set; |
2934 | 2954 | ||
2955 | if (!cfts || !cfts[0].ss) | ||
2956 | return -ENOENT; | ||
2957 | |||
2935 | cgroup_cfts_prepare(); | 2958 | cgroup_cfts_prepare(); |
2936 | 2959 | ||
2937 | list_for_each_entry(set, &ss->cftsets, node) { | 2960 | list_for_each_entry(set, &cfts[0].ss->cftsets, node) { |
2938 | if (set->cfts == cfts) { | 2961 | if (set->cfts == cfts) { |
2939 | list_del(&set->node); | 2962 | list_del(&set->node); |
2940 | kfree(set); | 2963 | kfree(set); |
2941 | cgroup_cfts_commit(ss, cfts, false); | 2964 | cgroup_cfts_commit(cfts, false); |
2942 | return 0; | 2965 | return 0; |
2943 | } | 2966 | } |
2944 | } | 2967 | } |
2945 | 2968 | ||
2946 | cgroup_cfts_commit(ss, NULL, false); | 2969 | cgroup_cfts_commit(NULL, false); |
2947 | return -ENOENT; | 2970 | return -ENOENT; |
2948 | } | 2971 | } |
2949 | 2972 | ||
@@ -2966,34 +2989,10 @@ int cgroup_task_count(const struct cgroup *cgrp) | |||
2966 | } | 2989 | } |
2967 | 2990 | ||
2968 | /* | 2991 | /* |
2969 | * Advance a list_head iterator. The iterator should be positioned at | 2992 | * To reduce the fork() overhead for systems that are not actually using |
2970 | * the start of a css_set | 2993 | * their cgroups capability, we don't maintain the lists running through |
2971 | */ | 2994 | * each css_set to its tasks until we see the list actually used - in other |
2972 | static void cgroup_advance_iter(struct cgroup *cgrp, struct cgroup_iter *it) | 2995 | * words after the first call to css_task_iter_start(). |
2973 | { | ||
2974 | struct list_head *l = it->cset_link; | ||
2975 | struct cgrp_cset_link *link; | ||
2976 | struct css_set *cset; | ||
2977 | |||
2978 | /* Advance to the next non-empty css_set */ | ||
2979 | do { | ||
2980 | l = l->next; | ||
2981 | if (l == &cgrp->cset_links) { | ||
2982 | it->cset_link = NULL; | ||
2983 | return; | ||
2984 | } | ||
2985 | link = list_entry(l, struct cgrp_cset_link, cset_link); | ||
2986 | cset = link->cset; | ||
2987 | } while (list_empty(&cset->tasks)); | ||
2988 | it->cset_link = l; | ||
2989 | it->task = cset->tasks.next; | ||
2990 | } | ||
2991 | |||
2992 | /* | ||
2993 | * To reduce the fork() overhead for systems that are not actually | ||
2994 | * using their cgroups capability, we don't maintain the lists running | ||
2995 | * through each css_set to its tasks until we see the list actually | ||
2996 | * used - in other words after the first call to cgroup_iter_start(). | ||
2997 | */ | 2996 | */ |
2998 | static void cgroup_enable_task_cg_lists(void) | 2997 | static void cgroup_enable_task_cg_lists(void) |
2999 | { | 2998 | { |
@@ -3024,16 +3023,21 @@ static void cgroup_enable_task_cg_lists(void) | |||
3024 | } | 3023 | } |
3025 | 3024 | ||
3026 | /** | 3025 | /** |
3027 | * cgroup_next_sibling - find the next sibling of a given cgroup | 3026 | * css_next_child - find the next child of a given css |
3028 | * @pos: the current cgroup | 3027 | * @pos_css: the current position (%NULL to initiate traversal) |
3028 | * @parent_css: css whose children to walk | ||
3029 | * | 3029 | * |
3030 | * This function returns the next sibling of @pos and should be called | 3030 | * This function returns the next child of @parent_css and should be called |
3031 | * under RCU read lock. The only requirement is that @pos is accessible. | 3031 | * under RCU read lock. The only requirement is that @parent_css and |
3032 | * The next sibling is guaranteed to be returned regardless of @pos's | 3032 | * @pos_css are accessible. The next sibling is guaranteed to be returned |
3033 | * state. | 3033 | * regardless of their states. |
3034 | */ | 3034 | */ |
3035 | struct cgroup *cgroup_next_sibling(struct cgroup *pos) | 3035 | struct cgroup_subsys_state * |
3036 | css_next_child(struct cgroup_subsys_state *pos_css, | ||
3037 | struct cgroup_subsys_state *parent_css) | ||
3036 | { | 3038 | { |
3039 | struct cgroup *pos = pos_css ? pos_css->cgroup : NULL; | ||
3040 | struct cgroup *cgrp = parent_css->cgroup; | ||
3037 | struct cgroup *next; | 3041 | struct cgroup *next; |
3038 | 3042 | ||
3039 | WARN_ON_ONCE(!rcu_read_lock_held()); | 3043 | WARN_ON_ONCE(!rcu_read_lock_held()); |
@@ -3048,78 +3052,81 @@ struct cgroup *cgroup_next_sibling(struct cgroup *pos) | |||
3048 | * safe to dereference from this RCU critical section. If | 3052 | * safe to dereference from this RCU critical section. If |
3049 | * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed | 3053 | * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed |
3050 | * to be visible as %true here. | 3054 | * to be visible as %true here. |
3055 | * | ||
3056 | * If @pos is dead, its next pointer can't be dereferenced; | ||
3057 | * however, as each cgroup is given a monotonically increasing | ||
3058 | * unique serial number and always appended to the sibling list, | ||
3059 | * the next one can be found by walking the parent's children until | ||
3060 | * we see a cgroup with higher serial number than @pos's. While | ||
3061 | * this path can be slower, it's taken only when either the current | ||
3062 | * cgroup is removed or iteration and removal race. | ||
3051 | */ | 3063 | */ |
3052 | if (likely(!cgroup_is_dead(pos))) { | 3064 | if (!pos) { |
3065 | next = list_entry_rcu(cgrp->children.next, struct cgroup, sibling); | ||
3066 | } else if (likely(!cgroup_is_dead(pos))) { | ||
3053 | next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); | 3067 | next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); |
3054 | if (&next->sibling != &pos->parent->children) | 3068 | } else { |
3055 | return next; | 3069 | list_for_each_entry_rcu(next, &cgrp->children, sibling) |
3056 | return NULL; | 3070 | if (next->serial_nr > pos->serial_nr) |
3071 | break; | ||
3057 | } | 3072 | } |
3058 | 3073 | ||
3059 | /* | 3074 | if (&next->sibling == &cgrp->children) |
3060 | * Can't dereference the next pointer. Each cgroup is given a | 3075 | return NULL; |
3061 | * monotonically increasing unique serial number and always | 3076 | |
3062 | * appended to the sibling list, so the next one can be found by | 3077 | return cgroup_css(next, parent_css->ss); |
3063 | * walking the parent's children until we see a cgroup with higher | ||
3064 | * serial number than @pos's. | ||
3065 | * | ||
3066 | * While this path can be slow, it's taken only when either the | ||
3067 | * current cgroup is removed or iteration and removal race. | ||
3068 | */ | ||
3069 | list_for_each_entry_rcu(next, &pos->parent->children, sibling) | ||
3070 | if (next->serial_nr > pos->serial_nr) | ||
3071 | return next; | ||
3072 | return NULL; | ||
3073 | } | 3078 | } |
3074 | EXPORT_SYMBOL_GPL(cgroup_next_sibling); | 3079 | EXPORT_SYMBOL_GPL(css_next_child); |
3075 | 3080 | ||
3076 | /** | 3081 | /** |
3077 | * cgroup_next_descendant_pre - find the next descendant for pre-order walk | 3082 | * css_next_descendant_pre - find the next descendant for pre-order walk |
3078 | * @pos: the current position (%NULL to initiate traversal) | 3083 | * @pos: the current position (%NULL to initiate traversal) |
3079 | * @cgroup: cgroup whose descendants to walk | 3084 | * @root: css whose descendants to walk |
3080 | * | 3085 | * |
3081 | * To be used by cgroup_for_each_descendant_pre(). Find the next | 3086 | * To be used by css_for_each_descendant_pre(). Find the next descendant |
3082 | * descendant to visit for pre-order traversal of @cgroup's descendants. | 3087 | * to visit for pre-order traversal of @root's descendants. @root is |
3088 | * included in the iteration and the first node to be visited. | ||
3083 | * | 3089 | * |
3084 | * While this function requires RCU read locking, it doesn't require the | 3090 | * While this function requires RCU read locking, it doesn't require the |
3085 | * whole traversal to be contained in a single RCU critical section. This | 3091 | * whole traversal to be contained in a single RCU critical section. This |
3086 | * function will return the correct next descendant as long as both @pos | 3092 | * function will return the correct next descendant as long as both @pos |
3087 | * and @cgroup are accessible and @pos is a descendant of @cgroup. | 3093 | * and @root are accessible and @pos is a descendant of @root. |
3088 | */ | 3094 | */ |
3089 | struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, | 3095 | struct cgroup_subsys_state * |
3090 | struct cgroup *cgroup) | 3096 | css_next_descendant_pre(struct cgroup_subsys_state *pos, |
3097 | struct cgroup_subsys_state *root) | ||
3091 | { | 3098 | { |
3092 | struct cgroup *next; | 3099 | struct cgroup_subsys_state *next; |
3093 | 3100 | ||
3094 | WARN_ON_ONCE(!rcu_read_lock_held()); | 3101 | WARN_ON_ONCE(!rcu_read_lock_held()); |
3095 | 3102 | ||
3096 | /* if first iteration, pretend we just visited @cgroup */ | 3103 | /* if first iteration, visit @root */ |
3097 | if (!pos) | 3104 | if (!pos) |
3098 | pos = cgroup; | 3105 | return root; |
3099 | 3106 | ||
3100 | /* visit the first child if exists */ | 3107 | /* visit the first child if exists */ |
3101 | next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling); | 3108 | next = css_next_child(NULL, pos); |
3102 | if (next) | 3109 | if (next) |
3103 | return next; | 3110 | return next; |
3104 | 3111 | ||
3105 | /* no child, visit my or the closest ancestor's next sibling */ | 3112 | /* no child, visit my or the closest ancestor's next sibling */ |
3106 | while (pos != cgroup) { | 3113 | while (pos != root) { |
3107 | next = cgroup_next_sibling(pos); | 3114 | next = css_next_child(pos, css_parent(pos)); |
3108 | if (next) | 3115 | if (next) |
3109 | return next; | 3116 | return next; |
3110 | pos = pos->parent; | 3117 | pos = css_parent(pos); |
3111 | } | 3118 | } |
3112 | 3119 | ||
3113 | return NULL; | 3120 | return NULL; |
3114 | } | 3121 | } |
3115 | EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); | 3122 | EXPORT_SYMBOL_GPL(css_next_descendant_pre); |
3116 | 3123 | ||
3117 | /** | 3124 | /** |
3118 | * cgroup_rightmost_descendant - return the rightmost descendant of a cgroup | 3125 | * css_rightmost_descendant - return the rightmost descendant of a css |
3119 | * @pos: cgroup of interest | 3126 | * @pos: css of interest |
3120 | * | 3127 | * |
3121 | * Return the rightmost descendant of @pos. If there's no descendant, | 3128 | * Return the rightmost descendant of @pos. If there's no descendant, @pos |
3122 | * @pos is returned. This can be used during pre-order traversal to skip | 3129 | * is returned. This can be used during pre-order traversal to skip |
3123 | * subtree of @pos. | 3130 | * subtree of @pos. |
3124 | * | 3131 | * |
3125 | * While this function requires RCU read locking, it doesn't require the | 3132 | * While this function requires RCU read locking, it doesn't require the |
@@ -3127,9 +3134,10 @@ EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); | |||
3127 | * function will return the correct rightmost descendant as long as @pos is | 3134 | * function will return the correct rightmost descendant as long as @pos is |
3128 | * accessible. | 3135 | * accessible. |
3129 | */ | 3136 | */ |
3130 | struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos) | 3137 | struct cgroup_subsys_state * |
3138 | css_rightmost_descendant(struct cgroup_subsys_state *pos) | ||
3131 | { | 3139 | { |
3132 | struct cgroup *last, *tmp; | 3140 | struct cgroup_subsys_state *last, *tmp; |
3133 | 3141 | ||
3134 | WARN_ON_ONCE(!rcu_read_lock_held()); | 3142 | WARN_ON_ONCE(!rcu_read_lock_held()); |
3135 | 3143 | ||
@@ -3137,82 +3145,136 @@ struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos) | |||
3137 | last = pos; | 3145 | last = pos; |
3138 | /* ->prev isn't RCU safe, walk ->next till the end */ | 3146 | /* ->prev isn't RCU safe, walk ->next till the end */ |
3139 | pos = NULL; | 3147 | pos = NULL; |
3140 | list_for_each_entry_rcu(tmp, &last->children, sibling) | 3148 | css_for_each_child(tmp, last) |
3141 | pos = tmp; | 3149 | pos = tmp; |
3142 | } while (pos); | 3150 | } while (pos); |
3143 | 3151 | ||
3144 | return last; | 3152 | return last; |
3145 | } | 3153 | } |
3146 | EXPORT_SYMBOL_GPL(cgroup_rightmost_descendant); | 3154 | EXPORT_SYMBOL_GPL(css_rightmost_descendant); |
3147 | 3155 | ||
3148 | static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos) | 3156 | static struct cgroup_subsys_state * |
3157 | css_leftmost_descendant(struct cgroup_subsys_state *pos) | ||
3149 | { | 3158 | { |
3150 | struct cgroup *last; | 3159 | struct cgroup_subsys_state *last; |
3151 | 3160 | ||
3152 | do { | 3161 | do { |
3153 | last = pos; | 3162 | last = pos; |
3154 | pos = list_first_or_null_rcu(&pos->children, struct cgroup, | 3163 | pos = css_next_child(NULL, pos); |
3155 | sibling); | ||
3156 | } while (pos); | 3164 | } while (pos); |
3157 | 3165 | ||
3158 | return last; | 3166 | return last; |
3159 | } | 3167 | } |
3160 | 3168 | ||
3161 | /** | 3169 | /** |
3162 | * cgroup_next_descendant_post - find the next descendant for post-order walk | 3170 | * css_next_descendant_post - find the next descendant for post-order walk |
3163 | * @pos: the current position (%NULL to initiate traversal) | 3171 | * @pos: the current position (%NULL to initiate traversal) |
3164 | * @cgroup: cgroup whose descendants to walk | 3172 | * @root: css whose descendants to walk |
3165 | * | 3173 | * |
3166 | * To be used by cgroup_for_each_descendant_post(). Find the next | 3174 | * To be used by css_for_each_descendant_post(). Find the next descendant |
3167 | * descendant to visit for post-order traversal of @cgroup's descendants. | 3175 | * to visit for post-order traversal of @root's descendants. @root is |
3176 | * included in the iteration and the last node to be visited. | ||
3168 | * | 3177 | * |
3169 | * While this function requires RCU read locking, it doesn't require the | 3178 | * While this function requires RCU read locking, it doesn't require the |
3170 | * whole traversal to be contained in a single RCU critical section. This | 3179 | * whole traversal to be contained in a single RCU critical section. This |
3171 | * function will return the correct next descendant as long as both @pos | 3180 | * function will return the correct next descendant as long as both @pos |
3172 | * and @cgroup are accessible and @pos is a descendant of @cgroup. | 3181 | * and @cgroup are accessible and @pos is a descendant of @cgroup. |
3173 | */ | 3182 | */ |
3174 | struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, | 3183 | struct cgroup_subsys_state * |
3175 | struct cgroup *cgroup) | 3184 | css_next_descendant_post(struct cgroup_subsys_state *pos, |
3185 | struct cgroup_subsys_state *root) | ||
3176 | { | 3186 | { |
3177 | struct cgroup *next; | 3187 | struct cgroup_subsys_state *next; |
3178 | 3188 | ||
3179 | WARN_ON_ONCE(!rcu_read_lock_held()); | 3189 | WARN_ON_ONCE(!rcu_read_lock_held()); |
3180 | 3190 | ||
3181 | /* if first iteration, visit the leftmost descendant */ | 3191 | /* if first iteration, visit leftmost descendant which may be @root */ |
3182 | if (!pos) { | 3192 | if (!pos) |
3183 | next = cgroup_leftmost_descendant(cgroup); | 3193 | return css_leftmost_descendant(root); |
3184 | return next != cgroup ? next : NULL; | 3194 | |
3185 | } | 3195 | /* if we visited @root, we're done */ |
3196 | if (pos == root) | ||
3197 | return NULL; | ||
3186 | 3198 | ||
3187 | /* if there's an unvisited sibling, visit its leftmost descendant */ | 3199 | /* if there's an unvisited sibling, visit its leftmost descendant */ |
3188 | next = cgroup_next_sibling(pos); | 3200 | next = css_next_child(pos, css_parent(pos)); |
3189 | if (next) | 3201 | if (next) |
3190 | return cgroup_leftmost_descendant(next); | 3202 | return css_leftmost_descendant(next); |
3191 | 3203 | ||
3192 | /* no sibling left, visit parent */ | 3204 | /* no sibling left, visit parent */ |
3193 | next = pos->parent; | 3205 | return css_parent(pos); |
3194 | return next != cgroup ? next : NULL; | 3206 | } |
3207 | EXPORT_SYMBOL_GPL(css_next_descendant_post); | ||
3208 | |||
3209 | /** | ||
3210 | * css_advance_task_iter - advance a task itererator to the next css_set | ||
3211 | * @it: the iterator to advance | ||
3212 | * | ||
3213 | * Advance @it to the next css_set to walk. | ||
3214 | */ | ||
3215 | static void css_advance_task_iter(struct css_task_iter *it) | ||
3216 | { | ||
3217 | struct list_head *l = it->cset_link; | ||
3218 | struct cgrp_cset_link *link; | ||
3219 | struct css_set *cset; | ||
3220 | |||
3221 | /* Advance to the next non-empty css_set */ | ||
3222 | do { | ||
3223 | l = l->next; | ||
3224 | if (l == &it->origin_css->cgroup->cset_links) { | ||
3225 | it->cset_link = NULL; | ||
3226 | return; | ||
3227 | } | ||
3228 | link = list_entry(l, struct cgrp_cset_link, cset_link); | ||
3229 | cset = link->cset; | ||
3230 | } while (list_empty(&cset->tasks)); | ||
3231 | it->cset_link = l; | ||
3232 | it->task = cset->tasks.next; | ||
3195 | } | 3233 | } |
3196 | EXPORT_SYMBOL_GPL(cgroup_next_descendant_post); | ||
3197 | 3234 | ||
3198 | void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) | 3235 | /** |
3236 | * css_task_iter_start - initiate task iteration | ||
3237 | * @css: the css to walk tasks of | ||
3238 | * @it: the task iterator to use | ||
3239 | * | ||
3240 | * Initiate iteration through the tasks of @css. The caller can call | ||
3241 | * css_task_iter_next() to walk through the tasks until the function | ||
3242 | * returns NULL. On completion of iteration, css_task_iter_end() must be | ||
3243 | * called. | ||
3244 | * | ||
3245 | * Note that this function acquires a lock which is released when the | ||
3246 | * iteration finishes. The caller can't sleep while iteration is in | ||
3247 | * progress. | ||
3248 | */ | ||
3249 | void css_task_iter_start(struct cgroup_subsys_state *css, | ||
3250 | struct css_task_iter *it) | ||
3199 | __acquires(css_set_lock) | 3251 | __acquires(css_set_lock) |
3200 | { | 3252 | { |
3201 | /* | 3253 | /* |
3202 | * The first time anyone tries to iterate across a cgroup, | 3254 | * The first time anyone tries to iterate across a css, we need to |
3203 | * we need to enable the list linking each css_set to its | 3255 | * enable the list linking each css_set to its tasks, and fix up |
3204 | * tasks, and fix up all existing tasks. | 3256 | * all existing tasks. |
3205 | */ | 3257 | */ |
3206 | if (!use_task_css_set_links) | 3258 | if (!use_task_css_set_links) |
3207 | cgroup_enable_task_cg_lists(); | 3259 | cgroup_enable_task_cg_lists(); |
3208 | 3260 | ||
3209 | read_lock(&css_set_lock); | 3261 | read_lock(&css_set_lock); |
3210 | it->cset_link = &cgrp->cset_links; | 3262 | |
3211 | cgroup_advance_iter(cgrp, it); | 3263 | it->origin_css = css; |
3264 | it->cset_link = &css->cgroup->cset_links; | ||
3265 | |||
3266 | css_advance_task_iter(it); | ||
3212 | } | 3267 | } |
3213 | 3268 | ||
3214 | struct task_struct *cgroup_iter_next(struct cgroup *cgrp, | 3269 | /** |
3215 | struct cgroup_iter *it) | 3270 | * css_task_iter_next - return the next task for the iterator |
3271 | * @it: the task iterator being iterated | ||
3272 | * | ||
3273 | * The "next" function for task iteration. @it should have been | ||
3274 | * initialized via css_task_iter_start(). Returns NULL when the iteration | ||
3275 | * reaches the end. | ||
3276 | */ | ||
3277 | struct task_struct *css_task_iter_next(struct css_task_iter *it) | ||
3216 | { | 3278 | { |
3217 | struct task_struct *res; | 3279 | struct task_struct *res; |
3218 | struct list_head *l = it->task; | 3280 | struct list_head *l = it->task; |
@@ -3226,16 +3288,24 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp, | |||
3226 | l = l->next; | 3288 | l = l->next; |
3227 | link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link); | 3289 | link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link); |
3228 | if (l == &link->cset->tasks) { | 3290 | if (l == &link->cset->tasks) { |
3229 | /* We reached the end of this task list - move on to | 3291 | /* |
3230 | * the next cg_cgroup_link */ | 3292 | * We reached the end of this task list - move on to the |
3231 | cgroup_advance_iter(cgrp, it); | 3293 | * next cgrp_cset_link. |
3294 | */ | ||
3295 | css_advance_task_iter(it); | ||
3232 | } else { | 3296 | } else { |
3233 | it->task = l; | 3297 | it->task = l; |
3234 | } | 3298 | } |
3235 | return res; | 3299 | return res; |
3236 | } | 3300 | } |
3237 | 3301 | ||
3238 | void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it) | 3302 | /** |
3303 | * css_task_iter_end - finish task iteration | ||
3304 | * @it: the task iterator to finish | ||
3305 | * | ||
3306 | * Finish task iteration started by css_task_iter_start(). | ||
3307 | */ | ||
3308 | void css_task_iter_end(struct css_task_iter *it) | ||
3239 | __releases(css_set_lock) | 3309 | __releases(css_set_lock) |
3240 | { | 3310 | { |
3241 | read_unlock(&css_set_lock); | 3311 | read_unlock(&css_set_lock); |
@@ -3276,46 +3346,49 @@ static inline int started_after(void *p1, void *p2) | |||
3276 | } | 3346 | } |
3277 | 3347 | ||
3278 | /** | 3348 | /** |
3279 | * cgroup_scan_tasks - iterate though all the tasks in a cgroup | 3349 | * css_scan_tasks - iterate though all the tasks in a css |
3280 | * @scan: struct cgroup_scanner containing arguments for the scan | 3350 | * @css: the css to iterate tasks of |
3351 | * @test: optional test callback | ||
3352 | * @process: process callback | ||
3353 | * @data: data passed to @test and @process | ||
3354 | * @heap: optional pre-allocated heap used for task iteration | ||
3355 | * | ||
3356 | * Iterate through all the tasks in @css, calling @test for each, and if it | ||
3357 | * returns %true, call @process for it also. | ||
3281 | * | 3358 | * |
3282 | * Arguments include pointers to callback functions test_task() and | 3359 | * @test may be NULL, meaning always true (select all tasks), which |
3283 | * process_task(). | 3360 | * effectively duplicates css_task_iter_{start,next,end}() but does not |
3284 | * Iterate through all the tasks in a cgroup, calling test_task() for each, | 3361 | * lock css_set_lock for the call to @process. |
3285 | * and if it returns true, call process_task() for it also. | ||
3286 | * The test_task pointer may be NULL, meaning always true (select all tasks). | ||
3287 | * Effectively duplicates cgroup_iter_{start,next,end}() | ||
3288 | * but does not lock css_set_lock for the call to process_task(). | ||
3289 | * The struct cgroup_scanner may be embedded in any structure of the caller's | ||
3290 | * creation. | ||
3291 | * It is guaranteed that process_task() will act on every task that | ||
3292 | * is a member of the cgroup for the duration of this call. This | ||
3293 | * function may or may not call process_task() for tasks that exit | ||
3294 | * or move to a different cgroup during the call, or are forked or | ||
3295 | * move into the cgroup during the call. | ||
3296 | * | 3362 | * |
3297 | * Note that test_task() may be called with locks held, and may in some | 3363 | * It is guaranteed that @process will act on every task that is a member |
3298 | * situations be called multiple times for the same task, so it should | 3364 | * of @css for the duration of this call. This function may or may not |
3299 | * be cheap. | 3365 | * call @process for tasks that exit or move to a different css during the |
3300 | * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been | 3366 | * call, or are forked or move into the css during the call. |
3301 | * pre-allocated and will be used for heap operations (and its "gt" member will | 3367 | * |
3302 | * be overwritten), else a temporary heap will be used (allocation of which | 3368 | * Note that @test may be called with locks held, and may in some |
3303 | * may cause this function to fail). | 3369 | * situations be called multiple times for the same task, so it should be |
3370 | * cheap. | ||
3371 | * | ||
3372 | * If @heap is non-NULL, a heap has been pre-allocated and will be used for | ||
3373 | * heap operations (and its "gt" member will be overwritten), else a | ||
3374 | * temporary heap will be used (allocation of which may cause this function | ||
3375 | * to fail). | ||
3304 | */ | 3376 | */ |
3305 | int cgroup_scan_tasks(struct cgroup_scanner *scan) | 3377 | int css_scan_tasks(struct cgroup_subsys_state *css, |
3378 | bool (*test)(struct task_struct *, void *), | ||
3379 | void (*process)(struct task_struct *, void *), | ||
3380 | void *data, struct ptr_heap *heap) | ||
3306 | { | 3381 | { |
3307 | int retval, i; | 3382 | int retval, i; |
3308 | struct cgroup_iter it; | 3383 | struct css_task_iter it; |
3309 | struct task_struct *p, *dropped; | 3384 | struct task_struct *p, *dropped; |
3310 | /* Never dereference latest_task, since it's not refcounted */ | 3385 | /* Never dereference latest_task, since it's not refcounted */ |
3311 | struct task_struct *latest_task = NULL; | 3386 | struct task_struct *latest_task = NULL; |
3312 | struct ptr_heap tmp_heap; | 3387 | struct ptr_heap tmp_heap; |
3313 | struct ptr_heap *heap; | ||
3314 | struct timespec latest_time = { 0, 0 }; | 3388 | struct timespec latest_time = { 0, 0 }; |
3315 | 3389 | ||
3316 | if (scan->heap) { | 3390 | if (heap) { |
3317 | /* The caller supplied our heap and pre-allocated its memory */ | 3391 | /* The caller supplied our heap and pre-allocated its memory */ |
3318 | heap = scan->heap; | ||
3319 | heap->gt = &started_after; | 3392 | heap->gt = &started_after; |
3320 | } else { | 3393 | } else { |
3321 | /* We need to allocate our own heap memory */ | 3394 | /* We need to allocate our own heap memory */ |
@@ -3328,25 +3401,24 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) | |||
3328 | 3401 | ||
3329 | again: | 3402 | again: |
3330 | /* | 3403 | /* |
3331 | * Scan tasks in the cgroup, using the scanner's "test_task" callback | 3404 | * Scan tasks in the css, using the @test callback to determine |
3332 | * to determine which are of interest, and using the scanner's | 3405 | * which are of interest, and invoking @process callback on the |
3333 | * "process_task" callback to process any of them that need an update. | 3406 | * ones which need an update. Since we don't want to hold any |
3334 | * Since we don't want to hold any locks during the task updates, | 3407 | * locks during the task updates, gather tasks to be processed in a |
3335 | * gather tasks to be processed in a heap structure. | 3408 | * heap structure. The heap is sorted by descending task start |
3336 | * The heap is sorted by descending task start time. | 3409 | * time. If the statically-sized heap fills up, we overflow tasks |
3337 | * If the statically-sized heap fills up, we overflow tasks that | 3410 | * that started later, and in future iterations only consider tasks |
3338 | * started later, and in future iterations only consider tasks that | 3411 | * that started after the latest task in the previous pass. This |
3339 | * started after the latest task in the previous pass. This | ||
3340 | * guarantees forward progress and that we don't miss any tasks. | 3412 | * guarantees forward progress and that we don't miss any tasks. |
3341 | */ | 3413 | */ |
3342 | heap->size = 0; | 3414 | heap->size = 0; |
3343 | cgroup_iter_start(scan->cg, &it); | 3415 | css_task_iter_start(css, &it); |
3344 | while ((p = cgroup_iter_next(scan->cg, &it))) { | 3416 | while ((p = css_task_iter_next(&it))) { |
3345 | /* | 3417 | /* |
3346 | * Only affect tasks that qualify per the caller's callback, | 3418 | * Only affect tasks that qualify per the caller's callback, |
3347 | * if he provided one | 3419 | * if he provided one |
3348 | */ | 3420 | */ |
3349 | if (scan->test_task && !scan->test_task(p, scan)) | 3421 | if (test && !test(p, data)) |
3350 | continue; | 3422 | continue; |
3351 | /* | 3423 | /* |
3352 | * Only process tasks that started after the last task | 3424 | * Only process tasks that started after the last task |
@@ -3374,7 +3446,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) | |||
3374 | * the heap and wasn't inserted | 3446 | * the heap and wasn't inserted |
3375 | */ | 3447 | */ |
3376 | } | 3448 | } |
3377 | cgroup_iter_end(scan->cg, &it); | 3449 | css_task_iter_end(&it); |
3378 | 3450 | ||
3379 | if (heap->size) { | 3451 | if (heap->size) { |
3380 | for (i = 0; i < heap->size; i++) { | 3452 | for (i = 0; i < heap->size; i++) { |
@@ -3384,7 +3456,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) | |||
3384 | latest_task = q; | 3456 | latest_task = q; |
3385 | } | 3457 | } |
3386 | /* Process the task per the caller's callback */ | 3458 | /* Process the task per the caller's callback */ |
3387 | scan->process_task(q, scan); | 3459 | process(q, data); |
3388 | put_task_struct(q); | 3460 | put_task_struct(q); |
3389 | } | 3461 | } |
3390 | /* | 3462 | /* |
@@ -3401,10 +3473,9 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) | |||
3401 | return 0; | 3473 | return 0; |
3402 | } | 3474 | } |
3403 | 3475 | ||
3404 | static void cgroup_transfer_one_task(struct task_struct *task, | 3476 | static void cgroup_transfer_one_task(struct task_struct *task, void *data) |
3405 | struct cgroup_scanner *scan) | ||
3406 | { | 3477 | { |
3407 | struct cgroup *new_cgroup = scan->data; | 3478 | struct cgroup *new_cgroup = data; |
3408 | 3479 | ||
3409 | mutex_lock(&cgroup_mutex); | 3480 | mutex_lock(&cgroup_mutex); |
3410 | cgroup_attach_task(new_cgroup, task, false); | 3481 | cgroup_attach_task(new_cgroup, task, false); |
@@ -3418,15 +3489,8 @@ static void cgroup_transfer_one_task(struct task_struct *task, | |||
3418 | */ | 3489 | */ |
3419 | int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) | 3490 | int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) |
3420 | { | 3491 | { |
3421 | struct cgroup_scanner scan; | 3492 | return css_scan_tasks(&from->dummy_css, NULL, cgroup_transfer_one_task, |
3422 | 3493 | to, NULL); | |
3423 | scan.cg = from; | ||
3424 | scan.test_task = NULL; /* select all tasks in cgroup */ | ||
3425 | scan.process_task = cgroup_transfer_one_task; | ||
3426 | scan.heap = NULL; | ||
3427 | scan.data = to; | ||
3428 | |||
3429 | return cgroup_scan_tasks(&scan); | ||
3430 | } | 3494 | } |
3431 | 3495 | ||
3432 | /* | 3496 | /* |
@@ -3468,7 +3532,7 @@ struct cgroup_pidlist { | |||
3468 | /* pointer to the cgroup we belong to, for list removal purposes */ | 3532 | /* pointer to the cgroup we belong to, for list removal purposes */ |
3469 | struct cgroup *owner; | 3533 | struct cgroup *owner; |
3470 | /* protects the other fields */ | 3534 | /* protects the other fields */ |
3471 | struct rw_semaphore mutex; | 3535 | struct rw_semaphore rwsem; |
3472 | }; | 3536 | }; |
3473 | 3537 | ||
3474 | /* | 3538 | /* |
@@ -3541,7 +3605,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, | |||
3541 | struct pid_namespace *ns = task_active_pid_ns(current); | 3605 | struct pid_namespace *ns = task_active_pid_ns(current); |
3542 | 3606 | ||
3543 | /* | 3607 | /* |
3544 | * We can't drop the pidlist_mutex before taking the l->mutex in case | 3608 | * We can't drop the pidlist_mutex before taking the l->rwsem in case |
3545 | * the last ref-holder is trying to remove l from the list at the same | 3609 | * the last ref-holder is trying to remove l from the list at the same |
3546 | * time. Holding the pidlist_mutex precludes somebody taking whichever | 3610 | * time. Holding the pidlist_mutex precludes somebody taking whichever |
3547 | * list we find out from under us - compare release_pid_array(). | 3611 | * list we find out from under us - compare release_pid_array(). |
@@ -3550,7 +3614,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, | |||
3550 | list_for_each_entry(l, &cgrp->pidlists, links) { | 3614 | list_for_each_entry(l, &cgrp->pidlists, links) { |
3551 | if (l->key.type == type && l->key.ns == ns) { | 3615 | if (l->key.type == type && l->key.ns == ns) { |
3552 | /* make sure l doesn't vanish out from under us */ | 3616 | /* make sure l doesn't vanish out from under us */ |
3553 | down_write(&l->mutex); | 3617 | down_write(&l->rwsem); |
3554 | mutex_unlock(&cgrp->pidlist_mutex); | 3618 | mutex_unlock(&cgrp->pidlist_mutex); |
3555 | return l; | 3619 | return l; |
3556 | } | 3620 | } |
@@ -3561,8 +3625,8 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, | |||
3561 | mutex_unlock(&cgrp->pidlist_mutex); | 3625 | mutex_unlock(&cgrp->pidlist_mutex); |
3562 | return l; | 3626 | return l; |
3563 | } | 3627 | } |
3564 | init_rwsem(&l->mutex); | 3628 | init_rwsem(&l->rwsem); |
3565 | down_write(&l->mutex); | 3629 | down_write(&l->rwsem); |
3566 | l->key.type = type; | 3630 | l->key.type = type; |
3567 | l->key.ns = get_pid_ns(ns); | 3631 | l->key.ns = get_pid_ns(ns); |
3568 | l->owner = cgrp; | 3632 | l->owner = cgrp; |
@@ -3580,7 +3644,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, | |||
3580 | pid_t *array; | 3644 | pid_t *array; |
3581 | int length; | 3645 | int length; |
3582 | int pid, n = 0; /* used for populating the array */ | 3646 | int pid, n = 0; /* used for populating the array */ |
3583 | struct cgroup_iter it; | 3647 | struct css_task_iter it; |
3584 | struct task_struct *tsk; | 3648 | struct task_struct *tsk; |
3585 | struct cgroup_pidlist *l; | 3649 | struct cgroup_pidlist *l; |
3586 | 3650 | ||
@@ -3595,8 +3659,8 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, | |||
3595 | if (!array) | 3659 | if (!array) |
3596 | return -ENOMEM; | 3660 | return -ENOMEM; |
3597 | /* now, populate the array */ | 3661 | /* now, populate the array */ |
3598 | cgroup_iter_start(cgrp, &it); | 3662 | css_task_iter_start(&cgrp->dummy_css, &it); |
3599 | while ((tsk = cgroup_iter_next(cgrp, &it))) { | 3663 | while ((tsk = css_task_iter_next(&it))) { |
3600 | if (unlikely(n == length)) | 3664 | if (unlikely(n == length)) |
3601 | break; | 3665 | break; |
3602 | /* get tgid or pid for procs or tasks file respectively */ | 3666 | /* get tgid or pid for procs or tasks file respectively */ |
@@ -3607,7 +3671,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, | |||
3607 | if (pid > 0) /* make sure to only use valid results */ | 3671 | if (pid > 0) /* make sure to only use valid results */ |
3608 | array[n++] = pid; | 3672 | array[n++] = pid; |
3609 | } | 3673 | } |
3610 | cgroup_iter_end(cgrp, &it); | 3674 | css_task_iter_end(&it); |
3611 | length = n; | 3675 | length = n; |
3612 | /* now sort & (if procs) strip out duplicates */ | 3676 | /* now sort & (if procs) strip out duplicates */ |
3613 | sort(array, length, sizeof(pid_t), cmppid, NULL); | 3677 | sort(array, length, sizeof(pid_t), cmppid, NULL); |
@@ -3623,7 +3687,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, | |||
3623 | l->list = array; | 3687 | l->list = array; |
3624 | l->length = length; | 3688 | l->length = length; |
3625 | l->use_count++; | 3689 | l->use_count++; |
3626 | up_write(&l->mutex); | 3690 | up_write(&l->rwsem); |
3627 | *lp = l; | 3691 | *lp = l; |
3628 | return 0; | 3692 | return 0; |
3629 | } | 3693 | } |
@@ -3641,7 +3705,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) | |||
3641 | { | 3705 | { |
3642 | int ret = -EINVAL; | 3706 | int ret = -EINVAL; |
3643 | struct cgroup *cgrp; | 3707 | struct cgroup *cgrp; |
3644 | struct cgroup_iter it; | 3708 | struct css_task_iter it; |
3645 | struct task_struct *tsk; | 3709 | struct task_struct *tsk; |
3646 | 3710 | ||
3647 | /* | 3711 | /* |
@@ -3655,8 +3719,8 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) | |||
3655 | ret = 0; | 3719 | ret = 0; |
3656 | cgrp = dentry->d_fsdata; | 3720 | cgrp = dentry->d_fsdata; |
3657 | 3721 | ||
3658 | cgroup_iter_start(cgrp, &it); | 3722 | css_task_iter_start(&cgrp->dummy_css, &it); |
3659 | while ((tsk = cgroup_iter_next(cgrp, &it))) { | 3723 | while ((tsk = css_task_iter_next(&it))) { |
3660 | switch (tsk->state) { | 3724 | switch (tsk->state) { |
3661 | case TASK_RUNNING: | 3725 | case TASK_RUNNING: |
3662 | stats->nr_running++; | 3726 | stats->nr_running++; |
@@ -3676,7 +3740,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) | |||
3676 | break; | 3740 | break; |
3677 | } | 3741 | } |
3678 | } | 3742 | } |
3679 | cgroup_iter_end(cgrp, &it); | 3743 | css_task_iter_end(&it); |
3680 | 3744 | ||
3681 | err: | 3745 | err: |
3682 | return ret; | 3746 | return ret; |
@@ -3701,7 +3765,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) | |||
3701 | int index = 0, pid = *pos; | 3765 | int index = 0, pid = *pos; |
3702 | int *iter; | 3766 | int *iter; |
3703 | 3767 | ||
3704 | down_read(&l->mutex); | 3768 | down_read(&l->rwsem); |
3705 | if (pid) { | 3769 | if (pid) { |
3706 | int end = l->length; | 3770 | int end = l->length; |
3707 | 3771 | ||
@@ -3728,7 +3792,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) | |||
3728 | static void cgroup_pidlist_stop(struct seq_file *s, void *v) | 3792 | static void cgroup_pidlist_stop(struct seq_file *s, void *v) |
3729 | { | 3793 | { |
3730 | struct cgroup_pidlist *l = s->private; | 3794 | struct cgroup_pidlist *l = s->private; |
3731 | up_read(&l->mutex); | 3795 | up_read(&l->rwsem); |
3732 | } | 3796 | } |
3733 | 3797 | ||
3734 | static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) | 3798 | static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) |
@@ -3774,7 +3838,7 @@ static void cgroup_release_pid_array(struct cgroup_pidlist *l) | |||
3774 | * pidlist_mutex, we have to take pidlist_mutex first. | 3838 | * pidlist_mutex, we have to take pidlist_mutex first. |
3775 | */ | 3839 | */ |
3776 | mutex_lock(&l->owner->pidlist_mutex); | 3840 | mutex_lock(&l->owner->pidlist_mutex); |
3777 | down_write(&l->mutex); | 3841 | down_write(&l->rwsem); |
3778 | BUG_ON(!l->use_count); | 3842 | BUG_ON(!l->use_count); |
3779 | if (!--l->use_count) { | 3843 | if (!--l->use_count) { |
3780 | /* we're the last user if refcount is 0; remove and free */ | 3844 | /* we're the last user if refcount is 0; remove and free */ |
@@ -3782,12 +3846,12 @@ static void cgroup_release_pid_array(struct cgroup_pidlist *l) | |||
3782 | mutex_unlock(&l->owner->pidlist_mutex); | 3846 | mutex_unlock(&l->owner->pidlist_mutex); |
3783 | pidlist_free(l->list); | 3847 | pidlist_free(l->list); |
3784 | put_pid_ns(l->key.ns); | 3848 | put_pid_ns(l->key.ns); |
3785 | up_write(&l->mutex); | 3849 | up_write(&l->rwsem); |
3786 | kfree(l); | 3850 | kfree(l); |
3787 | return; | 3851 | return; |
3788 | } | 3852 | } |
3789 | mutex_unlock(&l->owner->pidlist_mutex); | 3853 | mutex_unlock(&l->owner->pidlist_mutex); |
3790 | up_write(&l->mutex); | 3854 | up_write(&l->rwsem); |
3791 | } | 3855 | } |
3792 | 3856 | ||
3793 | static int cgroup_pidlist_release(struct inode *inode, struct file *file) | 3857 | static int cgroup_pidlist_release(struct inode *inode, struct file *file) |
@@ -3851,21 +3915,20 @@ static int cgroup_procs_open(struct inode *unused, struct file *file) | |||
3851 | return cgroup_pidlist_open(file, CGROUP_FILE_PROCS); | 3915 | return cgroup_pidlist_open(file, CGROUP_FILE_PROCS); |
3852 | } | 3916 | } |
3853 | 3917 | ||
3854 | static u64 cgroup_read_notify_on_release(struct cgroup *cgrp, | 3918 | static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, |
3855 | struct cftype *cft) | 3919 | struct cftype *cft) |
3856 | { | 3920 | { |
3857 | return notify_on_release(cgrp); | 3921 | return notify_on_release(css->cgroup); |
3858 | } | 3922 | } |
3859 | 3923 | ||
3860 | static int cgroup_write_notify_on_release(struct cgroup *cgrp, | 3924 | static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css, |
3861 | struct cftype *cft, | 3925 | struct cftype *cft, u64 val) |
3862 | u64 val) | ||
3863 | { | 3926 | { |
3864 | clear_bit(CGRP_RELEASABLE, &cgrp->flags); | 3927 | clear_bit(CGRP_RELEASABLE, &css->cgroup->flags); |
3865 | if (val) | 3928 | if (val) |
3866 | set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); | 3929 | set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); |
3867 | else | 3930 | else |
3868 | clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); | 3931 | clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); |
3869 | return 0; | 3932 | return 0; |
3870 | } | 3933 | } |
3871 | 3934 | ||
@@ -3895,18 +3958,18 @@ static void cgroup_event_remove(struct work_struct *work) | |||
3895 | { | 3958 | { |
3896 | struct cgroup_event *event = container_of(work, struct cgroup_event, | 3959 | struct cgroup_event *event = container_of(work, struct cgroup_event, |
3897 | remove); | 3960 | remove); |
3898 | struct cgroup *cgrp = event->cgrp; | 3961 | struct cgroup_subsys_state *css = event->css; |
3899 | 3962 | ||
3900 | remove_wait_queue(event->wqh, &event->wait); | 3963 | remove_wait_queue(event->wqh, &event->wait); |
3901 | 3964 | ||
3902 | event->cft->unregister_event(cgrp, event->cft, event->eventfd); | 3965 | event->cft->unregister_event(css, event->cft, event->eventfd); |
3903 | 3966 | ||
3904 | /* Notify userspace the event is going away. */ | 3967 | /* Notify userspace the event is going away. */ |
3905 | eventfd_signal(event->eventfd, 1); | 3968 | eventfd_signal(event->eventfd, 1); |
3906 | 3969 | ||
3907 | eventfd_ctx_put(event->eventfd); | 3970 | eventfd_ctx_put(event->eventfd); |
3908 | kfree(event); | 3971 | kfree(event); |
3909 | cgroup_dput(cgrp); | 3972 | css_put(css); |
3910 | } | 3973 | } |
3911 | 3974 | ||
3912 | /* | 3975 | /* |
@@ -3919,7 +3982,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, | |||
3919 | { | 3982 | { |
3920 | struct cgroup_event *event = container_of(wait, | 3983 | struct cgroup_event *event = container_of(wait, |
3921 | struct cgroup_event, wait); | 3984 | struct cgroup_event, wait); |
3922 | struct cgroup *cgrp = event->cgrp; | 3985 | struct cgroup *cgrp = event->css->cgroup; |
3923 | unsigned long flags = (unsigned long)key; | 3986 | unsigned long flags = (unsigned long)key; |
3924 | 3987 | ||
3925 | if (flags & POLLHUP) { | 3988 | if (flags & POLLHUP) { |
@@ -3963,14 +4026,15 @@ static void cgroup_event_ptable_queue_proc(struct file *file, | |||
3963 | * Input must be in format '<event_fd> <control_fd> <args>'. | 4026 | * Input must be in format '<event_fd> <control_fd> <args>'. |
3964 | * Interpretation of args is defined by control file implementation. | 4027 | * Interpretation of args is defined by control file implementation. |
3965 | */ | 4028 | */ |
3966 | static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, | 4029 | static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, |
3967 | const char *buffer) | 4030 | struct cftype *cft, const char *buffer) |
3968 | { | 4031 | { |
3969 | struct cgroup_event *event = NULL; | 4032 | struct cgroup *cgrp = dummy_css->cgroup; |
3970 | struct cgroup *cgrp_cfile; | 4033 | struct cgroup_event *event; |
4034 | struct cgroup_subsys_state *cfile_css; | ||
3971 | unsigned int efd, cfd; | 4035 | unsigned int efd, cfd; |
3972 | struct file *efile = NULL; | 4036 | struct fd efile; |
3973 | struct file *cfile = NULL; | 4037 | struct fd cfile; |
3974 | char *endp; | 4038 | char *endp; |
3975 | int ret; | 4039 | int ret; |
3976 | 4040 | ||
@@ -3987,109 +4051,113 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, | |||
3987 | event = kzalloc(sizeof(*event), GFP_KERNEL); | 4051 | event = kzalloc(sizeof(*event), GFP_KERNEL); |
3988 | if (!event) | 4052 | if (!event) |
3989 | return -ENOMEM; | 4053 | return -ENOMEM; |
3990 | event->cgrp = cgrp; | 4054 | |
3991 | INIT_LIST_HEAD(&event->list); | 4055 | INIT_LIST_HEAD(&event->list); |
3992 | init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); | 4056 | init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); |
3993 | init_waitqueue_func_entry(&event->wait, cgroup_event_wake); | 4057 | init_waitqueue_func_entry(&event->wait, cgroup_event_wake); |
3994 | INIT_WORK(&event->remove, cgroup_event_remove); | 4058 | INIT_WORK(&event->remove, cgroup_event_remove); |
3995 | 4059 | ||
3996 | efile = eventfd_fget(efd); | 4060 | efile = fdget(efd); |
3997 | if (IS_ERR(efile)) { | 4061 | if (!efile.file) { |
3998 | ret = PTR_ERR(efile); | 4062 | ret = -EBADF; |
3999 | goto fail; | 4063 | goto out_kfree; |
4000 | } | 4064 | } |
4001 | 4065 | ||
4002 | event->eventfd = eventfd_ctx_fileget(efile); | 4066 | event->eventfd = eventfd_ctx_fileget(efile.file); |
4003 | if (IS_ERR(event->eventfd)) { | 4067 | if (IS_ERR(event->eventfd)) { |
4004 | ret = PTR_ERR(event->eventfd); | 4068 | ret = PTR_ERR(event->eventfd); |
4005 | goto fail; | 4069 | goto out_put_efile; |
4006 | } | 4070 | } |
4007 | 4071 | ||
4008 | cfile = fget(cfd); | 4072 | cfile = fdget(cfd); |
4009 | if (!cfile) { | 4073 | if (!cfile.file) { |
4010 | ret = -EBADF; | 4074 | ret = -EBADF; |
4011 | goto fail; | 4075 | goto out_put_eventfd; |
4012 | } | 4076 | } |
4013 | 4077 | ||
4014 | /* the process need read permission on control file */ | 4078 | /* the process need read permission on control file */ |
4015 | /* AV: shouldn't we check that it's been opened for read instead? */ | 4079 | /* AV: shouldn't we check that it's been opened for read instead? */ |
4016 | ret = inode_permission(file_inode(cfile), MAY_READ); | 4080 | ret = inode_permission(file_inode(cfile.file), MAY_READ); |
4017 | if (ret < 0) | 4081 | if (ret < 0) |
4018 | goto fail; | 4082 | goto out_put_cfile; |
4019 | 4083 | ||
4020 | event->cft = __file_cft(cfile); | 4084 | event->cft = __file_cft(cfile.file); |
4021 | if (IS_ERR(event->cft)) { | 4085 | if (IS_ERR(event->cft)) { |
4022 | ret = PTR_ERR(event->cft); | 4086 | ret = PTR_ERR(event->cft); |
4023 | goto fail; | 4087 | goto out_put_cfile; |
4088 | } | ||
4089 | |||
4090 | if (!event->cft->ss) { | ||
4091 | ret = -EBADF; | ||
4092 | goto out_put_cfile; | ||
4024 | } | 4093 | } |
4025 | 4094 | ||
4026 | /* | 4095 | /* |
4027 | * The file to be monitored must be in the same cgroup as | 4096 | * Determine the css of @cfile, verify it belongs to the same |
4028 | * cgroup.event_control is. | 4097 | * cgroup as cgroup.event_control, and associate @event with it. |
4098 | * Remaining events are automatically removed on cgroup destruction | ||
4099 | * but the removal is asynchronous, so take an extra ref. | ||
4029 | */ | 4100 | */ |
4030 | cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent); | 4101 | rcu_read_lock(); |
4031 | if (cgrp_cfile != cgrp) { | 4102 | |
4032 | ret = -EINVAL; | 4103 | ret = -EINVAL; |
4033 | goto fail; | 4104 | event->css = cgroup_css(cgrp, event->cft->ss); |
4034 | } | 4105 | cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, event->cft->ss); |
4106 | if (event->css && event->css == cfile_css && css_tryget(event->css)) | ||
4107 | ret = 0; | ||
4108 | |||
4109 | rcu_read_unlock(); | ||
4110 | if (ret) | ||
4111 | goto out_put_cfile; | ||
4035 | 4112 | ||
4036 | if (!event->cft->register_event || !event->cft->unregister_event) { | 4113 | if (!event->cft->register_event || !event->cft->unregister_event) { |
4037 | ret = -EINVAL; | 4114 | ret = -EINVAL; |
4038 | goto fail; | 4115 | goto out_put_css; |
4039 | } | 4116 | } |
4040 | 4117 | ||
4041 | ret = event->cft->register_event(cgrp, event->cft, | 4118 | ret = event->cft->register_event(event->css, event->cft, |
4042 | event->eventfd, buffer); | 4119 | event->eventfd, buffer); |
4043 | if (ret) | 4120 | if (ret) |
4044 | goto fail; | 4121 | goto out_put_css; |
4045 | 4122 | ||
4046 | efile->f_op->poll(efile, &event->pt); | 4123 | efile.file->f_op->poll(efile.file, &event->pt); |
4047 | |||
4048 | /* | ||
4049 | * Events should be removed after rmdir of cgroup directory, but before | ||
4050 | * destroying subsystem state objects. Let's take reference to cgroup | ||
4051 | * directory dentry to do that. | ||
4052 | */ | ||
4053 | dget(cgrp->dentry); | ||
4054 | 4124 | ||
4055 | spin_lock(&cgrp->event_list_lock); | 4125 | spin_lock(&cgrp->event_list_lock); |
4056 | list_add(&event->list, &cgrp->event_list); | 4126 | list_add(&event->list, &cgrp->event_list); |
4057 | spin_unlock(&cgrp->event_list_lock); | 4127 | spin_unlock(&cgrp->event_list_lock); |
4058 | 4128 | ||
4059 | fput(cfile); | 4129 | fdput(cfile); |
4060 | fput(efile); | 4130 | fdput(efile); |
4061 | 4131 | ||
4062 | return 0; | 4132 | return 0; |
4063 | 4133 | ||
4064 | fail: | 4134 | out_put_css: |
4065 | if (cfile) | 4135 | css_put(event->css); |
4066 | fput(cfile); | 4136 | out_put_cfile: |
4067 | 4137 | fdput(cfile); | |
4068 | if (event && event->eventfd && !IS_ERR(event->eventfd)) | 4138 | out_put_eventfd: |
4069 | eventfd_ctx_put(event->eventfd); | 4139 | eventfd_ctx_put(event->eventfd); |
4070 | 4140 | out_put_efile: | |
4071 | if (!IS_ERR_OR_NULL(efile)) | 4141 | fdput(efile); |
4072 | fput(efile); | 4142 | out_kfree: |
4073 | |||
4074 | kfree(event); | 4143 | kfree(event); |
4075 | 4144 | ||
4076 | return ret; | 4145 | return ret; |
4077 | } | 4146 | } |
4078 | 4147 | ||
4079 | static u64 cgroup_clone_children_read(struct cgroup *cgrp, | 4148 | static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, |
4080 | struct cftype *cft) | 4149 | struct cftype *cft) |
4081 | { | 4150 | { |
4082 | return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); | 4151 | return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); |
4083 | } | 4152 | } |
4084 | 4153 | ||
4085 | static int cgroup_clone_children_write(struct cgroup *cgrp, | 4154 | static int cgroup_clone_children_write(struct cgroup_subsys_state *css, |
4086 | struct cftype *cft, | 4155 | struct cftype *cft, u64 val) |
4087 | u64 val) | ||
4088 | { | 4156 | { |
4089 | if (val) | 4157 | if (val) |
4090 | set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); | 4158 | set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); |
4091 | else | 4159 | else |
4092 | clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); | 4160 | clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); |
4093 | return 0; | 4161 | return 0; |
4094 | } | 4162 | } |
4095 | 4163 | ||
@@ -4148,36 +4216,34 @@ static struct cftype cgroup_base_files[] = { | |||
4148 | }; | 4216 | }; |
4149 | 4217 | ||
4150 | /** | 4218 | /** |
4151 | * cgroup_populate_dir - selectively creation of files in a directory | 4219 | * cgroup_populate_dir - create subsys files in a cgroup directory |
4152 | * @cgrp: target cgroup | 4220 | * @cgrp: target cgroup |
4153 | * @base_files: true if the base files should be added | ||
4154 | * @subsys_mask: mask of the subsystem ids whose files should be added | 4221 | * @subsys_mask: mask of the subsystem ids whose files should be added |
4222 | * | ||
4223 | * On failure, no file is added. | ||
4155 | */ | 4224 | */ |
4156 | static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, | 4225 | static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) |
4157 | unsigned long subsys_mask) | ||
4158 | { | 4226 | { |
4159 | int err; | ||
4160 | struct cgroup_subsys *ss; | 4227 | struct cgroup_subsys *ss; |
4161 | 4228 | int i, ret = 0; | |
4162 | if (base_files) { | ||
4163 | err = cgroup_addrm_files(cgrp, NULL, cgroup_base_files, true); | ||
4164 | if (err < 0) | ||
4165 | return err; | ||
4166 | } | ||
4167 | 4229 | ||
4168 | /* process cftsets of each subsystem */ | 4230 | /* process cftsets of each subsystem */ |
4169 | for_each_root_subsys(cgrp->root, ss) { | 4231 | for_each_subsys(ss, i) { |
4170 | struct cftype_set *set; | 4232 | struct cftype_set *set; |
4171 | if (!test_bit(ss->subsys_id, &subsys_mask)) | 4233 | |
4234 | if (!test_bit(i, &subsys_mask)) | ||
4172 | continue; | 4235 | continue; |
4173 | 4236 | ||
4174 | list_for_each_entry(set, &ss->cftsets, node) | 4237 | list_for_each_entry(set, &ss->cftsets, node) { |
4175 | cgroup_addrm_files(cgrp, ss, set->cfts, true); | 4238 | ret = cgroup_addrm_files(cgrp, set->cfts, true); |
4239 | if (ret < 0) | ||
4240 | goto err; | ||
4241 | } | ||
4176 | } | 4242 | } |
4177 | 4243 | ||
4178 | /* This cgroup is ready now */ | 4244 | /* This cgroup is ready now */ |
4179 | for_each_root_subsys(cgrp->root, ss) { | 4245 | for_each_root_subsys(cgrp->root, ss) { |
4180 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | 4246 | struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); |
4181 | struct css_id *id = rcu_dereference_protected(css->id, true); | 4247 | struct css_id *id = rcu_dereference_protected(css->id, true); |
4182 | 4248 | ||
4183 | /* | 4249 | /* |
@@ -4190,14 +4256,57 @@ static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, | |||
4190 | } | 4256 | } |
4191 | 4257 | ||
4192 | return 0; | 4258 | return 0; |
4259 | err: | ||
4260 | cgroup_clear_dir(cgrp, subsys_mask); | ||
4261 | return ret; | ||
4193 | } | 4262 | } |
4194 | 4263 | ||
4195 | static void css_dput_fn(struct work_struct *work) | 4264 | /* |
4265 | * css destruction is four-stage process. | ||
4266 | * | ||
4267 | * 1. Destruction starts. Killing of the percpu_ref is initiated. | ||
4268 | * Implemented in kill_css(). | ||
4269 | * | ||
4270 | * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs | ||
4271 | * and thus css_tryget() is guaranteed to fail, the css can be offlined | ||
4272 | * by invoking offline_css(). After offlining, the base ref is put. | ||
4273 | * Implemented in css_killed_work_fn(). | ||
4274 | * | ||
4275 | * 3. When the percpu_ref reaches zero, the only possible remaining | ||
4276 | * accessors are inside RCU read sections. css_release() schedules the | ||
4277 | * RCU callback. | ||
4278 | * | ||
4279 | * 4. After the grace period, the css can be freed. Implemented in | ||
4280 | * css_free_work_fn(). | ||
4281 | * | ||
4282 | * It is actually hairier because both step 2 and 4 require process context | ||
4283 | * and thus involve punting to css->destroy_work adding two additional | ||
4284 | * steps to the already complex sequence. | ||
4285 | */ | ||
4286 | static void css_free_work_fn(struct work_struct *work) | ||
4196 | { | 4287 | { |
4197 | struct cgroup_subsys_state *css = | 4288 | struct cgroup_subsys_state *css = |
4198 | container_of(work, struct cgroup_subsys_state, dput_work); | 4289 | container_of(work, struct cgroup_subsys_state, destroy_work); |
4290 | struct cgroup *cgrp = css->cgroup; | ||
4291 | |||
4292 | if (css->parent) | ||
4293 | css_put(css->parent); | ||
4199 | 4294 | ||
4200 | cgroup_dput(css->cgroup); | 4295 | css->ss->css_free(css); |
4296 | cgroup_dput(cgrp); | ||
4297 | } | ||
4298 | |||
4299 | static void css_free_rcu_fn(struct rcu_head *rcu_head) | ||
4300 | { | ||
4301 | struct cgroup_subsys_state *css = | ||
4302 | container_of(rcu_head, struct cgroup_subsys_state, rcu_head); | ||
4303 | |||
4304 | /* | ||
4305 | * css holds an extra ref to @cgrp->dentry which is put on the last | ||
4306 | * css_put(). dput() requires process context which we don't have. | ||
4307 | */ | ||
4308 | INIT_WORK(&css->destroy_work, css_free_work_fn); | ||
4309 | schedule_work(&css->destroy_work); | ||
4201 | } | 4310 | } |
4202 | 4311 | ||
4203 | static void css_release(struct percpu_ref *ref) | 4312 | static void css_release(struct percpu_ref *ref) |
@@ -4205,49 +4314,47 @@ static void css_release(struct percpu_ref *ref) | |||
4205 | struct cgroup_subsys_state *css = | 4314 | struct cgroup_subsys_state *css = |
4206 | container_of(ref, struct cgroup_subsys_state, refcnt); | 4315 | container_of(ref, struct cgroup_subsys_state, refcnt); |
4207 | 4316 | ||
4208 | schedule_work(&css->dput_work); | 4317 | call_rcu(&css->rcu_head, css_free_rcu_fn); |
4209 | } | 4318 | } |
4210 | 4319 | ||
4211 | static void init_cgroup_css(struct cgroup_subsys_state *css, | 4320 | static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, |
4212 | struct cgroup_subsys *ss, | 4321 | struct cgroup *cgrp) |
4213 | struct cgroup *cgrp) | ||
4214 | { | 4322 | { |
4215 | css->cgroup = cgrp; | 4323 | css->cgroup = cgrp; |
4324 | css->ss = ss; | ||
4216 | css->flags = 0; | 4325 | css->flags = 0; |
4217 | css->id = NULL; | 4326 | css->id = NULL; |
4218 | if (cgrp == cgroup_dummy_top) | 4327 | |
4328 | if (cgrp->parent) | ||
4329 | css->parent = cgroup_css(cgrp->parent, ss); | ||
4330 | else | ||
4219 | css->flags |= CSS_ROOT; | 4331 | css->flags |= CSS_ROOT; |
4220 | BUG_ON(cgrp->subsys[ss->subsys_id]); | ||
4221 | cgrp->subsys[ss->subsys_id] = css; | ||
4222 | 4332 | ||
4223 | /* | 4333 | BUG_ON(cgroup_css(cgrp, ss)); |
4224 | * css holds an extra ref to @cgrp->dentry which is put on the last | ||
4225 | * css_put(). dput() requires process context, which css_put() may | ||
4226 | * be called without. @css->dput_work will be used to invoke | ||
4227 | * dput() asynchronously from css_put(). | ||
4228 | */ | ||
4229 | INIT_WORK(&css->dput_work, css_dput_fn); | ||
4230 | } | 4334 | } |
4231 | 4335 | ||
4232 | /* invoke ->post_create() on a new CSS and mark it online if successful */ | 4336 | /* invoke ->css_online() on a new CSS and mark it online if successful */ |
4233 | static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp) | 4337 | static int online_css(struct cgroup_subsys_state *css) |
4234 | { | 4338 | { |
4339 | struct cgroup_subsys *ss = css->ss; | ||
4235 | int ret = 0; | 4340 | int ret = 0; |
4236 | 4341 | ||
4237 | lockdep_assert_held(&cgroup_mutex); | 4342 | lockdep_assert_held(&cgroup_mutex); |
4238 | 4343 | ||
4239 | if (ss->css_online) | 4344 | if (ss->css_online) |
4240 | ret = ss->css_online(cgrp); | 4345 | ret = ss->css_online(css); |
4241 | if (!ret) | 4346 | if (!ret) { |
4242 | cgrp->subsys[ss->subsys_id]->flags |= CSS_ONLINE; | 4347 | css->flags |= CSS_ONLINE; |
4348 | css->cgroup->nr_css++; | ||
4349 | rcu_assign_pointer(css->cgroup->subsys[ss->subsys_id], css); | ||
4350 | } | ||
4243 | return ret; | 4351 | return ret; |
4244 | } | 4352 | } |
4245 | 4353 | ||
4246 | /* if the CSS is online, invoke ->pre_destory() on it and mark it offline */ | 4354 | /* if the CSS is online, invoke ->css_offline() on it and mark it offline */ |
4247 | static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) | 4355 | static void offline_css(struct cgroup_subsys_state *css) |
4248 | __releases(&cgroup_mutex) __acquires(&cgroup_mutex) | ||
4249 | { | 4356 | { |
4250 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | 4357 | struct cgroup_subsys *ss = css->ss; |
4251 | 4358 | ||
4252 | lockdep_assert_held(&cgroup_mutex); | 4359 | lockdep_assert_held(&cgroup_mutex); |
4253 | 4360 | ||
@@ -4255,9 +4362,11 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) | |||
4255 | return; | 4362 | return; |
4256 | 4363 | ||
4257 | if (ss->css_offline) | 4364 | if (ss->css_offline) |
4258 | ss->css_offline(cgrp); | 4365 | ss->css_offline(css); |
4259 | 4366 | ||
4260 | cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE; | 4367 | css->flags &= ~CSS_ONLINE; |
4368 | css->cgroup->nr_css--; | ||
4369 | RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css); | ||
4261 | } | 4370 | } |
4262 | 4371 | ||
4263 | /* | 4372 | /* |
@@ -4271,6 +4380,7 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) | |||
4271 | static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | 4380 | static long cgroup_create(struct cgroup *parent, struct dentry *dentry, |
4272 | umode_t mode) | 4381 | umode_t mode) |
4273 | { | 4382 | { |
4383 | struct cgroup_subsys_state *css_ar[CGROUP_SUBSYS_COUNT] = { }; | ||
4274 | struct cgroup *cgrp; | 4384 | struct cgroup *cgrp; |
4275 | struct cgroup_name *name; | 4385 | struct cgroup_name *name; |
4276 | struct cgroupfs_root *root = parent->root; | 4386 | struct cgroupfs_root *root = parent->root; |
@@ -4288,7 +4398,11 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4288 | goto err_free_cgrp; | 4398 | goto err_free_cgrp; |
4289 | rcu_assign_pointer(cgrp->name, name); | 4399 | rcu_assign_pointer(cgrp->name, name); |
4290 | 4400 | ||
4291 | cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL); | 4401 | /* |
4402 | * Temporarily set the pointer to NULL, so idr_find() won't return | ||
4403 | * a half-baked cgroup. | ||
4404 | */ | ||
4405 | cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL); | ||
4292 | if (cgrp->id < 0) | 4406 | if (cgrp->id < 0) |
4293 | goto err_free_name; | 4407 | goto err_free_name; |
4294 | 4408 | ||
@@ -4317,6 +4431,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4317 | cgrp->dentry = dentry; | 4431 | cgrp->dentry = dentry; |
4318 | 4432 | ||
4319 | cgrp->parent = parent; | 4433 | cgrp->parent = parent; |
4434 | cgrp->dummy_css.parent = &parent->dummy_css; | ||
4320 | cgrp->root = parent->root; | 4435 | cgrp->root = parent->root; |
4321 | 4436 | ||
4322 | if (notify_on_release(parent)) | 4437 | if (notify_on_release(parent)) |
@@ -4328,22 +4443,21 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4328 | for_each_root_subsys(root, ss) { | 4443 | for_each_root_subsys(root, ss) { |
4329 | struct cgroup_subsys_state *css; | 4444 | struct cgroup_subsys_state *css; |
4330 | 4445 | ||
4331 | css = ss->css_alloc(cgrp); | 4446 | css = ss->css_alloc(cgroup_css(parent, ss)); |
4332 | if (IS_ERR(css)) { | 4447 | if (IS_ERR(css)) { |
4333 | err = PTR_ERR(css); | 4448 | err = PTR_ERR(css); |
4334 | goto err_free_all; | 4449 | goto err_free_all; |
4335 | } | 4450 | } |
4451 | css_ar[ss->subsys_id] = css; | ||
4336 | 4452 | ||
4337 | err = percpu_ref_init(&css->refcnt, css_release); | 4453 | err = percpu_ref_init(&css->refcnt, css_release); |
4338 | if (err) { | 4454 | if (err) |
4339 | ss->css_free(cgrp); | ||
4340 | goto err_free_all; | 4455 | goto err_free_all; |
4341 | } | ||
4342 | 4456 | ||
4343 | init_cgroup_css(css, ss, cgrp); | 4457 | init_css(css, ss, cgrp); |
4344 | 4458 | ||
4345 | if (ss->use_id) { | 4459 | if (ss->use_id) { |
4346 | err = alloc_css_id(ss, parent, cgrp); | 4460 | err = alloc_css_id(css); |
4347 | if (err) | 4461 | if (err) |
4348 | goto err_free_all; | 4462 | goto err_free_all; |
4349 | } | 4463 | } |
@@ -4365,16 +4479,22 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4365 | list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); | 4479 | list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); |
4366 | root->number_of_cgroups++; | 4480 | root->number_of_cgroups++; |
4367 | 4481 | ||
4368 | /* each css holds a ref to the cgroup's dentry */ | 4482 | /* each css holds a ref to the cgroup's dentry and the parent css */ |
4369 | for_each_root_subsys(root, ss) | 4483 | for_each_root_subsys(root, ss) { |
4484 | struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; | ||
4485 | |||
4370 | dget(dentry); | 4486 | dget(dentry); |
4487 | css_get(css->parent); | ||
4488 | } | ||
4371 | 4489 | ||
4372 | /* hold a ref to the parent's dentry */ | 4490 | /* hold a ref to the parent's dentry */ |
4373 | dget(parent->dentry); | 4491 | dget(parent->dentry); |
4374 | 4492 | ||
4375 | /* creation succeeded, notify subsystems */ | 4493 | /* creation succeeded, notify subsystems */ |
4376 | for_each_root_subsys(root, ss) { | 4494 | for_each_root_subsys(root, ss) { |
4377 | err = online_css(ss, cgrp); | 4495 | struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; |
4496 | |||
4497 | err = online_css(css); | ||
4378 | if (err) | 4498 | if (err) |
4379 | goto err_destroy; | 4499 | goto err_destroy; |
4380 | 4500 | ||
@@ -4388,7 +4508,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4388 | } | 4508 | } |
4389 | } | 4509 | } |
4390 | 4510 | ||
4391 | err = cgroup_populate_dir(cgrp, true, root->subsys_mask); | 4511 | idr_replace(&root->cgroup_idr, cgrp, cgrp->id); |
4512 | |||
4513 | err = cgroup_addrm_files(cgrp, cgroup_base_files, true); | ||
4514 | if (err) | ||
4515 | goto err_destroy; | ||
4516 | |||
4517 | err = cgroup_populate_dir(cgrp, root->subsys_mask); | ||
4392 | if (err) | 4518 | if (err) |
4393 | goto err_destroy; | 4519 | goto err_destroy; |
4394 | 4520 | ||
@@ -4399,18 +4525,18 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4399 | 4525 | ||
4400 | err_free_all: | 4526 | err_free_all: |
4401 | for_each_root_subsys(root, ss) { | 4527 | for_each_root_subsys(root, ss) { |
4402 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | 4528 | struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; |
4403 | 4529 | ||
4404 | if (css) { | 4530 | if (css) { |
4405 | percpu_ref_cancel_init(&css->refcnt); | 4531 | percpu_ref_cancel_init(&css->refcnt); |
4406 | ss->css_free(cgrp); | 4532 | ss->css_free(css); |
4407 | } | 4533 | } |
4408 | } | 4534 | } |
4409 | mutex_unlock(&cgroup_mutex); | 4535 | mutex_unlock(&cgroup_mutex); |
4410 | /* Release the reference count that we took on the superblock */ | 4536 | /* Release the reference count that we took on the superblock */ |
4411 | deactivate_super(sb); | 4537 | deactivate_super(sb); |
4412 | err_free_id: | 4538 | err_free_id: |
4413 | ida_simple_remove(&root->cgroup_ida, cgrp->id); | 4539 | idr_remove(&root->cgroup_idr, cgrp->id); |
4414 | err_free_name: | 4540 | err_free_name: |
4415 | kfree(rcu_dereference_raw(cgrp->name)); | 4541 | kfree(rcu_dereference_raw(cgrp->name)); |
4416 | err_free_cgrp: | 4542 | err_free_cgrp: |
@@ -4432,22 +4558,84 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) | |||
4432 | return cgroup_create(c_parent, dentry, mode | S_IFDIR); | 4558 | return cgroup_create(c_parent, dentry, mode | S_IFDIR); |
4433 | } | 4559 | } |
4434 | 4560 | ||
4435 | static void cgroup_css_killed(struct cgroup *cgrp) | 4561 | /* |
4562 | * This is called when the refcnt of a css is confirmed to be killed. | ||
4563 | * css_tryget() is now guaranteed to fail. | ||
4564 | */ | ||
4565 | static void css_killed_work_fn(struct work_struct *work) | ||
4436 | { | 4566 | { |
4437 | if (!atomic_dec_and_test(&cgrp->css_kill_cnt)) | 4567 | struct cgroup_subsys_state *css = |
4438 | return; | 4568 | container_of(work, struct cgroup_subsys_state, destroy_work); |
4569 | struct cgroup *cgrp = css->cgroup; | ||
4439 | 4570 | ||
4440 | /* percpu ref's of all css's are killed, kick off the next step */ | 4571 | mutex_lock(&cgroup_mutex); |
4441 | INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn); | 4572 | |
4442 | schedule_work(&cgrp->destroy_work); | 4573 | /* |
4574 | * css_tryget() is guaranteed to fail now. Tell subsystems to | ||
4575 | * initate destruction. | ||
4576 | */ | ||
4577 | offline_css(css); | ||
4578 | |||
4579 | /* | ||
4580 | * If @cgrp is marked dead, it's waiting for refs of all css's to | ||
4581 | * be disabled before proceeding to the second phase of cgroup | ||
4582 | * destruction. If we are the last one, kick it off. | ||
4583 | */ | ||
4584 | if (!cgrp->nr_css && cgroup_is_dead(cgrp)) | ||
4585 | cgroup_destroy_css_killed(cgrp); | ||
4586 | |||
4587 | mutex_unlock(&cgroup_mutex); | ||
4588 | |||
4589 | /* | ||
4590 | * Put the css refs from kill_css(). Each css holds an extra | ||
4591 | * reference to the cgroup's dentry and cgroup removal proceeds | ||
4592 | * regardless of css refs. On the last put of each css, whenever | ||
4593 | * that may be, the extra dentry ref is put so that dentry | ||
4594 | * destruction happens only after all css's are released. | ||
4595 | */ | ||
4596 | css_put(css); | ||
4443 | } | 4597 | } |
4444 | 4598 | ||
4445 | static void css_ref_killed_fn(struct percpu_ref *ref) | 4599 | /* css kill confirmation processing requires process context, bounce */ |
4600 | static void css_killed_ref_fn(struct percpu_ref *ref) | ||
4446 | { | 4601 | { |
4447 | struct cgroup_subsys_state *css = | 4602 | struct cgroup_subsys_state *css = |
4448 | container_of(ref, struct cgroup_subsys_state, refcnt); | 4603 | container_of(ref, struct cgroup_subsys_state, refcnt); |
4449 | 4604 | ||
4450 | cgroup_css_killed(css->cgroup); | 4605 | INIT_WORK(&css->destroy_work, css_killed_work_fn); |
4606 | schedule_work(&css->destroy_work); | ||
4607 | } | ||
4608 | |||
4609 | /** | ||
4610 | * kill_css - destroy a css | ||
4611 | * @css: css to destroy | ||
4612 | * | ||
4613 | * This function initiates destruction of @css by removing cgroup interface | ||
4614 | * files and putting its base reference. ->css_offline() will be invoked | ||
4615 | * asynchronously once css_tryget() is guaranteed to fail and when the | ||
4616 | * reference count reaches zero, @css will be released. | ||
4617 | */ | ||
4618 | static void kill_css(struct cgroup_subsys_state *css) | ||
4619 | { | ||
4620 | cgroup_clear_dir(css->cgroup, 1 << css->ss->subsys_id); | ||
4621 | |||
4622 | /* | ||
4623 | * Killing would put the base ref, but we need to keep it alive | ||
4624 | * until after ->css_offline(). | ||
4625 | */ | ||
4626 | css_get(css); | ||
4627 | |||
4628 | /* | ||
4629 | * cgroup core guarantees that, by the time ->css_offline() is | ||
4630 | * invoked, no new css reference will be given out via | ||
4631 | * css_tryget(). We can't simply call percpu_ref_kill() and | ||
4632 | * proceed to offlining css's because percpu_ref_kill() doesn't | ||
4633 | * guarantee that the ref is seen as killed on all CPUs on return. | ||
4634 | * | ||
4635 | * Use percpu_ref_kill_and_confirm() to get notifications as each | ||
4636 | * css is confirmed to be seen as killed on all CPUs. | ||
4637 | */ | ||
4638 | percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn); | ||
4451 | } | 4639 | } |
4452 | 4640 | ||
4453 | /** | 4641 | /** |
@@ -4513,41 +4701,19 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) | |||
4513 | return -EBUSY; | 4701 | return -EBUSY; |
4514 | 4702 | ||
4515 | /* | 4703 | /* |
4516 | * Block new css_tryget() by killing css refcnts. cgroup core | 4704 | * Initiate massacre of all css's. cgroup_destroy_css_killed() |
4517 | * guarantees that, by the time ->css_offline() is invoked, no new | 4705 | * will be invoked to perform the rest of destruction once the |
4518 | * css reference will be given out via css_tryget(). We can't | 4706 | * percpu refs of all css's are confirmed to be killed. |
4519 | * simply call percpu_ref_kill() and proceed to offlining css's | ||
4520 | * because percpu_ref_kill() doesn't guarantee that the ref is seen | ||
4521 | * as killed on all CPUs on return. | ||
4522 | * | ||
4523 | * Use percpu_ref_kill_and_confirm() to get notifications as each | ||
4524 | * css is confirmed to be seen as killed on all CPUs. The | ||
4525 | * notification callback keeps track of the number of css's to be | ||
4526 | * killed and schedules cgroup_offline_fn() to perform the rest of | ||
4527 | * destruction once the percpu refs of all css's are confirmed to | ||
4528 | * be killed. | ||
4529 | */ | 4707 | */ |
4530 | atomic_set(&cgrp->css_kill_cnt, 1); | 4708 | for_each_root_subsys(cgrp->root, ss) |
4531 | for_each_root_subsys(cgrp->root, ss) { | 4709 | kill_css(cgroup_css(cgrp, ss)); |
4532 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | ||
4533 | |||
4534 | /* | ||
4535 | * Killing would put the base ref, but we need to keep it | ||
4536 | * alive until after ->css_offline. | ||
4537 | */ | ||
4538 | percpu_ref_get(&css->refcnt); | ||
4539 | |||
4540 | atomic_inc(&cgrp->css_kill_cnt); | ||
4541 | percpu_ref_kill_and_confirm(&css->refcnt, css_ref_killed_fn); | ||
4542 | } | ||
4543 | cgroup_css_killed(cgrp); | ||
4544 | 4710 | ||
4545 | /* | 4711 | /* |
4546 | * Mark @cgrp dead. This prevents further task migration and child | 4712 | * Mark @cgrp dead. This prevents further task migration and child |
4547 | * creation by disabling cgroup_lock_live_group(). Note that | 4713 | * creation by disabling cgroup_lock_live_group(). Note that |
4548 | * CGRP_DEAD assertion is depended upon by cgroup_next_sibling() to | 4714 | * CGRP_DEAD assertion is depended upon by css_next_child() to |
4549 | * resume iteration after dropping RCU read lock. See | 4715 | * resume iteration after dropping RCU read lock. See |
4550 | * cgroup_next_sibling() for details. | 4716 | * css_next_child() for details. |
4551 | */ | 4717 | */ |
4552 | set_bit(CGRP_DEAD, &cgrp->flags); | 4718 | set_bit(CGRP_DEAD, &cgrp->flags); |
4553 | 4719 | ||
@@ -4558,9 +4724,20 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) | |||
4558 | raw_spin_unlock(&release_list_lock); | 4724 | raw_spin_unlock(&release_list_lock); |
4559 | 4725 | ||
4560 | /* | 4726 | /* |
4561 | * Remove @cgrp directory. The removal puts the base ref but we | 4727 | * If @cgrp has css's attached, the second stage of cgroup |
4562 | * aren't quite done with @cgrp yet, so hold onto it. | 4728 | * destruction is kicked off from css_killed_work_fn() after the |
4729 | * refs of all attached css's are killed. If @cgrp doesn't have | ||
4730 | * any css, we kick it off here. | ||
4563 | */ | 4731 | */ |
4732 | if (!cgrp->nr_css) | ||
4733 | cgroup_destroy_css_killed(cgrp); | ||
4734 | |||
4735 | /* | ||
4736 | * Clear the base files and remove @cgrp directory. The removal | ||
4737 | * puts the base ref but we aren't quite done with @cgrp yet, so | ||
4738 | * hold onto it. | ||
4739 | */ | ||
4740 | cgroup_addrm_files(cgrp, cgroup_base_files, false); | ||
4564 | dget(d); | 4741 | dget(d); |
4565 | cgroup_d_remove_dir(d); | 4742 | cgroup_d_remove_dir(d); |
4566 | 4743 | ||
@@ -4580,50 +4757,36 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) | |||
4580 | }; | 4757 | }; |
4581 | 4758 | ||
4582 | /** | 4759 | /** |
4583 | * cgroup_offline_fn - the second step of cgroup destruction | 4760 | * cgroup_destroy_css_killed - the second step of cgroup destruction |
4584 | * @work: cgroup->destroy_free_work | 4761 | * @work: cgroup->destroy_free_work |
4585 | * | 4762 | * |
4586 | * This function is invoked from a work item for a cgroup which is being | 4763 | * This function is invoked from a work item for a cgroup which is being |
4587 | * destroyed after the percpu refcnts of all css's are guaranteed to be | 4764 | * destroyed after all css's are offlined and performs the rest of |
4588 | * seen as killed on all CPUs, and performs the rest of destruction. This | 4765 | * destruction. This is the second step of destruction described in the |
4589 | * is the second step of destruction described in the comment above | 4766 | * comment above cgroup_destroy_locked(). |
4590 | * cgroup_destroy_locked(). | ||
4591 | */ | 4767 | */ |
4592 | static void cgroup_offline_fn(struct work_struct *work) | 4768 | static void cgroup_destroy_css_killed(struct cgroup *cgrp) |
4593 | { | 4769 | { |
4594 | struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); | ||
4595 | struct cgroup *parent = cgrp->parent; | 4770 | struct cgroup *parent = cgrp->parent; |
4596 | struct dentry *d = cgrp->dentry; | 4771 | struct dentry *d = cgrp->dentry; |
4597 | struct cgroup_subsys *ss; | ||
4598 | 4772 | ||
4599 | mutex_lock(&cgroup_mutex); | 4773 | lockdep_assert_held(&cgroup_mutex); |
4600 | 4774 | ||
4601 | /* | 4775 | /* delete this cgroup from parent->children */ |
4602 | * css_tryget() is guaranteed to fail now. Tell subsystems to | 4776 | list_del_rcu(&cgrp->sibling); |
4603 | * initate destruction. | ||
4604 | */ | ||
4605 | for_each_root_subsys(cgrp->root, ss) | ||
4606 | offline_css(ss, cgrp); | ||
4607 | 4777 | ||
4608 | /* | 4778 | /* |
4609 | * Put the css refs from cgroup_destroy_locked(). Each css holds | 4779 | * We should remove the cgroup object from idr before its grace |
4610 | * an extra reference to the cgroup's dentry and cgroup removal | 4780 | * period starts, so we won't be looking up a cgroup while the |
4611 | * proceeds regardless of css refs. On the last put of each css, | 4781 | * cgroup is being freed. |
4612 | * whenever that may be, the extra dentry ref is put so that dentry | ||
4613 | * destruction happens only after all css's are released. | ||
4614 | */ | 4782 | */ |
4615 | for_each_root_subsys(cgrp->root, ss) | 4783 | idr_remove(&cgrp->root->cgroup_idr, cgrp->id); |
4616 | css_put(cgrp->subsys[ss->subsys_id]); | 4784 | cgrp->id = -1; |
4617 | |||
4618 | /* delete this cgroup from parent->children */ | ||
4619 | list_del_rcu(&cgrp->sibling); | ||
4620 | 4785 | ||
4621 | dput(d); | 4786 | dput(d); |
4622 | 4787 | ||
4623 | set_bit(CGRP_RELEASABLE, &parent->flags); | 4788 | set_bit(CGRP_RELEASABLE, &parent->flags); |
4624 | check_for_release(parent); | 4789 | check_for_release(parent); |
4625 | |||
4626 | mutex_unlock(&cgroup_mutex); | ||
4627 | } | 4790 | } |
4628 | 4791 | ||
4629 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) | 4792 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) |
@@ -4646,6 +4809,11 @@ static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss) | |||
4646 | * deregistration. | 4809 | * deregistration. |
4647 | */ | 4810 | */ |
4648 | if (ss->base_cftypes) { | 4811 | if (ss->base_cftypes) { |
4812 | struct cftype *cft; | ||
4813 | |||
4814 | for (cft = ss->base_cftypes; cft->name[0] != '\0'; cft++) | ||
4815 | cft->ss = ss; | ||
4816 | |||
4649 | ss->base_cftset.cfts = ss->base_cftypes; | 4817 | ss->base_cftset.cfts = ss->base_cftypes; |
4650 | list_add_tail(&ss->base_cftset.node, &ss->cftsets); | 4818 | list_add_tail(&ss->base_cftset.node, &ss->cftsets); |
4651 | } | 4819 | } |
@@ -4665,10 +4833,10 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) | |||
4665 | /* Create the top cgroup state for this subsystem */ | 4833 | /* Create the top cgroup state for this subsystem */ |
4666 | list_add(&ss->sibling, &cgroup_dummy_root.subsys_list); | 4834 | list_add(&ss->sibling, &cgroup_dummy_root.subsys_list); |
4667 | ss->root = &cgroup_dummy_root; | 4835 | ss->root = &cgroup_dummy_root; |
4668 | css = ss->css_alloc(cgroup_dummy_top); | 4836 | css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss)); |
4669 | /* We don't handle early failures gracefully */ | 4837 | /* We don't handle early failures gracefully */ |
4670 | BUG_ON(IS_ERR(css)); | 4838 | BUG_ON(IS_ERR(css)); |
4671 | init_cgroup_css(css, ss, cgroup_dummy_top); | 4839 | init_css(css, ss, cgroup_dummy_top); |
4672 | 4840 | ||
4673 | /* Update the init_css_set to contain a subsys | 4841 | /* Update the init_css_set to contain a subsys |
4674 | * pointer to this state - since the subsystem is | 4842 | * pointer to this state - since the subsystem is |
@@ -4683,7 +4851,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) | |||
4683 | * need to invoke fork callbacks here. */ | 4851 | * need to invoke fork callbacks here. */ |
4684 | BUG_ON(!list_empty(&init_task.tasks)); | 4852 | BUG_ON(!list_empty(&init_task.tasks)); |
4685 | 4853 | ||
4686 | BUG_ON(online_css(ss, cgroup_dummy_top)); | 4854 | BUG_ON(online_css(css)); |
4687 | 4855 | ||
4688 | mutex_unlock(&cgroup_mutex); | 4856 | mutex_unlock(&cgroup_mutex); |
4689 | 4857 | ||
@@ -4744,7 +4912,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
4744 | * struct, so this can happen first (i.e. before the dummy root | 4912 | * struct, so this can happen first (i.e. before the dummy root |
4745 | * attachment). | 4913 | * attachment). |
4746 | */ | 4914 | */ |
4747 | css = ss->css_alloc(cgroup_dummy_top); | 4915 | css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss)); |
4748 | if (IS_ERR(css)) { | 4916 | if (IS_ERR(css)) { |
4749 | /* failure case - need to deassign the cgroup_subsys[] slot. */ | 4917 | /* failure case - need to deassign the cgroup_subsys[] slot. */ |
4750 | cgroup_subsys[ss->subsys_id] = NULL; | 4918 | cgroup_subsys[ss->subsys_id] = NULL; |
@@ -4756,8 +4924,8 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
4756 | ss->root = &cgroup_dummy_root; | 4924 | ss->root = &cgroup_dummy_root; |
4757 | 4925 | ||
4758 | /* our new subsystem will be attached to the dummy hierarchy. */ | 4926 | /* our new subsystem will be attached to the dummy hierarchy. */ |
4759 | init_cgroup_css(css, ss, cgroup_dummy_top); | 4927 | init_css(css, ss, cgroup_dummy_top); |
4760 | /* init_idr must be after init_cgroup_css because it sets css->id. */ | 4928 | /* init_idr must be after init_css() because it sets css->id. */ |
4761 | if (ss->use_id) { | 4929 | if (ss->use_id) { |
4762 | ret = cgroup_init_idr(ss, css); | 4930 | ret = cgroup_init_idr(ss, css); |
4763 | if (ret) | 4931 | if (ret) |
@@ -4787,7 +4955,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
4787 | } | 4955 | } |
4788 | write_unlock(&css_set_lock); | 4956 | write_unlock(&css_set_lock); |
4789 | 4957 | ||
4790 | ret = online_css(ss, cgroup_dummy_top); | 4958 | ret = online_css(css); |
4791 | if (ret) | 4959 | if (ret) |
4792 | goto err_unload; | 4960 | goto err_unload; |
4793 | 4961 | ||
@@ -4819,14 +4987,14 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) | |||
4819 | 4987 | ||
4820 | /* | 4988 | /* |
4821 | * we shouldn't be called if the subsystem is in use, and the use of | 4989 | * we shouldn't be called if the subsystem is in use, and the use of |
4822 | * try_module_get in parse_cgroupfs_options should ensure that it | 4990 | * try_module_get() in rebind_subsystems() should ensure that it |
4823 | * doesn't start being used while we're killing it off. | 4991 | * doesn't start being used while we're killing it off. |
4824 | */ | 4992 | */ |
4825 | BUG_ON(ss->root != &cgroup_dummy_root); | 4993 | BUG_ON(ss->root != &cgroup_dummy_root); |
4826 | 4994 | ||
4827 | mutex_lock(&cgroup_mutex); | 4995 | mutex_lock(&cgroup_mutex); |
4828 | 4996 | ||
4829 | offline_css(ss, cgroup_dummy_top); | 4997 | offline_css(cgroup_css(cgroup_dummy_top, ss)); |
4830 | 4998 | ||
4831 | if (ss->use_id) | 4999 | if (ss->use_id) |
4832 | idr_destroy(&ss->idr); | 5000 | idr_destroy(&ss->idr); |
@@ -4860,8 +5028,8 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) | |||
4860 | * the cgrp->subsys pointer to find their state. note that this | 5028 | * the cgrp->subsys pointer to find their state. note that this |
4861 | * also takes care of freeing the css_id. | 5029 | * also takes care of freeing the css_id. |
4862 | */ | 5030 | */ |
4863 | ss->css_free(cgroup_dummy_top); | 5031 | ss->css_free(cgroup_css(cgroup_dummy_top, ss)); |
4864 | cgroup_dummy_top->subsys[ss->subsys_id] = NULL; | 5032 | RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL); |
4865 | 5033 | ||
4866 | mutex_unlock(&cgroup_mutex); | 5034 | mutex_unlock(&cgroup_mutex); |
4867 | } | 5035 | } |
@@ -4943,6 +5111,10 @@ int __init cgroup_init(void) | |||
4943 | 5111 | ||
4944 | BUG_ON(cgroup_init_root_id(&cgroup_dummy_root, 0, 1)); | 5112 | BUG_ON(cgroup_init_root_id(&cgroup_dummy_root, 0, 1)); |
4945 | 5113 | ||
5114 | err = idr_alloc(&cgroup_dummy_root.cgroup_idr, cgroup_dummy_top, | ||
5115 | 0, 1, GFP_KERNEL); | ||
5116 | BUG_ON(err < 0); | ||
5117 | |||
4946 | mutex_unlock(&cgroup_root_mutex); | 5118 | mutex_unlock(&cgroup_root_mutex); |
4947 | mutex_unlock(&cgroup_mutex); | 5119 | mutex_unlock(&cgroup_mutex); |
4948 | 5120 | ||
@@ -5099,7 +5271,7 @@ void cgroup_fork(struct task_struct *child) | |||
5099 | * Adds the task to the list running through its css_set if necessary and | 5271 | * Adds the task to the list running through its css_set if necessary and |
5100 | * call the subsystem fork() callbacks. Has to be after the task is | 5272 | * call the subsystem fork() callbacks. Has to be after the task is |
5101 | * visible on the task list in case we race with the first call to | 5273 | * visible on the task list in case we race with the first call to |
5102 | * cgroup_iter_start() - to guarantee that the new task ends up on its | 5274 | * cgroup_task_iter_start() - to guarantee that the new task ends up on its |
5103 | * list. | 5275 | * list. |
5104 | */ | 5276 | */ |
5105 | void cgroup_post_fork(struct task_struct *child) | 5277 | void cgroup_post_fork(struct task_struct *child) |
@@ -5212,10 +5384,10 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) | |||
5212 | */ | 5384 | */ |
5213 | for_each_builtin_subsys(ss, i) { | 5385 | for_each_builtin_subsys(ss, i) { |
5214 | if (ss->exit) { | 5386 | if (ss->exit) { |
5215 | struct cgroup *old_cgrp = cset->subsys[i]->cgroup; | 5387 | struct cgroup_subsys_state *old_css = cset->subsys[i]; |
5216 | struct cgroup *cgrp = task_cgroup(tsk, i); | 5388 | struct cgroup_subsys_state *css = task_css(tsk, i); |
5217 | 5389 | ||
5218 | ss->exit(cgrp, old_cgrp, tsk); | 5390 | ss->exit(css, old_css, tsk); |
5219 | } | 5391 | } |
5220 | } | 5392 | } |
5221 | } | 5393 | } |
@@ -5474,20 +5646,16 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss, | |||
5474 | return 0; | 5646 | return 0; |
5475 | } | 5647 | } |
5476 | 5648 | ||
5477 | static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent, | 5649 | static int alloc_css_id(struct cgroup_subsys_state *child_css) |
5478 | struct cgroup *child) | ||
5479 | { | 5650 | { |
5480 | int subsys_id, i, depth = 0; | 5651 | struct cgroup_subsys_state *parent_css = css_parent(child_css); |
5481 | struct cgroup_subsys_state *parent_css, *child_css; | ||
5482 | struct css_id *child_id, *parent_id; | 5652 | struct css_id *child_id, *parent_id; |
5653 | int i, depth; | ||
5483 | 5654 | ||
5484 | subsys_id = ss->subsys_id; | ||
5485 | parent_css = parent->subsys[subsys_id]; | ||
5486 | child_css = child->subsys[subsys_id]; | ||
5487 | parent_id = rcu_dereference_protected(parent_css->id, true); | 5655 | parent_id = rcu_dereference_protected(parent_css->id, true); |
5488 | depth = parent_id->depth + 1; | 5656 | depth = parent_id->depth + 1; |
5489 | 5657 | ||
5490 | child_id = get_new_cssid(ss, depth); | 5658 | child_id = get_new_cssid(child_css->ss, depth); |
5491 | if (IS_ERR(child_id)) | 5659 | if (IS_ERR(child_id)) |
5492 | return PTR_ERR(child_id); | 5660 | return PTR_ERR(child_id); |
5493 | 5661 | ||
@@ -5525,31 +5693,56 @@ struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id) | |||
5525 | } | 5693 | } |
5526 | EXPORT_SYMBOL_GPL(css_lookup); | 5694 | EXPORT_SYMBOL_GPL(css_lookup); |
5527 | 5695 | ||
5528 | /* | 5696 | /** |
5529 | * get corresponding css from file open on cgroupfs directory | 5697 | * css_from_dir - get corresponding css from the dentry of a cgroup dir |
5698 | * @dentry: directory dentry of interest | ||
5699 | * @ss: subsystem of interest | ||
5700 | * | ||
5701 | * Must be called under RCU read lock. The caller is responsible for | ||
5702 | * pinning the returned css if it needs to be accessed outside the RCU | ||
5703 | * critical section. | ||
5530 | */ | 5704 | */ |
5531 | struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) | 5705 | struct cgroup_subsys_state *css_from_dir(struct dentry *dentry, |
5706 | struct cgroup_subsys *ss) | ||
5532 | { | 5707 | { |
5533 | struct cgroup *cgrp; | 5708 | struct cgroup *cgrp; |
5534 | struct inode *inode; | ||
5535 | struct cgroup_subsys_state *css; | ||
5536 | 5709 | ||
5537 | inode = file_inode(f); | 5710 | WARN_ON_ONCE(!rcu_read_lock_held()); |
5538 | /* check in cgroup filesystem dir */ | 5711 | |
5539 | if (inode->i_op != &cgroup_dir_inode_operations) | 5712 | /* is @dentry a cgroup dir? */ |
5713 | if (!dentry->d_inode || | ||
5714 | dentry->d_inode->i_op != &cgroup_dir_inode_operations) | ||
5540 | return ERR_PTR(-EBADF); | 5715 | return ERR_PTR(-EBADF); |
5541 | 5716 | ||
5542 | if (id < 0 || id >= CGROUP_SUBSYS_COUNT) | 5717 | cgrp = __d_cgrp(dentry); |
5543 | return ERR_PTR(-EINVAL); | 5718 | return cgroup_css(cgrp, ss) ?: ERR_PTR(-ENOENT); |
5719 | } | ||
5544 | 5720 | ||
5545 | /* get cgroup */ | 5721 | /** |
5546 | cgrp = __d_cgrp(f->f_dentry); | 5722 | * css_from_id - lookup css by id |
5547 | css = cgrp->subsys[id]; | 5723 | * @id: the cgroup id |
5548 | return css ? css : ERR_PTR(-ENOENT); | 5724 | * @ss: cgroup subsys to be looked into |
5725 | * | ||
5726 | * Returns the css if there's valid one with @id, otherwise returns NULL. | ||
5727 | * Should be called under rcu_read_lock(). | ||
5728 | */ | ||
5729 | struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) | ||
5730 | { | ||
5731 | struct cgroup *cgrp; | ||
5732 | |||
5733 | rcu_lockdep_assert(rcu_read_lock_held() || | ||
5734 | lockdep_is_held(&cgroup_mutex), | ||
5735 | "css_from_id() needs proper protection"); | ||
5736 | |||
5737 | cgrp = idr_find(&ss->root->cgroup_idr, id); | ||
5738 | if (cgrp) | ||
5739 | return cgroup_css(cgrp, ss); | ||
5740 | return NULL; | ||
5549 | } | 5741 | } |
5550 | 5742 | ||
5551 | #ifdef CONFIG_CGROUP_DEBUG | 5743 | #ifdef CONFIG_CGROUP_DEBUG |
5552 | static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cgrp) | 5744 | static struct cgroup_subsys_state * |
5745 | debug_css_alloc(struct cgroup_subsys_state *parent_css) | ||
5553 | { | 5746 | { |
5554 | struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); | 5747 | struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); |
5555 | 5748 | ||
@@ -5559,22 +5752,24 @@ static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cgrp) | |||
5559 | return css; | 5752 | return css; |
5560 | } | 5753 | } |
5561 | 5754 | ||
5562 | static void debug_css_free(struct cgroup *cgrp) | 5755 | static void debug_css_free(struct cgroup_subsys_state *css) |
5563 | { | 5756 | { |
5564 | kfree(cgrp->subsys[debug_subsys_id]); | 5757 | kfree(css); |
5565 | } | 5758 | } |
5566 | 5759 | ||
5567 | static u64 debug_taskcount_read(struct cgroup *cgrp, struct cftype *cft) | 5760 | static u64 debug_taskcount_read(struct cgroup_subsys_state *css, |
5761 | struct cftype *cft) | ||
5568 | { | 5762 | { |
5569 | return cgroup_task_count(cgrp); | 5763 | return cgroup_task_count(css->cgroup); |
5570 | } | 5764 | } |
5571 | 5765 | ||
5572 | static u64 current_css_set_read(struct cgroup *cgrp, struct cftype *cft) | 5766 | static u64 current_css_set_read(struct cgroup_subsys_state *css, |
5767 | struct cftype *cft) | ||
5573 | { | 5768 | { |
5574 | return (u64)(unsigned long)current->cgroups; | 5769 | return (u64)(unsigned long)current->cgroups; |
5575 | } | 5770 | } |
5576 | 5771 | ||
5577 | static u64 current_css_set_refcount_read(struct cgroup *cgrp, | 5772 | static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css, |
5578 | struct cftype *cft) | 5773 | struct cftype *cft) |
5579 | { | 5774 | { |
5580 | u64 count; | 5775 | u64 count; |
@@ -5585,7 +5780,7 @@ static u64 current_css_set_refcount_read(struct cgroup *cgrp, | |||
5585 | return count; | 5780 | return count; |
5586 | } | 5781 | } |
5587 | 5782 | ||
5588 | static int current_css_set_cg_links_read(struct cgroup *cgrp, | 5783 | static int current_css_set_cg_links_read(struct cgroup_subsys_state *css, |
5589 | struct cftype *cft, | 5784 | struct cftype *cft, |
5590 | struct seq_file *seq) | 5785 | struct seq_file *seq) |
5591 | { | 5786 | { |
@@ -5612,14 +5807,13 @@ static int current_css_set_cg_links_read(struct cgroup *cgrp, | |||
5612 | } | 5807 | } |
5613 | 5808 | ||
5614 | #define MAX_TASKS_SHOWN_PER_CSS 25 | 5809 | #define MAX_TASKS_SHOWN_PER_CSS 25 |
5615 | static int cgroup_css_links_read(struct cgroup *cgrp, | 5810 | static int cgroup_css_links_read(struct cgroup_subsys_state *css, |
5616 | struct cftype *cft, | 5811 | struct cftype *cft, struct seq_file *seq) |
5617 | struct seq_file *seq) | ||
5618 | { | 5812 | { |
5619 | struct cgrp_cset_link *link; | 5813 | struct cgrp_cset_link *link; |
5620 | 5814 | ||
5621 | read_lock(&css_set_lock); | 5815 | read_lock(&css_set_lock); |
5622 | list_for_each_entry(link, &cgrp->cset_links, cset_link) { | 5816 | list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { |
5623 | struct css_set *cset = link->cset; | 5817 | struct css_set *cset = link->cset; |
5624 | struct task_struct *task; | 5818 | struct task_struct *task; |
5625 | int count = 0; | 5819 | int count = 0; |
@@ -5638,9 +5832,9 @@ static int cgroup_css_links_read(struct cgroup *cgrp, | |||
5638 | return 0; | 5832 | return 0; |
5639 | } | 5833 | } |
5640 | 5834 | ||
5641 | static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft) | 5835 | static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft) |
5642 | { | 5836 | { |
5643 | return test_bit(CGRP_RELEASABLE, &cgrp->flags); | 5837 | return test_bit(CGRP_RELEASABLE, &css->cgroup->flags); |
5644 | } | 5838 | } |
5645 | 5839 | ||
5646 | static struct cftype debug_files[] = { | 5840 | static struct cftype debug_files[] = { |
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 75dda1ea5026..f0ff64d0ebaa 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c | |||
@@ -45,25 +45,19 @@ struct freezer { | |||
45 | spinlock_t lock; | 45 | spinlock_t lock; |
46 | }; | 46 | }; |
47 | 47 | ||
48 | static inline struct freezer *cgroup_freezer(struct cgroup *cgroup) | 48 | static inline struct freezer *css_freezer(struct cgroup_subsys_state *css) |
49 | { | 49 | { |
50 | return container_of(cgroup_subsys_state(cgroup, freezer_subsys_id), | 50 | return css ? container_of(css, struct freezer, css) : NULL; |
51 | struct freezer, css); | ||
52 | } | 51 | } |
53 | 52 | ||
54 | static inline struct freezer *task_freezer(struct task_struct *task) | 53 | static inline struct freezer *task_freezer(struct task_struct *task) |
55 | { | 54 | { |
56 | return container_of(task_subsys_state(task, freezer_subsys_id), | 55 | return css_freezer(task_css(task, freezer_subsys_id)); |
57 | struct freezer, css); | ||
58 | } | 56 | } |
59 | 57 | ||
60 | static struct freezer *parent_freezer(struct freezer *freezer) | 58 | static struct freezer *parent_freezer(struct freezer *freezer) |
61 | { | 59 | { |
62 | struct cgroup *pcg = freezer->css.cgroup->parent; | 60 | return css_freezer(css_parent(&freezer->css)); |
63 | |||
64 | if (pcg) | ||
65 | return cgroup_freezer(pcg); | ||
66 | return NULL; | ||
67 | } | 61 | } |
68 | 62 | ||
69 | bool cgroup_freezing(struct task_struct *task) | 63 | bool cgroup_freezing(struct task_struct *task) |
@@ -92,7 +86,8 @@ static const char *freezer_state_strs(unsigned int state) | |||
92 | 86 | ||
93 | struct cgroup_subsys freezer_subsys; | 87 | struct cgroup_subsys freezer_subsys; |
94 | 88 | ||
95 | static struct cgroup_subsys_state *freezer_css_alloc(struct cgroup *cgroup) | 89 | static struct cgroup_subsys_state * |
90 | freezer_css_alloc(struct cgroup_subsys_state *parent_css) | ||
96 | { | 91 | { |
97 | struct freezer *freezer; | 92 | struct freezer *freezer; |
98 | 93 | ||
@@ -105,22 +100,22 @@ static struct cgroup_subsys_state *freezer_css_alloc(struct cgroup *cgroup) | |||
105 | } | 100 | } |
106 | 101 | ||
107 | /** | 102 | /** |
108 | * freezer_css_online - commit creation of a freezer cgroup | 103 | * freezer_css_online - commit creation of a freezer css |
109 | * @cgroup: cgroup being created | 104 | * @css: css being created |
110 | * | 105 | * |
111 | * We're committing to creation of @cgroup. Mark it online and inherit | 106 | * We're committing to creation of @css. Mark it online and inherit |
112 | * parent's freezing state while holding both parent's and our | 107 | * parent's freezing state while holding both parent's and our |
113 | * freezer->lock. | 108 | * freezer->lock. |
114 | */ | 109 | */ |
115 | static int freezer_css_online(struct cgroup *cgroup) | 110 | static int freezer_css_online(struct cgroup_subsys_state *css) |
116 | { | 111 | { |
117 | struct freezer *freezer = cgroup_freezer(cgroup); | 112 | struct freezer *freezer = css_freezer(css); |
118 | struct freezer *parent = parent_freezer(freezer); | 113 | struct freezer *parent = parent_freezer(freezer); |
119 | 114 | ||
120 | /* | 115 | /* |
121 | * The following double locking and freezing state inheritance | 116 | * The following double locking and freezing state inheritance |
122 | * guarantee that @cgroup can never escape ancestors' freezing | 117 | * guarantee that @cgroup can never escape ancestors' freezing |
123 | * states. See cgroup_for_each_descendant_pre() for details. | 118 | * states. See css_for_each_descendant_pre() for details. |
124 | */ | 119 | */ |
125 | if (parent) | 120 | if (parent) |
126 | spin_lock_irq(&parent->lock); | 121 | spin_lock_irq(&parent->lock); |
@@ -141,15 +136,15 @@ static int freezer_css_online(struct cgroup *cgroup) | |||
141 | } | 136 | } |
142 | 137 | ||
143 | /** | 138 | /** |
144 | * freezer_css_offline - initiate destruction of @cgroup | 139 | * freezer_css_offline - initiate destruction of a freezer css |
145 | * @cgroup: cgroup being destroyed | 140 | * @css: css being destroyed |
146 | * | 141 | * |
147 | * @cgroup is going away. Mark it dead and decrement system_freezing_count | 142 | * @css is going away. Mark it dead and decrement system_freezing_count if |
148 | * if it was holding one. | 143 | * it was holding one. |
149 | */ | 144 | */ |
150 | static void freezer_css_offline(struct cgroup *cgroup) | 145 | static void freezer_css_offline(struct cgroup_subsys_state *css) |
151 | { | 146 | { |
152 | struct freezer *freezer = cgroup_freezer(cgroup); | 147 | struct freezer *freezer = css_freezer(css); |
153 | 148 | ||
154 | spin_lock_irq(&freezer->lock); | 149 | spin_lock_irq(&freezer->lock); |
155 | 150 | ||
@@ -161,9 +156,9 @@ static void freezer_css_offline(struct cgroup *cgroup) | |||
161 | spin_unlock_irq(&freezer->lock); | 156 | spin_unlock_irq(&freezer->lock); |
162 | } | 157 | } |
163 | 158 | ||
164 | static void freezer_css_free(struct cgroup *cgroup) | 159 | static void freezer_css_free(struct cgroup_subsys_state *css) |
165 | { | 160 | { |
166 | kfree(cgroup_freezer(cgroup)); | 161 | kfree(css_freezer(css)); |
167 | } | 162 | } |
168 | 163 | ||
169 | /* | 164 | /* |
@@ -175,25 +170,26 @@ static void freezer_css_free(struct cgroup *cgroup) | |||
175 | * @freezer->lock. freezer_attach() makes the new tasks conform to the | 170 | * @freezer->lock. freezer_attach() makes the new tasks conform to the |
176 | * current state and all following state changes can see the new tasks. | 171 | * current state and all following state changes can see the new tasks. |
177 | */ | 172 | */ |
178 | static void freezer_attach(struct cgroup *new_cgrp, struct cgroup_taskset *tset) | 173 | static void freezer_attach(struct cgroup_subsys_state *new_css, |
174 | struct cgroup_taskset *tset) | ||
179 | { | 175 | { |
180 | struct freezer *freezer = cgroup_freezer(new_cgrp); | 176 | struct freezer *freezer = css_freezer(new_css); |
181 | struct task_struct *task; | 177 | struct task_struct *task; |
182 | bool clear_frozen = false; | 178 | bool clear_frozen = false; |
183 | 179 | ||
184 | spin_lock_irq(&freezer->lock); | 180 | spin_lock_irq(&freezer->lock); |
185 | 181 | ||
186 | /* | 182 | /* |
187 | * Make the new tasks conform to the current state of @new_cgrp. | 183 | * Make the new tasks conform to the current state of @new_css. |
188 | * For simplicity, when migrating any task to a FROZEN cgroup, we | 184 | * For simplicity, when migrating any task to a FROZEN cgroup, we |
189 | * revert it to FREEZING and let update_if_frozen() determine the | 185 | * revert it to FREEZING and let update_if_frozen() determine the |
190 | * correct state later. | 186 | * correct state later. |
191 | * | 187 | * |
192 | * Tasks in @tset are on @new_cgrp but may not conform to its | 188 | * Tasks in @tset are on @new_css but may not conform to its |
193 | * current state before executing the following - !frozen tasks may | 189 | * current state before executing the following - !frozen tasks may |
194 | * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. | 190 | * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. |
195 | */ | 191 | */ |
196 | cgroup_taskset_for_each(task, new_cgrp, tset) { | 192 | cgroup_taskset_for_each(task, new_css, tset) { |
197 | if (!(freezer->state & CGROUP_FREEZING)) { | 193 | if (!(freezer->state & CGROUP_FREEZING)) { |
198 | __thaw_task(task); | 194 | __thaw_task(task); |
199 | } else { | 195 | } else { |
@@ -231,7 +227,7 @@ static void freezer_fork(struct task_struct *task) | |||
231 | * The root cgroup is non-freezable, so we can skip the | 227 | * The root cgroup is non-freezable, so we can skip the |
232 | * following check. | 228 | * following check. |
233 | */ | 229 | */ |
234 | if (!freezer->css.cgroup->parent) | 230 | if (!parent_freezer(freezer)) |
235 | goto out; | 231 | goto out; |
236 | 232 | ||
237 | spin_lock_irq(&freezer->lock); | 233 | spin_lock_irq(&freezer->lock); |
@@ -244,7 +240,7 @@ out: | |||
244 | 240 | ||
245 | /** | 241 | /** |
246 | * update_if_frozen - update whether a cgroup finished freezing | 242 | * update_if_frozen - update whether a cgroup finished freezing |
247 | * @cgroup: cgroup of interest | 243 | * @css: css of interest |
248 | * | 244 | * |
249 | * Once FREEZING is initiated, transition to FROZEN is lazily updated by | 245 | * Once FREEZING is initiated, transition to FROZEN is lazily updated by |
250 | * calling this function. If the current state is FREEZING but not FROZEN, | 246 | * calling this function. If the current state is FREEZING but not FROZEN, |
@@ -255,14 +251,14 @@ out: | |||
255 | * update_if_frozen() on all descendants prior to invoking this function. | 251 | * update_if_frozen() on all descendants prior to invoking this function. |
256 | * | 252 | * |
257 | * Task states and freezer state might disagree while tasks are being | 253 | * Task states and freezer state might disagree while tasks are being |
258 | * migrated into or out of @cgroup, so we can't verify task states against | 254 | * migrated into or out of @css, so we can't verify task states against |
259 | * @freezer state here. See freezer_attach() for details. | 255 | * @freezer state here. See freezer_attach() for details. |
260 | */ | 256 | */ |
261 | static void update_if_frozen(struct cgroup *cgroup) | 257 | static void update_if_frozen(struct cgroup_subsys_state *css) |
262 | { | 258 | { |
263 | struct freezer *freezer = cgroup_freezer(cgroup); | 259 | struct freezer *freezer = css_freezer(css); |
264 | struct cgroup *pos; | 260 | struct cgroup_subsys_state *pos; |
265 | struct cgroup_iter it; | 261 | struct css_task_iter it; |
266 | struct task_struct *task; | 262 | struct task_struct *task; |
267 | 263 | ||
268 | WARN_ON_ONCE(!rcu_read_lock_held()); | 264 | WARN_ON_ONCE(!rcu_read_lock_held()); |
@@ -274,8 +270,8 @@ static void update_if_frozen(struct cgroup *cgroup) | |||
274 | goto out_unlock; | 270 | goto out_unlock; |
275 | 271 | ||
276 | /* are all (live) children frozen? */ | 272 | /* are all (live) children frozen? */ |
277 | cgroup_for_each_child(pos, cgroup) { | 273 | css_for_each_child(pos, css) { |
278 | struct freezer *child = cgroup_freezer(pos); | 274 | struct freezer *child = css_freezer(pos); |
279 | 275 | ||
280 | if ((child->state & CGROUP_FREEZER_ONLINE) && | 276 | if ((child->state & CGROUP_FREEZER_ONLINE) && |
281 | !(child->state & CGROUP_FROZEN)) | 277 | !(child->state & CGROUP_FROZEN)) |
@@ -283,9 +279,9 @@ static void update_if_frozen(struct cgroup *cgroup) | |||
283 | } | 279 | } |
284 | 280 | ||
285 | /* are all tasks frozen? */ | 281 | /* are all tasks frozen? */ |
286 | cgroup_iter_start(cgroup, &it); | 282 | css_task_iter_start(css, &it); |
287 | 283 | ||
288 | while ((task = cgroup_iter_next(cgroup, &it))) { | 284 | while ((task = css_task_iter_next(&it))) { |
289 | if (freezing(task)) { | 285 | if (freezing(task)) { |
290 | /* | 286 | /* |
291 | * freezer_should_skip() indicates that the task | 287 | * freezer_should_skip() indicates that the task |
@@ -300,52 +296,49 @@ static void update_if_frozen(struct cgroup *cgroup) | |||
300 | 296 | ||
301 | freezer->state |= CGROUP_FROZEN; | 297 | freezer->state |= CGROUP_FROZEN; |
302 | out_iter_end: | 298 | out_iter_end: |
303 | cgroup_iter_end(cgroup, &it); | 299 | css_task_iter_end(&it); |
304 | out_unlock: | 300 | out_unlock: |
305 | spin_unlock_irq(&freezer->lock); | 301 | spin_unlock_irq(&freezer->lock); |
306 | } | 302 | } |
307 | 303 | ||
308 | static int freezer_read(struct cgroup *cgroup, struct cftype *cft, | 304 | static int freezer_read(struct cgroup_subsys_state *css, struct cftype *cft, |
309 | struct seq_file *m) | 305 | struct seq_file *m) |
310 | { | 306 | { |
311 | struct cgroup *pos; | 307 | struct cgroup_subsys_state *pos; |
312 | 308 | ||
313 | rcu_read_lock(); | 309 | rcu_read_lock(); |
314 | 310 | ||
315 | /* update states bottom-up */ | 311 | /* update states bottom-up */ |
316 | cgroup_for_each_descendant_post(pos, cgroup) | 312 | css_for_each_descendant_post(pos, css) |
317 | update_if_frozen(pos); | 313 | update_if_frozen(pos); |
318 | update_if_frozen(cgroup); | ||
319 | 314 | ||
320 | rcu_read_unlock(); | 315 | rcu_read_unlock(); |
321 | 316 | ||
322 | seq_puts(m, freezer_state_strs(cgroup_freezer(cgroup)->state)); | 317 | seq_puts(m, freezer_state_strs(css_freezer(css)->state)); |
323 | seq_putc(m, '\n'); | 318 | seq_putc(m, '\n'); |
324 | return 0; | 319 | return 0; |
325 | } | 320 | } |
326 | 321 | ||
327 | static void freeze_cgroup(struct freezer *freezer) | 322 | static void freeze_cgroup(struct freezer *freezer) |
328 | { | 323 | { |
329 | struct cgroup *cgroup = freezer->css.cgroup; | 324 | struct css_task_iter it; |
330 | struct cgroup_iter it; | ||
331 | struct task_struct *task; | 325 | struct task_struct *task; |
332 | 326 | ||
333 | cgroup_iter_start(cgroup, &it); | 327 | css_task_iter_start(&freezer->css, &it); |
334 | while ((task = cgroup_iter_next(cgroup, &it))) | 328 | while ((task = css_task_iter_next(&it))) |
335 | freeze_task(task); | 329 | freeze_task(task); |
336 | cgroup_iter_end(cgroup, &it); | 330 | css_task_iter_end(&it); |
337 | } | 331 | } |
338 | 332 | ||
339 | static void unfreeze_cgroup(struct freezer *freezer) | 333 | static void unfreeze_cgroup(struct freezer *freezer) |
340 | { | 334 | { |
341 | struct cgroup *cgroup = freezer->css.cgroup; | 335 | struct css_task_iter it; |
342 | struct cgroup_iter it; | ||
343 | struct task_struct *task; | 336 | struct task_struct *task; |
344 | 337 | ||
345 | cgroup_iter_start(cgroup, &it); | 338 | css_task_iter_start(&freezer->css, &it); |
346 | while ((task = cgroup_iter_next(cgroup, &it))) | 339 | while ((task = css_task_iter_next(&it))) |
347 | __thaw_task(task); | 340 | __thaw_task(task); |
348 | cgroup_iter_end(cgroup, &it); | 341 | css_task_iter_end(&it); |
349 | } | 342 | } |
350 | 343 | ||
351 | /** | 344 | /** |
@@ -395,12 +388,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze, | |||
395 | */ | 388 | */ |
396 | static void freezer_change_state(struct freezer *freezer, bool freeze) | 389 | static void freezer_change_state(struct freezer *freezer, bool freeze) |
397 | { | 390 | { |
398 | struct cgroup *pos; | 391 | struct cgroup_subsys_state *pos; |
399 | |||
400 | /* update @freezer */ | ||
401 | spin_lock_irq(&freezer->lock); | ||
402 | freezer_apply_state(freezer, freeze, CGROUP_FREEZING_SELF); | ||
403 | spin_unlock_irq(&freezer->lock); | ||
404 | 392 | ||
405 | /* | 393 | /* |
406 | * Update all its descendants in pre-order traversal. Each | 394 | * Update all its descendants in pre-order traversal. Each |
@@ -408,24 +396,33 @@ static void freezer_change_state(struct freezer *freezer, bool freeze) | |||
408 | * CGROUP_FREEZING_PARENT. | 396 | * CGROUP_FREEZING_PARENT. |
409 | */ | 397 | */ |
410 | rcu_read_lock(); | 398 | rcu_read_lock(); |
411 | cgroup_for_each_descendant_pre(pos, freezer->css.cgroup) { | 399 | css_for_each_descendant_pre(pos, &freezer->css) { |
412 | struct freezer *pos_f = cgroup_freezer(pos); | 400 | struct freezer *pos_f = css_freezer(pos); |
413 | struct freezer *parent = parent_freezer(pos_f); | 401 | struct freezer *parent = parent_freezer(pos_f); |
414 | 402 | ||
415 | /* | ||
416 | * Our update to @parent->state is already visible which is | ||
417 | * all we need. No need to lock @parent. For more info on | ||
418 | * synchronization, see freezer_post_create(). | ||
419 | */ | ||
420 | spin_lock_irq(&pos_f->lock); | 403 | spin_lock_irq(&pos_f->lock); |
421 | freezer_apply_state(pos_f, parent->state & CGROUP_FREEZING, | 404 | |
422 | CGROUP_FREEZING_PARENT); | 405 | if (pos_f == freezer) { |
406 | freezer_apply_state(pos_f, freeze, | ||
407 | CGROUP_FREEZING_SELF); | ||
408 | } else { | ||
409 | /* | ||
410 | * Our update to @parent->state is already visible | ||
411 | * which is all we need. No need to lock @parent. | ||
412 | * For more info on synchronization, see | ||
413 | * freezer_post_create(). | ||
414 | */ | ||
415 | freezer_apply_state(pos_f, | ||
416 | parent->state & CGROUP_FREEZING, | ||
417 | CGROUP_FREEZING_PARENT); | ||
418 | } | ||
419 | |||
423 | spin_unlock_irq(&pos_f->lock); | 420 | spin_unlock_irq(&pos_f->lock); |
424 | } | 421 | } |
425 | rcu_read_unlock(); | 422 | rcu_read_unlock(); |
426 | } | 423 | } |
427 | 424 | ||
428 | static int freezer_write(struct cgroup *cgroup, struct cftype *cft, | 425 | static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft, |
429 | const char *buffer) | 426 | const char *buffer) |
430 | { | 427 | { |
431 | bool freeze; | 428 | bool freeze; |
@@ -437,20 +434,22 @@ static int freezer_write(struct cgroup *cgroup, struct cftype *cft, | |||
437 | else | 434 | else |
438 | return -EINVAL; | 435 | return -EINVAL; |
439 | 436 | ||
440 | freezer_change_state(cgroup_freezer(cgroup), freeze); | 437 | freezer_change_state(css_freezer(css), freeze); |
441 | return 0; | 438 | return 0; |
442 | } | 439 | } |
443 | 440 | ||
444 | static u64 freezer_self_freezing_read(struct cgroup *cgroup, struct cftype *cft) | 441 | static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css, |
442 | struct cftype *cft) | ||
445 | { | 443 | { |
446 | struct freezer *freezer = cgroup_freezer(cgroup); | 444 | struct freezer *freezer = css_freezer(css); |
447 | 445 | ||
448 | return (bool)(freezer->state & CGROUP_FREEZING_SELF); | 446 | return (bool)(freezer->state & CGROUP_FREEZING_SELF); |
449 | } | 447 | } |
450 | 448 | ||
451 | static u64 freezer_parent_freezing_read(struct cgroup *cgroup, struct cftype *cft) | 449 | static u64 freezer_parent_freezing_read(struct cgroup_subsys_state *css, |
450 | struct cftype *cft) | ||
452 | { | 451 | { |
453 | struct freezer *freezer = cgroup_freezer(cgroup); | 452 | struct freezer *freezer = css_freezer(css); |
454 | 453 | ||
455 | return (bool)(freezer->state & CGROUP_FREEZING_PARENT); | 454 | return (bool)(freezer->state & CGROUP_FREEZING_PARENT); |
456 | } | 455 | } |
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 383f8231e436..859c8dfd78a1 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c | |||
@@ -20,26 +20,46 @@ | |||
20 | #include <linux/hardirq.h> | 20 | #include <linux/hardirq.h> |
21 | #include <linux/export.h> | 21 | #include <linux/export.h> |
22 | 22 | ||
23 | DEFINE_PER_CPU(struct context_tracking, context_tracking) = { | 23 | #define CREATE_TRACE_POINTS |
24 | #ifdef CONFIG_CONTEXT_TRACKING_FORCE | 24 | #include <trace/events/context_tracking.h> |
25 | .active = true, | 25 | |
26 | #endif | 26 | struct static_key context_tracking_enabled = STATIC_KEY_INIT_FALSE; |
27 | }; | 27 | EXPORT_SYMBOL_GPL(context_tracking_enabled); |
28 | |||
29 | DEFINE_PER_CPU(struct context_tracking, context_tracking); | ||
30 | EXPORT_SYMBOL_GPL(context_tracking); | ||
31 | |||
32 | void context_tracking_cpu_set(int cpu) | ||
33 | { | ||
34 | if (!per_cpu(context_tracking.active, cpu)) { | ||
35 | per_cpu(context_tracking.active, cpu) = true; | ||
36 | static_key_slow_inc(&context_tracking_enabled); | ||
37 | } | ||
38 | } | ||
28 | 39 | ||
29 | /** | 40 | /** |
30 | * user_enter - Inform the context tracking that the CPU is going to | 41 | * context_tracking_user_enter - Inform the context tracking that the CPU is going to |
31 | * enter userspace mode. | 42 | * enter userspace mode. |
32 | * | 43 | * |
33 | * This function must be called right before we switch from the kernel | 44 | * This function must be called right before we switch from the kernel |
34 | * to userspace, when it's guaranteed the remaining kernel instructions | 45 | * to userspace, when it's guaranteed the remaining kernel instructions |
35 | * to execute won't use any RCU read side critical section because this | 46 | * to execute won't use any RCU read side critical section because this |
36 | * function sets RCU in extended quiescent state. | 47 | * function sets RCU in extended quiescent state. |
37 | */ | 48 | */ |
38 | void user_enter(void) | 49 | void context_tracking_user_enter(void) |
39 | { | 50 | { |
40 | unsigned long flags; | 51 | unsigned long flags; |
41 | 52 | ||
42 | /* | 53 | /* |
54 | * Repeat the user_enter() check here because some archs may be calling | ||
55 | * this from asm and if no CPU needs context tracking, they shouldn't | ||
56 | * go further. Repeat the check here until they support the static key | ||
57 | * check. | ||
58 | */ | ||
59 | if (!static_key_false(&context_tracking_enabled)) | ||
60 | return; | ||
61 | |||
62 | /* | ||
43 | * Some contexts may involve an exception occuring in an irq, | 63 | * Some contexts may involve an exception occuring in an irq, |
44 | * leading to that nesting: | 64 | * leading to that nesting: |
45 | * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() | 65 | * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() |
@@ -54,17 +74,32 @@ void user_enter(void) | |||
54 | WARN_ON_ONCE(!current->mm); | 74 | WARN_ON_ONCE(!current->mm); |
55 | 75 | ||
56 | local_irq_save(flags); | 76 | local_irq_save(flags); |
57 | if (__this_cpu_read(context_tracking.active) && | 77 | if ( __this_cpu_read(context_tracking.state) != IN_USER) { |
58 | __this_cpu_read(context_tracking.state) != IN_USER) { | 78 | if (__this_cpu_read(context_tracking.active)) { |
79 | trace_user_enter(0); | ||
80 | /* | ||
81 | * At this stage, only low level arch entry code remains and | ||
82 | * then we'll run in userspace. We can assume there won't be | ||
83 | * any RCU read-side critical section until the next call to | ||
84 | * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency | ||
85 | * on the tick. | ||
86 | */ | ||
87 | vtime_user_enter(current); | ||
88 | rcu_user_enter(); | ||
89 | } | ||
59 | /* | 90 | /* |
60 | * At this stage, only low level arch entry code remains and | 91 | * Even if context tracking is disabled on this CPU, because it's outside |
61 | * then we'll run in userspace. We can assume there won't be | 92 | * the full dynticks mask for example, we still have to keep track of the |
62 | * any RCU read-side critical section until the next call to | 93 | * context transitions and states to prevent inconsistency on those of |
63 | * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency | 94 | * other CPUs. |
64 | * on the tick. | 95 | * If a task triggers an exception in userspace, sleep on the exception |
96 | * handler and then migrate to another CPU, that new CPU must know where | ||
97 | * the exception returns by the time we call exception_exit(). | ||
98 | * This information can only be provided by the previous CPU when it called | ||
99 | * exception_enter(). | ||
100 | * OTOH we can spare the calls to vtime and RCU when context_tracking.active | ||
101 | * is false because we know that CPU is not tickless. | ||
65 | */ | 102 | */ |
66 | vtime_user_enter(current); | ||
67 | rcu_user_enter(); | ||
68 | __this_cpu_write(context_tracking.state, IN_USER); | 103 | __this_cpu_write(context_tracking.state, IN_USER); |
69 | } | 104 | } |
70 | local_irq_restore(flags); | 105 | local_irq_restore(flags); |
@@ -87,10 +122,9 @@ void user_enter(void) | |||
87 | */ | 122 | */ |
88 | void __sched notrace preempt_schedule_context(void) | 123 | void __sched notrace preempt_schedule_context(void) |
89 | { | 124 | { |
90 | struct thread_info *ti = current_thread_info(); | ||
91 | enum ctx_state prev_ctx; | 125 | enum ctx_state prev_ctx; |
92 | 126 | ||
93 | if (likely(ti->preempt_count || irqs_disabled())) | 127 | if (likely(!preemptible())) |
94 | return; | 128 | return; |
95 | 129 | ||
96 | /* | 130 | /* |
@@ -112,8 +146,8 @@ EXPORT_SYMBOL_GPL(preempt_schedule_context); | |||
112 | #endif /* CONFIG_PREEMPT */ | 146 | #endif /* CONFIG_PREEMPT */ |
113 | 147 | ||
114 | /** | 148 | /** |
115 | * user_exit - Inform the context tracking that the CPU is | 149 | * context_tracking_user_exit - Inform the context tracking that the CPU is |
116 | * exiting userspace mode and entering the kernel. | 150 | * exiting userspace mode and entering the kernel. |
117 | * | 151 | * |
118 | * This function must be called after we entered the kernel from userspace | 152 | * This function must be called after we entered the kernel from userspace |
119 | * before any use of RCU read side critical section. This potentially include | 153 | * before any use of RCU read side critical section. This potentially include |
@@ -122,47 +156,34 @@ EXPORT_SYMBOL_GPL(preempt_schedule_context); | |||
122 | * This call supports re-entrancy. This way it can be called from any exception | 156 | * This call supports re-entrancy. This way it can be called from any exception |
123 | * handler without needing to know if we came from userspace or not. | 157 | * handler without needing to know if we came from userspace or not. |
124 | */ | 158 | */ |
125 | void user_exit(void) | 159 | void context_tracking_user_exit(void) |
126 | { | 160 | { |
127 | unsigned long flags; | 161 | unsigned long flags; |
128 | 162 | ||
163 | if (!static_key_false(&context_tracking_enabled)) | ||
164 | return; | ||
165 | |||
129 | if (in_interrupt()) | 166 | if (in_interrupt()) |
130 | return; | 167 | return; |
131 | 168 | ||
132 | local_irq_save(flags); | 169 | local_irq_save(flags); |
133 | if (__this_cpu_read(context_tracking.state) == IN_USER) { | 170 | if (__this_cpu_read(context_tracking.state) == IN_USER) { |
134 | /* | 171 | if (__this_cpu_read(context_tracking.active)) { |
135 | * We are going to run code that may use RCU. Inform | 172 | /* |
136 | * RCU core about that (ie: we may need the tick again). | 173 | * We are going to run code that may use RCU. Inform |
137 | */ | 174 | * RCU core about that (ie: we may need the tick again). |
138 | rcu_user_exit(); | 175 | */ |
139 | vtime_user_exit(current); | 176 | rcu_user_exit(); |
177 | vtime_user_exit(current); | ||
178 | trace_user_exit(0); | ||
179 | } | ||
140 | __this_cpu_write(context_tracking.state, IN_KERNEL); | 180 | __this_cpu_write(context_tracking.state, IN_KERNEL); |
141 | } | 181 | } |
142 | local_irq_restore(flags); | 182 | local_irq_restore(flags); |
143 | } | 183 | } |
144 | 184 | ||
145 | void guest_enter(void) | ||
146 | { | ||
147 | if (vtime_accounting_enabled()) | ||
148 | vtime_guest_enter(current); | ||
149 | else | ||
150 | __guest_enter(); | ||
151 | } | ||
152 | EXPORT_SYMBOL_GPL(guest_enter); | ||
153 | |||
154 | void guest_exit(void) | ||
155 | { | ||
156 | if (vtime_accounting_enabled()) | ||
157 | vtime_guest_exit(current); | ||
158 | else | ||
159 | __guest_exit(); | ||
160 | } | ||
161 | EXPORT_SYMBOL_GPL(guest_exit); | ||
162 | |||
163 | |||
164 | /** | 185 | /** |
165 | * context_tracking_task_switch - context switch the syscall callbacks | 186 | * __context_tracking_task_switch - context switch the syscall callbacks |
166 | * @prev: the task that is being switched out | 187 | * @prev: the task that is being switched out |
167 | * @next: the task that is being switched in | 188 | * @next: the task that is being switched in |
168 | * | 189 | * |
@@ -174,11 +195,19 @@ EXPORT_SYMBOL_GPL(guest_exit); | |||
174 | * migrate to some CPU that doesn't do the context tracking. As such the TIF | 195 | * migrate to some CPU that doesn't do the context tracking. As such the TIF |
175 | * flag may not be desired there. | 196 | * flag may not be desired there. |
176 | */ | 197 | */ |
177 | void context_tracking_task_switch(struct task_struct *prev, | 198 | void __context_tracking_task_switch(struct task_struct *prev, |
178 | struct task_struct *next) | 199 | struct task_struct *next) |
179 | { | 200 | { |
180 | if (__this_cpu_read(context_tracking.active)) { | 201 | clear_tsk_thread_flag(prev, TIF_NOHZ); |
181 | clear_tsk_thread_flag(prev, TIF_NOHZ); | 202 | set_tsk_thread_flag(next, TIF_NOHZ); |
182 | set_tsk_thread_flag(next, TIF_NOHZ); | ||
183 | } | ||
184 | } | 203 | } |
204 | |||
205 | #ifdef CONFIG_CONTEXT_TRACKING_FORCE | ||
206 | void __init context_tracking_init(void) | ||
207 | { | ||
208 | int cpu; | ||
209 | |||
210 | for_each_possible_cpu(cpu) | ||
211 | context_tracking_cpu_set(cpu); | ||
212 | } | ||
213 | #endif | ||
diff --git a/kernel/cpu.c b/kernel/cpu.c index b2b227b82123..d7f07a2da5a6 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -113,7 +113,7 @@ EXPORT_SYMBOL_GPL(put_online_cpus); | |||
113 | * get_online_cpus() not an api which is called all that often. | 113 | * get_online_cpus() not an api which is called all that often. |
114 | * | 114 | * |
115 | */ | 115 | */ |
116 | static void cpu_hotplug_begin(void) | 116 | void cpu_hotplug_begin(void) |
117 | { | 117 | { |
118 | cpu_hotplug.active_writer = current; | 118 | cpu_hotplug.active_writer = current; |
119 | 119 | ||
@@ -127,7 +127,7 @@ static void cpu_hotplug_begin(void) | |||
127 | } | 127 | } |
128 | } | 128 | } |
129 | 129 | ||
130 | static void cpu_hotplug_done(void) | 130 | void cpu_hotplug_done(void) |
131 | { | 131 | { |
132 | cpu_hotplug.active_writer = NULL; | 132 | cpu_hotplug.active_writer = NULL; |
133 | mutex_unlock(&cpu_hotplug.lock); | 133 | mutex_unlock(&cpu_hotplug.lock); |
@@ -154,10 +154,7 @@ void cpu_hotplug_enable(void) | |||
154 | cpu_maps_update_done(); | 154 | cpu_maps_update_done(); |
155 | } | 155 | } |
156 | 156 | ||
157 | #else /* #if CONFIG_HOTPLUG_CPU */ | 157 | #endif /* CONFIG_HOTPLUG_CPU */ |
158 | static void cpu_hotplug_begin(void) {} | ||
159 | static void cpu_hotplug_done(void) {} | ||
160 | #endif /* #else #if CONFIG_HOTPLUG_CPU */ | ||
161 | 158 | ||
162 | /* Need to know about CPUs going up/down? */ | 159 | /* Need to know about CPUs going up/down? */ |
163 | int __ref register_cpu_notifier(struct notifier_block *nb) | 160 | int __ref register_cpu_notifier(struct notifier_block *nb) |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index ea1966db34f2..6bf981e13c43 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -68,10 +68,6 @@ | |||
68 | */ | 68 | */ |
69 | int number_of_cpusets __read_mostly; | 69 | int number_of_cpusets __read_mostly; |
70 | 70 | ||
71 | /* Forward declare cgroup structures */ | ||
72 | struct cgroup_subsys cpuset_subsys; | ||
73 | struct cpuset; | ||
74 | |||
75 | /* See "Frequency meter" comments, below. */ | 71 | /* See "Frequency meter" comments, below. */ |
76 | 72 | ||
77 | struct fmeter { | 73 | struct fmeter { |
@@ -115,27 +111,20 @@ struct cpuset { | |||
115 | int relax_domain_level; | 111 | int relax_domain_level; |
116 | }; | 112 | }; |
117 | 113 | ||
118 | /* Retrieve the cpuset for a cgroup */ | 114 | static inline struct cpuset *css_cs(struct cgroup_subsys_state *css) |
119 | static inline struct cpuset *cgroup_cs(struct cgroup *cgrp) | ||
120 | { | 115 | { |
121 | return container_of(cgroup_subsys_state(cgrp, cpuset_subsys_id), | 116 | return css ? container_of(css, struct cpuset, css) : NULL; |
122 | struct cpuset, css); | ||
123 | } | 117 | } |
124 | 118 | ||
125 | /* Retrieve the cpuset for a task */ | 119 | /* Retrieve the cpuset for a task */ |
126 | static inline struct cpuset *task_cs(struct task_struct *task) | 120 | static inline struct cpuset *task_cs(struct task_struct *task) |
127 | { | 121 | { |
128 | return container_of(task_subsys_state(task, cpuset_subsys_id), | 122 | return css_cs(task_css(task, cpuset_subsys_id)); |
129 | struct cpuset, css); | ||
130 | } | 123 | } |
131 | 124 | ||
132 | static inline struct cpuset *parent_cs(const struct cpuset *cs) | 125 | static inline struct cpuset *parent_cs(struct cpuset *cs) |
133 | { | 126 | { |
134 | struct cgroup *pcgrp = cs->css.cgroup->parent; | 127 | return css_cs(css_parent(&cs->css)); |
135 | |||
136 | if (pcgrp) | ||
137 | return cgroup_cs(pcgrp); | ||
138 | return NULL; | ||
139 | } | 128 | } |
140 | 129 | ||
141 | #ifdef CONFIG_NUMA | 130 | #ifdef CONFIG_NUMA |
@@ -212,29 +201,30 @@ static struct cpuset top_cpuset = { | |||
212 | /** | 201 | /** |
213 | * cpuset_for_each_child - traverse online children of a cpuset | 202 | * cpuset_for_each_child - traverse online children of a cpuset |
214 | * @child_cs: loop cursor pointing to the current child | 203 | * @child_cs: loop cursor pointing to the current child |
215 | * @pos_cgrp: used for iteration | 204 | * @pos_css: used for iteration |
216 | * @parent_cs: target cpuset to walk children of | 205 | * @parent_cs: target cpuset to walk children of |
217 | * | 206 | * |
218 | * Walk @child_cs through the online children of @parent_cs. Must be used | 207 | * Walk @child_cs through the online children of @parent_cs. Must be used |
219 | * with RCU read locked. | 208 | * with RCU read locked. |
220 | */ | 209 | */ |
221 | #define cpuset_for_each_child(child_cs, pos_cgrp, parent_cs) \ | 210 | #define cpuset_for_each_child(child_cs, pos_css, parent_cs) \ |
222 | cgroup_for_each_child((pos_cgrp), (parent_cs)->css.cgroup) \ | 211 | css_for_each_child((pos_css), &(parent_cs)->css) \ |
223 | if (is_cpuset_online(((child_cs) = cgroup_cs((pos_cgrp))))) | 212 | if (is_cpuset_online(((child_cs) = css_cs((pos_css))))) |
224 | 213 | ||
225 | /** | 214 | /** |
226 | * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants | 215 | * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants |
227 | * @des_cs: loop cursor pointing to the current descendant | 216 | * @des_cs: loop cursor pointing to the current descendant |
228 | * @pos_cgrp: used for iteration | 217 | * @pos_css: used for iteration |
229 | * @root_cs: target cpuset to walk ancestor of | 218 | * @root_cs: target cpuset to walk ancestor of |
230 | * | 219 | * |
231 | * Walk @des_cs through the online descendants of @root_cs. Must be used | 220 | * Walk @des_cs through the online descendants of @root_cs. Must be used |
232 | * with RCU read locked. The caller may modify @pos_cgrp by calling | 221 | * with RCU read locked. The caller may modify @pos_css by calling |
233 | * cgroup_rightmost_descendant() to skip subtree. | 222 | * css_rightmost_descendant() to skip subtree. @root_cs is included in the |
223 | * iteration and the first node to be visited. | ||
234 | */ | 224 | */ |
235 | #define cpuset_for_each_descendant_pre(des_cs, pos_cgrp, root_cs) \ | 225 | #define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \ |
236 | cgroup_for_each_descendant_pre((pos_cgrp), (root_cs)->css.cgroup) \ | 226 | css_for_each_descendant_pre((pos_css), &(root_cs)->css) \ |
237 | if (is_cpuset_online(((des_cs) = cgroup_cs((pos_cgrp))))) | 227 | if (is_cpuset_online(((des_cs) = css_cs((pos_css))))) |
238 | 228 | ||
239 | /* | 229 | /* |
240 | * There are two global mutexes guarding cpuset structures - cpuset_mutex | 230 | * There are two global mutexes guarding cpuset structures - cpuset_mutex |
@@ -320,8 +310,7 @@ static struct file_system_type cpuset_fs_type = { | |||
320 | * | 310 | * |
321 | * Call with callback_mutex held. | 311 | * Call with callback_mutex held. |
322 | */ | 312 | */ |
323 | static void guarantee_online_cpus(const struct cpuset *cs, | 313 | static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) |
324 | struct cpumask *pmask) | ||
325 | { | 314 | { |
326 | while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) | 315 | while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) |
327 | cs = parent_cs(cs); | 316 | cs = parent_cs(cs); |
@@ -339,7 +328,7 @@ static void guarantee_online_cpus(const struct cpuset *cs, | |||
339 | * | 328 | * |
340 | * Call with callback_mutex held. | 329 | * Call with callback_mutex held. |
341 | */ | 330 | */ |
342 | static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) | 331 | static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) |
343 | { | 332 | { |
344 | while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY])) | 333 | while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY])) |
345 | cs = parent_cs(cs); | 334 | cs = parent_cs(cs); |
@@ -384,7 +373,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) | |||
384 | * alloc_trial_cpuset - allocate a trial cpuset | 373 | * alloc_trial_cpuset - allocate a trial cpuset |
385 | * @cs: the cpuset that the trial cpuset duplicates | 374 | * @cs: the cpuset that the trial cpuset duplicates |
386 | */ | 375 | */ |
387 | static struct cpuset *alloc_trial_cpuset(const struct cpuset *cs) | 376 | static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) |
388 | { | 377 | { |
389 | struct cpuset *trial; | 378 | struct cpuset *trial; |
390 | 379 | ||
@@ -431,9 +420,9 @@ static void free_trial_cpuset(struct cpuset *trial) | |||
431 | * Return 0 if valid, -errno if not. | 420 | * Return 0 if valid, -errno if not. |
432 | */ | 421 | */ |
433 | 422 | ||
434 | static int validate_change(const struct cpuset *cur, const struct cpuset *trial) | 423 | static int validate_change(struct cpuset *cur, struct cpuset *trial) |
435 | { | 424 | { |
436 | struct cgroup *cgrp; | 425 | struct cgroup_subsys_state *css; |
437 | struct cpuset *c, *par; | 426 | struct cpuset *c, *par; |
438 | int ret; | 427 | int ret; |
439 | 428 | ||
@@ -441,7 +430,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) | |||
441 | 430 | ||
442 | /* Each of our child cpusets must be a subset of us */ | 431 | /* Each of our child cpusets must be a subset of us */ |
443 | ret = -EBUSY; | 432 | ret = -EBUSY; |
444 | cpuset_for_each_child(c, cgrp, cur) | 433 | cpuset_for_each_child(c, css, cur) |
445 | if (!is_cpuset_subset(c, trial)) | 434 | if (!is_cpuset_subset(c, trial)) |
446 | goto out; | 435 | goto out; |
447 | 436 | ||
@@ -462,7 +451,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) | |||
462 | * overlap | 451 | * overlap |
463 | */ | 452 | */ |
464 | ret = -EINVAL; | 453 | ret = -EINVAL; |
465 | cpuset_for_each_child(c, cgrp, par) { | 454 | cpuset_for_each_child(c, css, par) { |
466 | if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && | 455 | if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && |
467 | c != cur && | 456 | c != cur && |
468 | cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) | 457 | cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) |
@@ -515,13 +504,16 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr, | |||
515 | struct cpuset *root_cs) | 504 | struct cpuset *root_cs) |
516 | { | 505 | { |
517 | struct cpuset *cp; | 506 | struct cpuset *cp; |
518 | struct cgroup *pos_cgrp; | 507 | struct cgroup_subsys_state *pos_css; |
519 | 508 | ||
520 | rcu_read_lock(); | 509 | rcu_read_lock(); |
521 | cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { | 510 | cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { |
511 | if (cp == root_cs) | ||
512 | continue; | ||
513 | |||
522 | /* skip the whole subtree if @cp doesn't have any CPU */ | 514 | /* skip the whole subtree if @cp doesn't have any CPU */ |
523 | if (cpumask_empty(cp->cpus_allowed)) { | 515 | if (cpumask_empty(cp->cpus_allowed)) { |
524 | pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); | 516 | pos_css = css_rightmost_descendant(pos_css); |
525 | continue; | 517 | continue; |
526 | } | 518 | } |
527 | 519 | ||
@@ -596,7 +588,7 @@ static int generate_sched_domains(cpumask_var_t **domains, | |||
596 | struct sched_domain_attr *dattr; /* attributes for custom domains */ | 588 | struct sched_domain_attr *dattr; /* attributes for custom domains */ |
597 | int ndoms = 0; /* number of sched domains in result */ | 589 | int ndoms = 0; /* number of sched domains in result */ |
598 | int nslot; /* next empty doms[] struct cpumask slot */ | 590 | int nslot; /* next empty doms[] struct cpumask slot */ |
599 | struct cgroup *pos_cgrp; | 591 | struct cgroup_subsys_state *pos_css; |
600 | 592 | ||
601 | doms = NULL; | 593 | doms = NULL; |
602 | dattr = NULL; | 594 | dattr = NULL; |
@@ -625,7 +617,9 @@ static int generate_sched_domains(cpumask_var_t **domains, | |||
625 | csn = 0; | 617 | csn = 0; |
626 | 618 | ||
627 | rcu_read_lock(); | 619 | rcu_read_lock(); |
628 | cpuset_for_each_descendant_pre(cp, pos_cgrp, &top_cpuset) { | 620 | cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) { |
621 | if (cp == &top_cpuset) | ||
622 | continue; | ||
629 | /* | 623 | /* |
630 | * Continue traversing beyond @cp iff @cp has some CPUs and | 624 | * Continue traversing beyond @cp iff @cp has some CPUs and |
631 | * isn't load balancing. The former is obvious. The | 625 | * isn't load balancing. The former is obvious. The |
@@ -642,7 +636,7 @@ static int generate_sched_domains(cpumask_var_t **domains, | |||
642 | csa[csn++] = cp; | 636 | csa[csn++] = cp; |
643 | 637 | ||
644 | /* skip @cp's subtree */ | 638 | /* skip @cp's subtree */ |
645 | pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); | 639 | pos_css = css_rightmost_descendant(pos_css); |
646 | } | 640 | } |
647 | rcu_read_unlock(); | 641 | rcu_read_unlock(); |
648 | 642 | ||
@@ -837,52 +831,45 @@ static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs) | |||
837 | /** | 831 | /** |
838 | * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's | 832 | * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's |
839 | * @tsk: task to test | 833 | * @tsk: task to test |
840 | * @scan: struct cgroup_scanner containing the cgroup of the task | 834 | * @data: cpuset to @tsk belongs to |
841 | * | 835 | * |
842 | * Called by cgroup_scan_tasks() for each task in a cgroup whose | 836 | * Called by css_scan_tasks() for each task in a cgroup whose cpus_allowed |
843 | * cpus_allowed mask needs to be changed. | 837 | * mask needs to be changed. |
844 | * | 838 | * |
845 | * We don't need to re-check for the cgroup/cpuset membership, since we're | 839 | * We don't need to re-check for the cgroup/cpuset membership, since we're |
846 | * holding cpuset_mutex at this point. | 840 | * holding cpuset_mutex at this point. |
847 | */ | 841 | */ |
848 | static void cpuset_change_cpumask(struct task_struct *tsk, | 842 | static void cpuset_change_cpumask(struct task_struct *tsk, void *data) |
849 | struct cgroup_scanner *scan) | ||
850 | { | 843 | { |
851 | struct cpuset *cpus_cs; | 844 | struct cpuset *cs = data; |
845 | struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); | ||
852 | 846 | ||
853 | cpus_cs = effective_cpumask_cpuset(cgroup_cs(scan->cg)); | ||
854 | set_cpus_allowed_ptr(tsk, cpus_cs->cpus_allowed); | 847 | set_cpus_allowed_ptr(tsk, cpus_cs->cpus_allowed); |
855 | } | 848 | } |
856 | 849 | ||
857 | /** | 850 | /** |
858 | * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. | 851 | * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. |
859 | * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed | 852 | * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed |
860 | * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() | 853 | * @heap: if NULL, defer allocating heap memory to css_scan_tasks() |
861 | * | 854 | * |
862 | * Called with cpuset_mutex held | 855 | * Called with cpuset_mutex held |
863 | * | 856 | * |
864 | * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, | 857 | * The css_scan_tasks() function will scan all the tasks in a cgroup, |
865 | * calling callback functions for each. | 858 | * calling callback functions for each. |
866 | * | 859 | * |
867 | * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 | 860 | * No return value. It's guaranteed that css_scan_tasks() always returns 0 |
868 | * if @heap != NULL. | 861 | * if @heap != NULL. |
869 | */ | 862 | */ |
870 | static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) | 863 | static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) |
871 | { | 864 | { |
872 | struct cgroup_scanner scan; | 865 | css_scan_tasks(&cs->css, NULL, cpuset_change_cpumask, cs, heap); |
873 | |||
874 | scan.cg = cs->css.cgroup; | ||
875 | scan.test_task = NULL; | ||
876 | scan.process_task = cpuset_change_cpumask; | ||
877 | scan.heap = heap; | ||
878 | cgroup_scan_tasks(&scan); | ||
879 | } | 866 | } |
880 | 867 | ||
881 | /* | 868 | /* |
882 | * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy. | 869 | * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy. |
883 | * @root_cs: the root cpuset of the hierarchy | 870 | * @root_cs: the root cpuset of the hierarchy |
884 | * @update_root: update root cpuset or not? | 871 | * @update_root: update root cpuset or not? |
885 | * @heap: the heap used by cgroup_scan_tasks() | 872 | * @heap: the heap used by css_scan_tasks() |
886 | * | 873 | * |
887 | * This will update cpumasks of tasks in @root_cs and all other empty cpusets | 874 | * This will update cpumasks of tasks in @root_cs and all other empty cpusets |
888 | * which take on cpumask of @root_cs. | 875 | * which take on cpumask of @root_cs. |
@@ -893,17 +880,19 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs, | |||
893 | bool update_root, struct ptr_heap *heap) | 880 | bool update_root, struct ptr_heap *heap) |
894 | { | 881 | { |
895 | struct cpuset *cp; | 882 | struct cpuset *cp; |
896 | struct cgroup *pos_cgrp; | 883 | struct cgroup_subsys_state *pos_css; |
897 | |||
898 | if (update_root) | ||
899 | update_tasks_cpumask(root_cs, heap); | ||
900 | 884 | ||
901 | rcu_read_lock(); | 885 | rcu_read_lock(); |
902 | cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { | 886 | cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { |
903 | /* skip the whole subtree if @cp have some CPU */ | 887 | if (cp == root_cs) { |
904 | if (!cpumask_empty(cp->cpus_allowed)) { | 888 | if (!update_root) |
905 | pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); | 889 | continue; |
906 | continue; | 890 | } else { |
891 | /* skip the whole subtree if @cp have some CPU */ | ||
892 | if (!cpumask_empty(cp->cpus_allowed)) { | ||
893 | pos_css = css_rightmost_descendant(pos_css); | ||
894 | continue; | ||
895 | } | ||
907 | } | 896 | } |
908 | if (!css_tryget(&cp->css)) | 897 | if (!css_tryget(&cp->css)) |
909 | continue; | 898 | continue; |
@@ -1059,20 +1048,24 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, | |||
1059 | task_unlock(tsk); | 1048 | task_unlock(tsk); |
1060 | } | 1049 | } |
1061 | 1050 | ||
1051 | struct cpuset_change_nodemask_arg { | ||
1052 | struct cpuset *cs; | ||
1053 | nodemask_t *newmems; | ||
1054 | }; | ||
1055 | |||
1062 | /* | 1056 | /* |
1063 | * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy | 1057 | * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy |
1064 | * of it to cpuset's new mems_allowed, and migrate pages to new nodes if | 1058 | * of it to cpuset's new mems_allowed, and migrate pages to new nodes if |
1065 | * memory_migrate flag is set. Called with cpuset_mutex held. | 1059 | * memory_migrate flag is set. Called with cpuset_mutex held. |
1066 | */ | 1060 | */ |
1067 | static void cpuset_change_nodemask(struct task_struct *p, | 1061 | static void cpuset_change_nodemask(struct task_struct *p, void *data) |
1068 | struct cgroup_scanner *scan) | ||
1069 | { | 1062 | { |
1070 | struct cpuset *cs = cgroup_cs(scan->cg); | 1063 | struct cpuset_change_nodemask_arg *arg = data; |
1064 | struct cpuset *cs = arg->cs; | ||
1071 | struct mm_struct *mm; | 1065 | struct mm_struct *mm; |
1072 | int migrate; | 1066 | int migrate; |
1073 | nodemask_t *newmems = scan->data; | ||
1074 | 1067 | ||
1075 | cpuset_change_task_nodemask(p, newmems); | 1068 | cpuset_change_task_nodemask(p, arg->newmems); |
1076 | 1069 | ||
1077 | mm = get_task_mm(p); | 1070 | mm = get_task_mm(p); |
1078 | if (!mm) | 1071 | if (!mm) |
@@ -1082,7 +1075,7 @@ static void cpuset_change_nodemask(struct task_struct *p, | |||
1082 | 1075 | ||
1083 | mpol_rebind_mm(mm, &cs->mems_allowed); | 1076 | mpol_rebind_mm(mm, &cs->mems_allowed); |
1084 | if (migrate) | 1077 | if (migrate) |
1085 | cpuset_migrate_mm(mm, &cs->old_mems_allowed, newmems); | 1078 | cpuset_migrate_mm(mm, &cs->old_mems_allowed, arg->newmems); |
1086 | mmput(mm); | 1079 | mmput(mm); |
1087 | } | 1080 | } |
1088 | 1081 | ||
@@ -1091,28 +1084,22 @@ static void *cpuset_being_rebound; | |||
1091 | /** | 1084 | /** |
1092 | * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. | 1085 | * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. |
1093 | * @cs: the cpuset in which each task's mems_allowed mask needs to be changed | 1086 | * @cs: the cpuset in which each task's mems_allowed mask needs to be changed |
1094 | * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() | 1087 | * @heap: if NULL, defer allocating heap memory to css_scan_tasks() |
1095 | * | 1088 | * |
1096 | * Called with cpuset_mutex held | 1089 | * Called with cpuset_mutex held. No return value. It's guaranteed that |
1097 | * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 | 1090 | * css_scan_tasks() always returns 0 if @heap != NULL. |
1098 | * if @heap != NULL. | ||
1099 | */ | 1091 | */ |
1100 | static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) | 1092 | static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) |
1101 | { | 1093 | { |
1102 | static nodemask_t newmems; /* protected by cpuset_mutex */ | 1094 | static nodemask_t newmems; /* protected by cpuset_mutex */ |
1103 | struct cgroup_scanner scan; | ||
1104 | struct cpuset *mems_cs = effective_nodemask_cpuset(cs); | 1095 | struct cpuset *mems_cs = effective_nodemask_cpuset(cs); |
1096 | struct cpuset_change_nodemask_arg arg = { .cs = cs, | ||
1097 | .newmems = &newmems }; | ||
1105 | 1098 | ||
1106 | cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ | 1099 | cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ |
1107 | 1100 | ||
1108 | guarantee_online_mems(mems_cs, &newmems); | 1101 | guarantee_online_mems(mems_cs, &newmems); |
1109 | 1102 | ||
1110 | scan.cg = cs->css.cgroup; | ||
1111 | scan.test_task = NULL; | ||
1112 | scan.process_task = cpuset_change_nodemask; | ||
1113 | scan.heap = heap; | ||
1114 | scan.data = &newmems; | ||
1115 | |||
1116 | /* | 1103 | /* |
1117 | * The mpol_rebind_mm() call takes mmap_sem, which we couldn't | 1104 | * The mpol_rebind_mm() call takes mmap_sem, which we couldn't |
1118 | * take while holding tasklist_lock. Forks can happen - the | 1105 | * take while holding tasklist_lock. Forks can happen - the |
@@ -1123,7 +1110,7 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) | |||
1123 | * It's ok if we rebind the same mm twice; mpol_rebind_mm() | 1110 | * It's ok if we rebind the same mm twice; mpol_rebind_mm() |
1124 | * is idempotent. Also migrate pages in each mm to new nodes. | 1111 | * is idempotent. Also migrate pages in each mm to new nodes. |
1125 | */ | 1112 | */ |
1126 | cgroup_scan_tasks(&scan); | 1113 | css_scan_tasks(&cs->css, NULL, cpuset_change_nodemask, &arg, heap); |
1127 | 1114 | ||
1128 | /* | 1115 | /* |
1129 | * All the tasks' nodemasks have been updated, update | 1116 | * All the tasks' nodemasks have been updated, update |
@@ -1139,7 +1126,7 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) | |||
1139 | * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy. | 1126 | * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy. |
1140 | * @cs: the root cpuset of the hierarchy | 1127 | * @cs: the root cpuset of the hierarchy |
1141 | * @update_root: update the root cpuset or not? | 1128 | * @update_root: update the root cpuset or not? |
1142 | * @heap: the heap used by cgroup_scan_tasks() | 1129 | * @heap: the heap used by css_scan_tasks() |
1143 | * | 1130 | * |
1144 | * This will update nodemasks of tasks in @root_cs and all other empty cpusets | 1131 | * This will update nodemasks of tasks in @root_cs and all other empty cpusets |
1145 | * which take on nodemask of @root_cs. | 1132 | * which take on nodemask of @root_cs. |
@@ -1150,17 +1137,19 @@ static void update_tasks_nodemask_hier(struct cpuset *root_cs, | |||
1150 | bool update_root, struct ptr_heap *heap) | 1137 | bool update_root, struct ptr_heap *heap) |
1151 | { | 1138 | { |
1152 | struct cpuset *cp; | 1139 | struct cpuset *cp; |
1153 | struct cgroup *pos_cgrp; | 1140 | struct cgroup_subsys_state *pos_css; |
1154 | |||
1155 | if (update_root) | ||
1156 | update_tasks_nodemask(root_cs, heap); | ||
1157 | 1141 | ||
1158 | rcu_read_lock(); | 1142 | rcu_read_lock(); |
1159 | cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { | 1143 | cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { |
1160 | /* skip the whole subtree if @cp have some CPU */ | 1144 | if (cp == root_cs) { |
1161 | if (!nodes_empty(cp->mems_allowed)) { | 1145 | if (!update_root) |
1162 | pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); | 1146 | continue; |
1163 | continue; | 1147 | } else { |
1148 | /* skip the whole subtree if @cp have some CPU */ | ||
1149 | if (!nodes_empty(cp->mems_allowed)) { | ||
1150 | pos_css = css_rightmost_descendant(pos_css); | ||
1151 | continue; | ||
1152 | } | ||
1164 | } | 1153 | } |
1165 | if (!css_tryget(&cp->css)) | 1154 | if (!css_tryget(&cp->css)) |
1166 | continue; | 1155 | continue; |
@@ -1267,44 +1256,39 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) | |||
1267 | return 0; | 1256 | return 0; |
1268 | } | 1257 | } |
1269 | 1258 | ||
1270 | /* | 1259 | /** |
1271 | * cpuset_change_flag - make a task's spread flags the same as its cpuset's | 1260 | * cpuset_change_flag - make a task's spread flags the same as its cpuset's |
1272 | * @tsk: task to be updated | 1261 | * @tsk: task to be updated |
1273 | * @scan: struct cgroup_scanner containing the cgroup of the task | 1262 | * @data: cpuset to @tsk belongs to |
1274 | * | 1263 | * |
1275 | * Called by cgroup_scan_tasks() for each task in a cgroup. | 1264 | * Called by css_scan_tasks() for each task in a cgroup. |
1276 | * | 1265 | * |
1277 | * We don't need to re-check for the cgroup/cpuset membership, since we're | 1266 | * We don't need to re-check for the cgroup/cpuset membership, since we're |
1278 | * holding cpuset_mutex at this point. | 1267 | * holding cpuset_mutex at this point. |
1279 | */ | 1268 | */ |
1280 | static void cpuset_change_flag(struct task_struct *tsk, | 1269 | static void cpuset_change_flag(struct task_struct *tsk, void *data) |
1281 | struct cgroup_scanner *scan) | ||
1282 | { | 1270 | { |
1283 | cpuset_update_task_spread_flag(cgroup_cs(scan->cg), tsk); | 1271 | struct cpuset *cs = data; |
1272 | |||
1273 | cpuset_update_task_spread_flag(cs, tsk); | ||
1284 | } | 1274 | } |
1285 | 1275 | ||
1286 | /* | 1276 | /** |
1287 | * update_tasks_flags - update the spread flags of tasks in the cpuset. | 1277 | * update_tasks_flags - update the spread flags of tasks in the cpuset. |
1288 | * @cs: the cpuset in which each task's spread flags needs to be changed | 1278 | * @cs: the cpuset in which each task's spread flags needs to be changed |
1289 | * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() | 1279 | * @heap: if NULL, defer allocating heap memory to css_scan_tasks() |
1290 | * | 1280 | * |
1291 | * Called with cpuset_mutex held | 1281 | * Called with cpuset_mutex held |
1292 | * | 1282 | * |
1293 | * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, | 1283 | * The css_scan_tasks() function will scan all the tasks in a cgroup, |
1294 | * calling callback functions for each. | 1284 | * calling callback functions for each. |
1295 | * | 1285 | * |
1296 | * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 | 1286 | * No return value. It's guaranteed that css_scan_tasks() always returns 0 |
1297 | * if @heap != NULL. | 1287 | * if @heap != NULL. |
1298 | */ | 1288 | */ |
1299 | static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap) | 1289 | static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap) |
1300 | { | 1290 | { |
1301 | struct cgroup_scanner scan; | 1291 | css_scan_tasks(&cs->css, NULL, cpuset_change_flag, cs, heap); |
1302 | |||
1303 | scan.cg = cs->css.cgroup; | ||
1304 | scan.test_task = NULL; | ||
1305 | scan.process_task = cpuset_change_flag; | ||
1306 | scan.heap = heap; | ||
1307 | cgroup_scan_tasks(&scan); | ||
1308 | } | 1292 | } |
1309 | 1293 | ||
1310 | /* | 1294 | /* |
@@ -1462,9 +1446,10 @@ static int fmeter_getrate(struct fmeter *fmp) | |||
1462 | } | 1446 | } |
1463 | 1447 | ||
1464 | /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ | 1448 | /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ |
1465 | static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) | 1449 | static int cpuset_can_attach(struct cgroup_subsys_state *css, |
1450 | struct cgroup_taskset *tset) | ||
1466 | { | 1451 | { |
1467 | struct cpuset *cs = cgroup_cs(cgrp); | 1452 | struct cpuset *cs = css_cs(css); |
1468 | struct task_struct *task; | 1453 | struct task_struct *task; |
1469 | int ret; | 1454 | int ret; |
1470 | 1455 | ||
@@ -1475,11 +1460,11 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) | |||
1475 | * flag is set. | 1460 | * flag is set. |
1476 | */ | 1461 | */ |
1477 | ret = -ENOSPC; | 1462 | ret = -ENOSPC; |
1478 | if (!cgroup_sane_behavior(cgrp) && | 1463 | if (!cgroup_sane_behavior(css->cgroup) && |
1479 | (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) | 1464 | (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) |
1480 | goto out_unlock; | 1465 | goto out_unlock; |
1481 | 1466 | ||
1482 | cgroup_taskset_for_each(task, cgrp, tset) { | 1467 | cgroup_taskset_for_each(task, css, tset) { |
1483 | /* | 1468 | /* |
1484 | * Kthreads which disallow setaffinity shouldn't be moved | 1469 | * Kthreads which disallow setaffinity shouldn't be moved |
1485 | * to a new cpuset; we don't want to change their cpu | 1470 | * to a new cpuset; we don't want to change their cpu |
@@ -1508,11 +1493,11 @@ out_unlock: | |||
1508 | return ret; | 1493 | return ret; |
1509 | } | 1494 | } |
1510 | 1495 | ||
1511 | static void cpuset_cancel_attach(struct cgroup *cgrp, | 1496 | static void cpuset_cancel_attach(struct cgroup_subsys_state *css, |
1512 | struct cgroup_taskset *tset) | 1497 | struct cgroup_taskset *tset) |
1513 | { | 1498 | { |
1514 | mutex_lock(&cpuset_mutex); | 1499 | mutex_lock(&cpuset_mutex); |
1515 | cgroup_cs(cgrp)->attach_in_progress--; | 1500 | css_cs(css)->attach_in_progress--; |
1516 | mutex_unlock(&cpuset_mutex); | 1501 | mutex_unlock(&cpuset_mutex); |
1517 | } | 1502 | } |
1518 | 1503 | ||
@@ -1523,16 +1508,18 @@ static void cpuset_cancel_attach(struct cgroup *cgrp, | |||
1523 | */ | 1508 | */ |
1524 | static cpumask_var_t cpus_attach; | 1509 | static cpumask_var_t cpus_attach; |
1525 | 1510 | ||
1526 | static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) | 1511 | static void cpuset_attach(struct cgroup_subsys_state *css, |
1512 | struct cgroup_taskset *tset) | ||
1527 | { | 1513 | { |
1528 | /* static buf protected by cpuset_mutex */ | 1514 | /* static buf protected by cpuset_mutex */ |
1529 | static nodemask_t cpuset_attach_nodemask_to; | 1515 | static nodemask_t cpuset_attach_nodemask_to; |
1530 | struct mm_struct *mm; | 1516 | struct mm_struct *mm; |
1531 | struct task_struct *task; | 1517 | struct task_struct *task; |
1532 | struct task_struct *leader = cgroup_taskset_first(tset); | 1518 | struct task_struct *leader = cgroup_taskset_first(tset); |
1533 | struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset); | 1519 | struct cgroup_subsys_state *oldcss = cgroup_taskset_cur_css(tset, |
1534 | struct cpuset *cs = cgroup_cs(cgrp); | 1520 | cpuset_subsys_id); |
1535 | struct cpuset *oldcs = cgroup_cs(oldcgrp); | 1521 | struct cpuset *cs = css_cs(css); |
1522 | struct cpuset *oldcs = css_cs(oldcss); | ||
1536 | struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); | 1523 | struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); |
1537 | struct cpuset *mems_cs = effective_nodemask_cpuset(cs); | 1524 | struct cpuset *mems_cs = effective_nodemask_cpuset(cs); |
1538 | 1525 | ||
@@ -1546,7 +1533,7 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) | |||
1546 | 1533 | ||
1547 | guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to); | 1534 | guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to); |
1548 | 1535 | ||
1549 | cgroup_taskset_for_each(task, cgrp, tset) { | 1536 | cgroup_taskset_for_each(task, css, tset) { |
1550 | /* | 1537 | /* |
1551 | * can_attach beforehand should guarantee that this doesn't | 1538 | * can_attach beforehand should guarantee that this doesn't |
1552 | * fail. TODO: have a better way to handle failure here | 1539 | * fail. TODO: have a better way to handle failure here |
@@ -1608,9 +1595,10 @@ typedef enum { | |||
1608 | FILE_SPREAD_SLAB, | 1595 | FILE_SPREAD_SLAB, |
1609 | } cpuset_filetype_t; | 1596 | } cpuset_filetype_t; |
1610 | 1597 | ||
1611 | static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) | 1598 | static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, |
1599 | u64 val) | ||
1612 | { | 1600 | { |
1613 | struct cpuset *cs = cgroup_cs(cgrp); | 1601 | struct cpuset *cs = css_cs(css); |
1614 | cpuset_filetype_t type = cft->private; | 1602 | cpuset_filetype_t type = cft->private; |
1615 | int retval = 0; | 1603 | int retval = 0; |
1616 | 1604 | ||
@@ -1657,9 +1645,10 @@ out_unlock: | |||
1657 | return retval; | 1645 | return retval; |
1658 | } | 1646 | } |
1659 | 1647 | ||
1660 | static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) | 1648 | static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, |
1649 | s64 val) | ||
1661 | { | 1650 | { |
1662 | struct cpuset *cs = cgroup_cs(cgrp); | 1651 | struct cpuset *cs = css_cs(css); |
1663 | cpuset_filetype_t type = cft->private; | 1652 | cpuset_filetype_t type = cft->private; |
1664 | int retval = -ENODEV; | 1653 | int retval = -ENODEV; |
1665 | 1654 | ||
@@ -1683,10 +1672,10 @@ out_unlock: | |||
1683 | /* | 1672 | /* |
1684 | * Common handling for a write to a "cpus" or "mems" file. | 1673 | * Common handling for a write to a "cpus" or "mems" file. |
1685 | */ | 1674 | */ |
1686 | static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, | 1675 | static int cpuset_write_resmask(struct cgroup_subsys_state *css, |
1687 | const char *buf) | 1676 | struct cftype *cft, const char *buf) |
1688 | { | 1677 | { |
1689 | struct cpuset *cs = cgroup_cs(cgrp); | 1678 | struct cpuset *cs = css_cs(css); |
1690 | struct cpuset *trialcs; | 1679 | struct cpuset *trialcs; |
1691 | int retval = -ENODEV; | 1680 | int retval = -ENODEV; |
1692 | 1681 | ||
@@ -1765,13 +1754,12 @@ static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs) | |||
1765 | return count; | 1754 | return count; |
1766 | } | 1755 | } |
1767 | 1756 | ||
1768 | static ssize_t cpuset_common_file_read(struct cgroup *cgrp, | 1757 | static ssize_t cpuset_common_file_read(struct cgroup_subsys_state *css, |
1769 | struct cftype *cft, | 1758 | struct cftype *cft, struct file *file, |
1770 | struct file *file, | 1759 | char __user *buf, size_t nbytes, |
1771 | char __user *buf, | 1760 | loff_t *ppos) |
1772 | size_t nbytes, loff_t *ppos) | ||
1773 | { | 1761 | { |
1774 | struct cpuset *cs = cgroup_cs(cgrp); | 1762 | struct cpuset *cs = css_cs(css); |
1775 | cpuset_filetype_t type = cft->private; | 1763 | cpuset_filetype_t type = cft->private; |
1776 | char *page; | 1764 | char *page; |
1777 | ssize_t retval = 0; | 1765 | ssize_t retval = 0; |
@@ -1801,9 +1789,9 @@ out: | |||
1801 | return retval; | 1789 | return retval; |
1802 | } | 1790 | } |
1803 | 1791 | ||
1804 | static u64 cpuset_read_u64(struct cgroup *cgrp, struct cftype *cft) | 1792 | static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) |
1805 | { | 1793 | { |
1806 | struct cpuset *cs = cgroup_cs(cgrp); | 1794 | struct cpuset *cs = css_cs(css); |
1807 | cpuset_filetype_t type = cft->private; | 1795 | cpuset_filetype_t type = cft->private; |
1808 | switch (type) { | 1796 | switch (type) { |
1809 | case FILE_CPU_EXCLUSIVE: | 1797 | case FILE_CPU_EXCLUSIVE: |
@@ -1832,9 +1820,9 @@ static u64 cpuset_read_u64(struct cgroup *cgrp, struct cftype *cft) | |||
1832 | return 0; | 1820 | return 0; |
1833 | } | 1821 | } |
1834 | 1822 | ||
1835 | static s64 cpuset_read_s64(struct cgroup *cgrp, struct cftype *cft) | 1823 | static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) |
1836 | { | 1824 | { |
1837 | struct cpuset *cs = cgroup_cs(cgrp); | 1825 | struct cpuset *cs = css_cs(css); |
1838 | cpuset_filetype_t type = cft->private; | 1826 | cpuset_filetype_t type = cft->private; |
1839 | switch (type) { | 1827 | switch (type) { |
1840 | case FILE_SCHED_RELAX_DOMAIN_LEVEL: | 1828 | case FILE_SCHED_RELAX_DOMAIN_LEVEL: |
@@ -1949,11 +1937,12 @@ static struct cftype files[] = { | |||
1949 | * cgrp: control group that the new cpuset will be part of | 1937 | * cgrp: control group that the new cpuset will be part of |
1950 | */ | 1938 | */ |
1951 | 1939 | ||
1952 | static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cgrp) | 1940 | static struct cgroup_subsys_state * |
1941 | cpuset_css_alloc(struct cgroup_subsys_state *parent_css) | ||
1953 | { | 1942 | { |
1954 | struct cpuset *cs; | 1943 | struct cpuset *cs; |
1955 | 1944 | ||
1956 | if (!cgrp->parent) | 1945 | if (!parent_css) |
1957 | return &top_cpuset.css; | 1946 | return &top_cpuset.css; |
1958 | 1947 | ||
1959 | cs = kzalloc(sizeof(*cs), GFP_KERNEL); | 1948 | cs = kzalloc(sizeof(*cs), GFP_KERNEL); |
@@ -1973,12 +1962,12 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cgrp) | |||
1973 | return &cs->css; | 1962 | return &cs->css; |
1974 | } | 1963 | } |
1975 | 1964 | ||
1976 | static int cpuset_css_online(struct cgroup *cgrp) | 1965 | static int cpuset_css_online(struct cgroup_subsys_state *css) |
1977 | { | 1966 | { |
1978 | struct cpuset *cs = cgroup_cs(cgrp); | 1967 | struct cpuset *cs = css_cs(css); |
1979 | struct cpuset *parent = parent_cs(cs); | 1968 | struct cpuset *parent = parent_cs(cs); |
1980 | struct cpuset *tmp_cs; | 1969 | struct cpuset *tmp_cs; |
1981 | struct cgroup *pos_cg; | 1970 | struct cgroup_subsys_state *pos_css; |
1982 | 1971 | ||
1983 | if (!parent) | 1972 | if (!parent) |
1984 | return 0; | 1973 | return 0; |
@@ -1993,7 +1982,7 @@ static int cpuset_css_online(struct cgroup *cgrp) | |||
1993 | 1982 | ||
1994 | number_of_cpusets++; | 1983 | number_of_cpusets++; |
1995 | 1984 | ||
1996 | if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags)) | 1985 | if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) |
1997 | goto out_unlock; | 1986 | goto out_unlock; |
1998 | 1987 | ||
1999 | /* | 1988 | /* |
@@ -2010,7 +1999,7 @@ static int cpuset_css_online(struct cgroup *cgrp) | |||
2010 | * (and likewise for mems) to the new cgroup. | 1999 | * (and likewise for mems) to the new cgroup. |
2011 | */ | 2000 | */ |
2012 | rcu_read_lock(); | 2001 | rcu_read_lock(); |
2013 | cpuset_for_each_child(tmp_cs, pos_cg, parent) { | 2002 | cpuset_for_each_child(tmp_cs, pos_css, parent) { |
2014 | if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) { | 2003 | if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) { |
2015 | rcu_read_unlock(); | 2004 | rcu_read_unlock(); |
2016 | goto out_unlock; | 2005 | goto out_unlock; |
@@ -2027,9 +2016,15 @@ out_unlock: | |||
2027 | return 0; | 2016 | return 0; |
2028 | } | 2017 | } |
2029 | 2018 | ||
2030 | static void cpuset_css_offline(struct cgroup *cgrp) | 2019 | /* |
2020 | * If the cpuset being removed has its flag 'sched_load_balance' | ||
2021 | * enabled, then simulate turning sched_load_balance off, which | ||
2022 | * will call rebuild_sched_domains_locked(). | ||
2023 | */ | ||
2024 | |||
2025 | static void cpuset_css_offline(struct cgroup_subsys_state *css) | ||
2031 | { | 2026 | { |
2032 | struct cpuset *cs = cgroup_cs(cgrp); | 2027 | struct cpuset *cs = css_cs(css); |
2033 | 2028 | ||
2034 | mutex_lock(&cpuset_mutex); | 2029 | mutex_lock(&cpuset_mutex); |
2035 | 2030 | ||
@@ -2042,15 +2037,9 @@ static void cpuset_css_offline(struct cgroup *cgrp) | |||
2042 | mutex_unlock(&cpuset_mutex); | 2037 | mutex_unlock(&cpuset_mutex); |
2043 | } | 2038 | } |
2044 | 2039 | ||
2045 | /* | 2040 | static void cpuset_css_free(struct cgroup_subsys_state *css) |
2046 | * If the cpuset being removed has its flag 'sched_load_balance' | ||
2047 | * enabled, then simulate turning sched_load_balance off, which | ||
2048 | * will call rebuild_sched_domains_locked(). | ||
2049 | */ | ||
2050 | |||
2051 | static void cpuset_css_free(struct cgroup *cgrp) | ||
2052 | { | 2041 | { |
2053 | struct cpuset *cs = cgroup_cs(cgrp); | 2042 | struct cpuset *cs = css_cs(css); |
2054 | 2043 | ||
2055 | free_cpumask_var(cs->cpus_allowed); | 2044 | free_cpumask_var(cs->cpus_allowed); |
2056 | kfree(cs); | 2045 | kfree(cs); |
@@ -2257,11 +2246,11 @@ static void cpuset_hotplug_workfn(struct work_struct *work) | |||
2257 | /* if cpus or mems changed, we need to propagate to descendants */ | 2246 | /* if cpus or mems changed, we need to propagate to descendants */ |
2258 | if (cpus_updated || mems_updated) { | 2247 | if (cpus_updated || mems_updated) { |
2259 | struct cpuset *cs; | 2248 | struct cpuset *cs; |
2260 | struct cgroup *pos_cgrp; | 2249 | struct cgroup_subsys_state *pos_css; |
2261 | 2250 | ||
2262 | rcu_read_lock(); | 2251 | rcu_read_lock(); |
2263 | cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset) { | 2252 | cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { |
2264 | if (!css_tryget(&cs->css)) | 2253 | if (cs == &top_cpuset || !css_tryget(&cs->css)) |
2265 | continue; | 2254 | continue; |
2266 | rcu_read_unlock(); | 2255 | rcu_read_unlock(); |
2267 | 2256 | ||
@@ -2350,7 +2339,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) | |||
2350 | 2339 | ||
2351 | void cpuset_cpus_allowed_fallback(struct task_struct *tsk) | 2340 | void cpuset_cpus_allowed_fallback(struct task_struct *tsk) |
2352 | { | 2341 | { |
2353 | const struct cpuset *cpus_cs; | 2342 | struct cpuset *cpus_cs; |
2354 | 2343 | ||
2355 | rcu_read_lock(); | 2344 | rcu_read_lock(); |
2356 | cpus_cs = effective_cpumask_cpuset(task_cs(tsk)); | 2345 | cpus_cs = effective_cpumask_cpuset(task_cs(tsk)); |
@@ -2423,7 +2412,7 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) | |||
2423 | * callback_mutex. If no ancestor is mem_exclusive or mem_hardwall | 2412 | * callback_mutex. If no ancestor is mem_exclusive or mem_hardwall |
2424 | * (an unusual configuration), then returns the root cpuset. | 2413 | * (an unusual configuration), then returns the root cpuset. |
2425 | */ | 2414 | */ |
2426 | static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs) | 2415 | static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) |
2427 | { | 2416 | { |
2428 | while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs)) | 2417 | while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs)) |
2429 | cs = parent_cs(cs); | 2418 | cs = parent_cs(cs); |
@@ -2493,7 +2482,7 @@ static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs) | |||
2493 | */ | 2482 | */ |
2494 | int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) | 2483 | int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) |
2495 | { | 2484 | { |
2496 | const struct cpuset *cs; /* current cpuset ancestors */ | 2485 | struct cpuset *cs; /* current cpuset ancestors */ |
2497 | int allowed; /* is allocation in zone z allowed? */ | 2486 | int allowed; /* is allocation in zone z allowed? */ |
2498 | 2487 | ||
2499 | if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) | 2488 | if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) |
@@ -2731,7 +2720,7 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v) | |||
2731 | goto out_free; | 2720 | goto out_free; |
2732 | 2721 | ||
2733 | rcu_read_lock(); | 2722 | rcu_read_lock(); |
2734 | css = task_subsys_state(tsk, cpuset_subsys_id); | 2723 | css = task_css(tsk, cpuset_subsys_id); |
2735 | retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); | 2724 | retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); |
2736 | rcu_read_unlock(); | 2725 | rcu_read_unlock(); |
2737 | if (retval < 0) | 2726 | if (retval < 0) |
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index c77206184b8b..97b67df8fbfe 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c | |||
@@ -116,6 +116,9 @@ int get_callchain_buffers(void) | |||
116 | 116 | ||
117 | err = alloc_callchain_buffers(); | 117 | err = alloc_callchain_buffers(); |
118 | exit: | 118 | exit: |
119 | if (err) | ||
120 | atomic_dec(&nr_callchain_events); | ||
121 | |||
119 | mutex_unlock(&callchain_mutex); | 122 | mutex_unlock(&callchain_mutex); |
120 | 123 | ||
121 | return err; | 124 | return err; |
diff --git a/kernel/events/core.c b/kernel/events/core.c index f86599e8c123..953c14348375 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
@@ -145,6 +145,7 @@ static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events); | |||
145 | static atomic_t nr_mmap_events __read_mostly; | 145 | static atomic_t nr_mmap_events __read_mostly; |
146 | static atomic_t nr_comm_events __read_mostly; | 146 | static atomic_t nr_comm_events __read_mostly; |
147 | static atomic_t nr_task_events __read_mostly; | 147 | static atomic_t nr_task_events __read_mostly; |
148 | static atomic_t nr_freq_events __read_mostly; | ||
148 | 149 | ||
149 | static LIST_HEAD(pmus); | 150 | static LIST_HEAD(pmus); |
150 | static DEFINE_MUTEX(pmus_lock); | 151 | static DEFINE_MUTEX(pmus_lock); |
@@ -340,8 +341,8 @@ struct perf_cgroup { | |||
340 | static inline struct perf_cgroup * | 341 | static inline struct perf_cgroup * |
341 | perf_cgroup_from_task(struct task_struct *task) | 342 | perf_cgroup_from_task(struct task_struct *task) |
342 | { | 343 | { |
343 | return container_of(task_subsys_state(task, perf_subsys_id), | 344 | return container_of(task_css(task, perf_subsys_id), |
344 | struct perf_cgroup, css); | 345 | struct perf_cgroup, css); |
345 | } | 346 | } |
346 | 347 | ||
347 | static inline bool | 348 | static inline bool |
@@ -591,7 +592,9 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, | |||
591 | if (!f.file) | 592 | if (!f.file) |
592 | return -EBADF; | 593 | return -EBADF; |
593 | 594 | ||
594 | css = cgroup_css_from_dir(f.file, perf_subsys_id); | 595 | rcu_read_lock(); |
596 | |||
597 | css = css_from_dir(f.file->f_dentry, &perf_subsys); | ||
595 | if (IS_ERR(css)) { | 598 | if (IS_ERR(css)) { |
596 | ret = PTR_ERR(css); | 599 | ret = PTR_ERR(css); |
597 | goto out; | 600 | goto out; |
@@ -617,6 +620,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, | |||
617 | ret = -EINVAL; | 620 | ret = -EINVAL; |
618 | } | 621 | } |
619 | out: | 622 | out: |
623 | rcu_read_unlock(); | ||
620 | fdput(f); | 624 | fdput(f); |
621 | return ret; | 625 | return ret; |
622 | } | 626 | } |
@@ -869,12 +873,8 @@ static void perf_pmu_rotate_start(struct pmu *pmu) | |||
869 | 873 | ||
870 | WARN_ON(!irqs_disabled()); | 874 | WARN_ON(!irqs_disabled()); |
871 | 875 | ||
872 | if (list_empty(&cpuctx->rotation_list)) { | 876 | if (list_empty(&cpuctx->rotation_list)) |
873 | int was_empty = list_empty(head); | ||
874 | list_add(&cpuctx->rotation_list, head); | 877 | list_add(&cpuctx->rotation_list, head); |
875 | if (was_empty) | ||
876 | tick_nohz_full_kick(); | ||
877 | } | ||
878 | } | 878 | } |
879 | 879 | ||
880 | static void get_ctx(struct perf_event_context *ctx) | 880 | static void get_ctx(struct perf_event_context *ctx) |
@@ -1216,6 +1216,9 @@ static void perf_event__id_header_size(struct perf_event *event) | |||
1216 | if (sample_type & PERF_SAMPLE_TIME) | 1216 | if (sample_type & PERF_SAMPLE_TIME) |
1217 | size += sizeof(data->time); | 1217 | size += sizeof(data->time); |
1218 | 1218 | ||
1219 | if (sample_type & PERF_SAMPLE_IDENTIFIER) | ||
1220 | size += sizeof(data->id); | ||
1221 | |||
1219 | if (sample_type & PERF_SAMPLE_ID) | 1222 | if (sample_type & PERF_SAMPLE_ID) |
1220 | size += sizeof(data->id); | 1223 | size += sizeof(data->id); |
1221 | 1224 | ||
@@ -2712,7 +2715,7 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, | |||
2712 | 2715 | ||
2713 | hwc = &event->hw; | 2716 | hwc = &event->hw; |
2714 | 2717 | ||
2715 | if (needs_unthr && hwc->interrupts == MAX_INTERRUPTS) { | 2718 | if (hwc->interrupts == MAX_INTERRUPTS) { |
2716 | hwc->interrupts = 0; | 2719 | hwc->interrupts = 0; |
2717 | perf_log_throttle(event, 1); | 2720 | perf_log_throttle(event, 1); |
2718 | event->pmu->start(event, 0); | 2721 | event->pmu->start(event, 0); |
@@ -2811,10 +2814,11 @@ done: | |||
2811 | #ifdef CONFIG_NO_HZ_FULL | 2814 | #ifdef CONFIG_NO_HZ_FULL |
2812 | bool perf_event_can_stop_tick(void) | 2815 | bool perf_event_can_stop_tick(void) |
2813 | { | 2816 | { |
2814 | if (list_empty(&__get_cpu_var(rotation_list))) | 2817 | if (atomic_read(&nr_freq_events) || |
2815 | return true; | 2818 | __this_cpu_read(perf_throttled_count)) |
2816 | else | ||
2817 | return false; | 2819 | return false; |
2820 | else | ||
2821 | return true; | ||
2818 | } | 2822 | } |
2819 | #endif | 2823 | #endif |
2820 | 2824 | ||
@@ -3128,36 +3132,63 @@ static void free_event_rcu(struct rcu_head *head) | |||
3128 | static void ring_buffer_put(struct ring_buffer *rb); | 3132 | static void ring_buffer_put(struct ring_buffer *rb); |
3129 | static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb); | 3133 | static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb); |
3130 | 3134 | ||
3131 | static void free_event(struct perf_event *event) | 3135 | static void unaccount_event_cpu(struct perf_event *event, int cpu) |
3132 | { | 3136 | { |
3133 | irq_work_sync(&event->pending); | 3137 | if (event->parent) |
3138 | return; | ||
3139 | |||
3140 | if (has_branch_stack(event)) { | ||
3141 | if (!(event->attach_state & PERF_ATTACH_TASK)) | ||
3142 | atomic_dec(&per_cpu(perf_branch_stack_events, cpu)); | ||
3143 | } | ||
3144 | if (is_cgroup_event(event)) | ||
3145 | atomic_dec(&per_cpu(perf_cgroup_events, cpu)); | ||
3146 | } | ||
3147 | |||
3148 | static void unaccount_event(struct perf_event *event) | ||
3149 | { | ||
3150 | if (event->parent) | ||
3151 | return; | ||
3152 | |||
3153 | if (event->attach_state & PERF_ATTACH_TASK) | ||
3154 | static_key_slow_dec_deferred(&perf_sched_events); | ||
3155 | if (event->attr.mmap || event->attr.mmap_data) | ||
3156 | atomic_dec(&nr_mmap_events); | ||
3157 | if (event->attr.comm) | ||
3158 | atomic_dec(&nr_comm_events); | ||
3159 | if (event->attr.task) | ||
3160 | atomic_dec(&nr_task_events); | ||
3161 | if (event->attr.freq) | ||
3162 | atomic_dec(&nr_freq_events); | ||
3163 | if (is_cgroup_event(event)) | ||
3164 | static_key_slow_dec_deferred(&perf_sched_events); | ||
3165 | if (has_branch_stack(event)) | ||
3166 | static_key_slow_dec_deferred(&perf_sched_events); | ||
3167 | |||
3168 | unaccount_event_cpu(event, event->cpu); | ||
3169 | } | ||
3134 | 3170 | ||
3171 | static void __free_event(struct perf_event *event) | ||
3172 | { | ||
3135 | if (!event->parent) { | 3173 | if (!event->parent) { |
3136 | if (event->attach_state & PERF_ATTACH_TASK) | ||
3137 | static_key_slow_dec_deferred(&perf_sched_events); | ||
3138 | if (event->attr.mmap || event->attr.mmap_data) | ||
3139 | atomic_dec(&nr_mmap_events); | ||
3140 | if (event->attr.comm) | ||
3141 | atomic_dec(&nr_comm_events); | ||
3142 | if (event->attr.task) | ||
3143 | atomic_dec(&nr_task_events); | ||
3144 | if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) | 3174 | if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) |
3145 | put_callchain_buffers(); | 3175 | put_callchain_buffers(); |
3146 | if (is_cgroup_event(event)) { | ||
3147 | atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); | ||
3148 | static_key_slow_dec_deferred(&perf_sched_events); | ||
3149 | } | ||
3150 | |||
3151 | if (has_branch_stack(event)) { | ||
3152 | static_key_slow_dec_deferred(&perf_sched_events); | ||
3153 | /* is system-wide event */ | ||
3154 | if (!(event->attach_state & PERF_ATTACH_TASK)) { | ||
3155 | atomic_dec(&per_cpu(perf_branch_stack_events, | ||
3156 | event->cpu)); | ||
3157 | } | ||
3158 | } | ||
3159 | } | 3176 | } |
3160 | 3177 | ||
3178 | if (event->destroy) | ||
3179 | event->destroy(event); | ||
3180 | |||
3181 | if (event->ctx) | ||
3182 | put_ctx(event->ctx); | ||
3183 | |||
3184 | call_rcu(&event->rcu_head, free_event_rcu); | ||
3185 | } | ||
3186 | static void free_event(struct perf_event *event) | ||
3187 | { | ||
3188 | irq_work_sync(&event->pending); | ||
3189 | |||
3190 | unaccount_event(event); | ||
3191 | |||
3161 | if (event->rb) { | 3192 | if (event->rb) { |
3162 | struct ring_buffer *rb; | 3193 | struct ring_buffer *rb; |
3163 | 3194 | ||
@@ -3180,13 +3211,8 @@ static void free_event(struct perf_event *event) | |||
3180 | if (is_cgroup_event(event)) | 3211 | if (is_cgroup_event(event)) |
3181 | perf_detach_cgroup(event); | 3212 | perf_detach_cgroup(event); |
3182 | 3213 | ||
3183 | if (event->destroy) | ||
3184 | event->destroy(event); | ||
3185 | 3214 | ||
3186 | if (event->ctx) | 3215 | __free_event(event); |
3187 | put_ctx(event->ctx); | ||
3188 | |||
3189 | call_rcu(&event->rcu_head, free_event_rcu); | ||
3190 | } | 3216 | } |
3191 | 3217 | ||
3192 | int perf_event_release_kernel(struct perf_event *event) | 3218 | int perf_event_release_kernel(struct perf_event *event) |
@@ -3544,6 +3570,15 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) | |||
3544 | case PERF_EVENT_IOC_PERIOD: | 3570 | case PERF_EVENT_IOC_PERIOD: |
3545 | return perf_event_period(event, (u64 __user *)arg); | 3571 | return perf_event_period(event, (u64 __user *)arg); |
3546 | 3572 | ||
3573 | case PERF_EVENT_IOC_ID: | ||
3574 | { | ||
3575 | u64 id = primary_event_id(event); | ||
3576 | |||
3577 | if (copy_to_user((void __user *)arg, &id, sizeof(id))) | ||
3578 | return -EFAULT; | ||
3579 | return 0; | ||
3580 | } | ||
3581 | |||
3547 | case PERF_EVENT_IOC_SET_OUTPUT: | 3582 | case PERF_EVENT_IOC_SET_OUTPUT: |
3548 | { | 3583 | { |
3549 | int ret; | 3584 | int ret; |
@@ -3625,6 +3660,26 @@ static void calc_timer_values(struct perf_event *event, | |||
3625 | *running = ctx_time - event->tstamp_running; | 3660 | *running = ctx_time - event->tstamp_running; |
3626 | } | 3661 | } |
3627 | 3662 | ||
3663 | static void perf_event_init_userpage(struct perf_event *event) | ||
3664 | { | ||
3665 | struct perf_event_mmap_page *userpg; | ||
3666 | struct ring_buffer *rb; | ||
3667 | |||
3668 | rcu_read_lock(); | ||
3669 | rb = rcu_dereference(event->rb); | ||
3670 | if (!rb) | ||
3671 | goto unlock; | ||
3672 | |||
3673 | userpg = rb->user_page; | ||
3674 | |||
3675 | /* Allow new userspace to detect that bit 0 is deprecated */ | ||
3676 | userpg->cap_bit0_is_deprecated = 1; | ||
3677 | userpg->size = offsetof(struct perf_event_mmap_page, __reserved); | ||
3678 | |||
3679 | unlock: | ||
3680 | rcu_read_unlock(); | ||
3681 | } | ||
3682 | |||
3628 | void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) | 3683 | void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) |
3629 | { | 3684 | { |
3630 | } | 3685 | } |
@@ -3641,6 +3696,10 @@ void perf_event_update_userpage(struct perf_event *event) | |||
3641 | u64 enabled, running, now; | 3696 | u64 enabled, running, now; |
3642 | 3697 | ||
3643 | rcu_read_lock(); | 3698 | rcu_read_lock(); |
3699 | rb = rcu_dereference(event->rb); | ||
3700 | if (!rb) | ||
3701 | goto unlock; | ||
3702 | |||
3644 | /* | 3703 | /* |
3645 | * compute total_time_enabled, total_time_running | 3704 | * compute total_time_enabled, total_time_running |
3646 | * based on snapshot values taken when the event | 3705 | * based on snapshot values taken when the event |
@@ -3651,12 +3710,8 @@ void perf_event_update_userpage(struct perf_event *event) | |||
3651 | * NMI context | 3710 | * NMI context |
3652 | */ | 3711 | */ |
3653 | calc_timer_values(event, &now, &enabled, &running); | 3712 | calc_timer_values(event, &now, &enabled, &running); |
3654 | rb = rcu_dereference(event->rb); | ||
3655 | if (!rb) | ||
3656 | goto unlock; | ||
3657 | 3713 | ||
3658 | userpg = rb->user_page; | 3714 | userpg = rb->user_page; |
3659 | |||
3660 | /* | 3715 | /* |
3661 | * Disable preemption so as to not let the corresponding user-space | 3716 | * Disable preemption so as to not let the corresponding user-space |
3662 | * spin too long if we get preempted. | 3717 | * spin too long if we get preempted. |
@@ -4009,6 +4064,7 @@ again: | |||
4009 | ring_buffer_attach(event, rb); | 4064 | ring_buffer_attach(event, rb); |
4010 | rcu_assign_pointer(event->rb, rb); | 4065 | rcu_assign_pointer(event->rb, rb); |
4011 | 4066 | ||
4067 | perf_event_init_userpage(event); | ||
4012 | perf_event_update_userpage(event); | 4068 | perf_event_update_userpage(event); |
4013 | 4069 | ||
4014 | unlock: | 4070 | unlock: |
@@ -4251,7 +4307,7 @@ static void __perf_event_header__init_id(struct perf_event_header *header, | |||
4251 | if (sample_type & PERF_SAMPLE_TIME) | 4307 | if (sample_type & PERF_SAMPLE_TIME) |
4252 | data->time = perf_clock(); | 4308 | data->time = perf_clock(); |
4253 | 4309 | ||
4254 | if (sample_type & PERF_SAMPLE_ID) | 4310 | if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER)) |
4255 | data->id = primary_event_id(event); | 4311 | data->id = primary_event_id(event); |
4256 | 4312 | ||
4257 | if (sample_type & PERF_SAMPLE_STREAM_ID) | 4313 | if (sample_type & PERF_SAMPLE_STREAM_ID) |
@@ -4290,6 +4346,9 @@ static void __perf_event__output_id_sample(struct perf_output_handle *handle, | |||
4290 | 4346 | ||
4291 | if (sample_type & PERF_SAMPLE_CPU) | 4347 | if (sample_type & PERF_SAMPLE_CPU) |
4292 | perf_output_put(handle, data->cpu_entry); | 4348 | perf_output_put(handle, data->cpu_entry); |
4349 | |||
4350 | if (sample_type & PERF_SAMPLE_IDENTIFIER) | ||
4351 | perf_output_put(handle, data->id); | ||
4293 | } | 4352 | } |
4294 | 4353 | ||
4295 | void perf_event__output_id_sample(struct perf_event *event, | 4354 | void perf_event__output_id_sample(struct perf_event *event, |
@@ -4355,7 +4414,8 @@ static void perf_output_read_group(struct perf_output_handle *handle, | |||
4355 | list_for_each_entry(sub, &leader->sibling_list, group_entry) { | 4414 | list_for_each_entry(sub, &leader->sibling_list, group_entry) { |
4356 | n = 0; | 4415 | n = 0; |
4357 | 4416 | ||
4358 | if (sub != event) | 4417 | if ((sub != event) && |
4418 | (sub->state == PERF_EVENT_STATE_ACTIVE)) | ||
4359 | sub->pmu->read(sub); | 4419 | sub->pmu->read(sub); |
4360 | 4420 | ||
4361 | values[n++] = perf_event_count(sub); | 4421 | values[n++] = perf_event_count(sub); |
@@ -4402,6 +4462,9 @@ void perf_output_sample(struct perf_output_handle *handle, | |||
4402 | 4462 | ||
4403 | perf_output_put(handle, *header); | 4463 | perf_output_put(handle, *header); |
4404 | 4464 | ||
4465 | if (sample_type & PERF_SAMPLE_IDENTIFIER) | ||
4466 | perf_output_put(handle, data->id); | ||
4467 | |||
4405 | if (sample_type & PERF_SAMPLE_IP) | 4468 | if (sample_type & PERF_SAMPLE_IP) |
4406 | perf_output_put(handle, data->ip); | 4469 | perf_output_put(handle, data->ip); |
4407 | 4470 | ||
@@ -4462,20 +4525,6 @@ void perf_output_sample(struct perf_output_handle *handle, | |||
4462 | } | 4525 | } |
4463 | } | 4526 | } |
4464 | 4527 | ||
4465 | if (!event->attr.watermark) { | ||
4466 | int wakeup_events = event->attr.wakeup_events; | ||
4467 | |||
4468 | if (wakeup_events) { | ||
4469 | struct ring_buffer *rb = handle->rb; | ||
4470 | int events = local_inc_return(&rb->events); | ||
4471 | |||
4472 | if (events >= wakeup_events) { | ||
4473 | local_sub(wakeup_events, &rb->events); | ||
4474 | local_inc(&rb->wakeup); | ||
4475 | } | ||
4476 | } | ||
4477 | } | ||
4478 | |||
4479 | if (sample_type & PERF_SAMPLE_BRANCH_STACK) { | 4528 | if (sample_type & PERF_SAMPLE_BRANCH_STACK) { |
4480 | if (data->br_stack) { | 4529 | if (data->br_stack) { |
4481 | size_t size; | 4530 | size_t size; |
@@ -4511,16 +4560,31 @@ void perf_output_sample(struct perf_output_handle *handle, | |||
4511 | } | 4560 | } |
4512 | } | 4561 | } |
4513 | 4562 | ||
4514 | if (sample_type & PERF_SAMPLE_STACK_USER) | 4563 | if (sample_type & PERF_SAMPLE_STACK_USER) { |
4515 | perf_output_sample_ustack(handle, | 4564 | perf_output_sample_ustack(handle, |
4516 | data->stack_user_size, | 4565 | data->stack_user_size, |
4517 | data->regs_user.regs); | 4566 | data->regs_user.regs); |
4567 | } | ||
4518 | 4568 | ||
4519 | if (sample_type & PERF_SAMPLE_WEIGHT) | 4569 | if (sample_type & PERF_SAMPLE_WEIGHT) |
4520 | perf_output_put(handle, data->weight); | 4570 | perf_output_put(handle, data->weight); |
4521 | 4571 | ||
4522 | if (sample_type & PERF_SAMPLE_DATA_SRC) | 4572 | if (sample_type & PERF_SAMPLE_DATA_SRC) |
4523 | perf_output_put(handle, data->data_src.val); | 4573 | perf_output_put(handle, data->data_src.val); |
4574 | |||
4575 | if (!event->attr.watermark) { | ||
4576 | int wakeup_events = event->attr.wakeup_events; | ||
4577 | |||
4578 | if (wakeup_events) { | ||
4579 | struct ring_buffer *rb = handle->rb; | ||
4580 | int events = local_inc_return(&rb->events); | ||
4581 | |||
4582 | if (events >= wakeup_events) { | ||
4583 | local_sub(wakeup_events, &rb->events); | ||
4584 | local_inc(&rb->wakeup); | ||
4585 | } | ||
4586 | } | ||
4587 | } | ||
4524 | } | 4588 | } |
4525 | 4589 | ||
4526 | void perf_prepare_sample(struct perf_event_header *header, | 4590 | void perf_prepare_sample(struct perf_event_header *header, |
@@ -4680,12 +4744,10 @@ perf_event_read_event(struct perf_event *event, | |||
4680 | perf_output_end(&handle); | 4744 | perf_output_end(&handle); |
4681 | } | 4745 | } |
4682 | 4746 | ||
4683 | typedef int (perf_event_aux_match_cb)(struct perf_event *event, void *data); | ||
4684 | typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data); | 4747 | typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data); |
4685 | 4748 | ||
4686 | static void | 4749 | static void |
4687 | perf_event_aux_ctx(struct perf_event_context *ctx, | 4750 | perf_event_aux_ctx(struct perf_event_context *ctx, |
4688 | perf_event_aux_match_cb match, | ||
4689 | perf_event_aux_output_cb output, | 4751 | perf_event_aux_output_cb output, |
4690 | void *data) | 4752 | void *data) |
4691 | { | 4753 | { |
@@ -4696,15 +4758,12 @@ perf_event_aux_ctx(struct perf_event_context *ctx, | |||
4696 | continue; | 4758 | continue; |
4697 | if (!event_filter_match(event)) | 4759 | if (!event_filter_match(event)) |
4698 | continue; | 4760 | continue; |
4699 | if (match(event, data)) | 4761 | output(event, data); |
4700 | output(event, data); | ||
4701 | } | 4762 | } |
4702 | } | 4763 | } |
4703 | 4764 | ||
4704 | static void | 4765 | static void |
4705 | perf_event_aux(perf_event_aux_match_cb match, | 4766 | perf_event_aux(perf_event_aux_output_cb output, void *data, |
4706 | perf_event_aux_output_cb output, | ||
4707 | void *data, | ||
4708 | struct perf_event_context *task_ctx) | 4767 | struct perf_event_context *task_ctx) |
4709 | { | 4768 | { |
4710 | struct perf_cpu_context *cpuctx; | 4769 | struct perf_cpu_context *cpuctx; |
@@ -4717,7 +4776,7 @@ perf_event_aux(perf_event_aux_match_cb match, | |||
4717 | cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); | 4776 | cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); |
4718 | if (cpuctx->unique_pmu != pmu) | 4777 | if (cpuctx->unique_pmu != pmu) |
4719 | goto next; | 4778 | goto next; |
4720 | perf_event_aux_ctx(&cpuctx->ctx, match, output, data); | 4779 | perf_event_aux_ctx(&cpuctx->ctx, output, data); |
4721 | if (task_ctx) | 4780 | if (task_ctx) |
4722 | goto next; | 4781 | goto next; |
4723 | ctxn = pmu->task_ctx_nr; | 4782 | ctxn = pmu->task_ctx_nr; |
@@ -4725,14 +4784,14 @@ perf_event_aux(perf_event_aux_match_cb match, | |||
4725 | goto next; | 4784 | goto next; |
4726 | ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); | 4785 | ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); |
4727 | if (ctx) | 4786 | if (ctx) |
4728 | perf_event_aux_ctx(ctx, match, output, data); | 4787 | perf_event_aux_ctx(ctx, output, data); |
4729 | next: | 4788 | next: |
4730 | put_cpu_ptr(pmu->pmu_cpu_context); | 4789 | put_cpu_ptr(pmu->pmu_cpu_context); |
4731 | } | 4790 | } |
4732 | 4791 | ||
4733 | if (task_ctx) { | 4792 | if (task_ctx) { |
4734 | preempt_disable(); | 4793 | preempt_disable(); |
4735 | perf_event_aux_ctx(task_ctx, match, output, data); | 4794 | perf_event_aux_ctx(task_ctx, output, data); |
4736 | preempt_enable(); | 4795 | preempt_enable(); |
4737 | } | 4796 | } |
4738 | rcu_read_unlock(); | 4797 | rcu_read_unlock(); |
@@ -4741,7 +4800,7 @@ next: | |||
4741 | /* | 4800 | /* |
4742 | * task tracking -- fork/exit | 4801 | * task tracking -- fork/exit |
4743 | * | 4802 | * |
4744 | * enabled by: attr.comm | attr.mmap | attr.mmap_data | attr.task | 4803 | * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task |
4745 | */ | 4804 | */ |
4746 | 4805 | ||
4747 | struct perf_task_event { | 4806 | struct perf_task_event { |
@@ -4759,6 +4818,13 @@ struct perf_task_event { | |||
4759 | } event_id; | 4818 | } event_id; |
4760 | }; | 4819 | }; |
4761 | 4820 | ||
4821 | static int perf_event_task_match(struct perf_event *event) | ||
4822 | { | ||
4823 | return event->attr.comm || event->attr.mmap || | ||
4824 | event->attr.mmap2 || event->attr.mmap_data || | ||
4825 | event->attr.task; | ||
4826 | } | ||
4827 | |||
4762 | static void perf_event_task_output(struct perf_event *event, | 4828 | static void perf_event_task_output(struct perf_event *event, |
4763 | void *data) | 4829 | void *data) |
4764 | { | 4830 | { |
@@ -4768,6 +4834,9 @@ static void perf_event_task_output(struct perf_event *event, | |||
4768 | struct task_struct *task = task_event->task; | 4834 | struct task_struct *task = task_event->task; |
4769 | int ret, size = task_event->event_id.header.size; | 4835 | int ret, size = task_event->event_id.header.size; |
4770 | 4836 | ||
4837 | if (!perf_event_task_match(event)) | ||
4838 | return; | ||
4839 | |||
4771 | perf_event_header__init_id(&task_event->event_id.header, &sample, event); | 4840 | perf_event_header__init_id(&task_event->event_id.header, &sample, event); |
4772 | 4841 | ||
4773 | ret = perf_output_begin(&handle, event, | 4842 | ret = perf_output_begin(&handle, event, |
@@ -4790,13 +4859,6 @@ out: | |||
4790 | task_event->event_id.header.size = size; | 4859 | task_event->event_id.header.size = size; |
4791 | } | 4860 | } |
4792 | 4861 | ||
4793 | static int perf_event_task_match(struct perf_event *event, | ||
4794 | void *data __maybe_unused) | ||
4795 | { | ||
4796 | return event->attr.comm || event->attr.mmap || | ||
4797 | event->attr.mmap_data || event->attr.task; | ||
4798 | } | ||
4799 | |||
4800 | static void perf_event_task(struct task_struct *task, | 4862 | static void perf_event_task(struct task_struct *task, |
4801 | struct perf_event_context *task_ctx, | 4863 | struct perf_event_context *task_ctx, |
4802 | int new) | 4864 | int new) |
@@ -4825,8 +4887,7 @@ static void perf_event_task(struct task_struct *task, | |||
4825 | }, | 4887 | }, |
4826 | }; | 4888 | }; |
4827 | 4889 | ||
4828 | perf_event_aux(perf_event_task_match, | 4890 | perf_event_aux(perf_event_task_output, |
4829 | perf_event_task_output, | ||
4830 | &task_event, | 4891 | &task_event, |
4831 | task_ctx); | 4892 | task_ctx); |
4832 | } | 4893 | } |
@@ -4853,6 +4914,11 @@ struct perf_comm_event { | |||
4853 | } event_id; | 4914 | } event_id; |
4854 | }; | 4915 | }; |
4855 | 4916 | ||
4917 | static int perf_event_comm_match(struct perf_event *event) | ||
4918 | { | ||
4919 | return event->attr.comm; | ||
4920 | } | ||
4921 | |||
4856 | static void perf_event_comm_output(struct perf_event *event, | 4922 | static void perf_event_comm_output(struct perf_event *event, |
4857 | void *data) | 4923 | void *data) |
4858 | { | 4924 | { |
@@ -4862,6 +4928,9 @@ static void perf_event_comm_output(struct perf_event *event, | |||
4862 | int size = comm_event->event_id.header.size; | 4928 | int size = comm_event->event_id.header.size; |
4863 | int ret; | 4929 | int ret; |
4864 | 4930 | ||
4931 | if (!perf_event_comm_match(event)) | ||
4932 | return; | ||
4933 | |||
4865 | perf_event_header__init_id(&comm_event->event_id.header, &sample, event); | 4934 | perf_event_header__init_id(&comm_event->event_id.header, &sample, event); |
4866 | ret = perf_output_begin(&handle, event, | 4935 | ret = perf_output_begin(&handle, event, |
4867 | comm_event->event_id.header.size); | 4936 | comm_event->event_id.header.size); |
@@ -4883,12 +4952,6 @@ out: | |||
4883 | comm_event->event_id.header.size = size; | 4952 | comm_event->event_id.header.size = size; |
4884 | } | 4953 | } |
4885 | 4954 | ||
4886 | static int perf_event_comm_match(struct perf_event *event, | ||
4887 | void *data __maybe_unused) | ||
4888 | { | ||
4889 | return event->attr.comm; | ||
4890 | } | ||
4891 | |||
4892 | static void perf_event_comm_event(struct perf_comm_event *comm_event) | 4955 | static void perf_event_comm_event(struct perf_comm_event *comm_event) |
4893 | { | 4956 | { |
4894 | char comm[TASK_COMM_LEN]; | 4957 | char comm[TASK_COMM_LEN]; |
@@ -4903,8 +4966,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) | |||
4903 | 4966 | ||
4904 | comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; | 4967 | comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; |
4905 | 4968 | ||
4906 | perf_event_aux(perf_event_comm_match, | 4969 | perf_event_aux(perf_event_comm_output, |
4907 | perf_event_comm_output, | ||
4908 | comm_event, | 4970 | comm_event, |
4909 | NULL); | 4971 | NULL); |
4910 | } | 4972 | } |
@@ -4955,6 +5017,9 @@ struct perf_mmap_event { | |||
4955 | 5017 | ||
4956 | const char *file_name; | 5018 | const char *file_name; |
4957 | int file_size; | 5019 | int file_size; |
5020 | int maj, min; | ||
5021 | u64 ino; | ||
5022 | u64 ino_generation; | ||
4958 | 5023 | ||
4959 | struct { | 5024 | struct { |
4960 | struct perf_event_header header; | 5025 | struct perf_event_header header; |
@@ -4967,6 +5032,17 @@ struct perf_mmap_event { | |||
4967 | } event_id; | 5032 | } event_id; |
4968 | }; | 5033 | }; |
4969 | 5034 | ||
5035 | static int perf_event_mmap_match(struct perf_event *event, | ||
5036 | void *data) | ||
5037 | { | ||
5038 | struct perf_mmap_event *mmap_event = data; | ||
5039 | struct vm_area_struct *vma = mmap_event->vma; | ||
5040 | int executable = vma->vm_flags & VM_EXEC; | ||
5041 | |||
5042 | return (!executable && event->attr.mmap_data) || | ||
5043 | (executable && (event->attr.mmap || event->attr.mmap2)); | ||
5044 | } | ||
5045 | |||
4970 | static void perf_event_mmap_output(struct perf_event *event, | 5046 | static void perf_event_mmap_output(struct perf_event *event, |
4971 | void *data) | 5047 | void *data) |
4972 | { | 5048 | { |
@@ -4976,6 +5052,17 @@ static void perf_event_mmap_output(struct perf_event *event, | |||
4976 | int size = mmap_event->event_id.header.size; | 5052 | int size = mmap_event->event_id.header.size; |
4977 | int ret; | 5053 | int ret; |
4978 | 5054 | ||
5055 | if (!perf_event_mmap_match(event, data)) | ||
5056 | return; | ||
5057 | |||
5058 | if (event->attr.mmap2) { | ||
5059 | mmap_event->event_id.header.type = PERF_RECORD_MMAP2; | ||
5060 | mmap_event->event_id.header.size += sizeof(mmap_event->maj); | ||
5061 | mmap_event->event_id.header.size += sizeof(mmap_event->min); | ||
5062 | mmap_event->event_id.header.size += sizeof(mmap_event->ino); | ||
5063 | mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation); | ||
5064 | } | ||
5065 | |||
4979 | perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); | 5066 | perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); |
4980 | ret = perf_output_begin(&handle, event, | 5067 | ret = perf_output_begin(&handle, event, |
4981 | mmap_event->event_id.header.size); | 5068 | mmap_event->event_id.header.size); |
@@ -4986,6 +5073,14 @@ static void perf_event_mmap_output(struct perf_event *event, | |||
4986 | mmap_event->event_id.tid = perf_event_tid(event, current); | 5073 | mmap_event->event_id.tid = perf_event_tid(event, current); |
4987 | 5074 | ||
4988 | perf_output_put(&handle, mmap_event->event_id); | 5075 | perf_output_put(&handle, mmap_event->event_id); |
5076 | |||
5077 | if (event->attr.mmap2) { | ||
5078 | perf_output_put(&handle, mmap_event->maj); | ||
5079 | perf_output_put(&handle, mmap_event->min); | ||
5080 | perf_output_put(&handle, mmap_event->ino); | ||
5081 | perf_output_put(&handle, mmap_event->ino_generation); | ||
5082 | } | ||
5083 | |||
4989 | __output_copy(&handle, mmap_event->file_name, | 5084 | __output_copy(&handle, mmap_event->file_name, |
4990 | mmap_event->file_size); | 5085 | mmap_event->file_size); |
4991 | 5086 | ||
@@ -4996,21 +5091,12 @@ out: | |||
4996 | mmap_event->event_id.header.size = size; | 5091 | mmap_event->event_id.header.size = size; |
4997 | } | 5092 | } |
4998 | 5093 | ||
4999 | static int perf_event_mmap_match(struct perf_event *event, | ||
5000 | void *data) | ||
5001 | { | ||
5002 | struct perf_mmap_event *mmap_event = data; | ||
5003 | struct vm_area_struct *vma = mmap_event->vma; | ||
5004 | int executable = vma->vm_flags & VM_EXEC; | ||
5005 | |||
5006 | return (!executable && event->attr.mmap_data) || | ||
5007 | (executable && event->attr.mmap); | ||
5008 | } | ||
5009 | |||
5010 | static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) | 5094 | static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) |
5011 | { | 5095 | { |
5012 | struct vm_area_struct *vma = mmap_event->vma; | 5096 | struct vm_area_struct *vma = mmap_event->vma; |
5013 | struct file *file = vma->vm_file; | 5097 | struct file *file = vma->vm_file; |
5098 | int maj = 0, min = 0; | ||
5099 | u64 ino = 0, gen = 0; | ||
5014 | unsigned int size; | 5100 | unsigned int size; |
5015 | char tmp[16]; | 5101 | char tmp[16]; |
5016 | char *buf = NULL; | 5102 | char *buf = NULL; |
@@ -5019,6 +5105,8 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) | |||
5019 | memset(tmp, 0, sizeof(tmp)); | 5105 | memset(tmp, 0, sizeof(tmp)); |
5020 | 5106 | ||
5021 | if (file) { | 5107 | if (file) { |
5108 | struct inode *inode; | ||
5109 | dev_t dev; | ||
5022 | /* | 5110 | /* |
5023 | * d_path works from the end of the rb backwards, so we | 5111 | * d_path works from the end of the rb backwards, so we |
5024 | * need to add enough zero bytes after the string to handle | 5112 | * need to add enough zero bytes after the string to handle |
@@ -5034,6 +5122,13 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) | |||
5034 | name = strncpy(tmp, "//toolong", sizeof(tmp)); | 5122 | name = strncpy(tmp, "//toolong", sizeof(tmp)); |
5035 | goto got_name; | 5123 | goto got_name; |
5036 | } | 5124 | } |
5125 | inode = file_inode(vma->vm_file); | ||
5126 | dev = inode->i_sb->s_dev; | ||
5127 | ino = inode->i_ino; | ||
5128 | gen = inode->i_generation; | ||
5129 | maj = MAJOR(dev); | ||
5130 | min = MINOR(dev); | ||
5131 | |||
5037 | } else { | 5132 | } else { |
5038 | if (arch_vma_name(mmap_event->vma)) { | 5133 | if (arch_vma_name(mmap_event->vma)) { |
5039 | name = strncpy(tmp, arch_vma_name(mmap_event->vma), | 5134 | name = strncpy(tmp, arch_vma_name(mmap_event->vma), |
@@ -5064,14 +5159,17 @@ got_name: | |||
5064 | 5159 | ||
5065 | mmap_event->file_name = name; | 5160 | mmap_event->file_name = name; |
5066 | mmap_event->file_size = size; | 5161 | mmap_event->file_size = size; |
5162 | mmap_event->maj = maj; | ||
5163 | mmap_event->min = min; | ||
5164 | mmap_event->ino = ino; | ||
5165 | mmap_event->ino_generation = gen; | ||
5067 | 5166 | ||
5068 | if (!(vma->vm_flags & VM_EXEC)) | 5167 | if (!(vma->vm_flags & VM_EXEC)) |
5069 | mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA; | 5168 | mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA; |
5070 | 5169 | ||
5071 | mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; | 5170 | mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; |
5072 | 5171 | ||
5073 | perf_event_aux(perf_event_mmap_match, | 5172 | perf_event_aux(perf_event_mmap_output, |
5074 | perf_event_mmap_output, | ||
5075 | mmap_event, | 5173 | mmap_event, |
5076 | NULL); | 5174 | NULL); |
5077 | 5175 | ||
@@ -5101,6 +5199,10 @@ void perf_event_mmap(struct vm_area_struct *vma) | |||
5101 | .len = vma->vm_end - vma->vm_start, | 5199 | .len = vma->vm_end - vma->vm_start, |
5102 | .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT, | 5200 | .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT, |
5103 | }, | 5201 | }, |
5202 | /* .maj (attr_mmap2 only) */ | ||
5203 | /* .min (attr_mmap2 only) */ | ||
5204 | /* .ino (attr_mmap2 only) */ | ||
5205 | /* .ino_generation (attr_mmap2 only) */ | ||
5104 | }; | 5206 | }; |
5105 | 5207 | ||
5106 | perf_event_mmap_event(&mmap_event); | 5208 | perf_event_mmap_event(&mmap_event); |
@@ -5178,6 +5280,7 @@ static int __perf_event_overflow(struct perf_event *event, | |||
5178 | __this_cpu_inc(perf_throttled_count); | 5280 | __this_cpu_inc(perf_throttled_count); |
5179 | hwc->interrupts = MAX_INTERRUPTS; | 5281 | hwc->interrupts = MAX_INTERRUPTS; |
5180 | perf_log_throttle(event, 0); | 5282 | perf_log_throttle(event, 0); |
5283 | tick_nohz_full_kick(); | ||
5181 | ret = 1; | 5284 | ret = 1; |
5182 | } | 5285 | } |
5183 | } | 5286 | } |
@@ -6443,6 +6546,44 @@ unlock: | |||
6443 | return pmu; | 6546 | return pmu; |
6444 | } | 6547 | } |
6445 | 6548 | ||
6549 | static void account_event_cpu(struct perf_event *event, int cpu) | ||
6550 | { | ||
6551 | if (event->parent) | ||
6552 | return; | ||
6553 | |||
6554 | if (has_branch_stack(event)) { | ||
6555 | if (!(event->attach_state & PERF_ATTACH_TASK)) | ||
6556 | atomic_inc(&per_cpu(perf_branch_stack_events, cpu)); | ||
6557 | } | ||
6558 | if (is_cgroup_event(event)) | ||
6559 | atomic_inc(&per_cpu(perf_cgroup_events, cpu)); | ||
6560 | } | ||
6561 | |||
6562 | static void account_event(struct perf_event *event) | ||
6563 | { | ||
6564 | if (event->parent) | ||
6565 | return; | ||
6566 | |||
6567 | if (event->attach_state & PERF_ATTACH_TASK) | ||
6568 | static_key_slow_inc(&perf_sched_events.key); | ||
6569 | if (event->attr.mmap || event->attr.mmap_data) | ||
6570 | atomic_inc(&nr_mmap_events); | ||
6571 | if (event->attr.comm) | ||
6572 | atomic_inc(&nr_comm_events); | ||
6573 | if (event->attr.task) | ||
6574 | atomic_inc(&nr_task_events); | ||
6575 | if (event->attr.freq) { | ||
6576 | if (atomic_inc_return(&nr_freq_events) == 1) | ||
6577 | tick_nohz_full_kick_all(); | ||
6578 | } | ||
6579 | if (has_branch_stack(event)) | ||
6580 | static_key_slow_inc(&perf_sched_events.key); | ||
6581 | if (is_cgroup_event(event)) | ||
6582 | static_key_slow_inc(&perf_sched_events.key); | ||
6583 | |||
6584 | account_event_cpu(event, event->cpu); | ||
6585 | } | ||
6586 | |||
6446 | /* | 6587 | /* |
6447 | * Allocate and initialize a event structure | 6588 | * Allocate and initialize a event structure |
6448 | */ | 6589 | */ |
@@ -6457,7 +6598,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, | |||
6457 | struct pmu *pmu; | 6598 | struct pmu *pmu; |
6458 | struct perf_event *event; | 6599 | struct perf_event *event; |
6459 | struct hw_perf_event *hwc; | 6600 | struct hw_perf_event *hwc; |
6460 | long err; | 6601 | long err = -EINVAL; |
6461 | 6602 | ||
6462 | if ((unsigned)cpu >= nr_cpu_ids) { | 6603 | if ((unsigned)cpu >= nr_cpu_ids) { |
6463 | if (!task || cpu != -1) | 6604 | if (!task || cpu != -1) |
@@ -6540,49 +6681,35 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, | |||
6540 | * we currently do not support PERF_FORMAT_GROUP on inherited events | 6681 | * we currently do not support PERF_FORMAT_GROUP on inherited events |
6541 | */ | 6682 | */ |
6542 | if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) | 6683 | if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) |
6543 | goto done; | 6684 | goto err_ns; |
6544 | 6685 | ||
6545 | pmu = perf_init_event(event); | 6686 | pmu = perf_init_event(event); |
6546 | |||
6547 | done: | ||
6548 | err = 0; | ||
6549 | if (!pmu) | 6687 | if (!pmu) |
6550 | err = -EINVAL; | 6688 | goto err_ns; |
6551 | else if (IS_ERR(pmu)) | 6689 | else if (IS_ERR(pmu)) { |
6552 | err = PTR_ERR(pmu); | 6690 | err = PTR_ERR(pmu); |
6553 | 6691 | goto err_ns; | |
6554 | if (err) { | ||
6555 | if (event->ns) | ||
6556 | put_pid_ns(event->ns); | ||
6557 | kfree(event); | ||
6558 | return ERR_PTR(err); | ||
6559 | } | 6692 | } |
6560 | 6693 | ||
6561 | if (!event->parent) { | 6694 | if (!event->parent) { |
6562 | if (event->attach_state & PERF_ATTACH_TASK) | ||
6563 | static_key_slow_inc(&perf_sched_events.key); | ||
6564 | if (event->attr.mmap || event->attr.mmap_data) | ||
6565 | atomic_inc(&nr_mmap_events); | ||
6566 | if (event->attr.comm) | ||
6567 | atomic_inc(&nr_comm_events); | ||
6568 | if (event->attr.task) | ||
6569 | atomic_inc(&nr_task_events); | ||
6570 | if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { | 6695 | if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { |
6571 | err = get_callchain_buffers(); | 6696 | err = get_callchain_buffers(); |
6572 | if (err) { | 6697 | if (err) |
6573 | free_event(event); | 6698 | goto err_pmu; |
6574 | return ERR_PTR(err); | ||
6575 | } | ||
6576 | } | ||
6577 | if (has_branch_stack(event)) { | ||
6578 | static_key_slow_inc(&perf_sched_events.key); | ||
6579 | if (!(event->attach_state & PERF_ATTACH_TASK)) | ||
6580 | atomic_inc(&per_cpu(perf_branch_stack_events, | ||
6581 | event->cpu)); | ||
6582 | } | 6699 | } |
6583 | } | 6700 | } |
6584 | 6701 | ||
6585 | return event; | 6702 | return event; |
6703 | |||
6704 | err_pmu: | ||
6705 | if (event->destroy) | ||
6706 | event->destroy(event); | ||
6707 | err_ns: | ||
6708 | if (event->ns) | ||
6709 | put_pid_ns(event->ns); | ||
6710 | kfree(event); | ||
6711 | |||
6712 | return ERR_PTR(err); | ||
6586 | } | 6713 | } |
6587 | 6714 | ||
6588 | static int perf_copy_attr(struct perf_event_attr __user *uattr, | 6715 | static int perf_copy_attr(struct perf_event_attr __user *uattr, |
@@ -6640,6 +6767,10 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, | |||
6640 | if (ret) | 6767 | if (ret) |
6641 | return -EFAULT; | 6768 | return -EFAULT; |
6642 | 6769 | ||
6770 | /* disabled for now */ | ||
6771 | if (attr->mmap2) | ||
6772 | return -EINVAL; | ||
6773 | |||
6643 | if (attr->__reserved_1) | 6774 | if (attr->__reserved_1) |
6644 | return -EINVAL; | 6775 | return -EINVAL; |
6645 | 6776 | ||
@@ -6864,17 +6995,14 @@ SYSCALL_DEFINE5(perf_event_open, | |||
6864 | 6995 | ||
6865 | if (flags & PERF_FLAG_PID_CGROUP) { | 6996 | if (flags & PERF_FLAG_PID_CGROUP) { |
6866 | err = perf_cgroup_connect(pid, event, &attr, group_leader); | 6997 | err = perf_cgroup_connect(pid, event, &attr, group_leader); |
6867 | if (err) | 6998 | if (err) { |
6868 | goto err_alloc; | 6999 | __free_event(event); |
6869 | /* | 7000 | goto err_task; |
6870 | * one more event: | 7001 | } |
6871 | * - that has cgroup constraint on event->cpu | ||
6872 | * - that may need work on context switch | ||
6873 | */ | ||
6874 | atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); | ||
6875 | static_key_slow_inc(&perf_sched_events.key); | ||
6876 | } | 7002 | } |
6877 | 7003 | ||
7004 | account_event(event); | ||
7005 | |||
6878 | /* | 7006 | /* |
6879 | * Special case software events and allow them to be part of | 7007 | * Special case software events and allow them to be part of |
6880 | * any hardware group. | 7008 | * any hardware group. |
@@ -7070,6 +7198,8 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, | |||
7070 | goto err; | 7198 | goto err; |
7071 | } | 7199 | } |
7072 | 7200 | ||
7201 | account_event(event); | ||
7202 | |||
7073 | ctx = find_get_context(event->pmu, task, cpu); | 7203 | ctx = find_get_context(event->pmu, task, cpu); |
7074 | if (IS_ERR(ctx)) { | 7204 | if (IS_ERR(ctx)) { |
7075 | err = PTR_ERR(ctx); | 7205 | err = PTR_ERR(ctx); |
@@ -7106,18 +7236,20 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) | |||
7106 | list_for_each_entry_safe(event, tmp, &src_ctx->event_list, | 7236 | list_for_each_entry_safe(event, tmp, &src_ctx->event_list, |
7107 | event_entry) { | 7237 | event_entry) { |
7108 | perf_remove_from_context(event); | 7238 | perf_remove_from_context(event); |
7239 | unaccount_event_cpu(event, src_cpu); | ||
7109 | put_ctx(src_ctx); | 7240 | put_ctx(src_ctx); |
7110 | list_add(&event->event_entry, &events); | 7241 | list_add(&event->migrate_entry, &events); |
7111 | } | 7242 | } |
7112 | mutex_unlock(&src_ctx->mutex); | 7243 | mutex_unlock(&src_ctx->mutex); |
7113 | 7244 | ||
7114 | synchronize_rcu(); | 7245 | synchronize_rcu(); |
7115 | 7246 | ||
7116 | mutex_lock(&dst_ctx->mutex); | 7247 | mutex_lock(&dst_ctx->mutex); |
7117 | list_for_each_entry_safe(event, tmp, &events, event_entry) { | 7248 | list_for_each_entry_safe(event, tmp, &events, migrate_entry) { |
7118 | list_del(&event->event_entry); | 7249 | list_del(&event->migrate_entry); |
7119 | if (event->state >= PERF_EVENT_STATE_OFF) | 7250 | if (event->state >= PERF_EVENT_STATE_OFF) |
7120 | event->state = PERF_EVENT_STATE_INACTIVE; | 7251 | event->state = PERF_EVENT_STATE_INACTIVE; |
7252 | account_event_cpu(event, dst_cpu); | ||
7121 | perf_install_in_context(dst_ctx, event, dst_cpu); | 7253 | perf_install_in_context(dst_ctx, event, dst_cpu); |
7122 | get_ctx(dst_ctx); | 7254 | get_ctx(dst_ctx); |
7123 | } | 7255 | } |
@@ -7798,7 +7930,8 @@ unlock: | |||
7798 | device_initcall(perf_event_sysfs_init); | 7930 | device_initcall(perf_event_sysfs_init); |
7799 | 7931 | ||
7800 | #ifdef CONFIG_CGROUP_PERF | 7932 | #ifdef CONFIG_CGROUP_PERF |
7801 | static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont) | 7933 | static struct cgroup_subsys_state * |
7934 | perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) | ||
7802 | { | 7935 | { |
7803 | struct perf_cgroup *jc; | 7936 | struct perf_cgroup *jc; |
7804 | 7937 | ||
@@ -7815,11 +7948,10 @@ static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont) | |||
7815 | return &jc->css; | 7948 | return &jc->css; |
7816 | } | 7949 | } |
7817 | 7950 | ||
7818 | static void perf_cgroup_css_free(struct cgroup *cont) | 7951 | static void perf_cgroup_css_free(struct cgroup_subsys_state *css) |
7819 | { | 7952 | { |
7820 | struct perf_cgroup *jc; | 7953 | struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css); |
7821 | jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), | 7954 | |
7822 | struct perf_cgroup, css); | ||
7823 | free_percpu(jc->info); | 7955 | free_percpu(jc->info); |
7824 | kfree(jc); | 7956 | kfree(jc); |
7825 | } | 7957 | } |
@@ -7831,15 +7963,17 @@ static int __perf_cgroup_move(void *info) | |||
7831 | return 0; | 7963 | return 0; |
7832 | } | 7964 | } |
7833 | 7965 | ||
7834 | static void perf_cgroup_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) | 7966 | static void perf_cgroup_attach(struct cgroup_subsys_state *css, |
7967 | struct cgroup_taskset *tset) | ||
7835 | { | 7968 | { |
7836 | struct task_struct *task; | 7969 | struct task_struct *task; |
7837 | 7970 | ||
7838 | cgroup_taskset_for_each(task, cgrp, tset) | 7971 | cgroup_taskset_for_each(task, css, tset) |
7839 | task_function_call(task, __perf_cgroup_move, task); | 7972 | task_function_call(task, __perf_cgroup_move, task); |
7840 | } | 7973 | } |
7841 | 7974 | ||
7842 | static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, | 7975 | static void perf_cgroup_exit(struct cgroup_subsys_state *css, |
7976 | struct cgroup_subsys_state *old_css, | ||
7843 | struct task_struct *task) | 7977 | struct task_struct *task) |
7844 | { | 7978 | { |
7845 | /* | 7979 | /* |
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index cd55144270b5..9c2ddfbf4525 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c | |||
@@ -87,10 +87,31 @@ again: | |||
87 | goto out; | 87 | goto out; |
88 | 88 | ||
89 | /* | 89 | /* |
90 | * Publish the known good head. Rely on the full barrier implied | 90 | * Since the mmap() consumer (userspace) can run on a different CPU: |
91 | * by atomic_dec_and_test() order the rb->head read and this | 91 | * |
92 | * write. | 92 | * kernel user |
93 | * | ||
94 | * READ ->data_tail READ ->data_head | ||
95 | * smp_mb() (A) smp_rmb() (C) | ||
96 | * WRITE $data READ $data | ||
97 | * smp_wmb() (B) smp_mb() (D) | ||
98 | * STORE ->data_head WRITE ->data_tail | ||
99 | * | ||
100 | * Where A pairs with D, and B pairs with C. | ||
101 | * | ||
102 | * I don't think A needs to be a full barrier because we won't in fact | ||
103 | * write data until we see the store from userspace. So we simply don't | ||
104 | * issue the data WRITE until we observe it. Be conservative for now. | ||
105 | * | ||
106 | * OTOH, D needs to be a full barrier since it separates the data READ | ||
107 | * from the tail WRITE. | ||
108 | * | ||
109 | * For B a WMB is sufficient since it separates two WRITEs, and for C | ||
110 | * an RMB is sufficient since it separates two READs. | ||
111 | * | ||
112 | * See perf_output_begin(). | ||
93 | */ | 113 | */ |
114 | smp_wmb(); | ||
94 | rb->user_page->data_head = head; | 115 | rb->user_page->data_head = head; |
95 | 116 | ||
96 | /* | 117 | /* |
@@ -154,9 +175,11 @@ int perf_output_begin(struct perf_output_handle *handle, | |||
154 | * Userspace could choose to issue a mb() before updating the | 175 | * Userspace could choose to issue a mb() before updating the |
155 | * tail pointer. So that all reads will be completed before the | 176 | * tail pointer. So that all reads will be completed before the |
156 | * write is issued. | 177 | * write is issued. |
178 | * | ||
179 | * See perf_output_put_handle(). | ||
157 | */ | 180 | */ |
158 | tail = ACCESS_ONCE(rb->user_page->data_tail); | 181 | tail = ACCESS_ONCE(rb->user_page->data_tail); |
159 | smp_rmb(); | 182 | smp_mb(); |
160 | offset = head = local_read(&rb->head); | 183 | offset = head = local_read(&rb->head); |
161 | head += size; | 184 | head += size; |
162 | if (unlikely(!perf_output_space(rb, tail, offset, head))) | 185 | if (unlikely(!perf_output_space(rb, tail, offset, head))) |
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index f3569747d629..ad8e1bdca70e 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c | |||
@@ -1682,12 +1682,10 @@ static bool handle_trampoline(struct pt_regs *regs) | |||
1682 | tmp = ri; | 1682 | tmp = ri; |
1683 | ri = ri->next; | 1683 | ri = ri->next; |
1684 | kfree(tmp); | 1684 | kfree(tmp); |
1685 | utask->depth--; | ||
1685 | 1686 | ||
1686 | if (!chained) | 1687 | if (!chained) |
1687 | break; | 1688 | break; |
1688 | |||
1689 | utask->depth--; | ||
1690 | |||
1691 | BUG_ON(!ri); | 1689 | BUG_ON(!ri); |
1692 | } | 1690 | } |
1693 | 1691 | ||
diff --git a/kernel/extable.c b/kernel/extable.c index 67460b93b1a1..832cb28105bb 100644 --- a/kernel/extable.c +++ b/kernel/extable.c | |||
@@ -41,7 +41,7 @@ u32 __initdata main_extable_sort_needed = 1; | |||
41 | /* Sort the kernel's built-in exception table */ | 41 | /* Sort the kernel's built-in exception table */ |
42 | void __init sort_main_extable(void) | 42 | void __init sort_main_extable(void) |
43 | { | 43 | { |
44 | if (main_extable_sort_needed) { | 44 | if (main_extable_sort_needed && __stop___ex_table > __start___ex_table) { |
45 | pr_notice("Sorting __ex_table...\n"); | 45 | pr_notice("Sorting __ex_table...\n"); |
46 | sort_extable(__start___ex_table, __stop___ex_table); | 46 | sort_extable(__start___ex_table, __stop___ex_table); |
47 | } | 47 | } |
diff --git a/kernel/fork.c b/kernel/fork.c index bf46287c91a4..086fe73ad6bd 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -351,7 +351,6 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
351 | struct rb_node **rb_link, *rb_parent; | 351 | struct rb_node **rb_link, *rb_parent; |
352 | int retval; | 352 | int retval; |
353 | unsigned long charge; | 353 | unsigned long charge; |
354 | struct mempolicy *pol; | ||
355 | 354 | ||
356 | uprobe_start_dup_mmap(); | 355 | uprobe_start_dup_mmap(); |
357 | down_write(&oldmm->mmap_sem); | 356 | down_write(&oldmm->mmap_sem); |
@@ -400,11 +399,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
400 | goto fail_nomem; | 399 | goto fail_nomem; |
401 | *tmp = *mpnt; | 400 | *tmp = *mpnt; |
402 | INIT_LIST_HEAD(&tmp->anon_vma_chain); | 401 | INIT_LIST_HEAD(&tmp->anon_vma_chain); |
403 | pol = mpol_dup(vma_policy(mpnt)); | 402 | retval = vma_dup_policy(mpnt, tmp); |
404 | retval = PTR_ERR(pol); | 403 | if (retval) |
405 | if (IS_ERR(pol)) | ||
406 | goto fail_nomem_policy; | 404 | goto fail_nomem_policy; |
407 | vma_set_policy(tmp, pol); | ||
408 | tmp->vm_mm = mm; | 405 | tmp->vm_mm = mm; |
409 | if (anon_vma_fork(tmp, mpnt)) | 406 | if (anon_vma_fork(tmp, mpnt)) |
410 | goto fail_nomem_anon_vma_fork; | 407 | goto fail_nomem_anon_vma_fork; |
@@ -472,7 +469,7 @@ out: | |||
472 | uprobe_end_dup_mmap(); | 469 | uprobe_end_dup_mmap(); |
473 | return retval; | 470 | return retval; |
474 | fail_nomem_anon_vma_fork: | 471 | fail_nomem_anon_vma_fork: |
475 | mpol_put(pol); | 472 | mpol_put(vma_policy(tmp)); |
476 | fail_nomem_policy: | 473 | fail_nomem_policy: |
477 | kmem_cache_free(vm_area_cachep, tmp); | 474 | kmem_cache_free(vm_area_cachep, tmp); |
478 | fail_nomem: | 475 | fail_nomem: |
@@ -522,7 +519,7 @@ static void mm_init_aio(struct mm_struct *mm) | |||
522 | { | 519 | { |
523 | #ifdef CONFIG_AIO | 520 | #ifdef CONFIG_AIO |
524 | spin_lock_init(&mm->ioctx_lock); | 521 | spin_lock_init(&mm->ioctx_lock); |
525 | INIT_HLIST_HEAD(&mm->ioctx_list); | 522 | mm->ioctx_table = NULL; |
526 | #endif | 523 | #endif |
527 | } | 524 | } |
528 | 525 | ||
@@ -1173,13 +1170,16 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1173 | return ERR_PTR(-EINVAL); | 1170 | return ERR_PTR(-EINVAL); |
1174 | 1171 | ||
1175 | /* | 1172 | /* |
1176 | * If the new process will be in a different pid namespace | 1173 | * If the new process will be in a different pid or user namespace |
1177 | * don't allow the creation of threads. | 1174 | * do not allow it to share a thread group or signal handlers or |
1175 | * parent with the forking task. | ||
1178 | */ | 1176 | */ |
1179 | if ((clone_flags & (CLONE_VM|CLONE_NEWPID)) && | 1177 | if (clone_flags & (CLONE_SIGHAND | CLONE_PARENT)) { |
1180 | (task_active_pid_ns(current) != | 1178 | if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) || |
1181 | current->nsproxy->pid_ns_for_children)) | 1179 | (task_active_pid_ns(current) != |
1182 | return ERR_PTR(-EINVAL); | 1180 | current->nsproxy->pid_ns_for_children)) |
1181 | return ERR_PTR(-EINVAL); | ||
1182 | } | ||
1183 | 1183 | ||
1184 | retval = security_task_create(clone_flags); | 1184 | retval = security_task_create(clone_flags); |
1185 | if (retval) | 1185 | if (retval) |
@@ -1576,15 +1576,6 @@ long do_fork(unsigned long clone_flags, | |||
1576 | long nr; | 1576 | long nr; |
1577 | 1577 | ||
1578 | /* | 1578 | /* |
1579 | * Do some preliminary argument and permissions checking before we | ||
1580 | * actually start allocating stuff | ||
1581 | */ | ||
1582 | if (clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) { | ||
1583 | if (clone_flags & (CLONE_THREAD|CLONE_PARENT)) | ||
1584 | return -EINVAL; | ||
1585 | } | ||
1586 | |||
1587 | /* | ||
1588 | * Determine whether and which event to report to ptracer. When | 1579 | * Determine whether and which event to report to ptracer. When |
1589 | * called from kernel_thread or CLONE_UNTRACED is explicitly | 1580 | * called from kernel_thread or CLONE_UNTRACED is explicitly |
1590 | * requested, no event is reported; otherwise, report if the event | 1581 | * requested, no event is reported; otherwise, report if the event |
@@ -1825,11 +1816,6 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) | |||
1825 | if (unshare_flags & CLONE_NEWUSER) | 1816 | if (unshare_flags & CLONE_NEWUSER) |
1826 | unshare_flags |= CLONE_THREAD | CLONE_FS; | 1817 | unshare_flags |= CLONE_THREAD | CLONE_FS; |
1827 | /* | 1818 | /* |
1828 | * If unsharing a pid namespace must also unshare the thread. | ||
1829 | */ | ||
1830 | if (unshare_flags & CLONE_NEWPID) | ||
1831 | unshare_flags |= CLONE_THREAD; | ||
1832 | /* | ||
1833 | * If unsharing a thread from a thread group, must also unshare vm. | 1819 | * If unsharing a thread from a thread group, must also unshare vm. |
1834 | */ | 1820 | */ |
1835 | if (unshare_flags & CLONE_THREAD) | 1821 | if (unshare_flags & CLONE_THREAD) |
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c index 9bd0934f6c33..7a7d2ee96d42 100644 --- a/kernel/gcov/fs.c +++ b/kernel/gcov/fs.c | |||
@@ -74,7 +74,7 @@ static int __init gcov_persist_setup(char *str) | |||
74 | { | 74 | { |
75 | unsigned long val; | 75 | unsigned long val; |
76 | 76 | ||
77 | if (strict_strtoul(str, 0, &val)) { | 77 | if (kstrtoul(str, 0, &val)) { |
78 | pr_warning("invalid gcov_persist parameter '%s'\n", str); | 78 | pr_warning("invalid gcov_persist parameter '%s'\n", str); |
79 | return 0; | 79 | return 0; |
80 | } | 80 | } |
diff --git a/kernel/groups.c b/kernel/groups.c index 6b2588dd04ff..90cf1c38c8ea 100644 --- a/kernel/groups.c +++ b/kernel/groups.c | |||
@@ -233,7 +233,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist) | |||
233 | struct group_info *group_info; | 233 | struct group_info *group_info; |
234 | int retval; | 234 | int retval; |
235 | 235 | ||
236 | if (!nsown_capable(CAP_SETGID)) | 236 | if (!ns_capable(current_user_ns(), CAP_SETGID)) |
237 | return -EPERM; | 237 | return -EPERM; |
238 | if ((unsigned)gidsetsize > NGROUPS_MAX) | 238 | if ((unsigned)gidsetsize > NGROUPS_MAX) |
239 | return -EINVAL; | 239 | return -EINVAL; |
diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 6df614912b9d..3e97fb126e6b 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c | |||
@@ -15,6 +15,7 @@ | |||
15 | #include <linux/lockdep.h> | 15 | #include <linux/lockdep.h> |
16 | #include <linux/export.h> | 16 | #include <linux/export.h> |
17 | #include <linux/sysctl.h> | 17 | #include <linux/sysctl.h> |
18 | #include <linux/utsname.h> | ||
18 | 19 | ||
19 | /* | 20 | /* |
20 | * The number of tasks checked: | 21 | * The number of tasks checked: |
@@ -99,10 +100,14 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) | |||
99 | * Ok, the task did not get scheduled for more than 2 minutes, | 100 | * Ok, the task did not get scheduled for more than 2 minutes, |
100 | * complain: | 101 | * complain: |
101 | */ | 102 | */ |
102 | printk(KERN_ERR "INFO: task %s:%d blocked for more than " | 103 | pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n", |
103 | "%ld seconds.\n", t->comm, t->pid, timeout); | 104 | t->comm, t->pid, timeout); |
104 | printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" | 105 | pr_err(" %s %s %.*s\n", |
105 | " disables this message.\n"); | 106 | print_tainted(), init_utsname()->release, |
107 | (int)strcspn(init_utsname()->version, " "), | ||
108 | init_utsname()->version); | ||
109 | pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" | ||
110 | " disables this message.\n"); | ||
106 | sched_show_task(t); | 111 | sched_show_task(t); |
107 | debug_show_held_locks(t); | 112 | debug_show_held_locks(t); |
108 | 113 | ||
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index d1a758bc972a..4a1fef09f658 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig | |||
@@ -1,15 +1,4 @@ | |||
1 | # Select this to activate the generic irq options below | ||
2 | config HAVE_GENERIC_HARDIRQS | ||
3 | bool | ||
4 | |||
5 | if HAVE_GENERIC_HARDIRQS | ||
6 | menu "IRQ subsystem" | 1 | menu "IRQ subsystem" |
7 | # | ||
8 | # Interrupt subsystem related configuration options | ||
9 | # | ||
10 | config GENERIC_HARDIRQS | ||
11 | def_bool y | ||
12 | |||
13 | # Options selectable by the architecture code | 2 | # Options selectable by the architecture code |
14 | 3 | ||
15 | # Make sparse irq Kconfig switch below available | 4 | # Make sparse irq Kconfig switch below available |
@@ -84,4 +73,3 @@ config SPARSE_IRQ | |||
84 | If you don't know what to do here, say N. | 73 | If you don't know what to do here, say N. |
85 | 74 | ||
86 | endmenu | 75 | endmenu |
87 | endif | ||
diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 60f48fa0fd0d..297a9247a3b3 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c | |||
@@ -13,6 +13,7 @@ | |||
13 | #include <linux/sort.h> | 13 | #include <linux/sort.h> |
14 | #include <linux/err.h> | 14 | #include <linux/err.h> |
15 | #include <linux/static_key.h> | 15 | #include <linux/static_key.h> |
16 | #include <linux/jump_label_ratelimit.h> | ||
16 | 17 | ||
17 | #ifdef HAVE_JUMP_LABEL | 18 | #ifdef HAVE_JUMP_LABEL |
18 | 19 | ||
diff --git a/kernel/kexec.c b/kernel/kexec.c index 59f7b55ba745..2a74f307c5ec 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
@@ -1474,11 +1474,8 @@ static int __init __parse_crashkernel(char *cmdline, | |||
1474 | if (first_colon && (!first_space || first_colon < first_space)) | 1474 | if (first_colon && (!first_space || first_colon < first_space)) |
1475 | return parse_crashkernel_mem(ck_cmdline, system_ram, | 1475 | return parse_crashkernel_mem(ck_cmdline, system_ram, |
1476 | crash_size, crash_base); | 1476 | crash_size, crash_base); |
1477 | else | ||
1478 | return parse_crashkernel_simple(ck_cmdline, crash_size, | ||
1479 | crash_base); | ||
1480 | 1477 | ||
1481 | return 0; | 1478 | return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base); |
1482 | } | 1479 | } |
1483 | 1480 | ||
1484 | /* | 1481 | /* |
diff --git a/kernel/kmod.c b/kernel/kmod.c index fb326365b694..b086006c59e7 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
@@ -571,6 +571,10 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) | |||
571 | DECLARE_COMPLETION_ONSTACK(done); | 571 | DECLARE_COMPLETION_ONSTACK(done); |
572 | int retval = 0; | 572 | int retval = 0; |
573 | 573 | ||
574 | if (!sub_info->path) { | ||
575 | call_usermodehelper_freeinfo(sub_info); | ||
576 | return -EINVAL; | ||
577 | } | ||
574 | helper_lock(); | 578 | helper_lock(); |
575 | if (!khelper_wq || usermodehelper_disabled) { | 579 | if (!khelper_wq || usermodehelper_disabled) { |
576 | retval = -EBUSY; | 580 | retval = -EBUSY; |
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 6e33498d665c..a0d367a49122 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
@@ -112,6 +112,7 @@ static struct kprobe_blackpoint kprobe_blacklist[] = { | |||
112 | struct kprobe_insn_page { | 112 | struct kprobe_insn_page { |
113 | struct list_head list; | 113 | struct list_head list; |
114 | kprobe_opcode_t *insns; /* Page of instruction slots */ | 114 | kprobe_opcode_t *insns; /* Page of instruction slots */ |
115 | struct kprobe_insn_cache *cache; | ||
115 | int nused; | 116 | int nused; |
116 | int ngarbage; | 117 | int ngarbage; |
117 | char slot_used[]; | 118 | char slot_used[]; |
@@ -121,12 +122,6 @@ struct kprobe_insn_page { | |||
121 | (offsetof(struct kprobe_insn_page, slot_used) + \ | 122 | (offsetof(struct kprobe_insn_page, slot_used) + \ |
122 | (sizeof(char) * (slots))) | 123 | (sizeof(char) * (slots))) |
123 | 124 | ||
124 | struct kprobe_insn_cache { | ||
125 | struct list_head pages; /* list of kprobe_insn_page */ | ||
126 | size_t insn_size; /* size of instruction slot */ | ||
127 | int nr_garbage; | ||
128 | }; | ||
129 | |||
130 | static int slots_per_page(struct kprobe_insn_cache *c) | 125 | static int slots_per_page(struct kprobe_insn_cache *c) |
131 | { | 126 | { |
132 | return PAGE_SIZE/(c->insn_size * sizeof(kprobe_opcode_t)); | 127 | return PAGE_SIZE/(c->insn_size * sizeof(kprobe_opcode_t)); |
@@ -138,8 +133,20 @@ enum kprobe_slot_state { | |||
138 | SLOT_USED = 2, | 133 | SLOT_USED = 2, |
139 | }; | 134 | }; |
140 | 135 | ||
141 | static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_slots */ | 136 | static void *alloc_insn_page(void) |
142 | static struct kprobe_insn_cache kprobe_insn_slots = { | 137 | { |
138 | return module_alloc(PAGE_SIZE); | ||
139 | } | ||
140 | |||
141 | static void free_insn_page(void *page) | ||
142 | { | ||
143 | module_free(NULL, page); | ||
144 | } | ||
145 | |||
146 | struct kprobe_insn_cache kprobe_insn_slots = { | ||
147 | .mutex = __MUTEX_INITIALIZER(kprobe_insn_slots.mutex), | ||
148 | .alloc = alloc_insn_page, | ||
149 | .free = free_insn_page, | ||
143 | .pages = LIST_HEAD_INIT(kprobe_insn_slots.pages), | 150 | .pages = LIST_HEAD_INIT(kprobe_insn_slots.pages), |
144 | .insn_size = MAX_INSN_SIZE, | 151 | .insn_size = MAX_INSN_SIZE, |
145 | .nr_garbage = 0, | 152 | .nr_garbage = 0, |
@@ -150,10 +157,12 @@ static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c); | |||
150 | * __get_insn_slot() - Find a slot on an executable page for an instruction. | 157 | * __get_insn_slot() - Find a slot on an executable page for an instruction. |
151 | * We allocate an executable page if there's no room on existing ones. | 158 | * We allocate an executable page if there's no room on existing ones. |
152 | */ | 159 | */ |
153 | static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c) | 160 | kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c) |
154 | { | 161 | { |
155 | struct kprobe_insn_page *kip; | 162 | struct kprobe_insn_page *kip; |
163 | kprobe_opcode_t *slot = NULL; | ||
156 | 164 | ||
165 | mutex_lock(&c->mutex); | ||
157 | retry: | 166 | retry: |
158 | list_for_each_entry(kip, &c->pages, list) { | 167 | list_for_each_entry(kip, &c->pages, list) { |
159 | if (kip->nused < slots_per_page(c)) { | 168 | if (kip->nused < slots_per_page(c)) { |
@@ -162,7 +171,8 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c) | |||
162 | if (kip->slot_used[i] == SLOT_CLEAN) { | 171 | if (kip->slot_used[i] == SLOT_CLEAN) { |
163 | kip->slot_used[i] = SLOT_USED; | 172 | kip->slot_used[i] = SLOT_USED; |
164 | kip->nused++; | 173 | kip->nused++; |
165 | return kip->insns + (i * c->insn_size); | 174 | slot = kip->insns + (i * c->insn_size); |
175 | goto out; | ||
166 | } | 176 | } |
167 | } | 177 | } |
168 | /* kip->nused is broken. Fix it. */ | 178 | /* kip->nused is broken. Fix it. */ |
@@ -178,37 +188,29 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c) | |||
178 | /* All out of space. Need to allocate a new page. */ | 188 | /* All out of space. Need to allocate a new page. */ |
179 | kip = kmalloc(KPROBE_INSN_PAGE_SIZE(slots_per_page(c)), GFP_KERNEL); | 189 | kip = kmalloc(KPROBE_INSN_PAGE_SIZE(slots_per_page(c)), GFP_KERNEL); |
180 | if (!kip) | 190 | if (!kip) |
181 | return NULL; | 191 | goto out; |
182 | 192 | ||
183 | /* | 193 | /* |
184 | * Use module_alloc so this page is within +/- 2GB of where the | 194 | * Use module_alloc so this page is within +/- 2GB of where the |
185 | * kernel image and loaded module images reside. This is required | 195 | * kernel image and loaded module images reside. This is required |
186 | * so x86_64 can correctly handle the %rip-relative fixups. | 196 | * so x86_64 can correctly handle the %rip-relative fixups. |
187 | */ | 197 | */ |
188 | kip->insns = module_alloc(PAGE_SIZE); | 198 | kip->insns = c->alloc(); |
189 | if (!kip->insns) { | 199 | if (!kip->insns) { |
190 | kfree(kip); | 200 | kfree(kip); |
191 | return NULL; | 201 | goto out; |
192 | } | 202 | } |
193 | INIT_LIST_HEAD(&kip->list); | 203 | INIT_LIST_HEAD(&kip->list); |
194 | memset(kip->slot_used, SLOT_CLEAN, slots_per_page(c)); | 204 | memset(kip->slot_used, SLOT_CLEAN, slots_per_page(c)); |
195 | kip->slot_used[0] = SLOT_USED; | 205 | kip->slot_used[0] = SLOT_USED; |
196 | kip->nused = 1; | 206 | kip->nused = 1; |
197 | kip->ngarbage = 0; | 207 | kip->ngarbage = 0; |
208 | kip->cache = c; | ||
198 | list_add(&kip->list, &c->pages); | 209 | list_add(&kip->list, &c->pages); |
199 | return kip->insns; | 210 | slot = kip->insns; |
200 | } | 211 | out: |
201 | 212 | mutex_unlock(&c->mutex); | |
202 | 213 | return slot; | |
203 | kprobe_opcode_t __kprobes *get_insn_slot(void) | ||
204 | { | ||
205 | kprobe_opcode_t *ret = NULL; | ||
206 | |||
207 | mutex_lock(&kprobe_insn_mutex); | ||
208 | ret = __get_insn_slot(&kprobe_insn_slots); | ||
209 | mutex_unlock(&kprobe_insn_mutex); | ||
210 | |||
211 | return ret; | ||
212 | } | 214 | } |
213 | 215 | ||
214 | /* Return 1 if all garbages are collected, otherwise 0. */ | 216 | /* Return 1 if all garbages are collected, otherwise 0. */ |
@@ -225,7 +227,7 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx) | |||
225 | */ | 227 | */ |
226 | if (!list_is_singular(&kip->list)) { | 228 | if (!list_is_singular(&kip->list)) { |
227 | list_del(&kip->list); | 229 | list_del(&kip->list); |
228 | module_free(NULL, kip->insns); | 230 | kip->cache->free(kip->insns); |
229 | kfree(kip); | 231 | kfree(kip); |
230 | } | 232 | } |
231 | return 1; | 233 | return 1; |
@@ -255,11 +257,12 @@ static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c) | |||
255 | return 0; | 257 | return 0; |
256 | } | 258 | } |
257 | 259 | ||
258 | static void __kprobes __free_insn_slot(struct kprobe_insn_cache *c, | 260 | void __kprobes __free_insn_slot(struct kprobe_insn_cache *c, |
259 | kprobe_opcode_t *slot, int dirty) | 261 | kprobe_opcode_t *slot, int dirty) |
260 | { | 262 | { |
261 | struct kprobe_insn_page *kip; | 263 | struct kprobe_insn_page *kip; |
262 | 264 | ||
265 | mutex_lock(&c->mutex); | ||
263 | list_for_each_entry(kip, &c->pages, list) { | 266 | list_for_each_entry(kip, &c->pages, list) { |
264 | long idx = ((long)slot - (long)kip->insns) / | 267 | long idx = ((long)slot - (long)kip->insns) / |
265 | (c->insn_size * sizeof(kprobe_opcode_t)); | 268 | (c->insn_size * sizeof(kprobe_opcode_t)); |
@@ -272,45 +275,25 @@ static void __kprobes __free_insn_slot(struct kprobe_insn_cache *c, | |||
272 | collect_garbage_slots(c); | 275 | collect_garbage_slots(c); |
273 | } else | 276 | } else |
274 | collect_one_slot(kip, idx); | 277 | collect_one_slot(kip, idx); |
275 | return; | 278 | goto out; |
276 | } | 279 | } |
277 | } | 280 | } |
278 | /* Could not free this slot. */ | 281 | /* Could not free this slot. */ |
279 | WARN_ON(1); | 282 | WARN_ON(1); |
283 | out: | ||
284 | mutex_unlock(&c->mutex); | ||
280 | } | 285 | } |
281 | 286 | ||
282 | void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) | ||
283 | { | ||
284 | mutex_lock(&kprobe_insn_mutex); | ||
285 | __free_insn_slot(&kprobe_insn_slots, slot, dirty); | ||
286 | mutex_unlock(&kprobe_insn_mutex); | ||
287 | } | ||
288 | #ifdef CONFIG_OPTPROBES | 287 | #ifdef CONFIG_OPTPROBES |
289 | /* For optimized_kprobe buffer */ | 288 | /* For optimized_kprobe buffer */ |
290 | static DEFINE_MUTEX(kprobe_optinsn_mutex); /* Protects kprobe_optinsn_slots */ | 289 | struct kprobe_insn_cache kprobe_optinsn_slots = { |
291 | static struct kprobe_insn_cache kprobe_optinsn_slots = { | 290 | .mutex = __MUTEX_INITIALIZER(kprobe_optinsn_slots.mutex), |
291 | .alloc = alloc_insn_page, | ||
292 | .free = free_insn_page, | ||
292 | .pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages), | 293 | .pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages), |
293 | /* .insn_size is initialized later */ | 294 | /* .insn_size is initialized later */ |
294 | .nr_garbage = 0, | 295 | .nr_garbage = 0, |
295 | }; | 296 | }; |
296 | /* Get a slot for optimized_kprobe buffer */ | ||
297 | kprobe_opcode_t __kprobes *get_optinsn_slot(void) | ||
298 | { | ||
299 | kprobe_opcode_t *ret = NULL; | ||
300 | |||
301 | mutex_lock(&kprobe_optinsn_mutex); | ||
302 | ret = __get_insn_slot(&kprobe_optinsn_slots); | ||
303 | mutex_unlock(&kprobe_optinsn_mutex); | ||
304 | |||
305 | return ret; | ||
306 | } | ||
307 | |||
308 | void __kprobes free_optinsn_slot(kprobe_opcode_t * slot, int dirty) | ||
309 | { | ||
310 | mutex_lock(&kprobe_optinsn_mutex); | ||
311 | __free_insn_slot(&kprobe_optinsn_slots, slot, dirty); | ||
312 | mutex_unlock(&kprobe_optinsn_mutex); | ||
313 | } | ||
314 | #endif | 297 | #endif |
315 | #endif | 298 | #endif |
316 | 299 | ||
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 6ada93c23a9a..9659d38e008f 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c | |||
@@ -113,7 +113,7 @@ static ssize_t kexec_crash_size_store(struct kobject *kobj, | |||
113 | unsigned long cnt; | 113 | unsigned long cnt; |
114 | int ret; | 114 | int ret; |
115 | 115 | ||
116 | if (strict_strtoul(buf, 0, &cnt)) | 116 | if (kstrtoul(buf, 0, &cnt)) |
117 | return -EINVAL; | 117 | return -EINVAL; |
118 | 118 | ||
119 | ret = crash_shrink_memory(cnt); | 119 | ret = crash_shrink_memory(cnt); |
diff --git a/kernel/lglock.c b/kernel/lglock.c index 6535a667a5a7..86ae2aebf004 100644 --- a/kernel/lglock.c +++ b/kernel/lglock.c | |||
@@ -21,7 +21,7 @@ void lg_local_lock(struct lglock *lg) | |||
21 | arch_spinlock_t *lock; | 21 | arch_spinlock_t *lock; |
22 | 22 | ||
23 | preempt_disable(); | 23 | preempt_disable(); |
24 | rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_); | 24 | lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); |
25 | lock = this_cpu_ptr(lg->lock); | 25 | lock = this_cpu_ptr(lg->lock); |
26 | arch_spin_lock(lock); | 26 | arch_spin_lock(lock); |
27 | } | 27 | } |
@@ -31,7 +31,7 @@ void lg_local_unlock(struct lglock *lg) | |||
31 | { | 31 | { |
32 | arch_spinlock_t *lock; | 32 | arch_spinlock_t *lock; |
33 | 33 | ||
34 | rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); | 34 | lock_release(&lg->lock_dep_map, 1, _RET_IP_); |
35 | lock = this_cpu_ptr(lg->lock); | 35 | lock = this_cpu_ptr(lg->lock); |
36 | arch_spin_unlock(lock); | 36 | arch_spin_unlock(lock); |
37 | preempt_enable(); | 37 | preempt_enable(); |
@@ -43,7 +43,7 @@ void lg_local_lock_cpu(struct lglock *lg, int cpu) | |||
43 | arch_spinlock_t *lock; | 43 | arch_spinlock_t *lock; |
44 | 44 | ||
45 | preempt_disable(); | 45 | preempt_disable(); |
46 | rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_); | 46 | lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); |
47 | lock = per_cpu_ptr(lg->lock, cpu); | 47 | lock = per_cpu_ptr(lg->lock, cpu); |
48 | arch_spin_lock(lock); | 48 | arch_spin_lock(lock); |
49 | } | 49 | } |
@@ -53,7 +53,7 @@ void lg_local_unlock_cpu(struct lglock *lg, int cpu) | |||
53 | { | 53 | { |
54 | arch_spinlock_t *lock; | 54 | arch_spinlock_t *lock; |
55 | 55 | ||
56 | rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); | 56 | lock_release(&lg->lock_dep_map, 1, _RET_IP_); |
57 | lock = per_cpu_ptr(lg->lock, cpu); | 57 | lock = per_cpu_ptr(lg->lock, cpu); |
58 | arch_spin_unlock(lock); | 58 | arch_spin_unlock(lock); |
59 | preempt_enable(); | 59 | preempt_enable(); |
@@ -65,7 +65,7 @@ void lg_global_lock(struct lglock *lg) | |||
65 | int i; | 65 | int i; |
66 | 66 | ||
67 | preempt_disable(); | 67 | preempt_disable(); |
68 | rwlock_acquire(&lg->lock_dep_map, 0, 0, _RET_IP_); | 68 | lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); |
69 | for_each_possible_cpu(i) { | 69 | for_each_possible_cpu(i) { |
70 | arch_spinlock_t *lock; | 70 | arch_spinlock_t *lock; |
71 | lock = per_cpu_ptr(lg->lock, i); | 71 | lock = per_cpu_ptr(lg->lock, i); |
@@ -78,7 +78,7 @@ void lg_global_unlock(struct lglock *lg) | |||
78 | { | 78 | { |
79 | int i; | 79 | int i; |
80 | 80 | ||
81 | rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); | 81 | lock_release(&lg->lock_dep_map, 1, _RET_IP_); |
82 | for_each_possible_cpu(i) { | 82 | for_each_possible_cpu(i) { |
83 | arch_spinlock_t *lock; | 83 | arch_spinlock_t *lock; |
84 | lock = per_cpu_ptr(lg->lock, i); | 84 | lock = per_cpu_ptr(lg->lock, i); |
diff --git a/kernel/modsign_pubkey.c b/kernel/modsign_pubkey.c index 2b6e69909c39..7cbd4507a7e6 100644 --- a/kernel/modsign_pubkey.c +++ b/kernel/modsign_pubkey.c | |||
@@ -18,14 +18,14 @@ | |||
18 | 18 | ||
19 | struct key *modsign_keyring; | 19 | struct key *modsign_keyring; |
20 | 20 | ||
21 | extern __initdata const u8 modsign_certificate_list[]; | 21 | extern __initconst const u8 modsign_certificate_list[]; |
22 | extern __initdata const u8 modsign_certificate_list_end[]; | 22 | extern __initconst const u8 modsign_certificate_list_end[]; |
23 | 23 | ||
24 | /* | 24 | /* |
25 | * We need to make sure ccache doesn't cache the .o file as it doesn't notice | 25 | * We need to make sure ccache doesn't cache the .o file as it doesn't notice |
26 | * if modsign.pub changes. | 26 | * if modsign.pub changes. |
27 | */ | 27 | */ |
28 | static __initdata const char annoy_ccache[] = __TIME__ "foo"; | 28 | static __initconst const char annoy_ccache[] = __TIME__ "foo"; |
29 | 29 | ||
30 | /* | 30 | /* |
31 | * Load the compiled-in keys | 31 | * Load the compiled-in keys |
diff --git a/kernel/module.c b/kernel/module.c index 206915830d29..dc582749fa13 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -136,6 +136,7 @@ static int param_set_bool_enable_only(const char *val, | |||
136 | } | 136 | } |
137 | 137 | ||
138 | static const struct kernel_param_ops param_ops_bool_enable_only = { | 138 | static const struct kernel_param_ops param_ops_bool_enable_only = { |
139 | .flags = KERNEL_PARAM_FL_NOARG, | ||
139 | .set = param_set_bool_enable_only, | 140 | .set = param_set_bool_enable_only, |
140 | .get = param_get_bool, | 141 | .get = param_get_bool, |
141 | }; | 142 | }; |
@@ -603,7 +604,7 @@ static void setup_modinfo_##field(struct module *mod, const char *s) \ | |||
603 | static ssize_t show_modinfo_##field(struct module_attribute *mattr, \ | 604 | static ssize_t show_modinfo_##field(struct module_attribute *mattr, \ |
604 | struct module_kobject *mk, char *buffer) \ | 605 | struct module_kobject *mk, char *buffer) \ |
605 | { \ | 606 | { \ |
606 | return sprintf(buffer, "%s\n", mk->mod->field); \ | 607 | return scnprintf(buffer, PAGE_SIZE, "%s\n", mk->mod->field); \ |
607 | } \ | 608 | } \ |
608 | static int modinfo_##field##_exists(struct module *mod) \ | 609 | static int modinfo_##field##_exists(struct module *mod) \ |
609 | { \ | 610 | { \ |
@@ -1611,6 +1612,14 @@ static void module_remove_modinfo_attrs(struct module *mod) | |||
1611 | kfree(mod->modinfo_attrs); | 1612 | kfree(mod->modinfo_attrs); |
1612 | } | 1613 | } |
1613 | 1614 | ||
1615 | static void mod_kobject_put(struct module *mod) | ||
1616 | { | ||
1617 | DECLARE_COMPLETION_ONSTACK(c); | ||
1618 | mod->mkobj.kobj_completion = &c; | ||
1619 | kobject_put(&mod->mkobj.kobj); | ||
1620 | wait_for_completion(&c); | ||
1621 | } | ||
1622 | |||
1614 | static int mod_sysfs_init(struct module *mod) | 1623 | static int mod_sysfs_init(struct module *mod) |
1615 | { | 1624 | { |
1616 | int err; | 1625 | int err; |
@@ -1638,7 +1647,7 @@ static int mod_sysfs_init(struct module *mod) | |||
1638 | err = kobject_init_and_add(&mod->mkobj.kobj, &module_ktype, NULL, | 1647 | err = kobject_init_and_add(&mod->mkobj.kobj, &module_ktype, NULL, |
1639 | "%s", mod->name); | 1648 | "%s", mod->name); |
1640 | if (err) | 1649 | if (err) |
1641 | kobject_put(&mod->mkobj.kobj); | 1650 | mod_kobject_put(mod); |
1642 | 1651 | ||
1643 | /* delay uevent until full sysfs population */ | 1652 | /* delay uevent until full sysfs population */ |
1644 | out: | 1653 | out: |
@@ -1682,7 +1691,7 @@ out_unreg_param: | |||
1682 | out_unreg_holders: | 1691 | out_unreg_holders: |
1683 | kobject_put(mod->holders_dir); | 1692 | kobject_put(mod->holders_dir); |
1684 | out_unreg: | 1693 | out_unreg: |
1685 | kobject_put(&mod->mkobj.kobj); | 1694 | mod_kobject_put(mod); |
1686 | out: | 1695 | out: |
1687 | return err; | 1696 | return err; |
1688 | } | 1697 | } |
@@ -1691,7 +1700,7 @@ static void mod_sysfs_fini(struct module *mod) | |||
1691 | { | 1700 | { |
1692 | remove_notes_attrs(mod); | 1701 | remove_notes_attrs(mod); |
1693 | remove_sect_attrs(mod); | 1702 | remove_sect_attrs(mod); |
1694 | kobject_put(&mod->mkobj.kobj); | 1703 | mod_kobject_put(mod); |
1695 | } | 1704 | } |
1696 | 1705 | ||
1697 | #else /* !CONFIG_SYSFS */ | 1706 | #else /* !CONFIG_SYSFS */ |
@@ -2540,21 +2549,20 @@ static int copy_module_from_user(const void __user *umod, unsigned long len, | |||
2540 | /* Sets info->hdr and info->len. */ | 2549 | /* Sets info->hdr and info->len. */ |
2541 | static int copy_module_from_fd(int fd, struct load_info *info) | 2550 | static int copy_module_from_fd(int fd, struct load_info *info) |
2542 | { | 2551 | { |
2543 | struct file *file; | 2552 | struct fd f = fdget(fd); |
2544 | int err; | 2553 | int err; |
2545 | struct kstat stat; | 2554 | struct kstat stat; |
2546 | loff_t pos; | 2555 | loff_t pos; |
2547 | ssize_t bytes = 0; | 2556 | ssize_t bytes = 0; |
2548 | 2557 | ||
2549 | file = fget(fd); | 2558 | if (!f.file) |
2550 | if (!file) | ||
2551 | return -ENOEXEC; | 2559 | return -ENOEXEC; |
2552 | 2560 | ||
2553 | err = security_kernel_module_from_file(file); | 2561 | err = security_kernel_module_from_file(f.file); |
2554 | if (err) | 2562 | if (err) |
2555 | goto out; | 2563 | goto out; |
2556 | 2564 | ||
2557 | err = vfs_getattr(&file->f_path, &stat); | 2565 | err = vfs_getattr(&f.file->f_path, &stat); |
2558 | if (err) | 2566 | if (err) |
2559 | goto out; | 2567 | goto out; |
2560 | 2568 | ||
@@ -2577,7 +2585,7 @@ static int copy_module_from_fd(int fd, struct load_info *info) | |||
2577 | 2585 | ||
2578 | pos = 0; | 2586 | pos = 0; |
2579 | while (pos < stat.size) { | 2587 | while (pos < stat.size) { |
2580 | bytes = kernel_read(file, pos, (char *)(info->hdr) + pos, | 2588 | bytes = kernel_read(f.file, pos, (char *)(info->hdr) + pos, |
2581 | stat.size - pos); | 2589 | stat.size - pos); |
2582 | if (bytes < 0) { | 2590 | if (bytes < 0) { |
2583 | vfree(info->hdr); | 2591 | vfree(info->hdr); |
@@ -2591,7 +2599,7 @@ static int copy_module_from_fd(int fd, struct load_info *info) | |||
2591 | info->len = pos; | 2599 | info->len = pos; |
2592 | 2600 | ||
2593 | out: | 2601 | out: |
2594 | fput(file); | 2602 | fdput(f); |
2595 | return err; | 2603 | return err; |
2596 | } | 2604 | } |
2597 | 2605 | ||
diff --git a/kernel/mutex.c b/kernel/mutex.c index a52ee7bb830d..d24105b1b794 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c | |||
@@ -209,11 +209,13 @@ int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) | |||
209 | */ | 209 | */ |
210 | static inline int mutex_can_spin_on_owner(struct mutex *lock) | 210 | static inline int mutex_can_spin_on_owner(struct mutex *lock) |
211 | { | 211 | { |
212 | struct task_struct *owner; | ||
212 | int retval = 1; | 213 | int retval = 1; |
213 | 214 | ||
214 | rcu_read_lock(); | 215 | rcu_read_lock(); |
215 | if (lock->owner) | 216 | owner = ACCESS_ONCE(lock->owner); |
216 | retval = lock->owner->on_cpu; | 217 | if (owner) |
218 | retval = owner->on_cpu; | ||
217 | rcu_read_unlock(); | 219 | rcu_read_unlock(); |
218 | /* | 220 | /* |
219 | * if lock->owner is not set, the mutex owner may have just acquired | 221 | * if lock->owner is not set, the mutex owner may have just acquired |
@@ -408,7 +410,7 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock, | |||
408 | static __always_inline int __sched | 410 | static __always_inline int __sched |
409 | __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, | 411 | __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, |
410 | struct lockdep_map *nest_lock, unsigned long ip, | 412 | struct lockdep_map *nest_lock, unsigned long ip, |
411 | struct ww_acquire_ctx *ww_ctx) | 413 | struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx) |
412 | { | 414 | { |
413 | struct task_struct *task = current; | 415 | struct task_struct *task = current; |
414 | struct mutex_waiter waiter; | 416 | struct mutex_waiter waiter; |
@@ -448,7 +450,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, | |||
448 | struct task_struct *owner; | 450 | struct task_struct *owner; |
449 | struct mspin_node node; | 451 | struct mspin_node node; |
450 | 452 | ||
451 | if (!__builtin_constant_p(ww_ctx == NULL) && ww_ctx->acquired > 0) { | 453 | if (use_ww_ctx && ww_ctx->acquired > 0) { |
452 | struct ww_mutex *ww; | 454 | struct ww_mutex *ww; |
453 | 455 | ||
454 | ww = container_of(lock, struct ww_mutex, base); | 456 | ww = container_of(lock, struct ww_mutex, base); |
@@ -461,7 +463,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, | |||
461 | * performed the optimistic spinning cannot be done. | 463 | * performed the optimistic spinning cannot be done. |
462 | */ | 464 | */ |
463 | if (ACCESS_ONCE(ww->ctx)) | 465 | if (ACCESS_ONCE(ww->ctx)) |
464 | break; | 466 | goto slowpath; |
465 | } | 467 | } |
466 | 468 | ||
467 | /* | 469 | /* |
@@ -472,13 +474,13 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, | |||
472 | owner = ACCESS_ONCE(lock->owner); | 474 | owner = ACCESS_ONCE(lock->owner); |
473 | if (owner && !mutex_spin_on_owner(lock, owner)) { | 475 | if (owner && !mutex_spin_on_owner(lock, owner)) { |
474 | mspin_unlock(MLOCK(lock), &node); | 476 | mspin_unlock(MLOCK(lock), &node); |
475 | break; | 477 | goto slowpath; |
476 | } | 478 | } |
477 | 479 | ||
478 | if ((atomic_read(&lock->count) == 1) && | 480 | if ((atomic_read(&lock->count) == 1) && |
479 | (atomic_cmpxchg(&lock->count, 1, 0) == 1)) { | 481 | (atomic_cmpxchg(&lock->count, 1, 0) == 1)) { |
480 | lock_acquired(&lock->dep_map, ip); | 482 | lock_acquired(&lock->dep_map, ip); |
481 | if (!__builtin_constant_p(ww_ctx == NULL)) { | 483 | if (use_ww_ctx) { |
482 | struct ww_mutex *ww; | 484 | struct ww_mutex *ww; |
483 | ww = container_of(lock, struct ww_mutex, base); | 485 | ww = container_of(lock, struct ww_mutex, base); |
484 | 486 | ||
@@ -499,7 +501,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, | |||
499 | * the owner complete. | 501 | * the owner complete. |
500 | */ | 502 | */ |
501 | if (!owner && (need_resched() || rt_task(task))) | 503 | if (!owner && (need_resched() || rt_task(task))) |
502 | break; | 504 | goto slowpath; |
503 | 505 | ||
504 | /* | 506 | /* |
505 | * The cpu_relax() call is a compiler barrier which forces | 507 | * The cpu_relax() call is a compiler barrier which forces |
@@ -513,6 +515,10 @@ slowpath: | |||
513 | #endif | 515 | #endif |
514 | spin_lock_mutex(&lock->wait_lock, flags); | 516 | spin_lock_mutex(&lock->wait_lock, flags); |
515 | 517 | ||
518 | /* once more, can we acquire the lock? */ | ||
519 | if (MUTEX_SHOW_NO_WAITER(lock) && (atomic_xchg(&lock->count, 0) == 1)) | ||
520 | goto skip_wait; | ||
521 | |||
516 | debug_mutex_lock_common(lock, &waiter); | 522 | debug_mutex_lock_common(lock, &waiter); |
517 | debug_mutex_add_waiter(lock, &waiter, task_thread_info(task)); | 523 | debug_mutex_add_waiter(lock, &waiter, task_thread_info(task)); |
518 | 524 | ||
@@ -520,9 +526,6 @@ slowpath: | |||
520 | list_add_tail(&waiter.list, &lock->wait_list); | 526 | list_add_tail(&waiter.list, &lock->wait_list); |
521 | waiter.task = task; | 527 | waiter.task = task; |
522 | 528 | ||
523 | if (MUTEX_SHOW_NO_WAITER(lock) && (atomic_xchg(&lock->count, -1) == 1)) | ||
524 | goto done; | ||
525 | |||
526 | lock_contended(&lock->dep_map, ip); | 529 | lock_contended(&lock->dep_map, ip); |
527 | 530 | ||
528 | for (;;) { | 531 | for (;;) { |
@@ -536,7 +539,7 @@ slowpath: | |||
536 | * other waiters: | 539 | * other waiters: |
537 | */ | 540 | */ |
538 | if (MUTEX_SHOW_NO_WAITER(lock) && | 541 | if (MUTEX_SHOW_NO_WAITER(lock) && |
539 | (atomic_xchg(&lock->count, -1) == 1)) | 542 | (atomic_xchg(&lock->count, -1) == 1)) |
540 | break; | 543 | break; |
541 | 544 | ||
542 | /* | 545 | /* |
@@ -548,7 +551,7 @@ slowpath: | |||
548 | goto err; | 551 | goto err; |
549 | } | 552 | } |
550 | 553 | ||
551 | if (!__builtin_constant_p(ww_ctx == NULL) && ww_ctx->acquired > 0) { | 554 | if (use_ww_ctx && ww_ctx->acquired > 0) { |
552 | ret = __mutex_lock_check_stamp(lock, ww_ctx); | 555 | ret = __mutex_lock_check_stamp(lock, ww_ctx); |
553 | if (ret) | 556 | if (ret) |
554 | goto err; | 557 | goto err; |
@@ -561,24 +564,25 @@ slowpath: | |||
561 | schedule_preempt_disabled(); | 564 | schedule_preempt_disabled(); |
562 | spin_lock_mutex(&lock->wait_lock, flags); | 565 | spin_lock_mutex(&lock->wait_lock, flags); |
563 | } | 566 | } |
567 | mutex_remove_waiter(lock, &waiter, current_thread_info()); | ||
568 | /* set it to 0 if there are no waiters left: */ | ||
569 | if (likely(list_empty(&lock->wait_list))) | ||
570 | atomic_set(&lock->count, 0); | ||
571 | debug_mutex_free_waiter(&waiter); | ||
564 | 572 | ||
565 | done: | 573 | skip_wait: |
574 | /* got the lock - cleanup and rejoice! */ | ||
566 | lock_acquired(&lock->dep_map, ip); | 575 | lock_acquired(&lock->dep_map, ip); |
567 | /* got the lock - rejoice! */ | ||
568 | mutex_remove_waiter(lock, &waiter, current_thread_info()); | ||
569 | mutex_set_owner(lock); | 576 | mutex_set_owner(lock); |
570 | 577 | ||
571 | if (!__builtin_constant_p(ww_ctx == NULL)) { | 578 | if (use_ww_ctx) { |
572 | struct ww_mutex *ww = container_of(lock, | 579 | struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); |
573 | struct ww_mutex, | ||
574 | base); | ||
575 | struct mutex_waiter *cur; | 580 | struct mutex_waiter *cur; |
576 | 581 | ||
577 | /* | 582 | /* |
578 | * This branch gets optimized out for the common case, | 583 | * This branch gets optimized out for the common case, |
579 | * and is only important for ww_mutex_lock. | 584 | * and is only important for ww_mutex_lock. |
580 | */ | 585 | */ |
581 | |||
582 | ww_mutex_lock_acquired(ww, ww_ctx); | 586 | ww_mutex_lock_acquired(ww, ww_ctx); |
583 | ww->ctx = ww_ctx; | 587 | ww->ctx = ww_ctx; |
584 | 588 | ||
@@ -592,15 +596,8 @@ done: | |||
592 | } | 596 | } |
593 | } | 597 | } |
594 | 598 | ||
595 | /* set it to 0 if there are no waiters left: */ | ||
596 | if (likely(list_empty(&lock->wait_list))) | ||
597 | atomic_set(&lock->count, 0); | ||
598 | |||
599 | spin_unlock_mutex(&lock->wait_lock, flags); | 599 | spin_unlock_mutex(&lock->wait_lock, flags); |
600 | |||
601 | debug_mutex_free_waiter(&waiter); | ||
602 | preempt_enable(); | 600 | preempt_enable(); |
603 | |||
604 | return 0; | 601 | return 0; |
605 | 602 | ||
606 | err: | 603 | err: |
@@ -618,7 +615,7 @@ mutex_lock_nested(struct mutex *lock, unsigned int subclass) | |||
618 | { | 615 | { |
619 | might_sleep(); | 616 | might_sleep(); |
620 | __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, | 617 | __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, |
621 | subclass, NULL, _RET_IP_, NULL); | 618 | subclass, NULL, _RET_IP_, NULL, 0); |
622 | } | 619 | } |
623 | 620 | ||
624 | EXPORT_SYMBOL_GPL(mutex_lock_nested); | 621 | EXPORT_SYMBOL_GPL(mutex_lock_nested); |
@@ -628,7 +625,7 @@ _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest) | |||
628 | { | 625 | { |
629 | might_sleep(); | 626 | might_sleep(); |
630 | __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, | 627 | __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, |
631 | 0, nest, _RET_IP_, NULL); | 628 | 0, nest, _RET_IP_, NULL, 0); |
632 | } | 629 | } |
633 | 630 | ||
634 | EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock); | 631 | EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock); |
@@ -638,7 +635,7 @@ mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass) | |||
638 | { | 635 | { |
639 | might_sleep(); | 636 | might_sleep(); |
640 | return __mutex_lock_common(lock, TASK_KILLABLE, | 637 | return __mutex_lock_common(lock, TASK_KILLABLE, |
641 | subclass, NULL, _RET_IP_, NULL); | 638 | subclass, NULL, _RET_IP_, NULL, 0); |
642 | } | 639 | } |
643 | EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); | 640 | EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); |
644 | 641 | ||
@@ -647,7 +644,7 @@ mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) | |||
647 | { | 644 | { |
648 | might_sleep(); | 645 | might_sleep(); |
649 | return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, | 646 | return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, |
650 | subclass, NULL, _RET_IP_, NULL); | 647 | subclass, NULL, _RET_IP_, NULL, 0); |
651 | } | 648 | } |
652 | 649 | ||
653 | EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); | 650 | EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); |
@@ -685,7 +682,7 @@ __ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) | |||
685 | 682 | ||
686 | might_sleep(); | 683 | might_sleep(); |
687 | ret = __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, | 684 | ret = __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, |
688 | 0, &ctx->dep_map, _RET_IP_, ctx); | 685 | 0, &ctx->dep_map, _RET_IP_, ctx, 1); |
689 | if (!ret && ctx->acquired > 1) | 686 | if (!ret && ctx->acquired > 1) |
690 | return ww_mutex_deadlock_injection(lock, ctx); | 687 | return ww_mutex_deadlock_injection(lock, ctx); |
691 | 688 | ||
@@ -700,7 +697,7 @@ __ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) | |||
700 | 697 | ||
701 | might_sleep(); | 698 | might_sleep(); |
702 | ret = __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, | 699 | ret = __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, |
703 | 0, &ctx->dep_map, _RET_IP_, ctx); | 700 | 0, &ctx->dep_map, _RET_IP_, ctx, 1); |
704 | 701 | ||
705 | if (!ret && ctx->acquired > 1) | 702 | if (!ret && ctx->acquired > 1) |
706 | return ww_mutex_deadlock_injection(lock, ctx); | 703 | return ww_mutex_deadlock_injection(lock, ctx); |
@@ -812,28 +809,28 @@ __mutex_lock_slowpath(atomic_t *lock_count) | |||
812 | struct mutex *lock = container_of(lock_count, struct mutex, count); | 809 | struct mutex *lock = container_of(lock_count, struct mutex, count); |
813 | 810 | ||
814 | __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, | 811 | __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, |
815 | NULL, _RET_IP_, NULL); | 812 | NULL, _RET_IP_, NULL, 0); |
816 | } | 813 | } |
817 | 814 | ||
818 | static noinline int __sched | 815 | static noinline int __sched |
819 | __mutex_lock_killable_slowpath(struct mutex *lock) | 816 | __mutex_lock_killable_slowpath(struct mutex *lock) |
820 | { | 817 | { |
821 | return __mutex_lock_common(lock, TASK_KILLABLE, 0, | 818 | return __mutex_lock_common(lock, TASK_KILLABLE, 0, |
822 | NULL, _RET_IP_, NULL); | 819 | NULL, _RET_IP_, NULL, 0); |
823 | } | 820 | } |
824 | 821 | ||
825 | static noinline int __sched | 822 | static noinline int __sched |
826 | __mutex_lock_interruptible_slowpath(struct mutex *lock) | 823 | __mutex_lock_interruptible_slowpath(struct mutex *lock) |
827 | { | 824 | { |
828 | return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, | 825 | return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, |
829 | NULL, _RET_IP_, NULL); | 826 | NULL, _RET_IP_, NULL, 0); |
830 | } | 827 | } |
831 | 828 | ||
832 | static noinline int __sched | 829 | static noinline int __sched |
833 | __ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) | 830 | __ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) |
834 | { | 831 | { |
835 | return __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, 0, | 832 | return __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, 0, |
836 | NULL, _RET_IP_, ctx); | 833 | NULL, _RET_IP_, ctx, 1); |
837 | } | 834 | } |
838 | 835 | ||
839 | static noinline int __sched | 836 | static noinline int __sched |
@@ -841,7 +838,7 @@ __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock, | |||
841 | struct ww_acquire_ctx *ctx) | 838 | struct ww_acquire_ctx *ctx) |
842 | { | 839 | { |
843 | return __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, 0, | 840 | return __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, 0, |
844 | NULL, _RET_IP_, ctx); | 841 | NULL, _RET_IP_, ctx, 1); |
845 | } | 842 | } |
846 | 843 | ||
847 | #endif | 844 | #endif |
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 997cbb951a3b..8e7811086b82 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c | |||
@@ -126,22 +126,16 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) | |||
126 | struct nsproxy *old_ns = tsk->nsproxy; | 126 | struct nsproxy *old_ns = tsk->nsproxy; |
127 | struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns); | 127 | struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns); |
128 | struct nsproxy *new_ns; | 128 | struct nsproxy *new_ns; |
129 | int err = 0; | ||
130 | 129 | ||
131 | if (!old_ns) | 130 | if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | |
131 | CLONE_NEWPID | CLONE_NEWNET)))) { | ||
132 | get_nsproxy(old_ns); | ||
132 | return 0; | 133 | return 0; |
133 | |||
134 | get_nsproxy(old_ns); | ||
135 | |||
136 | if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | | ||
137 | CLONE_NEWPID | CLONE_NEWNET))) | ||
138 | return 0; | ||
139 | |||
140 | if (!ns_capable(user_ns, CAP_SYS_ADMIN)) { | ||
141 | err = -EPERM; | ||
142 | goto out; | ||
143 | } | 134 | } |
144 | 135 | ||
136 | if (!ns_capable(user_ns, CAP_SYS_ADMIN)) | ||
137 | return -EPERM; | ||
138 | |||
145 | /* | 139 | /* |
146 | * CLONE_NEWIPC must detach from the undolist: after switching | 140 | * CLONE_NEWIPC must detach from the undolist: after switching |
147 | * to a new ipc namespace, the semaphore arrays from the old | 141 | * to a new ipc namespace, the semaphore arrays from the old |
@@ -149,22 +143,16 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) | |||
149 | * means share undolist with parent, so we must forbid using | 143 | * means share undolist with parent, so we must forbid using |
150 | * it along with CLONE_NEWIPC. | 144 | * it along with CLONE_NEWIPC. |
151 | */ | 145 | */ |
152 | if ((flags & CLONE_NEWIPC) && (flags & CLONE_SYSVSEM)) { | 146 | if ((flags & (CLONE_NEWIPC | CLONE_SYSVSEM)) == |
153 | err = -EINVAL; | 147 | (CLONE_NEWIPC | CLONE_SYSVSEM)) |
154 | goto out; | 148 | return -EINVAL; |
155 | } | ||
156 | 149 | ||
157 | new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs); | 150 | new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs); |
158 | if (IS_ERR(new_ns)) { | 151 | if (IS_ERR(new_ns)) |
159 | err = PTR_ERR(new_ns); | 152 | return PTR_ERR(new_ns); |
160 | goto out; | ||
161 | } | ||
162 | 153 | ||
163 | tsk->nsproxy = new_ns; | 154 | tsk->nsproxy = new_ns; |
164 | 155 | return 0; | |
165 | out: | ||
166 | put_nsproxy(old_ns); | ||
167 | return err; | ||
168 | } | 156 | } |
169 | 157 | ||
170 | void free_nsproxy(struct nsproxy *ns) | 158 | void free_nsproxy(struct nsproxy *ns) |
diff --git a/kernel/padata.c b/kernel/padata.c index 072f4ee4eb89..07af2c95dcfe 100644 --- a/kernel/padata.c +++ b/kernel/padata.c | |||
@@ -846,6 +846,8 @@ static int padata_cpu_callback(struct notifier_block *nfb, | |||
846 | switch (action) { | 846 | switch (action) { |
847 | case CPU_ONLINE: | 847 | case CPU_ONLINE: |
848 | case CPU_ONLINE_FROZEN: | 848 | case CPU_ONLINE_FROZEN: |
849 | case CPU_DOWN_FAILED: | ||
850 | case CPU_DOWN_FAILED_FROZEN: | ||
849 | if (!pinst_has_cpu(pinst, cpu)) | 851 | if (!pinst_has_cpu(pinst, cpu)) |
850 | break; | 852 | break; |
851 | mutex_lock(&pinst->lock); | 853 | mutex_lock(&pinst->lock); |
@@ -857,6 +859,8 @@ static int padata_cpu_callback(struct notifier_block *nfb, | |||
857 | 859 | ||
858 | case CPU_DOWN_PREPARE: | 860 | case CPU_DOWN_PREPARE: |
859 | case CPU_DOWN_PREPARE_FROZEN: | 861 | case CPU_DOWN_PREPARE_FROZEN: |
862 | case CPU_UP_CANCELED: | ||
863 | case CPU_UP_CANCELED_FROZEN: | ||
860 | if (!pinst_has_cpu(pinst, cpu)) | 864 | if (!pinst_has_cpu(pinst, cpu)) |
861 | break; | 865 | break; |
862 | mutex_lock(&pinst->lock); | 866 | mutex_lock(&pinst->lock); |
@@ -865,22 +869,6 @@ static int padata_cpu_callback(struct notifier_block *nfb, | |||
865 | if (err) | 869 | if (err) |
866 | return notifier_from_errno(err); | 870 | return notifier_from_errno(err); |
867 | break; | 871 | break; |
868 | |||
869 | case CPU_UP_CANCELED: | ||
870 | case CPU_UP_CANCELED_FROZEN: | ||
871 | if (!pinst_has_cpu(pinst, cpu)) | ||
872 | break; | ||
873 | mutex_lock(&pinst->lock); | ||
874 | __padata_remove_cpu(pinst, cpu); | ||
875 | mutex_unlock(&pinst->lock); | ||
876 | |||
877 | case CPU_DOWN_FAILED: | ||
878 | case CPU_DOWN_FAILED_FROZEN: | ||
879 | if (!pinst_has_cpu(pinst, cpu)) | ||
880 | break; | ||
881 | mutex_lock(&pinst->lock); | ||
882 | __padata_add_cpu(pinst, cpu); | ||
883 | mutex_unlock(&pinst->lock); | ||
884 | } | 872 | } |
885 | 873 | ||
886 | return NOTIFY_OK; | 874 | return NOTIFY_OK; |
@@ -1086,18 +1074,18 @@ struct padata_instance *padata_alloc(struct workqueue_struct *wq, | |||
1086 | 1074 | ||
1087 | pinst->flags = 0; | 1075 | pinst->flags = 0; |
1088 | 1076 | ||
1089 | #ifdef CONFIG_HOTPLUG_CPU | ||
1090 | pinst->cpu_notifier.notifier_call = padata_cpu_callback; | ||
1091 | pinst->cpu_notifier.priority = 0; | ||
1092 | register_hotcpu_notifier(&pinst->cpu_notifier); | ||
1093 | #endif | ||
1094 | |||
1095 | put_online_cpus(); | 1077 | put_online_cpus(); |
1096 | 1078 | ||
1097 | BLOCKING_INIT_NOTIFIER_HEAD(&pinst->cpumask_change_notifier); | 1079 | BLOCKING_INIT_NOTIFIER_HEAD(&pinst->cpumask_change_notifier); |
1098 | kobject_init(&pinst->kobj, &padata_attr_type); | 1080 | kobject_init(&pinst->kobj, &padata_attr_type); |
1099 | mutex_init(&pinst->lock); | 1081 | mutex_init(&pinst->lock); |
1100 | 1082 | ||
1083 | #ifdef CONFIG_HOTPLUG_CPU | ||
1084 | pinst->cpu_notifier.notifier_call = padata_cpu_callback; | ||
1085 | pinst->cpu_notifier.priority = 0; | ||
1086 | register_hotcpu_notifier(&pinst->cpu_notifier); | ||
1087 | #endif | ||
1088 | |||
1101 | return pinst; | 1089 | return pinst; |
1102 | 1090 | ||
1103 | err_free_masks: | 1091 | err_free_masks: |
diff --git a/kernel/panic.c b/kernel/panic.c index 801864600514..b6c482ccc5db 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
@@ -123,10 +123,14 @@ void panic(const char *fmt, ...) | |||
123 | */ | 123 | */ |
124 | smp_send_stop(); | 124 | smp_send_stop(); |
125 | 125 | ||
126 | kmsg_dump(KMSG_DUMP_PANIC); | 126 | /* |
127 | 127 | * Run any panic handlers, including those that might need to | |
128 | * add information to the kmsg dump output. | ||
129 | */ | ||
128 | atomic_notifier_call_chain(&panic_notifier_list, 0, buf); | 130 | atomic_notifier_call_chain(&panic_notifier_list, 0, buf); |
129 | 131 | ||
132 | kmsg_dump(KMSG_DUMP_PANIC); | ||
133 | |||
130 | bust_spinlocks(0); | 134 | bust_spinlocks(0); |
131 | 135 | ||
132 | if (!panic_blink) | 136 | if (!panic_blink) |
diff --git a/kernel/params.c b/kernel/params.c index 440e65d1a544..c00d5b502aa4 100644 --- a/kernel/params.c +++ b/kernel/params.c | |||
@@ -103,8 +103,8 @@ static int parse_one(char *param, | |||
103 | || params[i].level > max_level) | 103 | || params[i].level > max_level) |
104 | return 0; | 104 | return 0; |
105 | /* No one handled NULL, so do it here. */ | 105 | /* No one handled NULL, so do it here. */ |
106 | if (!val && params[i].ops->set != param_set_bool | 106 | if (!val && |
107 | && params[i].ops->set != param_set_bint) | 107 | !(params[i].ops->flags & KERNEL_PARAM_FL_NOARG)) |
108 | return -EINVAL; | 108 | return -EINVAL; |
109 | pr_debug("handling %s with %p\n", param, | 109 | pr_debug("handling %s with %p\n", param, |
110 | params[i].ops->set); | 110 | params[i].ops->set); |
@@ -241,7 +241,8 @@ int parse_args(const char *doing, | |||
241 | } \ | 241 | } \ |
242 | int param_get_##name(char *buffer, const struct kernel_param *kp) \ | 242 | int param_get_##name(char *buffer, const struct kernel_param *kp) \ |
243 | { \ | 243 | { \ |
244 | return sprintf(buffer, format, *((type *)kp->arg)); \ | 244 | return scnprintf(buffer, PAGE_SIZE, format, \ |
245 | *((type *)kp->arg)); \ | ||
245 | } \ | 246 | } \ |
246 | struct kernel_param_ops param_ops_##name = { \ | 247 | struct kernel_param_ops param_ops_##name = { \ |
247 | .set = param_set_##name, \ | 248 | .set = param_set_##name, \ |
@@ -252,13 +253,13 @@ int parse_args(const char *doing, | |||
252 | EXPORT_SYMBOL(param_ops_##name) | 253 | EXPORT_SYMBOL(param_ops_##name) |
253 | 254 | ||
254 | 255 | ||
255 | STANDARD_PARAM_DEF(byte, unsigned char, "%c", unsigned long, strict_strtoul); | 256 | STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", unsigned long, kstrtoul); |
256 | STANDARD_PARAM_DEF(short, short, "%hi", long, strict_strtol); | 257 | STANDARD_PARAM_DEF(short, short, "%hi", long, kstrtol); |
257 | STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, strict_strtoul); | 258 | STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, kstrtoul); |
258 | STANDARD_PARAM_DEF(int, int, "%i", long, strict_strtol); | 259 | STANDARD_PARAM_DEF(int, int, "%i", long, kstrtol); |
259 | STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, strict_strtoul); | 260 | STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, kstrtoul); |
260 | STANDARD_PARAM_DEF(long, long, "%li", long, strict_strtol); | 261 | STANDARD_PARAM_DEF(long, long, "%li", long, kstrtol); |
261 | STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul); | 262 | STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, kstrtoul); |
262 | 263 | ||
263 | int param_set_charp(const char *val, const struct kernel_param *kp) | 264 | int param_set_charp(const char *val, const struct kernel_param *kp) |
264 | { | 265 | { |
@@ -285,7 +286,7 @@ EXPORT_SYMBOL(param_set_charp); | |||
285 | 286 | ||
286 | int param_get_charp(char *buffer, const struct kernel_param *kp) | 287 | int param_get_charp(char *buffer, const struct kernel_param *kp) |
287 | { | 288 | { |
288 | return sprintf(buffer, "%s", *((char **)kp->arg)); | 289 | return scnprintf(buffer, PAGE_SIZE, "%s", *((char **)kp->arg)); |
289 | } | 290 | } |
290 | EXPORT_SYMBOL(param_get_charp); | 291 | EXPORT_SYMBOL(param_get_charp); |
291 | 292 | ||
@@ -320,6 +321,7 @@ int param_get_bool(char *buffer, const struct kernel_param *kp) | |||
320 | EXPORT_SYMBOL(param_get_bool); | 321 | EXPORT_SYMBOL(param_get_bool); |
321 | 322 | ||
322 | struct kernel_param_ops param_ops_bool = { | 323 | struct kernel_param_ops param_ops_bool = { |
324 | .flags = KERNEL_PARAM_FL_NOARG, | ||
323 | .set = param_set_bool, | 325 | .set = param_set_bool, |
324 | .get = param_get_bool, | 326 | .get = param_get_bool, |
325 | }; | 327 | }; |
@@ -370,6 +372,7 @@ int param_set_bint(const char *val, const struct kernel_param *kp) | |||
370 | EXPORT_SYMBOL(param_set_bint); | 372 | EXPORT_SYMBOL(param_set_bint); |
371 | 373 | ||
372 | struct kernel_param_ops param_ops_bint = { | 374 | struct kernel_param_ops param_ops_bint = { |
375 | .flags = KERNEL_PARAM_FL_NOARG, | ||
373 | .set = param_set_bint, | 376 | .set = param_set_bint, |
374 | .get = param_get_int, | 377 | .get = param_get_int, |
375 | }; | 378 | }; |
@@ -827,7 +830,7 @@ ssize_t __modver_version_show(struct module_attribute *mattr, | |||
827 | struct module_version_attribute *vattr = | 830 | struct module_version_attribute *vattr = |
828 | container_of(mattr, struct module_version_attribute, mattr); | 831 | container_of(mattr, struct module_version_attribute, mattr); |
829 | 832 | ||
830 | return sprintf(buf, "%s\n", vattr->version); | 833 | return scnprintf(buf, PAGE_SIZE, "%s\n", vattr->version); |
831 | } | 834 | } |
832 | 835 | ||
833 | extern const struct module_version_attribute *__start___modver[]; | 836 | extern const struct module_version_attribute *__start___modver[]; |
@@ -912,7 +915,14 @@ static const struct kset_uevent_ops module_uevent_ops = { | |||
912 | struct kset *module_kset; | 915 | struct kset *module_kset; |
913 | int module_sysfs_initialized; | 916 | int module_sysfs_initialized; |
914 | 917 | ||
918 | static void module_kobj_release(struct kobject *kobj) | ||
919 | { | ||
920 | struct module_kobject *mk = to_module_kobject(kobj); | ||
921 | complete(mk->kobj_completion); | ||
922 | } | ||
923 | |||
915 | struct kobj_type module_ktype = { | 924 | struct kobj_type module_ktype = { |
925 | .release = module_kobj_release, | ||
916 | .sysfs_ops = &module_sysfs_ops, | 926 | .sysfs_ops = &module_sysfs_ops, |
917 | }; | 927 | }; |
918 | 928 | ||
diff --git a/kernel/pid.c b/kernel/pid.c index 66505c1dfc51..9b9a26698144 100644 --- a/kernel/pid.c +++ b/kernel/pid.c | |||
@@ -265,6 +265,7 @@ void free_pid(struct pid *pid) | |||
265 | struct pid_namespace *ns = upid->ns; | 265 | struct pid_namespace *ns = upid->ns; |
266 | hlist_del_rcu(&upid->pid_chain); | 266 | hlist_del_rcu(&upid->pid_chain); |
267 | switch(--ns->nr_hashed) { | 267 | switch(--ns->nr_hashed) { |
268 | case 2: | ||
268 | case 1: | 269 | case 1: |
269 | /* When all that is left in the pid namespace | 270 | /* When all that is left in the pid namespace |
270 | * is the reaper wake up the reaper. The reaper | 271 | * is the reaper wake up the reaper. The reaper |
@@ -272,6 +273,11 @@ void free_pid(struct pid *pid) | |||
272 | */ | 273 | */ |
273 | wake_up_process(ns->child_reaper); | 274 | wake_up_process(ns->child_reaper); |
274 | break; | 275 | break; |
276 | case PIDNS_HASH_ADDING: | ||
277 | /* Handle a fork failure of the first process */ | ||
278 | WARN_ON(ns->child_reaper); | ||
279 | ns->nr_hashed = 0; | ||
280 | /* fall through */ | ||
275 | case 0: | 281 | case 0: |
276 | schedule_work(&ns->proc_work); | 282 | schedule_work(&ns->proc_work); |
277 | break; | 283 | break; |
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 601bb361c235..42086551a24a 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c | |||
@@ -329,7 +329,7 @@ static int pidns_install(struct nsproxy *nsproxy, void *ns) | |||
329 | struct pid_namespace *ancestor, *new = ns; | 329 | struct pid_namespace *ancestor, *new = ns; |
330 | 330 | ||
331 | if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) || | 331 | if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) || |
332 | !nsown_capable(CAP_SYS_ADMIN)) | 332 | !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) |
333 | return -EPERM; | 333 | return -EPERM; |
334 | 334 | ||
335 | /* | 335 | /* |
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index b26f5f1e773e..0121dab83f43 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c | |||
@@ -39,7 +39,7 @@ static int resume_delay; | |||
39 | static char resume_file[256] = CONFIG_PM_STD_PARTITION; | 39 | static char resume_file[256] = CONFIG_PM_STD_PARTITION; |
40 | dev_t swsusp_resume_device; | 40 | dev_t swsusp_resume_device; |
41 | sector_t swsusp_resume_block; | 41 | sector_t swsusp_resume_block; |
42 | int in_suspend __nosavedata; | 42 | __visible int in_suspend __nosavedata; |
43 | 43 | ||
44 | enum { | 44 | enum { |
45 | HIBERNATION_INVALID, | 45 | HIBERNATION_INVALID, |
@@ -644,22 +644,23 @@ int hibernate(void) | |||
644 | if (error) | 644 | if (error) |
645 | goto Exit; | 645 | goto Exit; |
646 | 646 | ||
647 | /* Allocate memory management structures */ | ||
648 | error = create_basic_memory_bitmaps(); | ||
649 | if (error) | ||
650 | goto Exit; | ||
651 | |||
652 | printk(KERN_INFO "PM: Syncing filesystems ... "); | 647 | printk(KERN_INFO "PM: Syncing filesystems ... "); |
653 | sys_sync(); | 648 | sys_sync(); |
654 | printk("done.\n"); | 649 | printk("done.\n"); |
655 | 650 | ||
656 | error = freeze_processes(); | 651 | error = freeze_processes(); |
657 | if (error) | 652 | if (error) |
658 | goto Free_bitmaps; | 653 | goto Exit; |
654 | |||
655 | lock_device_hotplug(); | ||
656 | /* Allocate memory management structures */ | ||
657 | error = create_basic_memory_bitmaps(); | ||
658 | if (error) | ||
659 | goto Thaw; | ||
659 | 660 | ||
660 | error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); | 661 | error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); |
661 | if (error || freezer_test_done) | 662 | if (error || freezer_test_done) |
662 | goto Thaw; | 663 | goto Free_bitmaps; |
663 | 664 | ||
664 | if (in_suspend) { | 665 | if (in_suspend) { |
665 | unsigned int flags = 0; | 666 | unsigned int flags = 0; |
@@ -682,14 +683,14 @@ int hibernate(void) | |||
682 | pr_debug("PM: Image restored successfully.\n"); | 683 | pr_debug("PM: Image restored successfully.\n"); |
683 | } | 684 | } |
684 | 685 | ||
686 | Free_bitmaps: | ||
687 | free_basic_memory_bitmaps(); | ||
685 | Thaw: | 688 | Thaw: |
689 | unlock_device_hotplug(); | ||
686 | thaw_processes(); | 690 | thaw_processes(); |
687 | 691 | ||
688 | /* Don't bother checking whether freezer_test_done is true */ | 692 | /* Don't bother checking whether freezer_test_done is true */ |
689 | freezer_test_done = false; | 693 | freezer_test_done = false; |
690 | |||
691 | Free_bitmaps: | ||
692 | free_basic_memory_bitmaps(); | ||
693 | Exit: | 694 | Exit: |
694 | pm_notifier_call_chain(PM_POST_HIBERNATION); | 695 | pm_notifier_call_chain(PM_POST_HIBERNATION); |
695 | pm_restore_console(); | 696 | pm_restore_console(); |
@@ -806,21 +807,20 @@ static int software_resume(void) | |||
806 | pm_prepare_console(); | 807 | pm_prepare_console(); |
807 | error = pm_notifier_call_chain(PM_RESTORE_PREPARE); | 808 | error = pm_notifier_call_chain(PM_RESTORE_PREPARE); |
808 | if (error) | 809 | if (error) |
809 | goto close_finish; | 810 | goto Close_Finish; |
810 | |||
811 | error = create_basic_memory_bitmaps(); | ||
812 | if (error) | ||
813 | goto close_finish; | ||
814 | 811 | ||
815 | pr_debug("PM: Preparing processes for restore.\n"); | 812 | pr_debug("PM: Preparing processes for restore.\n"); |
816 | error = freeze_processes(); | 813 | error = freeze_processes(); |
817 | if (error) { | 814 | if (error) |
818 | swsusp_close(FMODE_READ); | 815 | goto Close_Finish; |
819 | goto Done; | ||
820 | } | ||
821 | 816 | ||
822 | pr_debug("PM: Loading hibernation image.\n"); | 817 | pr_debug("PM: Loading hibernation image.\n"); |
823 | 818 | ||
819 | lock_device_hotplug(); | ||
820 | error = create_basic_memory_bitmaps(); | ||
821 | if (error) | ||
822 | goto Thaw; | ||
823 | |||
824 | error = swsusp_read(&flags); | 824 | error = swsusp_read(&flags); |
825 | swsusp_close(FMODE_READ); | 825 | swsusp_close(FMODE_READ); |
826 | if (!error) | 826 | if (!error) |
@@ -828,9 +828,10 @@ static int software_resume(void) | |||
828 | 828 | ||
829 | printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n"); | 829 | printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n"); |
830 | swsusp_free(); | 830 | swsusp_free(); |
831 | thaw_processes(); | ||
832 | Done: | ||
833 | free_basic_memory_bitmaps(); | 831 | free_basic_memory_bitmaps(); |
832 | Thaw: | ||
833 | unlock_device_hotplug(); | ||
834 | thaw_processes(); | ||
834 | Finish: | 835 | Finish: |
835 | pm_notifier_call_chain(PM_POST_RESTORE); | 836 | pm_notifier_call_chain(PM_POST_RESTORE); |
836 | pm_restore_console(); | 837 | pm_restore_console(); |
@@ -840,12 +841,12 @@ static int software_resume(void) | |||
840 | mutex_unlock(&pm_mutex); | 841 | mutex_unlock(&pm_mutex); |
841 | pr_debug("PM: Hibernation image not present or could not be loaded.\n"); | 842 | pr_debug("PM: Hibernation image not present or could not be loaded.\n"); |
842 | return error; | 843 | return error; |
843 | close_finish: | 844 | Close_Finish: |
844 | swsusp_close(FMODE_READ); | 845 | swsusp_close(FMODE_READ); |
845 | goto Finish; | 846 | goto Finish; |
846 | } | 847 | } |
847 | 848 | ||
848 | late_initcall(software_resume); | 849 | late_initcall_sync(software_resume); |
849 | 850 | ||
850 | 851 | ||
851 | static const char * const hibernation_modes[] = { | 852 | static const char * const hibernation_modes[] = { |
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 349587bb03e1..98c3b34a4cff 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
@@ -352,7 +352,7 @@ static int create_mem_extents(struct list_head *list, gfp_t gfp_mask) | |||
352 | struct mem_extent *ext, *cur, *aux; | 352 | struct mem_extent *ext, *cur, *aux; |
353 | 353 | ||
354 | zone_start = zone->zone_start_pfn; | 354 | zone_start = zone->zone_start_pfn; |
355 | zone_end = zone->zone_start_pfn + zone->spanned_pages; | 355 | zone_end = zone_end_pfn(zone); |
356 | 356 | ||
357 | list_for_each_entry(ext, list, hook) | 357 | list_for_each_entry(ext, list, hook) |
358 | if (zone_start <= ext->end) | 358 | if (zone_start <= ext->end) |
@@ -743,7 +743,10 @@ int create_basic_memory_bitmaps(void) | |||
743 | struct memory_bitmap *bm1, *bm2; | 743 | struct memory_bitmap *bm1, *bm2; |
744 | int error = 0; | 744 | int error = 0; |
745 | 745 | ||
746 | BUG_ON(forbidden_pages_map || free_pages_map); | 746 | if (forbidden_pages_map && free_pages_map) |
747 | return 0; | ||
748 | else | ||
749 | BUG_ON(forbidden_pages_map || free_pages_map); | ||
747 | 750 | ||
748 | bm1 = kzalloc(sizeof(struct memory_bitmap), GFP_KERNEL); | 751 | bm1 = kzalloc(sizeof(struct memory_bitmap), GFP_KERNEL); |
749 | if (!bm1) | 752 | if (!bm1) |
@@ -884,7 +887,7 @@ static unsigned int count_highmem_pages(void) | |||
884 | continue; | 887 | continue; |
885 | 888 | ||
886 | mark_free_pages(zone); | 889 | mark_free_pages(zone); |
887 | max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; | 890 | max_zone_pfn = zone_end_pfn(zone); |
888 | for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) | 891 | for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) |
889 | if (saveable_highmem_page(zone, pfn)) | 892 | if (saveable_highmem_page(zone, pfn)) |
890 | n++; | 893 | n++; |
@@ -948,7 +951,7 @@ static unsigned int count_data_pages(void) | |||
948 | continue; | 951 | continue; |
949 | 952 | ||
950 | mark_free_pages(zone); | 953 | mark_free_pages(zone); |
951 | max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; | 954 | max_zone_pfn = zone_end_pfn(zone); |
952 | for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) | 955 | for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) |
953 | if (saveable_page(zone, pfn)) | 956 | if (saveable_page(zone, pfn)) |
954 | n++; | 957 | n++; |
@@ -1041,7 +1044,7 @@ copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm) | |||
1041 | unsigned long max_zone_pfn; | 1044 | unsigned long max_zone_pfn; |
1042 | 1045 | ||
1043 | mark_free_pages(zone); | 1046 | mark_free_pages(zone); |
1044 | max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; | 1047 | max_zone_pfn = zone_end_pfn(zone); |
1045 | for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) | 1048 | for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) |
1046 | if (page_is_saveable(zone, pfn)) | 1049 | if (page_is_saveable(zone, pfn)) |
1047 | memory_bm_set_bit(orig_bm, pfn); | 1050 | memory_bm_set_bit(orig_bm, pfn); |
@@ -1093,7 +1096,7 @@ void swsusp_free(void) | |||
1093 | unsigned long pfn, max_zone_pfn; | 1096 | unsigned long pfn, max_zone_pfn; |
1094 | 1097 | ||
1095 | for_each_populated_zone(zone) { | 1098 | for_each_populated_zone(zone) { |
1096 | max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; | 1099 | max_zone_pfn = zone_end_pfn(zone); |
1097 | for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) | 1100 | for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) |
1098 | if (pfn_valid(pfn)) { | 1101 | if (pfn_valid(pfn)) { |
1099 | struct page *page = pfn_to_page(pfn); | 1102 | struct page *page = pfn_to_page(pfn); |
@@ -1755,7 +1758,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm) | |||
1755 | 1758 | ||
1756 | /* Clear page flags */ | 1759 | /* Clear page flags */ |
1757 | for_each_populated_zone(zone) { | 1760 | for_each_populated_zone(zone) { |
1758 | max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; | 1761 | max_zone_pfn = zone_end_pfn(zone); |
1759 | for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) | 1762 | for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) |
1760 | if (pfn_valid(pfn)) | 1763 | if (pfn_valid(pfn)) |
1761 | swsusp_unset_page_free(pfn_to_page(pfn)); | 1764 | swsusp_unset_page_free(pfn_to_page(pfn)); |
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index ece04223bb1e..62ee437b5c7e 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c | |||
@@ -210,6 +210,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) | |||
210 | goto Platform_wake; | 210 | goto Platform_wake; |
211 | } | 211 | } |
212 | 212 | ||
213 | ftrace_stop(); | ||
213 | error = disable_nonboot_cpus(); | 214 | error = disable_nonboot_cpus(); |
214 | if (error || suspend_test(TEST_CPUS)) | 215 | if (error || suspend_test(TEST_CPUS)) |
215 | goto Enable_cpus; | 216 | goto Enable_cpus; |
@@ -232,6 +233,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) | |||
232 | 233 | ||
233 | Enable_cpus: | 234 | Enable_cpus: |
234 | enable_nonboot_cpus(); | 235 | enable_nonboot_cpus(); |
236 | ftrace_start(); | ||
235 | 237 | ||
236 | Platform_wake: | 238 | Platform_wake: |
237 | if (need_suspend_ops(state) && suspend_ops->wake) | 239 | if (need_suspend_ops(state) && suspend_ops->wake) |
@@ -265,7 +267,6 @@ int suspend_devices_and_enter(suspend_state_t state) | |||
265 | goto Close; | 267 | goto Close; |
266 | } | 268 | } |
267 | suspend_console(); | 269 | suspend_console(); |
268 | ftrace_stop(); | ||
269 | suspend_test_start(); | 270 | suspend_test_start(); |
270 | error = dpm_suspend_start(PMSG_SUSPEND); | 271 | error = dpm_suspend_start(PMSG_SUSPEND); |
271 | if (error) { | 272 | if (error) { |
@@ -285,7 +286,6 @@ int suspend_devices_and_enter(suspend_state_t state) | |||
285 | suspend_test_start(); | 286 | suspend_test_start(); |
286 | dpm_resume_end(PMSG_RESUME); | 287 | dpm_resume_end(PMSG_RESUME); |
287 | suspend_test_finish("resume devices"); | 288 | suspend_test_finish("resume devices"); |
288 | ftrace_start(); | ||
289 | resume_console(); | 289 | resume_console(); |
290 | Close: | 290 | Close: |
291 | if (need_suspend_ops(state) && suspend_ops->end) | 291 | if (need_suspend_ops(state) && suspend_ops->end) |
diff --git a/kernel/power/user.c b/kernel/power/user.c index 4ed81e74f86f..957f06164ad1 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c | |||
@@ -39,6 +39,7 @@ static struct snapshot_data { | |||
39 | char frozen; | 39 | char frozen; |
40 | char ready; | 40 | char ready; |
41 | char platform_support; | 41 | char platform_support; |
42 | bool free_bitmaps; | ||
42 | } snapshot_state; | 43 | } snapshot_state; |
43 | 44 | ||
44 | atomic_t snapshot_device_available = ATOMIC_INIT(1); | 45 | atomic_t snapshot_device_available = ATOMIC_INIT(1); |
@@ -60,11 +61,6 @@ static int snapshot_open(struct inode *inode, struct file *filp) | |||
60 | error = -ENOSYS; | 61 | error = -ENOSYS; |
61 | goto Unlock; | 62 | goto Unlock; |
62 | } | 63 | } |
63 | if(create_basic_memory_bitmaps()) { | ||
64 | atomic_inc(&snapshot_device_available); | ||
65 | error = -ENOMEM; | ||
66 | goto Unlock; | ||
67 | } | ||
68 | nonseekable_open(inode, filp); | 64 | nonseekable_open(inode, filp); |
69 | data = &snapshot_state; | 65 | data = &snapshot_state; |
70 | filp->private_data = data; | 66 | filp->private_data = data; |
@@ -87,13 +83,16 @@ static int snapshot_open(struct inode *inode, struct file *filp) | |||
87 | data->swap = -1; | 83 | data->swap = -1; |
88 | data->mode = O_WRONLY; | 84 | data->mode = O_WRONLY; |
89 | error = pm_notifier_call_chain(PM_RESTORE_PREPARE); | 85 | error = pm_notifier_call_chain(PM_RESTORE_PREPARE); |
86 | if (!error) { | ||
87 | error = create_basic_memory_bitmaps(); | ||
88 | data->free_bitmaps = !error; | ||
89 | } | ||
90 | if (error) | 90 | if (error) |
91 | pm_notifier_call_chain(PM_POST_RESTORE); | 91 | pm_notifier_call_chain(PM_POST_RESTORE); |
92 | } | 92 | } |
93 | if (error) { | 93 | if (error) |
94 | free_basic_memory_bitmaps(); | ||
95 | atomic_inc(&snapshot_device_available); | 94 | atomic_inc(&snapshot_device_available); |
96 | } | 95 | |
97 | data->frozen = 0; | 96 | data->frozen = 0; |
98 | data->ready = 0; | 97 | data->ready = 0; |
99 | data->platform_support = 0; | 98 | data->platform_support = 0; |
@@ -111,12 +110,14 @@ static int snapshot_release(struct inode *inode, struct file *filp) | |||
111 | lock_system_sleep(); | 110 | lock_system_sleep(); |
112 | 111 | ||
113 | swsusp_free(); | 112 | swsusp_free(); |
114 | free_basic_memory_bitmaps(); | ||
115 | data = filp->private_data; | 113 | data = filp->private_data; |
116 | free_all_swap_pages(data->swap); | 114 | free_all_swap_pages(data->swap); |
117 | if (data->frozen) { | 115 | if (data->frozen) { |
118 | pm_restore_gfp_mask(); | 116 | pm_restore_gfp_mask(); |
117 | free_basic_memory_bitmaps(); | ||
119 | thaw_processes(); | 118 | thaw_processes(); |
119 | } else if (data->free_bitmaps) { | ||
120 | free_basic_memory_bitmaps(); | ||
120 | } | 121 | } |
121 | pm_notifier_call_chain(data->mode == O_RDONLY ? | 122 | pm_notifier_call_chain(data->mode == O_RDONLY ? |
122 | PM_POST_HIBERNATION : PM_POST_RESTORE); | 123 | PM_POST_HIBERNATION : PM_POST_RESTORE); |
@@ -207,6 +208,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, | |||
207 | if (!mutex_trylock(&pm_mutex)) | 208 | if (!mutex_trylock(&pm_mutex)) |
208 | return -EBUSY; | 209 | return -EBUSY; |
209 | 210 | ||
211 | lock_device_hotplug(); | ||
210 | data = filp->private_data; | 212 | data = filp->private_data; |
211 | 213 | ||
212 | switch (cmd) { | 214 | switch (cmd) { |
@@ -220,14 +222,23 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, | |||
220 | printk("done.\n"); | 222 | printk("done.\n"); |
221 | 223 | ||
222 | error = freeze_processes(); | 224 | error = freeze_processes(); |
223 | if (!error) | 225 | if (error) |
226 | break; | ||
227 | |||
228 | error = create_basic_memory_bitmaps(); | ||
229 | if (error) | ||
230 | thaw_processes(); | ||
231 | else | ||
224 | data->frozen = 1; | 232 | data->frozen = 1; |
233 | |||
225 | break; | 234 | break; |
226 | 235 | ||
227 | case SNAPSHOT_UNFREEZE: | 236 | case SNAPSHOT_UNFREEZE: |
228 | if (!data->frozen || data->ready) | 237 | if (!data->frozen || data->ready) |
229 | break; | 238 | break; |
230 | pm_restore_gfp_mask(); | 239 | pm_restore_gfp_mask(); |
240 | free_basic_memory_bitmaps(); | ||
241 | data->free_bitmaps = false; | ||
231 | thaw_processes(); | 242 | thaw_processes(); |
232 | data->frozen = 0; | 243 | data->frozen = 0; |
233 | break; | 244 | break; |
@@ -371,6 +382,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, | |||
371 | 382 | ||
372 | } | 383 | } |
373 | 384 | ||
385 | unlock_device_hotplug(); | ||
374 | mutex_unlock(&pm_mutex); | 386 | mutex_unlock(&pm_mutex); |
375 | 387 | ||
376 | return error; | 388 | return error; |
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 5b5a7080e2a5..b4e8500afdb3 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c | |||
@@ -2226,6 +2226,13 @@ void register_console(struct console *newcon) | |||
2226 | struct console *bcon = NULL; | 2226 | struct console *bcon = NULL; |
2227 | struct console_cmdline *c; | 2227 | struct console_cmdline *c; |
2228 | 2228 | ||
2229 | if (console_drivers) | ||
2230 | for_each_console(bcon) | ||
2231 | if (WARN(bcon == newcon, | ||
2232 | "console '%s%d' already registered\n", | ||
2233 | bcon->name, bcon->index)) | ||
2234 | return; | ||
2235 | |||
2229 | /* | 2236 | /* |
2230 | * before we register a new CON_BOOT console, make sure we don't | 2237 | * before we register a new CON_BOOT console, make sure we don't |
2231 | * already have a valid console | 2238 | * already have a valid console |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index a146ee327f6a..dd562e9aa2c8 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
@@ -236,7 +236,7 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode) | |||
236 | */ | 236 | */ |
237 | int dumpable = 0; | 237 | int dumpable = 0; |
238 | /* Don't let security modules deny introspection */ | 238 | /* Don't let security modules deny introspection */ |
239 | if (task == current) | 239 | if (same_thread_group(task, current)) |
240 | return 0; | 240 | return 0; |
241 | rcu_read_lock(); | 241 | rcu_read_lock(); |
242 | tcred = __task_cred(task); | 242 | tcred = __task_cred(task); |
diff --git a/kernel/rcu.h b/kernel/rcu.h index 7f8e7590e3e5..77131966c4ad 100644 --- a/kernel/rcu.h +++ b/kernel/rcu.h | |||
@@ -67,12 +67,15 @@ | |||
67 | 67 | ||
68 | extern struct debug_obj_descr rcuhead_debug_descr; | 68 | extern struct debug_obj_descr rcuhead_debug_descr; |
69 | 69 | ||
70 | static inline void debug_rcu_head_queue(struct rcu_head *head) | 70 | static inline int debug_rcu_head_queue(struct rcu_head *head) |
71 | { | 71 | { |
72 | debug_object_activate(head, &rcuhead_debug_descr); | 72 | int r1; |
73 | |||
74 | r1 = debug_object_activate(head, &rcuhead_debug_descr); | ||
73 | debug_object_active_state(head, &rcuhead_debug_descr, | 75 | debug_object_active_state(head, &rcuhead_debug_descr, |
74 | STATE_RCU_HEAD_READY, | 76 | STATE_RCU_HEAD_READY, |
75 | STATE_RCU_HEAD_QUEUED); | 77 | STATE_RCU_HEAD_QUEUED); |
78 | return r1; | ||
76 | } | 79 | } |
77 | 80 | ||
78 | static inline void debug_rcu_head_unqueue(struct rcu_head *head) | 81 | static inline void debug_rcu_head_unqueue(struct rcu_head *head) |
@@ -83,8 +86,9 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head) | |||
83 | debug_object_deactivate(head, &rcuhead_debug_descr); | 86 | debug_object_deactivate(head, &rcuhead_debug_descr); |
84 | } | 87 | } |
85 | #else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ | 88 | #else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ |
86 | static inline void debug_rcu_head_queue(struct rcu_head *head) | 89 | static inline int debug_rcu_head_queue(struct rcu_head *head) |
87 | { | 90 | { |
91 | return 0; | ||
88 | } | 92 | } |
89 | 93 | ||
90 | static inline void debug_rcu_head_unqueue(struct rcu_head *head) | 94 | static inline void debug_rcu_head_unqueue(struct rcu_head *head) |
@@ -94,7 +98,7 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head) | |||
94 | 98 | ||
95 | extern void kfree(const void *); | 99 | extern void kfree(const void *); |
96 | 100 | ||
97 | static inline bool __rcu_reclaim(char *rn, struct rcu_head *head) | 101 | static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) |
98 | { | 102 | { |
99 | unsigned long offset = (unsigned long)head->func; | 103 | unsigned long offset = (unsigned long)head->func; |
100 | 104 | ||
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index cce6ba8bbace..b02a339836b4 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
@@ -122,7 +122,7 @@ struct lockdep_map rcu_sched_lock_map = | |||
122 | STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key); | 122 | STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key); |
123 | EXPORT_SYMBOL_GPL(rcu_sched_lock_map); | 123 | EXPORT_SYMBOL_GPL(rcu_sched_lock_map); |
124 | 124 | ||
125 | int debug_lockdep_rcu_enabled(void) | 125 | int notrace debug_lockdep_rcu_enabled(void) |
126 | { | 126 | { |
127 | return rcu_scheduler_active && debug_locks && | 127 | return rcu_scheduler_active && debug_locks && |
128 | current->lockdep_recursion == 0; | 128 | current->lockdep_recursion == 0; |
@@ -212,43 +212,6 @@ static inline void debug_rcu_head_free(struct rcu_head *head) | |||
212 | } | 212 | } |
213 | 213 | ||
214 | /* | 214 | /* |
215 | * fixup_init is called when: | ||
216 | * - an active object is initialized | ||
217 | */ | ||
218 | static int rcuhead_fixup_init(void *addr, enum debug_obj_state state) | ||
219 | { | ||
220 | struct rcu_head *head = addr; | ||
221 | |||
222 | switch (state) { | ||
223 | case ODEBUG_STATE_ACTIVE: | ||
224 | /* | ||
225 | * Ensure that queued callbacks are all executed. | ||
226 | * If we detect that we are nested in a RCU read-side critical | ||
227 | * section, we should simply fail, otherwise we would deadlock. | ||
228 | * In !PREEMPT configurations, there is no way to tell if we are | ||
229 | * in a RCU read-side critical section or not, so we never | ||
230 | * attempt any fixup and just print a warning. | ||
231 | */ | ||
232 | #ifndef CONFIG_PREEMPT | ||
233 | WARN_ON_ONCE(1); | ||
234 | return 0; | ||
235 | #endif | ||
236 | if (rcu_preempt_depth() != 0 || preempt_count() != 0 || | ||
237 | irqs_disabled()) { | ||
238 | WARN_ON_ONCE(1); | ||
239 | return 0; | ||
240 | } | ||
241 | rcu_barrier(); | ||
242 | rcu_barrier_sched(); | ||
243 | rcu_barrier_bh(); | ||
244 | debug_object_init(head, &rcuhead_debug_descr); | ||
245 | return 1; | ||
246 | default: | ||
247 | return 0; | ||
248 | } | ||
249 | } | ||
250 | |||
251 | /* | ||
252 | * fixup_activate is called when: | 215 | * fixup_activate is called when: |
253 | * - an active object is activated | 216 | * - an active object is activated |
254 | * - an unknown object is activated (might be a statically initialized object) | 217 | * - an unknown object is activated (might be a statically initialized object) |
@@ -268,69 +231,8 @@ static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state) | |||
268 | debug_object_init(head, &rcuhead_debug_descr); | 231 | debug_object_init(head, &rcuhead_debug_descr); |
269 | debug_object_activate(head, &rcuhead_debug_descr); | 232 | debug_object_activate(head, &rcuhead_debug_descr); |
270 | return 0; | 233 | return 0; |
271 | |||
272 | case ODEBUG_STATE_ACTIVE: | ||
273 | /* | ||
274 | * Ensure that queued callbacks are all executed. | ||
275 | * If we detect that we are nested in a RCU read-side critical | ||
276 | * section, we should simply fail, otherwise we would deadlock. | ||
277 | * In !PREEMPT configurations, there is no way to tell if we are | ||
278 | * in a RCU read-side critical section or not, so we never | ||
279 | * attempt any fixup and just print a warning. | ||
280 | */ | ||
281 | #ifndef CONFIG_PREEMPT | ||
282 | WARN_ON_ONCE(1); | ||
283 | return 0; | ||
284 | #endif | ||
285 | if (rcu_preempt_depth() != 0 || preempt_count() != 0 || | ||
286 | irqs_disabled()) { | ||
287 | WARN_ON_ONCE(1); | ||
288 | return 0; | ||
289 | } | ||
290 | rcu_barrier(); | ||
291 | rcu_barrier_sched(); | ||
292 | rcu_barrier_bh(); | ||
293 | debug_object_activate(head, &rcuhead_debug_descr); | ||
294 | return 1; | ||
295 | default: | 234 | default: |
296 | return 0; | ||
297 | } | ||
298 | } | ||
299 | |||
300 | /* | ||
301 | * fixup_free is called when: | ||
302 | * - an active object is freed | ||
303 | */ | ||
304 | static int rcuhead_fixup_free(void *addr, enum debug_obj_state state) | ||
305 | { | ||
306 | struct rcu_head *head = addr; | ||
307 | |||
308 | switch (state) { | ||
309 | case ODEBUG_STATE_ACTIVE: | ||
310 | /* | ||
311 | * Ensure that queued callbacks are all executed. | ||
312 | * If we detect that we are nested in a RCU read-side critical | ||
313 | * section, we should simply fail, otherwise we would deadlock. | ||
314 | * In !PREEMPT configurations, there is no way to tell if we are | ||
315 | * in a RCU read-side critical section or not, so we never | ||
316 | * attempt any fixup and just print a warning. | ||
317 | */ | ||
318 | #ifndef CONFIG_PREEMPT | ||
319 | WARN_ON_ONCE(1); | ||
320 | return 0; | ||
321 | #endif | ||
322 | if (rcu_preempt_depth() != 0 || preempt_count() != 0 || | ||
323 | irqs_disabled()) { | ||
324 | WARN_ON_ONCE(1); | ||
325 | return 0; | ||
326 | } | ||
327 | rcu_barrier(); | ||
328 | rcu_barrier_sched(); | ||
329 | rcu_barrier_bh(); | ||
330 | debug_object_free(head, &rcuhead_debug_descr); | ||
331 | return 1; | 235 | return 1; |
332 | default: | ||
333 | return 0; | ||
334 | } | 236 | } |
335 | } | 237 | } |
336 | 238 | ||
@@ -369,15 +271,13 @@ EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack); | |||
369 | 271 | ||
370 | struct debug_obj_descr rcuhead_debug_descr = { | 272 | struct debug_obj_descr rcuhead_debug_descr = { |
371 | .name = "rcu_head", | 273 | .name = "rcu_head", |
372 | .fixup_init = rcuhead_fixup_init, | ||
373 | .fixup_activate = rcuhead_fixup_activate, | 274 | .fixup_activate = rcuhead_fixup_activate, |
374 | .fixup_free = rcuhead_fixup_free, | ||
375 | }; | 275 | }; |
376 | EXPORT_SYMBOL_GPL(rcuhead_debug_descr); | 276 | EXPORT_SYMBOL_GPL(rcuhead_debug_descr); |
377 | #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ | 277 | #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ |
378 | 278 | ||
379 | #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE) | 279 | #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE) |
380 | void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp, | 280 | void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp, |
381 | unsigned long secs, | 281 | unsigned long secs, |
382 | unsigned long c_old, unsigned long c) | 282 | unsigned long c_old, unsigned long c) |
383 | { | 283 | { |
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index aa344111de3e..9ed6075dc562 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c | |||
@@ -264,7 +264,7 @@ void rcu_check_callbacks(int cpu, int user) | |||
264 | */ | 264 | */ |
265 | static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) | 265 | static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) |
266 | { | 266 | { |
267 | char *rn = NULL; | 267 | const char *rn = NULL; |
268 | struct rcu_head *next, *list; | 268 | struct rcu_head *next, *list; |
269 | unsigned long flags; | 269 | unsigned long flags; |
270 | RCU_TRACE(int cb_count = 0); | 270 | RCU_TRACE(int cb_count = 0); |
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 0cd385acccfa..280d06cae352 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h | |||
@@ -36,7 +36,7 @@ struct rcu_ctrlblk { | |||
36 | RCU_TRACE(unsigned long gp_start); /* Start time for stalls. */ | 36 | RCU_TRACE(unsigned long gp_start); /* Start time for stalls. */ |
37 | RCU_TRACE(unsigned long ticks_this_gp); /* Statistic for stalls. */ | 37 | RCU_TRACE(unsigned long ticks_this_gp); /* Statistic for stalls. */ |
38 | RCU_TRACE(unsigned long jiffies_stall); /* Jiffies at next stall. */ | 38 | RCU_TRACE(unsigned long jiffies_stall); /* Jiffies at next stall. */ |
39 | RCU_TRACE(char *name); /* Name of RCU type. */ | 39 | RCU_TRACE(const char *name); /* Name of RCU type. */ |
40 | }; | 40 | }; |
41 | 41 | ||
42 | /* Definition for rcupdate control block. */ | 42 | /* Definition for rcupdate control block. */ |
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index f4871e52c546..be63101c6175 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
@@ -52,72 +52,78 @@ | |||
52 | MODULE_LICENSE("GPL"); | 52 | MODULE_LICENSE("GPL"); |
53 | MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>"); | 53 | MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>"); |
54 | 54 | ||
55 | static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ | 55 | static int fqs_duration; |
56 | static int nfakewriters = 4; /* # fake writer threads */ | ||
57 | static int stat_interval = 60; /* Interval between stats, in seconds. */ | ||
58 | /* Zero means "only at end of test". */ | ||
59 | static bool verbose; /* Print more debug info. */ | ||
60 | static bool test_no_idle_hz = true; | ||
61 | /* Test RCU support for tickless idle CPUs. */ | ||
62 | static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ | ||
63 | static int stutter = 5; /* Start/stop testing interval (in sec) */ | ||
64 | static int irqreader = 1; /* RCU readers from irq (timers). */ | ||
65 | static int fqs_duration; /* Duration of bursts (us), 0 to disable. */ | ||
66 | static int fqs_holdoff; /* Hold time within burst (us). */ | ||
67 | static int fqs_stutter = 3; /* Wait time between bursts (s). */ | ||
68 | static int n_barrier_cbs; /* Number of callbacks to test RCU barriers. */ | ||
69 | static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */ | ||
70 | static int onoff_holdoff; /* Seconds after boot before CPU hotplugs. */ | ||
71 | static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */ | ||
72 | static int stall_cpu; /* CPU-stall duration (s). 0 for no stall. */ | ||
73 | static int stall_cpu_holdoff = 10; /* Time to wait until stall (s). */ | ||
74 | static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */ | ||
75 | static int test_boost_interval = 7; /* Interval between boost tests, seconds. */ | ||
76 | static int test_boost_duration = 4; /* Duration of each boost test, seconds. */ | ||
77 | static char *torture_type = "rcu"; /* What RCU implementation to torture. */ | ||
78 | |||
79 | module_param(nreaders, int, 0444); | ||
80 | MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); | ||
81 | module_param(nfakewriters, int, 0444); | ||
82 | MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads"); | ||
83 | module_param(stat_interval, int, 0644); | ||
84 | MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); | ||
85 | module_param(verbose, bool, 0444); | ||
86 | MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); | ||
87 | module_param(test_no_idle_hz, bool, 0444); | ||
88 | MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); | ||
89 | module_param(shuffle_interval, int, 0444); | ||
90 | MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); | ||
91 | module_param(stutter, int, 0444); | ||
92 | MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test"); | ||
93 | module_param(irqreader, int, 0444); | ||
94 | MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers"); | ||
95 | module_param(fqs_duration, int, 0444); | 56 | module_param(fqs_duration, int, 0444); |
96 | MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us)"); | 57 | MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us), 0 to disable"); |
58 | static int fqs_holdoff; | ||
97 | module_param(fqs_holdoff, int, 0444); | 59 | module_param(fqs_holdoff, int, 0444); |
98 | MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); | 60 | MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); |
61 | static int fqs_stutter = 3; | ||
99 | module_param(fqs_stutter, int, 0444); | 62 | module_param(fqs_stutter, int, 0444); |
100 | MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); | 63 | MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); |
64 | static bool gp_exp; | ||
65 | module_param(gp_exp, bool, 0444); | ||
66 | MODULE_PARM_DESC(gp_exp, "Use expedited GP wait primitives"); | ||
67 | static bool gp_normal; | ||
68 | module_param(gp_normal, bool, 0444); | ||
69 | MODULE_PARM_DESC(gp_normal, "Use normal (non-expedited) GP wait primitives"); | ||
70 | static int irqreader = 1; | ||
71 | module_param(irqreader, int, 0444); | ||
72 | MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers"); | ||
73 | static int n_barrier_cbs; | ||
101 | module_param(n_barrier_cbs, int, 0444); | 74 | module_param(n_barrier_cbs, int, 0444); |
102 | MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing"); | 75 | MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing"); |
103 | module_param(onoff_interval, int, 0444); | 76 | static int nfakewriters = 4; |
104 | MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); | 77 | module_param(nfakewriters, int, 0444); |
78 | MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads"); | ||
79 | static int nreaders = -1; | ||
80 | module_param(nreaders, int, 0444); | ||
81 | MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); | ||
82 | static int object_debug; | ||
83 | module_param(object_debug, int, 0444); | ||
84 | MODULE_PARM_DESC(object_debug, "Enable debug-object double call_rcu() testing"); | ||
85 | static int onoff_holdoff; | ||
105 | module_param(onoff_holdoff, int, 0444); | 86 | module_param(onoff_holdoff, int, 0444); |
106 | MODULE_PARM_DESC(onoff_holdoff, "Time after boot before CPU hotplugs (s)"); | 87 | MODULE_PARM_DESC(onoff_holdoff, "Time after boot before CPU hotplugs (s)"); |
88 | static int onoff_interval; | ||
89 | module_param(onoff_interval, int, 0444); | ||
90 | MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); | ||
91 | static int shuffle_interval = 3; | ||
92 | module_param(shuffle_interval, int, 0444); | ||
93 | MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); | ||
94 | static int shutdown_secs; | ||
107 | module_param(shutdown_secs, int, 0444); | 95 | module_param(shutdown_secs, int, 0444); |
108 | MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), zero to disable."); | 96 | MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), <= zero to disable."); |
97 | static int stall_cpu; | ||
109 | module_param(stall_cpu, int, 0444); | 98 | module_param(stall_cpu, int, 0444); |
110 | MODULE_PARM_DESC(stall_cpu, "Stall duration (s), zero to disable."); | 99 | MODULE_PARM_DESC(stall_cpu, "Stall duration (s), zero to disable."); |
100 | static int stall_cpu_holdoff = 10; | ||
111 | module_param(stall_cpu_holdoff, int, 0444); | 101 | module_param(stall_cpu_holdoff, int, 0444); |
112 | MODULE_PARM_DESC(stall_cpu_holdoff, "Time to wait before starting stall (s)."); | 102 | MODULE_PARM_DESC(stall_cpu_holdoff, "Time to wait before starting stall (s)."); |
103 | static int stat_interval = 60; | ||
104 | module_param(stat_interval, int, 0644); | ||
105 | MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); | ||
106 | static int stutter = 5; | ||
107 | module_param(stutter, int, 0444); | ||
108 | MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test"); | ||
109 | static int test_boost = 1; | ||
113 | module_param(test_boost, int, 0444); | 110 | module_param(test_boost, int, 0444); |
114 | MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); | 111 | MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); |
115 | module_param(test_boost_interval, int, 0444); | 112 | static int test_boost_duration = 4; |
116 | MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds."); | ||
117 | module_param(test_boost_duration, int, 0444); | 113 | module_param(test_boost_duration, int, 0444); |
118 | MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds."); | 114 | MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds."); |
115 | static int test_boost_interval = 7; | ||
116 | module_param(test_boost_interval, int, 0444); | ||
117 | MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds."); | ||
118 | static bool test_no_idle_hz = true; | ||
119 | module_param(test_no_idle_hz, bool, 0444); | ||
120 | MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); | ||
121 | static char *torture_type = "rcu"; | ||
119 | module_param(torture_type, charp, 0444); | 122 | module_param(torture_type, charp, 0444); |
120 | MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); | 123 | MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)"); |
124 | static bool verbose; | ||
125 | module_param(verbose, bool, 0444); | ||
126 | MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); | ||
121 | 127 | ||
122 | #define TORTURE_FLAG "-torture:" | 128 | #define TORTURE_FLAG "-torture:" |
123 | #define PRINTK_STRING(s) \ | 129 | #define PRINTK_STRING(s) \ |
@@ -267,7 +273,7 @@ rcutorture_shutdown_notify(struct notifier_block *unused1, | |||
267 | * Absorb kthreads into a kernel function that won't return, so that | 273 | * Absorb kthreads into a kernel function that won't return, so that |
268 | * they won't ever access module text or data again. | 274 | * they won't ever access module text or data again. |
269 | */ | 275 | */ |
270 | static void rcutorture_shutdown_absorb(char *title) | 276 | static void rcutorture_shutdown_absorb(const char *title) |
271 | { | 277 | { |
272 | if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { | 278 | if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { |
273 | pr_notice( | 279 | pr_notice( |
@@ -337,7 +343,7 @@ rcu_random(struct rcu_random_state *rrsp) | |||
337 | } | 343 | } |
338 | 344 | ||
339 | static void | 345 | static void |
340 | rcu_stutter_wait(char *title) | 346 | rcu_stutter_wait(const char *title) |
341 | { | 347 | { |
342 | while (stutter_pause_test || !rcutorture_runnable) { | 348 | while (stutter_pause_test || !rcutorture_runnable) { |
343 | if (rcutorture_runnable) | 349 | if (rcutorture_runnable) |
@@ -360,13 +366,14 @@ struct rcu_torture_ops { | |||
360 | int (*completed)(void); | 366 | int (*completed)(void); |
361 | void (*deferred_free)(struct rcu_torture *p); | 367 | void (*deferred_free)(struct rcu_torture *p); |
362 | void (*sync)(void); | 368 | void (*sync)(void); |
369 | void (*exp_sync)(void); | ||
363 | void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); | 370 | void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); |
364 | void (*cb_barrier)(void); | 371 | void (*cb_barrier)(void); |
365 | void (*fqs)(void); | 372 | void (*fqs)(void); |
366 | int (*stats)(char *page); | 373 | int (*stats)(char *page); |
367 | int irq_capable; | 374 | int irq_capable; |
368 | int can_boost; | 375 | int can_boost; |
369 | char *name; | 376 | const char *name; |
370 | }; | 377 | }; |
371 | 378 | ||
372 | static struct rcu_torture_ops *cur_ops; | 379 | static struct rcu_torture_ops *cur_ops; |
@@ -443,81 +450,27 @@ static void rcu_torture_deferred_free(struct rcu_torture *p) | |||
443 | call_rcu(&p->rtort_rcu, rcu_torture_cb); | 450 | call_rcu(&p->rtort_rcu, rcu_torture_cb); |
444 | } | 451 | } |
445 | 452 | ||
446 | static struct rcu_torture_ops rcu_ops = { | ||
447 | .init = NULL, | ||
448 | .readlock = rcu_torture_read_lock, | ||
449 | .read_delay = rcu_read_delay, | ||
450 | .readunlock = rcu_torture_read_unlock, | ||
451 | .completed = rcu_torture_completed, | ||
452 | .deferred_free = rcu_torture_deferred_free, | ||
453 | .sync = synchronize_rcu, | ||
454 | .call = call_rcu, | ||
455 | .cb_barrier = rcu_barrier, | ||
456 | .fqs = rcu_force_quiescent_state, | ||
457 | .stats = NULL, | ||
458 | .irq_capable = 1, | ||
459 | .can_boost = rcu_can_boost(), | ||
460 | .name = "rcu" | ||
461 | }; | ||
462 | |||
463 | static void rcu_sync_torture_deferred_free(struct rcu_torture *p) | ||
464 | { | ||
465 | int i; | ||
466 | struct rcu_torture *rp; | ||
467 | struct rcu_torture *rp1; | ||
468 | |||
469 | cur_ops->sync(); | ||
470 | list_add(&p->rtort_free, &rcu_torture_removed); | ||
471 | list_for_each_entry_safe(rp, rp1, &rcu_torture_removed, rtort_free) { | ||
472 | i = rp->rtort_pipe_count; | ||
473 | if (i > RCU_TORTURE_PIPE_LEN) | ||
474 | i = RCU_TORTURE_PIPE_LEN; | ||
475 | atomic_inc(&rcu_torture_wcount[i]); | ||
476 | if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { | ||
477 | rp->rtort_mbtest = 0; | ||
478 | list_del(&rp->rtort_free); | ||
479 | rcu_torture_free(rp); | ||
480 | } | ||
481 | } | ||
482 | } | ||
483 | |||
484 | static void rcu_sync_torture_init(void) | 453 | static void rcu_sync_torture_init(void) |
485 | { | 454 | { |
486 | INIT_LIST_HEAD(&rcu_torture_removed); | 455 | INIT_LIST_HEAD(&rcu_torture_removed); |
487 | } | 456 | } |
488 | 457 | ||
489 | static struct rcu_torture_ops rcu_sync_ops = { | 458 | static struct rcu_torture_ops rcu_ops = { |
490 | .init = rcu_sync_torture_init, | 459 | .init = rcu_sync_torture_init, |
491 | .readlock = rcu_torture_read_lock, | 460 | .readlock = rcu_torture_read_lock, |
492 | .read_delay = rcu_read_delay, | 461 | .read_delay = rcu_read_delay, |
493 | .readunlock = rcu_torture_read_unlock, | 462 | .readunlock = rcu_torture_read_unlock, |
494 | .completed = rcu_torture_completed, | 463 | .completed = rcu_torture_completed, |
495 | .deferred_free = rcu_sync_torture_deferred_free, | 464 | .deferred_free = rcu_torture_deferred_free, |
496 | .sync = synchronize_rcu, | 465 | .sync = synchronize_rcu, |
497 | .call = NULL, | 466 | .exp_sync = synchronize_rcu_expedited, |
498 | .cb_barrier = NULL, | 467 | .call = call_rcu, |
499 | .fqs = rcu_force_quiescent_state, | 468 | .cb_barrier = rcu_barrier, |
500 | .stats = NULL, | ||
501 | .irq_capable = 1, | ||
502 | .can_boost = rcu_can_boost(), | ||
503 | .name = "rcu_sync" | ||
504 | }; | ||
505 | |||
506 | static struct rcu_torture_ops rcu_expedited_ops = { | ||
507 | .init = rcu_sync_torture_init, | ||
508 | .readlock = rcu_torture_read_lock, | ||
509 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | ||
510 | .readunlock = rcu_torture_read_unlock, | ||
511 | .completed = rcu_no_completed, | ||
512 | .deferred_free = rcu_sync_torture_deferred_free, | ||
513 | .sync = synchronize_rcu_expedited, | ||
514 | .call = NULL, | ||
515 | .cb_barrier = NULL, | ||
516 | .fqs = rcu_force_quiescent_state, | 469 | .fqs = rcu_force_quiescent_state, |
517 | .stats = NULL, | 470 | .stats = NULL, |
518 | .irq_capable = 1, | 471 | .irq_capable = 1, |
519 | .can_boost = rcu_can_boost(), | 472 | .can_boost = rcu_can_boost(), |
520 | .name = "rcu_expedited" | 473 | .name = "rcu" |
521 | }; | 474 | }; |
522 | 475 | ||
523 | /* | 476 | /* |
@@ -546,13 +499,14 @@ static void rcu_bh_torture_deferred_free(struct rcu_torture *p) | |||
546 | } | 499 | } |
547 | 500 | ||
548 | static struct rcu_torture_ops rcu_bh_ops = { | 501 | static struct rcu_torture_ops rcu_bh_ops = { |
549 | .init = NULL, | 502 | .init = rcu_sync_torture_init, |
550 | .readlock = rcu_bh_torture_read_lock, | 503 | .readlock = rcu_bh_torture_read_lock, |
551 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | 504 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ |
552 | .readunlock = rcu_bh_torture_read_unlock, | 505 | .readunlock = rcu_bh_torture_read_unlock, |
553 | .completed = rcu_bh_torture_completed, | 506 | .completed = rcu_bh_torture_completed, |
554 | .deferred_free = rcu_bh_torture_deferred_free, | 507 | .deferred_free = rcu_bh_torture_deferred_free, |
555 | .sync = synchronize_rcu_bh, | 508 | .sync = synchronize_rcu_bh, |
509 | .exp_sync = synchronize_rcu_bh_expedited, | ||
556 | .call = call_rcu_bh, | 510 | .call = call_rcu_bh, |
557 | .cb_barrier = rcu_barrier_bh, | 511 | .cb_barrier = rcu_barrier_bh, |
558 | .fqs = rcu_bh_force_quiescent_state, | 512 | .fqs = rcu_bh_force_quiescent_state, |
@@ -561,38 +515,6 @@ static struct rcu_torture_ops rcu_bh_ops = { | |||
561 | .name = "rcu_bh" | 515 | .name = "rcu_bh" |
562 | }; | 516 | }; |
563 | 517 | ||
564 | static struct rcu_torture_ops rcu_bh_sync_ops = { | ||
565 | .init = rcu_sync_torture_init, | ||
566 | .readlock = rcu_bh_torture_read_lock, | ||
567 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | ||
568 | .readunlock = rcu_bh_torture_read_unlock, | ||
569 | .completed = rcu_bh_torture_completed, | ||
570 | .deferred_free = rcu_sync_torture_deferred_free, | ||
571 | .sync = synchronize_rcu_bh, | ||
572 | .call = NULL, | ||
573 | .cb_barrier = NULL, | ||
574 | .fqs = rcu_bh_force_quiescent_state, | ||
575 | .stats = NULL, | ||
576 | .irq_capable = 1, | ||
577 | .name = "rcu_bh_sync" | ||
578 | }; | ||
579 | |||
580 | static struct rcu_torture_ops rcu_bh_expedited_ops = { | ||
581 | .init = rcu_sync_torture_init, | ||
582 | .readlock = rcu_bh_torture_read_lock, | ||
583 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | ||
584 | .readunlock = rcu_bh_torture_read_unlock, | ||
585 | .completed = rcu_bh_torture_completed, | ||
586 | .deferred_free = rcu_sync_torture_deferred_free, | ||
587 | .sync = synchronize_rcu_bh_expedited, | ||
588 | .call = NULL, | ||
589 | .cb_barrier = NULL, | ||
590 | .fqs = rcu_bh_force_quiescent_state, | ||
591 | .stats = NULL, | ||
592 | .irq_capable = 1, | ||
593 | .name = "rcu_bh_expedited" | ||
594 | }; | ||
595 | |||
596 | /* | 518 | /* |
597 | * Definitions for srcu torture testing. | 519 | * Definitions for srcu torture testing. |
598 | */ | 520 | */ |
@@ -667,6 +589,11 @@ static int srcu_torture_stats(char *page) | |||
667 | return cnt; | 589 | return cnt; |
668 | } | 590 | } |
669 | 591 | ||
592 | static void srcu_torture_synchronize_expedited(void) | ||
593 | { | ||
594 | synchronize_srcu_expedited(&srcu_ctl); | ||
595 | } | ||
596 | |||
670 | static struct rcu_torture_ops srcu_ops = { | 597 | static struct rcu_torture_ops srcu_ops = { |
671 | .init = rcu_sync_torture_init, | 598 | .init = rcu_sync_torture_init, |
672 | .readlock = srcu_torture_read_lock, | 599 | .readlock = srcu_torture_read_lock, |
@@ -675,45 +602,13 @@ static struct rcu_torture_ops srcu_ops = { | |||
675 | .completed = srcu_torture_completed, | 602 | .completed = srcu_torture_completed, |
676 | .deferred_free = srcu_torture_deferred_free, | 603 | .deferred_free = srcu_torture_deferred_free, |
677 | .sync = srcu_torture_synchronize, | 604 | .sync = srcu_torture_synchronize, |
605 | .exp_sync = srcu_torture_synchronize_expedited, | ||
678 | .call = srcu_torture_call, | 606 | .call = srcu_torture_call, |
679 | .cb_barrier = srcu_torture_barrier, | 607 | .cb_barrier = srcu_torture_barrier, |
680 | .stats = srcu_torture_stats, | 608 | .stats = srcu_torture_stats, |
681 | .name = "srcu" | 609 | .name = "srcu" |
682 | }; | 610 | }; |
683 | 611 | ||
684 | static struct rcu_torture_ops srcu_sync_ops = { | ||
685 | .init = rcu_sync_torture_init, | ||
686 | .readlock = srcu_torture_read_lock, | ||
687 | .read_delay = srcu_read_delay, | ||
688 | .readunlock = srcu_torture_read_unlock, | ||
689 | .completed = srcu_torture_completed, | ||
690 | .deferred_free = rcu_sync_torture_deferred_free, | ||
691 | .sync = srcu_torture_synchronize, | ||
692 | .call = NULL, | ||
693 | .cb_barrier = NULL, | ||
694 | .stats = srcu_torture_stats, | ||
695 | .name = "srcu_sync" | ||
696 | }; | ||
697 | |||
698 | static void srcu_torture_synchronize_expedited(void) | ||
699 | { | ||
700 | synchronize_srcu_expedited(&srcu_ctl); | ||
701 | } | ||
702 | |||
703 | static struct rcu_torture_ops srcu_expedited_ops = { | ||
704 | .init = rcu_sync_torture_init, | ||
705 | .readlock = srcu_torture_read_lock, | ||
706 | .read_delay = srcu_read_delay, | ||
707 | .readunlock = srcu_torture_read_unlock, | ||
708 | .completed = srcu_torture_completed, | ||
709 | .deferred_free = rcu_sync_torture_deferred_free, | ||
710 | .sync = srcu_torture_synchronize_expedited, | ||
711 | .call = NULL, | ||
712 | .cb_barrier = NULL, | ||
713 | .stats = srcu_torture_stats, | ||
714 | .name = "srcu_expedited" | ||
715 | }; | ||
716 | |||
717 | /* | 612 | /* |
718 | * Definitions for sched torture testing. | 613 | * Definitions for sched torture testing. |
719 | */ | 614 | */ |
@@ -742,6 +637,8 @@ static struct rcu_torture_ops sched_ops = { | |||
742 | .completed = rcu_no_completed, | 637 | .completed = rcu_no_completed, |
743 | .deferred_free = rcu_sched_torture_deferred_free, | 638 | .deferred_free = rcu_sched_torture_deferred_free, |
744 | .sync = synchronize_sched, | 639 | .sync = synchronize_sched, |
640 | .exp_sync = synchronize_sched_expedited, | ||
641 | .call = call_rcu_sched, | ||
745 | .cb_barrier = rcu_barrier_sched, | 642 | .cb_barrier = rcu_barrier_sched, |
746 | .fqs = rcu_sched_force_quiescent_state, | 643 | .fqs = rcu_sched_force_quiescent_state, |
747 | .stats = NULL, | 644 | .stats = NULL, |
@@ -749,35 +646,6 @@ static struct rcu_torture_ops sched_ops = { | |||
749 | .name = "sched" | 646 | .name = "sched" |
750 | }; | 647 | }; |
751 | 648 | ||
752 | static struct rcu_torture_ops sched_sync_ops = { | ||
753 | .init = rcu_sync_torture_init, | ||
754 | .readlock = sched_torture_read_lock, | ||
755 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | ||
756 | .readunlock = sched_torture_read_unlock, | ||
757 | .completed = rcu_no_completed, | ||
758 | .deferred_free = rcu_sync_torture_deferred_free, | ||
759 | .sync = synchronize_sched, | ||
760 | .cb_barrier = NULL, | ||
761 | .fqs = rcu_sched_force_quiescent_state, | ||
762 | .stats = NULL, | ||
763 | .name = "sched_sync" | ||
764 | }; | ||
765 | |||
766 | static struct rcu_torture_ops sched_expedited_ops = { | ||
767 | .init = rcu_sync_torture_init, | ||
768 | .readlock = sched_torture_read_lock, | ||
769 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | ||
770 | .readunlock = sched_torture_read_unlock, | ||
771 | .completed = rcu_no_completed, | ||
772 | .deferred_free = rcu_sync_torture_deferred_free, | ||
773 | .sync = synchronize_sched_expedited, | ||
774 | .cb_barrier = NULL, | ||
775 | .fqs = rcu_sched_force_quiescent_state, | ||
776 | .stats = NULL, | ||
777 | .irq_capable = 1, | ||
778 | .name = "sched_expedited" | ||
779 | }; | ||
780 | |||
781 | /* | 649 | /* |
782 | * RCU torture priority-boost testing. Runs one real-time thread per | 650 | * RCU torture priority-boost testing. Runs one real-time thread per |
783 | * CPU for moderate bursts, repeatedly registering RCU callbacks and | 651 | * CPU for moderate bursts, repeatedly registering RCU callbacks and |
@@ -927,9 +795,10 @@ rcu_torture_fqs(void *arg) | |||
927 | static int | 795 | static int |
928 | rcu_torture_writer(void *arg) | 796 | rcu_torture_writer(void *arg) |
929 | { | 797 | { |
798 | bool exp; | ||
930 | int i; | 799 | int i; |
931 | long oldbatch = rcu_batches_completed(); | ||
932 | struct rcu_torture *rp; | 800 | struct rcu_torture *rp; |
801 | struct rcu_torture *rp1; | ||
933 | struct rcu_torture *old_rp; | 802 | struct rcu_torture *old_rp; |
934 | static DEFINE_RCU_RANDOM(rand); | 803 | static DEFINE_RCU_RANDOM(rand); |
935 | 804 | ||
@@ -954,10 +823,33 @@ rcu_torture_writer(void *arg) | |||
954 | i = RCU_TORTURE_PIPE_LEN; | 823 | i = RCU_TORTURE_PIPE_LEN; |
955 | atomic_inc(&rcu_torture_wcount[i]); | 824 | atomic_inc(&rcu_torture_wcount[i]); |
956 | old_rp->rtort_pipe_count++; | 825 | old_rp->rtort_pipe_count++; |
957 | cur_ops->deferred_free(old_rp); | 826 | if (gp_normal == gp_exp) |
827 | exp = !!(rcu_random(&rand) & 0x80); | ||
828 | else | ||
829 | exp = gp_exp; | ||
830 | if (!exp) { | ||
831 | cur_ops->deferred_free(old_rp); | ||
832 | } else { | ||
833 | cur_ops->exp_sync(); | ||
834 | list_add(&old_rp->rtort_free, | ||
835 | &rcu_torture_removed); | ||
836 | list_for_each_entry_safe(rp, rp1, | ||
837 | &rcu_torture_removed, | ||
838 | rtort_free) { | ||
839 | i = rp->rtort_pipe_count; | ||
840 | if (i > RCU_TORTURE_PIPE_LEN) | ||
841 | i = RCU_TORTURE_PIPE_LEN; | ||
842 | atomic_inc(&rcu_torture_wcount[i]); | ||
843 | if (++rp->rtort_pipe_count >= | ||
844 | RCU_TORTURE_PIPE_LEN) { | ||
845 | rp->rtort_mbtest = 0; | ||
846 | list_del(&rp->rtort_free); | ||
847 | rcu_torture_free(rp); | ||
848 | } | ||
849 | } | ||
850 | } | ||
958 | } | 851 | } |
959 | rcutorture_record_progress(++rcu_torture_current_version); | 852 | rcutorture_record_progress(++rcu_torture_current_version); |
960 | oldbatch = cur_ops->completed(); | ||
961 | rcu_stutter_wait("rcu_torture_writer"); | 853 | rcu_stutter_wait("rcu_torture_writer"); |
962 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); | 854 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); |
963 | VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); | 855 | VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); |
@@ -983,10 +875,18 @@ rcu_torture_fakewriter(void *arg) | |||
983 | schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); | 875 | schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); |
984 | udelay(rcu_random(&rand) & 0x3ff); | 876 | udelay(rcu_random(&rand) & 0x3ff); |
985 | if (cur_ops->cb_barrier != NULL && | 877 | if (cur_ops->cb_barrier != NULL && |
986 | rcu_random(&rand) % (nfakewriters * 8) == 0) | 878 | rcu_random(&rand) % (nfakewriters * 8) == 0) { |
987 | cur_ops->cb_barrier(); | 879 | cur_ops->cb_barrier(); |
988 | else | 880 | } else if (gp_normal == gp_exp) { |
881 | if (rcu_random(&rand) & 0x80) | ||
882 | cur_ops->sync(); | ||
883 | else | ||
884 | cur_ops->exp_sync(); | ||
885 | } else if (gp_normal) { | ||
989 | cur_ops->sync(); | 886 | cur_ops->sync(); |
887 | } else { | ||
888 | cur_ops->exp_sync(); | ||
889 | } | ||
990 | rcu_stutter_wait("rcu_torture_fakewriter"); | 890 | rcu_stutter_wait("rcu_torture_fakewriter"); |
991 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); | 891 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); |
992 | 892 | ||
@@ -1364,7 +1264,7 @@ rcu_torture_stutter(void *arg) | |||
1364 | } | 1264 | } |
1365 | 1265 | ||
1366 | static inline void | 1266 | static inline void |
1367 | rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag) | 1267 | rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) |
1368 | { | 1268 | { |
1369 | pr_alert("%s" TORTURE_FLAG | 1269 | pr_alert("%s" TORTURE_FLAG |
1370 | "--- %s: nreaders=%d nfakewriters=%d " | 1270 | "--- %s: nreaders=%d nfakewriters=%d " |
@@ -1534,7 +1434,13 @@ rcu_torture_onoff(void *arg) | |||
1534 | torture_type, cpu); | 1434 | torture_type, cpu); |
1535 | starttime = jiffies; | 1435 | starttime = jiffies; |
1536 | n_online_attempts++; | 1436 | n_online_attempts++; |
1537 | if (cpu_up(cpu) == 0) { | 1437 | ret = cpu_up(cpu); |
1438 | if (ret) { | ||
1439 | if (verbose) | ||
1440 | pr_alert("%s" TORTURE_FLAG | ||
1441 | "rcu_torture_onoff task: online %d failed: errno %d\n", | ||
1442 | torture_type, cpu, ret); | ||
1443 | } else { | ||
1538 | if (verbose) | 1444 | if (verbose) |
1539 | pr_alert("%s" TORTURE_FLAG | 1445 | pr_alert("%s" TORTURE_FLAG |
1540 | "rcu_torture_onoff task: onlined %d\n", | 1446 | "rcu_torture_onoff task: onlined %d\n", |
@@ -1934,6 +1840,62 @@ rcu_torture_cleanup(void) | |||
1934 | rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); | 1840 | rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); |
1935 | } | 1841 | } |
1936 | 1842 | ||
1843 | #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD | ||
1844 | static void rcu_torture_leak_cb(struct rcu_head *rhp) | ||
1845 | { | ||
1846 | } | ||
1847 | |||
1848 | static void rcu_torture_err_cb(struct rcu_head *rhp) | ||
1849 | { | ||
1850 | /* | ||
1851 | * This -might- happen due to race conditions, but is unlikely. | ||
1852 | * The scenario that leads to this happening is that the | ||
1853 | * first of the pair of duplicate callbacks is queued, | ||
1854 | * someone else starts a grace period that includes that | ||
1855 | * callback, then the second of the pair must wait for the | ||
1856 | * next grace period. Unlikely, but can happen. If it | ||
1857 | * does happen, the debug-objects subsystem won't have splatted. | ||
1858 | */ | ||
1859 | pr_alert("rcutorture: duplicated callback was invoked.\n"); | ||
1860 | } | ||
1861 | #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ | ||
1862 | |||
1863 | /* | ||
1864 | * Verify that double-free causes debug-objects to complain, but only | ||
1865 | * if CONFIG_DEBUG_OBJECTS_RCU_HEAD=y. Otherwise, say that the test | ||
1866 | * cannot be carried out. | ||
1867 | */ | ||
1868 | static void rcu_test_debug_objects(void) | ||
1869 | { | ||
1870 | #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD | ||
1871 | struct rcu_head rh1; | ||
1872 | struct rcu_head rh2; | ||
1873 | |||
1874 | init_rcu_head_on_stack(&rh1); | ||
1875 | init_rcu_head_on_stack(&rh2); | ||
1876 | pr_alert("rcutorture: WARN: Duplicate call_rcu() test starting.\n"); | ||
1877 | |||
1878 | /* Try to queue the rh2 pair of callbacks for the same grace period. */ | ||
1879 | preempt_disable(); /* Prevent preemption from interrupting test. */ | ||
1880 | rcu_read_lock(); /* Make it impossible to finish a grace period. */ | ||
1881 | call_rcu(&rh1, rcu_torture_leak_cb); /* Start grace period. */ | ||
1882 | local_irq_disable(); /* Make it harder to start a new grace period. */ | ||
1883 | call_rcu(&rh2, rcu_torture_leak_cb); | ||
1884 | call_rcu(&rh2, rcu_torture_err_cb); /* Duplicate callback. */ | ||
1885 | local_irq_enable(); | ||
1886 | rcu_read_unlock(); | ||
1887 | preempt_enable(); | ||
1888 | |||
1889 | /* Wait for them all to get done so we can safely return. */ | ||
1890 | rcu_barrier(); | ||
1891 | pr_alert("rcutorture: WARN: Duplicate call_rcu() test complete.\n"); | ||
1892 | destroy_rcu_head_on_stack(&rh1); | ||
1893 | destroy_rcu_head_on_stack(&rh2); | ||
1894 | #else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ | ||
1895 | pr_alert("rcutorture: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n"); | ||
1896 | #endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ | ||
1897 | } | ||
1898 | |||
1937 | static int __init | 1899 | static int __init |
1938 | rcu_torture_init(void) | 1900 | rcu_torture_init(void) |
1939 | { | 1901 | { |
@@ -1941,11 +1903,9 @@ rcu_torture_init(void) | |||
1941 | int cpu; | 1903 | int cpu; |
1942 | int firsterr = 0; | 1904 | int firsterr = 0; |
1943 | int retval; | 1905 | int retval; |
1944 | static struct rcu_torture_ops *torture_ops[] = | 1906 | static struct rcu_torture_ops *torture_ops[] = { |
1945 | { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, | 1907 | &rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops, |
1946 | &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, | 1908 | }; |
1947 | &srcu_ops, &srcu_sync_ops, &srcu_expedited_ops, | ||
1948 | &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; | ||
1949 | 1909 | ||
1950 | mutex_lock(&fullstop_mutex); | 1910 | mutex_lock(&fullstop_mutex); |
1951 | 1911 | ||
@@ -2163,6 +2123,8 @@ rcu_torture_init(void) | |||
2163 | firsterr = retval; | 2123 | firsterr = retval; |
2164 | goto unwind; | 2124 | goto unwind; |
2165 | } | 2125 | } |
2126 | if (object_debug) | ||
2127 | rcu_test_debug_objects(); | ||
2166 | rcutorture_record_test_transition(); | 2128 | rcutorture_record_test_transition(); |
2167 | mutex_unlock(&fullstop_mutex); | 2129 | mutex_unlock(&fullstop_mutex); |
2168 | return 0; | 2130 | return 0; |
diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 068de3a93606..32618b3fe4e6 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c | |||
@@ -53,18 +53,38 @@ | |||
53 | #include <linux/delay.h> | 53 | #include <linux/delay.h> |
54 | #include <linux/stop_machine.h> | 54 | #include <linux/stop_machine.h> |
55 | #include <linux/random.h> | 55 | #include <linux/random.h> |
56 | #include <linux/ftrace_event.h> | ||
57 | #include <linux/suspend.h> | ||
56 | 58 | ||
57 | #include "rcutree.h" | 59 | #include "rcutree.h" |
58 | #include <trace/events/rcu.h> | 60 | #include <trace/events/rcu.h> |
59 | 61 | ||
60 | #include "rcu.h" | 62 | #include "rcu.h" |
61 | 63 | ||
64 | /* | ||
65 | * Strings used in tracepoints need to be exported via the | ||
66 | * tracing system such that tools like perf and trace-cmd can | ||
67 | * translate the string address pointers to actual text. | ||
68 | */ | ||
69 | #define TPS(x) tracepoint_string(x) | ||
70 | |||
62 | /* Data structures. */ | 71 | /* Data structures. */ |
63 | 72 | ||
64 | static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; | 73 | static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; |
65 | static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; | 74 | static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; |
66 | 75 | ||
67 | #define RCU_STATE_INITIALIZER(sname, sabbr, cr) { \ | 76 | /* |
77 | * In order to export the rcu_state name to the tracing tools, it | ||
78 | * needs to be added in the __tracepoint_string section. | ||
79 | * This requires defining a separate variable tp_<sname>_varname | ||
80 | * that points to the string being used, and this will allow | ||
81 | * the tracing userspace tools to be able to decipher the string | ||
82 | * address to the matching string. | ||
83 | */ | ||
84 | #define RCU_STATE_INITIALIZER(sname, sabbr, cr) \ | ||
85 | static char sname##_varname[] = #sname; \ | ||
86 | static const char *tp_##sname##_varname __used __tracepoint_string = sname##_varname; \ | ||
87 | struct rcu_state sname##_state = { \ | ||
68 | .level = { &sname##_state.node[0] }, \ | 88 | .level = { &sname##_state.node[0] }, \ |
69 | .call = cr, \ | 89 | .call = cr, \ |
70 | .fqs_state = RCU_GP_IDLE, \ | 90 | .fqs_state = RCU_GP_IDLE, \ |
@@ -75,16 +95,13 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; | |||
75 | .orphan_donetail = &sname##_state.orphan_donelist, \ | 95 | .orphan_donetail = &sname##_state.orphan_donelist, \ |
76 | .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ | 96 | .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ |
77 | .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \ | 97 | .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \ |
78 | .name = #sname, \ | 98 | .name = sname##_varname, \ |
79 | .abbr = sabbr, \ | 99 | .abbr = sabbr, \ |
80 | } | 100 | }; \ |
81 | 101 | DEFINE_PER_CPU(struct rcu_data, sname##_data) | |
82 | struct rcu_state rcu_sched_state = | ||
83 | RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); | ||
84 | DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); | ||
85 | 102 | ||
86 | struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); | 103 | RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); |
87 | DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); | 104 | RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); |
88 | 105 | ||
89 | static struct rcu_state *rcu_state; | 106 | static struct rcu_state *rcu_state; |
90 | LIST_HEAD(rcu_struct_flavors); | 107 | LIST_HEAD(rcu_struct_flavors); |
@@ -178,7 +195,7 @@ void rcu_sched_qs(int cpu) | |||
178 | struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu); | 195 | struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu); |
179 | 196 | ||
180 | if (rdp->passed_quiesce == 0) | 197 | if (rdp->passed_quiesce == 0) |
181 | trace_rcu_grace_period("rcu_sched", rdp->gpnum, "cpuqs"); | 198 | trace_rcu_grace_period(TPS("rcu_sched"), rdp->gpnum, TPS("cpuqs")); |
182 | rdp->passed_quiesce = 1; | 199 | rdp->passed_quiesce = 1; |
183 | } | 200 | } |
184 | 201 | ||
@@ -187,7 +204,7 @@ void rcu_bh_qs(int cpu) | |||
187 | struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); | 204 | struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); |
188 | 205 | ||
189 | if (rdp->passed_quiesce == 0) | 206 | if (rdp->passed_quiesce == 0) |
190 | trace_rcu_grace_period("rcu_bh", rdp->gpnum, "cpuqs"); | 207 | trace_rcu_grace_period(TPS("rcu_bh"), rdp->gpnum, TPS("cpuqs")); |
191 | rdp->passed_quiesce = 1; | 208 | rdp->passed_quiesce = 1; |
192 | } | 209 | } |
193 | 210 | ||
@@ -198,16 +215,20 @@ void rcu_bh_qs(int cpu) | |||
198 | */ | 215 | */ |
199 | void rcu_note_context_switch(int cpu) | 216 | void rcu_note_context_switch(int cpu) |
200 | { | 217 | { |
201 | trace_rcu_utilization("Start context switch"); | 218 | trace_rcu_utilization(TPS("Start context switch")); |
202 | rcu_sched_qs(cpu); | 219 | rcu_sched_qs(cpu); |
203 | rcu_preempt_note_context_switch(cpu); | 220 | rcu_preempt_note_context_switch(cpu); |
204 | trace_rcu_utilization("End context switch"); | 221 | trace_rcu_utilization(TPS("End context switch")); |
205 | } | 222 | } |
206 | EXPORT_SYMBOL_GPL(rcu_note_context_switch); | 223 | EXPORT_SYMBOL_GPL(rcu_note_context_switch); |
207 | 224 | ||
208 | DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { | 225 | DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { |
209 | .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, | 226 | .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, |
210 | .dynticks = ATOMIC_INIT(1), | 227 | .dynticks = ATOMIC_INIT(1), |
228 | #ifdef CONFIG_NO_HZ_FULL_SYSIDLE | ||
229 | .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE, | ||
230 | .dynticks_idle = ATOMIC_INIT(1), | ||
231 | #endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ | ||
211 | }; | 232 | }; |
212 | 233 | ||
213 | static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ | 234 | static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ |
@@ -226,7 +247,10 @@ module_param(jiffies_till_next_fqs, ulong, 0644); | |||
226 | 247 | ||
227 | static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, | 248 | static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, |
228 | struct rcu_data *rdp); | 249 | struct rcu_data *rdp); |
229 | static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)); | 250 | static void force_qs_rnp(struct rcu_state *rsp, |
251 | int (*f)(struct rcu_data *rsp, bool *isidle, | ||
252 | unsigned long *maxj), | ||
253 | bool *isidle, unsigned long *maxj); | ||
230 | static void force_quiescent_state(struct rcu_state *rsp); | 254 | static void force_quiescent_state(struct rcu_state *rsp); |
231 | static int rcu_pending(int cpu); | 255 | static int rcu_pending(int cpu); |
232 | 256 | ||
@@ -345,11 +369,11 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp) | |||
345 | static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, | 369 | static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, |
346 | bool user) | 370 | bool user) |
347 | { | 371 | { |
348 | trace_rcu_dyntick("Start", oldval, rdtp->dynticks_nesting); | 372 | trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting); |
349 | if (!user && !is_idle_task(current)) { | 373 | if (!user && !is_idle_task(current)) { |
350 | struct task_struct *idle = idle_task(smp_processor_id()); | 374 | struct task_struct *idle = idle_task(smp_processor_id()); |
351 | 375 | ||
352 | trace_rcu_dyntick("Error on entry: not idle task", oldval, 0); | 376 | trace_rcu_dyntick(TPS("Error on entry: not idle task"), oldval, 0); |
353 | ftrace_dump(DUMP_ORIG); | 377 | ftrace_dump(DUMP_ORIG); |
354 | WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", | 378 | WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", |
355 | current->pid, current->comm, | 379 | current->pid, current->comm, |
@@ -411,6 +435,7 @@ void rcu_idle_enter(void) | |||
411 | 435 | ||
412 | local_irq_save(flags); | 436 | local_irq_save(flags); |
413 | rcu_eqs_enter(false); | 437 | rcu_eqs_enter(false); |
438 | rcu_sysidle_enter(&__get_cpu_var(rcu_dynticks), 0); | ||
414 | local_irq_restore(flags); | 439 | local_irq_restore(flags); |
415 | } | 440 | } |
416 | EXPORT_SYMBOL_GPL(rcu_idle_enter); | 441 | EXPORT_SYMBOL_GPL(rcu_idle_enter); |
@@ -428,27 +453,6 @@ void rcu_user_enter(void) | |||
428 | { | 453 | { |
429 | rcu_eqs_enter(1); | 454 | rcu_eqs_enter(1); |
430 | } | 455 | } |
431 | |||
432 | /** | ||
433 | * rcu_user_enter_after_irq - inform RCU that we are going to resume userspace | ||
434 | * after the current irq returns. | ||
435 | * | ||
436 | * This is similar to rcu_user_enter() but in the context of a non-nesting | ||
437 | * irq. After this call, RCU enters into idle mode when the interrupt | ||
438 | * returns. | ||
439 | */ | ||
440 | void rcu_user_enter_after_irq(void) | ||
441 | { | ||
442 | unsigned long flags; | ||
443 | struct rcu_dynticks *rdtp; | ||
444 | |||
445 | local_irq_save(flags); | ||
446 | rdtp = &__get_cpu_var(rcu_dynticks); | ||
447 | /* Ensure this irq is interrupting a non-idle RCU state. */ | ||
448 | WARN_ON_ONCE(!(rdtp->dynticks_nesting & DYNTICK_TASK_MASK)); | ||
449 | rdtp->dynticks_nesting = 1; | ||
450 | local_irq_restore(flags); | ||
451 | } | ||
452 | #endif /* CONFIG_RCU_USER_QS */ | 456 | #endif /* CONFIG_RCU_USER_QS */ |
453 | 457 | ||
454 | /** | 458 | /** |
@@ -479,9 +483,10 @@ void rcu_irq_exit(void) | |||
479 | rdtp->dynticks_nesting--; | 483 | rdtp->dynticks_nesting--; |
480 | WARN_ON_ONCE(rdtp->dynticks_nesting < 0); | 484 | WARN_ON_ONCE(rdtp->dynticks_nesting < 0); |
481 | if (rdtp->dynticks_nesting) | 485 | if (rdtp->dynticks_nesting) |
482 | trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting); | 486 | trace_rcu_dyntick(TPS("--="), oldval, rdtp->dynticks_nesting); |
483 | else | 487 | else |
484 | rcu_eqs_enter_common(rdtp, oldval, true); | 488 | rcu_eqs_enter_common(rdtp, oldval, true); |
489 | rcu_sysidle_enter(rdtp, 1); | ||
485 | local_irq_restore(flags); | 490 | local_irq_restore(flags); |
486 | } | 491 | } |
487 | 492 | ||
@@ -501,11 +506,11 @@ static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval, | |||
501 | smp_mb__after_atomic_inc(); /* See above. */ | 506 | smp_mb__after_atomic_inc(); /* See above. */ |
502 | WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); | 507 | WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); |
503 | rcu_cleanup_after_idle(smp_processor_id()); | 508 | rcu_cleanup_after_idle(smp_processor_id()); |
504 | trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting); | 509 | trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting); |
505 | if (!user && !is_idle_task(current)) { | 510 | if (!user && !is_idle_task(current)) { |
506 | struct task_struct *idle = idle_task(smp_processor_id()); | 511 | struct task_struct *idle = idle_task(smp_processor_id()); |
507 | 512 | ||
508 | trace_rcu_dyntick("Error on exit: not idle task", | 513 | trace_rcu_dyntick(TPS("Error on exit: not idle task"), |
509 | oldval, rdtp->dynticks_nesting); | 514 | oldval, rdtp->dynticks_nesting); |
510 | ftrace_dump(DUMP_ORIG); | 515 | ftrace_dump(DUMP_ORIG); |
511 | WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", | 516 | WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", |
@@ -550,6 +555,7 @@ void rcu_idle_exit(void) | |||
550 | 555 | ||
551 | local_irq_save(flags); | 556 | local_irq_save(flags); |
552 | rcu_eqs_exit(false); | 557 | rcu_eqs_exit(false); |
558 | rcu_sysidle_exit(&__get_cpu_var(rcu_dynticks), 0); | ||
553 | local_irq_restore(flags); | 559 | local_irq_restore(flags); |
554 | } | 560 | } |
555 | EXPORT_SYMBOL_GPL(rcu_idle_exit); | 561 | EXPORT_SYMBOL_GPL(rcu_idle_exit); |
@@ -565,28 +571,6 @@ void rcu_user_exit(void) | |||
565 | { | 571 | { |
566 | rcu_eqs_exit(1); | 572 | rcu_eqs_exit(1); |
567 | } | 573 | } |
568 | |||
569 | /** | ||
570 | * rcu_user_exit_after_irq - inform RCU that we won't resume to userspace | ||
571 | * idle mode after the current non-nesting irq returns. | ||
572 | * | ||
573 | * This is similar to rcu_user_exit() but in the context of an irq. | ||
574 | * This is called when the irq has interrupted a userspace RCU idle mode | ||
575 | * context. When the current non-nesting interrupt returns after this call, | ||
576 | * the CPU won't restore the RCU idle mode. | ||
577 | */ | ||
578 | void rcu_user_exit_after_irq(void) | ||
579 | { | ||
580 | unsigned long flags; | ||
581 | struct rcu_dynticks *rdtp; | ||
582 | |||
583 | local_irq_save(flags); | ||
584 | rdtp = &__get_cpu_var(rcu_dynticks); | ||
585 | /* Ensure we are interrupting an RCU idle mode. */ | ||
586 | WARN_ON_ONCE(rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK); | ||
587 | rdtp->dynticks_nesting += DYNTICK_TASK_EXIT_IDLE; | ||
588 | local_irq_restore(flags); | ||
589 | } | ||
590 | #endif /* CONFIG_RCU_USER_QS */ | 574 | #endif /* CONFIG_RCU_USER_QS */ |
591 | 575 | ||
592 | /** | 576 | /** |
@@ -620,9 +604,10 @@ void rcu_irq_enter(void) | |||
620 | rdtp->dynticks_nesting++; | 604 | rdtp->dynticks_nesting++; |
621 | WARN_ON_ONCE(rdtp->dynticks_nesting == 0); | 605 | WARN_ON_ONCE(rdtp->dynticks_nesting == 0); |
622 | if (oldval) | 606 | if (oldval) |
623 | trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting); | 607 | trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting); |
624 | else | 608 | else |
625 | rcu_eqs_exit_common(rdtp, oldval, true); | 609 | rcu_eqs_exit_common(rdtp, oldval, true); |
610 | rcu_sysidle_exit(rdtp, 1); | ||
626 | local_irq_restore(flags); | 611 | local_irq_restore(flags); |
627 | } | 612 | } |
628 | 613 | ||
@@ -746,9 +731,11 @@ static int rcu_is_cpu_rrupt_from_idle(void) | |||
746 | * credit them with an implicit quiescent state. Return 1 if this CPU | 731 | * credit them with an implicit quiescent state. Return 1 if this CPU |
747 | * is in dynticks idle mode, which is an extended quiescent state. | 732 | * is in dynticks idle mode, which is an extended quiescent state. |
748 | */ | 733 | */ |
749 | static int dyntick_save_progress_counter(struct rcu_data *rdp) | 734 | static int dyntick_save_progress_counter(struct rcu_data *rdp, |
735 | bool *isidle, unsigned long *maxj) | ||
750 | { | 736 | { |
751 | rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); | 737 | rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); |
738 | rcu_sysidle_check_cpu(rdp, isidle, maxj); | ||
752 | return (rdp->dynticks_snap & 0x1) == 0; | 739 | return (rdp->dynticks_snap & 0x1) == 0; |
753 | } | 740 | } |
754 | 741 | ||
@@ -758,7 +745,8 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp) | |||
758 | * idle state since the last call to dyntick_save_progress_counter() | 745 | * idle state since the last call to dyntick_save_progress_counter() |
759 | * for this same CPU, or by virtue of having been offline. | 746 | * for this same CPU, or by virtue of having been offline. |
760 | */ | 747 | */ |
761 | static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) | 748 | static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, |
749 | bool *isidle, unsigned long *maxj) | ||
762 | { | 750 | { |
763 | unsigned int curr; | 751 | unsigned int curr; |
764 | unsigned int snap; | 752 | unsigned int snap; |
@@ -775,7 +763,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) | |||
775 | * of the current RCU grace period. | 763 | * of the current RCU grace period. |
776 | */ | 764 | */ |
777 | if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) { | 765 | if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) { |
778 | trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "dti"); | 766 | trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); |
779 | rdp->dynticks_fqs++; | 767 | rdp->dynticks_fqs++; |
780 | return 1; | 768 | return 1; |
781 | } | 769 | } |
@@ -795,7 +783,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) | |||
795 | return 0; /* Grace period is not old enough. */ | 783 | return 0; /* Grace period is not old enough. */ |
796 | barrier(); | 784 | barrier(); |
797 | if (cpu_is_offline(rdp->cpu)) { | 785 | if (cpu_is_offline(rdp->cpu)) { |
798 | trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl"); | 786 | trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("ofl")); |
799 | rdp->offline_fqs++; | 787 | rdp->offline_fqs++; |
800 | return 1; | 788 | return 1; |
801 | } | 789 | } |
@@ -1032,7 +1020,7 @@ static unsigned long rcu_cbs_completed(struct rcu_state *rsp, | |||
1032 | * rcu_nocb_wait_gp(). | 1020 | * rcu_nocb_wait_gp(). |
1033 | */ | 1021 | */ |
1034 | static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, | 1022 | static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, |
1035 | unsigned long c, char *s) | 1023 | unsigned long c, const char *s) |
1036 | { | 1024 | { |
1037 | trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum, | 1025 | trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum, |
1038 | rnp->completed, c, rnp->level, | 1026 | rnp->completed, c, rnp->level, |
@@ -1058,9 +1046,9 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) | |||
1058 | * grace period is already marked as needed, return to the caller. | 1046 | * grace period is already marked as needed, return to the caller. |
1059 | */ | 1047 | */ |
1060 | c = rcu_cbs_completed(rdp->rsp, rnp); | 1048 | c = rcu_cbs_completed(rdp->rsp, rnp); |
1061 | trace_rcu_future_gp(rnp, rdp, c, "Startleaf"); | 1049 | trace_rcu_future_gp(rnp, rdp, c, TPS("Startleaf")); |
1062 | if (rnp->need_future_gp[c & 0x1]) { | 1050 | if (rnp->need_future_gp[c & 0x1]) { |
1063 | trace_rcu_future_gp(rnp, rdp, c, "Prestartleaf"); | 1051 | trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartleaf")); |
1064 | return c; | 1052 | return c; |
1065 | } | 1053 | } |
1066 | 1054 | ||
@@ -1074,7 +1062,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) | |||
1074 | if (rnp->gpnum != rnp->completed || | 1062 | if (rnp->gpnum != rnp->completed || |
1075 | ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) { | 1063 | ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) { |
1076 | rnp->need_future_gp[c & 0x1]++; | 1064 | rnp->need_future_gp[c & 0x1]++; |
1077 | trace_rcu_future_gp(rnp, rdp, c, "Startedleaf"); | 1065 | trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf")); |
1078 | return c; | 1066 | return c; |
1079 | } | 1067 | } |
1080 | 1068 | ||
@@ -1102,7 +1090,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) | |||
1102 | * recorded, trace and leave. | 1090 | * recorded, trace and leave. |
1103 | */ | 1091 | */ |
1104 | if (rnp_root->need_future_gp[c & 0x1]) { | 1092 | if (rnp_root->need_future_gp[c & 0x1]) { |
1105 | trace_rcu_future_gp(rnp, rdp, c, "Prestartedroot"); | 1093 | trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartedroot")); |
1106 | goto unlock_out; | 1094 | goto unlock_out; |
1107 | } | 1095 | } |
1108 | 1096 | ||
@@ -1111,9 +1099,9 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) | |||
1111 | 1099 | ||
1112 | /* If a grace period is not already in progress, start one. */ | 1100 | /* If a grace period is not already in progress, start one. */ |
1113 | if (rnp_root->gpnum != rnp_root->completed) { | 1101 | if (rnp_root->gpnum != rnp_root->completed) { |
1114 | trace_rcu_future_gp(rnp, rdp, c, "Startedleafroot"); | 1102 | trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleafroot")); |
1115 | } else { | 1103 | } else { |
1116 | trace_rcu_future_gp(rnp, rdp, c, "Startedroot"); | 1104 | trace_rcu_future_gp(rnp, rdp, c, TPS("Startedroot")); |
1117 | rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp); | 1105 | rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp); |
1118 | } | 1106 | } |
1119 | unlock_out: | 1107 | unlock_out: |
@@ -1137,7 +1125,8 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) | |||
1137 | rcu_nocb_gp_cleanup(rsp, rnp); | 1125 | rcu_nocb_gp_cleanup(rsp, rnp); |
1138 | rnp->need_future_gp[c & 0x1] = 0; | 1126 | rnp->need_future_gp[c & 0x1] = 0; |
1139 | needmore = rnp->need_future_gp[(c + 1) & 0x1]; | 1127 | needmore = rnp->need_future_gp[(c + 1) & 0x1]; |
1140 | trace_rcu_future_gp(rnp, rdp, c, needmore ? "CleanupMore" : "Cleanup"); | 1128 | trace_rcu_future_gp(rnp, rdp, c, |
1129 | needmore ? TPS("CleanupMore") : TPS("Cleanup")); | ||
1141 | return needmore; | 1130 | return needmore; |
1142 | } | 1131 | } |
1143 | 1132 | ||
@@ -1205,9 +1194,9 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, | |||
1205 | 1194 | ||
1206 | /* Trace depending on how much we were able to accelerate. */ | 1195 | /* Trace depending on how much we were able to accelerate. */ |
1207 | if (!*rdp->nxttail[RCU_WAIT_TAIL]) | 1196 | if (!*rdp->nxttail[RCU_WAIT_TAIL]) |
1208 | trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccWaitCB"); | 1197 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB")); |
1209 | else | 1198 | else |
1210 | trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccReadyCB"); | 1199 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB")); |
1211 | } | 1200 | } |
1212 | 1201 | ||
1213 | /* | 1202 | /* |
@@ -1273,7 +1262,7 @@ static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struc | |||
1273 | 1262 | ||
1274 | /* Remember that we saw this grace-period completion. */ | 1263 | /* Remember that we saw this grace-period completion. */ |
1275 | rdp->completed = rnp->completed; | 1264 | rdp->completed = rnp->completed; |
1276 | trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuend"); | 1265 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend")); |
1277 | } | 1266 | } |
1278 | 1267 | ||
1279 | if (rdp->gpnum != rnp->gpnum) { | 1268 | if (rdp->gpnum != rnp->gpnum) { |
@@ -1283,7 +1272,7 @@ static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struc | |||
1283 | * go looking for one. | 1272 | * go looking for one. |
1284 | */ | 1273 | */ |
1285 | rdp->gpnum = rnp->gpnum; | 1274 | rdp->gpnum = rnp->gpnum; |
1286 | trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart"); | 1275 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); |
1287 | rdp->passed_quiesce = 0; | 1276 | rdp->passed_quiesce = 0; |
1288 | rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); | 1277 | rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); |
1289 | zero_cpu_stall_ticks(rdp); | 1278 | zero_cpu_stall_ticks(rdp); |
@@ -1315,6 +1304,7 @@ static int rcu_gp_init(struct rcu_state *rsp) | |||
1315 | struct rcu_data *rdp; | 1304 | struct rcu_data *rdp; |
1316 | struct rcu_node *rnp = rcu_get_root(rsp); | 1305 | struct rcu_node *rnp = rcu_get_root(rsp); |
1317 | 1306 | ||
1307 | rcu_bind_gp_kthread(); | ||
1318 | raw_spin_lock_irq(&rnp->lock); | 1308 | raw_spin_lock_irq(&rnp->lock); |
1319 | rsp->gp_flags = 0; /* Clear all flags: New grace period. */ | 1309 | rsp->gp_flags = 0; /* Clear all flags: New grace period. */ |
1320 | 1310 | ||
@@ -1326,7 +1316,7 @@ static int rcu_gp_init(struct rcu_state *rsp) | |||
1326 | 1316 | ||
1327 | /* Advance to a new grace period and initialize state. */ | 1317 | /* Advance to a new grace period and initialize state. */ |
1328 | rsp->gpnum++; | 1318 | rsp->gpnum++; |
1329 | trace_rcu_grace_period(rsp->name, rsp->gpnum, "start"); | 1319 | trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); |
1330 | record_gp_stall_check_time(rsp); | 1320 | record_gp_stall_check_time(rsp); |
1331 | raw_spin_unlock_irq(&rnp->lock); | 1321 | raw_spin_unlock_irq(&rnp->lock); |
1332 | 1322 | ||
@@ -1379,16 +1369,25 @@ static int rcu_gp_init(struct rcu_state *rsp) | |||
1379 | int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) | 1369 | int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) |
1380 | { | 1370 | { |
1381 | int fqs_state = fqs_state_in; | 1371 | int fqs_state = fqs_state_in; |
1372 | bool isidle = false; | ||
1373 | unsigned long maxj; | ||
1382 | struct rcu_node *rnp = rcu_get_root(rsp); | 1374 | struct rcu_node *rnp = rcu_get_root(rsp); |
1383 | 1375 | ||
1384 | rsp->n_force_qs++; | 1376 | rsp->n_force_qs++; |
1385 | if (fqs_state == RCU_SAVE_DYNTICK) { | 1377 | if (fqs_state == RCU_SAVE_DYNTICK) { |
1386 | /* Collect dyntick-idle snapshots. */ | 1378 | /* Collect dyntick-idle snapshots. */ |
1387 | force_qs_rnp(rsp, dyntick_save_progress_counter); | 1379 | if (is_sysidle_rcu_state(rsp)) { |
1380 | isidle = 1; | ||
1381 | maxj = jiffies - ULONG_MAX / 4; | ||
1382 | } | ||
1383 | force_qs_rnp(rsp, dyntick_save_progress_counter, | ||
1384 | &isidle, &maxj); | ||
1385 | rcu_sysidle_report_gp(rsp, isidle, maxj); | ||
1388 | fqs_state = RCU_FORCE_QS; | 1386 | fqs_state = RCU_FORCE_QS; |
1389 | } else { | 1387 | } else { |
1390 | /* Handle dyntick-idle and offline CPUs. */ | 1388 | /* Handle dyntick-idle and offline CPUs. */ |
1391 | force_qs_rnp(rsp, rcu_implicit_dynticks_qs); | 1389 | isidle = 0; |
1390 | force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj); | ||
1392 | } | 1391 | } |
1393 | /* Clear flag to prevent immediate re-entry. */ | 1392 | /* Clear flag to prevent immediate re-entry. */ |
1394 | if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { | 1393 | if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { |
@@ -1448,7 +1447,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) | |||
1448 | rcu_nocb_gp_set(rnp, nocb); | 1447 | rcu_nocb_gp_set(rnp, nocb); |
1449 | 1448 | ||
1450 | rsp->completed = rsp->gpnum; /* Declare grace period done. */ | 1449 | rsp->completed = rsp->gpnum; /* Declare grace period done. */ |
1451 | trace_rcu_grace_period(rsp->name, rsp->completed, "end"); | 1450 | trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end")); |
1452 | rsp->fqs_state = RCU_GP_IDLE; | 1451 | rsp->fqs_state = RCU_GP_IDLE; |
1453 | rdp = this_cpu_ptr(rsp->rda); | 1452 | rdp = this_cpu_ptr(rsp->rda); |
1454 | rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */ | 1453 | rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */ |
@@ -1558,10 +1557,12 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, | |||
1558 | 1557 | ||
1559 | /* | 1558 | /* |
1560 | * We can't do wakeups while holding the rnp->lock, as that | 1559 | * We can't do wakeups while holding the rnp->lock, as that |
1561 | * could cause possible deadlocks with the rq->lock. Deter | 1560 | * could cause possible deadlocks with the rq->lock. Defer |
1562 | * the wakeup to interrupt context. | 1561 | * the wakeup to interrupt context. And don't bother waking |
1562 | * up the running kthread. | ||
1563 | */ | 1563 | */ |
1564 | irq_work_queue(&rsp->wakeup_work); | 1564 | if (current != rsp->gp_kthread) |
1565 | irq_work_queue(&rsp->wakeup_work); | ||
1565 | } | 1566 | } |
1566 | 1567 | ||
1567 | /* | 1568 | /* |
@@ -1857,7 +1858,7 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) | |||
1857 | RCU_TRACE(mask = rdp->grpmask); | 1858 | RCU_TRACE(mask = rdp->grpmask); |
1858 | trace_rcu_grace_period(rsp->name, | 1859 | trace_rcu_grace_period(rsp->name, |
1859 | rnp->gpnum + 1 - !!(rnp->qsmask & mask), | 1860 | rnp->gpnum + 1 - !!(rnp->qsmask & mask), |
1860 | "cpuofl"); | 1861 | TPS("cpuofl")); |
1861 | } | 1862 | } |
1862 | 1863 | ||
1863 | /* | 1864 | /* |
@@ -2044,7 +2045,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
2044 | */ | 2045 | */ |
2045 | void rcu_check_callbacks(int cpu, int user) | 2046 | void rcu_check_callbacks(int cpu, int user) |
2046 | { | 2047 | { |
2047 | trace_rcu_utilization("Start scheduler-tick"); | 2048 | trace_rcu_utilization(TPS("Start scheduler-tick")); |
2048 | increment_cpu_stall_ticks(); | 2049 | increment_cpu_stall_ticks(); |
2049 | if (user || rcu_is_cpu_rrupt_from_idle()) { | 2050 | if (user || rcu_is_cpu_rrupt_from_idle()) { |
2050 | 2051 | ||
@@ -2077,7 +2078,7 @@ void rcu_check_callbacks(int cpu, int user) | |||
2077 | rcu_preempt_check_callbacks(cpu); | 2078 | rcu_preempt_check_callbacks(cpu); |
2078 | if (rcu_pending(cpu)) | 2079 | if (rcu_pending(cpu)) |
2079 | invoke_rcu_core(); | 2080 | invoke_rcu_core(); |
2080 | trace_rcu_utilization("End scheduler-tick"); | 2081 | trace_rcu_utilization(TPS("End scheduler-tick")); |
2081 | } | 2082 | } |
2082 | 2083 | ||
2083 | /* | 2084 | /* |
@@ -2087,7 +2088,10 @@ void rcu_check_callbacks(int cpu, int user) | |||
2087 | * | 2088 | * |
2088 | * The caller must have suppressed start of new grace periods. | 2089 | * The caller must have suppressed start of new grace periods. |
2089 | */ | 2090 | */ |
2090 | static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) | 2091 | static void force_qs_rnp(struct rcu_state *rsp, |
2092 | int (*f)(struct rcu_data *rsp, bool *isidle, | ||
2093 | unsigned long *maxj), | ||
2094 | bool *isidle, unsigned long *maxj) | ||
2091 | { | 2095 | { |
2092 | unsigned long bit; | 2096 | unsigned long bit; |
2093 | int cpu; | 2097 | int cpu; |
@@ -2110,9 +2114,12 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) | |||
2110 | cpu = rnp->grplo; | 2114 | cpu = rnp->grplo; |
2111 | bit = 1; | 2115 | bit = 1; |
2112 | for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { | 2116 | for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { |
2113 | if ((rnp->qsmask & bit) != 0 && | 2117 | if ((rnp->qsmask & bit) != 0) { |
2114 | f(per_cpu_ptr(rsp->rda, cpu))) | 2118 | if ((rnp->qsmaskinit & bit) != 0) |
2115 | mask |= bit; | 2119 | *isidle = 0; |
2120 | if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) | ||
2121 | mask |= bit; | ||
2122 | } | ||
2116 | } | 2123 | } |
2117 | if (mask != 0) { | 2124 | if (mask != 0) { |
2118 | 2125 | ||
@@ -2208,10 +2215,10 @@ static void rcu_process_callbacks(struct softirq_action *unused) | |||
2208 | 2215 | ||
2209 | if (cpu_is_offline(smp_processor_id())) | 2216 | if (cpu_is_offline(smp_processor_id())) |
2210 | return; | 2217 | return; |
2211 | trace_rcu_utilization("Start RCU core"); | 2218 | trace_rcu_utilization(TPS("Start RCU core")); |
2212 | for_each_rcu_flavor(rsp) | 2219 | for_each_rcu_flavor(rsp) |
2213 | __rcu_process_callbacks(rsp); | 2220 | __rcu_process_callbacks(rsp); |
2214 | trace_rcu_utilization("End RCU core"); | 2221 | trace_rcu_utilization(TPS("End RCU core")); |
2215 | } | 2222 | } |
2216 | 2223 | ||
2217 | /* | 2224 | /* |
@@ -2287,6 +2294,13 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, | |||
2287 | } | 2294 | } |
2288 | 2295 | ||
2289 | /* | 2296 | /* |
2297 | * RCU callback function to leak a callback. | ||
2298 | */ | ||
2299 | static void rcu_leak_callback(struct rcu_head *rhp) | ||
2300 | { | ||
2301 | } | ||
2302 | |||
2303 | /* | ||
2290 | * Helper function for call_rcu() and friends. The cpu argument will | 2304 | * Helper function for call_rcu() and friends. The cpu argument will |
2291 | * normally be -1, indicating "currently running CPU". It may specify | 2305 | * normally be -1, indicating "currently running CPU". It may specify |
2292 | * a CPU only if that CPU is a no-CBs CPU. Currently, only _rcu_barrier() | 2306 | * a CPU only if that CPU is a no-CBs CPU. Currently, only _rcu_barrier() |
@@ -2300,7 +2314,12 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
2300 | struct rcu_data *rdp; | 2314 | struct rcu_data *rdp; |
2301 | 2315 | ||
2302 | WARN_ON_ONCE((unsigned long)head & 0x3); /* Misaligned rcu_head! */ | 2316 | WARN_ON_ONCE((unsigned long)head & 0x3); /* Misaligned rcu_head! */ |
2303 | debug_rcu_head_queue(head); | 2317 | if (debug_rcu_head_queue(head)) { |
2318 | /* Probable double call_rcu(), so leak the callback. */ | ||
2319 | ACCESS_ONCE(head->func) = rcu_leak_callback; | ||
2320 | WARN_ONCE(1, "__call_rcu(): Leaked duplicate callback\n"); | ||
2321 | return; | ||
2322 | } | ||
2304 | head->func = func; | 2323 | head->func = func; |
2305 | head->next = NULL; | 2324 | head->next = NULL; |
2306 | 2325 | ||
@@ -2720,7 +2739,7 @@ static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy) | |||
2720 | * Helper function for _rcu_barrier() tracing. If tracing is disabled, | 2739 | * Helper function for _rcu_barrier() tracing. If tracing is disabled, |
2721 | * the compiler is expected to optimize this away. | 2740 | * the compiler is expected to optimize this away. |
2722 | */ | 2741 | */ |
2723 | static void _rcu_barrier_trace(struct rcu_state *rsp, char *s, | 2742 | static void _rcu_barrier_trace(struct rcu_state *rsp, const char *s, |
2724 | int cpu, unsigned long done) | 2743 | int cpu, unsigned long done) |
2725 | { | 2744 | { |
2726 | trace_rcu_barrier(rsp->name, s, cpu, | 2745 | trace_rcu_barrier(rsp->name, s, cpu, |
@@ -2785,9 +2804,20 @@ static void _rcu_barrier(struct rcu_state *rsp) | |||
2785 | * transition. The "if" expression below therefore rounds the old | 2804 | * transition. The "if" expression below therefore rounds the old |
2786 | * value up to the next even number and adds two before comparing. | 2805 | * value up to the next even number and adds two before comparing. |
2787 | */ | 2806 | */ |
2788 | snap_done = ACCESS_ONCE(rsp->n_barrier_done); | 2807 | snap_done = rsp->n_barrier_done; |
2789 | _rcu_barrier_trace(rsp, "Check", -1, snap_done); | 2808 | _rcu_barrier_trace(rsp, "Check", -1, snap_done); |
2790 | if (ULONG_CMP_GE(snap_done, ((snap + 1) & ~0x1) + 2)) { | 2809 | |
2810 | /* | ||
2811 | * If the value in snap is odd, we needed to wait for the current | ||
2812 | * rcu_barrier() to complete, then wait for the next one, in other | ||
2813 | * words, we need the value of snap_done to be three larger than | ||
2814 | * the value of snap. On the other hand, if the value in snap is | ||
2815 | * even, we only had to wait for the next rcu_barrier() to complete, | ||
2816 | * in other words, we need the value of snap_done to be only two | ||
2817 | * greater than the value of snap. The "(snap + 3) & ~0x1" computes | ||
2818 | * this for us (thank you, Linus!). | ||
2819 | */ | ||
2820 | if (ULONG_CMP_GE(snap_done, (snap + 3) & ~0x1)) { | ||
2791 | _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done); | 2821 | _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done); |
2792 | smp_mb(); /* caller's subsequent code after above check. */ | 2822 | smp_mb(); /* caller's subsequent code after above check. */ |
2793 | mutex_unlock(&rsp->barrier_mutex); | 2823 | mutex_unlock(&rsp->barrier_mutex); |
@@ -2930,6 +2960,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) | |||
2930 | rdp->blimit = blimit; | 2960 | rdp->blimit = blimit; |
2931 | init_callback_list(rdp); /* Re-enable callbacks on this CPU. */ | 2961 | init_callback_list(rdp); /* Re-enable callbacks on this CPU. */ |
2932 | rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; | 2962 | rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; |
2963 | rcu_sysidle_init_percpu_data(rdp->dynticks); | ||
2933 | atomic_set(&rdp->dynticks->dynticks, | 2964 | atomic_set(&rdp->dynticks->dynticks, |
2934 | (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); | 2965 | (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); |
2935 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 2966 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
@@ -2952,7 +2983,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) | |||
2952 | rdp->completed = rnp->completed; | 2983 | rdp->completed = rnp->completed; |
2953 | rdp->passed_quiesce = 0; | 2984 | rdp->passed_quiesce = 0; |
2954 | rdp->qs_pending = 0; | 2985 | rdp->qs_pending = 0; |
2955 | trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuonl"); | 2986 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); |
2956 | } | 2987 | } |
2957 | raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ | 2988 | raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ |
2958 | rnp = rnp->parent; | 2989 | rnp = rnp->parent; |
@@ -2982,7 +3013,7 @@ static int rcu_cpu_notify(struct notifier_block *self, | |||
2982 | struct rcu_node *rnp = rdp->mynode; | 3013 | struct rcu_node *rnp = rdp->mynode; |
2983 | struct rcu_state *rsp; | 3014 | struct rcu_state *rsp; |
2984 | 3015 | ||
2985 | trace_rcu_utilization("Start CPU hotplug"); | 3016 | trace_rcu_utilization(TPS("Start CPU hotplug")); |
2986 | switch (action) { | 3017 | switch (action) { |
2987 | case CPU_UP_PREPARE: | 3018 | case CPU_UP_PREPARE: |
2988 | case CPU_UP_PREPARE_FROZEN: | 3019 | case CPU_UP_PREPARE_FROZEN: |
@@ -3011,7 +3042,26 @@ static int rcu_cpu_notify(struct notifier_block *self, | |||
3011 | default: | 3042 | default: |
3012 | break; | 3043 | break; |
3013 | } | 3044 | } |
3014 | trace_rcu_utilization("End CPU hotplug"); | 3045 | trace_rcu_utilization(TPS("End CPU hotplug")); |
3046 | return NOTIFY_OK; | ||
3047 | } | ||
3048 | |||
3049 | static int rcu_pm_notify(struct notifier_block *self, | ||
3050 | unsigned long action, void *hcpu) | ||
3051 | { | ||
3052 | switch (action) { | ||
3053 | case PM_HIBERNATION_PREPARE: | ||
3054 | case PM_SUSPEND_PREPARE: | ||
3055 | if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */ | ||
3056 | rcu_expedited = 1; | ||
3057 | break; | ||
3058 | case PM_POST_HIBERNATION: | ||
3059 | case PM_POST_SUSPEND: | ||
3060 | rcu_expedited = 0; | ||
3061 | break; | ||
3062 | default: | ||
3063 | break; | ||
3064 | } | ||
3015 | return NOTIFY_OK; | 3065 | return NOTIFY_OK; |
3016 | } | 3066 | } |
3017 | 3067 | ||
@@ -3256,6 +3306,7 @@ void __init rcu_init(void) | |||
3256 | * or the scheduler are operational. | 3306 | * or the scheduler are operational. |
3257 | */ | 3307 | */ |
3258 | cpu_notifier(rcu_cpu_notify, 0); | 3308 | cpu_notifier(rcu_cpu_notify, 0); |
3309 | pm_notifier(rcu_pm_notify, 0); | ||
3259 | for_each_online_cpu(cpu) | 3310 | for_each_online_cpu(cpu) |
3260 | rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); | 3311 | rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); |
3261 | } | 3312 | } |
diff --git a/kernel/rcutree.h b/kernel/rcutree.h index b3832581043c..5f97eab602cd 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h | |||
@@ -88,6 +88,14 @@ struct rcu_dynticks { | |||
88 | /* Process level is worth LLONG_MAX/2. */ | 88 | /* Process level is worth LLONG_MAX/2. */ |
89 | int dynticks_nmi_nesting; /* Track NMI nesting level. */ | 89 | int dynticks_nmi_nesting; /* Track NMI nesting level. */ |
90 | atomic_t dynticks; /* Even value for idle, else odd. */ | 90 | atomic_t dynticks; /* Even value for idle, else odd. */ |
91 | #ifdef CONFIG_NO_HZ_FULL_SYSIDLE | ||
92 | long long dynticks_idle_nesting; | ||
93 | /* irq/process nesting level from idle. */ | ||
94 | atomic_t dynticks_idle; /* Even value for idle, else odd. */ | ||
95 | /* "Idle" excludes userspace execution. */ | ||
96 | unsigned long dynticks_idle_jiffies; | ||
97 | /* End of last non-NMI non-idle period. */ | ||
98 | #endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ | ||
91 | #ifdef CONFIG_RCU_FAST_NO_HZ | 99 | #ifdef CONFIG_RCU_FAST_NO_HZ |
92 | bool all_lazy; /* Are all CPU's CBs lazy? */ | 100 | bool all_lazy; /* Are all CPU's CBs lazy? */ |
93 | unsigned long nonlazy_posted; | 101 | unsigned long nonlazy_posted; |
@@ -445,7 +453,7 @@ struct rcu_state { | |||
445 | /* for CPU stalls. */ | 453 | /* for CPU stalls. */ |
446 | unsigned long gp_max; /* Maximum GP duration in */ | 454 | unsigned long gp_max; /* Maximum GP duration in */ |
447 | /* jiffies. */ | 455 | /* jiffies. */ |
448 | char *name; /* Name of structure. */ | 456 | const char *name; /* Name of structure. */ |
449 | char abbr; /* Abbreviated name. */ | 457 | char abbr; /* Abbreviated name. */ |
450 | struct list_head flavors; /* List of RCU flavors. */ | 458 | struct list_head flavors; /* List of RCU flavors. */ |
451 | struct irq_work wakeup_work; /* Postponed wakeups */ | 459 | struct irq_work wakeup_work; /* Postponed wakeups */ |
@@ -545,6 +553,15 @@ static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); | |||
545 | static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); | 553 | static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); |
546 | static void rcu_kick_nohz_cpu(int cpu); | 554 | static void rcu_kick_nohz_cpu(int cpu); |
547 | static bool init_nocb_callback_list(struct rcu_data *rdp); | 555 | static bool init_nocb_callback_list(struct rcu_data *rdp); |
556 | static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq); | ||
557 | static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq); | ||
558 | static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, | ||
559 | unsigned long *maxj); | ||
560 | static bool is_sysidle_rcu_state(struct rcu_state *rsp); | ||
561 | static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, | ||
562 | unsigned long maxj); | ||
563 | static void rcu_bind_gp_kthread(void); | ||
564 | static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp); | ||
548 | 565 | ||
549 | #endif /* #ifndef RCU_TREE_NONCORE */ | 566 | #endif /* #ifndef RCU_TREE_NONCORE */ |
550 | 567 | ||
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 769e12e3151b..130c97b027f2 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h | |||
@@ -28,7 +28,7 @@ | |||
28 | #include <linux/gfp.h> | 28 | #include <linux/gfp.h> |
29 | #include <linux/oom.h> | 29 | #include <linux/oom.h> |
30 | #include <linux/smpboot.h> | 30 | #include <linux/smpboot.h> |
31 | #include <linux/tick.h> | 31 | #include "time/tick-internal.h" |
32 | 32 | ||
33 | #define RCU_KTHREAD_PRIO 1 | 33 | #define RCU_KTHREAD_PRIO 1 |
34 | 34 | ||
@@ -110,9 +110,7 @@ static void __init rcu_bootup_announce_oddness(void) | |||
110 | 110 | ||
111 | #ifdef CONFIG_TREE_PREEMPT_RCU | 111 | #ifdef CONFIG_TREE_PREEMPT_RCU |
112 | 112 | ||
113 | struct rcu_state rcu_preempt_state = | 113 | RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); |
114 | RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); | ||
115 | DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); | ||
116 | static struct rcu_state *rcu_state = &rcu_preempt_state; | 114 | static struct rcu_state *rcu_state = &rcu_preempt_state; |
117 | 115 | ||
118 | static int rcu_preempted_readers_exp(struct rcu_node *rnp); | 116 | static int rcu_preempted_readers_exp(struct rcu_node *rnp); |
@@ -169,7 +167,7 @@ static void rcu_preempt_qs(int cpu) | |||
169 | struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); | 167 | struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); |
170 | 168 | ||
171 | if (rdp->passed_quiesce == 0) | 169 | if (rdp->passed_quiesce == 0) |
172 | trace_rcu_grace_period("rcu_preempt", rdp->gpnum, "cpuqs"); | 170 | trace_rcu_grace_period(TPS("rcu_preempt"), rdp->gpnum, TPS("cpuqs")); |
173 | rdp->passed_quiesce = 1; | 171 | rdp->passed_quiesce = 1; |
174 | current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; | 172 | current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; |
175 | } | 173 | } |
@@ -388,7 +386,7 @@ void rcu_read_unlock_special(struct task_struct *t) | |||
388 | np = rcu_next_node_entry(t, rnp); | 386 | np = rcu_next_node_entry(t, rnp); |
389 | list_del_init(&t->rcu_node_entry); | 387 | list_del_init(&t->rcu_node_entry); |
390 | t->rcu_blocked_node = NULL; | 388 | t->rcu_blocked_node = NULL; |
391 | trace_rcu_unlock_preempted_task("rcu_preempt", | 389 | trace_rcu_unlock_preempted_task(TPS("rcu_preempt"), |
392 | rnp->gpnum, t->pid); | 390 | rnp->gpnum, t->pid); |
393 | if (&t->rcu_node_entry == rnp->gp_tasks) | 391 | if (&t->rcu_node_entry == rnp->gp_tasks) |
394 | rnp->gp_tasks = np; | 392 | rnp->gp_tasks = np; |
@@ -412,7 +410,7 @@ void rcu_read_unlock_special(struct task_struct *t) | |||
412 | */ | 410 | */ |
413 | empty_exp_now = !rcu_preempted_readers_exp(rnp); | 411 | empty_exp_now = !rcu_preempted_readers_exp(rnp); |
414 | if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) { | 412 | if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) { |
415 | trace_rcu_quiescent_state_report("preempt_rcu", | 413 | trace_rcu_quiescent_state_report(TPS("preempt_rcu"), |
416 | rnp->gpnum, | 414 | rnp->gpnum, |
417 | 0, rnp->qsmask, | 415 | 0, rnp->qsmask, |
418 | rnp->level, | 416 | rnp->level, |
@@ -1250,12 +1248,12 @@ static int rcu_boost_kthread(void *arg) | |||
1250 | int spincnt = 0; | 1248 | int spincnt = 0; |
1251 | int more2boost; | 1249 | int more2boost; |
1252 | 1250 | ||
1253 | trace_rcu_utilization("Start boost kthread@init"); | 1251 | trace_rcu_utilization(TPS("Start boost kthread@init")); |
1254 | for (;;) { | 1252 | for (;;) { |
1255 | rnp->boost_kthread_status = RCU_KTHREAD_WAITING; | 1253 | rnp->boost_kthread_status = RCU_KTHREAD_WAITING; |
1256 | trace_rcu_utilization("End boost kthread@rcu_wait"); | 1254 | trace_rcu_utilization(TPS("End boost kthread@rcu_wait")); |
1257 | rcu_wait(rnp->boost_tasks || rnp->exp_tasks); | 1255 | rcu_wait(rnp->boost_tasks || rnp->exp_tasks); |
1258 | trace_rcu_utilization("Start boost kthread@rcu_wait"); | 1256 | trace_rcu_utilization(TPS("Start boost kthread@rcu_wait")); |
1259 | rnp->boost_kthread_status = RCU_KTHREAD_RUNNING; | 1257 | rnp->boost_kthread_status = RCU_KTHREAD_RUNNING; |
1260 | more2boost = rcu_boost(rnp); | 1258 | more2boost = rcu_boost(rnp); |
1261 | if (more2boost) | 1259 | if (more2boost) |
@@ -1264,14 +1262,14 @@ static int rcu_boost_kthread(void *arg) | |||
1264 | spincnt = 0; | 1262 | spincnt = 0; |
1265 | if (spincnt > 10) { | 1263 | if (spincnt > 10) { |
1266 | rnp->boost_kthread_status = RCU_KTHREAD_YIELDING; | 1264 | rnp->boost_kthread_status = RCU_KTHREAD_YIELDING; |
1267 | trace_rcu_utilization("End boost kthread@rcu_yield"); | 1265 | trace_rcu_utilization(TPS("End boost kthread@rcu_yield")); |
1268 | schedule_timeout_interruptible(2); | 1266 | schedule_timeout_interruptible(2); |
1269 | trace_rcu_utilization("Start boost kthread@rcu_yield"); | 1267 | trace_rcu_utilization(TPS("Start boost kthread@rcu_yield")); |
1270 | spincnt = 0; | 1268 | spincnt = 0; |
1271 | } | 1269 | } |
1272 | } | 1270 | } |
1273 | /* NOTREACHED */ | 1271 | /* NOTREACHED */ |
1274 | trace_rcu_utilization("End boost kthread@notreached"); | 1272 | trace_rcu_utilization(TPS("End boost kthread@notreached")); |
1275 | return 0; | 1273 | return 0; |
1276 | } | 1274 | } |
1277 | 1275 | ||
@@ -1419,7 +1417,7 @@ static void rcu_cpu_kthread(unsigned int cpu) | |||
1419 | int spincnt; | 1417 | int spincnt; |
1420 | 1418 | ||
1421 | for (spincnt = 0; spincnt < 10; spincnt++) { | 1419 | for (spincnt = 0; spincnt < 10; spincnt++) { |
1422 | trace_rcu_utilization("Start CPU kthread@rcu_wait"); | 1420 | trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait")); |
1423 | local_bh_disable(); | 1421 | local_bh_disable(); |
1424 | *statusp = RCU_KTHREAD_RUNNING; | 1422 | *statusp = RCU_KTHREAD_RUNNING; |
1425 | this_cpu_inc(rcu_cpu_kthread_loops); | 1423 | this_cpu_inc(rcu_cpu_kthread_loops); |
@@ -1431,15 +1429,15 @@ static void rcu_cpu_kthread(unsigned int cpu) | |||
1431 | rcu_kthread_do_work(); | 1429 | rcu_kthread_do_work(); |
1432 | local_bh_enable(); | 1430 | local_bh_enable(); |
1433 | if (*workp == 0) { | 1431 | if (*workp == 0) { |
1434 | trace_rcu_utilization("End CPU kthread@rcu_wait"); | 1432 | trace_rcu_utilization(TPS("End CPU kthread@rcu_wait")); |
1435 | *statusp = RCU_KTHREAD_WAITING; | 1433 | *statusp = RCU_KTHREAD_WAITING; |
1436 | return; | 1434 | return; |
1437 | } | 1435 | } |
1438 | } | 1436 | } |
1439 | *statusp = RCU_KTHREAD_YIELDING; | 1437 | *statusp = RCU_KTHREAD_YIELDING; |
1440 | trace_rcu_utilization("Start CPU kthread@rcu_yield"); | 1438 | trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield")); |
1441 | schedule_timeout_interruptible(2); | 1439 | schedule_timeout_interruptible(2); |
1442 | trace_rcu_utilization("End CPU kthread@rcu_yield"); | 1440 | trace_rcu_utilization(TPS("End CPU kthread@rcu_yield")); |
1443 | *statusp = RCU_KTHREAD_WAITING; | 1441 | *statusp = RCU_KTHREAD_WAITING; |
1444 | } | 1442 | } |
1445 | 1443 | ||
@@ -2202,7 +2200,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) | |||
2202 | * Wait for the grace period. Do so interruptibly to avoid messing | 2200 | * Wait for the grace period. Do so interruptibly to avoid messing |
2203 | * up the load average. | 2201 | * up the load average. |
2204 | */ | 2202 | */ |
2205 | trace_rcu_future_gp(rnp, rdp, c, "StartWait"); | 2203 | trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait")); |
2206 | for (;;) { | 2204 | for (;;) { |
2207 | wait_event_interruptible( | 2205 | wait_event_interruptible( |
2208 | rnp->nocb_gp_wq[c & 0x1], | 2206 | rnp->nocb_gp_wq[c & 0x1], |
@@ -2210,9 +2208,9 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) | |||
2210 | if (likely(d)) | 2208 | if (likely(d)) |
2211 | break; | 2209 | break; |
2212 | flush_signals(current); | 2210 | flush_signals(current); |
2213 | trace_rcu_future_gp(rnp, rdp, c, "ResumeWait"); | 2211 | trace_rcu_future_gp(rnp, rdp, c, TPS("ResumeWait")); |
2214 | } | 2212 | } |
2215 | trace_rcu_future_gp(rnp, rdp, c, "EndWait"); | 2213 | trace_rcu_future_gp(rnp, rdp, c, TPS("EndWait")); |
2216 | smp_mb(); /* Ensure that CB invocation happens after GP end. */ | 2214 | smp_mb(); /* Ensure that CB invocation happens after GP end. */ |
2217 | } | 2215 | } |
2218 | 2216 | ||
@@ -2375,3 +2373,425 @@ static void rcu_kick_nohz_cpu(int cpu) | |||
2375 | smp_send_reschedule(cpu); | 2373 | smp_send_reschedule(cpu); |
2376 | #endif /* #ifdef CONFIG_NO_HZ_FULL */ | 2374 | #endif /* #ifdef CONFIG_NO_HZ_FULL */ |
2377 | } | 2375 | } |
2376 | |||
2377 | |||
2378 | #ifdef CONFIG_NO_HZ_FULL_SYSIDLE | ||
2379 | |||
2380 | /* | ||
2381 | * Define RCU flavor that holds sysidle state. This needs to be the | ||
2382 | * most active flavor of RCU. | ||
2383 | */ | ||
2384 | #ifdef CONFIG_PREEMPT_RCU | ||
2385 | static struct rcu_state *rcu_sysidle_state = &rcu_preempt_state; | ||
2386 | #else /* #ifdef CONFIG_PREEMPT_RCU */ | ||
2387 | static struct rcu_state *rcu_sysidle_state = &rcu_sched_state; | ||
2388 | #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ | ||
2389 | |||
2390 | static int full_sysidle_state; /* Current system-idle state. */ | ||
2391 | #define RCU_SYSIDLE_NOT 0 /* Some CPU is not idle. */ | ||
2392 | #define RCU_SYSIDLE_SHORT 1 /* All CPUs idle for brief period. */ | ||
2393 | #define RCU_SYSIDLE_LONG 2 /* All CPUs idle for long enough. */ | ||
2394 | #define RCU_SYSIDLE_FULL 3 /* All CPUs idle, ready for sysidle. */ | ||
2395 | #define RCU_SYSIDLE_FULL_NOTED 4 /* Actually entered sysidle state. */ | ||
2396 | |||
2397 | /* | ||
2398 | * Invoked to note exit from irq or task transition to idle. Note that | ||
2399 | * usermode execution does -not- count as idle here! After all, we want | ||
2400 | * to detect full-system idle states, not RCU quiescent states and grace | ||
2401 | * periods. The caller must have disabled interrupts. | ||
2402 | */ | ||
2403 | static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) | ||
2404 | { | ||
2405 | unsigned long j; | ||
2406 | |||
2407 | /* Adjust nesting, check for fully idle. */ | ||
2408 | if (irq) { | ||
2409 | rdtp->dynticks_idle_nesting--; | ||
2410 | WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0); | ||
2411 | if (rdtp->dynticks_idle_nesting != 0) | ||
2412 | return; /* Still not fully idle. */ | ||
2413 | } else { | ||
2414 | if ((rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) == | ||
2415 | DYNTICK_TASK_NEST_VALUE) { | ||
2416 | rdtp->dynticks_idle_nesting = 0; | ||
2417 | } else { | ||
2418 | rdtp->dynticks_idle_nesting -= DYNTICK_TASK_NEST_VALUE; | ||
2419 | WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0); | ||
2420 | return; /* Still not fully idle. */ | ||
2421 | } | ||
2422 | } | ||
2423 | |||
2424 | /* Record start of fully idle period. */ | ||
2425 | j = jiffies; | ||
2426 | ACCESS_ONCE(rdtp->dynticks_idle_jiffies) = j; | ||
2427 | smp_mb__before_atomic_inc(); | ||
2428 | atomic_inc(&rdtp->dynticks_idle); | ||
2429 | smp_mb__after_atomic_inc(); | ||
2430 | WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1); | ||
2431 | } | ||
2432 | |||
2433 | /* | ||
2434 | * Unconditionally force exit from full system-idle state. This is | ||
2435 | * invoked when a normal CPU exits idle, but must be called separately | ||
2436 | * for the timekeeping CPU (tick_do_timer_cpu). The reason for this | ||
2437 | * is that the timekeeping CPU is permitted to take scheduling-clock | ||
2438 | * interrupts while the system is in system-idle state, and of course | ||
2439 | * rcu_sysidle_exit() has no way of distinguishing a scheduling-clock | ||
2440 | * interrupt from any other type of interrupt. | ||
2441 | */ | ||
2442 | void rcu_sysidle_force_exit(void) | ||
2443 | { | ||
2444 | int oldstate = ACCESS_ONCE(full_sysidle_state); | ||
2445 | int newoldstate; | ||
2446 | |||
2447 | /* | ||
2448 | * Each pass through the following loop attempts to exit full | ||
2449 | * system-idle state. If contention proves to be a problem, | ||
2450 | * a trylock-based contention tree could be used here. | ||
2451 | */ | ||
2452 | while (oldstate > RCU_SYSIDLE_SHORT) { | ||
2453 | newoldstate = cmpxchg(&full_sysidle_state, | ||
2454 | oldstate, RCU_SYSIDLE_NOT); | ||
2455 | if (oldstate == newoldstate && | ||
2456 | oldstate == RCU_SYSIDLE_FULL_NOTED) { | ||
2457 | rcu_kick_nohz_cpu(tick_do_timer_cpu); | ||
2458 | return; /* We cleared it, done! */ | ||
2459 | } | ||
2460 | oldstate = newoldstate; | ||
2461 | } | ||
2462 | smp_mb(); /* Order initial oldstate fetch vs. later non-idle work. */ | ||
2463 | } | ||
2464 | |||
2465 | /* | ||
2466 | * Invoked to note entry to irq or task transition from idle. Note that | ||
2467 | * usermode execution does -not- count as idle here! The caller must | ||
2468 | * have disabled interrupts. | ||
2469 | */ | ||
2470 | static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) | ||
2471 | { | ||
2472 | /* Adjust nesting, check for already non-idle. */ | ||
2473 | if (irq) { | ||
2474 | rdtp->dynticks_idle_nesting++; | ||
2475 | WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0); | ||
2476 | if (rdtp->dynticks_idle_nesting != 1) | ||
2477 | return; /* Already non-idle. */ | ||
2478 | } else { | ||
2479 | /* | ||
2480 | * Allow for irq misnesting. Yes, it really is possible | ||
2481 | * to enter an irq handler then never leave it, and maybe | ||
2482 | * also vice versa. Handle both possibilities. | ||
2483 | */ | ||
2484 | if (rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) { | ||
2485 | rdtp->dynticks_idle_nesting += DYNTICK_TASK_NEST_VALUE; | ||
2486 | WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0); | ||
2487 | return; /* Already non-idle. */ | ||
2488 | } else { | ||
2489 | rdtp->dynticks_idle_nesting = DYNTICK_TASK_EXIT_IDLE; | ||
2490 | } | ||
2491 | } | ||
2492 | |||
2493 | /* Record end of idle period. */ | ||
2494 | smp_mb__before_atomic_inc(); | ||
2495 | atomic_inc(&rdtp->dynticks_idle); | ||
2496 | smp_mb__after_atomic_inc(); | ||
2497 | WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1)); | ||
2498 | |||
2499 | /* | ||
2500 | * If we are the timekeeping CPU, we are permitted to be non-idle | ||
2501 | * during a system-idle state. This must be the case, because | ||
2502 | * the timekeeping CPU has to take scheduling-clock interrupts | ||
2503 | * during the time that the system is transitioning to full | ||
2504 | * system-idle state. This means that the timekeeping CPU must | ||
2505 | * invoke rcu_sysidle_force_exit() directly if it does anything | ||
2506 | * more than take a scheduling-clock interrupt. | ||
2507 | */ | ||
2508 | if (smp_processor_id() == tick_do_timer_cpu) | ||
2509 | return; | ||
2510 | |||
2511 | /* Update system-idle state: We are clearly no longer fully idle! */ | ||
2512 | rcu_sysidle_force_exit(); | ||
2513 | } | ||
2514 | |||
2515 | /* | ||
2516 | * Check to see if the current CPU is idle. Note that usermode execution | ||
2517 | * does not count as idle. The caller must have disabled interrupts. | ||
2518 | */ | ||
2519 | static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, | ||
2520 | unsigned long *maxj) | ||
2521 | { | ||
2522 | int cur; | ||
2523 | unsigned long j; | ||
2524 | struct rcu_dynticks *rdtp = rdp->dynticks; | ||
2525 | |||
2526 | /* | ||
2527 | * If some other CPU has already reported non-idle, if this is | ||
2528 | * not the flavor of RCU that tracks sysidle state, or if this | ||
2529 | * is an offline or the timekeeping CPU, nothing to do. | ||
2530 | */ | ||
2531 | if (!*isidle || rdp->rsp != rcu_sysidle_state || | ||
2532 | cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu) | ||
2533 | return; | ||
2534 | if (rcu_gp_in_progress(rdp->rsp)) | ||
2535 | WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu); | ||
2536 | |||
2537 | /* Pick up current idle and NMI-nesting counter and check. */ | ||
2538 | cur = atomic_read(&rdtp->dynticks_idle); | ||
2539 | if (cur & 0x1) { | ||
2540 | *isidle = false; /* We are not idle! */ | ||
2541 | return; | ||
2542 | } | ||
2543 | smp_mb(); /* Read counters before timestamps. */ | ||
2544 | |||
2545 | /* Pick up timestamps. */ | ||
2546 | j = ACCESS_ONCE(rdtp->dynticks_idle_jiffies); | ||
2547 | /* If this CPU entered idle more recently, update maxj timestamp. */ | ||
2548 | if (ULONG_CMP_LT(*maxj, j)) | ||
2549 | *maxj = j; | ||
2550 | } | ||
2551 | |||
2552 | /* | ||
2553 | * Is this the flavor of RCU that is handling full-system idle? | ||
2554 | */ | ||
2555 | static bool is_sysidle_rcu_state(struct rcu_state *rsp) | ||
2556 | { | ||
2557 | return rsp == rcu_sysidle_state; | ||
2558 | } | ||
2559 | |||
2560 | /* | ||
2561 | * Bind the grace-period kthread for the sysidle flavor of RCU to the | ||
2562 | * timekeeping CPU. | ||
2563 | */ | ||
2564 | static void rcu_bind_gp_kthread(void) | ||
2565 | { | ||
2566 | int cpu = ACCESS_ONCE(tick_do_timer_cpu); | ||
2567 | |||
2568 | if (cpu < 0 || cpu >= nr_cpu_ids) | ||
2569 | return; | ||
2570 | if (raw_smp_processor_id() != cpu) | ||
2571 | set_cpus_allowed_ptr(current, cpumask_of(cpu)); | ||
2572 | } | ||
2573 | |||
2574 | /* | ||
2575 | * Return a delay in jiffies based on the number of CPUs, rcu_node | ||
2576 | * leaf fanout, and jiffies tick rate. The idea is to allow larger | ||
2577 | * systems more time to transition to full-idle state in order to | ||
2578 | * avoid the cache thrashing that otherwise occur on the state variable. | ||
2579 | * Really small systems (less than a couple of tens of CPUs) should | ||
2580 | * instead use a single global atomically incremented counter, and later | ||
2581 | * versions of this will automatically reconfigure themselves accordingly. | ||
2582 | */ | ||
2583 | static unsigned long rcu_sysidle_delay(void) | ||
2584 | { | ||
2585 | if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) | ||
2586 | return 0; | ||
2587 | return DIV_ROUND_UP(nr_cpu_ids * HZ, rcu_fanout_leaf * 1000); | ||
2588 | } | ||
2589 | |||
2590 | /* | ||
2591 | * Advance the full-system-idle state. This is invoked when all of | ||
2592 | * the non-timekeeping CPUs are idle. | ||
2593 | */ | ||
2594 | static void rcu_sysidle(unsigned long j) | ||
2595 | { | ||
2596 | /* Check the current state. */ | ||
2597 | switch (ACCESS_ONCE(full_sysidle_state)) { | ||
2598 | case RCU_SYSIDLE_NOT: | ||
2599 | |||
2600 | /* First time all are idle, so note a short idle period. */ | ||
2601 | ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_SHORT; | ||
2602 | break; | ||
2603 | |||
2604 | case RCU_SYSIDLE_SHORT: | ||
2605 | |||
2606 | /* | ||
2607 | * Idle for a bit, time to advance to next state? | ||
2608 | * cmpxchg failure means race with non-idle, let them win. | ||
2609 | */ | ||
2610 | if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay())) | ||
2611 | (void)cmpxchg(&full_sysidle_state, | ||
2612 | RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG); | ||
2613 | break; | ||
2614 | |||
2615 | case RCU_SYSIDLE_LONG: | ||
2616 | |||
2617 | /* | ||
2618 | * Do an additional check pass before advancing to full. | ||
2619 | * cmpxchg failure means race with non-idle, let them win. | ||
2620 | */ | ||
2621 | if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay())) | ||
2622 | (void)cmpxchg(&full_sysidle_state, | ||
2623 | RCU_SYSIDLE_LONG, RCU_SYSIDLE_FULL); | ||
2624 | break; | ||
2625 | |||
2626 | default: | ||
2627 | break; | ||
2628 | } | ||
2629 | } | ||
2630 | |||
2631 | /* | ||
2632 | * Found a non-idle non-timekeeping CPU, so kick the system-idle state | ||
2633 | * back to the beginning. | ||
2634 | */ | ||
2635 | static void rcu_sysidle_cancel(void) | ||
2636 | { | ||
2637 | smp_mb(); | ||
2638 | ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT; | ||
2639 | } | ||
2640 | |||
2641 | /* | ||
2642 | * Update the sysidle state based on the results of a force-quiescent-state | ||
2643 | * scan of the CPUs' dyntick-idle state. | ||
2644 | */ | ||
2645 | static void rcu_sysidle_report(struct rcu_state *rsp, int isidle, | ||
2646 | unsigned long maxj, bool gpkt) | ||
2647 | { | ||
2648 | if (rsp != rcu_sysidle_state) | ||
2649 | return; /* Wrong flavor, ignore. */ | ||
2650 | if (gpkt && nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) | ||
2651 | return; /* Running state machine from timekeeping CPU. */ | ||
2652 | if (isidle) | ||
2653 | rcu_sysidle(maxj); /* More idle! */ | ||
2654 | else | ||
2655 | rcu_sysidle_cancel(); /* Idle is over. */ | ||
2656 | } | ||
2657 | |||
2658 | /* | ||
2659 | * Wrapper for rcu_sysidle_report() when called from the grace-period | ||
2660 | * kthread's context. | ||
2661 | */ | ||
2662 | static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, | ||
2663 | unsigned long maxj) | ||
2664 | { | ||
2665 | rcu_sysidle_report(rsp, isidle, maxj, true); | ||
2666 | } | ||
2667 | |||
2668 | /* Callback and function for forcing an RCU grace period. */ | ||
2669 | struct rcu_sysidle_head { | ||
2670 | struct rcu_head rh; | ||
2671 | int inuse; | ||
2672 | }; | ||
2673 | |||
2674 | static void rcu_sysidle_cb(struct rcu_head *rhp) | ||
2675 | { | ||
2676 | struct rcu_sysidle_head *rshp; | ||
2677 | |||
2678 | /* | ||
2679 | * The following memory barrier is needed to replace the | ||
2680 | * memory barriers that would normally be in the memory | ||
2681 | * allocator. | ||
2682 | */ | ||
2683 | smp_mb(); /* grace period precedes setting inuse. */ | ||
2684 | |||
2685 | rshp = container_of(rhp, struct rcu_sysidle_head, rh); | ||
2686 | ACCESS_ONCE(rshp->inuse) = 0; | ||
2687 | } | ||
2688 | |||
2689 | /* | ||
2690 | * Check to see if the system is fully idle, other than the timekeeping CPU. | ||
2691 | * The caller must have disabled interrupts. | ||
2692 | */ | ||
2693 | bool rcu_sys_is_idle(void) | ||
2694 | { | ||
2695 | static struct rcu_sysidle_head rsh; | ||
2696 | int rss = ACCESS_ONCE(full_sysidle_state); | ||
2697 | |||
2698 | if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu)) | ||
2699 | return false; | ||
2700 | |||
2701 | /* Handle small-system case by doing a full scan of CPUs. */ | ||
2702 | if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) { | ||
2703 | int oldrss = rss - 1; | ||
2704 | |||
2705 | /* | ||
2706 | * One pass to advance to each state up to _FULL. | ||
2707 | * Give up if any pass fails to advance the state. | ||
2708 | */ | ||
2709 | while (rss < RCU_SYSIDLE_FULL && oldrss < rss) { | ||
2710 | int cpu; | ||
2711 | bool isidle = true; | ||
2712 | unsigned long maxj = jiffies - ULONG_MAX / 4; | ||
2713 | struct rcu_data *rdp; | ||
2714 | |||
2715 | /* Scan all the CPUs looking for nonidle CPUs. */ | ||
2716 | for_each_possible_cpu(cpu) { | ||
2717 | rdp = per_cpu_ptr(rcu_sysidle_state->rda, cpu); | ||
2718 | rcu_sysidle_check_cpu(rdp, &isidle, &maxj); | ||
2719 | if (!isidle) | ||
2720 | break; | ||
2721 | } | ||
2722 | rcu_sysidle_report(rcu_sysidle_state, | ||
2723 | isidle, maxj, false); | ||
2724 | oldrss = rss; | ||
2725 | rss = ACCESS_ONCE(full_sysidle_state); | ||
2726 | } | ||
2727 | } | ||
2728 | |||
2729 | /* If this is the first observation of an idle period, record it. */ | ||
2730 | if (rss == RCU_SYSIDLE_FULL) { | ||
2731 | rss = cmpxchg(&full_sysidle_state, | ||
2732 | RCU_SYSIDLE_FULL, RCU_SYSIDLE_FULL_NOTED); | ||
2733 | return rss == RCU_SYSIDLE_FULL; | ||
2734 | } | ||
2735 | |||
2736 | smp_mb(); /* ensure rss load happens before later caller actions. */ | ||
2737 | |||
2738 | /* If already fully idle, tell the caller (in case of races). */ | ||
2739 | if (rss == RCU_SYSIDLE_FULL_NOTED) | ||
2740 | return true; | ||
2741 | |||
2742 | /* | ||
2743 | * If we aren't there yet, and a grace period is not in flight, | ||
2744 | * initiate a grace period. Either way, tell the caller that | ||
2745 | * we are not there yet. We use an xchg() rather than an assignment | ||
2746 | * to make up for the memory barriers that would otherwise be | ||
2747 | * provided by the memory allocator. | ||
2748 | */ | ||
2749 | if (nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL && | ||
2750 | !rcu_gp_in_progress(rcu_sysidle_state) && | ||
2751 | !rsh.inuse && xchg(&rsh.inuse, 1) == 0) | ||
2752 | call_rcu(&rsh.rh, rcu_sysidle_cb); | ||
2753 | return false; | ||
2754 | } | ||
2755 | |||
2756 | /* | ||
2757 | * Initialize dynticks sysidle state for CPUs coming online. | ||
2758 | */ | ||
2759 | static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) | ||
2760 | { | ||
2761 | rdtp->dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE; | ||
2762 | } | ||
2763 | |||
2764 | #else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ | ||
2765 | |||
2766 | static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) | ||
2767 | { | ||
2768 | } | ||
2769 | |||
2770 | static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) | ||
2771 | { | ||
2772 | } | ||
2773 | |||
2774 | static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, | ||
2775 | unsigned long *maxj) | ||
2776 | { | ||
2777 | } | ||
2778 | |||
2779 | static bool is_sysidle_rcu_state(struct rcu_state *rsp) | ||
2780 | { | ||
2781 | return false; | ||
2782 | } | ||
2783 | |||
2784 | static void rcu_bind_gp_kthread(void) | ||
2785 | { | ||
2786 | } | ||
2787 | |||
2788 | static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, | ||
2789 | unsigned long maxj) | ||
2790 | { | ||
2791 | } | ||
2792 | |||
2793 | static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) | ||
2794 | { | ||
2795 | } | ||
2796 | |||
2797 | #endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ | ||
diff --git a/kernel/reboot.c b/kernel/reboot.c index 269ed9384cc4..f813b3474646 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c | |||
@@ -32,7 +32,14 @@ EXPORT_SYMBOL(cad_pid); | |||
32 | #endif | 32 | #endif |
33 | enum reboot_mode reboot_mode DEFAULT_REBOOT_MODE; | 33 | enum reboot_mode reboot_mode DEFAULT_REBOOT_MODE; |
34 | 34 | ||
35 | int reboot_default; | 35 | /* |
36 | * This variable is used privately to keep track of whether or not | ||
37 | * reboot_type is still set to its default value (i.e., reboot= hasn't | ||
38 | * been set on the command line). This is needed so that we can | ||
39 | * suppress DMI scanning for reboot quirks. Without it, it's | ||
40 | * impossible to override a faulty reboot quirk without recompiling. | ||
41 | */ | ||
42 | int reboot_default = 1; | ||
36 | int reboot_cpu; | 43 | int reboot_cpu; |
37 | enum reboot_type reboot_type = BOOT_ACPI; | 44 | enum reboot_type reboot_type = BOOT_ACPI; |
38 | int reboot_force; | 45 | int reboot_force; |
diff --git a/kernel/res_counter.c b/kernel/res_counter.c index ff55247e7049..4aa8a305aede 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c | |||
@@ -17,8 +17,8 @@ | |||
17 | void res_counter_init(struct res_counter *counter, struct res_counter *parent) | 17 | void res_counter_init(struct res_counter *counter, struct res_counter *parent) |
18 | { | 18 | { |
19 | spin_lock_init(&counter->lock); | 19 | spin_lock_init(&counter->lock); |
20 | counter->limit = RESOURCE_MAX; | 20 | counter->limit = RES_COUNTER_MAX; |
21 | counter->soft_limit = RESOURCE_MAX; | 21 | counter->soft_limit = RES_COUNTER_MAX; |
22 | counter->parent = parent; | 22 | counter->parent = parent; |
23 | } | 23 | } |
24 | 24 | ||
@@ -178,23 +178,30 @@ u64 res_counter_read_u64(struct res_counter *counter, int member) | |||
178 | #endif | 178 | #endif |
179 | 179 | ||
180 | int res_counter_memparse_write_strategy(const char *buf, | 180 | int res_counter_memparse_write_strategy(const char *buf, |
181 | unsigned long long *res) | 181 | unsigned long long *resp) |
182 | { | 182 | { |
183 | char *end; | 183 | char *end; |
184 | unsigned long long res; | ||
184 | 185 | ||
185 | /* return RESOURCE_MAX(unlimited) if "-1" is specified */ | 186 | /* return RES_COUNTER_MAX(unlimited) if "-1" is specified */ |
186 | if (*buf == '-') { | 187 | if (*buf == '-') { |
187 | *res = simple_strtoull(buf + 1, &end, 10); | 188 | res = simple_strtoull(buf + 1, &end, 10); |
188 | if (*res != 1 || *end != '\0') | 189 | if (res != 1 || *end != '\0') |
189 | return -EINVAL; | 190 | return -EINVAL; |
190 | *res = RESOURCE_MAX; | 191 | *resp = RES_COUNTER_MAX; |
191 | return 0; | 192 | return 0; |
192 | } | 193 | } |
193 | 194 | ||
194 | *res = memparse(buf, &end); | 195 | res = memparse(buf, &end); |
195 | if (*end != '\0') | 196 | if (*end != '\0') |
196 | return -EINVAL; | 197 | return -EINVAL; |
197 | 198 | ||
198 | *res = PAGE_ALIGN(*res); | 199 | if (PAGE_ALIGN(res) >= res) |
200 | res = PAGE_ALIGN(res); | ||
201 | else | ||
202 | res = RES_COUNTER_MAX; | ||
203 | |||
204 | *resp = res; | ||
205 | |||
199 | return 0; | 206 | return 0; |
200 | } | 207 | } |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 05c39f030314..5ac63c9a995a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -978,13 +978,6 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) | |||
978 | rq->skip_clock_update = 1; | 978 | rq->skip_clock_update = 1; |
979 | } | 979 | } |
980 | 980 | ||
981 | static ATOMIC_NOTIFIER_HEAD(task_migration_notifier); | ||
982 | |||
983 | void register_task_migration_notifier(struct notifier_block *n) | ||
984 | { | ||
985 | atomic_notifier_chain_register(&task_migration_notifier, n); | ||
986 | } | ||
987 | |||
988 | #ifdef CONFIG_SMP | 981 | #ifdef CONFIG_SMP |
989 | void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | 982 | void set_task_cpu(struct task_struct *p, unsigned int new_cpu) |
990 | { | 983 | { |
@@ -1015,18 +1008,10 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |||
1015 | trace_sched_migrate_task(p, new_cpu); | 1008 | trace_sched_migrate_task(p, new_cpu); |
1016 | 1009 | ||
1017 | if (task_cpu(p) != new_cpu) { | 1010 | if (task_cpu(p) != new_cpu) { |
1018 | struct task_migration_notifier tmn; | ||
1019 | |||
1020 | if (p->sched_class->migrate_task_rq) | 1011 | if (p->sched_class->migrate_task_rq) |
1021 | p->sched_class->migrate_task_rq(p, new_cpu); | 1012 | p->sched_class->migrate_task_rq(p, new_cpu); |
1022 | p->se.nr_migrations++; | 1013 | p->se.nr_migrations++; |
1023 | perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); | 1014 | perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); |
1024 | |||
1025 | tmn.task = p; | ||
1026 | tmn.from_cpu = task_cpu(p); | ||
1027 | tmn.to_cpu = new_cpu; | ||
1028 | |||
1029 | atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn); | ||
1030 | } | 1015 | } |
1031 | 1016 | ||
1032 | __set_task_cpu(p, new_cpu); | 1017 | __set_task_cpu(p, new_cpu); |
@@ -2527,13 +2512,11 @@ void __sched schedule_preempt_disabled(void) | |||
2527 | */ | 2512 | */ |
2528 | asmlinkage void __sched notrace preempt_schedule(void) | 2513 | asmlinkage void __sched notrace preempt_schedule(void) |
2529 | { | 2514 | { |
2530 | struct thread_info *ti = current_thread_info(); | ||
2531 | |||
2532 | /* | 2515 | /* |
2533 | * If there is a non-zero preempt_count or interrupts are disabled, | 2516 | * If there is a non-zero preempt_count or interrupts are disabled, |
2534 | * we do not want to preempt the current task. Just return.. | 2517 | * we do not want to preempt the current task. Just return.. |
2535 | */ | 2518 | */ |
2536 | if (likely(ti->preempt_count || irqs_disabled())) | 2519 | if (likely(!preemptible())) |
2537 | return; | 2520 | return; |
2538 | 2521 | ||
2539 | do { | 2522 | do { |
@@ -2677,7 +2660,7 @@ void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, | |||
2677 | if (unlikely(!q)) | 2660 | if (unlikely(!q)) |
2678 | return; | 2661 | return; |
2679 | 2662 | ||
2680 | if (unlikely(!nr_exclusive)) | 2663 | if (unlikely(nr_exclusive != 1)) |
2681 | wake_flags = 0; | 2664 | wake_flags = 0; |
2682 | 2665 | ||
2683 | spin_lock_irqsave(&q->lock, flags); | 2666 | spin_lock_irqsave(&q->lock, flags); |
@@ -4964,7 +4947,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) | |||
4964 | SD_BALANCE_FORK | | 4947 | SD_BALANCE_FORK | |
4965 | SD_BALANCE_EXEC | | 4948 | SD_BALANCE_EXEC | |
4966 | SD_SHARE_CPUPOWER | | 4949 | SD_SHARE_CPUPOWER | |
4967 | SD_SHARE_PKG_RESOURCES); | 4950 | SD_SHARE_PKG_RESOURCES | |
4951 | SD_PREFER_SIBLING); | ||
4968 | if (nr_node_ids == 1) | 4952 | if (nr_node_ids == 1) |
4969 | pflags &= ~SD_SERIALIZE; | 4953 | pflags &= ~SD_SERIALIZE; |
4970 | } | 4954 | } |
@@ -5133,18 +5117,23 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) | |||
5133 | * two cpus are in the same cache domain, see cpus_share_cache(). | 5117 | * two cpus are in the same cache domain, see cpus_share_cache(). |
5134 | */ | 5118 | */ |
5135 | DEFINE_PER_CPU(struct sched_domain *, sd_llc); | 5119 | DEFINE_PER_CPU(struct sched_domain *, sd_llc); |
5120 | DEFINE_PER_CPU(int, sd_llc_size); | ||
5136 | DEFINE_PER_CPU(int, sd_llc_id); | 5121 | DEFINE_PER_CPU(int, sd_llc_id); |
5137 | 5122 | ||
5138 | static void update_top_cache_domain(int cpu) | 5123 | static void update_top_cache_domain(int cpu) |
5139 | { | 5124 | { |
5140 | struct sched_domain *sd; | 5125 | struct sched_domain *sd; |
5141 | int id = cpu; | 5126 | int id = cpu; |
5127 | int size = 1; | ||
5142 | 5128 | ||
5143 | sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); | 5129 | sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); |
5144 | if (sd) | 5130 | if (sd) { |
5145 | id = cpumask_first(sched_domain_span(sd)); | 5131 | id = cpumask_first(sched_domain_span(sd)); |
5132 | size = cpumask_weight(sched_domain_span(sd)); | ||
5133 | } | ||
5146 | 5134 | ||
5147 | rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); | 5135 | rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); |
5136 | per_cpu(sd_llc_size, cpu) = size; | ||
5148 | per_cpu(sd_llc_id, cpu) = id; | 5137 | per_cpu(sd_llc_id, cpu) = id; |
5149 | } | 5138 | } |
5150 | 5139 | ||
@@ -5168,6 +5157,13 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | |||
5168 | tmp->parent = parent->parent; | 5157 | tmp->parent = parent->parent; |
5169 | if (parent->parent) | 5158 | if (parent->parent) |
5170 | parent->parent->child = tmp; | 5159 | parent->parent->child = tmp; |
5160 | /* | ||
5161 | * Transfer SD_PREFER_SIBLING down in case of a | ||
5162 | * degenerate parent; the spans match for this | ||
5163 | * so the property transfers. | ||
5164 | */ | ||
5165 | if (parent->flags & SD_PREFER_SIBLING) | ||
5166 | tmp->flags |= SD_PREFER_SIBLING; | ||
5171 | destroy_sched_domain(parent, cpu); | 5167 | destroy_sched_domain(parent, cpu); |
5172 | } else | 5168 | } else |
5173 | tmp = tmp->parent; | 5169 | tmp = tmp->parent; |
@@ -6234,8 +6230,9 @@ match1: | |||
6234 | ; | 6230 | ; |
6235 | } | 6231 | } |
6236 | 6232 | ||
6233 | n = ndoms_cur; | ||
6237 | if (doms_new == NULL) { | 6234 | if (doms_new == NULL) { |
6238 | ndoms_cur = 0; | 6235 | n = 0; |
6239 | doms_new = &fallback_doms; | 6236 | doms_new = &fallback_doms; |
6240 | cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); | 6237 | cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); |
6241 | WARN_ON_ONCE(dattr_new); | 6238 | WARN_ON_ONCE(dattr_new); |
@@ -6243,7 +6240,7 @@ match1: | |||
6243 | 6240 | ||
6244 | /* Build new domains */ | 6241 | /* Build new domains */ |
6245 | for (i = 0; i < ndoms_new; i++) { | 6242 | for (i = 0; i < ndoms_new; i++) { |
6246 | for (j = 0; j < ndoms_cur && !new_topology; j++) { | 6243 | for (j = 0; j < n && !new_topology; j++) { |
6247 | if (cpumask_equal(doms_new[i], doms_cur[j]) | 6244 | if (cpumask_equal(doms_new[i], doms_cur[j]) |
6248 | && dattrs_equal(dattr_new, i, dattr_cur, j)) | 6245 | && dattrs_equal(dattr_new, i, dattr_cur, j)) |
6249 | goto match2; | 6246 | goto match2; |
@@ -6815,7 +6812,7 @@ void sched_move_task(struct task_struct *tsk) | |||
6815 | if (unlikely(running)) | 6812 | if (unlikely(running)) |
6816 | tsk->sched_class->put_prev_task(rq, tsk); | 6813 | tsk->sched_class->put_prev_task(rq, tsk); |
6817 | 6814 | ||
6818 | tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id, | 6815 | tg = container_of(task_css_check(tsk, cpu_cgroup_subsys_id, |
6819 | lockdep_is_held(&tsk->sighand->siglock)), | 6816 | lockdep_is_held(&tsk->sighand->siglock)), |
6820 | struct task_group, css); | 6817 | struct task_group, css); |
6821 | tg = autogroup_task_group(tsk, tg); | 6818 | tg = autogroup_task_group(tsk, tg); |
@@ -7137,23 +7134,22 @@ int sched_rt_handler(struct ctl_table *table, int write, | |||
7137 | 7134 | ||
7138 | #ifdef CONFIG_CGROUP_SCHED | 7135 | #ifdef CONFIG_CGROUP_SCHED |
7139 | 7136 | ||
7140 | /* return corresponding task_group object of a cgroup */ | 7137 | static inline struct task_group *css_tg(struct cgroup_subsys_state *css) |
7141 | static inline struct task_group *cgroup_tg(struct cgroup *cgrp) | ||
7142 | { | 7138 | { |
7143 | return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id), | 7139 | return css ? container_of(css, struct task_group, css) : NULL; |
7144 | struct task_group, css); | ||
7145 | } | 7140 | } |
7146 | 7141 | ||
7147 | static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp) | 7142 | static struct cgroup_subsys_state * |
7143 | cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) | ||
7148 | { | 7144 | { |
7149 | struct task_group *tg, *parent; | 7145 | struct task_group *parent = css_tg(parent_css); |
7146 | struct task_group *tg; | ||
7150 | 7147 | ||
7151 | if (!cgrp->parent) { | 7148 | if (!parent) { |
7152 | /* This is early initialization for the top cgroup */ | 7149 | /* This is early initialization for the top cgroup */ |
7153 | return &root_task_group.css; | 7150 | return &root_task_group.css; |
7154 | } | 7151 | } |
7155 | 7152 | ||
7156 | parent = cgroup_tg(cgrp->parent); | ||
7157 | tg = sched_create_group(parent); | 7153 | tg = sched_create_group(parent); |
7158 | if (IS_ERR(tg)) | 7154 | if (IS_ERR(tg)) |
7159 | return ERR_PTR(-ENOMEM); | 7155 | return ERR_PTR(-ENOMEM); |
@@ -7161,41 +7157,38 @@ static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp) | |||
7161 | return &tg->css; | 7157 | return &tg->css; |
7162 | } | 7158 | } |
7163 | 7159 | ||
7164 | static int cpu_cgroup_css_online(struct cgroup *cgrp) | 7160 | static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) |
7165 | { | 7161 | { |
7166 | struct task_group *tg = cgroup_tg(cgrp); | 7162 | struct task_group *tg = css_tg(css); |
7167 | struct task_group *parent; | 7163 | struct task_group *parent = css_tg(css_parent(css)); |
7168 | 7164 | ||
7169 | if (!cgrp->parent) | 7165 | if (parent) |
7170 | return 0; | 7166 | sched_online_group(tg, parent); |
7171 | |||
7172 | parent = cgroup_tg(cgrp->parent); | ||
7173 | sched_online_group(tg, parent); | ||
7174 | return 0; | 7167 | return 0; |
7175 | } | 7168 | } |
7176 | 7169 | ||
7177 | static void cpu_cgroup_css_free(struct cgroup *cgrp) | 7170 | static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) |
7178 | { | 7171 | { |
7179 | struct task_group *tg = cgroup_tg(cgrp); | 7172 | struct task_group *tg = css_tg(css); |
7180 | 7173 | ||
7181 | sched_destroy_group(tg); | 7174 | sched_destroy_group(tg); |
7182 | } | 7175 | } |
7183 | 7176 | ||
7184 | static void cpu_cgroup_css_offline(struct cgroup *cgrp) | 7177 | static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) |
7185 | { | 7178 | { |
7186 | struct task_group *tg = cgroup_tg(cgrp); | 7179 | struct task_group *tg = css_tg(css); |
7187 | 7180 | ||
7188 | sched_offline_group(tg); | 7181 | sched_offline_group(tg); |
7189 | } | 7182 | } |
7190 | 7183 | ||
7191 | static int cpu_cgroup_can_attach(struct cgroup *cgrp, | 7184 | static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, |
7192 | struct cgroup_taskset *tset) | 7185 | struct cgroup_taskset *tset) |
7193 | { | 7186 | { |
7194 | struct task_struct *task; | 7187 | struct task_struct *task; |
7195 | 7188 | ||
7196 | cgroup_taskset_for_each(task, cgrp, tset) { | 7189 | cgroup_taskset_for_each(task, css, tset) { |
7197 | #ifdef CONFIG_RT_GROUP_SCHED | 7190 | #ifdef CONFIG_RT_GROUP_SCHED |
7198 | if (!sched_rt_can_attach(cgroup_tg(cgrp), task)) | 7191 | if (!sched_rt_can_attach(css_tg(css), task)) |
7199 | return -EINVAL; | 7192 | return -EINVAL; |
7200 | #else | 7193 | #else |
7201 | /* We don't support RT-tasks being in separate groups */ | 7194 | /* We don't support RT-tasks being in separate groups */ |
@@ -7206,18 +7199,18 @@ static int cpu_cgroup_can_attach(struct cgroup *cgrp, | |||
7206 | return 0; | 7199 | return 0; |
7207 | } | 7200 | } |
7208 | 7201 | ||
7209 | static void cpu_cgroup_attach(struct cgroup *cgrp, | 7202 | static void cpu_cgroup_attach(struct cgroup_subsys_state *css, |
7210 | struct cgroup_taskset *tset) | 7203 | struct cgroup_taskset *tset) |
7211 | { | 7204 | { |
7212 | struct task_struct *task; | 7205 | struct task_struct *task; |
7213 | 7206 | ||
7214 | cgroup_taskset_for_each(task, cgrp, tset) | 7207 | cgroup_taskset_for_each(task, css, tset) |
7215 | sched_move_task(task); | 7208 | sched_move_task(task); |
7216 | } | 7209 | } |
7217 | 7210 | ||
7218 | static void | 7211 | static void cpu_cgroup_exit(struct cgroup_subsys_state *css, |
7219 | cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, | 7212 | struct cgroup_subsys_state *old_css, |
7220 | struct task_struct *task) | 7213 | struct task_struct *task) |
7221 | { | 7214 | { |
7222 | /* | 7215 | /* |
7223 | * cgroup_exit() is called in the copy_process() failure path. | 7216 | * cgroup_exit() is called in the copy_process() failure path. |
@@ -7231,15 +7224,16 @@ cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, | |||
7231 | } | 7224 | } |
7232 | 7225 | ||
7233 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7226 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7234 | static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, | 7227 | static int cpu_shares_write_u64(struct cgroup_subsys_state *css, |
7235 | u64 shareval) | 7228 | struct cftype *cftype, u64 shareval) |
7236 | { | 7229 | { |
7237 | return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval)); | 7230 | return sched_group_set_shares(css_tg(css), scale_load(shareval)); |
7238 | } | 7231 | } |
7239 | 7232 | ||
7240 | static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) | 7233 | static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css, |
7234 | struct cftype *cft) | ||
7241 | { | 7235 | { |
7242 | struct task_group *tg = cgroup_tg(cgrp); | 7236 | struct task_group *tg = css_tg(css); |
7243 | 7237 | ||
7244 | return (u64) scale_load_down(tg->shares); | 7238 | return (u64) scale_load_down(tg->shares); |
7245 | } | 7239 | } |
@@ -7361,26 +7355,28 @@ long tg_get_cfs_period(struct task_group *tg) | |||
7361 | return cfs_period_us; | 7355 | return cfs_period_us; |
7362 | } | 7356 | } |
7363 | 7357 | ||
7364 | static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft) | 7358 | static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css, |
7359 | struct cftype *cft) | ||
7365 | { | 7360 | { |
7366 | return tg_get_cfs_quota(cgroup_tg(cgrp)); | 7361 | return tg_get_cfs_quota(css_tg(css)); |
7367 | } | 7362 | } |
7368 | 7363 | ||
7369 | static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype, | 7364 | static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css, |
7370 | s64 cfs_quota_us) | 7365 | struct cftype *cftype, s64 cfs_quota_us) |
7371 | { | 7366 | { |
7372 | return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us); | 7367 | return tg_set_cfs_quota(css_tg(css), cfs_quota_us); |
7373 | } | 7368 | } |
7374 | 7369 | ||
7375 | static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft) | 7370 | static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css, |
7371 | struct cftype *cft) | ||
7376 | { | 7372 | { |
7377 | return tg_get_cfs_period(cgroup_tg(cgrp)); | 7373 | return tg_get_cfs_period(css_tg(css)); |
7378 | } | 7374 | } |
7379 | 7375 | ||
7380 | static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype, | 7376 | static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css, |
7381 | u64 cfs_period_us) | 7377 | struct cftype *cftype, u64 cfs_period_us) |
7382 | { | 7378 | { |
7383 | return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us); | 7379 | return tg_set_cfs_period(css_tg(css), cfs_period_us); |
7384 | } | 7380 | } |
7385 | 7381 | ||
7386 | struct cfs_schedulable_data { | 7382 | struct cfs_schedulable_data { |
@@ -7461,10 +7457,10 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) | |||
7461 | return ret; | 7457 | return ret; |
7462 | } | 7458 | } |
7463 | 7459 | ||
7464 | static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft, | 7460 | static int cpu_stats_show(struct cgroup_subsys_state *css, struct cftype *cft, |
7465 | struct cgroup_map_cb *cb) | 7461 | struct cgroup_map_cb *cb) |
7466 | { | 7462 | { |
7467 | struct task_group *tg = cgroup_tg(cgrp); | 7463 | struct task_group *tg = css_tg(css); |
7468 | struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; | 7464 | struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; |
7469 | 7465 | ||
7470 | cb->fill(cb, "nr_periods", cfs_b->nr_periods); | 7466 | cb->fill(cb, "nr_periods", cfs_b->nr_periods); |
@@ -7477,26 +7473,28 @@ static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft, | |||
7477 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 7473 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
7478 | 7474 | ||
7479 | #ifdef CONFIG_RT_GROUP_SCHED | 7475 | #ifdef CONFIG_RT_GROUP_SCHED |
7480 | static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, | 7476 | static int cpu_rt_runtime_write(struct cgroup_subsys_state *css, |
7481 | s64 val) | 7477 | struct cftype *cft, s64 val) |
7482 | { | 7478 | { |
7483 | return sched_group_set_rt_runtime(cgroup_tg(cgrp), val); | 7479 | return sched_group_set_rt_runtime(css_tg(css), val); |
7484 | } | 7480 | } |
7485 | 7481 | ||
7486 | static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft) | 7482 | static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css, |
7483 | struct cftype *cft) | ||
7487 | { | 7484 | { |
7488 | return sched_group_rt_runtime(cgroup_tg(cgrp)); | 7485 | return sched_group_rt_runtime(css_tg(css)); |
7489 | } | 7486 | } |
7490 | 7487 | ||
7491 | static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype, | 7488 | static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css, |
7492 | u64 rt_period_us) | 7489 | struct cftype *cftype, u64 rt_period_us) |
7493 | { | 7490 | { |
7494 | return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us); | 7491 | return sched_group_set_rt_period(css_tg(css), rt_period_us); |
7495 | } | 7492 | } |
7496 | 7493 | ||
7497 | static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft) | 7494 | static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, |
7495 | struct cftype *cft) | ||
7498 | { | 7496 | { |
7499 | return sched_group_rt_period(cgroup_tg(cgrp)); | 7497 | return sched_group_rt_period(css_tg(css)); |
7500 | } | 7498 | } |
7501 | #endif /* CONFIG_RT_GROUP_SCHED */ | 7499 | #endif /* CONFIG_RT_GROUP_SCHED */ |
7502 | 7500 | ||
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index dbb7e2cd95eb..f64722ff0299 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c | |||
@@ -33,30 +33,20 @@ struct cpuacct { | |||
33 | struct kernel_cpustat __percpu *cpustat; | 33 | struct kernel_cpustat __percpu *cpustat; |
34 | }; | 34 | }; |
35 | 35 | ||
36 | /* return cpu accounting group corresponding to this container */ | 36 | static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css) |
37 | static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) | ||
38 | { | 37 | { |
39 | return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), | 38 | return css ? container_of(css, struct cpuacct, css) : NULL; |
40 | struct cpuacct, css); | ||
41 | } | 39 | } |
42 | 40 | ||
43 | /* return cpu accounting group to which this task belongs */ | 41 | /* return cpu accounting group to which this task belongs */ |
44 | static inline struct cpuacct *task_ca(struct task_struct *tsk) | 42 | static inline struct cpuacct *task_ca(struct task_struct *tsk) |
45 | { | 43 | { |
46 | return container_of(task_subsys_state(tsk, cpuacct_subsys_id), | 44 | return css_ca(task_css(tsk, cpuacct_subsys_id)); |
47 | struct cpuacct, css); | ||
48 | } | ||
49 | |||
50 | static inline struct cpuacct *__parent_ca(struct cpuacct *ca) | ||
51 | { | ||
52 | return cgroup_ca(ca->css.cgroup->parent); | ||
53 | } | 45 | } |
54 | 46 | ||
55 | static inline struct cpuacct *parent_ca(struct cpuacct *ca) | 47 | static inline struct cpuacct *parent_ca(struct cpuacct *ca) |
56 | { | 48 | { |
57 | if (!ca->css.cgroup->parent) | 49 | return css_ca(css_parent(&ca->css)); |
58 | return NULL; | ||
59 | return cgroup_ca(ca->css.cgroup->parent); | ||
60 | } | 50 | } |
61 | 51 | ||
62 | static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); | 52 | static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); |
@@ -66,11 +56,12 @@ static struct cpuacct root_cpuacct = { | |||
66 | }; | 56 | }; |
67 | 57 | ||
68 | /* create a new cpu accounting group */ | 58 | /* create a new cpu accounting group */ |
69 | static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp) | 59 | static struct cgroup_subsys_state * |
60 | cpuacct_css_alloc(struct cgroup_subsys_state *parent_css) | ||
70 | { | 61 | { |
71 | struct cpuacct *ca; | 62 | struct cpuacct *ca; |
72 | 63 | ||
73 | if (!cgrp->parent) | 64 | if (!parent_css) |
74 | return &root_cpuacct.css; | 65 | return &root_cpuacct.css; |
75 | 66 | ||
76 | ca = kzalloc(sizeof(*ca), GFP_KERNEL); | 67 | ca = kzalloc(sizeof(*ca), GFP_KERNEL); |
@@ -96,9 +87,9 @@ out: | |||
96 | } | 87 | } |
97 | 88 | ||
98 | /* destroy an existing cpu accounting group */ | 89 | /* destroy an existing cpu accounting group */ |
99 | static void cpuacct_css_free(struct cgroup *cgrp) | 90 | static void cpuacct_css_free(struct cgroup_subsys_state *css) |
100 | { | 91 | { |
101 | struct cpuacct *ca = cgroup_ca(cgrp); | 92 | struct cpuacct *ca = css_ca(css); |
102 | 93 | ||
103 | free_percpu(ca->cpustat); | 94 | free_percpu(ca->cpustat); |
104 | free_percpu(ca->cpuusage); | 95 | free_percpu(ca->cpuusage); |
@@ -141,9 +132,9 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) | |||
141 | } | 132 | } |
142 | 133 | ||
143 | /* return total cpu usage (in nanoseconds) of a group */ | 134 | /* return total cpu usage (in nanoseconds) of a group */ |
144 | static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) | 135 | static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft) |
145 | { | 136 | { |
146 | struct cpuacct *ca = cgroup_ca(cgrp); | 137 | struct cpuacct *ca = css_ca(css); |
147 | u64 totalcpuusage = 0; | 138 | u64 totalcpuusage = 0; |
148 | int i; | 139 | int i; |
149 | 140 | ||
@@ -153,10 +144,10 @@ static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) | |||
153 | return totalcpuusage; | 144 | return totalcpuusage; |
154 | } | 145 | } |
155 | 146 | ||
156 | static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, | 147 | static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, |
157 | u64 reset) | 148 | u64 reset) |
158 | { | 149 | { |
159 | struct cpuacct *ca = cgroup_ca(cgrp); | 150 | struct cpuacct *ca = css_ca(css); |
160 | int err = 0; | 151 | int err = 0; |
161 | int i; | 152 | int i; |
162 | 153 | ||
@@ -172,10 +163,10 @@ out: | |||
172 | return err; | 163 | return err; |
173 | } | 164 | } |
174 | 165 | ||
175 | static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, | 166 | static int cpuacct_percpu_seq_read(struct cgroup_subsys_state *css, |
176 | struct seq_file *m) | 167 | struct cftype *cft, struct seq_file *m) |
177 | { | 168 | { |
178 | struct cpuacct *ca = cgroup_ca(cgroup); | 169 | struct cpuacct *ca = css_ca(css); |
179 | u64 percpu; | 170 | u64 percpu; |
180 | int i; | 171 | int i; |
181 | 172 | ||
@@ -192,10 +183,10 @@ static const char * const cpuacct_stat_desc[] = { | |||
192 | [CPUACCT_STAT_SYSTEM] = "system", | 183 | [CPUACCT_STAT_SYSTEM] = "system", |
193 | }; | 184 | }; |
194 | 185 | ||
195 | static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, | 186 | static int cpuacct_stats_show(struct cgroup_subsys_state *css, |
196 | struct cgroup_map_cb *cb) | 187 | struct cftype *cft, struct cgroup_map_cb *cb) |
197 | { | 188 | { |
198 | struct cpuacct *ca = cgroup_ca(cgrp); | 189 | struct cpuacct *ca = css_ca(css); |
199 | int cpu; | 190 | int cpu; |
200 | s64 val = 0; | 191 | s64 val = 0; |
201 | 192 | ||
@@ -281,7 +272,7 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val) | |||
281 | while (ca != &root_cpuacct) { | 272 | while (ca != &root_cpuacct) { |
282 | kcpustat = this_cpu_ptr(ca->cpustat); | 273 | kcpustat = this_cpu_ptr(ca->cpustat); |
283 | kcpustat->cpustat[index] += val; | 274 | kcpustat->cpustat[index] += val; |
284 | ca = __parent_ca(ca); | 275 | ca = parent_ca(ca); |
285 | } | 276 | } |
286 | rcu_read_unlock(); | 277 | rcu_read_unlock(); |
287 | } | 278 | } |
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index a7959e05a9d5..99947919e30b 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c | |||
@@ -121,7 +121,7 @@ static inline void task_group_account_field(struct task_struct *p, int index, | |||
121 | * is the only cgroup, then nothing else should be necessary. | 121 | * is the only cgroup, then nothing else should be necessary. |
122 | * | 122 | * |
123 | */ | 123 | */ |
124 | __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; | 124 | __this_cpu_add(kernel_cpustat.cpustat[index], tmp); |
125 | 125 | ||
126 | cpuacct_account_field(p, index, tmp); | 126 | cpuacct_account_field(p, index, tmp); |
127 | } | 127 | } |
@@ -378,11 +378,8 @@ static inline void irqtime_account_process_tick(struct task_struct *p, int user_ | |||
378 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING | 378 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING |
379 | 379 | ||
380 | #ifndef __ARCH_HAS_VTIME_TASK_SWITCH | 380 | #ifndef __ARCH_HAS_VTIME_TASK_SWITCH |
381 | void vtime_task_switch(struct task_struct *prev) | 381 | void vtime_common_task_switch(struct task_struct *prev) |
382 | { | 382 | { |
383 | if (!vtime_accounting_enabled()) | ||
384 | return; | ||
385 | |||
386 | if (is_idle_task(prev)) | 383 | if (is_idle_task(prev)) |
387 | vtime_account_idle(prev); | 384 | vtime_account_idle(prev); |
388 | else | 385 | else |
@@ -404,11 +401,8 @@ void vtime_task_switch(struct task_struct *prev) | |||
404 | * vtime_account(). | 401 | * vtime_account(). |
405 | */ | 402 | */ |
406 | #ifndef __ARCH_HAS_VTIME_ACCOUNT | 403 | #ifndef __ARCH_HAS_VTIME_ACCOUNT |
407 | void vtime_account_irq_enter(struct task_struct *tsk) | 404 | void vtime_common_account_irq_enter(struct task_struct *tsk) |
408 | { | 405 | { |
409 | if (!vtime_accounting_enabled()) | ||
410 | return; | ||
411 | |||
412 | if (!in_interrupt()) { | 406 | if (!in_interrupt()) { |
413 | /* | 407 | /* |
414 | * If we interrupted user, context_tracking_in_user() | 408 | * If we interrupted user, context_tracking_in_user() |
@@ -428,7 +422,7 @@ void vtime_account_irq_enter(struct task_struct *tsk) | |||
428 | } | 422 | } |
429 | vtime_account_system(tsk); | 423 | vtime_account_system(tsk); |
430 | } | 424 | } |
431 | EXPORT_SYMBOL_GPL(vtime_account_irq_enter); | 425 | EXPORT_SYMBOL_GPL(vtime_common_account_irq_enter); |
432 | #endif /* __ARCH_HAS_VTIME_ACCOUNT */ | 426 | #endif /* __ARCH_HAS_VTIME_ACCOUNT */ |
433 | #endif /* CONFIG_VIRT_CPU_ACCOUNTING */ | 427 | #endif /* CONFIG_VIRT_CPU_ACCOUNTING */ |
434 | 428 | ||
@@ -557,16 +551,7 @@ static void cputime_adjust(struct task_cputime *curr, | |||
557 | struct cputime *prev, | 551 | struct cputime *prev, |
558 | cputime_t *ut, cputime_t *st) | 552 | cputime_t *ut, cputime_t *st) |
559 | { | 553 | { |
560 | cputime_t rtime, stime, utime, total; | 554 | cputime_t rtime, stime, utime; |
561 | |||
562 | if (vtime_accounting_enabled()) { | ||
563 | *ut = curr->utime; | ||
564 | *st = curr->stime; | ||
565 | return; | ||
566 | } | ||
567 | |||
568 | stime = curr->stime; | ||
569 | total = stime + curr->utime; | ||
570 | 555 | ||
571 | /* | 556 | /* |
572 | * Tick based cputime accounting depend on random scheduling | 557 | * Tick based cputime accounting depend on random scheduling |
@@ -588,13 +573,19 @@ static void cputime_adjust(struct task_cputime *curr, | |||
588 | if (prev->stime + prev->utime >= rtime) | 573 | if (prev->stime + prev->utime >= rtime) |
589 | goto out; | 574 | goto out; |
590 | 575 | ||
591 | if (total) { | 576 | stime = curr->stime; |
577 | utime = curr->utime; | ||
578 | |||
579 | if (utime == 0) { | ||
580 | stime = rtime; | ||
581 | } else if (stime == 0) { | ||
582 | utime = rtime; | ||
583 | } else { | ||
584 | cputime_t total = stime + utime; | ||
585 | |||
592 | stime = scale_stime((__force u64)stime, | 586 | stime = scale_stime((__force u64)stime, |
593 | (__force u64)rtime, (__force u64)total); | 587 | (__force u64)rtime, (__force u64)total); |
594 | utime = rtime - stime; | 588 | utime = rtime - stime; |
595 | } else { | ||
596 | stime = rtime; | ||
597 | utime = 0; | ||
598 | } | 589 | } |
599 | 590 | ||
600 | /* | 591 | /* |
@@ -664,23 +655,17 @@ static void __vtime_account_system(struct task_struct *tsk) | |||
664 | 655 | ||
665 | void vtime_account_system(struct task_struct *tsk) | 656 | void vtime_account_system(struct task_struct *tsk) |
666 | { | 657 | { |
667 | if (!vtime_accounting_enabled()) | ||
668 | return; | ||
669 | |||
670 | write_seqlock(&tsk->vtime_seqlock); | 658 | write_seqlock(&tsk->vtime_seqlock); |
671 | __vtime_account_system(tsk); | 659 | __vtime_account_system(tsk); |
672 | write_sequnlock(&tsk->vtime_seqlock); | 660 | write_sequnlock(&tsk->vtime_seqlock); |
673 | } | 661 | } |
674 | 662 | ||
675 | void vtime_account_irq_exit(struct task_struct *tsk) | 663 | void vtime_gen_account_irq_exit(struct task_struct *tsk) |
676 | { | 664 | { |
677 | if (!vtime_accounting_enabled()) | ||
678 | return; | ||
679 | |||
680 | write_seqlock(&tsk->vtime_seqlock); | 665 | write_seqlock(&tsk->vtime_seqlock); |
666 | __vtime_account_system(tsk); | ||
681 | if (context_tracking_in_user()) | 667 | if (context_tracking_in_user()) |
682 | tsk->vtime_snap_whence = VTIME_USER; | 668 | tsk->vtime_snap_whence = VTIME_USER; |
683 | __vtime_account_system(tsk); | ||
684 | write_sequnlock(&tsk->vtime_seqlock); | 669 | write_sequnlock(&tsk->vtime_seqlock); |
685 | } | 670 | } |
686 | 671 | ||
@@ -688,12 +673,8 @@ void vtime_account_user(struct task_struct *tsk) | |||
688 | { | 673 | { |
689 | cputime_t delta_cpu; | 674 | cputime_t delta_cpu; |
690 | 675 | ||
691 | if (!vtime_accounting_enabled()) | ||
692 | return; | ||
693 | |||
694 | delta_cpu = get_vtime_delta(tsk); | ||
695 | |||
696 | write_seqlock(&tsk->vtime_seqlock); | 676 | write_seqlock(&tsk->vtime_seqlock); |
677 | delta_cpu = get_vtime_delta(tsk); | ||
697 | tsk->vtime_snap_whence = VTIME_SYS; | 678 | tsk->vtime_snap_whence = VTIME_SYS; |
698 | account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); | 679 | account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); |
699 | write_sequnlock(&tsk->vtime_seqlock); | 680 | write_sequnlock(&tsk->vtime_seqlock); |
@@ -701,22 +682,27 @@ void vtime_account_user(struct task_struct *tsk) | |||
701 | 682 | ||
702 | void vtime_user_enter(struct task_struct *tsk) | 683 | void vtime_user_enter(struct task_struct *tsk) |
703 | { | 684 | { |
704 | if (!vtime_accounting_enabled()) | ||
705 | return; | ||
706 | |||
707 | write_seqlock(&tsk->vtime_seqlock); | 685 | write_seqlock(&tsk->vtime_seqlock); |
708 | tsk->vtime_snap_whence = VTIME_USER; | ||
709 | __vtime_account_system(tsk); | 686 | __vtime_account_system(tsk); |
687 | tsk->vtime_snap_whence = VTIME_USER; | ||
710 | write_sequnlock(&tsk->vtime_seqlock); | 688 | write_sequnlock(&tsk->vtime_seqlock); |
711 | } | 689 | } |
712 | 690 | ||
713 | void vtime_guest_enter(struct task_struct *tsk) | 691 | void vtime_guest_enter(struct task_struct *tsk) |
714 | { | 692 | { |
693 | /* | ||
694 | * The flags must be updated under the lock with | ||
695 | * the vtime_snap flush and update. | ||
696 | * That enforces a right ordering and update sequence | ||
697 | * synchronization against the reader (task_gtime()) | ||
698 | * that can thus safely catch up with a tickless delta. | ||
699 | */ | ||
715 | write_seqlock(&tsk->vtime_seqlock); | 700 | write_seqlock(&tsk->vtime_seqlock); |
716 | __vtime_account_system(tsk); | 701 | __vtime_account_system(tsk); |
717 | current->flags |= PF_VCPU; | 702 | current->flags |= PF_VCPU; |
718 | write_sequnlock(&tsk->vtime_seqlock); | 703 | write_sequnlock(&tsk->vtime_seqlock); |
719 | } | 704 | } |
705 | EXPORT_SYMBOL_GPL(vtime_guest_enter); | ||
720 | 706 | ||
721 | void vtime_guest_exit(struct task_struct *tsk) | 707 | void vtime_guest_exit(struct task_struct *tsk) |
722 | { | 708 | { |
@@ -725,6 +711,7 @@ void vtime_guest_exit(struct task_struct *tsk) | |||
725 | current->flags &= ~PF_VCPU; | 711 | current->flags &= ~PF_VCPU; |
726 | write_sequnlock(&tsk->vtime_seqlock); | 712 | write_sequnlock(&tsk->vtime_seqlock); |
727 | } | 713 | } |
714 | EXPORT_SYMBOL_GPL(vtime_guest_exit); | ||
728 | 715 | ||
729 | void vtime_account_idle(struct task_struct *tsk) | 716 | void vtime_account_idle(struct task_struct *tsk) |
730 | { | 717 | { |
@@ -733,11 +720,6 @@ void vtime_account_idle(struct task_struct *tsk) | |||
733 | account_idle_time(delta_cpu); | 720 | account_idle_time(delta_cpu); |
734 | } | 721 | } |
735 | 722 | ||
736 | bool vtime_accounting_enabled(void) | ||
737 | { | ||
738 | return context_tracking_active(); | ||
739 | } | ||
740 | |||
741 | void arch_vtime_task_switch(struct task_struct *prev) | 723 | void arch_vtime_task_switch(struct task_struct *prev) |
742 | { | 724 | { |
743 | write_seqlock(&prev->vtime_seqlock); | 725 | write_seqlock(&prev->vtime_seqlock); |
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index e076bddd4c66..196559994f7c 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
@@ -124,7 +124,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) | |||
124 | SEQ_printf(m, " "); | 124 | SEQ_printf(m, " "); |
125 | 125 | ||
126 | SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ", | 126 | SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ", |
127 | p->comm, p->pid, | 127 | p->comm, task_pid_nr(p), |
128 | SPLIT_NS(p->se.vruntime), | 128 | SPLIT_NS(p->se.vruntime), |
129 | (long long)(p->nvcsw + p->nivcsw), | 129 | (long long)(p->nvcsw + p->nivcsw), |
130 | p->prio); | 130 | p->prio); |
@@ -289,7 +289,7 @@ do { \ | |||
289 | P(nr_load_updates); | 289 | P(nr_load_updates); |
290 | P(nr_uninterruptible); | 290 | P(nr_uninterruptible); |
291 | PN(next_balance); | 291 | PN(next_balance); |
292 | P(curr->pid); | 292 | SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr))); |
293 | PN(clock); | 293 | PN(clock); |
294 | P(cpu_load[0]); | 294 | P(cpu_load[0]); |
295 | P(cpu_load[1]); | 295 | P(cpu_load[1]); |
@@ -492,7 +492,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
492 | { | 492 | { |
493 | unsigned long nr_switches; | 493 | unsigned long nr_switches; |
494 | 494 | ||
495 | SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, | 495 | SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr(p), |
496 | get_nr_threads(p)); | 496 | get_nr_threads(p)); |
497 | SEQ_printf(m, | 497 | SEQ_printf(m, |
498 | "---------------------------------------------------------" | 498 | "---------------------------------------------------------" |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 68f1609ca149..7c70201fbc61 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -3018,6 +3018,23 @@ static unsigned long cpu_avg_load_per_task(int cpu) | |||
3018 | return 0; | 3018 | return 0; |
3019 | } | 3019 | } |
3020 | 3020 | ||
3021 | static void record_wakee(struct task_struct *p) | ||
3022 | { | ||
3023 | /* | ||
3024 | * Rough decay (wiping) for cost saving, don't worry | ||
3025 | * about the boundary, really active task won't care | ||
3026 | * about the loss. | ||
3027 | */ | ||
3028 | if (jiffies > current->wakee_flip_decay_ts + HZ) { | ||
3029 | current->wakee_flips = 0; | ||
3030 | current->wakee_flip_decay_ts = jiffies; | ||
3031 | } | ||
3032 | |||
3033 | if (current->last_wakee != p) { | ||
3034 | current->last_wakee = p; | ||
3035 | current->wakee_flips++; | ||
3036 | } | ||
3037 | } | ||
3021 | 3038 | ||
3022 | static void task_waking_fair(struct task_struct *p) | 3039 | static void task_waking_fair(struct task_struct *p) |
3023 | { | 3040 | { |
@@ -3038,6 +3055,7 @@ static void task_waking_fair(struct task_struct *p) | |||
3038 | #endif | 3055 | #endif |
3039 | 3056 | ||
3040 | se->vruntime -= min_vruntime; | 3057 | se->vruntime -= min_vruntime; |
3058 | record_wakee(p); | ||
3041 | } | 3059 | } |
3042 | 3060 | ||
3043 | #ifdef CONFIG_FAIR_GROUP_SCHED | 3061 | #ifdef CONFIG_FAIR_GROUP_SCHED |
@@ -3156,6 +3174,28 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu, | |||
3156 | 3174 | ||
3157 | #endif | 3175 | #endif |
3158 | 3176 | ||
3177 | static int wake_wide(struct task_struct *p) | ||
3178 | { | ||
3179 | int factor = this_cpu_read(sd_llc_size); | ||
3180 | |||
3181 | /* | ||
3182 | * Yeah, it's the switching-frequency, could means many wakee or | ||
3183 | * rapidly switch, use factor here will just help to automatically | ||
3184 | * adjust the loose-degree, so bigger node will lead to more pull. | ||
3185 | */ | ||
3186 | if (p->wakee_flips > factor) { | ||
3187 | /* | ||
3188 | * wakee is somewhat hot, it needs certain amount of cpu | ||
3189 | * resource, so if waker is far more hot, prefer to leave | ||
3190 | * it alone. | ||
3191 | */ | ||
3192 | if (current->wakee_flips > (factor * p->wakee_flips)) | ||
3193 | return 1; | ||
3194 | } | ||
3195 | |||
3196 | return 0; | ||
3197 | } | ||
3198 | |||
3159 | static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) | 3199 | static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) |
3160 | { | 3200 | { |
3161 | s64 this_load, load; | 3201 | s64 this_load, load; |
@@ -3165,6 +3205,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) | |||
3165 | unsigned long weight; | 3205 | unsigned long weight; |
3166 | int balanced; | 3206 | int balanced; |
3167 | 3207 | ||
3208 | /* | ||
3209 | * If we wake multiple tasks be careful to not bounce | ||
3210 | * ourselves around too much. | ||
3211 | */ | ||
3212 | if (wake_wide(p)) | ||
3213 | return 0; | ||
3214 | |||
3168 | idx = sd->wake_idx; | 3215 | idx = sd->wake_idx; |
3169 | this_cpu = smp_processor_id(); | 3216 | this_cpu = smp_processor_id(); |
3170 | prev_cpu = task_cpu(p); | 3217 | prev_cpu = task_cpu(p); |
@@ -4172,47 +4219,48 @@ static void update_blocked_averages(int cpu) | |||
4172 | } | 4219 | } |
4173 | 4220 | ||
4174 | /* | 4221 | /* |
4175 | * Compute the cpu's hierarchical load factor for each task group. | 4222 | * Compute the hierarchical load factor for cfs_rq and all its ascendants. |
4176 | * This needs to be done in a top-down fashion because the load of a child | 4223 | * This needs to be done in a top-down fashion because the load of a child |
4177 | * group is a fraction of its parents load. | 4224 | * group is a fraction of its parents load. |
4178 | */ | 4225 | */ |
4179 | static int tg_load_down(struct task_group *tg, void *data) | 4226 | static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq) |
4180 | { | 4227 | { |
4181 | unsigned long load; | 4228 | struct rq *rq = rq_of(cfs_rq); |
4182 | long cpu = (long)data; | 4229 | struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)]; |
4183 | |||
4184 | if (!tg->parent) { | ||
4185 | load = cpu_rq(cpu)->avg.load_avg_contrib; | ||
4186 | } else { | ||
4187 | load = tg->parent->cfs_rq[cpu]->h_load; | ||
4188 | load = div64_ul(load * tg->se[cpu]->avg.load_avg_contrib, | ||
4189 | tg->parent->cfs_rq[cpu]->runnable_load_avg + 1); | ||
4190 | } | ||
4191 | |||
4192 | tg->cfs_rq[cpu]->h_load = load; | ||
4193 | |||
4194 | return 0; | ||
4195 | } | ||
4196 | |||
4197 | static void update_h_load(long cpu) | ||
4198 | { | ||
4199 | struct rq *rq = cpu_rq(cpu); | ||
4200 | unsigned long now = jiffies; | 4230 | unsigned long now = jiffies; |
4231 | unsigned long load; | ||
4201 | 4232 | ||
4202 | if (rq->h_load_throttle == now) | 4233 | if (cfs_rq->last_h_load_update == now) |
4203 | return; | 4234 | return; |
4204 | 4235 | ||
4205 | rq->h_load_throttle = now; | 4236 | cfs_rq->h_load_next = NULL; |
4237 | for_each_sched_entity(se) { | ||
4238 | cfs_rq = cfs_rq_of(se); | ||
4239 | cfs_rq->h_load_next = se; | ||
4240 | if (cfs_rq->last_h_load_update == now) | ||
4241 | break; | ||
4242 | } | ||
4206 | 4243 | ||
4207 | rcu_read_lock(); | 4244 | if (!se) { |
4208 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); | 4245 | cfs_rq->h_load = cfs_rq->runnable_load_avg; |
4209 | rcu_read_unlock(); | 4246 | cfs_rq->last_h_load_update = now; |
4247 | } | ||
4248 | |||
4249 | while ((se = cfs_rq->h_load_next) != NULL) { | ||
4250 | load = cfs_rq->h_load; | ||
4251 | load = div64_ul(load * se->avg.load_avg_contrib, | ||
4252 | cfs_rq->runnable_load_avg + 1); | ||
4253 | cfs_rq = group_cfs_rq(se); | ||
4254 | cfs_rq->h_load = load; | ||
4255 | cfs_rq->last_h_load_update = now; | ||
4256 | } | ||
4210 | } | 4257 | } |
4211 | 4258 | ||
4212 | static unsigned long task_h_load(struct task_struct *p) | 4259 | static unsigned long task_h_load(struct task_struct *p) |
4213 | { | 4260 | { |
4214 | struct cfs_rq *cfs_rq = task_cfs_rq(p); | 4261 | struct cfs_rq *cfs_rq = task_cfs_rq(p); |
4215 | 4262 | ||
4263 | update_cfs_rq_h_load(cfs_rq); | ||
4216 | return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load, | 4264 | return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load, |
4217 | cfs_rq->runnable_load_avg + 1); | 4265 | cfs_rq->runnable_load_avg + 1); |
4218 | } | 4266 | } |
@@ -4221,10 +4269,6 @@ static inline void update_blocked_averages(int cpu) | |||
4221 | { | 4269 | { |
4222 | } | 4270 | } |
4223 | 4271 | ||
4224 | static inline void update_h_load(long cpu) | ||
4225 | { | ||
4226 | } | ||
4227 | |||
4228 | static unsigned long task_h_load(struct task_struct *p) | 4272 | static unsigned long task_h_load(struct task_struct *p) |
4229 | { | 4273 | { |
4230 | return p->se.avg.load_avg_contrib; | 4274 | return p->se.avg.load_avg_contrib; |
@@ -4233,50 +4277,56 @@ static unsigned long task_h_load(struct task_struct *p) | |||
4233 | 4277 | ||
4234 | /********** Helpers for find_busiest_group ************************/ | 4278 | /********** Helpers for find_busiest_group ************************/ |
4235 | /* | 4279 | /* |
4236 | * sd_lb_stats - Structure to store the statistics of a sched_domain | ||
4237 | * during load balancing. | ||
4238 | */ | ||
4239 | struct sd_lb_stats { | ||
4240 | struct sched_group *busiest; /* Busiest group in this sd */ | ||
4241 | struct sched_group *this; /* Local group in this sd */ | ||
4242 | unsigned long total_load; /* Total load of all groups in sd */ | ||
4243 | unsigned long total_pwr; /* Total power of all groups in sd */ | ||
4244 | unsigned long avg_load; /* Average load across all groups in sd */ | ||
4245 | |||
4246 | /** Statistics of this group */ | ||
4247 | unsigned long this_load; | ||
4248 | unsigned long this_load_per_task; | ||
4249 | unsigned long this_nr_running; | ||
4250 | unsigned long this_has_capacity; | ||
4251 | unsigned int this_idle_cpus; | ||
4252 | |||
4253 | /* Statistics of the busiest group */ | ||
4254 | unsigned int busiest_idle_cpus; | ||
4255 | unsigned long max_load; | ||
4256 | unsigned long busiest_load_per_task; | ||
4257 | unsigned long busiest_nr_running; | ||
4258 | unsigned long busiest_group_capacity; | ||
4259 | unsigned long busiest_has_capacity; | ||
4260 | unsigned int busiest_group_weight; | ||
4261 | |||
4262 | int group_imb; /* Is there imbalance in this sd */ | ||
4263 | }; | ||
4264 | |||
4265 | /* | ||
4266 | * sg_lb_stats - stats of a sched_group required for load_balancing | 4280 | * sg_lb_stats - stats of a sched_group required for load_balancing |
4267 | */ | 4281 | */ |
4268 | struct sg_lb_stats { | 4282 | struct sg_lb_stats { |
4269 | unsigned long avg_load; /*Avg load across the CPUs of the group */ | 4283 | unsigned long avg_load; /*Avg load across the CPUs of the group */ |
4270 | unsigned long group_load; /* Total load over the CPUs of the group */ | 4284 | unsigned long group_load; /* Total load over the CPUs of the group */ |
4271 | unsigned long sum_nr_running; /* Nr tasks running in the group */ | ||
4272 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ | 4285 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ |
4273 | unsigned long group_capacity; | 4286 | unsigned long load_per_task; |
4274 | unsigned long idle_cpus; | 4287 | unsigned long group_power; |
4275 | unsigned long group_weight; | 4288 | unsigned int sum_nr_running; /* Nr tasks running in the group */ |
4289 | unsigned int group_capacity; | ||
4290 | unsigned int idle_cpus; | ||
4291 | unsigned int group_weight; | ||
4276 | int group_imb; /* Is there an imbalance in the group ? */ | 4292 | int group_imb; /* Is there an imbalance in the group ? */ |
4277 | int group_has_capacity; /* Is there extra capacity in the group? */ | 4293 | int group_has_capacity; /* Is there extra capacity in the group? */ |
4278 | }; | 4294 | }; |
4279 | 4295 | ||
4296 | /* | ||
4297 | * sd_lb_stats - Structure to store the statistics of a sched_domain | ||
4298 | * during load balancing. | ||
4299 | */ | ||
4300 | struct sd_lb_stats { | ||
4301 | struct sched_group *busiest; /* Busiest group in this sd */ | ||
4302 | struct sched_group *local; /* Local group in this sd */ | ||
4303 | unsigned long total_load; /* Total load of all groups in sd */ | ||
4304 | unsigned long total_pwr; /* Total power of all groups in sd */ | ||
4305 | unsigned long avg_load; /* Average load across all groups in sd */ | ||
4306 | |||
4307 | struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */ | ||
4308 | struct sg_lb_stats local_stat; /* Statistics of the local group */ | ||
4309 | }; | ||
4310 | |||
4311 | static inline void init_sd_lb_stats(struct sd_lb_stats *sds) | ||
4312 | { | ||
4313 | /* | ||
4314 | * Skimp on the clearing to avoid duplicate work. We can avoid clearing | ||
4315 | * local_stat because update_sg_lb_stats() does a full clear/assignment. | ||
4316 | * We must however clear busiest_stat::avg_load because | ||
4317 | * update_sd_pick_busiest() reads this before assignment. | ||
4318 | */ | ||
4319 | *sds = (struct sd_lb_stats){ | ||
4320 | .busiest = NULL, | ||
4321 | .local = NULL, | ||
4322 | .total_load = 0UL, | ||
4323 | .total_pwr = 0UL, | ||
4324 | .busiest_stat = { | ||
4325 | .avg_load = 0UL, | ||
4326 | }, | ||
4327 | }; | ||
4328 | } | ||
4329 | |||
4280 | /** | 4330 | /** |
4281 | * get_sd_load_idx - Obtain the load index for a given sched domain. | 4331 | * get_sd_load_idx - Obtain the load index for a given sched domain. |
4282 | * @sd: The sched_domain whose load_idx is to be obtained. | 4332 | * @sd: The sched_domain whose load_idx is to be obtained. |
@@ -4460,33 +4510,99 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | |||
4460 | return 0; | 4510 | return 0; |
4461 | } | 4511 | } |
4462 | 4512 | ||
4513 | /* | ||
4514 | * Group imbalance indicates (and tries to solve) the problem where balancing | ||
4515 | * groups is inadequate due to tsk_cpus_allowed() constraints. | ||
4516 | * | ||
4517 | * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a | ||
4518 | * cpumask covering 1 cpu of the first group and 3 cpus of the second group. | ||
4519 | * Something like: | ||
4520 | * | ||
4521 | * { 0 1 2 3 } { 4 5 6 7 } | ||
4522 | * * * * * | ||
4523 | * | ||
4524 | * If we were to balance group-wise we'd place two tasks in the first group and | ||
4525 | * two tasks in the second group. Clearly this is undesired as it will overload | ||
4526 | * cpu 3 and leave one of the cpus in the second group unused. | ||
4527 | * | ||
4528 | * The current solution to this issue is detecting the skew in the first group | ||
4529 | * by noticing it has a cpu that is overloaded while the remaining cpus are | ||
4530 | * idle -- or rather, there's a distinct imbalance in the cpus; see | ||
4531 | * sg_imbalanced(). | ||
4532 | * | ||
4533 | * When this is so detected; this group becomes a candidate for busiest; see | ||
4534 | * update_sd_pick_busiest(). And calculcate_imbalance() and | ||
4535 | * find_busiest_group() avoid some of the usual balance conditional to allow it | ||
4536 | * to create an effective group imbalance. | ||
4537 | * | ||
4538 | * This is a somewhat tricky proposition since the next run might not find the | ||
4539 | * group imbalance and decide the groups need to be balanced again. A most | ||
4540 | * subtle and fragile situation. | ||
4541 | */ | ||
4542 | |||
4543 | struct sg_imb_stats { | ||
4544 | unsigned long max_nr_running, min_nr_running; | ||
4545 | unsigned long max_cpu_load, min_cpu_load; | ||
4546 | }; | ||
4547 | |||
4548 | static inline void init_sg_imb_stats(struct sg_imb_stats *sgi) | ||
4549 | { | ||
4550 | sgi->max_cpu_load = sgi->max_nr_running = 0UL; | ||
4551 | sgi->min_cpu_load = sgi->min_nr_running = ~0UL; | ||
4552 | } | ||
4553 | |||
4554 | static inline void | ||
4555 | update_sg_imb_stats(struct sg_imb_stats *sgi, | ||
4556 | unsigned long load, unsigned long nr_running) | ||
4557 | { | ||
4558 | if (load > sgi->max_cpu_load) | ||
4559 | sgi->max_cpu_load = load; | ||
4560 | if (sgi->min_cpu_load > load) | ||
4561 | sgi->min_cpu_load = load; | ||
4562 | |||
4563 | if (nr_running > sgi->max_nr_running) | ||
4564 | sgi->max_nr_running = nr_running; | ||
4565 | if (sgi->min_nr_running > nr_running) | ||
4566 | sgi->min_nr_running = nr_running; | ||
4567 | } | ||
4568 | |||
4569 | static inline int | ||
4570 | sg_imbalanced(struct sg_lb_stats *sgs, struct sg_imb_stats *sgi) | ||
4571 | { | ||
4572 | /* | ||
4573 | * Consider the group unbalanced when the imbalance is larger | ||
4574 | * than the average weight of a task. | ||
4575 | * | ||
4576 | * APZ: with cgroup the avg task weight can vary wildly and | ||
4577 | * might not be a suitable number - should we keep a | ||
4578 | * normalized nr_running number somewhere that negates | ||
4579 | * the hierarchy? | ||
4580 | */ | ||
4581 | if ((sgi->max_cpu_load - sgi->min_cpu_load) >= sgs->load_per_task && | ||
4582 | (sgi->max_nr_running - sgi->min_nr_running) > 1) | ||
4583 | return 1; | ||
4584 | |||
4585 | return 0; | ||
4586 | } | ||
4587 | |||
4463 | /** | 4588 | /** |
4464 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. | 4589 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. |
4465 | * @env: The load balancing environment. | 4590 | * @env: The load balancing environment. |
4466 | * @group: sched_group whose statistics are to be updated. | 4591 | * @group: sched_group whose statistics are to be updated. |
4467 | * @load_idx: Load index of sched_domain of this_cpu for load calc. | 4592 | * @load_idx: Load index of sched_domain of this_cpu for load calc. |
4468 | * @local_group: Does group contain this_cpu. | 4593 | * @local_group: Does group contain this_cpu. |
4469 | * @balance: Should we balance. | ||
4470 | * @sgs: variable to hold the statistics for this group. | 4594 | * @sgs: variable to hold the statistics for this group. |
4471 | */ | 4595 | */ |
4472 | static inline void update_sg_lb_stats(struct lb_env *env, | 4596 | static inline void update_sg_lb_stats(struct lb_env *env, |
4473 | struct sched_group *group, int load_idx, | 4597 | struct sched_group *group, int load_idx, |
4474 | int local_group, int *balance, struct sg_lb_stats *sgs) | 4598 | int local_group, struct sg_lb_stats *sgs) |
4475 | { | 4599 | { |
4476 | unsigned long nr_running, max_nr_running, min_nr_running; | 4600 | struct sg_imb_stats sgi; |
4477 | unsigned long load, max_cpu_load, min_cpu_load; | 4601 | unsigned long nr_running; |
4478 | unsigned int balance_cpu = -1, first_idle_cpu = 0; | 4602 | unsigned long load; |
4479 | unsigned long avg_load_per_task = 0; | ||
4480 | int i; | 4603 | int i; |
4481 | 4604 | ||
4482 | if (local_group) | 4605 | init_sg_imb_stats(&sgi); |
4483 | balance_cpu = group_balance_cpu(group); | ||
4484 | |||
4485 | /* Tally up the load of all CPUs in the group */ | ||
4486 | max_cpu_load = 0; | ||
4487 | min_cpu_load = ~0UL; | ||
4488 | max_nr_running = 0; | ||
4489 | min_nr_running = ~0UL; | ||
4490 | 4606 | ||
4491 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { | 4607 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { |
4492 | struct rq *rq = cpu_rq(i); | 4608 | struct rq *rq = cpu_rq(i); |
@@ -4495,24 +4611,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
4495 | 4611 | ||
4496 | /* Bias balancing toward cpus of our domain */ | 4612 | /* Bias balancing toward cpus of our domain */ |
4497 | if (local_group) { | 4613 | if (local_group) { |
4498 | if (idle_cpu(i) && !first_idle_cpu && | ||
4499 | cpumask_test_cpu(i, sched_group_mask(group))) { | ||
4500 | first_idle_cpu = 1; | ||
4501 | balance_cpu = i; | ||
4502 | } | ||
4503 | |||
4504 | load = target_load(i, load_idx); | 4614 | load = target_load(i, load_idx); |
4505 | } else { | 4615 | } else { |
4506 | load = source_load(i, load_idx); | 4616 | load = source_load(i, load_idx); |
4507 | if (load > max_cpu_load) | 4617 | update_sg_imb_stats(&sgi, load, nr_running); |
4508 | max_cpu_load = load; | ||
4509 | if (min_cpu_load > load) | ||
4510 | min_cpu_load = load; | ||
4511 | |||
4512 | if (nr_running > max_nr_running) | ||
4513 | max_nr_running = nr_running; | ||
4514 | if (min_nr_running > nr_running) | ||
4515 | min_nr_running = nr_running; | ||
4516 | } | 4618 | } |
4517 | 4619 | ||
4518 | sgs->group_load += load; | 4620 | sgs->group_load += load; |
@@ -4522,46 +4624,25 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
4522 | sgs->idle_cpus++; | 4624 | sgs->idle_cpus++; |
4523 | } | 4625 | } |
4524 | 4626 | ||
4525 | /* | 4627 | if (local_group && (env->idle != CPU_NEWLY_IDLE || |
4526 | * First idle cpu or the first cpu(busiest) in this sched group | 4628 | time_after_eq(jiffies, group->sgp->next_update))) |
4527 | * is eligible for doing load balancing at this and above | 4629 | update_group_power(env->sd, env->dst_cpu); |
4528 | * domains. In the newly idle case, we will allow all the cpu's | ||
4529 | * to do the newly idle load balance. | ||
4530 | */ | ||
4531 | if (local_group) { | ||
4532 | if (env->idle != CPU_NEWLY_IDLE) { | ||
4533 | if (balance_cpu != env->dst_cpu) { | ||
4534 | *balance = 0; | ||
4535 | return; | ||
4536 | } | ||
4537 | update_group_power(env->sd, env->dst_cpu); | ||
4538 | } else if (time_after_eq(jiffies, group->sgp->next_update)) | ||
4539 | update_group_power(env->sd, env->dst_cpu); | ||
4540 | } | ||
4541 | 4630 | ||
4542 | /* Adjust by relative CPU power of the group */ | 4631 | /* Adjust by relative CPU power of the group */ |
4543 | sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power; | 4632 | sgs->group_power = group->sgp->power; |
4633 | sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power; | ||
4544 | 4634 | ||
4545 | /* | ||
4546 | * Consider the group unbalanced when the imbalance is larger | ||
4547 | * than the average weight of a task. | ||
4548 | * | ||
4549 | * APZ: with cgroup the avg task weight can vary wildly and | ||
4550 | * might not be a suitable number - should we keep a | ||
4551 | * normalized nr_running number somewhere that negates | ||
4552 | * the hierarchy? | ||
4553 | */ | ||
4554 | if (sgs->sum_nr_running) | 4635 | if (sgs->sum_nr_running) |
4555 | avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; | 4636 | sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; |
4556 | 4637 | ||
4557 | if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && | 4638 | sgs->group_imb = sg_imbalanced(sgs, &sgi); |
4558 | (max_nr_running - min_nr_running) > 1) | 4639 | |
4559 | sgs->group_imb = 1; | 4640 | sgs->group_capacity = |
4641 | DIV_ROUND_CLOSEST(sgs->group_power, SCHED_POWER_SCALE); | ||
4560 | 4642 | ||
4561 | sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power, | ||
4562 | SCHED_POWER_SCALE); | ||
4563 | if (!sgs->group_capacity) | 4643 | if (!sgs->group_capacity) |
4564 | sgs->group_capacity = fix_small_capacity(env->sd, group); | 4644 | sgs->group_capacity = fix_small_capacity(env->sd, group); |
4645 | |||
4565 | sgs->group_weight = group->group_weight; | 4646 | sgs->group_weight = group->group_weight; |
4566 | 4647 | ||
4567 | if (sgs->group_capacity > sgs->sum_nr_running) | 4648 | if (sgs->group_capacity > sgs->sum_nr_running) |
@@ -4586,7 +4667,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, | |||
4586 | struct sched_group *sg, | 4667 | struct sched_group *sg, |
4587 | struct sg_lb_stats *sgs) | 4668 | struct sg_lb_stats *sgs) |
4588 | { | 4669 | { |
4589 | if (sgs->avg_load <= sds->max_load) | 4670 | if (sgs->avg_load <= sds->busiest_stat.avg_load) |
4590 | return false; | 4671 | return false; |
4591 | 4672 | ||
4592 | if (sgs->sum_nr_running > sgs->group_capacity) | 4673 | if (sgs->sum_nr_running > sgs->group_capacity) |
@@ -4619,11 +4700,11 @@ static bool update_sd_pick_busiest(struct lb_env *env, | |||
4619 | * @sds: variable to hold the statistics for this sched_domain. | 4700 | * @sds: variable to hold the statistics for this sched_domain. |
4620 | */ | 4701 | */ |
4621 | static inline void update_sd_lb_stats(struct lb_env *env, | 4702 | static inline void update_sd_lb_stats(struct lb_env *env, |
4622 | int *balance, struct sd_lb_stats *sds) | 4703 | struct sd_lb_stats *sds) |
4623 | { | 4704 | { |
4624 | struct sched_domain *child = env->sd->child; | 4705 | struct sched_domain *child = env->sd->child; |
4625 | struct sched_group *sg = env->sd->groups; | 4706 | struct sched_group *sg = env->sd->groups; |
4626 | struct sg_lb_stats sgs; | 4707 | struct sg_lb_stats tmp_sgs; |
4627 | int load_idx, prefer_sibling = 0; | 4708 | int load_idx, prefer_sibling = 0; |
4628 | 4709 | ||
4629 | if (child && child->flags & SD_PREFER_SIBLING) | 4710 | if (child && child->flags & SD_PREFER_SIBLING) |
@@ -4632,17 +4713,17 @@ static inline void update_sd_lb_stats(struct lb_env *env, | |||
4632 | load_idx = get_sd_load_idx(env->sd, env->idle); | 4713 | load_idx = get_sd_load_idx(env->sd, env->idle); |
4633 | 4714 | ||
4634 | do { | 4715 | do { |
4716 | struct sg_lb_stats *sgs = &tmp_sgs; | ||
4635 | int local_group; | 4717 | int local_group; |
4636 | 4718 | ||
4637 | local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); | 4719 | local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); |
4638 | memset(&sgs, 0, sizeof(sgs)); | 4720 | if (local_group) { |
4639 | update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs); | 4721 | sds->local = sg; |
4640 | 4722 | sgs = &sds->local_stat; | |
4641 | if (local_group && !(*balance)) | 4723 | } |
4642 | return; | ||
4643 | 4724 | ||
4644 | sds->total_load += sgs.group_load; | 4725 | memset(sgs, 0, sizeof(*sgs)); |
4645 | sds->total_pwr += sg->sgp->power; | 4726 | update_sg_lb_stats(env, sg, load_idx, local_group, sgs); |
4646 | 4727 | ||
4647 | /* | 4728 | /* |
4648 | * In case the child domain prefers tasks go to siblings | 4729 | * In case the child domain prefers tasks go to siblings |
@@ -4654,26 +4735,17 @@ static inline void update_sd_lb_stats(struct lb_env *env, | |||
4654 | * heaviest group when it is already under-utilized (possible | 4735 | * heaviest group when it is already under-utilized (possible |
4655 | * with a large weight task outweighs the tasks on the system). | 4736 | * with a large weight task outweighs the tasks on the system). |
4656 | */ | 4737 | */ |
4657 | if (prefer_sibling && !local_group && sds->this_has_capacity) | 4738 | if (prefer_sibling && !local_group && |
4658 | sgs.group_capacity = min(sgs.group_capacity, 1UL); | 4739 | sds->local && sds->local_stat.group_has_capacity) |
4740 | sgs->group_capacity = min(sgs->group_capacity, 1U); | ||
4659 | 4741 | ||
4660 | if (local_group) { | 4742 | /* Now, start updating sd_lb_stats */ |
4661 | sds->this_load = sgs.avg_load; | 4743 | sds->total_load += sgs->group_load; |
4662 | sds->this = sg; | 4744 | sds->total_pwr += sgs->group_power; |
4663 | sds->this_nr_running = sgs.sum_nr_running; | 4745 | |
4664 | sds->this_load_per_task = sgs.sum_weighted_load; | 4746 | if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) { |
4665 | sds->this_has_capacity = sgs.group_has_capacity; | ||
4666 | sds->this_idle_cpus = sgs.idle_cpus; | ||
4667 | } else if (update_sd_pick_busiest(env, sds, sg, &sgs)) { | ||
4668 | sds->max_load = sgs.avg_load; | ||
4669 | sds->busiest = sg; | 4747 | sds->busiest = sg; |
4670 | sds->busiest_nr_running = sgs.sum_nr_running; | 4748 | sds->busiest_stat = *sgs; |
4671 | sds->busiest_idle_cpus = sgs.idle_cpus; | ||
4672 | sds->busiest_group_capacity = sgs.group_capacity; | ||
4673 | sds->busiest_load_per_task = sgs.sum_weighted_load; | ||
4674 | sds->busiest_has_capacity = sgs.group_has_capacity; | ||
4675 | sds->busiest_group_weight = sgs.group_weight; | ||
4676 | sds->group_imb = sgs.group_imb; | ||
4677 | } | 4749 | } |
4678 | 4750 | ||
4679 | sg = sg->next; | 4751 | sg = sg->next; |
@@ -4718,7 +4790,8 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) | |||
4718 | return 0; | 4790 | return 0; |
4719 | 4791 | ||
4720 | env->imbalance = DIV_ROUND_CLOSEST( | 4792 | env->imbalance = DIV_ROUND_CLOSEST( |
4721 | sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE); | 4793 | sds->busiest_stat.avg_load * sds->busiest_stat.group_power, |
4794 | SCHED_POWER_SCALE); | ||
4722 | 4795 | ||
4723 | return 1; | 4796 | return 1; |
4724 | } | 4797 | } |
@@ -4736,24 +4809,23 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) | |||
4736 | unsigned long tmp, pwr_now = 0, pwr_move = 0; | 4809 | unsigned long tmp, pwr_now = 0, pwr_move = 0; |
4737 | unsigned int imbn = 2; | 4810 | unsigned int imbn = 2; |
4738 | unsigned long scaled_busy_load_per_task; | 4811 | unsigned long scaled_busy_load_per_task; |
4812 | struct sg_lb_stats *local, *busiest; | ||
4739 | 4813 | ||
4740 | if (sds->this_nr_running) { | 4814 | local = &sds->local_stat; |
4741 | sds->this_load_per_task /= sds->this_nr_running; | 4815 | busiest = &sds->busiest_stat; |
4742 | if (sds->busiest_load_per_task > | 4816 | |
4743 | sds->this_load_per_task) | 4817 | if (!local->sum_nr_running) |
4744 | imbn = 1; | 4818 | local->load_per_task = cpu_avg_load_per_task(env->dst_cpu); |
4745 | } else { | 4819 | else if (busiest->load_per_task > local->load_per_task) |
4746 | sds->this_load_per_task = | 4820 | imbn = 1; |
4747 | cpu_avg_load_per_task(env->dst_cpu); | ||
4748 | } | ||
4749 | 4821 | ||
4750 | scaled_busy_load_per_task = sds->busiest_load_per_task | 4822 | scaled_busy_load_per_task = |
4751 | * SCHED_POWER_SCALE; | 4823 | (busiest->load_per_task * SCHED_POWER_SCALE) / |
4752 | scaled_busy_load_per_task /= sds->busiest->sgp->power; | 4824 | busiest->group_power; |
4753 | 4825 | ||
4754 | if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= | 4826 | if (busiest->avg_load + scaled_busy_load_per_task >= |
4755 | (scaled_busy_load_per_task * imbn)) { | 4827 | local->avg_load + (scaled_busy_load_per_task * imbn)) { |
4756 | env->imbalance = sds->busiest_load_per_task; | 4828 | env->imbalance = busiest->load_per_task; |
4757 | return; | 4829 | return; |
4758 | } | 4830 | } |
4759 | 4831 | ||
@@ -4763,34 +4835,37 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) | |||
4763 | * moving them. | 4835 | * moving them. |
4764 | */ | 4836 | */ |
4765 | 4837 | ||
4766 | pwr_now += sds->busiest->sgp->power * | 4838 | pwr_now += busiest->group_power * |
4767 | min(sds->busiest_load_per_task, sds->max_load); | 4839 | min(busiest->load_per_task, busiest->avg_load); |
4768 | pwr_now += sds->this->sgp->power * | 4840 | pwr_now += local->group_power * |
4769 | min(sds->this_load_per_task, sds->this_load); | 4841 | min(local->load_per_task, local->avg_load); |
4770 | pwr_now /= SCHED_POWER_SCALE; | 4842 | pwr_now /= SCHED_POWER_SCALE; |
4771 | 4843 | ||
4772 | /* Amount of load we'd subtract */ | 4844 | /* Amount of load we'd subtract */ |
4773 | tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / | 4845 | tmp = (busiest->load_per_task * SCHED_POWER_SCALE) / |
4774 | sds->busiest->sgp->power; | 4846 | busiest->group_power; |
4775 | if (sds->max_load > tmp) | 4847 | if (busiest->avg_load > tmp) { |
4776 | pwr_move += sds->busiest->sgp->power * | 4848 | pwr_move += busiest->group_power * |
4777 | min(sds->busiest_load_per_task, sds->max_load - tmp); | 4849 | min(busiest->load_per_task, |
4850 | busiest->avg_load - tmp); | ||
4851 | } | ||
4778 | 4852 | ||
4779 | /* Amount of load we'd add */ | 4853 | /* Amount of load we'd add */ |
4780 | if (sds->max_load * sds->busiest->sgp->power < | 4854 | if (busiest->avg_load * busiest->group_power < |
4781 | sds->busiest_load_per_task * SCHED_POWER_SCALE) | 4855 | busiest->load_per_task * SCHED_POWER_SCALE) { |
4782 | tmp = (sds->max_load * sds->busiest->sgp->power) / | 4856 | tmp = (busiest->avg_load * busiest->group_power) / |
4783 | sds->this->sgp->power; | 4857 | local->group_power; |
4784 | else | 4858 | } else { |
4785 | tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / | 4859 | tmp = (busiest->load_per_task * SCHED_POWER_SCALE) / |
4786 | sds->this->sgp->power; | 4860 | local->group_power; |
4787 | pwr_move += sds->this->sgp->power * | 4861 | } |
4788 | min(sds->this_load_per_task, sds->this_load + tmp); | 4862 | pwr_move += local->group_power * |
4863 | min(local->load_per_task, local->avg_load + tmp); | ||
4789 | pwr_move /= SCHED_POWER_SCALE; | 4864 | pwr_move /= SCHED_POWER_SCALE; |
4790 | 4865 | ||
4791 | /* Move if we gain throughput */ | 4866 | /* Move if we gain throughput */ |
4792 | if (pwr_move > pwr_now) | 4867 | if (pwr_move > pwr_now) |
4793 | env->imbalance = sds->busiest_load_per_task; | 4868 | env->imbalance = busiest->load_per_task; |
4794 | } | 4869 | } |
4795 | 4870 | ||
4796 | /** | 4871 | /** |
@@ -4802,11 +4877,18 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) | |||
4802 | static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds) | 4877 | static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds) |
4803 | { | 4878 | { |
4804 | unsigned long max_pull, load_above_capacity = ~0UL; | 4879 | unsigned long max_pull, load_above_capacity = ~0UL; |
4880 | struct sg_lb_stats *local, *busiest; | ||
4881 | |||
4882 | local = &sds->local_stat; | ||
4883 | busiest = &sds->busiest_stat; | ||
4805 | 4884 | ||
4806 | sds->busiest_load_per_task /= sds->busiest_nr_running; | 4885 | if (busiest->group_imb) { |
4807 | if (sds->group_imb) { | 4886 | /* |
4808 | sds->busiest_load_per_task = | 4887 | * In the group_imb case we cannot rely on group-wide averages |
4809 | min(sds->busiest_load_per_task, sds->avg_load); | 4888 | * to ensure cpu-load equilibrium, look at wider averages. XXX |
4889 | */ | ||
4890 | busiest->load_per_task = | ||
4891 | min(busiest->load_per_task, sds->avg_load); | ||
4810 | } | 4892 | } |
4811 | 4893 | ||
4812 | /* | 4894 | /* |
@@ -4814,21 +4896,23 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
4814 | * max load less than avg load(as we skip the groups at or below | 4896 | * max load less than avg load(as we skip the groups at or below |
4815 | * its cpu_power, while calculating max_load..) | 4897 | * its cpu_power, while calculating max_load..) |
4816 | */ | 4898 | */ |
4817 | if (sds->max_load < sds->avg_load) { | 4899 | if (busiest->avg_load <= sds->avg_load || |
4900 | local->avg_load >= sds->avg_load) { | ||
4818 | env->imbalance = 0; | 4901 | env->imbalance = 0; |
4819 | return fix_small_imbalance(env, sds); | 4902 | return fix_small_imbalance(env, sds); |
4820 | } | 4903 | } |
4821 | 4904 | ||
4822 | if (!sds->group_imb) { | 4905 | if (!busiest->group_imb) { |
4823 | /* | 4906 | /* |
4824 | * Don't want to pull so many tasks that a group would go idle. | 4907 | * Don't want to pull so many tasks that a group would go idle. |
4908 | * Except of course for the group_imb case, since then we might | ||
4909 | * have to drop below capacity to reach cpu-load equilibrium. | ||
4825 | */ | 4910 | */ |
4826 | load_above_capacity = (sds->busiest_nr_running - | 4911 | load_above_capacity = |
4827 | sds->busiest_group_capacity); | 4912 | (busiest->sum_nr_running - busiest->group_capacity); |
4828 | 4913 | ||
4829 | load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); | 4914 | load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); |
4830 | 4915 | load_above_capacity /= busiest->group_power; | |
4831 | load_above_capacity /= sds->busiest->sgp->power; | ||
4832 | } | 4916 | } |
4833 | 4917 | ||
4834 | /* | 4918 | /* |
@@ -4838,15 +4922,14 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
4838 | * we also don't want to reduce the group load below the group capacity | 4922 | * we also don't want to reduce the group load below the group capacity |
4839 | * (so that we can implement power-savings policies etc). Thus we look | 4923 | * (so that we can implement power-savings policies etc). Thus we look |
4840 | * for the minimum possible imbalance. | 4924 | * for the minimum possible imbalance. |
4841 | * Be careful of negative numbers as they'll appear as very large values | ||
4842 | * with unsigned longs. | ||
4843 | */ | 4925 | */ |
4844 | max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); | 4926 | max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity); |
4845 | 4927 | ||
4846 | /* How much load to actually move to equalise the imbalance */ | 4928 | /* How much load to actually move to equalise the imbalance */ |
4847 | env->imbalance = min(max_pull * sds->busiest->sgp->power, | 4929 | env->imbalance = min( |
4848 | (sds->avg_load - sds->this_load) * sds->this->sgp->power) | 4930 | max_pull * busiest->group_power, |
4849 | / SCHED_POWER_SCALE; | 4931 | (sds->avg_load - local->avg_load) * local->group_power |
4932 | ) / SCHED_POWER_SCALE; | ||
4850 | 4933 | ||
4851 | /* | 4934 | /* |
4852 | * if *imbalance is less than the average load per runnable task | 4935 | * if *imbalance is less than the average load per runnable task |
@@ -4854,9 +4937,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
4854 | * a think about bumping its value to force at least one task to be | 4937 | * a think about bumping its value to force at least one task to be |
4855 | * moved | 4938 | * moved |
4856 | */ | 4939 | */ |
4857 | if (env->imbalance < sds->busiest_load_per_task) | 4940 | if (env->imbalance < busiest->load_per_task) |
4858 | return fix_small_imbalance(env, sds); | 4941 | return fix_small_imbalance(env, sds); |
4859 | |||
4860 | } | 4942 | } |
4861 | 4943 | ||
4862 | /******* find_busiest_group() helpers end here *********************/ | 4944 | /******* find_busiest_group() helpers end here *********************/ |
@@ -4872,69 +4954,62 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
4872 | * to restore balance. | 4954 | * to restore balance. |
4873 | * | 4955 | * |
4874 | * @env: The load balancing environment. | 4956 | * @env: The load balancing environment. |
4875 | * @balance: Pointer to a variable indicating if this_cpu | ||
4876 | * is the appropriate cpu to perform load balancing at this_level. | ||
4877 | * | 4957 | * |
4878 | * Return: - The busiest group if imbalance exists. | 4958 | * Return: - The busiest group if imbalance exists. |
4879 | * - If no imbalance and user has opted for power-savings balance, | 4959 | * - If no imbalance and user has opted for power-savings balance, |
4880 | * return the least loaded group whose CPUs can be | 4960 | * return the least loaded group whose CPUs can be |
4881 | * put to idle by rebalancing its tasks onto our group. | 4961 | * put to idle by rebalancing its tasks onto our group. |
4882 | */ | 4962 | */ |
4883 | static struct sched_group * | 4963 | static struct sched_group *find_busiest_group(struct lb_env *env) |
4884 | find_busiest_group(struct lb_env *env, int *balance) | ||
4885 | { | 4964 | { |
4965 | struct sg_lb_stats *local, *busiest; | ||
4886 | struct sd_lb_stats sds; | 4966 | struct sd_lb_stats sds; |
4887 | 4967 | ||
4888 | memset(&sds, 0, sizeof(sds)); | 4968 | init_sd_lb_stats(&sds); |
4889 | 4969 | ||
4890 | /* | 4970 | /* |
4891 | * Compute the various statistics relavent for load balancing at | 4971 | * Compute the various statistics relavent for load balancing at |
4892 | * this level. | 4972 | * this level. |
4893 | */ | 4973 | */ |
4894 | update_sd_lb_stats(env, balance, &sds); | 4974 | update_sd_lb_stats(env, &sds); |
4895 | 4975 | local = &sds.local_stat; | |
4896 | /* | 4976 | busiest = &sds.busiest_stat; |
4897 | * this_cpu is not the appropriate cpu to perform load balancing at | ||
4898 | * this level. | ||
4899 | */ | ||
4900 | if (!(*balance)) | ||
4901 | goto ret; | ||
4902 | 4977 | ||
4903 | if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && | 4978 | if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && |
4904 | check_asym_packing(env, &sds)) | 4979 | check_asym_packing(env, &sds)) |
4905 | return sds.busiest; | 4980 | return sds.busiest; |
4906 | 4981 | ||
4907 | /* There is no busy sibling group to pull tasks from */ | 4982 | /* There is no busy sibling group to pull tasks from */ |
4908 | if (!sds.busiest || sds.busiest_nr_running == 0) | 4983 | if (!sds.busiest || busiest->sum_nr_running == 0) |
4909 | goto out_balanced; | 4984 | goto out_balanced; |
4910 | 4985 | ||
4911 | sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr; | 4986 | sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr; |
4912 | 4987 | ||
4913 | /* | 4988 | /* |
4914 | * If the busiest group is imbalanced the below checks don't | 4989 | * If the busiest group is imbalanced the below checks don't |
4915 | * work because they assumes all things are equal, which typically | 4990 | * work because they assume all things are equal, which typically |
4916 | * isn't true due to cpus_allowed constraints and the like. | 4991 | * isn't true due to cpus_allowed constraints and the like. |
4917 | */ | 4992 | */ |
4918 | if (sds.group_imb) | 4993 | if (busiest->group_imb) |
4919 | goto force_balance; | 4994 | goto force_balance; |
4920 | 4995 | ||
4921 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ | 4996 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ |
4922 | if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity && | 4997 | if (env->idle == CPU_NEWLY_IDLE && local->group_has_capacity && |
4923 | !sds.busiest_has_capacity) | 4998 | !busiest->group_has_capacity) |
4924 | goto force_balance; | 4999 | goto force_balance; |
4925 | 5000 | ||
4926 | /* | 5001 | /* |
4927 | * If the local group is more busy than the selected busiest group | 5002 | * If the local group is more busy than the selected busiest group |
4928 | * don't try and pull any tasks. | 5003 | * don't try and pull any tasks. |
4929 | */ | 5004 | */ |
4930 | if (sds.this_load >= sds.max_load) | 5005 | if (local->avg_load >= busiest->avg_load) |
4931 | goto out_balanced; | 5006 | goto out_balanced; |
4932 | 5007 | ||
4933 | /* | 5008 | /* |
4934 | * Don't pull any tasks if this group is already above the domain | 5009 | * Don't pull any tasks if this group is already above the domain |
4935 | * average load. | 5010 | * average load. |
4936 | */ | 5011 | */ |
4937 | if (sds.this_load >= sds.avg_load) | 5012 | if (local->avg_load >= sds.avg_load) |
4938 | goto out_balanced; | 5013 | goto out_balanced; |
4939 | 5014 | ||
4940 | if (env->idle == CPU_IDLE) { | 5015 | if (env->idle == CPU_IDLE) { |
@@ -4944,15 +5019,16 @@ find_busiest_group(struct lb_env *env, int *balance) | |||
4944 | * there is no imbalance between this and busiest group | 5019 | * there is no imbalance between this and busiest group |
4945 | * wrt to idle cpu's, it is balanced. | 5020 | * wrt to idle cpu's, it is balanced. |
4946 | */ | 5021 | */ |
4947 | if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) && | 5022 | if ((local->idle_cpus < busiest->idle_cpus) && |
4948 | sds.busiest_nr_running <= sds.busiest_group_weight) | 5023 | busiest->sum_nr_running <= busiest->group_weight) |
4949 | goto out_balanced; | 5024 | goto out_balanced; |
4950 | } else { | 5025 | } else { |
4951 | /* | 5026 | /* |
4952 | * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use | 5027 | * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use |
4953 | * imbalance_pct to be conservative. | 5028 | * imbalance_pct to be conservative. |
4954 | */ | 5029 | */ |
4955 | if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load) | 5030 | if (100 * busiest->avg_load <= |
5031 | env->sd->imbalance_pct * local->avg_load) | ||
4956 | goto out_balanced; | 5032 | goto out_balanced; |
4957 | } | 5033 | } |
4958 | 5034 | ||
@@ -4962,7 +5038,6 @@ force_balance: | |||
4962 | return sds.busiest; | 5038 | return sds.busiest; |
4963 | 5039 | ||
4964 | out_balanced: | 5040 | out_balanced: |
4965 | ret: | ||
4966 | env->imbalance = 0; | 5041 | env->imbalance = 0; |
4967 | return NULL; | 5042 | return NULL; |
4968 | } | 5043 | } |
@@ -4974,10 +5049,10 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
4974 | struct sched_group *group) | 5049 | struct sched_group *group) |
4975 | { | 5050 | { |
4976 | struct rq *busiest = NULL, *rq; | 5051 | struct rq *busiest = NULL, *rq; |
4977 | unsigned long max_load = 0; | 5052 | unsigned long busiest_load = 0, busiest_power = 1; |
4978 | int i; | 5053 | int i; |
4979 | 5054 | ||
4980 | for_each_cpu(i, sched_group_cpus(group)) { | 5055 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { |
4981 | unsigned long power = power_of(i); | 5056 | unsigned long power = power_of(i); |
4982 | unsigned long capacity = DIV_ROUND_CLOSEST(power, | 5057 | unsigned long capacity = DIV_ROUND_CLOSEST(power, |
4983 | SCHED_POWER_SCALE); | 5058 | SCHED_POWER_SCALE); |
@@ -4986,9 +5061,6 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
4986 | if (!capacity) | 5061 | if (!capacity) |
4987 | capacity = fix_small_capacity(env->sd, group); | 5062 | capacity = fix_small_capacity(env->sd, group); |
4988 | 5063 | ||
4989 | if (!cpumask_test_cpu(i, env->cpus)) | ||
4990 | continue; | ||
4991 | |||
4992 | rq = cpu_rq(i); | 5064 | rq = cpu_rq(i); |
4993 | wl = weighted_cpuload(i); | 5065 | wl = weighted_cpuload(i); |
4994 | 5066 | ||
@@ -5004,11 +5076,15 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
5004 | * the weighted_cpuload() scaled with the cpu power, so that | 5076 | * the weighted_cpuload() scaled with the cpu power, so that |
5005 | * the load can be moved away from the cpu that is potentially | 5077 | * the load can be moved away from the cpu that is potentially |
5006 | * running at a lower capacity. | 5078 | * running at a lower capacity. |
5079 | * | ||
5080 | * Thus we're looking for max(wl_i / power_i), crosswise | ||
5081 | * multiplication to rid ourselves of the division works out | ||
5082 | * to: wl_i * power_j > wl_j * power_i; where j is our | ||
5083 | * previous maximum. | ||
5007 | */ | 5084 | */ |
5008 | wl = (wl * SCHED_POWER_SCALE) / power; | 5085 | if (wl * busiest_power > busiest_load * power) { |
5009 | 5086 | busiest_load = wl; | |
5010 | if (wl > max_load) { | 5087 | busiest_power = power; |
5011 | max_load = wl; | ||
5012 | busiest = rq; | 5088 | busiest = rq; |
5013 | } | 5089 | } |
5014 | } | 5090 | } |
@@ -5045,13 +5121,47 @@ static int need_active_balance(struct lb_env *env) | |||
5045 | 5121 | ||
5046 | static int active_load_balance_cpu_stop(void *data); | 5122 | static int active_load_balance_cpu_stop(void *data); |
5047 | 5123 | ||
5124 | static int should_we_balance(struct lb_env *env) | ||
5125 | { | ||
5126 | struct sched_group *sg = env->sd->groups; | ||
5127 | struct cpumask *sg_cpus, *sg_mask; | ||
5128 | int cpu, balance_cpu = -1; | ||
5129 | |||
5130 | /* | ||
5131 | * In the newly idle case, we will allow all the cpu's | ||
5132 | * to do the newly idle load balance. | ||
5133 | */ | ||
5134 | if (env->idle == CPU_NEWLY_IDLE) | ||
5135 | return 1; | ||
5136 | |||
5137 | sg_cpus = sched_group_cpus(sg); | ||
5138 | sg_mask = sched_group_mask(sg); | ||
5139 | /* Try to find first idle cpu */ | ||
5140 | for_each_cpu_and(cpu, sg_cpus, env->cpus) { | ||
5141 | if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu)) | ||
5142 | continue; | ||
5143 | |||
5144 | balance_cpu = cpu; | ||
5145 | break; | ||
5146 | } | ||
5147 | |||
5148 | if (balance_cpu == -1) | ||
5149 | balance_cpu = group_balance_cpu(sg); | ||
5150 | |||
5151 | /* | ||
5152 | * First idle cpu or the first cpu(busiest) in this sched group | ||
5153 | * is eligible for doing load balancing at this and above domains. | ||
5154 | */ | ||
5155 | return balance_cpu == env->dst_cpu; | ||
5156 | } | ||
5157 | |||
5048 | /* | 5158 | /* |
5049 | * Check this_cpu to ensure it is balanced within domain. Attempt to move | 5159 | * Check this_cpu to ensure it is balanced within domain. Attempt to move |
5050 | * tasks if there is an imbalance. | 5160 | * tasks if there is an imbalance. |
5051 | */ | 5161 | */ |
5052 | static int load_balance(int this_cpu, struct rq *this_rq, | 5162 | static int load_balance(int this_cpu, struct rq *this_rq, |
5053 | struct sched_domain *sd, enum cpu_idle_type idle, | 5163 | struct sched_domain *sd, enum cpu_idle_type idle, |
5054 | int *balance) | 5164 | int *continue_balancing) |
5055 | { | 5165 | { |
5056 | int ld_moved, cur_ld_moved, active_balance = 0; | 5166 | int ld_moved, cur_ld_moved, active_balance = 0; |
5057 | struct sched_group *group; | 5167 | struct sched_group *group; |
@@ -5081,11 +5191,12 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
5081 | schedstat_inc(sd, lb_count[idle]); | 5191 | schedstat_inc(sd, lb_count[idle]); |
5082 | 5192 | ||
5083 | redo: | 5193 | redo: |
5084 | group = find_busiest_group(&env, balance); | 5194 | if (!should_we_balance(&env)) { |
5085 | 5195 | *continue_balancing = 0; | |
5086 | if (*balance == 0) | ||
5087 | goto out_balanced; | 5196 | goto out_balanced; |
5197 | } | ||
5088 | 5198 | ||
5199 | group = find_busiest_group(&env); | ||
5089 | if (!group) { | 5200 | if (!group) { |
5090 | schedstat_inc(sd, lb_nobusyg[idle]); | 5201 | schedstat_inc(sd, lb_nobusyg[idle]); |
5091 | goto out_balanced; | 5202 | goto out_balanced; |
@@ -5114,7 +5225,6 @@ redo: | |||
5114 | env.src_rq = busiest; | 5225 | env.src_rq = busiest; |
5115 | env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); | 5226 | env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); |
5116 | 5227 | ||
5117 | update_h_load(env.src_cpu); | ||
5118 | more_balance: | 5228 | more_balance: |
5119 | local_irq_save(flags); | 5229 | local_irq_save(flags); |
5120 | double_rq_lock(env.dst_rq, busiest); | 5230 | double_rq_lock(env.dst_rq, busiest); |
@@ -5298,7 +5408,7 @@ void idle_balance(int this_cpu, struct rq *this_rq) | |||
5298 | rcu_read_lock(); | 5408 | rcu_read_lock(); |
5299 | for_each_domain(this_cpu, sd) { | 5409 | for_each_domain(this_cpu, sd) { |
5300 | unsigned long interval; | 5410 | unsigned long interval; |
5301 | int balance = 1; | 5411 | int continue_balancing = 1; |
5302 | 5412 | ||
5303 | if (!(sd->flags & SD_LOAD_BALANCE)) | 5413 | if (!(sd->flags & SD_LOAD_BALANCE)) |
5304 | continue; | 5414 | continue; |
@@ -5306,7 +5416,8 @@ void idle_balance(int this_cpu, struct rq *this_rq) | |||
5306 | if (sd->flags & SD_BALANCE_NEWIDLE) { | 5416 | if (sd->flags & SD_BALANCE_NEWIDLE) { |
5307 | /* If we've pulled tasks over stop searching: */ | 5417 | /* If we've pulled tasks over stop searching: */ |
5308 | pulled_task = load_balance(this_cpu, this_rq, | 5418 | pulled_task = load_balance(this_cpu, this_rq, |
5309 | sd, CPU_NEWLY_IDLE, &balance); | 5419 | sd, CPU_NEWLY_IDLE, |
5420 | &continue_balancing); | ||
5310 | } | 5421 | } |
5311 | 5422 | ||
5312 | interval = msecs_to_jiffies(sd->balance_interval); | 5423 | interval = msecs_to_jiffies(sd->balance_interval); |
@@ -5544,7 +5655,7 @@ void update_max_interval(void) | |||
5544 | */ | 5655 | */ |
5545 | static void rebalance_domains(int cpu, enum cpu_idle_type idle) | 5656 | static void rebalance_domains(int cpu, enum cpu_idle_type idle) |
5546 | { | 5657 | { |
5547 | int balance = 1; | 5658 | int continue_balancing = 1; |
5548 | struct rq *rq = cpu_rq(cpu); | 5659 | struct rq *rq = cpu_rq(cpu); |
5549 | unsigned long interval; | 5660 | unsigned long interval; |
5550 | struct sched_domain *sd; | 5661 | struct sched_domain *sd; |
@@ -5576,7 +5687,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
5576 | } | 5687 | } |
5577 | 5688 | ||
5578 | if (time_after_eq(jiffies, sd->last_balance + interval)) { | 5689 | if (time_after_eq(jiffies, sd->last_balance + interval)) { |
5579 | if (load_balance(cpu, rq, sd, idle, &balance)) { | 5690 | if (load_balance(cpu, rq, sd, idle, &continue_balancing)) { |
5580 | /* | 5691 | /* |
5581 | * The LBF_SOME_PINNED logic could have changed | 5692 | * The LBF_SOME_PINNED logic could have changed |
5582 | * env->dst_cpu, so we can't know our idle | 5693 | * env->dst_cpu, so we can't know our idle |
@@ -5599,7 +5710,7 @@ out: | |||
5599 | * CPU in our sched group which is doing load balancing more | 5710 | * CPU in our sched group which is doing load balancing more |
5600 | * actively. | 5711 | * actively. |
5601 | */ | 5712 | */ |
5602 | if (!balance) | 5713 | if (!continue_balancing) |
5603 | break; | 5714 | break; |
5604 | } | 5715 | } |
5605 | rcu_read_unlock(); | 5716 | rcu_read_unlock(); |
@@ -5818,11 +5929,15 @@ static void task_fork_fair(struct task_struct *p) | |||
5818 | cfs_rq = task_cfs_rq(current); | 5929 | cfs_rq = task_cfs_rq(current); |
5819 | curr = cfs_rq->curr; | 5930 | curr = cfs_rq->curr; |
5820 | 5931 | ||
5821 | if (unlikely(task_cpu(p) != this_cpu)) { | 5932 | /* |
5822 | rcu_read_lock(); | 5933 | * Not only the cpu but also the task_group of the parent might have |
5823 | __set_task_cpu(p, this_cpu); | 5934 | * been changed after parent->se.parent,cfs_rq were copied to |
5824 | rcu_read_unlock(); | 5935 | * child->se.parent,cfs_rq. So call __set_task_cpu() to make those |
5825 | } | 5936 | * of child point to valid ones. |
5937 | */ | ||
5938 | rcu_read_lock(); | ||
5939 | __set_task_cpu(p, this_cpu); | ||
5940 | rcu_read_unlock(); | ||
5826 | 5941 | ||
5827 | update_curr(cfs_rq); | 5942 | update_curr(cfs_rq); |
5828 | 5943 | ||
@@ -5895,11 +6010,9 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) | |||
5895 | * and ensure we don't carry in an old decay_count if we | 6010 | * and ensure we don't carry in an old decay_count if we |
5896 | * switch back. | 6011 | * switch back. |
5897 | */ | 6012 | */ |
5898 | if (p->se.avg.decay_count) { | 6013 | if (se->avg.decay_count) { |
5899 | struct cfs_rq *cfs_rq = cfs_rq_of(&p->se); | 6014 | __synchronize_entity_decay(se); |
5900 | __synchronize_entity_decay(&p->se); | 6015 | subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib); |
5901 | subtract_blocked_load_contrib(cfs_rq, | ||
5902 | p->se.avg.load_avg_contrib); | ||
5903 | } | 6016 | } |
5904 | #endif | 6017 | #endif |
5905 | } | 6018 | } |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ef0a7b2439dd..b3c5653e1dca 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -285,7 +285,6 @@ struct cfs_rq { | |||
285 | /* Required to track per-cpu representation of a task_group */ | 285 | /* Required to track per-cpu representation of a task_group */ |
286 | u32 tg_runnable_contrib; | 286 | u32 tg_runnable_contrib; |
287 | unsigned long tg_load_contrib; | 287 | unsigned long tg_load_contrib; |
288 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
289 | 288 | ||
290 | /* | 289 | /* |
291 | * h_load = weight * f(tg) | 290 | * h_load = weight * f(tg) |
@@ -294,6 +293,9 @@ struct cfs_rq { | |||
294 | * this group. | 293 | * this group. |
295 | */ | 294 | */ |
296 | unsigned long h_load; | 295 | unsigned long h_load; |
296 | u64 last_h_load_update; | ||
297 | struct sched_entity *h_load_next; | ||
298 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
297 | #endif /* CONFIG_SMP */ | 299 | #endif /* CONFIG_SMP */ |
298 | 300 | ||
299 | #ifdef CONFIG_FAIR_GROUP_SCHED | 301 | #ifdef CONFIG_FAIR_GROUP_SCHED |
@@ -429,9 +431,6 @@ struct rq { | |||
429 | #ifdef CONFIG_FAIR_GROUP_SCHED | 431 | #ifdef CONFIG_FAIR_GROUP_SCHED |
430 | /* list of leaf cfs_rq on this cpu: */ | 432 | /* list of leaf cfs_rq on this cpu: */ |
431 | struct list_head leaf_cfs_rq_list; | 433 | struct list_head leaf_cfs_rq_list; |
432 | #ifdef CONFIG_SMP | ||
433 | unsigned long h_load_throttle; | ||
434 | #endif /* CONFIG_SMP */ | ||
435 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 434 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
436 | 435 | ||
437 | #ifdef CONFIG_RT_GROUP_SCHED | 436 | #ifdef CONFIG_RT_GROUP_SCHED |
@@ -595,6 +594,7 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag) | |||
595 | } | 594 | } |
596 | 595 | ||
597 | DECLARE_PER_CPU(struct sched_domain *, sd_llc); | 596 | DECLARE_PER_CPU(struct sched_domain *, sd_llc); |
597 | DECLARE_PER_CPU(int, sd_llc_size); | ||
598 | DECLARE_PER_CPU(int, sd_llc_id); | 598 | DECLARE_PER_CPU(int, sd_llc_id); |
599 | 599 | ||
600 | struct sched_group_power { | 600 | struct sched_group_power { |
@@ -665,9 +665,9 @@ extern int group_balance_cpu(struct sched_group *sg); | |||
665 | /* | 665 | /* |
666 | * Return the group to which this tasks belongs. | 666 | * Return the group to which this tasks belongs. |
667 | * | 667 | * |
668 | * We cannot use task_subsys_state() and friends because the cgroup | 668 | * We cannot use task_css() and friends because the cgroup subsystem |
669 | * subsystem changes that value before the cgroup_subsys::attach() method | 669 | * changes that value before the cgroup_subsys::attach() method is called, |
670 | * is called, therefore we cannot pin it and might observe the wrong value. | 670 | * therefore we cannot pin it and might observe the wrong value. |
671 | * | 671 | * |
672 | * The same is true for autogroup's p->signal->autogroup->tg, the autogroup | 672 | * The same is true for autogroup's p->signal->autogroup->tg, the autogroup |
673 | * core changes this before calling sched_move_task(). | 673 | * core changes this before calling sched_move_task(). |
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 5aef494fc8b4..c7edee71bce8 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h | |||
@@ -104,8 +104,9 @@ static inline void sched_info_queued(struct task_struct *t) | |||
104 | } | 104 | } |
105 | 105 | ||
106 | /* | 106 | /* |
107 | * Called when a process ceases being the active-running process, either | 107 | * Called when a process ceases being the active-running process involuntarily |
108 | * voluntarily or involuntarily. Now we can calculate how long we ran. | 108 | * due, typically, to expiring its time slice (this may also be called when |
109 | * switching to the idle task). Now we can calculate how long we ran. | ||
109 | * Also, if the process is still in the TASK_RUNNING state, call | 110 | * Also, if the process is still in the TASK_RUNNING state, call |
110 | * sched_info_queued() to mark that it has now again started waiting on | 111 | * sched_info_queued() to mark that it has now again started waiting on |
111 | * the runqueue. | 112 | * the runqueue. |
diff --git a/kernel/signal.c b/kernel/signal.c index 50e41075ac77..ded28b91fa53 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -3394,7 +3394,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig, | |||
3394 | new_ka.sa.sa_restorer = compat_ptr(restorer); | 3394 | new_ka.sa.sa_restorer = compat_ptr(restorer); |
3395 | #endif | 3395 | #endif |
3396 | ret |= copy_from_user(&mask, &act->sa_mask, sizeof(mask)); | 3396 | ret |= copy_from_user(&mask, &act->sa_mask, sizeof(mask)); |
3397 | ret |= __get_user(new_ka.sa.sa_flags, &act->sa_flags); | 3397 | ret |= get_user(new_ka.sa.sa_flags, &act->sa_flags); |
3398 | if (ret) | 3398 | if (ret) |
3399 | return -EFAULT; | 3399 | return -EFAULT; |
3400 | sigset_from_compat(&new_ka.sa.sa_mask, &mask); | 3400 | sigset_from_compat(&new_ka.sa.sa_mask, &mask); |
@@ -3406,7 +3406,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig, | |||
3406 | ret = put_user(ptr_to_compat(old_ka.sa.sa_handler), | 3406 | ret = put_user(ptr_to_compat(old_ka.sa.sa_handler), |
3407 | &oact->sa_handler); | 3407 | &oact->sa_handler); |
3408 | ret |= copy_to_user(&oact->sa_mask, &mask, sizeof(mask)); | 3408 | ret |= copy_to_user(&oact->sa_mask, &mask, sizeof(mask)); |
3409 | ret |= __put_user(old_ka.sa.sa_flags, &oact->sa_flags); | 3409 | ret |= put_user(old_ka.sa.sa_flags, &oact->sa_flags); |
3410 | #ifdef __ARCH_HAS_SA_RESTORER | 3410 | #ifdef __ARCH_HAS_SA_RESTORER |
3411 | ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer), | 3411 | ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer), |
3412 | &oact->sa_restorer); | 3412 | &oact->sa_restorer); |
diff --git a/kernel/smp.c b/kernel/smp.c index fe9f773d7114..0564571dcdf7 100644 --- a/kernel/smp.c +++ b/kernel/smp.c | |||
@@ -48,10 +48,13 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
48 | cpu_to_node(cpu))) | 48 | cpu_to_node(cpu))) |
49 | return notifier_from_errno(-ENOMEM); | 49 | return notifier_from_errno(-ENOMEM); |
50 | if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL, | 50 | if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL, |
51 | cpu_to_node(cpu))) | 51 | cpu_to_node(cpu))) { |
52 | free_cpumask_var(cfd->cpumask); | ||
52 | return notifier_from_errno(-ENOMEM); | 53 | return notifier_from_errno(-ENOMEM); |
54 | } | ||
53 | cfd->csd = alloc_percpu(struct call_single_data); | 55 | cfd->csd = alloc_percpu(struct call_single_data); |
54 | if (!cfd->csd) { | 56 | if (!cfd->csd) { |
57 | free_cpumask_var(cfd->cpumask_ipi); | ||
55 | free_cpumask_var(cfd->cpumask); | 58 | free_cpumask_var(cfd->cpumask); |
56 | return notifier_from_errno(-ENOMEM); | 59 | return notifier_from_errno(-ENOMEM); |
57 | } | 60 | } |
@@ -186,25 +189,13 @@ void generic_smp_call_function_single_interrupt(void) | |||
186 | 189 | ||
187 | while (!list_empty(&list)) { | 190 | while (!list_empty(&list)) { |
188 | struct call_single_data *csd; | 191 | struct call_single_data *csd; |
189 | unsigned int csd_flags; | ||
190 | 192 | ||
191 | csd = list_entry(list.next, struct call_single_data, list); | 193 | csd = list_entry(list.next, struct call_single_data, list); |
192 | list_del(&csd->list); | 194 | list_del(&csd->list); |
193 | 195 | ||
194 | /* | ||
195 | * 'csd' can be invalid after this call if flags == 0 | ||
196 | * (when called through generic_exec_single()), | ||
197 | * so save them away before making the call: | ||
198 | */ | ||
199 | csd_flags = csd->flags; | ||
200 | |||
201 | csd->func(csd->info); | 196 | csd->func(csd->info); |
202 | 197 | ||
203 | /* | 198 | csd_unlock(csd); |
204 | * Unlocked CSDs are valid through generic_exec_single(): | ||
205 | */ | ||
206 | if (csd_flags & CSD_FLAG_LOCK) | ||
207 | csd_unlock(csd); | ||
208 | } | 199 | } |
209 | } | 200 | } |
210 | 201 | ||
@@ -278,8 +269,6 @@ EXPORT_SYMBOL(smp_call_function_single); | |||
278 | * @wait: If true, wait until function has completed. | 269 | * @wait: If true, wait until function has completed. |
279 | * | 270 | * |
280 | * Returns 0 on success, else a negative status code (if no cpus were online). | 271 | * Returns 0 on success, else a negative status code (if no cpus were online). |
281 | * Note that @wait will be implicitly turned on in case of allocation failures, | ||
282 | * since we fall back to on-stack allocation. | ||
283 | * | 272 | * |
284 | * Selection preference: | 273 | * Selection preference: |
285 | * 1) current cpu if in @mask | 274 | * 1) current cpu if in @mask |
@@ -586,8 +575,10 @@ EXPORT_SYMBOL(on_each_cpu); | |||
586 | * | 575 | * |
587 | * If @wait is true, then returns once @func has returned. | 576 | * If @wait is true, then returns once @func has returned. |
588 | * | 577 | * |
589 | * You must not call this function with disabled interrupts or | 578 | * You must not call this function with disabled interrupts or from a |
590 | * from a hardware interrupt handler or from a bottom half handler. | 579 | * hardware interrupt handler or from a bottom half handler. The |
580 | * exception is that it may be used during early boot while | ||
581 | * early_boot_irqs_disabled is set. | ||
591 | */ | 582 | */ |
592 | void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func, | 583 | void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func, |
593 | void *info, bool wait) | 584 | void *info, bool wait) |
@@ -596,9 +587,10 @@ void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func, | |||
596 | 587 | ||
597 | smp_call_function_many(mask, func, info, wait); | 588 | smp_call_function_many(mask, func, info, wait); |
598 | if (cpumask_test_cpu(cpu, mask)) { | 589 | if (cpumask_test_cpu(cpu, mask)) { |
599 | local_irq_disable(); | 590 | unsigned long flags; |
591 | local_irq_save(flags); | ||
600 | func(info); | 592 | func(info); |
601 | local_irq_enable(); | 593 | local_irq_restore(flags); |
602 | } | 594 | } |
603 | put_cpu(); | 595 | put_cpu(); |
604 | } | 596 | } |
diff --git a/kernel/softirq.c b/kernel/softirq.c index be3d3514c325..d7d498d8cc4f 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
@@ -328,10 +328,19 @@ void irq_enter(void) | |||
328 | 328 | ||
329 | static inline void invoke_softirq(void) | 329 | static inline void invoke_softirq(void) |
330 | { | 330 | { |
331 | if (!force_irqthreads) | 331 | if (!force_irqthreads) { |
332 | __do_softirq(); | 332 | /* |
333 | else | 333 | * We can safely execute softirq on the current stack if |
334 | * it is the irq stack, because it should be near empty | ||
335 | * at this stage. But we have no way to know if the arch | ||
336 | * calls irq_exit() on the irq stack. So call softirq | ||
337 | * in its own stack to prevent from any overrun on top | ||
338 | * of a potentially deep task stack. | ||
339 | */ | ||
340 | do_softirq(); | ||
341 | } else { | ||
334 | wakeup_softirqd(); | 342 | wakeup_softirqd(); |
343 | } | ||
335 | } | 344 | } |
336 | 345 | ||
337 | static inline void tick_irq_exit(void) | 346 | static inline void tick_irq_exit(void) |
@@ -876,7 +885,6 @@ int __init __weak early_irq_init(void) | |||
876 | return 0; | 885 | return 0; |
877 | } | 886 | } |
878 | 887 | ||
879 | #ifdef CONFIG_GENERIC_HARDIRQS | ||
880 | int __init __weak arch_probe_nr_irqs(void) | 888 | int __init __weak arch_probe_nr_irqs(void) |
881 | { | 889 | { |
882 | return NR_IRQS_LEGACY; | 890 | return NR_IRQS_LEGACY; |
@@ -886,4 +894,3 @@ int __init __weak arch_early_irq_init(void) | |||
886 | { | 894 | { |
887 | return 0; | 895 | return 0; |
888 | } | 896 | } |
889 | #endif | ||
diff --git a/kernel/spinlock.c b/kernel/spinlock.c index 5cdd8065a3ce..4b082b5cac9e 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c | |||
@@ -34,6 +34,20 @@ | |||
34 | #else | 34 | #else |
35 | #define raw_read_can_lock(l) read_can_lock(l) | 35 | #define raw_read_can_lock(l) read_can_lock(l) |
36 | #define raw_write_can_lock(l) write_can_lock(l) | 36 | #define raw_write_can_lock(l) write_can_lock(l) |
37 | |||
38 | /* | ||
39 | * Some architectures can relax in favour of the CPU owning the lock. | ||
40 | */ | ||
41 | #ifndef arch_read_relax | ||
42 | # define arch_read_relax(l) cpu_relax() | ||
43 | #endif | ||
44 | #ifndef arch_write_relax | ||
45 | # define arch_write_relax(l) cpu_relax() | ||
46 | #endif | ||
47 | #ifndef arch_spin_relax | ||
48 | # define arch_spin_relax(l) cpu_relax() | ||
49 | #endif | ||
50 | |||
37 | /* | 51 | /* |
38 | * We build the __lock_function inlines here. They are too large for | 52 | * We build the __lock_function inlines here. They are too large for |
39 | * inlining all over the place, but here is only one user per function | 53 | * inlining all over the place, but here is only one user per function |
diff --git a/kernel/sys.c b/kernel/sys.c index 771129b299f8..c18ecca575b4 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -337,7 +337,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) | |||
337 | if (rgid != (gid_t) -1) { | 337 | if (rgid != (gid_t) -1) { |
338 | if (gid_eq(old->gid, krgid) || | 338 | if (gid_eq(old->gid, krgid) || |
339 | gid_eq(old->egid, krgid) || | 339 | gid_eq(old->egid, krgid) || |
340 | nsown_capable(CAP_SETGID)) | 340 | ns_capable(old->user_ns, CAP_SETGID)) |
341 | new->gid = krgid; | 341 | new->gid = krgid; |
342 | else | 342 | else |
343 | goto error; | 343 | goto error; |
@@ -346,7 +346,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) | |||
346 | if (gid_eq(old->gid, kegid) || | 346 | if (gid_eq(old->gid, kegid) || |
347 | gid_eq(old->egid, kegid) || | 347 | gid_eq(old->egid, kegid) || |
348 | gid_eq(old->sgid, kegid) || | 348 | gid_eq(old->sgid, kegid) || |
349 | nsown_capable(CAP_SETGID)) | 349 | ns_capable(old->user_ns, CAP_SETGID)) |
350 | new->egid = kegid; | 350 | new->egid = kegid; |
351 | else | 351 | else |
352 | goto error; | 352 | goto error; |
@@ -387,7 +387,7 @@ SYSCALL_DEFINE1(setgid, gid_t, gid) | |||
387 | old = current_cred(); | 387 | old = current_cred(); |
388 | 388 | ||
389 | retval = -EPERM; | 389 | retval = -EPERM; |
390 | if (nsown_capable(CAP_SETGID)) | 390 | if (ns_capable(old->user_ns, CAP_SETGID)) |
391 | new->gid = new->egid = new->sgid = new->fsgid = kgid; | 391 | new->gid = new->egid = new->sgid = new->fsgid = kgid; |
392 | else if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->sgid)) | 392 | else if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->sgid)) |
393 | new->egid = new->fsgid = kgid; | 393 | new->egid = new->fsgid = kgid; |
@@ -471,7 +471,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) | |||
471 | new->uid = kruid; | 471 | new->uid = kruid; |
472 | if (!uid_eq(old->uid, kruid) && | 472 | if (!uid_eq(old->uid, kruid) && |
473 | !uid_eq(old->euid, kruid) && | 473 | !uid_eq(old->euid, kruid) && |
474 | !nsown_capable(CAP_SETUID)) | 474 | !ns_capable(old->user_ns, CAP_SETUID)) |
475 | goto error; | 475 | goto error; |
476 | } | 476 | } |
477 | 477 | ||
@@ -480,7 +480,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) | |||
480 | if (!uid_eq(old->uid, keuid) && | 480 | if (!uid_eq(old->uid, keuid) && |
481 | !uid_eq(old->euid, keuid) && | 481 | !uid_eq(old->euid, keuid) && |
482 | !uid_eq(old->suid, keuid) && | 482 | !uid_eq(old->suid, keuid) && |
483 | !nsown_capable(CAP_SETUID)) | 483 | !ns_capable(old->user_ns, CAP_SETUID)) |
484 | goto error; | 484 | goto error; |
485 | } | 485 | } |
486 | 486 | ||
@@ -534,7 +534,7 @@ SYSCALL_DEFINE1(setuid, uid_t, uid) | |||
534 | old = current_cred(); | 534 | old = current_cred(); |
535 | 535 | ||
536 | retval = -EPERM; | 536 | retval = -EPERM; |
537 | if (nsown_capable(CAP_SETUID)) { | 537 | if (ns_capable(old->user_ns, CAP_SETUID)) { |
538 | new->suid = new->uid = kuid; | 538 | new->suid = new->uid = kuid; |
539 | if (!uid_eq(kuid, old->uid)) { | 539 | if (!uid_eq(kuid, old->uid)) { |
540 | retval = set_user(new); | 540 | retval = set_user(new); |
@@ -591,7 +591,7 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) | |||
591 | old = current_cred(); | 591 | old = current_cred(); |
592 | 592 | ||
593 | retval = -EPERM; | 593 | retval = -EPERM; |
594 | if (!nsown_capable(CAP_SETUID)) { | 594 | if (!ns_capable(old->user_ns, CAP_SETUID)) { |
595 | if (ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) && | 595 | if (ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) && |
596 | !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid)) | 596 | !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid)) |
597 | goto error; | 597 | goto error; |
@@ -673,7 +673,7 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) | |||
673 | old = current_cred(); | 673 | old = current_cred(); |
674 | 674 | ||
675 | retval = -EPERM; | 675 | retval = -EPERM; |
676 | if (!nsown_capable(CAP_SETGID)) { | 676 | if (!ns_capable(old->user_ns, CAP_SETGID)) { |
677 | if (rgid != (gid_t) -1 && !gid_eq(krgid, old->gid) && | 677 | if (rgid != (gid_t) -1 && !gid_eq(krgid, old->gid) && |
678 | !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid)) | 678 | !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid)) |
679 | goto error; | 679 | goto error; |
@@ -744,7 +744,7 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid) | |||
744 | 744 | ||
745 | if (uid_eq(kuid, old->uid) || uid_eq(kuid, old->euid) || | 745 | if (uid_eq(kuid, old->uid) || uid_eq(kuid, old->euid) || |
746 | uid_eq(kuid, old->suid) || uid_eq(kuid, old->fsuid) || | 746 | uid_eq(kuid, old->suid) || uid_eq(kuid, old->fsuid) || |
747 | nsown_capable(CAP_SETUID)) { | 747 | ns_capable(old->user_ns, CAP_SETUID)) { |
748 | if (!uid_eq(kuid, old->fsuid)) { | 748 | if (!uid_eq(kuid, old->fsuid)) { |
749 | new->fsuid = kuid; | 749 | new->fsuid = kuid; |
750 | if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) | 750 | if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) |
@@ -783,7 +783,7 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid) | |||
783 | 783 | ||
784 | if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->egid) || | 784 | if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->egid) || |
785 | gid_eq(kgid, old->sgid) || gid_eq(kgid, old->fsgid) || | 785 | gid_eq(kgid, old->sgid) || gid_eq(kgid, old->fsgid) || |
786 | nsown_capable(CAP_SETGID)) { | 786 | ns_capable(old->user_ns, CAP_SETGID)) { |
787 | if (!gid_eq(kgid, old->fsgid)) { | 787 | if (!gid_eq(kgid, old->fsgid)) { |
788 | new->fsgid = kgid; | 788 | new->fsgid = kgid; |
789 | goto change_okay; | 789 | goto change_okay; |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 07f6fc468e17..b2f06f3c6a3f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -1225,7 +1225,7 @@ static struct ctl_table vm_table[] = { | |||
1225 | .data = &hugepages_treat_as_movable, | 1225 | .data = &hugepages_treat_as_movable, |
1226 | .maxlen = sizeof(int), | 1226 | .maxlen = sizeof(int), |
1227 | .mode = 0644, | 1227 | .mode = 0644, |
1228 | .proc_handler = hugetlb_treat_movable_handler, | 1228 | .proc_handler = proc_dointvec, |
1229 | }, | 1229 | }, |
1230 | { | 1230 | { |
1231 | .procname = "nr_overcommit_hugepages", | 1231 | .procname = "nr_overcommit_hugepages", |
@@ -1471,14 +1471,14 @@ static struct ctl_table fs_table[] = { | |||
1471 | { | 1471 | { |
1472 | .procname = "inode-nr", | 1472 | .procname = "inode-nr", |
1473 | .data = &inodes_stat, | 1473 | .data = &inodes_stat, |
1474 | .maxlen = 2*sizeof(int), | 1474 | .maxlen = 2*sizeof(long), |
1475 | .mode = 0444, | 1475 | .mode = 0444, |
1476 | .proc_handler = proc_nr_inodes, | 1476 | .proc_handler = proc_nr_inodes, |
1477 | }, | 1477 | }, |
1478 | { | 1478 | { |
1479 | .procname = "inode-state", | 1479 | .procname = "inode-state", |
1480 | .data = &inodes_stat, | 1480 | .data = &inodes_stat, |
1481 | .maxlen = 7*sizeof(int), | 1481 | .maxlen = 7*sizeof(long), |
1482 | .mode = 0444, | 1482 | .mode = 0444, |
1483 | .proc_handler = proc_nr_inodes, | 1483 | .proc_handler = proc_nr_inodes, |
1484 | }, | 1484 | }, |
@@ -1508,7 +1508,7 @@ static struct ctl_table fs_table[] = { | |||
1508 | { | 1508 | { |
1509 | .procname = "dentry-state", | 1509 | .procname = "dentry-state", |
1510 | .data = &dentry_stat, | 1510 | .data = &dentry_stat, |
1511 | .maxlen = 6*sizeof(int), | 1511 | .maxlen = 6*sizeof(long), |
1512 | .mode = 0444, | 1512 | .mode = 0444, |
1513 | .proc_handler = proc_nr_dentry, | 1513 | .proc_handler = proc_nr_dentry, |
1514 | }, | 1514 | }, |
diff --git a/kernel/task_work.c b/kernel/task_work.c index 65bd3c92d6f3..8727032e3a6f 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c | |||
@@ -4,6 +4,23 @@ | |||
4 | 4 | ||
5 | static struct callback_head work_exited; /* all we need is ->next == NULL */ | 5 | static struct callback_head work_exited; /* all we need is ->next == NULL */ |
6 | 6 | ||
7 | /** | ||
8 | * task_work_add - ask the @task to execute @work->func() | ||
9 | * @task: the task which should run the callback | ||
10 | * @work: the callback to run | ||
11 | * @notify: send the notification if true | ||
12 | * | ||
13 | * Queue @work for task_work_run() below and notify the @task if @notify. | ||
14 | * Fails if the @task is exiting/exited and thus it can't process this @work. | ||
15 | * Otherwise @work->func() will be called when the @task returns from kernel | ||
16 | * mode or exits. | ||
17 | * | ||
18 | * This is like the signal handler which runs in kernel mode, but it doesn't | ||
19 | * try to wake up the @task. | ||
20 | * | ||
21 | * RETURNS: | ||
22 | * 0 if succeeds or -ESRCH. | ||
23 | */ | ||
7 | int | 24 | int |
8 | task_work_add(struct task_struct *task, struct callback_head *work, bool notify) | 25 | task_work_add(struct task_struct *task, struct callback_head *work, bool notify) |
9 | { | 26 | { |
@@ -21,11 +38,22 @@ task_work_add(struct task_struct *task, struct callback_head *work, bool notify) | |||
21 | return 0; | 38 | return 0; |
22 | } | 39 | } |
23 | 40 | ||
41 | /** | ||
42 | * task_work_cancel - cancel a pending work added by task_work_add() | ||
43 | * @task: the task which should execute the work | ||
44 | * @func: identifies the work to remove | ||
45 | * | ||
46 | * Find the last queued pending work with ->func == @func and remove | ||
47 | * it from queue. | ||
48 | * | ||
49 | * RETURNS: | ||
50 | * The found work or NULL if not found. | ||
51 | */ | ||
24 | struct callback_head * | 52 | struct callback_head * |
25 | task_work_cancel(struct task_struct *task, task_work_func_t func) | 53 | task_work_cancel(struct task_struct *task, task_work_func_t func) |
26 | { | 54 | { |
27 | struct callback_head **pprev = &task->task_works; | 55 | struct callback_head **pprev = &task->task_works; |
28 | struct callback_head *work = NULL; | 56 | struct callback_head *work; |
29 | unsigned long flags; | 57 | unsigned long flags; |
30 | /* | 58 | /* |
31 | * If cmpxchg() fails we continue without updating pprev. | 59 | * If cmpxchg() fails we continue without updating pprev. |
@@ -35,7 +63,7 @@ task_work_cancel(struct task_struct *task, task_work_func_t func) | |||
35 | */ | 63 | */ |
36 | raw_spin_lock_irqsave(&task->pi_lock, flags); | 64 | raw_spin_lock_irqsave(&task->pi_lock, flags); |
37 | while ((work = ACCESS_ONCE(*pprev))) { | 65 | while ((work = ACCESS_ONCE(*pprev))) { |
38 | read_barrier_depends(); | 66 | smp_read_barrier_depends(); |
39 | if (work->func != func) | 67 | if (work->func != func) |
40 | pprev = &work->next; | 68 | pprev = &work->next; |
41 | else if (cmpxchg(pprev, work, work->next) == work) | 69 | else if (cmpxchg(pprev, work, work->next) == work) |
@@ -46,6 +74,14 @@ task_work_cancel(struct task_struct *task, task_work_func_t func) | |||
46 | return work; | 74 | return work; |
47 | } | 75 | } |
48 | 76 | ||
77 | /** | ||
78 | * task_work_run - execute the works added by task_work_add() | ||
79 | * | ||
80 | * Flush the pending works. Should be used by the core kernel code. | ||
81 | * Called before the task returns to the user-mode or stops, or when | ||
82 | * it exits. In the latter case task_work_add() can no longer add the | ||
83 | * new work after task_work_run() returns. | ||
84 | */ | ||
49 | void task_work_run(void) | 85 | void task_work_run(void) |
50 | { | 86 | { |
51 | struct task_struct *task = current; | 87 | struct task_struct *task = current; |
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 70f27e89012b..2b62fe86f9ec 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig | |||
@@ -105,7 +105,6 @@ config NO_HZ_FULL | |||
105 | select RCU_USER_QS | 105 | select RCU_USER_QS |
106 | select RCU_NOCB_CPU | 106 | select RCU_NOCB_CPU |
107 | select VIRT_CPU_ACCOUNTING_GEN | 107 | select VIRT_CPU_ACCOUNTING_GEN |
108 | select CONTEXT_TRACKING_FORCE | ||
109 | select IRQ_WORK | 108 | select IRQ_WORK |
110 | help | 109 | help |
111 | Adaptively try to shutdown the tick whenever possible, even when | 110 | Adaptively try to shutdown the tick whenever possible, even when |
@@ -134,6 +133,56 @@ config NO_HZ_FULL_ALL | |||
134 | Note the boot CPU will still be kept outside the range to | 133 | Note the boot CPU will still be kept outside the range to |
135 | handle the timekeeping duty. | 134 | handle the timekeeping duty. |
136 | 135 | ||
136 | config NO_HZ_FULL_SYSIDLE | ||
137 | bool "Detect full-system idle state for full dynticks system" | ||
138 | depends on NO_HZ_FULL | ||
139 | default n | ||
140 | help | ||
141 | At least one CPU must keep the scheduling-clock tick running for | ||
142 | timekeeping purposes whenever there is a non-idle CPU, where | ||
143 | "non-idle" also includes dynticks CPUs as long as they are | ||
144 | running non-idle tasks. Because the underlying adaptive-tick | ||
145 | support cannot distinguish between all CPUs being idle and | ||
146 | all CPUs each running a single task in dynticks mode, the | ||
147 | underlying support simply ensures that there is always a CPU | ||
148 | handling the scheduling-clock tick, whether or not all CPUs | ||
149 | are idle. This Kconfig option enables scalable detection of | ||
150 | the all-CPUs-idle state, thus allowing the scheduling-clock | ||
151 | tick to be disabled when all CPUs are idle. Note that scalable | ||
152 | detection of the all-CPUs-idle state means that larger systems | ||
153 | will be slower to declare the all-CPUs-idle state. | ||
154 | |||
155 | Say Y if you would like to help debug all-CPUs-idle detection. | ||
156 | |||
157 | Say N if you are unsure. | ||
158 | |||
159 | config NO_HZ_FULL_SYSIDLE_SMALL | ||
160 | int "Number of CPUs above which large-system approach is used" | ||
161 | depends on NO_HZ_FULL_SYSIDLE | ||
162 | range 1 NR_CPUS | ||
163 | default 8 | ||
164 | help | ||
165 | The full-system idle detection mechanism takes a lazy approach | ||
166 | on large systems, as is required to attain decent scalability. | ||
167 | However, on smaller systems, scalability is not anywhere near as | ||
168 | large a concern as is energy efficiency. The sysidle subsystem | ||
169 | therefore uses a fast but non-scalable algorithm for small | ||
170 | systems and a lazier but scalable algorithm for large systems. | ||
171 | This Kconfig parameter defines the number of CPUs in the largest | ||
172 | system that will be considered to be "small". | ||
173 | |||
174 | The default value will be fine in most cases. Battery-powered | ||
175 | systems that (1) enable NO_HZ_FULL_SYSIDLE, (2) have larger | ||
176 | numbers of CPUs, and (3) are suffering from battery-lifetime | ||
177 | problems due to long sysidle latencies might wish to experiment | ||
178 | with larger values for this Kconfig parameter. On the other | ||
179 | hand, they might be even better served by disabling NO_HZ_FULL | ||
180 | entirely, given that NO_HZ_FULL is intended for HPC and | ||
181 | real-time workloads that at present do not tend to be run on | ||
182 | battery-powered systems. | ||
183 | |||
184 | Take the default if you are unsure. | ||
185 | |||
137 | config NO_HZ | 186 | config NO_HZ |
138 | bool "Old Idle dynticks config" | 187 | bool "Old Idle dynticks config" |
139 | depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS | 188 | depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS |
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 38959c866789..662c5798a685 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c | |||
@@ -33,29 +33,64 @@ struct ce_unbind { | |||
33 | int res; | 33 | int res; |
34 | }; | 34 | }; |
35 | 35 | ||
36 | /** | 36 | static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt, |
37 | * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds | 37 | bool ismax) |
38 | * @latch: value to convert | ||
39 | * @evt: pointer to clock event device descriptor | ||
40 | * | ||
41 | * Math helper, returns latch value converted to nanoseconds (bound checked) | ||
42 | */ | ||
43 | u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt) | ||
44 | { | 38 | { |
45 | u64 clc = (u64) latch << evt->shift; | 39 | u64 clc = (u64) latch << evt->shift; |
40 | u64 rnd; | ||
46 | 41 | ||
47 | if (unlikely(!evt->mult)) { | 42 | if (unlikely(!evt->mult)) { |
48 | evt->mult = 1; | 43 | evt->mult = 1; |
49 | WARN_ON(1); | 44 | WARN_ON(1); |
50 | } | 45 | } |
46 | rnd = (u64) evt->mult - 1; | ||
47 | |||
48 | /* | ||
49 | * Upper bound sanity check. If the backwards conversion is | ||
50 | * not equal latch, we know that the above shift overflowed. | ||
51 | */ | ||
52 | if ((clc >> evt->shift) != (u64)latch) | ||
53 | clc = ~0ULL; | ||
54 | |||
55 | /* | ||
56 | * Scaled math oddities: | ||
57 | * | ||
58 | * For mult <= (1 << shift) we can safely add mult - 1 to | ||
59 | * prevent integer rounding loss. So the backwards conversion | ||
60 | * from nsec to device ticks will be correct. | ||
61 | * | ||
62 | * For mult > (1 << shift), i.e. device frequency is > 1GHz we | ||
63 | * need to be careful. Adding mult - 1 will result in a value | ||
64 | * which when converted back to device ticks can be larger | ||
65 | * than latch by up to (mult - 1) >> shift. For the min_delta | ||
66 | * calculation we still want to apply this in order to stay | ||
67 | * above the minimum device ticks limit. For the upper limit | ||
68 | * we would end up with a latch value larger than the upper | ||
69 | * limit of the device, so we omit the add to stay below the | ||
70 | * device upper boundary. | ||
71 | * | ||
72 | * Also omit the add if it would overflow the u64 boundary. | ||
73 | */ | ||
74 | if ((~0ULL - clc > rnd) && | ||
75 | (!ismax || evt->mult <= (1U << evt->shift))) | ||
76 | clc += rnd; | ||
51 | 77 | ||
52 | do_div(clc, evt->mult); | 78 | do_div(clc, evt->mult); |
53 | if (clc < 1000) | ||
54 | clc = 1000; | ||
55 | if (clc > KTIME_MAX) | ||
56 | clc = KTIME_MAX; | ||
57 | 79 | ||
58 | return clc; | 80 | /* Deltas less than 1usec are pointless noise */ |
81 | return clc > 1000 ? clc : 1000; | ||
82 | } | ||
83 | |||
84 | /** | ||
85 | * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds | ||
86 | * @latch: value to convert | ||
87 | * @evt: pointer to clock event device descriptor | ||
88 | * | ||
89 | * Math helper, returns latch value converted to nanoseconds (bound checked) | ||
90 | */ | ||
91 | u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt) | ||
92 | { | ||
93 | return cev_delta2ns(latch, evt, false); | ||
59 | } | 94 | } |
60 | EXPORT_SYMBOL_GPL(clockevent_delta2ns); | 95 | EXPORT_SYMBOL_GPL(clockevent_delta2ns); |
61 | 96 | ||
@@ -380,8 +415,8 @@ void clockevents_config(struct clock_event_device *dev, u32 freq) | |||
380 | sec = 600; | 415 | sec = 600; |
381 | 416 | ||
382 | clockevents_calc_mult_shift(dev, freq, sec); | 417 | clockevents_calc_mult_shift(dev, freq, sec); |
383 | dev->min_delta_ns = clockevent_delta2ns(dev->min_delta_ticks, dev); | 418 | dev->min_delta_ns = cev_delta2ns(dev->min_delta_ticks, dev, false); |
384 | dev->max_delta_ns = clockevent_delta2ns(dev->max_delta_ticks, dev); | 419 | dev->max_delta_ns = cev_delta2ns(dev->max_delta_ticks, dev, true); |
385 | } | 420 | } |
386 | 421 | ||
387 | /** | 422 | /** |
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 8f5b3b98577b..bb2215174f05 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c | |||
@@ -516,13 +516,13 @@ static void sync_cmos_clock(struct work_struct *work) | |||
516 | schedule_delayed_work(&sync_cmos_work, timespec_to_jiffies(&next)); | 516 | schedule_delayed_work(&sync_cmos_work, timespec_to_jiffies(&next)); |
517 | } | 517 | } |
518 | 518 | ||
519 | static void notify_cmos_timer(void) | 519 | void ntp_notify_cmos_timer(void) |
520 | { | 520 | { |
521 | schedule_delayed_work(&sync_cmos_work, 0); | 521 | schedule_delayed_work(&sync_cmos_work, 0); |
522 | } | 522 | } |
523 | 523 | ||
524 | #else | 524 | #else |
525 | static inline void notify_cmos_timer(void) { } | 525 | void ntp_notify_cmos_timer(void) { } |
526 | #endif | 526 | #endif |
527 | 527 | ||
528 | 528 | ||
@@ -687,8 +687,6 @@ int __do_adjtimex(struct timex *txc, struct timespec *ts, s32 *time_tai) | |||
687 | if (!(time_status & STA_NANO)) | 687 | if (!(time_status & STA_NANO)) |
688 | txc->time.tv_usec /= NSEC_PER_USEC; | 688 | txc->time.tv_usec /= NSEC_PER_USEC; |
689 | 689 | ||
690 | notify_cmos_timer(); | ||
691 | |||
692 | return result; | 690 | return result; |
693 | } | 691 | } |
694 | 692 | ||
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index e8a1516cc0a3..3612fc77f834 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -23,6 +23,7 @@ | |||
23 | #include <linux/irq_work.h> | 23 | #include <linux/irq_work.h> |
24 | #include <linux/posix-timers.h> | 24 | #include <linux/posix-timers.h> |
25 | #include <linux/perf_event.h> | 25 | #include <linux/perf_event.h> |
26 | #include <linux/context_tracking.h> | ||
26 | 27 | ||
27 | #include <asm/irq_regs.h> | 28 | #include <asm/irq_regs.h> |
28 | 29 | ||
@@ -148,8 +149,8 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) | |||
148 | } | 149 | } |
149 | 150 | ||
150 | #ifdef CONFIG_NO_HZ_FULL | 151 | #ifdef CONFIG_NO_HZ_FULL |
151 | static cpumask_var_t nohz_full_mask; | 152 | cpumask_var_t tick_nohz_full_mask; |
152 | bool have_nohz_full_mask; | 153 | bool tick_nohz_full_running; |
153 | 154 | ||
154 | static bool can_stop_full_tick(void) | 155 | static bool can_stop_full_tick(void) |
155 | { | 156 | { |
@@ -182,7 +183,7 @@ static bool can_stop_full_tick(void) | |||
182 | * Don't allow the user to think they can get | 183 | * Don't allow the user to think they can get |
183 | * full NO_HZ with this machine. | 184 | * full NO_HZ with this machine. |
184 | */ | 185 | */ |
185 | WARN_ONCE(have_nohz_full_mask, | 186 | WARN_ONCE(tick_nohz_full_running, |
186 | "NO_HZ FULL will not work with unstable sched clock"); | 187 | "NO_HZ FULL will not work with unstable sched clock"); |
187 | return false; | 188 | return false; |
188 | } | 189 | } |
@@ -197,7 +198,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now); | |||
197 | * Re-evaluate the need for the tick on the current CPU | 198 | * Re-evaluate the need for the tick on the current CPU |
198 | * and restart it if necessary. | 199 | * and restart it if necessary. |
199 | */ | 200 | */ |
200 | void tick_nohz_full_check(void) | 201 | void __tick_nohz_full_check(void) |
201 | { | 202 | { |
202 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | 203 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); |
203 | 204 | ||
@@ -211,7 +212,7 @@ void tick_nohz_full_check(void) | |||
211 | 212 | ||
212 | static void nohz_full_kick_work_func(struct irq_work *work) | 213 | static void nohz_full_kick_work_func(struct irq_work *work) |
213 | { | 214 | { |
214 | tick_nohz_full_check(); | 215 | __tick_nohz_full_check(); |
215 | } | 216 | } |
216 | 217 | ||
217 | static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { | 218 | static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { |
@@ -230,7 +231,7 @@ void tick_nohz_full_kick(void) | |||
230 | 231 | ||
231 | static void nohz_full_kick_ipi(void *info) | 232 | static void nohz_full_kick_ipi(void *info) |
232 | { | 233 | { |
233 | tick_nohz_full_check(); | 234 | __tick_nohz_full_check(); |
234 | } | 235 | } |
235 | 236 | ||
236 | /* | 237 | /* |
@@ -239,12 +240,13 @@ static void nohz_full_kick_ipi(void *info) | |||
239 | */ | 240 | */ |
240 | void tick_nohz_full_kick_all(void) | 241 | void tick_nohz_full_kick_all(void) |
241 | { | 242 | { |
242 | if (!have_nohz_full_mask) | 243 | if (!tick_nohz_full_running) |
243 | return; | 244 | return; |
244 | 245 | ||
245 | preempt_disable(); | 246 | preempt_disable(); |
246 | smp_call_function_many(nohz_full_mask, | 247 | smp_call_function_many(tick_nohz_full_mask, |
247 | nohz_full_kick_ipi, NULL, false); | 248 | nohz_full_kick_ipi, NULL, false); |
249 | tick_nohz_full_kick(); | ||
248 | preempt_enable(); | 250 | preempt_enable(); |
249 | } | 251 | } |
250 | 252 | ||
@@ -253,7 +255,7 @@ void tick_nohz_full_kick_all(void) | |||
253 | * It might need the tick due to per task/process properties: | 255 | * It might need the tick due to per task/process properties: |
254 | * perf events, posix cpu timers, ... | 256 | * perf events, posix cpu timers, ... |
255 | */ | 257 | */ |
256 | void tick_nohz_task_switch(struct task_struct *tsk) | 258 | void __tick_nohz_task_switch(struct task_struct *tsk) |
257 | { | 259 | { |
258 | unsigned long flags; | 260 | unsigned long flags; |
259 | 261 | ||
@@ -269,31 +271,23 @@ out: | |||
269 | local_irq_restore(flags); | 271 | local_irq_restore(flags); |
270 | } | 272 | } |
271 | 273 | ||
272 | int tick_nohz_full_cpu(int cpu) | ||
273 | { | ||
274 | if (!have_nohz_full_mask) | ||
275 | return 0; | ||
276 | |||
277 | return cpumask_test_cpu(cpu, nohz_full_mask); | ||
278 | } | ||
279 | |||
280 | /* Parse the boot-time nohz CPU list from the kernel parameters. */ | 274 | /* Parse the boot-time nohz CPU list from the kernel parameters. */ |
281 | static int __init tick_nohz_full_setup(char *str) | 275 | static int __init tick_nohz_full_setup(char *str) |
282 | { | 276 | { |
283 | int cpu; | 277 | int cpu; |
284 | 278 | ||
285 | alloc_bootmem_cpumask_var(&nohz_full_mask); | 279 | alloc_bootmem_cpumask_var(&tick_nohz_full_mask); |
286 | if (cpulist_parse(str, nohz_full_mask) < 0) { | 280 | if (cpulist_parse(str, tick_nohz_full_mask) < 0) { |
287 | pr_warning("NOHZ: Incorrect nohz_full cpumask\n"); | 281 | pr_warning("NOHZ: Incorrect nohz_full cpumask\n"); |
288 | return 1; | 282 | return 1; |
289 | } | 283 | } |
290 | 284 | ||
291 | cpu = smp_processor_id(); | 285 | cpu = smp_processor_id(); |
292 | if (cpumask_test_cpu(cpu, nohz_full_mask)) { | 286 | if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { |
293 | pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu); | 287 | pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu); |
294 | cpumask_clear_cpu(cpu, nohz_full_mask); | 288 | cpumask_clear_cpu(cpu, tick_nohz_full_mask); |
295 | } | 289 | } |
296 | have_nohz_full_mask = true; | 290 | tick_nohz_full_running = true; |
297 | 291 | ||
298 | return 1; | 292 | return 1; |
299 | } | 293 | } |
@@ -311,7 +305,7 @@ static int tick_nohz_cpu_down_callback(struct notifier_block *nfb, | |||
311 | * If we handle the timekeeping duty for full dynticks CPUs, | 305 | * If we handle the timekeeping duty for full dynticks CPUs, |
312 | * we can't safely shutdown that CPU. | 306 | * we can't safely shutdown that CPU. |
313 | */ | 307 | */ |
314 | if (have_nohz_full_mask && tick_do_timer_cpu == cpu) | 308 | if (tick_nohz_full_running && tick_do_timer_cpu == cpu) |
315 | return NOTIFY_BAD; | 309 | return NOTIFY_BAD; |
316 | break; | 310 | break; |
317 | } | 311 | } |
@@ -330,31 +324,34 @@ static int tick_nohz_init_all(void) | |||
330 | int err = -1; | 324 | int err = -1; |
331 | 325 | ||
332 | #ifdef CONFIG_NO_HZ_FULL_ALL | 326 | #ifdef CONFIG_NO_HZ_FULL_ALL |
333 | if (!alloc_cpumask_var(&nohz_full_mask, GFP_KERNEL)) { | 327 | if (!alloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL)) { |
334 | pr_err("NO_HZ: Can't allocate full dynticks cpumask\n"); | 328 | pr_err("NO_HZ: Can't allocate full dynticks cpumask\n"); |
335 | return err; | 329 | return err; |
336 | } | 330 | } |
337 | err = 0; | 331 | err = 0; |
338 | cpumask_setall(nohz_full_mask); | 332 | cpumask_setall(tick_nohz_full_mask); |
339 | cpumask_clear_cpu(smp_processor_id(), nohz_full_mask); | 333 | cpumask_clear_cpu(smp_processor_id(), tick_nohz_full_mask); |
340 | have_nohz_full_mask = true; | 334 | tick_nohz_full_running = true; |
341 | #endif | 335 | #endif |
342 | return err; | 336 | return err; |
343 | } | 337 | } |
344 | 338 | ||
345 | void __init tick_nohz_init(void) | 339 | void __init tick_nohz_init(void) |
346 | { | 340 | { |
347 | if (!have_nohz_full_mask) { | 341 | int cpu; |
342 | |||
343 | if (!tick_nohz_full_running) { | ||
348 | if (tick_nohz_init_all() < 0) | 344 | if (tick_nohz_init_all() < 0) |
349 | return; | 345 | return; |
350 | } | 346 | } |
351 | 347 | ||
348 | for_each_cpu(cpu, tick_nohz_full_mask) | ||
349 | context_tracking_cpu_set(cpu); | ||
350 | |||
352 | cpu_notifier(tick_nohz_cpu_down_callback, 0); | 351 | cpu_notifier(tick_nohz_cpu_down_callback, 0); |
353 | cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), nohz_full_mask); | 352 | cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), tick_nohz_full_mask); |
354 | pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf); | 353 | pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf); |
355 | } | 354 | } |
356 | #else | ||
357 | #define have_nohz_full_mask (0) | ||
358 | #endif | 355 | #endif |
359 | 356 | ||
360 | /* | 357 | /* |
@@ -732,7 +729,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) | |||
732 | return false; | 729 | return false; |
733 | } | 730 | } |
734 | 731 | ||
735 | if (have_nohz_full_mask) { | 732 | if (tick_nohz_full_enabled()) { |
736 | /* | 733 | /* |
737 | * Keep the tick alive to guarantee timekeeping progression | 734 | * Keep the tick alive to guarantee timekeeping progression |
738 | * if there are full dynticks CPUs around | 735 | * if there are full dynticks CPUs around |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 48b9fffabdc2..947ba25a95a0 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
@@ -1703,6 +1703,8 @@ int do_adjtimex(struct timex *txc) | |||
1703 | write_seqcount_end(&timekeeper_seq); | 1703 | write_seqcount_end(&timekeeper_seq); |
1704 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); | 1704 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); |
1705 | 1705 | ||
1706 | ntp_notify_cmos_timer(); | ||
1707 | |||
1706 | return ret; | 1708 | return ret; |
1707 | } | 1709 | } |
1708 | 1710 | ||
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a6d098c6df3f..03cf44ac54d3 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
@@ -1978,12 +1978,27 @@ int __weak ftrace_arch_code_modify_post_process(void) | |||
1978 | 1978 | ||
1979 | void ftrace_modify_all_code(int command) | 1979 | void ftrace_modify_all_code(int command) |
1980 | { | 1980 | { |
1981 | int update = command & FTRACE_UPDATE_TRACE_FUNC; | ||
1982 | |||
1983 | /* | ||
1984 | * If the ftrace_caller calls a ftrace_ops func directly, | ||
1985 | * we need to make sure that it only traces functions it | ||
1986 | * expects to trace. When doing the switch of functions, | ||
1987 | * we need to update to the ftrace_ops_list_func first | ||
1988 | * before the transition between old and new calls are set, | ||
1989 | * as the ftrace_ops_list_func will check the ops hashes | ||
1990 | * to make sure the ops are having the right functions | ||
1991 | * traced. | ||
1992 | */ | ||
1993 | if (update) | ||
1994 | ftrace_update_ftrace_func(ftrace_ops_list_func); | ||
1995 | |||
1981 | if (command & FTRACE_UPDATE_CALLS) | 1996 | if (command & FTRACE_UPDATE_CALLS) |
1982 | ftrace_replace_code(1); | 1997 | ftrace_replace_code(1); |
1983 | else if (command & FTRACE_DISABLE_CALLS) | 1998 | else if (command & FTRACE_DISABLE_CALLS) |
1984 | ftrace_replace_code(0); | 1999 | ftrace_replace_code(0); |
1985 | 2000 | ||
1986 | if (command & FTRACE_UPDATE_TRACE_FUNC) | 2001 | if (update && ftrace_trace_function != ftrace_ops_list_func) |
1987 | ftrace_update_ftrace_func(ftrace_trace_function); | 2002 | ftrace_update_ftrace_func(ftrace_trace_function); |
1988 | 2003 | ||
1989 | if (command & FTRACE_START_FUNC_RET) | 2004 | if (command & FTRACE_START_FUNC_RET) |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 496f94d57698..7974ba20557d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
@@ -3166,11 +3166,6 @@ static const struct file_operations show_traces_fops = { | |||
3166 | }; | 3166 | }; |
3167 | 3167 | ||
3168 | /* | 3168 | /* |
3169 | * Only trace on a CPU if the bitmask is set: | ||
3170 | */ | ||
3171 | static cpumask_var_t tracing_cpumask; | ||
3172 | |||
3173 | /* | ||
3174 | * The tracer itself will not take this lock, but still we want | 3169 | * The tracer itself will not take this lock, but still we want |
3175 | * to provide a consistent cpumask to user-space: | 3170 | * to provide a consistent cpumask to user-space: |
3176 | */ | 3171 | */ |
@@ -3186,11 +3181,12 @@ static ssize_t | |||
3186 | tracing_cpumask_read(struct file *filp, char __user *ubuf, | 3181 | tracing_cpumask_read(struct file *filp, char __user *ubuf, |
3187 | size_t count, loff_t *ppos) | 3182 | size_t count, loff_t *ppos) |
3188 | { | 3183 | { |
3184 | struct trace_array *tr = file_inode(filp)->i_private; | ||
3189 | int len; | 3185 | int len; |
3190 | 3186 | ||
3191 | mutex_lock(&tracing_cpumask_update_lock); | 3187 | mutex_lock(&tracing_cpumask_update_lock); |
3192 | 3188 | ||
3193 | len = cpumask_scnprintf(mask_str, count, tracing_cpumask); | 3189 | len = cpumask_scnprintf(mask_str, count, tr->tracing_cpumask); |
3194 | if (count - len < 2) { | 3190 | if (count - len < 2) { |
3195 | count = -EINVAL; | 3191 | count = -EINVAL; |
3196 | goto out_err; | 3192 | goto out_err; |
@@ -3208,7 +3204,7 @@ static ssize_t | |||
3208 | tracing_cpumask_write(struct file *filp, const char __user *ubuf, | 3204 | tracing_cpumask_write(struct file *filp, const char __user *ubuf, |
3209 | size_t count, loff_t *ppos) | 3205 | size_t count, loff_t *ppos) |
3210 | { | 3206 | { |
3211 | struct trace_array *tr = filp->private_data; | 3207 | struct trace_array *tr = file_inode(filp)->i_private; |
3212 | cpumask_var_t tracing_cpumask_new; | 3208 | cpumask_var_t tracing_cpumask_new; |
3213 | int err, cpu; | 3209 | int err, cpu; |
3214 | 3210 | ||
@@ -3228,12 +3224,12 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, | |||
3228 | * Increase/decrease the disabled counter if we are | 3224 | * Increase/decrease the disabled counter if we are |
3229 | * about to flip a bit in the cpumask: | 3225 | * about to flip a bit in the cpumask: |
3230 | */ | 3226 | */ |
3231 | if (cpumask_test_cpu(cpu, tracing_cpumask) && | 3227 | if (cpumask_test_cpu(cpu, tr->tracing_cpumask) && |
3232 | !cpumask_test_cpu(cpu, tracing_cpumask_new)) { | 3228 | !cpumask_test_cpu(cpu, tracing_cpumask_new)) { |
3233 | atomic_inc(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled); | 3229 | atomic_inc(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled); |
3234 | ring_buffer_record_disable_cpu(tr->trace_buffer.buffer, cpu); | 3230 | ring_buffer_record_disable_cpu(tr->trace_buffer.buffer, cpu); |
3235 | } | 3231 | } |
3236 | if (!cpumask_test_cpu(cpu, tracing_cpumask) && | 3232 | if (!cpumask_test_cpu(cpu, tr->tracing_cpumask) && |
3237 | cpumask_test_cpu(cpu, tracing_cpumask_new)) { | 3233 | cpumask_test_cpu(cpu, tracing_cpumask_new)) { |
3238 | atomic_dec(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled); | 3234 | atomic_dec(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled); |
3239 | ring_buffer_record_enable_cpu(tr->trace_buffer.buffer, cpu); | 3235 | ring_buffer_record_enable_cpu(tr->trace_buffer.buffer, cpu); |
@@ -3242,7 +3238,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, | |||
3242 | arch_spin_unlock(&ftrace_max_lock); | 3238 | arch_spin_unlock(&ftrace_max_lock); |
3243 | local_irq_enable(); | 3239 | local_irq_enable(); |
3244 | 3240 | ||
3245 | cpumask_copy(tracing_cpumask, tracing_cpumask_new); | 3241 | cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new); |
3246 | 3242 | ||
3247 | mutex_unlock(&tracing_cpumask_update_lock); | 3243 | mutex_unlock(&tracing_cpumask_update_lock); |
3248 | free_cpumask_var(tracing_cpumask_new); | 3244 | free_cpumask_var(tracing_cpumask_new); |
@@ -3256,9 +3252,10 @@ err_unlock: | |||
3256 | } | 3252 | } |
3257 | 3253 | ||
3258 | static const struct file_operations tracing_cpumask_fops = { | 3254 | static const struct file_operations tracing_cpumask_fops = { |
3259 | .open = tracing_open_generic, | 3255 | .open = tracing_open_generic_tr, |
3260 | .read = tracing_cpumask_read, | 3256 | .read = tracing_cpumask_read, |
3261 | .write = tracing_cpumask_write, | 3257 | .write = tracing_cpumask_write, |
3258 | .release = tracing_release_generic_tr, | ||
3262 | .llseek = generic_file_llseek, | 3259 | .llseek = generic_file_llseek, |
3263 | }; | 3260 | }; |
3264 | 3261 | ||
@@ -5938,6 +5935,11 @@ static int new_instance_create(const char *name) | |||
5938 | if (!tr->name) | 5935 | if (!tr->name) |
5939 | goto out_free_tr; | 5936 | goto out_free_tr; |
5940 | 5937 | ||
5938 | if (!alloc_cpumask_var(&tr->tracing_cpumask, GFP_KERNEL)) | ||
5939 | goto out_free_tr; | ||
5940 | |||
5941 | cpumask_copy(tr->tracing_cpumask, cpu_all_mask); | ||
5942 | |||
5941 | raw_spin_lock_init(&tr->start_lock); | 5943 | raw_spin_lock_init(&tr->start_lock); |
5942 | 5944 | ||
5943 | tr->current_trace = &nop_trace; | 5945 | tr->current_trace = &nop_trace; |
@@ -5969,6 +5971,7 @@ static int new_instance_create(const char *name) | |||
5969 | out_free_tr: | 5971 | out_free_tr: |
5970 | if (tr->trace_buffer.buffer) | 5972 | if (tr->trace_buffer.buffer) |
5971 | ring_buffer_free(tr->trace_buffer.buffer); | 5973 | ring_buffer_free(tr->trace_buffer.buffer); |
5974 | free_cpumask_var(tr->tracing_cpumask); | ||
5972 | kfree(tr->name); | 5975 | kfree(tr->name); |
5973 | kfree(tr); | 5976 | kfree(tr); |
5974 | 5977 | ||
@@ -6098,6 +6101,9 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer) | |||
6098 | { | 6101 | { |
6099 | int cpu; | 6102 | int cpu; |
6100 | 6103 | ||
6104 | trace_create_file("tracing_cpumask", 0644, d_tracer, | ||
6105 | tr, &tracing_cpumask_fops); | ||
6106 | |||
6101 | trace_create_file("trace_options", 0644, d_tracer, | 6107 | trace_create_file("trace_options", 0644, d_tracer, |
6102 | tr, &tracing_iter_fops); | 6108 | tr, &tracing_iter_fops); |
6103 | 6109 | ||
@@ -6147,9 +6153,6 @@ static __init int tracer_init_debugfs(void) | |||
6147 | 6153 | ||
6148 | init_tracer_debugfs(&global_trace, d_tracer); | 6154 | init_tracer_debugfs(&global_trace, d_tracer); |
6149 | 6155 | ||
6150 | trace_create_file("tracing_cpumask", 0644, d_tracer, | ||
6151 | &global_trace, &tracing_cpumask_fops); | ||
6152 | |||
6153 | trace_create_file("available_tracers", 0444, d_tracer, | 6156 | trace_create_file("available_tracers", 0444, d_tracer, |
6154 | &global_trace, &show_traces_fops); | 6157 | &global_trace, &show_traces_fops); |
6155 | 6158 | ||
@@ -6371,7 +6374,7 @@ __init static int tracer_alloc_buffers(void) | |||
6371 | if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) | 6374 | if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) |
6372 | goto out; | 6375 | goto out; |
6373 | 6376 | ||
6374 | if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) | 6377 | if (!alloc_cpumask_var(&global_trace.tracing_cpumask, GFP_KERNEL)) |
6375 | goto out_free_buffer_mask; | 6378 | goto out_free_buffer_mask; |
6376 | 6379 | ||
6377 | /* Only allocate trace_printk buffers if a trace_printk exists */ | 6380 | /* Only allocate trace_printk buffers if a trace_printk exists */ |
@@ -6386,7 +6389,7 @@ __init static int tracer_alloc_buffers(void) | |||
6386 | ring_buf_size = 1; | 6389 | ring_buf_size = 1; |
6387 | 6390 | ||
6388 | cpumask_copy(tracing_buffer_mask, cpu_possible_mask); | 6391 | cpumask_copy(tracing_buffer_mask, cpu_possible_mask); |
6389 | cpumask_copy(tracing_cpumask, cpu_all_mask); | 6392 | cpumask_copy(global_trace.tracing_cpumask, cpu_all_mask); |
6390 | 6393 | ||
6391 | raw_spin_lock_init(&global_trace.start_lock); | 6394 | raw_spin_lock_init(&global_trace.start_lock); |
6392 | 6395 | ||
@@ -6441,7 +6444,7 @@ out_free_cpumask: | |||
6441 | #ifdef CONFIG_TRACER_MAX_TRACE | 6444 | #ifdef CONFIG_TRACER_MAX_TRACE |
6442 | free_percpu(global_trace.max_buffer.data); | 6445 | free_percpu(global_trace.max_buffer.data); |
6443 | #endif | 6446 | #endif |
6444 | free_cpumask_var(tracing_cpumask); | 6447 | free_cpumask_var(global_trace.tracing_cpumask); |
6445 | out_free_buffer_mask: | 6448 | out_free_buffer_mask: |
6446 | free_cpumask_var(tracing_buffer_mask); | 6449 | free_cpumask_var(tracing_buffer_mask); |
6447 | out: | 6450 | out: |
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index afaae41b0a02..10c86fb7a2b4 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
@@ -206,6 +206,7 @@ struct trace_array { | |||
206 | struct dentry *event_dir; | 206 | struct dentry *event_dir; |
207 | struct list_head systems; | 207 | struct list_head systems; |
208 | struct list_head events; | 208 | struct list_head events; |
209 | cpumask_var_t tracing_cpumask; /* only trace on set CPUs */ | ||
209 | int ref; | 210 | int ref; |
210 | }; | 211 | }; |
211 | 212 | ||
@@ -1022,6 +1023,9 @@ extern struct list_head ftrace_events; | |||
1022 | extern const char *__start___trace_bprintk_fmt[]; | 1023 | extern const char *__start___trace_bprintk_fmt[]; |
1023 | extern const char *__stop___trace_bprintk_fmt[]; | 1024 | extern const char *__stop___trace_bprintk_fmt[]; |
1024 | 1025 | ||
1026 | extern const char *__start___tracepoint_str[]; | ||
1027 | extern const char *__stop___tracepoint_str[]; | ||
1028 | |||
1025 | void trace_printk_init_buffers(void); | 1029 | void trace_printk_init_buffers(void); |
1026 | void trace_printk_start_comm(void); | 1030 | void trace_printk_start_comm(void); |
1027 | int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set); | 1031 | int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set); |
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 29a7ebcfb426..368a4d50cc30 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
@@ -1489,12 +1489,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name, | |||
1489 | } | 1489 | } |
1490 | 1490 | ||
1491 | static int | 1491 | static int |
1492 | event_create_dir(struct dentry *parent, | 1492 | event_create_dir(struct dentry *parent, struct ftrace_event_file *file) |
1493 | struct ftrace_event_file *file, | ||
1494 | const struct file_operations *id, | ||
1495 | const struct file_operations *enable, | ||
1496 | const struct file_operations *filter, | ||
1497 | const struct file_operations *format) | ||
1498 | { | 1493 | { |
1499 | struct ftrace_event_call *call = file->event_call; | 1494 | struct ftrace_event_call *call = file->event_call; |
1500 | struct trace_array *tr = file->tr; | 1495 | struct trace_array *tr = file->tr; |
@@ -1522,12 +1517,13 @@ event_create_dir(struct dentry *parent, | |||
1522 | 1517 | ||
1523 | if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) | 1518 | if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) |
1524 | trace_create_file("enable", 0644, file->dir, file, | 1519 | trace_create_file("enable", 0644, file->dir, file, |
1525 | enable); | 1520 | &ftrace_enable_fops); |
1526 | 1521 | ||
1527 | #ifdef CONFIG_PERF_EVENTS | 1522 | #ifdef CONFIG_PERF_EVENTS |
1528 | if (call->event.type && call->class->reg) | 1523 | if (call->event.type && call->class->reg) |
1529 | trace_create_file("id", 0444, file->dir, | 1524 | trace_create_file("id", 0444, file->dir, |
1530 | (void *)(long)call->event.type, id); | 1525 | (void *)(long)call->event.type, |
1526 | &ftrace_event_id_fops); | ||
1531 | #endif | 1527 | #endif |
1532 | 1528 | ||
1533 | /* | 1529 | /* |
@@ -1544,10 +1540,10 @@ event_create_dir(struct dentry *parent, | |||
1544 | } | 1540 | } |
1545 | } | 1541 | } |
1546 | trace_create_file("filter", 0644, file->dir, call, | 1542 | trace_create_file("filter", 0644, file->dir, call, |
1547 | filter); | 1543 | &ftrace_event_filter_fops); |
1548 | 1544 | ||
1549 | trace_create_file("format", 0444, file->dir, call, | 1545 | trace_create_file("format", 0444, file->dir, call, |
1550 | format); | 1546 | &ftrace_event_format_fops); |
1551 | 1547 | ||
1552 | return 0; | 1548 | return 0; |
1553 | } | 1549 | } |
@@ -1648,12 +1644,7 @@ trace_create_new_event(struct ftrace_event_call *call, | |||
1648 | 1644 | ||
1649 | /* Add an event to a trace directory */ | 1645 | /* Add an event to a trace directory */ |
1650 | static int | 1646 | static int |
1651 | __trace_add_new_event(struct ftrace_event_call *call, | 1647 | __trace_add_new_event(struct ftrace_event_call *call, struct trace_array *tr) |
1652 | struct trace_array *tr, | ||
1653 | const struct file_operations *id, | ||
1654 | const struct file_operations *enable, | ||
1655 | const struct file_operations *filter, | ||
1656 | const struct file_operations *format) | ||
1657 | { | 1648 | { |
1658 | struct ftrace_event_file *file; | 1649 | struct ftrace_event_file *file; |
1659 | 1650 | ||
@@ -1661,7 +1652,7 @@ __trace_add_new_event(struct ftrace_event_call *call, | |||
1661 | if (!file) | 1652 | if (!file) |
1662 | return -ENOMEM; | 1653 | return -ENOMEM; |
1663 | 1654 | ||
1664 | return event_create_dir(tr->event_dir, file, id, enable, filter, format); | 1655 | return event_create_dir(tr->event_dir, file); |
1665 | } | 1656 | } |
1666 | 1657 | ||
1667 | /* | 1658 | /* |
@@ -1683,8 +1674,7 @@ __trace_early_add_new_event(struct ftrace_event_call *call, | |||
1683 | } | 1674 | } |
1684 | 1675 | ||
1685 | struct ftrace_module_file_ops; | 1676 | struct ftrace_module_file_ops; |
1686 | static void __add_event_to_tracers(struct ftrace_event_call *call, | 1677 | static void __add_event_to_tracers(struct ftrace_event_call *call); |
1687 | struct ftrace_module_file_ops *file_ops); | ||
1688 | 1678 | ||
1689 | /* Add an additional event_call dynamically */ | 1679 | /* Add an additional event_call dynamically */ |
1690 | int trace_add_event_call(struct ftrace_event_call *call) | 1680 | int trace_add_event_call(struct ftrace_event_call *call) |
@@ -1695,7 +1685,7 @@ int trace_add_event_call(struct ftrace_event_call *call) | |||
1695 | 1685 | ||
1696 | ret = __register_event(call, NULL); | 1686 | ret = __register_event(call, NULL); |
1697 | if (ret >= 0) | 1687 | if (ret >= 0) |
1698 | __add_event_to_tracers(call, NULL); | 1688 | __add_event_to_tracers(call); |
1699 | 1689 | ||
1700 | mutex_unlock(&event_mutex); | 1690 | mutex_unlock(&event_mutex); |
1701 | mutex_unlock(&trace_types_lock); | 1691 | mutex_unlock(&trace_types_lock); |
@@ -1769,100 +1759,21 @@ int trace_remove_event_call(struct ftrace_event_call *call) | |||
1769 | 1759 | ||
1770 | #ifdef CONFIG_MODULES | 1760 | #ifdef CONFIG_MODULES |
1771 | 1761 | ||
1772 | static LIST_HEAD(ftrace_module_file_list); | ||
1773 | |||
1774 | /* | ||
1775 | * Modules must own their file_operations to keep up with | ||
1776 | * reference counting. | ||
1777 | */ | ||
1778 | struct ftrace_module_file_ops { | ||
1779 | struct list_head list; | ||
1780 | struct module *mod; | ||
1781 | struct file_operations id; | ||
1782 | struct file_operations enable; | ||
1783 | struct file_operations format; | ||
1784 | struct file_operations filter; | ||
1785 | }; | ||
1786 | |||
1787 | static struct ftrace_module_file_ops * | ||
1788 | find_ftrace_file_ops(struct ftrace_module_file_ops *file_ops, struct module *mod) | ||
1789 | { | ||
1790 | /* | ||
1791 | * As event_calls are added in groups by module, | ||
1792 | * when we find one file_ops, we don't need to search for | ||
1793 | * each call in that module, as the rest should be the | ||
1794 | * same. Only search for a new one if the last one did | ||
1795 | * not match. | ||
1796 | */ | ||
1797 | if (file_ops && mod == file_ops->mod) | ||
1798 | return file_ops; | ||
1799 | |||
1800 | list_for_each_entry(file_ops, &ftrace_module_file_list, list) { | ||
1801 | if (file_ops->mod == mod) | ||
1802 | return file_ops; | ||
1803 | } | ||
1804 | return NULL; | ||
1805 | } | ||
1806 | |||
1807 | static struct ftrace_module_file_ops * | ||
1808 | trace_create_file_ops(struct module *mod) | ||
1809 | { | ||
1810 | struct ftrace_module_file_ops *file_ops; | ||
1811 | |||
1812 | /* | ||
1813 | * This is a bit of a PITA. To allow for correct reference | ||
1814 | * counting, modules must "own" their file_operations. | ||
1815 | * To do this, we allocate the file operations that will be | ||
1816 | * used in the event directory. | ||
1817 | */ | ||
1818 | |||
1819 | file_ops = kmalloc(sizeof(*file_ops), GFP_KERNEL); | ||
1820 | if (!file_ops) | ||
1821 | return NULL; | ||
1822 | |||
1823 | file_ops->mod = mod; | ||
1824 | |||
1825 | file_ops->id = ftrace_event_id_fops; | ||
1826 | file_ops->id.owner = mod; | ||
1827 | |||
1828 | file_ops->enable = ftrace_enable_fops; | ||
1829 | file_ops->enable.owner = mod; | ||
1830 | |||
1831 | file_ops->filter = ftrace_event_filter_fops; | ||
1832 | file_ops->filter.owner = mod; | ||
1833 | |||
1834 | file_ops->format = ftrace_event_format_fops; | ||
1835 | file_ops->format.owner = mod; | ||
1836 | |||
1837 | list_add(&file_ops->list, &ftrace_module_file_list); | ||
1838 | |||
1839 | return file_ops; | ||
1840 | } | ||
1841 | |||
1842 | static void trace_module_add_events(struct module *mod) | 1762 | static void trace_module_add_events(struct module *mod) |
1843 | { | 1763 | { |
1844 | struct ftrace_module_file_ops *file_ops = NULL; | ||
1845 | struct ftrace_event_call **call, **start, **end; | 1764 | struct ftrace_event_call **call, **start, **end; |
1846 | 1765 | ||
1847 | start = mod->trace_events; | 1766 | start = mod->trace_events; |
1848 | end = mod->trace_events + mod->num_trace_events; | 1767 | end = mod->trace_events + mod->num_trace_events; |
1849 | 1768 | ||
1850 | if (start == end) | ||
1851 | return; | ||
1852 | |||
1853 | file_ops = trace_create_file_ops(mod); | ||
1854 | if (!file_ops) | ||
1855 | return; | ||
1856 | |||
1857 | for_each_event(call, start, end) { | 1769 | for_each_event(call, start, end) { |
1858 | __register_event(*call, mod); | 1770 | __register_event(*call, mod); |
1859 | __add_event_to_tracers(*call, file_ops); | 1771 | __add_event_to_tracers(*call); |
1860 | } | 1772 | } |
1861 | } | 1773 | } |
1862 | 1774 | ||
1863 | static void trace_module_remove_events(struct module *mod) | 1775 | static void trace_module_remove_events(struct module *mod) |
1864 | { | 1776 | { |
1865 | struct ftrace_module_file_ops *file_ops; | ||
1866 | struct ftrace_event_call *call, *p; | 1777 | struct ftrace_event_call *call, *p; |
1867 | bool clear_trace = false; | 1778 | bool clear_trace = false; |
1868 | 1779 | ||
@@ -1874,16 +1785,6 @@ static void trace_module_remove_events(struct module *mod) | |||
1874 | __trace_remove_event_call(call); | 1785 | __trace_remove_event_call(call); |
1875 | } | 1786 | } |
1876 | } | 1787 | } |
1877 | |||
1878 | /* Now free the file_operations */ | ||
1879 | list_for_each_entry(file_ops, &ftrace_module_file_list, list) { | ||
1880 | if (file_ops->mod == mod) | ||
1881 | break; | ||
1882 | } | ||
1883 | if (&file_ops->list != &ftrace_module_file_list) { | ||
1884 | list_del(&file_ops->list); | ||
1885 | kfree(file_ops); | ||
1886 | } | ||
1887 | up_write(&trace_event_sem); | 1788 | up_write(&trace_event_sem); |
1888 | 1789 | ||
1889 | /* | 1790 | /* |
@@ -1919,67 +1820,21 @@ static int trace_module_notify(struct notifier_block *self, | |||
1919 | return 0; | 1820 | return 0; |
1920 | } | 1821 | } |
1921 | 1822 | ||
1922 | static int | 1823 | static struct notifier_block trace_module_nb = { |
1923 | __trace_add_new_mod_event(struct ftrace_event_call *call, | 1824 | .notifier_call = trace_module_notify, |
1924 | struct trace_array *tr, | 1825 | .priority = 0, |
1925 | struct ftrace_module_file_ops *file_ops) | 1826 | }; |
1926 | { | ||
1927 | return __trace_add_new_event(call, tr, | ||
1928 | &file_ops->id, &file_ops->enable, | ||
1929 | &file_ops->filter, &file_ops->format); | ||
1930 | } | ||
1931 | |||
1932 | #else | ||
1933 | static inline struct ftrace_module_file_ops * | ||
1934 | find_ftrace_file_ops(struct ftrace_module_file_ops *file_ops, struct module *mod) | ||
1935 | { | ||
1936 | return NULL; | ||
1937 | } | ||
1938 | static inline int trace_module_notify(struct notifier_block *self, | ||
1939 | unsigned long val, void *data) | ||
1940 | { | ||
1941 | return 0; | ||
1942 | } | ||
1943 | static inline int | ||
1944 | __trace_add_new_mod_event(struct ftrace_event_call *call, | ||
1945 | struct trace_array *tr, | ||
1946 | struct ftrace_module_file_ops *file_ops) | ||
1947 | { | ||
1948 | return -ENODEV; | ||
1949 | } | ||
1950 | #endif /* CONFIG_MODULES */ | 1827 | #endif /* CONFIG_MODULES */ |
1951 | 1828 | ||
1952 | /* Create a new event directory structure for a trace directory. */ | 1829 | /* Create a new event directory structure for a trace directory. */ |
1953 | static void | 1830 | static void |
1954 | __trace_add_event_dirs(struct trace_array *tr) | 1831 | __trace_add_event_dirs(struct trace_array *tr) |
1955 | { | 1832 | { |
1956 | struct ftrace_module_file_ops *file_ops = NULL; | ||
1957 | struct ftrace_event_call *call; | 1833 | struct ftrace_event_call *call; |
1958 | int ret; | 1834 | int ret; |
1959 | 1835 | ||
1960 | list_for_each_entry(call, &ftrace_events, list) { | 1836 | list_for_each_entry(call, &ftrace_events, list) { |
1961 | if (call->mod) { | 1837 | ret = __trace_add_new_event(call, tr); |
1962 | /* | ||
1963 | * Directories for events by modules need to | ||
1964 | * keep module ref counts when opened (as we don't | ||
1965 | * want the module to disappear when reading one | ||
1966 | * of these files). The file_ops keep account of | ||
1967 | * the module ref count. | ||
1968 | */ | ||
1969 | file_ops = find_ftrace_file_ops(file_ops, call->mod); | ||
1970 | if (!file_ops) | ||
1971 | continue; /* Warn? */ | ||
1972 | ret = __trace_add_new_mod_event(call, tr, file_ops); | ||
1973 | if (ret < 0) | ||
1974 | pr_warning("Could not create directory for event %s\n", | ||
1975 | call->name); | ||
1976 | continue; | ||
1977 | } | ||
1978 | ret = __trace_add_new_event(call, tr, | ||
1979 | &ftrace_event_id_fops, | ||
1980 | &ftrace_enable_fops, | ||
1981 | &ftrace_event_filter_fops, | ||
1982 | &ftrace_event_format_fops); | ||
1983 | if (ret < 0) | 1838 | if (ret < 0) |
1984 | pr_warning("Could not create directory for event %s\n", | 1839 | pr_warning("Could not create directory for event %s\n", |
1985 | call->name); | 1840 | call->name); |
@@ -2287,11 +2142,7 @@ __trace_early_add_event_dirs(struct trace_array *tr) | |||
2287 | 2142 | ||
2288 | 2143 | ||
2289 | list_for_each_entry(file, &tr->events, list) { | 2144 | list_for_each_entry(file, &tr->events, list) { |
2290 | ret = event_create_dir(tr->event_dir, file, | 2145 | ret = event_create_dir(tr->event_dir, file); |
2291 | &ftrace_event_id_fops, | ||
2292 | &ftrace_enable_fops, | ||
2293 | &ftrace_event_filter_fops, | ||
2294 | &ftrace_event_format_fops); | ||
2295 | if (ret < 0) | 2146 | if (ret < 0) |
2296 | pr_warning("Could not create directory for event %s\n", | 2147 | pr_warning("Could not create directory for event %s\n", |
2297 | file->event_call->name); | 2148 | file->event_call->name); |
@@ -2332,29 +2183,14 @@ __trace_remove_event_dirs(struct trace_array *tr) | |||
2332 | remove_event_file_dir(file); | 2183 | remove_event_file_dir(file); |
2333 | } | 2184 | } |
2334 | 2185 | ||
2335 | static void | 2186 | static void __add_event_to_tracers(struct ftrace_event_call *call) |
2336 | __add_event_to_tracers(struct ftrace_event_call *call, | ||
2337 | struct ftrace_module_file_ops *file_ops) | ||
2338 | { | 2187 | { |
2339 | struct trace_array *tr; | 2188 | struct trace_array *tr; |
2340 | 2189 | ||
2341 | list_for_each_entry(tr, &ftrace_trace_arrays, list) { | 2190 | list_for_each_entry(tr, &ftrace_trace_arrays, list) |
2342 | if (file_ops) | 2191 | __trace_add_new_event(call, tr); |
2343 | __trace_add_new_mod_event(call, tr, file_ops); | ||
2344 | else | ||
2345 | __trace_add_new_event(call, tr, | ||
2346 | &ftrace_event_id_fops, | ||
2347 | &ftrace_enable_fops, | ||
2348 | &ftrace_event_filter_fops, | ||
2349 | &ftrace_event_format_fops); | ||
2350 | } | ||
2351 | } | 2192 | } |
2352 | 2193 | ||
2353 | static struct notifier_block trace_module_nb = { | ||
2354 | .notifier_call = trace_module_notify, | ||
2355 | .priority = 0, | ||
2356 | }; | ||
2357 | |||
2358 | extern struct ftrace_event_call *__start_ftrace_events[]; | 2194 | extern struct ftrace_event_call *__start_ftrace_events[]; |
2359 | extern struct ftrace_event_call *__stop_ftrace_events[]; | 2195 | extern struct ftrace_event_call *__stop_ftrace_events[]; |
2360 | 2196 | ||
@@ -2559,10 +2395,11 @@ static __init int event_trace_init(void) | |||
2559 | if (ret) | 2395 | if (ret) |
2560 | return ret; | 2396 | return ret; |
2561 | 2397 | ||
2398 | #ifdef CONFIG_MODULES | ||
2562 | ret = register_module_notifier(&trace_module_nb); | 2399 | ret = register_module_notifier(&trace_module_nb); |
2563 | if (ret) | 2400 | if (ret) |
2564 | pr_warning("Failed to register trace events module notifier\n"); | 2401 | pr_warning("Failed to register trace events module notifier\n"); |
2565 | 2402 | #endif | |
2566 | return 0; | 2403 | return 0; |
2567 | } | 2404 | } |
2568 | early_initcall(event_trace_memsetup); | 2405 | early_initcall(event_trace_memsetup); |
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index a9077c1b4ad3..2900817ba65c 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c | |||
@@ -244,12 +244,31 @@ static const char **find_next(void *v, loff_t *pos) | |||
244 | { | 244 | { |
245 | const char **fmt = v; | 245 | const char **fmt = v; |
246 | int start_index; | 246 | int start_index; |
247 | int last_index; | ||
247 | 248 | ||
248 | start_index = __stop___trace_bprintk_fmt - __start___trace_bprintk_fmt; | 249 | start_index = __stop___trace_bprintk_fmt - __start___trace_bprintk_fmt; |
249 | 250 | ||
250 | if (*pos < start_index) | 251 | if (*pos < start_index) |
251 | return __start___trace_bprintk_fmt + *pos; | 252 | return __start___trace_bprintk_fmt + *pos; |
252 | 253 | ||
254 | /* | ||
255 | * The __tracepoint_str section is treated the same as the | ||
256 | * __trace_printk_fmt section. The difference is that the | ||
257 | * __trace_printk_fmt section should only be used by trace_printk() | ||
258 | * in a debugging environment, as if anything exists in that section | ||
259 | * the trace_prink() helper buffers are allocated, which would just | ||
260 | * waste space in a production environment. | ||
261 | * | ||
262 | * The __tracepoint_str sections on the other hand are used by | ||
263 | * tracepoints which need to map pointers to their strings to | ||
264 | * the ASCII text for userspace. | ||
265 | */ | ||
266 | last_index = start_index; | ||
267 | start_index = __stop___tracepoint_str - __start___tracepoint_str; | ||
268 | |||
269 | if (*pos < last_index + start_index) | ||
270 | return __start___tracepoint_str + (*pos - last_index); | ||
271 | |||
253 | return find_next_mod_format(start_index, v, fmt, pos); | 272 | return find_next_mod_format(start_index, v, fmt, pos); |
254 | } | 273 | } |
255 | 274 | ||
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 8fd03657bc7d..559329d9bd2f 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c | |||
@@ -200,8 +200,8 @@ extern char *__bad_type_size(void); | |||
200 | #type, #name, offsetof(typeof(trace), name), \ | 200 | #type, #name, offsetof(typeof(trace), name), \ |
201 | sizeof(trace.name), is_signed_type(type) | 201 | sizeof(trace.name), is_signed_type(type) |
202 | 202 | ||
203 | static | 203 | static int __init |
204 | int __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) | 204 | __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) |
205 | { | 205 | { |
206 | int i; | 206 | int i; |
207 | int pos = 0; | 207 | int pos = 0; |
@@ -228,7 +228,7 @@ int __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) | |||
228 | return pos; | 228 | return pos; |
229 | } | 229 | } |
230 | 230 | ||
231 | static int set_syscall_print_fmt(struct ftrace_event_call *call) | 231 | static int __init set_syscall_print_fmt(struct ftrace_event_call *call) |
232 | { | 232 | { |
233 | char *print_fmt; | 233 | char *print_fmt; |
234 | int len; | 234 | int len; |
@@ -253,7 +253,7 @@ static int set_syscall_print_fmt(struct ftrace_event_call *call) | |||
253 | return 0; | 253 | return 0; |
254 | } | 254 | } |
255 | 255 | ||
256 | static void free_syscall_print_fmt(struct ftrace_event_call *call) | 256 | static void __init free_syscall_print_fmt(struct ftrace_event_call *call) |
257 | { | 257 | { |
258 | struct syscall_metadata *entry = call->data; | 258 | struct syscall_metadata *entry = call->data; |
259 | 259 | ||
@@ -459,7 +459,7 @@ static void unreg_event_syscall_exit(struct ftrace_event_file *file, | |||
459 | mutex_unlock(&syscall_trace_lock); | 459 | mutex_unlock(&syscall_trace_lock); |
460 | } | 460 | } |
461 | 461 | ||
462 | static int init_syscall_trace(struct ftrace_event_call *call) | 462 | static int __init init_syscall_trace(struct ftrace_event_call *call) |
463 | { | 463 | { |
464 | int id; | 464 | int id; |
465 | int num; | 465 | int num; |
diff --git a/kernel/uid16.c b/kernel/uid16.c index f6c83d7ef000..602e5bbbceff 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c | |||
@@ -176,7 +176,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist) | |||
176 | struct group_info *group_info; | 176 | struct group_info *group_info; |
177 | int retval; | 177 | int retval; |
178 | 178 | ||
179 | if (!nsown_capable(CAP_SETGID)) | 179 | if (!ns_capable(current_user_ns(), CAP_SETGID)) |
180 | return -EPERM; | 180 | return -EPERM; |
181 | if ((unsigned)gidsetsize > NGROUPS_MAX) | 181 | if ((unsigned)gidsetsize > NGROUPS_MAX) |
182 | return -EINVAL; | 182 | return -EINVAL; |
diff --git a/kernel/up.c b/kernel/up.c index c54c75e9faf7..630d72bf7e41 100644 --- a/kernel/up.c +++ b/kernel/up.c | |||
@@ -10,12 +10,64 @@ | |||
10 | int smp_call_function_single(int cpu, void (*func) (void *info), void *info, | 10 | int smp_call_function_single(int cpu, void (*func) (void *info), void *info, |
11 | int wait) | 11 | int wait) |
12 | { | 12 | { |
13 | unsigned long flags; | ||
14 | |||
13 | WARN_ON(cpu != 0); | 15 | WARN_ON(cpu != 0); |
14 | 16 | ||
15 | local_irq_disable(); | 17 | local_irq_save(flags); |
16 | (func)(info); | 18 | func(info); |
17 | local_irq_enable(); | 19 | local_irq_restore(flags); |
18 | 20 | ||
19 | return 0; | 21 | return 0; |
20 | } | 22 | } |
21 | EXPORT_SYMBOL(smp_call_function_single); | 23 | EXPORT_SYMBOL(smp_call_function_single); |
24 | |||
25 | int on_each_cpu(smp_call_func_t func, void *info, int wait) | ||
26 | { | ||
27 | unsigned long flags; | ||
28 | |||
29 | local_irq_save(flags); | ||
30 | func(info); | ||
31 | local_irq_restore(flags); | ||
32 | return 0; | ||
33 | } | ||
34 | EXPORT_SYMBOL(on_each_cpu); | ||
35 | |||
36 | /* | ||
37 | * Note we still need to test the mask even for UP | ||
38 | * because we actually can get an empty mask from | ||
39 | * code that on SMP might call us without the local | ||
40 | * CPU in the mask. | ||
41 | */ | ||
42 | void on_each_cpu_mask(const struct cpumask *mask, | ||
43 | smp_call_func_t func, void *info, bool wait) | ||
44 | { | ||
45 | unsigned long flags; | ||
46 | |||
47 | if (cpumask_test_cpu(0, mask)) { | ||
48 | local_irq_save(flags); | ||
49 | func(info); | ||
50 | local_irq_restore(flags); | ||
51 | } | ||
52 | } | ||
53 | EXPORT_SYMBOL(on_each_cpu_mask); | ||
54 | |||
55 | /* | ||
56 | * Preemption is disabled here to make sure the cond_func is called under the | ||
57 | * same condtions in UP and SMP. | ||
58 | */ | ||
59 | void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), | ||
60 | smp_call_func_t func, void *info, bool wait, | ||
61 | gfp_t gfp_flags) | ||
62 | { | ||
63 | unsigned long flags; | ||
64 | |||
65 | preempt_disable(); | ||
66 | if (cond_func(0, info)) { | ||
67 | local_irq_save(flags); | ||
68 | func(info); | ||
69 | local_irq_restore(flags); | ||
70 | } | ||
71 | preempt_enable(); | ||
72 | } | ||
73 | EXPORT_SYMBOL(on_each_cpu_cond); | ||
diff --git a/kernel/user.c b/kernel/user.c index 69b4c3d48cde..5bbb91988e69 100644 --- a/kernel/user.c +++ b/kernel/user.c | |||
@@ -51,8 +51,6 @@ struct user_namespace init_user_ns = { | |||
51 | .owner = GLOBAL_ROOT_UID, | 51 | .owner = GLOBAL_ROOT_UID, |
52 | .group = GLOBAL_ROOT_GID, | 52 | .group = GLOBAL_ROOT_GID, |
53 | .proc_inum = PROC_USER_INIT_INO, | 53 | .proc_inum = PROC_USER_INIT_INO, |
54 | .may_mount_sysfs = true, | ||
55 | .may_mount_proc = true, | ||
56 | }; | 54 | }; |
57 | EXPORT_SYMBOL_GPL(init_user_ns); | 55 | EXPORT_SYMBOL_GPL(init_user_ns); |
58 | 56 | ||
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 9064b919a406..13fb1134ba58 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c | |||
@@ -101,8 +101,6 @@ int create_user_ns(struct cred *new) | |||
101 | 101 | ||
102 | set_cred_user_ns(new, ns); | 102 | set_cred_user_ns(new, ns); |
103 | 103 | ||
104 | update_mnt_policy(ns); | ||
105 | |||
106 | return 0; | 104 | return 0; |
107 | } | 105 | } |
108 | 106 | ||
diff --git a/kernel/utsname.c b/kernel/utsname.c index 2fc8576efaa8..fd393124e507 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c | |||
@@ -114,7 +114,7 @@ static int utsns_install(struct nsproxy *nsproxy, void *new) | |||
114 | struct uts_namespace *ns = new; | 114 | struct uts_namespace *ns = new; |
115 | 115 | ||
116 | if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || | 116 | if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || |
117 | !nsown_capable(CAP_SYS_ADMIN)) | 117 | !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) |
118 | return -EPERM; | 118 | return -EPERM; |
119 | 119 | ||
120 | get_uts_ns(ns); | 120 | get_uts_ns(ns); |
diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 1241d8c91d5e..4431610f049a 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c | |||
@@ -486,7 +486,52 @@ static struct smp_hotplug_thread watchdog_threads = { | |||
486 | .unpark = watchdog_enable, | 486 | .unpark = watchdog_enable, |
487 | }; | 487 | }; |
488 | 488 | ||
489 | static int watchdog_enable_all_cpus(void) | 489 | static void restart_watchdog_hrtimer(void *info) |
490 | { | ||
491 | struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); | ||
492 | int ret; | ||
493 | |||
494 | /* | ||
495 | * No need to cancel and restart hrtimer if it is currently executing | ||
496 | * because it will reprogram itself with the new period now. | ||
497 | * We should never see it unqueued here because we are running per-cpu | ||
498 | * with interrupts disabled. | ||
499 | */ | ||
500 | ret = hrtimer_try_to_cancel(hrtimer); | ||
501 | if (ret == 1) | ||
502 | hrtimer_start(hrtimer, ns_to_ktime(sample_period), | ||
503 | HRTIMER_MODE_REL_PINNED); | ||
504 | } | ||
505 | |||
506 | static void update_timers(int cpu) | ||
507 | { | ||
508 | struct call_single_data data = {.func = restart_watchdog_hrtimer}; | ||
509 | /* | ||
510 | * Make sure that perf event counter will adopt to a new | ||
511 | * sampling period. Updating the sampling period directly would | ||
512 | * be much nicer but we do not have an API for that now so | ||
513 | * let's use a big hammer. | ||
514 | * Hrtimer will adopt the new period on the next tick but this | ||
515 | * might be late already so we have to restart the timer as well. | ||
516 | */ | ||
517 | watchdog_nmi_disable(cpu); | ||
518 | __smp_call_function_single(cpu, &data, 1); | ||
519 | watchdog_nmi_enable(cpu); | ||
520 | } | ||
521 | |||
522 | static void update_timers_all_cpus(void) | ||
523 | { | ||
524 | int cpu; | ||
525 | |||
526 | get_online_cpus(); | ||
527 | preempt_disable(); | ||
528 | for_each_online_cpu(cpu) | ||
529 | update_timers(cpu); | ||
530 | preempt_enable(); | ||
531 | put_online_cpus(); | ||
532 | } | ||
533 | |||
534 | static int watchdog_enable_all_cpus(bool sample_period_changed) | ||
490 | { | 535 | { |
491 | int err = 0; | 536 | int err = 0; |
492 | 537 | ||
@@ -496,6 +541,8 @@ static int watchdog_enable_all_cpus(void) | |||
496 | pr_err("Failed to create watchdog threads, disabled\n"); | 541 | pr_err("Failed to create watchdog threads, disabled\n"); |
497 | else | 542 | else |
498 | watchdog_running = 1; | 543 | watchdog_running = 1; |
544 | } else if (sample_period_changed) { | ||
545 | update_timers_all_cpus(); | ||
499 | } | 546 | } |
500 | 547 | ||
501 | return err; | 548 | return err; |
@@ -520,13 +567,15 @@ int proc_dowatchdog(struct ctl_table *table, int write, | |||
520 | void __user *buffer, size_t *lenp, loff_t *ppos) | 567 | void __user *buffer, size_t *lenp, loff_t *ppos) |
521 | { | 568 | { |
522 | int err, old_thresh, old_enabled; | 569 | int err, old_thresh, old_enabled; |
570 | static DEFINE_MUTEX(watchdog_proc_mutex); | ||
523 | 571 | ||
572 | mutex_lock(&watchdog_proc_mutex); | ||
524 | old_thresh = ACCESS_ONCE(watchdog_thresh); | 573 | old_thresh = ACCESS_ONCE(watchdog_thresh); |
525 | old_enabled = ACCESS_ONCE(watchdog_user_enabled); | 574 | old_enabled = ACCESS_ONCE(watchdog_user_enabled); |
526 | 575 | ||
527 | err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); | 576 | err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); |
528 | if (err || !write) | 577 | if (err || !write) |
529 | return err; | 578 | goto out; |
530 | 579 | ||
531 | set_sample_period(); | 580 | set_sample_period(); |
532 | /* | 581 | /* |
@@ -535,7 +584,7 @@ int proc_dowatchdog(struct ctl_table *table, int write, | |||
535 | * watchdog_*_all_cpus() function takes care of this. | 584 | * watchdog_*_all_cpus() function takes care of this. |
536 | */ | 585 | */ |
537 | if (watchdog_user_enabled && watchdog_thresh) | 586 | if (watchdog_user_enabled && watchdog_thresh) |
538 | err = watchdog_enable_all_cpus(); | 587 | err = watchdog_enable_all_cpus(old_thresh != watchdog_thresh); |
539 | else | 588 | else |
540 | watchdog_disable_all_cpus(); | 589 | watchdog_disable_all_cpus(); |
541 | 590 | ||
@@ -544,7 +593,8 @@ int proc_dowatchdog(struct ctl_table *table, int write, | |||
544 | watchdog_thresh = old_thresh; | 593 | watchdog_thresh = old_thresh; |
545 | watchdog_user_enabled = old_enabled; | 594 | watchdog_user_enabled = old_enabled; |
546 | } | 595 | } |
547 | 596 | out: | |
597 | mutex_unlock(&watchdog_proc_mutex); | ||
548 | return err; | 598 | return err; |
549 | } | 599 | } |
550 | #endif /* CONFIG_SYSCTL */ | 600 | #endif /* CONFIG_SYSCTL */ |
@@ -553,14 +603,6 @@ void __init lockup_detector_init(void) | |||
553 | { | 603 | { |
554 | set_sample_period(); | 604 | set_sample_period(); |
555 | 605 | ||
556 | #ifdef CONFIG_NO_HZ_FULL | ||
557 | if (watchdog_user_enabled) { | ||
558 | watchdog_user_enabled = 0; | ||
559 | pr_warning("Disabled lockup detectors by default for full dynticks\n"); | ||
560 | pr_warning("You can reactivate it with 'sysctl -w kernel.watchdog=1'\n"); | ||
561 | } | ||
562 | #endif | ||
563 | |||
564 | if (watchdog_user_enabled) | 606 | if (watchdog_user_enabled) |
565 | watchdog_enable_all_cpus(); | 607 | watchdog_enable_all_cpus(false); |
566 | } | 608 | } |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index e93f7b9067d8..987293d03ebc 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -16,9 +16,10 @@ | |||
16 | * | 16 | * |
17 | * This is the generic async execution mechanism. Work items as are | 17 | * This is the generic async execution mechanism. Work items as are |
18 | * executed in process context. The worker pool is shared and | 18 | * executed in process context. The worker pool is shared and |
19 | * automatically managed. There is one worker pool for each CPU and | 19 | * automatically managed. There are two worker pools for each CPU (one for |
20 | * one extra for works which are better served by workers which are | 20 | * normal work items and the other for high priority ones) and some extra |
21 | * not bound to any specific CPU. | 21 | * pools for workqueues which are not bound to any specific CPU - the |
22 | * number of these backing pools is dynamic. | ||
22 | * | 23 | * |
23 | * Please read Documentation/workqueue.txt for details. | 24 | * Please read Documentation/workqueue.txt for details. |
24 | */ | 25 | */ |
@@ -540,6 +541,8 @@ static int worker_pool_assign_id(struct worker_pool *pool) | |||
540 | * This must be called either with pwq_lock held or sched RCU read locked. | 541 | * This must be called either with pwq_lock held or sched RCU read locked. |
541 | * If the pwq needs to be used beyond the locking in effect, the caller is | 542 | * If the pwq needs to be used beyond the locking in effect, the caller is |
542 | * responsible for guaranteeing that the pwq stays online. | 543 | * responsible for guaranteeing that the pwq stays online. |
544 | * | ||
545 | * Return: The unbound pool_workqueue for @node. | ||
543 | */ | 546 | */ |
544 | static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq, | 547 | static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq, |
545 | int node) | 548 | int node) |
@@ -638,8 +641,6 @@ static struct pool_workqueue *get_work_pwq(struct work_struct *work) | |||
638 | * get_work_pool - return the worker_pool a given work was associated with | 641 | * get_work_pool - return the worker_pool a given work was associated with |
639 | * @work: the work item of interest | 642 | * @work: the work item of interest |
640 | * | 643 | * |
641 | * Return the worker_pool @work was last associated with. %NULL if none. | ||
642 | * | ||
643 | * Pools are created and destroyed under wq_pool_mutex, and allows read | 644 | * Pools are created and destroyed under wq_pool_mutex, and allows read |
644 | * access under sched-RCU read lock. As such, this function should be | 645 | * access under sched-RCU read lock. As such, this function should be |
645 | * called under wq_pool_mutex or with preemption disabled. | 646 | * called under wq_pool_mutex or with preemption disabled. |
@@ -648,6 +649,8 @@ static struct pool_workqueue *get_work_pwq(struct work_struct *work) | |||
648 | * mentioned locking is in effect. If the returned pool needs to be used | 649 | * mentioned locking is in effect. If the returned pool needs to be used |
649 | * beyond the critical section, the caller is responsible for ensuring the | 650 | * beyond the critical section, the caller is responsible for ensuring the |
650 | * returned pool is and stays online. | 651 | * returned pool is and stays online. |
652 | * | ||
653 | * Return: The worker_pool @work was last associated with. %NULL if none. | ||
651 | */ | 654 | */ |
652 | static struct worker_pool *get_work_pool(struct work_struct *work) | 655 | static struct worker_pool *get_work_pool(struct work_struct *work) |
653 | { | 656 | { |
@@ -671,7 +674,7 @@ static struct worker_pool *get_work_pool(struct work_struct *work) | |||
671 | * get_work_pool_id - return the worker pool ID a given work is associated with | 674 | * get_work_pool_id - return the worker pool ID a given work is associated with |
672 | * @work: the work item of interest | 675 | * @work: the work item of interest |
673 | * | 676 | * |
674 | * Return the worker_pool ID @work was last associated with. | 677 | * Return: The worker_pool ID @work was last associated with. |
675 | * %WORK_OFFQ_POOL_NONE if none. | 678 | * %WORK_OFFQ_POOL_NONE if none. |
676 | */ | 679 | */ |
677 | static int get_work_pool_id(struct work_struct *work) | 680 | static int get_work_pool_id(struct work_struct *work) |
@@ -830,7 +833,7 @@ void wq_worker_waking_up(struct task_struct *task, int cpu) | |||
830 | * CONTEXT: | 833 | * CONTEXT: |
831 | * spin_lock_irq(rq->lock) | 834 | * spin_lock_irq(rq->lock) |
832 | * | 835 | * |
833 | * RETURNS: | 836 | * Return: |
834 | * Worker task on @cpu to wake up, %NULL if none. | 837 | * Worker task on @cpu to wake up, %NULL if none. |
835 | */ | 838 | */ |
836 | struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu) | 839 | struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu) |
@@ -965,8 +968,8 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) | |||
965 | * CONTEXT: | 968 | * CONTEXT: |
966 | * spin_lock_irq(pool->lock). | 969 | * spin_lock_irq(pool->lock). |
967 | * | 970 | * |
968 | * RETURNS: | 971 | * Return: |
969 | * Pointer to worker which is executing @work if found, NULL | 972 | * Pointer to worker which is executing @work if found, %NULL |
970 | * otherwise. | 973 | * otherwise. |
971 | */ | 974 | */ |
972 | static struct worker *find_worker_executing_work(struct worker_pool *pool, | 975 | static struct worker *find_worker_executing_work(struct worker_pool *pool, |
@@ -1154,14 +1157,16 @@ out_put: | |||
1154 | * @flags: place to store irq state | 1157 | * @flags: place to store irq state |
1155 | * | 1158 | * |
1156 | * Try to grab PENDING bit of @work. This function can handle @work in any | 1159 | * Try to grab PENDING bit of @work. This function can handle @work in any |
1157 | * stable state - idle, on timer or on worklist. Return values are | 1160 | * stable state - idle, on timer or on worklist. |
1158 | * | 1161 | * |
1162 | * Return: | ||
1159 | * 1 if @work was pending and we successfully stole PENDING | 1163 | * 1 if @work was pending and we successfully stole PENDING |
1160 | * 0 if @work was idle and we claimed PENDING | 1164 | * 0 if @work was idle and we claimed PENDING |
1161 | * -EAGAIN if PENDING couldn't be grabbed at the moment, safe to busy-retry | 1165 | * -EAGAIN if PENDING couldn't be grabbed at the moment, safe to busy-retry |
1162 | * -ENOENT if someone else is canceling @work, this state may persist | 1166 | * -ENOENT if someone else is canceling @work, this state may persist |
1163 | * for arbitrarily long | 1167 | * for arbitrarily long |
1164 | * | 1168 | * |
1169 | * Note: | ||
1165 | * On >= 0 return, the caller owns @work's PENDING bit. To avoid getting | 1170 | * On >= 0 return, the caller owns @work's PENDING bit. To avoid getting |
1166 | * interrupted while holding PENDING and @work off queue, irq must be | 1171 | * interrupted while holding PENDING and @work off queue, irq must be |
1167 | * disabled on entry. This, combined with delayed_work->timer being | 1172 | * disabled on entry. This, combined with delayed_work->timer being |
@@ -1403,10 +1408,10 @@ retry: | |||
1403 | * @wq: workqueue to use | 1408 | * @wq: workqueue to use |
1404 | * @work: work to queue | 1409 | * @work: work to queue |
1405 | * | 1410 | * |
1406 | * Returns %false if @work was already on a queue, %true otherwise. | ||
1407 | * | ||
1408 | * We queue the work to a specific CPU, the caller must ensure it | 1411 | * We queue the work to a specific CPU, the caller must ensure it |
1409 | * can't go away. | 1412 | * can't go away. |
1413 | * | ||
1414 | * Return: %false if @work was already on a queue, %true otherwise. | ||
1410 | */ | 1415 | */ |
1411 | bool queue_work_on(int cpu, struct workqueue_struct *wq, | 1416 | bool queue_work_on(int cpu, struct workqueue_struct *wq, |
1412 | struct work_struct *work) | 1417 | struct work_struct *work) |
@@ -1476,7 +1481,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, | |||
1476 | * @dwork: work to queue | 1481 | * @dwork: work to queue |
1477 | * @delay: number of jiffies to wait before queueing | 1482 | * @delay: number of jiffies to wait before queueing |
1478 | * | 1483 | * |
1479 | * Returns %false if @work was already on a queue, %true otherwise. If | 1484 | * Return: %false if @work was already on a queue, %true otherwise. If |
1480 | * @delay is zero and @dwork is idle, it will be scheduled for immediate | 1485 | * @delay is zero and @dwork is idle, it will be scheduled for immediate |
1481 | * execution. | 1486 | * execution. |
1482 | */ | 1487 | */ |
@@ -1512,7 +1517,7 @@ EXPORT_SYMBOL(queue_delayed_work_on); | |||
1512 | * zero, @work is guaranteed to be scheduled immediately regardless of its | 1517 | * zero, @work is guaranteed to be scheduled immediately regardless of its |
1513 | * current state. | 1518 | * current state. |
1514 | * | 1519 | * |
1515 | * Returns %false if @dwork was idle and queued, %true if @dwork was | 1520 | * Return: %false if @dwork was idle and queued, %true if @dwork was |
1516 | * pending and its timer was modified. | 1521 | * pending and its timer was modified. |
1517 | * | 1522 | * |
1518 | * This function is safe to call from any context including IRQ handler. | 1523 | * This function is safe to call from any context including IRQ handler. |
@@ -1627,7 +1632,7 @@ static void worker_leave_idle(struct worker *worker) | |||
1627 | * Might sleep. Called without any lock but returns with pool->lock | 1632 | * Might sleep. Called without any lock but returns with pool->lock |
1628 | * held. | 1633 | * held. |
1629 | * | 1634 | * |
1630 | * RETURNS: | 1635 | * Return: |
1631 | * %true if the associated pool is online (@worker is successfully | 1636 | * %true if the associated pool is online (@worker is successfully |
1632 | * bound), %false if offline. | 1637 | * bound), %false if offline. |
1633 | */ | 1638 | */ |
@@ -1688,7 +1693,7 @@ static struct worker *alloc_worker(void) | |||
1688 | * CONTEXT: | 1693 | * CONTEXT: |
1689 | * Might sleep. Does GFP_KERNEL allocations. | 1694 | * Might sleep. Does GFP_KERNEL allocations. |
1690 | * | 1695 | * |
1691 | * RETURNS: | 1696 | * Return: |
1692 | * Pointer to the newly created worker. | 1697 | * Pointer to the newly created worker. |
1693 | */ | 1698 | */ |
1694 | static struct worker *create_worker(struct worker_pool *pool) | 1699 | static struct worker *create_worker(struct worker_pool *pool) |
@@ -1788,6 +1793,8 @@ static void start_worker(struct worker *worker) | |||
1788 | * @pool: the target pool | 1793 | * @pool: the target pool |
1789 | * | 1794 | * |
1790 | * Grab the managership of @pool and create and start a new worker for it. | 1795 | * Grab the managership of @pool and create and start a new worker for it. |
1796 | * | ||
1797 | * Return: 0 on success. A negative error code otherwise. | ||
1791 | */ | 1798 | */ |
1792 | static int create_and_start_worker(struct worker_pool *pool) | 1799 | static int create_and_start_worker(struct worker_pool *pool) |
1793 | { | 1800 | { |
@@ -1932,7 +1939,7 @@ static void pool_mayday_timeout(unsigned long __pool) | |||
1932 | * multiple times. Does GFP_KERNEL allocations. Called only from | 1939 | * multiple times. Does GFP_KERNEL allocations. Called only from |
1933 | * manager. | 1940 | * manager. |
1934 | * | 1941 | * |
1935 | * RETURNS: | 1942 | * Return: |
1936 | * %false if no action was taken and pool->lock stayed locked, %true | 1943 | * %false if no action was taken and pool->lock stayed locked, %true |
1937 | * otherwise. | 1944 | * otherwise. |
1938 | */ | 1945 | */ |
@@ -1989,7 +1996,7 @@ restart: | |||
1989 | * spin_lock_irq(pool->lock) which may be released and regrabbed | 1996 | * spin_lock_irq(pool->lock) which may be released and regrabbed |
1990 | * multiple times. Called only from manager. | 1997 | * multiple times. Called only from manager. |
1991 | * | 1998 | * |
1992 | * RETURNS: | 1999 | * Return: |
1993 | * %false if no action was taken and pool->lock stayed locked, %true | 2000 | * %false if no action was taken and pool->lock stayed locked, %true |
1994 | * otherwise. | 2001 | * otherwise. |
1995 | */ | 2002 | */ |
@@ -2032,9 +2039,12 @@ static bool maybe_destroy_workers(struct worker_pool *pool) | |||
2032 | * spin_lock_irq(pool->lock) which may be released and regrabbed | 2039 | * spin_lock_irq(pool->lock) which may be released and regrabbed |
2033 | * multiple times. Does GFP_KERNEL allocations. | 2040 | * multiple times. Does GFP_KERNEL allocations. |
2034 | * | 2041 | * |
2035 | * RETURNS: | 2042 | * Return: |
2036 | * spin_lock_irq(pool->lock) which may be released and regrabbed | 2043 | * %false if the pool don't need management and the caller can safely start |
2037 | * multiple times. Does GFP_KERNEL allocations. | 2044 | * processing works, %true indicates that the function released pool->lock |
2045 | * and reacquired it to perform some management function and that the | ||
2046 | * conditions that the caller verified while holding the lock before | ||
2047 | * calling the function might no longer be true. | ||
2038 | */ | 2048 | */ |
2039 | static bool manage_workers(struct worker *worker) | 2049 | static bool manage_workers(struct worker *worker) |
2040 | { | 2050 | { |
@@ -2255,6 +2265,8 @@ static void process_scheduled_works(struct worker *worker) | |||
2255 | * work items regardless of their specific target workqueue. The only | 2265 | * work items regardless of their specific target workqueue. The only |
2256 | * exception is work items which belong to workqueues with a rescuer which | 2266 | * exception is work items which belong to workqueues with a rescuer which |
2257 | * will be explained in rescuer_thread(). | 2267 | * will be explained in rescuer_thread(). |
2268 | * | ||
2269 | * Return: 0 | ||
2258 | */ | 2270 | */ |
2259 | static int worker_thread(void *__worker) | 2271 | static int worker_thread(void *__worker) |
2260 | { | 2272 | { |
@@ -2353,6 +2365,8 @@ sleep: | |||
2353 | * those works so that forward progress can be guaranteed. | 2365 | * those works so that forward progress can be guaranteed. |
2354 | * | 2366 | * |
2355 | * This should happen rarely. | 2367 | * This should happen rarely. |
2368 | * | ||
2369 | * Return: 0 | ||
2356 | */ | 2370 | */ |
2357 | static int rescuer_thread(void *__rescuer) | 2371 | static int rescuer_thread(void *__rescuer) |
2358 | { | 2372 | { |
@@ -2525,7 +2539,7 @@ static void insert_wq_barrier(struct pool_workqueue *pwq, | |||
2525 | * CONTEXT: | 2539 | * CONTEXT: |
2526 | * mutex_lock(wq->mutex). | 2540 | * mutex_lock(wq->mutex). |
2527 | * | 2541 | * |
2528 | * RETURNS: | 2542 | * Return: |
2529 | * %true if @flush_color >= 0 and there's something to flush. %false | 2543 | * %true if @flush_color >= 0 and there's something to flush. %false |
2530 | * otherwise. | 2544 | * otherwise. |
2531 | */ | 2545 | */ |
@@ -2846,7 +2860,7 @@ static bool __flush_work(struct work_struct *work) | |||
2846 | * Wait until @work has finished execution. @work is guaranteed to be idle | 2860 | * Wait until @work has finished execution. @work is guaranteed to be idle |
2847 | * on return if it hasn't been requeued since flush started. | 2861 | * on return if it hasn't been requeued since flush started. |
2848 | * | 2862 | * |
2849 | * RETURNS: | 2863 | * Return: |
2850 | * %true if flush_work() waited for the work to finish execution, | 2864 | * %true if flush_work() waited for the work to finish execution, |
2851 | * %false if it was already idle. | 2865 | * %false if it was already idle. |
2852 | */ | 2866 | */ |
@@ -2898,7 +2912,7 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork) | |||
2898 | * The caller must ensure that the workqueue on which @work was last | 2912 | * The caller must ensure that the workqueue on which @work was last |
2899 | * queued can't be destroyed before this function returns. | 2913 | * queued can't be destroyed before this function returns. |
2900 | * | 2914 | * |
2901 | * RETURNS: | 2915 | * Return: |
2902 | * %true if @work was pending, %false otherwise. | 2916 | * %true if @work was pending, %false otherwise. |
2903 | */ | 2917 | */ |
2904 | bool cancel_work_sync(struct work_struct *work) | 2918 | bool cancel_work_sync(struct work_struct *work) |
@@ -2915,7 +2929,7 @@ EXPORT_SYMBOL_GPL(cancel_work_sync); | |||
2915 | * immediate execution. Like flush_work(), this function only | 2929 | * immediate execution. Like flush_work(), this function only |
2916 | * considers the last queueing instance of @dwork. | 2930 | * considers the last queueing instance of @dwork. |
2917 | * | 2931 | * |
2918 | * RETURNS: | 2932 | * Return: |
2919 | * %true if flush_work() waited for the work to finish execution, | 2933 | * %true if flush_work() waited for the work to finish execution, |
2920 | * %false if it was already idle. | 2934 | * %false if it was already idle. |
2921 | */ | 2935 | */ |
@@ -2933,11 +2947,15 @@ EXPORT_SYMBOL(flush_delayed_work); | |||
2933 | * cancel_delayed_work - cancel a delayed work | 2947 | * cancel_delayed_work - cancel a delayed work |
2934 | * @dwork: delayed_work to cancel | 2948 | * @dwork: delayed_work to cancel |
2935 | * | 2949 | * |
2936 | * Kill off a pending delayed_work. Returns %true if @dwork was pending | 2950 | * Kill off a pending delayed_work. |
2937 | * and canceled; %false if wasn't pending. Note that the work callback | 2951 | * |
2938 | * function may still be running on return, unless it returns %true and the | 2952 | * Return: %true if @dwork was pending and canceled; %false if it wasn't |
2939 | * work doesn't re-arm itself. Explicitly flush or use | 2953 | * pending. |
2940 | * cancel_delayed_work_sync() to wait on it. | 2954 | * |
2955 | * Note: | ||
2956 | * The work callback function may still be running on return, unless | ||
2957 | * it returns %true and the work doesn't re-arm itself. Explicitly flush or | ||
2958 | * use cancel_delayed_work_sync() to wait on it. | ||
2941 | * | 2959 | * |
2942 | * This function is safe to call from any context including IRQ handler. | 2960 | * This function is safe to call from any context including IRQ handler. |
2943 | */ | 2961 | */ |
@@ -2966,7 +2984,7 @@ EXPORT_SYMBOL(cancel_delayed_work); | |||
2966 | * | 2984 | * |
2967 | * This is cancel_work_sync() for delayed works. | 2985 | * This is cancel_work_sync() for delayed works. |
2968 | * | 2986 | * |
2969 | * RETURNS: | 2987 | * Return: |
2970 | * %true if @dwork was pending, %false otherwise. | 2988 | * %true if @dwork was pending, %false otherwise. |
2971 | */ | 2989 | */ |
2972 | bool cancel_delayed_work_sync(struct delayed_work *dwork) | 2990 | bool cancel_delayed_work_sync(struct delayed_work *dwork) |
@@ -2983,7 +3001,7 @@ EXPORT_SYMBOL(cancel_delayed_work_sync); | |||
2983 | * system workqueue and blocks until all CPUs have completed. | 3001 | * system workqueue and blocks until all CPUs have completed. |
2984 | * schedule_on_each_cpu() is very slow. | 3002 | * schedule_on_each_cpu() is very slow. |
2985 | * | 3003 | * |
2986 | * RETURNS: | 3004 | * Return: |
2987 | * 0 on success, -errno on failure. | 3005 | * 0 on success, -errno on failure. |
2988 | */ | 3006 | */ |
2989 | int schedule_on_each_cpu(work_func_t func) | 3007 | int schedule_on_each_cpu(work_func_t func) |
@@ -3051,7 +3069,7 @@ EXPORT_SYMBOL(flush_scheduled_work); | |||
3051 | * Executes the function immediately if process context is available, | 3069 | * Executes the function immediately if process context is available, |
3052 | * otherwise schedules the function for delayed execution. | 3070 | * otherwise schedules the function for delayed execution. |
3053 | * | 3071 | * |
3054 | * Returns: 0 - function was executed | 3072 | * Return: 0 - function was executed |
3055 | * 1 - function was scheduled for execution | 3073 | * 1 - function was scheduled for execution |
3056 | */ | 3074 | */ |
3057 | int execute_in_process_context(work_func_t fn, struct execute_work *ew) | 3075 | int execute_in_process_context(work_func_t fn, struct execute_work *ew) |
@@ -3095,25 +3113,26 @@ static struct workqueue_struct *dev_to_wq(struct device *dev) | |||
3095 | return wq_dev->wq; | 3113 | return wq_dev->wq; |
3096 | } | 3114 | } |
3097 | 3115 | ||
3098 | static ssize_t wq_per_cpu_show(struct device *dev, | 3116 | static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr, |
3099 | struct device_attribute *attr, char *buf) | 3117 | char *buf) |
3100 | { | 3118 | { |
3101 | struct workqueue_struct *wq = dev_to_wq(dev); | 3119 | struct workqueue_struct *wq = dev_to_wq(dev); |
3102 | 3120 | ||
3103 | return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND)); | 3121 | return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND)); |
3104 | } | 3122 | } |
3123 | static DEVICE_ATTR_RO(per_cpu); | ||
3105 | 3124 | ||
3106 | static ssize_t wq_max_active_show(struct device *dev, | 3125 | static ssize_t max_active_show(struct device *dev, |
3107 | struct device_attribute *attr, char *buf) | 3126 | struct device_attribute *attr, char *buf) |
3108 | { | 3127 | { |
3109 | struct workqueue_struct *wq = dev_to_wq(dev); | 3128 | struct workqueue_struct *wq = dev_to_wq(dev); |
3110 | 3129 | ||
3111 | return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active); | 3130 | return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active); |
3112 | } | 3131 | } |
3113 | 3132 | ||
3114 | static ssize_t wq_max_active_store(struct device *dev, | 3133 | static ssize_t max_active_store(struct device *dev, |
3115 | struct device_attribute *attr, | 3134 | struct device_attribute *attr, const char *buf, |
3116 | const char *buf, size_t count) | 3135 | size_t count) |
3117 | { | 3136 | { |
3118 | struct workqueue_struct *wq = dev_to_wq(dev); | 3137 | struct workqueue_struct *wq = dev_to_wq(dev); |
3119 | int val; | 3138 | int val; |
@@ -3124,12 +3143,14 @@ static ssize_t wq_max_active_store(struct device *dev, | |||
3124 | workqueue_set_max_active(wq, val); | 3143 | workqueue_set_max_active(wq, val); |
3125 | return count; | 3144 | return count; |
3126 | } | 3145 | } |
3146 | static DEVICE_ATTR_RW(max_active); | ||
3127 | 3147 | ||
3128 | static struct device_attribute wq_sysfs_attrs[] = { | 3148 | static struct attribute *wq_sysfs_attrs[] = { |
3129 | __ATTR(per_cpu, 0444, wq_per_cpu_show, NULL), | 3149 | &dev_attr_per_cpu.attr, |
3130 | __ATTR(max_active, 0644, wq_max_active_show, wq_max_active_store), | 3150 | &dev_attr_max_active.attr, |
3131 | __ATTR_NULL, | 3151 | NULL, |
3132 | }; | 3152 | }; |
3153 | ATTRIBUTE_GROUPS(wq_sysfs); | ||
3133 | 3154 | ||
3134 | static ssize_t wq_pool_ids_show(struct device *dev, | 3155 | static ssize_t wq_pool_ids_show(struct device *dev, |
3135 | struct device_attribute *attr, char *buf) | 3156 | struct device_attribute *attr, char *buf) |
@@ -3279,7 +3300,7 @@ static struct device_attribute wq_sysfs_unbound_attrs[] = { | |||
3279 | 3300 | ||
3280 | static struct bus_type wq_subsys = { | 3301 | static struct bus_type wq_subsys = { |
3281 | .name = "workqueue", | 3302 | .name = "workqueue", |
3282 | .dev_attrs = wq_sysfs_attrs, | 3303 | .dev_groups = wq_sysfs_groups, |
3283 | }; | 3304 | }; |
3284 | 3305 | ||
3285 | static int __init wq_sysfs_init(void) | 3306 | static int __init wq_sysfs_init(void) |
@@ -3308,7 +3329,7 @@ static void wq_device_release(struct device *dev) | |||
3308 | * apply_workqueue_attrs() may race against userland updating the | 3329 | * apply_workqueue_attrs() may race against userland updating the |
3309 | * attributes. | 3330 | * attributes. |
3310 | * | 3331 | * |
3311 | * Returns 0 on success, -errno on failure. | 3332 | * Return: 0 on success, -errno on failure. |
3312 | */ | 3333 | */ |
3313 | int workqueue_sysfs_register(struct workqueue_struct *wq) | 3334 | int workqueue_sysfs_register(struct workqueue_struct *wq) |
3314 | { | 3335 | { |
@@ -3401,7 +3422,9 @@ void free_workqueue_attrs(struct workqueue_attrs *attrs) | |||
3401 | * @gfp_mask: allocation mask to use | 3422 | * @gfp_mask: allocation mask to use |
3402 | * | 3423 | * |
3403 | * Allocate a new workqueue_attrs, initialize with default settings and | 3424 | * Allocate a new workqueue_attrs, initialize with default settings and |
3404 | * return it. Returns NULL on failure. | 3425 | * return it. |
3426 | * | ||
3427 | * Return: The allocated new workqueue_attr on success. %NULL on failure. | ||
3405 | */ | 3428 | */ |
3406 | struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask) | 3429 | struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask) |
3407 | { | 3430 | { |
@@ -3460,7 +3483,8 @@ static bool wqattrs_equal(const struct workqueue_attrs *a, | |||
3460 | * @pool: worker_pool to initialize | 3483 | * @pool: worker_pool to initialize |
3461 | * | 3484 | * |
3462 | * Initiailize a newly zalloc'd @pool. It also allocates @pool->attrs. | 3485 | * Initiailize a newly zalloc'd @pool. It also allocates @pool->attrs. |
3463 | * Returns 0 on success, -errno on failure. Even on failure, all fields | 3486 | * |
3487 | * Return: 0 on success, -errno on failure. Even on failure, all fields | ||
3464 | * inside @pool proper are initialized and put_unbound_pool() can be called | 3488 | * inside @pool proper are initialized and put_unbound_pool() can be called |
3465 | * on @pool safely to release it. | 3489 | * on @pool safely to release it. |
3466 | */ | 3490 | */ |
@@ -3567,9 +3591,12 @@ static void put_unbound_pool(struct worker_pool *pool) | |||
3567 | * Obtain a worker_pool which has the same attributes as @attrs, bump the | 3591 | * Obtain a worker_pool which has the same attributes as @attrs, bump the |
3568 | * reference count and return it. If there already is a matching | 3592 | * reference count and return it. If there already is a matching |
3569 | * worker_pool, it will be used; otherwise, this function attempts to | 3593 | * worker_pool, it will be used; otherwise, this function attempts to |
3570 | * create a new one. On failure, returns NULL. | 3594 | * create a new one. |
3571 | * | 3595 | * |
3572 | * Should be called with wq_pool_mutex held. | 3596 | * Should be called with wq_pool_mutex held. |
3597 | * | ||
3598 | * Return: On success, a worker_pool with the same attributes as @attrs. | ||
3599 | * On failure, %NULL. | ||
3573 | */ | 3600 | */ |
3574 | static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) | 3601 | static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) |
3575 | { | 3602 | { |
@@ -3805,9 +3832,7 @@ static void free_unbound_pwq(struct pool_workqueue *pwq) | |||
3805 | * | 3832 | * |
3806 | * Calculate the cpumask a workqueue with @attrs should use on @node. If | 3833 | * Calculate the cpumask a workqueue with @attrs should use on @node. If |
3807 | * @cpu_going_down is >= 0, that cpu is considered offline during | 3834 | * @cpu_going_down is >= 0, that cpu is considered offline during |
3808 | * calculation. The result is stored in @cpumask. This function returns | 3835 | * calculation. The result is stored in @cpumask. |
3809 | * %true if the resulting @cpumask is different from @attrs->cpumask, | ||
3810 | * %false if equal. | ||
3811 | * | 3836 | * |
3812 | * If NUMA affinity is not enabled, @attrs->cpumask is always used. If | 3837 | * If NUMA affinity is not enabled, @attrs->cpumask is always used. If |
3813 | * enabled and @node has online CPUs requested by @attrs, the returned | 3838 | * enabled and @node has online CPUs requested by @attrs, the returned |
@@ -3816,6 +3841,9 @@ static void free_unbound_pwq(struct pool_workqueue *pwq) | |||
3816 | * | 3841 | * |
3817 | * The caller is responsible for ensuring that the cpumask of @node stays | 3842 | * The caller is responsible for ensuring that the cpumask of @node stays |
3818 | * stable. | 3843 | * stable. |
3844 | * | ||
3845 | * Return: %true if the resulting @cpumask is different from @attrs->cpumask, | ||
3846 | * %false if equal. | ||
3819 | */ | 3847 | */ |
3820 | static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node, | 3848 | static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node, |
3821 | int cpu_going_down, cpumask_t *cpumask) | 3849 | int cpu_going_down, cpumask_t *cpumask) |
@@ -3869,8 +3897,9 @@ static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq, | |||
3869 | * items finish. Note that a work item which repeatedly requeues itself | 3897 | * items finish. Note that a work item which repeatedly requeues itself |
3870 | * back-to-back will stay on its current pwq. | 3898 | * back-to-back will stay on its current pwq. |
3871 | * | 3899 | * |
3872 | * Performs GFP_KERNEL allocations. Returns 0 on success and -errno on | 3900 | * Performs GFP_KERNEL allocations. |
3873 | * failure. | 3901 | * |
3902 | * Return: 0 on success and -errno on failure. | ||
3874 | */ | 3903 | */ |
3875 | int apply_workqueue_attrs(struct workqueue_struct *wq, | 3904 | int apply_workqueue_attrs(struct workqueue_struct *wq, |
3876 | const struct workqueue_attrs *attrs) | 3905 | const struct workqueue_attrs *attrs) |
@@ -4338,6 +4367,8 @@ EXPORT_SYMBOL_GPL(workqueue_set_max_active); | |||
4338 | * | 4367 | * |
4339 | * Determine whether %current is a workqueue rescuer. Can be used from | 4368 | * Determine whether %current is a workqueue rescuer. Can be used from |
4340 | * work functions to determine whether it's being run off the rescuer task. | 4369 | * work functions to determine whether it's being run off the rescuer task. |
4370 | * | ||
4371 | * Return: %true if %current is a workqueue rescuer. %false otherwise. | ||
4341 | */ | 4372 | */ |
4342 | bool current_is_workqueue_rescuer(void) | 4373 | bool current_is_workqueue_rescuer(void) |
4343 | { | 4374 | { |
@@ -4361,7 +4392,7 @@ bool current_is_workqueue_rescuer(void) | |||
4361 | * workqueue being congested on one CPU doesn't mean the workqueue is also | 4392 | * workqueue being congested on one CPU doesn't mean the workqueue is also |
4362 | * contested on other CPUs / NUMA nodes. | 4393 | * contested on other CPUs / NUMA nodes. |
4363 | * | 4394 | * |
4364 | * RETURNS: | 4395 | * Return: |
4365 | * %true if congested, %false otherwise. | 4396 | * %true if congested, %false otherwise. |
4366 | */ | 4397 | */ |
4367 | bool workqueue_congested(int cpu, struct workqueue_struct *wq) | 4398 | bool workqueue_congested(int cpu, struct workqueue_struct *wq) |
@@ -4394,7 +4425,7 @@ EXPORT_SYMBOL_GPL(workqueue_congested); | |||
4394 | * synchronization around this function and the test result is | 4425 | * synchronization around this function and the test result is |
4395 | * unreliable and only useful as advisory hints or for debugging. | 4426 | * unreliable and only useful as advisory hints or for debugging. |
4396 | * | 4427 | * |
4397 | * RETURNS: | 4428 | * Return: |
4398 | * OR'd bitmask of WORK_BUSY_* bits. | 4429 | * OR'd bitmask of WORK_BUSY_* bits. |
4399 | */ | 4430 | */ |
4400 | unsigned int work_busy(struct work_struct *work) | 4431 | unsigned int work_busy(struct work_struct *work) |
@@ -4772,9 +4803,10 @@ static void work_for_cpu_fn(struct work_struct *work) | |||
4772 | * @fn: the function to run | 4803 | * @fn: the function to run |
4773 | * @arg: the function arg | 4804 | * @arg: the function arg |
4774 | * | 4805 | * |
4775 | * This will return the value @fn returns. | ||
4776 | * It is up to the caller to ensure that the cpu doesn't go offline. | 4806 | * It is up to the caller to ensure that the cpu doesn't go offline. |
4777 | * The caller must not hold any locks which would prevent @fn from completing. | 4807 | * The caller must not hold any locks which would prevent @fn from completing. |
4808 | * | ||
4809 | * Return: The value @fn returns. | ||
4778 | */ | 4810 | */ |
4779 | long work_on_cpu(int cpu, long (*fn)(void *), void *arg) | 4811 | long work_on_cpu(int cpu, long (*fn)(void *), void *arg) |
4780 | { | 4812 | { |
@@ -4846,7 +4878,7 @@ void freeze_workqueues_begin(void) | |||
4846 | * CONTEXT: | 4878 | * CONTEXT: |
4847 | * Grabs and releases wq_pool_mutex. | 4879 | * Grabs and releases wq_pool_mutex. |
4848 | * | 4880 | * |
4849 | * RETURNS: | 4881 | * Return: |
4850 | * %true if some freezable workqueues are still busy. %false if freezing | 4882 | * %true if some freezable workqueues are still busy. %false if freezing |
4851 | * is complete. | 4883 | * is complete. |
4852 | */ | 4884 | */ |