diff options
Diffstat (limited to 'kernel')
37 files changed, 3010 insertions, 2160 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 781845a013ab..e0aeb32415ff 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -81,7 +81,7 @@ | |||
81 | */ | 81 | */ |
82 | #ifdef CONFIG_PROVE_RCU | 82 | #ifdef CONFIG_PROVE_RCU |
83 | DEFINE_MUTEX(cgroup_mutex); | 83 | DEFINE_MUTEX(cgroup_mutex); |
84 | EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for task_subsys_state_check() */ | 84 | EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for lockdep */ |
85 | #else | 85 | #else |
86 | static DEFINE_MUTEX(cgroup_mutex); | 86 | static DEFINE_MUTEX(cgroup_mutex); |
87 | #endif | 87 | #endif |
@@ -117,6 +117,7 @@ struct cfent { | |||
117 | struct list_head node; | 117 | struct list_head node; |
118 | struct dentry *dentry; | 118 | struct dentry *dentry; |
119 | struct cftype *type; | 119 | struct cftype *type; |
120 | struct cgroup_subsys_state *css; | ||
120 | 121 | ||
121 | /* file xattrs */ | 122 | /* file xattrs */ |
122 | struct simple_xattrs xattrs; | 123 | struct simple_xattrs xattrs; |
@@ -159,9 +160,9 @@ struct css_id { | |||
159 | */ | 160 | */ |
160 | struct cgroup_event { | 161 | struct cgroup_event { |
161 | /* | 162 | /* |
162 | * Cgroup which the event belongs to. | 163 | * css which the event belongs to. |
163 | */ | 164 | */ |
164 | struct cgroup *cgrp; | 165 | struct cgroup_subsys_state *css; |
165 | /* | 166 | /* |
166 | * Control file which the event associated. | 167 | * Control file which the event associated. |
167 | */ | 168 | */ |
@@ -215,10 +216,33 @@ static u64 cgroup_serial_nr_next = 1; | |||
215 | */ | 216 | */ |
216 | static int need_forkexit_callback __read_mostly; | 217 | static int need_forkexit_callback __read_mostly; |
217 | 218 | ||
218 | static void cgroup_offline_fn(struct work_struct *work); | 219 | static struct cftype cgroup_base_files[]; |
220 | |||
221 | static void cgroup_destroy_css_killed(struct cgroup *cgrp); | ||
219 | static int cgroup_destroy_locked(struct cgroup *cgrp); | 222 | static int cgroup_destroy_locked(struct cgroup *cgrp); |
220 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, | 223 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], |
221 | struct cftype cfts[], bool is_add); | 224 | bool is_add); |
225 | |||
226 | /** | ||
227 | * cgroup_css - obtain a cgroup's css for the specified subsystem | ||
228 | * @cgrp: the cgroup of interest | ||
229 | * @ss: the subsystem of interest (%NULL returns the dummy_css) | ||
230 | * | ||
231 | * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This | ||
232 | * function must be called either under cgroup_mutex or rcu_read_lock() and | ||
233 | * the caller is responsible for pinning the returned css if it wants to | ||
234 | * keep accessing it outside the said locks. This function may return | ||
235 | * %NULL if @cgrp doesn't have @subsys_id enabled. | ||
236 | */ | ||
237 | static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, | ||
238 | struct cgroup_subsys *ss) | ||
239 | { | ||
240 | if (ss) | ||
241 | return rcu_dereference_check(cgrp->subsys[ss->subsys_id], | ||
242 | lockdep_is_held(&cgroup_mutex)); | ||
243 | else | ||
244 | return &cgrp->dummy_css; | ||
245 | } | ||
222 | 246 | ||
223 | /* convenient tests for these bits */ | 247 | /* convenient tests for these bits */ |
224 | static inline bool cgroup_is_dead(const struct cgroup *cgrp) | 248 | static inline bool cgroup_is_dead(const struct cgroup *cgrp) |
@@ -365,9 +389,11 @@ static struct cgrp_cset_link init_cgrp_cset_link; | |||
365 | static int cgroup_init_idr(struct cgroup_subsys *ss, | 389 | static int cgroup_init_idr(struct cgroup_subsys *ss, |
366 | struct cgroup_subsys_state *css); | 390 | struct cgroup_subsys_state *css); |
367 | 391 | ||
368 | /* css_set_lock protects the list of css_set objects, and the | 392 | /* |
369 | * chain of tasks off each css_set. Nests outside task->alloc_lock | 393 | * css_set_lock protects the list of css_set objects, and the chain of |
370 | * due to cgroup_iter_start() */ | 394 | * tasks off each css_set. Nests outside task->alloc_lock due to |
395 | * css_task_iter_start(). | ||
396 | */ | ||
371 | static DEFINE_RWLOCK(css_set_lock); | 397 | static DEFINE_RWLOCK(css_set_lock); |
372 | static int css_set_count; | 398 | static int css_set_count; |
373 | 399 | ||
@@ -392,10 +418,12 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) | |||
392 | return key; | 418 | return key; |
393 | } | 419 | } |
394 | 420 | ||
395 | /* We don't maintain the lists running through each css_set to its | 421 | /* |
396 | * task until after the first call to cgroup_iter_start(). This | 422 | * We don't maintain the lists running through each css_set to its task |
397 | * reduces the fork()/exit() overhead for people who have cgroups | 423 | * until after the first call to css_task_iter_start(). This reduces the |
398 | * compiled into their kernel but not actually in use */ | 424 | * fork()/exit() overhead for people who have cgroups compiled into their |
425 | * kernel but not actually in use. | ||
426 | */ | ||
399 | static int use_task_css_set_links __read_mostly; | 427 | static int use_task_css_set_links __read_mostly; |
400 | 428 | ||
401 | static void __put_css_set(struct css_set *cset, int taskexit) | 429 | static void __put_css_set(struct css_set *cset, int taskexit) |
@@ -464,7 +492,7 @@ static inline void put_css_set_taskexit(struct css_set *cset) | |||
464 | * @new_cgrp: cgroup that's being entered by the task | 492 | * @new_cgrp: cgroup that's being entered by the task |
465 | * @template: desired set of css pointers in css_set (pre-calculated) | 493 | * @template: desired set of css pointers in css_set (pre-calculated) |
466 | * | 494 | * |
467 | * Returns true if "cg" matches "old_cg" except for the hierarchy | 495 | * Returns true if "cset" matches "old_cset" except for the hierarchy |
468 | * which "new_cgrp" belongs to, for which it should match "new_cgrp". | 496 | * which "new_cgrp" belongs to, for which it should match "new_cgrp". |
469 | */ | 497 | */ |
470 | static bool compare_css_sets(struct css_set *cset, | 498 | static bool compare_css_sets(struct css_set *cset, |
@@ -555,7 +583,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset, | |||
555 | /* Subsystem is in this hierarchy. So we want | 583 | /* Subsystem is in this hierarchy. So we want |
556 | * the subsystem state from the new | 584 | * the subsystem state from the new |
557 | * cgroup */ | 585 | * cgroup */ |
558 | template[i] = cgrp->subsys[i]; | 586 | template[i] = cgroup_css(cgrp, ss); |
559 | } else { | 587 | } else { |
560 | /* Subsystem is not in this hierarchy, so we | 588 | /* Subsystem is not in this hierarchy, so we |
561 | * don't want to change the subsystem state */ | 589 | * don't want to change the subsystem state */ |
@@ -803,8 +831,7 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, | |||
803 | 831 | ||
804 | static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); | 832 | static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); |
805 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); | 833 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); |
806 | static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, | 834 | static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask); |
807 | unsigned long subsys_mask); | ||
808 | static const struct inode_operations cgroup_dir_inode_operations; | 835 | static const struct inode_operations cgroup_dir_inode_operations; |
809 | static const struct file_operations proc_cgroupstats_operations; | 836 | static const struct file_operations proc_cgroupstats_operations; |
810 | 837 | ||
@@ -813,8 +840,7 @@ static struct backing_dev_info cgroup_backing_dev_info = { | |||
813 | .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, | 840 | .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, |
814 | }; | 841 | }; |
815 | 842 | ||
816 | static int alloc_css_id(struct cgroup_subsys *ss, | 843 | static int alloc_css_id(struct cgroup_subsys_state *child_css); |
817 | struct cgroup *parent, struct cgroup *child); | ||
818 | 844 | ||
819 | static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) | 845 | static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) |
820 | { | 846 | { |
@@ -845,15 +871,8 @@ static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry) | |||
845 | static void cgroup_free_fn(struct work_struct *work) | 871 | static void cgroup_free_fn(struct work_struct *work) |
846 | { | 872 | { |
847 | struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); | 873 | struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); |
848 | struct cgroup_subsys *ss; | ||
849 | 874 | ||
850 | mutex_lock(&cgroup_mutex); | 875 | mutex_lock(&cgroup_mutex); |
851 | /* | ||
852 | * Release the subsystem state objects. | ||
853 | */ | ||
854 | for_each_root_subsys(cgrp->root, ss) | ||
855 | ss->css_free(cgrp); | ||
856 | |||
857 | cgrp->root->number_of_cgroups--; | 876 | cgrp->root->number_of_cgroups--; |
858 | mutex_unlock(&cgroup_mutex); | 877 | mutex_unlock(&cgroup_mutex); |
859 | 878 | ||
@@ -864,8 +883,6 @@ static void cgroup_free_fn(struct work_struct *work) | |||
864 | */ | 883 | */ |
865 | dput(cgrp->parent->dentry); | 884 | dput(cgrp->parent->dentry); |
866 | 885 | ||
867 | ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); | ||
868 | |||
869 | /* | 886 | /* |
870 | * Drop the active superblock reference that we took when we | 887 | * Drop the active superblock reference that we took when we |
871 | * created the cgroup. This will free cgrp->root, if we are | 888 | * created the cgroup. This will free cgrp->root, if we are |
@@ -956,27 +973,22 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) | |||
956 | } | 973 | } |
957 | 974 | ||
958 | /** | 975 | /** |
959 | * cgroup_clear_directory - selective removal of base and subsystem files | 976 | * cgroup_clear_dir - remove subsys files in a cgroup directory |
960 | * @dir: directory containing the files | 977 | * @cgrp: target cgroup |
961 | * @base_files: true if the base files should be removed | ||
962 | * @subsys_mask: mask of the subsystem ids whose files should be removed | 978 | * @subsys_mask: mask of the subsystem ids whose files should be removed |
963 | */ | 979 | */ |
964 | static void cgroup_clear_directory(struct dentry *dir, bool base_files, | 980 | static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) |
965 | unsigned long subsys_mask) | ||
966 | { | 981 | { |
967 | struct cgroup *cgrp = __d_cgrp(dir); | ||
968 | struct cgroup_subsys *ss; | 982 | struct cgroup_subsys *ss; |
983 | int i; | ||
969 | 984 | ||
970 | for_each_root_subsys(cgrp->root, ss) { | 985 | for_each_subsys(ss, i) { |
971 | struct cftype_set *set; | 986 | struct cftype_set *set; |
972 | if (!test_bit(ss->subsys_id, &subsys_mask)) | 987 | |
988 | if (!test_bit(i, &subsys_mask)) | ||
973 | continue; | 989 | continue; |
974 | list_for_each_entry(set, &ss->cftsets, node) | 990 | list_for_each_entry(set, &ss->cftsets, node) |
975 | cgroup_addrm_files(cgrp, NULL, set->cfts, false); | 991 | cgroup_addrm_files(cgrp, set->cfts, false); |
976 | } | ||
977 | if (base_files) { | ||
978 | while (!list_empty(&cgrp->files)) | ||
979 | cgroup_rm_file(cgrp, NULL); | ||
980 | } | 992 | } |
981 | } | 993 | } |
982 | 994 | ||
@@ -986,9 +998,6 @@ static void cgroup_clear_directory(struct dentry *dir, bool base_files, | |||
986 | static void cgroup_d_remove_dir(struct dentry *dentry) | 998 | static void cgroup_d_remove_dir(struct dentry *dentry) |
987 | { | 999 | { |
988 | struct dentry *parent; | 1000 | struct dentry *parent; |
989 | struct cgroupfs_root *root = dentry->d_sb->s_fs_info; | ||
990 | |||
991 | cgroup_clear_directory(dentry, true, root->subsys_mask); | ||
992 | 1001 | ||
993 | parent = dentry->d_parent; | 1002 | parent = dentry->d_parent; |
994 | spin_lock(&parent->d_lock); | 1003 | spin_lock(&parent->d_lock); |
@@ -1009,79 +1018,84 @@ static int rebind_subsystems(struct cgroupfs_root *root, | |||
1009 | { | 1018 | { |
1010 | struct cgroup *cgrp = &root->top_cgroup; | 1019 | struct cgroup *cgrp = &root->top_cgroup; |
1011 | struct cgroup_subsys *ss; | 1020 | struct cgroup_subsys *ss; |
1012 | int i; | 1021 | unsigned long pinned = 0; |
1022 | int i, ret; | ||
1013 | 1023 | ||
1014 | BUG_ON(!mutex_is_locked(&cgroup_mutex)); | 1024 | BUG_ON(!mutex_is_locked(&cgroup_mutex)); |
1015 | BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); | 1025 | BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); |
1016 | 1026 | ||
1017 | /* Check that any added subsystems are currently free */ | 1027 | /* Check that any added subsystems are currently free */ |
1018 | for_each_subsys(ss, i) { | 1028 | for_each_subsys(ss, i) { |
1019 | unsigned long bit = 1UL << i; | 1029 | if (!(added_mask & (1 << i))) |
1020 | |||
1021 | if (!(bit & added_mask)) | ||
1022 | continue; | 1030 | continue; |
1023 | 1031 | ||
1032 | /* is the subsystem mounted elsewhere? */ | ||
1024 | if (ss->root != &cgroup_dummy_root) { | 1033 | if (ss->root != &cgroup_dummy_root) { |
1025 | /* Subsystem isn't free */ | 1034 | ret = -EBUSY; |
1026 | return -EBUSY; | 1035 | goto out_put; |
1036 | } | ||
1037 | |||
1038 | /* pin the module */ | ||
1039 | if (!try_module_get(ss->module)) { | ||
1040 | ret = -ENOENT; | ||
1041 | goto out_put; | ||
1027 | } | 1042 | } |
1043 | pinned |= 1 << i; | ||
1028 | } | 1044 | } |
1029 | 1045 | ||
1030 | /* Currently we don't handle adding/removing subsystems when | 1046 | /* subsys could be missing if unloaded between parsing and here */ |
1031 | * any child cgroups exist. This is theoretically supportable | 1047 | if (added_mask != pinned) { |
1032 | * but involves complex error handling, so it's being left until | 1048 | ret = -ENOENT; |
1033 | * later */ | 1049 | goto out_put; |
1034 | if (root->number_of_cgroups > 1) | 1050 | } |
1035 | return -EBUSY; | 1051 | |
1052 | ret = cgroup_populate_dir(cgrp, added_mask); | ||
1053 | if (ret) | ||
1054 | goto out_put; | ||
1055 | |||
1056 | /* | ||
1057 | * Nothing can fail from this point on. Remove files for the | ||
1058 | * removed subsystems and rebind each subsystem. | ||
1059 | */ | ||
1060 | cgroup_clear_dir(cgrp, removed_mask); | ||
1036 | 1061 | ||
1037 | /* Process each subsystem */ | ||
1038 | for_each_subsys(ss, i) { | 1062 | for_each_subsys(ss, i) { |
1039 | unsigned long bit = 1UL << i; | 1063 | unsigned long bit = 1UL << i; |
1040 | 1064 | ||
1041 | if (bit & added_mask) { | 1065 | if (bit & added_mask) { |
1042 | /* We're binding this subsystem to this hierarchy */ | 1066 | /* We're binding this subsystem to this hierarchy */ |
1043 | BUG_ON(cgrp->subsys[i]); | 1067 | BUG_ON(cgroup_css(cgrp, ss)); |
1044 | BUG_ON(!cgroup_dummy_top->subsys[i]); | 1068 | BUG_ON(!cgroup_css(cgroup_dummy_top, ss)); |
1045 | BUG_ON(cgroup_dummy_top->subsys[i]->cgroup != cgroup_dummy_top); | 1069 | BUG_ON(cgroup_css(cgroup_dummy_top, ss)->cgroup != cgroup_dummy_top); |
1070 | |||
1071 | rcu_assign_pointer(cgrp->subsys[i], | ||
1072 | cgroup_css(cgroup_dummy_top, ss)); | ||
1073 | cgroup_css(cgrp, ss)->cgroup = cgrp; | ||
1046 | 1074 | ||
1047 | cgrp->subsys[i] = cgroup_dummy_top->subsys[i]; | ||
1048 | cgrp->subsys[i]->cgroup = cgrp; | ||
1049 | list_move(&ss->sibling, &root->subsys_list); | 1075 | list_move(&ss->sibling, &root->subsys_list); |
1050 | ss->root = root; | 1076 | ss->root = root; |
1051 | if (ss->bind) | 1077 | if (ss->bind) |
1052 | ss->bind(cgrp); | 1078 | ss->bind(cgroup_css(cgrp, ss)); |
1053 | 1079 | ||
1054 | /* refcount was already taken, and we're keeping it */ | 1080 | /* refcount was already taken, and we're keeping it */ |
1055 | root->subsys_mask |= bit; | 1081 | root->subsys_mask |= bit; |
1056 | } else if (bit & removed_mask) { | 1082 | } else if (bit & removed_mask) { |
1057 | /* We're removing this subsystem */ | 1083 | /* We're removing this subsystem */ |
1058 | BUG_ON(cgrp->subsys[i] != cgroup_dummy_top->subsys[i]); | 1084 | BUG_ON(cgroup_css(cgrp, ss) != cgroup_css(cgroup_dummy_top, ss)); |
1059 | BUG_ON(cgrp->subsys[i]->cgroup != cgrp); | 1085 | BUG_ON(cgroup_css(cgrp, ss)->cgroup != cgrp); |
1060 | 1086 | ||
1061 | if (ss->bind) | 1087 | if (ss->bind) |
1062 | ss->bind(cgroup_dummy_top); | 1088 | ss->bind(cgroup_css(cgroup_dummy_top, ss)); |
1063 | cgroup_dummy_top->subsys[i]->cgroup = cgroup_dummy_top; | 1089 | |
1064 | cgrp->subsys[i] = NULL; | 1090 | cgroup_css(cgroup_dummy_top, ss)->cgroup = cgroup_dummy_top; |
1091 | RCU_INIT_POINTER(cgrp->subsys[i], NULL); | ||
1092 | |||
1065 | cgroup_subsys[i]->root = &cgroup_dummy_root; | 1093 | cgroup_subsys[i]->root = &cgroup_dummy_root; |
1066 | list_move(&ss->sibling, &cgroup_dummy_root.subsys_list); | 1094 | list_move(&ss->sibling, &cgroup_dummy_root.subsys_list); |
1067 | 1095 | ||
1068 | /* subsystem is now free - drop reference on module */ | 1096 | /* subsystem is now free - drop reference on module */ |
1069 | module_put(ss->module); | 1097 | module_put(ss->module); |
1070 | root->subsys_mask &= ~bit; | 1098 | root->subsys_mask &= ~bit; |
1071 | } else if (bit & root->subsys_mask) { | ||
1072 | /* Subsystem state should already exist */ | ||
1073 | BUG_ON(!cgrp->subsys[i]); | ||
1074 | /* | ||
1075 | * a refcount was taken, but we already had one, so | ||
1076 | * drop the extra reference. | ||
1077 | */ | ||
1078 | module_put(ss->module); | ||
1079 | #ifdef CONFIG_MODULE_UNLOAD | ||
1080 | BUG_ON(ss->module && !module_refcount(ss->module)); | ||
1081 | #endif | ||
1082 | } else { | ||
1083 | /* Subsystem state shouldn't exist */ | ||
1084 | BUG_ON(cgrp->subsys[i]); | ||
1085 | } | 1099 | } |
1086 | } | 1100 | } |
1087 | 1101 | ||
@@ -1092,6 +1106,12 @@ static int rebind_subsystems(struct cgroupfs_root *root, | |||
1092 | root->flags |= CGRP_ROOT_SUBSYS_BOUND; | 1106 | root->flags |= CGRP_ROOT_SUBSYS_BOUND; |
1093 | 1107 | ||
1094 | return 0; | 1108 | return 0; |
1109 | |||
1110 | out_put: | ||
1111 | for_each_subsys(ss, i) | ||
1112 | if (pinned & (1 << i)) | ||
1113 | module_put(ss->module); | ||
1114 | return ret; | ||
1095 | } | 1115 | } |
1096 | 1116 | ||
1097 | static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) | 1117 | static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) |
@@ -1142,7 +1162,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
1142 | char *token, *o = data; | 1162 | char *token, *o = data; |
1143 | bool all_ss = false, one_ss = false; | 1163 | bool all_ss = false, one_ss = false; |
1144 | unsigned long mask = (unsigned long)-1; | 1164 | unsigned long mask = (unsigned long)-1; |
1145 | bool module_pin_failed = false; | ||
1146 | struct cgroup_subsys *ss; | 1165 | struct cgroup_subsys *ss; |
1147 | int i; | 1166 | int i; |
1148 | 1167 | ||
@@ -1285,52 +1304,9 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
1285 | if (!opts->subsys_mask && !opts->name) | 1304 | if (!opts->subsys_mask && !opts->name) |
1286 | return -EINVAL; | 1305 | return -EINVAL; |
1287 | 1306 | ||
1288 | /* | ||
1289 | * Grab references on all the modules we'll need, so the subsystems | ||
1290 | * don't dance around before rebind_subsystems attaches them. This may | ||
1291 | * take duplicate reference counts on a subsystem that's already used, | ||
1292 | * but rebind_subsystems handles this case. | ||
1293 | */ | ||
1294 | for_each_subsys(ss, i) { | ||
1295 | if (!(opts->subsys_mask & (1UL << i))) | ||
1296 | continue; | ||
1297 | if (!try_module_get(cgroup_subsys[i]->module)) { | ||
1298 | module_pin_failed = true; | ||
1299 | break; | ||
1300 | } | ||
1301 | } | ||
1302 | if (module_pin_failed) { | ||
1303 | /* | ||
1304 | * oops, one of the modules was going away. this means that we | ||
1305 | * raced with a module_delete call, and to the user this is | ||
1306 | * essentially a "subsystem doesn't exist" case. | ||
1307 | */ | ||
1308 | for (i--; i >= 0; i--) { | ||
1309 | /* drop refcounts only on the ones we took */ | ||
1310 | unsigned long bit = 1UL << i; | ||
1311 | |||
1312 | if (!(bit & opts->subsys_mask)) | ||
1313 | continue; | ||
1314 | module_put(cgroup_subsys[i]->module); | ||
1315 | } | ||
1316 | return -ENOENT; | ||
1317 | } | ||
1318 | |||
1319 | return 0; | 1307 | return 0; |
1320 | } | 1308 | } |
1321 | 1309 | ||
1322 | static void drop_parsed_module_refcounts(unsigned long subsys_mask) | ||
1323 | { | ||
1324 | struct cgroup_subsys *ss; | ||
1325 | int i; | ||
1326 | |||
1327 | mutex_lock(&cgroup_mutex); | ||
1328 | for_each_subsys(ss, i) | ||
1329 | if (subsys_mask & (1UL << i)) | ||
1330 | module_put(cgroup_subsys[i]->module); | ||
1331 | mutex_unlock(&cgroup_mutex); | ||
1332 | } | ||
1333 | |||
1334 | static int cgroup_remount(struct super_block *sb, int *flags, char *data) | 1310 | static int cgroup_remount(struct super_block *sb, int *flags, char *data) |
1335 | { | 1311 | { |
1336 | int ret = 0; | 1312 | int ret = 0; |
@@ -1370,22 +1346,15 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) | |||
1370 | goto out_unlock; | 1346 | goto out_unlock; |
1371 | } | 1347 | } |
1372 | 1348 | ||
1373 | /* | 1349 | /* remounting is not allowed for populated hierarchies */ |
1374 | * Clear out the files of subsystems that should be removed, do | 1350 | if (root->number_of_cgroups > 1) { |
1375 | * this before rebind_subsystems, since rebind_subsystems may | 1351 | ret = -EBUSY; |
1376 | * change this hierarchy's subsys_list. | ||
1377 | */ | ||
1378 | cgroup_clear_directory(cgrp->dentry, false, removed_mask); | ||
1379 | |||
1380 | ret = rebind_subsystems(root, added_mask, removed_mask); | ||
1381 | if (ret) { | ||
1382 | /* rebind_subsystems failed, re-populate the removed files */ | ||
1383 | cgroup_populate_dir(cgrp, false, removed_mask); | ||
1384 | goto out_unlock; | 1352 | goto out_unlock; |
1385 | } | 1353 | } |
1386 | 1354 | ||
1387 | /* re-populate subsystem files */ | 1355 | ret = rebind_subsystems(root, added_mask, removed_mask); |
1388 | cgroup_populate_dir(cgrp, false, added_mask); | 1356 | if (ret) |
1357 | goto out_unlock; | ||
1389 | 1358 | ||
1390 | if (opts.release_agent) | 1359 | if (opts.release_agent) |
1391 | strcpy(root->release_agent_path, opts.release_agent); | 1360 | strcpy(root->release_agent_path, opts.release_agent); |
@@ -1395,8 +1364,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) | |||
1395 | mutex_unlock(&cgroup_root_mutex); | 1364 | mutex_unlock(&cgroup_root_mutex); |
1396 | mutex_unlock(&cgroup_mutex); | 1365 | mutex_unlock(&cgroup_mutex); |
1397 | mutex_unlock(&cgrp->dentry->d_inode->i_mutex); | 1366 | mutex_unlock(&cgrp->dentry->d_inode->i_mutex); |
1398 | if (ret) | ||
1399 | drop_parsed_module_refcounts(opts.subsys_mask); | ||
1400 | return ret; | 1367 | return ret; |
1401 | } | 1368 | } |
1402 | 1369 | ||
@@ -1416,6 +1383,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) | |||
1416 | INIT_LIST_HEAD(&cgrp->release_list); | 1383 | INIT_LIST_HEAD(&cgrp->release_list); |
1417 | INIT_LIST_HEAD(&cgrp->pidlists); | 1384 | INIT_LIST_HEAD(&cgrp->pidlists); |
1418 | mutex_init(&cgrp->pidlist_mutex); | 1385 | mutex_init(&cgrp->pidlist_mutex); |
1386 | cgrp->dummy_css.cgroup = cgrp; | ||
1419 | INIT_LIST_HEAD(&cgrp->event_list); | 1387 | INIT_LIST_HEAD(&cgrp->event_list); |
1420 | spin_lock_init(&cgrp->event_list_lock); | 1388 | spin_lock_init(&cgrp->event_list_lock); |
1421 | simple_xattrs_init(&cgrp->xattrs); | 1389 | simple_xattrs_init(&cgrp->xattrs); |
@@ -1431,6 +1399,7 @@ static void init_cgroup_root(struct cgroupfs_root *root) | |||
1431 | cgrp->root = root; | 1399 | cgrp->root = root; |
1432 | RCU_INIT_POINTER(cgrp->name, &root_cgroup_name); | 1400 | RCU_INIT_POINTER(cgrp->name, &root_cgroup_name); |
1433 | init_cgroup_housekeeping(cgrp); | 1401 | init_cgroup_housekeeping(cgrp); |
1402 | idr_init(&root->cgroup_idr); | ||
1434 | } | 1403 | } |
1435 | 1404 | ||
1436 | static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end) | 1405 | static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end) |
@@ -1503,7 +1472,6 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) | |||
1503 | */ | 1472 | */ |
1504 | root->subsys_mask = opts->subsys_mask; | 1473 | root->subsys_mask = opts->subsys_mask; |
1505 | root->flags = opts->flags; | 1474 | root->flags = opts->flags; |
1506 | ida_init(&root->cgroup_ida); | ||
1507 | if (opts->release_agent) | 1475 | if (opts->release_agent) |
1508 | strcpy(root->release_agent_path, opts->release_agent); | 1476 | strcpy(root->release_agent_path, opts->release_agent); |
1509 | if (opts->name) | 1477 | if (opts->name) |
@@ -1519,7 +1487,7 @@ static void cgroup_free_root(struct cgroupfs_root *root) | |||
1519 | /* hierarhcy ID shoulid already have been released */ | 1487 | /* hierarhcy ID shoulid already have been released */ |
1520 | WARN_ON_ONCE(root->hierarchy_id); | 1488 | WARN_ON_ONCE(root->hierarchy_id); |
1521 | 1489 | ||
1522 | ida_destroy(&root->cgroup_ida); | 1490 | idr_destroy(&root->cgroup_idr); |
1523 | kfree(root); | 1491 | kfree(root); |
1524 | } | 1492 | } |
1525 | } | 1493 | } |
@@ -1584,7 +1552,9 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1584 | int ret = 0; | 1552 | int ret = 0; |
1585 | struct super_block *sb; | 1553 | struct super_block *sb; |
1586 | struct cgroupfs_root *new_root; | 1554 | struct cgroupfs_root *new_root; |
1555 | struct list_head tmp_links; | ||
1587 | struct inode *inode; | 1556 | struct inode *inode; |
1557 | const struct cred *cred; | ||
1588 | 1558 | ||
1589 | /* First find the desired set of subsystems */ | 1559 | /* First find the desired set of subsystems */ |
1590 | mutex_lock(&cgroup_mutex); | 1560 | mutex_lock(&cgroup_mutex); |
@@ -1600,7 +1570,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1600 | new_root = cgroup_root_from_opts(&opts); | 1570 | new_root = cgroup_root_from_opts(&opts); |
1601 | if (IS_ERR(new_root)) { | 1571 | if (IS_ERR(new_root)) { |
1602 | ret = PTR_ERR(new_root); | 1572 | ret = PTR_ERR(new_root); |
1603 | goto drop_modules; | 1573 | goto out_err; |
1604 | } | 1574 | } |
1605 | opts.new_root = new_root; | 1575 | opts.new_root = new_root; |
1606 | 1576 | ||
@@ -1609,17 +1579,15 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1609 | if (IS_ERR(sb)) { | 1579 | if (IS_ERR(sb)) { |
1610 | ret = PTR_ERR(sb); | 1580 | ret = PTR_ERR(sb); |
1611 | cgroup_free_root(opts.new_root); | 1581 | cgroup_free_root(opts.new_root); |
1612 | goto drop_modules; | 1582 | goto out_err; |
1613 | } | 1583 | } |
1614 | 1584 | ||
1615 | root = sb->s_fs_info; | 1585 | root = sb->s_fs_info; |
1616 | BUG_ON(!root); | 1586 | BUG_ON(!root); |
1617 | if (root == opts.new_root) { | 1587 | if (root == opts.new_root) { |
1618 | /* We used the new root structure, so this is a new hierarchy */ | 1588 | /* We used the new root structure, so this is a new hierarchy */ |
1619 | struct list_head tmp_links; | ||
1620 | struct cgroup *root_cgrp = &root->top_cgroup; | 1589 | struct cgroup *root_cgrp = &root->top_cgroup; |
1621 | struct cgroupfs_root *existing_root; | 1590 | struct cgroupfs_root *existing_root; |
1622 | const struct cred *cred; | ||
1623 | int i; | 1591 | int i; |
1624 | struct css_set *cset; | 1592 | struct css_set *cset; |
1625 | 1593 | ||
@@ -1634,6 +1602,11 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1634 | mutex_lock(&cgroup_mutex); | 1602 | mutex_lock(&cgroup_mutex); |
1635 | mutex_lock(&cgroup_root_mutex); | 1603 | mutex_lock(&cgroup_root_mutex); |
1636 | 1604 | ||
1605 | root_cgrp->id = idr_alloc(&root->cgroup_idr, root_cgrp, | ||
1606 | 0, 1, GFP_KERNEL); | ||
1607 | if (root_cgrp->id < 0) | ||
1608 | goto unlock_drop; | ||
1609 | |||
1637 | /* Check for name clashes with existing mounts */ | 1610 | /* Check for name clashes with existing mounts */ |
1638 | ret = -EBUSY; | 1611 | ret = -EBUSY; |
1639 | if (strlen(root->name)) | 1612 | if (strlen(root->name)) |
@@ -1657,26 +1630,37 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1657 | if (ret) | 1630 | if (ret) |
1658 | goto unlock_drop; | 1631 | goto unlock_drop; |
1659 | 1632 | ||
1633 | sb->s_root->d_fsdata = root_cgrp; | ||
1634 | root_cgrp->dentry = sb->s_root; | ||
1635 | |||
1636 | /* | ||
1637 | * We're inside get_sb() and will call lookup_one_len() to | ||
1638 | * create the root files, which doesn't work if SELinux is | ||
1639 | * in use. The following cred dancing somehow works around | ||
1640 | * it. See 2ce9738ba ("cgroupfs: use init_cred when | ||
1641 | * populating new cgroupfs mount") for more details. | ||
1642 | */ | ||
1643 | cred = override_creds(&init_cred); | ||
1644 | |||
1645 | ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true); | ||
1646 | if (ret) | ||
1647 | goto rm_base_files; | ||
1648 | |||
1660 | ret = rebind_subsystems(root, root->subsys_mask, 0); | 1649 | ret = rebind_subsystems(root, root->subsys_mask, 0); |
1661 | if (ret == -EBUSY) { | 1650 | if (ret) |
1662 | free_cgrp_cset_links(&tmp_links); | 1651 | goto rm_base_files; |
1663 | goto unlock_drop; | 1652 | |
1664 | } | 1653 | revert_creds(cred); |
1654 | |||
1665 | /* | 1655 | /* |
1666 | * There must be no failure case after here, since rebinding | 1656 | * There must be no failure case after here, since rebinding |
1667 | * takes care of subsystems' refcounts, which are explicitly | 1657 | * takes care of subsystems' refcounts, which are explicitly |
1668 | * dropped in the failure exit path. | 1658 | * dropped in the failure exit path. |
1669 | */ | 1659 | */ |
1670 | 1660 | ||
1671 | /* EBUSY should be the only error here */ | ||
1672 | BUG_ON(ret); | ||
1673 | |||
1674 | list_add(&root->root_list, &cgroup_roots); | 1661 | list_add(&root->root_list, &cgroup_roots); |
1675 | cgroup_root_count++; | 1662 | cgroup_root_count++; |
1676 | 1663 | ||
1677 | sb->s_root->d_fsdata = root_cgrp; | ||
1678 | root->top_cgroup.dentry = sb->s_root; | ||
1679 | |||
1680 | /* Link the top cgroup in this hierarchy into all | 1664 | /* Link the top cgroup in this hierarchy into all |
1681 | * the css_set objects */ | 1665 | * the css_set objects */ |
1682 | write_lock(&css_set_lock); | 1666 | write_lock(&css_set_lock); |
@@ -1689,9 +1673,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1689 | BUG_ON(!list_empty(&root_cgrp->children)); | 1673 | BUG_ON(!list_empty(&root_cgrp->children)); |
1690 | BUG_ON(root->number_of_cgroups != 1); | 1674 | BUG_ON(root->number_of_cgroups != 1); |
1691 | 1675 | ||
1692 | cred = override_creds(&init_cred); | ||
1693 | cgroup_populate_dir(root_cgrp, true, root->subsys_mask); | ||
1694 | revert_creds(cred); | ||
1695 | mutex_unlock(&cgroup_root_mutex); | 1676 | mutex_unlock(&cgroup_root_mutex); |
1696 | mutex_unlock(&cgroup_mutex); | 1677 | mutex_unlock(&cgroup_mutex); |
1697 | mutex_unlock(&inode->i_mutex); | 1678 | mutex_unlock(&inode->i_mutex); |
@@ -1711,15 +1692,16 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1711 | pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n"); | 1692 | pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n"); |
1712 | } | 1693 | } |
1713 | } | 1694 | } |
1714 | |||
1715 | /* no subsys rebinding, so refcounts don't change */ | ||
1716 | drop_parsed_module_refcounts(opts.subsys_mask); | ||
1717 | } | 1695 | } |
1718 | 1696 | ||
1719 | kfree(opts.release_agent); | 1697 | kfree(opts.release_agent); |
1720 | kfree(opts.name); | 1698 | kfree(opts.name); |
1721 | return dget(sb->s_root); | 1699 | return dget(sb->s_root); |
1722 | 1700 | ||
1701 | rm_base_files: | ||
1702 | free_cgrp_cset_links(&tmp_links); | ||
1703 | cgroup_addrm_files(&root->top_cgroup, cgroup_base_files, false); | ||
1704 | revert_creds(cred); | ||
1723 | unlock_drop: | 1705 | unlock_drop: |
1724 | cgroup_exit_root_id(root); | 1706 | cgroup_exit_root_id(root); |
1725 | mutex_unlock(&cgroup_root_mutex); | 1707 | mutex_unlock(&cgroup_root_mutex); |
@@ -1727,8 +1709,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1727 | mutex_unlock(&inode->i_mutex); | 1709 | mutex_unlock(&inode->i_mutex); |
1728 | drop_new_super: | 1710 | drop_new_super: |
1729 | deactivate_locked_super(sb); | 1711 | deactivate_locked_super(sb); |
1730 | drop_modules: | ||
1731 | drop_parsed_module_refcounts(opts.subsys_mask); | ||
1732 | out_err: | 1712 | out_err: |
1733 | kfree(opts.release_agent); | 1713 | kfree(opts.release_agent); |
1734 | kfree(opts.name); | 1714 | kfree(opts.name); |
@@ -1746,6 +1726,7 @@ static void cgroup_kill_sb(struct super_block *sb) { | |||
1746 | BUG_ON(root->number_of_cgroups != 1); | 1726 | BUG_ON(root->number_of_cgroups != 1); |
1747 | BUG_ON(!list_empty(&cgrp->children)); | 1727 | BUG_ON(!list_empty(&cgrp->children)); |
1748 | 1728 | ||
1729 | mutex_lock(&cgrp->dentry->d_inode->i_mutex); | ||
1749 | mutex_lock(&cgroup_mutex); | 1730 | mutex_lock(&cgroup_mutex); |
1750 | mutex_lock(&cgroup_root_mutex); | 1731 | mutex_lock(&cgroup_root_mutex); |
1751 | 1732 | ||
@@ -1778,6 +1759,7 @@ static void cgroup_kill_sb(struct super_block *sb) { | |||
1778 | 1759 | ||
1779 | mutex_unlock(&cgroup_root_mutex); | 1760 | mutex_unlock(&cgroup_root_mutex); |
1780 | mutex_unlock(&cgroup_mutex); | 1761 | mutex_unlock(&cgroup_mutex); |
1762 | mutex_unlock(&cgrp->dentry->d_inode->i_mutex); | ||
1781 | 1763 | ||
1782 | simple_xattrs_free(&cgrp->xattrs); | 1764 | simple_xattrs_free(&cgrp->xattrs); |
1783 | 1765 | ||
@@ -1889,7 +1871,7 @@ EXPORT_SYMBOL_GPL(task_cgroup_path); | |||
1889 | struct task_and_cgroup { | 1871 | struct task_and_cgroup { |
1890 | struct task_struct *task; | 1872 | struct task_struct *task; |
1891 | struct cgroup *cgrp; | 1873 | struct cgroup *cgrp; |
1892 | struct css_set *cg; | 1874 | struct css_set *cset; |
1893 | }; | 1875 | }; |
1894 | 1876 | ||
1895 | struct cgroup_taskset { | 1877 | struct cgroup_taskset { |
@@ -1939,18 +1921,20 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) | |||
1939 | EXPORT_SYMBOL_GPL(cgroup_taskset_next); | 1921 | EXPORT_SYMBOL_GPL(cgroup_taskset_next); |
1940 | 1922 | ||
1941 | /** | 1923 | /** |
1942 | * cgroup_taskset_cur_cgroup - return the matching cgroup for the current task | 1924 | * cgroup_taskset_cur_css - return the matching css for the current task |
1943 | * @tset: taskset of interest | 1925 | * @tset: taskset of interest |
1926 | * @subsys_id: the ID of the target subsystem | ||
1944 | * | 1927 | * |
1945 | * Return the cgroup for the current (last returned) task of @tset. This | 1928 | * Return the css for the current (last returned) task of @tset for |
1946 | * function must be preceded by either cgroup_taskset_first() or | 1929 | * subsystem specified by @subsys_id. This function must be preceded by |
1947 | * cgroup_taskset_next(). | 1930 | * either cgroup_taskset_first() or cgroup_taskset_next(). |
1948 | */ | 1931 | */ |
1949 | struct cgroup *cgroup_taskset_cur_cgroup(struct cgroup_taskset *tset) | 1932 | struct cgroup_subsys_state *cgroup_taskset_cur_css(struct cgroup_taskset *tset, |
1933 | int subsys_id) | ||
1950 | { | 1934 | { |
1951 | return tset->cur_cgrp; | 1935 | return cgroup_css(tset->cur_cgrp, cgroup_subsys[subsys_id]); |
1952 | } | 1936 | } |
1953 | EXPORT_SYMBOL_GPL(cgroup_taskset_cur_cgroup); | 1937 | EXPORT_SYMBOL_GPL(cgroup_taskset_cur_css); |
1954 | 1938 | ||
1955 | /** | 1939 | /** |
1956 | * cgroup_taskset_size - return the number of tasks in taskset | 1940 | * cgroup_taskset_size - return the number of tasks in taskset |
@@ -2089,8 +2073,10 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, | |||
2089 | * step 1: check that we can legitimately attach to the cgroup. | 2073 | * step 1: check that we can legitimately attach to the cgroup. |
2090 | */ | 2074 | */ |
2091 | for_each_root_subsys(root, ss) { | 2075 | for_each_root_subsys(root, ss) { |
2076 | struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); | ||
2077 | |||
2092 | if (ss->can_attach) { | 2078 | if (ss->can_attach) { |
2093 | retval = ss->can_attach(cgrp, &tset); | 2079 | retval = ss->can_attach(css, &tset); |
2094 | if (retval) { | 2080 | if (retval) { |
2095 | failed_ss = ss; | 2081 | failed_ss = ss; |
2096 | goto out_cancel_attach; | 2082 | goto out_cancel_attach; |
@@ -2107,8 +2093,8 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, | |||
2107 | 2093 | ||
2108 | tc = flex_array_get(group, i); | 2094 | tc = flex_array_get(group, i); |
2109 | old_cset = task_css_set(tc->task); | 2095 | old_cset = task_css_set(tc->task); |
2110 | tc->cg = find_css_set(old_cset, cgrp); | 2096 | tc->cset = find_css_set(old_cset, cgrp); |
2111 | if (!tc->cg) { | 2097 | if (!tc->cset) { |
2112 | retval = -ENOMEM; | 2098 | retval = -ENOMEM; |
2113 | goto out_put_css_set_refs; | 2099 | goto out_put_css_set_refs; |
2114 | } | 2100 | } |
@@ -2121,7 +2107,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, | |||
2121 | */ | 2107 | */ |
2122 | for (i = 0; i < group_size; i++) { | 2108 | for (i = 0; i < group_size; i++) { |
2123 | tc = flex_array_get(group, i); | 2109 | tc = flex_array_get(group, i); |
2124 | cgroup_task_migrate(tc->cgrp, tc->task, tc->cg); | 2110 | cgroup_task_migrate(tc->cgrp, tc->task, tc->cset); |
2125 | } | 2111 | } |
2126 | /* nothing is sensitive to fork() after this point. */ | 2112 | /* nothing is sensitive to fork() after this point. */ |
2127 | 2113 | ||
@@ -2129,8 +2115,10 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, | |||
2129 | * step 4: do subsystem attach callbacks. | 2115 | * step 4: do subsystem attach callbacks. |
2130 | */ | 2116 | */ |
2131 | for_each_root_subsys(root, ss) { | 2117 | for_each_root_subsys(root, ss) { |
2118 | struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); | ||
2119 | |||
2132 | if (ss->attach) | 2120 | if (ss->attach) |
2133 | ss->attach(cgrp, &tset); | 2121 | ss->attach(css, &tset); |
2134 | } | 2122 | } |
2135 | 2123 | ||
2136 | /* | 2124 | /* |
@@ -2141,18 +2129,20 @@ out_put_css_set_refs: | |||
2141 | if (retval) { | 2129 | if (retval) { |
2142 | for (i = 0; i < group_size; i++) { | 2130 | for (i = 0; i < group_size; i++) { |
2143 | tc = flex_array_get(group, i); | 2131 | tc = flex_array_get(group, i); |
2144 | if (!tc->cg) | 2132 | if (!tc->cset) |
2145 | break; | 2133 | break; |
2146 | put_css_set(tc->cg); | 2134 | put_css_set(tc->cset); |
2147 | } | 2135 | } |
2148 | } | 2136 | } |
2149 | out_cancel_attach: | 2137 | out_cancel_attach: |
2150 | if (retval) { | 2138 | if (retval) { |
2151 | for_each_root_subsys(root, ss) { | 2139 | for_each_root_subsys(root, ss) { |
2140 | struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); | ||
2141 | |||
2152 | if (ss == failed_ss) | 2142 | if (ss == failed_ss) |
2153 | break; | 2143 | break; |
2154 | if (ss->cancel_attach) | 2144 | if (ss->cancel_attach) |
2155 | ss->cancel_attach(cgrp, &tset); | 2145 | ss->cancel_attach(css, &tset); |
2156 | } | 2146 | } |
2157 | } | 2147 | } |
2158 | out_free_group_list: | 2148 | out_free_group_list: |
@@ -2253,9 +2243,9 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) | |||
2253 | 2243 | ||
2254 | mutex_lock(&cgroup_mutex); | 2244 | mutex_lock(&cgroup_mutex); |
2255 | for_each_active_root(root) { | 2245 | for_each_active_root(root) { |
2256 | struct cgroup *from_cg = task_cgroup_from_root(from, root); | 2246 | struct cgroup *from_cgrp = task_cgroup_from_root(from, root); |
2257 | 2247 | ||
2258 | retval = cgroup_attach_task(from_cg, tsk, false); | 2248 | retval = cgroup_attach_task(from_cgrp, tsk, false); |
2259 | if (retval) | 2249 | if (retval) |
2260 | break; | 2250 | break; |
2261 | } | 2251 | } |
@@ -2265,34 +2255,38 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) | |||
2265 | } | 2255 | } |
2266 | EXPORT_SYMBOL_GPL(cgroup_attach_task_all); | 2256 | EXPORT_SYMBOL_GPL(cgroup_attach_task_all); |
2267 | 2257 | ||
2268 | static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) | 2258 | static int cgroup_tasks_write(struct cgroup_subsys_state *css, |
2259 | struct cftype *cft, u64 pid) | ||
2269 | { | 2260 | { |
2270 | return attach_task_by_pid(cgrp, pid, false); | 2261 | return attach_task_by_pid(css->cgroup, pid, false); |
2271 | } | 2262 | } |
2272 | 2263 | ||
2273 | static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid) | 2264 | static int cgroup_procs_write(struct cgroup_subsys_state *css, |
2265 | struct cftype *cft, u64 tgid) | ||
2274 | { | 2266 | { |
2275 | return attach_task_by_pid(cgrp, tgid, true); | 2267 | return attach_task_by_pid(css->cgroup, tgid, true); |
2276 | } | 2268 | } |
2277 | 2269 | ||
2278 | static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, | 2270 | static int cgroup_release_agent_write(struct cgroup_subsys_state *css, |
2279 | const char *buffer) | 2271 | struct cftype *cft, const char *buffer) |
2280 | { | 2272 | { |
2281 | BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); | 2273 | BUILD_BUG_ON(sizeof(css->cgroup->root->release_agent_path) < PATH_MAX); |
2282 | if (strlen(buffer) >= PATH_MAX) | 2274 | if (strlen(buffer) >= PATH_MAX) |
2283 | return -EINVAL; | 2275 | return -EINVAL; |
2284 | if (!cgroup_lock_live_group(cgrp)) | 2276 | if (!cgroup_lock_live_group(css->cgroup)) |
2285 | return -ENODEV; | 2277 | return -ENODEV; |
2286 | mutex_lock(&cgroup_root_mutex); | 2278 | mutex_lock(&cgroup_root_mutex); |
2287 | strcpy(cgrp->root->release_agent_path, buffer); | 2279 | strcpy(css->cgroup->root->release_agent_path, buffer); |
2288 | mutex_unlock(&cgroup_root_mutex); | 2280 | mutex_unlock(&cgroup_root_mutex); |
2289 | mutex_unlock(&cgroup_mutex); | 2281 | mutex_unlock(&cgroup_mutex); |
2290 | return 0; | 2282 | return 0; |
2291 | } | 2283 | } |
2292 | 2284 | ||
2293 | static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft, | 2285 | static int cgroup_release_agent_show(struct cgroup_subsys_state *css, |
2294 | struct seq_file *seq) | 2286 | struct cftype *cft, struct seq_file *seq) |
2295 | { | 2287 | { |
2288 | struct cgroup *cgrp = css->cgroup; | ||
2289 | |||
2296 | if (!cgroup_lock_live_group(cgrp)) | 2290 | if (!cgroup_lock_live_group(cgrp)) |
2297 | return -ENODEV; | 2291 | return -ENODEV; |
2298 | seq_puts(seq, cgrp->root->release_agent_path); | 2292 | seq_puts(seq, cgrp->root->release_agent_path); |
@@ -2301,20 +2295,20 @@ static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft, | |||
2301 | return 0; | 2295 | return 0; |
2302 | } | 2296 | } |
2303 | 2297 | ||
2304 | static int cgroup_sane_behavior_show(struct cgroup *cgrp, struct cftype *cft, | 2298 | static int cgroup_sane_behavior_show(struct cgroup_subsys_state *css, |
2305 | struct seq_file *seq) | 2299 | struct cftype *cft, struct seq_file *seq) |
2306 | { | 2300 | { |
2307 | seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp)); | 2301 | seq_printf(seq, "%d\n", cgroup_sane_behavior(css->cgroup)); |
2308 | return 0; | 2302 | return 0; |
2309 | } | 2303 | } |
2310 | 2304 | ||
2311 | /* A buffer size big enough for numbers or short strings */ | 2305 | /* A buffer size big enough for numbers or short strings */ |
2312 | #define CGROUP_LOCAL_BUFFER_SIZE 64 | 2306 | #define CGROUP_LOCAL_BUFFER_SIZE 64 |
2313 | 2307 | ||
2314 | static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft, | 2308 | static ssize_t cgroup_write_X64(struct cgroup_subsys_state *css, |
2315 | struct file *file, | 2309 | struct cftype *cft, struct file *file, |
2316 | const char __user *userbuf, | 2310 | const char __user *userbuf, size_t nbytes, |
2317 | size_t nbytes, loff_t *unused_ppos) | 2311 | loff_t *unused_ppos) |
2318 | { | 2312 | { |
2319 | char buffer[CGROUP_LOCAL_BUFFER_SIZE]; | 2313 | char buffer[CGROUP_LOCAL_BUFFER_SIZE]; |
2320 | int retval = 0; | 2314 | int retval = 0; |
@@ -2332,22 +2326,22 @@ static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft, | |||
2332 | u64 val = simple_strtoull(strstrip(buffer), &end, 0); | 2326 | u64 val = simple_strtoull(strstrip(buffer), &end, 0); |
2333 | if (*end) | 2327 | if (*end) |
2334 | return -EINVAL; | 2328 | return -EINVAL; |
2335 | retval = cft->write_u64(cgrp, cft, val); | 2329 | retval = cft->write_u64(css, cft, val); |
2336 | } else { | 2330 | } else { |
2337 | s64 val = simple_strtoll(strstrip(buffer), &end, 0); | 2331 | s64 val = simple_strtoll(strstrip(buffer), &end, 0); |
2338 | if (*end) | 2332 | if (*end) |
2339 | return -EINVAL; | 2333 | return -EINVAL; |
2340 | retval = cft->write_s64(cgrp, cft, val); | 2334 | retval = cft->write_s64(css, cft, val); |
2341 | } | 2335 | } |
2342 | if (!retval) | 2336 | if (!retval) |
2343 | retval = nbytes; | 2337 | retval = nbytes; |
2344 | return retval; | 2338 | return retval; |
2345 | } | 2339 | } |
2346 | 2340 | ||
2347 | static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft, | 2341 | static ssize_t cgroup_write_string(struct cgroup_subsys_state *css, |
2348 | struct file *file, | 2342 | struct cftype *cft, struct file *file, |
2349 | const char __user *userbuf, | 2343 | const char __user *userbuf, size_t nbytes, |
2350 | size_t nbytes, loff_t *unused_ppos) | 2344 | loff_t *unused_ppos) |
2351 | { | 2345 | { |
2352 | char local_buffer[CGROUP_LOCAL_BUFFER_SIZE]; | 2346 | char local_buffer[CGROUP_LOCAL_BUFFER_SIZE]; |
2353 | int retval = 0; | 2347 | int retval = 0; |
@@ -2370,7 +2364,7 @@ static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft, | |||
2370 | } | 2364 | } |
2371 | 2365 | ||
2372 | buffer[nbytes] = 0; /* nul-terminate */ | 2366 | buffer[nbytes] = 0; /* nul-terminate */ |
2373 | retval = cft->write_string(cgrp, cft, strstrip(buffer)); | 2367 | retval = cft->write_string(css, cft, strstrip(buffer)); |
2374 | if (!retval) | 2368 | if (!retval) |
2375 | retval = nbytes; | 2369 | retval = nbytes; |
2376 | out: | 2370 | out: |
@@ -2380,65 +2374,60 @@ out: | |||
2380 | } | 2374 | } |
2381 | 2375 | ||
2382 | static ssize_t cgroup_file_write(struct file *file, const char __user *buf, | 2376 | static ssize_t cgroup_file_write(struct file *file, const char __user *buf, |
2383 | size_t nbytes, loff_t *ppos) | 2377 | size_t nbytes, loff_t *ppos) |
2384 | { | 2378 | { |
2379 | struct cfent *cfe = __d_cfe(file->f_dentry); | ||
2385 | struct cftype *cft = __d_cft(file->f_dentry); | 2380 | struct cftype *cft = __d_cft(file->f_dentry); |
2386 | struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); | 2381 | struct cgroup_subsys_state *css = cfe->css; |
2387 | 2382 | ||
2388 | if (cgroup_is_dead(cgrp)) | ||
2389 | return -ENODEV; | ||
2390 | if (cft->write) | 2383 | if (cft->write) |
2391 | return cft->write(cgrp, cft, file, buf, nbytes, ppos); | 2384 | return cft->write(css, cft, file, buf, nbytes, ppos); |
2392 | if (cft->write_u64 || cft->write_s64) | 2385 | if (cft->write_u64 || cft->write_s64) |
2393 | return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos); | 2386 | return cgroup_write_X64(css, cft, file, buf, nbytes, ppos); |
2394 | if (cft->write_string) | 2387 | if (cft->write_string) |
2395 | return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos); | 2388 | return cgroup_write_string(css, cft, file, buf, nbytes, ppos); |
2396 | if (cft->trigger) { | 2389 | if (cft->trigger) { |
2397 | int ret = cft->trigger(cgrp, (unsigned int)cft->private); | 2390 | int ret = cft->trigger(css, (unsigned int)cft->private); |
2398 | return ret ? ret : nbytes; | 2391 | return ret ? ret : nbytes; |
2399 | } | 2392 | } |
2400 | return -EINVAL; | 2393 | return -EINVAL; |
2401 | } | 2394 | } |
2402 | 2395 | ||
2403 | static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft, | 2396 | static ssize_t cgroup_read_u64(struct cgroup_subsys_state *css, |
2404 | struct file *file, | 2397 | struct cftype *cft, struct file *file, |
2405 | char __user *buf, size_t nbytes, | 2398 | char __user *buf, size_t nbytes, loff_t *ppos) |
2406 | loff_t *ppos) | ||
2407 | { | 2399 | { |
2408 | char tmp[CGROUP_LOCAL_BUFFER_SIZE]; | 2400 | char tmp[CGROUP_LOCAL_BUFFER_SIZE]; |
2409 | u64 val = cft->read_u64(cgrp, cft); | 2401 | u64 val = cft->read_u64(css, cft); |
2410 | int len = sprintf(tmp, "%llu\n", (unsigned long long) val); | 2402 | int len = sprintf(tmp, "%llu\n", (unsigned long long) val); |
2411 | 2403 | ||
2412 | return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); | 2404 | return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); |
2413 | } | 2405 | } |
2414 | 2406 | ||
2415 | static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft, | 2407 | static ssize_t cgroup_read_s64(struct cgroup_subsys_state *css, |
2416 | struct file *file, | 2408 | struct cftype *cft, struct file *file, |
2417 | char __user *buf, size_t nbytes, | 2409 | char __user *buf, size_t nbytes, loff_t *ppos) |
2418 | loff_t *ppos) | ||
2419 | { | 2410 | { |
2420 | char tmp[CGROUP_LOCAL_BUFFER_SIZE]; | 2411 | char tmp[CGROUP_LOCAL_BUFFER_SIZE]; |
2421 | s64 val = cft->read_s64(cgrp, cft); | 2412 | s64 val = cft->read_s64(css, cft); |
2422 | int len = sprintf(tmp, "%lld\n", (long long) val); | 2413 | int len = sprintf(tmp, "%lld\n", (long long) val); |
2423 | 2414 | ||
2424 | return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); | 2415 | return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); |
2425 | } | 2416 | } |
2426 | 2417 | ||
2427 | static ssize_t cgroup_file_read(struct file *file, char __user *buf, | 2418 | static ssize_t cgroup_file_read(struct file *file, char __user *buf, |
2428 | size_t nbytes, loff_t *ppos) | 2419 | size_t nbytes, loff_t *ppos) |
2429 | { | 2420 | { |
2421 | struct cfent *cfe = __d_cfe(file->f_dentry); | ||
2430 | struct cftype *cft = __d_cft(file->f_dentry); | 2422 | struct cftype *cft = __d_cft(file->f_dentry); |
2431 | struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); | 2423 | struct cgroup_subsys_state *css = cfe->css; |
2432 | |||
2433 | if (cgroup_is_dead(cgrp)) | ||
2434 | return -ENODEV; | ||
2435 | 2424 | ||
2436 | if (cft->read) | 2425 | if (cft->read) |
2437 | return cft->read(cgrp, cft, file, buf, nbytes, ppos); | 2426 | return cft->read(css, cft, file, buf, nbytes, ppos); |
2438 | if (cft->read_u64) | 2427 | if (cft->read_u64) |
2439 | return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos); | 2428 | return cgroup_read_u64(css, cft, file, buf, nbytes, ppos); |
2440 | if (cft->read_s64) | 2429 | if (cft->read_s64) |
2441 | return cgroup_read_s64(cgrp, cft, file, buf, nbytes, ppos); | 2430 | return cgroup_read_s64(css, cft, file, buf, nbytes, ppos); |
2442 | return -EINVAL; | 2431 | return -EINVAL; |
2443 | } | 2432 | } |
2444 | 2433 | ||
@@ -2447,11 +2436,6 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf, | |||
2447 | * supports string->u64 maps, but can be extended in future. | 2436 | * supports string->u64 maps, but can be extended in future. |
2448 | */ | 2437 | */ |
2449 | 2438 | ||
2450 | struct cgroup_seqfile_state { | ||
2451 | struct cftype *cft; | ||
2452 | struct cgroup *cgroup; | ||
2453 | }; | ||
2454 | |||
2455 | static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value) | 2439 | static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value) |
2456 | { | 2440 | { |
2457 | struct seq_file *sf = cb->state; | 2441 | struct seq_file *sf = cb->state; |
@@ -2460,69 +2444,86 @@ static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value) | |||
2460 | 2444 | ||
2461 | static int cgroup_seqfile_show(struct seq_file *m, void *arg) | 2445 | static int cgroup_seqfile_show(struct seq_file *m, void *arg) |
2462 | { | 2446 | { |
2463 | struct cgroup_seqfile_state *state = m->private; | 2447 | struct cfent *cfe = m->private; |
2464 | struct cftype *cft = state->cft; | 2448 | struct cftype *cft = cfe->type; |
2449 | struct cgroup_subsys_state *css = cfe->css; | ||
2450 | |||
2465 | if (cft->read_map) { | 2451 | if (cft->read_map) { |
2466 | struct cgroup_map_cb cb = { | 2452 | struct cgroup_map_cb cb = { |
2467 | .fill = cgroup_map_add, | 2453 | .fill = cgroup_map_add, |
2468 | .state = m, | 2454 | .state = m, |
2469 | }; | 2455 | }; |
2470 | return cft->read_map(state->cgroup, cft, &cb); | 2456 | return cft->read_map(css, cft, &cb); |
2471 | } | 2457 | } |
2472 | return cft->read_seq_string(state->cgroup, cft, m); | 2458 | return cft->read_seq_string(css, cft, m); |
2473 | } | ||
2474 | |||
2475 | static int cgroup_seqfile_release(struct inode *inode, struct file *file) | ||
2476 | { | ||
2477 | struct seq_file *seq = file->private_data; | ||
2478 | kfree(seq->private); | ||
2479 | return single_release(inode, file); | ||
2480 | } | 2459 | } |
2481 | 2460 | ||
2482 | static const struct file_operations cgroup_seqfile_operations = { | 2461 | static const struct file_operations cgroup_seqfile_operations = { |
2483 | .read = seq_read, | 2462 | .read = seq_read, |
2484 | .write = cgroup_file_write, | 2463 | .write = cgroup_file_write, |
2485 | .llseek = seq_lseek, | 2464 | .llseek = seq_lseek, |
2486 | .release = cgroup_seqfile_release, | 2465 | .release = single_release, |
2487 | }; | 2466 | }; |
2488 | 2467 | ||
2489 | static int cgroup_file_open(struct inode *inode, struct file *file) | 2468 | static int cgroup_file_open(struct inode *inode, struct file *file) |
2490 | { | 2469 | { |
2470 | struct cfent *cfe = __d_cfe(file->f_dentry); | ||
2471 | struct cftype *cft = __d_cft(file->f_dentry); | ||
2472 | struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent); | ||
2473 | struct cgroup_subsys_state *css; | ||
2491 | int err; | 2474 | int err; |
2492 | struct cftype *cft; | ||
2493 | 2475 | ||
2494 | err = generic_file_open(inode, file); | 2476 | err = generic_file_open(inode, file); |
2495 | if (err) | 2477 | if (err) |
2496 | return err; | 2478 | return err; |
2497 | cft = __d_cft(file->f_dentry); | ||
2498 | 2479 | ||
2499 | if (cft->read_map || cft->read_seq_string) { | 2480 | /* |
2500 | struct cgroup_seqfile_state *state; | 2481 | * If the file belongs to a subsystem, pin the css. Will be |
2482 | * unpinned either on open failure or release. This ensures that | ||
2483 | * @css stays alive for all file operations. | ||
2484 | */ | ||
2485 | rcu_read_lock(); | ||
2486 | css = cgroup_css(cgrp, cft->ss); | ||
2487 | if (cft->ss && !css_tryget(css)) | ||
2488 | css = NULL; | ||
2489 | rcu_read_unlock(); | ||
2501 | 2490 | ||
2502 | state = kzalloc(sizeof(*state), GFP_USER); | 2491 | if (!css) |
2503 | if (!state) | 2492 | return -ENODEV; |
2504 | return -ENOMEM; | ||
2505 | 2493 | ||
2506 | state->cft = cft; | 2494 | /* |
2507 | state->cgroup = __d_cgrp(file->f_dentry->d_parent); | 2495 | * @cfe->css is used by read/write/close to determine the |
2496 | * associated css. @file->private_data would be a better place but | ||
2497 | * that's already used by seqfile. Multiple accessors may use it | ||
2498 | * simultaneously which is okay as the association never changes. | ||
2499 | */ | ||
2500 | WARN_ON_ONCE(cfe->css && cfe->css != css); | ||
2501 | cfe->css = css; | ||
2502 | |||
2503 | if (cft->read_map || cft->read_seq_string) { | ||
2508 | file->f_op = &cgroup_seqfile_operations; | 2504 | file->f_op = &cgroup_seqfile_operations; |
2509 | err = single_open(file, cgroup_seqfile_show, state); | 2505 | err = single_open(file, cgroup_seqfile_show, cfe); |
2510 | if (err < 0) | 2506 | } else if (cft->open) { |
2511 | kfree(state); | ||
2512 | } else if (cft->open) | ||
2513 | err = cft->open(inode, file); | 2507 | err = cft->open(inode, file); |
2514 | else | 2508 | } |
2515 | err = 0; | ||
2516 | 2509 | ||
2510 | if (css->ss && err) | ||
2511 | css_put(css); | ||
2517 | return err; | 2512 | return err; |
2518 | } | 2513 | } |
2519 | 2514 | ||
2520 | static int cgroup_file_release(struct inode *inode, struct file *file) | 2515 | static int cgroup_file_release(struct inode *inode, struct file *file) |
2521 | { | 2516 | { |
2517 | struct cfent *cfe = __d_cfe(file->f_dentry); | ||
2522 | struct cftype *cft = __d_cft(file->f_dentry); | 2518 | struct cftype *cft = __d_cft(file->f_dentry); |
2519 | struct cgroup_subsys_state *css = cfe->css; | ||
2520 | int ret = 0; | ||
2521 | |||
2523 | if (cft->release) | 2522 | if (cft->release) |
2524 | return cft->release(inode, file); | 2523 | ret = cft->release(inode, file); |
2525 | return 0; | 2524 | if (css->ss) |
2525 | css_put(css); | ||
2526 | return ret; | ||
2526 | } | 2527 | } |
2527 | 2528 | ||
2528 | /* | 2529 | /* |
@@ -2736,8 +2737,7 @@ static umode_t cgroup_file_mode(const struct cftype *cft) | |||
2736 | return mode; | 2737 | return mode; |
2737 | } | 2738 | } |
2738 | 2739 | ||
2739 | static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, | 2740 | static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) |
2740 | struct cftype *cft) | ||
2741 | { | 2741 | { |
2742 | struct dentry *dir = cgrp->dentry; | 2742 | struct dentry *dir = cgrp->dentry; |
2743 | struct cgroup *parent = __d_cgrp(dir); | 2743 | struct cgroup *parent = __d_cgrp(dir); |
@@ -2747,8 +2747,9 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, | |||
2747 | umode_t mode; | 2747 | umode_t mode; |
2748 | char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; | 2748 | char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; |
2749 | 2749 | ||
2750 | if (subsys && !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) { | 2750 | if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) && |
2751 | strcpy(name, subsys->name); | 2751 | !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) { |
2752 | strcpy(name, cft->ss->name); | ||
2752 | strcat(name, "."); | 2753 | strcat(name, "."); |
2753 | } | 2754 | } |
2754 | strcat(name, cft->name); | 2755 | strcat(name, cft->name); |
@@ -2782,11 +2783,25 @@ out: | |||
2782 | return error; | 2783 | return error; |
2783 | } | 2784 | } |
2784 | 2785 | ||
2785 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, | 2786 | /** |
2786 | struct cftype cfts[], bool is_add) | 2787 | * cgroup_addrm_files - add or remove files to a cgroup directory |
2788 | * @cgrp: the target cgroup | ||
2789 | * @cfts: array of cftypes to be added | ||
2790 | * @is_add: whether to add or remove | ||
2791 | * | ||
2792 | * Depending on @is_add, add or remove files defined by @cfts on @cgrp. | ||
2793 | * For removals, this function never fails. If addition fails, this | ||
2794 | * function doesn't remove files already added. The caller is responsible | ||
2795 | * for cleaning up. | ||
2796 | */ | ||
2797 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], | ||
2798 | bool is_add) | ||
2787 | { | 2799 | { |
2788 | struct cftype *cft; | 2800 | struct cftype *cft; |
2789 | int err, ret = 0; | 2801 | int ret; |
2802 | |||
2803 | lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); | ||
2804 | lockdep_assert_held(&cgroup_mutex); | ||
2790 | 2805 | ||
2791 | for (cft = cfts; cft->name[0] != '\0'; cft++) { | 2806 | for (cft = cfts; cft->name[0] != '\0'; cft++) { |
2792 | /* does cft->flags tell us to skip this file on @cgrp? */ | 2807 | /* does cft->flags tell us to skip this file on @cgrp? */ |
@@ -2798,16 +2813,17 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, | |||
2798 | continue; | 2813 | continue; |
2799 | 2814 | ||
2800 | if (is_add) { | 2815 | if (is_add) { |
2801 | err = cgroup_add_file(cgrp, subsys, cft); | 2816 | ret = cgroup_add_file(cgrp, cft); |
2802 | if (err) | 2817 | if (ret) { |
2803 | pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n", | 2818 | pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n", |
2804 | cft->name, err); | 2819 | cft->name, ret); |
2805 | ret = err; | 2820 | return ret; |
2821 | } | ||
2806 | } else { | 2822 | } else { |
2807 | cgroup_rm_file(cgrp, cft); | 2823 | cgroup_rm_file(cgrp, cft); |
2808 | } | 2824 | } |
2809 | } | 2825 | } |
2810 | return ret; | 2826 | return 0; |
2811 | } | 2827 | } |
2812 | 2828 | ||
2813 | static void cgroup_cfts_prepare(void) | 2829 | static void cgroup_cfts_prepare(void) |
@@ -2816,28 +2832,30 @@ static void cgroup_cfts_prepare(void) | |||
2816 | /* | 2832 | /* |
2817 | * Thanks to the entanglement with vfs inode locking, we can't walk | 2833 | * Thanks to the entanglement with vfs inode locking, we can't walk |
2818 | * the existing cgroups under cgroup_mutex and create files. | 2834 | * the existing cgroups under cgroup_mutex and create files. |
2819 | * Instead, we use cgroup_for_each_descendant_pre() and drop RCU | 2835 | * Instead, we use css_for_each_descendant_pre() and drop RCU read |
2820 | * read lock before calling cgroup_addrm_files(). | 2836 | * lock before calling cgroup_addrm_files(). |
2821 | */ | 2837 | */ |
2822 | mutex_lock(&cgroup_mutex); | 2838 | mutex_lock(&cgroup_mutex); |
2823 | } | 2839 | } |
2824 | 2840 | ||
2825 | static void cgroup_cfts_commit(struct cgroup_subsys *ss, | 2841 | static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) |
2826 | struct cftype *cfts, bool is_add) | ||
2827 | __releases(&cgroup_mutex) | 2842 | __releases(&cgroup_mutex) |
2828 | { | 2843 | { |
2829 | LIST_HEAD(pending); | 2844 | LIST_HEAD(pending); |
2830 | struct cgroup *cgrp, *root = &ss->root->top_cgroup; | 2845 | struct cgroup_subsys *ss = cfts[0].ss; |
2846 | struct cgroup *root = &ss->root->top_cgroup; | ||
2831 | struct super_block *sb = ss->root->sb; | 2847 | struct super_block *sb = ss->root->sb; |
2832 | struct dentry *prev = NULL; | 2848 | struct dentry *prev = NULL; |
2833 | struct inode *inode; | 2849 | struct inode *inode; |
2850 | struct cgroup_subsys_state *css; | ||
2834 | u64 update_before; | 2851 | u64 update_before; |
2852 | int ret = 0; | ||
2835 | 2853 | ||
2836 | /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ | 2854 | /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ |
2837 | if (!cfts || ss->root == &cgroup_dummy_root || | 2855 | if (!cfts || ss->root == &cgroup_dummy_root || |
2838 | !atomic_inc_not_zero(&sb->s_active)) { | 2856 | !atomic_inc_not_zero(&sb->s_active)) { |
2839 | mutex_unlock(&cgroup_mutex); | 2857 | mutex_unlock(&cgroup_mutex); |
2840 | return; | 2858 | return 0; |
2841 | } | 2859 | } |
2842 | 2860 | ||
2843 | /* | 2861 | /* |
@@ -2849,17 +2867,11 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss, | |||
2849 | 2867 | ||
2850 | mutex_unlock(&cgroup_mutex); | 2868 | mutex_unlock(&cgroup_mutex); |
2851 | 2869 | ||
2852 | /* @root always needs to be updated */ | ||
2853 | inode = root->dentry->d_inode; | ||
2854 | mutex_lock(&inode->i_mutex); | ||
2855 | mutex_lock(&cgroup_mutex); | ||
2856 | cgroup_addrm_files(root, ss, cfts, is_add); | ||
2857 | mutex_unlock(&cgroup_mutex); | ||
2858 | mutex_unlock(&inode->i_mutex); | ||
2859 | |||
2860 | /* add/rm files for all cgroups created before */ | 2870 | /* add/rm files for all cgroups created before */ |
2861 | rcu_read_lock(); | 2871 | rcu_read_lock(); |
2862 | cgroup_for_each_descendant_pre(cgrp, root) { | 2872 | css_for_each_descendant_pre(css, cgroup_css(root, ss)) { |
2873 | struct cgroup *cgrp = css->cgroup; | ||
2874 | |||
2863 | if (cgroup_is_dead(cgrp)) | 2875 | if (cgroup_is_dead(cgrp)) |
2864 | continue; | 2876 | continue; |
2865 | 2877 | ||
@@ -2873,15 +2885,18 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss, | |||
2873 | mutex_lock(&inode->i_mutex); | 2885 | mutex_lock(&inode->i_mutex); |
2874 | mutex_lock(&cgroup_mutex); | 2886 | mutex_lock(&cgroup_mutex); |
2875 | if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp)) | 2887 | if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp)) |
2876 | cgroup_addrm_files(cgrp, ss, cfts, is_add); | 2888 | ret = cgroup_addrm_files(cgrp, cfts, is_add); |
2877 | mutex_unlock(&cgroup_mutex); | 2889 | mutex_unlock(&cgroup_mutex); |
2878 | mutex_unlock(&inode->i_mutex); | 2890 | mutex_unlock(&inode->i_mutex); |
2879 | 2891 | ||
2880 | rcu_read_lock(); | 2892 | rcu_read_lock(); |
2893 | if (ret) | ||
2894 | break; | ||
2881 | } | 2895 | } |
2882 | rcu_read_unlock(); | 2896 | rcu_read_unlock(); |
2883 | dput(prev); | 2897 | dput(prev); |
2884 | deactivate_super(sb); | 2898 | deactivate_super(sb); |
2899 | return ret; | ||
2885 | } | 2900 | } |
2886 | 2901 | ||
2887 | /** | 2902 | /** |
@@ -2901,49 +2916,56 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss, | |||
2901 | int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) | 2916 | int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) |
2902 | { | 2917 | { |
2903 | struct cftype_set *set; | 2918 | struct cftype_set *set; |
2919 | struct cftype *cft; | ||
2920 | int ret; | ||
2904 | 2921 | ||
2905 | set = kzalloc(sizeof(*set), GFP_KERNEL); | 2922 | set = kzalloc(sizeof(*set), GFP_KERNEL); |
2906 | if (!set) | 2923 | if (!set) |
2907 | return -ENOMEM; | 2924 | return -ENOMEM; |
2908 | 2925 | ||
2926 | for (cft = cfts; cft->name[0] != '\0'; cft++) | ||
2927 | cft->ss = ss; | ||
2928 | |||
2909 | cgroup_cfts_prepare(); | 2929 | cgroup_cfts_prepare(); |
2910 | set->cfts = cfts; | 2930 | set->cfts = cfts; |
2911 | list_add_tail(&set->node, &ss->cftsets); | 2931 | list_add_tail(&set->node, &ss->cftsets); |
2912 | cgroup_cfts_commit(ss, cfts, true); | 2932 | ret = cgroup_cfts_commit(cfts, true); |
2913 | 2933 | if (ret) | |
2914 | return 0; | 2934 | cgroup_rm_cftypes(cfts); |
2935 | return ret; | ||
2915 | } | 2936 | } |
2916 | EXPORT_SYMBOL_GPL(cgroup_add_cftypes); | 2937 | EXPORT_SYMBOL_GPL(cgroup_add_cftypes); |
2917 | 2938 | ||
2918 | /** | 2939 | /** |
2919 | * cgroup_rm_cftypes - remove an array of cftypes from a subsystem | 2940 | * cgroup_rm_cftypes - remove an array of cftypes from a subsystem |
2920 | * @ss: target cgroup subsystem | ||
2921 | * @cfts: zero-length name terminated array of cftypes | 2941 | * @cfts: zero-length name terminated array of cftypes |
2922 | * | 2942 | * |
2923 | * Unregister @cfts from @ss. Files described by @cfts are removed from | 2943 | * Unregister @cfts. Files described by @cfts are removed from all |
2924 | * all existing cgroups to which @ss is attached and all future cgroups | 2944 | * existing cgroups and all future cgroups won't have them either. This |
2925 | * won't have them either. This function can be called anytime whether @ss | 2945 | * function can be called anytime whether @cfts' subsys is attached or not. |
2926 | * is attached or not. | ||
2927 | * | 2946 | * |
2928 | * Returns 0 on successful unregistration, -ENOENT if @cfts is not | 2947 | * Returns 0 on successful unregistration, -ENOENT if @cfts is not |
2929 | * registered with @ss. | 2948 | * registered. |
2930 | */ | 2949 | */ |
2931 | int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) | 2950 | int cgroup_rm_cftypes(struct cftype *cfts) |
2932 | { | 2951 | { |
2933 | struct cftype_set *set; | 2952 | struct cftype_set *set; |
2934 | 2953 | ||
2954 | if (!cfts || !cfts[0].ss) | ||
2955 | return -ENOENT; | ||
2956 | |||
2935 | cgroup_cfts_prepare(); | 2957 | cgroup_cfts_prepare(); |
2936 | 2958 | ||
2937 | list_for_each_entry(set, &ss->cftsets, node) { | 2959 | list_for_each_entry(set, &cfts[0].ss->cftsets, node) { |
2938 | if (set->cfts == cfts) { | 2960 | if (set->cfts == cfts) { |
2939 | list_del(&set->node); | 2961 | list_del(&set->node); |
2940 | kfree(set); | 2962 | kfree(set); |
2941 | cgroup_cfts_commit(ss, cfts, false); | 2963 | cgroup_cfts_commit(cfts, false); |
2942 | return 0; | 2964 | return 0; |
2943 | } | 2965 | } |
2944 | } | 2966 | } |
2945 | 2967 | ||
2946 | cgroup_cfts_commit(ss, NULL, false); | 2968 | cgroup_cfts_commit(NULL, false); |
2947 | return -ENOENT; | 2969 | return -ENOENT; |
2948 | } | 2970 | } |
2949 | 2971 | ||
@@ -2966,34 +2988,10 @@ int cgroup_task_count(const struct cgroup *cgrp) | |||
2966 | } | 2988 | } |
2967 | 2989 | ||
2968 | /* | 2990 | /* |
2969 | * Advance a list_head iterator. The iterator should be positioned at | 2991 | * To reduce the fork() overhead for systems that are not actually using |
2970 | * the start of a css_set | 2992 | * their cgroups capability, we don't maintain the lists running through |
2971 | */ | 2993 | * each css_set to its tasks until we see the list actually used - in other |
2972 | static void cgroup_advance_iter(struct cgroup *cgrp, struct cgroup_iter *it) | 2994 | * words after the first call to css_task_iter_start(). |
2973 | { | ||
2974 | struct list_head *l = it->cset_link; | ||
2975 | struct cgrp_cset_link *link; | ||
2976 | struct css_set *cset; | ||
2977 | |||
2978 | /* Advance to the next non-empty css_set */ | ||
2979 | do { | ||
2980 | l = l->next; | ||
2981 | if (l == &cgrp->cset_links) { | ||
2982 | it->cset_link = NULL; | ||
2983 | return; | ||
2984 | } | ||
2985 | link = list_entry(l, struct cgrp_cset_link, cset_link); | ||
2986 | cset = link->cset; | ||
2987 | } while (list_empty(&cset->tasks)); | ||
2988 | it->cset_link = l; | ||
2989 | it->task = cset->tasks.next; | ||
2990 | } | ||
2991 | |||
2992 | /* | ||
2993 | * To reduce the fork() overhead for systems that are not actually | ||
2994 | * using their cgroups capability, we don't maintain the lists running | ||
2995 | * through each css_set to its tasks until we see the list actually | ||
2996 | * used - in other words after the first call to cgroup_iter_start(). | ||
2997 | */ | 2995 | */ |
2998 | static void cgroup_enable_task_cg_lists(void) | 2996 | static void cgroup_enable_task_cg_lists(void) |
2999 | { | 2997 | { |
@@ -3024,16 +3022,21 @@ static void cgroup_enable_task_cg_lists(void) | |||
3024 | } | 3022 | } |
3025 | 3023 | ||
3026 | /** | 3024 | /** |
3027 | * cgroup_next_sibling - find the next sibling of a given cgroup | 3025 | * css_next_child - find the next child of a given css |
3028 | * @pos: the current cgroup | 3026 | * @pos_css: the current position (%NULL to initiate traversal) |
3027 | * @parent_css: css whose children to walk | ||
3029 | * | 3028 | * |
3030 | * This function returns the next sibling of @pos and should be called | 3029 | * This function returns the next child of @parent_css and should be called |
3031 | * under RCU read lock. The only requirement is that @pos is accessible. | 3030 | * under RCU read lock. The only requirement is that @parent_css and |
3032 | * The next sibling is guaranteed to be returned regardless of @pos's | 3031 | * @pos_css are accessible. The next sibling is guaranteed to be returned |
3033 | * state. | 3032 | * regardless of their states. |
3034 | */ | 3033 | */ |
3035 | struct cgroup *cgroup_next_sibling(struct cgroup *pos) | 3034 | struct cgroup_subsys_state * |
3035 | css_next_child(struct cgroup_subsys_state *pos_css, | ||
3036 | struct cgroup_subsys_state *parent_css) | ||
3036 | { | 3037 | { |
3038 | struct cgroup *pos = pos_css ? pos_css->cgroup : NULL; | ||
3039 | struct cgroup *cgrp = parent_css->cgroup; | ||
3037 | struct cgroup *next; | 3040 | struct cgroup *next; |
3038 | 3041 | ||
3039 | WARN_ON_ONCE(!rcu_read_lock_held()); | 3042 | WARN_ON_ONCE(!rcu_read_lock_held()); |
@@ -3048,78 +3051,81 @@ struct cgroup *cgroup_next_sibling(struct cgroup *pos) | |||
3048 | * safe to dereference from this RCU critical section. If | 3051 | * safe to dereference from this RCU critical section. If |
3049 | * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed | 3052 | * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed |
3050 | * to be visible as %true here. | 3053 | * to be visible as %true here. |
3054 | * | ||
3055 | * If @pos is dead, its next pointer can't be dereferenced; | ||
3056 | * however, as each cgroup is given a monotonically increasing | ||
3057 | * unique serial number and always appended to the sibling list, | ||
3058 | * the next one can be found by walking the parent's children until | ||
3059 | * we see a cgroup with higher serial number than @pos's. While | ||
3060 | * this path can be slower, it's taken only when either the current | ||
3061 | * cgroup is removed or iteration and removal race. | ||
3051 | */ | 3062 | */ |
3052 | if (likely(!cgroup_is_dead(pos))) { | 3063 | if (!pos) { |
3064 | next = list_entry_rcu(cgrp->children.next, struct cgroup, sibling); | ||
3065 | } else if (likely(!cgroup_is_dead(pos))) { | ||
3053 | next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); | 3066 | next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); |
3054 | if (&next->sibling != &pos->parent->children) | 3067 | } else { |
3055 | return next; | 3068 | list_for_each_entry_rcu(next, &cgrp->children, sibling) |
3056 | return NULL; | 3069 | if (next->serial_nr > pos->serial_nr) |
3070 | break; | ||
3057 | } | 3071 | } |
3058 | 3072 | ||
3059 | /* | 3073 | if (&next->sibling == &cgrp->children) |
3060 | * Can't dereference the next pointer. Each cgroup is given a | 3074 | return NULL; |
3061 | * monotonically increasing unique serial number and always | 3075 | |
3062 | * appended to the sibling list, so the next one can be found by | 3076 | return cgroup_css(next, parent_css->ss); |
3063 | * walking the parent's children until we see a cgroup with higher | ||
3064 | * serial number than @pos's. | ||
3065 | * | ||
3066 | * While this path can be slow, it's taken only when either the | ||
3067 | * current cgroup is removed or iteration and removal race. | ||
3068 | */ | ||
3069 | list_for_each_entry_rcu(next, &pos->parent->children, sibling) | ||
3070 | if (next->serial_nr > pos->serial_nr) | ||
3071 | return next; | ||
3072 | return NULL; | ||
3073 | } | 3077 | } |
3074 | EXPORT_SYMBOL_GPL(cgroup_next_sibling); | 3078 | EXPORT_SYMBOL_GPL(css_next_child); |
3075 | 3079 | ||
3076 | /** | 3080 | /** |
3077 | * cgroup_next_descendant_pre - find the next descendant for pre-order walk | 3081 | * css_next_descendant_pre - find the next descendant for pre-order walk |
3078 | * @pos: the current position (%NULL to initiate traversal) | 3082 | * @pos: the current position (%NULL to initiate traversal) |
3079 | * @cgroup: cgroup whose descendants to walk | 3083 | * @root: css whose descendants to walk |
3080 | * | 3084 | * |
3081 | * To be used by cgroup_for_each_descendant_pre(). Find the next | 3085 | * To be used by css_for_each_descendant_pre(). Find the next descendant |
3082 | * descendant to visit for pre-order traversal of @cgroup's descendants. | 3086 | * to visit for pre-order traversal of @root's descendants. @root is |
3087 | * included in the iteration and the first node to be visited. | ||
3083 | * | 3088 | * |
3084 | * While this function requires RCU read locking, it doesn't require the | 3089 | * While this function requires RCU read locking, it doesn't require the |
3085 | * whole traversal to be contained in a single RCU critical section. This | 3090 | * whole traversal to be contained in a single RCU critical section. This |
3086 | * function will return the correct next descendant as long as both @pos | 3091 | * function will return the correct next descendant as long as both @pos |
3087 | * and @cgroup are accessible and @pos is a descendant of @cgroup. | 3092 | * and @root are accessible and @pos is a descendant of @root. |
3088 | */ | 3093 | */ |
3089 | struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, | 3094 | struct cgroup_subsys_state * |
3090 | struct cgroup *cgroup) | 3095 | css_next_descendant_pre(struct cgroup_subsys_state *pos, |
3096 | struct cgroup_subsys_state *root) | ||
3091 | { | 3097 | { |
3092 | struct cgroup *next; | 3098 | struct cgroup_subsys_state *next; |
3093 | 3099 | ||
3094 | WARN_ON_ONCE(!rcu_read_lock_held()); | 3100 | WARN_ON_ONCE(!rcu_read_lock_held()); |
3095 | 3101 | ||
3096 | /* if first iteration, pretend we just visited @cgroup */ | 3102 | /* if first iteration, visit @root */ |
3097 | if (!pos) | 3103 | if (!pos) |
3098 | pos = cgroup; | 3104 | return root; |
3099 | 3105 | ||
3100 | /* visit the first child if exists */ | 3106 | /* visit the first child if exists */ |
3101 | next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling); | 3107 | next = css_next_child(NULL, pos); |
3102 | if (next) | 3108 | if (next) |
3103 | return next; | 3109 | return next; |
3104 | 3110 | ||
3105 | /* no child, visit my or the closest ancestor's next sibling */ | 3111 | /* no child, visit my or the closest ancestor's next sibling */ |
3106 | while (pos != cgroup) { | 3112 | while (pos != root) { |
3107 | next = cgroup_next_sibling(pos); | 3113 | next = css_next_child(pos, css_parent(pos)); |
3108 | if (next) | 3114 | if (next) |
3109 | return next; | 3115 | return next; |
3110 | pos = pos->parent; | 3116 | pos = css_parent(pos); |
3111 | } | 3117 | } |
3112 | 3118 | ||
3113 | return NULL; | 3119 | return NULL; |
3114 | } | 3120 | } |
3115 | EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); | 3121 | EXPORT_SYMBOL_GPL(css_next_descendant_pre); |
3116 | 3122 | ||
3117 | /** | 3123 | /** |
3118 | * cgroup_rightmost_descendant - return the rightmost descendant of a cgroup | 3124 | * css_rightmost_descendant - return the rightmost descendant of a css |
3119 | * @pos: cgroup of interest | 3125 | * @pos: css of interest |
3120 | * | 3126 | * |
3121 | * Return the rightmost descendant of @pos. If there's no descendant, | 3127 | * Return the rightmost descendant of @pos. If there's no descendant, @pos |
3122 | * @pos is returned. This can be used during pre-order traversal to skip | 3128 | * is returned. This can be used during pre-order traversal to skip |
3123 | * subtree of @pos. | 3129 | * subtree of @pos. |
3124 | * | 3130 | * |
3125 | * While this function requires RCU read locking, it doesn't require the | 3131 | * While this function requires RCU read locking, it doesn't require the |
@@ -3127,9 +3133,10 @@ EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); | |||
3127 | * function will return the correct rightmost descendant as long as @pos is | 3133 | * function will return the correct rightmost descendant as long as @pos is |
3128 | * accessible. | 3134 | * accessible. |
3129 | */ | 3135 | */ |
3130 | struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos) | 3136 | struct cgroup_subsys_state * |
3137 | css_rightmost_descendant(struct cgroup_subsys_state *pos) | ||
3131 | { | 3138 | { |
3132 | struct cgroup *last, *tmp; | 3139 | struct cgroup_subsys_state *last, *tmp; |
3133 | 3140 | ||
3134 | WARN_ON_ONCE(!rcu_read_lock_held()); | 3141 | WARN_ON_ONCE(!rcu_read_lock_held()); |
3135 | 3142 | ||
@@ -3137,82 +3144,138 @@ struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos) | |||
3137 | last = pos; | 3144 | last = pos; |
3138 | /* ->prev isn't RCU safe, walk ->next till the end */ | 3145 | /* ->prev isn't RCU safe, walk ->next till the end */ |
3139 | pos = NULL; | 3146 | pos = NULL; |
3140 | list_for_each_entry_rcu(tmp, &last->children, sibling) | 3147 | css_for_each_child(tmp, last) |
3141 | pos = tmp; | 3148 | pos = tmp; |
3142 | } while (pos); | 3149 | } while (pos); |
3143 | 3150 | ||
3144 | return last; | 3151 | return last; |
3145 | } | 3152 | } |
3146 | EXPORT_SYMBOL_GPL(cgroup_rightmost_descendant); | 3153 | EXPORT_SYMBOL_GPL(css_rightmost_descendant); |
3147 | 3154 | ||
3148 | static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos) | 3155 | static struct cgroup_subsys_state * |
3156 | css_leftmost_descendant(struct cgroup_subsys_state *pos) | ||
3149 | { | 3157 | { |
3150 | struct cgroup *last; | 3158 | struct cgroup_subsys_state *last; |
3151 | 3159 | ||
3152 | do { | 3160 | do { |
3153 | last = pos; | 3161 | last = pos; |
3154 | pos = list_first_or_null_rcu(&pos->children, struct cgroup, | 3162 | pos = css_next_child(NULL, pos); |
3155 | sibling); | ||
3156 | } while (pos); | 3163 | } while (pos); |
3157 | 3164 | ||
3158 | return last; | 3165 | return last; |
3159 | } | 3166 | } |
3160 | 3167 | ||
3161 | /** | 3168 | /** |
3162 | * cgroup_next_descendant_post - find the next descendant for post-order walk | 3169 | * css_next_descendant_post - find the next descendant for post-order walk |
3163 | * @pos: the current position (%NULL to initiate traversal) | 3170 | * @pos: the current position (%NULL to initiate traversal) |
3164 | * @cgroup: cgroup whose descendants to walk | 3171 | * @root: css whose descendants to walk |
3165 | * | 3172 | * |
3166 | * To be used by cgroup_for_each_descendant_post(). Find the next | 3173 | * To be used by css_for_each_descendant_post(). Find the next descendant |
3167 | * descendant to visit for post-order traversal of @cgroup's descendants. | 3174 | * to visit for post-order traversal of @root's descendants. @root is |
3175 | * included in the iteration and the last node to be visited. | ||
3168 | * | 3176 | * |
3169 | * While this function requires RCU read locking, it doesn't require the | 3177 | * While this function requires RCU read locking, it doesn't require the |
3170 | * whole traversal to be contained in a single RCU critical section. This | 3178 | * whole traversal to be contained in a single RCU critical section. This |
3171 | * function will return the correct next descendant as long as both @pos | 3179 | * function will return the correct next descendant as long as both @pos |
3172 | * and @cgroup are accessible and @pos is a descendant of @cgroup. | 3180 | * and @cgroup are accessible and @pos is a descendant of @cgroup. |
3173 | */ | 3181 | */ |
3174 | struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, | 3182 | struct cgroup_subsys_state * |
3175 | struct cgroup *cgroup) | 3183 | css_next_descendant_post(struct cgroup_subsys_state *pos, |
3184 | struct cgroup_subsys_state *root) | ||
3176 | { | 3185 | { |
3177 | struct cgroup *next; | 3186 | struct cgroup_subsys_state *next; |
3178 | 3187 | ||
3179 | WARN_ON_ONCE(!rcu_read_lock_held()); | 3188 | WARN_ON_ONCE(!rcu_read_lock_held()); |
3180 | 3189 | ||
3181 | /* if first iteration, visit the leftmost descendant */ | 3190 | /* if first iteration, visit the leftmost descendant */ |
3182 | if (!pos) { | 3191 | if (!pos) { |
3183 | next = cgroup_leftmost_descendant(cgroup); | 3192 | next = css_leftmost_descendant(root); |
3184 | return next != cgroup ? next : NULL; | 3193 | return next != root ? next : NULL; |
3185 | } | 3194 | } |
3186 | 3195 | ||
3196 | /* if we visited @root, we're done */ | ||
3197 | if (pos == root) | ||
3198 | return NULL; | ||
3199 | |||
3187 | /* if there's an unvisited sibling, visit its leftmost descendant */ | 3200 | /* if there's an unvisited sibling, visit its leftmost descendant */ |
3188 | next = cgroup_next_sibling(pos); | 3201 | next = css_next_child(pos, css_parent(pos)); |
3189 | if (next) | 3202 | if (next) |
3190 | return cgroup_leftmost_descendant(next); | 3203 | return css_leftmost_descendant(next); |
3191 | 3204 | ||
3192 | /* no sibling left, visit parent */ | 3205 | /* no sibling left, visit parent */ |
3193 | next = pos->parent; | 3206 | return css_parent(pos); |
3194 | return next != cgroup ? next : NULL; | 3207 | } |
3208 | EXPORT_SYMBOL_GPL(css_next_descendant_post); | ||
3209 | |||
3210 | /** | ||
3211 | * css_advance_task_iter - advance a task itererator to the next css_set | ||
3212 | * @it: the iterator to advance | ||
3213 | * | ||
3214 | * Advance @it to the next css_set to walk. | ||
3215 | */ | ||
3216 | static void css_advance_task_iter(struct css_task_iter *it) | ||
3217 | { | ||
3218 | struct list_head *l = it->cset_link; | ||
3219 | struct cgrp_cset_link *link; | ||
3220 | struct css_set *cset; | ||
3221 | |||
3222 | /* Advance to the next non-empty css_set */ | ||
3223 | do { | ||
3224 | l = l->next; | ||
3225 | if (l == &it->origin_css->cgroup->cset_links) { | ||
3226 | it->cset_link = NULL; | ||
3227 | return; | ||
3228 | } | ||
3229 | link = list_entry(l, struct cgrp_cset_link, cset_link); | ||
3230 | cset = link->cset; | ||
3231 | } while (list_empty(&cset->tasks)); | ||
3232 | it->cset_link = l; | ||
3233 | it->task = cset->tasks.next; | ||
3195 | } | 3234 | } |
3196 | EXPORT_SYMBOL_GPL(cgroup_next_descendant_post); | ||
3197 | 3235 | ||
3198 | void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) | 3236 | /** |
3237 | * css_task_iter_start - initiate task iteration | ||
3238 | * @css: the css to walk tasks of | ||
3239 | * @it: the task iterator to use | ||
3240 | * | ||
3241 | * Initiate iteration through the tasks of @css. The caller can call | ||
3242 | * css_task_iter_next() to walk through the tasks until the function | ||
3243 | * returns NULL. On completion of iteration, css_task_iter_end() must be | ||
3244 | * called. | ||
3245 | * | ||
3246 | * Note that this function acquires a lock which is released when the | ||
3247 | * iteration finishes. The caller can't sleep while iteration is in | ||
3248 | * progress. | ||
3249 | */ | ||
3250 | void css_task_iter_start(struct cgroup_subsys_state *css, | ||
3251 | struct css_task_iter *it) | ||
3199 | __acquires(css_set_lock) | 3252 | __acquires(css_set_lock) |
3200 | { | 3253 | { |
3201 | /* | 3254 | /* |
3202 | * The first time anyone tries to iterate across a cgroup, | 3255 | * The first time anyone tries to iterate across a css, we need to |
3203 | * we need to enable the list linking each css_set to its | 3256 | * enable the list linking each css_set to its tasks, and fix up |
3204 | * tasks, and fix up all existing tasks. | 3257 | * all existing tasks. |
3205 | */ | 3258 | */ |
3206 | if (!use_task_css_set_links) | 3259 | if (!use_task_css_set_links) |
3207 | cgroup_enable_task_cg_lists(); | 3260 | cgroup_enable_task_cg_lists(); |
3208 | 3261 | ||
3209 | read_lock(&css_set_lock); | 3262 | read_lock(&css_set_lock); |
3210 | it->cset_link = &cgrp->cset_links; | 3263 | |
3211 | cgroup_advance_iter(cgrp, it); | 3264 | it->origin_css = css; |
3265 | it->cset_link = &css->cgroup->cset_links; | ||
3266 | |||
3267 | css_advance_task_iter(it); | ||
3212 | } | 3268 | } |
3213 | 3269 | ||
3214 | struct task_struct *cgroup_iter_next(struct cgroup *cgrp, | 3270 | /** |
3215 | struct cgroup_iter *it) | 3271 | * css_task_iter_next - return the next task for the iterator |
3272 | * @it: the task iterator being iterated | ||
3273 | * | ||
3274 | * The "next" function for task iteration. @it should have been | ||
3275 | * initialized via css_task_iter_start(). Returns NULL when the iteration | ||
3276 | * reaches the end. | ||
3277 | */ | ||
3278 | struct task_struct *css_task_iter_next(struct css_task_iter *it) | ||
3216 | { | 3279 | { |
3217 | struct task_struct *res; | 3280 | struct task_struct *res; |
3218 | struct list_head *l = it->task; | 3281 | struct list_head *l = it->task; |
@@ -3226,16 +3289,24 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp, | |||
3226 | l = l->next; | 3289 | l = l->next; |
3227 | link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link); | 3290 | link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link); |
3228 | if (l == &link->cset->tasks) { | 3291 | if (l == &link->cset->tasks) { |
3229 | /* We reached the end of this task list - move on to | 3292 | /* |
3230 | * the next cg_cgroup_link */ | 3293 | * We reached the end of this task list - move on to the |
3231 | cgroup_advance_iter(cgrp, it); | 3294 | * next cgrp_cset_link. |
3295 | */ | ||
3296 | css_advance_task_iter(it); | ||
3232 | } else { | 3297 | } else { |
3233 | it->task = l; | 3298 | it->task = l; |
3234 | } | 3299 | } |
3235 | return res; | 3300 | return res; |
3236 | } | 3301 | } |
3237 | 3302 | ||
3238 | void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it) | 3303 | /** |
3304 | * css_task_iter_end - finish task iteration | ||
3305 | * @it: the task iterator to finish | ||
3306 | * | ||
3307 | * Finish task iteration started by css_task_iter_start(). | ||
3308 | */ | ||
3309 | void css_task_iter_end(struct css_task_iter *it) | ||
3239 | __releases(css_set_lock) | 3310 | __releases(css_set_lock) |
3240 | { | 3311 | { |
3241 | read_unlock(&css_set_lock); | 3312 | read_unlock(&css_set_lock); |
@@ -3276,46 +3347,49 @@ static inline int started_after(void *p1, void *p2) | |||
3276 | } | 3347 | } |
3277 | 3348 | ||
3278 | /** | 3349 | /** |
3279 | * cgroup_scan_tasks - iterate though all the tasks in a cgroup | 3350 | * css_scan_tasks - iterate though all the tasks in a css |
3280 | * @scan: struct cgroup_scanner containing arguments for the scan | 3351 | * @css: the css to iterate tasks of |
3352 | * @test: optional test callback | ||
3353 | * @process: process callback | ||
3354 | * @data: data passed to @test and @process | ||
3355 | * @heap: optional pre-allocated heap used for task iteration | ||
3356 | * | ||
3357 | * Iterate through all the tasks in @css, calling @test for each, and if it | ||
3358 | * returns %true, call @process for it also. | ||
3359 | * | ||
3360 | * @test may be NULL, meaning always true (select all tasks), which | ||
3361 | * effectively duplicates css_task_iter_{start,next,end}() but does not | ||
3362 | * lock css_set_lock for the call to @process. | ||
3281 | * | 3363 | * |
3282 | * Arguments include pointers to callback functions test_task() and | 3364 | * It is guaranteed that @process will act on every task that is a member |
3283 | * process_task(). | 3365 | * of @css for the duration of this call. This function may or may not |
3284 | * Iterate through all the tasks in a cgroup, calling test_task() for each, | 3366 | * call @process for tasks that exit or move to a different css during the |
3285 | * and if it returns true, call process_task() for it also. | 3367 | * call, or are forked or move into the css during the call. |
3286 | * The test_task pointer may be NULL, meaning always true (select all tasks). | ||
3287 | * Effectively duplicates cgroup_iter_{start,next,end}() | ||
3288 | * but does not lock css_set_lock for the call to process_task(). | ||
3289 | * The struct cgroup_scanner may be embedded in any structure of the caller's | ||
3290 | * creation. | ||
3291 | * It is guaranteed that process_task() will act on every task that | ||
3292 | * is a member of the cgroup for the duration of this call. This | ||
3293 | * function may or may not call process_task() for tasks that exit | ||
3294 | * or move to a different cgroup during the call, or are forked or | ||
3295 | * move into the cgroup during the call. | ||
3296 | * | 3368 | * |
3297 | * Note that test_task() may be called with locks held, and may in some | 3369 | * Note that @test may be called with locks held, and may in some |
3298 | * situations be called multiple times for the same task, so it should | 3370 | * situations be called multiple times for the same task, so it should be |
3299 | * be cheap. | 3371 | * cheap. |
3300 | * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been | 3372 | * |
3301 | * pre-allocated and will be used for heap operations (and its "gt" member will | 3373 | * If @heap is non-NULL, a heap has been pre-allocated and will be used for |
3302 | * be overwritten), else a temporary heap will be used (allocation of which | 3374 | * heap operations (and its "gt" member will be overwritten), else a |
3303 | * may cause this function to fail). | 3375 | * temporary heap will be used (allocation of which may cause this function |
3376 | * to fail). | ||
3304 | */ | 3377 | */ |
3305 | int cgroup_scan_tasks(struct cgroup_scanner *scan) | 3378 | int css_scan_tasks(struct cgroup_subsys_state *css, |
3379 | bool (*test)(struct task_struct *, void *), | ||
3380 | void (*process)(struct task_struct *, void *), | ||
3381 | void *data, struct ptr_heap *heap) | ||
3306 | { | 3382 | { |
3307 | int retval, i; | 3383 | int retval, i; |
3308 | struct cgroup_iter it; | 3384 | struct css_task_iter it; |
3309 | struct task_struct *p, *dropped; | 3385 | struct task_struct *p, *dropped; |
3310 | /* Never dereference latest_task, since it's not refcounted */ | 3386 | /* Never dereference latest_task, since it's not refcounted */ |
3311 | struct task_struct *latest_task = NULL; | 3387 | struct task_struct *latest_task = NULL; |
3312 | struct ptr_heap tmp_heap; | 3388 | struct ptr_heap tmp_heap; |
3313 | struct ptr_heap *heap; | ||
3314 | struct timespec latest_time = { 0, 0 }; | 3389 | struct timespec latest_time = { 0, 0 }; |
3315 | 3390 | ||
3316 | if (scan->heap) { | 3391 | if (heap) { |
3317 | /* The caller supplied our heap and pre-allocated its memory */ | 3392 | /* The caller supplied our heap and pre-allocated its memory */ |
3318 | heap = scan->heap; | ||
3319 | heap->gt = &started_after; | 3393 | heap->gt = &started_after; |
3320 | } else { | 3394 | } else { |
3321 | /* We need to allocate our own heap memory */ | 3395 | /* We need to allocate our own heap memory */ |
@@ -3328,25 +3402,24 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) | |||
3328 | 3402 | ||
3329 | again: | 3403 | again: |
3330 | /* | 3404 | /* |
3331 | * Scan tasks in the cgroup, using the scanner's "test_task" callback | 3405 | * Scan tasks in the css, using the @test callback to determine |
3332 | * to determine which are of interest, and using the scanner's | 3406 | * which are of interest, and invoking @process callback on the |
3333 | * "process_task" callback to process any of them that need an update. | 3407 | * ones which need an update. Since we don't want to hold any |
3334 | * Since we don't want to hold any locks during the task updates, | 3408 | * locks during the task updates, gather tasks to be processed in a |
3335 | * gather tasks to be processed in a heap structure. | 3409 | * heap structure. The heap is sorted by descending task start |
3336 | * The heap is sorted by descending task start time. | 3410 | * time. If the statically-sized heap fills up, we overflow tasks |
3337 | * If the statically-sized heap fills up, we overflow tasks that | 3411 | * that started later, and in future iterations only consider tasks |
3338 | * started later, and in future iterations only consider tasks that | 3412 | * that started after the latest task in the previous pass. This |
3339 | * started after the latest task in the previous pass. This | ||
3340 | * guarantees forward progress and that we don't miss any tasks. | 3413 | * guarantees forward progress and that we don't miss any tasks. |
3341 | */ | 3414 | */ |
3342 | heap->size = 0; | 3415 | heap->size = 0; |
3343 | cgroup_iter_start(scan->cg, &it); | 3416 | css_task_iter_start(css, &it); |
3344 | while ((p = cgroup_iter_next(scan->cg, &it))) { | 3417 | while ((p = css_task_iter_next(&it))) { |
3345 | /* | 3418 | /* |
3346 | * Only affect tasks that qualify per the caller's callback, | 3419 | * Only affect tasks that qualify per the caller's callback, |
3347 | * if he provided one | 3420 | * if he provided one |
3348 | */ | 3421 | */ |
3349 | if (scan->test_task && !scan->test_task(p, scan)) | 3422 | if (test && !test(p, data)) |
3350 | continue; | 3423 | continue; |
3351 | /* | 3424 | /* |
3352 | * Only process tasks that started after the last task | 3425 | * Only process tasks that started after the last task |
@@ -3374,7 +3447,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) | |||
3374 | * the heap and wasn't inserted | 3447 | * the heap and wasn't inserted |
3375 | */ | 3448 | */ |
3376 | } | 3449 | } |
3377 | cgroup_iter_end(scan->cg, &it); | 3450 | css_task_iter_end(&it); |
3378 | 3451 | ||
3379 | if (heap->size) { | 3452 | if (heap->size) { |
3380 | for (i = 0; i < heap->size; i++) { | 3453 | for (i = 0; i < heap->size; i++) { |
@@ -3384,7 +3457,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) | |||
3384 | latest_task = q; | 3457 | latest_task = q; |
3385 | } | 3458 | } |
3386 | /* Process the task per the caller's callback */ | 3459 | /* Process the task per the caller's callback */ |
3387 | scan->process_task(q, scan); | 3460 | process(q, data); |
3388 | put_task_struct(q); | 3461 | put_task_struct(q); |
3389 | } | 3462 | } |
3390 | /* | 3463 | /* |
@@ -3401,10 +3474,9 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) | |||
3401 | return 0; | 3474 | return 0; |
3402 | } | 3475 | } |
3403 | 3476 | ||
3404 | static void cgroup_transfer_one_task(struct task_struct *task, | 3477 | static void cgroup_transfer_one_task(struct task_struct *task, void *data) |
3405 | struct cgroup_scanner *scan) | ||
3406 | { | 3478 | { |
3407 | struct cgroup *new_cgroup = scan->data; | 3479 | struct cgroup *new_cgroup = data; |
3408 | 3480 | ||
3409 | mutex_lock(&cgroup_mutex); | 3481 | mutex_lock(&cgroup_mutex); |
3410 | cgroup_attach_task(new_cgroup, task, false); | 3482 | cgroup_attach_task(new_cgroup, task, false); |
@@ -3418,15 +3490,8 @@ static void cgroup_transfer_one_task(struct task_struct *task, | |||
3418 | */ | 3490 | */ |
3419 | int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) | 3491 | int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) |
3420 | { | 3492 | { |
3421 | struct cgroup_scanner scan; | 3493 | return css_scan_tasks(&from->dummy_css, NULL, cgroup_transfer_one_task, |
3422 | 3494 | to, NULL); | |
3423 | scan.cg = from; | ||
3424 | scan.test_task = NULL; /* select all tasks in cgroup */ | ||
3425 | scan.process_task = cgroup_transfer_one_task; | ||
3426 | scan.heap = NULL; | ||
3427 | scan.data = to; | ||
3428 | |||
3429 | return cgroup_scan_tasks(&scan); | ||
3430 | } | 3495 | } |
3431 | 3496 | ||
3432 | /* | 3497 | /* |
@@ -3468,7 +3533,7 @@ struct cgroup_pidlist { | |||
3468 | /* pointer to the cgroup we belong to, for list removal purposes */ | 3533 | /* pointer to the cgroup we belong to, for list removal purposes */ |
3469 | struct cgroup *owner; | 3534 | struct cgroup *owner; |
3470 | /* protects the other fields */ | 3535 | /* protects the other fields */ |
3471 | struct rw_semaphore mutex; | 3536 | struct rw_semaphore rwsem; |
3472 | }; | 3537 | }; |
3473 | 3538 | ||
3474 | /* | 3539 | /* |
@@ -3541,7 +3606,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, | |||
3541 | struct pid_namespace *ns = task_active_pid_ns(current); | 3606 | struct pid_namespace *ns = task_active_pid_ns(current); |
3542 | 3607 | ||
3543 | /* | 3608 | /* |
3544 | * We can't drop the pidlist_mutex before taking the l->mutex in case | 3609 | * We can't drop the pidlist_mutex before taking the l->rwsem in case |
3545 | * the last ref-holder is trying to remove l from the list at the same | 3610 | * the last ref-holder is trying to remove l from the list at the same |
3546 | * time. Holding the pidlist_mutex precludes somebody taking whichever | 3611 | * time. Holding the pidlist_mutex precludes somebody taking whichever |
3547 | * list we find out from under us - compare release_pid_array(). | 3612 | * list we find out from under us - compare release_pid_array(). |
@@ -3550,7 +3615,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, | |||
3550 | list_for_each_entry(l, &cgrp->pidlists, links) { | 3615 | list_for_each_entry(l, &cgrp->pidlists, links) { |
3551 | if (l->key.type == type && l->key.ns == ns) { | 3616 | if (l->key.type == type && l->key.ns == ns) { |
3552 | /* make sure l doesn't vanish out from under us */ | 3617 | /* make sure l doesn't vanish out from under us */ |
3553 | down_write(&l->mutex); | 3618 | down_write(&l->rwsem); |
3554 | mutex_unlock(&cgrp->pidlist_mutex); | 3619 | mutex_unlock(&cgrp->pidlist_mutex); |
3555 | return l; | 3620 | return l; |
3556 | } | 3621 | } |
@@ -3561,8 +3626,8 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, | |||
3561 | mutex_unlock(&cgrp->pidlist_mutex); | 3626 | mutex_unlock(&cgrp->pidlist_mutex); |
3562 | return l; | 3627 | return l; |
3563 | } | 3628 | } |
3564 | init_rwsem(&l->mutex); | 3629 | init_rwsem(&l->rwsem); |
3565 | down_write(&l->mutex); | 3630 | down_write(&l->rwsem); |
3566 | l->key.type = type; | 3631 | l->key.type = type; |
3567 | l->key.ns = get_pid_ns(ns); | 3632 | l->key.ns = get_pid_ns(ns); |
3568 | l->owner = cgrp; | 3633 | l->owner = cgrp; |
@@ -3580,7 +3645,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, | |||
3580 | pid_t *array; | 3645 | pid_t *array; |
3581 | int length; | 3646 | int length; |
3582 | int pid, n = 0; /* used for populating the array */ | 3647 | int pid, n = 0; /* used for populating the array */ |
3583 | struct cgroup_iter it; | 3648 | struct css_task_iter it; |
3584 | struct task_struct *tsk; | 3649 | struct task_struct *tsk; |
3585 | struct cgroup_pidlist *l; | 3650 | struct cgroup_pidlist *l; |
3586 | 3651 | ||
@@ -3595,8 +3660,8 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, | |||
3595 | if (!array) | 3660 | if (!array) |
3596 | return -ENOMEM; | 3661 | return -ENOMEM; |
3597 | /* now, populate the array */ | 3662 | /* now, populate the array */ |
3598 | cgroup_iter_start(cgrp, &it); | 3663 | css_task_iter_start(&cgrp->dummy_css, &it); |
3599 | while ((tsk = cgroup_iter_next(cgrp, &it))) { | 3664 | while ((tsk = css_task_iter_next(&it))) { |
3600 | if (unlikely(n == length)) | 3665 | if (unlikely(n == length)) |
3601 | break; | 3666 | break; |
3602 | /* get tgid or pid for procs or tasks file respectively */ | 3667 | /* get tgid or pid for procs or tasks file respectively */ |
@@ -3607,7 +3672,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, | |||
3607 | if (pid > 0) /* make sure to only use valid results */ | 3672 | if (pid > 0) /* make sure to only use valid results */ |
3608 | array[n++] = pid; | 3673 | array[n++] = pid; |
3609 | } | 3674 | } |
3610 | cgroup_iter_end(cgrp, &it); | 3675 | css_task_iter_end(&it); |
3611 | length = n; | 3676 | length = n; |
3612 | /* now sort & (if procs) strip out duplicates */ | 3677 | /* now sort & (if procs) strip out duplicates */ |
3613 | sort(array, length, sizeof(pid_t), cmppid, NULL); | 3678 | sort(array, length, sizeof(pid_t), cmppid, NULL); |
@@ -3623,7 +3688,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, | |||
3623 | l->list = array; | 3688 | l->list = array; |
3624 | l->length = length; | 3689 | l->length = length; |
3625 | l->use_count++; | 3690 | l->use_count++; |
3626 | up_write(&l->mutex); | 3691 | up_write(&l->rwsem); |
3627 | *lp = l; | 3692 | *lp = l; |
3628 | return 0; | 3693 | return 0; |
3629 | } | 3694 | } |
@@ -3641,7 +3706,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) | |||
3641 | { | 3706 | { |
3642 | int ret = -EINVAL; | 3707 | int ret = -EINVAL; |
3643 | struct cgroup *cgrp; | 3708 | struct cgroup *cgrp; |
3644 | struct cgroup_iter it; | 3709 | struct css_task_iter it; |
3645 | struct task_struct *tsk; | 3710 | struct task_struct *tsk; |
3646 | 3711 | ||
3647 | /* | 3712 | /* |
@@ -3655,8 +3720,8 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) | |||
3655 | ret = 0; | 3720 | ret = 0; |
3656 | cgrp = dentry->d_fsdata; | 3721 | cgrp = dentry->d_fsdata; |
3657 | 3722 | ||
3658 | cgroup_iter_start(cgrp, &it); | 3723 | css_task_iter_start(&cgrp->dummy_css, &it); |
3659 | while ((tsk = cgroup_iter_next(cgrp, &it))) { | 3724 | while ((tsk = css_task_iter_next(&it))) { |
3660 | switch (tsk->state) { | 3725 | switch (tsk->state) { |
3661 | case TASK_RUNNING: | 3726 | case TASK_RUNNING: |
3662 | stats->nr_running++; | 3727 | stats->nr_running++; |
@@ -3676,7 +3741,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) | |||
3676 | break; | 3741 | break; |
3677 | } | 3742 | } |
3678 | } | 3743 | } |
3679 | cgroup_iter_end(cgrp, &it); | 3744 | css_task_iter_end(&it); |
3680 | 3745 | ||
3681 | err: | 3746 | err: |
3682 | return ret; | 3747 | return ret; |
@@ -3701,7 +3766,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) | |||
3701 | int index = 0, pid = *pos; | 3766 | int index = 0, pid = *pos; |
3702 | int *iter; | 3767 | int *iter; |
3703 | 3768 | ||
3704 | down_read(&l->mutex); | 3769 | down_read(&l->rwsem); |
3705 | if (pid) { | 3770 | if (pid) { |
3706 | int end = l->length; | 3771 | int end = l->length; |
3707 | 3772 | ||
@@ -3728,7 +3793,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) | |||
3728 | static void cgroup_pidlist_stop(struct seq_file *s, void *v) | 3793 | static void cgroup_pidlist_stop(struct seq_file *s, void *v) |
3729 | { | 3794 | { |
3730 | struct cgroup_pidlist *l = s->private; | 3795 | struct cgroup_pidlist *l = s->private; |
3731 | up_read(&l->mutex); | 3796 | up_read(&l->rwsem); |
3732 | } | 3797 | } |
3733 | 3798 | ||
3734 | static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) | 3799 | static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) |
@@ -3774,7 +3839,7 @@ static void cgroup_release_pid_array(struct cgroup_pidlist *l) | |||
3774 | * pidlist_mutex, we have to take pidlist_mutex first. | 3839 | * pidlist_mutex, we have to take pidlist_mutex first. |
3775 | */ | 3840 | */ |
3776 | mutex_lock(&l->owner->pidlist_mutex); | 3841 | mutex_lock(&l->owner->pidlist_mutex); |
3777 | down_write(&l->mutex); | 3842 | down_write(&l->rwsem); |
3778 | BUG_ON(!l->use_count); | 3843 | BUG_ON(!l->use_count); |
3779 | if (!--l->use_count) { | 3844 | if (!--l->use_count) { |
3780 | /* we're the last user if refcount is 0; remove and free */ | 3845 | /* we're the last user if refcount is 0; remove and free */ |
@@ -3782,12 +3847,12 @@ static void cgroup_release_pid_array(struct cgroup_pidlist *l) | |||
3782 | mutex_unlock(&l->owner->pidlist_mutex); | 3847 | mutex_unlock(&l->owner->pidlist_mutex); |
3783 | pidlist_free(l->list); | 3848 | pidlist_free(l->list); |
3784 | put_pid_ns(l->key.ns); | 3849 | put_pid_ns(l->key.ns); |
3785 | up_write(&l->mutex); | 3850 | up_write(&l->rwsem); |
3786 | kfree(l); | 3851 | kfree(l); |
3787 | return; | 3852 | return; |
3788 | } | 3853 | } |
3789 | mutex_unlock(&l->owner->pidlist_mutex); | 3854 | mutex_unlock(&l->owner->pidlist_mutex); |
3790 | up_write(&l->mutex); | 3855 | up_write(&l->rwsem); |
3791 | } | 3856 | } |
3792 | 3857 | ||
3793 | static int cgroup_pidlist_release(struct inode *inode, struct file *file) | 3858 | static int cgroup_pidlist_release(struct inode *inode, struct file *file) |
@@ -3851,21 +3916,20 @@ static int cgroup_procs_open(struct inode *unused, struct file *file) | |||
3851 | return cgroup_pidlist_open(file, CGROUP_FILE_PROCS); | 3916 | return cgroup_pidlist_open(file, CGROUP_FILE_PROCS); |
3852 | } | 3917 | } |
3853 | 3918 | ||
3854 | static u64 cgroup_read_notify_on_release(struct cgroup *cgrp, | 3919 | static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, |
3855 | struct cftype *cft) | 3920 | struct cftype *cft) |
3856 | { | 3921 | { |
3857 | return notify_on_release(cgrp); | 3922 | return notify_on_release(css->cgroup); |
3858 | } | 3923 | } |
3859 | 3924 | ||
3860 | static int cgroup_write_notify_on_release(struct cgroup *cgrp, | 3925 | static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css, |
3861 | struct cftype *cft, | 3926 | struct cftype *cft, u64 val) |
3862 | u64 val) | ||
3863 | { | 3927 | { |
3864 | clear_bit(CGRP_RELEASABLE, &cgrp->flags); | 3928 | clear_bit(CGRP_RELEASABLE, &css->cgroup->flags); |
3865 | if (val) | 3929 | if (val) |
3866 | set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); | 3930 | set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); |
3867 | else | 3931 | else |
3868 | clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); | 3932 | clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); |
3869 | return 0; | 3933 | return 0; |
3870 | } | 3934 | } |
3871 | 3935 | ||
@@ -3895,18 +3959,18 @@ static void cgroup_event_remove(struct work_struct *work) | |||
3895 | { | 3959 | { |
3896 | struct cgroup_event *event = container_of(work, struct cgroup_event, | 3960 | struct cgroup_event *event = container_of(work, struct cgroup_event, |
3897 | remove); | 3961 | remove); |
3898 | struct cgroup *cgrp = event->cgrp; | 3962 | struct cgroup_subsys_state *css = event->css; |
3899 | 3963 | ||
3900 | remove_wait_queue(event->wqh, &event->wait); | 3964 | remove_wait_queue(event->wqh, &event->wait); |
3901 | 3965 | ||
3902 | event->cft->unregister_event(cgrp, event->cft, event->eventfd); | 3966 | event->cft->unregister_event(css, event->cft, event->eventfd); |
3903 | 3967 | ||
3904 | /* Notify userspace the event is going away. */ | 3968 | /* Notify userspace the event is going away. */ |
3905 | eventfd_signal(event->eventfd, 1); | 3969 | eventfd_signal(event->eventfd, 1); |
3906 | 3970 | ||
3907 | eventfd_ctx_put(event->eventfd); | 3971 | eventfd_ctx_put(event->eventfd); |
3908 | kfree(event); | 3972 | kfree(event); |
3909 | cgroup_dput(cgrp); | 3973 | css_put(css); |
3910 | } | 3974 | } |
3911 | 3975 | ||
3912 | /* | 3976 | /* |
@@ -3919,7 +3983,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, | |||
3919 | { | 3983 | { |
3920 | struct cgroup_event *event = container_of(wait, | 3984 | struct cgroup_event *event = container_of(wait, |
3921 | struct cgroup_event, wait); | 3985 | struct cgroup_event, wait); |
3922 | struct cgroup *cgrp = event->cgrp; | 3986 | struct cgroup *cgrp = event->css->cgroup; |
3923 | unsigned long flags = (unsigned long)key; | 3987 | unsigned long flags = (unsigned long)key; |
3924 | 3988 | ||
3925 | if (flags & POLLHUP) { | 3989 | if (flags & POLLHUP) { |
@@ -3963,14 +4027,15 @@ static void cgroup_event_ptable_queue_proc(struct file *file, | |||
3963 | * Input must be in format '<event_fd> <control_fd> <args>'. | 4027 | * Input must be in format '<event_fd> <control_fd> <args>'. |
3964 | * Interpretation of args is defined by control file implementation. | 4028 | * Interpretation of args is defined by control file implementation. |
3965 | */ | 4029 | */ |
3966 | static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, | 4030 | static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, |
3967 | const char *buffer) | 4031 | struct cftype *cft, const char *buffer) |
3968 | { | 4032 | { |
3969 | struct cgroup_event *event = NULL; | 4033 | struct cgroup *cgrp = dummy_css->cgroup; |
3970 | struct cgroup *cgrp_cfile; | 4034 | struct cgroup_event *event; |
4035 | struct cgroup_subsys_state *cfile_css; | ||
3971 | unsigned int efd, cfd; | 4036 | unsigned int efd, cfd; |
3972 | struct file *efile = NULL; | 4037 | struct file *efile; |
3973 | struct file *cfile = NULL; | 4038 | struct file *cfile; |
3974 | char *endp; | 4039 | char *endp; |
3975 | int ret; | 4040 | int ret; |
3976 | 4041 | ||
@@ -3987,7 +4052,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, | |||
3987 | event = kzalloc(sizeof(*event), GFP_KERNEL); | 4052 | event = kzalloc(sizeof(*event), GFP_KERNEL); |
3988 | if (!event) | 4053 | if (!event) |
3989 | return -ENOMEM; | 4054 | return -ENOMEM; |
3990 | event->cgrp = cgrp; | 4055 | |
3991 | INIT_LIST_HEAD(&event->list); | 4056 | INIT_LIST_HEAD(&event->list); |
3992 | init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); | 4057 | init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); |
3993 | init_waitqueue_func_entry(&event->wait, cgroup_event_wake); | 4058 | init_waitqueue_func_entry(&event->wait, cgroup_event_wake); |
@@ -3996,62 +4061,68 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, | |||
3996 | efile = eventfd_fget(efd); | 4061 | efile = eventfd_fget(efd); |
3997 | if (IS_ERR(efile)) { | 4062 | if (IS_ERR(efile)) { |
3998 | ret = PTR_ERR(efile); | 4063 | ret = PTR_ERR(efile); |
3999 | goto fail; | 4064 | goto out_kfree; |
4000 | } | 4065 | } |
4001 | 4066 | ||
4002 | event->eventfd = eventfd_ctx_fileget(efile); | 4067 | event->eventfd = eventfd_ctx_fileget(efile); |
4003 | if (IS_ERR(event->eventfd)) { | 4068 | if (IS_ERR(event->eventfd)) { |
4004 | ret = PTR_ERR(event->eventfd); | 4069 | ret = PTR_ERR(event->eventfd); |
4005 | goto fail; | 4070 | goto out_put_efile; |
4006 | } | 4071 | } |
4007 | 4072 | ||
4008 | cfile = fget(cfd); | 4073 | cfile = fget(cfd); |
4009 | if (!cfile) { | 4074 | if (!cfile) { |
4010 | ret = -EBADF; | 4075 | ret = -EBADF; |
4011 | goto fail; | 4076 | goto out_put_eventfd; |
4012 | } | 4077 | } |
4013 | 4078 | ||
4014 | /* the process need read permission on control file */ | 4079 | /* the process need read permission on control file */ |
4015 | /* AV: shouldn't we check that it's been opened for read instead? */ | 4080 | /* AV: shouldn't we check that it's been opened for read instead? */ |
4016 | ret = inode_permission(file_inode(cfile), MAY_READ); | 4081 | ret = inode_permission(file_inode(cfile), MAY_READ); |
4017 | if (ret < 0) | 4082 | if (ret < 0) |
4018 | goto fail; | 4083 | goto out_put_cfile; |
4019 | 4084 | ||
4020 | event->cft = __file_cft(cfile); | 4085 | event->cft = __file_cft(cfile); |
4021 | if (IS_ERR(event->cft)) { | 4086 | if (IS_ERR(event->cft)) { |
4022 | ret = PTR_ERR(event->cft); | 4087 | ret = PTR_ERR(event->cft); |
4023 | goto fail; | 4088 | goto out_put_cfile; |
4089 | } | ||
4090 | |||
4091 | if (!event->cft->ss) { | ||
4092 | ret = -EBADF; | ||
4093 | goto out_put_cfile; | ||
4024 | } | 4094 | } |
4025 | 4095 | ||
4026 | /* | 4096 | /* |
4027 | * The file to be monitored must be in the same cgroup as | 4097 | * Determine the css of @cfile, verify it belongs to the same |
4028 | * cgroup.event_control is. | 4098 | * cgroup as cgroup.event_control, and associate @event with it. |
4099 | * Remaining events are automatically removed on cgroup destruction | ||
4100 | * but the removal is asynchronous, so take an extra ref. | ||
4029 | */ | 4101 | */ |
4030 | cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent); | 4102 | rcu_read_lock(); |
4031 | if (cgrp_cfile != cgrp) { | 4103 | |
4032 | ret = -EINVAL; | 4104 | ret = -EINVAL; |
4033 | goto fail; | 4105 | event->css = cgroup_css(cgrp, event->cft->ss); |
4034 | } | 4106 | cfile_css = css_from_dir(cfile->f_dentry->d_parent, event->cft->ss); |
4107 | if (event->css && event->css == cfile_css && css_tryget(event->css)) | ||
4108 | ret = 0; | ||
4109 | |||
4110 | rcu_read_unlock(); | ||
4111 | if (ret) | ||
4112 | goto out_put_cfile; | ||
4035 | 4113 | ||
4036 | if (!event->cft->register_event || !event->cft->unregister_event) { | 4114 | if (!event->cft->register_event || !event->cft->unregister_event) { |
4037 | ret = -EINVAL; | 4115 | ret = -EINVAL; |
4038 | goto fail; | 4116 | goto out_put_css; |
4039 | } | 4117 | } |
4040 | 4118 | ||
4041 | ret = event->cft->register_event(cgrp, event->cft, | 4119 | ret = event->cft->register_event(event->css, event->cft, |
4042 | event->eventfd, buffer); | 4120 | event->eventfd, buffer); |
4043 | if (ret) | 4121 | if (ret) |
4044 | goto fail; | 4122 | goto out_put_css; |
4045 | 4123 | ||
4046 | efile->f_op->poll(efile, &event->pt); | 4124 | efile->f_op->poll(efile, &event->pt); |
4047 | 4125 | ||
4048 | /* | ||
4049 | * Events should be removed after rmdir of cgroup directory, but before | ||
4050 | * destroying subsystem state objects. Let's take reference to cgroup | ||
4051 | * directory dentry to do that. | ||
4052 | */ | ||
4053 | dget(cgrp->dentry); | ||
4054 | |||
4055 | spin_lock(&cgrp->event_list_lock); | 4126 | spin_lock(&cgrp->event_list_lock); |
4056 | list_add(&event->list, &cgrp->event_list); | 4127 | list_add(&event->list, &cgrp->event_list); |
4057 | spin_unlock(&cgrp->event_list_lock); | 4128 | spin_unlock(&cgrp->event_list_lock); |
@@ -4061,35 +4132,33 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, | |||
4061 | 4132 | ||
4062 | return 0; | 4133 | return 0; |
4063 | 4134 | ||
4064 | fail: | 4135 | out_put_css: |
4065 | if (cfile) | 4136 | css_put(event->css); |
4066 | fput(cfile); | 4137 | out_put_cfile: |
4067 | 4138 | fput(cfile); | |
4068 | if (event && event->eventfd && !IS_ERR(event->eventfd)) | 4139 | out_put_eventfd: |
4069 | eventfd_ctx_put(event->eventfd); | 4140 | eventfd_ctx_put(event->eventfd); |
4070 | 4141 | out_put_efile: | |
4071 | if (!IS_ERR_OR_NULL(efile)) | 4142 | fput(efile); |
4072 | fput(efile); | 4143 | out_kfree: |
4073 | |||
4074 | kfree(event); | 4144 | kfree(event); |
4075 | 4145 | ||
4076 | return ret; | 4146 | return ret; |
4077 | } | 4147 | } |
4078 | 4148 | ||
4079 | static u64 cgroup_clone_children_read(struct cgroup *cgrp, | 4149 | static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, |
4080 | struct cftype *cft) | 4150 | struct cftype *cft) |
4081 | { | 4151 | { |
4082 | return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); | 4152 | return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); |
4083 | } | 4153 | } |
4084 | 4154 | ||
4085 | static int cgroup_clone_children_write(struct cgroup *cgrp, | 4155 | static int cgroup_clone_children_write(struct cgroup_subsys_state *css, |
4086 | struct cftype *cft, | 4156 | struct cftype *cft, u64 val) |
4087 | u64 val) | ||
4088 | { | 4157 | { |
4089 | if (val) | 4158 | if (val) |
4090 | set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); | 4159 | set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); |
4091 | else | 4160 | else |
4092 | clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); | 4161 | clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); |
4093 | return 0; | 4162 | return 0; |
4094 | } | 4163 | } |
4095 | 4164 | ||
@@ -4148,36 +4217,34 @@ static struct cftype cgroup_base_files[] = { | |||
4148 | }; | 4217 | }; |
4149 | 4218 | ||
4150 | /** | 4219 | /** |
4151 | * cgroup_populate_dir - selectively creation of files in a directory | 4220 | * cgroup_populate_dir - create subsys files in a cgroup directory |
4152 | * @cgrp: target cgroup | 4221 | * @cgrp: target cgroup |
4153 | * @base_files: true if the base files should be added | ||
4154 | * @subsys_mask: mask of the subsystem ids whose files should be added | 4222 | * @subsys_mask: mask of the subsystem ids whose files should be added |
4223 | * | ||
4224 | * On failure, no file is added. | ||
4155 | */ | 4225 | */ |
4156 | static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, | 4226 | static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) |
4157 | unsigned long subsys_mask) | ||
4158 | { | 4227 | { |
4159 | int err; | ||
4160 | struct cgroup_subsys *ss; | 4228 | struct cgroup_subsys *ss; |
4161 | 4229 | int i, ret = 0; | |
4162 | if (base_files) { | ||
4163 | err = cgroup_addrm_files(cgrp, NULL, cgroup_base_files, true); | ||
4164 | if (err < 0) | ||
4165 | return err; | ||
4166 | } | ||
4167 | 4230 | ||
4168 | /* process cftsets of each subsystem */ | 4231 | /* process cftsets of each subsystem */ |
4169 | for_each_root_subsys(cgrp->root, ss) { | 4232 | for_each_subsys(ss, i) { |
4170 | struct cftype_set *set; | 4233 | struct cftype_set *set; |
4171 | if (!test_bit(ss->subsys_id, &subsys_mask)) | 4234 | |
4235 | if (!test_bit(i, &subsys_mask)) | ||
4172 | continue; | 4236 | continue; |
4173 | 4237 | ||
4174 | list_for_each_entry(set, &ss->cftsets, node) | 4238 | list_for_each_entry(set, &ss->cftsets, node) { |
4175 | cgroup_addrm_files(cgrp, ss, set->cfts, true); | 4239 | ret = cgroup_addrm_files(cgrp, set->cfts, true); |
4240 | if (ret < 0) | ||
4241 | goto err; | ||
4242 | } | ||
4176 | } | 4243 | } |
4177 | 4244 | ||
4178 | /* This cgroup is ready now */ | 4245 | /* This cgroup is ready now */ |
4179 | for_each_root_subsys(cgrp->root, ss) { | 4246 | for_each_root_subsys(cgrp->root, ss) { |
4180 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | 4247 | struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); |
4181 | struct css_id *id = rcu_dereference_protected(css->id, true); | 4248 | struct css_id *id = rcu_dereference_protected(css->id, true); |
4182 | 4249 | ||
4183 | /* | 4250 | /* |
@@ -4190,14 +4257,57 @@ static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, | |||
4190 | } | 4257 | } |
4191 | 4258 | ||
4192 | return 0; | 4259 | return 0; |
4260 | err: | ||
4261 | cgroup_clear_dir(cgrp, subsys_mask); | ||
4262 | return ret; | ||
4263 | } | ||
4264 | |||
4265 | /* | ||
4266 | * css destruction is four-stage process. | ||
4267 | * | ||
4268 | * 1. Destruction starts. Killing of the percpu_ref is initiated. | ||
4269 | * Implemented in kill_css(). | ||
4270 | * | ||
4271 | * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs | ||
4272 | * and thus css_tryget() is guaranteed to fail, the css can be offlined | ||
4273 | * by invoking offline_css(). After offlining, the base ref is put. | ||
4274 | * Implemented in css_killed_work_fn(). | ||
4275 | * | ||
4276 | * 3. When the percpu_ref reaches zero, the only possible remaining | ||
4277 | * accessors are inside RCU read sections. css_release() schedules the | ||
4278 | * RCU callback. | ||
4279 | * | ||
4280 | * 4. After the grace period, the css can be freed. Implemented in | ||
4281 | * css_free_work_fn(). | ||
4282 | * | ||
4283 | * It is actually hairier because both step 2 and 4 require process context | ||
4284 | * and thus involve punting to css->destroy_work adding two additional | ||
4285 | * steps to the already complex sequence. | ||
4286 | */ | ||
4287 | static void css_free_work_fn(struct work_struct *work) | ||
4288 | { | ||
4289 | struct cgroup_subsys_state *css = | ||
4290 | container_of(work, struct cgroup_subsys_state, destroy_work); | ||
4291 | struct cgroup *cgrp = css->cgroup; | ||
4292 | |||
4293 | if (css->parent) | ||
4294 | css_put(css->parent); | ||
4295 | |||
4296 | css->ss->css_free(css); | ||
4297 | cgroup_dput(cgrp); | ||
4193 | } | 4298 | } |
4194 | 4299 | ||
4195 | static void css_dput_fn(struct work_struct *work) | 4300 | static void css_free_rcu_fn(struct rcu_head *rcu_head) |
4196 | { | 4301 | { |
4197 | struct cgroup_subsys_state *css = | 4302 | struct cgroup_subsys_state *css = |
4198 | container_of(work, struct cgroup_subsys_state, dput_work); | 4303 | container_of(rcu_head, struct cgroup_subsys_state, rcu_head); |
4199 | 4304 | ||
4200 | cgroup_dput(css->cgroup); | 4305 | /* |
4306 | * css holds an extra ref to @cgrp->dentry which is put on the last | ||
4307 | * css_put(). dput() requires process context which we don't have. | ||
4308 | */ | ||
4309 | INIT_WORK(&css->destroy_work, css_free_work_fn); | ||
4310 | schedule_work(&css->destroy_work); | ||
4201 | } | 4311 | } |
4202 | 4312 | ||
4203 | static void css_release(struct percpu_ref *ref) | 4313 | static void css_release(struct percpu_ref *ref) |
@@ -4205,49 +4315,47 @@ static void css_release(struct percpu_ref *ref) | |||
4205 | struct cgroup_subsys_state *css = | 4315 | struct cgroup_subsys_state *css = |
4206 | container_of(ref, struct cgroup_subsys_state, refcnt); | 4316 | container_of(ref, struct cgroup_subsys_state, refcnt); |
4207 | 4317 | ||
4208 | schedule_work(&css->dput_work); | 4318 | call_rcu(&css->rcu_head, css_free_rcu_fn); |
4209 | } | 4319 | } |
4210 | 4320 | ||
4211 | static void init_cgroup_css(struct cgroup_subsys_state *css, | 4321 | static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, |
4212 | struct cgroup_subsys *ss, | 4322 | struct cgroup *cgrp) |
4213 | struct cgroup *cgrp) | ||
4214 | { | 4323 | { |
4215 | css->cgroup = cgrp; | 4324 | css->cgroup = cgrp; |
4325 | css->ss = ss; | ||
4216 | css->flags = 0; | 4326 | css->flags = 0; |
4217 | css->id = NULL; | 4327 | css->id = NULL; |
4218 | if (cgrp == cgroup_dummy_top) | 4328 | |
4329 | if (cgrp->parent) | ||
4330 | css->parent = cgroup_css(cgrp->parent, ss); | ||
4331 | else | ||
4219 | css->flags |= CSS_ROOT; | 4332 | css->flags |= CSS_ROOT; |
4220 | BUG_ON(cgrp->subsys[ss->subsys_id]); | ||
4221 | cgrp->subsys[ss->subsys_id] = css; | ||
4222 | 4333 | ||
4223 | /* | 4334 | BUG_ON(cgroup_css(cgrp, ss)); |
4224 | * css holds an extra ref to @cgrp->dentry which is put on the last | ||
4225 | * css_put(). dput() requires process context, which css_put() may | ||
4226 | * be called without. @css->dput_work will be used to invoke | ||
4227 | * dput() asynchronously from css_put(). | ||
4228 | */ | ||
4229 | INIT_WORK(&css->dput_work, css_dput_fn); | ||
4230 | } | 4335 | } |
4231 | 4336 | ||
4232 | /* invoke ->post_create() on a new CSS and mark it online if successful */ | 4337 | /* invoke ->css_online() on a new CSS and mark it online if successful */ |
4233 | static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp) | 4338 | static int online_css(struct cgroup_subsys_state *css) |
4234 | { | 4339 | { |
4340 | struct cgroup_subsys *ss = css->ss; | ||
4235 | int ret = 0; | 4341 | int ret = 0; |
4236 | 4342 | ||
4237 | lockdep_assert_held(&cgroup_mutex); | 4343 | lockdep_assert_held(&cgroup_mutex); |
4238 | 4344 | ||
4239 | if (ss->css_online) | 4345 | if (ss->css_online) |
4240 | ret = ss->css_online(cgrp); | 4346 | ret = ss->css_online(css); |
4241 | if (!ret) | 4347 | if (!ret) { |
4242 | cgrp->subsys[ss->subsys_id]->flags |= CSS_ONLINE; | 4348 | css->flags |= CSS_ONLINE; |
4349 | css->cgroup->nr_css++; | ||
4350 | rcu_assign_pointer(css->cgroup->subsys[ss->subsys_id], css); | ||
4351 | } | ||
4243 | return ret; | 4352 | return ret; |
4244 | } | 4353 | } |
4245 | 4354 | ||
4246 | /* if the CSS is online, invoke ->pre_destory() on it and mark it offline */ | 4355 | /* if the CSS is online, invoke ->css_offline() on it and mark it offline */ |
4247 | static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) | 4356 | static void offline_css(struct cgroup_subsys_state *css) |
4248 | __releases(&cgroup_mutex) __acquires(&cgroup_mutex) | ||
4249 | { | 4357 | { |
4250 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | 4358 | struct cgroup_subsys *ss = css->ss; |
4251 | 4359 | ||
4252 | lockdep_assert_held(&cgroup_mutex); | 4360 | lockdep_assert_held(&cgroup_mutex); |
4253 | 4361 | ||
@@ -4255,9 +4363,11 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) | |||
4255 | return; | 4363 | return; |
4256 | 4364 | ||
4257 | if (ss->css_offline) | 4365 | if (ss->css_offline) |
4258 | ss->css_offline(cgrp); | 4366 | ss->css_offline(css); |
4259 | 4367 | ||
4260 | cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE; | 4368 | css->flags &= ~CSS_ONLINE; |
4369 | css->cgroup->nr_css--; | ||
4370 | RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css); | ||
4261 | } | 4371 | } |
4262 | 4372 | ||
4263 | /* | 4373 | /* |
@@ -4271,6 +4381,7 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) | |||
4271 | static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | 4381 | static long cgroup_create(struct cgroup *parent, struct dentry *dentry, |
4272 | umode_t mode) | 4382 | umode_t mode) |
4273 | { | 4383 | { |
4384 | struct cgroup_subsys_state *css_ar[CGROUP_SUBSYS_COUNT] = { }; | ||
4274 | struct cgroup *cgrp; | 4385 | struct cgroup *cgrp; |
4275 | struct cgroup_name *name; | 4386 | struct cgroup_name *name; |
4276 | struct cgroupfs_root *root = parent->root; | 4387 | struct cgroupfs_root *root = parent->root; |
@@ -4288,7 +4399,11 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4288 | goto err_free_cgrp; | 4399 | goto err_free_cgrp; |
4289 | rcu_assign_pointer(cgrp->name, name); | 4400 | rcu_assign_pointer(cgrp->name, name); |
4290 | 4401 | ||
4291 | cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL); | 4402 | /* |
4403 | * Temporarily set the pointer to NULL, so idr_find() won't return | ||
4404 | * a half-baked cgroup. | ||
4405 | */ | ||
4406 | cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL); | ||
4292 | if (cgrp->id < 0) | 4407 | if (cgrp->id < 0) |
4293 | goto err_free_name; | 4408 | goto err_free_name; |
4294 | 4409 | ||
@@ -4317,6 +4432,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4317 | cgrp->dentry = dentry; | 4432 | cgrp->dentry = dentry; |
4318 | 4433 | ||
4319 | cgrp->parent = parent; | 4434 | cgrp->parent = parent; |
4435 | cgrp->dummy_css.parent = &parent->dummy_css; | ||
4320 | cgrp->root = parent->root; | 4436 | cgrp->root = parent->root; |
4321 | 4437 | ||
4322 | if (notify_on_release(parent)) | 4438 | if (notify_on_release(parent)) |
@@ -4328,22 +4444,21 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4328 | for_each_root_subsys(root, ss) { | 4444 | for_each_root_subsys(root, ss) { |
4329 | struct cgroup_subsys_state *css; | 4445 | struct cgroup_subsys_state *css; |
4330 | 4446 | ||
4331 | css = ss->css_alloc(cgrp); | 4447 | css = ss->css_alloc(cgroup_css(parent, ss)); |
4332 | if (IS_ERR(css)) { | 4448 | if (IS_ERR(css)) { |
4333 | err = PTR_ERR(css); | 4449 | err = PTR_ERR(css); |
4334 | goto err_free_all; | 4450 | goto err_free_all; |
4335 | } | 4451 | } |
4452 | css_ar[ss->subsys_id] = css; | ||
4336 | 4453 | ||
4337 | err = percpu_ref_init(&css->refcnt, css_release); | 4454 | err = percpu_ref_init(&css->refcnt, css_release); |
4338 | if (err) { | 4455 | if (err) |
4339 | ss->css_free(cgrp); | ||
4340 | goto err_free_all; | 4456 | goto err_free_all; |
4341 | } | ||
4342 | 4457 | ||
4343 | init_cgroup_css(css, ss, cgrp); | 4458 | init_css(css, ss, cgrp); |
4344 | 4459 | ||
4345 | if (ss->use_id) { | 4460 | if (ss->use_id) { |
4346 | err = alloc_css_id(ss, parent, cgrp); | 4461 | err = alloc_css_id(css); |
4347 | if (err) | 4462 | if (err) |
4348 | goto err_free_all; | 4463 | goto err_free_all; |
4349 | } | 4464 | } |
@@ -4365,16 +4480,22 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4365 | list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); | 4480 | list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); |
4366 | root->number_of_cgroups++; | 4481 | root->number_of_cgroups++; |
4367 | 4482 | ||
4368 | /* each css holds a ref to the cgroup's dentry */ | 4483 | /* each css holds a ref to the cgroup's dentry and the parent css */ |
4369 | for_each_root_subsys(root, ss) | 4484 | for_each_root_subsys(root, ss) { |
4485 | struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; | ||
4486 | |||
4370 | dget(dentry); | 4487 | dget(dentry); |
4488 | css_get(css->parent); | ||
4489 | } | ||
4371 | 4490 | ||
4372 | /* hold a ref to the parent's dentry */ | 4491 | /* hold a ref to the parent's dentry */ |
4373 | dget(parent->dentry); | 4492 | dget(parent->dentry); |
4374 | 4493 | ||
4375 | /* creation succeeded, notify subsystems */ | 4494 | /* creation succeeded, notify subsystems */ |
4376 | for_each_root_subsys(root, ss) { | 4495 | for_each_root_subsys(root, ss) { |
4377 | err = online_css(ss, cgrp); | 4496 | struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; |
4497 | |||
4498 | err = online_css(css); | ||
4378 | if (err) | 4499 | if (err) |
4379 | goto err_destroy; | 4500 | goto err_destroy; |
4380 | 4501 | ||
@@ -4388,7 +4509,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4388 | } | 4509 | } |
4389 | } | 4510 | } |
4390 | 4511 | ||
4391 | err = cgroup_populate_dir(cgrp, true, root->subsys_mask); | 4512 | idr_replace(&root->cgroup_idr, cgrp, cgrp->id); |
4513 | |||
4514 | err = cgroup_addrm_files(cgrp, cgroup_base_files, true); | ||
4515 | if (err) | ||
4516 | goto err_destroy; | ||
4517 | |||
4518 | err = cgroup_populate_dir(cgrp, root->subsys_mask); | ||
4392 | if (err) | 4519 | if (err) |
4393 | goto err_destroy; | 4520 | goto err_destroy; |
4394 | 4521 | ||
@@ -4399,18 +4526,18 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4399 | 4526 | ||
4400 | err_free_all: | 4527 | err_free_all: |
4401 | for_each_root_subsys(root, ss) { | 4528 | for_each_root_subsys(root, ss) { |
4402 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | 4529 | struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; |
4403 | 4530 | ||
4404 | if (css) { | 4531 | if (css) { |
4405 | percpu_ref_cancel_init(&css->refcnt); | 4532 | percpu_ref_cancel_init(&css->refcnt); |
4406 | ss->css_free(cgrp); | 4533 | ss->css_free(css); |
4407 | } | 4534 | } |
4408 | } | 4535 | } |
4409 | mutex_unlock(&cgroup_mutex); | 4536 | mutex_unlock(&cgroup_mutex); |
4410 | /* Release the reference count that we took on the superblock */ | 4537 | /* Release the reference count that we took on the superblock */ |
4411 | deactivate_super(sb); | 4538 | deactivate_super(sb); |
4412 | err_free_id: | 4539 | err_free_id: |
4413 | ida_simple_remove(&root->cgroup_ida, cgrp->id); | 4540 | idr_remove(&root->cgroup_idr, cgrp->id); |
4414 | err_free_name: | 4541 | err_free_name: |
4415 | kfree(rcu_dereference_raw(cgrp->name)); | 4542 | kfree(rcu_dereference_raw(cgrp->name)); |
4416 | err_free_cgrp: | 4543 | err_free_cgrp: |
@@ -4432,22 +4559,84 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) | |||
4432 | return cgroup_create(c_parent, dentry, mode | S_IFDIR); | 4559 | return cgroup_create(c_parent, dentry, mode | S_IFDIR); |
4433 | } | 4560 | } |
4434 | 4561 | ||
4435 | static void cgroup_css_killed(struct cgroup *cgrp) | 4562 | /* |
4563 | * This is called when the refcnt of a css is confirmed to be killed. | ||
4564 | * css_tryget() is now guaranteed to fail. | ||
4565 | */ | ||
4566 | static void css_killed_work_fn(struct work_struct *work) | ||
4436 | { | 4567 | { |
4437 | if (!atomic_dec_and_test(&cgrp->css_kill_cnt)) | 4568 | struct cgroup_subsys_state *css = |
4438 | return; | 4569 | container_of(work, struct cgroup_subsys_state, destroy_work); |
4570 | struct cgroup *cgrp = css->cgroup; | ||
4439 | 4571 | ||
4440 | /* percpu ref's of all css's are killed, kick off the next step */ | 4572 | mutex_lock(&cgroup_mutex); |
4441 | INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn); | 4573 | |
4442 | schedule_work(&cgrp->destroy_work); | 4574 | /* |
4575 | * css_tryget() is guaranteed to fail now. Tell subsystems to | ||
4576 | * initate destruction. | ||
4577 | */ | ||
4578 | offline_css(css); | ||
4579 | |||
4580 | /* | ||
4581 | * If @cgrp is marked dead, it's waiting for refs of all css's to | ||
4582 | * be disabled before proceeding to the second phase of cgroup | ||
4583 | * destruction. If we are the last one, kick it off. | ||
4584 | */ | ||
4585 | if (!cgrp->nr_css && cgroup_is_dead(cgrp)) | ||
4586 | cgroup_destroy_css_killed(cgrp); | ||
4587 | |||
4588 | mutex_unlock(&cgroup_mutex); | ||
4589 | |||
4590 | /* | ||
4591 | * Put the css refs from kill_css(). Each css holds an extra | ||
4592 | * reference to the cgroup's dentry and cgroup removal proceeds | ||
4593 | * regardless of css refs. On the last put of each css, whenever | ||
4594 | * that may be, the extra dentry ref is put so that dentry | ||
4595 | * destruction happens only after all css's are released. | ||
4596 | */ | ||
4597 | css_put(css); | ||
4443 | } | 4598 | } |
4444 | 4599 | ||
4445 | static void css_ref_killed_fn(struct percpu_ref *ref) | 4600 | /* css kill confirmation processing requires process context, bounce */ |
4601 | static void css_killed_ref_fn(struct percpu_ref *ref) | ||
4446 | { | 4602 | { |
4447 | struct cgroup_subsys_state *css = | 4603 | struct cgroup_subsys_state *css = |
4448 | container_of(ref, struct cgroup_subsys_state, refcnt); | 4604 | container_of(ref, struct cgroup_subsys_state, refcnt); |
4449 | 4605 | ||
4450 | cgroup_css_killed(css->cgroup); | 4606 | INIT_WORK(&css->destroy_work, css_killed_work_fn); |
4607 | schedule_work(&css->destroy_work); | ||
4608 | } | ||
4609 | |||
4610 | /** | ||
4611 | * kill_css - destroy a css | ||
4612 | * @css: css to destroy | ||
4613 | * | ||
4614 | * This function initiates destruction of @css by removing cgroup interface | ||
4615 | * files and putting its base reference. ->css_offline() will be invoked | ||
4616 | * asynchronously once css_tryget() is guaranteed to fail and when the | ||
4617 | * reference count reaches zero, @css will be released. | ||
4618 | */ | ||
4619 | static void kill_css(struct cgroup_subsys_state *css) | ||
4620 | { | ||
4621 | cgroup_clear_dir(css->cgroup, 1 << css->ss->subsys_id); | ||
4622 | |||
4623 | /* | ||
4624 | * Killing would put the base ref, but we need to keep it alive | ||
4625 | * until after ->css_offline(). | ||
4626 | */ | ||
4627 | css_get(css); | ||
4628 | |||
4629 | /* | ||
4630 | * cgroup core guarantees that, by the time ->css_offline() is | ||
4631 | * invoked, no new css reference will be given out via | ||
4632 | * css_tryget(). We can't simply call percpu_ref_kill() and | ||
4633 | * proceed to offlining css's because percpu_ref_kill() doesn't | ||
4634 | * guarantee that the ref is seen as killed on all CPUs on return. | ||
4635 | * | ||
4636 | * Use percpu_ref_kill_and_confirm() to get notifications as each | ||
4637 | * css is confirmed to be seen as killed on all CPUs. | ||
4638 | */ | ||
4639 | percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn); | ||
4451 | } | 4640 | } |
4452 | 4641 | ||
4453 | /** | 4642 | /** |
@@ -4480,6 +4669,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) | |||
4480 | struct dentry *d = cgrp->dentry; | 4669 | struct dentry *d = cgrp->dentry; |
4481 | struct cgroup_event *event, *tmp; | 4670 | struct cgroup_event *event, *tmp; |
4482 | struct cgroup_subsys *ss; | 4671 | struct cgroup_subsys *ss; |
4672 | struct cgroup *child; | ||
4483 | bool empty; | 4673 | bool empty; |
4484 | 4674 | ||
4485 | lockdep_assert_held(&d->d_inode->i_mutex); | 4675 | lockdep_assert_held(&d->d_inode->i_mutex); |
@@ -4490,47 +4680,41 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) | |||
4490 | * @cgrp from being removed while __put_css_set() is in progress. | 4680 | * @cgrp from being removed while __put_css_set() is in progress. |
4491 | */ | 4681 | */ |
4492 | read_lock(&css_set_lock); | 4682 | read_lock(&css_set_lock); |
4493 | empty = list_empty(&cgrp->cset_links) && list_empty(&cgrp->children); | 4683 | empty = list_empty(&cgrp->cset_links); |
4494 | read_unlock(&css_set_lock); | 4684 | read_unlock(&css_set_lock); |
4495 | if (!empty) | 4685 | if (!empty) |
4496 | return -EBUSY; | 4686 | return -EBUSY; |
4497 | 4687 | ||
4498 | /* | 4688 | /* |
4499 | * Block new css_tryget() by killing css refcnts. cgroup core | 4689 | * Make sure there's no live children. We can't test ->children |
4500 | * guarantees that, by the time ->css_offline() is invoked, no new | 4690 | * emptiness as dead children linger on it while being destroyed; |
4501 | * css reference will be given out via css_tryget(). We can't | 4691 | * otherwise, "rmdir parent/child parent" may fail with -EBUSY. |
4502 | * simply call percpu_ref_kill() and proceed to offlining css's | ||
4503 | * because percpu_ref_kill() doesn't guarantee that the ref is seen | ||
4504 | * as killed on all CPUs on return. | ||
4505 | * | ||
4506 | * Use percpu_ref_kill_and_confirm() to get notifications as each | ||
4507 | * css is confirmed to be seen as killed on all CPUs. The | ||
4508 | * notification callback keeps track of the number of css's to be | ||
4509 | * killed and schedules cgroup_offline_fn() to perform the rest of | ||
4510 | * destruction once the percpu refs of all css's are confirmed to | ||
4511 | * be killed. | ||
4512 | */ | 4692 | */ |
4513 | atomic_set(&cgrp->css_kill_cnt, 1); | 4693 | empty = true; |
4514 | for_each_root_subsys(cgrp->root, ss) { | 4694 | rcu_read_lock(); |
4515 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | 4695 | list_for_each_entry_rcu(child, &cgrp->children, sibling) { |
4516 | 4696 | empty = cgroup_is_dead(child); | |
4517 | /* | 4697 | if (!empty) |
4518 | * Killing would put the base ref, but we need to keep it | 4698 | break; |
4519 | * alive until after ->css_offline. | ||
4520 | */ | ||
4521 | percpu_ref_get(&css->refcnt); | ||
4522 | |||
4523 | atomic_inc(&cgrp->css_kill_cnt); | ||
4524 | percpu_ref_kill_and_confirm(&css->refcnt, css_ref_killed_fn); | ||
4525 | } | 4699 | } |
4526 | cgroup_css_killed(cgrp); | 4700 | rcu_read_unlock(); |
4701 | if (!empty) | ||
4702 | return -EBUSY; | ||
4703 | |||
4704 | /* | ||
4705 | * Initiate massacre of all css's. cgroup_destroy_css_killed() | ||
4706 | * will be invoked to perform the rest of destruction once the | ||
4707 | * percpu refs of all css's are confirmed to be killed. | ||
4708 | */ | ||
4709 | for_each_root_subsys(cgrp->root, ss) | ||
4710 | kill_css(cgroup_css(cgrp, ss)); | ||
4527 | 4711 | ||
4528 | /* | 4712 | /* |
4529 | * Mark @cgrp dead. This prevents further task migration and child | 4713 | * Mark @cgrp dead. This prevents further task migration and child |
4530 | * creation by disabling cgroup_lock_live_group(). Note that | 4714 | * creation by disabling cgroup_lock_live_group(). Note that |
4531 | * CGRP_DEAD assertion is depended upon by cgroup_next_sibling() to | 4715 | * CGRP_DEAD assertion is depended upon by css_next_child() to |
4532 | * resume iteration after dropping RCU read lock. See | 4716 | * resume iteration after dropping RCU read lock. See |
4533 | * cgroup_next_sibling() for details. | 4717 | * css_next_child() for details. |
4534 | */ | 4718 | */ |
4535 | set_bit(CGRP_DEAD, &cgrp->flags); | 4719 | set_bit(CGRP_DEAD, &cgrp->flags); |
4536 | 4720 | ||
@@ -4541,9 +4725,20 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) | |||
4541 | raw_spin_unlock(&release_list_lock); | 4725 | raw_spin_unlock(&release_list_lock); |
4542 | 4726 | ||
4543 | /* | 4727 | /* |
4544 | * Remove @cgrp directory. The removal puts the base ref but we | 4728 | * If @cgrp has css's attached, the second stage of cgroup |
4545 | * aren't quite done with @cgrp yet, so hold onto it. | 4729 | * destruction is kicked off from css_killed_work_fn() after the |
4730 | * refs of all attached css's are killed. If @cgrp doesn't have | ||
4731 | * any css, we kick it off here. | ||
4546 | */ | 4732 | */ |
4733 | if (!cgrp->nr_css) | ||
4734 | cgroup_destroy_css_killed(cgrp); | ||
4735 | |||
4736 | /* | ||
4737 | * Clear the base files and remove @cgrp directory. The removal | ||
4738 | * puts the base ref but we aren't quite done with @cgrp yet, so | ||
4739 | * hold onto it. | ||
4740 | */ | ||
4741 | cgroup_addrm_files(cgrp, cgroup_base_files, false); | ||
4547 | dget(d); | 4742 | dget(d); |
4548 | cgroup_d_remove_dir(d); | 4743 | cgroup_d_remove_dir(d); |
4549 | 4744 | ||
@@ -4563,50 +4758,36 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) | |||
4563 | }; | 4758 | }; |
4564 | 4759 | ||
4565 | /** | 4760 | /** |
4566 | * cgroup_offline_fn - the second step of cgroup destruction | 4761 | * cgroup_destroy_css_killed - the second step of cgroup destruction |
4567 | * @work: cgroup->destroy_free_work | 4762 | * @work: cgroup->destroy_free_work |
4568 | * | 4763 | * |
4569 | * This function is invoked from a work item for a cgroup which is being | 4764 | * This function is invoked from a work item for a cgroup which is being |
4570 | * destroyed after the percpu refcnts of all css's are guaranteed to be | 4765 | * destroyed after all css's are offlined and performs the rest of |
4571 | * seen as killed on all CPUs, and performs the rest of destruction. This | 4766 | * destruction. This is the second step of destruction described in the |
4572 | * is the second step of destruction described in the comment above | 4767 | * comment above cgroup_destroy_locked(). |
4573 | * cgroup_destroy_locked(). | ||
4574 | */ | 4768 | */ |
4575 | static void cgroup_offline_fn(struct work_struct *work) | 4769 | static void cgroup_destroy_css_killed(struct cgroup *cgrp) |
4576 | { | 4770 | { |
4577 | struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); | ||
4578 | struct cgroup *parent = cgrp->parent; | 4771 | struct cgroup *parent = cgrp->parent; |
4579 | struct dentry *d = cgrp->dentry; | 4772 | struct dentry *d = cgrp->dentry; |
4580 | struct cgroup_subsys *ss; | ||
4581 | 4773 | ||
4582 | mutex_lock(&cgroup_mutex); | 4774 | lockdep_assert_held(&cgroup_mutex); |
4583 | 4775 | ||
4584 | /* | 4776 | /* delete this cgroup from parent->children */ |
4585 | * css_tryget() is guaranteed to fail now. Tell subsystems to | 4777 | list_del_rcu(&cgrp->sibling); |
4586 | * initate destruction. | ||
4587 | */ | ||
4588 | for_each_root_subsys(cgrp->root, ss) | ||
4589 | offline_css(ss, cgrp); | ||
4590 | 4778 | ||
4591 | /* | 4779 | /* |
4592 | * Put the css refs from cgroup_destroy_locked(). Each css holds | 4780 | * We should remove the cgroup object from idr before its grace |
4593 | * an extra reference to the cgroup's dentry and cgroup removal | 4781 | * period starts, so we won't be looking up a cgroup while the |
4594 | * proceeds regardless of css refs. On the last put of each css, | 4782 | * cgroup is being freed. |
4595 | * whenever that may be, the extra dentry ref is put so that dentry | ||
4596 | * destruction happens only after all css's are released. | ||
4597 | */ | 4783 | */ |
4598 | for_each_root_subsys(cgrp->root, ss) | 4784 | idr_remove(&cgrp->root->cgroup_idr, cgrp->id); |
4599 | css_put(cgrp->subsys[ss->subsys_id]); | 4785 | cgrp->id = -1; |
4600 | |||
4601 | /* delete this cgroup from parent->children */ | ||
4602 | list_del_rcu(&cgrp->sibling); | ||
4603 | 4786 | ||
4604 | dput(d); | 4787 | dput(d); |
4605 | 4788 | ||
4606 | set_bit(CGRP_RELEASABLE, &parent->flags); | 4789 | set_bit(CGRP_RELEASABLE, &parent->flags); |
4607 | check_for_release(parent); | 4790 | check_for_release(parent); |
4608 | |||
4609 | mutex_unlock(&cgroup_mutex); | ||
4610 | } | 4791 | } |
4611 | 4792 | ||
4612 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) | 4793 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) |
@@ -4629,6 +4810,11 @@ static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss) | |||
4629 | * deregistration. | 4810 | * deregistration. |
4630 | */ | 4811 | */ |
4631 | if (ss->base_cftypes) { | 4812 | if (ss->base_cftypes) { |
4813 | struct cftype *cft; | ||
4814 | |||
4815 | for (cft = ss->base_cftypes; cft->name[0] != '\0'; cft++) | ||
4816 | cft->ss = ss; | ||
4817 | |||
4632 | ss->base_cftset.cfts = ss->base_cftypes; | 4818 | ss->base_cftset.cfts = ss->base_cftypes; |
4633 | list_add_tail(&ss->base_cftset.node, &ss->cftsets); | 4819 | list_add_tail(&ss->base_cftset.node, &ss->cftsets); |
4634 | } | 4820 | } |
@@ -4648,10 +4834,10 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) | |||
4648 | /* Create the top cgroup state for this subsystem */ | 4834 | /* Create the top cgroup state for this subsystem */ |
4649 | list_add(&ss->sibling, &cgroup_dummy_root.subsys_list); | 4835 | list_add(&ss->sibling, &cgroup_dummy_root.subsys_list); |
4650 | ss->root = &cgroup_dummy_root; | 4836 | ss->root = &cgroup_dummy_root; |
4651 | css = ss->css_alloc(cgroup_dummy_top); | 4837 | css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss)); |
4652 | /* We don't handle early failures gracefully */ | 4838 | /* We don't handle early failures gracefully */ |
4653 | BUG_ON(IS_ERR(css)); | 4839 | BUG_ON(IS_ERR(css)); |
4654 | init_cgroup_css(css, ss, cgroup_dummy_top); | 4840 | init_css(css, ss, cgroup_dummy_top); |
4655 | 4841 | ||
4656 | /* Update the init_css_set to contain a subsys | 4842 | /* Update the init_css_set to contain a subsys |
4657 | * pointer to this state - since the subsystem is | 4843 | * pointer to this state - since the subsystem is |
@@ -4666,7 +4852,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) | |||
4666 | * need to invoke fork callbacks here. */ | 4852 | * need to invoke fork callbacks here. */ |
4667 | BUG_ON(!list_empty(&init_task.tasks)); | 4853 | BUG_ON(!list_empty(&init_task.tasks)); |
4668 | 4854 | ||
4669 | BUG_ON(online_css(ss, cgroup_dummy_top)); | 4855 | BUG_ON(online_css(css)); |
4670 | 4856 | ||
4671 | mutex_unlock(&cgroup_mutex); | 4857 | mutex_unlock(&cgroup_mutex); |
4672 | 4858 | ||
@@ -4727,7 +4913,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
4727 | * struct, so this can happen first (i.e. before the dummy root | 4913 | * struct, so this can happen first (i.e. before the dummy root |
4728 | * attachment). | 4914 | * attachment). |
4729 | */ | 4915 | */ |
4730 | css = ss->css_alloc(cgroup_dummy_top); | 4916 | css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss)); |
4731 | if (IS_ERR(css)) { | 4917 | if (IS_ERR(css)) { |
4732 | /* failure case - need to deassign the cgroup_subsys[] slot. */ | 4918 | /* failure case - need to deassign the cgroup_subsys[] slot. */ |
4733 | cgroup_subsys[ss->subsys_id] = NULL; | 4919 | cgroup_subsys[ss->subsys_id] = NULL; |
@@ -4739,8 +4925,8 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
4739 | ss->root = &cgroup_dummy_root; | 4925 | ss->root = &cgroup_dummy_root; |
4740 | 4926 | ||
4741 | /* our new subsystem will be attached to the dummy hierarchy. */ | 4927 | /* our new subsystem will be attached to the dummy hierarchy. */ |
4742 | init_cgroup_css(css, ss, cgroup_dummy_top); | 4928 | init_css(css, ss, cgroup_dummy_top); |
4743 | /* init_idr must be after init_cgroup_css because it sets css->id. */ | 4929 | /* init_idr must be after init_css() because it sets css->id. */ |
4744 | if (ss->use_id) { | 4930 | if (ss->use_id) { |
4745 | ret = cgroup_init_idr(ss, css); | 4931 | ret = cgroup_init_idr(ss, css); |
4746 | if (ret) | 4932 | if (ret) |
@@ -4770,7 +4956,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
4770 | } | 4956 | } |
4771 | write_unlock(&css_set_lock); | 4957 | write_unlock(&css_set_lock); |
4772 | 4958 | ||
4773 | ret = online_css(ss, cgroup_dummy_top); | 4959 | ret = online_css(css); |
4774 | if (ret) | 4960 | if (ret) |
4775 | goto err_unload; | 4961 | goto err_unload; |
4776 | 4962 | ||
@@ -4802,14 +4988,14 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) | |||
4802 | 4988 | ||
4803 | /* | 4989 | /* |
4804 | * we shouldn't be called if the subsystem is in use, and the use of | 4990 | * we shouldn't be called if the subsystem is in use, and the use of |
4805 | * try_module_get in parse_cgroupfs_options should ensure that it | 4991 | * try_module_get() in rebind_subsystems() should ensure that it |
4806 | * doesn't start being used while we're killing it off. | 4992 | * doesn't start being used while we're killing it off. |
4807 | */ | 4993 | */ |
4808 | BUG_ON(ss->root != &cgroup_dummy_root); | 4994 | BUG_ON(ss->root != &cgroup_dummy_root); |
4809 | 4995 | ||
4810 | mutex_lock(&cgroup_mutex); | 4996 | mutex_lock(&cgroup_mutex); |
4811 | 4997 | ||
4812 | offline_css(ss, cgroup_dummy_top); | 4998 | offline_css(cgroup_css(cgroup_dummy_top, ss)); |
4813 | 4999 | ||
4814 | if (ss->use_id) | 5000 | if (ss->use_id) |
4815 | idr_destroy(&ss->idr); | 5001 | idr_destroy(&ss->idr); |
@@ -4843,8 +5029,8 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) | |||
4843 | * the cgrp->subsys pointer to find their state. note that this | 5029 | * the cgrp->subsys pointer to find their state. note that this |
4844 | * also takes care of freeing the css_id. | 5030 | * also takes care of freeing the css_id. |
4845 | */ | 5031 | */ |
4846 | ss->css_free(cgroup_dummy_top); | 5032 | ss->css_free(cgroup_css(cgroup_dummy_top, ss)); |
4847 | cgroup_dummy_top->subsys[ss->subsys_id] = NULL; | 5033 | RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL); |
4848 | 5034 | ||
4849 | mutex_unlock(&cgroup_mutex); | 5035 | mutex_unlock(&cgroup_mutex); |
4850 | } | 5036 | } |
@@ -4926,6 +5112,10 @@ int __init cgroup_init(void) | |||
4926 | 5112 | ||
4927 | BUG_ON(cgroup_init_root_id(&cgroup_dummy_root, 0, 1)); | 5113 | BUG_ON(cgroup_init_root_id(&cgroup_dummy_root, 0, 1)); |
4928 | 5114 | ||
5115 | err = idr_alloc(&cgroup_dummy_root.cgroup_idr, cgroup_dummy_top, | ||
5116 | 0, 1, GFP_KERNEL); | ||
5117 | BUG_ON(err < 0); | ||
5118 | |||
4929 | mutex_unlock(&cgroup_root_mutex); | 5119 | mutex_unlock(&cgroup_root_mutex); |
4930 | mutex_unlock(&cgroup_mutex); | 5120 | mutex_unlock(&cgroup_mutex); |
4931 | 5121 | ||
@@ -5082,7 +5272,7 @@ void cgroup_fork(struct task_struct *child) | |||
5082 | * Adds the task to the list running through its css_set if necessary and | 5272 | * Adds the task to the list running through its css_set if necessary and |
5083 | * call the subsystem fork() callbacks. Has to be after the task is | 5273 | * call the subsystem fork() callbacks. Has to be after the task is |
5084 | * visible on the task list in case we race with the first call to | 5274 | * visible on the task list in case we race with the first call to |
5085 | * cgroup_iter_start() - to guarantee that the new task ends up on its | 5275 | * cgroup_task_iter_start() - to guarantee that the new task ends up on its |
5086 | * list. | 5276 | * list. |
5087 | */ | 5277 | */ |
5088 | void cgroup_post_fork(struct task_struct *child) | 5278 | void cgroup_post_fork(struct task_struct *child) |
@@ -5195,10 +5385,10 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) | |||
5195 | */ | 5385 | */ |
5196 | for_each_builtin_subsys(ss, i) { | 5386 | for_each_builtin_subsys(ss, i) { |
5197 | if (ss->exit) { | 5387 | if (ss->exit) { |
5198 | struct cgroup *old_cgrp = cset->subsys[i]->cgroup; | 5388 | struct cgroup_subsys_state *old_css = cset->subsys[i]; |
5199 | struct cgroup *cgrp = task_cgroup(tsk, i); | 5389 | struct cgroup_subsys_state *css = task_css(tsk, i); |
5200 | 5390 | ||
5201 | ss->exit(cgrp, old_cgrp, tsk); | 5391 | ss->exit(css, old_css, tsk); |
5202 | } | 5392 | } |
5203 | } | 5393 | } |
5204 | } | 5394 | } |
@@ -5457,20 +5647,16 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss, | |||
5457 | return 0; | 5647 | return 0; |
5458 | } | 5648 | } |
5459 | 5649 | ||
5460 | static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent, | 5650 | static int alloc_css_id(struct cgroup_subsys_state *child_css) |
5461 | struct cgroup *child) | ||
5462 | { | 5651 | { |
5463 | int subsys_id, i, depth = 0; | 5652 | struct cgroup_subsys_state *parent_css = css_parent(child_css); |
5464 | struct cgroup_subsys_state *parent_css, *child_css; | ||
5465 | struct css_id *child_id, *parent_id; | 5653 | struct css_id *child_id, *parent_id; |
5654 | int i, depth; | ||
5466 | 5655 | ||
5467 | subsys_id = ss->subsys_id; | ||
5468 | parent_css = parent->subsys[subsys_id]; | ||
5469 | child_css = child->subsys[subsys_id]; | ||
5470 | parent_id = rcu_dereference_protected(parent_css->id, true); | 5656 | parent_id = rcu_dereference_protected(parent_css->id, true); |
5471 | depth = parent_id->depth + 1; | 5657 | depth = parent_id->depth + 1; |
5472 | 5658 | ||
5473 | child_id = get_new_cssid(ss, depth); | 5659 | child_id = get_new_cssid(child_css->ss, depth); |
5474 | if (IS_ERR(child_id)) | 5660 | if (IS_ERR(child_id)) |
5475 | return PTR_ERR(child_id); | 5661 | return PTR_ERR(child_id); |
5476 | 5662 | ||
@@ -5508,31 +5694,56 @@ struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id) | |||
5508 | } | 5694 | } |
5509 | EXPORT_SYMBOL_GPL(css_lookup); | 5695 | EXPORT_SYMBOL_GPL(css_lookup); |
5510 | 5696 | ||
5511 | /* | 5697 | /** |
5512 | * get corresponding css from file open on cgroupfs directory | 5698 | * css_from_dir - get corresponding css from the dentry of a cgroup dir |
5699 | * @dentry: directory dentry of interest | ||
5700 | * @ss: subsystem of interest | ||
5701 | * | ||
5702 | * Must be called under RCU read lock. The caller is responsible for | ||
5703 | * pinning the returned css if it needs to be accessed outside the RCU | ||
5704 | * critical section. | ||
5513 | */ | 5705 | */ |
5514 | struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) | 5706 | struct cgroup_subsys_state *css_from_dir(struct dentry *dentry, |
5707 | struct cgroup_subsys *ss) | ||
5515 | { | 5708 | { |
5516 | struct cgroup *cgrp; | 5709 | struct cgroup *cgrp; |
5517 | struct inode *inode; | ||
5518 | struct cgroup_subsys_state *css; | ||
5519 | 5710 | ||
5520 | inode = file_inode(f); | 5711 | WARN_ON_ONCE(!rcu_read_lock_held()); |
5521 | /* check in cgroup filesystem dir */ | 5712 | |
5522 | if (inode->i_op != &cgroup_dir_inode_operations) | 5713 | /* is @dentry a cgroup dir? */ |
5714 | if (!dentry->d_inode || | ||
5715 | dentry->d_inode->i_op != &cgroup_dir_inode_operations) | ||
5523 | return ERR_PTR(-EBADF); | 5716 | return ERR_PTR(-EBADF); |
5524 | 5717 | ||
5525 | if (id < 0 || id >= CGROUP_SUBSYS_COUNT) | 5718 | cgrp = __d_cgrp(dentry); |
5526 | return ERR_PTR(-EINVAL); | 5719 | return cgroup_css(cgrp, ss) ?: ERR_PTR(-ENOENT); |
5720 | } | ||
5527 | 5721 | ||
5528 | /* get cgroup */ | 5722 | /** |
5529 | cgrp = __d_cgrp(f->f_dentry); | 5723 | * css_from_id - lookup css by id |
5530 | css = cgrp->subsys[id]; | 5724 | * @id: the cgroup id |
5531 | return css ? css : ERR_PTR(-ENOENT); | 5725 | * @ss: cgroup subsys to be looked into |
5726 | * | ||
5727 | * Returns the css if there's valid one with @id, otherwise returns NULL. | ||
5728 | * Should be called under rcu_read_lock(). | ||
5729 | */ | ||
5730 | struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) | ||
5731 | { | ||
5732 | struct cgroup *cgrp; | ||
5733 | |||
5734 | rcu_lockdep_assert(rcu_read_lock_held() || | ||
5735 | lockdep_is_held(&cgroup_mutex), | ||
5736 | "css_from_id() needs proper protection"); | ||
5737 | |||
5738 | cgrp = idr_find(&ss->root->cgroup_idr, id); | ||
5739 | if (cgrp) | ||
5740 | return cgroup_css(cgrp, ss); | ||
5741 | return NULL; | ||
5532 | } | 5742 | } |
5533 | 5743 | ||
5534 | #ifdef CONFIG_CGROUP_DEBUG | 5744 | #ifdef CONFIG_CGROUP_DEBUG |
5535 | static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cgrp) | 5745 | static struct cgroup_subsys_state * |
5746 | debug_css_alloc(struct cgroup_subsys_state *parent_css) | ||
5536 | { | 5747 | { |
5537 | struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); | 5748 | struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); |
5538 | 5749 | ||
@@ -5542,22 +5753,24 @@ static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cgrp) | |||
5542 | return css; | 5753 | return css; |
5543 | } | 5754 | } |
5544 | 5755 | ||
5545 | static void debug_css_free(struct cgroup *cgrp) | 5756 | static void debug_css_free(struct cgroup_subsys_state *css) |
5546 | { | 5757 | { |
5547 | kfree(cgrp->subsys[debug_subsys_id]); | 5758 | kfree(css); |
5548 | } | 5759 | } |
5549 | 5760 | ||
5550 | static u64 debug_taskcount_read(struct cgroup *cgrp, struct cftype *cft) | 5761 | static u64 debug_taskcount_read(struct cgroup_subsys_state *css, |
5762 | struct cftype *cft) | ||
5551 | { | 5763 | { |
5552 | return cgroup_task_count(cgrp); | 5764 | return cgroup_task_count(css->cgroup); |
5553 | } | 5765 | } |
5554 | 5766 | ||
5555 | static u64 current_css_set_read(struct cgroup *cgrp, struct cftype *cft) | 5767 | static u64 current_css_set_read(struct cgroup_subsys_state *css, |
5768 | struct cftype *cft) | ||
5556 | { | 5769 | { |
5557 | return (u64)(unsigned long)current->cgroups; | 5770 | return (u64)(unsigned long)current->cgroups; |
5558 | } | 5771 | } |
5559 | 5772 | ||
5560 | static u64 current_css_set_refcount_read(struct cgroup *cgrp, | 5773 | static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css, |
5561 | struct cftype *cft) | 5774 | struct cftype *cft) |
5562 | { | 5775 | { |
5563 | u64 count; | 5776 | u64 count; |
@@ -5568,7 +5781,7 @@ static u64 current_css_set_refcount_read(struct cgroup *cgrp, | |||
5568 | return count; | 5781 | return count; |
5569 | } | 5782 | } |
5570 | 5783 | ||
5571 | static int current_css_set_cg_links_read(struct cgroup *cgrp, | 5784 | static int current_css_set_cg_links_read(struct cgroup_subsys_state *css, |
5572 | struct cftype *cft, | 5785 | struct cftype *cft, |
5573 | struct seq_file *seq) | 5786 | struct seq_file *seq) |
5574 | { | 5787 | { |
@@ -5595,14 +5808,13 @@ static int current_css_set_cg_links_read(struct cgroup *cgrp, | |||
5595 | } | 5808 | } |
5596 | 5809 | ||
5597 | #define MAX_TASKS_SHOWN_PER_CSS 25 | 5810 | #define MAX_TASKS_SHOWN_PER_CSS 25 |
5598 | static int cgroup_css_links_read(struct cgroup *cgrp, | 5811 | static int cgroup_css_links_read(struct cgroup_subsys_state *css, |
5599 | struct cftype *cft, | 5812 | struct cftype *cft, struct seq_file *seq) |
5600 | struct seq_file *seq) | ||
5601 | { | 5813 | { |
5602 | struct cgrp_cset_link *link; | 5814 | struct cgrp_cset_link *link; |
5603 | 5815 | ||
5604 | read_lock(&css_set_lock); | 5816 | read_lock(&css_set_lock); |
5605 | list_for_each_entry(link, &cgrp->cset_links, cset_link) { | 5817 | list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { |
5606 | struct css_set *cset = link->cset; | 5818 | struct css_set *cset = link->cset; |
5607 | struct task_struct *task; | 5819 | struct task_struct *task; |
5608 | int count = 0; | 5820 | int count = 0; |
@@ -5621,9 +5833,9 @@ static int cgroup_css_links_read(struct cgroup *cgrp, | |||
5621 | return 0; | 5833 | return 0; |
5622 | } | 5834 | } |
5623 | 5835 | ||
5624 | static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft) | 5836 | static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft) |
5625 | { | 5837 | { |
5626 | return test_bit(CGRP_RELEASABLE, &cgrp->flags); | 5838 | return test_bit(CGRP_RELEASABLE, &css->cgroup->flags); |
5627 | } | 5839 | } |
5628 | 5840 | ||
5629 | static struct cftype debug_files[] = { | 5841 | static struct cftype debug_files[] = { |
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 75dda1ea5026..f0ff64d0ebaa 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c | |||
@@ -45,25 +45,19 @@ struct freezer { | |||
45 | spinlock_t lock; | 45 | spinlock_t lock; |
46 | }; | 46 | }; |
47 | 47 | ||
48 | static inline struct freezer *cgroup_freezer(struct cgroup *cgroup) | 48 | static inline struct freezer *css_freezer(struct cgroup_subsys_state *css) |
49 | { | 49 | { |
50 | return container_of(cgroup_subsys_state(cgroup, freezer_subsys_id), | 50 | return css ? container_of(css, struct freezer, css) : NULL; |
51 | struct freezer, css); | ||
52 | } | 51 | } |
53 | 52 | ||
54 | static inline struct freezer *task_freezer(struct task_struct *task) | 53 | static inline struct freezer *task_freezer(struct task_struct *task) |
55 | { | 54 | { |
56 | return container_of(task_subsys_state(task, freezer_subsys_id), | 55 | return css_freezer(task_css(task, freezer_subsys_id)); |
57 | struct freezer, css); | ||
58 | } | 56 | } |
59 | 57 | ||
60 | static struct freezer *parent_freezer(struct freezer *freezer) | 58 | static struct freezer *parent_freezer(struct freezer *freezer) |
61 | { | 59 | { |
62 | struct cgroup *pcg = freezer->css.cgroup->parent; | 60 | return css_freezer(css_parent(&freezer->css)); |
63 | |||
64 | if (pcg) | ||
65 | return cgroup_freezer(pcg); | ||
66 | return NULL; | ||
67 | } | 61 | } |
68 | 62 | ||
69 | bool cgroup_freezing(struct task_struct *task) | 63 | bool cgroup_freezing(struct task_struct *task) |
@@ -92,7 +86,8 @@ static const char *freezer_state_strs(unsigned int state) | |||
92 | 86 | ||
93 | struct cgroup_subsys freezer_subsys; | 87 | struct cgroup_subsys freezer_subsys; |
94 | 88 | ||
95 | static struct cgroup_subsys_state *freezer_css_alloc(struct cgroup *cgroup) | 89 | static struct cgroup_subsys_state * |
90 | freezer_css_alloc(struct cgroup_subsys_state *parent_css) | ||
96 | { | 91 | { |
97 | struct freezer *freezer; | 92 | struct freezer *freezer; |
98 | 93 | ||
@@ -105,22 +100,22 @@ static struct cgroup_subsys_state *freezer_css_alloc(struct cgroup *cgroup) | |||
105 | } | 100 | } |
106 | 101 | ||
107 | /** | 102 | /** |
108 | * freezer_css_online - commit creation of a freezer cgroup | 103 | * freezer_css_online - commit creation of a freezer css |
109 | * @cgroup: cgroup being created | 104 | * @css: css being created |
110 | * | 105 | * |
111 | * We're committing to creation of @cgroup. Mark it online and inherit | 106 | * We're committing to creation of @css. Mark it online and inherit |
112 | * parent's freezing state while holding both parent's and our | 107 | * parent's freezing state while holding both parent's and our |
113 | * freezer->lock. | 108 | * freezer->lock. |
114 | */ | 109 | */ |
115 | static int freezer_css_online(struct cgroup *cgroup) | 110 | static int freezer_css_online(struct cgroup_subsys_state *css) |
116 | { | 111 | { |
117 | struct freezer *freezer = cgroup_freezer(cgroup); | 112 | struct freezer *freezer = css_freezer(css); |
118 | struct freezer *parent = parent_freezer(freezer); | 113 | struct freezer *parent = parent_freezer(freezer); |
119 | 114 | ||
120 | /* | 115 | /* |
121 | * The following double locking and freezing state inheritance | 116 | * The following double locking and freezing state inheritance |
122 | * guarantee that @cgroup can never escape ancestors' freezing | 117 | * guarantee that @cgroup can never escape ancestors' freezing |
123 | * states. See cgroup_for_each_descendant_pre() for details. | 118 | * states. See css_for_each_descendant_pre() for details. |
124 | */ | 119 | */ |
125 | if (parent) | 120 | if (parent) |
126 | spin_lock_irq(&parent->lock); | 121 | spin_lock_irq(&parent->lock); |
@@ -141,15 +136,15 @@ static int freezer_css_online(struct cgroup *cgroup) | |||
141 | } | 136 | } |
142 | 137 | ||
143 | /** | 138 | /** |
144 | * freezer_css_offline - initiate destruction of @cgroup | 139 | * freezer_css_offline - initiate destruction of a freezer css |
145 | * @cgroup: cgroup being destroyed | 140 | * @css: css being destroyed |
146 | * | 141 | * |
147 | * @cgroup is going away. Mark it dead and decrement system_freezing_count | 142 | * @css is going away. Mark it dead and decrement system_freezing_count if |
148 | * if it was holding one. | 143 | * it was holding one. |
149 | */ | 144 | */ |
150 | static void freezer_css_offline(struct cgroup *cgroup) | 145 | static void freezer_css_offline(struct cgroup_subsys_state *css) |
151 | { | 146 | { |
152 | struct freezer *freezer = cgroup_freezer(cgroup); | 147 | struct freezer *freezer = css_freezer(css); |
153 | 148 | ||
154 | spin_lock_irq(&freezer->lock); | 149 | spin_lock_irq(&freezer->lock); |
155 | 150 | ||
@@ -161,9 +156,9 @@ static void freezer_css_offline(struct cgroup *cgroup) | |||
161 | spin_unlock_irq(&freezer->lock); | 156 | spin_unlock_irq(&freezer->lock); |
162 | } | 157 | } |
163 | 158 | ||
164 | static void freezer_css_free(struct cgroup *cgroup) | 159 | static void freezer_css_free(struct cgroup_subsys_state *css) |
165 | { | 160 | { |
166 | kfree(cgroup_freezer(cgroup)); | 161 | kfree(css_freezer(css)); |
167 | } | 162 | } |
168 | 163 | ||
169 | /* | 164 | /* |
@@ -175,25 +170,26 @@ static void freezer_css_free(struct cgroup *cgroup) | |||
175 | * @freezer->lock. freezer_attach() makes the new tasks conform to the | 170 | * @freezer->lock. freezer_attach() makes the new tasks conform to the |
176 | * current state and all following state changes can see the new tasks. | 171 | * current state and all following state changes can see the new tasks. |
177 | */ | 172 | */ |
178 | static void freezer_attach(struct cgroup *new_cgrp, struct cgroup_taskset *tset) | 173 | static void freezer_attach(struct cgroup_subsys_state *new_css, |
174 | struct cgroup_taskset *tset) | ||
179 | { | 175 | { |
180 | struct freezer *freezer = cgroup_freezer(new_cgrp); | 176 | struct freezer *freezer = css_freezer(new_css); |
181 | struct task_struct *task; | 177 | struct task_struct *task; |
182 | bool clear_frozen = false; | 178 | bool clear_frozen = false; |
183 | 179 | ||
184 | spin_lock_irq(&freezer->lock); | 180 | spin_lock_irq(&freezer->lock); |
185 | 181 | ||
186 | /* | 182 | /* |
187 | * Make the new tasks conform to the current state of @new_cgrp. | 183 | * Make the new tasks conform to the current state of @new_css. |
188 | * For simplicity, when migrating any task to a FROZEN cgroup, we | 184 | * For simplicity, when migrating any task to a FROZEN cgroup, we |
189 | * revert it to FREEZING and let update_if_frozen() determine the | 185 | * revert it to FREEZING and let update_if_frozen() determine the |
190 | * correct state later. | 186 | * correct state later. |
191 | * | 187 | * |
192 | * Tasks in @tset are on @new_cgrp but may not conform to its | 188 | * Tasks in @tset are on @new_css but may not conform to its |
193 | * current state before executing the following - !frozen tasks may | 189 | * current state before executing the following - !frozen tasks may |
194 | * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. | 190 | * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. |
195 | */ | 191 | */ |
196 | cgroup_taskset_for_each(task, new_cgrp, tset) { | 192 | cgroup_taskset_for_each(task, new_css, tset) { |
197 | if (!(freezer->state & CGROUP_FREEZING)) { | 193 | if (!(freezer->state & CGROUP_FREEZING)) { |
198 | __thaw_task(task); | 194 | __thaw_task(task); |
199 | } else { | 195 | } else { |
@@ -231,7 +227,7 @@ static void freezer_fork(struct task_struct *task) | |||
231 | * The root cgroup is non-freezable, so we can skip the | 227 | * The root cgroup is non-freezable, so we can skip the |
232 | * following check. | 228 | * following check. |
233 | */ | 229 | */ |
234 | if (!freezer->css.cgroup->parent) | 230 | if (!parent_freezer(freezer)) |
235 | goto out; | 231 | goto out; |
236 | 232 | ||
237 | spin_lock_irq(&freezer->lock); | 233 | spin_lock_irq(&freezer->lock); |
@@ -244,7 +240,7 @@ out: | |||
244 | 240 | ||
245 | /** | 241 | /** |
246 | * update_if_frozen - update whether a cgroup finished freezing | 242 | * update_if_frozen - update whether a cgroup finished freezing |
247 | * @cgroup: cgroup of interest | 243 | * @css: css of interest |
248 | * | 244 | * |
249 | * Once FREEZING is initiated, transition to FROZEN is lazily updated by | 245 | * Once FREEZING is initiated, transition to FROZEN is lazily updated by |
250 | * calling this function. If the current state is FREEZING but not FROZEN, | 246 | * calling this function. If the current state is FREEZING but not FROZEN, |
@@ -255,14 +251,14 @@ out: | |||
255 | * update_if_frozen() on all descendants prior to invoking this function. | 251 | * update_if_frozen() on all descendants prior to invoking this function. |
256 | * | 252 | * |
257 | * Task states and freezer state might disagree while tasks are being | 253 | * Task states and freezer state might disagree while tasks are being |
258 | * migrated into or out of @cgroup, so we can't verify task states against | 254 | * migrated into or out of @css, so we can't verify task states against |
259 | * @freezer state here. See freezer_attach() for details. | 255 | * @freezer state here. See freezer_attach() for details. |
260 | */ | 256 | */ |
261 | static void update_if_frozen(struct cgroup *cgroup) | 257 | static void update_if_frozen(struct cgroup_subsys_state *css) |
262 | { | 258 | { |
263 | struct freezer *freezer = cgroup_freezer(cgroup); | 259 | struct freezer *freezer = css_freezer(css); |
264 | struct cgroup *pos; | 260 | struct cgroup_subsys_state *pos; |
265 | struct cgroup_iter it; | 261 | struct css_task_iter it; |
266 | struct task_struct *task; | 262 | struct task_struct *task; |
267 | 263 | ||
268 | WARN_ON_ONCE(!rcu_read_lock_held()); | 264 | WARN_ON_ONCE(!rcu_read_lock_held()); |
@@ -274,8 +270,8 @@ static void update_if_frozen(struct cgroup *cgroup) | |||
274 | goto out_unlock; | 270 | goto out_unlock; |
275 | 271 | ||
276 | /* are all (live) children frozen? */ | 272 | /* are all (live) children frozen? */ |
277 | cgroup_for_each_child(pos, cgroup) { | 273 | css_for_each_child(pos, css) { |
278 | struct freezer *child = cgroup_freezer(pos); | 274 | struct freezer *child = css_freezer(pos); |
279 | 275 | ||
280 | if ((child->state & CGROUP_FREEZER_ONLINE) && | 276 | if ((child->state & CGROUP_FREEZER_ONLINE) && |
281 | !(child->state & CGROUP_FROZEN)) | 277 | !(child->state & CGROUP_FROZEN)) |
@@ -283,9 +279,9 @@ static void update_if_frozen(struct cgroup *cgroup) | |||
283 | } | 279 | } |
284 | 280 | ||
285 | /* are all tasks frozen? */ | 281 | /* are all tasks frozen? */ |
286 | cgroup_iter_start(cgroup, &it); | 282 | css_task_iter_start(css, &it); |
287 | 283 | ||
288 | while ((task = cgroup_iter_next(cgroup, &it))) { | 284 | while ((task = css_task_iter_next(&it))) { |
289 | if (freezing(task)) { | 285 | if (freezing(task)) { |
290 | /* | 286 | /* |
291 | * freezer_should_skip() indicates that the task | 287 | * freezer_should_skip() indicates that the task |
@@ -300,52 +296,49 @@ static void update_if_frozen(struct cgroup *cgroup) | |||
300 | 296 | ||
301 | freezer->state |= CGROUP_FROZEN; | 297 | freezer->state |= CGROUP_FROZEN; |
302 | out_iter_end: | 298 | out_iter_end: |
303 | cgroup_iter_end(cgroup, &it); | 299 | css_task_iter_end(&it); |
304 | out_unlock: | 300 | out_unlock: |
305 | spin_unlock_irq(&freezer->lock); | 301 | spin_unlock_irq(&freezer->lock); |
306 | } | 302 | } |
307 | 303 | ||
308 | static int freezer_read(struct cgroup *cgroup, struct cftype *cft, | 304 | static int freezer_read(struct cgroup_subsys_state *css, struct cftype *cft, |
309 | struct seq_file *m) | 305 | struct seq_file *m) |
310 | { | 306 | { |
311 | struct cgroup *pos; | 307 | struct cgroup_subsys_state *pos; |
312 | 308 | ||
313 | rcu_read_lock(); | 309 | rcu_read_lock(); |
314 | 310 | ||
315 | /* update states bottom-up */ | 311 | /* update states bottom-up */ |
316 | cgroup_for_each_descendant_post(pos, cgroup) | 312 | css_for_each_descendant_post(pos, css) |
317 | update_if_frozen(pos); | 313 | update_if_frozen(pos); |
318 | update_if_frozen(cgroup); | ||
319 | 314 | ||
320 | rcu_read_unlock(); | 315 | rcu_read_unlock(); |
321 | 316 | ||
322 | seq_puts(m, freezer_state_strs(cgroup_freezer(cgroup)->state)); | 317 | seq_puts(m, freezer_state_strs(css_freezer(css)->state)); |
323 | seq_putc(m, '\n'); | 318 | seq_putc(m, '\n'); |
324 | return 0; | 319 | return 0; |
325 | } | 320 | } |
326 | 321 | ||
327 | static void freeze_cgroup(struct freezer *freezer) | 322 | static void freeze_cgroup(struct freezer *freezer) |
328 | { | 323 | { |
329 | struct cgroup *cgroup = freezer->css.cgroup; | 324 | struct css_task_iter it; |
330 | struct cgroup_iter it; | ||
331 | struct task_struct *task; | 325 | struct task_struct *task; |
332 | 326 | ||
333 | cgroup_iter_start(cgroup, &it); | 327 | css_task_iter_start(&freezer->css, &it); |
334 | while ((task = cgroup_iter_next(cgroup, &it))) | 328 | while ((task = css_task_iter_next(&it))) |
335 | freeze_task(task); | 329 | freeze_task(task); |
336 | cgroup_iter_end(cgroup, &it); | 330 | css_task_iter_end(&it); |
337 | } | 331 | } |
338 | 332 | ||
339 | static void unfreeze_cgroup(struct freezer *freezer) | 333 | static void unfreeze_cgroup(struct freezer *freezer) |
340 | { | 334 | { |
341 | struct cgroup *cgroup = freezer->css.cgroup; | 335 | struct css_task_iter it; |
342 | struct cgroup_iter it; | ||
343 | struct task_struct *task; | 336 | struct task_struct *task; |
344 | 337 | ||
345 | cgroup_iter_start(cgroup, &it); | 338 | css_task_iter_start(&freezer->css, &it); |
346 | while ((task = cgroup_iter_next(cgroup, &it))) | 339 | while ((task = css_task_iter_next(&it))) |
347 | __thaw_task(task); | 340 | __thaw_task(task); |
348 | cgroup_iter_end(cgroup, &it); | 341 | css_task_iter_end(&it); |
349 | } | 342 | } |
350 | 343 | ||
351 | /** | 344 | /** |
@@ -395,12 +388,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze, | |||
395 | */ | 388 | */ |
396 | static void freezer_change_state(struct freezer *freezer, bool freeze) | 389 | static void freezer_change_state(struct freezer *freezer, bool freeze) |
397 | { | 390 | { |
398 | struct cgroup *pos; | 391 | struct cgroup_subsys_state *pos; |
399 | |||
400 | /* update @freezer */ | ||
401 | spin_lock_irq(&freezer->lock); | ||
402 | freezer_apply_state(freezer, freeze, CGROUP_FREEZING_SELF); | ||
403 | spin_unlock_irq(&freezer->lock); | ||
404 | 392 | ||
405 | /* | 393 | /* |
406 | * Update all its descendants in pre-order traversal. Each | 394 | * Update all its descendants in pre-order traversal. Each |
@@ -408,24 +396,33 @@ static void freezer_change_state(struct freezer *freezer, bool freeze) | |||
408 | * CGROUP_FREEZING_PARENT. | 396 | * CGROUP_FREEZING_PARENT. |
409 | */ | 397 | */ |
410 | rcu_read_lock(); | 398 | rcu_read_lock(); |
411 | cgroup_for_each_descendant_pre(pos, freezer->css.cgroup) { | 399 | css_for_each_descendant_pre(pos, &freezer->css) { |
412 | struct freezer *pos_f = cgroup_freezer(pos); | 400 | struct freezer *pos_f = css_freezer(pos); |
413 | struct freezer *parent = parent_freezer(pos_f); | 401 | struct freezer *parent = parent_freezer(pos_f); |
414 | 402 | ||
415 | /* | ||
416 | * Our update to @parent->state is already visible which is | ||
417 | * all we need. No need to lock @parent. For more info on | ||
418 | * synchronization, see freezer_post_create(). | ||
419 | */ | ||
420 | spin_lock_irq(&pos_f->lock); | 403 | spin_lock_irq(&pos_f->lock); |
421 | freezer_apply_state(pos_f, parent->state & CGROUP_FREEZING, | 404 | |
422 | CGROUP_FREEZING_PARENT); | 405 | if (pos_f == freezer) { |
406 | freezer_apply_state(pos_f, freeze, | ||
407 | CGROUP_FREEZING_SELF); | ||
408 | } else { | ||
409 | /* | ||
410 | * Our update to @parent->state is already visible | ||
411 | * which is all we need. No need to lock @parent. | ||
412 | * For more info on synchronization, see | ||
413 | * freezer_post_create(). | ||
414 | */ | ||
415 | freezer_apply_state(pos_f, | ||
416 | parent->state & CGROUP_FREEZING, | ||
417 | CGROUP_FREEZING_PARENT); | ||
418 | } | ||
419 | |||
423 | spin_unlock_irq(&pos_f->lock); | 420 | spin_unlock_irq(&pos_f->lock); |
424 | } | 421 | } |
425 | rcu_read_unlock(); | 422 | rcu_read_unlock(); |
426 | } | 423 | } |
427 | 424 | ||
428 | static int freezer_write(struct cgroup *cgroup, struct cftype *cft, | 425 | static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft, |
429 | const char *buffer) | 426 | const char *buffer) |
430 | { | 427 | { |
431 | bool freeze; | 428 | bool freeze; |
@@ -437,20 +434,22 @@ static int freezer_write(struct cgroup *cgroup, struct cftype *cft, | |||
437 | else | 434 | else |
438 | return -EINVAL; | 435 | return -EINVAL; |
439 | 436 | ||
440 | freezer_change_state(cgroup_freezer(cgroup), freeze); | 437 | freezer_change_state(css_freezer(css), freeze); |
441 | return 0; | 438 | return 0; |
442 | } | 439 | } |
443 | 440 | ||
444 | static u64 freezer_self_freezing_read(struct cgroup *cgroup, struct cftype *cft) | 441 | static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css, |
442 | struct cftype *cft) | ||
445 | { | 443 | { |
446 | struct freezer *freezer = cgroup_freezer(cgroup); | 444 | struct freezer *freezer = css_freezer(css); |
447 | 445 | ||
448 | return (bool)(freezer->state & CGROUP_FREEZING_SELF); | 446 | return (bool)(freezer->state & CGROUP_FREEZING_SELF); |
449 | } | 447 | } |
450 | 448 | ||
451 | static u64 freezer_parent_freezing_read(struct cgroup *cgroup, struct cftype *cft) | 449 | static u64 freezer_parent_freezing_read(struct cgroup_subsys_state *css, |
450 | struct cftype *cft) | ||
452 | { | 451 | { |
453 | struct freezer *freezer = cgroup_freezer(cgroup); | 452 | struct freezer *freezer = css_freezer(css); |
454 | 453 | ||
455 | return (bool)(freezer->state & CGROUP_FREEZING_PARENT); | 454 | return (bool)(freezer->state & CGROUP_FREEZING_PARENT); |
456 | } | 455 | } |
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 383f8231e436..247091bf0587 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c | |||
@@ -20,22 +20,33 @@ | |||
20 | #include <linux/hardirq.h> | 20 | #include <linux/hardirq.h> |
21 | #include <linux/export.h> | 21 | #include <linux/export.h> |
22 | 22 | ||
23 | DEFINE_PER_CPU(struct context_tracking, context_tracking) = { | 23 | #define CREATE_TRACE_POINTS |
24 | #ifdef CONFIG_CONTEXT_TRACKING_FORCE | 24 | #include <trace/events/context_tracking.h> |
25 | .active = true, | 25 | |
26 | #endif | 26 | struct static_key context_tracking_enabled = STATIC_KEY_INIT_FALSE; |
27 | }; | 27 | EXPORT_SYMBOL_GPL(context_tracking_enabled); |
28 | |||
29 | DEFINE_PER_CPU(struct context_tracking, context_tracking); | ||
30 | EXPORT_SYMBOL_GPL(context_tracking); | ||
31 | |||
32 | void context_tracking_cpu_set(int cpu) | ||
33 | { | ||
34 | if (!per_cpu(context_tracking.active, cpu)) { | ||
35 | per_cpu(context_tracking.active, cpu) = true; | ||
36 | static_key_slow_inc(&context_tracking_enabled); | ||
37 | } | ||
38 | } | ||
28 | 39 | ||
29 | /** | 40 | /** |
30 | * user_enter - Inform the context tracking that the CPU is going to | 41 | * context_tracking_user_enter - Inform the context tracking that the CPU is going to |
31 | * enter userspace mode. | 42 | * enter userspace mode. |
32 | * | 43 | * |
33 | * This function must be called right before we switch from the kernel | 44 | * This function must be called right before we switch from the kernel |
34 | * to userspace, when it's guaranteed the remaining kernel instructions | 45 | * to userspace, when it's guaranteed the remaining kernel instructions |
35 | * to execute won't use any RCU read side critical section because this | 46 | * to execute won't use any RCU read side critical section because this |
36 | * function sets RCU in extended quiescent state. | 47 | * function sets RCU in extended quiescent state. |
37 | */ | 48 | */ |
38 | void user_enter(void) | 49 | void context_tracking_user_enter(void) |
39 | { | 50 | { |
40 | unsigned long flags; | 51 | unsigned long flags; |
41 | 52 | ||
@@ -54,17 +65,32 @@ void user_enter(void) | |||
54 | WARN_ON_ONCE(!current->mm); | 65 | WARN_ON_ONCE(!current->mm); |
55 | 66 | ||
56 | local_irq_save(flags); | 67 | local_irq_save(flags); |
57 | if (__this_cpu_read(context_tracking.active) && | 68 | if ( __this_cpu_read(context_tracking.state) != IN_USER) { |
58 | __this_cpu_read(context_tracking.state) != IN_USER) { | 69 | if (__this_cpu_read(context_tracking.active)) { |
70 | trace_user_enter(0); | ||
71 | /* | ||
72 | * At this stage, only low level arch entry code remains and | ||
73 | * then we'll run in userspace. We can assume there won't be | ||
74 | * any RCU read-side critical section until the next call to | ||
75 | * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency | ||
76 | * on the tick. | ||
77 | */ | ||
78 | vtime_user_enter(current); | ||
79 | rcu_user_enter(); | ||
80 | } | ||
59 | /* | 81 | /* |
60 | * At this stage, only low level arch entry code remains and | 82 | * Even if context tracking is disabled on this CPU, because it's outside |
61 | * then we'll run in userspace. We can assume there won't be | 83 | * the full dynticks mask for example, we still have to keep track of the |
62 | * any RCU read-side critical section until the next call to | 84 | * context transitions and states to prevent inconsistency on those of |
63 | * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency | 85 | * other CPUs. |
64 | * on the tick. | 86 | * If a task triggers an exception in userspace, sleep on the exception |
87 | * handler and then migrate to another CPU, that new CPU must know where | ||
88 | * the exception returns by the time we call exception_exit(). | ||
89 | * This information can only be provided by the previous CPU when it called | ||
90 | * exception_enter(). | ||
91 | * OTOH we can spare the calls to vtime and RCU when context_tracking.active | ||
92 | * is false because we know that CPU is not tickless. | ||
65 | */ | 93 | */ |
66 | vtime_user_enter(current); | ||
67 | rcu_user_enter(); | ||
68 | __this_cpu_write(context_tracking.state, IN_USER); | 94 | __this_cpu_write(context_tracking.state, IN_USER); |
69 | } | 95 | } |
70 | local_irq_restore(flags); | 96 | local_irq_restore(flags); |
@@ -87,10 +113,9 @@ void user_enter(void) | |||
87 | */ | 113 | */ |
88 | void __sched notrace preempt_schedule_context(void) | 114 | void __sched notrace preempt_schedule_context(void) |
89 | { | 115 | { |
90 | struct thread_info *ti = current_thread_info(); | ||
91 | enum ctx_state prev_ctx; | 116 | enum ctx_state prev_ctx; |
92 | 117 | ||
93 | if (likely(ti->preempt_count || irqs_disabled())) | 118 | if (likely(!preemptible())) |
94 | return; | 119 | return; |
95 | 120 | ||
96 | /* | 121 | /* |
@@ -112,8 +137,8 @@ EXPORT_SYMBOL_GPL(preempt_schedule_context); | |||
112 | #endif /* CONFIG_PREEMPT */ | 137 | #endif /* CONFIG_PREEMPT */ |
113 | 138 | ||
114 | /** | 139 | /** |
115 | * user_exit - Inform the context tracking that the CPU is | 140 | * context_tracking_user_exit - Inform the context tracking that the CPU is |
116 | * exiting userspace mode and entering the kernel. | 141 | * exiting userspace mode and entering the kernel. |
117 | * | 142 | * |
118 | * This function must be called after we entered the kernel from userspace | 143 | * This function must be called after we entered the kernel from userspace |
119 | * before any use of RCU read side critical section. This potentially include | 144 | * before any use of RCU read side critical section. This potentially include |
@@ -122,7 +147,7 @@ EXPORT_SYMBOL_GPL(preempt_schedule_context); | |||
122 | * This call supports re-entrancy. This way it can be called from any exception | 147 | * This call supports re-entrancy. This way it can be called from any exception |
123 | * handler without needing to know if we came from userspace or not. | 148 | * handler without needing to know if we came from userspace or not. |
124 | */ | 149 | */ |
125 | void user_exit(void) | 150 | void context_tracking_user_exit(void) |
126 | { | 151 | { |
127 | unsigned long flags; | 152 | unsigned long flags; |
128 | 153 | ||
@@ -131,38 +156,22 @@ void user_exit(void) | |||
131 | 156 | ||
132 | local_irq_save(flags); | 157 | local_irq_save(flags); |
133 | if (__this_cpu_read(context_tracking.state) == IN_USER) { | 158 | if (__this_cpu_read(context_tracking.state) == IN_USER) { |
134 | /* | 159 | if (__this_cpu_read(context_tracking.active)) { |
135 | * We are going to run code that may use RCU. Inform | 160 | /* |
136 | * RCU core about that (ie: we may need the tick again). | 161 | * We are going to run code that may use RCU. Inform |
137 | */ | 162 | * RCU core about that (ie: we may need the tick again). |
138 | rcu_user_exit(); | 163 | */ |
139 | vtime_user_exit(current); | 164 | rcu_user_exit(); |
165 | vtime_user_exit(current); | ||
166 | trace_user_exit(0); | ||
167 | } | ||
140 | __this_cpu_write(context_tracking.state, IN_KERNEL); | 168 | __this_cpu_write(context_tracking.state, IN_KERNEL); |
141 | } | 169 | } |
142 | local_irq_restore(flags); | 170 | local_irq_restore(flags); |
143 | } | 171 | } |
144 | 172 | ||
145 | void guest_enter(void) | ||
146 | { | ||
147 | if (vtime_accounting_enabled()) | ||
148 | vtime_guest_enter(current); | ||
149 | else | ||
150 | __guest_enter(); | ||
151 | } | ||
152 | EXPORT_SYMBOL_GPL(guest_enter); | ||
153 | |||
154 | void guest_exit(void) | ||
155 | { | ||
156 | if (vtime_accounting_enabled()) | ||
157 | vtime_guest_exit(current); | ||
158 | else | ||
159 | __guest_exit(); | ||
160 | } | ||
161 | EXPORT_SYMBOL_GPL(guest_exit); | ||
162 | |||
163 | |||
164 | /** | 173 | /** |
165 | * context_tracking_task_switch - context switch the syscall callbacks | 174 | * __context_tracking_task_switch - context switch the syscall callbacks |
166 | * @prev: the task that is being switched out | 175 | * @prev: the task that is being switched out |
167 | * @next: the task that is being switched in | 176 | * @next: the task that is being switched in |
168 | * | 177 | * |
@@ -174,11 +183,19 @@ EXPORT_SYMBOL_GPL(guest_exit); | |||
174 | * migrate to some CPU that doesn't do the context tracking. As such the TIF | 183 | * migrate to some CPU that doesn't do the context tracking. As such the TIF |
175 | * flag may not be desired there. | 184 | * flag may not be desired there. |
176 | */ | 185 | */ |
177 | void context_tracking_task_switch(struct task_struct *prev, | 186 | void __context_tracking_task_switch(struct task_struct *prev, |
178 | struct task_struct *next) | 187 | struct task_struct *next) |
179 | { | 188 | { |
180 | if (__this_cpu_read(context_tracking.active)) { | 189 | clear_tsk_thread_flag(prev, TIF_NOHZ); |
181 | clear_tsk_thread_flag(prev, TIF_NOHZ); | 190 | set_tsk_thread_flag(next, TIF_NOHZ); |
182 | set_tsk_thread_flag(next, TIF_NOHZ); | ||
183 | } | ||
184 | } | 191 | } |
192 | |||
193 | #ifdef CONFIG_CONTEXT_TRACKING_FORCE | ||
194 | void __init context_tracking_init(void) | ||
195 | { | ||
196 | int cpu; | ||
197 | |||
198 | for_each_possible_cpu(cpu) | ||
199 | context_tracking_cpu_set(cpu); | ||
200 | } | ||
201 | #endif | ||
diff --git a/kernel/cpu.c b/kernel/cpu.c index b2b227b82123..d7f07a2da5a6 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -113,7 +113,7 @@ EXPORT_SYMBOL_GPL(put_online_cpus); | |||
113 | * get_online_cpus() not an api which is called all that often. | 113 | * get_online_cpus() not an api which is called all that often. |
114 | * | 114 | * |
115 | */ | 115 | */ |
116 | static void cpu_hotplug_begin(void) | 116 | void cpu_hotplug_begin(void) |
117 | { | 117 | { |
118 | cpu_hotplug.active_writer = current; | 118 | cpu_hotplug.active_writer = current; |
119 | 119 | ||
@@ -127,7 +127,7 @@ static void cpu_hotplug_begin(void) | |||
127 | } | 127 | } |
128 | } | 128 | } |
129 | 129 | ||
130 | static void cpu_hotplug_done(void) | 130 | void cpu_hotplug_done(void) |
131 | { | 131 | { |
132 | cpu_hotplug.active_writer = NULL; | 132 | cpu_hotplug.active_writer = NULL; |
133 | mutex_unlock(&cpu_hotplug.lock); | 133 | mutex_unlock(&cpu_hotplug.lock); |
@@ -154,10 +154,7 @@ void cpu_hotplug_enable(void) | |||
154 | cpu_maps_update_done(); | 154 | cpu_maps_update_done(); |
155 | } | 155 | } |
156 | 156 | ||
157 | #else /* #if CONFIG_HOTPLUG_CPU */ | 157 | #endif /* CONFIG_HOTPLUG_CPU */ |
158 | static void cpu_hotplug_begin(void) {} | ||
159 | static void cpu_hotplug_done(void) {} | ||
160 | #endif /* #else #if CONFIG_HOTPLUG_CPU */ | ||
161 | 158 | ||
162 | /* Need to know about CPUs going up/down? */ | 159 | /* Need to know about CPUs going up/down? */ |
163 | int __ref register_cpu_notifier(struct notifier_block *nb) | 160 | int __ref register_cpu_notifier(struct notifier_block *nb) |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index ea1966db34f2..6bf981e13c43 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -68,10 +68,6 @@ | |||
68 | */ | 68 | */ |
69 | int number_of_cpusets __read_mostly; | 69 | int number_of_cpusets __read_mostly; |
70 | 70 | ||
71 | /* Forward declare cgroup structures */ | ||
72 | struct cgroup_subsys cpuset_subsys; | ||
73 | struct cpuset; | ||
74 | |||
75 | /* See "Frequency meter" comments, below. */ | 71 | /* See "Frequency meter" comments, below. */ |
76 | 72 | ||
77 | struct fmeter { | 73 | struct fmeter { |
@@ -115,27 +111,20 @@ struct cpuset { | |||
115 | int relax_domain_level; | 111 | int relax_domain_level; |
116 | }; | 112 | }; |
117 | 113 | ||
118 | /* Retrieve the cpuset for a cgroup */ | 114 | static inline struct cpuset *css_cs(struct cgroup_subsys_state *css) |
119 | static inline struct cpuset *cgroup_cs(struct cgroup *cgrp) | ||
120 | { | 115 | { |
121 | return container_of(cgroup_subsys_state(cgrp, cpuset_subsys_id), | 116 | return css ? container_of(css, struct cpuset, css) : NULL; |
122 | struct cpuset, css); | ||
123 | } | 117 | } |
124 | 118 | ||
125 | /* Retrieve the cpuset for a task */ | 119 | /* Retrieve the cpuset for a task */ |
126 | static inline struct cpuset *task_cs(struct task_struct *task) | 120 | static inline struct cpuset *task_cs(struct task_struct *task) |
127 | { | 121 | { |
128 | return container_of(task_subsys_state(task, cpuset_subsys_id), | 122 | return css_cs(task_css(task, cpuset_subsys_id)); |
129 | struct cpuset, css); | ||
130 | } | 123 | } |
131 | 124 | ||
132 | static inline struct cpuset *parent_cs(const struct cpuset *cs) | 125 | static inline struct cpuset *parent_cs(struct cpuset *cs) |
133 | { | 126 | { |
134 | struct cgroup *pcgrp = cs->css.cgroup->parent; | 127 | return css_cs(css_parent(&cs->css)); |
135 | |||
136 | if (pcgrp) | ||
137 | return cgroup_cs(pcgrp); | ||
138 | return NULL; | ||
139 | } | 128 | } |
140 | 129 | ||
141 | #ifdef CONFIG_NUMA | 130 | #ifdef CONFIG_NUMA |
@@ -212,29 +201,30 @@ static struct cpuset top_cpuset = { | |||
212 | /** | 201 | /** |
213 | * cpuset_for_each_child - traverse online children of a cpuset | 202 | * cpuset_for_each_child - traverse online children of a cpuset |
214 | * @child_cs: loop cursor pointing to the current child | 203 | * @child_cs: loop cursor pointing to the current child |
215 | * @pos_cgrp: used for iteration | 204 | * @pos_css: used for iteration |
216 | * @parent_cs: target cpuset to walk children of | 205 | * @parent_cs: target cpuset to walk children of |
217 | * | 206 | * |
218 | * Walk @child_cs through the online children of @parent_cs. Must be used | 207 | * Walk @child_cs through the online children of @parent_cs. Must be used |
219 | * with RCU read locked. | 208 | * with RCU read locked. |
220 | */ | 209 | */ |
221 | #define cpuset_for_each_child(child_cs, pos_cgrp, parent_cs) \ | 210 | #define cpuset_for_each_child(child_cs, pos_css, parent_cs) \ |
222 | cgroup_for_each_child((pos_cgrp), (parent_cs)->css.cgroup) \ | 211 | css_for_each_child((pos_css), &(parent_cs)->css) \ |
223 | if (is_cpuset_online(((child_cs) = cgroup_cs((pos_cgrp))))) | 212 | if (is_cpuset_online(((child_cs) = css_cs((pos_css))))) |
224 | 213 | ||
225 | /** | 214 | /** |
226 | * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants | 215 | * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants |
227 | * @des_cs: loop cursor pointing to the current descendant | 216 | * @des_cs: loop cursor pointing to the current descendant |
228 | * @pos_cgrp: used for iteration | 217 | * @pos_css: used for iteration |
229 | * @root_cs: target cpuset to walk ancestor of | 218 | * @root_cs: target cpuset to walk ancestor of |
230 | * | 219 | * |
231 | * Walk @des_cs through the online descendants of @root_cs. Must be used | 220 | * Walk @des_cs through the online descendants of @root_cs. Must be used |
232 | * with RCU read locked. The caller may modify @pos_cgrp by calling | 221 | * with RCU read locked. The caller may modify @pos_css by calling |
233 | * cgroup_rightmost_descendant() to skip subtree. | 222 | * css_rightmost_descendant() to skip subtree. @root_cs is included in the |
223 | * iteration and the first node to be visited. | ||
234 | */ | 224 | */ |
235 | #define cpuset_for_each_descendant_pre(des_cs, pos_cgrp, root_cs) \ | 225 | #define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \ |
236 | cgroup_for_each_descendant_pre((pos_cgrp), (root_cs)->css.cgroup) \ | 226 | css_for_each_descendant_pre((pos_css), &(root_cs)->css) \ |
237 | if (is_cpuset_online(((des_cs) = cgroup_cs((pos_cgrp))))) | 227 | if (is_cpuset_online(((des_cs) = css_cs((pos_css))))) |
238 | 228 | ||
239 | /* | 229 | /* |
240 | * There are two global mutexes guarding cpuset structures - cpuset_mutex | 230 | * There are two global mutexes guarding cpuset structures - cpuset_mutex |
@@ -320,8 +310,7 @@ static struct file_system_type cpuset_fs_type = { | |||
320 | * | 310 | * |
321 | * Call with callback_mutex held. | 311 | * Call with callback_mutex held. |
322 | */ | 312 | */ |
323 | static void guarantee_online_cpus(const struct cpuset *cs, | 313 | static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) |
324 | struct cpumask *pmask) | ||
325 | { | 314 | { |
326 | while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) | 315 | while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) |
327 | cs = parent_cs(cs); | 316 | cs = parent_cs(cs); |
@@ -339,7 +328,7 @@ static void guarantee_online_cpus(const struct cpuset *cs, | |||
339 | * | 328 | * |
340 | * Call with callback_mutex held. | 329 | * Call with callback_mutex held. |
341 | */ | 330 | */ |
342 | static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) | 331 | static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) |
343 | { | 332 | { |
344 | while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY])) | 333 | while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY])) |
345 | cs = parent_cs(cs); | 334 | cs = parent_cs(cs); |
@@ -384,7 +373,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) | |||
384 | * alloc_trial_cpuset - allocate a trial cpuset | 373 | * alloc_trial_cpuset - allocate a trial cpuset |
385 | * @cs: the cpuset that the trial cpuset duplicates | 374 | * @cs: the cpuset that the trial cpuset duplicates |
386 | */ | 375 | */ |
387 | static struct cpuset *alloc_trial_cpuset(const struct cpuset *cs) | 376 | static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) |
388 | { | 377 | { |
389 | struct cpuset *trial; | 378 | struct cpuset *trial; |
390 | 379 | ||
@@ -431,9 +420,9 @@ static void free_trial_cpuset(struct cpuset *trial) | |||
431 | * Return 0 if valid, -errno if not. | 420 | * Return 0 if valid, -errno if not. |
432 | */ | 421 | */ |
433 | 422 | ||
434 | static int validate_change(const struct cpuset *cur, const struct cpuset *trial) | 423 | static int validate_change(struct cpuset *cur, struct cpuset *trial) |
435 | { | 424 | { |
436 | struct cgroup *cgrp; | 425 | struct cgroup_subsys_state *css; |
437 | struct cpuset *c, *par; | 426 | struct cpuset *c, *par; |
438 | int ret; | 427 | int ret; |
439 | 428 | ||
@@ -441,7 +430,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) | |||
441 | 430 | ||
442 | /* Each of our child cpusets must be a subset of us */ | 431 | /* Each of our child cpusets must be a subset of us */ |
443 | ret = -EBUSY; | 432 | ret = -EBUSY; |
444 | cpuset_for_each_child(c, cgrp, cur) | 433 | cpuset_for_each_child(c, css, cur) |
445 | if (!is_cpuset_subset(c, trial)) | 434 | if (!is_cpuset_subset(c, trial)) |
446 | goto out; | 435 | goto out; |
447 | 436 | ||
@@ -462,7 +451,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) | |||
462 | * overlap | 451 | * overlap |
463 | */ | 452 | */ |
464 | ret = -EINVAL; | 453 | ret = -EINVAL; |
465 | cpuset_for_each_child(c, cgrp, par) { | 454 | cpuset_for_each_child(c, css, par) { |
466 | if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && | 455 | if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && |
467 | c != cur && | 456 | c != cur && |
468 | cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) | 457 | cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) |
@@ -515,13 +504,16 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr, | |||
515 | struct cpuset *root_cs) | 504 | struct cpuset *root_cs) |
516 | { | 505 | { |
517 | struct cpuset *cp; | 506 | struct cpuset *cp; |
518 | struct cgroup *pos_cgrp; | 507 | struct cgroup_subsys_state *pos_css; |
519 | 508 | ||
520 | rcu_read_lock(); | 509 | rcu_read_lock(); |
521 | cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { | 510 | cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { |
511 | if (cp == root_cs) | ||
512 | continue; | ||
513 | |||
522 | /* skip the whole subtree if @cp doesn't have any CPU */ | 514 | /* skip the whole subtree if @cp doesn't have any CPU */ |
523 | if (cpumask_empty(cp->cpus_allowed)) { | 515 | if (cpumask_empty(cp->cpus_allowed)) { |
524 | pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); | 516 | pos_css = css_rightmost_descendant(pos_css); |
525 | continue; | 517 | continue; |
526 | } | 518 | } |
527 | 519 | ||
@@ -596,7 +588,7 @@ static int generate_sched_domains(cpumask_var_t **domains, | |||
596 | struct sched_domain_attr *dattr; /* attributes for custom domains */ | 588 | struct sched_domain_attr *dattr; /* attributes for custom domains */ |
597 | int ndoms = 0; /* number of sched domains in result */ | 589 | int ndoms = 0; /* number of sched domains in result */ |
598 | int nslot; /* next empty doms[] struct cpumask slot */ | 590 | int nslot; /* next empty doms[] struct cpumask slot */ |
599 | struct cgroup *pos_cgrp; | 591 | struct cgroup_subsys_state *pos_css; |
600 | 592 | ||
601 | doms = NULL; | 593 | doms = NULL; |
602 | dattr = NULL; | 594 | dattr = NULL; |
@@ -625,7 +617,9 @@ static int generate_sched_domains(cpumask_var_t **domains, | |||
625 | csn = 0; | 617 | csn = 0; |
626 | 618 | ||
627 | rcu_read_lock(); | 619 | rcu_read_lock(); |
628 | cpuset_for_each_descendant_pre(cp, pos_cgrp, &top_cpuset) { | 620 | cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) { |
621 | if (cp == &top_cpuset) | ||
622 | continue; | ||
629 | /* | 623 | /* |
630 | * Continue traversing beyond @cp iff @cp has some CPUs and | 624 | * Continue traversing beyond @cp iff @cp has some CPUs and |
631 | * isn't load balancing. The former is obvious. The | 625 | * isn't load balancing. The former is obvious. The |
@@ -642,7 +636,7 @@ static int generate_sched_domains(cpumask_var_t **domains, | |||
642 | csa[csn++] = cp; | 636 | csa[csn++] = cp; |
643 | 637 | ||
644 | /* skip @cp's subtree */ | 638 | /* skip @cp's subtree */ |
645 | pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); | 639 | pos_css = css_rightmost_descendant(pos_css); |
646 | } | 640 | } |
647 | rcu_read_unlock(); | 641 | rcu_read_unlock(); |
648 | 642 | ||
@@ -837,52 +831,45 @@ static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs) | |||
837 | /** | 831 | /** |
838 | * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's | 832 | * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's |
839 | * @tsk: task to test | 833 | * @tsk: task to test |
840 | * @scan: struct cgroup_scanner containing the cgroup of the task | 834 | * @data: cpuset to @tsk belongs to |
841 | * | 835 | * |
842 | * Called by cgroup_scan_tasks() for each task in a cgroup whose | 836 | * Called by css_scan_tasks() for each task in a cgroup whose cpus_allowed |
843 | * cpus_allowed mask needs to be changed. | 837 | * mask needs to be changed. |
844 | * | 838 | * |
845 | * We don't need to re-check for the cgroup/cpuset membership, since we're | 839 | * We don't need to re-check for the cgroup/cpuset membership, since we're |
846 | * holding cpuset_mutex at this point. | 840 | * holding cpuset_mutex at this point. |
847 | */ | 841 | */ |
848 | static void cpuset_change_cpumask(struct task_struct *tsk, | 842 | static void cpuset_change_cpumask(struct task_struct *tsk, void *data) |
849 | struct cgroup_scanner *scan) | ||
850 | { | 843 | { |
851 | struct cpuset *cpus_cs; | 844 | struct cpuset *cs = data; |
845 | struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); | ||
852 | 846 | ||
853 | cpus_cs = effective_cpumask_cpuset(cgroup_cs(scan->cg)); | ||
854 | set_cpus_allowed_ptr(tsk, cpus_cs->cpus_allowed); | 847 | set_cpus_allowed_ptr(tsk, cpus_cs->cpus_allowed); |
855 | } | 848 | } |
856 | 849 | ||
857 | /** | 850 | /** |
858 | * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. | 851 | * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. |
859 | * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed | 852 | * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed |
860 | * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() | 853 | * @heap: if NULL, defer allocating heap memory to css_scan_tasks() |
861 | * | 854 | * |
862 | * Called with cpuset_mutex held | 855 | * Called with cpuset_mutex held |
863 | * | 856 | * |
864 | * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, | 857 | * The css_scan_tasks() function will scan all the tasks in a cgroup, |
865 | * calling callback functions for each. | 858 | * calling callback functions for each. |
866 | * | 859 | * |
867 | * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 | 860 | * No return value. It's guaranteed that css_scan_tasks() always returns 0 |
868 | * if @heap != NULL. | 861 | * if @heap != NULL. |
869 | */ | 862 | */ |
870 | static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) | 863 | static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) |
871 | { | 864 | { |
872 | struct cgroup_scanner scan; | 865 | css_scan_tasks(&cs->css, NULL, cpuset_change_cpumask, cs, heap); |
873 | |||
874 | scan.cg = cs->css.cgroup; | ||
875 | scan.test_task = NULL; | ||
876 | scan.process_task = cpuset_change_cpumask; | ||
877 | scan.heap = heap; | ||
878 | cgroup_scan_tasks(&scan); | ||
879 | } | 866 | } |
880 | 867 | ||
881 | /* | 868 | /* |
882 | * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy. | 869 | * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy. |
883 | * @root_cs: the root cpuset of the hierarchy | 870 | * @root_cs: the root cpuset of the hierarchy |
884 | * @update_root: update root cpuset or not? | 871 | * @update_root: update root cpuset or not? |
885 | * @heap: the heap used by cgroup_scan_tasks() | 872 | * @heap: the heap used by css_scan_tasks() |
886 | * | 873 | * |
887 | * This will update cpumasks of tasks in @root_cs and all other empty cpusets | 874 | * This will update cpumasks of tasks in @root_cs and all other empty cpusets |
888 | * which take on cpumask of @root_cs. | 875 | * which take on cpumask of @root_cs. |
@@ -893,17 +880,19 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs, | |||
893 | bool update_root, struct ptr_heap *heap) | 880 | bool update_root, struct ptr_heap *heap) |
894 | { | 881 | { |
895 | struct cpuset *cp; | 882 | struct cpuset *cp; |
896 | struct cgroup *pos_cgrp; | 883 | struct cgroup_subsys_state *pos_css; |
897 | |||
898 | if (update_root) | ||
899 | update_tasks_cpumask(root_cs, heap); | ||
900 | 884 | ||
901 | rcu_read_lock(); | 885 | rcu_read_lock(); |
902 | cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { | 886 | cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { |
903 | /* skip the whole subtree if @cp have some CPU */ | 887 | if (cp == root_cs) { |
904 | if (!cpumask_empty(cp->cpus_allowed)) { | 888 | if (!update_root) |
905 | pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); | 889 | continue; |
906 | continue; | 890 | } else { |
891 | /* skip the whole subtree if @cp have some CPU */ | ||
892 | if (!cpumask_empty(cp->cpus_allowed)) { | ||
893 | pos_css = css_rightmost_descendant(pos_css); | ||
894 | continue; | ||
895 | } | ||
907 | } | 896 | } |
908 | if (!css_tryget(&cp->css)) | 897 | if (!css_tryget(&cp->css)) |
909 | continue; | 898 | continue; |
@@ -1059,20 +1048,24 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, | |||
1059 | task_unlock(tsk); | 1048 | task_unlock(tsk); |
1060 | } | 1049 | } |
1061 | 1050 | ||
1051 | struct cpuset_change_nodemask_arg { | ||
1052 | struct cpuset *cs; | ||
1053 | nodemask_t *newmems; | ||
1054 | }; | ||
1055 | |||
1062 | /* | 1056 | /* |
1063 | * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy | 1057 | * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy |
1064 | * of it to cpuset's new mems_allowed, and migrate pages to new nodes if | 1058 | * of it to cpuset's new mems_allowed, and migrate pages to new nodes if |
1065 | * memory_migrate flag is set. Called with cpuset_mutex held. | 1059 | * memory_migrate flag is set. Called with cpuset_mutex held. |
1066 | */ | 1060 | */ |
1067 | static void cpuset_change_nodemask(struct task_struct *p, | 1061 | static void cpuset_change_nodemask(struct task_struct *p, void *data) |
1068 | struct cgroup_scanner *scan) | ||
1069 | { | 1062 | { |
1070 | struct cpuset *cs = cgroup_cs(scan->cg); | 1063 | struct cpuset_change_nodemask_arg *arg = data; |
1064 | struct cpuset *cs = arg->cs; | ||
1071 | struct mm_struct *mm; | 1065 | struct mm_struct *mm; |
1072 | int migrate; | 1066 | int migrate; |
1073 | nodemask_t *newmems = scan->data; | ||
1074 | 1067 | ||
1075 | cpuset_change_task_nodemask(p, newmems); | 1068 | cpuset_change_task_nodemask(p, arg->newmems); |
1076 | 1069 | ||
1077 | mm = get_task_mm(p); | 1070 | mm = get_task_mm(p); |
1078 | if (!mm) | 1071 | if (!mm) |
@@ -1082,7 +1075,7 @@ static void cpuset_change_nodemask(struct task_struct *p, | |||
1082 | 1075 | ||
1083 | mpol_rebind_mm(mm, &cs->mems_allowed); | 1076 | mpol_rebind_mm(mm, &cs->mems_allowed); |
1084 | if (migrate) | 1077 | if (migrate) |
1085 | cpuset_migrate_mm(mm, &cs->old_mems_allowed, newmems); | 1078 | cpuset_migrate_mm(mm, &cs->old_mems_allowed, arg->newmems); |
1086 | mmput(mm); | 1079 | mmput(mm); |
1087 | } | 1080 | } |
1088 | 1081 | ||
@@ -1091,28 +1084,22 @@ static void *cpuset_being_rebound; | |||
1091 | /** | 1084 | /** |
1092 | * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. | 1085 | * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. |
1093 | * @cs: the cpuset in which each task's mems_allowed mask needs to be changed | 1086 | * @cs: the cpuset in which each task's mems_allowed mask needs to be changed |
1094 | * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() | 1087 | * @heap: if NULL, defer allocating heap memory to css_scan_tasks() |
1095 | * | 1088 | * |
1096 | * Called with cpuset_mutex held | 1089 | * Called with cpuset_mutex held. No return value. It's guaranteed that |
1097 | * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 | 1090 | * css_scan_tasks() always returns 0 if @heap != NULL. |
1098 | * if @heap != NULL. | ||
1099 | */ | 1091 | */ |
1100 | static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) | 1092 | static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) |
1101 | { | 1093 | { |
1102 | static nodemask_t newmems; /* protected by cpuset_mutex */ | 1094 | static nodemask_t newmems; /* protected by cpuset_mutex */ |
1103 | struct cgroup_scanner scan; | ||
1104 | struct cpuset *mems_cs = effective_nodemask_cpuset(cs); | 1095 | struct cpuset *mems_cs = effective_nodemask_cpuset(cs); |
1096 | struct cpuset_change_nodemask_arg arg = { .cs = cs, | ||
1097 | .newmems = &newmems }; | ||
1105 | 1098 | ||
1106 | cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ | 1099 | cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ |
1107 | 1100 | ||
1108 | guarantee_online_mems(mems_cs, &newmems); | 1101 | guarantee_online_mems(mems_cs, &newmems); |
1109 | 1102 | ||
1110 | scan.cg = cs->css.cgroup; | ||
1111 | scan.test_task = NULL; | ||
1112 | scan.process_task = cpuset_change_nodemask; | ||
1113 | scan.heap = heap; | ||
1114 | scan.data = &newmems; | ||
1115 | |||
1116 | /* | 1103 | /* |
1117 | * The mpol_rebind_mm() call takes mmap_sem, which we couldn't | 1104 | * The mpol_rebind_mm() call takes mmap_sem, which we couldn't |
1118 | * take while holding tasklist_lock. Forks can happen - the | 1105 | * take while holding tasklist_lock. Forks can happen - the |
@@ -1123,7 +1110,7 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) | |||
1123 | * It's ok if we rebind the same mm twice; mpol_rebind_mm() | 1110 | * It's ok if we rebind the same mm twice; mpol_rebind_mm() |
1124 | * is idempotent. Also migrate pages in each mm to new nodes. | 1111 | * is idempotent. Also migrate pages in each mm to new nodes. |
1125 | */ | 1112 | */ |
1126 | cgroup_scan_tasks(&scan); | 1113 | css_scan_tasks(&cs->css, NULL, cpuset_change_nodemask, &arg, heap); |
1127 | 1114 | ||
1128 | /* | 1115 | /* |
1129 | * All the tasks' nodemasks have been updated, update | 1116 | * All the tasks' nodemasks have been updated, update |
@@ -1139,7 +1126,7 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) | |||
1139 | * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy. | 1126 | * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy. |
1140 | * @cs: the root cpuset of the hierarchy | 1127 | * @cs: the root cpuset of the hierarchy |
1141 | * @update_root: update the root cpuset or not? | 1128 | * @update_root: update the root cpuset or not? |
1142 | * @heap: the heap used by cgroup_scan_tasks() | 1129 | * @heap: the heap used by css_scan_tasks() |
1143 | * | 1130 | * |
1144 | * This will update nodemasks of tasks in @root_cs and all other empty cpusets | 1131 | * This will update nodemasks of tasks in @root_cs and all other empty cpusets |
1145 | * which take on nodemask of @root_cs. | 1132 | * which take on nodemask of @root_cs. |
@@ -1150,17 +1137,19 @@ static void update_tasks_nodemask_hier(struct cpuset *root_cs, | |||
1150 | bool update_root, struct ptr_heap *heap) | 1137 | bool update_root, struct ptr_heap *heap) |
1151 | { | 1138 | { |
1152 | struct cpuset *cp; | 1139 | struct cpuset *cp; |
1153 | struct cgroup *pos_cgrp; | 1140 | struct cgroup_subsys_state *pos_css; |
1154 | |||
1155 | if (update_root) | ||
1156 | update_tasks_nodemask(root_cs, heap); | ||
1157 | 1141 | ||
1158 | rcu_read_lock(); | 1142 | rcu_read_lock(); |
1159 | cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { | 1143 | cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { |
1160 | /* skip the whole subtree if @cp have some CPU */ | 1144 | if (cp == root_cs) { |
1161 | if (!nodes_empty(cp->mems_allowed)) { | 1145 | if (!update_root) |
1162 | pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); | 1146 | continue; |
1163 | continue; | 1147 | } else { |
1148 | /* skip the whole subtree if @cp have some CPU */ | ||
1149 | if (!nodes_empty(cp->mems_allowed)) { | ||
1150 | pos_css = css_rightmost_descendant(pos_css); | ||
1151 | continue; | ||
1152 | } | ||
1164 | } | 1153 | } |
1165 | if (!css_tryget(&cp->css)) | 1154 | if (!css_tryget(&cp->css)) |
1166 | continue; | 1155 | continue; |
@@ -1267,44 +1256,39 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) | |||
1267 | return 0; | 1256 | return 0; |
1268 | } | 1257 | } |
1269 | 1258 | ||
1270 | /* | 1259 | /** |
1271 | * cpuset_change_flag - make a task's spread flags the same as its cpuset's | 1260 | * cpuset_change_flag - make a task's spread flags the same as its cpuset's |
1272 | * @tsk: task to be updated | 1261 | * @tsk: task to be updated |
1273 | * @scan: struct cgroup_scanner containing the cgroup of the task | 1262 | * @data: cpuset to @tsk belongs to |
1274 | * | 1263 | * |
1275 | * Called by cgroup_scan_tasks() for each task in a cgroup. | 1264 | * Called by css_scan_tasks() for each task in a cgroup. |
1276 | * | 1265 | * |
1277 | * We don't need to re-check for the cgroup/cpuset membership, since we're | 1266 | * We don't need to re-check for the cgroup/cpuset membership, since we're |
1278 | * holding cpuset_mutex at this point. | 1267 | * holding cpuset_mutex at this point. |
1279 | */ | 1268 | */ |
1280 | static void cpuset_change_flag(struct task_struct *tsk, | 1269 | static void cpuset_change_flag(struct task_struct *tsk, void *data) |
1281 | struct cgroup_scanner *scan) | ||
1282 | { | 1270 | { |
1283 | cpuset_update_task_spread_flag(cgroup_cs(scan->cg), tsk); | 1271 | struct cpuset *cs = data; |
1272 | |||
1273 | cpuset_update_task_spread_flag(cs, tsk); | ||
1284 | } | 1274 | } |
1285 | 1275 | ||
1286 | /* | 1276 | /** |
1287 | * update_tasks_flags - update the spread flags of tasks in the cpuset. | 1277 | * update_tasks_flags - update the spread flags of tasks in the cpuset. |
1288 | * @cs: the cpuset in which each task's spread flags needs to be changed | 1278 | * @cs: the cpuset in which each task's spread flags needs to be changed |
1289 | * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() | 1279 | * @heap: if NULL, defer allocating heap memory to css_scan_tasks() |
1290 | * | 1280 | * |
1291 | * Called with cpuset_mutex held | 1281 | * Called with cpuset_mutex held |
1292 | * | 1282 | * |
1293 | * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, | 1283 | * The css_scan_tasks() function will scan all the tasks in a cgroup, |
1294 | * calling callback functions for each. | 1284 | * calling callback functions for each. |
1295 | * | 1285 | * |
1296 | * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 | 1286 | * No return value. It's guaranteed that css_scan_tasks() always returns 0 |
1297 | * if @heap != NULL. | 1287 | * if @heap != NULL. |
1298 | */ | 1288 | */ |
1299 | static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap) | 1289 | static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap) |
1300 | { | 1290 | { |
1301 | struct cgroup_scanner scan; | 1291 | css_scan_tasks(&cs->css, NULL, cpuset_change_flag, cs, heap); |
1302 | |||
1303 | scan.cg = cs->css.cgroup; | ||
1304 | scan.test_task = NULL; | ||
1305 | scan.process_task = cpuset_change_flag; | ||
1306 | scan.heap = heap; | ||
1307 | cgroup_scan_tasks(&scan); | ||
1308 | } | 1292 | } |
1309 | 1293 | ||
1310 | /* | 1294 | /* |
@@ -1462,9 +1446,10 @@ static int fmeter_getrate(struct fmeter *fmp) | |||
1462 | } | 1446 | } |
1463 | 1447 | ||
1464 | /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ | 1448 | /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ |
1465 | static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) | 1449 | static int cpuset_can_attach(struct cgroup_subsys_state *css, |
1450 | struct cgroup_taskset *tset) | ||
1466 | { | 1451 | { |
1467 | struct cpuset *cs = cgroup_cs(cgrp); | 1452 | struct cpuset *cs = css_cs(css); |
1468 | struct task_struct *task; | 1453 | struct task_struct *task; |
1469 | int ret; | 1454 | int ret; |
1470 | 1455 | ||
@@ -1475,11 +1460,11 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) | |||
1475 | * flag is set. | 1460 | * flag is set. |
1476 | */ | 1461 | */ |
1477 | ret = -ENOSPC; | 1462 | ret = -ENOSPC; |
1478 | if (!cgroup_sane_behavior(cgrp) && | 1463 | if (!cgroup_sane_behavior(css->cgroup) && |
1479 | (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) | 1464 | (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) |
1480 | goto out_unlock; | 1465 | goto out_unlock; |
1481 | 1466 | ||
1482 | cgroup_taskset_for_each(task, cgrp, tset) { | 1467 | cgroup_taskset_for_each(task, css, tset) { |
1483 | /* | 1468 | /* |
1484 | * Kthreads which disallow setaffinity shouldn't be moved | 1469 | * Kthreads which disallow setaffinity shouldn't be moved |
1485 | * to a new cpuset; we don't want to change their cpu | 1470 | * to a new cpuset; we don't want to change their cpu |
@@ -1508,11 +1493,11 @@ out_unlock: | |||
1508 | return ret; | 1493 | return ret; |
1509 | } | 1494 | } |
1510 | 1495 | ||
1511 | static void cpuset_cancel_attach(struct cgroup *cgrp, | 1496 | static void cpuset_cancel_attach(struct cgroup_subsys_state *css, |
1512 | struct cgroup_taskset *tset) | 1497 | struct cgroup_taskset *tset) |
1513 | { | 1498 | { |
1514 | mutex_lock(&cpuset_mutex); | 1499 | mutex_lock(&cpuset_mutex); |
1515 | cgroup_cs(cgrp)->attach_in_progress--; | 1500 | css_cs(css)->attach_in_progress--; |
1516 | mutex_unlock(&cpuset_mutex); | 1501 | mutex_unlock(&cpuset_mutex); |
1517 | } | 1502 | } |
1518 | 1503 | ||
@@ -1523,16 +1508,18 @@ static void cpuset_cancel_attach(struct cgroup *cgrp, | |||
1523 | */ | 1508 | */ |
1524 | static cpumask_var_t cpus_attach; | 1509 | static cpumask_var_t cpus_attach; |
1525 | 1510 | ||
1526 | static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) | 1511 | static void cpuset_attach(struct cgroup_subsys_state *css, |
1512 | struct cgroup_taskset *tset) | ||
1527 | { | 1513 | { |
1528 | /* static buf protected by cpuset_mutex */ | 1514 | /* static buf protected by cpuset_mutex */ |
1529 | static nodemask_t cpuset_attach_nodemask_to; | 1515 | static nodemask_t cpuset_attach_nodemask_to; |
1530 | struct mm_struct *mm; | 1516 | struct mm_struct *mm; |
1531 | struct task_struct *task; | 1517 | struct task_struct *task; |
1532 | struct task_struct *leader = cgroup_taskset_first(tset); | 1518 | struct task_struct *leader = cgroup_taskset_first(tset); |
1533 | struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset); | 1519 | struct cgroup_subsys_state *oldcss = cgroup_taskset_cur_css(tset, |
1534 | struct cpuset *cs = cgroup_cs(cgrp); | 1520 | cpuset_subsys_id); |
1535 | struct cpuset *oldcs = cgroup_cs(oldcgrp); | 1521 | struct cpuset *cs = css_cs(css); |
1522 | struct cpuset *oldcs = css_cs(oldcss); | ||
1536 | struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); | 1523 | struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); |
1537 | struct cpuset *mems_cs = effective_nodemask_cpuset(cs); | 1524 | struct cpuset *mems_cs = effective_nodemask_cpuset(cs); |
1538 | 1525 | ||
@@ -1546,7 +1533,7 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) | |||
1546 | 1533 | ||
1547 | guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to); | 1534 | guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to); |
1548 | 1535 | ||
1549 | cgroup_taskset_for_each(task, cgrp, tset) { | 1536 | cgroup_taskset_for_each(task, css, tset) { |
1550 | /* | 1537 | /* |
1551 | * can_attach beforehand should guarantee that this doesn't | 1538 | * can_attach beforehand should guarantee that this doesn't |
1552 | * fail. TODO: have a better way to handle failure here | 1539 | * fail. TODO: have a better way to handle failure here |
@@ -1608,9 +1595,10 @@ typedef enum { | |||
1608 | FILE_SPREAD_SLAB, | 1595 | FILE_SPREAD_SLAB, |
1609 | } cpuset_filetype_t; | 1596 | } cpuset_filetype_t; |
1610 | 1597 | ||
1611 | static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) | 1598 | static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, |
1599 | u64 val) | ||
1612 | { | 1600 | { |
1613 | struct cpuset *cs = cgroup_cs(cgrp); | 1601 | struct cpuset *cs = css_cs(css); |
1614 | cpuset_filetype_t type = cft->private; | 1602 | cpuset_filetype_t type = cft->private; |
1615 | int retval = 0; | 1603 | int retval = 0; |
1616 | 1604 | ||
@@ -1657,9 +1645,10 @@ out_unlock: | |||
1657 | return retval; | 1645 | return retval; |
1658 | } | 1646 | } |
1659 | 1647 | ||
1660 | static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) | 1648 | static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, |
1649 | s64 val) | ||
1661 | { | 1650 | { |
1662 | struct cpuset *cs = cgroup_cs(cgrp); | 1651 | struct cpuset *cs = css_cs(css); |
1663 | cpuset_filetype_t type = cft->private; | 1652 | cpuset_filetype_t type = cft->private; |
1664 | int retval = -ENODEV; | 1653 | int retval = -ENODEV; |
1665 | 1654 | ||
@@ -1683,10 +1672,10 @@ out_unlock: | |||
1683 | /* | 1672 | /* |
1684 | * Common handling for a write to a "cpus" or "mems" file. | 1673 | * Common handling for a write to a "cpus" or "mems" file. |
1685 | */ | 1674 | */ |
1686 | static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, | 1675 | static int cpuset_write_resmask(struct cgroup_subsys_state *css, |
1687 | const char *buf) | 1676 | struct cftype *cft, const char *buf) |
1688 | { | 1677 | { |
1689 | struct cpuset *cs = cgroup_cs(cgrp); | 1678 | struct cpuset *cs = css_cs(css); |
1690 | struct cpuset *trialcs; | 1679 | struct cpuset *trialcs; |
1691 | int retval = -ENODEV; | 1680 | int retval = -ENODEV; |
1692 | 1681 | ||
@@ -1765,13 +1754,12 @@ static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs) | |||
1765 | return count; | 1754 | return count; |
1766 | } | 1755 | } |
1767 | 1756 | ||
1768 | static ssize_t cpuset_common_file_read(struct cgroup *cgrp, | 1757 | static ssize_t cpuset_common_file_read(struct cgroup_subsys_state *css, |
1769 | struct cftype *cft, | 1758 | struct cftype *cft, struct file *file, |
1770 | struct file *file, | 1759 | char __user *buf, size_t nbytes, |
1771 | char __user *buf, | 1760 | loff_t *ppos) |
1772 | size_t nbytes, loff_t *ppos) | ||
1773 | { | 1761 | { |
1774 | struct cpuset *cs = cgroup_cs(cgrp); | 1762 | struct cpuset *cs = css_cs(css); |
1775 | cpuset_filetype_t type = cft->private; | 1763 | cpuset_filetype_t type = cft->private; |
1776 | char *page; | 1764 | char *page; |
1777 | ssize_t retval = 0; | 1765 | ssize_t retval = 0; |
@@ -1801,9 +1789,9 @@ out: | |||
1801 | return retval; | 1789 | return retval; |
1802 | } | 1790 | } |
1803 | 1791 | ||
1804 | static u64 cpuset_read_u64(struct cgroup *cgrp, struct cftype *cft) | 1792 | static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) |
1805 | { | 1793 | { |
1806 | struct cpuset *cs = cgroup_cs(cgrp); | 1794 | struct cpuset *cs = css_cs(css); |
1807 | cpuset_filetype_t type = cft->private; | 1795 | cpuset_filetype_t type = cft->private; |
1808 | switch (type) { | 1796 | switch (type) { |
1809 | case FILE_CPU_EXCLUSIVE: | 1797 | case FILE_CPU_EXCLUSIVE: |
@@ -1832,9 +1820,9 @@ static u64 cpuset_read_u64(struct cgroup *cgrp, struct cftype *cft) | |||
1832 | return 0; | 1820 | return 0; |
1833 | } | 1821 | } |
1834 | 1822 | ||
1835 | static s64 cpuset_read_s64(struct cgroup *cgrp, struct cftype *cft) | 1823 | static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) |
1836 | { | 1824 | { |
1837 | struct cpuset *cs = cgroup_cs(cgrp); | 1825 | struct cpuset *cs = css_cs(css); |
1838 | cpuset_filetype_t type = cft->private; | 1826 | cpuset_filetype_t type = cft->private; |
1839 | switch (type) { | 1827 | switch (type) { |
1840 | case FILE_SCHED_RELAX_DOMAIN_LEVEL: | 1828 | case FILE_SCHED_RELAX_DOMAIN_LEVEL: |
@@ -1949,11 +1937,12 @@ static struct cftype files[] = { | |||
1949 | * cgrp: control group that the new cpuset will be part of | 1937 | * cgrp: control group that the new cpuset will be part of |
1950 | */ | 1938 | */ |
1951 | 1939 | ||
1952 | static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cgrp) | 1940 | static struct cgroup_subsys_state * |
1941 | cpuset_css_alloc(struct cgroup_subsys_state *parent_css) | ||
1953 | { | 1942 | { |
1954 | struct cpuset *cs; | 1943 | struct cpuset *cs; |
1955 | 1944 | ||
1956 | if (!cgrp->parent) | 1945 | if (!parent_css) |
1957 | return &top_cpuset.css; | 1946 | return &top_cpuset.css; |
1958 | 1947 | ||
1959 | cs = kzalloc(sizeof(*cs), GFP_KERNEL); | 1948 | cs = kzalloc(sizeof(*cs), GFP_KERNEL); |
@@ -1973,12 +1962,12 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cgrp) | |||
1973 | return &cs->css; | 1962 | return &cs->css; |
1974 | } | 1963 | } |
1975 | 1964 | ||
1976 | static int cpuset_css_online(struct cgroup *cgrp) | 1965 | static int cpuset_css_online(struct cgroup_subsys_state *css) |
1977 | { | 1966 | { |
1978 | struct cpuset *cs = cgroup_cs(cgrp); | 1967 | struct cpuset *cs = css_cs(css); |
1979 | struct cpuset *parent = parent_cs(cs); | 1968 | struct cpuset *parent = parent_cs(cs); |
1980 | struct cpuset *tmp_cs; | 1969 | struct cpuset *tmp_cs; |
1981 | struct cgroup *pos_cg; | 1970 | struct cgroup_subsys_state *pos_css; |
1982 | 1971 | ||
1983 | if (!parent) | 1972 | if (!parent) |
1984 | return 0; | 1973 | return 0; |
@@ -1993,7 +1982,7 @@ static int cpuset_css_online(struct cgroup *cgrp) | |||
1993 | 1982 | ||
1994 | number_of_cpusets++; | 1983 | number_of_cpusets++; |
1995 | 1984 | ||
1996 | if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags)) | 1985 | if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) |
1997 | goto out_unlock; | 1986 | goto out_unlock; |
1998 | 1987 | ||
1999 | /* | 1988 | /* |
@@ -2010,7 +1999,7 @@ static int cpuset_css_online(struct cgroup *cgrp) | |||
2010 | * (and likewise for mems) to the new cgroup. | 1999 | * (and likewise for mems) to the new cgroup. |
2011 | */ | 2000 | */ |
2012 | rcu_read_lock(); | 2001 | rcu_read_lock(); |
2013 | cpuset_for_each_child(tmp_cs, pos_cg, parent) { | 2002 | cpuset_for_each_child(tmp_cs, pos_css, parent) { |
2014 | if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) { | 2003 | if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) { |
2015 | rcu_read_unlock(); | 2004 | rcu_read_unlock(); |
2016 | goto out_unlock; | 2005 | goto out_unlock; |
@@ -2027,9 +2016,15 @@ out_unlock: | |||
2027 | return 0; | 2016 | return 0; |
2028 | } | 2017 | } |
2029 | 2018 | ||
2030 | static void cpuset_css_offline(struct cgroup *cgrp) | 2019 | /* |
2020 | * If the cpuset being removed has its flag 'sched_load_balance' | ||
2021 | * enabled, then simulate turning sched_load_balance off, which | ||
2022 | * will call rebuild_sched_domains_locked(). | ||
2023 | */ | ||
2024 | |||
2025 | static void cpuset_css_offline(struct cgroup_subsys_state *css) | ||
2031 | { | 2026 | { |
2032 | struct cpuset *cs = cgroup_cs(cgrp); | 2027 | struct cpuset *cs = css_cs(css); |
2033 | 2028 | ||
2034 | mutex_lock(&cpuset_mutex); | 2029 | mutex_lock(&cpuset_mutex); |
2035 | 2030 | ||
@@ -2042,15 +2037,9 @@ static void cpuset_css_offline(struct cgroup *cgrp) | |||
2042 | mutex_unlock(&cpuset_mutex); | 2037 | mutex_unlock(&cpuset_mutex); |
2043 | } | 2038 | } |
2044 | 2039 | ||
2045 | /* | 2040 | static void cpuset_css_free(struct cgroup_subsys_state *css) |
2046 | * If the cpuset being removed has its flag 'sched_load_balance' | ||
2047 | * enabled, then simulate turning sched_load_balance off, which | ||
2048 | * will call rebuild_sched_domains_locked(). | ||
2049 | */ | ||
2050 | |||
2051 | static void cpuset_css_free(struct cgroup *cgrp) | ||
2052 | { | 2041 | { |
2053 | struct cpuset *cs = cgroup_cs(cgrp); | 2042 | struct cpuset *cs = css_cs(css); |
2054 | 2043 | ||
2055 | free_cpumask_var(cs->cpus_allowed); | 2044 | free_cpumask_var(cs->cpus_allowed); |
2056 | kfree(cs); | 2045 | kfree(cs); |
@@ -2257,11 +2246,11 @@ static void cpuset_hotplug_workfn(struct work_struct *work) | |||
2257 | /* if cpus or mems changed, we need to propagate to descendants */ | 2246 | /* if cpus or mems changed, we need to propagate to descendants */ |
2258 | if (cpus_updated || mems_updated) { | 2247 | if (cpus_updated || mems_updated) { |
2259 | struct cpuset *cs; | 2248 | struct cpuset *cs; |
2260 | struct cgroup *pos_cgrp; | 2249 | struct cgroup_subsys_state *pos_css; |
2261 | 2250 | ||
2262 | rcu_read_lock(); | 2251 | rcu_read_lock(); |
2263 | cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset) { | 2252 | cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { |
2264 | if (!css_tryget(&cs->css)) | 2253 | if (cs == &top_cpuset || !css_tryget(&cs->css)) |
2265 | continue; | 2254 | continue; |
2266 | rcu_read_unlock(); | 2255 | rcu_read_unlock(); |
2267 | 2256 | ||
@@ -2350,7 +2339,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) | |||
2350 | 2339 | ||
2351 | void cpuset_cpus_allowed_fallback(struct task_struct *tsk) | 2340 | void cpuset_cpus_allowed_fallback(struct task_struct *tsk) |
2352 | { | 2341 | { |
2353 | const struct cpuset *cpus_cs; | 2342 | struct cpuset *cpus_cs; |
2354 | 2343 | ||
2355 | rcu_read_lock(); | 2344 | rcu_read_lock(); |
2356 | cpus_cs = effective_cpumask_cpuset(task_cs(tsk)); | 2345 | cpus_cs = effective_cpumask_cpuset(task_cs(tsk)); |
@@ -2423,7 +2412,7 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) | |||
2423 | * callback_mutex. If no ancestor is mem_exclusive or mem_hardwall | 2412 | * callback_mutex. If no ancestor is mem_exclusive or mem_hardwall |
2424 | * (an unusual configuration), then returns the root cpuset. | 2413 | * (an unusual configuration), then returns the root cpuset. |
2425 | */ | 2414 | */ |
2426 | static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs) | 2415 | static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) |
2427 | { | 2416 | { |
2428 | while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs)) | 2417 | while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs)) |
2429 | cs = parent_cs(cs); | 2418 | cs = parent_cs(cs); |
@@ -2493,7 +2482,7 @@ static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs) | |||
2493 | */ | 2482 | */ |
2494 | int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) | 2483 | int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) |
2495 | { | 2484 | { |
2496 | const struct cpuset *cs; /* current cpuset ancestors */ | 2485 | struct cpuset *cs; /* current cpuset ancestors */ |
2497 | int allowed; /* is allocation in zone z allowed? */ | 2486 | int allowed; /* is allocation in zone z allowed? */ |
2498 | 2487 | ||
2499 | if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) | 2488 | if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) |
@@ -2731,7 +2720,7 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v) | |||
2731 | goto out_free; | 2720 | goto out_free; |
2732 | 2721 | ||
2733 | rcu_read_lock(); | 2722 | rcu_read_lock(); |
2734 | css = task_subsys_state(tsk, cpuset_subsys_id); | 2723 | css = task_css(tsk, cpuset_subsys_id); |
2735 | retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); | 2724 | retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); |
2736 | rcu_read_unlock(); | 2725 | rcu_read_unlock(); |
2737 | if (retval < 0) | 2726 | if (retval < 0) |
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index c77206184b8b..97b67df8fbfe 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c | |||
@@ -116,6 +116,9 @@ int get_callchain_buffers(void) | |||
116 | 116 | ||
117 | err = alloc_callchain_buffers(); | 117 | err = alloc_callchain_buffers(); |
118 | exit: | 118 | exit: |
119 | if (err) | ||
120 | atomic_dec(&nr_callchain_events); | ||
121 | |||
119 | mutex_unlock(&callchain_mutex); | 122 | mutex_unlock(&callchain_mutex); |
120 | 123 | ||
121 | return err; | 124 | return err; |
diff --git a/kernel/events/core.c b/kernel/events/core.c index f86599e8c123..2207efc941d1 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
@@ -145,6 +145,7 @@ static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events); | |||
145 | static atomic_t nr_mmap_events __read_mostly; | 145 | static atomic_t nr_mmap_events __read_mostly; |
146 | static atomic_t nr_comm_events __read_mostly; | 146 | static atomic_t nr_comm_events __read_mostly; |
147 | static atomic_t nr_task_events __read_mostly; | 147 | static atomic_t nr_task_events __read_mostly; |
148 | static atomic_t nr_freq_events __read_mostly; | ||
148 | 149 | ||
149 | static LIST_HEAD(pmus); | 150 | static LIST_HEAD(pmus); |
150 | static DEFINE_MUTEX(pmus_lock); | 151 | static DEFINE_MUTEX(pmus_lock); |
@@ -340,8 +341,8 @@ struct perf_cgroup { | |||
340 | static inline struct perf_cgroup * | 341 | static inline struct perf_cgroup * |
341 | perf_cgroup_from_task(struct task_struct *task) | 342 | perf_cgroup_from_task(struct task_struct *task) |
342 | { | 343 | { |
343 | return container_of(task_subsys_state(task, perf_subsys_id), | 344 | return container_of(task_css(task, perf_subsys_id), |
344 | struct perf_cgroup, css); | 345 | struct perf_cgroup, css); |
345 | } | 346 | } |
346 | 347 | ||
347 | static inline bool | 348 | static inline bool |
@@ -591,7 +592,9 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, | |||
591 | if (!f.file) | 592 | if (!f.file) |
592 | return -EBADF; | 593 | return -EBADF; |
593 | 594 | ||
594 | css = cgroup_css_from_dir(f.file, perf_subsys_id); | 595 | rcu_read_lock(); |
596 | |||
597 | css = css_from_dir(f.file->f_dentry, &perf_subsys); | ||
595 | if (IS_ERR(css)) { | 598 | if (IS_ERR(css)) { |
596 | ret = PTR_ERR(css); | 599 | ret = PTR_ERR(css); |
597 | goto out; | 600 | goto out; |
@@ -617,6 +620,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, | |||
617 | ret = -EINVAL; | 620 | ret = -EINVAL; |
618 | } | 621 | } |
619 | out: | 622 | out: |
623 | rcu_read_unlock(); | ||
620 | fdput(f); | 624 | fdput(f); |
621 | return ret; | 625 | return ret; |
622 | } | 626 | } |
@@ -869,12 +873,8 @@ static void perf_pmu_rotate_start(struct pmu *pmu) | |||
869 | 873 | ||
870 | WARN_ON(!irqs_disabled()); | 874 | WARN_ON(!irqs_disabled()); |
871 | 875 | ||
872 | if (list_empty(&cpuctx->rotation_list)) { | 876 | if (list_empty(&cpuctx->rotation_list)) |
873 | int was_empty = list_empty(head); | ||
874 | list_add(&cpuctx->rotation_list, head); | 877 | list_add(&cpuctx->rotation_list, head); |
875 | if (was_empty) | ||
876 | tick_nohz_full_kick(); | ||
877 | } | ||
878 | } | 878 | } |
879 | 879 | ||
880 | static void get_ctx(struct perf_event_context *ctx) | 880 | static void get_ctx(struct perf_event_context *ctx) |
@@ -1216,6 +1216,9 @@ static void perf_event__id_header_size(struct perf_event *event) | |||
1216 | if (sample_type & PERF_SAMPLE_TIME) | 1216 | if (sample_type & PERF_SAMPLE_TIME) |
1217 | size += sizeof(data->time); | 1217 | size += sizeof(data->time); |
1218 | 1218 | ||
1219 | if (sample_type & PERF_SAMPLE_IDENTIFIER) | ||
1220 | size += sizeof(data->id); | ||
1221 | |||
1219 | if (sample_type & PERF_SAMPLE_ID) | 1222 | if (sample_type & PERF_SAMPLE_ID) |
1220 | size += sizeof(data->id); | 1223 | size += sizeof(data->id); |
1221 | 1224 | ||
@@ -2712,7 +2715,7 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, | |||
2712 | 2715 | ||
2713 | hwc = &event->hw; | 2716 | hwc = &event->hw; |
2714 | 2717 | ||
2715 | if (needs_unthr && hwc->interrupts == MAX_INTERRUPTS) { | 2718 | if (hwc->interrupts == MAX_INTERRUPTS) { |
2716 | hwc->interrupts = 0; | 2719 | hwc->interrupts = 0; |
2717 | perf_log_throttle(event, 1); | 2720 | perf_log_throttle(event, 1); |
2718 | event->pmu->start(event, 0); | 2721 | event->pmu->start(event, 0); |
@@ -2811,10 +2814,11 @@ done: | |||
2811 | #ifdef CONFIG_NO_HZ_FULL | 2814 | #ifdef CONFIG_NO_HZ_FULL |
2812 | bool perf_event_can_stop_tick(void) | 2815 | bool perf_event_can_stop_tick(void) |
2813 | { | 2816 | { |
2814 | if (list_empty(&__get_cpu_var(rotation_list))) | 2817 | if (atomic_read(&nr_freq_events) || |
2815 | return true; | 2818 | __this_cpu_read(perf_throttled_count)) |
2816 | else | ||
2817 | return false; | 2819 | return false; |
2820 | else | ||
2821 | return true; | ||
2818 | } | 2822 | } |
2819 | #endif | 2823 | #endif |
2820 | 2824 | ||
@@ -3128,36 +3132,63 @@ static void free_event_rcu(struct rcu_head *head) | |||
3128 | static void ring_buffer_put(struct ring_buffer *rb); | 3132 | static void ring_buffer_put(struct ring_buffer *rb); |
3129 | static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb); | 3133 | static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb); |
3130 | 3134 | ||
3131 | static void free_event(struct perf_event *event) | 3135 | static void unaccount_event_cpu(struct perf_event *event, int cpu) |
3132 | { | 3136 | { |
3133 | irq_work_sync(&event->pending); | 3137 | if (event->parent) |
3138 | return; | ||
3139 | |||
3140 | if (has_branch_stack(event)) { | ||
3141 | if (!(event->attach_state & PERF_ATTACH_TASK)) | ||
3142 | atomic_dec(&per_cpu(perf_branch_stack_events, cpu)); | ||
3143 | } | ||
3144 | if (is_cgroup_event(event)) | ||
3145 | atomic_dec(&per_cpu(perf_cgroup_events, cpu)); | ||
3146 | } | ||
3134 | 3147 | ||
3148 | static void unaccount_event(struct perf_event *event) | ||
3149 | { | ||
3150 | if (event->parent) | ||
3151 | return; | ||
3152 | |||
3153 | if (event->attach_state & PERF_ATTACH_TASK) | ||
3154 | static_key_slow_dec_deferred(&perf_sched_events); | ||
3155 | if (event->attr.mmap || event->attr.mmap_data) | ||
3156 | atomic_dec(&nr_mmap_events); | ||
3157 | if (event->attr.comm) | ||
3158 | atomic_dec(&nr_comm_events); | ||
3159 | if (event->attr.task) | ||
3160 | atomic_dec(&nr_task_events); | ||
3161 | if (event->attr.freq) | ||
3162 | atomic_dec(&nr_freq_events); | ||
3163 | if (is_cgroup_event(event)) | ||
3164 | static_key_slow_dec_deferred(&perf_sched_events); | ||
3165 | if (has_branch_stack(event)) | ||
3166 | static_key_slow_dec_deferred(&perf_sched_events); | ||
3167 | |||
3168 | unaccount_event_cpu(event, event->cpu); | ||
3169 | } | ||
3170 | |||
3171 | static void __free_event(struct perf_event *event) | ||
3172 | { | ||
3135 | if (!event->parent) { | 3173 | if (!event->parent) { |
3136 | if (event->attach_state & PERF_ATTACH_TASK) | ||
3137 | static_key_slow_dec_deferred(&perf_sched_events); | ||
3138 | if (event->attr.mmap || event->attr.mmap_data) | ||
3139 | atomic_dec(&nr_mmap_events); | ||
3140 | if (event->attr.comm) | ||
3141 | atomic_dec(&nr_comm_events); | ||
3142 | if (event->attr.task) | ||
3143 | atomic_dec(&nr_task_events); | ||
3144 | if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) | 3174 | if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) |
3145 | put_callchain_buffers(); | 3175 | put_callchain_buffers(); |
3146 | if (is_cgroup_event(event)) { | ||
3147 | atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); | ||
3148 | static_key_slow_dec_deferred(&perf_sched_events); | ||
3149 | } | ||
3150 | |||
3151 | if (has_branch_stack(event)) { | ||
3152 | static_key_slow_dec_deferred(&perf_sched_events); | ||
3153 | /* is system-wide event */ | ||
3154 | if (!(event->attach_state & PERF_ATTACH_TASK)) { | ||
3155 | atomic_dec(&per_cpu(perf_branch_stack_events, | ||
3156 | event->cpu)); | ||
3157 | } | ||
3158 | } | ||
3159 | } | 3176 | } |
3160 | 3177 | ||
3178 | if (event->destroy) | ||
3179 | event->destroy(event); | ||
3180 | |||
3181 | if (event->ctx) | ||
3182 | put_ctx(event->ctx); | ||
3183 | |||
3184 | call_rcu(&event->rcu_head, free_event_rcu); | ||
3185 | } | ||
3186 | static void free_event(struct perf_event *event) | ||
3187 | { | ||
3188 | irq_work_sync(&event->pending); | ||
3189 | |||
3190 | unaccount_event(event); | ||
3191 | |||
3161 | if (event->rb) { | 3192 | if (event->rb) { |
3162 | struct ring_buffer *rb; | 3193 | struct ring_buffer *rb; |
3163 | 3194 | ||
@@ -3180,13 +3211,8 @@ static void free_event(struct perf_event *event) | |||
3180 | if (is_cgroup_event(event)) | 3211 | if (is_cgroup_event(event)) |
3181 | perf_detach_cgroup(event); | 3212 | perf_detach_cgroup(event); |
3182 | 3213 | ||
3183 | if (event->destroy) | ||
3184 | event->destroy(event); | ||
3185 | |||
3186 | if (event->ctx) | ||
3187 | put_ctx(event->ctx); | ||
3188 | 3214 | ||
3189 | call_rcu(&event->rcu_head, free_event_rcu); | 3215 | __free_event(event); |
3190 | } | 3216 | } |
3191 | 3217 | ||
3192 | int perf_event_release_kernel(struct perf_event *event) | 3218 | int perf_event_release_kernel(struct perf_event *event) |
@@ -3544,6 +3570,15 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) | |||
3544 | case PERF_EVENT_IOC_PERIOD: | 3570 | case PERF_EVENT_IOC_PERIOD: |
3545 | return perf_event_period(event, (u64 __user *)arg); | 3571 | return perf_event_period(event, (u64 __user *)arg); |
3546 | 3572 | ||
3573 | case PERF_EVENT_IOC_ID: | ||
3574 | { | ||
3575 | u64 id = primary_event_id(event); | ||
3576 | |||
3577 | if (copy_to_user((void __user *)arg, &id, sizeof(id))) | ||
3578 | return -EFAULT; | ||
3579 | return 0; | ||
3580 | } | ||
3581 | |||
3547 | case PERF_EVENT_IOC_SET_OUTPUT: | 3582 | case PERF_EVENT_IOC_SET_OUTPUT: |
3548 | { | 3583 | { |
3549 | int ret; | 3584 | int ret; |
@@ -3641,6 +3676,10 @@ void perf_event_update_userpage(struct perf_event *event) | |||
3641 | u64 enabled, running, now; | 3676 | u64 enabled, running, now; |
3642 | 3677 | ||
3643 | rcu_read_lock(); | 3678 | rcu_read_lock(); |
3679 | rb = rcu_dereference(event->rb); | ||
3680 | if (!rb) | ||
3681 | goto unlock; | ||
3682 | |||
3644 | /* | 3683 | /* |
3645 | * compute total_time_enabled, total_time_running | 3684 | * compute total_time_enabled, total_time_running |
3646 | * based on snapshot values taken when the event | 3685 | * based on snapshot values taken when the event |
@@ -3651,12 +3690,8 @@ void perf_event_update_userpage(struct perf_event *event) | |||
3651 | * NMI context | 3690 | * NMI context |
3652 | */ | 3691 | */ |
3653 | calc_timer_values(event, &now, &enabled, &running); | 3692 | calc_timer_values(event, &now, &enabled, &running); |
3654 | rb = rcu_dereference(event->rb); | ||
3655 | if (!rb) | ||
3656 | goto unlock; | ||
3657 | 3693 | ||
3658 | userpg = rb->user_page; | 3694 | userpg = rb->user_page; |
3659 | |||
3660 | /* | 3695 | /* |
3661 | * Disable preemption so as to not let the corresponding user-space | 3696 | * Disable preemption so as to not let the corresponding user-space |
3662 | * spin too long if we get preempted. | 3697 | * spin too long if we get preempted. |
@@ -4251,7 +4286,7 @@ static void __perf_event_header__init_id(struct perf_event_header *header, | |||
4251 | if (sample_type & PERF_SAMPLE_TIME) | 4286 | if (sample_type & PERF_SAMPLE_TIME) |
4252 | data->time = perf_clock(); | 4287 | data->time = perf_clock(); |
4253 | 4288 | ||
4254 | if (sample_type & PERF_SAMPLE_ID) | 4289 | if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER)) |
4255 | data->id = primary_event_id(event); | 4290 | data->id = primary_event_id(event); |
4256 | 4291 | ||
4257 | if (sample_type & PERF_SAMPLE_STREAM_ID) | 4292 | if (sample_type & PERF_SAMPLE_STREAM_ID) |
@@ -4290,6 +4325,9 @@ static void __perf_event__output_id_sample(struct perf_output_handle *handle, | |||
4290 | 4325 | ||
4291 | if (sample_type & PERF_SAMPLE_CPU) | 4326 | if (sample_type & PERF_SAMPLE_CPU) |
4292 | perf_output_put(handle, data->cpu_entry); | 4327 | perf_output_put(handle, data->cpu_entry); |
4328 | |||
4329 | if (sample_type & PERF_SAMPLE_IDENTIFIER) | ||
4330 | perf_output_put(handle, data->id); | ||
4293 | } | 4331 | } |
4294 | 4332 | ||
4295 | void perf_event__output_id_sample(struct perf_event *event, | 4333 | void perf_event__output_id_sample(struct perf_event *event, |
@@ -4355,7 +4393,8 @@ static void perf_output_read_group(struct perf_output_handle *handle, | |||
4355 | list_for_each_entry(sub, &leader->sibling_list, group_entry) { | 4393 | list_for_each_entry(sub, &leader->sibling_list, group_entry) { |
4356 | n = 0; | 4394 | n = 0; |
4357 | 4395 | ||
4358 | if (sub != event) | 4396 | if ((sub != event) && |
4397 | (sub->state == PERF_EVENT_STATE_ACTIVE)) | ||
4359 | sub->pmu->read(sub); | 4398 | sub->pmu->read(sub); |
4360 | 4399 | ||
4361 | values[n++] = perf_event_count(sub); | 4400 | values[n++] = perf_event_count(sub); |
@@ -4402,6 +4441,9 @@ void perf_output_sample(struct perf_output_handle *handle, | |||
4402 | 4441 | ||
4403 | perf_output_put(handle, *header); | 4442 | perf_output_put(handle, *header); |
4404 | 4443 | ||
4444 | if (sample_type & PERF_SAMPLE_IDENTIFIER) | ||
4445 | perf_output_put(handle, data->id); | ||
4446 | |||
4405 | if (sample_type & PERF_SAMPLE_IP) | 4447 | if (sample_type & PERF_SAMPLE_IP) |
4406 | perf_output_put(handle, data->ip); | 4448 | perf_output_put(handle, data->ip); |
4407 | 4449 | ||
@@ -4462,20 +4504,6 @@ void perf_output_sample(struct perf_output_handle *handle, | |||
4462 | } | 4504 | } |
4463 | } | 4505 | } |
4464 | 4506 | ||
4465 | if (!event->attr.watermark) { | ||
4466 | int wakeup_events = event->attr.wakeup_events; | ||
4467 | |||
4468 | if (wakeup_events) { | ||
4469 | struct ring_buffer *rb = handle->rb; | ||
4470 | int events = local_inc_return(&rb->events); | ||
4471 | |||
4472 | if (events >= wakeup_events) { | ||
4473 | local_sub(wakeup_events, &rb->events); | ||
4474 | local_inc(&rb->wakeup); | ||
4475 | } | ||
4476 | } | ||
4477 | } | ||
4478 | |||
4479 | if (sample_type & PERF_SAMPLE_BRANCH_STACK) { | 4507 | if (sample_type & PERF_SAMPLE_BRANCH_STACK) { |
4480 | if (data->br_stack) { | 4508 | if (data->br_stack) { |
4481 | size_t size; | 4509 | size_t size; |
@@ -4511,16 +4539,31 @@ void perf_output_sample(struct perf_output_handle *handle, | |||
4511 | } | 4539 | } |
4512 | } | 4540 | } |
4513 | 4541 | ||
4514 | if (sample_type & PERF_SAMPLE_STACK_USER) | 4542 | if (sample_type & PERF_SAMPLE_STACK_USER) { |
4515 | perf_output_sample_ustack(handle, | 4543 | perf_output_sample_ustack(handle, |
4516 | data->stack_user_size, | 4544 | data->stack_user_size, |
4517 | data->regs_user.regs); | 4545 | data->regs_user.regs); |
4546 | } | ||
4518 | 4547 | ||
4519 | if (sample_type & PERF_SAMPLE_WEIGHT) | 4548 | if (sample_type & PERF_SAMPLE_WEIGHT) |
4520 | perf_output_put(handle, data->weight); | 4549 | perf_output_put(handle, data->weight); |
4521 | 4550 | ||
4522 | if (sample_type & PERF_SAMPLE_DATA_SRC) | 4551 | if (sample_type & PERF_SAMPLE_DATA_SRC) |
4523 | perf_output_put(handle, data->data_src.val); | 4552 | perf_output_put(handle, data->data_src.val); |
4553 | |||
4554 | if (!event->attr.watermark) { | ||
4555 | int wakeup_events = event->attr.wakeup_events; | ||
4556 | |||
4557 | if (wakeup_events) { | ||
4558 | struct ring_buffer *rb = handle->rb; | ||
4559 | int events = local_inc_return(&rb->events); | ||
4560 | |||
4561 | if (events >= wakeup_events) { | ||
4562 | local_sub(wakeup_events, &rb->events); | ||
4563 | local_inc(&rb->wakeup); | ||
4564 | } | ||
4565 | } | ||
4566 | } | ||
4524 | } | 4567 | } |
4525 | 4568 | ||
4526 | void perf_prepare_sample(struct perf_event_header *header, | 4569 | void perf_prepare_sample(struct perf_event_header *header, |
@@ -4680,12 +4723,10 @@ perf_event_read_event(struct perf_event *event, | |||
4680 | perf_output_end(&handle); | 4723 | perf_output_end(&handle); |
4681 | } | 4724 | } |
4682 | 4725 | ||
4683 | typedef int (perf_event_aux_match_cb)(struct perf_event *event, void *data); | ||
4684 | typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data); | 4726 | typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data); |
4685 | 4727 | ||
4686 | static void | 4728 | static void |
4687 | perf_event_aux_ctx(struct perf_event_context *ctx, | 4729 | perf_event_aux_ctx(struct perf_event_context *ctx, |
4688 | perf_event_aux_match_cb match, | ||
4689 | perf_event_aux_output_cb output, | 4730 | perf_event_aux_output_cb output, |
4690 | void *data) | 4731 | void *data) |
4691 | { | 4732 | { |
@@ -4696,15 +4737,12 @@ perf_event_aux_ctx(struct perf_event_context *ctx, | |||
4696 | continue; | 4737 | continue; |
4697 | if (!event_filter_match(event)) | 4738 | if (!event_filter_match(event)) |
4698 | continue; | 4739 | continue; |
4699 | if (match(event, data)) | 4740 | output(event, data); |
4700 | output(event, data); | ||
4701 | } | 4741 | } |
4702 | } | 4742 | } |
4703 | 4743 | ||
4704 | static void | 4744 | static void |
4705 | perf_event_aux(perf_event_aux_match_cb match, | 4745 | perf_event_aux(perf_event_aux_output_cb output, void *data, |
4706 | perf_event_aux_output_cb output, | ||
4707 | void *data, | ||
4708 | struct perf_event_context *task_ctx) | 4746 | struct perf_event_context *task_ctx) |
4709 | { | 4747 | { |
4710 | struct perf_cpu_context *cpuctx; | 4748 | struct perf_cpu_context *cpuctx; |
@@ -4717,7 +4755,7 @@ perf_event_aux(perf_event_aux_match_cb match, | |||
4717 | cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); | 4755 | cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); |
4718 | if (cpuctx->unique_pmu != pmu) | 4756 | if (cpuctx->unique_pmu != pmu) |
4719 | goto next; | 4757 | goto next; |
4720 | perf_event_aux_ctx(&cpuctx->ctx, match, output, data); | 4758 | perf_event_aux_ctx(&cpuctx->ctx, output, data); |
4721 | if (task_ctx) | 4759 | if (task_ctx) |
4722 | goto next; | 4760 | goto next; |
4723 | ctxn = pmu->task_ctx_nr; | 4761 | ctxn = pmu->task_ctx_nr; |
@@ -4725,14 +4763,14 @@ perf_event_aux(perf_event_aux_match_cb match, | |||
4725 | goto next; | 4763 | goto next; |
4726 | ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); | 4764 | ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); |
4727 | if (ctx) | 4765 | if (ctx) |
4728 | perf_event_aux_ctx(ctx, match, output, data); | 4766 | perf_event_aux_ctx(ctx, output, data); |
4729 | next: | 4767 | next: |
4730 | put_cpu_ptr(pmu->pmu_cpu_context); | 4768 | put_cpu_ptr(pmu->pmu_cpu_context); |
4731 | } | 4769 | } |
4732 | 4770 | ||
4733 | if (task_ctx) { | 4771 | if (task_ctx) { |
4734 | preempt_disable(); | 4772 | preempt_disable(); |
4735 | perf_event_aux_ctx(task_ctx, match, output, data); | 4773 | perf_event_aux_ctx(task_ctx, output, data); |
4736 | preempt_enable(); | 4774 | preempt_enable(); |
4737 | } | 4775 | } |
4738 | rcu_read_unlock(); | 4776 | rcu_read_unlock(); |
@@ -4741,7 +4779,7 @@ next: | |||
4741 | /* | 4779 | /* |
4742 | * task tracking -- fork/exit | 4780 | * task tracking -- fork/exit |
4743 | * | 4781 | * |
4744 | * enabled by: attr.comm | attr.mmap | attr.mmap_data | attr.task | 4782 | * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task |
4745 | */ | 4783 | */ |
4746 | 4784 | ||
4747 | struct perf_task_event { | 4785 | struct perf_task_event { |
@@ -4759,6 +4797,13 @@ struct perf_task_event { | |||
4759 | } event_id; | 4797 | } event_id; |
4760 | }; | 4798 | }; |
4761 | 4799 | ||
4800 | static int perf_event_task_match(struct perf_event *event) | ||
4801 | { | ||
4802 | return event->attr.comm || event->attr.mmap || | ||
4803 | event->attr.mmap2 || event->attr.mmap_data || | ||
4804 | event->attr.task; | ||
4805 | } | ||
4806 | |||
4762 | static void perf_event_task_output(struct perf_event *event, | 4807 | static void perf_event_task_output(struct perf_event *event, |
4763 | void *data) | 4808 | void *data) |
4764 | { | 4809 | { |
@@ -4768,6 +4813,9 @@ static void perf_event_task_output(struct perf_event *event, | |||
4768 | struct task_struct *task = task_event->task; | 4813 | struct task_struct *task = task_event->task; |
4769 | int ret, size = task_event->event_id.header.size; | 4814 | int ret, size = task_event->event_id.header.size; |
4770 | 4815 | ||
4816 | if (!perf_event_task_match(event)) | ||
4817 | return; | ||
4818 | |||
4771 | perf_event_header__init_id(&task_event->event_id.header, &sample, event); | 4819 | perf_event_header__init_id(&task_event->event_id.header, &sample, event); |
4772 | 4820 | ||
4773 | ret = perf_output_begin(&handle, event, | 4821 | ret = perf_output_begin(&handle, event, |
@@ -4790,13 +4838,6 @@ out: | |||
4790 | task_event->event_id.header.size = size; | 4838 | task_event->event_id.header.size = size; |
4791 | } | 4839 | } |
4792 | 4840 | ||
4793 | static int perf_event_task_match(struct perf_event *event, | ||
4794 | void *data __maybe_unused) | ||
4795 | { | ||
4796 | return event->attr.comm || event->attr.mmap || | ||
4797 | event->attr.mmap_data || event->attr.task; | ||
4798 | } | ||
4799 | |||
4800 | static void perf_event_task(struct task_struct *task, | 4841 | static void perf_event_task(struct task_struct *task, |
4801 | struct perf_event_context *task_ctx, | 4842 | struct perf_event_context *task_ctx, |
4802 | int new) | 4843 | int new) |
@@ -4825,8 +4866,7 @@ static void perf_event_task(struct task_struct *task, | |||
4825 | }, | 4866 | }, |
4826 | }; | 4867 | }; |
4827 | 4868 | ||
4828 | perf_event_aux(perf_event_task_match, | 4869 | perf_event_aux(perf_event_task_output, |
4829 | perf_event_task_output, | ||
4830 | &task_event, | 4870 | &task_event, |
4831 | task_ctx); | 4871 | task_ctx); |
4832 | } | 4872 | } |
@@ -4853,6 +4893,11 @@ struct perf_comm_event { | |||
4853 | } event_id; | 4893 | } event_id; |
4854 | }; | 4894 | }; |
4855 | 4895 | ||
4896 | static int perf_event_comm_match(struct perf_event *event) | ||
4897 | { | ||
4898 | return event->attr.comm; | ||
4899 | } | ||
4900 | |||
4856 | static void perf_event_comm_output(struct perf_event *event, | 4901 | static void perf_event_comm_output(struct perf_event *event, |
4857 | void *data) | 4902 | void *data) |
4858 | { | 4903 | { |
@@ -4862,6 +4907,9 @@ static void perf_event_comm_output(struct perf_event *event, | |||
4862 | int size = comm_event->event_id.header.size; | 4907 | int size = comm_event->event_id.header.size; |
4863 | int ret; | 4908 | int ret; |
4864 | 4909 | ||
4910 | if (!perf_event_comm_match(event)) | ||
4911 | return; | ||
4912 | |||
4865 | perf_event_header__init_id(&comm_event->event_id.header, &sample, event); | 4913 | perf_event_header__init_id(&comm_event->event_id.header, &sample, event); |
4866 | ret = perf_output_begin(&handle, event, | 4914 | ret = perf_output_begin(&handle, event, |
4867 | comm_event->event_id.header.size); | 4915 | comm_event->event_id.header.size); |
@@ -4883,12 +4931,6 @@ out: | |||
4883 | comm_event->event_id.header.size = size; | 4931 | comm_event->event_id.header.size = size; |
4884 | } | 4932 | } |
4885 | 4933 | ||
4886 | static int perf_event_comm_match(struct perf_event *event, | ||
4887 | void *data __maybe_unused) | ||
4888 | { | ||
4889 | return event->attr.comm; | ||
4890 | } | ||
4891 | |||
4892 | static void perf_event_comm_event(struct perf_comm_event *comm_event) | 4934 | static void perf_event_comm_event(struct perf_comm_event *comm_event) |
4893 | { | 4935 | { |
4894 | char comm[TASK_COMM_LEN]; | 4936 | char comm[TASK_COMM_LEN]; |
@@ -4903,8 +4945,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) | |||
4903 | 4945 | ||
4904 | comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; | 4946 | comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; |
4905 | 4947 | ||
4906 | perf_event_aux(perf_event_comm_match, | 4948 | perf_event_aux(perf_event_comm_output, |
4907 | perf_event_comm_output, | ||
4908 | comm_event, | 4949 | comm_event, |
4909 | NULL); | 4950 | NULL); |
4910 | } | 4951 | } |
@@ -4955,6 +4996,9 @@ struct perf_mmap_event { | |||
4955 | 4996 | ||
4956 | const char *file_name; | 4997 | const char *file_name; |
4957 | int file_size; | 4998 | int file_size; |
4999 | int maj, min; | ||
5000 | u64 ino; | ||
5001 | u64 ino_generation; | ||
4958 | 5002 | ||
4959 | struct { | 5003 | struct { |
4960 | struct perf_event_header header; | 5004 | struct perf_event_header header; |
@@ -4967,6 +5011,17 @@ struct perf_mmap_event { | |||
4967 | } event_id; | 5011 | } event_id; |
4968 | }; | 5012 | }; |
4969 | 5013 | ||
5014 | static int perf_event_mmap_match(struct perf_event *event, | ||
5015 | void *data) | ||
5016 | { | ||
5017 | struct perf_mmap_event *mmap_event = data; | ||
5018 | struct vm_area_struct *vma = mmap_event->vma; | ||
5019 | int executable = vma->vm_flags & VM_EXEC; | ||
5020 | |||
5021 | return (!executable && event->attr.mmap_data) || | ||
5022 | (executable && (event->attr.mmap || event->attr.mmap2)); | ||
5023 | } | ||
5024 | |||
4970 | static void perf_event_mmap_output(struct perf_event *event, | 5025 | static void perf_event_mmap_output(struct perf_event *event, |
4971 | void *data) | 5026 | void *data) |
4972 | { | 5027 | { |
@@ -4976,6 +5031,16 @@ static void perf_event_mmap_output(struct perf_event *event, | |||
4976 | int size = mmap_event->event_id.header.size; | 5031 | int size = mmap_event->event_id.header.size; |
4977 | int ret; | 5032 | int ret; |
4978 | 5033 | ||
5034 | if (!perf_event_mmap_match(event, data)) | ||
5035 | return; | ||
5036 | |||
5037 | if (event->attr.mmap2) { | ||
5038 | mmap_event->event_id.header.type = PERF_RECORD_MMAP2; | ||
5039 | mmap_event->event_id.header.size += sizeof(mmap_event->maj); | ||
5040 | mmap_event->event_id.header.size += sizeof(mmap_event->min); | ||
5041 | mmap_event->event_id.header.size += sizeof(mmap_event->ino); | ||
5042 | } | ||
5043 | |||
4979 | perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); | 5044 | perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); |
4980 | ret = perf_output_begin(&handle, event, | 5045 | ret = perf_output_begin(&handle, event, |
4981 | mmap_event->event_id.header.size); | 5046 | mmap_event->event_id.header.size); |
@@ -4986,6 +5051,14 @@ static void perf_event_mmap_output(struct perf_event *event, | |||
4986 | mmap_event->event_id.tid = perf_event_tid(event, current); | 5051 | mmap_event->event_id.tid = perf_event_tid(event, current); |
4987 | 5052 | ||
4988 | perf_output_put(&handle, mmap_event->event_id); | 5053 | perf_output_put(&handle, mmap_event->event_id); |
5054 | |||
5055 | if (event->attr.mmap2) { | ||
5056 | perf_output_put(&handle, mmap_event->maj); | ||
5057 | perf_output_put(&handle, mmap_event->min); | ||
5058 | perf_output_put(&handle, mmap_event->ino); | ||
5059 | perf_output_put(&handle, mmap_event->ino_generation); | ||
5060 | } | ||
5061 | |||
4989 | __output_copy(&handle, mmap_event->file_name, | 5062 | __output_copy(&handle, mmap_event->file_name, |
4990 | mmap_event->file_size); | 5063 | mmap_event->file_size); |
4991 | 5064 | ||
@@ -4996,21 +5069,12 @@ out: | |||
4996 | mmap_event->event_id.header.size = size; | 5069 | mmap_event->event_id.header.size = size; |
4997 | } | 5070 | } |
4998 | 5071 | ||
4999 | static int perf_event_mmap_match(struct perf_event *event, | ||
5000 | void *data) | ||
5001 | { | ||
5002 | struct perf_mmap_event *mmap_event = data; | ||
5003 | struct vm_area_struct *vma = mmap_event->vma; | ||
5004 | int executable = vma->vm_flags & VM_EXEC; | ||
5005 | |||
5006 | return (!executable && event->attr.mmap_data) || | ||
5007 | (executable && event->attr.mmap); | ||
5008 | } | ||
5009 | |||
5010 | static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) | 5072 | static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) |
5011 | { | 5073 | { |
5012 | struct vm_area_struct *vma = mmap_event->vma; | 5074 | struct vm_area_struct *vma = mmap_event->vma; |
5013 | struct file *file = vma->vm_file; | 5075 | struct file *file = vma->vm_file; |
5076 | int maj = 0, min = 0; | ||
5077 | u64 ino = 0, gen = 0; | ||
5014 | unsigned int size; | 5078 | unsigned int size; |
5015 | char tmp[16]; | 5079 | char tmp[16]; |
5016 | char *buf = NULL; | 5080 | char *buf = NULL; |
@@ -5019,6 +5083,8 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) | |||
5019 | memset(tmp, 0, sizeof(tmp)); | 5083 | memset(tmp, 0, sizeof(tmp)); |
5020 | 5084 | ||
5021 | if (file) { | 5085 | if (file) { |
5086 | struct inode *inode; | ||
5087 | dev_t dev; | ||
5022 | /* | 5088 | /* |
5023 | * d_path works from the end of the rb backwards, so we | 5089 | * d_path works from the end of the rb backwards, so we |
5024 | * need to add enough zero bytes after the string to handle | 5090 | * need to add enough zero bytes after the string to handle |
@@ -5034,6 +5100,13 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) | |||
5034 | name = strncpy(tmp, "//toolong", sizeof(tmp)); | 5100 | name = strncpy(tmp, "//toolong", sizeof(tmp)); |
5035 | goto got_name; | 5101 | goto got_name; |
5036 | } | 5102 | } |
5103 | inode = file_inode(vma->vm_file); | ||
5104 | dev = inode->i_sb->s_dev; | ||
5105 | ino = inode->i_ino; | ||
5106 | gen = inode->i_generation; | ||
5107 | maj = MAJOR(dev); | ||
5108 | min = MINOR(dev); | ||
5109 | |||
5037 | } else { | 5110 | } else { |
5038 | if (arch_vma_name(mmap_event->vma)) { | 5111 | if (arch_vma_name(mmap_event->vma)) { |
5039 | name = strncpy(tmp, arch_vma_name(mmap_event->vma), | 5112 | name = strncpy(tmp, arch_vma_name(mmap_event->vma), |
@@ -5064,14 +5137,17 @@ got_name: | |||
5064 | 5137 | ||
5065 | mmap_event->file_name = name; | 5138 | mmap_event->file_name = name; |
5066 | mmap_event->file_size = size; | 5139 | mmap_event->file_size = size; |
5140 | mmap_event->maj = maj; | ||
5141 | mmap_event->min = min; | ||
5142 | mmap_event->ino = ino; | ||
5143 | mmap_event->ino_generation = gen; | ||
5067 | 5144 | ||
5068 | if (!(vma->vm_flags & VM_EXEC)) | 5145 | if (!(vma->vm_flags & VM_EXEC)) |
5069 | mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA; | 5146 | mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA; |
5070 | 5147 | ||
5071 | mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; | 5148 | mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; |
5072 | 5149 | ||
5073 | perf_event_aux(perf_event_mmap_match, | 5150 | perf_event_aux(perf_event_mmap_output, |
5074 | perf_event_mmap_output, | ||
5075 | mmap_event, | 5151 | mmap_event, |
5076 | NULL); | 5152 | NULL); |
5077 | 5153 | ||
@@ -5101,6 +5177,10 @@ void perf_event_mmap(struct vm_area_struct *vma) | |||
5101 | .len = vma->vm_end - vma->vm_start, | 5177 | .len = vma->vm_end - vma->vm_start, |
5102 | .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT, | 5178 | .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT, |
5103 | }, | 5179 | }, |
5180 | /* .maj (attr_mmap2 only) */ | ||
5181 | /* .min (attr_mmap2 only) */ | ||
5182 | /* .ino (attr_mmap2 only) */ | ||
5183 | /* .ino_generation (attr_mmap2 only) */ | ||
5104 | }; | 5184 | }; |
5105 | 5185 | ||
5106 | perf_event_mmap_event(&mmap_event); | 5186 | perf_event_mmap_event(&mmap_event); |
@@ -5178,6 +5258,7 @@ static int __perf_event_overflow(struct perf_event *event, | |||
5178 | __this_cpu_inc(perf_throttled_count); | 5258 | __this_cpu_inc(perf_throttled_count); |
5179 | hwc->interrupts = MAX_INTERRUPTS; | 5259 | hwc->interrupts = MAX_INTERRUPTS; |
5180 | perf_log_throttle(event, 0); | 5260 | perf_log_throttle(event, 0); |
5261 | tick_nohz_full_kick(); | ||
5181 | ret = 1; | 5262 | ret = 1; |
5182 | } | 5263 | } |
5183 | } | 5264 | } |
@@ -6443,6 +6524,44 @@ unlock: | |||
6443 | return pmu; | 6524 | return pmu; |
6444 | } | 6525 | } |
6445 | 6526 | ||
6527 | static void account_event_cpu(struct perf_event *event, int cpu) | ||
6528 | { | ||
6529 | if (event->parent) | ||
6530 | return; | ||
6531 | |||
6532 | if (has_branch_stack(event)) { | ||
6533 | if (!(event->attach_state & PERF_ATTACH_TASK)) | ||
6534 | atomic_inc(&per_cpu(perf_branch_stack_events, cpu)); | ||
6535 | } | ||
6536 | if (is_cgroup_event(event)) | ||
6537 | atomic_inc(&per_cpu(perf_cgroup_events, cpu)); | ||
6538 | } | ||
6539 | |||
6540 | static void account_event(struct perf_event *event) | ||
6541 | { | ||
6542 | if (event->parent) | ||
6543 | return; | ||
6544 | |||
6545 | if (event->attach_state & PERF_ATTACH_TASK) | ||
6546 | static_key_slow_inc(&perf_sched_events.key); | ||
6547 | if (event->attr.mmap || event->attr.mmap_data) | ||
6548 | atomic_inc(&nr_mmap_events); | ||
6549 | if (event->attr.comm) | ||
6550 | atomic_inc(&nr_comm_events); | ||
6551 | if (event->attr.task) | ||
6552 | atomic_inc(&nr_task_events); | ||
6553 | if (event->attr.freq) { | ||
6554 | if (atomic_inc_return(&nr_freq_events) == 1) | ||
6555 | tick_nohz_full_kick_all(); | ||
6556 | } | ||
6557 | if (has_branch_stack(event)) | ||
6558 | static_key_slow_inc(&perf_sched_events.key); | ||
6559 | if (is_cgroup_event(event)) | ||
6560 | static_key_slow_inc(&perf_sched_events.key); | ||
6561 | |||
6562 | account_event_cpu(event, event->cpu); | ||
6563 | } | ||
6564 | |||
6446 | /* | 6565 | /* |
6447 | * Allocate and initialize a event structure | 6566 | * Allocate and initialize a event structure |
6448 | */ | 6567 | */ |
@@ -6457,7 +6576,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, | |||
6457 | struct pmu *pmu; | 6576 | struct pmu *pmu; |
6458 | struct perf_event *event; | 6577 | struct perf_event *event; |
6459 | struct hw_perf_event *hwc; | 6578 | struct hw_perf_event *hwc; |
6460 | long err; | 6579 | long err = -EINVAL; |
6461 | 6580 | ||
6462 | if ((unsigned)cpu >= nr_cpu_ids) { | 6581 | if ((unsigned)cpu >= nr_cpu_ids) { |
6463 | if (!task || cpu != -1) | 6582 | if (!task || cpu != -1) |
@@ -6540,49 +6659,35 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, | |||
6540 | * we currently do not support PERF_FORMAT_GROUP on inherited events | 6659 | * we currently do not support PERF_FORMAT_GROUP on inherited events |
6541 | */ | 6660 | */ |
6542 | if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) | 6661 | if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) |
6543 | goto done; | 6662 | goto err_ns; |
6544 | 6663 | ||
6545 | pmu = perf_init_event(event); | 6664 | pmu = perf_init_event(event); |
6546 | |||
6547 | done: | ||
6548 | err = 0; | ||
6549 | if (!pmu) | 6665 | if (!pmu) |
6550 | err = -EINVAL; | 6666 | goto err_ns; |
6551 | else if (IS_ERR(pmu)) | 6667 | else if (IS_ERR(pmu)) { |
6552 | err = PTR_ERR(pmu); | 6668 | err = PTR_ERR(pmu); |
6553 | 6669 | goto err_ns; | |
6554 | if (err) { | ||
6555 | if (event->ns) | ||
6556 | put_pid_ns(event->ns); | ||
6557 | kfree(event); | ||
6558 | return ERR_PTR(err); | ||
6559 | } | 6670 | } |
6560 | 6671 | ||
6561 | if (!event->parent) { | 6672 | if (!event->parent) { |
6562 | if (event->attach_state & PERF_ATTACH_TASK) | ||
6563 | static_key_slow_inc(&perf_sched_events.key); | ||
6564 | if (event->attr.mmap || event->attr.mmap_data) | ||
6565 | atomic_inc(&nr_mmap_events); | ||
6566 | if (event->attr.comm) | ||
6567 | atomic_inc(&nr_comm_events); | ||
6568 | if (event->attr.task) | ||
6569 | atomic_inc(&nr_task_events); | ||
6570 | if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { | 6673 | if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { |
6571 | err = get_callchain_buffers(); | 6674 | err = get_callchain_buffers(); |
6572 | if (err) { | 6675 | if (err) |
6573 | free_event(event); | 6676 | goto err_pmu; |
6574 | return ERR_PTR(err); | ||
6575 | } | ||
6576 | } | ||
6577 | if (has_branch_stack(event)) { | ||
6578 | static_key_slow_inc(&perf_sched_events.key); | ||
6579 | if (!(event->attach_state & PERF_ATTACH_TASK)) | ||
6580 | atomic_inc(&per_cpu(perf_branch_stack_events, | ||
6581 | event->cpu)); | ||
6582 | } | 6677 | } |
6583 | } | 6678 | } |
6584 | 6679 | ||
6585 | return event; | 6680 | return event; |
6681 | |||
6682 | err_pmu: | ||
6683 | if (event->destroy) | ||
6684 | event->destroy(event); | ||
6685 | err_ns: | ||
6686 | if (event->ns) | ||
6687 | put_pid_ns(event->ns); | ||
6688 | kfree(event); | ||
6689 | |||
6690 | return ERR_PTR(err); | ||
6586 | } | 6691 | } |
6587 | 6692 | ||
6588 | static int perf_copy_attr(struct perf_event_attr __user *uattr, | 6693 | static int perf_copy_attr(struct perf_event_attr __user *uattr, |
@@ -6864,17 +6969,14 @@ SYSCALL_DEFINE5(perf_event_open, | |||
6864 | 6969 | ||
6865 | if (flags & PERF_FLAG_PID_CGROUP) { | 6970 | if (flags & PERF_FLAG_PID_CGROUP) { |
6866 | err = perf_cgroup_connect(pid, event, &attr, group_leader); | 6971 | err = perf_cgroup_connect(pid, event, &attr, group_leader); |
6867 | if (err) | 6972 | if (err) { |
6868 | goto err_alloc; | 6973 | __free_event(event); |
6869 | /* | 6974 | goto err_task; |
6870 | * one more event: | 6975 | } |
6871 | * - that has cgroup constraint on event->cpu | ||
6872 | * - that may need work on context switch | ||
6873 | */ | ||
6874 | atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); | ||
6875 | static_key_slow_inc(&perf_sched_events.key); | ||
6876 | } | 6976 | } |
6877 | 6977 | ||
6978 | account_event(event); | ||
6979 | |||
6878 | /* | 6980 | /* |
6879 | * Special case software events and allow them to be part of | 6981 | * Special case software events and allow them to be part of |
6880 | * any hardware group. | 6982 | * any hardware group. |
@@ -7070,6 +7172,8 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, | |||
7070 | goto err; | 7172 | goto err; |
7071 | } | 7173 | } |
7072 | 7174 | ||
7175 | account_event(event); | ||
7176 | |||
7073 | ctx = find_get_context(event->pmu, task, cpu); | 7177 | ctx = find_get_context(event->pmu, task, cpu); |
7074 | if (IS_ERR(ctx)) { | 7178 | if (IS_ERR(ctx)) { |
7075 | err = PTR_ERR(ctx); | 7179 | err = PTR_ERR(ctx); |
@@ -7106,6 +7210,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) | |||
7106 | list_for_each_entry_safe(event, tmp, &src_ctx->event_list, | 7210 | list_for_each_entry_safe(event, tmp, &src_ctx->event_list, |
7107 | event_entry) { | 7211 | event_entry) { |
7108 | perf_remove_from_context(event); | 7212 | perf_remove_from_context(event); |
7213 | unaccount_event_cpu(event, src_cpu); | ||
7109 | put_ctx(src_ctx); | 7214 | put_ctx(src_ctx); |
7110 | list_add(&event->event_entry, &events); | 7215 | list_add(&event->event_entry, &events); |
7111 | } | 7216 | } |
@@ -7118,6 +7223,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) | |||
7118 | list_del(&event->event_entry); | 7223 | list_del(&event->event_entry); |
7119 | if (event->state >= PERF_EVENT_STATE_OFF) | 7224 | if (event->state >= PERF_EVENT_STATE_OFF) |
7120 | event->state = PERF_EVENT_STATE_INACTIVE; | 7225 | event->state = PERF_EVENT_STATE_INACTIVE; |
7226 | account_event_cpu(event, dst_cpu); | ||
7121 | perf_install_in_context(dst_ctx, event, dst_cpu); | 7227 | perf_install_in_context(dst_ctx, event, dst_cpu); |
7122 | get_ctx(dst_ctx); | 7228 | get_ctx(dst_ctx); |
7123 | } | 7229 | } |
@@ -7798,7 +7904,8 @@ unlock: | |||
7798 | device_initcall(perf_event_sysfs_init); | 7904 | device_initcall(perf_event_sysfs_init); |
7799 | 7905 | ||
7800 | #ifdef CONFIG_CGROUP_PERF | 7906 | #ifdef CONFIG_CGROUP_PERF |
7801 | static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont) | 7907 | static struct cgroup_subsys_state * |
7908 | perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) | ||
7802 | { | 7909 | { |
7803 | struct perf_cgroup *jc; | 7910 | struct perf_cgroup *jc; |
7804 | 7911 | ||
@@ -7815,11 +7922,10 @@ static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont) | |||
7815 | return &jc->css; | 7922 | return &jc->css; |
7816 | } | 7923 | } |
7817 | 7924 | ||
7818 | static void perf_cgroup_css_free(struct cgroup *cont) | 7925 | static void perf_cgroup_css_free(struct cgroup_subsys_state *css) |
7819 | { | 7926 | { |
7820 | struct perf_cgroup *jc; | 7927 | struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css); |
7821 | jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), | 7928 | |
7822 | struct perf_cgroup, css); | ||
7823 | free_percpu(jc->info); | 7929 | free_percpu(jc->info); |
7824 | kfree(jc); | 7930 | kfree(jc); |
7825 | } | 7931 | } |
@@ -7831,15 +7937,17 @@ static int __perf_cgroup_move(void *info) | |||
7831 | return 0; | 7937 | return 0; |
7832 | } | 7938 | } |
7833 | 7939 | ||
7834 | static void perf_cgroup_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) | 7940 | static void perf_cgroup_attach(struct cgroup_subsys_state *css, |
7941 | struct cgroup_taskset *tset) | ||
7835 | { | 7942 | { |
7836 | struct task_struct *task; | 7943 | struct task_struct *task; |
7837 | 7944 | ||
7838 | cgroup_taskset_for_each(task, cgrp, tset) | 7945 | cgroup_taskset_for_each(task, css, tset) |
7839 | task_function_call(task, __perf_cgroup_move, task); | 7946 | task_function_call(task, __perf_cgroup_move, task); |
7840 | } | 7947 | } |
7841 | 7948 | ||
7842 | static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, | 7949 | static void perf_cgroup_exit(struct cgroup_subsys_state *css, |
7950 | struct cgroup_subsys_state *old_css, | ||
7843 | struct task_struct *task) | 7951 | struct task_struct *task) |
7844 | { | 7952 | { |
7845 | /* | 7953 | /* |
diff --git a/kernel/fork.c b/kernel/fork.c index e23bb19e2a3e..bf46287c91a4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -1177,7 +1177,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1177 | * don't allow the creation of threads. | 1177 | * don't allow the creation of threads. |
1178 | */ | 1178 | */ |
1179 | if ((clone_flags & (CLONE_VM|CLONE_NEWPID)) && | 1179 | if ((clone_flags & (CLONE_VM|CLONE_NEWPID)) && |
1180 | (task_active_pid_ns(current) != current->nsproxy->pid_ns)) | 1180 | (task_active_pid_ns(current) != |
1181 | current->nsproxy->pid_ns_for_children)) | ||
1181 | return ERR_PTR(-EINVAL); | 1182 | return ERR_PTR(-EINVAL); |
1182 | 1183 | ||
1183 | retval = security_task_create(clone_flags); | 1184 | retval = security_task_create(clone_flags); |
@@ -1351,7 +1352,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1351 | 1352 | ||
1352 | if (pid != &init_struct_pid) { | 1353 | if (pid != &init_struct_pid) { |
1353 | retval = -ENOMEM; | 1354 | retval = -ENOMEM; |
1354 | pid = alloc_pid(p->nsproxy->pid_ns); | 1355 | pid = alloc_pid(p->nsproxy->pid_ns_for_children); |
1355 | if (!pid) | 1356 | if (!pid) |
1356 | goto bad_fork_cleanup_io; | 1357 | goto bad_fork_cleanup_io; |
1357 | } | 1358 | } |
diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 6df614912b9d..3e97fb126e6b 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c | |||
@@ -15,6 +15,7 @@ | |||
15 | #include <linux/lockdep.h> | 15 | #include <linux/lockdep.h> |
16 | #include <linux/export.h> | 16 | #include <linux/export.h> |
17 | #include <linux/sysctl.h> | 17 | #include <linux/sysctl.h> |
18 | #include <linux/utsname.h> | ||
18 | 19 | ||
19 | /* | 20 | /* |
20 | * The number of tasks checked: | 21 | * The number of tasks checked: |
@@ -99,10 +100,14 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) | |||
99 | * Ok, the task did not get scheduled for more than 2 minutes, | 100 | * Ok, the task did not get scheduled for more than 2 minutes, |
100 | * complain: | 101 | * complain: |
101 | */ | 102 | */ |
102 | printk(KERN_ERR "INFO: task %s:%d blocked for more than " | 103 | pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n", |
103 | "%ld seconds.\n", t->comm, t->pid, timeout); | 104 | t->comm, t->pid, timeout); |
104 | printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" | 105 | pr_err(" %s %s %.*s\n", |
105 | " disables this message.\n"); | 106 | print_tainted(), init_utsname()->release, |
107 | (int)strcspn(init_utsname()->version, " "), | ||
108 | init_utsname()->version); | ||
109 | pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" | ||
110 | " disables this message.\n"); | ||
106 | sched_show_task(t); | 111 | sched_show_task(t); |
107 | debug_show_held_locks(t); | 112 | debug_show_held_locks(t); |
108 | 113 | ||
diff --git a/kernel/lglock.c b/kernel/lglock.c index 6535a667a5a7..86ae2aebf004 100644 --- a/kernel/lglock.c +++ b/kernel/lglock.c | |||
@@ -21,7 +21,7 @@ void lg_local_lock(struct lglock *lg) | |||
21 | arch_spinlock_t *lock; | 21 | arch_spinlock_t *lock; |
22 | 22 | ||
23 | preempt_disable(); | 23 | preempt_disable(); |
24 | rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_); | 24 | lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); |
25 | lock = this_cpu_ptr(lg->lock); | 25 | lock = this_cpu_ptr(lg->lock); |
26 | arch_spin_lock(lock); | 26 | arch_spin_lock(lock); |
27 | } | 27 | } |
@@ -31,7 +31,7 @@ void lg_local_unlock(struct lglock *lg) | |||
31 | { | 31 | { |
32 | arch_spinlock_t *lock; | 32 | arch_spinlock_t *lock; |
33 | 33 | ||
34 | rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); | 34 | lock_release(&lg->lock_dep_map, 1, _RET_IP_); |
35 | lock = this_cpu_ptr(lg->lock); | 35 | lock = this_cpu_ptr(lg->lock); |
36 | arch_spin_unlock(lock); | 36 | arch_spin_unlock(lock); |
37 | preempt_enable(); | 37 | preempt_enable(); |
@@ -43,7 +43,7 @@ void lg_local_lock_cpu(struct lglock *lg, int cpu) | |||
43 | arch_spinlock_t *lock; | 43 | arch_spinlock_t *lock; |
44 | 44 | ||
45 | preempt_disable(); | 45 | preempt_disable(); |
46 | rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_); | 46 | lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); |
47 | lock = per_cpu_ptr(lg->lock, cpu); | 47 | lock = per_cpu_ptr(lg->lock, cpu); |
48 | arch_spin_lock(lock); | 48 | arch_spin_lock(lock); |
49 | } | 49 | } |
@@ -53,7 +53,7 @@ void lg_local_unlock_cpu(struct lglock *lg, int cpu) | |||
53 | { | 53 | { |
54 | arch_spinlock_t *lock; | 54 | arch_spinlock_t *lock; |
55 | 55 | ||
56 | rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); | 56 | lock_release(&lg->lock_dep_map, 1, _RET_IP_); |
57 | lock = per_cpu_ptr(lg->lock, cpu); | 57 | lock = per_cpu_ptr(lg->lock, cpu); |
58 | arch_spin_unlock(lock); | 58 | arch_spin_unlock(lock); |
59 | preempt_enable(); | 59 | preempt_enable(); |
@@ -65,7 +65,7 @@ void lg_global_lock(struct lglock *lg) | |||
65 | int i; | 65 | int i; |
66 | 66 | ||
67 | preempt_disable(); | 67 | preempt_disable(); |
68 | rwlock_acquire(&lg->lock_dep_map, 0, 0, _RET_IP_); | 68 | lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); |
69 | for_each_possible_cpu(i) { | 69 | for_each_possible_cpu(i) { |
70 | arch_spinlock_t *lock; | 70 | arch_spinlock_t *lock; |
71 | lock = per_cpu_ptr(lg->lock, i); | 71 | lock = per_cpu_ptr(lg->lock, i); |
@@ -78,7 +78,7 @@ void lg_global_unlock(struct lglock *lg) | |||
78 | { | 78 | { |
79 | int i; | 79 | int i; |
80 | 80 | ||
81 | rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); | 81 | lock_release(&lg->lock_dep_map, 1, _RET_IP_); |
82 | for_each_possible_cpu(i) { | 82 | for_each_possible_cpu(i) { |
83 | arch_spinlock_t *lock; | 83 | arch_spinlock_t *lock; |
84 | lock = per_cpu_ptr(lg->lock, i); | 84 | lock = per_cpu_ptr(lg->lock, i); |
diff --git a/kernel/mutex.c b/kernel/mutex.c index a52ee7bb830d..6d647aedffea 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c | |||
@@ -209,11 +209,13 @@ int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) | |||
209 | */ | 209 | */ |
210 | static inline int mutex_can_spin_on_owner(struct mutex *lock) | 210 | static inline int mutex_can_spin_on_owner(struct mutex *lock) |
211 | { | 211 | { |
212 | struct task_struct *owner; | ||
212 | int retval = 1; | 213 | int retval = 1; |
213 | 214 | ||
214 | rcu_read_lock(); | 215 | rcu_read_lock(); |
215 | if (lock->owner) | 216 | owner = ACCESS_ONCE(lock->owner); |
216 | retval = lock->owner->on_cpu; | 217 | if (owner) |
218 | retval = owner->on_cpu; | ||
217 | rcu_read_unlock(); | 219 | rcu_read_unlock(); |
218 | /* | 220 | /* |
219 | * if lock->owner is not set, the mutex owner may have just acquired | 221 | * if lock->owner is not set, the mutex owner may have just acquired |
@@ -461,7 +463,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, | |||
461 | * performed the optimistic spinning cannot be done. | 463 | * performed the optimistic spinning cannot be done. |
462 | */ | 464 | */ |
463 | if (ACCESS_ONCE(ww->ctx)) | 465 | if (ACCESS_ONCE(ww->ctx)) |
464 | break; | 466 | goto slowpath; |
465 | } | 467 | } |
466 | 468 | ||
467 | /* | 469 | /* |
@@ -472,7 +474,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, | |||
472 | owner = ACCESS_ONCE(lock->owner); | 474 | owner = ACCESS_ONCE(lock->owner); |
473 | if (owner && !mutex_spin_on_owner(lock, owner)) { | 475 | if (owner && !mutex_spin_on_owner(lock, owner)) { |
474 | mspin_unlock(MLOCK(lock), &node); | 476 | mspin_unlock(MLOCK(lock), &node); |
475 | break; | 477 | goto slowpath; |
476 | } | 478 | } |
477 | 479 | ||
478 | if ((atomic_read(&lock->count) == 1) && | 480 | if ((atomic_read(&lock->count) == 1) && |
@@ -499,7 +501,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, | |||
499 | * the owner complete. | 501 | * the owner complete. |
500 | */ | 502 | */ |
501 | if (!owner && (need_resched() || rt_task(task))) | 503 | if (!owner && (need_resched() || rt_task(task))) |
502 | break; | 504 | goto slowpath; |
503 | 505 | ||
504 | /* | 506 | /* |
505 | * The cpu_relax() call is a compiler barrier which forces | 507 | * The cpu_relax() call is a compiler barrier which forces |
@@ -513,6 +515,10 @@ slowpath: | |||
513 | #endif | 515 | #endif |
514 | spin_lock_mutex(&lock->wait_lock, flags); | 516 | spin_lock_mutex(&lock->wait_lock, flags); |
515 | 517 | ||
518 | /* once more, can we acquire the lock? */ | ||
519 | if (MUTEX_SHOW_NO_WAITER(lock) && (atomic_xchg(&lock->count, 0) == 1)) | ||
520 | goto skip_wait; | ||
521 | |||
516 | debug_mutex_lock_common(lock, &waiter); | 522 | debug_mutex_lock_common(lock, &waiter); |
517 | debug_mutex_add_waiter(lock, &waiter, task_thread_info(task)); | 523 | debug_mutex_add_waiter(lock, &waiter, task_thread_info(task)); |
518 | 524 | ||
@@ -520,9 +526,6 @@ slowpath: | |||
520 | list_add_tail(&waiter.list, &lock->wait_list); | 526 | list_add_tail(&waiter.list, &lock->wait_list); |
521 | waiter.task = task; | 527 | waiter.task = task; |
522 | 528 | ||
523 | if (MUTEX_SHOW_NO_WAITER(lock) && (atomic_xchg(&lock->count, -1) == 1)) | ||
524 | goto done; | ||
525 | |||
526 | lock_contended(&lock->dep_map, ip); | 529 | lock_contended(&lock->dep_map, ip); |
527 | 530 | ||
528 | for (;;) { | 531 | for (;;) { |
@@ -536,7 +539,7 @@ slowpath: | |||
536 | * other waiters: | 539 | * other waiters: |
537 | */ | 540 | */ |
538 | if (MUTEX_SHOW_NO_WAITER(lock) && | 541 | if (MUTEX_SHOW_NO_WAITER(lock) && |
539 | (atomic_xchg(&lock->count, -1) == 1)) | 542 | (atomic_xchg(&lock->count, -1) == 1)) |
540 | break; | 543 | break; |
541 | 544 | ||
542 | /* | 545 | /* |
@@ -561,24 +564,25 @@ slowpath: | |||
561 | schedule_preempt_disabled(); | 564 | schedule_preempt_disabled(); |
562 | spin_lock_mutex(&lock->wait_lock, flags); | 565 | spin_lock_mutex(&lock->wait_lock, flags); |
563 | } | 566 | } |
567 | mutex_remove_waiter(lock, &waiter, current_thread_info()); | ||
568 | /* set it to 0 if there are no waiters left: */ | ||
569 | if (likely(list_empty(&lock->wait_list))) | ||
570 | atomic_set(&lock->count, 0); | ||
571 | debug_mutex_free_waiter(&waiter); | ||
564 | 572 | ||
565 | done: | 573 | skip_wait: |
574 | /* got the lock - cleanup and rejoice! */ | ||
566 | lock_acquired(&lock->dep_map, ip); | 575 | lock_acquired(&lock->dep_map, ip); |
567 | /* got the lock - rejoice! */ | ||
568 | mutex_remove_waiter(lock, &waiter, current_thread_info()); | ||
569 | mutex_set_owner(lock); | 576 | mutex_set_owner(lock); |
570 | 577 | ||
571 | if (!__builtin_constant_p(ww_ctx == NULL)) { | 578 | if (!__builtin_constant_p(ww_ctx == NULL)) { |
572 | struct ww_mutex *ww = container_of(lock, | 579 | struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); |
573 | struct ww_mutex, | ||
574 | base); | ||
575 | struct mutex_waiter *cur; | 580 | struct mutex_waiter *cur; |
576 | 581 | ||
577 | /* | 582 | /* |
578 | * This branch gets optimized out for the common case, | 583 | * This branch gets optimized out for the common case, |
579 | * and is only important for ww_mutex_lock. | 584 | * and is only important for ww_mutex_lock. |
580 | */ | 585 | */ |
581 | |||
582 | ww_mutex_lock_acquired(ww, ww_ctx); | 586 | ww_mutex_lock_acquired(ww, ww_ctx); |
583 | ww->ctx = ww_ctx; | 587 | ww->ctx = ww_ctx; |
584 | 588 | ||
@@ -592,15 +596,8 @@ done: | |||
592 | } | 596 | } |
593 | } | 597 | } |
594 | 598 | ||
595 | /* set it to 0 if there are no waiters left: */ | ||
596 | if (likely(list_empty(&lock->wait_list))) | ||
597 | atomic_set(&lock->count, 0); | ||
598 | |||
599 | spin_unlock_mutex(&lock->wait_lock, flags); | 599 | spin_unlock_mutex(&lock->wait_lock, flags); |
600 | |||
601 | debug_mutex_free_waiter(&waiter); | ||
602 | preempt_enable(); | 600 | preempt_enable(); |
603 | |||
604 | return 0; | 601 | return 0; |
605 | 602 | ||
606 | err: | 603 | err: |
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 364ceab15f0c..997cbb951a3b 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c | |||
@@ -29,15 +29,15 @@ | |||
29 | static struct kmem_cache *nsproxy_cachep; | 29 | static struct kmem_cache *nsproxy_cachep; |
30 | 30 | ||
31 | struct nsproxy init_nsproxy = { | 31 | struct nsproxy init_nsproxy = { |
32 | .count = ATOMIC_INIT(1), | 32 | .count = ATOMIC_INIT(1), |
33 | .uts_ns = &init_uts_ns, | 33 | .uts_ns = &init_uts_ns, |
34 | #if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC) | 34 | #if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC) |
35 | .ipc_ns = &init_ipc_ns, | 35 | .ipc_ns = &init_ipc_ns, |
36 | #endif | 36 | #endif |
37 | .mnt_ns = NULL, | 37 | .mnt_ns = NULL, |
38 | .pid_ns = &init_pid_ns, | 38 | .pid_ns_for_children = &init_pid_ns, |
39 | #ifdef CONFIG_NET | 39 | #ifdef CONFIG_NET |
40 | .net_ns = &init_net, | 40 | .net_ns = &init_net, |
41 | #endif | 41 | #endif |
42 | }; | 42 | }; |
43 | 43 | ||
@@ -85,9 +85,10 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, | |||
85 | goto out_ipc; | 85 | goto out_ipc; |
86 | } | 86 | } |
87 | 87 | ||
88 | new_nsp->pid_ns = copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns); | 88 | new_nsp->pid_ns_for_children = |
89 | if (IS_ERR(new_nsp->pid_ns)) { | 89 | copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns_for_children); |
90 | err = PTR_ERR(new_nsp->pid_ns); | 90 | if (IS_ERR(new_nsp->pid_ns_for_children)) { |
91 | err = PTR_ERR(new_nsp->pid_ns_for_children); | ||
91 | goto out_pid; | 92 | goto out_pid; |
92 | } | 93 | } |
93 | 94 | ||
@@ -100,8 +101,8 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, | |||
100 | return new_nsp; | 101 | return new_nsp; |
101 | 102 | ||
102 | out_net: | 103 | out_net: |
103 | if (new_nsp->pid_ns) | 104 | if (new_nsp->pid_ns_for_children) |
104 | put_pid_ns(new_nsp->pid_ns); | 105 | put_pid_ns(new_nsp->pid_ns_for_children); |
105 | out_pid: | 106 | out_pid: |
106 | if (new_nsp->ipc_ns) | 107 | if (new_nsp->ipc_ns) |
107 | put_ipc_ns(new_nsp->ipc_ns); | 108 | put_ipc_ns(new_nsp->ipc_ns); |
@@ -174,8 +175,8 @@ void free_nsproxy(struct nsproxy *ns) | |||
174 | put_uts_ns(ns->uts_ns); | 175 | put_uts_ns(ns->uts_ns); |
175 | if (ns->ipc_ns) | 176 | if (ns->ipc_ns) |
176 | put_ipc_ns(ns->ipc_ns); | 177 | put_ipc_ns(ns->ipc_ns); |
177 | if (ns->pid_ns) | 178 | if (ns->pid_ns_for_children) |
178 | put_pid_ns(ns->pid_ns); | 179 | put_pid_ns(ns->pid_ns_for_children); |
179 | put_net(ns->net_ns); | 180 | put_net(ns->net_ns); |
180 | kmem_cache_free(nsproxy_cachep, ns); | 181 | kmem_cache_free(nsproxy_cachep, ns); |
181 | } | 182 | } |
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 6917e8edb48e..601bb361c235 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c | |||
@@ -349,8 +349,8 @@ static int pidns_install(struct nsproxy *nsproxy, void *ns) | |||
349 | if (ancestor != active) | 349 | if (ancestor != active) |
350 | return -EINVAL; | 350 | return -EINVAL; |
351 | 351 | ||
352 | put_pid_ns(nsproxy->pid_ns); | 352 | put_pid_ns(nsproxy->pid_ns_for_children); |
353 | nsproxy->pid_ns = get_pid_ns(new); | 353 | nsproxy->pid_ns_for_children = get_pid_ns(new); |
354 | return 0; | 354 | return 0; |
355 | } | 355 | } |
356 | 356 | ||
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index b26f5f1e773e..3085e62a80a5 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c | |||
@@ -39,7 +39,7 @@ static int resume_delay; | |||
39 | static char resume_file[256] = CONFIG_PM_STD_PARTITION; | 39 | static char resume_file[256] = CONFIG_PM_STD_PARTITION; |
40 | dev_t swsusp_resume_device; | 40 | dev_t swsusp_resume_device; |
41 | sector_t swsusp_resume_block; | 41 | sector_t swsusp_resume_block; |
42 | int in_suspend __nosavedata; | 42 | __visible int in_suspend __nosavedata; |
43 | 43 | ||
44 | enum { | 44 | enum { |
45 | HIBERNATION_INVALID, | 45 | HIBERNATION_INVALID, |
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index ece04223bb1e..62ee437b5c7e 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c | |||
@@ -210,6 +210,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) | |||
210 | goto Platform_wake; | 210 | goto Platform_wake; |
211 | } | 211 | } |
212 | 212 | ||
213 | ftrace_stop(); | ||
213 | error = disable_nonboot_cpus(); | 214 | error = disable_nonboot_cpus(); |
214 | if (error || suspend_test(TEST_CPUS)) | 215 | if (error || suspend_test(TEST_CPUS)) |
215 | goto Enable_cpus; | 216 | goto Enable_cpus; |
@@ -232,6 +233,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) | |||
232 | 233 | ||
233 | Enable_cpus: | 234 | Enable_cpus: |
234 | enable_nonboot_cpus(); | 235 | enable_nonboot_cpus(); |
236 | ftrace_start(); | ||
235 | 237 | ||
236 | Platform_wake: | 238 | Platform_wake: |
237 | if (need_suspend_ops(state) && suspend_ops->wake) | 239 | if (need_suspend_ops(state) && suspend_ops->wake) |
@@ -265,7 +267,6 @@ int suspend_devices_and_enter(suspend_state_t state) | |||
265 | goto Close; | 267 | goto Close; |
266 | } | 268 | } |
267 | suspend_console(); | 269 | suspend_console(); |
268 | ftrace_stop(); | ||
269 | suspend_test_start(); | 270 | suspend_test_start(); |
270 | error = dpm_suspend_start(PMSG_SUSPEND); | 271 | error = dpm_suspend_start(PMSG_SUSPEND); |
271 | if (error) { | 272 | if (error) { |
@@ -285,7 +286,6 @@ int suspend_devices_and_enter(suspend_state_t state) | |||
285 | suspend_test_start(); | 286 | suspend_test_start(); |
286 | dpm_resume_end(PMSG_RESUME); | 287 | dpm_resume_end(PMSG_RESUME); |
287 | suspend_test_finish("resume devices"); | 288 | suspend_test_finish("resume devices"); |
288 | ftrace_start(); | ||
289 | resume_console(); | 289 | resume_console(); |
290 | Close: | 290 | Close: |
291 | if (need_suspend_ops(state) && suspend_ops->end) | 291 | if (need_suspend_ops(state) && suspend_ops->end) |
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 5b5a7080e2a5..b4e8500afdb3 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c | |||
@@ -2226,6 +2226,13 @@ void register_console(struct console *newcon) | |||
2226 | struct console *bcon = NULL; | 2226 | struct console *bcon = NULL; |
2227 | struct console_cmdline *c; | 2227 | struct console_cmdline *c; |
2228 | 2228 | ||
2229 | if (console_drivers) | ||
2230 | for_each_console(bcon) | ||
2231 | if (WARN(bcon == newcon, | ||
2232 | "console '%s%d' already registered\n", | ||
2233 | bcon->name, bcon->index)) | ||
2234 | return; | ||
2235 | |||
2229 | /* | 2236 | /* |
2230 | * before we register a new CON_BOOT console, make sure we don't | 2237 | * before we register a new CON_BOOT console, make sure we don't |
2231 | * already have a valid console | 2238 | * already have a valid console |
diff --git a/kernel/rcu.h b/kernel/rcu.h index 7f8e7590e3e5..77131966c4ad 100644 --- a/kernel/rcu.h +++ b/kernel/rcu.h | |||
@@ -67,12 +67,15 @@ | |||
67 | 67 | ||
68 | extern struct debug_obj_descr rcuhead_debug_descr; | 68 | extern struct debug_obj_descr rcuhead_debug_descr; |
69 | 69 | ||
70 | static inline void debug_rcu_head_queue(struct rcu_head *head) | 70 | static inline int debug_rcu_head_queue(struct rcu_head *head) |
71 | { | 71 | { |
72 | debug_object_activate(head, &rcuhead_debug_descr); | 72 | int r1; |
73 | |||
74 | r1 = debug_object_activate(head, &rcuhead_debug_descr); | ||
73 | debug_object_active_state(head, &rcuhead_debug_descr, | 75 | debug_object_active_state(head, &rcuhead_debug_descr, |
74 | STATE_RCU_HEAD_READY, | 76 | STATE_RCU_HEAD_READY, |
75 | STATE_RCU_HEAD_QUEUED); | 77 | STATE_RCU_HEAD_QUEUED); |
78 | return r1; | ||
76 | } | 79 | } |
77 | 80 | ||
78 | static inline void debug_rcu_head_unqueue(struct rcu_head *head) | 81 | static inline void debug_rcu_head_unqueue(struct rcu_head *head) |
@@ -83,8 +86,9 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head) | |||
83 | debug_object_deactivate(head, &rcuhead_debug_descr); | 86 | debug_object_deactivate(head, &rcuhead_debug_descr); |
84 | } | 87 | } |
85 | #else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ | 88 | #else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ |
86 | static inline void debug_rcu_head_queue(struct rcu_head *head) | 89 | static inline int debug_rcu_head_queue(struct rcu_head *head) |
87 | { | 90 | { |
91 | return 0; | ||
88 | } | 92 | } |
89 | 93 | ||
90 | static inline void debug_rcu_head_unqueue(struct rcu_head *head) | 94 | static inline void debug_rcu_head_unqueue(struct rcu_head *head) |
@@ -94,7 +98,7 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head) | |||
94 | 98 | ||
95 | extern void kfree(const void *); | 99 | extern void kfree(const void *); |
96 | 100 | ||
97 | static inline bool __rcu_reclaim(char *rn, struct rcu_head *head) | 101 | static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) |
98 | { | 102 | { |
99 | unsigned long offset = (unsigned long)head->func; | 103 | unsigned long offset = (unsigned long)head->func; |
100 | 104 | ||
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index cce6ba8bbace..33eb4620aa17 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
@@ -212,43 +212,6 @@ static inline void debug_rcu_head_free(struct rcu_head *head) | |||
212 | } | 212 | } |
213 | 213 | ||
214 | /* | 214 | /* |
215 | * fixup_init is called when: | ||
216 | * - an active object is initialized | ||
217 | */ | ||
218 | static int rcuhead_fixup_init(void *addr, enum debug_obj_state state) | ||
219 | { | ||
220 | struct rcu_head *head = addr; | ||
221 | |||
222 | switch (state) { | ||
223 | case ODEBUG_STATE_ACTIVE: | ||
224 | /* | ||
225 | * Ensure that queued callbacks are all executed. | ||
226 | * If we detect that we are nested in a RCU read-side critical | ||
227 | * section, we should simply fail, otherwise we would deadlock. | ||
228 | * In !PREEMPT configurations, there is no way to tell if we are | ||
229 | * in a RCU read-side critical section or not, so we never | ||
230 | * attempt any fixup and just print a warning. | ||
231 | */ | ||
232 | #ifndef CONFIG_PREEMPT | ||
233 | WARN_ON_ONCE(1); | ||
234 | return 0; | ||
235 | #endif | ||
236 | if (rcu_preempt_depth() != 0 || preempt_count() != 0 || | ||
237 | irqs_disabled()) { | ||
238 | WARN_ON_ONCE(1); | ||
239 | return 0; | ||
240 | } | ||
241 | rcu_barrier(); | ||
242 | rcu_barrier_sched(); | ||
243 | rcu_barrier_bh(); | ||
244 | debug_object_init(head, &rcuhead_debug_descr); | ||
245 | return 1; | ||
246 | default: | ||
247 | return 0; | ||
248 | } | ||
249 | } | ||
250 | |||
251 | /* | ||
252 | * fixup_activate is called when: | 215 | * fixup_activate is called when: |
253 | * - an active object is activated | 216 | * - an active object is activated |
254 | * - an unknown object is activated (might be a statically initialized object) | 217 | * - an unknown object is activated (might be a statically initialized object) |
@@ -268,69 +231,8 @@ static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state) | |||
268 | debug_object_init(head, &rcuhead_debug_descr); | 231 | debug_object_init(head, &rcuhead_debug_descr); |
269 | debug_object_activate(head, &rcuhead_debug_descr); | 232 | debug_object_activate(head, &rcuhead_debug_descr); |
270 | return 0; | 233 | return 0; |
271 | |||
272 | case ODEBUG_STATE_ACTIVE: | ||
273 | /* | ||
274 | * Ensure that queued callbacks are all executed. | ||
275 | * If we detect that we are nested in a RCU read-side critical | ||
276 | * section, we should simply fail, otherwise we would deadlock. | ||
277 | * In !PREEMPT configurations, there is no way to tell if we are | ||
278 | * in a RCU read-side critical section or not, so we never | ||
279 | * attempt any fixup and just print a warning. | ||
280 | */ | ||
281 | #ifndef CONFIG_PREEMPT | ||
282 | WARN_ON_ONCE(1); | ||
283 | return 0; | ||
284 | #endif | ||
285 | if (rcu_preempt_depth() != 0 || preempt_count() != 0 || | ||
286 | irqs_disabled()) { | ||
287 | WARN_ON_ONCE(1); | ||
288 | return 0; | ||
289 | } | ||
290 | rcu_barrier(); | ||
291 | rcu_barrier_sched(); | ||
292 | rcu_barrier_bh(); | ||
293 | debug_object_activate(head, &rcuhead_debug_descr); | ||
294 | return 1; | ||
295 | default: | 234 | default: |
296 | return 0; | ||
297 | } | ||
298 | } | ||
299 | |||
300 | /* | ||
301 | * fixup_free is called when: | ||
302 | * - an active object is freed | ||
303 | */ | ||
304 | static int rcuhead_fixup_free(void *addr, enum debug_obj_state state) | ||
305 | { | ||
306 | struct rcu_head *head = addr; | ||
307 | |||
308 | switch (state) { | ||
309 | case ODEBUG_STATE_ACTIVE: | ||
310 | /* | ||
311 | * Ensure that queued callbacks are all executed. | ||
312 | * If we detect that we are nested in a RCU read-side critical | ||
313 | * section, we should simply fail, otherwise we would deadlock. | ||
314 | * In !PREEMPT configurations, there is no way to tell if we are | ||
315 | * in a RCU read-side critical section or not, so we never | ||
316 | * attempt any fixup and just print a warning. | ||
317 | */ | ||
318 | #ifndef CONFIG_PREEMPT | ||
319 | WARN_ON_ONCE(1); | ||
320 | return 0; | ||
321 | #endif | ||
322 | if (rcu_preempt_depth() != 0 || preempt_count() != 0 || | ||
323 | irqs_disabled()) { | ||
324 | WARN_ON_ONCE(1); | ||
325 | return 0; | ||
326 | } | ||
327 | rcu_barrier(); | ||
328 | rcu_barrier_sched(); | ||
329 | rcu_barrier_bh(); | ||
330 | debug_object_free(head, &rcuhead_debug_descr); | ||
331 | return 1; | 235 | return 1; |
332 | default: | ||
333 | return 0; | ||
334 | } | 236 | } |
335 | } | 237 | } |
336 | 238 | ||
@@ -369,15 +271,13 @@ EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack); | |||
369 | 271 | ||
370 | struct debug_obj_descr rcuhead_debug_descr = { | 272 | struct debug_obj_descr rcuhead_debug_descr = { |
371 | .name = "rcu_head", | 273 | .name = "rcu_head", |
372 | .fixup_init = rcuhead_fixup_init, | ||
373 | .fixup_activate = rcuhead_fixup_activate, | 274 | .fixup_activate = rcuhead_fixup_activate, |
374 | .fixup_free = rcuhead_fixup_free, | ||
375 | }; | 275 | }; |
376 | EXPORT_SYMBOL_GPL(rcuhead_debug_descr); | 276 | EXPORT_SYMBOL_GPL(rcuhead_debug_descr); |
377 | #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ | 277 | #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ |
378 | 278 | ||
379 | #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE) | 279 | #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE) |
380 | void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp, | 280 | void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp, |
381 | unsigned long secs, | 281 | unsigned long secs, |
382 | unsigned long c_old, unsigned long c) | 282 | unsigned long c_old, unsigned long c) |
383 | { | 283 | { |
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index aa344111de3e..9ed6075dc562 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c | |||
@@ -264,7 +264,7 @@ void rcu_check_callbacks(int cpu, int user) | |||
264 | */ | 264 | */ |
265 | static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) | 265 | static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) |
266 | { | 266 | { |
267 | char *rn = NULL; | 267 | const char *rn = NULL; |
268 | struct rcu_head *next, *list; | 268 | struct rcu_head *next, *list; |
269 | unsigned long flags; | 269 | unsigned long flags; |
270 | RCU_TRACE(int cb_count = 0); | 270 | RCU_TRACE(int cb_count = 0); |
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 0cd385acccfa..280d06cae352 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h | |||
@@ -36,7 +36,7 @@ struct rcu_ctrlblk { | |||
36 | RCU_TRACE(unsigned long gp_start); /* Start time for stalls. */ | 36 | RCU_TRACE(unsigned long gp_start); /* Start time for stalls. */ |
37 | RCU_TRACE(unsigned long ticks_this_gp); /* Statistic for stalls. */ | 37 | RCU_TRACE(unsigned long ticks_this_gp); /* Statistic for stalls. */ |
38 | RCU_TRACE(unsigned long jiffies_stall); /* Jiffies at next stall. */ | 38 | RCU_TRACE(unsigned long jiffies_stall); /* Jiffies at next stall. */ |
39 | RCU_TRACE(char *name); /* Name of RCU type. */ | 39 | RCU_TRACE(const char *name); /* Name of RCU type. */ |
40 | }; | 40 | }; |
41 | 41 | ||
42 | /* Definition for rcupdate control block. */ | 42 | /* Definition for rcupdate control block. */ |
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index f4871e52c546..be63101c6175 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
@@ -52,72 +52,78 @@ | |||
52 | MODULE_LICENSE("GPL"); | 52 | MODULE_LICENSE("GPL"); |
53 | MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>"); | 53 | MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>"); |
54 | 54 | ||
55 | static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ | 55 | static int fqs_duration; |
56 | static int nfakewriters = 4; /* # fake writer threads */ | ||
57 | static int stat_interval = 60; /* Interval between stats, in seconds. */ | ||
58 | /* Zero means "only at end of test". */ | ||
59 | static bool verbose; /* Print more debug info. */ | ||
60 | static bool test_no_idle_hz = true; | ||
61 | /* Test RCU support for tickless idle CPUs. */ | ||
62 | static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ | ||
63 | static int stutter = 5; /* Start/stop testing interval (in sec) */ | ||
64 | static int irqreader = 1; /* RCU readers from irq (timers). */ | ||
65 | static int fqs_duration; /* Duration of bursts (us), 0 to disable. */ | ||
66 | static int fqs_holdoff; /* Hold time within burst (us). */ | ||
67 | static int fqs_stutter = 3; /* Wait time between bursts (s). */ | ||
68 | static int n_barrier_cbs; /* Number of callbacks to test RCU barriers. */ | ||
69 | static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */ | ||
70 | static int onoff_holdoff; /* Seconds after boot before CPU hotplugs. */ | ||
71 | static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */ | ||
72 | static int stall_cpu; /* CPU-stall duration (s). 0 for no stall. */ | ||
73 | static int stall_cpu_holdoff = 10; /* Time to wait until stall (s). */ | ||
74 | static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */ | ||
75 | static int test_boost_interval = 7; /* Interval between boost tests, seconds. */ | ||
76 | static int test_boost_duration = 4; /* Duration of each boost test, seconds. */ | ||
77 | static char *torture_type = "rcu"; /* What RCU implementation to torture. */ | ||
78 | |||
79 | module_param(nreaders, int, 0444); | ||
80 | MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); | ||
81 | module_param(nfakewriters, int, 0444); | ||
82 | MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads"); | ||
83 | module_param(stat_interval, int, 0644); | ||
84 | MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); | ||
85 | module_param(verbose, bool, 0444); | ||
86 | MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); | ||
87 | module_param(test_no_idle_hz, bool, 0444); | ||
88 | MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); | ||
89 | module_param(shuffle_interval, int, 0444); | ||
90 | MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); | ||
91 | module_param(stutter, int, 0444); | ||
92 | MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test"); | ||
93 | module_param(irqreader, int, 0444); | ||
94 | MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers"); | ||
95 | module_param(fqs_duration, int, 0444); | 56 | module_param(fqs_duration, int, 0444); |
96 | MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us)"); | 57 | MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us), 0 to disable"); |
58 | static int fqs_holdoff; | ||
97 | module_param(fqs_holdoff, int, 0444); | 59 | module_param(fqs_holdoff, int, 0444); |
98 | MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); | 60 | MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); |
61 | static int fqs_stutter = 3; | ||
99 | module_param(fqs_stutter, int, 0444); | 62 | module_param(fqs_stutter, int, 0444); |
100 | MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); | 63 | MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); |
64 | static bool gp_exp; | ||
65 | module_param(gp_exp, bool, 0444); | ||
66 | MODULE_PARM_DESC(gp_exp, "Use expedited GP wait primitives"); | ||
67 | static bool gp_normal; | ||
68 | module_param(gp_normal, bool, 0444); | ||
69 | MODULE_PARM_DESC(gp_normal, "Use normal (non-expedited) GP wait primitives"); | ||
70 | static int irqreader = 1; | ||
71 | module_param(irqreader, int, 0444); | ||
72 | MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers"); | ||
73 | static int n_barrier_cbs; | ||
101 | module_param(n_barrier_cbs, int, 0444); | 74 | module_param(n_barrier_cbs, int, 0444); |
102 | MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing"); | 75 | MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing"); |
103 | module_param(onoff_interval, int, 0444); | 76 | static int nfakewriters = 4; |
104 | MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); | 77 | module_param(nfakewriters, int, 0444); |
78 | MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads"); | ||
79 | static int nreaders = -1; | ||
80 | module_param(nreaders, int, 0444); | ||
81 | MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); | ||
82 | static int object_debug; | ||
83 | module_param(object_debug, int, 0444); | ||
84 | MODULE_PARM_DESC(object_debug, "Enable debug-object double call_rcu() testing"); | ||
85 | static int onoff_holdoff; | ||
105 | module_param(onoff_holdoff, int, 0444); | 86 | module_param(onoff_holdoff, int, 0444); |
106 | MODULE_PARM_DESC(onoff_holdoff, "Time after boot before CPU hotplugs (s)"); | 87 | MODULE_PARM_DESC(onoff_holdoff, "Time after boot before CPU hotplugs (s)"); |
88 | static int onoff_interval; | ||
89 | module_param(onoff_interval, int, 0444); | ||
90 | MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); | ||
91 | static int shuffle_interval = 3; | ||
92 | module_param(shuffle_interval, int, 0444); | ||
93 | MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); | ||
94 | static int shutdown_secs; | ||
107 | module_param(shutdown_secs, int, 0444); | 95 | module_param(shutdown_secs, int, 0444); |
108 | MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), zero to disable."); | 96 | MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), <= zero to disable."); |
97 | static int stall_cpu; | ||
109 | module_param(stall_cpu, int, 0444); | 98 | module_param(stall_cpu, int, 0444); |
110 | MODULE_PARM_DESC(stall_cpu, "Stall duration (s), zero to disable."); | 99 | MODULE_PARM_DESC(stall_cpu, "Stall duration (s), zero to disable."); |
100 | static int stall_cpu_holdoff = 10; | ||
111 | module_param(stall_cpu_holdoff, int, 0444); | 101 | module_param(stall_cpu_holdoff, int, 0444); |
112 | MODULE_PARM_DESC(stall_cpu_holdoff, "Time to wait before starting stall (s)."); | 102 | MODULE_PARM_DESC(stall_cpu_holdoff, "Time to wait before starting stall (s)."); |
103 | static int stat_interval = 60; | ||
104 | module_param(stat_interval, int, 0644); | ||
105 | MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); | ||
106 | static int stutter = 5; | ||
107 | module_param(stutter, int, 0444); | ||
108 | MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test"); | ||
109 | static int test_boost = 1; | ||
113 | module_param(test_boost, int, 0444); | 110 | module_param(test_boost, int, 0444); |
114 | MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); | 111 | MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); |
115 | module_param(test_boost_interval, int, 0444); | 112 | static int test_boost_duration = 4; |
116 | MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds."); | ||
117 | module_param(test_boost_duration, int, 0444); | 113 | module_param(test_boost_duration, int, 0444); |
118 | MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds."); | 114 | MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds."); |
115 | static int test_boost_interval = 7; | ||
116 | module_param(test_boost_interval, int, 0444); | ||
117 | MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds."); | ||
118 | static bool test_no_idle_hz = true; | ||
119 | module_param(test_no_idle_hz, bool, 0444); | ||
120 | MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); | ||
121 | static char *torture_type = "rcu"; | ||
119 | module_param(torture_type, charp, 0444); | 122 | module_param(torture_type, charp, 0444); |
120 | MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); | 123 | MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)"); |
124 | static bool verbose; | ||
125 | module_param(verbose, bool, 0444); | ||
126 | MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); | ||
121 | 127 | ||
122 | #define TORTURE_FLAG "-torture:" | 128 | #define TORTURE_FLAG "-torture:" |
123 | #define PRINTK_STRING(s) \ | 129 | #define PRINTK_STRING(s) \ |
@@ -267,7 +273,7 @@ rcutorture_shutdown_notify(struct notifier_block *unused1, | |||
267 | * Absorb kthreads into a kernel function that won't return, so that | 273 | * Absorb kthreads into a kernel function that won't return, so that |
268 | * they won't ever access module text or data again. | 274 | * they won't ever access module text or data again. |
269 | */ | 275 | */ |
270 | static void rcutorture_shutdown_absorb(char *title) | 276 | static void rcutorture_shutdown_absorb(const char *title) |
271 | { | 277 | { |
272 | if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { | 278 | if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { |
273 | pr_notice( | 279 | pr_notice( |
@@ -337,7 +343,7 @@ rcu_random(struct rcu_random_state *rrsp) | |||
337 | } | 343 | } |
338 | 344 | ||
339 | static void | 345 | static void |
340 | rcu_stutter_wait(char *title) | 346 | rcu_stutter_wait(const char *title) |
341 | { | 347 | { |
342 | while (stutter_pause_test || !rcutorture_runnable) { | 348 | while (stutter_pause_test || !rcutorture_runnable) { |
343 | if (rcutorture_runnable) | 349 | if (rcutorture_runnable) |
@@ -360,13 +366,14 @@ struct rcu_torture_ops { | |||
360 | int (*completed)(void); | 366 | int (*completed)(void); |
361 | void (*deferred_free)(struct rcu_torture *p); | 367 | void (*deferred_free)(struct rcu_torture *p); |
362 | void (*sync)(void); | 368 | void (*sync)(void); |
369 | void (*exp_sync)(void); | ||
363 | void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); | 370 | void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); |
364 | void (*cb_barrier)(void); | 371 | void (*cb_barrier)(void); |
365 | void (*fqs)(void); | 372 | void (*fqs)(void); |
366 | int (*stats)(char *page); | 373 | int (*stats)(char *page); |
367 | int irq_capable; | 374 | int irq_capable; |
368 | int can_boost; | 375 | int can_boost; |
369 | char *name; | 376 | const char *name; |
370 | }; | 377 | }; |
371 | 378 | ||
372 | static struct rcu_torture_ops *cur_ops; | 379 | static struct rcu_torture_ops *cur_ops; |
@@ -443,81 +450,27 @@ static void rcu_torture_deferred_free(struct rcu_torture *p) | |||
443 | call_rcu(&p->rtort_rcu, rcu_torture_cb); | 450 | call_rcu(&p->rtort_rcu, rcu_torture_cb); |
444 | } | 451 | } |
445 | 452 | ||
446 | static struct rcu_torture_ops rcu_ops = { | ||
447 | .init = NULL, | ||
448 | .readlock = rcu_torture_read_lock, | ||
449 | .read_delay = rcu_read_delay, | ||
450 | .readunlock = rcu_torture_read_unlock, | ||
451 | .completed = rcu_torture_completed, | ||
452 | .deferred_free = rcu_torture_deferred_free, | ||
453 | .sync = synchronize_rcu, | ||
454 | .call = call_rcu, | ||
455 | .cb_barrier = rcu_barrier, | ||
456 | .fqs = rcu_force_quiescent_state, | ||
457 | .stats = NULL, | ||
458 | .irq_capable = 1, | ||
459 | .can_boost = rcu_can_boost(), | ||
460 | .name = "rcu" | ||
461 | }; | ||
462 | |||
463 | static void rcu_sync_torture_deferred_free(struct rcu_torture *p) | ||
464 | { | ||
465 | int i; | ||
466 | struct rcu_torture *rp; | ||
467 | struct rcu_torture *rp1; | ||
468 | |||
469 | cur_ops->sync(); | ||
470 | list_add(&p->rtort_free, &rcu_torture_removed); | ||
471 | list_for_each_entry_safe(rp, rp1, &rcu_torture_removed, rtort_free) { | ||
472 | i = rp->rtort_pipe_count; | ||
473 | if (i > RCU_TORTURE_PIPE_LEN) | ||
474 | i = RCU_TORTURE_PIPE_LEN; | ||
475 | atomic_inc(&rcu_torture_wcount[i]); | ||
476 | if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { | ||
477 | rp->rtort_mbtest = 0; | ||
478 | list_del(&rp->rtort_free); | ||
479 | rcu_torture_free(rp); | ||
480 | } | ||
481 | } | ||
482 | } | ||
483 | |||
484 | static void rcu_sync_torture_init(void) | 453 | static void rcu_sync_torture_init(void) |
485 | { | 454 | { |
486 | INIT_LIST_HEAD(&rcu_torture_removed); | 455 | INIT_LIST_HEAD(&rcu_torture_removed); |
487 | } | 456 | } |
488 | 457 | ||
489 | static struct rcu_torture_ops rcu_sync_ops = { | 458 | static struct rcu_torture_ops rcu_ops = { |
490 | .init = rcu_sync_torture_init, | 459 | .init = rcu_sync_torture_init, |
491 | .readlock = rcu_torture_read_lock, | 460 | .readlock = rcu_torture_read_lock, |
492 | .read_delay = rcu_read_delay, | 461 | .read_delay = rcu_read_delay, |
493 | .readunlock = rcu_torture_read_unlock, | 462 | .readunlock = rcu_torture_read_unlock, |
494 | .completed = rcu_torture_completed, | 463 | .completed = rcu_torture_completed, |
495 | .deferred_free = rcu_sync_torture_deferred_free, | 464 | .deferred_free = rcu_torture_deferred_free, |
496 | .sync = synchronize_rcu, | 465 | .sync = synchronize_rcu, |
497 | .call = NULL, | 466 | .exp_sync = synchronize_rcu_expedited, |
498 | .cb_barrier = NULL, | 467 | .call = call_rcu, |
499 | .fqs = rcu_force_quiescent_state, | 468 | .cb_barrier = rcu_barrier, |
500 | .stats = NULL, | ||
501 | .irq_capable = 1, | ||
502 | .can_boost = rcu_can_boost(), | ||
503 | .name = "rcu_sync" | ||
504 | }; | ||
505 | |||
506 | static struct rcu_torture_ops rcu_expedited_ops = { | ||
507 | .init = rcu_sync_torture_init, | ||
508 | .readlock = rcu_torture_read_lock, | ||
509 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | ||
510 | .readunlock = rcu_torture_read_unlock, | ||
511 | .completed = rcu_no_completed, | ||
512 | .deferred_free = rcu_sync_torture_deferred_free, | ||
513 | .sync = synchronize_rcu_expedited, | ||
514 | .call = NULL, | ||
515 | .cb_barrier = NULL, | ||
516 | .fqs = rcu_force_quiescent_state, | 469 | .fqs = rcu_force_quiescent_state, |
517 | .stats = NULL, | 470 | .stats = NULL, |
518 | .irq_capable = 1, | 471 | .irq_capable = 1, |
519 | .can_boost = rcu_can_boost(), | 472 | .can_boost = rcu_can_boost(), |
520 | .name = "rcu_expedited" | 473 | .name = "rcu" |
521 | }; | 474 | }; |
522 | 475 | ||
523 | /* | 476 | /* |
@@ -546,13 +499,14 @@ static void rcu_bh_torture_deferred_free(struct rcu_torture *p) | |||
546 | } | 499 | } |
547 | 500 | ||
548 | static struct rcu_torture_ops rcu_bh_ops = { | 501 | static struct rcu_torture_ops rcu_bh_ops = { |
549 | .init = NULL, | 502 | .init = rcu_sync_torture_init, |
550 | .readlock = rcu_bh_torture_read_lock, | 503 | .readlock = rcu_bh_torture_read_lock, |
551 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | 504 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ |
552 | .readunlock = rcu_bh_torture_read_unlock, | 505 | .readunlock = rcu_bh_torture_read_unlock, |
553 | .completed = rcu_bh_torture_completed, | 506 | .completed = rcu_bh_torture_completed, |
554 | .deferred_free = rcu_bh_torture_deferred_free, | 507 | .deferred_free = rcu_bh_torture_deferred_free, |
555 | .sync = synchronize_rcu_bh, | 508 | .sync = synchronize_rcu_bh, |
509 | .exp_sync = synchronize_rcu_bh_expedited, | ||
556 | .call = call_rcu_bh, | 510 | .call = call_rcu_bh, |
557 | .cb_barrier = rcu_barrier_bh, | 511 | .cb_barrier = rcu_barrier_bh, |
558 | .fqs = rcu_bh_force_quiescent_state, | 512 | .fqs = rcu_bh_force_quiescent_state, |
@@ -561,38 +515,6 @@ static struct rcu_torture_ops rcu_bh_ops = { | |||
561 | .name = "rcu_bh" | 515 | .name = "rcu_bh" |
562 | }; | 516 | }; |
563 | 517 | ||
564 | static struct rcu_torture_ops rcu_bh_sync_ops = { | ||
565 | .init = rcu_sync_torture_init, | ||
566 | .readlock = rcu_bh_torture_read_lock, | ||
567 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | ||
568 | .readunlock = rcu_bh_torture_read_unlock, | ||
569 | .completed = rcu_bh_torture_completed, | ||
570 | .deferred_free = rcu_sync_torture_deferred_free, | ||
571 | .sync = synchronize_rcu_bh, | ||
572 | .call = NULL, | ||
573 | .cb_barrier = NULL, | ||
574 | .fqs = rcu_bh_force_quiescent_state, | ||
575 | .stats = NULL, | ||
576 | .irq_capable = 1, | ||
577 | .name = "rcu_bh_sync" | ||
578 | }; | ||
579 | |||
580 | static struct rcu_torture_ops rcu_bh_expedited_ops = { | ||
581 | .init = rcu_sync_torture_init, | ||
582 | .readlock = rcu_bh_torture_read_lock, | ||
583 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | ||
584 | .readunlock = rcu_bh_torture_read_unlock, | ||
585 | .completed = rcu_bh_torture_completed, | ||
586 | .deferred_free = rcu_sync_torture_deferred_free, | ||
587 | .sync = synchronize_rcu_bh_expedited, | ||
588 | .call = NULL, | ||
589 | .cb_barrier = NULL, | ||
590 | .fqs = rcu_bh_force_quiescent_state, | ||
591 | .stats = NULL, | ||
592 | .irq_capable = 1, | ||
593 | .name = "rcu_bh_expedited" | ||
594 | }; | ||
595 | |||
596 | /* | 518 | /* |
597 | * Definitions for srcu torture testing. | 519 | * Definitions for srcu torture testing. |
598 | */ | 520 | */ |
@@ -667,6 +589,11 @@ static int srcu_torture_stats(char *page) | |||
667 | return cnt; | 589 | return cnt; |
668 | } | 590 | } |
669 | 591 | ||
592 | static void srcu_torture_synchronize_expedited(void) | ||
593 | { | ||
594 | synchronize_srcu_expedited(&srcu_ctl); | ||
595 | } | ||
596 | |||
670 | static struct rcu_torture_ops srcu_ops = { | 597 | static struct rcu_torture_ops srcu_ops = { |
671 | .init = rcu_sync_torture_init, | 598 | .init = rcu_sync_torture_init, |
672 | .readlock = srcu_torture_read_lock, | 599 | .readlock = srcu_torture_read_lock, |
@@ -675,45 +602,13 @@ static struct rcu_torture_ops srcu_ops = { | |||
675 | .completed = srcu_torture_completed, | 602 | .completed = srcu_torture_completed, |
676 | .deferred_free = srcu_torture_deferred_free, | 603 | .deferred_free = srcu_torture_deferred_free, |
677 | .sync = srcu_torture_synchronize, | 604 | .sync = srcu_torture_synchronize, |
605 | .exp_sync = srcu_torture_synchronize_expedited, | ||
678 | .call = srcu_torture_call, | 606 | .call = srcu_torture_call, |
679 | .cb_barrier = srcu_torture_barrier, | 607 | .cb_barrier = srcu_torture_barrier, |
680 | .stats = srcu_torture_stats, | 608 | .stats = srcu_torture_stats, |
681 | .name = "srcu" | 609 | .name = "srcu" |
682 | }; | 610 | }; |
683 | 611 | ||
684 | static struct rcu_torture_ops srcu_sync_ops = { | ||
685 | .init = rcu_sync_torture_init, | ||
686 | .readlock = srcu_torture_read_lock, | ||
687 | .read_delay = srcu_read_delay, | ||
688 | .readunlock = srcu_torture_read_unlock, | ||
689 | .completed = srcu_torture_completed, | ||
690 | .deferred_free = rcu_sync_torture_deferred_free, | ||
691 | .sync = srcu_torture_synchronize, | ||
692 | .call = NULL, | ||
693 | .cb_barrier = NULL, | ||
694 | .stats = srcu_torture_stats, | ||
695 | .name = "srcu_sync" | ||
696 | }; | ||
697 | |||
698 | static void srcu_torture_synchronize_expedited(void) | ||
699 | { | ||
700 | synchronize_srcu_expedited(&srcu_ctl); | ||
701 | } | ||
702 | |||
703 | static struct rcu_torture_ops srcu_expedited_ops = { | ||
704 | .init = rcu_sync_torture_init, | ||
705 | .readlock = srcu_torture_read_lock, | ||
706 | .read_delay = srcu_read_delay, | ||
707 | .readunlock = srcu_torture_read_unlock, | ||
708 | .completed = srcu_torture_completed, | ||
709 | .deferred_free = rcu_sync_torture_deferred_free, | ||
710 | .sync = srcu_torture_synchronize_expedited, | ||
711 | .call = NULL, | ||
712 | .cb_barrier = NULL, | ||
713 | .stats = srcu_torture_stats, | ||
714 | .name = "srcu_expedited" | ||
715 | }; | ||
716 | |||
717 | /* | 612 | /* |
718 | * Definitions for sched torture testing. | 613 | * Definitions for sched torture testing. |
719 | */ | 614 | */ |
@@ -742,6 +637,8 @@ static struct rcu_torture_ops sched_ops = { | |||
742 | .completed = rcu_no_completed, | 637 | .completed = rcu_no_completed, |
743 | .deferred_free = rcu_sched_torture_deferred_free, | 638 | .deferred_free = rcu_sched_torture_deferred_free, |
744 | .sync = synchronize_sched, | 639 | .sync = synchronize_sched, |
640 | .exp_sync = synchronize_sched_expedited, | ||
641 | .call = call_rcu_sched, | ||
745 | .cb_barrier = rcu_barrier_sched, | 642 | .cb_barrier = rcu_barrier_sched, |
746 | .fqs = rcu_sched_force_quiescent_state, | 643 | .fqs = rcu_sched_force_quiescent_state, |
747 | .stats = NULL, | 644 | .stats = NULL, |
@@ -749,35 +646,6 @@ static struct rcu_torture_ops sched_ops = { | |||
749 | .name = "sched" | 646 | .name = "sched" |
750 | }; | 647 | }; |
751 | 648 | ||
752 | static struct rcu_torture_ops sched_sync_ops = { | ||
753 | .init = rcu_sync_torture_init, | ||
754 | .readlock = sched_torture_read_lock, | ||
755 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | ||
756 | .readunlock = sched_torture_read_unlock, | ||
757 | .completed = rcu_no_completed, | ||
758 | .deferred_free = rcu_sync_torture_deferred_free, | ||
759 | .sync = synchronize_sched, | ||
760 | .cb_barrier = NULL, | ||
761 | .fqs = rcu_sched_force_quiescent_state, | ||
762 | .stats = NULL, | ||
763 | .name = "sched_sync" | ||
764 | }; | ||
765 | |||
766 | static struct rcu_torture_ops sched_expedited_ops = { | ||
767 | .init = rcu_sync_torture_init, | ||
768 | .readlock = sched_torture_read_lock, | ||
769 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | ||
770 | .readunlock = sched_torture_read_unlock, | ||
771 | .completed = rcu_no_completed, | ||
772 | .deferred_free = rcu_sync_torture_deferred_free, | ||
773 | .sync = synchronize_sched_expedited, | ||
774 | .cb_barrier = NULL, | ||
775 | .fqs = rcu_sched_force_quiescent_state, | ||
776 | .stats = NULL, | ||
777 | .irq_capable = 1, | ||
778 | .name = "sched_expedited" | ||
779 | }; | ||
780 | |||
781 | /* | 649 | /* |
782 | * RCU torture priority-boost testing. Runs one real-time thread per | 650 | * RCU torture priority-boost testing. Runs one real-time thread per |
783 | * CPU for moderate bursts, repeatedly registering RCU callbacks and | 651 | * CPU for moderate bursts, repeatedly registering RCU callbacks and |
@@ -927,9 +795,10 @@ rcu_torture_fqs(void *arg) | |||
927 | static int | 795 | static int |
928 | rcu_torture_writer(void *arg) | 796 | rcu_torture_writer(void *arg) |
929 | { | 797 | { |
798 | bool exp; | ||
930 | int i; | 799 | int i; |
931 | long oldbatch = rcu_batches_completed(); | ||
932 | struct rcu_torture *rp; | 800 | struct rcu_torture *rp; |
801 | struct rcu_torture *rp1; | ||
933 | struct rcu_torture *old_rp; | 802 | struct rcu_torture *old_rp; |
934 | static DEFINE_RCU_RANDOM(rand); | 803 | static DEFINE_RCU_RANDOM(rand); |
935 | 804 | ||
@@ -954,10 +823,33 @@ rcu_torture_writer(void *arg) | |||
954 | i = RCU_TORTURE_PIPE_LEN; | 823 | i = RCU_TORTURE_PIPE_LEN; |
955 | atomic_inc(&rcu_torture_wcount[i]); | 824 | atomic_inc(&rcu_torture_wcount[i]); |
956 | old_rp->rtort_pipe_count++; | 825 | old_rp->rtort_pipe_count++; |
957 | cur_ops->deferred_free(old_rp); | 826 | if (gp_normal == gp_exp) |
827 | exp = !!(rcu_random(&rand) & 0x80); | ||
828 | else | ||
829 | exp = gp_exp; | ||
830 | if (!exp) { | ||
831 | cur_ops->deferred_free(old_rp); | ||
832 | } else { | ||
833 | cur_ops->exp_sync(); | ||
834 | list_add(&old_rp->rtort_free, | ||
835 | &rcu_torture_removed); | ||
836 | list_for_each_entry_safe(rp, rp1, | ||
837 | &rcu_torture_removed, | ||
838 | rtort_free) { | ||
839 | i = rp->rtort_pipe_count; | ||
840 | if (i > RCU_TORTURE_PIPE_LEN) | ||
841 | i = RCU_TORTURE_PIPE_LEN; | ||
842 | atomic_inc(&rcu_torture_wcount[i]); | ||
843 | if (++rp->rtort_pipe_count >= | ||
844 | RCU_TORTURE_PIPE_LEN) { | ||
845 | rp->rtort_mbtest = 0; | ||
846 | list_del(&rp->rtort_free); | ||
847 | rcu_torture_free(rp); | ||
848 | } | ||
849 | } | ||
850 | } | ||
958 | } | 851 | } |
959 | rcutorture_record_progress(++rcu_torture_current_version); | 852 | rcutorture_record_progress(++rcu_torture_current_version); |
960 | oldbatch = cur_ops->completed(); | ||
961 | rcu_stutter_wait("rcu_torture_writer"); | 853 | rcu_stutter_wait("rcu_torture_writer"); |
962 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); | 854 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); |
963 | VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); | 855 | VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); |
@@ -983,10 +875,18 @@ rcu_torture_fakewriter(void *arg) | |||
983 | schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); | 875 | schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); |
984 | udelay(rcu_random(&rand) & 0x3ff); | 876 | udelay(rcu_random(&rand) & 0x3ff); |
985 | if (cur_ops->cb_barrier != NULL && | 877 | if (cur_ops->cb_barrier != NULL && |
986 | rcu_random(&rand) % (nfakewriters * 8) == 0) | 878 | rcu_random(&rand) % (nfakewriters * 8) == 0) { |
987 | cur_ops->cb_barrier(); | 879 | cur_ops->cb_barrier(); |
988 | else | 880 | } else if (gp_normal == gp_exp) { |
881 | if (rcu_random(&rand) & 0x80) | ||
882 | cur_ops->sync(); | ||
883 | else | ||
884 | cur_ops->exp_sync(); | ||
885 | } else if (gp_normal) { | ||
989 | cur_ops->sync(); | 886 | cur_ops->sync(); |
887 | } else { | ||
888 | cur_ops->exp_sync(); | ||
889 | } | ||
990 | rcu_stutter_wait("rcu_torture_fakewriter"); | 890 | rcu_stutter_wait("rcu_torture_fakewriter"); |
991 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); | 891 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); |
992 | 892 | ||
@@ -1364,7 +1264,7 @@ rcu_torture_stutter(void *arg) | |||
1364 | } | 1264 | } |
1365 | 1265 | ||
1366 | static inline void | 1266 | static inline void |
1367 | rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag) | 1267 | rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) |
1368 | { | 1268 | { |
1369 | pr_alert("%s" TORTURE_FLAG | 1269 | pr_alert("%s" TORTURE_FLAG |
1370 | "--- %s: nreaders=%d nfakewriters=%d " | 1270 | "--- %s: nreaders=%d nfakewriters=%d " |
@@ -1534,7 +1434,13 @@ rcu_torture_onoff(void *arg) | |||
1534 | torture_type, cpu); | 1434 | torture_type, cpu); |
1535 | starttime = jiffies; | 1435 | starttime = jiffies; |
1536 | n_online_attempts++; | 1436 | n_online_attempts++; |
1537 | if (cpu_up(cpu) == 0) { | 1437 | ret = cpu_up(cpu); |
1438 | if (ret) { | ||
1439 | if (verbose) | ||
1440 | pr_alert("%s" TORTURE_FLAG | ||
1441 | "rcu_torture_onoff task: online %d failed: errno %d\n", | ||
1442 | torture_type, cpu, ret); | ||
1443 | } else { | ||
1538 | if (verbose) | 1444 | if (verbose) |
1539 | pr_alert("%s" TORTURE_FLAG | 1445 | pr_alert("%s" TORTURE_FLAG |
1540 | "rcu_torture_onoff task: onlined %d\n", | 1446 | "rcu_torture_onoff task: onlined %d\n", |
@@ -1934,6 +1840,62 @@ rcu_torture_cleanup(void) | |||
1934 | rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); | 1840 | rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); |
1935 | } | 1841 | } |
1936 | 1842 | ||
1843 | #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD | ||
1844 | static void rcu_torture_leak_cb(struct rcu_head *rhp) | ||
1845 | { | ||
1846 | } | ||
1847 | |||
1848 | static void rcu_torture_err_cb(struct rcu_head *rhp) | ||
1849 | { | ||
1850 | /* | ||
1851 | * This -might- happen due to race conditions, but is unlikely. | ||
1852 | * The scenario that leads to this happening is that the | ||
1853 | * first of the pair of duplicate callbacks is queued, | ||
1854 | * someone else starts a grace period that includes that | ||
1855 | * callback, then the second of the pair must wait for the | ||
1856 | * next grace period. Unlikely, but can happen. If it | ||
1857 | * does happen, the debug-objects subsystem won't have splatted. | ||
1858 | */ | ||
1859 | pr_alert("rcutorture: duplicated callback was invoked.\n"); | ||
1860 | } | ||
1861 | #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ | ||
1862 | |||
1863 | /* | ||
1864 | * Verify that double-free causes debug-objects to complain, but only | ||
1865 | * if CONFIG_DEBUG_OBJECTS_RCU_HEAD=y. Otherwise, say that the test | ||
1866 | * cannot be carried out. | ||
1867 | */ | ||
1868 | static void rcu_test_debug_objects(void) | ||
1869 | { | ||
1870 | #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD | ||
1871 | struct rcu_head rh1; | ||
1872 | struct rcu_head rh2; | ||
1873 | |||
1874 | init_rcu_head_on_stack(&rh1); | ||
1875 | init_rcu_head_on_stack(&rh2); | ||
1876 | pr_alert("rcutorture: WARN: Duplicate call_rcu() test starting.\n"); | ||
1877 | |||
1878 | /* Try to queue the rh2 pair of callbacks for the same grace period. */ | ||
1879 | preempt_disable(); /* Prevent preemption from interrupting test. */ | ||
1880 | rcu_read_lock(); /* Make it impossible to finish a grace period. */ | ||
1881 | call_rcu(&rh1, rcu_torture_leak_cb); /* Start grace period. */ | ||
1882 | local_irq_disable(); /* Make it harder to start a new grace period. */ | ||
1883 | call_rcu(&rh2, rcu_torture_leak_cb); | ||
1884 | call_rcu(&rh2, rcu_torture_err_cb); /* Duplicate callback. */ | ||
1885 | local_irq_enable(); | ||
1886 | rcu_read_unlock(); | ||
1887 | preempt_enable(); | ||
1888 | |||
1889 | /* Wait for them all to get done so we can safely return. */ | ||
1890 | rcu_barrier(); | ||
1891 | pr_alert("rcutorture: WARN: Duplicate call_rcu() test complete.\n"); | ||
1892 | destroy_rcu_head_on_stack(&rh1); | ||
1893 | destroy_rcu_head_on_stack(&rh2); | ||
1894 | #else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ | ||
1895 | pr_alert("rcutorture: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n"); | ||
1896 | #endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ | ||
1897 | } | ||
1898 | |||
1937 | static int __init | 1899 | static int __init |
1938 | rcu_torture_init(void) | 1900 | rcu_torture_init(void) |
1939 | { | 1901 | { |
@@ -1941,11 +1903,9 @@ rcu_torture_init(void) | |||
1941 | int cpu; | 1903 | int cpu; |
1942 | int firsterr = 0; | 1904 | int firsterr = 0; |
1943 | int retval; | 1905 | int retval; |
1944 | static struct rcu_torture_ops *torture_ops[] = | 1906 | static struct rcu_torture_ops *torture_ops[] = { |
1945 | { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, | 1907 | &rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops, |
1946 | &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, | 1908 | }; |
1947 | &srcu_ops, &srcu_sync_ops, &srcu_expedited_ops, | ||
1948 | &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; | ||
1949 | 1909 | ||
1950 | mutex_lock(&fullstop_mutex); | 1910 | mutex_lock(&fullstop_mutex); |
1951 | 1911 | ||
@@ -2163,6 +2123,8 @@ rcu_torture_init(void) | |||
2163 | firsterr = retval; | 2123 | firsterr = retval; |
2164 | goto unwind; | 2124 | goto unwind; |
2165 | } | 2125 | } |
2126 | if (object_debug) | ||
2127 | rcu_test_debug_objects(); | ||
2166 | rcutorture_record_test_transition(); | 2128 | rcutorture_record_test_transition(); |
2167 | mutex_unlock(&fullstop_mutex); | 2129 | mutex_unlock(&fullstop_mutex); |
2168 | return 0; | 2130 | return 0; |
diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 068de3a93606..32618b3fe4e6 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c | |||
@@ -53,18 +53,38 @@ | |||
53 | #include <linux/delay.h> | 53 | #include <linux/delay.h> |
54 | #include <linux/stop_machine.h> | 54 | #include <linux/stop_machine.h> |
55 | #include <linux/random.h> | 55 | #include <linux/random.h> |
56 | #include <linux/ftrace_event.h> | ||
57 | #include <linux/suspend.h> | ||
56 | 58 | ||
57 | #include "rcutree.h" | 59 | #include "rcutree.h" |
58 | #include <trace/events/rcu.h> | 60 | #include <trace/events/rcu.h> |
59 | 61 | ||
60 | #include "rcu.h" | 62 | #include "rcu.h" |
61 | 63 | ||
64 | /* | ||
65 | * Strings used in tracepoints need to be exported via the | ||
66 | * tracing system such that tools like perf and trace-cmd can | ||
67 | * translate the string address pointers to actual text. | ||
68 | */ | ||
69 | #define TPS(x) tracepoint_string(x) | ||
70 | |||
62 | /* Data structures. */ | 71 | /* Data structures. */ |
63 | 72 | ||
64 | static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; | 73 | static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; |
65 | static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; | 74 | static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; |
66 | 75 | ||
67 | #define RCU_STATE_INITIALIZER(sname, sabbr, cr) { \ | 76 | /* |
77 | * In order to export the rcu_state name to the tracing tools, it | ||
78 | * needs to be added in the __tracepoint_string section. | ||
79 | * This requires defining a separate variable tp_<sname>_varname | ||
80 | * that points to the string being used, and this will allow | ||
81 | * the tracing userspace tools to be able to decipher the string | ||
82 | * address to the matching string. | ||
83 | */ | ||
84 | #define RCU_STATE_INITIALIZER(sname, sabbr, cr) \ | ||
85 | static char sname##_varname[] = #sname; \ | ||
86 | static const char *tp_##sname##_varname __used __tracepoint_string = sname##_varname; \ | ||
87 | struct rcu_state sname##_state = { \ | ||
68 | .level = { &sname##_state.node[0] }, \ | 88 | .level = { &sname##_state.node[0] }, \ |
69 | .call = cr, \ | 89 | .call = cr, \ |
70 | .fqs_state = RCU_GP_IDLE, \ | 90 | .fqs_state = RCU_GP_IDLE, \ |
@@ -75,16 +95,13 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; | |||
75 | .orphan_donetail = &sname##_state.orphan_donelist, \ | 95 | .orphan_donetail = &sname##_state.orphan_donelist, \ |
76 | .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ | 96 | .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ |
77 | .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \ | 97 | .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \ |
78 | .name = #sname, \ | 98 | .name = sname##_varname, \ |
79 | .abbr = sabbr, \ | 99 | .abbr = sabbr, \ |
80 | } | 100 | }; \ |
81 | 101 | DEFINE_PER_CPU(struct rcu_data, sname##_data) | |
82 | struct rcu_state rcu_sched_state = | ||
83 | RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); | ||
84 | DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); | ||
85 | 102 | ||
86 | struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); | 103 | RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); |
87 | DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); | 104 | RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); |
88 | 105 | ||
89 | static struct rcu_state *rcu_state; | 106 | static struct rcu_state *rcu_state; |
90 | LIST_HEAD(rcu_struct_flavors); | 107 | LIST_HEAD(rcu_struct_flavors); |
@@ -178,7 +195,7 @@ void rcu_sched_qs(int cpu) | |||
178 | struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu); | 195 | struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu); |
179 | 196 | ||
180 | if (rdp->passed_quiesce == 0) | 197 | if (rdp->passed_quiesce == 0) |
181 | trace_rcu_grace_period("rcu_sched", rdp->gpnum, "cpuqs"); | 198 | trace_rcu_grace_period(TPS("rcu_sched"), rdp->gpnum, TPS("cpuqs")); |
182 | rdp->passed_quiesce = 1; | 199 | rdp->passed_quiesce = 1; |
183 | } | 200 | } |
184 | 201 | ||
@@ -187,7 +204,7 @@ void rcu_bh_qs(int cpu) | |||
187 | struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); | 204 | struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); |
188 | 205 | ||
189 | if (rdp->passed_quiesce == 0) | 206 | if (rdp->passed_quiesce == 0) |
190 | trace_rcu_grace_period("rcu_bh", rdp->gpnum, "cpuqs"); | 207 | trace_rcu_grace_period(TPS("rcu_bh"), rdp->gpnum, TPS("cpuqs")); |
191 | rdp->passed_quiesce = 1; | 208 | rdp->passed_quiesce = 1; |
192 | } | 209 | } |
193 | 210 | ||
@@ -198,16 +215,20 @@ void rcu_bh_qs(int cpu) | |||
198 | */ | 215 | */ |
199 | void rcu_note_context_switch(int cpu) | 216 | void rcu_note_context_switch(int cpu) |
200 | { | 217 | { |
201 | trace_rcu_utilization("Start context switch"); | 218 | trace_rcu_utilization(TPS("Start context switch")); |
202 | rcu_sched_qs(cpu); | 219 | rcu_sched_qs(cpu); |
203 | rcu_preempt_note_context_switch(cpu); | 220 | rcu_preempt_note_context_switch(cpu); |
204 | trace_rcu_utilization("End context switch"); | 221 | trace_rcu_utilization(TPS("End context switch")); |
205 | } | 222 | } |
206 | EXPORT_SYMBOL_GPL(rcu_note_context_switch); | 223 | EXPORT_SYMBOL_GPL(rcu_note_context_switch); |
207 | 224 | ||
208 | DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { | 225 | DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { |
209 | .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, | 226 | .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, |
210 | .dynticks = ATOMIC_INIT(1), | 227 | .dynticks = ATOMIC_INIT(1), |
228 | #ifdef CONFIG_NO_HZ_FULL_SYSIDLE | ||
229 | .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE, | ||
230 | .dynticks_idle = ATOMIC_INIT(1), | ||
231 | #endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ | ||
211 | }; | 232 | }; |
212 | 233 | ||
213 | static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ | 234 | static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ |
@@ -226,7 +247,10 @@ module_param(jiffies_till_next_fqs, ulong, 0644); | |||
226 | 247 | ||
227 | static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, | 248 | static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, |
228 | struct rcu_data *rdp); | 249 | struct rcu_data *rdp); |
229 | static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)); | 250 | static void force_qs_rnp(struct rcu_state *rsp, |
251 | int (*f)(struct rcu_data *rsp, bool *isidle, | ||
252 | unsigned long *maxj), | ||
253 | bool *isidle, unsigned long *maxj); | ||
230 | static void force_quiescent_state(struct rcu_state *rsp); | 254 | static void force_quiescent_state(struct rcu_state *rsp); |
231 | static int rcu_pending(int cpu); | 255 | static int rcu_pending(int cpu); |
232 | 256 | ||
@@ -345,11 +369,11 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp) | |||
345 | static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, | 369 | static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, |
346 | bool user) | 370 | bool user) |
347 | { | 371 | { |
348 | trace_rcu_dyntick("Start", oldval, rdtp->dynticks_nesting); | 372 | trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting); |
349 | if (!user && !is_idle_task(current)) { | 373 | if (!user && !is_idle_task(current)) { |
350 | struct task_struct *idle = idle_task(smp_processor_id()); | 374 | struct task_struct *idle = idle_task(smp_processor_id()); |
351 | 375 | ||
352 | trace_rcu_dyntick("Error on entry: not idle task", oldval, 0); | 376 | trace_rcu_dyntick(TPS("Error on entry: not idle task"), oldval, 0); |
353 | ftrace_dump(DUMP_ORIG); | 377 | ftrace_dump(DUMP_ORIG); |
354 | WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", | 378 | WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", |
355 | current->pid, current->comm, | 379 | current->pid, current->comm, |
@@ -411,6 +435,7 @@ void rcu_idle_enter(void) | |||
411 | 435 | ||
412 | local_irq_save(flags); | 436 | local_irq_save(flags); |
413 | rcu_eqs_enter(false); | 437 | rcu_eqs_enter(false); |
438 | rcu_sysidle_enter(&__get_cpu_var(rcu_dynticks), 0); | ||
414 | local_irq_restore(flags); | 439 | local_irq_restore(flags); |
415 | } | 440 | } |
416 | EXPORT_SYMBOL_GPL(rcu_idle_enter); | 441 | EXPORT_SYMBOL_GPL(rcu_idle_enter); |
@@ -428,27 +453,6 @@ void rcu_user_enter(void) | |||
428 | { | 453 | { |
429 | rcu_eqs_enter(1); | 454 | rcu_eqs_enter(1); |
430 | } | 455 | } |
431 | |||
432 | /** | ||
433 | * rcu_user_enter_after_irq - inform RCU that we are going to resume userspace | ||
434 | * after the current irq returns. | ||
435 | * | ||
436 | * This is similar to rcu_user_enter() but in the context of a non-nesting | ||
437 | * irq. After this call, RCU enters into idle mode when the interrupt | ||
438 | * returns. | ||
439 | */ | ||
440 | void rcu_user_enter_after_irq(void) | ||
441 | { | ||
442 | unsigned long flags; | ||
443 | struct rcu_dynticks *rdtp; | ||
444 | |||
445 | local_irq_save(flags); | ||
446 | rdtp = &__get_cpu_var(rcu_dynticks); | ||
447 | /* Ensure this irq is interrupting a non-idle RCU state. */ | ||
448 | WARN_ON_ONCE(!(rdtp->dynticks_nesting & DYNTICK_TASK_MASK)); | ||
449 | rdtp->dynticks_nesting = 1; | ||
450 | local_irq_restore(flags); | ||
451 | } | ||
452 | #endif /* CONFIG_RCU_USER_QS */ | 456 | #endif /* CONFIG_RCU_USER_QS */ |
453 | 457 | ||
454 | /** | 458 | /** |
@@ -479,9 +483,10 @@ void rcu_irq_exit(void) | |||
479 | rdtp->dynticks_nesting--; | 483 | rdtp->dynticks_nesting--; |
480 | WARN_ON_ONCE(rdtp->dynticks_nesting < 0); | 484 | WARN_ON_ONCE(rdtp->dynticks_nesting < 0); |
481 | if (rdtp->dynticks_nesting) | 485 | if (rdtp->dynticks_nesting) |
482 | trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting); | 486 | trace_rcu_dyntick(TPS("--="), oldval, rdtp->dynticks_nesting); |
483 | else | 487 | else |
484 | rcu_eqs_enter_common(rdtp, oldval, true); | 488 | rcu_eqs_enter_common(rdtp, oldval, true); |
489 | rcu_sysidle_enter(rdtp, 1); | ||
485 | local_irq_restore(flags); | 490 | local_irq_restore(flags); |
486 | } | 491 | } |
487 | 492 | ||
@@ -501,11 +506,11 @@ static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval, | |||
501 | smp_mb__after_atomic_inc(); /* See above. */ | 506 | smp_mb__after_atomic_inc(); /* See above. */ |
502 | WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); | 507 | WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); |
503 | rcu_cleanup_after_idle(smp_processor_id()); | 508 | rcu_cleanup_after_idle(smp_processor_id()); |
504 | trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting); | 509 | trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting); |
505 | if (!user && !is_idle_task(current)) { | 510 | if (!user && !is_idle_task(current)) { |
506 | struct task_struct *idle = idle_task(smp_processor_id()); | 511 | struct task_struct *idle = idle_task(smp_processor_id()); |
507 | 512 | ||
508 | trace_rcu_dyntick("Error on exit: not idle task", | 513 | trace_rcu_dyntick(TPS("Error on exit: not idle task"), |
509 | oldval, rdtp->dynticks_nesting); | 514 | oldval, rdtp->dynticks_nesting); |
510 | ftrace_dump(DUMP_ORIG); | 515 | ftrace_dump(DUMP_ORIG); |
511 | WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", | 516 | WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", |
@@ -550,6 +555,7 @@ void rcu_idle_exit(void) | |||
550 | 555 | ||
551 | local_irq_save(flags); | 556 | local_irq_save(flags); |
552 | rcu_eqs_exit(false); | 557 | rcu_eqs_exit(false); |
558 | rcu_sysidle_exit(&__get_cpu_var(rcu_dynticks), 0); | ||
553 | local_irq_restore(flags); | 559 | local_irq_restore(flags); |
554 | } | 560 | } |
555 | EXPORT_SYMBOL_GPL(rcu_idle_exit); | 561 | EXPORT_SYMBOL_GPL(rcu_idle_exit); |
@@ -565,28 +571,6 @@ void rcu_user_exit(void) | |||
565 | { | 571 | { |
566 | rcu_eqs_exit(1); | 572 | rcu_eqs_exit(1); |
567 | } | 573 | } |
568 | |||
569 | /** | ||
570 | * rcu_user_exit_after_irq - inform RCU that we won't resume to userspace | ||
571 | * idle mode after the current non-nesting irq returns. | ||
572 | * | ||
573 | * This is similar to rcu_user_exit() but in the context of an irq. | ||
574 | * This is called when the irq has interrupted a userspace RCU idle mode | ||
575 | * context. When the current non-nesting interrupt returns after this call, | ||
576 | * the CPU won't restore the RCU idle mode. | ||
577 | */ | ||
578 | void rcu_user_exit_after_irq(void) | ||
579 | { | ||
580 | unsigned long flags; | ||
581 | struct rcu_dynticks *rdtp; | ||
582 | |||
583 | local_irq_save(flags); | ||
584 | rdtp = &__get_cpu_var(rcu_dynticks); | ||
585 | /* Ensure we are interrupting an RCU idle mode. */ | ||
586 | WARN_ON_ONCE(rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK); | ||
587 | rdtp->dynticks_nesting += DYNTICK_TASK_EXIT_IDLE; | ||
588 | local_irq_restore(flags); | ||
589 | } | ||
590 | #endif /* CONFIG_RCU_USER_QS */ | 574 | #endif /* CONFIG_RCU_USER_QS */ |
591 | 575 | ||
592 | /** | 576 | /** |
@@ -620,9 +604,10 @@ void rcu_irq_enter(void) | |||
620 | rdtp->dynticks_nesting++; | 604 | rdtp->dynticks_nesting++; |
621 | WARN_ON_ONCE(rdtp->dynticks_nesting == 0); | 605 | WARN_ON_ONCE(rdtp->dynticks_nesting == 0); |
622 | if (oldval) | 606 | if (oldval) |
623 | trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting); | 607 | trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting); |
624 | else | 608 | else |
625 | rcu_eqs_exit_common(rdtp, oldval, true); | 609 | rcu_eqs_exit_common(rdtp, oldval, true); |
610 | rcu_sysidle_exit(rdtp, 1); | ||
626 | local_irq_restore(flags); | 611 | local_irq_restore(flags); |
627 | } | 612 | } |
628 | 613 | ||
@@ -746,9 +731,11 @@ static int rcu_is_cpu_rrupt_from_idle(void) | |||
746 | * credit them with an implicit quiescent state. Return 1 if this CPU | 731 | * credit them with an implicit quiescent state. Return 1 if this CPU |
747 | * is in dynticks idle mode, which is an extended quiescent state. | 732 | * is in dynticks idle mode, which is an extended quiescent state. |
748 | */ | 733 | */ |
749 | static int dyntick_save_progress_counter(struct rcu_data *rdp) | 734 | static int dyntick_save_progress_counter(struct rcu_data *rdp, |
735 | bool *isidle, unsigned long *maxj) | ||
750 | { | 736 | { |
751 | rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); | 737 | rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); |
738 | rcu_sysidle_check_cpu(rdp, isidle, maxj); | ||
752 | return (rdp->dynticks_snap & 0x1) == 0; | 739 | return (rdp->dynticks_snap & 0x1) == 0; |
753 | } | 740 | } |
754 | 741 | ||
@@ -758,7 +745,8 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp) | |||
758 | * idle state since the last call to dyntick_save_progress_counter() | 745 | * idle state since the last call to dyntick_save_progress_counter() |
759 | * for this same CPU, or by virtue of having been offline. | 746 | * for this same CPU, or by virtue of having been offline. |
760 | */ | 747 | */ |
761 | static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) | 748 | static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, |
749 | bool *isidle, unsigned long *maxj) | ||
762 | { | 750 | { |
763 | unsigned int curr; | 751 | unsigned int curr; |
764 | unsigned int snap; | 752 | unsigned int snap; |
@@ -775,7 +763,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) | |||
775 | * of the current RCU grace period. | 763 | * of the current RCU grace period. |
776 | */ | 764 | */ |
777 | if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) { | 765 | if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) { |
778 | trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "dti"); | 766 | trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); |
779 | rdp->dynticks_fqs++; | 767 | rdp->dynticks_fqs++; |
780 | return 1; | 768 | return 1; |
781 | } | 769 | } |
@@ -795,7 +783,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) | |||
795 | return 0; /* Grace period is not old enough. */ | 783 | return 0; /* Grace period is not old enough. */ |
796 | barrier(); | 784 | barrier(); |
797 | if (cpu_is_offline(rdp->cpu)) { | 785 | if (cpu_is_offline(rdp->cpu)) { |
798 | trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl"); | 786 | trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("ofl")); |
799 | rdp->offline_fqs++; | 787 | rdp->offline_fqs++; |
800 | return 1; | 788 | return 1; |
801 | } | 789 | } |
@@ -1032,7 +1020,7 @@ static unsigned long rcu_cbs_completed(struct rcu_state *rsp, | |||
1032 | * rcu_nocb_wait_gp(). | 1020 | * rcu_nocb_wait_gp(). |
1033 | */ | 1021 | */ |
1034 | static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, | 1022 | static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, |
1035 | unsigned long c, char *s) | 1023 | unsigned long c, const char *s) |
1036 | { | 1024 | { |
1037 | trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum, | 1025 | trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum, |
1038 | rnp->completed, c, rnp->level, | 1026 | rnp->completed, c, rnp->level, |
@@ -1058,9 +1046,9 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) | |||
1058 | * grace period is already marked as needed, return to the caller. | 1046 | * grace period is already marked as needed, return to the caller. |
1059 | */ | 1047 | */ |
1060 | c = rcu_cbs_completed(rdp->rsp, rnp); | 1048 | c = rcu_cbs_completed(rdp->rsp, rnp); |
1061 | trace_rcu_future_gp(rnp, rdp, c, "Startleaf"); | 1049 | trace_rcu_future_gp(rnp, rdp, c, TPS("Startleaf")); |
1062 | if (rnp->need_future_gp[c & 0x1]) { | 1050 | if (rnp->need_future_gp[c & 0x1]) { |
1063 | trace_rcu_future_gp(rnp, rdp, c, "Prestartleaf"); | 1051 | trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartleaf")); |
1064 | return c; | 1052 | return c; |
1065 | } | 1053 | } |
1066 | 1054 | ||
@@ -1074,7 +1062,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) | |||
1074 | if (rnp->gpnum != rnp->completed || | 1062 | if (rnp->gpnum != rnp->completed || |
1075 | ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) { | 1063 | ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) { |
1076 | rnp->need_future_gp[c & 0x1]++; | 1064 | rnp->need_future_gp[c & 0x1]++; |
1077 | trace_rcu_future_gp(rnp, rdp, c, "Startedleaf"); | 1065 | trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf")); |
1078 | return c; | 1066 | return c; |
1079 | } | 1067 | } |
1080 | 1068 | ||
@@ -1102,7 +1090,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) | |||
1102 | * recorded, trace and leave. | 1090 | * recorded, trace and leave. |
1103 | */ | 1091 | */ |
1104 | if (rnp_root->need_future_gp[c & 0x1]) { | 1092 | if (rnp_root->need_future_gp[c & 0x1]) { |
1105 | trace_rcu_future_gp(rnp, rdp, c, "Prestartedroot"); | 1093 | trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartedroot")); |
1106 | goto unlock_out; | 1094 | goto unlock_out; |
1107 | } | 1095 | } |
1108 | 1096 | ||
@@ -1111,9 +1099,9 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) | |||
1111 | 1099 | ||
1112 | /* If a grace period is not already in progress, start one. */ | 1100 | /* If a grace period is not already in progress, start one. */ |
1113 | if (rnp_root->gpnum != rnp_root->completed) { | 1101 | if (rnp_root->gpnum != rnp_root->completed) { |
1114 | trace_rcu_future_gp(rnp, rdp, c, "Startedleafroot"); | 1102 | trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleafroot")); |
1115 | } else { | 1103 | } else { |
1116 | trace_rcu_future_gp(rnp, rdp, c, "Startedroot"); | 1104 | trace_rcu_future_gp(rnp, rdp, c, TPS("Startedroot")); |
1117 | rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp); | 1105 | rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp); |
1118 | } | 1106 | } |
1119 | unlock_out: | 1107 | unlock_out: |
@@ -1137,7 +1125,8 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) | |||
1137 | rcu_nocb_gp_cleanup(rsp, rnp); | 1125 | rcu_nocb_gp_cleanup(rsp, rnp); |
1138 | rnp->need_future_gp[c & 0x1] = 0; | 1126 | rnp->need_future_gp[c & 0x1] = 0; |
1139 | needmore = rnp->need_future_gp[(c + 1) & 0x1]; | 1127 | needmore = rnp->need_future_gp[(c + 1) & 0x1]; |
1140 | trace_rcu_future_gp(rnp, rdp, c, needmore ? "CleanupMore" : "Cleanup"); | 1128 | trace_rcu_future_gp(rnp, rdp, c, |
1129 | needmore ? TPS("CleanupMore") : TPS("Cleanup")); | ||
1141 | return needmore; | 1130 | return needmore; |
1142 | } | 1131 | } |
1143 | 1132 | ||
@@ -1205,9 +1194,9 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, | |||
1205 | 1194 | ||
1206 | /* Trace depending on how much we were able to accelerate. */ | 1195 | /* Trace depending on how much we were able to accelerate. */ |
1207 | if (!*rdp->nxttail[RCU_WAIT_TAIL]) | 1196 | if (!*rdp->nxttail[RCU_WAIT_TAIL]) |
1208 | trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccWaitCB"); | 1197 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB")); |
1209 | else | 1198 | else |
1210 | trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccReadyCB"); | 1199 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB")); |
1211 | } | 1200 | } |
1212 | 1201 | ||
1213 | /* | 1202 | /* |
@@ -1273,7 +1262,7 @@ static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struc | |||
1273 | 1262 | ||
1274 | /* Remember that we saw this grace-period completion. */ | 1263 | /* Remember that we saw this grace-period completion. */ |
1275 | rdp->completed = rnp->completed; | 1264 | rdp->completed = rnp->completed; |
1276 | trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuend"); | 1265 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend")); |
1277 | } | 1266 | } |
1278 | 1267 | ||
1279 | if (rdp->gpnum != rnp->gpnum) { | 1268 | if (rdp->gpnum != rnp->gpnum) { |
@@ -1283,7 +1272,7 @@ static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struc | |||
1283 | * go looking for one. | 1272 | * go looking for one. |
1284 | */ | 1273 | */ |
1285 | rdp->gpnum = rnp->gpnum; | 1274 | rdp->gpnum = rnp->gpnum; |
1286 | trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart"); | 1275 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); |
1287 | rdp->passed_quiesce = 0; | 1276 | rdp->passed_quiesce = 0; |
1288 | rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); | 1277 | rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); |
1289 | zero_cpu_stall_ticks(rdp); | 1278 | zero_cpu_stall_ticks(rdp); |
@@ -1315,6 +1304,7 @@ static int rcu_gp_init(struct rcu_state *rsp) | |||
1315 | struct rcu_data *rdp; | 1304 | struct rcu_data *rdp; |
1316 | struct rcu_node *rnp = rcu_get_root(rsp); | 1305 | struct rcu_node *rnp = rcu_get_root(rsp); |
1317 | 1306 | ||
1307 | rcu_bind_gp_kthread(); | ||
1318 | raw_spin_lock_irq(&rnp->lock); | 1308 | raw_spin_lock_irq(&rnp->lock); |
1319 | rsp->gp_flags = 0; /* Clear all flags: New grace period. */ | 1309 | rsp->gp_flags = 0; /* Clear all flags: New grace period. */ |
1320 | 1310 | ||
@@ -1326,7 +1316,7 @@ static int rcu_gp_init(struct rcu_state *rsp) | |||
1326 | 1316 | ||
1327 | /* Advance to a new grace period and initialize state. */ | 1317 | /* Advance to a new grace period and initialize state. */ |
1328 | rsp->gpnum++; | 1318 | rsp->gpnum++; |
1329 | trace_rcu_grace_period(rsp->name, rsp->gpnum, "start"); | 1319 | trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); |
1330 | record_gp_stall_check_time(rsp); | 1320 | record_gp_stall_check_time(rsp); |
1331 | raw_spin_unlock_irq(&rnp->lock); | 1321 | raw_spin_unlock_irq(&rnp->lock); |
1332 | 1322 | ||
@@ -1379,16 +1369,25 @@ static int rcu_gp_init(struct rcu_state *rsp) | |||
1379 | int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) | 1369 | int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) |
1380 | { | 1370 | { |
1381 | int fqs_state = fqs_state_in; | 1371 | int fqs_state = fqs_state_in; |
1372 | bool isidle = false; | ||
1373 | unsigned long maxj; | ||
1382 | struct rcu_node *rnp = rcu_get_root(rsp); | 1374 | struct rcu_node *rnp = rcu_get_root(rsp); |
1383 | 1375 | ||
1384 | rsp->n_force_qs++; | 1376 | rsp->n_force_qs++; |
1385 | if (fqs_state == RCU_SAVE_DYNTICK) { | 1377 | if (fqs_state == RCU_SAVE_DYNTICK) { |
1386 | /* Collect dyntick-idle snapshots. */ | 1378 | /* Collect dyntick-idle snapshots. */ |
1387 | force_qs_rnp(rsp, dyntick_save_progress_counter); | 1379 | if (is_sysidle_rcu_state(rsp)) { |
1380 | isidle = 1; | ||
1381 | maxj = jiffies - ULONG_MAX / 4; | ||
1382 | } | ||
1383 | force_qs_rnp(rsp, dyntick_save_progress_counter, | ||
1384 | &isidle, &maxj); | ||
1385 | rcu_sysidle_report_gp(rsp, isidle, maxj); | ||
1388 | fqs_state = RCU_FORCE_QS; | 1386 | fqs_state = RCU_FORCE_QS; |
1389 | } else { | 1387 | } else { |
1390 | /* Handle dyntick-idle and offline CPUs. */ | 1388 | /* Handle dyntick-idle and offline CPUs. */ |
1391 | force_qs_rnp(rsp, rcu_implicit_dynticks_qs); | 1389 | isidle = 0; |
1390 | force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj); | ||
1392 | } | 1391 | } |
1393 | /* Clear flag to prevent immediate re-entry. */ | 1392 | /* Clear flag to prevent immediate re-entry. */ |
1394 | if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { | 1393 | if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { |
@@ -1448,7 +1447,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) | |||
1448 | rcu_nocb_gp_set(rnp, nocb); | 1447 | rcu_nocb_gp_set(rnp, nocb); |
1449 | 1448 | ||
1450 | rsp->completed = rsp->gpnum; /* Declare grace period done. */ | 1449 | rsp->completed = rsp->gpnum; /* Declare grace period done. */ |
1451 | trace_rcu_grace_period(rsp->name, rsp->completed, "end"); | 1450 | trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end")); |
1452 | rsp->fqs_state = RCU_GP_IDLE; | 1451 | rsp->fqs_state = RCU_GP_IDLE; |
1453 | rdp = this_cpu_ptr(rsp->rda); | 1452 | rdp = this_cpu_ptr(rsp->rda); |
1454 | rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */ | 1453 | rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */ |
@@ -1558,10 +1557,12 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, | |||
1558 | 1557 | ||
1559 | /* | 1558 | /* |
1560 | * We can't do wakeups while holding the rnp->lock, as that | 1559 | * We can't do wakeups while holding the rnp->lock, as that |
1561 | * could cause possible deadlocks with the rq->lock. Deter | 1560 | * could cause possible deadlocks with the rq->lock. Defer |
1562 | * the wakeup to interrupt context. | 1561 | * the wakeup to interrupt context. And don't bother waking |
1562 | * up the running kthread. | ||
1563 | */ | 1563 | */ |
1564 | irq_work_queue(&rsp->wakeup_work); | 1564 | if (current != rsp->gp_kthread) |
1565 | irq_work_queue(&rsp->wakeup_work); | ||
1565 | } | 1566 | } |
1566 | 1567 | ||
1567 | /* | 1568 | /* |
@@ -1857,7 +1858,7 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) | |||
1857 | RCU_TRACE(mask = rdp->grpmask); | 1858 | RCU_TRACE(mask = rdp->grpmask); |
1858 | trace_rcu_grace_period(rsp->name, | 1859 | trace_rcu_grace_period(rsp->name, |
1859 | rnp->gpnum + 1 - !!(rnp->qsmask & mask), | 1860 | rnp->gpnum + 1 - !!(rnp->qsmask & mask), |
1860 | "cpuofl"); | 1861 | TPS("cpuofl")); |
1861 | } | 1862 | } |
1862 | 1863 | ||
1863 | /* | 1864 | /* |
@@ -2044,7 +2045,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
2044 | */ | 2045 | */ |
2045 | void rcu_check_callbacks(int cpu, int user) | 2046 | void rcu_check_callbacks(int cpu, int user) |
2046 | { | 2047 | { |
2047 | trace_rcu_utilization("Start scheduler-tick"); | 2048 | trace_rcu_utilization(TPS("Start scheduler-tick")); |
2048 | increment_cpu_stall_ticks(); | 2049 | increment_cpu_stall_ticks(); |
2049 | if (user || rcu_is_cpu_rrupt_from_idle()) { | 2050 | if (user || rcu_is_cpu_rrupt_from_idle()) { |
2050 | 2051 | ||
@@ -2077,7 +2078,7 @@ void rcu_check_callbacks(int cpu, int user) | |||
2077 | rcu_preempt_check_callbacks(cpu); | 2078 | rcu_preempt_check_callbacks(cpu); |
2078 | if (rcu_pending(cpu)) | 2079 | if (rcu_pending(cpu)) |
2079 | invoke_rcu_core(); | 2080 | invoke_rcu_core(); |
2080 | trace_rcu_utilization("End scheduler-tick"); | 2081 | trace_rcu_utilization(TPS("End scheduler-tick")); |
2081 | } | 2082 | } |
2082 | 2083 | ||
2083 | /* | 2084 | /* |
@@ -2087,7 +2088,10 @@ void rcu_check_callbacks(int cpu, int user) | |||
2087 | * | 2088 | * |
2088 | * The caller must have suppressed start of new grace periods. | 2089 | * The caller must have suppressed start of new grace periods. |
2089 | */ | 2090 | */ |
2090 | static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) | 2091 | static void force_qs_rnp(struct rcu_state *rsp, |
2092 | int (*f)(struct rcu_data *rsp, bool *isidle, | ||
2093 | unsigned long *maxj), | ||
2094 | bool *isidle, unsigned long *maxj) | ||
2091 | { | 2095 | { |
2092 | unsigned long bit; | 2096 | unsigned long bit; |
2093 | int cpu; | 2097 | int cpu; |
@@ -2110,9 +2114,12 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) | |||
2110 | cpu = rnp->grplo; | 2114 | cpu = rnp->grplo; |
2111 | bit = 1; | 2115 | bit = 1; |
2112 | for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { | 2116 | for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { |
2113 | if ((rnp->qsmask & bit) != 0 && | 2117 | if ((rnp->qsmask & bit) != 0) { |
2114 | f(per_cpu_ptr(rsp->rda, cpu))) | 2118 | if ((rnp->qsmaskinit & bit) != 0) |
2115 | mask |= bit; | 2119 | *isidle = 0; |
2120 | if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) | ||
2121 | mask |= bit; | ||
2122 | } | ||
2116 | } | 2123 | } |
2117 | if (mask != 0) { | 2124 | if (mask != 0) { |
2118 | 2125 | ||
@@ -2208,10 +2215,10 @@ static void rcu_process_callbacks(struct softirq_action *unused) | |||
2208 | 2215 | ||
2209 | if (cpu_is_offline(smp_processor_id())) | 2216 | if (cpu_is_offline(smp_processor_id())) |
2210 | return; | 2217 | return; |
2211 | trace_rcu_utilization("Start RCU core"); | 2218 | trace_rcu_utilization(TPS("Start RCU core")); |
2212 | for_each_rcu_flavor(rsp) | 2219 | for_each_rcu_flavor(rsp) |
2213 | __rcu_process_callbacks(rsp); | 2220 | __rcu_process_callbacks(rsp); |
2214 | trace_rcu_utilization("End RCU core"); | 2221 | trace_rcu_utilization(TPS("End RCU core")); |
2215 | } | 2222 | } |
2216 | 2223 | ||
2217 | /* | 2224 | /* |
@@ -2287,6 +2294,13 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, | |||
2287 | } | 2294 | } |
2288 | 2295 | ||
2289 | /* | 2296 | /* |
2297 | * RCU callback function to leak a callback. | ||
2298 | */ | ||
2299 | static void rcu_leak_callback(struct rcu_head *rhp) | ||
2300 | { | ||
2301 | } | ||
2302 | |||
2303 | /* | ||
2290 | * Helper function for call_rcu() and friends. The cpu argument will | 2304 | * Helper function for call_rcu() and friends. The cpu argument will |
2291 | * normally be -1, indicating "currently running CPU". It may specify | 2305 | * normally be -1, indicating "currently running CPU". It may specify |
2292 | * a CPU only if that CPU is a no-CBs CPU. Currently, only _rcu_barrier() | 2306 | * a CPU only if that CPU is a no-CBs CPU. Currently, only _rcu_barrier() |
@@ -2300,7 +2314,12 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
2300 | struct rcu_data *rdp; | 2314 | struct rcu_data *rdp; |
2301 | 2315 | ||
2302 | WARN_ON_ONCE((unsigned long)head & 0x3); /* Misaligned rcu_head! */ | 2316 | WARN_ON_ONCE((unsigned long)head & 0x3); /* Misaligned rcu_head! */ |
2303 | debug_rcu_head_queue(head); | 2317 | if (debug_rcu_head_queue(head)) { |
2318 | /* Probable double call_rcu(), so leak the callback. */ | ||
2319 | ACCESS_ONCE(head->func) = rcu_leak_callback; | ||
2320 | WARN_ONCE(1, "__call_rcu(): Leaked duplicate callback\n"); | ||
2321 | return; | ||
2322 | } | ||
2304 | head->func = func; | 2323 | head->func = func; |
2305 | head->next = NULL; | 2324 | head->next = NULL; |
2306 | 2325 | ||
@@ -2720,7 +2739,7 @@ static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy) | |||
2720 | * Helper function for _rcu_barrier() tracing. If tracing is disabled, | 2739 | * Helper function for _rcu_barrier() tracing. If tracing is disabled, |
2721 | * the compiler is expected to optimize this away. | 2740 | * the compiler is expected to optimize this away. |
2722 | */ | 2741 | */ |
2723 | static void _rcu_barrier_trace(struct rcu_state *rsp, char *s, | 2742 | static void _rcu_barrier_trace(struct rcu_state *rsp, const char *s, |
2724 | int cpu, unsigned long done) | 2743 | int cpu, unsigned long done) |
2725 | { | 2744 | { |
2726 | trace_rcu_barrier(rsp->name, s, cpu, | 2745 | trace_rcu_barrier(rsp->name, s, cpu, |
@@ -2785,9 +2804,20 @@ static void _rcu_barrier(struct rcu_state *rsp) | |||
2785 | * transition. The "if" expression below therefore rounds the old | 2804 | * transition. The "if" expression below therefore rounds the old |
2786 | * value up to the next even number and adds two before comparing. | 2805 | * value up to the next even number and adds two before comparing. |
2787 | */ | 2806 | */ |
2788 | snap_done = ACCESS_ONCE(rsp->n_barrier_done); | 2807 | snap_done = rsp->n_barrier_done; |
2789 | _rcu_barrier_trace(rsp, "Check", -1, snap_done); | 2808 | _rcu_barrier_trace(rsp, "Check", -1, snap_done); |
2790 | if (ULONG_CMP_GE(snap_done, ((snap + 1) & ~0x1) + 2)) { | 2809 | |
2810 | /* | ||
2811 | * If the value in snap is odd, we needed to wait for the current | ||
2812 | * rcu_barrier() to complete, then wait for the next one, in other | ||
2813 | * words, we need the value of snap_done to be three larger than | ||
2814 | * the value of snap. On the other hand, if the value in snap is | ||
2815 | * even, we only had to wait for the next rcu_barrier() to complete, | ||
2816 | * in other words, we need the value of snap_done to be only two | ||
2817 | * greater than the value of snap. The "(snap + 3) & ~0x1" computes | ||
2818 | * this for us (thank you, Linus!). | ||
2819 | */ | ||
2820 | if (ULONG_CMP_GE(snap_done, (snap + 3) & ~0x1)) { | ||
2791 | _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done); | 2821 | _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done); |
2792 | smp_mb(); /* caller's subsequent code after above check. */ | 2822 | smp_mb(); /* caller's subsequent code after above check. */ |
2793 | mutex_unlock(&rsp->barrier_mutex); | 2823 | mutex_unlock(&rsp->barrier_mutex); |
@@ -2930,6 +2960,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) | |||
2930 | rdp->blimit = blimit; | 2960 | rdp->blimit = blimit; |
2931 | init_callback_list(rdp); /* Re-enable callbacks on this CPU. */ | 2961 | init_callback_list(rdp); /* Re-enable callbacks on this CPU. */ |
2932 | rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; | 2962 | rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; |
2963 | rcu_sysidle_init_percpu_data(rdp->dynticks); | ||
2933 | atomic_set(&rdp->dynticks->dynticks, | 2964 | atomic_set(&rdp->dynticks->dynticks, |
2934 | (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); | 2965 | (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); |
2935 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 2966 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
@@ -2952,7 +2983,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) | |||
2952 | rdp->completed = rnp->completed; | 2983 | rdp->completed = rnp->completed; |
2953 | rdp->passed_quiesce = 0; | 2984 | rdp->passed_quiesce = 0; |
2954 | rdp->qs_pending = 0; | 2985 | rdp->qs_pending = 0; |
2955 | trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuonl"); | 2986 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); |
2956 | } | 2987 | } |
2957 | raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ | 2988 | raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ |
2958 | rnp = rnp->parent; | 2989 | rnp = rnp->parent; |
@@ -2982,7 +3013,7 @@ static int rcu_cpu_notify(struct notifier_block *self, | |||
2982 | struct rcu_node *rnp = rdp->mynode; | 3013 | struct rcu_node *rnp = rdp->mynode; |
2983 | struct rcu_state *rsp; | 3014 | struct rcu_state *rsp; |
2984 | 3015 | ||
2985 | trace_rcu_utilization("Start CPU hotplug"); | 3016 | trace_rcu_utilization(TPS("Start CPU hotplug")); |
2986 | switch (action) { | 3017 | switch (action) { |
2987 | case CPU_UP_PREPARE: | 3018 | case CPU_UP_PREPARE: |
2988 | case CPU_UP_PREPARE_FROZEN: | 3019 | case CPU_UP_PREPARE_FROZEN: |
@@ -3011,7 +3042,26 @@ static int rcu_cpu_notify(struct notifier_block *self, | |||
3011 | default: | 3042 | default: |
3012 | break; | 3043 | break; |
3013 | } | 3044 | } |
3014 | trace_rcu_utilization("End CPU hotplug"); | 3045 | trace_rcu_utilization(TPS("End CPU hotplug")); |
3046 | return NOTIFY_OK; | ||
3047 | } | ||
3048 | |||
3049 | static int rcu_pm_notify(struct notifier_block *self, | ||
3050 | unsigned long action, void *hcpu) | ||
3051 | { | ||
3052 | switch (action) { | ||
3053 | case PM_HIBERNATION_PREPARE: | ||
3054 | case PM_SUSPEND_PREPARE: | ||
3055 | if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */ | ||
3056 | rcu_expedited = 1; | ||
3057 | break; | ||
3058 | case PM_POST_HIBERNATION: | ||
3059 | case PM_POST_SUSPEND: | ||
3060 | rcu_expedited = 0; | ||
3061 | break; | ||
3062 | default: | ||
3063 | break; | ||
3064 | } | ||
3015 | return NOTIFY_OK; | 3065 | return NOTIFY_OK; |
3016 | } | 3066 | } |
3017 | 3067 | ||
@@ -3256,6 +3306,7 @@ void __init rcu_init(void) | |||
3256 | * or the scheduler are operational. | 3306 | * or the scheduler are operational. |
3257 | */ | 3307 | */ |
3258 | cpu_notifier(rcu_cpu_notify, 0); | 3308 | cpu_notifier(rcu_cpu_notify, 0); |
3309 | pm_notifier(rcu_pm_notify, 0); | ||
3259 | for_each_online_cpu(cpu) | 3310 | for_each_online_cpu(cpu) |
3260 | rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); | 3311 | rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); |
3261 | } | 3312 | } |
diff --git a/kernel/rcutree.h b/kernel/rcutree.h index b3832581043c..5f97eab602cd 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h | |||
@@ -88,6 +88,14 @@ struct rcu_dynticks { | |||
88 | /* Process level is worth LLONG_MAX/2. */ | 88 | /* Process level is worth LLONG_MAX/2. */ |
89 | int dynticks_nmi_nesting; /* Track NMI nesting level. */ | 89 | int dynticks_nmi_nesting; /* Track NMI nesting level. */ |
90 | atomic_t dynticks; /* Even value for idle, else odd. */ | 90 | atomic_t dynticks; /* Even value for idle, else odd. */ |
91 | #ifdef CONFIG_NO_HZ_FULL_SYSIDLE | ||
92 | long long dynticks_idle_nesting; | ||
93 | /* irq/process nesting level from idle. */ | ||
94 | atomic_t dynticks_idle; /* Even value for idle, else odd. */ | ||
95 | /* "Idle" excludes userspace execution. */ | ||
96 | unsigned long dynticks_idle_jiffies; | ||
97 | /* End of last non-NMI non-idle period. */ | ||
98 | #endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ | ||
91 | #ifdef CONFIG_RCU_FAST_NO_HZ | 99 | #ifdef CONFIG_RCU_FAST_NO_HZ |
92 | bool all_lazy; /* Are all CPU's CBs lazy? */ | 100 | bool all_lazy; /* Are all CPU's CBs lazy? */ |
93 | unsigned long nonlazy_posted; | 101 | unsigned long nonlazy_posted; |
@@ -445,7 +453,7 @@ struct rcu_state { | |||
445 | /* for CPU stalls. */ | 453 | /* for CPU stalls. */ |
446 | unsigned long gp_max; /* Maximum GP duration in */ | 454 | unsigned long gp_max; /* Maximum GP duration in */ |
447 | /* jiffies. */ | 455 | /* jiffies. */ |
448 | char *name; /* Name of structure. */ | 456 | const char *name; /* Name of structure. */ |
449 | char abbr; /* Abbreviated name. */ | 457 | char abbr; /* Abbreviated name. */ |
450 | struct list_head flavors; /* List of RCU flavors. */ | 458 | struct list_head flavors; /* List of RCU flavors. */ |
451 | struct irq_work wakeup_work; /* Postponed wakeups */ | 459 | struct irq_work wakeup_work; /* Postponed wakeups */ |
@@ -545,6 +553,15 @@ static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); | |||
545 | static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); | 553 | static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); |
546 | static void rcu_kick_nohz_cpu(int cpu); | 554 | static void rcu_kick_nohz_cpu(int cpu); |
547 | static bool init_nocb_callback_list(struct rcu_data *rdp); | 555 | static bool init_nocb_callback_list(struct rcu_data *rdp); |
556 | static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq); | ||
557 | static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq); | ||
558 | static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, | ||
559 | unsigned long *maxj); | ||
560 | static bool is_sysidle_rcu_state(struct rcu_state *rsp); | ||
561 | static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, | ||
562 | unsigned long maxj); | ||
563 | static void rcu_bind_gp_kthread(void); | ||
564 | static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp); | ||
548 | 565 | ||
549 | #endif /* #ifndef RCU_TREE_NONCORE */ | 566 | #endif /* #ifndef RCU_TREE_NONCORE */ |
550 | 567 | ||
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 769e12e3151b..130c97b027f2 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h | |||
@@ -28,7 +28,7 @@ | |||
28 | #include <linux/gfp.h> | 28 | #include <linux/gfp.h> |
29 | #include <linux/oom.h> | 29 | #include <linux/oom.h> |
30 | #include <linux/smpboot.h> | 30 | #include <linux/smpboot.h> |
31 | #include <linux/tick.h> | 31 | #include "time/tick-internal.h" |
32 | 32 | ||
33 | #define RCU_KTHREAD_PRIO 1 | 33 | #define RCU_KTHREAD_PRIO 1 |
34 | 34 | ||
@@ -110,9 +110,7 @@ static void __init rcu_bootup_announce_oddness(void) | |||
110 | 110 | ||
111 | #ifdef CONFIG_TREE_PREEMPT_RCU | 111 | #ifdef CONFIG_TREE_PREEMPT_RCU |
112 | 112 | ||
113 | struct rcu_state rcu_preempt_state = | 113 | RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); |
114 | RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); | ||
115 | DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); | ||
116 | static struct rcu_state *rcu_state = &rcu_preempt_state; | 114 | static struct rcu_state *rcu_state = &rcu_preempt_state; |
117 | 115 | ||
118 | static int rcu_preempted_readers_exp(struct rcu_node *rnp); | 116 | static int rcu_preempted_readers_exp(struct rcu_node *rnp); |
@@ -169,7 +167,7 @@ static void rcu_preempt_qs(int cpu) | |||
169 | struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); | 167 | struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); |
170 | 168 | ||
171 | if (rdp->passed_quiesce == 0) | 169 | if (rdp->passed_quiesce == 0) |
172 | trace_rcu_grace_period("rcu_preempt", rdp->gpnum, "cpuqs"); | 170 | trace_rcu_grace_period(TPS("rcu_preempt"), rdp->gpnum, TPS("cpuqs")); |
173 | rdp->passed_quiesce = 1; | 171 | rdp->passed_quiesce = 1; |
174 | current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; | 172 | current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; |
175 | } | 173 | } |
@@ -388,7 +386,7 @@ void rcu_read_unlock_special(struct task_struct *t) | |||
388 | np = rcu_next_node_entry(t, rnp); | 386 | np = rcu_next_node_entry(t, rnp); |
389 | list_del_init(&t->rcu_node_entry); | 387 | list_del_init(&t->rcu_node_entry); |
390 | t->rcu_blocked_node = NULL; | 388 | t->rcu_blocked_node = NULL; |
391 | trace_rcu_unlock_preempted_task("rcu_preempt", | 389 | trace_rcu_unlock_preempted_task(TPS("rcu_preempt"), |
392 | rnp->gpnum, t->pid); | 390 | rnp->gpnum, t->pid); |
393 | if (&t->rcu_node_entry == rnp->gp_tasks) | 391 | if (&t->rcu_node_entry == rnp->gp_tasks) |
394 | rnp->gp_tasks = np; | 392 | rnp->gp_tasks = np; |
@@ -412,7 +410,7 @@ void rcu_read_unlock_special(struct task_struct *t) | |||
412 | */ | 410 | */ |
413 | empty_exp_now = !rcu_preempted_readers_exp(rnp); | 411 | empty_exp_now = !rcu_preempted_readers_exp(rnp); |
414 | if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) { | 412 | if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) { |
415 | trace_rcu_quiescent_state_report("preempt_rcu", | 413 | trace_rcu_quiescent_state_report(TPS("preempt_rcu"), |
416 | rnp->gpnum, | 414 | rnp->gpnum, |
417 | 0, rnp->qsmask, | 415 | 0, rnp->qsmask, |
418 | rnp->level, | 416 | rnp->level, |
@@ -1250,12 +1248,12 @@ static int rcu_boost_kthread(void *arg) | |||
1250 | int spincnt = 0; | 1248 | int spincnt = 0; |
1251 | int more2boost; | 1249 | int more2boost; |
1252 | 1250 | ||
1253 | trace_rcu_utilization("Start boost kthread@init"); | 1251 | trace_rcu_utilization(TPS("Start boost kthread@init")); |
1254 | for (;;) { | 1252 | for (;;) { |
1255 | rnp->boost_kthread_status = RCU_KTHREAD_WAITING; | 1253 | rnp->boost_kthread_status = RCU_KTHREAD_WAITING; |
1256 | trace_rcu_utilization("End boost kthread@rcu_wait"); | 1254 | trace_rcu_utilization(TPS("End boost kthread@rcu_wait")); |
1257 | rcu_wait(rnp->boost_tasks || rnp->exp_tasks); | 1255 | rcu_wait(rnp->boost_tasks || rnp->exp_tasks); |
1258 | trace_rcu_utilization("Start boost kthread@rcu_wait"); | 1256 | trace_rcu_utilization(TPS("Start boost kthread@rcu_wait")); |
1259 | rnp->boost_kthread_status = RCU_KTHREAD_RUNNING; | 1257 | rnp->boost_kthread_status = RCU_KTHREAD_RUNNING; |
1260 | more2boost = rcu_boost(rnp); | 1258 | more2boost = rcu_boost(rnp); |
1261 | if (more2boost) | 1259 | if (more2boost) |
@@ -1264,14 +1262,14 @@ static int rcu_boost_kthread(void *arg) | |||
1264 | spincnt = 0; | 1262 | spincnt = 0; |
1265 | if (spincnt > 10) { | 1263 | if (spincnt > 10) { |
1266 | rnp->boost_kthread_status = RCU_KTHREAD_YIELDING; | 1264 | rnp->boost_kthread_status = RCU_KTHREAD_YIELDING; |
1267 | trace_rcu_utilization("End boost kthread@rcu_yield"); | 1265 | trace_rcu_utilization(TPS("End boost kthread@rcu_yield")); |
1268 | schedule_timeout_interruptible(2); | 1266 | schedule_timeout_interruptible(2); |
1269 | trace_rcu_utilization("Start boost kthread@rcu_yield"); | 1267 | trace_rcu_utilization(TPS("Start boost kthread@rcu_yield")); |
1270 | spincnt = 0; | 1268 | spincnt = 0; |
1271 | } | 1269 | } |
1272 | } | 1270 | } |
1273 | /* NOTREACHED */ | 1271 | /* NOTREACHED */ |
1274 | trace_rcu_utilization("End boost kthread@notreached"); | 1272 | trace_rcu_utilization(TPS("End boost kthread@notreached")); |
1275 | return 0; | 1273 | return 0; |
1276 | } | 1274 | } |
1277 | 1275 | ||
@@ -1419,7 +1417,7 @@ static void rcu_cpu_kthread(unsigned int cpu) | |||
1419 | int spincnt; | 1417 | int spincnt; |
1420 | 1418 | ||
1421 | for (spincnt = 0; spincnt < 10; spincnt++) { | 1419 | for (spincnt = 0; spincnt < 10; spincnt++) { |
1422 | trace_rcu_utilization("Start CPU kthread@rcu_wait"); | 1420 | trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait")); |
1423 | local_bh_disable(); | 1421 | local_bh_disable(); |
1424 | *statusp = RCU_KTHREAD_RUNNING; | 1422 | *statusp = RCU_KTHREAD_RUNNING; |
1425 | this_cpu_inc(rcu_cpu_kthread_loops); | 1423 | this_cpu_inc(rcu_cpu_kthread_loops); |
@@ -1431,15 +1429,15 @@ static void rcu_cpu_kthread(unsigned int cpu) | |||
1431 | rcu_kthread_do_work(); | 1429 | rcu_kthread_do_work(); |
1432 | local_bh_enable(); | 1430 | local_bh_enable(); |
1433 | if (*workp == 0) { | 1431 | if (*workp == 0) { |
1434 | trace_rcu_utilization("End CPU kthread@rcu_wait"); | 1432 | trace_rcu_utilization(TPS("End CPU kthread@rcu_wait")); |
1435 | *statusp = RCU_KTHREAD_WAITING; | 1433 | *statusp = RCU_KTHREAD_WAITING; |
1436 | return; | 1434 | return; |
1437 | } | 1435 | } |
1438 | } | 1436 | } |
1439 | *statusp = RCU_KTHREAD_YIELDING; | 1437 | *statusp = RCU_KTHREAD_YIELDING; |
1440 | trace_rcu_utilization("Start CPU kthread@rcu_yield"); | 1438 | trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield")); |
1441 | schedule_timeout_interruptible(2); | 1439 | schedule_timeout_interruptible(2); |
1442 | trace_rcu_utilization("End CPU kthread@rcu_yield"); | 1440 | trace_rcu_utilization(TPS("End CPU kthread@rcu_yield")); |
1443 | *statusp = RCU_KTHREAD_WAITING; | 1441 | *statusp = RCU_KTHREAD_WAITING; |
1444 | } | 1442 | } |
1445 | 1443 | ||
@@ -2202,7 +2200,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) | |||
2202 | * Wait for the grace period. Do so interruptibly to avoid messing | 2200 | * Wait for the grace period. Do so interruptibly to avoid messing |
2203 | * up the load average. | 2201 | * up the load average. |
2204 | */ | 2202 | */ |
2205 | trace_rcu_future_gp(rnp, rdp, c, "StartWait"); | 2203 | trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait")); |
2206 | for (;;) { | 2204 | for (;;) { |
2207 | wait_event_interruptible( | 2205 | wait_event_interruptible( |
2208 | rnp->nocb_gp_wq[c & 0x1], | 2206 | rnp->nocb_gp_wq[c & 0x1], |
@@ -2210,9 +2208,9 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) | |||
2210 | if (likely(d)) | 2208 | if (likely(d)) |
2211 | break; | 2209 | break; |
2212 | flush_signals(current); | 2210 | flush_signals(current); |
2213 | trace_rcu_future_gp(rnp, rdp, c, "ResumeWait"); | 2211 | trace_rcu_future_gp(rnp, rdp, c, TPS("ResumeWait")); |
2214 | } | 2212 | } |
2215 | trace_rcu_future_gp(rnp, rdp, c, "EndWait"); | 2213 | trace_rcu_future_gp(rnp, rdp, c, TPS("EndWait")); |
2216 | smp_mb(); /* Ensure that CB invocation happens after GP end. */ | 2214 | smp_mb(); /* Ensure that CB invocation happens after GP end. */ |
2217 | } | 2215 | } |
2218 | 2216 | ||
@@ -2375,3 +2373,425 @@ static void rcu_kick_nohz_cpu(int cpu) | |||
2375 | smp_send_reschedule(cpu); | 2373 | smp_send_reschedule(cpu); |
2376 | #endif /* #ifdef CONFIG_NO_HZ_FULL */ | 2374 | #endif /* #ifdef CONFIG_NO_HZ_FULL */ |
2377 | } | 2375 | } |
2376 | |||
2377 | |||
2378 | #ifdef CONFIG_NO_HZ_FULL_SYSIDLE | ||
2379 | |||
2380 | /* | ||
2381 | * Define RCU flavor that holds sysidle state. This needs to be the | ||
2382 | * most active flavor of RCU. | ||
2383 | */ | ||
2384 | #ifdef CONFIG_PREEMPT_RCU | ||
2385 | static struct rcu_state *rcu_sysidle_state = &rcu_preempt_state; | ||
2386 | #else /* #ifdef CONFIG_PREEMPT_RCU */ | ||
2387 | static struct rcu_state *rcu_sysidle_state = &rcu_sched_state; | ||
2388 | #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ | ||
2389 | |||
2390 | static int full_sysidle_state; /* Current system-idle state. */ | ||
2391 | #define RCU_SYSIDLE_NOT 0 /* Some CPU is not idle. */ | ||
2392 | #define RCU_SYSIDLE_SHORT 1 /* All CPUs idle for brief period. */ | ||
2393 | #define RCU_SYSIDLE_LONG 2 /* All CPUs idle for long enough. */ | ||
2394 | #define RCU_SYSIDLE_FULL 3 /* All CPUs idle, ready for sysidle. */ | ||
2395 | #define RCU_SYSIDLE_FULL_NOTED 4 /* Actually entered sysidle state. */ | ||
2396 | |||
2397 | /* | ||
2398 | * Invoked to note exit from irq or task transition to idle. Note that | ||
2399 | * usermode execution does -not- count as idle here! After all, we want | ||
2400 | * to detect full-system idle states, not RCU quiescent states and grace | ||
2401 | * periods. The caller must have disabled interrupts. | ||
2402 | */ | ||
2403 | static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) | ||
2404 | { | ||
2405 | unsigned long j; | ||
2406 | |||
2407 | /* Adjust nesting, check for fully idle. */ | ||
2408 | if (irq) { | ||
2409 | rdtp->dynticks_idle_nesting--; | ||
2410 | WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0); | ||
2411 | if (rdtp->dynticks_idle_nesting != 0) | ||
2412 | return; /* Still not fully idle. */ | ||
2413 | } else { | ||
2414 | if ((rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) == | ||
2415 | DYNTICK_TASK_NEST_VALUE) { | ||
2416 | rdtp->dynticks_idle_nesting = 0; | ||
2417 | } else { | ||
2418 | rdtp->dynticks_idle_nesting -= DYNTICK_TASK_NEST_VALUE; | ||
2419 | WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0); | ||
2420 | return; /* Still not fully idle. */ | ||
2421 | } | ||
2422 | } | ||
2423 | |||
2424 | /* Record start of fully idle period. */ | ||
2425 | j = jiffies; | ||
2426 | ACCESS_ONCE(rdtp->dynticks_idle_jiffies) = j; | ||
2427 | smp_mb__before_atomic_inc(); | ||
2428 | atomic_inc(&rdtp->dynticks_idle); | ||
2429 | smp_mb__after_atomic_inc(); | ||
2430 | WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1); | ||
2431 | } | ||
2432 | |||
2433 | /* | ||
2434 | * Unconditionally force exit from full system-idle state. This is | ||
2435 | * invoked when a normal CPU exits idle, but must be called separately | ||
2436 | * for the timekeeping CPU (tick_do_timer_cpu). The reason for this | ||
2437 | * is that the timekeeping CPU is permitted to take scheduling-clock | ||
2438 | * interrupts while the system is in system-idle state, and of course | ||
2439 | * rcu_sysidle_exit() has no way of distinguishing a scheduling-clock | ||
2440 | * interrupt from any other type of interrupt. | ||
2441 | */ | ||
2442 | void rcu_sysidle_force_exit(void) | ||
2443 | { | ||
2444 | int oldstate = ACCESS_ONCE(full_sysidle_state); | ||
2445 | int newoldstate; | ||
2446 | |||
2447 | /* | ||
2448 | * Each pass through the following loop attempts to exit full | ||
2449 | * system-idle state. If contention proves to be a problem, | ||
2450 | * a trylock-based contention tree could be used here. | ||
2451 | */ | ||
2452 | while (oldstate > RCU_SYSIDLE_SHORT) { | ||
2453 | newoldstate = cmpxchg(&full_sysidle_state, | ||
2454 | oldstate, RCU_SYSIDLE_NOT); | ||
2455 | if (oldstate == newoldstate && | ||
2456 | oldstate == RCU_SYSIDLE_FULL_NOTED) { | ||
2457 | rcu_kick_nohz_cpu(tick_do_timer_cpu); | ||
2458 | return; /* We cleared it, done! */ | ||
2459 | } | ||
2460 | oldstate = newoldstate; | ||
2461 | } | ||
2462 | smp_mb(); /* Order initial oldstate fetch vs. later non-idle work. */ | ||
2463 | } | ||
2464 | |||
2465 | /* | ||
2466 | * Invoked to note entry to irq or task transition from idle. Note that | ||
2467 | * usermode execution does -not- count as idle here! The caller must | ||
2468 | * have disabled interrupts. | ||
2469 | */ | ||
2470 | static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) | ||
2471 | { | ||
2472 | /* Adjust nesting, check for already non-idle. */ | ||
2473 | if (irq) { | ||
2474 | rdtp->dynticks_idle_nesting++; | ||
2475 | WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0); | ||
2476 | if (rdtp->dynticks_idle_nesting != 1) | ||
2477 | return; /* Already non-idle. */ | ||
2478 | } else { | ||
2479 | /* | ||
2480 | * Allow for irq misnesting. Yes, it really is possible | ||
2481 | * to enter an irq handler then never leave it, and maybe | ||
2482 | * also vice versa. Handle both possibilities. | ||
2483 | */ | ||
2484 | if (rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) { | ||
2485 | rdtp->dynticks_idle_nesting += DYNTICK_TASK_NEST_VALUE; | ||
2486 | WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0); | ||
2487 | return; /* Already non-idle. */ | ||
2488 | } else { | ||
2489 | rdtp->dynticks_idle_nesting = DYNTICK_TASK_EXIT_IDLE; | ||
2490 | } | ||
2491 | } | ||
2492 | |||
2493 | /* Record end of idle period. */ | ||
2494 | smp_mb__before_atomic_inc(); | ||
2495 | atomic_inc(&rdtp->dynticks_idle); | ||
2496 | smp_mb__after_atomic_inc(); | ||
2497 | WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1)); | ||
2498 | |||
2499 | /* | ||
2500 | * If we are the timekeeping CPU, we are permitted to be non-idle | ||
2501 | * during a system-idle state. This must be the case, because | ||
2502 | * the timekeeping CPU has to take scheduling-clock interrupts | ||
2503 | * during the time that the system is transitioning to full | ||
2504 | * system-idle state. This means that the timekeeping CPU must | ||
2505 | * invoke rcu_sysidle_force_exit() directly if it does anything | ||
2506 | * more than take a scheduling-clock interrupt. | ||
2507 | */ | ||
2508 | if (smp_processor_id() == tick_do_timer_cpu) | ||
2509 | return; | ||
2510 | |||
2511 | /* Update system-idle state: We are clearly no longer fully idle! */ | ||
2512 | rcu_sysidle_force_exit(); | ||
2513 | } | ||
2514 | |||
2515 | /* | ||
2516 | * Check to see if the current CPU is idle. Note that usermode execution | ||
2517 | * does not count as idle. The caller must have disabled interrupts. | ||
2518 | */ | ||
2519 | static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, | ||
2520 | unsigned long *maxj) | ||
2521 | { | ||
2522 | int cur; | ||
2523 | unsigned long j; | ||
2524 | struct rcu_dynticks *rdtp = rdp->dynticks; | ||
2525 | |||
2526 | /* | ||
2527 | * If some other CPU has already reported non-idle, if this is | ||
2528 | * not the flavor of RCU that tracks sysidle state, or if this | ||
2529 | * is an offline or the timekeeping CPU, nothing to do. | ||
2530 | */ | ||
2531 | if (!*isidle || rdp->rsp != rcu_sysidle_state || | ||
2532 | cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu) | ||
2533 | return; | ||
2534 | if (rcu_gp_in_progress(rdp->rsp)) | ||
2535 | WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu); | ||
2536 | |||
2537 | /* Pick up current idle and NMI-nesting counter and check. */ | ||
2538 | cur = atomic_read(&rdtp->dynticks_idle); | ||
2539 | if (cur & 0x1) { | ||
2540 | *isidle = false; /* We are not idle! */ | ||
2541 | return; | ||
2542 | } | ||
2543 | smp_mb(); /* Read counters before timestamps. */ | ||
2544 | |||
2545 | /* Pick up timestamps. */ | ||
2546 | j = ACCESS_ONCE(rdtp->dynticks_idle_jiffies); | ||
2547 | /* If this CPU entered idle more recently, update maxj timestamp. */ | ||
2548 | if (ULONG_CMP_LT(*maxj, j)) | ||
2549 | *maxj = j; | ||
2550 | } | ||
2551 | |||
2552 | /* | ||
2553 | * Is this the flavor of RCU that is handling full-system idle? | ||
2554 | */ | ||
2555 | static bool is_sysidle_rcu_state(struct rcu_state *rsp) | ||
2556 | { | ||
2557 | return rsp == rcu_sysidle_state; | ||
2558 | } | ||
2559 | |||
2560 | /* | ||
2561 | * Bind the grace-period kthread for the sysidle flavor of RCU to the | ||
2562 | * timekeeping CPU. | ||
2563 | */ | ||
2564 | static void rcu_bind_gp_kthread(void) | ||
2565 | { | ||
2566 | int cpu = ACCESS_ONCE(tick_do_timer_cpu); | ||
2567 | |||
2568 | if (cpu < 0 || cpu >= nr_cpu_ids) | ||
2569 | return; | ||
2570 | if (raw_smp_processor_id() != cpu) | ||
2571 | set_cpus_allowed_ptr(current, cpumask_of(cpu)); | ||
2572 | } | ||
2573 | |||
2574 | /* | ||
2575 | * Return a delay in jiffies based on the number of CPUs, rcu_node | ||
2576 | * leaf fanout, and jiffies tick rate. The idea is to allow larger | ||
2577 | * systems more time to transition to full-idle state in order to | ||
2578 | * avoid the cache thrashing that otherwise occur on the state variable. | ||
2579 | * Really small systems (less than a couple of tens of CPUs) should | ||
2580 | * instead use a single global atomically incremented counter, and later | ||
2581 | * versions of this will automatically reconfigure themselves accordingly. | ||
2582 | */ | ||
2583 | static unsigned long rcu_sysidle_delay(void) | ||
2584 | { | ||
2585 | if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) | ||
2586 | return 0; | ||
2587 | return DIV_ROUND_UP(nr_cpu_ids * HZ, rcu_fanout_leaf * 1000); | ||
2588 | } | ||
2589 | |||
2590 | /* | ||
2591 | * Advance the full-system-idle state. This is invoked when all of | ||
2592 | * the non-timekeeping CPUs are idle. | ||
2593 | */ | ||
2594 | static void rcu_sysidle(unsigned long j) | ||
2595 | { | ||
2596 | /* Check the current state. */ | ||
2597 | switch (ACCESS_ONCE(full_sysidle_state)) { | ||
2598 | case RCU_SYSIDLE_NOT: | ||
2599 | |||
2600 | /* First time all are idle, so note a short idle period. */ | ||
2601 | ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_SHORT; | ||
2602 | break; | ||
2603 | |||
2604 | case RCU_SYSIDLE_SHORT: | ||
2605 | |||
2606 | /* | ||
2607 | * Idle for a bit, time to advance to next state? | ||
2608 | * cmpxchg failure means race with non-idle, let them win. | ||
2609 | */ | ||
2610 | if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay())) | ||
2611 | (void)cmpxchg(&full_sysidle_state, | ||
2612 | RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG); | ||
2613 | break; | ||
2614 | |||
2615 | case RCU_SYSIDLE_LONG: | ||
2616 | |||
2617 | /* | ||
2618 | * Do an additional check pass before advancing to full. | ||
2619 | * cmpxchg failure means race with non-idle, let them win. | ||
2620 | */ | ||
2621 | if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay())) | ||
2622 | (void)cmpxchg(&full_sysidle_state, | ||
2623 | RCU_SYSIDLE_LONG, RCU_SYSIDLE_FULL); | ||
2624 | break; | ||
2625 | |||
2626 | default: | ||
2627 | break; | ||
2628 | } | ||
2629 | } | ||
2630 | |||
2631 | /* | ||
2632 | * Found a non-idle non-timekeeping CPU, so kick the system-idle state | ||
2633 | * back to the beginning. | ||
2634 | */ | ||
2635 | static void rcu_sysidle_cancel(void) | ||
2636 | { | ||
2637 | smp_mb(); | ||
2638 | ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT; | ||
2639 | } | ||
2640 | |||
2641 | /* | ||
2642 | * Update the sysidle state based on the results of a force-quiescent-state | ||
2643 | * scan of the CPUs' dyntick-idle state. | ||
2644 | */ | ||
2645 | static void rcu_sysidle_report(struct rcu_state *rsp, int isidle, | ||
2646 | unsigned long maxj, bool gpkt) | ||
2647 | { | ||
2648 | if (rsp != rcu_sysidle_state) | ||
2649 | return; /* Wrong flavor, ignore. */ | ||
2650 | if (gpkt && nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) | ||
2651 | return; /* Running state machine from timekeeping CPU. */ | ||
2652 | if (isidle) | ||
2653 | rcu_sysidle(maxj); /* More idle! */ | ||
2654 | else | ||
2655 | rcu_sysidle_cancel(); /* Idle is over. */ | ||
2656 | } | ||
2657 | |||
2658 | /* | ||
2659 | * Wrapper for rcu_sysidle_report() when called from the grace-period | ||
2660 | * kthread's context. | ||
2661 | */ | ||
2662 | static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, | ||
2663 | unsigned long maxj) | ||
2664 | { | ||
2665 | rcu_sysidle_report(rsp, isidle, maxj, true); | ||
2666 | } | ||
2667 | |||
2668 | /* Callback and function for forcing an RCU grace period. */ | ||
2669 | struct rcu_sysidle_head { | ||
2670 | struct rcu_head rh; | ||
2671 | int inuse; | ||
2672 | }; | ||
2673 | |||
2674 | static void rcu_sysidle_cb(struct rcu_head *rhp) | ||
2675 | { | ||
2676 | struct rcu_sysidle_head *rshp; | ||
2677 | |||
2678 | /* | ||
2679 | * The following memory barrier is needed to replace the | ||
2680 | * memory barriers that would normally be in the memory | ||
2681 | * allocator. | ||
2682 | */ | ||
2683 | smp_mb(); /* grace period precedes setting inuse. */ | ||
2684 | |||
2685 | rshp = container_of(rhp, struct rcu_sysidle_head, rh); | ||
2686 | ACCESS_ONCE(rshp->inuse) = 0; | ||
2687 | } | ||
2688 | |||
2689 | /* | ||
2690 | * Check to see if the system is fully idle, other than the timekeeping CPU. | ||
2691 | * The caller must have disabled interrupts. | ||
2692 | */ | ||
2693 | bool rcu_sys_is_idle(void) | ||
2694 | { | ||
2695 | static struct rcu_sysidle_head rsh; | ||
2696 | int rss = ACCESS_ONCE(full_sysidle_state); | ||
2697 | |||
2698 | if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu)) | ||
2699 | return false; | ||
2700 | |||
2701 | /* Handle small-system case by doing a full scan of CPUs. */ | ||
2702 | if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) { | ||
2703 | int oldrss = rss - 1; | ||
2704 | |||
2705 | /* | ||
2706 | * One pass to advance to each state up to _FULL. | ||
2707 | * Give up if any pass fails to advance the state. | ||
2708 | */ | ||
2709 | while (rss < RCU_SYSIDLE_FULL && oldrss < rss) { | ||
2710 | int cpu; | ||
2711 | bool isidle = true; | ||
2712 | unsigned long maxj = jiffies - ULONG_MAX / 4; | ||
2713 | struct rcu_data *rdp; | ||
2714 | |||
2715 | /* Scan all the CPUs looking for nonidle CPUs. */ | ||
2716 | for_each_possible_cpu(cpu) { | ||
2717 | rdp = per_cpu_ptr(rcu_sysidle_state->rda, cpu); | ||
2718 | rcu_sysidle_check_cpu(rdp, &isidle, &maxj); | ||
2719 | if (!isidle) | ||
2720 | break; | ||
2721 | } | ||
2722 | rcu_sysidle_report(rcu_sysidle_state, | ||
2723 | isidle, maxj, false); | ||
2724 | oldrss = rss; | ||
2725 | rss = ACCESS_ONCE(full_sysidle_state); | ||
2726 | } | ||
2727 | } | ||
2728 | |||
2729 | /* If this is the first observation of an idle period, record it. */ | ||
2730 | if (rss == RCU_SYSIDLE_FULL) { | ||
2731 | rss = cmpxchg(&full_sysidle_state, | ||
2732 | RCU_SYSIDLE_FULL, RCU_SYSIDLE_FULL_NOTED); | ||
2733 | return rss == RCU_SYSIDLE_FULL; | ||
2734 | } | ||
2735 | |||
2736 | smp_mb(); /* ensure rss load happens before later caller actions. */ | ||
2737 | |||
2738 | /* If already fully idle, tell the caller (in case of races). */ | ||
2739 | if (rss == RCU_SYSIDLE_FULL_NOTED) | ||
2740 | return true; | ||
2741 | |||
2742 | /* | ||
2743 | * If we aren't there yet, and a grace period is not in flight, | ||
2744 | * initiate a grace period. Either way, tell the caller that | ||
2745 | * we are not there yet. We use an xchg() rather than an assignment | ||
2746 | * to make up for the memory barriers that would otherwise be | ||
2747 | * provided by the memory allocator. | ||
2748 | */ | ||
2749 | if (nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL && | ||
2750 | !rcu_gp_in_progress(rcu_sysidle_state) && | ||
2751 | !rsh.inuse && xchg(&rsh.inuse, 1) == 0) | ||
2752 | call_rcu(&rsh.rh, rcu_sysidle_cb); | ||
2753 | return false; | ||
2754 | } | ||
2755 | |||
2756 | /* | ||
2757 | * Initialize dynticks sysidle state for CPUs coming online. | ||
2758 | */ | ||
2759 | static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) | ||
2760 | { | ||
2761 | rdtp->dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE; | ||
2762 | } | ||
2763 | |||
2764 | #else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ | ||
2765 | |||
2766 | static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) | ||
2767 | { | ||
2768 | } | ||
2769 | |||
2770 | static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) | ||
2771 | { | ||
2772 | } | ||
2773 | |||
2774 | static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, | ||
2775 | unsigned long *maxj) | ||
2776 | { | ||
2777 | } | ||
2778 | |||
2779 | static bool is_sysidle_rcu_state(struct rcu_state *rsp) | ||
2780 | { | ||
2781 | return false; | ||
2782 | } | ||
2783 | |||
2784 | static void rcu_bind_gp_kthread(void) | ||
2785 | { | ||
2786 | } | ||
2787 | |||
2788 | static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, | ||
2789 | unsigned long maxj) | ||
2790 | { | ||
2791 | } | ||
2792 | |||
2793 | static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) | ||
2794 | { | ||
2795 | } | ||
2796 | |||
2797 | #endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ | ||
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 05c39f030314..725aa067ad63 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -2527,13 +2527,11 @@ void __sched schedule_preempt_disabled(void) | |||
2527 | */ | 2527 | */ |
2528 | asmlinkage void __sched notrace preempt_schedule(void) | 2528 | asmlinkage void __sched notrace preempt_schedule(void) |
2529 | { | 2529 | { |
2530 | struct thread_info *ti = current_thread_info(); | ||
2531 | |||
2532 | /* | 2530 | /* |
2533 | * If there is a non-zero preempt_count or interrupts are disabled, | 2531 | * If there is a non-zero preempt_count or interrupts are disabled, |
2534 | * we do not want to preempt the current task. Just return.. | 2532 | * we do not want to preempt the current task. Just return.. |
2535 | */ | 2533 | */ |
2536 | if (likely(ti->preempt_count || irqs_disabled())) | 2534 | if (likely(!preemptible())) |
2537 | return; | 2535 | return; |
2538 | 2536 | ||
2539 | do { | 2537 | do { |
@@ -2677,7 +2675,7 @@ void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, | |||
2677 | if (unlikely(!q)) | 2675 | if (unlikely(!q)) |
2678 | return; | 2676 | return; |
2679 | 2677 | ||
2680 | if (unlikely(!nr_exclusive)) | 2678 | if (unlikely(nr_exclusive != 1)) |
2681 | wake_flags = 0; | 2679 | wake_flags = 0; |
2682 | 2680 | ||
2683 | spin_lock_irqsave(&q->lock, flags); | 2681 | spin_lock_irqsave(&q->lock, flags); |
@@ -4964,7 +4962,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) | |||
4964 | SD_BALANCE_FORK | | 4962 | SD_BALANCE_FORK | |
4965 | SD_BALANCE_EXEC | | 4963 | SD_BALANCE_EXEC | |
4966 | SD_SHARE_CPUPOWER | | 4964 | SD_SHARE_CPUPOWER | |
4967 | SD_SHARE_PKG_RESOURCES); | 4965 | SD_SHARE_PKG_RESOURCES | |
4966 | SD_PREFER_SIBLING); | ||
4968 | if (nr_node_ids == 1) | 4967 | if (nr_node_ids == 1) |
4969 | pflags &= ~SD_SERIALIZE; | 4968 | pflags &= ~SD_SERIALIZE; |
4970 | } | 4969 | } |
@@ -5133,18 +5132,23 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) | |||
5133 | * two cpus are in the same cache domain, see cpus_share_cache(). | 5132 | * two cpus are in the same cache domain, see cpus_share_cache(). |
5134 | */ | 5133 | */ |
5135 | DEFINE_PER_CPU(struct sched_domain *, sd_llc); | 5134 | DEFINE_PER_CPU(struct sched_domain *, sd_llc); |
5135 | DEFINE_PER_CPU(int, sd_llc_size); | ||
5136 | DEFINE_PER_CPU(int, sd_llc_id); | 5136 | DEFINE_PER_CPU(int, sd_llc_id); |
5137 | 5137 | ||
5138 | static void update_top_cache_domain(int cpu) | 5138 | static void update_top_cache_domain(int cpu) |
5139 | { | 5139 | { |
5140 | struct sched_domain *sd; | 5140 | struct sched_domain *sd; |
5141 | int id = cpu; | 5141 | int id = cpu; |
5142 | int size = 1; | ||
5142 | 5143 | ||
5143 | sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); | 5144 | sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); |
5144 | if (sd) | 5145 | if (sd) { |
5145 | id = cpumask_first(sched_domain_span(sd)); | 5146 | id = cpumask_first(sched_domain_span(sd)); |
5147 | size = cpumask_weight(sched_domain_span(sd)); | ||
5148 | } | ||
5146 | 5149 | ||
5147 | rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); | 5150 | rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); |
5151 | per_cpu(sd_llc_size, cpu) = size; | ||
5148 | per_cpu(sd_llc_id, cpu) = id; | 5152 | per_cpu(sd_llc_id, cpu) = id; |
5149 | } | 5153 | } |
5150 | 5154 | ||
@@ -5168,6 +5172,13 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | |||
5168 | tmp->parent = parent->parent; | 5172 | tmp->parent = parent->parent; |
5169 | if (parent->parent) | 5173 | if (parent->parent) |
5170 | parent->parent->child = tmp; | 5174 | parent->parent->child = tmp; |
5175 | /* | ||
5176 | * Transfer SD_PREFER_SIBLING down in case of a | ||
5177 | * degenerate parent; the spans match for this | ||
5178 | * so the property transfers. | ||
5179 | */ | ||
5180 | if (parent->flags & SD_PREFER_SIBLING) | ||
5181 | tmp->flags |= SD_PREFER_SIBLING; | ||
5171 | destroy_sched_domain(parent, cpu); | 5182 | destroy_sched_domain(parent, cpu); |
5172 | } else | 5183 | } else |
5173 | tmp = tmp->parent; | 5184 | tmp = tmp->parent; |
@@ -6234,8 +6245,9 @@ match1: | |||
6234 | ; | 6245 | ; |
6235 | } | 6246 | } |
6236 | 6247 | ||
6248 | n = ndoms_cur; | ||
6237 | if (doms_new == NULL) { | 6249 | if (doms_new == NULL) { |
6238 | ndoms_cur = 0; | 6250 | n = 0; |
6239 | doms_new = &fallback_doms; | 6251 | doms_new = &fallback_doms; |
6240 | cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); | 6252 | cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); |
6241 | WARN_ON_ONCE(dattr_new); | 6253 | WARN_ON_ONCE(dattr_new); |
@@ -6243,7 +6255,7 @@ match1: | |||
6243 | 6255 | ||
6244 | /* Build new domains */ | 6256 | /* Build new domains */ |
6245 | for (i = 0; i < ndoms_new; i++) { | 6257 | for (i = 0; i < ndoms_new; i++) { |
6246 | for (j = 0; j < ndoms_cur && !new_topology; j++) { | 6258 | for (j = 0; j < n && !new_topology; j++) { |
6247 | if (cpumask_equal(doms_new[i], doms_cur[j]) | 6259 | if (cpumask_equal(doms_new[i], doms_cur[j]) |
6248 | && dattrs_equal(dattr_new, i, dattr_cur, j)) | 6260 | && dattrs_equal(dattr_new, i, dattr_cur, j)) |
6249 | goto match2; | 6261 | goto match2; |
@@ -6815,7 +6827,7 @@ void sched_move_task(struct task_struct *tsk) | |||
6815 | if (unlikely(running)) | 6827 | if (unlikely(running)) |
6816 | tsk->sched_class->put_prev_task(rq, tsk); | 6828 | tsk->sched_class->put_prev_task(rq, tsk); |
6817 | 6829 | ||
6818 | tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id, | 6830 | tg = container_of(task_css_check(tsk, cpu_cgroup_subsys_id, |
6819 | lockdep_is_held(&tsk->sighand->siglock)), | 6831 | lockdep_is_held(&tsk->sighand->siglock)), |
6820 | struct task_group, css); | 6832 | struct task_group, css); |
6821 | tg = autogroup_task_group(tsk, tg); | 6833 | tg = autogroup_task_group(tsk, tg); |
@@ -7137,23 +7149,22 @@ int sched_rt_handler(struct ctl_table *table, int write, | |||
7137 | 7149 | ||
7138 | #ifdef CONFIG_CGROUP_SCHED | 7150 | #ifdef CONFIG_CGROUP_SCHED |
7139 | 7151 | ||
7140 | /* return corresponding task_group object of a cgroup */ | 7152 | static inline struct task_group *css_tg(struct cgroup_subsys_state *css) |
7141 | static inline struct task_group *cgroup_tg(struct cgroup *cgrp) | ||
7142 | { | 7153 | { |
7143 | return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id), | 7154 | return css ? container_of(css, struct task_group, css) : NULL; |
7144 | struct task_group, css); | ||
7145 | } | 7155 | } |
7146 | 7156 | ||
7147 | static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp) | 7157 | static struct cgroup_subsys_state * |
7158 | cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) | ||
7148 | { | 7159 | { |
7149 | struct task_group *tg, *parent; | 7160 | struct task_group *parent = css_tg(parent_css); |
7161 | struct task_group *tg; | ||
7150 | 7162 | ||
7151 | if (!cgrp->parent) { | 7163 | if (!parent) { |
7152 | /* This is early initialization for the top cgroup */ | 7164 | /* This is early initialization for the top cgroup */ |
7153 | return &root_task_group.css; | 7165 | return &root_task_group.css; |
7154 | } | 7166 | } |
7155 | 7167 | ||
7156 | parent = cgroup_tg(cgrp->parent); | ||
7157 | tg = sched_create_group(parent); | 7168 | tg = sched_create_group(parent); |
7158 | if (IS_ERR(tg)) | 7169 | if (IS_ERR(tg)) |
7159 | return ERR_PTR(-ENOMEM); | 7170 | return ERR_PTR(-ENOMEM); |
@@ -7161,41 +7172,38 @@ static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp) | |||
7161 | return &tg->css; | 7172 | return &tg->css; |
7162 | } | 7173 | } |
7163 | 7174 | ||
7164 | static int cpu_cgroup_css_online(struct cgroup *cgrp) | 7175 | static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) |
7165 | { | 7176 | { |
7166 | struct task_group *tg = cgroup_tg(cgrp); | 7177 | struct task_group *tg = css_tg(css); |
7167 | struct task_group *parent; | 7178 | struct task_group *parent = css_tg(css_parent(css)); |
7168 | |||
7169 | if (!cgrp->parent) | ||
7170 | return 0; | ||
7171 | 7179 | ||
7172 | parent = cgroup_tg(cgrp->parent); | 7180 | if (parent) |
7173 | sched_online_group(tg, parent); | 7181 | sched_online_group(tg, parent); |
7174 | return 0; | 7182 | return 0; |
7175 | } | 7183 | } |
7176 | 7184 | ||
7177 | static void cpu_cgroup_css_free(struct cgroup *cgrp) | 7185 | static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) |
7178 | { | 7186 | { |
7179 | struct task_group *tg = cgroup_tg(cgrp); | 7187 | struct task_group *tg = css_tg(css); |
7180 | 7188 | ||
7181 | sched_destroy_group(tg); | 7189 | sched_destroy_group(tg); |
7182 | } | 7190 | } |
7183 | 7191 | ||
7184 | static void cpu_cgroup_css_offline(struct cgroup *cgrp) | 7192 | static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) |
7185 | { | 7193 | { |
7186 | struct task_group *tg = cgroup_tg(cgrp); | 7194 | struct task_group *tg = css_tg(css); |
7187 | 7195 | ||
7188 | sched_offline_group(tg); | 7196 | sched_offline_group(tg); |
7189 | } | 7197 | } |
7190 | 7198 | ||
7191 | static int cpu_cgroup_can_attach(struct cgroup *cgrp, | 7199 | static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, |
7192 | struct cgroup_taskset *tset) | 7200 | struct cgroup_taskset *tset) |
7193 | { | 7201 | { |
7194 | struct task_struct *task; | 7202 | struct task_struct *task; |
7195 | 7203 | ||
7196 | cgroup_taskset_for_each(task, cgrp, tset) { | 7204 | cgroup_taskset_for_each(task, css, tset) { |
7197 | #ifdef CONFIG_RT_GROUP_SCHED | 7205 | #ifdef CONFIG_RT_GROUP_SCHED |
7198 | if (!sched_rt_can_attach(cgroup_tg(cgrp), task)) | 7206 | if (!sched_rt_can_attach(css_tg(css), task)) |
7199 | return -EINVAL; | 7207 | return -EINVAL; |
7200 | #else | 7208 | #else |
7201 | /* We don't support RT-tasks being in separate groups */ | 7209 | /* We don't support RT-tasks being in separate groups */ |
@@ -7206,18 +7214,18 @@ static int cpu_cgroup_can_attach(struct cgroup *cgrp, | |||
7206 | return 0; | 7214 | return 0; |
7207 | } | 7215 | } |
7208 | 7216 | ||
7209 | static void cpu_cgroup_attach(struct cgroup *cgrp, | 7217 | static void cpu_cgroup_attach(struct cgroup_subsys_state *css, |
7210 | struct cgroup_taskset *tset) | 7218 | struct cgroup_taskset *tset) |
7211 | { | 7219 | { |
7212 | struct task_struct *task; | 7220 | struct task_struct *task; |
7213 | 7221 | ||
7214 | cgroup_taskset_for_each(task, cgrp, tset) | 7222 | cgroup_taskset_for_each(task, css, tset) |
7215 | sched_move_task(task); | 7223 | sched_move_task(task); |
7216 | } | 7224 | } |
7217 | 7225 | ||
7218 | static void | 7226 | static void cpu_cgroup_exit(struct cgroup_subsys_state *css, |
7219 | cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, | 7227 | struct cgroup_subsys_state *old_css, |
7220 | struct task_struct *task) | 7228 | struct task_struct *task) |
7221 | { | 7229 | { |
7222 | /* | 7230 | /* |
7223 | * cgroup_exit() is called in the copy_process() failure path. | 7231 | * cgroup_exit() is called in the copy_process() failure path. |
@@ -7231,15 +7239,16 @@ cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, | |||
7231 | } | 7239 | } |
7232 | 7240 | ||
7233 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7241 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7234 | static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, | 7242 | static int cpu_shares_write_u64(struct cgroup_subsys_state *css, |
7235 | u64 shareval) | 7243 | struct cftype *cftype, u64 shareval) |
7236 | { | 7244 | { |
7237 | return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval)); | 7245 | return sched_group_set_shares(css_tg(css), scale_load(shareval)); |
7238 | } | 7246 | } |
7239 | 7247 | ||
7240 | static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) | 7248 | static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css, |
7249 | struct cftype *cft) | ||
7241 | { | 7250 | { |
7242 | struct task_group *tg = cgroup_tg(cgrp); | 7251 | struct task_group *tg = css_tg(css); |
7243 | 7252 | ||
7244 | return (u64) scale_load_down(tg->shares); | 7253 | return (u64) scale_load_down(tg->shares); |
7245 | } | 7254 | } |
@@ -7361,26 +7370,28 @@ long tg_get_cfs_period(struct task_group *tg) | |||
7361 | return cfs_period_us; | 7370 | return cfs_period_us; |
7362 | } | 7371 | } |
7363 | 7372 | ||
7364 | static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft) | 7373 | static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css, |
7374 | struct cftype *cft) | ||
7365 | { | 7375 | { |
7366 | return tg_get_cfs_quota(cgroup_tg(cgrp)); | 7376 | return tg_get_cfs_quota(css_tg(css)); |
7367 | } | 7377 | } |
7368 | 7378 | ||
7369 | static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype, | 7379 | static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css, |
7370 | s64 cfs_quota_us) | 7380 | struct cftype *cftype, s64 cfs_quota_us) |
7371 | { | 7381 | { |
7372 | return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us); | 7382 | return tg_set_cfs_quota(css_tg(css), cfs_quota_us); |
7373 | } | 7383 | } |
7374 | 7384 | ||
7375 | static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft) | 7385 | static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css, |
7386 | struct cftype *cft) | ||
7376 | { | 7387 | { |
7377 | return tg_get_cfs_period(cgroup_tg(cgrp)); | 7388 | return tg_get_cfs_period(css_tg(css)); |
7378 | } | 7389 | } |
7379 | 7390 | ||
7380 | static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype, | 7391 | static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css, |
7381 | u64 cfs_period_us) | 7392 | struct cftype *cftype, u64 cfs_period_us) |
7382 | { | 7393 | { |
7383 | return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us); | 7394 | return tg_set_cfs_period(css_tg(css), cfs_period_us); |
7384 | } | 7395 | } |
7385 | 7396 | ||
7386 | struct cfs_schedulable_data { | 7397 | struct cfs_schedulable_data { |
@@ -7461,10 +7472,10 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) | |||
7461 | return ret; | 7472 | return ret; |
7462 | } | 7473 | } |
7463 | 7474 | ||
7464 | static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft, | 7475 | static int cpu_stats_show(struct cgroup_subsys_state *css, struct cftype *cft, |
7465 | struct cgroup_map_cb *cb) | 7476 | struct cgroup_map_cb *cb) |
7466 | { | 7477 | { |
7467 | struct task_group *tg = cgroup_tg(cgrp); | 7478 | struct task_group *tg = css_tg(css); |
7468 | struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; | 7479 | struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; |
7469 | 7480 | ||
7470 | cb->fill(cb, "nr_periods", cfs_b->nr_periods); | 7481 | cb->fill(cb, "nr_periods", cfs_b->nr_periods); |
@@ -7477,26 +7488,28 @@ static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft, | |||
7477 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 7488 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
7478 | 7489 | ||
7479 | #ifdef CONFIG_RT_GROUP_SCHED | 7490 | #ifdef CONFIG_RT_GROUP_SCHED |
7480 | static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, | 7491 | static int cpu_rt_runtime_write(struct cgroup_subsys_state *css, |
7481 | s64 val) | 7492 | struct cftype *cft, s64 val) |
7482 | { | 7493 | { |
7483 | return sched_group_set_rt_runtime(cgroup_tg(cgrp), val); | 7494 | return sched_group_set_rt_runtime(css_tg(css), val); |
7484 | } | 7495 | } |
7485 | 7496 | ||
7486 | static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft) | 7497 | static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css, |
7498 | struct cftype *cft) | ||
7487 | { | 7499 | { |
7488 | return sched_group_rt_runtime(cgroup_tg(cgrp)); | 7500 | return sched_group_rt_runtime(css_tg(css)); |
7489 | } | 7501 | } |
7490 | 7502 | ||
7491 | static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype, | 7503 | static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css, |
7492 | u64 rt_period_us) | 7504 | struct cftype *cftype, u64 rt_period_us) |
7493 | { | 7505 | { |
7494 | return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us); | 7506 | return sched_group_set_rt_period(css_tg(css), rt_period_us); |
7495 | } | 7507 | } |
7496 | 7508 | ||
7497 | static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft) | 7509 | static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, |
7510 | struct cftype *cft) | ||
7498 | { | 7511 | { |
7499 | return sched_group_rt_period(cgroup_tg(cgrp)); | 7512 | return sched_group_rt_period(css_tg(css)); |
7500 | } | 7513 | } |
7501 | #endif /* CONFIG_RT_GROUP_SCHED */ | 7514 | #endif /* CONFIG_RT_GROUP_SCHED */ |
7502 | 7515 | ||
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index dbb7e2cd95eb..f64722ff0299 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c | |||
@@ -33,30 +33,20 @@ struct cpuacct { | |||
33 | struct kernel_cpustat __percpu *cpustat; | 33 | struct kernel_cpustat __percpu *cpustat; |
34 | }; | 34 | }; |
35 | 35 | ||
36 | /* return cpu accounting group corresponding to this container */ | 36 | static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css) |
37 | static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) | ||
38 | { | 37 | { |
39 | return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), | 38 | return css ? container_of(css, struct cpuacct, css) : NULL; |
40 | struct cpuacct, css); | ||
41 | } | 39 | } |
42 | 40 | ||
43 | /* return cpu accounting group to which this task belongs */ | 41 | /* return cpu accounting group to which this task belongs */ |
44 | static inline struct cpuacct *task_ca(struct task_struct *tsk) | 42 | static inline struct cpuacct *task_ca(struct task_struct *tsk) |
45 | { | 43 | { |
46 | return container_of(task_subsys_state(tsk, cpuacct_subsys_id), | 44 | return css_ca(task_css(tsk, cpuacct_subsys_id)); |
47 | struct cpuacct, css); | ||
48 | } | ||
49 | |||
50 | static inline struct cpuacct *__parent_ca(struct cpuacct *ca) | ||
51 | { | ||
52 | return cgroup_ca(ca->css.cgroup->parent); | ||
53 | } | 45 | } |
54 | 46 | ||
55 | static inline struct cpuacct *parent_ca(struct cpuacct *ca) | 47 | static inline struct cpuacct *parent_ca(struct cpuacct *ca) |
56 | { | 48 | { |
57 | if (!ca->css.cgroup->parent) | 49 | return css_ca(css_parent(&ca->css)); |
58 | return NULL; | ||
59 | return cgroup_ca(ca->css.cgroup->parent); | ||
60 | } | 50 | } |
61 | 51 | ||
62 | static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); | 52 | static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); |
@@ -66,11 +56,12 @@ static struct cpuacct root_cpuacct = { | |||
66 | }; | 56 | }; |
67 | 57 | ||
68 | /* create a new cpu accounting group */ | 58 | /* create a new cpu accounting group */ |
69 | static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp) | 59 | static struct cgroup_subsys_state * |
60 | cpuacct_css_alloc(struct cgroup_subsys_state *parent_css) | ||
70 | { | 61 | { |
71 | struct cpuacct *ca; | 62 | struct cpuacct *ca; |
72 | 63 | ||
73 | if (!cgrp->parent) | 64 | if (!parent_css) |
74 | return &root_cpuacct.css; | 65 | return &root_cpuacct.css; |
75 | 66 | ||
76 | ca = kzalloc(sizeof(*ca), GFP_KERNEL); | 67 | ca = kzalloc(sizeof(*ca), GFP_KERNEL); |
@@ -96,9 +87,9 @@ out: | |||
96 | } | 87 | } |
97 | 88 | ||
98 | /* destroy an existing cpu accounting group */ | 89 | /* destroy an existing cpu accounting group */ |
99 | static void cpuacct_css_free(struct cgroup *cgrp) | 90 | static void cpuacct_css_free(struct cgroup_subsys_state *css) |
100 | { | 91 | { |
101 | struct cpuacct *ca = cgroup_ca(cgrp); | 92 | struct cpuacct *ca = css_ca(css); |
102 | 93 | ||
103 | free_percpu(ca->cpustat); | 94 | free_percpu(ca->cpustat); |
104 | free_percpu(ca->cpuusage); | 95 | free_percpu(ca->cpuusage); |
@@ -141,9 +132,9 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) | |||
141 | } | 132 | } |
142 | 133 | ||
143 | /* return total cpu usage (in nanoseconds) of a group */ | 134 | /* return total cpu usage (in nanoseconds) of a group */ |
144 | static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) | 135 | static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft) |
145 | { | 136 | { |
146 | struct cpuacct *ca = cgroup_ca(cgrp); | 137 | struct cpuacct *ca = css_ca(css); |
147 | u64 totalcpuusage = 0; | 138 | u64 totalcpuusage = 0; |
148 | int i; | 139 | int i; |
149 | 140 | ||
@@ -153,10 +144,10 @@ static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) | |||
153 | return totalcpuusage; | 144 | return totalcpuusage; |
154 | } | 145 | } |
155 | 146 | ||
156 | static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, | 147 | static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, |
157 | u64 reset) | 148 | u64 reset) |
158 | { | 149 | { |
159 | struct cpuacct *ca = cgroup_ca(cgrp); | 150 | struct cpuacct *ca = css_ca(css); |
160 | int err = 0; | 151 | int err = 0; |
161 | int i; | 152 | int i; |
162 | 153 | ||
@@ -172,10 +163,10 @@ out: | |||
172 | return err; | 163 | return err; |
173 | } | 164 | } |
174 | 165 | ||
175 | static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, | 166 | static int cpuacct_percpu_seq_read(struct cgroup_subsys_state *css, |
176 | struct seq_file *m) | 167 | struct cftype *cft, struct seq_file *m) |
177 | { | 168 | { |
178 | struct cpuacct *ca = cgroup_ca(cgroup); | 169 | struct cpuacct *ca = css_ca(css); |
179 | u64 percpu; | 170 | u64 percpu; |
180 | int i; | 171 | int i; |
181 | 172 | ||
@@ -192,10 +183,10 @@ static const char * const cpuacct_stat_desc[] = { | |||
192 | [CPUACCT_STAT_SYSTEM] = "system", | 183 | [CPUACCT_STAT_SYSTEM] = "system", |
193 | }; | 184 | }; |
194 | 185 | ||
195 | static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, | 186 | static int cpuacct_stats_show(struct cgroup_subsys_state *css, |
196 | struct cgroup_map_cb *cb) | 187 | struct cftype *cft, struct cgroup_map_cb *cb) |
197 | { | 188 | { |
198 | struct cpuacct *ca = cgroup_ca(cgrp); | 189 | struct cpuacct *ca = css_ca(css); |
199 | int cpu; | 190 | int cpu; |
200 | s64 val = 0; | 191 | s64 val = 0; |
201 | 192 | ||
@@ -281,7 +272,7 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val) | |||
281 | while (ca != &root_cpuacct) { | 272 | while (ca != &root_cpuacct) { |
282 | kcpustat = this_cpu_ptr(ca->cpustat); | 273 | kcpustat = this_cpu_ptr(ca->cpustat); |
283 | kcpustat->cpustat[index] += val; | 274 | kcpustat->cpustat[index] += val; |
284 | ca = __parent_ca(ca); | 275 | ca = parent_ca(ca); |
285 | } | 276 | } |
286 | rcu_read_unlock(); | 277 | rcu_read_unlock(); |
287 | } | 278 | } |
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index a7959e05a9d5..ace34f95e200 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c | |||
@@ -121,7 +121,7 @@ static inline void task_group_account_field(struct task_struct *p, int index, | |||
121 | * is the only cgroup, then nothing else should be necessary. | 121 | * is the only cgroup, then nothing else should be necessary. |
122 | * | 122 | * |
123 | */ | 123 | */ |
124 | __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; | 124 | __this_cpu_add(kernel_cpustat.cpustat[index], tmp); |
125 | 125 | ||
126 | cpuacct_account_field(p, index, tmp); | 126 | cpuacct_account_field(p, index, tmp); |
127 | } | 127 | } |
@@ -378,11 +378,8 @@ static inline void irqtime_account_process_tick(struct task_struct *p, int user_ | |||
378 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING | 378 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING |
379 | 379 | ||
380 | #ifndef __ARCH_HAS_VTIME_TASK_SWITCH | 380 | #ifndef __ARCH_HAS_VTIME_TASK_SWITCH |
381 | void vtime_task_switch(struct task_struct *prev) | 381 | void vtime_common_task_switch(struct task_struct *prev) |
382 | { | 382 | { |
383 | if (!vtime_accounting_enabled()) | ||
384 | return; | ||
385 | |||
386 | if (is_idle_task(prev)) | 383 | if (is_idle_task(prev)) |
387 | vtime_account_idle(prev); | 384 | vtime_account_idle(prev); |
388 | else | 385 | else |
@@ -404,11 +401,8 @@ void vtime_task_switch(struct task_struct *prev) | |||
404 | * vtime_account(). | 401 | * vtime_account(). |
405 | */ | 402 | */ |
406 | #ifndef __ARCH_HAS_VTIME_ACCOUNT | 403 | #ifndef __ARCH_HAS_VTIME_ACCOUNT |
407 | void vtime_account_irq_enter(struct task_struct *tsk) | 404 | void vtime_common_account_irq_enter(struct task_struct *tsk) |
408 | { | 405 | { |
409 | if (!vtime_accounting_enabled()) | ||
410 | return; | ||
411 | |||
412 | if (!in_interrupt()) { | 406 | if (!in_interrupt()) { |
413 | /* | 407 | /* |
414 | * If we interrupted user, context_tracking_in_user() | 408 | * If we interrupted user, context_tracking_in_user() |
@@ -428,7 +422,7 @@ void vtime_account_irq_enter(struct task_struct *tsk) | |||
428 | } | 422 | } |
429 | vtime_account_system(tsk); | 423 | vtime_account_system(tsk); |
430 | } | 424 | } |
431 | EXPORT_SYMBOL_GPL(vtime_account_irq_enter); | 425 | EXPORT_SYMBOL_GPL(vtime_common_account_irq_enter); |
432 | #endif /* __ARCH_HAS_VTIME_ACCOUNT */ | 426 | #endif /* __ARCH_HAS_VTIME_ACCOUNT */ |
433 | #endif /* CONFIG_VIRT_CPU_ACCOUNTING */ | 427 | #endif /* CONFIG_VIRT_CPU_ACCOUNTING */ |
434 | 428 | ||
@@ -559,12 +553,6 @@ static void cputime_adjust(struct task_cputime *curr, | |||
559 | { | 553 | { |
560 | cputime_t rtime, stime, utime, total; | 554 | cputime_t rtime, stime, utime, total; |
561 | 555 | ||
562 | if (vtime_accounting_enabled()) { | ||
563 | *ut = curr->utime; | ||
564 | *st = curr->stime; | ||
565 | return; | ||
566 | } | ||
567 | |||
568 | stime = curr->stime; | 556 | stime = curr->stime; |
569 | total = stime + curr->utime; | 557 | total = stime + curr->utime; |
570 | 558 | ||
@@ -664,23 +652,17 @@ static void __vtime_account_system(struct task_struct *tsk) | |||
664 | 652 | ||
665 | void vtime_account_system(struct task_struct *tsk) | 653 | void vtime_account_system(struct task_struct *tsk) |
666 | { | 654 | { |
667 | if (!vtime_accounting_enabled()) | ||
668 | return; | ||
669 | |||
670 | write_seqlock(&tsk->vtime_seqlock); | 655 | write_seqlock(&tsk->vtime_seqlock); |
671 | __vtime_account_system(tsk); | 656 | __vtime_account_system(tsk); |
672 | write_sequnlock(&tsk->vtime_seqlock); | 657 | write_sequnlock(&tsk->vtime_seqlock); |
673 | } | 658 | } |
674 | 659 | ||
675 | void vtime_account_irq_exit(struct task_struct *tsk) | 660 | void vtime_gen_account_irq_exit(struct task_struct *tsk) |
676 | { | 661 | { |
677 | if (!vtime_accounting_enabled()) | ||
678 | return; | ||
679 | |||
680 | write_seqlock(&tsk->vtime_seqlock); | 662 | write_seqlock(&tsk->vtime_seqlock); |
663 | __vtime_account_system(tsk); | ||
681 | if (context_tracking_in_user()) | 664 | if (context_tracking_in_user()) |
682 | tsk->vtime_snap_whence = VTIME_USER; | 665 | tsk->vtime_snap_whence = VTIME_USER; |
683 | __vtime_account_system(tsk); | ||
684 | write_sequnlock(&tsk->vtime_seqlock); | 666 | write_sequnlock(&tsk->vtime_seqlock); |
685 | } | 667 | } |
686 | 668 | ||
@@ -688,12 +670,8 @@ void vtime_account_user(struct task_struct *tsk) | |||
688 | { | 670 | { |
689 | cputime_t delta_cpu; | 671 | cputime_t delta_cpu; |
690 | 672 | ||
691 | if (!vtime_accounting_enabled()) | ||
692 | return; | ||
693 | |||
694 | delta_cpu = get_vtime_delta(tsk); | ||
695 | |||
696 | write_seqlock(&tsk->vtime_seqlock); | 673 | write_seqlock(&tsk->vtime_seqlock); |
674 | delta_cpu = get_vtime_delta(tsk); | ||
697 | tsk->vtime_snap_whence = VTIME_SYS; | 675 | tsk->vtime_snap_whence = VTIME_SYS; |
698 | account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); | 676 | account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); |
699 | write_sequnlock(&tsk->vtime_seqlock); | 677 | write_sequnlock(&tsk->vtime_seqlock); |
@@ -701,22 +679,27 @@ void vtime_account_user(struct task_struct *tsk) | |||
701 | 679 | ||
702 | void vtime_user_enter(struct task_struct *tsk) | 680 | void vtime_user_enter(struct task_struct *tsk) |
703 | { | 681 | { |
704 | if (!vtime_accounting_enabled()) | ||
705 | return; | ||
706 | |||
707 | write_seqlock(&tsk->vtime_seqlock); | 682 | write_seqlock(&tsk->vtime_seqlock); |
708 | tsk->vtime_snap_whence = VTIME_USER; | ||
709 | __vtime_account_system(tsk); | 683 | __vtime_account_system(tsk); |
684 | tsk->vtime_snap_whence = VTIME_USER; | ||
710 | write_sequnlock(&tsk->vtime_seqlock); | 685 | write_sequnlock(&tsk->vtime_seqlock); |
711 | } | 686 | } |
712 | 687 | ||
713 | void vtime_guest_enter(struct task_struct *tsk) | 688 | void vtime_guest_enter(struct task_struct *tsk) |
714 | { | 689 | { |
690 | /* | ||
691 | * The flags must be updated under the lock with | ||
692 | * the vtime_snap flush and update. | ||
693 | * That enforces a right ordering and update sequence | ||
694 | * synchronization against the reader (task_gtime()) | ||
695 | * that can thus safely catch up with a tickless delta. | ||
696 | */ | ||
715 | write_seqlock(&tsk->vtime_seqlock); | 697 | write_seqlock(&tsk->vtime_seqlock); |
716 | __vtime_account_system(tsk); | 698 | __vtime_account_system(tsk); |
717 | current->flags |= PF_VCPU; | 699 | current->flags |= PF_VCPU; |
718 | write_sequnlock(&tsk->vtime_seqlock); | 700 | write_sequnlock(&tsk->vtime_seqlock); |
719 | } | 701 | } |
702 | EXPORT_SYMBOL_GPL(vtime_guest_enter); | ||
720 | 703 | ||
721 | void vtime_guest_exit(struct task_struct *tsk) | 704 | void vtime_guest_exit(struct task_struct *tsk) |
722 | { | 705 | { |
@@ -725,6 +708,7 @@ void vtime_guest_exit(struct task_struct *tsk) | |||
725 | current->flags &= ~PF_VCPU; | 708 | current->flags &= ~PF_VCPU; |
726 | write_sequnlock(&tsk->vtime_seqlock); | 709 | write_sequnlock(&tsk->vtime_seqlock); |
727 | } | 710 | } |
711 | EXPORT_SYMBOL_GPL(vtime_guest_exit); | ||
728 | 712 | ||
729 | void vtime_account_idle(struct task_struct *tsk) | 713 | void vtime_account_idle(struct task_struct *tsk) |
730 | { | 714 | { |
@@ -733,11 +717,6 @@ void vtime_account_idle(struct task_struct *tsk) | |||
733 | account_idle_time(delta_cpu); | 717 | account_idle_time(delta_cpu); |
734 | } | 718 | } |
735 | 719 | ||
736 | bool vtime_accounting_enabled(void) | ||
737 | { | ||
738 | return context_tracking_active(); | ||
739 | } | ||
740 | |||
741 | void arch_vtime_task_switch(struct task_struct *prev) | 720 | void arch_vtime_task_switch(struct task_struct *prev) |
742 | { | 721 | { |
743 | write_seqlock(&prev->vtime_seqlock); | 722 | write_seqlock(&prev->vtime_seqlock); |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 68f1609ca149..7f0a5e6cdae0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -3018,6 +3018,23 @@ static unsigned long cpu_avg_load_per_task(int cpu) | |||
3018 | return 0; | 3018 | return 0; |
3019 | } | 3019 | } |
3020 | 3020 | ||
3021 | static void record_wakee(struct task_struct *p) | ||
3022 | { | ||
3023 | /* | ||
3024 | * Rough decay (wiping) for cost saving, don't worry | ||
3025 | * about the boundary, really active task won't care | ||
3026 | * about the loss. | ||
3027 | */ | ||
3028 | if (jiffies > current->wakee_flip_decay_ts + HZ) { | ||
3029 | current->wakee_flips = 0; | ||
3030 | current->wakee_flip_decay_ts = jiffies; | ||
3031 | } | ||
3032 | |||
3033 | if (current->last_wakee != p) { | ||
3034 | current->last_wakee = p; | ||
3035 | current->wakee_flips++; | ||
3036 | } | ||
3037 | } | ||
3021 | 3038 | ||
3022 | static void task_waking_fair(struct task_struct *p) | 3039 | static void task_waking_fair(struct task_struct *p) |
3023 | { | 3040 | { |
@@ -3038,6 +3055,7 @@ static void task_waking_fair(struct task_struct *p) | |||
3038 | #endif | 3055 | #endif |
3039 | 3056 | ||
3040 | se->vruntime -= min_vruntime; | 3057 | se->vruntime -= min_vruntime; |
3058 | record_wakee(p); | ||
3041 | } | 3059 | } |
3042 | 3060 | ||
3043 | #ifdef CONFIG_FAIR_GROUP_SCHED | 3061 | #ifdef CONFIG_FAIR_GROUP_SCHED |
@@ -3156,6 +3174,28 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu, | |||
3156 | 3174 | ||
3157 | #endif | 3175 | #endif |
3158 | 3176 | ||
3177 | static int wake_wide(struct task_struct *p) | ||
3178 | { | ||
3179 | int factor = this_cpu_read(sd_llc_size); | ||
3180 | |||
3181 | /* | ||
3182 | * Yeah, it's the switching-frequency, could means many wakee or | ||
3183 | * rapidly switch, use factor here will just help to automatically | ||
3184 | * adjust the loose-degree, so bigger node will lead to more pull. | ||
3185 | */ | ||
3186 | if (p->wakee_flips > factor) { | ||
3187 | /* | ||
3188 | * wakee is somewhat hot, it needs certain amount of cpu | ||
3189 | * resource, so if waker is far more hot, prefer to leave | ||
3190 | * it alone. | ||
3191 | */ | ||
3192 | if (current->wakee_flips > (factor * p->wakee_flips)) | ||
3193 | return 1; | ||
3194 | } | ||
3195 | |||
3196 | return 0; | ||
3197 | } | ||
3198 | |||
3159 | static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) | 3199 | static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) |
3160 | { | 3200 | { |
3161 | s64 this_load, load; | 3201 | s64 this_load, load; |
@@ -3165,6 +3205,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) | |||
3165 | unsigned long weight; | 3205 | unsigned long weight; |
3166 | int balanced; | 3206 | int balanced; |
3167 | 3207 | ||
3208 | /* | ||
3209 | * If we wake multiple tasks be careful to not bounce | ||
3210 | * ourselves around too much. | ||
3211 | */ | ||
3212 | if (wake_wide(p)) | ||
3213 | return 0; | ||
3214 | |||
3168 | idx = sd->wake_idx; | 3215 | idx = sd->wake_idx; |
3169 | this_cpu = smp_processor_id(); | 3216 | this_cpu = smp_processor_id(); |
3170 | prev_cpu = task_cpu(p); | 3217 | prev_cpu = task_cpu(p); |
@@ -4172,47 +4219,48 @@ static void update_blocked_averages(int cpu) | |||
4172 | } | 4219 | } |
4173 | 4220 | ||
4174 | /* | 4221 | /* |
4175 | * Compute the cpu's hierarchical load factor for each task group. | 4222 | * Compute the hierarchical load factor for cfs_rq and all its ascendants. |
4176 | * This needs to be done in a top-down fashion because the load of a child | 4223 | * This needs to be done in a top-down fashion because the load of a child |
4177 | * group is a fraction of its parents load. | 4224 | * group is a fraction of its parents load. |
4178 | */ | 4225 | */ |
4179 | static int tg_load_down(struct task_group *tg, void *data) | 4226 | static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq) |
4180 | { | ||
4181 | unsigned long load; | ||
4182 | long cpu = (long)data; | ||
4183 | |||
4184 | if (!tg->parent) { | ||
4185 | load = cpu_rq(cpu)->avg.load_avg_contrib; | ||
4186 | } else { | ||
4187 | load = tg->parent->cfs_rq[cpu]->h_load; | ||
4188 | load = div64_ul(load * tg->se[cpu]->avg.load_avg_contrib, | ||
4189 | tg->parent->cfs_rq[cpu]->runnable_load_avg + 1); | ||
4190 | } | ||
4191 | |||
4192 | tg->cfs_rq[cpu]->h_load = load; | ||
4193 | |||
4194 | return 0; | ||
4195 | } | ||
4196 | |||
4197 | static void update_h_load(long cpu) | ||
4198 | { | 4227 | { |
4199 | struct rq *rq = cpu_rq(cpu); | 4228 | struct rq *rq = rq_of(cfs_rq); |
4229 | struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)]; | ||
4200 | unsigned long now = jiffies; | 4230 | unsigned long now = jiffies; |
4231 | unsigned long load; | ||
4201 | 4232 | ||
4202 | if (rq->h_load_throttle == now) | 4233 | if (cfs_rq->last_h_load_update == now) |
4203 | return; | 4234 | return; |
4204 | 4235 | ||
4205 | rq->h_load_throttle = now; | 4236 | cfs_rq->h_load_next = NULL; |
4237 | for_each_sched_entity(se) { | ||
4238 | cfs_rq = cfs_rq_of(se); | ||
4239 | cfs_rq->h_load_next = se; | ||
4240 | if (cfs_rq->last_h_load_update == now) | ||
4241 | break; | ||
4242 | } | ||
4206 | 4243 | ||
4207 | rcu_read_lock(); | 4244 | if (!se) { |
4208 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); | 4245 | cfs_rq->h_load = rq->avg.load_avg_contrib; |
4209 | rcu_read_unlock(); | 4246 | cfs_rq->last_h_load_update = now; |
4247 | } | ||
4248 | |||
4249 | while ((se = cfs_rq->h_load_next) != NULL) { | ||
4250 | load = cfs_rq->h_load; | ||
4251 | load = div64_ul(load * se->avg.load_avg_contrib, | ||
4252 | cfs_rq->runnable_load_avg + 1); | ||
4253 | cfs_rq = group_cfs_rq(se); | ||
4254 | cfs_rq->h_load = load; | ||
4255 | cfs_rq->last_h_load_update = now; | ||
4256 | } | ||
4210 | } | 4257 | } |
4211 | 4258 | ||
4212 | static unsigned long task_h_load(struct task_struct *p) | 4259 | static unsigned long task_h_load(struct task_struct *p) |
4213 | { | 4260 | { |
4214 | struct cfs_rq *cfs_rq = task_cfs_rq(p); | 4261 | struct cfs_rq *cfs_rq = task_cfs_rq(p); |
4215 | 4262 | ||
4263 | update_cfs_rq_h_load(cfs_rq); | ||
4216 | return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load, | 4264 | return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load, |
4217 | cfs_rq->runnable_load_avg + 1); | 4265 | cfs_rq->runnable_load_avg + 1); |
4218 | } | 4266 | } |
@@ -4221,10 +4269,6 @@ static inline void update_blocked_averages(int cpu) | |||
4221 | { | 4269 | { |
4222 | } | 4270 | } |
4223 | 4271 | ||
4224 | static inline void update_h_load(long cpu) | ||
4225 | { | ||
4226 | } | ||
4227 | |||
4228 | static unsigned long task_h_load(struct task_struct *p) | 4272 | static unsigned long task_h_load(struct task_struct *p) |
4229 | { | 4273 | { |
4230 | return p->se.avg.load_avg_contrib; | 4274 | return p->se.avg.load_avg_contrib; |
@@ -4233,50 +4277,56 @@ static unsigned long task_h_load(struct task_struct *p) | |||
4233 | 4277 | ||
4234 | /********** Helpers for find_busiest_group ************************/ | 4278 | /********** Helpers for find_busiest_group ************************/ |
4235 | /* | 4279 | /* |
4236 | * sd_lb_stats - Structure to store the statistics of a sched_domain | ||
4237 | * during load balancing. | ||
4238 | */ | ||
4239 | struct sd_lb_stats { | ||
4240 | struct sched_group *busiest; /* Busiest group in this sd */ | ||
4241 | struct sched_group *this; /* Local group in this sd */ | ||
4242 | unsigned long total_load; /* Total load of all groups in sd */ | ||
4243 | unsigned long total_pwr; /* Total power of all groups in sd */ | ||
4244 | unsigned long avg_load; /* Average load across all groups in sd */ | ||
4245 | |||
4246 | /** Statistics of this group */ | ||
4247 | unsigned long this_load; | ||
4248 | unsigned long this_load_per_task; | ||
4249 | unsigned long this_nr_running; | ||
4250 | unsigned long this_has_capacity; | ||
4251 | unsigned int this_idle_cpus; | ||
4252 | |||
4253 | /* Statistics of the busiest group */ | ||
4254 | unsigned int busiest_idle_cpus; | ||
4255 | unsigned long max_load; | ||
4256 | unsigned long busiest_load_per_task; | ||
4257 | unsigned long busiest_nr_running; | ||
4258 | unsigned long busiest_group_capacity; | ||
4259 | unsigned long busiest_has_capacity; | ||
4260 | unsigned int busiest_group_weight; | ||
4261 | |||
4262 | int group_imb; /* Is there imbalance in this sd */ | ||
4263 | }; | ||
4264 | |||
4265 | /* | ||
4266 | * sg_lb_stats - stats of a sched_group required for load_balancing | 4280 | * sg_lb_stats - stats of a sched_group required for load_balancing |
4267 | */ | 4281 | */ |
4268 | struct sg_lb_stats { | 4282 | struct sg_lb_stats { |
4269 | unsigned long avg_load; /*Avg load across the CPUs of the group */ | 4283 | unsigned long avg_load; /*Avg load across the CPUs of the group */ |
4270 | unsigned long group_load; /* Total load over the CPUs of the group */ | 4284 | unsigned long group_load; /* Total load over the CPUs of the group */ |
4271 | unsigned long sum_nr_running; /* Nr tasks running in the group */ | ||
4272 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ | 4285 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ |
4273 | unsigned long group_capacity; | 4286 | unsigned long load_per_task; |
4274 | unsigned long idle_cpus; | 4287 | unsigned long group_power; |
4275 | unsigned long group_weight; | 4288 | unsigned int sum_nr_running; /* Nr tasks running in the group */ |
4289 | unsigned int group_capacity; | ||
4290 | unsigned int idle_cpus; | ||
4291 | unsigned int group_weight; | ||
4276 | int group_imb; /* Is there an imbalance in the group ? */ | 4292 | int group_imb; /* Is there an imbalance in the group ? */ |
4277 | int group_has_capacity; /* Is there extra capacity in the group? */ | 4293 | int group_has_capacity; /* Is there extra capacity in the group? */ |
4278 | }; | 4294 | }; |
4279 | 4295 | ||
4296 | /* | ||
4297 | * sd_lb_stats - Structure to store the statistics of a sched_domain | ||
4298 | * during load balancing. | ||
4299 | */ | ||
4300 | struct sd_lb_stats { | ||
4301 | struct sched_group *busiest; /* Busiest group in this sd */ | ||
4302 | struct sched_group *local; /* Local group in this sd */ | ||
4303 | unsigned long total_load; /* Total load of all groups in sd */ | ||
4304 | unsigned long total_pwr; /* Total power of all groups in sd */ | ||
4305 | unsigned long avg_load; /* Average load across all groups in sd */ | ||
4306 | |||
4307 | struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */ | ||
4308 | struct sg_lb_stats local_stat; /* Statistics of the local group */ | ||
4309 | }; | ||
4310 | |||
4311 | static inline void init_sd_lb_stats(struct sd_lb_stats *sds) | ||
4312 | { | ||
4313 | /* | ||
4314 | * Skimp on the clearing to avoid duplicate work. We can avoid clearing | ||
4315 | * local_stat because update_sg_lb_stats() does a full clear/assignment. | ||
4316 | * We must however clear busiest_stat::avg_load because | ||
4317 | * update_sd_pick_busiest() reads this before assignment. | ||
4318 | */ | ||
4319 | *sds = (struct sd_lb_stats){ | ||
4320 | .busiest = NULL, | ||
4321 | .local = NULL, | ||
4322 | .total_load = 0UL, | ||
4323 | .total_pwr = 0UL, | ||
4324 | .busiest_stat = { | ||
4325 | .avg_load = 0UL, | ||
4326 | }, | ||
4327 | }; | ||
4328 | } | ||
4329 | |||
4280 | /** | 4330 | /** |
4281 | * get_sd_load_idx - Obtain the load index for a given sched domain. | 4331 | * get_sd_load_idx - Obtain the load index for a given sched domain. |
4282 | * @sd: The sched_domain whose load_idx is to be obtained. | 4332 | * @sd: The sched_domain whose load_idx is to be obtained. |
@@ -4460,33 +4510,99 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | |||
4460 | return 0; | 4510 | return 0; |
4461 | } | 4511 | } |
4462 | 4512 | ||
4513 | /* | ||
4514 | * Group imbalance indicates (and tries to solve) the problem where balancing | ||
4515 | * groups is inadequate due to tsk_cpus_allowed() constraints. | ||
4516 | * | ||
4517 | * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a | ||
4518 | * cpumask covering 1 cpu of the first group and 3 cpus of the second group. | ||
4519 | * Something like: | ||
4520 | * | ||
4521 | * { 0 1 2 3 } { 4 5 6 7 } | ||
4522 | * * * * * | ||
4523 | * | ||
4524 | * If we were to balance group-wise we'd place two tasks in the first group and | ||
4525 | * two tasks in the second group. Clearly this is undesired as it will overload | ||
4526 | * cpu 3 and leave one of the cpus in the second group unused. | ||
4527 | * | ||
4528 | * The current solution to this issue is detecting the skew in the first group | ||
4529 | * by noticing it has a cpu that is overloaded while the remaining cpus are | ||
4530 | * idle -- or rather, there's a distinct imbalance in the cpus; see | ||
4531 | * sg_imbalanced(). | ||
4532 | * | ||
4533 | * When this is so detected; this group becomes a candidate for busiest; see | ||
4534 | * update_sd_pick_busiest(). And calculcate_imbalance() and | ||
4535 | * find_busiest_group() avoid some of the usual balance conditional to allow it | ||
4536 | * to create an effective group imbalance. | ||
4537 | * | ||
4538 | * This is a somewhat tricky proposition since the next run might not find the | ||
4539 | * group imbalance and decide the groups need to be balanced again. A most | ||
4540 | * subtle and fragile situation. | ||
4541 | */ | ||
4542 | |||
4543 | struct sg_imb_stats { | ||
4544 | unsigned long max_nr_running, min_nr_running; | ||
4545 | unsigned long max_cpu_load, min_cpu_load; | ||
4546 | }; | ||
4547 | |||
4548 | static inline void init_sg_imb_stats(struct sg_imb_stats *sgi) | ||
4549 | { | ||
4550 | sgi->max_cpu_load = sgi->max_nr_running = 0UL; | ||
4551 | sgi->min_cpu_load = sgi->min_nr_running = ~0UL; | ||
4552 | } | ||
4553 | |||
4554 | static inline void | ||
4555 | update_sg_imb_stats(struct sg_imb_stats *sgi, | ||
4556 | unsigned long load, unsigned long nr_running) | ||
4557 | { | ||
4558 | if (load > sgi->max_cpu_load) | ||
4559 | sgi->max_cpu_load = load; | ||
4560 | if (sgi->min_cpu_load > load) | ||
4561 | sgi->min_cpu_load = load; | ||
4562 | |||
4563 | if (nr_running > sgi->max_nr_running) | ||
4564 | sgi->max_nr_running = nr_running; | ||
4565 | if (sgi->min_nr_running > nr_running) | ||
4566 | sgi->min_nr_running = nr_running; | ||
4567 | } | ||
4568 | |||
4569 | static inline int | ||
4570 | sg_imbalanced(struct sg_lb_stats *sgs, struct sg_imb_stats *sgi) | ||
4571 | { | ||
4572 | /* | ||
4573 | * Consider the group unbalanced when the imbalance is larger | ||
4574 | * than the average weight of a task. | ||
4575 | * | ||
4576 | * APZ: with cgroup the avg task weight can vary wildly and | ||
4577 | * might not be a suitable number - should we keep a | ||
4578 | * normalized nr_running number somewhere that negates | ||
4579 | * the hierarchy? | ||
4580 | */ | ||
4581 | if ((sgi->max_cpu_load - sgi->min_cpu_load) >= sgs->load_per_task && | ||
4582 | (sgi->max_nr_running - sgi->min_nr_running) > 1) | ||
4583 | return 1; | ||
4584 | |||
4585 | return 0; | ||
4586 | } | ||
4587 | |||
4463 | /** | 4588 | /** |
4464 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. | 4589 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. |
4465 | * @env: The load balancing environment. | 4590 | * @env: The load balancing environment. |
4466 | * @group: sched_group whose statistics are to be updated. | 4591 | * @group: sched_group whose statistics are to be updated. |
4467 | * @load_idx: Load index of sched_domain of this_cpu for load calc. | 4592 | * @load_idx: Load index of sched_domain of this_cpu for load calc. |
4468 | * @local_group: Does group contain this_cpu. | 4593 | * @local_group: Does group contain this_cpu. |
4469 | * @balance: Should we balance. | ||
4470 | * @sgs: variable to hold the statistics for this group. | 4594 | * @sgs: variable to hold the statistics for this group. |
4471 | */ | 4595 | */ |
4472 | static inline void update_sg_lb_stats(struct lb_env *env, | 4596 | static inline void update_sg_lb_stats(struct lb_env *env, |
4473 | struct sched_group *group, int load_idx, | 4597 | struct sched_group *group, int load_idx, |
4474 | int local_group, int *balance, struct sg_lb_stats *sgs) | 4598 | int local_group, struct sg_lb_stats *sgs) |
4475 | { | 4599 | { |
4476 | unsigned long nr_running, max_nr_running, min_nr_running; | 4600 | struct sg_imb_stats sgi; |
4477 | unsigned long load, max_cpu_load, min_cpu_load; | 4601 | unsigned long nr_running; |
4478 | unsigned int balance_cpu = -1, first_idle_cpu = 0; | 4602 | unsigned long load; |
4479 | unsigned long avg_load_per_task = 0; | ||
4480 | int i; | 4603 | int i; |
4481 | 4604 | ||
4482 | if (local_group) | 4605 | init_sg_imb_stats(&sgi); |
4483 | balance_cpu = group_balance_cpu(group); | ||
4484 | |||
4485 | /* Tally up the load of all CPUs in the group */ | ||
4486 | max_cpu_load = 0; | ||
4487 | min_cpu_load = ~0UL; | ||
4488 | max_nr_running = 0; | ||
4489 | min_nr_running = ~0UL; | ||
4490 | 4606 | ||
4491 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { | 4607 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { |
4492 | struct rq *rq = cpu_rq(i); | 4608 | struct rq *rq = cpu_rq(i); |
@@ -4495,24 +4611,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
4495 | 4611 | ||
4496 | /* Bias balancing toward cpus of our domain */ | 4612 | /* Bias balancing toward cpus of our domain */ |
4497 | if (local_group) { | 4613 | if (local_group) { |
4498 | if (idle_cpu(i) && !first_idle_cpu && | ||
4499 | cpumask_test_cpu(i, sched_group_mask(group))) { | ||
4500 | first_idle_cpu = 1; | ||
4501 | balance_cpu = i; | ||
4502 | } | ||
4503 | |||
4504 | load = target_load(i, load_idx); | 4614 | load = target_load(i, load_idx); |
4505 | } else { | 4615 | } else { |
4506 | load = source_load(i, load_idx); | 4616 | load = source_load(i, load_idx); |
4507 | if (load > max_cpu_load) | 4617 | update_sg_imb_stats(&sgi, load, nr_running); |
4508 | max_cpu_load = load; | ||
4509 | if (min_cpu_load > load) | ||
4510 | min_cpu_load = load; | ||
4511 | |||
4512 | if (nr_running > max_nr_running) | ||
4513 | max_nr_running = nr_running; | ||
4514 | if (min_nr_running > nr_running) | ||
4515 | min_nr_running = nr_running; | ||
4516 | } | 4618 | } |
4517 | 4619 | ||
4518 | sgs->group_load += load; | 4620 | sgs->group_load += load; |
@@ -4522,46 +4624,25 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
4522 | sgs->idle_cpus++; | 4624 | sgs->idle_cpus++; |
4523 | } | 4625 | } |
4524 | 4626 | ||
4525 | /* | 4627 | if (local_group && (env->idle != CPU_NEWLY_IDLE || |
4526 | * First idle cpu or the first cpu(busiest) in this sched group | 4628 | time_after_eq(jiffies, group->sgp->next_update))) |
4527 | * is eligible for doing load balancing at this and above | 4629 | update_group_power(env->sd, env->dst_cpu); |
4528 | * domains. In the newly idle case, we will allow all the cpu's | ||
4529 | * to do the newly idle load balance. | ||
4530 | */ | ||
4531 | if (local_group) { | ||
4532 | if (env->idle != CPU_NEWLY_IDLE) { | ||
4533 | if (balance_cpu != env->dst_cpu) { | ||
4534 | *balance = 0; | ||
4535 | return; | ||
4536 | } | ||
4537 | update_group_power(env->sd, env->dst_cpu); | ||
4538 | } else if (time_after_eq(jiffies, group->sgp->next_update)) | ||
4539 | update_group_power(env->sd, env->dst_cpu); | ||
4540 | } | ||
4541 | 4630 | ||
4542 | /* Adjust by relative CPU power of the group */ | 4631 | /* Adjust by relative CPU power of the group */ |
4543 | sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power; | 4632 | sgs->group_power = group->sgp->power; |
4633 | sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power; | ||
4544 | 4634 | ||
4545 | /* | ||
4546 | * Consider the group unbalanced when the imbalance is larger | ||
4547 | * than the average weight of a task. | ||
4548 | * | ||
4549 | * APZ: with cgroup the avg task weight can vary wildly and | ||
4550 | * might not be a suitable number - should we keep a | ||
4551 | * normalized nr_running number somewhere that negates | ||
4552 | * the hierarchy? | ||
4553 | */ | ||
4554 | if (sgs->sum_nr_running) | 4635 | if (sgs->sum_nr_running) |
4555 | avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; | 4636 | sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; |
4637 | |||
4638 | sgs->group_imb = sg_imbalanced(sgs, &sgi); | ||
4556 | 4639 | ||
4557 | if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && | 4640 | sgs->group_capacity = |
4558 | (max_nr_running - min_nr_running) > 1) | 4641 | DIV_ROUND_CLOSEST(sgs->group_power, SCHED_POWER_SCALE); |
4559 | sgs->group_imb = 1; | ||
4560 | 4642 | ||
4561 | sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power, | ||
4562 | SCHED_POWER_SCALE); | ||
4563 | if (!sgs->group_capacity) | 4643 | if (!sgs->group_capacity) |
4564 | sgs->group_capacity = fix_small_capacity(env->sd, group); | 4644 | sgs->group_capacity = fix_small_capacity(env->sd, group); |
4645 | |||
4565 | sgs->group_weight = group->group_weight; | 4646 | sgs->group_weight = group->group_weight; |
4566 | 4647 | ||
4567 | if (sgs->group_capacity > sgs->sum_nr_running) | 4648 | if (sgs->group_capacity > sgs->sum_nr_running) |
@@ -4586,7 +4667,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, | |||
4586 | struct sched_group *sg, | 4667 | struct sched_group *sg, |
4587 | struct sg_lb_stats *sgs) | 4668 | struct sg_lb_stats *sgs) |
4588 | { | 4669 | { |
4589 | if (sgs->avg_load <= sds->max_load) | 4670 | if (sgs->avg_load <= sds->busiest_stat.avg_load) |
4590 | return false; | 4671 | return false; |
4591 | 4672 | ||
4592 | if (sgs->sum_nr_running > sgs->group_capacity) | 4673 | if (sgs->sum_nr_running > sgs->group_capacity) |
@@ -4619,11 +4700,11 @@ static bool update_sd_pick_busiest(struct lb_env *env, | |||
4619 | * @sds: variable to hold the statistics for this sched_domain. | 4700 | * @sds: variable to hold the statistics for this sched_domain. |
4620 | */ | 4701 | */ |
4621 | static inline void update_sd_lb_stats(struct lb_env *env, | 4702 | static inline void update_sd_lb_stats(struct lb_env *env, |
4622 | int *balance, struct sd_lb_stats *sds) | 4703 | struct sd_lb_stats *sds) |
4623 | { | 4704 | { |
4624 | struct sched_domain *child = env->sd->child; | 4705 | struct sched_domain *child = env->sd->child; |
4625 | struct sched_group *sg = env->sd->groups; | 4706 | struct sched_group *sg = env->sd->groups; |
4626 | struct sg_lb_stats sgs; | 4707 | struct sg_lb_stats tmp_sgs; |
4627 | int load_idx, prefer_sibling = 0; | 4708 | int load_idx, prefer_sibling = 0; |
4628 | 4709 | ||
4629 | if (child && child->flags & SD_PREFER_SIBLING) | 4710 | if (child && child->flags & SD_PREFER_SIBLING) |
@@ -4632,17 +4713,17 @@ static inline void update_sd_lb_stats(struct lb_env *env, | |||
4632 | load_idx = get_sd_load_idx(env->sd, env->idle); | 4713 | load_idx = get_sd_load_idx(env->sd, env->idle); |
4633 | 4714 | ||
4634 | do { | 4715 | do { |
4716 | struct sg_lb_stats *sgs = &tmp_sgs; | ||
4635 | int local_group; | 4717 | int local_group; |
4636 | 4718 | ||
4637 | local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); | 4719 | local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); |
4638 | memset(&sgs, 0, sizeof(sgs)); | 4720 | if (local_group) { |
4639 | update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs); | 4721 | sds->local = sg; |
4640 | 4722 | sgs = &sds->local_stat; | |
4641 | if (local_group && !(*balance)) | 4723 | } |
4642 | return; | ||
4643 | 4724 | ||
4644 | sds->total_load += sgs.group_load; | 4725 | memset(sgs, 0, sizeof(*sgs)); |
4645 | sds->total_pwr += sg->sgp->power; | 4726 | update_sg_lb_stats(env, sg, load_idx, local_group, sgs); |
4646 | 4727 | ||
4647 | /* | 4728 | /* |
4648 | * In case the child domain prefers tasks go to siblings | 4729 | * In case the child domain prefers tasks go to siblings |
@@ -4654,26 +4735,17 @@ static inline void update_sd_lb_stats(struct lb_env *env, | |||
4654 | * heaviest group when it is already under-utilized (possible | 4735 | * heaviest group when it is already under-utilized (possible |
4655 | * with a large weight task outweighs the tasks on the system). | 4736 | * with a large weight task outweighs the tasks on the system). |
4656 | */ | 4737 | */ |
4657 | if (prefer_sibling && !local_group && sds->this_has_capacity) | 4738 | if (prefer_sibling && !local_group && |
4658 | sgs.group_capacity = min(sgs.group_capacity, 1UL); | 4739 | sds->local && sds->local_stat.group_has_capacity) |
4740 | sgs->group_capacity = min(sgs->group_capacity, 1U); | ||
4659 | 4741 | ||
4660 | if (local_group) { | 4742 | /* Now, start updating sd_lb_stats */ |
4661 | sds->this_load = sgs.avg_load; | 4743 | sds->total_load += sgs->group_load; |
4662 | sds->this = sg; | 4744 | sds->total_pwr += sgs->group_power; |
4663 | sds->this_nr_running = sgs.sum_nr_running; | 4745 | |
4664 | sds->this_load_per_task = sgs.sum_weighted_load; | 4746 | if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) { |
4665 | sds->this_has_capacity = sgs.group_has_capacity; | ||
4666 | sds->this_idle_cpus = sgs.idle_cpus; | ||
4667 | } else if (update_sd_pick_busiest(env, sds, sg, &sgs)) { | ||
4668 | sds->max_load = sgs.avg_load; | ||
4669 | sds->busiest = sg; | 4747 | sds->busiest = sg; |
4670 | sds->busiest_nr_running = sgs.sum_nr_running; | 4748 | sds->busiest_stat = *sgs; |
4671 | sds->busiest_idle_cpus = sgs.idle_cpus; | ||
4672 | sds->busiest_group_capacity = sgs.group_capacity; | ||
4673 | sds->busiest_load_per_task = sgs.sum_weighted_load; | ||
4674 | sds->busiest_has_capacity = sgs.group_has_capacity; | ||
4675 | sds->busiest_group_weight = sgs.group_weight; | ||
4676 | sds->group_imb = sgs.group_imb; | ||
4677 | } | 4749 | } |
4678 | 4750 | ||
4679 | sg = sg->next; | 4751 | sg = sg->next; |
@@ -4718,7 +4790,8 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) | |||
4718 | return 0; | 4790 | return 0; |
4719 | 4791 | ||
4720 | env->imbalance = DIV_ROUND_CLOSEST( | 4792 | env->imbalance = DIV_ROUND_CLOSEST( |
4721 | sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE); | 4793 | sds->busiest_stat.avg_load * sds->busiest_stat.group_power, |
4794 | SCHED_POWER_SCALE); | ||
4722 | 4795 | ||
4723 | return 1; | 4796 | return 1; |
4724 | } | 4797 | } |
@@ -4736,24 +4809,23 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) | |||
4736 | unsigned long tmp, pwr_now = 0, pwr_move = 0; | 4809 | unsigned long tmp, pwr_now = 0, pwr_move = 0; |
4737 | unsigned int imbn = 2; | 4810 | unsigned int imbn = 2; |
4738 | unsigned long scaled_busy_load_per_task; | 4811 | unsigned long scaled_busy_load_per_task; |
4812 | struct sg_lb_stats *local, *busiest; | ||
4739 | 4813 | ||
4740 | if (sds->this_nr_running) { | 4814 | local = &sds->local_stat; |
4741 | sds->this_load_per_task /= sds->this_nr_running; | 4815 | busiest = &sds->busiest_stat; |
4742 | if (sds->busiest_load_per_task > | 4816 | |
4743 | sds->this_load_per_task) | 4817 | if (!local->sum_nr_running) |
4744 | imbn = 1; | 4818 | local->load_per_task = cpu_avg_load_per_task(env->dst_cpu); |
4745 | } else { | 4819 | else if (busiest->load_per_task > local->load_per_task) |
4746 | sds->this_load_per_task = | 4820 | imbn = 1; |
4747 | cpu_avg_load_per_task(env->dst_cpu); | ||
4748 | } | ||
4749 | 4821 | ||
4750 | scaled_busy_load_per_task = sds->busiest_load_per_task | 4822 | scaled_busy_load_per_task = |
4751 | * SCHED_POWER_SCALE; | 4823 | (busiest->load_per_task * SCHED_POWER_SCALE) / |
4752 | scaled_busy_load_per_task /= sds->busiest->sgp->power; | 4824 | busiest->group_power; |
4753 | 4825 | ||
4754 | if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= | 4826 | if (busiest->avg_load - local->avg_load + scaled_busy_load_per_task >= |
4755 | (scaled_busy_load_per_task * imbn)) { | 4827 | (scaled_busy_load_per_task * imbn)) { |
4756 | env->imbalance = sds->busiest_load_per_task; | 4828 | env->imbalance = busiest->load_per_task; |
4757 | return; | 4829 | return; |
4758 | } | 4830 | } |
4759 | 4831 | ||
@@ -4763,34 +4835,37 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) | |||
4763 | * moving them. | 4835 | * moving them. |
4764 | */ | 4836 | */ |
4765 | 4837 | ||
4766 | pwr_now += sds->busiest->sgp->power * | 4838 | pwr_now += busiest->group_power * |
4767 | min(sds->busiest_load_per_task, sds->max_load); | 4839 | min(busiest->load_per_task, busiest->avg_load); |
4768 | pwr_now += sds->this->sgp->power * | 4840 | pwr_now += local->group_power * |
4769 | min(sds->this_load_per_task, sds->this_load); | 4841 | min(local->load_per_task, local->avg_load); |
4770 | pwr_now /= SCHED_POWER_SCALE; | 4842 | pwr_now /= SCHED_POWER_SCALE; |
4771 | 4843 | ||
4772 | /* Amount of load we'd subtract */ | 4844 | /* Amount of load we'd subtract */ |
4773 | tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / | 4845 | tmp = (busiest->load_per_task * SCHED_POWER_SCALE) / |
4774 | sds->busiest->sgp->power; | 4846 | busiest->group_power; |
4775 | if (sds->max_load > tmp) | 4847 | if (busiest->avg_load > tmp) { |
4776 | pwr_move += sds->busiest->sgp->power * | 4848 | pwr_move += busiest->group_power * |
4777 | min(sds->busiest_load_per_task, sds->max_load - tmp); | 4849 | min(busiest->load_per_task, |
4850 | busiest->avg_load - tmp); | ||
4851 | } | ||
4778 | 4852 | ||
4779 | /* Amount of load we'd add */ | 4853 | /* Amount of load we'd add */ |
4780 | if (sds->max_load * sds->busiest->sgp->power < | 4854 | if (busiest->avg_load * busiest->group_power < |
4781 | sds->busiest_load_per_task * SCHED_POWER_SCALE) | 4855 | busiest->load_per_task * SCHED_POWER_SCALE) { |
4782 | tmp = (sds->max_load * sds->busiest->sgp->power) / | 4856 | tmp = (busiest->avg_load * busiest->group_power) / |
4783 | sds->this->sgp->power; | 4857 | local->group_power; |
4784 | else | 4858 | } else { |
4785 | tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / | 4859 | tmp = (busiest->load_per_task * SCHED_POWER_SCALE) / |
4786 | sds->this->sgp->power; | 4860 | local->group_power; |
4787 | pwr_move += sds->this->sgp->power * | 4861 | } |
4788 | min(sds->this_load_per_task, sds->this_load + tmp); | 4862 | pwr_move += local->group_power * |
4863 | min(local->load_per_task, local->avg_load + tmp); | ||
4789 | pwr_move /= SCHED_POWER_SCALE; | 4864 | pwr_move /= SCHED_POWER_SCALE; |
4790 | 4865 | ||
4791 | /* Move if we gain throughput */ | 4866 | /* Move if we gain throughput */ |
4792 | if (pwr_move > pwr_now) | 4867 | if (pwr_move > pwr_now) |
4793 | env->imbalance = sds->busiest_load_per_task; | 4868 | env->imbalance = busiest->load_per_task; |
4794 | } | 4869 | } |
4795 | 4870 | ||
4796 | /** | 4871 | /** |
@@ -4802,11 +4877,18 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) | |||
4802 | static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds) | 4877 | static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds) |
4803 | { | 4878 | { |
4804 | unsigned long max_pull, load_above_capacity = ~0UL; | 4879 | unsigned long max_pull, load_above_capacity = ~0UL; |
4880 | struct sg_lb_stats *local, *busiest; | ||
4805 | 4881 | ||
4806 | sds->busiest_load_per_task /= sds->busiest_nr_running; | 4882 | local = &sds->local_stat; |
4807 | if (sds->group_imb) { | 4883 | busiest = &sds->busiest_stat; |
4808 | sds->busiest_load_per_task = | 4884 | |
4809 | min(sds->busiest_load_per_task, sds->avg_load); | 4885 | if (busiest->group_imb) { |
4886 | /* | ||
4887 | * In the group_imb case we cannot rely on group-wide averages | ||
4888 | * to ensure cpu-load equilibrium, look at wider averages. XXX | ||
4889 | */ | ||
4890 | busiest->load_per_task = | ||
4891 | min(busiest->load_per_task, sds->avg_load); | ||
4810 | } | 4892 | } |
4811 | 4893 | ||
4812 | /* | 4894 | /* |
@@ -4814,21 +4896,22 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
4814 | * max load less than avg load(as we skip the groups at or below | 4896 | * max load less than avg load(as we skip the groups at or below |
4815 | * its cpu_power, while calculating max_load..) | 4897 | * its cpu_power, while calculating max_load..) |
4816 | */ | 4898 | */ |
4817 | if (sds->max_load < sds->avg_load) { | 4899 | if (busiest->avg_load < sds->avg_load) { |
4818 | env->imbalance = 0; | 4900 | env->imbalance = 0; |
4819 | return fix_small_imbalance(env, sds); | 4901 | return fix_small_imbalance(env, sds); |
4820 | } | 4902 | } |
4821 | 4903 | ||
4822 | if (!sds->group_imb) { | 4904 | if (!busiest->group_imb) { |
4823 | /* | 4905 | /* |
4824 | * Don't want to pull so many tasks that a group would go idle. | 4906 | * Don't want to pull so many tasks that a group would go idle. |
4907 | * Except of course for the group_imb case, since then we might | ||
4908 | * have to drop below capacity to reach cpu-load equilibrium. | ||
4825 | */ | 4909 | */ |
4826 | load_above_capacity = (sds->busiest_nr_running - | 4910 | load_above_capacity = |
4827 | sds->busiest_group_capacity); | 4911 | (busiest->sum_nr_running - busiest->group_capacity); |
4828 | 4912 | ||
4829 | load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); | 4913 | load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); |
4830 | 4914 | load_above_capacity /= busiest->group_power; | |
4831 | load_above_capacity /= sds->busiest->sgp->power; | ||
4832 | } | 4915 | } |
4833 | 4916 | ||
4834 | /* | 4917 | /* |
@@ -4838,15 +4921,14 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
4838 | * we also don't want to reduce the group load below the group capacity | 4921 | * we also don't want to reduce the group load below the group capacity |
4839 | * (so that we can implement power-savings policies etc). Thus we look | 4922 | * (so that we can implement power-savings policies etc). Thus we look |
4840 | * for the minimum possible imbalance. | 4923 | * for the minimum possible imbalance. |
4841 | * Be careful of negative numbers as they'll appear as very large values | ||
4842 | * with unsigned longs. | ||
4843 | */ | 4924 | */ |
4844 | max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); | 4925 | max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity); |
4845 | 4926 | ||
4846 | /* How much load to actually move to equalise the imbalance */ | 4927 | /* How much load to actually move to equalise the imbalance */ |
4847 | env->imbalance = min(max_pull * sds->busiest->sgp->power, | 4928 | env->imbalance = min( |
4848 | (sds->avg_load - sds->this_load) * sds->this->sgp->power) | 4929 | max_pull * busiest->group_power, |
4849 | / SCHED_POWER_SCALE; | 4930 | (sds->avg_load - local->avg_load) * local->group_power |
4931 | ) / SCHED_POWER_SCALE; | ||
4850 | 4932 | ||
4851 | /* | 4933 | /* |
4852 | * if *imbalance is less than the average load per runnable task | 4934 | * if *imbalance is less than the average load per runnable task |
@@ -4854,9 +4936,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
4854 | * a think about bumping its value to force at least one task to be | 4936 | * a think about bumping its value to force at least one task to be |
4855 | * moved | 4937 | * moved |
4856 | */ | 4938 | */ |
4857 | if (env->imbalance < sds->busiest_load_per_task) | 4939 | if (env->imbalance < busiest->load_per_task) |
4858 | return fix_small_imbalance(env, sds); | 4940 | return fix_small_imbalance(env, sds); |
4859 | |||
4860 | } | 4941 | } |
4861 | 4942 | ||
4862 | /******* find_busiest_group() helpers end here *********************/ | 4943 | /******* find_busiest_group() helpers end here *********************/ |
@@ -4872,69 +4953,62 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
4872 | * to restore balance. | 4953 | * to restore balance. |
4873 | * | 4954 | * |
4874 | * @env: The load balancing environment. | 4955 | * @env: The load balancing environment. |
4875 | * @balance: Pointer to a variable indicating if this_cpu | ||
4876 | * is the appropriate cpu to perform load balancing at this_level. | ||
4877 | * | 4956 | * |
4878 | * Return: - The busiest group if imbalance exists. | 4957 | * Return: - The busiest group if imbalance exists. |
4879 | * - If no imbalance and user has opted for power-savings balance, | 4958 | * - If no imbalance and user has opted for power-savings balance, |
4880 | * return the least loaded group whose CPUs can be | 4959 | * return the least loaded group whose CPUs can be |
4881 | * put to idle by rebalancing its tasks onto our group. | 4960 | * put to idle by rebalancing its tasks onto our group. |
4882 | */ | 4961 | */ |
4883 | static struct sched_group * | 4962 | static struct sched_group *find_busiest_group(struct lb_env *env) |
4884 | find_busiest_group(struct lb_env *env, int *balance) | ||
4885 | { | 4963 | { |
4964 | struct sg_lb_stats *local, *busiest; | ||
4886 | struct sd_lb_stats sds; | 4965 | struct sd_lb_stats sds; |
4887 | 4966 | ||
4888 | memset(&sds, 0, sizeof(sds)); | 4967 | init_sd_lb_stats(&sds); |
4889 | 4968 | ||
4890 | /* | 4969 | /* |
4891 | * Compute the various statistics relavent for load balancing at | 4970 | * Compute the various statistics relavent for load balancing at |
4892 | * this level. | 4971 | * this level. |
4893 | */ | 4972 | */ |
4894 | update_sd_lb_stats(env, balance, &sds); | 4973 | update_sd_lb_stats(env, &sds); |
4895 | 4974 | local = &sds.local_stat; | |
4896 | /* | 4975 | busiest = &sds.busiest_stat; |
4897 | * this_cpu is not the appropriate cpu to perform load balancing at | ||
4898 | * this level. | ||
4899 | */ | ||
4900 | if (!(*balance)) | ||
4901 | goto ret; | ||
4902 | 4976 | ||
4903 | if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && | 4977 | if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && |
4904 | check_asym_packing(env, &sds)) | 4978 | check_asym_packing(env, &sds)) |
4905 | return sds.busiest; | 4979 | return sds.busiest; |
4906 | 4980 | ||
4907 | /* There is no busy sibling group to pull tasks from */ | 4981 | /* There is no busy sibling group to pull tasks from */ |
4908 | if (!sds.busiest || sds.busiest_nr_running == 0) | 4982 | if (!sds.busiest || busiest->sum_nr_running == 0) |
4909 | goto out_balanced; | 4983 | goto out_balanced; |
4910 | 4984 | ||
4911 | sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr; | 4985 | sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr; |
4912 | 4986 | ||
4913 | /* | 4987 | /* |
4914 | * If the busiest group is imbalanced the below checks don't | 4988 | * If the busiest group is imbalanced the below checks don't |
4915 | * work because they assumes all things are equal, which typically | 4989 | * work because they assume all things are equal, which typically |
4916 | * isn't true due to cpus_allowed constraints and the like. | 4990 | * isn't true due to cpus_allowed constraints and the like. |
4917 | */ | 4991 | */ |
4918 | if (sds.group_imb) | 4992 | if (busiest->group_imb) |
4919 | goto force_balance; | 4993 | goto force_balance; |
4920 | 4994 | ||
4921 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ | 4995 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ |
4922 | if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity && | 4996 | if (env->idle == CPU_NEWLY_IDLE && local->group_has_capacity && |
4923 | !sds.busiest_has_capacity) | 4997 | !busiest->group_has_capacity) |
4924 | goto force_balance; | 4998 | goto force_balance; |
4925 | 4999 | ||
4926 | /* | 5000 | /* |
4927 | * If the local group is more busy than the selected busiest group | 5001 | * If the local group is more busy than the selected busiest group |
4928 | * don't try and pull any tasks. | 5002 | * don't try and pull any tasks. |
4929 | */ | 5003 | */ |
4930 | if (sds.this_load >= sds.max_load) | 5004 | if (local->avg_load >= busiest->avg_load) |
4931 | goto out_balanced; | 5005 | goto out_balanced; |
4932 | 5006 | ||
4933 | /* | 5007 | /* |
4934 | * Don't pull any tasks if this group is already above the domain | 5008 | * Don't pull any tasks if this group is already above the domain |
4935 | * average load. | 5009 | * average load. |
4936 | */ | 5010 | */ |
4937 | if (sds.this_load >= sds.avg_load) | 5011 | if (local->avg_load >= sds.avg_load) |
4938 | goto out_balanced; | 5012 | goto out_balanced; |
4939 | 5013 | ||
4940 | if (env->idle == CPU_IDLE) { | 5014 | if (env->idle == CPU_IDLE) { |
@@ -4944,15 +5018,16 @@ find_busiest_group(struct lb_env *env, int *balance) | |||
4944 | * there is no imbalance between this and busiest group | 5018 | * there is no imbalance between this and busiest group |
4945 | * wrt to idle cpu's, it is balanced. | 5019 | * wrt to idle cpu's, it is balanced. |
4946 | */ | 5020 | */ |
4947 | if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) && | 5021 | if ((local->idle_cpus < busiest->idle_cpus) && |
4948 | sds.busiest_nr_running <= sds.busiest_group_weight) | 5022 | busiest->sum_nr_running <= busiest->group_weight) |
4949 | goto out_balanced; | 5023 | goto out_balanced; |
4950 | } else { | 5024 | } else { |
4951 | /* | 5025 | /* |
4952 | * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use | 5026 | * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use |
4953 | * imbalance_pct to be conservative. | 5027 | * imbalance_pct to be conservative. |
4954 | */ | 5028 | */ |
4955 | if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load) | 5029 | if (100 * busiest->avg_load <= |
5030 | env->sd->imbalance_pct * local->avg_load) | ||
4956 | goto out_balanced; | 5031 | goto out_balanced; |
4957 | } | 5032 | } |
4958 | 5033 | ||
@@ -4962,7 +5037,6 @@ force_balance: | |||
4962 | return sds.busiest; | 5037 | return sds.busiest; |
4963 | 5038 | ||
4964 | out_balanced: | 5039 | out_balanced: |
4965 | ret: | ||
4966 | env->imbalance = 0; | 5040 | env->imbalance = 0; |
4967 | return NULL; | 5041 | return NULL; |
4968 | } | 5042 | } |
@@ -4974,10 +5048,10 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
4974 | struct sched_group *group) | 5048 | struct sched_group *group) |
4975 | { | 5049 | { |
4976 | struct rq *busiest = NULL, *rq; | 5050 | struct rq *busiest = NULL, *rq; |
4977 | unsigned long max_load = 0; | 5051 | unsigned long busiest_load = 0, busiest_power = 1; |
4978 | int i; | 5052 | int i; |
4979 | 5053 | ||
4980 | for_each_cpu(i, sched_group_cpus(group)) { | 5054 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { |
4981 | unsigned long power = power_of(i); | 5055 | unsigned long power = power_of(i); |
4982 | unsigned long capacity = DIV_ROUND_CLOSEST(power, | 5056 | unsigned long capacity = DIV_ROUND_CLOSEST(power, |
4983 | SCHED_POWER_SCALE); | 5057 | SCHED_POWER_SCALE); |
@@ -4986,9 +5060,6 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
4986 | if (!capacity) | 5060 | if (!capacity) |
4987 | capacity = fix_small_capacity(env->sd, group); | 5061 | capacity = fix_small_capacity(env->sd, group); |
4988 | 5062 | ||
4989 | if (!cpumask_test_cpu(i, env->cpus)) | ||
4990 | continue; | ||
4991 | |||
4992 | rq = cpu_rq(i); | 5063 | rq = cpu_rq(i); |
4993 | wl = weighted_cpuload(i); | 5064 | wl = weighted_cpuload(i); |
4994 | 5065 | ||
@@ -5004,11 +5075,15 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
5004 | * the weighted_cpuload() scaled with the cpu power, so that | 5075 | * the weighted_cpuload() scaled with the cpu power, so that |
5005 | * the load can be moved away from the cpu that is potentially | 5076 | * the load can be moved away from the cpu that is potentially |
5006 | * running at a lower capacity. | 5077 | * running at a lower capacity. |
5078 | * | ||
5079 | * Thus we're looking for max(wl_i / power_i), crosswise | ||
5080 | * multiplication to rid ourselves of the division works out | ||
5081 | * to: wl_i * power_j > wl_j * power_i; where j is our | ||
5082 | * previous maximum. | ||
5007 | */ | 5083 | */ |
5008 | wl = (wl * SCHED_POWER_SCALE) / power; | 5084 | if (wl * busiest_power > busiest_load * power) { |
5009 | 5085 | busiest_load = wl; | |
5010 | if (wl > max_load) { | 5086 | busiest_power = power; |
5011 | max_load = wl; | ||
5012 | busiest = rq; | 5087 | busiest = rq; |
5013 | } | 5088 | } |
5014 | } | 5089 | } |
@@ -5045,13 +5120,47 @@ static int need_active_balance(struct lb_env *env) | |||
5045 | 5120 | ||
5046 | static int active_load_balance_cpu_stop(void *data); | 5121 | static int active_load_balance_cpu_stop(void *data); |
5047 | 5122 | ||
5123 | static int should_we_balance(struct lb_env *env) | ||
5124 | { | ||
5125 | struct sched_group *sg = env->sd->groups; | ||
5126 | struct cpumask *sg_cpus, *sg_mask; | ||
5127 | int cpu, balance_cpu = -1; | ||
5128 | |||
5129 | /* | ||
5130 | * In the newly idle case, we will allow all the cpu's | ||
5131 | * to do the newly idle load balance. | ||
5132 | */ | ||
5133 | if (env->idle == CPU_NEWLY_IDLE) | ||
5134 | return 1; | ||
5135 | |||
5136 | sg_cpus = sched_group_cpus(sg); | ||
5137 | sg_mask = sched_group_mask(sg); | ||
5138 | /* Try to find first idle cpu */ | ||
5139 | for_each_cpu_and(cpu, sg_cpus, env->cpus) { | ||
5140 | if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu)) | ||
5141 | continue; | ||
5142 | |||
5143 | balance_cpu = cpu; | ||
5144 | break; | ||
5145 | } | ||
5146 | |||
5147 | if (balance_cpu == -1) | ||
5148 | balance_cpu = group_balance_cpu(sg); | ||
5149 | |||
5150 | /* | ||
5151 | * First idle cpu or the first cpu(busiest) in this sched group | ||
5152 | * is eligible for doing load balancing at this and above domains. | ||
5153 | */ | ||
5154 | return balance_cpu != env->dst_cpu; | ||
5155 | } | ||
5156 | |||
5048 | /* | 5157 | /* |
5049 | * Check this_cpu to ensure it is balanced within domain. Attempt to move | 5158 | * Check this_cpu to ensure it is balanced within domain. Attempt to move |
5050 | * tasks if there is an imbalance. | 5159 | * tasks if there is an imbalance. |
5051 | */ | 5160 | */ |
5052 | static int load_balance(int this_cpu, struct rq *this_rq, | 5161 | static int load_balance(int this_cpu, struct rq *this_rq, |
5053 | struct sched_domain *sd, enum cpu_idle_type idle, | 5162 | struct sched_domain *sd, enum cpu_idle_type idle, |
5054 | int *balance) | 5163 | int *continue_balancing) |
5055 | { | 5164 | { |
5056 | int ld_moved, cur_ld_moved, active_balance = 0; | 5165 | int ld_moved, cur_ld_moved, active_balance = 0; |
5057 | struct sched_group *group; | 5166 | struct sched_group *group; |
@@ -5081,11 +5190,12 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
5081 | schedstat_inc(sd, lb_count[idle]); | 5190 | schedstat_inc(sd, lb_count[idle]); |
5082 | 5191 | ||
5083 | redo: | 5192 | redo: |
5084 | group = find_busiest_group(&env, balance); | 5193 | if (!should_we_balance(&env)) { |
5085 | 5194 | *continue_balancing = 0; | |
5086 | if (*balance == 0) | ||
5087 | goto out_balanced; | 5195 | goto out_balanced; |
5196 | } | ||
5088 | 5197 | ||
5198 | group = find_busiest_group(&env); | ||
5089 | if (!group) { | 5199 | if (!group) { |
5090 | schedstat_inc(sd, lb_nobusyg[idle]); | 5200 | schedstat_inc(sd, lb_nobusyg[idle]); |
5091 | goto out_balanced; | 5201 | goto out_balanced; |
@@ -5114,7 +5224,6 @@ redo: | |||
5114 | env.src_rq = busiest; | 5224 | env.src_rq = busiest; |
5115 | env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); | 5225 | env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); |
5116 | 5226 | ||
5117 | update_h_load(env.src_cpu); | ||
5118 | more_balance: | 5227 | more_balance: |
5119 | local_irq_save(flags); | 5228 | local_irq_save(flags); |
5120 | double_rq_lock(env.dst_rq, busiest); | 5229 | double_rq_lock(env.dst_rq, busiest); |
@@ -5298,7 +5407,7 @@ void idle_balance(int this_cpu, struct rq *this_rq) | |||
5298 | rcu_read_lock(); | 5407 | rcu_read_lock(); |
5299 | for_each_domain(this_cpu, sd) { | 5408 | for_each_domain(this_cpu, sd) { |
5300 | unsigned long interval; | 5409 | unsigned long interval; |
5301 | int balance = 1; | 5410 | int continue_balancing = 1; |
5302 | 5411 | ||
5303 | if (!(sd->flags & SD_LOAD_BALANCE)) | 5412 | if (!(sd->flags & SD_LOAD_BALANCE)) |
5304 | continue; | 5413 | continue; |
@@ -5306,7 +5415,8 @@ void idle_balance(int this_cpu, struct rq *this_rq) | |||
5306 | if (sd->flags & SD_BALANCE_NEWIDLE) { | 5415 | if (sd->flags & SD_BALANCE_NEWIDLE) { |
5307 | /* If we've pulled tasks over stop searching: */ | 5416 | /* If we've pulled tasks over stop searching: */ |
5308 | pulled_task = load_balance(this_cpu, this_rq, | 5417 | pulled_task = load_balance(this_cpu, this_rq, |
5309 | sd, CPU_NEWLY_IDLE, &balance); | 5418 | sd, CPU_NEWLY_IDLE, |
5419 | &continue_balancing); | ||
5310 | } | 5420 | } |
5311 | 5421 | ||
5312 | interval = msecs_to_jiffies(sd->balance_interval); | 5422 | interval = msecs_to_jiffies(sd->balance_interval); |
@@ -5544,7 +5654,7 @@ void update_max_interval(void) | |||
5544 | */ | 5654 | */ |
5545 | static void rebalance_domains(int cpu, enum cpu_idle_type idle) | 5655 | static void rebalance_domains(int cpu, enum cpu_idle_type idle) |
5546 | { | 5656 | { |
5547 | int balance = 1; | 5657 | int continue_balancing = 1; |
5548 | struct rq *rq = cpu_rq(cpu); | 5658 | struct rq *rq = cpu_rq(cpu); |
5549 | unsigned long interval; | 5659 | unsigned long interval; |
5550 | struct sched_domain *sd; | 5660 | struct sched_domain *sd; |
@@ -5576,7 +5686,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
5576 | } | 5686 | } |
5577 | 5687 | ||
5578 | if (time_after_eq(jiffies, sd->last_balance + interval)) { | 5688 | if (time_after_eq(jiffies, sd->last_balance + interval)) { |
5579 | if (load_balance(cpu, rq, sd, idle, &balance)) { | 5689 | if (load_balance(cpu, rq, sd, idle, &continue_balancing)) { |
5580 | /* | 5690 | /* |
5581 | * The LBF_SOME_PINNED logic could have changed | 5691 | * The LBF_SOME_PINNED logic could have changed |
5582 | * env->dst_cpu, so we can't know our idle | 5692 | * env->dst_cpu, so we can't know our idle |
@@ -5599,7 +5709,7 @@ out: | |||
5599 | * CPU in our sched group which is doing load balancing more | 5709 | * CPU in our sched group which is doing load balancing more |
5600 | * actively. | 5710 | * actively. |
5601 | */ | 5711 | */ |
5602 | if (!balance) | 5712 | if (!continue_balancing) |
5603 | break; | 5713 | break; |
5604 | } | 5714 | } |
5605 | rcu_read_unlock(); | 5715 | rcu_read_unlock(); |
@@ -5895,11 +6005,9 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) | |||
5895 | * and ensure we don't carry in an old decay_count if we | 6005 | * and ensure we don't carry in an old decay_count if we |
5896 | * switch back. | 6006 | * switch back. |
5897 | */ | 6007 | */ |
5898 | if (p->se.avg.decay_count) { | 6008 | if (se->avg.decay_count) { |
5899 | struct cfs_rq *cfs_rq = cfs_rq_of(&p->se); | 6009 | __synchronize_entity_decay(se); |
5900 | __synchronize_entity_decay(&p->se); | 6010 | subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib); |
5901 | subtract_blocked_load_contrib(cfs_rq, | ||
5902 | p->se.avg.load_avg_contrib); | ||
5903 | } | 6011 | } |
5904 | #endif | 6012 | #endif |
5905 | } | 6013 | } |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ef0a7b2439dd..b3c5653e1dca 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -285,7 +285,6 @@ struct cfs_rq { | |||
285 | /* Required to track per-cpu representation of a task_group */ | 285 | /* Required to track per-cpu representation of a task_group */ |
286 | u32 tg_runnable_contrib; | 286 | u32 tg_runnable_contrib; |
287 | unsigned long tg_load_contrib; | 287 | unsigned long tg_load_contrib; |
288 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
289 | 288 | ||
290 | /* | 289 | /* |
291 | * h_load = weight * f(tg) | 290 | * h_load = weight * f(tg) |
@@ -294,6 +293,9 @@ struct cfs_rq { | |||
294 | * this group. | 293 | * this group. |
295 | */ | 294 | */ |
296 | unsigned long h_load; | 295 | unsigned long h_load; |
296 | u64 last_h_load_update; | ||
297 | struct sched_entity *h_load_next; | ||
298 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
297 | #endif /* CONFIG_SMP */ | 299 | #endif /* CONFIG_SMP */ |
298 | 300 | ||
299 | #ifdef CONFIG_FAIR_GROUP_SCHED | 301 | #ifdef CONFIG_FAIR_GROUP_SCHED |
@@ -429,9 +431,6 @@ struct rq { | |||
429 | #ifdef CONFIG_FAIR_GROUP_SCHED | 431 | #ifdef CONFIG_FAIR_GROUP_SCHED |
430 | /* list of leaf cfs_rq on this cpu: */ | 432 | /* list of leaf cfs_rq on this cpu: */ |
431 | struct list_head leaf_cfs_rq_list; | 433 | struct list_head leaf_cfs_rq_list; |
432 | #ifdef CONFIG_SMP | ||
433 | unsigned long h_load_throttle; | ||
434 | #endif /* CONFIG_SMP */ | ||
435 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 434 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
436 | 435 | ||
437 | #ifdef CONFIG_RT_GROUP_SCHED | 436 | #ifdef CONFIG_RT_GROUP_SCHED |
@@ -595,6 +594,7 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag) | |||
595 | } | 594 | } |
596 | 595 | ||
597 | DECLARE_PER_CPU(struct sched_domain *, sd_llc); | 596 | DECLARE_PER_CPU(struct sched_domain *, sd_llc); |
597 | DECLARE_PER_CPU(int, sd_llc_size); | ||
598 | DECLARE_PER_CPU(int, sd_llc_id); | 598 | DECLARE_PER_CPU(int, sd_llc_id); |
599 | 599 | ||
600 | struct sched_group_power { | 600 | struct sched_group_power { |
@@ -665,9 +665,9 @@ extern int group_balance_cpu(struct sched_group *sg); | |||
665 | /* | 665 | /* |
666 | * Return the group to which this tasks belongs. | 666 | * Return the group to which this tasks belongs. |
667 | * | 667 | * |
668 | * We cannot use task_subsys_state() and friends because the cgroup | 668 | * We cannot use task_css() and friends because the cgroup subsystem |
669 | * subsystem changes that value before the cgroup_subsys::attach() method | 669 | * changes that value before the cgroup_subsys::attach() method is called, |
670 | * is called, therefore we cannot pin it and might observe the wrong value. | 670 | * therefore we cannot pin it and might observe the wrong value. |
671 | * | 671 | * |
672 | * The same is true for autogroup's p->signal->autogroup->tg, the autogroup | 672 | * The same is true for autogroup's p->signal->autogroup->tg, the autogroup |
673 | * core changes this before calling sched_move_task(). | 673 | * core changes this before calling sched_move_task(). |
diff --git a/kernel/smp.c b/kernel/smp.c index fe9f773d7114..449b707fc20d 100644 --- a/kernel/smp.c +++ b/kernel/smp.c | |||
@@ -186,25 +186,13 @@ void generic_smp_call_function_single_interrupt(void) | |||
186 | 186 | ||
187 | while (!list_empty(&list)) { | 187 | while (!list_empty(&list)) { |
188 | struct call_single_data *csd; | 188 | struct call_single_data *csd; |
189 | unsigned int csd_flags; | ||
190 | 189 | ||
191 | csd = list_entry(list.next, struct call_single_data, list); | 190 | csd = list_entry(list.next, struct call_single_data, list); |
192 | list_del(&csd->list); | 191 | list_del(&csd->list); |
193 | 192 | ||
194 | /* | ||
195 | * 'csd' can be invalid after this call if flags == 0 | ||
196 | * (when called through generic_exec_single()), | ||
197 | * so save them away before making the call: | ||
198 | */ | ||
199 | csd_flags = csd->flags; | ||
200 | |||
201 | csd->func(csd->info); | 193 | csd->func(csd->info); |
202 | 194 | ||
203 | /* | 195 | csd_unlock(csd); |
204 | * Unlocked CSDs are valid through generic_exec_single(): | ||
205 | */ | ||
206 | if (csd_flags & CSD_FLAG_LOCK) | ||
207 | csd_unlock(csd); | ||
208 | } | 196 | } |
209 | } | 197 | } |
210 | 198 | ||
@@ -278,8 +266,6 @@ EXPORT_SYMBOL(smp_call_function_single); | |||
278 | * @wait: If true, wait until function has completed. | 266 | * @wait: If true, wait until function has completed. |
279 | * | 267 | * |
280 | * Returns 0 on success, else a negative status code (if no cpus were online). | 268 | * Returns 0 on success, else a negative status code (if no cpus were online). |
281 | * Note that @wait will be implicitly turned on in case of allocation failures, | ||
282 | * since we fall back to on-stack allocation. | ||
283 | * | 269 | * |
284 | * Selection preference: | 270 | * Selection preference: |
285 | * 1) current cpu if in @mask | 271 | * 1) current cpu if in @mask |
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 70f27e89012b..2b62fe86f9ec 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig | |||
@@ -105,7 +105,6 @@ config NO_HZ_FULL | |||
105 | select RCU_USER_QS | 105 | select RCU_USER_QS |
106 | select RCU_NOCB_CPU | 106 | select RCU_NOCB_CPU |
107 | select VIRT_CPU_ACCOUNTING_GEN | 107 | select VIRT_CPU_ACCOUNTING_GEN |
108 | select CONTEXT_TRACKING_FORCE | ||
109 | select IRQ_WORK | 108 | select IRQ_WORK |
110 | help | 109 | help |
111 | Adaptively try to shutdown the tick whenever possible, even when | 110 | Adaptively try to shutdown the tick whenever possible, even when |
@@ -134,6 +133,56 @@ config NO_HZ_FULL_ALL | |||
134 | Note the boot CPU will still be kept outside the range to | 133 | Note the boot CPU will still be kept outside the range to |
135 | handle the timekeeping duty. | 134 | handle the timekeeping duty. |
136 | 135 | ||
136 | config NO_HZ_FULL_SYSIDLE | ||
137 | bool "Detect full-system idle state for full dynticks system" | ||
138 | depends on NO_HZ_FULL | ||
139 | default n | ||
140 | help | ||
141 | At least one CPU must keep the scheduling-clock tick running for | ||
142 | timekeeping purposes whenever there is a non-idle CPU, where | ||
143 | "non-idle" also includes dynticks CPUs as long as they are | ||
144 | running non-idle tasks. Because the underlying adaptive-tick | ||
145 | support cannot distinguish between all CPUs being idle and | ||
146 | all CPUs each running a single task in dynticks mode, the | ||
147 | underlying support simply ensures that there is always a CPU | ||
148 | handling the scheduling-clock tick, whether or not all CPUs | ||
149 | are idle. This Kconfig option enables scalable detection of | ||
150 | the all-CPUs-idle state, thus allowing the scheduling-clock | ||
151 | tick to be disabled when all CPUs are idle. Note that scalable | ||
152 | detection of the all-CPUs-idle state means that larger systems | ||
153 | will be slower to declare the all-CPUs-idle state. | ||
154 | |||
155 | Say Y if you would like to help debug all-CPUs-idle detection. | ||
156 | |||
157 | Say N if you are unsure. | ||
158 | |||
159 | config NO_HZ_FULL_SYSIDLE_SMALL | ||
160 | int "Number of CPUs above which large-system approach is used" | ||
161 | depends on NO_HZ_FULL_SYSIDLE | ||
162 | range 1 NR_CPUS | ||
163 | default 8 | ||
164 | help | ||
165 | The full-system idle detection mechanism takes a lazy approach | ||
166 | on large systems, as is required to attain decent scalability. | ||
167 | However, on smaller systems, scalability is not anywhere near as | ||
168 | large a concern as is energy efficiency. The sysidle subsystem | ||
169 | therefore uses a fast but non-scalable algorithm for small | ||
170 | systems and a lazier but scalable algorithm for large systems. | ||
171 | This Kconfig parameter defines the number of CPUs in the largest | ||
172 | system that will be considered to be "small". | ||
173 | |||
174 | The default value will be fine in most cases. Battery-powered | ||
175 | systems that (1) enable NO_HZ_FULL_SYSIDLE, (2) have larger | ||
176 | numbers of CPUs, and (3) are suffering from battery-lifetime | ||
177 | problems due to long sysidle latencies might wish to experiment | ||
178 | with larger values for this Kconfig parameter. On the other | ||
179 | hand, they might be even better served by disabling NO_HZ_FULL | ||
180 | entirely, given that NO_HZ_FULL is intended for HPC and | ||
181 | real-time workloads that at present do not tend to be run on | ||
182 | battery-powered systems. | ||
183 | |||
184 | Take the default if you are unsure. | ||
185 | |||
137 | config NO_HZ | 186 | config NO_HZ |
138 | bool "Old Idle dynticks config" | 187 | bool "Old Idle dynticks config" |
139 | depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS | 188 | depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index e8a1516cc0a3..3612fc77f834 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -23,6 +23,7 @@ | |||
23 | #include <linux/irq_work.h> | 23 | #include <linux/irq_work.h> |
24 | #include <linux/posix-timers.h> | 24 | #include <linux/posix-timers.h> |
25 | #include <linux/perf_event.h> | 25 | #include <linux/perf_event.h> |
26 | #include <linux/context_tracking.h> | ||
26 | 27 | ||
27 | #include <asm/irq_regs.h> | 28 | #include <asm/irq_regs.h> |
28 | 29 | ||
@@ -148,8 +149,8 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) | |||
148 | } | 149 | } |
149 | 150 | ||
150 | #ifdef CONFIG_NO_HZ_FULL | 151 | #ifdef CONFIG_NO_HZ_FULL |
151 | static cpumask_var_t nohz_full_mask; | 152 | cpumask_var_t tick_nohz_full_mask; |
152 | bool have_nohz_full_mask; | 153 | bool tick_nohz_full_running; |
153 | 154 | ||
154 | static bool can_stop_full_tick(void) | 155 | static bool can_stop_full_tick(void) |
155 | { | 156 | { |
@@ -182,7 +183,7 @@ static bool can_stop_full_tick(void) | |||
182 | * Don't allow the user to think they can get | 183 | * Don't allow the user to think they can get |
183 | * full NO_HZ with this machine. | 184 | * full NO_HZ with this machine. |
184 | */ | 185 | */ |
185 | WARN_ONCE(have_nohz_full_mask, | 186 | WARN_ONCE(tick_nohz_full_running, |
186 | "NO_HZ FULL will not work with unstable sched clock"); | 187 | "NO_HZ FULL will not work with unstable sched clock"); |
187 | return false; | 188 | return false; |
188 | } | 189 | } |
@@ -197,7 +198,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now); | |||
197 | * Re-evaluate the need for the tick on the current CPU | 198 | * Re-evaluate the need for the tick on the current CPU |
198 | * and restart it if necessary. | 199 | * and restart it if necessary. |
199 | */ | 200 | */ |
200 | void tick_nohz_full_check(void) | 201 | void __tick_nohz_full_check(void) |
201 | { | 202 | { |
202 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | 203 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); |
203 | 204 | ||
@@ -211,7 +212,7 @@ void tick_nohz_full_check(void) | |||
211 | 212 | ||
212 | static void nohz_full_kick_work_func(struct irq_work *work) | 213 | static void nohz_full_kick_work_func(struct irq_work *work) |
213 | { | 214 | { |
214 | tick_nohz_full_check(); | 215 | __tick_nohz_full_check(); |
215 | } | 216 | } |
216 | 217 | ||
217 | static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { | 218 | static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { |
@@ -230,7 +231,7 @@ void tick_nohz_full_kick(void) | |||
230 | 231 | ||
231 | static void nohz_full_kick_ipi(void *info) | 232 | static void nohz_full_kick_ipi(void *info) |
232 | { | 233 | { |
233 | tick_nohz_full_check(); | 234 | __tick_nohz_full_check(); |
234 | } | 235 | } |
235 | 236 | ||
236 | /* | 237 | /* |
@@ -239,12 +240,13 @@ static void nohz_full_kick_ipi(void *info) | |||
239 | */ | 240 | */ |
240 | void tick_nohz_full_kick_all(void) | 241 | void tick_nohz_full_kick_all(void) |
241 | { | 242 | { |
242 | if (!have_nohz_full_mask) | 243 | if (!tick_nohz_full_running) |
243 | return; | 244 | return; |
244 | 245 | ||
245 | preempt_disable(); | 246 | preempt_disable(); |
246 | smp_call_function_many(nohz_full_mask, | 247 | smp_call_function_many(tick_nohz_full_mask, |
247 | nohz_full_kick_ipi, NULL, false); | 248 | nohz_full_kick_ipi, NULL, false); |
249 | tick_nohz_full_kick(); | ||
248 | preempt_enable(); | 250 | preempt_enable(); |
249 | } | 251 | } |
250 | 252 | ||
@@ -253,7 +255,7 @@ void tick_nohz_full_kick_all(void) | |||
253 | * It might need the tick due to per task/process properties: | 255 | * It might need the tick due to per task/process properties: |
254 | * perf events, posix cpu timers, ... | 256 | * perf events, posix cpu timers, ... |
255 | */ | 257 | */ |
256 | void tick_nohz_task_switch(struct task_struct *tsk) | 258 | void __tick_nohz_task_switch(struct task_struct *tsk) |
257 | { | 259 | { |
258 | unsigned long flags; | 260 | unsigned long flags; |
259 | 261 | ||
@@ -269,31 +271,23 @@ out: | |||
269 | local_irq_restore(flags); | 271 | local_irq_restore(flags); |
270 | } | 272 | } |
271 | 273 | ||
272 | int tick_nohz_full_cpu(int cpu) | ||
273 | { | ||
274 | if (!have_nohz_full_mask) | ||
275 | return 0; | ||
276 | |||
277 | return cpumask_test_cpu(cpu, nohz_full_mask); | ||
278 | } | ||
279 | |||
280 | /* Parse the boot-time nohz CPU list from the kernel parameters. */ | 274 | /* Parse the boot-time nohz CPU list from the kernel parameters. */ |
281 | static int __init tick_nohz_full_setup(char *str) | 275 | static int __init tick_nohz_full_setup(char *str) |
282 | { | 276 | { |
283 | int cpu; | 277 | int cpu; |
284 | 278 | ||
285 | alloc_bootmem_cpumask_var(&nohz_full_mask); | 279 | alloc_bootmem_cpumask_var(&tick_nohz_full_mask); |
286 | if (cpulist_parse(str, nohz_full_mask) < 0) { | 280 | if (cpulist_parse(str, tick_nohz_full_mask) < 0) { |
287 | pr_warning("NOHZ: Incorrect nohz_full cpumask\n"); | 281 | pr_warning("NOHZ: Incorrect nohz_full cpumask\n"); |
288 | return 1; | 282 | return 1; |
289 | } | 283 | } |
290 | 284 | ||
291 | cpu = smp_processor_id(); | 285 | cpu = smp_processor_id(); |
292 | if (cpumask_test_cpu(cpu, nohz_full_mask)) { | 286 | if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { |
293 | pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu); | 287 | pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu); |
294 | cpumask_clear_cpu(cpu, nohz_full_mask); | 288 | cpumask_clear_cpu(cpu, tick_nohz_full_mask); |
295 | } | 289 | } |
296 | have_nohz_full_mask = true; | 290 | tick_nohz_full_running = true; |
297 | 291 | ||
298 | return 1; | 292 | return 1; |
299 | } | 293 | } |
@@ -311,7 +305,7 @@ static int tick_nohz_cpu_down_callback(struct notifier_block *nfb, | |||
311 | * If we handle the timekeeping duty for full dynticks CPUs, | 305 | * If we handle the timekeeping duty for full dynticks CPUs, |
312 | * we can't safely shutdown that CPU. | 306 | * we can't safely shutdown that CPU. |
313 | */ | 307 | */ |
314 | if (have_nohz_full_mask && tick_do_timer_cpu == cpu) | 308 | if (tick_nohz_full_running && tick_do_timer_cpu == cpu) |
315 | return NOTIFY_BAD; | 309 | return NOTIFY_BAD; |
316 | break; | 310 | break; |
317 | } | 311 | } |
@@ -330,31 +324,34 @@ static int tick_nohz_init_all(void) | |||
330 | int err = -1; | 324 | int err = -1; |
331 | 325 | ||
332 | #ifdef CONFIG_NO_HZ_FULL_ALL | 326 | #ifdef CONFIG_NO_HZ_FULL_ALL |
333 | if (!alloc_cpumask_var(&nohz_full_mask, GFP_KERNEL)) { | 327 | if (!alloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL)) { |
334 | pr_err("NO_HZ: Can't allocate full dynticks cpumask\n"); | 328 | pr_err("NO_HZ: Can't allocate full dynticks cpumask\n"); |
335 | return err; | 329 | return err; |
336 | } | 330 | } |
337 | err = 0; | 331 | err = 0; |
338 | cpumask_setall(nohz_full_mask); | 332 | cpumask_setall(tick_nohz_full_mask); |
339 | cpumask_clear_cpu(smp_processor_id(), nohz_full_mask); | 333 | cpumask_clear_cpu(smp_processor_id(), tick_nohz_full_mask); |
340 | have_nohz_full_mask = true; | 334 | tick_nohz_full_running = true; |
341 | #endif | 335 | #endif |
342 | return err; | 336 | return err; |
343 | } | 337 | } |
344 | 338 | ||
345 | void __init tick_nohz_init(void) | 339 | void __init tick_nohz_init(void) |
346 | { | 340 | { |
347 | if (!have_nohz_full_mask) { | 341 | int cpu; |
342 | |||
343 | if (!tick_nohz_full_running) { | ||
348 | if (tick_nohz_init_all() < 0) | 344 | if (tick_nohz_init_all() < 0) |
349 | return; | 345 | return; |
350 | } | 346 | } |
351 | 347 | ||
348 | for_each_cpu(cpu, tick_nohz_full_mask) | ||
349 | context_tracking_cpu_set(cpu); | ||
350 | |||
352 | cpu_notifier(tick_nohz_cpu_down_callback, 0); | 351 | cpu_notifier(tick_nohz_cpu_down_callback, 0); |
353 | cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), nohz_full_mask); | 352 | cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), tick_nohz_full_mask); |
354 | pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf); | 353 | pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf); |
355 | } | 354 | } |
356 | #else | ||
357 | #define have_nohz_full_mask (0) | ||
358 | #endif | 355 | #endif |
359 | 356 | ||
360 | /* | 357 | /* |
@@ -732,7 +729,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) | |||
732 | return false; | 729 | return false; |
733 | } | 730 | } |
734 | 731 | ||
735 | if (have_nohz_full_mask) { | 732 | if (tick_nohz_full_enabled()) { |
736 | /* | 733 | /* |
737 | * Keep the tick alive to guarantee timekeeping progression | 734 | * Keep the tick alive to guarantee timekeeping progression |
738 | * if there are full dynticks CPUs around | 735 | * if there are full dynticks CPUs around |
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 3bdf28323012..61ed862cdd37 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c | |||
@@ -265,10 +265,9 @@ static inline void timer_list_header(struct seq_file *m, u64 now) | |||
265 | static int timer_list_show(struct seq_file *m, void *v) | 265 | static int timer_list_show(struct seq_file *m, void *v) |
266 | { | 266 | { |
267 | struct timer_list_iter *iter = v; | 267 | struct timer_list_iter *iter = v; |
268 | u64 now = ktime_to_ns(ktime_get()); | ||
269 | 268 | ||
270 | if (iter->cpu == -1 && !iter->second_pass) | 269 | if (iter->cpu == -1 && !iter->second_pass) |
271 | timer_list_header(m, now); | 270 | timer_list_header(m, iter->now); |
272 | else if (!iter->second_pass) | 271 | else if (!iter->second_pass) |
273 | print_cpu(m, iter->cpu, iter->now); | 272 | print_cpu(m, iter->cpu, iter->now); |
274 | #ifdef CONFIG_GENERIC_CLOCKEVENTS | 273 | #ifdef CONFIG_GENERIC_CLOCKEVENTS |
@@ -298,33 +297,41 @@ void sysrq_timer_list_show(void) | |||
298 | return; | 297 | return; |
299 | } | 298 | } |
300 | 299 | ||
301 | static void *timer_list_start(struct seq_file *file, loff_t *offset) | 300 | static void *move_iter(struct timer_list_iter *iter, loff_t offset) |
302 | { | 301 | { |
303 | struct timer_list_iter *iter = file->private; | 302 | for (; offset; offset--) { |
304 | 303 | iter->cpu = cpumask_next(iter->cpu, cpu_online_mask); | |
305 | if (!*offset) { | 304 | if (iter->cpu >= nr_cpu_ids) { |
306 | iter->cpu = -1; | ||
307 | iter->now = ktime_to_ns(ktime_get()); | ||
308 | } else if (iter->cpu >= nr_cpu_ids) { | ||
309 | #ifdef CONFIG_GENERIC_CLOCKEVENTS | 305 | #ifdef CONFIG_GENERIC_CLOCKEVENTS |
310 | if (!iter->second_pass) { | 306 | if (!iter->second_pass) { |
311 | iter->cpu = -1; | 307 | iter->cpu = -1; |
312 | iter->second_pass = true; | 308 | iter->second_pass = true; |
313 | } else | 309 | } else |
314 | return NULL; | 310 | return NULL; |
315 | #else | 311 | #else |
316 | return NULL; | 312 | return NULL; |
317 | #endif | 313 | #endif |
314 | } | ||
318 | } | 315 | } |
319 | return iter; | 316 | return iter; |
320 | } | 317 | } |
321 | 318 | ||
319 | static void *timer_list_start(struct seq_file *file, loff_t *offset) | ||
320 | { | ||
321 | struct timer_list_iter *iter = file->private; | ||
322 | |||
323 | if (!*offset) | ||
324 | iter->now = ktime_to_ns(ktime_get()); | ||
325 | iter->cpu = -1; | ||
326 | iter->second_pass = false; | ||
327 | return move_iter(iter, *offset); | ||
328 | } | ||
329 | |||
322 | static void *timer_list_next(struct seq_file *file, void *v, loff_t *offset) | 330 | static void *timer_list_next(struct seq_file *file, void *v, loff_t *offset) |
323 | { | 331 | { |
324 | struct timer_list_iter *iter = file->private; | 332 | struct timer_list_iter *iter = file->private; |
325 | iter->cpu = cpumask_next(iter->cpu, cpu_online_mask); | ||
326 | ++*offset; | 333 | ++*offset; |
327 | return timer_list_start(file, offset); | 334 | return move_iter(iter, 1); |
328 | } | 335 | } |
329 | 336 | ||
330 | static void timer_list_stop(struct seq_file *seq, void *v) | 337 | static void timer_list_stop(struct seq_file *seq, void *v) |
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index afaae41b0a02..fe39acd4c1aa 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
@@ -1022,6 +1022,9 @@ extern struct list_head ftrace_events; | |||
1022 | extern const char *__start___trace_bprintk_fmt[]; | 1022 | extern const char *__start___trace_bprintk_fmt[]; |
1023 | extern const char *__stop___trace_bprintk_fmt[]; | 1023 | extern const char *__stop___trace_bprintk_fmt[]; |
1024 | 1024 | ||
1025 | extern const char *__start___tracepoint_str[]; | ||
1026 | extern const char *__stop___tracepoint_str[]; | ||
1027 | |||
1025 | void trace_printk_init_buffers(void); | 1028 | void trace_printk_init_buffers(void); |
1026 | void trace_printk_start_comm(void); | 1029 | void trace_printk_start_comm(void); |
1027 | int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set); | 1030 | int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set); |
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index a9077c1b4ad3..2900817ba65c 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c | |||
@@ -244,12 +244,31 @@ static const char **find_next(void *v, loff_t *pos) | |||
244 | { | 244 | { |
245 | const char **fmt = v; | 245 | const char **fmt = v; |
246 | int start_index; | 246 | int start_index; |
247 | int last_index; | ||
247 | 248 | ||
248 | start_index = __stop___trace_bprintk_fmt - __start___trace_bprintk_fmt; | 249 | start_index = __stop___trace_bprintk_fmt - __start___trace_bprintk_fmt; |
249 | 250 | ||
250 | if (*pos < start_index) | 251 | if (*pos < start_index) |
251 | return __start___trace_bprintk_fmt + *pos; | 252 | return __start___trace_bprintk_fmt + *pos; |
252 | 253 | ||
254 | /* | ||
255 | * The __tracepoint_str section is treated the same as the | ||
256 | * __trace_printk_fmt section. The difference is that the | ||
257 | * __trace_printk_fmt section should only be used by trace_printk() | ||
258 | * in a debugging environment, as if anything exists in that section | ||
259 | * the trace_prink() helper buffers are allocated, which would just | ||
260 | * waste space in a production environment. | ||
261 | * | ||
262 | * The __tracepoint_str sections on the other hand are used by | ||
263 | * tracepoints which need to map pointers to their strings to | ||
264 | * the ASCII text for userspace. | ||
265 | */ | ||
266 | last_index = start_index; | ||
267 | start_index = __stop___tracepoint_str - __start___tracepoint_str; | ||
268 | |||
269 | if (*pos < last_index + start_index) | ||
270 | return __start___tracepoint_str + (*pos - last_index); | ||
271 | |||
253 | return find_next_mod_format(start_index, v, fmt, pos); | 272 | return find_next_mod_format(start_index, v, fmt, pos); |
254 | } | 273 | } |
255 | 274 | ||
diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 1241d8c91d5e..51c4f34d258e 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c | |||
@@ -553,14 +553,6 @@ void __init lockup_detector_init(void) | |||
553 | { | 553 | { |
554 | set_sample_period(); | 554 | set_sample_period(); |
555 | 555 | ||
556 | #ifdef CONFIG_NO_HZ_FULL | ||
557 | if (watchdog_user_enabled) { | ||
558 | watchdog_user_enabled = 0; | ||
559 | pr_warning("Disabled lockup detectors by default for full dynticks\n"); | ||
560 | pr_warning("You can reactivate it with 'sysctl -w kernel.watchdog=1'\n"); | ||
561 | } | ||
562 | #endif | ||
563 | |||
564 | if (watchdog_user_enabled) | 556 | if (watchdog_user_enabled) |
565 | watchdog_enable_all_cpus(); | 557 | watchdog_enable_all_cpus(); |
566 | } | 558 | } |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 7f5d4be22034..29b79852a845 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -16,9 +16,10 @@ | |||
16 | * | 16 | * |
17 | * This is the generic async execution mechanism. Work items as are | 17 | * This is the generic async execution mechanism. Work items as are |
18 | * executed in process context. The worker pool is shared and | 18 | * executed in process context. The worker pool is shared and |
19 | * automatically managed. There is one worker pool for each CPU and | 19 | * automatically managed. There are two worker pools for each CPU (one for |
20 | * one extra for works which are better served by workers which are | 20 | * normal work items and the other for high priority ones) and some extra |
21 | * not bound to any specific CPU. | 21 | * pools for workqueues which are not bound to any specific CPU - the |
22 | * number of these backing pools is dynamic. | ||
22 | * | 23 | * |
23 | * Please read Documentation/workqueue.txt for details. | 24 | * Please read Documentation/workqueue.txt for details. |
24 | */ | 25 | */ |
@@ -2033,8 +2034,11 @@ static bool maybe_destroy_workers(struct worker_pool *pool) | |||
2033 | * multiple times. Does GFP_KERNEL allocations. | 2034 | * multiple times. Does GFP_KERNEL allocations. |
2034 | * | 2035 | * |
2035 | * RETURNS: | 2036 | * RETURNS: |
2036 | * spin_lock_irq(pool->lock) which may be released and regrabbed | 2037 | * %false if the pool don't need management and the caller can safely start |
2037 | * multiple times. Does GFP_KERNEL allocations. | 2038 | * processing works, %true indicates that the function released pool->lock |
2039 | * and reacquired it to perform some management function and that the | ||
2040 | * conditions that the caller verified while holding the lock before | ||
2041 | * calling the function might no longer be true. | ||
2038 | */ | 2042 | */ |
2039 | static bool manage_workers(struct worker *worker) | 2043 | static bool manage_workers(struct worker *worker) |
2040 | { | 2044 | { |
@@ -2201,6 +2205,15 @@ __acquires(&pool->lock) | |||
2201 | dump_stack(); | 2205 | dump_stack(); |
2202 | } | 2206 | } |
2203 | 2207 | ||
2208 | /* | ||
2209 | * The following prevents a kworker from hogging CPU on !PREEMPT | ||
2210 | * kernels, where a requeueing work item waiting for something to | ||
2211 | * happen could deadlock with stop_machine as such work item could | ||
2212 | * indefinitely requeue itself while all other CPUs are trapped in | ||
2213 | * stop_machine. | ||
2214 | */ | ||
2215 | cond_resched(); | ||
2216 | |||
2204 | spin_lock_irq(&pool->lock); | 2217 | spin_lock_irq(&pool->lock); |
2205 | 2218 | ||
2206 | /* clear cpu intensive status */ | 2219 | /* clear cpu intensive status */ |
@@ -3086,25 +3099,26 @@ static struct workqueue_struct *dev_to_wq(struct device *dev) | |||
3086 | return wq_dev->wq; | 3099 | return wq_dev->wq; |
3087 | } | 3100 | } |
3088 | 3101 | ||
3089 | static ssize_t wq_per_cpu_show(struct device *dev, | 3102 | static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr, |
3090 | struct device_attribute *attr, char *buf) | 3103 | char *buf) |
3091 | { | 3104 | { |
3092 | struct workqueue_struct *wq = dev_to_wq(dev); | 3105 | struct workqueue_struct *wq = dev_to_wq(dev); |
3093 | 3106 | ||
3094 | return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND)); | 3107 | return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND)); |
3095 | } | 3108 | } |
3109 | static DEVICE_ATTR_RO(per_cpu); | ||
3096 | 3110 | ||
3097 | static ssize_t wq_max_active_show(struct device *dev, | 3111 | static ssize_t max_active_show(struct device *dev, |
3098 | struct device_attribute *attr, char *buf) | 3112 | struct device_attribute *attr, char *buf) |
3099 | { | 3113 | { |
3100 | struct workqueue_struct *wq = dev_to_wq(dev); | 3114 | struct workqueue_struct *wq = dev_to_wq(dev); |
3101 | 3115 | ||
3102 | return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active); | 3116 | return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active); |
3103 | } | 3117 | } |
3104 | 3118 | ||
3105 | static ssize_t wq_max_active_store(struct device *dev, | 3119 | static ssize_t max_active_store(struct device *dev, |
3106 | struct device_attribute *attr, | 3120 | struct device_attribute *attr, const char *buf, |
3107 | const char *buf, size_t count) | 3121 | size_t count) |
3108 | { | 3122 | { |
3109 | struct workqueue_struct *wq = dev_to_wq(dev); | 3123 | struct workqueue_struct *wq = dev_to_wq(dev); |
3110 | int val; | 3124 | int val; |
@@ -3115,12 +3129,14 @@ static ssize_t wq_max_active_store(struct device *dev, | |||
3115 | workqueue_set_max_active(wq, val); | 3129 | workqueue_set_max_active(wq, val); |
3116 | return count; | 3130 | return count; |
3117 | } | 3131 | } |
3132 | static DEVICE_ATTR_RW(max_active); | ||
3118 | 3133 | ||
3119 | static struct device_attribute wq_sysfs_attrs[] = { | 3134 | static struct attribute *wq_sysfs_attrs[] = { |
3120 | __ATTR(per_cpu, 0444, wq_per_cpu_show, NULL), | 3135 | &dev_attr_per_cpu.attr, |
3121 | __ATTR(max_active, 0644, wq_max_active_show, wq_max_active_store), | 3136 | &dev_attr_max_active.attr, |
3122 | __ATTR_NULL, | 3137 | NULL, |
3123 | }; | 3138 | }; |
3139 | ATTRIBUTE_GROUPS(wq_sysfs); | ||
3124 | 3140 | ||
3125 | static ssize_t wq_pool_ids_show(struct device *dev, | 3141 | static ssize_t wq_pool_ids_show(struct device *dev, |
3126 | struct device_attribute *attr, char *buf) | 3142 | struct device_attribute *attr, char *buf) |
@@ -3270,7 +3286,7 @@ static struct device_attribute wq_sysfs_unbound_attrs[] = { | |||
3270 | 3286 | ||
3271 | static struct bus_type wq_subsys = { | 3287 | static struct bus_type wq_subsys = { |
3272 | .name = "workqueue", | 3288 | .name = "workqueue", |
3273 | .dev_attrs = wq_sysfs_attrs, | 3289 | .dev_groups = wq_sysfs_groups, |
3274 | }; | 3290 | }; |
3275 | 3291 | ||
3276 | static int __init wq_sysfs_init(void) | 3292 | static int __init wq_sysfs_init(void) |