aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup.c1660
-rw-r--r--kernel/cgroup_freezer.c155
-rw-r--r--kernel/context_tracking.c125
-rw-r--r--kernel/cpu.c9
-rw-r--r--kernel/cpuset.c317
-rw-r--r--kernel/events/callchain.c3
-rw-r--r--kernel/events/core.c410
-rw-r--r--kernel/fork.c5
-rw-r--r--kernel/hung_task.c13
-rw-r--r--kernel/lglock.c12
-rw-r--r--kernel/mutex.c43
-rw-r--r--kernel/nsproxy.c27
-rw-r--r--kernel/pid_namespace.c4
-rw-r--r--kernel/power/hibernate.c2
-rw-r--r--kernel/power/suspend.c4
-rw-r--r--kernel/printk/printk.c7
-rw-r--r--kernel/rcu.h12
-rw-r--r--kernel/rcupdate.c102
-rw-r--r--kernel/rcutiny.c2
-rw-r--r--kernel/rcutiny_plugin.h2
-rw-r--r--kernel/rcutorture.c396
-rw-r--r--kernel/rcutree.c255
-rw-r--r--kernel/rcutree.h19
-rw-r--r--kernel/rcutree_plugin.h460
-rw-r--r--kernel/sched/core.c141
-rw-r--r--kernel/sched/cpuacct.c51
-rw-r--r--kernel/sched/cputime.c55
-rw-r--r--kernel/sched/fair.c616
-rw-r--r--kernel/sched/sched.h14
-rw-r--r--kernel/smp.c16
-rw-r--r--kernel/time/Kconfig51
-rw-r--r--kernel/time/tick-sched.c61
-rw-r--r--kernel/time/timer_list.c41
-rw-r--r--kernel/trace/trace.h3
-rw-r--r--kernel/trace/trace_printk.c19
-rw-r--r--kernel/watchdog.c8
-rw-r--r--kernel/workqueue.c50
37 files changed, 3010 insertions, 2160 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 781845a013ab..e0aeb32415ff 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -81,7 +81,7 @@
81 */ 81 */
82#ifdef CONFIG_PROVE_RCU 82#ifdef CONFIG_PROVE_RCU
83DEFINE_MUTEX(cgroup_mutex); 83DEFINE_MUTEX(cgroup_mutex);
84EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for task_subsys_state_check() */ 84EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for lockdep */
85#else 85#else
86static DEFINE_MUTEX(cgroup_mutex); 86static DEFINE_MUTEX(cgroup_mutex);
87#endif 87#endif
@@ -117,6 +117,7 @@ struct cfent {
117 struct list_head node; 117 struct list_head node;
118 struct dentry *dentry; 118 struct dentry *dentry;
119 struct cftype *type; 119 struct cftype *type;
120 struct cgroup_subsys_state *css;
120 121
121 /* file xattrs */ 122 /* file xattrs */
122 struct simple_xattrs xattrs; 123 struct simple_xattrs xattrs;
@@ -159,9 +160,9 @@ struct css_id {
159 */ 160 */
160struct cgroup_event { 161struct cgroup_event {
161 /* 162 /*
162 * Cgroup which the event belongs to. 163 * css which the event belongs to.
163 */ 164 */
164 struct cgroup *cgrp; 165 struct cgroup_subsys_state *css;
165 /* 166 /*
166 * Control file which the event associated. 167 * Control file which the event associated.
167 */ 168 */
@@ -215,10 +216,33 @@ static u64 cgroup_serial_nr_next = 1;
215 */ 216 */
216static int need_forkexit_callback __read_mostly; 217static int need_forkexit_callback __read_mostly;
217 218
218static void cgroup_offline_fn(struct work_struct *work); 219static struct cftype cgroup_base_files[];
220
221static void cgroup_destroy_css_killed(struct cgroup *cgrp);
219static int cgroup_destroy_locked(struct cgroup *cgrp); 222static int cgroup_destroy_locked(struct cgroup *cgrp);
220static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, 223static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
221 struct cftype cfts[], bool is_add); 224 bool is_add);
225
226/**
227 * cgroup_css - obtain a cgroup's css for the specified subsystem
228 * @cgrp: the cgroup of interest
229 * @ss: the subsystem of interest (%NULL returns the dummy_css)
230 *
231 * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This
232 * function must be called either under cgroup_mutex or rcu_read_lock() and
233 * the caller is responsible for pinning the returned css if it wants to
234 * keep accessing it outside the said locks. This function may return
235 * %NULL if @cgrp doesn't have @subsys_id enabled.
236 */
237static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
238 struct cgroup_subsys *ss)
239{
240 if (ss)
241 return rcu_dereference_check(cgrp->subsys[ss->subsys_id],
242 lockdep_is_held(&cgroup_mutex));
243 else
244 return &cgrp->dummy_css;
245}
222 246
223/* convenient tests for these bits */ 247/* convenient tests for these bits */
224static inline bool cgroup_is_dead(const struct cgroup *cgrp) 248static inline bool cgroup_is_dead(const struct cgroup *cgrp)
@@ -365,9 +389,11 @@ static struct cgrp_cset_link init_cgrp_cset_link;
365static int cgroup_init_idr(struct cgroup_subsys *ss, 389static int cgroup_init_idr(struct cgroup_subsys *ss,
366 struct cgroup_subsys_state *css); 390 struct cgroup_subsys_state *css);
367 391
368/* css_set_lock protects the list of css_set objects, and the 392/*
369 * chain of tasks off each css_set. Nests outside task->alloc_lock 393 * css_set_lock protects the list of css_set objects, and the chain of
370 * due to cgroup_iter_start() */ 394 * tasks off each css_set. Nests outside task->alloc_lock due to
395 * css_task_iter_start().
396 */
371static DEFINE_RWLOCK(css_set_lock); 397static DEFINE_RWLOCK(css_set_lock);
372static int css_set_count; 398static int css_set_count;
373 399
@@ -392,10 +418,12 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
392 return key; 418 return key;
393} 419}
394 420
395/* We don't maintain the lists running through each css_set to its 421/*
396 * task until after the first call to cgroup_iter_start(). This 422 * We don't maintain the lists running through each css_set to its task
397 * reduces the fork()/exit() overhead for people who have cgroups 423 * until after the first call to css_task_iter_start(). This reduces the
398 * compiled into their kernel but not actually in use */ 424 * fork()/exit() overhead for people who have cgroups compiled into their
425 * kernel but not actually in use.
426 */
399static int use_task_css_set_links __read_mostly; 427static int use_task_css_set_links __read_mostly;
400 428
401static void __put_css_set(struct css_set *cset, int taskexit) 429static void __put_css_set(struct css_set *cset, int taskexit)
@@ -464,7 +492,7 @@ static inline void put_css_set_taskexit(struct css_set *cset)
464 * @new_cgrp: cgroup that's being entered by the task 492 * @new_cgrp: cgroup that's being entered by the task
465 * @template: desired set of css pointers in css_set (pre-calculated) 493 * @template: desired set of css pointers in css_set (pre-calculated)
466 * 494 *
467 * Returns true if "cg" matches "old_cg" except for the hierarchy 495 * Returns true if "cset" matches "old_cset" except for the hierarchy
468 * which "new_cgrp" belongs to, for which it should match "new_cgrp". 496 * which "new_cgrp" belongs to, for which it should match "new_cgrp".
469 */ 497 */
470static bool compare_css_sets(struct css_set *cset, 498static bool compare_css_sets(struct css_set *cset,
@@ -555,7 +583,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset,
555 /* Subsystem is in this hierarchy. So we want 583 /* Subsystem is in this hierarchy. So we want
556 * the subsystem state from the new 584 * the subsystem state from the new
557 * cgroup */ 585 * cgroup */
558 template[i] = cgrp->subsys[i]; 586 template[i] = cgroup_css(cgrp, ss);
559 } else { 587 } else {
560 /* Subsystem is not in this hierarchy, so we 588 /* Subsystem is not in this hierarchy, so we
561 * don't want to change the subsystem state */ 589 * don't want to change the subsystem state */
@@ -803,8 +831,7 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
803 831
804static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); 832static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
805static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); 833static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
806static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, 834static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask);
807 unsigned long subsys_mask);
808static const struct inode_operations cgroup_dir_inode_operations; 835static const struct inode_operations cgroup_dir_inode_operations;
809static const struct file_operations proc_cgroupstats_operations; 836static const struct file_operations proc_cgroupstats_operations;
810 837
@@ -813,8 +840,7 @@ static struct backing_dev_info cgroup_backing_dev_info = {
813 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, 840 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
814}; 841};
815 842
816static int alloc_css_id(struct cgroup_subsys *ss, 843static int alloc_css_id(struct cgroup_subsys_state *child_css);
817 struct cgroup *parent, struct cgroup *child);
818 844
819static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) 845static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
820{ 846{
@@ -845,15 +871,8 @@ static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry)
845static void cgroup_free_fn(struct work_struct *work) 871static void cgroup_free_fn(struct work_struct *work)
846{ 872{
847 struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); 873 struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
848 struct cgroup_subsys *ss;
849 874
850 mutex_lock(&cgroup_mutex); 875 mutex_lock(&cgroup_mutex);
851 /*
852 * Release the subsystem state objects.
853 */
854 for_each_root_subsys(cgrp->root, ss)
855 ss->css_free(cgrp);
856
857 cgrp->root->number_of_cgroups--; 876 cgrp->root->number_of_cgroups--;
858 mutex_unlock(&cgroup_mutex); 877 mutex_unlock(&cgroup_mutex);
859 878
@@ -864,8 +883,6 @@ static void cgroup_free_fn(struct work_struct *work)
864 */ 883 */
865 dput(cgrp->parent->dentry); 884 dput(cgrp->parent->dentry);
866 885
867 ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
868
869 /* 886 /*
870 * Drop the active superblock reference that we took when we 887 * Drop the active superblock reference that we took when we
871 * created the cgroup. This will free cgrp->root, if we are 888 * created the cgroup. This will free cgrp->root, if we are
@@ -956,27 +973,22 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
956} 973}
957 974
958/** 975/**
959 * cgroup_clear_directory - selective removal of base and subsystem files 976 * cgroup_clear_dir - remove subsys files in a cgroup directory
960 * @dir: directory containing the files 977 * @cgrp: target cgroup
961 * @base_files: true if the base files should be removed
962 * @subsys_mask: mask of the subsystem ids whose files should be removed 978 * @subsys_mask: mask of the subsystem ids whose files should be removed
963 */ 979 */
964static void cgroup_clear_directory(struct dentry *dir, bool base_files, 980static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
965 unsigned long subsys_mask)
966{ 981{
967 struct cgroup *cgrp = __d_cgrp(dir);
968 struct cgroup_subsys *ss; 982 struct cgroup_subsys *ss;
983 int i;
969 984
970 for_each_root_subsys(cgrp->root, ss) { 985 for_each_subsys(ss, i) {
971 struct cftype_set *set; 986 struct cftype_set *set;
972 if (!test_bit(ss->subsys_id, &subsys_mask)) 987
988 if (!test_bit(i, &subsys_mask))
973 continue; 989 continue;
974 list_for_each_entry(set, &ss->cftsets, node) 990 list_for_each_entry(set, &ss->cftsets, node)
975 cgroup_addrm_files(cgrp, NULL, set->cfts, false); 991 cgroup_addrm_files(cgrp, set->cfts, false);
976 }
977 if (base_files) {
978 while (!list_empty(&cgrp->files))
979 cgroup_rm_file(cgrp, NULL);
980 } 992 }
981} 993}
982 994
@@ -986,9 +998,6 @@ static void cgroup_clear_directory(struct dentry *dir, bool base_files,
986static void cgroup_d_remove_dir(struct dentry *dentry) 998static void cgroup_d_remove_dir(struct dentry *dentry)
987{ 999{
988 struct dentry *parent; 1000 struct dentry *parent;
989 struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
990
991 cgroup_clear_directory(dentry, true, root->subsys_mask);
992 1001
993 parent = dentry->d_parent; 1002 parent = dentry->d_parent;
994 spin_lock(&parent->d_lock); 1003 spin_lock(&parent->d_lock);
@@ -1009,79 +1018,84 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1009{ 1018{
1010 struct cgroup *cgrp = &root->top_cgroup; 1019 struct cgroup *cgrp = &root->top_cgroup;
1011 struct cgroup_subsys *ss; 1020 struct cgroup_subsys *ss;
1012 int i; 1021 unsigned long pinned = 0;
1022 int i, ret;
1013 1023
1014 BUG_ON(!mutex_is_locked(&cgroup_mutex)); 1024 BUG_ON(!mutex_is_locked(&cgroup_mutex));
1015 BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); 1025 BUG_ON(!mutex_is_locked(&cgroup_root_mutex));
1016 1026
1017 /* Check that any added subsystems are currently free */ 1027 /* Check that any added subsystems are currently free */
1018 for_each_subsys(ss, i) { 1028 for_each_subsys(ss, i) {
1019 unsigned long bit = 1UL << i; 1029 if (!(added_mask & (1 << i)))
1020
1021 if (!(bit & added_mask))
1022 continue; 1030 continue;
1023 1031
1032 /* is the subsystem mounted elsewhere? */
1024 if (ss->root != &cgroup_dummy_root) { 1033 if (ss->root != &cgroup_dummy_root) {
1025 /* Subsystem isn't free */ 1034 ret = -EBUSY;
1026 return -EBUSY; 1035 goto out_put;
1036 }
1037
1038 /* pin the module */
1039 if (!try_module_get(ss->module)) {
1040 ret = -ENOENT;
1041 goto out_put;
1027 } 1042 }
1043 pinned |= 1 << i;
1028 } 1044 }
1029 1045
1030 /* Currently we don't handle adding/removing subsystems when 1046 /* subsys could be missing if unloaded between parsing and here */
1031 * any child cgroups exist. This is theoretically supportable 1047 if (added_mask != pinned) {
1032 * but involves complex error handling, so it's being left until 1048 ret = -ENOENT;
1033 * later */ 1049 goto out_put;
1034 if (root->number_of_cgroups > 1) 1050 }
1035 return -EBUSY; 1051
1052 ret = cgroup_populate_dir(cgrp, added_mask);
1053 if (ret)
1054 goto out_put;
1055
1056 /*
1057 * Nothing can fail from this point on. Remove files for the
1058 * removed subsystems and rebind each subsystem.
1059 */
1060 cgroup_clear_dir(cgrp, removed_mask);
1036 1061
1037 /* Process each subsystem */
1038 for_each_subsys(ss, i) { 1062 for_each_subsys(ss, i) {
1039 unsigned long bit = 1UL << i; 1063 unsigned long bit = 1UL << i;
1040 1064
1041 if (bit & added_mask) { 1065 if (bit & added_mask) {
1042 /* We're binding this subsystem to this hierarchy */ 1066 /* We're binding this subsystem to this hierarchy */
1043 BUG_ON(cgrp->subsys[i]); 1067 BUG_ON(cgroup_css(cgrp, ss));
1044 BUG_ON(!cgroup_dummy_top->subsys[i]); 1068 BUG_ON(!cgroup_css(cgroup_dummy_top, ss));
1045 BUG_ON(cgroup_dummy_top->subsys[i]->cgroup != cgroup_dummy_top); 1069 BUG_ON(cgroup_css(cgroup_dummy_top, ss)->cgroup != cgroup_dummy_top);
1070
1071 rcu_assign_pointer(cgrp->subsys[i],
1072 cgroup_css(cgroup_dummy_top, ss));
1073 cgroup_css(cgrp, ss)->cgroup = cgrp;
1046 1074
1047 cgrp->subsys[i] = cgroup_dummy_top->subsys[i];
1048 cgrp->subsys[i]->cgroup = cgrp;
1049 list_move(&ss->sibling, &root->subsys_list); 1075 list_move(&ss->sibling, &root->subsys_list);
1050 ss->root = root; 1076 ss->root = root;
1051 if (ss->bind) 1077 if (ss->bind)
1052 ss->bind(cgrp); 1078 ss->bind(cgroup_css(cgrp, ss));
1053 1079
1054 /* refcount was already taken, and we're keeping it */ 1080 /* refcount was already taken, and we're keeping it */
1055 root->subsys_mask |= bit; 1081 root->subsys_mask |= bit;
1056 } else if (bit & removed_mask) { 1082 } else if (bit & removed_mask) {
1057 /* We're removing this subsystem */ 1083 /* We're removing this subsystem */
1058 BUG_ON(cgrp->subsys[i] != cgroup_dummy_top->subsys[i]); 1084 BUG_ON(cgroup_css(cgrp, ss) != cgroup_css(cgroup_dummy_top, ss));
1059 BUG_ON(cgrp->subsys[i]->cgroup != cgrp); 1085 BUG_ON(cgroup_css(cgrp, ss)->cgroup != cgrp);
1060 1086
1061 if (ss->bind) 1087 if (ss->bind)
1062 ss->bind(cgroup_dummy_top); 1088 ss->bind(cgroup_css(cgroup_dummy_top, ss));
1063 cgroup_dummy_top->subsys[i]->cgroup = cgroup_dummy_top; 1089
1064 cgrp->subsys[i] = NULL; 1090 cgroup_css(cgroup_dummy_top, ss)->cgroup = cgroup_dummy_top;
1091 RCU_INIT_POINTER(cgrp->subsys[i], NULL);
1092
1065 cgroup_subsys[i]->root = &cgroup_dummy_root; 1093 cgroup_subsys[i]->root = &cgroup_dummy_root;
1066 list_move(&ss->sibling, &cgroup_dummy_root.subsys_list); 1094 list_move(&ss->sibling, &cgroup_dummy_root.subsys_list);
1067 1095
1068 /* subsystem is now free - drop reference on module */ 1096 /* subsystem is now free - drop reference on module */
1069 module_put(ss->module); 1097 module_put(ss->module);
1070 root->subsys_mask &= ~bit; 1098 root->subsys_mask &= ~bit;
1071 } else if (bit & root->subsys_mask) {
1072 /* Subsystem state should already exist */
1073 BUG_ON(!cgrp->subsys[i]);
1074 /*
1075 * a refcount was taken, but we already had one, so
1076 * drop the extra reference.
1077 */
1078 module_put(ss->module);
1079#ifdef CONFIG_MODULE_UNLOAD
1080 BUG_ON(ss->module && !module_refcount(ss->module));
1081#endif
1082 } else {
1083 /* Subsystem state shouldn't exist */
1084 BUG_ON(cgrp->subsys[i]);
1085 } 1099 }
1086 } 1100 }
1087 1101
@@ -1092,6 +1106,12 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1092 root->flags |= CGRP_ROOT_SUBSYS_BOUND; 1106 root->flags |= CGRP_ROOT_SUBSYS_BOUND;
1093 1107
1094 return 0; 1108 return 0;
1109
1110out_put:
1111 for_each_subsys(ss, i)
1112 if (pinned & (1 << i))
1113 module_put(ss->module);
1114 return ret;
1095} 1115}
1096 1116
1097static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) 1117static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
@@ -1142,7 +1162,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1142 char *token, *o = data; 1162 char *token, *o = data;
1143 bool all_ss = false, one_ss = false; 1163 bool all_ss = false, one_ss = false;
1144 unsigned long mask = (unsigned long)-1; 1164 unsigned long mask = (unsigned long)-1;
1145 bool module_pin_failed = false;
1146 struct cgroup_subsys *ss; 1165 struct cgroup_subsys *ss;
1147 int i; 1166 int i;
1148 1167
@@ -1285,52 +1304,9 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1285 if (!opts->subsys_mask && !opts->name) 1304 if (!opts->subsys_mask && !opts->name)
1286 return -EINVAL; 1305 return -EINVAL;
1287 1306
1288 /*
1289 * Grab references on all the modules we'll need, so the subsystems
1290 * don't dance around before rebind_subsystems attaches them. This may
1291 * take duplicate reference counts on a subsystem that's already used,
1292 * but rebind_subsystems handles this case.
1293 */
1294 for_each_subsys(ss, i) {
1295 if (!(opts->subsys_mask & (1UL << i)))
1296 continue;
1297 if (!try_module_get(cgroup_subsys[i]->module)) {
1298 module_pin_failed = true;
1299 break;
1300 }
1301 }
1302 if (module_pin_failed) {
1303 /*
1304 * oops, one of the modules was going away. this means that we
1305 * raced with a module_delete call, and to the user this is
1306 * essentially a "subsystem doesn't exist" case.
1307 */
1308 for (i--; i >= 0; i--) {
1309 /* drop refcounts only on the ones we took */
1310 unsigned long bit = 1UL << i;
1311
1312 if (!(bit & opts->subsys_mask))
1313 continue;
1314 module_put(cgroup_subsys[i]->module);
1315 }
1316 return -ENOENT;
1317 }
1318
1319 return 0; 1307 return 0;
1320} 1308}
1321 1309
1322static void drop_parsed_module_refcounts(unsigned long subsys_mask)
1323{
1324 struct cgroup_subsys *ss;
1325 int i;
1326
1327 mutex_lock(&cgroup_mutex);
1328 for_each_subsys(ss, i)
1329 if (subsys_mask & (1UL << i))
1330 module_put(cgroup_subsys[i]->module);
1331 mutex_unlock(&cgroup_mutex);
1332}
1333
1334static int cgroup_remount(struct super_block *sb, int *flags, char *data) 1310static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1335{ 1311{
1336 int ret = 0; 1312 int ret = 0;
@@ -1370,22 +1346,15 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1370 goto out_unlock; 1346 goto out_unlock;
1371 } 1347 }
1372 1348
1373 /* 1349 /* remounting is not allowed for populated hierarchies */
1374 * Clear out the files of subsystems that should be removed, do 1350 if (root->number_of_cgroups > 1) {
1375 * this before rebind_subsystems, since rebind_subsystems may 1351 ret = -EBUSY;
1376 * change this hierarchy's subsys_list.
1377 */
1378 cgroup_clear_directory(cgrp->dentry, false, removed_mask);
1379
1380 ret = rebind_subsystems(root, added_mask, removed_mask);
1381 if (ret) {
1382 /* rebind_subsystems failed, re-populate the removed files */
1383 cgroup_populate_dir(cgrp, false, removed_mask);
1384 goto out_unlock; 1352 goto out_unlock;
1385 } 1353 }
1386 1354
1387 /* re-populate subsystem files */ 1355 ret = rebind_subsystems(root, added_mask, removed_mask);
1388 cgroup_populate_dir(cgrp, false, added_mask); 1356 if (ret)
1357 goto out_unlock;
1389 1358
1390 if (opts.release_agent) 1359 if (opts.release_agent)
1391 strcpy(root->release_agent_path, opts.release_agent); 1360 strcpy(root->release_agent_path, opts.release_agent);
@@ -1395,8 +1364,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1395 mutex_unlock(&cgroup_root_mutex); 1364 mutex_unlock(&cgroup_root_mutex);
1396 mutex_unlock(&cgroup_mutex); 1365 mutex_unlock(&cgroup_mutex);
1397 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 1366 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
1398 if (ret)
1399 drop_parsed_module_refcounts(opts.subsys_mask);
1400 return ret; 1367 return ret;
1401} 1368}
1402 1369
@@ -1416,6 +1383,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1416 INIT_LIST_HEAD(&cgrp->release_list); 1383 INIT_LIST_HEAD(&cgrp->release_list);
1417 INIT_LIST_HEAD(&cgrp->pidlists); 1384 INIT_LIST_HEAD(&cgrp->pidlists);
1418 mutex_init(&cgrp->pidlist_mutex); 1385 mutex_init(&cgrp->pidlist_mutex);
1386 cgrp->dummy_css.cgroup = cgrp;
1419 INIT_LIST_HEAD(&cgrp->event_list); 1387 INIT_LIST_HEAD(&cgrp->event_list);
1420 spin_lock_init(&cgrp->event_list_lock); 1388 spin_lock_init(&cgrp->event_list_lock);
1421 simple_xattrs_init(&cgrp->xattrs); 1389 simple_xattrs_init(&cgrp->xattrs);
@@ -1431,6 +1399,7 @@ static void init_cgroup_root(struct cgroupfs_root *root)
1431 cgrp->root = root; 1399 cgrp->root = root;
1432 RCU_INIT_POINTER(cgrp->name, &root_cgroup_name); 1400 RCU_INIT_POINTER(cgrp->name, &root_cgroup_name);
1433 init_cgroup_housekeeping(cgrp); 1401 init_cgroup_housekeeping(cgrp);
1402 idr_init(&root->cgroup_idr);
1434} 1403}
1435 1404
1436static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end) 1405static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end)
@@ -1503,7 +1472,6 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1503 */ 1472 */
1504 root->subsys_mask = opts->subsys_mask; 1473 root->subsys_mask = opts->subsys_mask;
1505 root->flags = opts->flags; 1474 root->flags = opts->flags;
1506 ida_init(&root->cgroup_ida);
1507 if (opts->release_agent) 1475 if (opts->release_agent)
1508 strcpy(root->release_agent_path, opts->release_agent); 1476 strcpy(root->release_agent_path, opts->release_agent);
1509 if (opts->name) 1477 if (opts->name)
@@ -1519,7 +1487,7 @@ static void cgroup_free_root(struct cgroupfs_root *root)
1519 /* hierarhcy ID shoulid already have been released */ 1487 /* hierarhcy ID shoulid already have been released */
1520 WARN_ON_ONCE(root->hierarchy_id); 1488 WARN_ON_ONCE(root->hierarchy_id);
1521 1489
1522 ida_destroy(&root->cgroup_ida); 1490 idr_destroy(&root->cgroup_idr);
1523 kfree(root); 1491 kfree(root);
1524 } 1492 }
1525} 1493}
@@ -1584,7 +1552,9 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1584 int ret = 0; 1552 int ret = 0;
1585 struct super_block *sb; 1553 struct super_block *sb;
1586 struct cgroupfs_root *new_root; 1554 struct cgroupfs_root *new_root;
1555 struct list_head tmp_links;
1587 struct inode *inode; 1556 struct inode *inode;
1557 const struct cred *cred;
1588 1558
1589 /* First find the desired set of subsystems */ 1559 /* First find the desired set of subsystems */
1590 mutex_lock(&cgroup_mutex); 1560 mutex_lock(&cgroup_mutex);
@@ -1600,7 +1570,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1600 new_root = cgroup_root_from_opts(&opts); 1570 new_root = cgroup_root_from_opts(&opts);
1601 if (IS_ERR(new_root)) { 1571 if (IS_ERR(new_root)) {
1602 ret = PTR_ERR(new_root); 1572 ret = PTR_ERR(new_root);
1603 goto drop_modules; 1573 goto out_err;
1604 } 1574 }
1605 opts.new_root = new_root; 1575 opts.new_root = new_root;
1606 1576
@@ -1609,17 +1579,15 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1609 if (IS_ERR(sb)) { 1579 if (IS_ERR(sb)) {
1610 ret = PTR_ERR(sb); 1580 ret = PTR_ERR(sb);
1611 cgroup_free_root(opts.new_root); 1581 cgroup_free_root(opts.new_root);
1612 goto drop_modules; 1582 goto out_err;
1613 } 1583 }
1614 1584
1615 root = sb->s_fs_info; 1585 root = sb->s_fs_info;
1616 BUG_ON(!root); 1586 BUG_ON(!root);
1617 if (root == opts.new_root) { 1587 if (root == opts.new_root) {
1618 /* We used the new root structure, so this is a new hierarchy */ 1588 /* We used the new root structure, so this is a new hierarchy */
1619 struct list_head tmp_links;
1620 struct cgroup *root_cgrp = &root->top_cgroup; 1589 struct cgroup *root_cgrp = &root->top_cgroup;
1621 struct cgroupfs_root *existing_root; 1590 struct cgroupfs_root *existing_root;
1622 const struct cred *cred;
1623 int i; 1591 int i;
1624 struct css_set *cset; 1592 struct css_set *cset;
1625 1593
@@ -1634,6 +1602,11 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1634 mutex_lock(&cgroup_mutex); 1602 mutex_lock(&cgroup_mutex);
1635 mutex_lock(&cgroup_root_mutex); 1603 mutex_lock(&cgroup_root_mutex);
1636 1604
1605 root_cgrp->id = idr_alloc(&root->cgroup_idr, root_cgrp,
1606 0, 1, GFP_KERNEL);
1607 if (root_cgrp->id < 0)
1608 goto unlock_drop;
1609
1637 /* Check for name clashes with existing mounts */ 1610 /* Check for name clashes with existing mounts */
1638 ret = -EBUSY; 1611 ret = -EBUSY;
1639 if (strlen(root->name)) 1612 if (strlen(root->name))
@@ -1657,26 +1630,37 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1657 if (ret) 1630 if (ret)
1658 goto unlock_drop; 1631 goto unlock_drop;
1659 1632
1633 sb->s_root->d_fsdata = root_cgrp;
1634 root_cgrp->dentry = sb->s_root;
1635
1636 /*
1637 * We're inside get_sb() and will call lookup_one_len() to
1638 * create the root files, which doesn't work if SELinux is
1639 * in use. The following cred dancing somehow works around
1640 * it. See 2ce9738ba ("cgroupfs: use init_cred when
1641 * populating new cgroupfs mount") for more details.
1642 */
1643 cred = override_creds(&init_cred);
1644
1645 ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true);
1646 if (ret)
1647 goto rm_base_files;
1648
1660 ret = rebind_subsystems(root, root->subsys_mask, 0); 1649 ret = rebind_subsystems(root, root->subsys_mask, 0);
1661 if (ret == -EBUSY) { 1650 if (ret)
1662 free_cgrp_cset_links(&tmp_links); 1651 goto rm_base_files;
1663 goto unlock_drop; 1652
1664 } 1653 revert_creds(cred);
1654
1665 /* 1655 /*
1666 * There must be no failure case after here, since rebinding 1656 * There must be no failure case after here, since rebinding
1667 * takes care of subsystems' refcounts, which are explicitly 1657 * takes care of subsystems' refcounts, which are explicitly
1668 * dropped in the failure exit path. 1658 * dropped in the failure exit path.
1669 */ 1659 */
1670 1660
1671 /* EBUSY should be the only error here */
1672 BUG_ON(ret);
1673
1674 list_add(&root->root_list, &cgroup_roots); 1661 list_add(&root->root_list, &cgroup_roots);
1675 cgroup_root_count++; 1662 cgroup_root_count++;
1676 1663
1677 sb->s_root->d_fsdata = root_cgrp;
1678 root->top_cgroup.dentry = sb->s_root;
1679
1680 /* Link the top cgroup in this hierarchy into all 1664 /* Link the top cgroup in this hierarchy into all
1681 * the css_set objects */ 1665 * the css_set objects */
1682 write_lock(&css_set_lock); 1666 write_lock(&css_set_lock);
@@ -1689,9 +1673,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1689 BUG_ON(!list_empty(&root_cgrp->children)); 1673 BUG_ON(!list_empty(&root_cgrp->children));
1690 BUG_ON(root->number_of_cgroups != 1); 1674 BUG_ON(root->number_of_cgroups != 1);
1691 1675
1692 cred = override_creds(&init_cred);
1693 cgroup_populate_dir(root_cgrp, true, root->subsys_mask);
1694 revert_creds(cred);
1695 mutex_unlock(&cgroup_root_mutex); 1676 mutex_unlock(&cgroup_root_mutex);
1696 mutex_unlock(&cgroup_mutex); 1677 mutex_unlock(&cgroup_mutex);
1697 mutex_unlock(&inode->i_mutex); 1678 mutex_unlock(&inode->i_mutex);
@@ -1711,15 +1692,16 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1711 pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n"); 1692 pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n");
1712 } 1693 }
1713 } 1694 }
1714
1715 /* no subsys rebinding, so refcounts don't change */
1716 drop_parsed_module_refcounts(opts.subsys_mask);
1717 } 1695 }
1718 1696
1719 kfree(opts.release_agent); 1697 kfree(opts.release_agent);
1720 kfree(opts.name); 1698 kfree(opts.name);
1721 return dget(sb->s_root); 1699 return dget(sb->s_root);
1722 1700
1701 rm_base_files:
1702 free_cgrp_cset_links(&tmp_links);
1703 cgroup_addrm_files(&root->top_cgroup, cgroup_base_files, false);
1704 revert_creds(cred);
1723 unlock_drop: 1705 unlock_drop:
1724 cgroup_exit_root_id(root); 1706 cgroup_exit_root_id(root);
1725 mutex_unlock(&cgroup_root_mutex); 1707 mutex_unlock(&cgroup_root_mutex);
@@ -1727,8 +1709,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1727 mutex_unlock(&inode->i_mutex); 1709 mutex_unlock(&inode->i_mutex);
1728 drop_new_super: 1710 drop_new_super:
1729 deactivate_locked_super(sb); 1711 deactivate_locked_super(sb);
1730 drop_modules:
1731 drop_parsed_module_refcounts(opts.subsys_mask);
1732 out_err: 1712 out_err:
1733 kfree(opts.release_agent); 1713 kfree(opts.release_agent);
1734 kfree(opts.name); 1714 kfree(opts.name);
@@ -1746,6 +1726,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
1746 BUG_ON(root->number_of_cgroups != 1); 1726 BUG_ON(root->number_of_cgroups != 1);
1747 BUG_ON(!list_empty(&cgrp->children)); 1727 BUG_ON(!list_empty(&cgrp->children));
1748 1728
1729 mutex_lock(&cgrp->dentry->d_inode->i_mutex);
1749 mutex_lock(&cgroup_mutex); 1730 mutex_lock(&cgroup_mutex);
1750 mutex_lock(&cgroup_root_mutex); 1731 mutex_lock(&cgroup_root_mutex);
1751 1732
@@ -1778,6 +1759,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
1778 1759
1779 mutex_unlock(&cgroup_root_mutex); 1760 mutex_unlock(&cgroup_root_mutex);
1780 mutex_unlock(&cgroup_mutex); 1761 mutex_unlock(&cgroup_mutex);
1762 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
1781 1763
1782 simple_xattrs_free(&cgrp->xattrs); 1764 simple_xattrs_free(&cgrp->xattrs);
1783 1765
@@ -1889,7 +1871,7 @@ EXPORT_SYMBOL_GPL(task_cgroup_path);
1889struct task_and_cgroup { 1871struct task_and_cgroup {
1890 struct task_struct *task; 1872 struct task_struct *task;
1891 struct cgroup *cgrp; 1873 struct cgroup *cgrp;
1892 struct css_set *cg; 1874 struct css_set *cset;
1893}; 1875};
1894 1876
1895struct cgroup_taskset { 1877struct cgroup_taskset {
@@ -1939,18 +1921,20 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
1939EXPORT_SYMBOL_GPL(cgroup_taskset_next); 1921EXPORT_SYMBOL_GPL(cgroup_taskset_next);
1940 1922
1941/** 1923/**
1942 * cgroup_taskset_cur_cgroup - return the matching cgroup for the current task 1924 * cgroup_taskset_cur_css - return the matching css for the current task
1943 * @tset: taskset of interest 1925 * @tset: taskset of interest
1926 * @subsys_id: the ID of the target subsystem
1944 * 1927 *
1945 * Return the cgroup for the current (last returned) task of @tset. This 1928 * Return the css for the current (last returned) task of @tset for
1946 * function must be preceded by either cgroup_taskset_first() or 1929 * subsystem specified by @subsys_id. This function must be preceded by
1947 * cgroup_taskset_next(). 1930 * either cgroup_taskset_first() or cgroup_taskset_next().
1948 */ 1931 */
1949struct cgroup *cgroup_taskset_cur_cgroup(struct cgroup_taskset *tset) 1932struct cgroup_subsys_state *cgroup_taskset_cur_css(struct cgroup_taskset *tset,
1933 int subsys_id)
1950{ 1934{
1951 return tset->cur_cgrp; 1935 return cgroup_css(tset->cur_cgrp, cgroup_subsys[subsys_id]);
1952} 1936}
1953EXPORT_SYMBOL_GPL(cgroup_taskset_cur_cgroup); 1937EXPORT_SYMBOL_GPL(cgroup_taskset_cur_css);
1954 1938
1955/** 1939/**
1956 * cgroup_taskset_size - return the number of tasks in taskset 1940 * cgroup_taskset_size - return the number of tasks in taskset
@@ -2089,8 +2073,10 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2089 * step 1: check that we can legitimately attach to the cgroup. 2073 * step 1: check that we can legitimately attach to the cgroup.
2090 */ 2074 */
2091 for_each_root_subsys(root, ss) { 2075 for_each_root_subsys(root, ss) {
2076 struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
2077
2092 if (ss->can_attach) { 2078 if (ss->can_attach) {
2093 retval = ss->can_attach(cgrp, &tset); 2079 retval = ss->can_attach(css, &tset);
2094 if (retval) { 2080 if (retval) {
2095 failed_ss = ss; 2081 failed_ss = ss;
2096 goto out_cancel_attach; 2082 goto out_cancel_attach;
@@ -2107,8 +2093,8 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2107 2093
2108 tc = flex_array_get(group, i); 2094 tc = flex_array_get(group, i);
2109 old_cset = task_css_set(tc->task); 2095 old_cset = task_css_set(tc->task);
2110 tc->cg = find_css_set(old_cset, cgrp); 2096 tc->cset = find_css_set(old_cset, cgrp);
2111 if (!tc->cg) { 2097 if (!tc->cset) {
2112 retval = -ENOMEM; 2098 retval = -ENOMEM;
2113 goto out_put_css_set_refs; 2099 goto out_put_css_set_refs;
2114 } 2100 }
@@ -2121,7 +2107,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2121 */ 2107 */
2122 for (i = 0; i < group_size; i++) { 2108 for (i = 0; i < group_size; i++) {
2123 tc = flex_array_get(group, i); 2109 tc = flex_array_get(group, i);
2124 cgroup_task_migrate(tc->cgrp, tc->task, tc->cg); 2110 cgroup_task_migrate(tc->cgrp, tc->task, tc->cset);
2125 } 2111 }
2126 /* nothing is sensitive to fork() after this point. */ 2112 /* nothing is sensitive to fork() after this point. */
2127 2113
@@ -2129,8 +2115,10 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2129 * step 4: do subsystem attach callbacks. 2115 * step 4: do subsystem attach callbacks.
2130 */ 2116 */
2131 for_each_root_subsys(root, ss) { 2117 for_each_root_subsys(root, ss) {
2118 struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
2119
2132 if (ss->attach) 2120 if (ss->attach)
2133 ss->attach(cgrp, &tset); 2121 ss->attach(css, &tset);
2134 } 2122 }
2135 2123
2136 /* 2124 /*
@@ -2141,18 +2129,20 @@ out_put_css_set_refs:
2141 if (retval) { 2129 if (retval) {
2142 for (i = 0; i < group_size; i++) { 2130 for (i = 0; i < group_size; i++) {
2143 tc = flex_array_get(group, i); 2131 tc = flex_array_get(group, i);
2144 if (!tc->cg) 2132 if (!tc->cset)
2145 break; 2133 break;
2146 put_css_set(tc->cg); 2134 put_css_set(tc->cset);
2147 } 2135 }
2148 } 2136 }
2149out_cancel_attach: 2137out_cancel_attach:
2150 if (retval) { 2138 if (retval) {
2151 for_each_root_subsys(root, ss) { 2139 for_each_root_subsys(root, ss) {
2140 struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
2141
2152 if (ss == failed_ss) 2142 if (ss == failed_ss)
2153 break; 2143 break;
2154 if (ss->cancel_attach) 2144 if (ss->cancel_attach)
2155 ss->cancel_attach(cgrp, &tset); 2145 ss->cancel_attach(css, &tset);
2156 } 2146 }
2157 } 2147 }
2158out_free_group_list: 2148out_free_group_list:
@@ -2253,9 +2243,9 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
2253 2243
2254 mutex_lock(&cgroup_mutex); 2244 mutex_lock(&cgroup_mutex);
2255 for_each_active_root(root) { 2245 for_each_active_root(root) {
2256 struct cgroup *from_cg = task_cgroup_from_root(from, root); 2246 struct cgroup *from_cgrp = task_cgroup_from_root(from, root);
2257 2247
2258 retval = cgroup_attach_task(from_cg, tsk, false); 2248 retval = cgroup_attach_task(from_cgrp, tsk, false);
2259 if (retval) 2249 if (retval)
2260 break; 2250 break;
2261 } 2251 }
@@ -2265,34 +2255,38 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
2265} 2255}
2266EXPORT_SYMBOL_GPL(cgroup_attach_task_all); 2256EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
2267 2257
2268static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) 2258static int cgroup_tasks_write(struct cgroup_subsys_state *css,
2259 struct cftype *cft, u64 pid)
2269{ 2260{
2270 return attach_task_by_pid(cgrp, pid, false); 2261 return attach_task_by_pid(css->cgroup, pid, false);
2271} 2262}
2272 2263
2273static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid) 2264static int cgroup_procs_write(struct cgroup_subsys_state *css,
2265 struct cftype *cft, u64 tgid)
2274{ 2266{
2275 return attach_task_by_pid(cgrp, tgid, true); 2267 return attach_task_by_pid(css->cgroup, tgid, true);
2276} 2268}
2277 2269
2278static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, 2270static int cgroup_release_agent_write(struct cgroup_subsys_state *css,
2279 const char *buffer) 2271 struct cftype *cft, const char *buffer)
2280{ 2272{
2281 BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); 2273 BUILD_BUG_ON(sizeof(css->cgroup->root->release_agent_path) < PATH_MAX);
2282 if (strlen(buffer) >= PATH_MAX) 2274 if (strlen(buffer) >= PATH_MAX)
2283 return -EINVAL; 2275 return -EINVAL;
2284 if (!cgroup_lock_live_group(cgrp)) 2276 if (!cgroup_lock_live_group(css->cgroup))
2285 return -ENODEV; 2277 return -ENODEV;
2286 mutex_lock(&cgroup_root_mutex); 2278 mutex_lock(&cgroup_root_mutex);
2287 strcpy(cgrp->root->release_agent_path, buffer); 2279 strcpy(css->cgroup->root->release_agent_path, buffer);
2288 mutex_unlock(&cgroup_root_mutex); 2280 mutex_unlock(&cgroup_root_mutex);
2289 mutex_unlock(&cgroup_mutex); 2281 mutex_unlock(&cgroup_mutex);
2290 return 0; 2282 return 0;
2291} 2283}
2292 2284
2293static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft, 2285static int cgroup_release_agent_show(struct cgroup_subsys_state *css,
2294 struct seq_file *seq) 2286 struct cftype *cft, struct seq_file *seq)
2295{ 2287{
2288 struct cgroup *cgrp = css->cgroup;
2289
2296 if (!cgroup_lock_live_group(cgrp)) 2290 if (!cgroup_lock_live_group(cgrp))
2297 return -ENODEV; 2291 return -ENODEV;
2298 seq_puts(seq, cgrp->root->release_agent_path); 2292 seq_puts(seq, cgrp->root->release_agent_path);
@@ -2301,20 +2295,20 @@ static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
2301 return 0; 2295 return 0;
2302} 2296}
2303 2297
2304static int cgroup_sane_behavior_show(struct cgroup *cgrp, struct cftype *cft, 2298static int cgroup_sane_behavior_show(struct cgroup_subsys_state *css,
2305 struct seq_file *seq) 2299 struct cftype *cft, struct seq_file *seq)
2306{ 2300{
2307 seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp)); 2301 seq_printf(seq, "%d\n", cgroup_sane_behavior(css->cgroup));
2308 return 0; 2302 return 0;
2309} 2303}
2310 2304
2311/* A buffer size big enough for numbers or short strings */ 2305/* A buffer size big enough for numbers or short strings */
2312#define CGROUP_LOCAL_BUFFER_SIZE 64 2306#define CGROUP_LOCAL_BUFFER_SIZE 64
2313 2307
2314static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft, 2308static ssize_t cgroup_write_X64(struct cgroup_subsys_state *css,
2315 struct file *file, 2309 struct cftype *cft, struct file *file,
2316 const char __user *userbuf, 2310 const char __user *userbuf, size_t nbytes,
2317 size_t nbytes, loff_t *unused_ppos) 2311 loff_t *unused_ppos)
2318{ 2312{
2319 char buffer[CGROUP_LOCAL_BUFFER_SIZE]; 2313 char buffer[CGROUP_LOCAL_BUFFER_SIZE];
2320 int retval = 0; 2314 int retval = 0;
@@ -2332,22 +2326,22 @@ static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
2332 u64 val = simple_strtoull(strstrip(buffer), &end, 0); 2326 u64 val = simple_strtoull(strstrip(buffer), &end, 0);
2333 if (*end) 2327 if (*end)
2334 return -EINVAL; 2328 return -EINVAL;
2335 retval = cft->write_u64(cgrp, cft, val); 2329 retval = cft->write_u64(css, cft, val);
2336 } else { 2330 } else {
2337 s64 val = simple_strtoll(strstrip(buffer), &end, 0); 2331 s64 val = simple_strtoll(strstrip(buffer), &end, 0);
2338 if (*end) 2332 if (*end)
2339 return -EINVAL; 2333 return -EINVAL;
2340 retval = cft->write_s64(cgrp, cft, val); 2334 retval = cft->write_s64(css, cft, val);
2341 } 2335 }
2342 if (!retval) 2336 if (!retval)
2343 retval = nbytes; 2337 retval = nbytes;
2344 return retval; 2338 return retval;
2345} 2339}
2346 2340
2347static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft, 2341static ssize_t cgroup_write_string(struct cgroup_subsys_state *css,
2348 struct file *file, 2342 struct cftype *cft, struct file *file,
2349 const char __user *userbuf, 2343 const char __user *userbuf, size_t nbytes,
2350 size_t nbytes, loff_t *unused_ppos) 2344 loff_t *unused_ppos)
2351{ 2345{
2352 char local_buffer[CGROUP_LOCAL_BUFFER_SIZE]; 2346 char local_buffer[CGROUP_LOCAL_BUFFER_SIZE];
2353 int retval = 0; 2347 int retval = 0;
@@ -2370,7 +2364,7 @@ static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
2370 } 2364 }
2371 2365
2372 buffer[nbytes] = 0; /* nul-terminate */ 2366 buffer[nbytes] = 0; /* nul-terminate */
2373 retval = cft->write_string(cgrp, cft, strstrip(buffer)); 2367 retval = cft->write_string(css, cft, strstrip(buffer));
2374 if (!retval) 2368 if (!retval)
2375 retval = nbytes; 2369 retval = nbytes;
2376out: 2370out:
@@ -2380,65 +2374,60 @@ out:
2380} 2374}
2381 2375
2382static ssize_t cgroup_file_write(struct file *file, const char __user *buf, 2376static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
2383 size_t nbytes, loff_t *ppos) 2377 size_t nbytes, loff_t *ppos)
2384{ 2378{
2379 struct cfent *cfe = __d_cfe(file->f_dentry);
2385 struct cftype *cft = __d_cft(file->f_dentry); 2380 struct cftype *cft = __d_cft(file->f_dentry);
2386 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2381 struct cgroup_subsys_state *css = cfe->css;
2387 2382
2388 if (cgroup_is_dead(cgrp))
2389 return -ENODEV;
2390 if (cft->write) 2383 if (cft->write)
2391 return cft->write(cgrp, cft, file, buf, nbytes, ppos); 2384 return cft->write(css, cft, file, buf, nbytes, ppos);
2392 if (cft->write_u64 || cft->write_s64) 2385 if (cft->write_u64 || cft->write_s64)
2393 return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos); 2386 return cgroup_write_X64(css, cft, file, buf, nbytes, ppos);
2394 if (cft->write_string) 2387 if (cft->write_string)
2395 return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos); 2388 return cgroup_write_string(css, cft, file, buf, nbytes, ppos);
2396 if (cft->trigger) { 2389 if (cft->trigger) {
2397 int ret = cft->trigger(cgrp, (unsigned int)cft->private); 2390 int ret = cft->trigger(css, (unsigned int)cft->private);
2398 return ret ? ret : nbytes; 2391 return ret ? ret : nbytes;
2399 } 2392 }
2400 return -EINVAL; 2393 return -EINVAL;
2401} 2394}
2402 2395
2403static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft, 2396static ssize_t cgroup_read_u64(struct cgroup_subsys_state *css,
2404 struct file *file, 2397 struct cftype *cft, struct file *file,
2405 char __user *buf, size_t nbytes, 2398 char __user *buf, size_t nbytes, loff_t *ppos)
2406 loff_t *ppos)
2407{ 2399{
2408 char tmp[CGROUP_LOCAL_BUFFER_SIZE]; 2400 char tmp[CGROUP_LOCAL_BUFFER_SIZE];
2409 u64 val = cft->read_u64(cgrp, cft); 2401 u64 val = cft->read_u64(css, cft);
2410 int len = sprintf(tmp, "%llu\n", (unsigned long long) val); 2402 int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
2411 2403
2412 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); 2404 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
2413} 2405}
2414 2406
2415static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft, 2407static ssize_t cgroup_read_s64(struct cgroup_subsys_state *css,
2416 struct file *file, 2408 struct cftype *cft, struct file *file,
2417 char __user *buf, size_t nbytes, 2409 char __user *buf, size_t nbytes, loff_t *ppos)
2418 loff_t *ppos)
2419{ 2410{
2420 char tmp[CGROUP_LOCAL_BUFFER_SIZE]; 2411 char tmp[CGROUP_LOCAL_BUFFER_SIZE];
2421 s64 val = cft->read_s64(cgrp, cft); 2412 s64 val = cft->read_s64(css, cft);
2422 int len = sprintf(tmp, "%lld\n", (long long) val); 2413 int len = sprintf(tmp, "%lld\n", (long long) val);
2423 2414
2424 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); 2415 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
2425} 2416}
2426 2417
2427static ssize_t cgroup_file_read(struct file *file, char __user *buf, 2418static ssize_t cgroup_file_read(struct file *file, char __user *buf,
2428 size_t nbytes, loff_t *ppos) 2419 size_t nbytes, loff_t *ppos)
2429{ 2420{
2421 struct cfent *cfe = __d_cfe(file->f_dentry);
2430 struct cftype *cft = __d_cft(file->f_dentry); 2422 struct cftype *cft = __d_cft(file->f_dentry);
2431 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2423 struct cgroup_subsys_state *css = cfe->css;
2432
2433 if (cgroup_is_dead(cgrp))
2434 return -ENODEV;
2435 2424
2436 if (cft->read) 2425 if (cft->read)
2437 return cft->read(cgrp, cft, file, buf, nbytes, ppos); 2426 return cft->read(css, cft, file, buf, nbytes, ppos);
2438 if (cft->read_u64) 2427 if (cft->read_u64)
2439 return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos); 2428 return cgroup_read_u64(css, cft, file, buf, nbytes, ppos);
2440 if (cft->read_s64) 2429 if (cft->read_s64)
2441 return cgroup_read_s64(cgrp, cft, file, buf, nbytes, ppos); 2430 return cgroup_read_s64(css, cft, file, buf, nbytes, ppos);
2442 return -EINVAL; 2431 return -EINVAL;
2443} 2432}
2444 2433
@@ -2447,11 +2436,6 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf,
2447 * supports string->u64 maps, but can be extended in future. 2436 * supports string->u64 maps, but can be extended in future.
2448 */ 2437 */
2449 2438
2450struct cgroup_seqfile_state {
2451 struct cftype *cft;
2452 struct cgroup *cgroup;
2453};
2454
2455static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value) 2439static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
2456{ 2440{
2457 struct seq_file *sf = cb->state; 2441 struct seq_file *sf = cb->state;
@@ -2460,69 +2444,86 @@ static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
2460 2444
2461static int cgroup_seqfile_show(struct seq_file *m, void *arg) 2445static int cgroup_seqfile_show(struct seq_file *m, void *arg)
2462{ 2446{
2463 struct cgroup_seqfile_state *state = m->private; 2447 struct cfent *cfe = m->private;
2464 struct cftype *cft = state->cft; 2448 struct cftype *cft = cfe->type;
2449 struct cgroup_subsys_state *css = cfe->css;
2450
2465 if (cft->read_map) { 2451 if (cft->read_map) {
2466 struct cgroup_map_cb cb = { 2452 struct cgroup_map_cb cb = {
2467 .fill = cgroup_map_add, 2453 .fill = cgroup_map_add,
2468 .state = m, 2454 .state = m,
2469 }; 2455 };
2470 return cft->read_map(state->cgroup, cft, &cb); 2456 return cft->read_map(css, cft, &cb);
2471 } 2457 }
2472 return cft->read_seq_string(state->cgroup, cft, m); 2458 return cft->read_seq_string(css, cft, m);
2473}
2474
2475static int cgroup_seqfile_release(struct inode *inode, struct file *file)
2476{
2477 struct seq_file *seq = file->private_data;
2478 kfree(seq->private);
2479 return single_release(inode, file);
2480} 2459}
2481 2460
2482static const struct file_operations cgroup_seqfile_operations = { 2461static const struct file_operations cgroup_seqfile_operations = {
2483 .read = seq_read, 2462 .read = seq_read,
2484 .write = cgroup_file_write, 2463 .write = cgroup_file_write,
2485 .llseek = seq_lseek, 2464 .llseek = seq_lseek,
2486 .release = cgroup_seqfile_release, 2465 .release = single_release,
2487}; 2466};
2488 2467
2489static int cgroup_file_open(struct inode *inode, struct file *file) 2468static int cgroup_file_open(struct inode *inode, struct file *file)
2490{ 2469{
2470 struct cfent *cfe = __d_cfe(file->f_dentry);
2471 struct cftype *cft = __d_cft(file->f_dentry);
2472 struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent);
2473 struct cgroup_subsys_state *css;
2491 int err; 2474 int err;
2492 struct cftype *cft;
2493 2475
2494 err = generic_file_open(inode, file); 2476 err = generic_file_open(inode, file);
2495 if (err) 2477 if (err)
2496 return err; 2478 return err;
2497 cft = __d_cft(file->f_dentry);
2498 2479
2499 if (cft->read_map || cft->read_seq_string) { 2480 /*
2500 struct cgroup_seqfile_state *state; 2481 * If the file belongs to a subsystem, pin the css. Will be
2482 * unpinned either on open failure or release. This ensures that
2483 * @css stays alive for all file operations.
2484 */
2485 rcu_read_lock();
2486 css = cgroup_css(cgrp, cft->ss);
2487 if (cft->ss && !css_tryget(css))
2488 css = NULL;
2489 rcu_read_unlock();
2501 2490
2502 state = kzalloc(sizeof(*state), GFP_USER); 2491 if (!css)
2503 if (!state) 2492 return -ENODEV;
2504 return -ENOMEM;
2505 2493
2506 state->cft = cft; 2494 /*
2507 state->cgroup = __d_cgrp(file->f_dentry->d_parent); 2495 * @cfe->css is used by read/write/close to determine the
2496 * associated css. @file->private_data would be a better place but
2497 * that's already used by seqfile. Multiple accessors may use it
2498 * simultaneously which is okay as the association never changes.
2499 */
2500 WARN_ON_ONCE(cfe->css && cfe->css != css);
2501 cfe->css = css;
2502
2503 if (cft->read_map || cft->read_seq_string) {
2508 file->f_op = &cgroup_seqfile_operations; 2504 file->f_op = &cgroup_seqfile_operations;
2509 err = single_open(file, cgroup_seqfile_show, state); 2505 err = single_open(file, cgroup_seqfile_show, cfe);
2510 if (err < 0) 2506 } else if (cft->open) {
2511 kfree(state);
2512 } else if (cft->open)
2513 err = cft->open(inode, file); 2507 err = cft->open(inode, file);
2514 else 2508 }
2515 err = 0;
2516 2509
2510 if (css->ss && err)
2511 css_put(css);
2517 return err; 2512 return err;
2518} 2513}
2519 2514
2520static int cgroup_file_release(struct inode *inode, struct file *file) 2515static int cgroup_file_release(struct inode *inode, struct file *file)
2521{ 2516{
2517 struct cfent *cfe = __d_cfe(file->f_dentry);
2522 struct cftype *cft = __d_cft(file->f_dentry); 2518 struct cftype *cft = __d_cft(file->f_dentry);
2519 struct cgroup_subsys_state *css = cfe->css;
2520 int ret = 0;
2521
2523 if (cft->release) 2522 if (cft->release)
2524 return cft->release(inode, file); 2523 ret = cft->release(inode, file);
2525 return 0; 2524 if (css->ss)
2525 css_put(css);
2526 return ret;
2526} 2527}
2527 2528
2528/* 2529/*
@@ -2736,8 +2737,7 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
2736 return mode; 2737 return mode;
2737} 2738}
2738 2739
2739static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, 2740static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
2740 struct cftype *cft)
2741{ 2741{
2742 struct dentry *dir = cgrp->dentry; 2742 struct dentry *dir = cgrp->dentry;
2743 struct cgroup *parent = __d_cgrp(dir); 2743 struct cgroup *parent = __d_cgrp(dir);
@@ -2747,8 +2747,9 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2747 umode_t mode; 2747 umode_t mode;
2748 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; 2748 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
2749 2749
2750 if (subsys && !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) { 2750 if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
2751 strcpy(name, subsys->name); 2751 !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
2752 strcpy(name, cft->ss->name);
2752 strcat(name, "."); 2753 strcat(name, ".");
2753 } 2754 }
2754 strcat(name, cft->name); 2755 strcat(name, cft->name);
@@ -2782,11 +2783,25 @@ out:
2782 return error; 2783 return error;
2783} 2784}
2784 2785
2785static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, 2786/**
2786 struct cftype cfts[], bool is_add) 2787 * cgroup_addrm_files - add or remove files to a cgroup directory
2788 * @cgrp: the target cgroup
2789 * @cfts: array of cftypes to be added
2790 * @is_add: whether to add or remove
2791 *
2792 * Depending on @is_add, add or remove files defined by @cfts on @cgrp.
2793 * For removals, this function never fails. If addition fails, this
2794 * function doesn't remove files already added. The caller is responsible
2795 * for cleaning up.
2796 */
2797static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
2798 bool is_add)
2787{ 2799{
2788 struct cftype *cft; 2800 struct cftype *cft;
2789 int err, ret = 0; 2801 int ret;
2802
2803 lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
2804 lockdep_assert_held(&cgroup_mutex);
2790 2805
2791 for (cft = cfts; cft->name[0] != '\0'; cft++) { 2806 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2792 /* does cft->flags tell us to skip this file on @cgrp? */ 2807 /* does cft->flags tell us to skip this file on @cgrp? */
@@ -2798,16 +2813,17 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2798 continue; 2813 continue;
2799 2814
2800 if (is_add) { 2815 if (is_add) {
2801 err = cgroup_add_file(cgrp, subsys, cft); 2816 ret = cgroup_add_file(cgrp, cft);
2802 if (err) 2817 if (ret) {
2803 pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n", 2818 pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n",
2804 cft->name, err); 2819 cft->name, ret);
2805 ret = err; 2820 return ret;
2821 }
2806 } else { 2822 } else {
2807 cgroup_rm_file(cgrp, cft); 2823 cgroup_rm_file(cgrp, cft);
2808 } 2824 }
2809 } 2825 }
2810 return ret; 2826 return 0;
2811} 2827}
2812 2828
2813static void cgroup_cfts_prepare(void) 2829static void cgroup_cfts_prepare(void)
@@ -2816,28 +2832,30 @@ static void cgroup_cfts_prepare(void)
2816 /* 2832 /*
2817 * Thanks to the entanglement with vfs inode locking, we can't walk 2833 * Thanks to the entanglement with vfs inode locking, we can't walk
2818 * the existing cgroups under cgroup_mutex and create files. 2834 * the existing cgroups under cgroup_mutex and create files.
2819 * Instead, we use cgroup_for_each_descendant_pre() and drop RCU 2835 * Instead, we use css_for_each_descendant_pre() and drop RCU read
2820 * read lock before calling cgroup_addrm_files(). 2836 * lock before calling cgroup_addrm_files().
2821 */ 2837 */
2822 mutex_lock(&cgroup_mutex); 2838 mutex_lock(&cgroup_mutex);
2823} 2839}
2824 2840
2825static void cgroup_cfts_commit(struct cgroup_subsys *ss, 2841static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
2826 struct cftype *cfts, bool is_add)
2827 __releases(&cgroup_mutex) 2842 __releases(&cgroup_mutex)
2828{ 2843{
2829 LIST_HEAD(pending); 2844 LIST_HEAD(pending);
2830 struct cgroup *cgrp, *root = &ss->root->top_cgroup; 2845 struct cgroup_subsys *ss = cfts[0].ss;
2846 struct cgroup *root = &ss->root->top_cgroup;
2831 struct super_block *sb = ss->root->sb; 2847 struct super_block *sb = ss->root->sb;
2832 struct dentry *prev = NULL; 2848 struct dentry *prev = NULL;
2833 struct inode *inode; 2849 struct inode *inode;
2850 struct cgroup_subsys_state *css;
2834 u64 update_before; 2851 u64 update_before;
2852 int ret = 0;
2835 2853
2836 /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ 2854 /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */
2837 if (!cfts || ss->root == &cgroup_dummy_root || 2855 if (!cfts || ss->root == &cgroup_dummy_root ||
2838 !atomic_inc_not_zero(&sb->s_active)) { 2856 !atomic_inc_not_zero(&sb->s_active)) {
2839 mutex_unlock(&cgroup_mutex); 2857 mutex_unlock(&cgroup_mutex);
2840 return; 2858 return 0;
2841 } 2859 }
2842 2860
2843 /* 2861 /*
@@ -2849,17 +2867,11 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss,
2849 2867
2850 mutex_unlock(&cgroup_mutex); 2868 mutex_unlock(&cgroup_mutex);
2851 2869
2852 /* @root always needs to be updated */
2853 inode = root->dentry->d_inode;
2854 mutex_lock(&inode->i_mutex);
2855 mutex_lock(&cgroup_mutex);
2856 cgroup_addrm_files(root, ss, cfts, is_add);
2857 mutex_unlock(&cgroup_mutex);
2858 mutex_unlock(&inode->i_mutex);
2859
2860 /* add/rm files for all cgroups created before */ 2870 /* add/rm files for all cgroups created before */
2861 rcu_read_lock(); 2871 rcu_read_lock();
2862 cgroup_for_each_descendant_pre(cgrp, root) { 2872 css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
2873 struct cgroup *cgrp = css->cgroup;
2874
2863 if (cgroup_is_dead(cgrp)) 2875 if (cgroup_is_dead(cgrp))
2864 continue; 2876 continue;
2865 2877
@@ -2873,15 +2885,18 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss,
2873 mutex_lock(&inode->i_mutex); 2885 mutex_lock(&inode->i_mutex);
2874 mutex_lock(&cgroup_mutex); 2886 mutex_lock(&cgroup_mutex);
2875 if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp)) 2887 if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp))
2876 cgroup_addrm_files(cgrp, ss, cfts, is_add); 2888 ret = cgroup_addrm_files(cgrp, cfts, is_add);
2877 mutex_unlock(&cgroup_mutex); 2889 mutex_unlock(&cgroup_mutex);
2878 mutex_unlock(&inode->i_mutex); 2890 mutex_unlock(&inode->i_mutex);
2879 2891
2880 rcu_read_lock(); 2892 rcu_read_lock();
2893 if (ret)
2894 break;
2881 } 2895 }
2882 rcu_read_unlock(); 2896 rcu_read_unlock();
2883 dput(prev); 2897 dput(prev);
2884 deactivate_super(sb); 2898 deactivate_super(sb);
2899 return ret;
2885} 2900}
2886 2901
2887/** 2902/**
@@ -2901,49 +2916,56 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss,
2901int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) 2916int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2902{ 2917{
2903 struct cftype_set *set; 2918 struct cftype_set *set;
2919 struct cftype *cft;
2920 int ret;
2904 2921
2905 set = kzalloc(sizeof(*set), GFP_KERNEL); 2922 set = kzalloc(sizeof(*set), GFP_KERNEL);
2906 if (!set) 2923 if (!set)
2907 return -ENOMEM; 2924 return -ENOMEM;
2908 2925
2926 for (cft = cfts; cft->name[0] != '\0'; cft++)
2927 cft->ss = ss;
2928
2909 cgroup_cfts_prepare(); 2929 cgroup_cfts_prepare();
2910 set->cfts = cfts; 2930 set->cfts = cfts;
2911 list_add_tail(&set->node, &ss->cftsets); 2931 list_add_tail(&set->node, &ss->cftsets);
2912 cgroup_cfts_commit(ss, cfts, true); 2932 ret = cgroup_cfts_commit(cfts, true);
2913 2933 if (ret)
2914 return 0; 2934 cgroup_rm_cftypes(cfts);
2935 return ret;
2915} 2936}
2916EXPORT_SYMBOL_GPL(cgroup_add_cftypes); 2937EXPORT_SYMBOL_GPL(cgroup_add_cftypes);
2917 2938
2918/** 2939/**
2919 * cgroup_rm_cftypes - remove an array of cftypes from a subsystem 2940 * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
2920 * @ss: target cgroup subsystem
2921 * @cfts: zero-length name terminated array of cftypes 2941 * @cfts: zero-length name terminated array of cftypes
2922 * 2942 *
2923 * Unregister @cfts from @ss. Files described by @cfts are removed from 2943 * Unregister @cfts. Files described by @cfts are removed from all
2924 * all existing cgroups to which @ss is attached and all future cgroups 2944 * existing cgroups and all future cgroups won't have them either. This
2925 * won't have them either. This function can be called anytime whether @ss 2945 * function can be called anytime whether @cfts' subsys is attached or not.
2926 * is attached or not.
2927 * 2946 *
2928 * Returns 0 on successful unregistration, -ENOENT if @cfts is not 2947 * Returns 0 on successful unregistration, -ENOENT if @cfts is not
2929 * registered with @ss. 2948 * registered.
2930 */ 2949 */
2931int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) 2950int cgroup_rm_cftypes(struct cftype *cfts)
2932{ 2951{
2933 struct cftype_set *set; 2952 struct cftype_set *set;
2934 2953
2954 if (!cfts || !cfts[0].ss)
2955 return -ENOENT;
2956
2935 cgroup_cfts_prepare(); 2957 cgroup_cfts_prepare();
2936 2958
2937 list_for_each_entry(set, &ss->cftsets, node) { 2959 list_for_each_entry(set, &cfts[0].ss->cftsets, node) {
2938 if (set->cfts == cfts) { 2960 if (set->cfts == cfts) {
2939 list_del(&set->node); 2961 list_del(&set->node);
2940 kfree(set); 2962 kfree(set);
2941 cgroup_cfts_commit(ss, cfts, false); 2963 cgroup_cfts_commit(cfts, false);
2942 return 0; 2964 return 0;
2943 } 2965 }
2944 } 2966 }
2945 2967
2946 cgroup_cfts_commit(ss, NULL, false); 2968 cgroup_cfts_commit(NULL, false);
2947 return -ENOENT; 2969 return -ENOENT;
2948} 2970}
2949 2971
@@ -2966,34 +2988,10 @@ int cgroup_task_count(const struct cgroup *cgrp)
2966} 2988}
2967 2989
2968/* 2990/*
2969 * Advance a list_head iterator. The iterator should be positioned at 2991 * To reduce the fork() overhead for systems that are not actually using
2970 * the start of a css_set 2992 * their cgroups capability, we don't maintain the lists running through
2971 */ 2993 * each css_set to its tasks until we see the list actually used - in other
2972static void cgroup_advance_iter(struct cgroup *cgrp, struct cgroup_iter *it) 2994 * words after the first call to css_task_iter_start().
2973{
2974 struct list_head *l = it->cset_link;
2975 struct cgrp_cset_link *link;
2976 struct css_set *cset;
2977
2978 /* Advance to the next non-empty css_set */
2979 do {
2980 l = l->next;
2981 if (l == &cgrp->cset_links) {
2982 it->cset_link = NULL;
2983 return;
2984 }
2985 link = list_entry(l, struct cgrp_cset_link, cset_link);
2986 cset = link->cset;
2987 } while (list_empty(&cset->tasks));
2988 it->cset_link = l;
2989 it->task = cset->tasks.next;
2990}
2991
2992/*
2993 * To reduce the fork() overhead for systems that are not actually
2994 * using their cgroups capability, we don't maintain the lists running
2995 * through each css_set to its tasks until we see the list actually
2996 * used - in other words after the first call to cgroup_iter_start().
2997 */ 2995 */
2998static void cgroup_enable_task_cg_lists(void) 2996static void cgroup_enable_task_cg_lists(void)
2999{ 2997{
@@ -3024,16 +3022,21 @@ static void cgroup_enable_task_cg_lists(void)
3024} 3022}
3025 3023
3026/** 3024/**
3027 * cgroup_next_sibling - find the next sibling of a given cgroup 3025 * css_next_child - find the next child of a given css
3028 * @pos: the current cgroup 3026 * @pos_css: the current position (%NULL to initiate traversal)
3027 * @parent_css: css whose children to walk
3029 * 3028 *
3030 * This function returns the next sibling of @pos and should be called 3029 * This function returns the next child of @parent_css and should be called
3031 * under RCU read lock. The only requirement is that @pos is accessible. 3030 * under RCU read lock. The only requirement is that @parent_css and
3032 * The next sibling is guaranteed to be returned regardless of @pos's 3031 * @pos_css are accessible. The next sibling is guaranteed to be returned
3033 * state. 3032 * regardless of their states.
3034 */ 3033 */
3035struct cgroup *cgroup_next_sibling(struct cgroup *pos) 3034struct cgroup_subsys_state *
3035css_next_child(struct cgroup_subsys_state *pos_css,
3036 struct cgroup_subsys_state *parent_css)
3036{ 3037{
3038 struct cgroup *pos = pos_css ? pos_css->cgroup : NULL;
3039 struct cgroup *cgrp = parent_css->cgroup;
3037 struct cgroup *next; 3040 struct cgroup *next;
3038 3041
3039 WARN_ON_ONCE(!rcu_read_lock_held()); 3042 WARN_ON_ONCE(!rcu_read_lock_held());
@@ -3048,78 +3051,81 @@ struct cgroup *cgroup_next_sibling(struct cgroup *pos)
3048 * safe to dereference from this RCU critical section. If 3051 * safe to dereference from this RCU critical section. If
3049 * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed 3052 * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed
3050 * to be visible as %true here. 3053 * to be visible as %true here.
3054 *
3055 * If @pos is dead, its next pointer can't be dereferenced;
3056 * however, as each cgroup is given a monotonically increasing
3057 * unique serial number and always appended to the sibling list,
3058 * the next one can be found by walking the parent's children until
3059 * we see a cgroup with higher serial number than @pos's. While
3060 * this path can be slower, it's taken only when either the current
3061 * cgroup is removed or iteration and removal race.
3051 */ 3062 */
3052 if (likely(!cgroup_is_dead(pos))) { 3063 if (!pos) {
3064 next = list_entry_rcu(cgrp->children.next, struct cgroup, sibling);
3065 } else if (likely(!cgroup_is_dead(pos))) {
3053 next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); 3066 next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
3054 if (&next->sibling != &pos->parent->children) 3067 } else {
3055 return next; 3068 list_for_each_entry_rcu(next, &cgrp->children, sibling)
3056 return NULL; 3069 if (next->serial_nr > pos->serial_nr)
3070 break;
3057 } 3071 }
3058 3072
3059 /* 3073 if (&next->sibling == &cgrp->children)
3060 * Can't dereference the next pointer. Each cgroup is given a 3074 return NULL;
3061 * monotonically increasing unique serial number and always 3075
3062 * appended to the sibling list, so the next one can be found by 3076 return cgroup_css(next, parent_css->ss);
3063 * walking the parent's children until we see a cgroup with higher
3064 * serial number than @pos's.
3065 *
3066 * While this path can be slow, it's taken only when either the
3067 * current cgroup is removed or iteration and removal race.
3068 */
3069 list_for_each_entry_rcu(next, &pos->parent->children, sibling)
3070 if (next->serial_nr > pos->serial_nr)
3071 return next;
3072 return NULL;
3073} 3077}
3074EXPORT_SYMBOL_GPL(cgroup_next_sibling); 3078EXPORT_SYMBOL_GPL(css_next_child);
3075 3079
3076/** 3080/**
3077 * cgroup_next_descendant_pre - find the next descendant for pre-order walk 3081 * css_next_descendant_pre - find the next descendant for pre-order walk
3078 * @pos: the current position (%NULL to initiate traversal) 3082 * @pos: the current position (%NULL to initiate traversal)
3079 * @cgroup: cgroup whose descendants to walk 3083 * @root: css whose descendants to walk
3080 * 3084 *
3081 * To be used by cgroup_for_each_descendant_pre(). Find the next 3085 * To be used by css_for_each_descendant_pre(). Find the next descendant
3082 * descendant to visit for pre-order traversal of @cgroup's descendants. 3086 * to visit for pre-order traversal of @root's descendants. @root is
3087 * included in the iteration and the first node to be visited.
3083 * 3088 *
3084 * While this function requires RCU read locking, it doesn't require the 3089 * While this function requires RCU read locking, it doesn't require the
3085 * whole traversal to be contained in a single RCU critical section. This 3090 * whole traversal to be contained in a single RCU critical section. This
3086 * function will return the correct next descendant as long as both @pos 3091 * function will return the correct next descendant as long as both @pos
3087 * and @cgroup are accessible and @pos is a descendant of @cgroup. 3092 * and @root are accessible and @pos is a descendant of @root.
3088 */ 3093 */
3089struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, 3094struct cgroup_subsys_state *
3090 struct cgroup *cgroup) 3095css_next_descendant_pre(struct cgroup_subsys_state *pos,
3096 struct cgroup_subsys_state *root)
3091{ 3097{
3092 struct cgroup *next; 3098 struct cgroup_subsys_state *next;
3093 3099
3094 WARN_ON_ONCE(!rcu_read_lock_held()); 3100 WARN_ON_ONCE(!rcu_read_lock_held());
3095 3101
3096 /* if first iteration, pretend we just visited @cgroup */ 3102 /* if first iteration, visit @root */
3097 if (!pos) 3103 if (!pos)
3098 pos = cgroup; 3104 return root;
3099 3105
3100 /* visit the first child if exists */ 3106 /* visit the first child if exists */
3101 next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling); 3107 next = css_next_child(NULL, pos);
3102 if (next) 3108 if (next)
3103 return next; 3109 return next;
3104 3110
3105 /* no child, visit my or the closest ancestor's next sibling */ 3111 /* no child, visit my or the closest ancestor's next sibling */
3106 while (pos != cgroup) { 3112 while (pos != root) {
3107 next = cgroup_next_sibling(pos); 3113 next = css_next_child(pos, css_parent(pos));
3108 if (next) 3114 if (next)
3109 return next; 3115 return next;
3110 pos = pos->parent; 3116 pos = css_parent(pos);
3111 } 3117 }
3112 3118
3113 return NULL; 3119 return NULL;
3114} 3120}
3115EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); 3121EXPORT_SYMBOL_GPL(css_next_descendant_pre);
3116 3122
3117/** 3123/**
3118 * cgroup_rightmost_descendant - return the rightmost descendant of a cgroup 3124 * css_rightmost_descendant - return the rightmost descendant of a css
3119 * @pos: cgroup of interest 3125 * @pos: css of interest
3120 * 3126 *
3121 * Return the rightmost descendant of @pos. If there's no descendant, 3127 * Return the rightmost descendant of @pos. If there's no descendant, @pos
3122 * @pos is returned. This can be used during pre-order traversal to skip 3128 * is returned. This can be used during pre-order traversal to skip
3123 * subtree of @pos. 3129 * subtree of @pos.
3124 * 3130 *
3125 * While this function requires RCU read locking, it doesn't require the 3131 * While this function requires RCU read locking, it doesn't require the
@@ -3127,9 +3133,10 @@ EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);
3127 * function will return the correct rightmost descendant as long as @pos is 3133 * function will return the correct rightmost descendant as long as @pos is
3128 * accessible. 3134 * accessible.
3129 */ 3135 */
3130struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos) 3136struct cgroup_subsys_state *
3137css_rightmost_descendant(struct cgroup_subsys_state *pos)
3131{ 3138{
3132 struct cgroup *last, *tmp; 3139 struct cgroup_subsys_state *last, *tmp;
3133 3140
3134 WARN_ON_ONCE(!rcu_read_lock_held()); 3141 WARN_ON_ONCE(!rcu_read_lock_held());
3135 3142
@@ -3137,82 +3144,138 @@ struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos)
3137 last = pos; 3144 last = pos;
3138 /* ->prev isn't RCU safe, walk ->next till the end */ 3145 /* ->prev isn't RCU safe, walk ->next till the end */
3139 pos = NULL; 3146 pos = NULL;
3140 list_for_each_entry_rcu(tmp, &last->children, sibling) 3147 css_for_each_child(tmp, last)
3141 pos = tmp; 3148 pos = tmp;
3142 } while (pos); 3149 } while (pos);
3143 3150
3144 return last; 3151 return last;
3145} 3152}
3146EXPORT_SYMBOL_GPL(cgroup_rightmost_descendant); 3153EXPORT_SYMBOL_GPL(css_rightmost_descendant);
3147 3154
3148static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos) 3155static struct cgroup_subsys_state *
3156css_leftmost_descendant(struct cgroup_subsys_state *pos)
3149{ 3157{
3150 struct cgroup *last; 3158 struct cgroup_subsys_state *last;
3151 3159
3152 do { 3160 do {
3153 last = pos; 3161 last = pos;
3154 pos = list_first_or_null_rcu(&pos->children, struct cgroup, 3162 pos = css_next_child(NULL, pos);
3155 sibling);
3156 } while (pos); 3163 } while (pos);
3157 3164
3158 return last; 3165 return last;
3159} 3166}
3160 3167
3161/** 3168/**
3162 * cgroup_next_descendant_post - find the next descendant for post-order walk 3169 * css_next_descendant_post - find the next descendant for post-order walk
3163 * @pos: the current position (%NULL to initiate traversal) 3170 * @pos: the current position (%NULL to initiate traversal)
3164 * @cgroup: cgroup whose descendants to walk 3171 * @root: css whose descendants to walk
3165 * 3172 *
3166 * To be used by cgroup_for_each_descendant_post(). Find the next 3173 * To be used by css_for_each_descendant_post(). Find the next descendant
3167 * descendant to visit for post-order traversal of @cgroup's descendants. 3174 * to visit for post-order traversal of @root's descendants. @root is
3175 * included in the iteration and the last node to be visited.
3168 * 3176 *
3169 * While this function requires RCU read locking, it doesn't require the 3177 * While this function requires RCU read locking, it doesn't require the
3170 * whole traversal to be contained in a single RCU critical section. This 3178 * whole traversal to be contained in a single RCU critical section. This
3171 * function will return the correct next descendant as long as both @pos 3179 * function will return the correct next descendant as long as both @pos
3172 * and @cgroup are accessible and @pos is a descendant of @cgroup. 3180 * and @cgroup are accessible and @pos is a descendant of @cgroup.
3173 */ 3181 */
3174struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, 3182struct cgroup_subsys_state *
3175 struct cgroup *cgroup) 3183css_next_descendant_post(struct cgroup_subsys_state *pos,
3184 struct cgroup_subsys_state *root)
3176{ 3185{
3177 struct cgroup *next; 3186 struct cgroup_subsys_state *next;
3178 3187
3179 WARN_ON_ONCE(!rcu_read_lock_held()); 3188 WARN_ON_ONCE(!rcu_read_lock_held());
3180 3189
3181 /* if first iteration, visit the leftmost descendant */ 3190 /* if first iteration, visit the leftmost descendant */
3182 if (!pos) { 3191 if (!pos) {
3183 next = cgroup_leftmost_descendant(cgroup); 3192 next = css_leftmost_descendant(root);
3184 return next != cgroup ? next : NULL; 3193 return next != root ? next : NULL;
3185 } 3194 }
3186 3195
3196 /* if we visited @root, we're done */
3197 if (pos == root)
3198 return NULL;
3199
3187 /* if there's an unvisited sibling, visit its leftmost descendant */ 3200 /* if there's an unvisited sibling, visit its leftmost descendant */
3188 next = cgroup_next_sibling(pos); 3201 next = css_next_child(pos, css_parent(pos));
3189 if (next) 3202 if (next)
3190 return cgroup_leftmost_descendant(next); 3203 return css_leftmost_descendant(next);
3191 3204
3192 /* no sibling left, visit parent */ 3205 /* no sibling left, visit parent */
3193 next = pos->parent; 3206 return css_parent(pos);
3194 return next != cgroup ? next : NULL; 3207}
3208EXPORT_SYMBOL_GPL(css_next_descendant_post);
3209
3210/**
3211 * css_advance_task_iter - advance a task itererator to the next css_set
3212 * @it: the iterator to advance
3213 *
3214 * Advance @it to the next css_set to walk.
3215 */
3216static void css_advance_task_iter(struct css_task_iter *it)
3217{
3218 struct list_head *l = it->cset_link;
3219 struct cgrp_cset_link *link;
3220 struct css_set *cset;
3221
3222 /* Advance to the next non-empty css_set */
3223 do {
3224 l = l->next;
3225 if (l == &it->origin_css->cgroup->cset_links) {
3226 it->cset_link = NULL;
3227 return;
3228 }
3229 link = list_entry(l, struct cgrp_cset_link, cset_link);
3230 cset = link->cset;
3231 } while (list_empty(&cset->tasks));
3232 it->cset_link = l;
3233 it->task = cset->tasks.next;
3195} 3234}
3196EXPORT_SYMBOL_GPL(cgroup_next_descendant_post);
3197 3235
3198void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) 3236/**
3237 * css_task_iter_start - initiate task iteration
3238 * @css: the css to walk tasks of
3239 * @it: the task iterator to use
3240 *
3241 * Initiate iteration through the tasks of @css. The caller can call
3242 * css_task_iter_next() to walk through the tasks until the function
3243 * returns NULL. On completion of iteration, css_task_iter_end() must be
3244 * called.
3245 *
3246 * Note that this function acquires a lock which is released when the
3247 * iteration finishes. The caller can't sleep while iteration is in
3248 * progress.
3249 */
3250void css_task_iter_start(struct cgroup_subsys_state *css,
3251 struct css_task_iter *it)
3199 __acquires(css_set_lock) 3252 __acquires(css_set_lock)
3200{ 3253{
3201 /* 3254 /*
3202 * The first time anyone tries to iterate across a cgroup, 3255 * The first time anyone tries to iterate across a css, we need to
3203 * we need to enable the list linking each css_set to its 3256 * enable the list linking each css_set to its tasks, and fix up
3204 * tasks, and fix up all existing tasks. 3257 * all existing tasks.
3205 */ 3258 */
3206 if (!use_task_css_set_links) 3259 if (!use_task_css_set_links)
3207 cgroup_enable_task_cg_lists(); 3260 cgroup_enable_task_cg_lists();
3208 3261
3209 read_lock(&css_set_lock); 3262 read_lock(&css_set_lock);
3210 it->cset_link = &cgrp->cset_links; 3263
3211 cgroup_advance_iter(cgrp, it); 3264 it->origin_css = css;
3265 it->cset_link = &css->cgroup->cset_links;
3266
3267 css_advance_task_iter(it);
3212} 3268}
3213 3269
3214struct task_struct *cgroup_iter_next(struct cgroup *cgrp, 3270/**
3215 struct cgroup_iter *it) 3271 * css_task_iter_next - return the next task for the iterator
3272 * @it: the task iterator being iterated
3273 *
3274 * The "next" function for task iteration. @it should have been
3275 * initialized via css_task_iter_start(). Returns NULL when the iteration
3276 * reaches the end.
3277 */
3278struct task_struct *css_task_iter_next(struct css_task_iter *it)
3216{ 3279{
3217 struct task_struct *res; 3280 struct task_struct *res;
3218 struct list_head *l = it->task; 3281 struct list_head *l = it->task;
@@ -3226,16 +3289,24 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
3226 l = l->next; 3289 l = l->next;
3227 link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link); 3290 link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link);
3228 if (l == &link->cset->tasks) { 3291 if (l == &link->cset->tasks) {
3229 /* We reached the end of this task list - move on to 3292 /*
3230 * the next cg_cgroup_link */ 3293 * We reached the end of this task list - move on to the
3231 cgroup_advance_iter(cgrp, it); 3294 * next cgrp_cset_link.
3295 */
3296 css_advance_task_iter(it);
3232 } else { 3297 } else {
3233 it->task = l; 3298 it->task = l;
3234 } 3299 }
3235 return res; 3300 return res;
3236} 3301}
3237 3302
3238void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it) 3303/**
3304 * css_task_iter_end - finish task iteration
3305 * @it: the task iterator to finish
3306 *
3307 * Finish task iteration started by css_task_iter_start().
3308 */
3309void css_task_iter_end(struct css_task_iter *it)
3239 __releases(css_set_lock) 3310 __releases(css_set_lock)
3240{ 3311{
3241 read_unlock(&css_set_lock); 3312 read_unlock(&css_set_lock);
@@ -3276,46 +3347,49 @@ static inline int started_after(void *p1, void *p2)
3276} 3347}
3277 3348
3278/** 3349/**
3279 * cgroup_scan_tasks - iterate though all the tasks in a cgroup 3350 * css_scan_tasks - iterate though all the tasks in a css
3280 * @scan: struct cgroup_scanner containing arguments for the scan 3351 * @css: the css to iterate tasks of
3352 * @test: optional test callback
3353 * @process: process callback
3354 * @data: data passed to @test and @process
3355 * @heap: optional pre-allocated heap used for task iteration
3356 *
3357 * Iterate through all the tasks in @css, calling @test for each, and if it
3358 * returns %true, call @process for it also.
3359 *
3360 * @test may be NULL, meaning always true (select all tasks), which
3361 * effectively duplicates css_task_iter_{start,next,end}() but does not
3362 * lock css_set_lock for the call to @process.
3281 * 3363 *
3282 * Arguments include pointers to callback functions test_task() and 3364 * It is guaranteed that @process will act on every task that is a member
3283 * process_task(). 3365 * of @css for the duration of this call. This function may or may not
3284 * Iterate through all the tasks in a cgroup, calling test_task() for each, 3366 * call @process for tasks that exit or move to a different css during the
3285 * and if it returns true, call process_task() for it also. 3367 * call, or are forked or move into the css during the call.
3286 * The test_task pointer may be NULL, meaning always true (select all tasks).
3287 * Effectively duplicates cgroup_iter_{start,next,end}()
3288 * but does not lock css_set_lock for the call to process_task().
3289 * The struct cgroup_scanner may be embedded in any structure of the caller's
3290 * creation.
3291 * It is guaranteed that process_task() will act on every task that
3292 * is a member of the cgroup for the duration of this call. This
3293 * function may or may not call process_task() for tasks that exit
3294 * or move to a different cgroup during the call, or are forked or
3295 * move into the cgroup during the call.
3296 * 3368 *
3297 * Note that test_task() may be called with locks held, and may in some 3369 * Note that @test may be called with locks held, and may in some
3298 * situations be called multiple times for the same task, so it should 3370 * situations be called multiple times for the same task, so it should be
3299 * be cheap. 3371 * cheap.
3300 * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been 3372 *
3301 * pre-allocated and will be used for heap operations (and its "gt" member will 3373 * If @heap is non-NULL, a heap has been pre-allocated and will be used for
3302 * be overwritten), else a temporary heap will be used (allocation of which 3374 * heap operations (and its "gt" member will be overwritten), else a
3303 * may cause this function to fail). 3375 * temporary heap will be used (allocation of which may cause this function
3376 * to fail).
3304 */ 3377 */
3305int cgroup_scan_tasks(struct cgroup_scanner *scan) 3378int css_scan_tasks(struct cgroup_subsys_state *css,
3379 bool (*test)(struct task_struct *, void *),
3380 void (*process)(struct task_struct *, void *),
3381 void *data, struct ptr_heap *heap)
3306{ 3382{
3307 int retval, i; 3383 int retval, i;
3308 struct cgroup_iter it; 3384 struct css_task_iter it;
3309 struct task_struct *p, *dropped; 3385 struct task_struct *p, *dropped;
3310 /* Never dereference latest_task, since it's not refcounted */ 3386 /* Never dereference latest_task, since it's not refcounted */
3311 struct task_struct *latest_task = NULL; 3387 struct task_struct *latest_task = NULL;
3312 struct ptr_heap tmp_heap; 3388 struct ptr_heap tmp_heap;
3313 struct ptr_heap *heap;
3314 struct timespec latest_time = { 0, 0 }; 3389 struct timespec latest_time = { 0, 0 };
3315 3390
3316 if (scan->heap) { 3391 if (heap) {
3317 /* The caller supplied our heap and pre-allocated its memory */ 3392 /* The caller supplied our heap and pre-allocated its memory */
3318 heap = scan->heap;
3319 heap->gt = &started_after; 3393 heap->gt = &started_after;
3320 } else { 3394 } else {
3321 /* We need to allocate our own heap memory */ 3395 /* We need to allocate our own heap memory */
@@ -3328,25 +3402,24 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
3328 3402
3329 again: 3403 again:
3330 /* 3404 /*
3331 * Scan tasks in the cgroup, using the scanner's "test_task" callback 3405 * Scan tasks in the css, using the @test callback to determine
3332 * to determine which are of interest, and using the scanner's 3406 * which are of interest, and invoking @process callback on the
3333 * "process_task" callback to process any of them that need an update. 3407 * ones which need an update. Since we don't want to hold any
3334 * Since we don't want to hold any locks during the task updates, 3408 * locks during the task updates, gather tasks to be processed in a
3335 * gather tasks to be processed in a heap structure. 3409 * heap structure. The heap is sorted by descending task start
3336 * The heap is sorted by descending task start time. 3410 * time. If the statically-sized heap fills up, we overflow tasks
3337 * If the statically-sized heap fills up, we overflow tasks that 3411 * that started later, and in future iterations only consider tasks
3338 * started later, and in future iterations only consider tasks that 3412 * that started after the latest task in the previous pass. This
3339 * started after the latest task in the previous pass. This
3340 * guarantees forward progress and that we don't miss any tasks. 3413 * guarantees forward progress and that we don't miss any tasks.
3341 */ 3414 */
3342 heap->size = 0; 3415 heap->size = 0;
3343 cgroup_iter_start(scan->cg, &it); 3416 css_task_iter_start(css, &it);
3344 while ((p = cgroup_iter_next(scan->cg, &it))) { 3417 while ((p = css_task_iter_next(&it))) {
3345 /* 3418 /*
3346 * Only affect tasks that qualify per the caller's callback, 3419 * Only affect tasks that qualify per the caller's callback,
3347 * if he provided one 3420 * if he provided one
3348 */ 3421 */
3349 if (scan->test_task && !scan->test_task(p, scan)) 3422 if (test && !test(p, data))
3350 continue; 3423 continue;
3351 /* 3424 /*
3352 * Only process tasks that started after the last task 3425 * Only process tasks that started after the last task
@@ -3374,7 +3447,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
3374 * the heap and wasn't inserted 3447 * the heap and wasn't inserted
3375 */ 3448 */
3376 } 3449 }
3377 cgroup_iter_end(scan->cg, &it); 3450 css_task_iter_end(&it);
3378 3451
3379 if (heap->size) { 3452 if (heap->size) {
3380 for (i = 0; i < heap->size; i++) { 3453 for (i = 0; i < heap->size; i++) {
@@ -3384,7 +3457,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
3384 latest_task = q; 3457 latest_task = q;
3385 } 3458 }
3386 /* Process the task per the caller's callback */ 3459 /* Process the task per the caller's callback */
3387 scan->process_task(q, scan); 3460 process(q, data);
3388 put_task_struct(q); 3461 put_task_struct(q);
3389 } 3462 }
3390 /* 3463 /*
@@ -3401,10 +3474,9 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
3401 return 0; 3474 return 0;
3402} 3475}
3403 3476
3404static void cgroup_transfer_one_task(struct task_struct *task, 3477static void cgroup_transfer_one_task(struct task_struct *task, void *data)
3405 struct cgroup_scanner *scan)
3406{ 3478{
3407 struct cgroup *new_cgroup = scan->data; 3479 struct cgroup *new_cgroup = data;
3408 3480
3409 mutex_lock(&cgroup_mutex); 3481 mutex_lock(&cgroup_mutex);
3410 cgroup_attach_task(new_cgroup, task, false); 3482 cgroup_attach_task(new_cgroup, task, false);
@@ -3418,15 +3490,8 @@ static void cgroup_transfer_one_task(struct task_struct *task,
3418 */ 3490 */
3419int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) 3491int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
3420{ 3492{
3421 struct cgroup_scanner scan; 3493 return css_scan_tasks(&from->dummy_css, NULL, cgroup_transfer_one_task,
3422 3494 to, NULL);
3423 scan.cg = from;
3424 scan.test_task = NULL; /* select all tasks in cgroup */
3425 scan.process_task = cgroup_transfer_one_task;
3426 scan.heap = NULL;
3427 scan.data = to;
3428
3429 return cgroup_scan_tasks(&scan);
3430} 3495}
3431 3496
3432/* 3497/*
@@ -3468,7 +3533,7 @@ struct cgroup_pidlist {
3468 /* pointer to the cgroup we belong to, for list removal purposes */ 3533 /* pointer to the cgroup we belong to, for list removal purposes */
3469 struct cgroup *owner; 3534 struct cgroup *owner;
3470 /* protects the other fields */ 3535 /* protects the other fields */
3471 struct rw_semaphore mutex; 3536 struct rw_semaphore rwsem;
3472}; 3537};
3473 3538
3474/* 3539/*
@@ -3541,7 +3606,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3541 struct pid_namespace *ns = task_active_pid_ns(current); 3606 struct pid_namespace *ns = task_active_pid_ns(current);
3542 3607
3543 /* 3608 /*
3544 * We can't drop the pidlist_mutex before taking the l->mutex in case 3609 * We can't drop the pidlist_mutex before taking the l->rwsem in case
3545 * the last ref-holder is trying to remove l from the list at the same 3610 * the last ref-holder is trying to remove l from the list at the same
3546 * time. Holding the pidlist_mutex precludes somebody taking whichever 3611 * time. Holding the pidlist_mutex precludes somebody taking whichever
3547 * list we find out from under us - compare release_pid_array(). 3612 * list we find out from under us - compare release_pid_array().
@@ -3550,7 +3615,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3550 list_for_each_entry(l, &cgrp->pidlists, links) { 3615 list_for_each_entry(l, &cgrp->pidlists, links) {
3551 if (l->key.type == type && l->key.ns == ns) { 3616 if (l->key.type == type && l->key.ns == ns) {
3552 /* make sure l doesn't vanish out from under us */ 3617 /* make sure l doesn't vanish out from under us */
3553 down_write(&l->mutex); 3618 down_write(&l->rwsem);
3554 mutex_unlock(&cgrp->pidlist_mutex); 3619 mutex_unlock(&cgrp->pidlist_mutex);
3555 return l; 3620 return l;
3556 } 3621 }
@@ -3561,8 +3626,8 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3561 mutex_unlock(&cgrp->pidlist_mutex); 3626 mutex_unlock(&cgrp->pidlist_mutex);
3562 return l; 3627 return l;
3563 } 3628 }
3564 init_rwsem(&l->mutex); 3629 init_rwsem(&l->rwsem);
3565 down_write(&l->mutex); 3630 down_write(&l->rwsem);
3566 l->key.type = type; 3631 l->key.type = type;
3567 l->key.ns = get_pid_ns(ns); 3632 l->key.ns = get_pid_ns(ns);
3568 l->owner = cgrp; 3633 l->owner = cgrp;
@@ -3580,7 +3645,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3580 pid_t *array; 3645 pid_t *array;
3581 int length; 3646 int length;
3582 int pid, n = 0; /* used for populating the array */ 3647 int pid, n = 0; /* used for populating the array */
3583 struct cgroup_iter it; 3648 struct css_task_iter it;
3584 struct task_struct *tsk; 3649 struct task_struct *tsk;
3585 struct cgroup_pidlist *l; 3650 struct cgroup_pidlist *l;
3586 3651
@@ -3595,8 +3660,8 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3595 if (!array) 3660 if (!array)
3596 return -ENOMEM; 3661 return -ENOMEM;
3597 /* now, populate the array */ 3662 /* now, populate the array */
3598 cgroup_iter_start(cgrp, &it); 3663 css_task_iter_start(&cgrp->dummy_css, &it);
3599 while ((tsk = cgroup_iter_next(cgrp, &it))) { 3664 while ((tsk = css_task_iter_next(&it))) {
3600 if (unlikely(n == length)) 3665 if (unlikely(n == length))
3601 break; 3666 break;
3602 /* get tgid or pid for procs or tasks file respectively */ 3667 /* get tgid or pid for procs or tasks file respectively */
@@ -3607,7 +3672,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3607 if (pid > 0) /* make sure to only use valid results */ 3672 if (pid > 0) /* make sure to only use valid results */
3608 array[n++] = pid; 3673 array[n++] = pid;
3609 } 3674 }
3610 cgroup_iter_end(cgrp, &it); 3675 css_task_iter_end(&it);
3611 length = n; 3676 length = n;
3612 /* now sort & (if procs) strip out duplicates */ 3677 /* now sort & (if procs) strip out duplicates */
3613 sort(array, length, sizeof(pid_t), cmppid, NULL); 3678 sort(array, length, sizeof(pid_t), cmppid, NULL);
@@ -3623,7 +3688,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3623 l->list = array; 3688 l->list = array;
3624 l->length = length; 3689 l->length = length;
3625 l->use_count++; 3690 l->use_count++;
3626 up_write(&l->mutex); 3691 up_write(&l->rwsem);
3627 *lp = l; 3692 *lp = l;
3628 return 0; 3693 return 0;
3629} 3694}
@@ -3641,7 +3706,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3641{ 3706{
3642 int ret = -EINVAL; 3707 int ret = -EINVAL;
3643 struct cgroup *cgrp; 3708 struct cgroup *cgrp;
3644 struct cgroup_iter it; 3709 struct css_task_iter it;
3645 struct task_struct *tsk; 3710 struct task_struct *tsk;
3646 3711
3647 /* 3712 /*
@@ -3655,8 +3720,8 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3655 ret = 0; 3720 ret = 0;
3656 cgrp = dentry->d_fsdata; 3721 cgrp = dentry->d_fsdata;
3657 3722
3658 cgroup_iter_start(cgrp, &it); 3723 css_task_iter_start(&cgrp->dummy_css, &it);
3659 while ((tsk = cgroup_iter_next(cgrp, &it))) { 3724 while ((tsk = css_task_iter_next(&it))) {
3660 switch (tsk->state) { 3725 switch (tsk->state) {
3661 case TASK_RUNNING: 3726 case TASK_RUNNING:
3662 stats->nr_running++; 3727 stats->nr_running++;
@@ -3676,7 +3741,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3676 break; 3741 break;
3677 } 3742 }
3678 } 3743 }
3679 cgroup_iter_end(cgrp, &it); 3744 css_task_iter_end(&it);
3680 3745
3681err: 3746err:
3682 return ret; 3747 return ret;
@@ -3701,7 +3766,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
3701 int index = 0, pid = *pos; 3766 int index = 0, pid = *pos;
3702 int *iter; 3767 int *iter;
3703 3768
3704 down_read(&l->mutex); 3769 down_read(&l->rwsem);
3705 if (pid) { 3770 if (pid) {
3706 int end = l->length; 3771 int end = l->length;
3707 3772
@@ -3728,7 +3793,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
3728static void cgroup_pidlist_stop(struct seq_file *s, void *v) 3793static void cgroup_pidlist_stop(struct seq_file *s, void *v)
3729{ 3794{
3730 struct cgroup_pidlist *l = s->private; 3795 struct cgroup_pidlist *l = s->private;
3731 up_read(&l->mutex); 3796 up_read(&l->rwsem);
3732} 3797}
3733 3798
3734static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) 3799static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
@@ -3774,7 +3839,7 @@ static void cgroup_release_pid_array(struct cgroup_pidlist *l)
3774 * pidlist_mutex, we have to take pidlist_mutex first. 3839 * pidlist_mutex, we have to take pidlist_mutex first.
3775 */ 3840 */
3776 mutex_lock(&l->owner->pidlist_mutex); 3841 mutex_lock(&l->owner->pidlist_mutex);
3777 down_write(&l->mutex); 3842 down_write(&l->rwsem);
3778 BUG_ON(!l->use_count); 3843 BUG_ON(!l->use_count);
3779 if (!--l->use_count) { 3844 if (!--l->use_count) {
3780 /* we're the last user if refcount is 0; remove and free */ 3845 /* we're the last user if refcount is 0; remove and free */
@@ -3782,12 +3847,12 @@ static void cgroup_release_pid_array(struct cgroup_pidlist *l)
3782 mutex_unlock(&l->owner->pidlist_mutex); 3847 mutex_unlock(&l->owner->pidlist_mutex);
3783 pidlist_free(l->list); 3848 pidlist_free(l->list);
3784 put_pid_ns(l->key.ns); 3849 put_pid_ns(l->key.ns);
3785 up_write(&l->mutex); 3850 up_write(&l->rwsem);
3786 kfree(l); 3851 kfree(l);
3787 return; 3852 return;
3788 } 3853 }
3789 mutex_unlock(&l->owner->pidlist_mutex); 3854 mutex_unlock(&l->owner->pidlist_mutex);
3790 up_write(&l->mutex); 3855 up_write(&l->rwsem);
3791} 3856}
3792 3857
3793static int cgroup_pidlist_release(struct inode *inode, struct file *file) 3858static int cgroup_pidlist_release(struct inode *inode, struct file *file)
@@ -3851,21 +3916,20 @@ static int cgroup_procs_open(struct inode *unused, struct file *file)
3851 return cgroup_pidlist_open(file, CGROUP_FILE_PROCS); 3916 return cgroup_pidlist_open(file, CGROUP_FILE_PROCS);
3852} 3917}
3853 3918
3854static u64 cgroup_read_notify_on_release(struct cgroup *cgrp, 3919static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
3855 struct cftype *cft) 3920 struct cftype *cft)
3856{ 3921{
3857 return notify_on_release(cgrp); 3922 return notify_on_release(css->cgroup);
3858} 3923}
3859 3924
3860static int cgroup_write_notify_on_release(struct cgroup *cgrp, 3925static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
3861 struct cftype *cft, 3926 struct cftype *cft, u64 val)
3862 u64 val)
3863{ 3927{
3864 clear_bit(CGRP_RELEASABLE, &cgrp->flags); 3928 clear_bit(CGRP_RELEASABLE, &css->cgroup->flags);
3865 if (val) 3929 if (val)
3866 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 3930 set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
3867 else 3931 else
3868 clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 3932 clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
3869 return 0; 3933 return 0;
3870} 3934}
3871 3935
@@ -3895,18 +3959,18 @@ static void cgroup_event_remove(struct work_struct *work)
3895{ 3959{
3896 struct cgroup_event *event = container_of(work, struct cgroup_event, 3960 struct cgroup_event *event = container_of(work, struct cgroup_event,
3897 remove); 3961 remove);
3898 struct cgroup *cgrp = event->cgrp; 3962 struct cgroup_subsys_state *css = event->css;
3899 3963
3900 remove_wait_queue(event->wqh, &event->wait); 3964 remove_wait_queue(event->wqh, &event->wait);
3901 3965
3902 event->cft->unregister_event(cgrp, event->cft, event->eventfd); 3966 event->cft->unregister_event(css, event->cft, event->eventfd);
3903 3967
3904 /* Notify userspace the event is going away. */ 3968 /* Notify userspace the event is going away. */
3905 eventfd_signal(event->eventfd, 1); 3969 eventfd_signal(event->eventfd, 1);
3906 3970
3907 eventfd_ctx_put(event->eventfd); 3971 eventfd_ctx_put(event->eventfd);
3908 kfree(event); 3972 kfree(event);
3909 cgroup_dput(cgrp); 3973 css_put(css);
3910} 3974}
3911 3975
3912/* 3976/*
@@ -3919,7 +3983,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
3919{ 3983{
3920 struct cgroup_event *event = container_of(wait, 3984 struct cgroup_event *event = container_of(wait,
3921 struct cgroup_event, wait); 3985 struct cgroup_event, wait);
3922 struct cgroup *cgrp = event->cgrp; 3986 struct cgroup *cgrp = event->css->cgroup;
3923 unsigned long flags = (unsigned long)key; 3987 unsigned long flags = (unsigned long)key;
3924 3988
3925 if (flags & POLLHUP) { 3989 if (flags & POLLHUP) {
@@ -3963,14 +4027,15 @@ static void cgroup_event_ptable_queue_proc(struct file *file,
3963 * Input must be in format '<event_fd> <control_fd> <args>'. 4027 * Input must be in format '<event_fd> <control_fd> <args>'.
3964 * Interpretation of args is defined by control file implementation. 4028 * Interpretation of args is defined by control file implementation.
3965 */ 4029 */
3966static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, 4030static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css,
3967 const char *buffer) 4031 struct cftype *cft, const char *buffer)
3968{ 4032{
3969 struct cgroup_event *event = NULL; 4033 struct cgroup *cgrp = dummy_css->cgroup;
3970 struct cgroup *cgrp_cfile; 4034 struct cgroup_event *event;
4035 struct cgroup_subsys_state *cfile_css;
3971 unsigned int efd, cfd; 4036 unsigned int efd, cfd;
3972 struct file *efile = NULL; 4037 struct file *efile;
3973 struct file *cfile = NULL; 4038 struct file *cfile;
3974 char *endp; 4039 char *endp;
3975 int ret; 4040 int ret;
3976 4041
@@ -3987,7 +4052,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
3987 event = kzalloc(sizeof(*event), GFP_KERNEL); 4052 event = kzalloc(sizeof(*event), GFP_KERNEL);
3988 if (!event) 4053 if (!event)
3989 return -ENOMEM; 4054 return -ENOMEM;
3990 event->cgrp = cgrp; 4055
3991 INIT_LIST_HEAD(&event->list); 4056 INIT_LIST_HEAD(&event->list);
3992 init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); 4057 init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
3993 init_waitqueue_func_entry(&event->wait, cgroup_event_wake); 4058 init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
@@ -3996,62 +4061,68 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
3996 efile = eventfd_fget(efd); 4061 efile = eventfd_fget(efd);
3997 if (IS_ERR(efile)) { 4062 if (IS_ERR(efile)) {
3998 ret = PTR_ERR(efile); 4063 ret = PTR_ERR(efile);
3999 goto fail; 4064 goto out_kfree;
4000 } 4065 }
4001 4066
4002 event->eventfd = eventfd_ctx_fileget(efile); 4067 event->eventfd = eventfd_ctx_fileget(efile);
4003 if (IS_ERR(event->eventfd)) { 4068 if (IS_ERR(event->eventfd)) {
4004 ret = PTR_ERR(event->eventfd); 4069 ret = PTR_ERR(event->eventfd);
4005 goto fail; 4070 goto out_put_efile;
4006 } 4071 }
4007 4072
4008 cfile = fget(cfd); 4073 cfile = fget(cfd);
4009 if (!cfile) { 4074 if (!cfile) {
4010 ret = -EBADF; 4075 ret = -EBADF;
4011 goto fail; 4076 goto out_put_eventfd;
4012 } 4077 }
4013 4078
4014 /* the process need read permission on control file */ 4079 /* the process need read permission on control file */
4015 /* AV: shouldn't we check that it's been opened for read instead? */ 4080 /* AV: shouldn't we check that it's been opened for read instead? */
4016 ret = inode_permission(file_inode(cfile), MAY_READ); 4081 ret = inode_permission(file_inode(cfile), MAY_READ);
4017 if (ret < 0) 4082 if (ret < 0)
4018 goto fail; 4083 goto out_put_cfile;
4019 4084
4020 event->cft = __file_cft(cfile); 4085 event->cft = __file_cft(cfile);
4021 if (IS_ERR(event->cft)) { 4086 if (IS_ERR(event->cft)) {
4022 ret = PTR_ERR(event->cft); 4087 ret = PTR_ERR(event->cft);
4023 goto fail; 4088 goto out_put_cfile;
4089 }
4090
4091 if (!event->cft->ss) {
4092 ret = -EBADF;
4093 goto out_put_cfile;
4024 } 4094 }
4025 4095
4026 /* 4096 /*
4027 * The file to be monitored must be in the same cgroup as 4097 * Determine the css of @cfile, verify it belongs to the same
4028 * cgroup.event_control is. 4098 * cgroup as cgroup.event_control, and associate @event with it.
4099 * Remaining events are automatically removed on cgroup destruction
4100 * but the removal is asynchronous, so take an extra ref.
4029 */ 4101 */
4030 cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent); 4102 rcu_read_lock();
4031 if (cgrp_cfile != cgrp) { 4103
4032 ret = -EINVAL; 4104 ret = -EINVAL;
4033 goto fail; 4105 event->css = cgroup_css(cgrp, event->cft->ss);
4034 } 4106 cfile_css = css_from_dir(cfile->f_dentry->d_parent, event->cft->ss);
4107 if (event->css && event->css == cfile_css && css_tryget(event->css))
4108 ret = 0;
4109
4110 rcu_read_unlock();
4111 if (ret)
4112 goto out_put_cfile;
4035 4113
4036 if (!event->cft->register_event || !event->cft->unregister_event) { 4114 if (!event->cft->register_event || !event->cft->unregister_event) {
4037 ret = -EINVAL; 4115 ret = -EINVAL;
4038 goto fail; 4116 goto out_put_css;
4039 } 4117 }
4040 4118
4041 ret = event->cft->register_event(cgrp, event->cft, 4119 ret = event->cft->register_event(event->css, event->cft,
4042 event->eventfd, buffer); 4120 event->eventfd, buffer);
4043 if (ret) 4121 if (ret)
4044 goto fail; 4122 goto out_put_css;
4045 4123
4046 efile->f_op->poll(efile, &event->pt); 4124 efile->f_op->poll(efile, &event->pt);
4047 4125
4048 /*
4049 * Events should be removed after rmdir of cgroup directory, but before
4050 * destroying subsystem state objects. Let's take reference to cgroup
4051 * directory dentry to do that.
4052 */
4053 dget(cgrp->dentry);
4054
4055 spin_lock(&cgrp->event_list_lock); 4126 spin_lock(&cgrp->event_list_lock);
4056 list_add(&event->list, &cgrp->event_list); 4127 list_add(&event->list, &cgrp->event_list);
4057 spin_unlock(&cgrp->event_list_lock); 4128 spin_unlock(&cgrp->event_list_lock);
@@ -4061,35 +4132,33 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
4061 4132
4062 return 0; 4133 return 0;
4063 4134
4064fail: 4135out_put_css:
4065 if (cfile) 4136 css_put(event->css);
4066 fput(cfile); 4137out_put_cfile:
4067 4138 fput(cfile);
4068 if (event && event->eventfd && !IS_ERR(event->eventfd)) 4139out_put_eventfd:
4069 eventfd_ctx_put(event->eventfd); 4140 eventfd_ctx_put(event->eventfd);
4070 4141out_put_efile:
4071 if (!IS_ERR_OR_NULL(efile)) 4142 fput(efile);
4072 fput(efile); 4143out_kfree:
4073
4074 kfree(event); 4144 kfree(event);
4075 4145
4076 return ret; 4146 return ret;
4077} 4147}
4078 4148
4079static u64 cgroup_clone_children_read(struct cgroup *cgrp, 4149static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
4080 struct cftype *cft) 4150 struct cftype *cft)
4081{ 4151{
4082 return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); 4152 return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
4083} 4153}
4084 4154
4085static int cgroup_clone_children_write(struct cgroup *cgrp, 4155static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
4086 struct cftype *cft, 4156 struct cftype *cft, u64 val)
4087 u64 val)
4088{ 4157{
4089 if (val) 4158 if (val)
4090 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); 4159 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
4091 else 4160 else
4092 clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); 4161 clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
4093 return 0; 4162 return 0;
4094} 4163}
4095 4164
@@ -4148,36 +4217,34 @@ static struct cftype cgroup_base_files[] = {
4148}; 4217};
4149 4218
4150/** 4219/**
4151 * cgroup_populate_dir - selectively creation of files in a directory 4220 * cgroup_populate_dir - create subsys files in a cgroup directory
4152 * @cgrp: target cgroup 4221 * @cgrp: target cgroup
4153 * @base_files: true if the base files should be added
4154 * @subsys_mask: mask of the subsystem ids whose files should be added 4222 * @subsys_mask: mask of the subsystem ids whose files should be added
4223 *
4224 * On failure, no file is added.
4155 */ 4225 */
4156static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, 4226static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
4157 unsigned long subsys_mask)
4158{ 4227{
4159 int err;
4160 struct cgroup_subsys *ss; 4228 struct cgroup_subsys *ss;
4161 4229 int i, ret = 0;
4162 if (base_files) {
4163 err = cgroup_addrm_files(cgrp, NULL, cgroup_base_files, true);
4164 if (err < 0)
4165 return err;
4166 }
4167 4230
4168 /* process cftsets of each subsystem */ 4231 /* process cftsets of each subsystem */
4169 for_each_root_subsys(cgrp->root, ss) { 4232 for_each_subsys(ss, i) {
4170 struct cftype_set *set; 4233 struct cftype_set *set;
4171 if (!test_bit(ss->subsys_id, &subsys_mask)) 4234
4235 if (!test_bit(i, &subsys_mask))
4172 continue; 4236 continue;
4173 4237
4174 list_for_each_entry(set, &ss->cftsets, node) 4238 list_for_each_entry(set, &ss->cftsets, node) {
4175 cgroup_addrm_files(cgrp, ss, set->cfts, true); 4239 ret = cgroup_addrm_files(cgrp, set->cfts, true);
4240 if (ret < 0)
4241 goto err;
4242 }
4176 } 4243 }
4177 4244
4178 /* This cgroup is ready now */ 4245 /* This cgroup is ready now */
4179 for_each_root_subsys(cgrp->root, ss) { 4246 for_each_root_subsys(cgrp->root, ss) {
4180 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4247 struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
4181 struct css_id *id = rcu_dereference_protected(css->id, true); 4248 struct css_id *id = rcu_dereference_protected(css->id, true);
4182 4249
4183 /* 4250 /*
@@ -4190,14 +4257,57 @@ static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files,
4190 } 4257 }
4191 4258
4192 return 0; 4259 return 0;
4260err:
4261 cgroup_clear_dir(cgrp, subsys_mask);
4262 return ret;
4263}
4264
4265/*
4266 * css destruction is four-stage process.
4267 *
4268 * 1. Destruction starts. Killing of the percpu_ref is initiated.
4269 * Implemented in kill_css().
4270 *
4271 * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
4272 * and thus css_tryget() is guaranteed to fail, the css can be offlined
4273 * by invoking offline_css(). After offlining, the base ref is put.
4274 * Implemented in css_killed_work_fn().
4275 *
4276 * 3. When the percpu_ref reaches zero, the only possible remaining
4277 * accessors are inside RCU read sections. css_release() schedules the
4278 * RCU callback.
4279 *
4280 * 4. After the grace period, the css can be freed. Implemented in
4281 * css_free_work_fn().
4282 *
4283 * It is actually hairier because both step 2 and 4 require process context
4284 * and thus involve punting to css->destroy_work adding two additional
4285 * steps to the already complex sequence.
4286 */
4287static void css_free_work_fn(struct work_struct *work)
4288{
4289 struct cgroup_subsys_state *css =
4290 container_of(work, struct cgroup_subsys_state, destroy_work);
4291 struct cgroup *cgrp = css->cgroup;
4292
4293 if (css->parent)
4294 css_put(css->parent);
4295
4296 css->ss->css_free(css);
4297 cgroup_dput(cgrp);
4193} 4298}
4194 4299
4195static void css_dput_fn(struct work_struct *work) 4300static void css_free_rcu_fn(struct rcu_head *rcu_head)
4196{ 4301{
4197 struct cgroup_subsys_state *css = 4302 struct cgroup_subsys_state *css =
4198 container_of(work, struct cgroup_subsys_state, dput_work); 4303 container_of(rcu_head, struct cgroup_subsys_state, rcu_head);
4199 4304
4200 cgroup_dput(css->cgroup); 4305 /*
4306 * css holds an extra ref to @cgrp->dentry which is put on the last
4307 * css_put(). dput() requires process context which we don't have.
4308 */
4309 INIT_WORK(&css->destroy_work, css_free_work_fn);
4310 schedule_work(&css->destroy_work);
4201} 4311}
4202 4312
4203static void css_release(struct percpu_ref *ref) 4313static void css_release(struct percpu_ref *ref)
@@ -4205,49 +4315,47 @@ static void css_release(struct percpu_ref *ref)
4205 struct cgroup_subsys_state *css = 4315 struct cgroup_subsys_state *css =
4206 container_of(ref, struct cgroup_subsys_state, refcnt); 4316 container_of(ref, struct cgroup_subsys_state, refcnt);
4207 4317
4208 schedule_work(&css->dput_work); 4318 call_rcu(&css->rcu_head, css_free_rcu_fn);
4209} 4319}
4210 4320
4211static void init_cgroup_css(struct cgroup_subsys_state *css, 4321static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss,
4212 struct cgroup_subsys *ss, 4322 struct cgroup *cgrp)
4213 struct cgroup *cgrp)
4214{ 4323{
4215 css->cgroup = cgrp; 4324 css->cgroup = cgrp;
4325 css->ss = ss;
4216 css->flags = 0; 4326 css->flags = 0;
4217 css->id = NULL; 4327 css->id = NULL;
4218 if (cgrp == cgroup_dummy_top) 4328
4329 if (cgrp->parent)
4330 css->parent = cgroup_css(cgrp->parent, ss);
4331 else
4219 css->flags |= CSS_ROOT; 4332 css->flags |= CSS_ROOT;
4220 BUG_ON(cgrp->subsys[ss->subsys_id]);
4221 cgrp->subsys[ss->subsys_id] = css;
4222 4333
4223 /* 4334 BUG_ON(cgroup_css(cgrp, ss));
4224 * css holds an extra ref to @cgrp->dentry which is put on the last
4225 * css_put(). dput() requires process context, which css_put() may
4226 * be called without. @css->dput_work will be used to invoke
4227 * dput() asynchronously from css_put().
4228 */
4229 INIT_WORK(&css->dput_work, css_dput_fn);
4230} 4335}
4231 4336
4232/* invoke ->post_create() on a new CSS and mark it online if successful */ 4337/* invoke ->css_online() on a new CSS and mark it online if successful */
4233static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp) 4338static int online_css(struct cgroup_subsys_state *css)
4234{ 4339{
4340 struct cgroup_subsys *ss = css->ss;
4235 int ret = 0; 4341 int ret = 0;
4236 4342
4237 lockdep_assert_held(&cgroup_mutex); 4343 lockdep_assert_held(&cgroup_mutex);
4238 4344
4239 if (ss->css_online) 4345 if (ss->css_online)
4240 ret = ss->css_online(cgrp); 4346 ret = ss->css_online(css);
4241 if (!ret) 4347 if (!ret) {
4242 cgrp->subsys[ss->subsys_id]->flags |= CSS_ONLINE; 4348 css->flags |= CSS_ONLINE;
4349 css->cgroup->nr_css++;
4350 rcu_assign_pointer(css->cgroup->subsys[ss->subsys_id], css);
4351 }
4243 return ret; 4352 return ret;
4244} 4353}
4245 4354
4246/* if the CSS is online, invoke ->pre_destory() on it and mark it offline */ 4355/* if the CSS is online, invoke ->css_offline() on it and mark it offline */
4247static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) 4356static void offline_css(struct cgroup_subsys_state *css)
4248 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4249{ 4357{
4250 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4358 struct cgroup_subsys *ss = css->ss;
4251 4359
4252 lockdep_assert_held(&cgroup_mutex); 4360 lockdep_assert_held(&cgroup_mutex);
4253 4361
@@ -4255,9 +4363,11 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
4255 return; 4363 return;
4256 4364
4257 if (ss->css_offline) 4365 if (ss->css_offline)
4258 ss->css_offline(cgrp); 4366 ss->css_offline(css);
4259 4367
4260 cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE; 4368 css->flags &= ~CSS_ONLINE;
4369 css->cgroup->nr_css--;
4370 RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css);
4261} 4371}
4262 4372
4263/* 4373/*
@@ -4271,6 +4381,7 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
4271static long cgroup_create(struct cgroup *parent, struct dentry *dentry, 4381static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4272 umode_t mode) 4382 umode_t mode)
4273{ 4383{
4384 struct cgroup_subsys_state *css_ar[CGROUP_SUBSYS_COUNT] = { };
4274 struct cgroup *cgrp; 4385 struct cgroup *cgrp;
4275 struct cgroup_name *name; 4386 struct cgroup_name *name;
4276 struct cgroupfs_root *root = parent->root; 4387 struct cgroupfs_root *root = parent->root;
@@ -4288,7 +4399,11 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4288 goto err_free_cgrp; 4399 goto err_free_cgrp;
4289 rcu_assign_pointer(cgrp->name, name); 4400 rcu_assign_pointer(cgrp->name, name);
4290 4401
4291 cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL); 4402 /*
4403 * Temporarily set the pointer to NULL, so idr_find() won't return
4404 * a half-baked cgroup.
4405 */
4406 cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL);
4292 if (cgrp->id < 0) 4407 if (cgrp->id < 0)
4293 goto err_free_name; 4408 goto err_free_name;
4294 4409
@@ -4317,6 +4432,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4317 cgrp->dentry = dentry; 4432 cgrp->dentry = dentry;
4318 4433
4319 cgrp->parent = parent; 4434 cgrp->parent = parent;
4435 cgrp->dummy_css.parent = &parent->dummy_css;
4320 cgrp->root = parent->root; 4436 cgrp->root = parent->root;
4321 4437
4322 if (notify_on_release(parent)) 4438 if (notify_on_release(parent))
@@ -4328,22 +4444,21 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4328 for_each_root_subsys(root, ss) { 4444 for_each_root_subsys(root, ss) {
4329 struct cgroup_subsys_state *css; 4445 struct cgroup_subsys_state *css;
4330 4446
4331 css = ss->css_alloc(cgrp); 4447 css = ss->css_alloc(cgroup_css(parent, ss));
4332 if (IS_ERR(css)) { 4448 if (IS_ERR(css)) {
4333 err = PTR_ERR(css); 4449 err = PTR_ERR(css);
4334 goto err_free_all; 4450 goto err_free_all;
4335 } 4451 }
4452 css_ar[ss->subsys_id] = css;
4336 4453
4337 err = percpu_ref_init(&css->refcnt, css_release); 4454 err = percpu_ref_init(&css->refcnt, css_release);
4338 if (err) { 4455 if (err)
4339 ss->css_free(cgrp);
4340 goto err_free_all; 4456 goto err_free_all;
4341 }
4342 4457
4343 init_cgroup_css(css, ss, cgrp); 4458 init_css(css, ss, cgrp);
4344 4459
4345 if (ss->use_id) { 4460 if (ss->use_id) {
4346 err = alloc_css_id(ss, parent, cgrp); 4461 err = alloc_css_id(css);
4347 if (err) 4462 if (err)
4348 goto err_free_all; 4463 goto err_free_all;
4349 } 4464 }
@@ -4365,16 +4480,22 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4365 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); 4480 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
4366 root->number_of_cgroups++; 4481 root->number_of_cgroups++;
4367 4482
4368 /* each css holds a ref to the cgroup's dentry */ 4483 /* each css holds a ref to the cgroup's dentry and the parent css */
4369 for_each_root_subsys(root, ss) 4484 for_each_root_subsys(root, ss) {
4485 struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
4486
4370 dget(dentry); 4487 dget(dentry);
4488 css_get(css->parent);
4489 }
4371 4490
4372 /* hold a ref to the parent's dentry */ 4491 /* hold a ref to the parent's dentry */
4373 dget(parent->dentry); 4492 dget(parent->dentry);
4374 4493
4375 /* creation succeeded, notify subsystems */ 4494 /* creation succeeded, notify subsystems */
4376 for_each_root_subsys(root, ss) { 4495 for_each_root_subsys(root, ss) {
4377 err = online_css(ss, cgrp); 4496 struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
4497
4498 err = online_css(css);
4378 if (err) 4499 if (err)
4379 goto err_destroy; 4500 goto err_destroy;
4380 4501
@@ -4388,7 +4509,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4388 } 4509 }
4389 } 4510 }
4390 4511
4391 err = cgroup_populate_dir(cgrp, true, root->subsys_mask); 4512 idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
4513
4514 err = cgroup_addrm_files(cgrp, cgroup_base_files, true);
4515 if (err)
4516 goto err_destroy;
4517
4518 err = cgroup_populate_dir(cgrp, root->subsys_mask);
4392 if (err) 4519 if (err)
4393 goto err_destroy; 4520 goto err_destroy;
4394 4521
@@ -4399,18 +4526,18 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4399 4526
4400err_free_all: 4527err_free_all:
4401 for_each_root_subsys(root, ss) { 4528 for_each_root_subsys(root, ss) {
4402 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4529 struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
4403 4530
4404 if (css) { 4531 if (css) {
4405 percpu_ref_cancel_init(&css->refcnt); 4532 percpu_ref_cancel_init(&css->refcnt);
4406 ss->css_free(cgrp); 4533 ss->css_free(css);
4407 } 4534 }
4408 } 4535 }
4409 mutex_unlock(&cgroup_mutex); 4536 mutex_unlock(&cgroup_mutex);
4410 /* Release the reference count that we took on the superblock */ 4537 /* Release the reference count that we took on the superblock */
4411 deactivate_super(sb); 4538 deactivate_super(sb);
4412err_free_id: 4539err_free_id:
4413 ida_simple_remove(&root->cgroup_ida, cgrp->id); 4540 idr_remove(&root->cgroup_idr, cgrp->id);
4414err_free_name: 4541err_free_name:
4415 kfree(rcu_dereference_raw(cgrp->name)); 4542 kfree(rcu_dereference_raw(cgrp->name));
4416err_free_cgrp: 4543err_free_cgrp:
@@ -4432,22 +4559,84 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
4432 return cgroup_create(c_parent, dentry, mode | S_IFDIR); 4559 return cgroup_create(c_parent, dentry, mode | S_IFDIR);
4433} 4560}
4434 4561
4435static void cgroup_css_killed(struct cgroup *cgrp) 4562/*
4563 * This is called when the refcnt of a css is confirmed to be killed.
4564 * css_tryget() is now guaranteed to fail.
4565 */
4566static void css_killed_work_fn(struct work_struct *work)
4436{ 4567{
4437 if (!atomic_dec_and_test(&cgrp->css_kill_cnt)) 4568 struct cgroup_subsys_state *css =
4438 return; 4569 container_of(work, struct cgroup_subsys_state, destroy_work);
4570 struct cgroup *cgrp = css->cgroup;
4439 4571
4440 /* percpu ref's of all css's are killed, kick off the next step */ 4572 mutex_lock(&cgroup_mutex);
4441 INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn); 4573
4442 schedule_work(&cgrp->destroy_work); 4574 /*
4575 * css_tryget() is guaranteed to fail now. Tell subsystems to
4576 * initate destruction.
4577 */
4578 offline_css(css);
4579
4580 /*
4581 * If @cgrp is marked dead, it's waiting for refs of all css's to
4582 * be disabled before proceeding to the second phase of cgroup
4583 * destruction. If we are the last one, kick it off.
4584 */
4585 if (!cgrp->nr_css && cgroup_is_dead(cgrp))
4586 cgroup_destroy_css_killed(cgrp);
4587
4588 mutex_unlock(&cgroup_mutex);
4589
4590 /*
4591 * Put the css refs from kill_css(). Each css holds an extra
4592 * reference to the cgroup's dentry and cgroup removal proceeds
4593 * regardless of css refs. On the last put of each css, whenever
4594 * that may be, the extra dentry ref is put so that dentry
4595 * destruction happens only after all css's are released.
4596 */
4597 css_put(css);
4443} 4598}
4444 4599
4445static void css_ref_killed_fn(struct percpu_ref *ref) 4600/* css kill confirmation processing requires process context, bounce */
4601static void css_killed_ref_fn(struct percpu_ref *ref)
4446{ 4602{
4447 struct cgroup_subsys_state *css = 4603 struct cgroup_subsys_state *css =
4448 container_of(ref, struct cgroup_subsys_state, refcnt); 4604 container_of(ref, struct cgroup_subsys_state, refcnt);
4449 4605
4450 cgroup_css_killed(css->cgroup); 4606 INIT_WORK(&css->destroy_work, css_killed_work_fn);
4607 schedule_work(&css->destroy_work);
4608}
4609
4610/**
4611 * kill_css - destroy a css
4612 * @css: css to destroy
4613 *
4614 * This function initiates destruction of @css by removing cgroup interface
4615 * files and putting its base reference. ->css_offline() will be invoked
4616 * asynchronously once css_tryget() is guaranteed to fail and when the
4617 * reference count reaches zero, @css will be released.
4618 */
4619static void kill_css(struct cgroup_subsys_state *css)
4620{
4621 cgroup_clear_dir(css->cgroup, 1 << css->ss->subsys_id);
4622
4623 /*
4624 * Killing would put the base ref, but we need to keep it alive
4625 * until after ->css_offline().
4626 */
4627 css_get(css);
4628
4629 /*
4630 * cgroup core guarantees that, by the time ->css_offline() is
4631 * invoked, no new css reference will be given out via
4632 * css_tryget(). We can't simply call percpu_ref_kill() and
4633 * proceed to offlining css's because percpu_ref_kill() doesn't
4634 * guarantee that the ref is seen as killed on all CPUs on return.
4635 *
4636 * Use percpu_ref_kill_and_confirm() to get notifications as each
4637 * css is confirmed to be seen as killed on all CPUs.
4638 */
4639 percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
4451} 4640}
4452 4641
4453/** 4642/**
@@ -4480,6 +4669,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4480 struct dentry *d = cgrp->dentry; 4669 struct dentry *d = cgrp->dentry;
4481 struct cgroup_event *event, *tmp; 4670 struct cgroup_event *event, *tmp;
4482 struct cgroup_subsys *ss; 4671 struct cgroup_subsys *ss;
4672 struct cgroup *child;
4483 bool empty; 4673 bool empty;
4484 4674
4485 lockdep_assert_held(&d->d_inode->i_mutex); 4675 lockdep_assert_held(&d->d_inode->i_mutex);
@@ -4490,47 +4680,41 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4490 * @cgrp from being removed while __put_css_set() is in progress. 4680 * @cgrp from being removed while __put_css_set() is in progress.
4491 */ 4681 */
4492 read_lock(&css_set_lock); 4682 read_lock(&css_set_lock);
4493 empty = list_empty(&cgrp->cset_links) && list_empty(&cgrp->children); 4683 empty = list_empty(&cgrp->cset_links);
4494 read_unlock(&css_set_lock); 4684 read_unlock(&css_set_lock);
4495 if (!empty) 4685 if (!empty)
4496 return -EBUSY; 4686 return -EBUSY;
4497 4687
4498 /* 4688 /*
4499 * Block new css_tryget() by killing css refcnts. cgroup core 4689 * Make sure there's no live children. We can't test ->children
4500 * guarantees that, by the time ->css_offline() is invoked, no new 4690 * emptiness as dead children linger on it while being destroyed;
4501 * css reference will be given out via css_tryget(). We can't 4691 * otherwise, "rmdir parent/child parent" may fail with -EBUSY.
4502 * simply call percpu_ref_kill() and proceed to offlining css's
4503 * because percpu_ref_kill() doesn't guarantee that the ref is seen
4504 * as killed on all CPUs on return.
4505 *
4506 * Use percpu_ref_kill_and_confirm() to get notifications as each
4507 * css is confirmed to be seen as killed on all CPUs. The
4508 * notification callback keeps track of the number of css's to be
4509 * killed and schedules cgroup_offline_fn() to perform the rest of
4510 * destruction once the percpu refs of all css's are confirmed to
4511 * be killed.
4512 */ 4692 */
4513 atomic_set(&cgrp->css_kill_cnt, 1); 4693 empty = true;
4514 for_each_root_subsys(cgrp->root, ss) { 4694 rcu_read_lock();
4515 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4695 list_for_each_entry_rcu(child, &cgrp->children, sibling) {
4516 4696 empty = cgroup_is_dead(child);
4517 /* 4697 if (!empty)
4518 * Killing would put the base ref, but we need to keep it 4698 break;
4519 * alive until after ->css_offline.
4520 */
4521 percpu_ref_get(&css->refcnt);
4522
4523 atomic_inc(&cgrp->css_kill_cnt);
4524 percpu_ref_kill_and_confirm(&css->refcnt, css_ref_killed_fn);
4525 } 4699 }
4526 cgroup_css_killed(cgrp); 4700 rcu_read_unlock();
4701 if (!empty)
4702 return -EBUSY;
4703
4704 /*
4705 * Initiate massacre of all css's. cgroup_destroy_css_killed()
4706 * will be invoked to perform the rest of destruction once the
4707 * percpu refs of all css's are confirmed to be killed.
4708 */
4709 for_each_root_subsys(cgrp->root, ss)
4710 kill_css(cgroup_css(cgrp, ss));
4527 4711
4528 /* 4712 /*
4529 * Mark @cgrp dead. This prevents further task migration and child 4713 * Mark @cgrp dead. This prevents further task migration and child
4530 * creation by disabling cgroup_lock_live_group(). Note that 4714 * creation by disabling cgroup_lock_live_group(). Note that
4531 * CGRP_DEAD assertion is depended upon by cgroup_next_sibling() to 4715 * CGRP_DEAD assertion is depended upon by css_next_child() to
4532 * resume iteration after dropping RCU read lock. See 4716 * resume iteration after dropping RCU read lock. See
4533 * cgroup_next_sibling() for details. 4717 * css_next_child() for details.
4534 */ 4718 */
4535 set_bit(CGRP_DEAD, &cgrp->flags); 4719 set_bit(CGRP_DEAD, &cgrp->flags);
4536 4720
@@ -4541,9 +4725,20 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4541 raw_spin_unlock(&release_list_lock); 4725 raw_spin_unlock(&release_list_lock);
4542 4726
4543 /* 4727 /*
4544 * Remove @cgrp directory. The removal puts the base ref but we 4728 * If @cgrp has css's attached, the second stage of cgroup
4545 * aren't quite done with @cgrp yet, so hold onto it. 4729 * destruction is kicked off from css_killed_work_fn() after the
4730 * refs of all attached css's are killed. If @cgrp doesn't have
4731 * any css, we kick it off here.
4546 */ 4732 */
4733 if (!cgrp->nr_css)
4734 cgroup_destroy_css_killed(cgrp);
4735
4736 /*
4737 * Clear the base files and remove @cgrp directory. The removal
4738 * puts the base ref but we aren't quite done with @cgrp yet, so
4739 * hold onto it.
4740 */
4741 cgroup_addrm_files(cgrp, cgroup_base_files, false);
4547 dget(d); 4742 dget(d);
4548 cgroup_d_remove_dir(d); 4743 cgroup_d_remove_dir(d);
4549 4744
@@ -4563,50 +4758,36 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4563}; 4758};
4564 4759
4565/** 4760/**
4566 * cgroup_offline_fn - the second step of cgroup destruction 4761 * cgroup_destroy_css_killed - the second step of cgroup destruction
4567 * @work: cgroup->destroy_free_work 4762 * @work: cgroup->destroy_free_work
4568 * 4763 *
4569 * This function is invoked from a work item for a cgroup which is being 4764 * This function is invoked from a work item for a cgroup which is being
4570 * destroyed after the percpu refcnts of all css's are guaranteed to be 4765 * destroyed after all css's are offlined and performs the rest of
4571 * seen as killed on all CPUs, and performs the rest of destruction. This 4766 * destruction. This is the second step of destruction described in the
4572 * is the second step of destruction described in the comment above 4767 * comment above cgroup_destroy_locked().
4573 * cgroup_destroy_locked().
4574 */ 4768 */
4575static void cgroup_offline_fn(struct work_struct *work) 4769static void cgroup_destroy_css_killed(struct cgroup *cgrp)
4576{ 4770{
4577 struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
4578 struct cgroup *parent = cgrp->parent; 4771 struct cgroup *parent = cgrp->parent;
4579 struct dentry *d = cgrp->dentry; 4772 struct dentry *d = cgrp->dentry;
4580 struct cgroup_subsys *ss;
4581 4773
4582 mutex_lock(&cgroup_mutex); 4774 lockdep_assert_held(&cgroup_mutex);
4583 4775
4584 /* 4776 /* delete this cgroup from parent->children */
4585 * css_tryget() is guaranteed to fail now. Tell subsystems to 4777 list_del_rcu(&cgrp->sibling);
4586 * initate destruction.
4587 */
4588 for_each_root_subsys(cgrp->root, ss)
4589 offline_css(ss, cgrp);
4590 4778
4591 /* 4779 /*
4592 * Put the css refs from cgroup_destroy_locked(). Each css holds 4780 * We should remove the cgroup object from idr before its grace
4593 * an extra reference to the cgroup's dentry and cgroup removal 4781 * period starts, so we won't be looking up a cgroup while the
4594 * proceeds regardless of css refs. On the last put of each css, 4782 * cgroup is being freed.
4595 * whenever that may be, the extra dentry ref is put so that dentry
4596 * destruction happens only after all css's are released.
4597 */ 4783 */
4598 for_each_root_subsys(cgrp->root, ss) 4784 idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
4599 css_put(cgrp->subsys[ss->subsys_id]); 4785 cgrp->id = -1;
4600
4601 /* delete this cgroup from parent->children */
4602 list_del_rcu(&cgrp->sibling);
4603 4786
4604 dput(d); 4787 dput(d);
4605 4788
4606 set_bit(CGRP_RELEASABLE, &parent->flags); 4789 set_bit(CGRP_RELEASABLE, &parent->flags);
4607 check_for_release(parent); 4790 check_for_release(parent);
4608
4609 mutex_unlock(&cgroup_mutex);
4610} 4791}
4611 4792
4612static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) 4793static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
@@ -4629,6 +4810,11 @@ static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss)
4629 * deregistration. 4810 * deregistration.
4630 */ 4811 */
4631 if (ss->base_cftypes) { 4812 if (ss->base_cftypes) {
4813 struct cftype *cft;
4814
4815 for (cft = ss->base_cftypes; cft->name[0] != '\0'; cft++)
4816 cft->ss = ss;
4817
4632 ss->base_cftset.cfts = ss->base_cftypes; 4818 ss->base_cftset.cfts = ss->base_cftypes;
4633 list_add_tail(&ss->base_cftset.node, &ss->cftsets); 4819 list_add_tail(&ss->base_cftset.node, &ss->cftsets);
4634 } 4820 }
@@ -4648,10 +4834,10 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4648 /* Create the top cgroup state for this subsystem */ 4834 /* Create the top cgroup state for this subsystem */
4649 list_add(&ss->sibling, &cgroup_dummy_root.subsys_list); 4835 list_add(&ss->sibling, &cgroup_dummy_root.subsys_list);
4650 ss->root = &cgroup_dummy_root; 4836 ss->root = &cgroup_dummy_root;
4651 css = ss->css_alloc(cgroup_dummy_top); 4837 css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss));
4652 /* We don't handle early failures gracefully */ 4838 /* We don't handle early failures gracefully */
4653 BUG_ON(IS_ERR(css)); 4839 BUG_ON(IS_ERR(css));
4654 init_cgroup_css(css, ss, cgroup_dummy_top); 4840 init_css(css, ss, cgroup_dummy_top);
4655 4841
4656 /* Update the init_css_set to contain a subsys 4842 /* Update the init_css_set to contain a subsys
4657 * pointer to this state - since the subsystem is 4843 * pointer to this state - since the subsystem is
@@ -4666,7 +4852,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4666 * need to invoke fork callbacks here. */ 4852 * need to invoke fork callbacks here. */
4667 BUG_ON(!list_empty(&init_task.tasks)); 4853 BUG_ON(!list_empty(&init_task.tasks));
4668 4854
4669 BUG_ON(online_css(ss, cgroup_dummy_top)); 4855 BUG_ON(online_css(css));
4670 4856
4671 mutex_unlock(&cgroup_mutex); 4857 mutex_unlock(&cgroup_mutex);
4672 4858
@@ -4727,7 +4913,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4727 * struct, so this can happen first (i.e. before the dummy root 4913 * struct, so this can happen first (i.e. before the dummy root
4728 * attachment). 4914 * attachment).
4729 */ 4915 */
4730 css = ss->css_alloc(cgroup_dummy_top); 4916 css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss));
4731 if (IS_ERR(css)) { 4917 if (IS_ERR(css)) {
4732 /* failure case - need to deassign the cgroup_subsys[] slot. */ 4918 /* failure case - need to deassign the cgroup_subsys[] slot. */
4733 cgroup_subsys[ss->subsys_id] = NULL; 4919 cgroup_subsys[ss->subsys_id] = NULL;
@@ -4739,8 +4925,8 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4739 ss->root = &cgroup_dummy_root; 4925 ss->root = &cgroup_dummy_root;
4740 4926
4741 /* our new subsystem will be attached to the dummy hierarchy. */ 4927 /* our new subsystem will be attached to the dummy hierarchy. */
4742 init_cgroup_css(css, ss, cgroup_dummy_top); 4928 init_css(css, ss, cgroup_dummy_top);
4743 /* init_idr must be after init_cgroup_css because it sets css->id. */ 4929 /* init_idr must be after init_css() because it sets css->id. */
4744 if (ss->use_id) { 4930 if (ss->use_id) {
4745 ret = cgroup_init_idr(ss, css); 4931 ret = cgroup_init_idr(ss, css);
4746 if (ret) 4932 if (ret)
@@ -4770,7 +4956,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4770 } 4956 }
4771 write_unlock(&css_set_lock); 4957 write_unlock(&css_set_lock);
4772 4958
4773 ret = online_css(ss, cgroup_dummy_top); 4959 ret = online_css(css);
4774 if (ret) 4960 if (ret)
4775 goto err_unload; 4961 goto err_unload;
4776 4962
@@ -4802,14 +4988,14 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4802 4988
4803 /* 4989 /*
4804 * we shouldn't be called if the subsystem is in use, and the use of 4990 * we shouldn't be called if the subsystem is in use, and the use of
4805 * try_module_get in parse_cgroupfs_options should ensure that it 4991 * try_module_get() in rebind_subsystems() should ensure that it
4806 * doesn't start being used while we're killing it off. 4992 * doesn't start being used while we're killing it off.
4807 */ 4993 */
4808 BUG_ON(ss->root != &cgroup_dummy_root); 4994 BUG_ON(ss->root != &cgroup_dummy_root);
4809 4995
4810 mutex_lock(&cgroup_mutex); 4996 mutex_lock(&cgroup_mutex);
4811 4997
4812 offline_css(ss, cgroup_dummy_top); 4998 offline_css(cgroup_css(cgroup_dummy_top, ss));
4813 4999
4814 if (ss->use_id) 5000 if (ss->use_id)
4815 idr_destroy(&ss->idr); 5001 idr_destroy(&ss->idr);
@@ -4843,8 +5029,8 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4843 * the cgrp->subsys pointer to find their state. note that this 5029 * the cgrp->subsys pointer to find their state. note that this
4844 * also takes care of freeing the css_id. 5030 * also takes care of freeing the css_id.
4845 */ 5031 */
4846 ss->css_free(cgroup_dummy_top); 5032 ss->css_free(cgroup_css(cgroup_dummy_top, ss));
4847 cgroup_dummy_top->subsys[ss->subsys_id] = NULL; 5033 RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL);
4848 5034
4849 mutex_unlock(&cgroup_mutex); 5035 mutex_unlock(&cgroup_mutex);
4850} 5036}
@@ -4926,6 +5112,10 @@ int __init cgroup_init(void)
4926 5112
4927 BUG_ON(cgroup_init_root_id(&cgroup_dummy_root, 0, 1)); 5113 BUG_ON(cgroup_init_root_id(&cgroup_dummy_root, 0, 1));
4928 5114
5115 err = idr_alloc(&cgroup_dummy_root.cgroup_idr, cgroup_dummy_top,
5116 0, 1, GFP_KERNEL);
5117 BUG_ON(err < 0);
5118
4929 mutex_unlock(&cgroup_root_mutex); 5119 mutex_unlock(&cgroup_root_mutex);
4930 mutex_unlock(&cgroup_mutex); 5120 mutex_unlock(&cgroup_mutex);
4931 5121
@@ -5082,7 +5272,7 @@ void cgroup_fork(struct task_struct *child)
5082 * Adds the task to the list running through its css_set if necessary and 5272 * Adds the task to the list running through its css_set if necessary and
5083 * call the subsystem fork() callbacks. Has to be after the task is 5273 * call the subsystem fork() callbacks. Has to be after the task is
5084 * visible on the task list in case we race with the first call to 5274 * visible on the task list in case we race with the first call to
5085 * cgroup_iter_start() - to guarantee that the new task ends up on its 5275 * cgroup_task_iter_start() - to guarantee that the new task ends up on its
5086 * list. 5276 * list.
5087 */ 5277 */
5088void cgroup_post_fork(struct task_struct *child) 5278void cgroup_post_fork(struct task_struct *child)
@@ -5195,10 +5385,10 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
5195 */ 5385 */
5196 for_each_builtin_subsys(ss, i) { 5386 for_each_builtin_subsys(ss, i) {
5197 if (ss->exit) { 5387 if (ss->exit) {
5198 struct cgroup *old_cgrp = cset->subsys[i]->cgroup; 5388 struct cgroup_subsys_state *old_css = cset->subsys[i];
5199 struct cgroup *cgrp = task_cgroup(tsk, i); 5389 struct cgroup_subsys_state *css = task_css(tsk, i);
5200 5390
5201 ss->exit(cgrp, old_cgrp, tsk); 5391 ss->exit(css, old_css, tsk);
5202 } 5392 }
5203 } 5393 }
5204 } 5394 }
@@ -5457,20 +5647,16 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
5457 return 0; 5647 return 0;
5458} 5648}
5459 5649
5460static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent, 5650static int alloc_css_id(struct cgroup_subsys_state *child_css)
5461 struct cgroup *child)
5462{ 5651{
5463 int subsys_id, i, depth = 0; 5652 struct cgroup_subsys_state *parent_css = css_parent(child_css);
5464 struct cgroup_subsys_state *parent_css, *child_css;
5465 struct css_id *child_id, *parent_id; 5653 struct css_id *child_id, *parent_id;
5654 int i, depth;
5466 5655
5467 subsys_id = ss->subsys_id;
5468 parent_css = parent->subsys[subsys_id];
5469 child_css = child->subsys[subsys_id];
5470 parent_id = rcu_dereference_protected(parent_css->id, true); 5656 parent_id = rcu_dereference_protected(parent_css->id, true);
5471 depth = parent_id->depth + 1; 5657 depth = parent_id->depth + 1;
5472 5658
5473 child_id = get_new_cssid(ss, depth); 5659 child_id = get_new_cssid(child_css->ss, depth);
5474 if (IS_ERR(child_id)) 5660 if (IS_ERR(child_id))
5475 return PTR_ERR(child_id); 5661 return PTR_ERR(child_id);
5476 5662
@@ -5508,31 +5694,56 @@ struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
5508} 5694}
5509EXPORT_SYMBOL_GPL(css_lookup); 5695EXPORT_SYMBOL_GPL(css_lookup);
5510 5696
5511/* 5697/**
5512 * get corresponding css from file open on cgroupfs directory 5698 * css_from_dir - get corresponding css from the dentry of a cgroup dir
5699 * @dentry: directory dentry of interest
5700 * @ss: subsystem of interest
5701 *
5702 * Must be called under RCU read lock. The caller is responsible for
5703 * pinning the returned css if it needs to be accessed outside the RCU
5704 * critical section.
5513 */ 5705 */
5514struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) 5706struct cgroup_subsys_state *css_from_dir(struct dentry *dentry,
5707 struct cgroup_subsys *ss)
5515{ 5708{
5516 struct cgroup *cgrp; 5709 struct cgroup *cgrp;
5517 struct inode *inode;
5518 struct cgroup_subsys_state *css;
5519 5710
5520 inode = file_inode(f); 5711 WARN_ON_ONCE(!rcu_read_lock_held());
5521 /* check in cgroup filesystem dir */ 5712
5522 if (inode->i_op != &cgroup_dir_inode_operations) 5713 /* is @dentry a cgroup dir? */
5714 if (!dentry->d_inode ||
5715 dentry->d_inode->i_op != &cgroup_dir_inode_operations)
5523 return ERR_PTR(-EBADF); 5716 return ERR_PTR(-EBADF);
5524 5717
5525 if (id < 0 || id >= CGROUP_SUBSYS_COUNT) 5718 cgrp = __d_cgrp(dentry);
5526 return ERR_PTR(-EINVAL); 5719 return cgroup_css(cgrp, ss) ?: ERR_PTR(-ENOENT);
5720}
5527 5721
5528 /* get cgroup */ 5722/**
5529 cgrp = __d_cgrp(f->f_dentry); 5723 * css_from_id - lookup css by id
5530 css = cgrp->subsys[id]; 5724 * @id: the cgroup id
5531 return css ? css : ERR_PTR(-ENOENT); 5725 * @ss: cgroup subsys to be looked into
5726 *
5727 * Returns the css if there's valid one with @id, otherwise returns NULL.
5728 * Should be called under rcu_read_lock().
5729 */
5730struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
5731{
5732 struct cgroup *cgrp;
5733
5734 rcu_lockdep_assert(rcu_read_lock_held() ||
5735 lockdep_is_held(&cgroup_mutex),
5736 "css_from_id() needs proper protection");
5737
5738 cgrp = idr_find(&ss->root->cgroup_idr, id);
5739 if (cgrp)
5740 return cgroup_css(cgrp, ss);
5741 return NULL;
5532} 5742}
5533 5743
5534#ifdef CONFIG_CGROUP_DEBUG 5744#ifdef CONFIG_CGROUP_DEBUG
5535static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cgrp) 5745static struct cgroup_subsys_state *
5746debug_css_alloc(struct cgroup_subsys_state *parent_css)
5536{ 5747{
5537 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); 5748 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
5538 5749
@@ -5542,22 +5753,24 @@ static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cgrp)
5542 return css; 5753 return css;
5543} 5754}
5544 5755
5545static void debug_css_free(struct cgroup *cgrp) 5756static void debug_css_free(struct cgroup_subsys_state *css)
5546{ 5757{
5547 kfree(cgrp->subsys[debug_subsys_id]); 5758 kfree(css);
5548} 5759}
5549 5760
5550static u64 debug_taskcount_read(struct cgroup *cgrp, struct cftype *cft) 5761static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
5762 struct cftype *cft)
5551{ 5763{
5552 return cgroup_task_count(cgrp); 5764 return cgroup_task_count(css->cgroup);
5553} 5765}
5554 5766
5555static u64 current_css_set_read(struct cgroup *cgrp, struct cftype *cft) 5767static u64 current_css_set_read(struct cgroup_subsys_state *css,
5768 struct cftype *cft)
5556{ 5769{
5557 return (u64)(unsigned long)current->cgroups; 5770 return (u64)(unsigned long)current->cgroups;
5558} 5771}
5559 5772
5560static u64 current_css_set_refcount_read(struct cgroup *cgrp, 5773static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
5561 struct cftype *cft) 5774 struct cftype *cft)
5562{ 5775{
5563 u64 count; 5776 u64 count;
@@ -5568,7 +5781,7 @@ static u64 current_css_set_refcount_read(struct cgroup *cgrp,
5568 return count; 5781 return count;
5569} 5782}
5570 5783
5571static int current_css_set_cg_links_read(struct cgroup *cgrp, 5784static int current_css_set_cg_links_read(struct cgroup_subsys_state *css,
5572 struct cftype *cft, 5785 struct cftype *cft,
5573 struct seq_file *seq) 5786 struct seq_file *seq)
5574{ 5787{
@@ -5595,14 +5808,13 @@ static int current_css_set_cg_links_read(struct cgroup *cgrp,
5595} 5808}
5596 5809
5597#define MAX_TASKS_SHOWN_PER_CSS 25 5810#define MAX_TASKS_SHOWN_PER_CSS 25
5598static int cgroup_css_links_read(struct cgroup *cgrp, 5811static int cgroup_css_links_read(struct cgroup_subsys_state *css,
5599 struct cftype *cft, 5812 struct cftype *cft, struct seq_file *seq)
5600 struct seq_file *seq)
5601{ 5813{
5602 struct cgrp_cset_link *link; 5814 struct cgrp_cset_link *link;
5603 5815
5604 read_lock(&css_set_lock); 5816 read_lock(&css_set_lock);
5605 list_for_each_entry(link, &cgrp->cset_links, cset_link) { 5817 list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
5606 struct css_set *cset = link->cset; 5818 struct css_set *cset = link->cset;
5607 struct task_struct *task; 5819 struct task_struct *task;
5608 int count = 0; 5820 int count = 0;
@@ -5621,9 +5833,9 @@ static int cgroup_css_links_read(struct cgroup *cgrp,
5621 return 0; 5833 return 0;
5622} 5834}
5623 5835
5624static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft) 5836static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
5625{ 5837{
5626 return test_bit(CGRP_RELEASABLE, &cgrp->flags); 5838 return test_bit(CGRP_RELEASABLE, &css->cgroup->flags);
5627} 5839}
5628 5840
5629static struct cftype debug_files[] = { 5841static struct cftype debug_files[] = {
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 75dda1ea5026..f0ff64d0ebaa 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -45,25 +45,19 @@ struct freezer {
45 spinlock_t lock; 45 spinlock_t lock;
46}; 46};
47 47
48static inline struct freezer *cgroup_freezer(struct cgroup *cgroup) 48static inline struct freezer *css_freezer(struct cgroup_subsys_state *css)
49{ 49{
50 return container_of(cgroup_subsys_state(cgroup, freezer_subsys_id), 50 return css ? container_of(css, struct freezer, css) : NULL;
51 struct freezer, css);
52} 51}
53 52
54static inline struct freezer *task_freezer(struct task_struct *task) 53static inline struct freezer *task_freezer(struct task_struct *task)
55{ 54{
56 return container_of(task_subsys_state(task, freezer_subsys_id), 55 return css_freezer(task_css(task, freezer_subsys_id));
57 struct freezer, css);
58} 56}
59 57
60static struct freezer *parent_freezer(struct freezer *freezer) 58static struct freezer *parent_freezer(struct freezer *freezer)
61{ 59{
62 struct cgroup *pcg = freezer->css.cgroup->parent; 60 return css_freezer(css_parent(&freezer->css));
63
64 if (pcg)
65 return cgroup_freezer(pcg);
66 return NULL;
67} 61}
68 62
69bool cgroup_freezing(struct task_struct *task) 63bool cgroup_freezing(struct task_struct *task)
@@ -92,7 +86,8 @@ static const char *freezer_state_strs(unsigned int state)
92 86
93struct cgroup_subsys freezer_subsys; 87struct cgroup_subsys freezer_subsys;
94 88
95static struct cgroup_subsys_state *freezer_css_alloc(struct cgroup *cgroup) 89static struct cgroup_subsys_state *
90freezer_css_alloc(struct cgroup_subsys_state *parent_css)
96{ 91{
97 struct freezer *freezer; 92 struct freezer *freezer;
98 93
@@ -105,22 +100,22 @@ static struct cgroup_subsys_state *freezer_css_alloc(struct cgroup *cgroup)
105} 100}
106 101
107/** 102/**
108 * freezer_css_online - commit creation of a freezer cgroup 103 * freezer_css_online - commit creation of a freezer css
109 * @cgroup: cgroup being created 104 * @css: css being created
110 * 105 *
111 * We're committing to creation of @cgroup. Mark it online and inherit 106 * We're committing to creation of @css. Mark it online and inherit
112 * parent's freezing state while holding both parent's and our 107 * parent's freezing state while holding both parent's and our
113 * freezer->lock. 108 * freezer->lock.
114 */ 109 */
115static int freezer_css_online(struct cgroup *cgroup) 110static int freezer_css_online(struct cgroup_subsys_state *css)
116{ 111{
117 struct freezer *freezer = cgroup_freezer(cgroup); 112 struct freezer *freezer = css_freezer(css);
118 struct freezer *parent = parent_freezer(freezer); 113 struct freezer *parent = parent_freezer(freezer);
119 114
120 /* 115 /*
121 * The following double locking and freezing state inheritance 116 * The following double locking and freezing state inheritance
122 * guarantee that @cgroup can never escape ancestors' freezing 117 * guarantee that @cgroup can never escape ancestors' freezing
123 * states. See cgroup_for_each_descendant_pre() for details. 118 * states. See css_for_each_descendant_pre() for details.
124 */ 119 */
125 if (parent) 120 if (parent)
126 spin_lock_irq(&parent->lock); 121 spin_lock_irq(&parent->lock);
@@ -141,15 +136,15 @@ static int freezer_css_online(struct cgroup *cgroup)
141} 136}
142 137
143/** 138/**
144 * freezer_css_offline - initiate destruction of @cgroup 139 * freezer_css_offline - initiate destruction of a freezer css
145 * @cgroup: cgroup being destroyed 140 * @css: css being destroyed
146 * 141 *
147 * @cgroup is going away. Mark it dead and decrement system_freezing_count 142 * @css is going away. Mark it dead and decrement system_freezing_count if
148 * if it was holding one. 143 * it was holding one.
149 */ 144 */
150static void freezer_css_offline(struct cgroup *cgroup) 145static void freezer_css_offline(struct cgroup_subsys_state *css)
151{ 146{
152 struct freezer *freezer = cgroup_freezer(cgroup); 147 struct freezer *freezer = css_freezer(css);
153 148
154 spin_lock_irq(&freezer->lock); 149 spin_lock_irq(&freezer->lock);
155 150
@@ -161,9 +156,9 @@ static void freezer_css_offline(struct cgroup *cgroup)
161 spin_unlock_irq(&freezer->lock); 156 spin_unlock_irq(&freezer->lock);
162} 157}
163 158
164static void freezer_css_free(struct cgroup *cgroup) 159static void freezer_css_free(struct cgroup_subsys_state *css)
165{ 160{
166 kfree(cgroup_freezer(cgroup)); 161 kfree(css_freezer(css));
167} 162}
168 163
169/* 164/*
@@ -175,25 +170,26 @@ static void freezer_css_free(struct cgroup *cgroup)
175 * @freezer->lock. freezer_attach() makes the new tasks conform to the 170 * @freezer->lock. freezer_attach() makes the new tasks conform to the
176 * current state and all following state changes can see the new tasks. 171 * current state and all following state changes can see the new tasks.
177 */ 172 */
178static void freezer_attach(struct cgroup *new_cgrp, struct cgroup_taskset *tset) 173static void freezer_attach(struct cgroup_subsys_state *new_css,
174 struct cgroup_taskset *tset)
179{ 175{
180 struct freezer *freezer = cgroup_freezer(new_cgrp); 176 struct freezer *freezer = css_freezer(new_css);
181 struct task_struct *task; 177 struct task_struct *task;
182 bool clear_frozen = false; 178 bool clear_frozen = false;
183 179
184 spin_lock_irq(&freezer->lock); 180 spin_lock_irq(&freezer->lock);
185 181
186 /* 182 /*
187 * Make the new tasks conform to the current state of @new_cgrp. 183 * Make the new tasks conform to the current state of @new_css.
188 * For simplicity, when migrating any task to a FROZEN cgroup, we 184 * For simplicity, when migrating any task to a FROZEN cgroup, we
189 * revert it to FREEZING and let update_if_frozen() determine the 185 * revert it to FREEZING and let update_if_frozen() determine the
190 * correct state later. 186 * correct state later.
191 * 187 *
192 * Tasks in @tset are on @new_cgrp but may not conform to its 188 * Tasks in @tset are on @new_css but may not conform to its
193 * current state before executing the following - !frozen tasks may 189 * current state before executing the following - !frozen tasks may
194 * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. 190 * be visible in a FROZEN cgroup and frozen tasks in a THAWED one.
195 */ 191 */
196 cgroup_taskset_for_each(task, new_cgrp, tset) { 192 cgroup_taskset_for_each(task, new_css, tset) {
197 if (!(freezer->state & CGROUP_FREEZING)) { 193 if (!(freezer->state & CGROUP_FREEZING)) {
198 __thaw_task(task); 194 __thaw_task(task);
199 } else { 195 } else {
@@ -231,7 +227,7 @@ static void freezer_fork(struct task_struct *task)
231 * The root cgroup is non-freezable, so we can skip the 227 * The root cgroup is non-freezable, so we can skip the
232 * following check. 228 * following check.
233 */ 229 */
234 if (!freezer->css.cgroup->parent) 230 if (!parent_freezer(freezer))
235 goto out; 231 goto out;
236 232
237 spin_lock_irq(&freezer->lock); 233 spin_lock_irq(&freezer->lock);
@@ -244,7 +240,7 @@ out:
244 240
245/** 241/**
246 * update_if_frozen - update whether a cgroup finished freezing 242 * update_if_frozen - update whether a cgroup finished freezing
247 * @cgroup: cgroup of interest 243 * @css: css of interest
248 * 244 *
249 * Once FREEZING is initiated, transition to FROZEN is lazily updated by 245 * Once FREEZING is initiated, transition to FROZEN is lazily updated by
250 * calling this function. If the current state is FREEZING but not FROZEN, 246 * calling this function. If the current state is FREEZING but not FROZEN,
@@ -255,14 +251,14 @@ out:
255 * update_if_frozen() on all descendants prior to invoking this function. 251 * update_if_frozen() on all descendants prior to invoking this function.
256 * 252 *
257 * Task states and freezer state might disagree while tasks are being 253 * Task states and freezer state might disagree while tasks are being
258 * migrated into or out of @cgroup, so we can't verify task states against 254 * migrated into or out of @css, so we can't verify task states against
259 * @freezer state here. See freezer_attach() for details. 255 * @freezer state here. See freezer_attach() for details.
260 */ 256 */
261static void update_if_frozen(struct cgroup *cgroup) 257static void update_if_frozen(struct cgroup_subsys_state *css)
262{ 258{
263 struct freezer *freezer = cgroup_freezer(cgroup); 259 struct freezer *freezer = css_freezer(css);
264 struct cgroup *pos; 260 struct cgroup_subsys_state *pos;
265 struct cgroup_iter it; 261 struct css_task_iter it;
266 struct task_struct *task; 262 struct task_struct *task;
267 263
268 WARN_ON_ONCE(!rcu_read_lock_held()); 264 WARN_ON_ONCE(!rcu_read_lock_held());
@@ -274,8 +270,8 @@ static void update_if_frozen(struct cgroup *cgroup)
274 goto out_unlock; 270 goto out_unlock;
275 271
276 /* are all (live) children frozen? */ 272 /* are all (live) children frozen? */
277 cgroup_for_each_child(pos, cgroup) { 273 css_for_each_child(pos, css) {
278 struct freezer *child = cgroup_freezer(pos); 274 struct freezer *child = css_freezer(pos);
279 275
280 if ((child->state & CGROUP_FREEZER_ONLINE) && 276 if ((child->state & CGROUP_FREEZER_ONLINE) &&
281 !(child->state & CGROUP_FROZEN)) 277 !(child->state & CGROUP_FROZEN))
@@ -283,9 +279,9 @@ static void update_if_frozen(struct cgroup *cgroup)
283 } 279 }
284 280
285 /* are all tasks frozen? */ 281 /* are all tasks frozen? */
286 cgroup_iter_start(cgroup, &it); 282 css_task_iter_start(css, &it);
287 283
288 while ((task = cgroup_iter_next(cgroup, &it))) { 284 while ((task = css_task_iter_next(&it))) {
289 if (freezing(task)) { 285 if (freezing(task)) {
290 /* 286 /*
291 * freezer_should_skip() indicates that the task 287 * freezer_should_skip() indicates that the task
@@ -300,52 +296,49 @@ static void update_if_frozen(struct cgroup *cgroup)
300 296
301 freezer->state |= CGROUP_FROZEN; 297 freezer->state |= CGROUP_FROZEN;
302out_iter_end: 298out_iter_end:
303 cgroup_iter_end(cgroup, &it); 299 css_task_iter_end(&it);
304out_unlock: 300out_unlock:
305 spin_unlock_irq(&freezer->lock); 301 spin_unlock_irq(&freezer->lock);
306} 302}
307 303
308static int freezer_read(struct cgroup *cgroup, struct cftype *cft, 304static int freezer_read(struct cgroup_subsys_state *css, struct cftype *cft,
309 struct seq_file *m) 305 struct seq_file *m)
310{ 306{
311 struct cgroup *pos; 307 struct cgroup_subsys_state *pos;
312 308
313 rcu_read_lock(); 309 rcu_read_lock();
314 310
315 /* update states bottom-up */ 311 /* update states bottom-up */
316 cgroup_for_each_descendant_post(pos, cgroup) 312 css_for_each_descendant_post(pos, css)
317 update_if_frozen(pos); 313 update_if_frozen(pos);
318 update_if_frozen(cgroup);
319 314
320 rcu_read_unlock(); 315 rcu_read_unlock();
321 316
322 seq_puts(m, freezer_state_strs(cgroup_freezer(cgroup)->state)); 317 seq_puts(m, freezer_state_strs(css_freezer(css)->state));
323 seq_putc(m, '\n'); 318 seq_putc(m, '\n');
324 return 0; 319 return 0;
325} 320}
326 321
327static void freeze_cgroup(struct freezer *freezer) 322static void freeze_cgroup(struct freezer *freezer)
328{ 323{
329 struct cgroup *cgroup = freezer->css.cgroup; 324 struct css_task_iter it;
330 struct cgroup_iter it;
331 struct task_struct *task; 325 struct task_struct *task;
332 326
333 cgroup_iter_start(cgroup, &it); 327 css_task_iter_start(&freezer->css, &it);
334 while ((task = cgroup_iter_next(cgroup, &it))) 328 while ((task = css_task_iter_next(&it)))
335 freeze_task(task); 329 freeze_task(task);
336 cgroup_iter_end(cgroup, &it); 330 css_task_iter_end(&it);
337} 331}
338 332
339static void unfreeze_cgroup(struct freezer *freezer) 333static void unfreeze_cgroup(struct freezer *freezer)
340{ 334{
341 struct cgroup *cgroup = freezer->css.cgroup; 335 struct css_task_iter it;
342 struct cgroup_iter it;
343 struct task_struct *task; 336 struct task_struct *task;
344 337
345 cgroup_iter_start(cgroup, &it); 338 css_task_iter_start(&freezer->css, &it);
346 while ((task = cgroup_iter_next(cgroup, &it))) 339 while ((task = css_task_iter_next(&it)))
347 __thaw_task(task); 340 __thaw_task(task);
348 cgroup_iter_end(cgroup, &it); 341 css_task_iter_end(&it);
349} 342}
350 343
351/** 344/**
@@ -395,12 +388,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze,
395 */ 388 */
396static void freezer_change_state(struct freezer *freezer, bool freeze) 389static void freezer_change_state(struct freezer *freezer, bool freeze)
397{ 390{
398 struct cgroup *pos; 391 struct cgroup_subsys_state *pos;
399
400 /* update @freezer */
401 spin_lock_irq(&freezer->lock);
402 freezer_apply_state(freezer, freeze, CGROUP_FREEZING_SELF);
403 spin_unlock_irq(&freezer->lock);
404 392
405 /* 393 /*
406 * Update all its descendants in pre-order traversal. Each 394 * Update all its descendants in pre-order traversal. Each
@@ -408,24 +396,33 @@ static void freezer_change_state(struct freezer *freezer, bool freeze)
408 * CGROUP_FREEZING_PARENT. 396 * CGROUP_FREEZING_PARENT.
409 */ 397 */
410 rcu_read_lock(); 398 rcu_read_lock();
411 cgroup_for_each_descendant_pre(pos, freezer->css.cgroup) { 399 css_for_each_descendant_pre(pos, &freezer->css) {
412 struct freezer *pos_f = cgroup_freezer(pos); 400 struct freezer *pos_f = css_freezer(pos);
413 struct freezer *parent = parent_freezer(pos_f); 401 struct freezer *parent = parent_freezer(pos_f);
414 402
415 /*
416 * Our update to @parent->state is already visible which is
417 * all we need. No need to lock @parent. For more info on
418 * synchronization, see freezer_post_create().
419 */
420 spin_lock_irq(&pos_f->lock); 403 spin_lock_irq(&pos_f->lock);
421 freezer_apply_state(pos_f, parent->state & CGROUP_FREEZING, 404
422 CGROUP_FREEZING_PARENT); 405 if (pos_f == freezer) {
406 freezer_apply_state(pos_f, freeze,
407 CGROUP_FREEZING_SELF);
408 } else {
409 /*
410 * Our update to @parent->state is already visible
411 * which is all we need. No need to lock @parent.
412 * For more info on synchronization, see
413 * freezer_post_create().
414 */
415 freezer_apply_state(pos_f,
416 parent->state & CGROUP_FREEZING,
417 CGROUP_FREEZING_PARENT);
418 }
419
423 spin_unlock_irq(&pos_f->lock); 420 spin_unlock_irq(&pos_f->lock);
424 } 421 }
425 rcu_read_unlock(); 422 rcu_read_unlock();
426} 423}
427 424
428static int freezer_write(struct cgroup *cgroup, struct cftype *cft, 425static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft,
429 const char *buffer) 426 const char *buffer)
430{ 427{
431 bool freeze; 428 bool freeze;
@@ -437,20 +434,22 @@ static int freezer_write(struct cgroup *cgroup, struct cftype *cft,
437 else 434 else
438 return -EINVAL; 435 return -EINVAL;
439 436
440 freezer_change_state(cgroup_freezer(cgroup), freeze); 437 freezer_change_state(css_freezer(css), freeze);
441 return 0; 438 return 0;
442} 439}
443 440
444static u64 freezer_self_freezing_read(struct cgroup *cgroup, struct cftype *cft) 441static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css,
442 struct cftype *cft)
445{ 443{
446 struct freezer *freezer = cgroup_freezer(cgroup); 444 struct freezer *freezer = css_freezer(css);
447 445
448 return (bool)(freezer->state & CGROUP_FREEZING_SELF); 446 return (bool)(freezer->state & CGROUP_FREEZING_SELF);
449} 447}
450 448
451static u64 freezer_parent_freezing_read(struct cgroup *cgroup, struct cftype *cft) 449static u64 freezer_parent_freezing_read(struct cgroup_subsys_state *css,
450 struct cftype *cft)
452{ 451{
453 struct freezer *freezer = cgroup_freezer(cgroup); 452 struct freezer *freezer = css_freezer(css);
454 453
455 return (bool)(freezer->state & CGROUP_FREEZING_PARENT); 454 return (bool)(freezer->state & CGROUP_FREEZING_PARENT);
456} 455}
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 383f8231e436..247091bf0587 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -20,22 +20,33 @@
20#include <linux/hardirq.h> 20#include <linux/hardirq.h>
21#include <linux/export.h> 21#include <linux/export.h>
22 22
23DEFINE_PER_CPU(struct context_tracking, context_tracking) = { 23#define CREATE_TRACE_POINTS
24#ifdef CONFIG_CONTEXT_TRACKING_FORCE 24#include <trace/events/context_tracking.h>
25 .active = true, 25
26#endif 26struct static_key context_tracking_enabled = STATIC_KEY_INIT_FALSE;
27}; 27EXPORT_SYMBOL_GPL(context_tracking_enabled);
28
29DEFINE_PER_CPU(struct context_tracking, context_tracking);
30EXPORT_SYMBOL_GPL(context_tracking);
31
32void context_tracking_cpu_set(int cpu)
33{
34 if (!per_cpu(context_tracking.active, cpu)) {
35 per_cpu(context_tracking.active, cpu) = true;
36 static_key_slow_inc(&context_tracking_enabled);
37 }
38}
28 39
29/** 40/**
30 * user_enter - Inform the context tracking that the CPU is going to 41 * context_tracking_user_enter - Inform the context tracking that the CPU is going to
31 * enter userspace mode. 42 * enter userspace mode.
32 * 43 *
33 * This function must be called right before we switch from the kernel 44 * This function must be called right before we switch from the kernel
34 * to userspace, when it's guaranteed the remaining kernel instructions 45 * to userspace, when it's guaranteed the remaining kernel instructions
35 * to execute won't use any RCU read side critical section because this 46 * to execute won't use any RCU read side critical section because this
36 * function sets RCU in extended quiescent state. 47 * function sets RCU in extended quiescent state.
37 */ 48 */
38void user_enter(void) 49void context_tracking_user_enter(void)
39{ 50{
40 unsigned long flags; 51 unsigned long flags;
41 52
@@ -54,17 +65,32 @@ void user_enter(void)
54 WARN_ON_ONCE(!current->mm); 65 WARN_ON_ONCE(!current->mm);
55 66
56 local_irq_save(flags); 67 local_irq_save(flags);
57 if (__this_cpu_read(context_tracking.active) && 68 if ( __this_cpu_read(context_tracking.state) != IN_USER) {
58 __this_cpu_read(context_tracking.state) != IN_USER) { 69 if (__this_cpu_read(context_tracking.active)) {
70 trace_user_enter(0);
71 /*
72 * At this stage, only low level arch entry code remains and
73 * then we'll run in userspace. We can assume there won't be
74 * any RCU read-side critical section until the next call to
75 * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency
76 * on the tick.
77 */
78 vtime_user_enter(current);
79 rcu_user_enter();
80 }
59 /* 81 /*
60 * At this stage, only low level arch entry code remains and 82 * Even if context tracking is disabled on this CPU, because it's outside
61 * then we'll run in userspace. We can assume there won't be 83 * the full dynticks mask for example, we still have to keep track of the
62 * any RCU read-side critical section until the next call to 84 * context transitions and states to prevent inconsistency on those of
63 * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency 85 * other CPUs.
64 * on the tick. 86 * If a task triggers an exception in userspace, sleep on the exception
87 * handler and then migrate to another CPU, that new CPU must know where
88 * the exception returns by the time we call exception_exit().
89 * This information can only be provided by the previous CPU when it called
90 * exception_enter().
91 * OTOH we can spare the calls to vtime and RCU when context_tracking.active
92 * is false because we know that CPU is not tickless.
65 */ 93 */
66 vtime_user_enter(current);
67 rcu_user_enter();
68 __this_cpu_write(context_tracking.state, IN_USER); 94 __this_cpu_write(context_tracking.state, IN_USER);
69 } 95 }
70 local_irq_restore(flags); 96 local_irq_restore(flags);
@@ -87,10 +113,9 @@ void user_enter(void)
87 */ 113 */
88void __sched notrace preempt_schedule_context(void) 114void __sched notrace preempt_schedule_context(void)
89{ 115{
90 struct thread_info *ti = current_thread_info();
91 enum ctx_state prev_ctx; 116 enum ctx_state prev_ctx;
92 117
93 if (likely(ti->preempt_count || irqs_disabled())) 118 if (likely(!preemptible()))
94 return; 119 return;
95 120
96 /* 121 /*
@@ -112,8 +137,8 @@ EXPORT_SYMBOL_GPL(preempt_schedule_context);
112#endif /* CONFIG_PREEMPT */ 137#endif /* CONFIG_PREEMPT */
113 138
114/** 139/**
115 * user_exit - Inform the context tracking that the CPU is 140 * context_tracking_user_exit - Inform the context tracking that the CPU is
116 * exiting userspace mode and entering the kernel. 141 * exiting userspace mode and entering the kernel.
117 * 142 *
118 * This function must be called after we entered the kernel from userspace 143 * This function must be called after we entered the kernel from userspace
119 * before any use of RCU read side critical section. This potentially include 144 * before any use of RCU read side critical section. This potentially include
@@ -122,7 +147,7 @@ EXPORT_SYMBOL_GPL(preempt_schedule_context);
122 * This call supports re-entrancy. This way it can be called from any exception 147 * This call supports re-entrancy. This way it can be called from any exception
123 * handler without needing to know if we came from userspace or not. 148 * handler without needing to know if we came from userspace or not.
124 */ 149 */
125void user_exit(void) 150void context_tracking_user_exit(void)
126{ 151{
127 unsigned long flags; 152 unsigned long flags;
128 153
@@ -131,38 +156,22 @@ void user_exit(void)
131 156
132 local_irq_save(flags); 157 local_irq_save(flags);
133 if (__this_cpu_read(context_tracking.state) == IN_USER) { 158 if (__this_cpu_read(context_tracking.state) == IN_USER) {
134 /* 159 if (__this_cpu_read(context_tracking.active)) {
135 * We are going to run code that may use RCU. Inform 160 /*
136 * RCU core about that (ie: we may need the tick again). 161 * We are going to run code that may use RCU. Inform
137 */ 162 * RCU core about that (ie: we may need the tick again).
138 rcu_user_exit(); 163 */
139 vtime_user_exit(current); 164 rcu_user_exit();
165 vtime_user_exit(current);
166 trace_user_exit(0);
167 }
140 __this_cpu_write(context_tracking.state, IN_KERNEL); 168 __this_cpu_write(context_tracking.state, IN_KERNEL);
141 } 169 }
142 local_irq_restore(flags); 170 local_irq_restore(flags);
143} 171}
144 172
145void guest_enter(void)
146{
147 if (vtime_accounting_enabled())
148 vtime_guest_enter(current);
149 else
150 __guest_enter();
151}
152EXPORT_SYMBOL_GPL(guest_enter);
153
154void guest_exit(void)
155{
156 if (vtime_accounting_enabled())
157 vtime_guest_exit(current);
158 else
159 __guest_exit();
160}
161EXPORT_SYMBOL_GPL(guest_exit);
162
163
164/** 173/**
165 * context_tracking_task_switch - context switch the syscall callbacks 174 * __context_tracking_task_switch - context switch the syscall callbacks
166 * @prev: the task that is being switched out 175 * @prev: the task that is being switched out
167 * @next: the task that is being switched in 176 * @next: the task that is being switched in
168 * 177 *
@@ -174,11 +183,19 @@ EXPORT_SYMBOL_GPL(guest_exit);
174 * migrate to some CPU that doesn't do the context tracking. As such the TIF 183 * migrate to some CPU that doesn't do the context tracking. As such the TIF
175 * flag may not be desired there. 184 * flag may not be desired there.
176 */ 185 */
177void context_tracking_task_switch(struct task_struct *prev, 186void __context_tracking_task_switch(struct task_struct *prev,
178 struct task_struct *next) 187 struct task_struct *next)
179{ 188{
180 if (__this_cpu_read(context_tracking.active)) { 189 clear_tsk_thread_flag(prev, TIF_NOHZ);
181 clear_tsk_thread_flag(prev, TIF_NOHZ); 190 set_tsk_thread_flag(next, TIF_NOHZ);
182 set_tsk_thread_flag(next, TIF_NOHZ);
183 }
184} 191}
192
193#ifdef CONFIG_CONTEXT_TRACKING_FORCE
194void __init context_tracking_init(void)
195{
196 int cpu;
197
198 for_each_possible_cpu(cpu)
199 context_tracking_cpu_set(cpu);
200}
201#endif
diff --git a/kernel/cpu.c b/kernel/cpu.c
index b2b227b82123..d7f07a2da5a6 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -113,7 +113,7 @@ EXPORT_SYMBOL_GPL(put_online_cpus);
113 * get_online_cpus() not an api which is called all that often. 113 * get_online_cpus() not an api which is called all that often.
114 * 114 *
115 */ 115 */
116static void cpu_hotplug_begin(void) 116void cpu_hotplug_begin(void)
117{ 117{
118 cpu_hotplug.active_writer = current; 118 cpu_hotplug.active_writer = current;
119 119
@@ -127,7 +127,7 @@ static void cpu_hotplug_begin(void)
127 } 127 }
128} 128}
129 129
130static void cpu_hotplug_done(void) 130void cpu_hotplug_done(void)
131{ 131{
132 cpu_hotplug.active_writer = NULL; 132 cpu_hotplug.active_writer = NULL;
133 mutex_unlock(&cpu_hotplug.lock); 133 mutex_unlock(&cpu_hotplug.lock);
@@ -154,10 +154,7 @@ void cpu_hotplug_enable(void)
154 cpu_maps_update_done(); 154 cpu_maps_update_done();
155} 155}
156 156
157#else /* #if CONFIG_HOTPLUG_CPU */ 157#endif /* CONFIG_HOTPLUG_CPU */
158static void cpu_hotplug_begin(void) {}
159static void cpu_hotplug_done(void) {}
160#endif /* #else #if CONFIG_HOTPLUG_CPU */
161 158
162/* Need to know about CPUs going up/down? */ 159/* Need to know about CPUs going up/down? */
163int __ref register_cpu_notifier(struct notifier_block *nb) 160int __ref register_cpu_notifier(struct notifier_block *nb)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index ea1966db34f2..6bf981e13c43 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -68,10 +68,6 @@
68 */ 68 */
69int number_of_cpusets __read_mostly; 69int number_of_cpusets __read_mostly;
70 70
71/* Forward declare cgroup structures */
72struct cgroup_subsys cpuset_subsys;
73struct cpuset;
74
75/* See "Frequency meter" comments, below. */ 71/* See "Frequency meter" comments, below. */
76 72
77struct fmeter { 73struct fmeter {
@@ -115,27 +111,20 @@ struct cpuset {
115 int relax_domain_level; 111 int relax_domain_level;
116}; 112};
117 113
118/* Retrieve the cpuset for a cgroup */ 114static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
119static inline struct cpuset *cgroup_cs(struct cgroup *cgrp)
120{ 115{
121 return container_of(cgroup_subsys_state(cgrp, cpuset_subsys_id), 116 return css ? container_of(css, struct cpuset, css) : NULL;
122 struct cpuset, css);
123} 117}
124 118
125/* Retrieve the cpuset for a task */ 119/* Retrieve the cpuset for a task */
126static inline struct cpuset *task_cs(struct task_struct *task) 120static inline struct cpuset *task_cs(struct task_struct *task)
127{ 121{
128 return container_of(task_subsys_state(task, cpuset_subsys_id), 122 return css_cs(task_css(task, cpuset_subsys_id));
129 struct cpuset, css);
130} 123}
131 124
132static inline struct cpuset *parent_cs(const struct cpuset *cs) 125static inline struct cpuset *parent_cs(struct cpuset *cs)
133{ 126{
134 struct cgroup *pcgrp = cs->css.cgroup->parent; 127 return css_cs(css_parent(&cs->css));
135
136 if (pcgrp)
137 return cgroup_cs(pcgrp);
138 return NULL;
139} 128}
140 129
141#ifdef CONFIG_NUMA 130#ifdef CONFIG_NUMA
@@ -212,29 +201,30 @@ static struct cpuset top_cpuset = {
212/** 201/**
213 * cpuset_for_each_child - traverse online children of a cpuset 202 * cpuset_for_each_child - traverse online children of a cpuset
214 * @child_cs: loop cursor pointing to the current child 203 * @child_cs: loop cursor pointing to the current child
215 * @pos_cgrp: used for iteration 204 * @pos_css: used for iteration
216 * @parent_cs: target cpuset to walk children of 205 * @parent_cs: target cpuset to walk children of
217 * 206 *
218 * Walk @child_cs through the online children of @parent_cs. Must be used 207 * Walk @child_cs through the online children of @parent_cs. Must be used
219 * with RCU read locked. 208 * with RCU read locked.
220 */ 209 */
221#define cpuset_for_each_child(child_cs, pos_cgrp, parent_cs) \ 210#define cpuset_for_each_child(child_cs, pos_css, parent_cs) \
222 cgroup_for_each_child((pos_cgrp), (parent_cs)->css.cgroup) \ 211 css_for_each_child((pos_css), &(parent_cs)->css) \
223 if (is_cpuset_online(((child_cs) = cgroup_cs((pos_cgrp))))) 212 if (is_cpuset_online(((child_cs) = css_cs((pos_css)))))
224 213
225/** 214/**
226 * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants 215 * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
227 * @des_cs: loop cursor pointing to the current descendant 216 * @des_cs: loop cursor pointing to the current descendant
228 * @pos_cgrp: used for iteration 217 * @pos_css: used for iteration
229 * @root_cs: target cpuset to walk ancestor of 218 * @root_cs: target cpuset to walk ancestor of
230 * 219 *
231 * Walk @des_cs through the online descendants of @root_cs. Must be used 220 * Walk @des_cs through the online descendants of @root_cs. Must be used
232 * with RCU read locked. The caller may modify @pos_cgrp by calling 221 * with RCU read locked. The caller may modify @pos_css by calling
233 * cgroup_rightmost_descendant() to skip subtree. 222 * css_rightmost_descendant() to skip subtree. @root_cs is included in the
223 * iteration and the first node to be visited.
234 */ 224 */
235#define cpuset_for_each_descendant_pre(des_cs, pos_cgrp, root_cs) \ 225#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \
236 cgroup_for_each_descendant_pre((pos_cgrp), (root_cs)->css.cgroup) \ 226 css_for_each_descendant_pre((pos_css), &(root_cs)->css) \
237 if (is_cpuset_online(((des_cs) = cgroup_cs((pos_cgrp))))) 227 if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
238 228
239/* 229/*
240 * There are two global mutexes guarding cpuset structures - cpuset_mutex 230 * There are two global mutexes guarding cpuset structures - cpuset_mutex
@@ -320,8 +310,7 @@ static struct file_system_type cpuset_fs_type = {
320 * 310 *
321 * Call with callback_mutex held. 311 * Call with callback_mutex held.
322 */ 312 */
323static void guarantee_online_cpus(const struct cpuset *cs, 313static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
324 struct cpumask *pmask)
325{ 314{
326 while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) 315 while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
327 cs = parent_cs(cs); 316 cs = parent_cs(cs);
@@ -339,7 +328,7 @@ static void guarantee_online_cpus(const struct cpuset *cs,
339 * 328 *
340 * Call with callback_mutex held. 329 * Call with callback_mutex held.
341 */ 330 */
342static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) 331static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
343{ 332{
344 while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY])) 333 while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY]))
345 cs = parent_cs(cs); 334 cs = parent_cs(cs);
@@ -384,7 +373,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
384 * alloc_trial_cpuset - allocate a trial cpuset 373 * alloc_trial_cpuset - allocate a trial cpuset
385 * @cs: the cpuset that the trial cpuset duplicates 374 * @cs: the cpuset that the trial cpuset duplicates
386 */ 375 */
387static struct cpuset *alloc_trial_cpuset(const struct cpuset *cs) 376static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
388{ 377{
389 struct cpuset *trial; 378 struct cpuset *trial;
390 379
@@ -431,9 +420,9 @@ static void free_trial_cpuset(struct cpuset *trial)
431 * Return 0 if valid, -errno if not. 420 * Return 0 if valid, -errno if not.
432 */ 421 */
433 422
434static int validate_change(const struct cpuset *cur, const struct cpuset *trial) 423static int validate_change(struct cpuset *cur, struct cpuset *trial)
435{ 424{
436 struct cgroup *cgrp; 425 struct cgroup_subsys_state *css;
437 struct cpuset *c, *par; 426 struct cpuset *c, *par;
438 int ret; 427 int ret;
439 428
@@ -441,7 +430,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
441 430
442 /* Each of our child cpusets must be a subset of us */ 431 /* Each of our child cpusets must be a subset of us */
443 ret = -EBUSY; 432 ret = -EBUSY;
444 cpuset_for_each_child(c, cgrp, cur) 433 cpuset_for_each_child(c, css, cur)
445 if (!is_cpuset_subset(c, trial)) 434 if (!is_cpuset_subset(c, trial))
446 goto out; 435 goto out;
447 436
@@ -462,7 +451,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
462 * overlap 451 * overlap
463 */ 452 */
464 ret = -EINVAL; 453 ret = -EINVAL;
465 cpuset_for_each_child(c, cgrp, par) { 454 cpuset_for_each_child(c, css, par) {
466 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && 455 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
467 c != cur && 456 c != cur &&
468 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) 457 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
@@ -515,13 +504,16 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr,
515 struct cpuset *root_cs) 504 struct cpuset *root_cs)
516{ 505{
517 struct cpuset *cp; 506 struct cpuset *cp;
518 struct cgroup *pos_cgrp; 507 struct cgroup_subsys_state *pos_css;
519 508
520 rcu_read_lock(); 509 rcu_read_lock();
521 cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { 510 cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
511 if (cp == root_cs)
512 continue;
513
522 /* skip the whole subtree if @cp doesn't have any CPU */ 514 /* skip the whole subtree if @cp doesn't have any CPU */
523 if (cpumask_empty(cp->cpus_allowed)) { 515 if (cpumask_empty(cp->cpus_allowed)) {
524 pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); 516 pos_css = css_rightmost_descendant(pos_css);
525 continue; 517 continue;
526 } 518 }
527 519
@@ -596,7 +588,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
596 struct sched_domain_attr *dattr; /* attributes for custom domains */ 588 struct sched_domain_attr *dattr; /* attributes for custom domains */
597 int ndoms = 0; /* number of sched domains in result */ 589 int ndoms = 0; /* number of sched domains in result */
598 int nslot; /* next empty doms[] struct cpumask slot */ 590 int nslot; /* next empty doms[] struct cpumask slot */
599 struct cgroup *pos_cgrp; 591 struct cgroup_subsys_state *pos_css;
600 592
601 doms = NULL; 593 doms = NULL;
602 dattr = NULL; 594 dattr = NULL;
@@ -625,7 +617,9 @@ static int generate_sched_domains(cpumask_var_t **domains,
625 csn = 0; 617 csn = 0;
626 618
627 rcu_read_lock(); 619 rcu_read_lock();
628 cpuset_for_each_descendant_pre(cp, pos_cgrp, &top_cpuset) { 620 cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
621 if (cp == &top_cpuset)
622 continue;
629 /* 623 /*
630 * Continue traversing beyond @cp iff @cp has some CPUs and 624 * Continue traversing beyond @cp iff @cp has some CPUs and
631 * isn't load balancing. The former is obvious. The 625 * isn't load balancing. The former is obvious. The
@@ -642,7 +636,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
642 csa[csn++] = cp; 636 csa[csn++] = cp;
643 637
644 /* skip @cp's subtree */ 638 /* skip @cp's subtree */
645 pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); 639 pos_css = css_rightmost_descendant(pos_css);
646 } 640 }
647 rcu_read_unlock(); 641 rcu_read_unlock();
648 642
@@ -837,52 +831,45 @@ static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs)
837/** 831/**
838 * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's 832 * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's
839 * @tsk: task to test 833 * @tsk: task to test
840 * @scan: struct cgroup_scanner containing the cgroup of the task 834 * @data: cpuset to @tsk belongs to
841 * 835 *
842 * Called by cgroup_scan_tasks() for each task in a cgroup whose 836 * Called by css_scan_tasks() for each task in a cgroup whose cpus_allowed
843 * cpus_allowed mask needs to be changed. 837 * mask needs to be changed.
844 * 838 *
845 * We don't need to re-check for the cgroup/cpuset membership, since we're 839 * We don't need to re-check for the cgroup/cpuset membership, since we're
846 * holding cpuset_mutex at this point. 840 * holding cpuset_mutex at this point.
847 */ 841 */
848static void cpuset_change_cpumask(struct task_struct *tsk, 842static void cpuset_change_cpumask(struct task_struct *tsk, void *data)
849 struct cgroup_scanner *scan)
850{ 843{
851 struct cpuset *cpus_cs; 844 struct cpuset *cs = data;
845 struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
852 846
853 cpus_cs = effective_cpumask_cpuset(cgroup_cs(scan->cg));
854 set_cpus_allowed_ptr(tsk, cpus_cs->cpus_allowed); 847 set_cpus_allowed_ptr(tsk, cpus_cs->cpus_allowed);
855} 848}
856 849
857/** 850/**
858 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. 851 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
859 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed 852 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
860 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() 853 * @heap: if NULL, defer allocating heap memory to css_scan_tasks()
861 * 854 *
862 * Called with cpuset_mutex held 855 * Called with cpuset_mutex held
863 * 856 *
864 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, 857 * The css_scan_tasks() function will scan all the tasks in a cgroup,
865 * calling callback functions for each. 858 * calling callback functions for each.
866 * 859 *
867 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 860 * No return value. It's guaranteed that css_scan_tasks() always returns 0
868 * if @heap != NULL. 861 * if @heap != NULL.
869 */ 862 */
870static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) 863static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
871{ 864{
872 struct cgroup_scanner scan; 865 css_scan_tasks(&cs->css, NULL, cpuset_change_cpumask, cs, heap);
873
874 scan.cg = cs->css.cgroup;
875 scan.test_task = NULL;
876 scan.process_task = cpuset_change_cpumask;
877 scan.heap = heap;
878 cgroup_scan_tasks(&scan);
879} 866}
880 867
881/* 868/*
882 * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy. 869 * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy.
883 * @root_cs: the root cpuset of the hierarchy 870 * @root_cs: the root cpuset of the hierarchy
884 * @update_root: update root cpuset or not? 871 * @update_root: update root cpuset or not?
885 * @heap: the heap used by cgroup_scan_tasks() 872 * @heap: the heap used by css_scan_tasks()
886 * 873 *
887 * This will update cpumasks of tasks in @root_cs and all other empty cpusets 874 * This will update cpumasks of tasks in @root_cs and all other empty cpusets
888 * which take on cpumask of @root_cs. 875 * which take on cpumask of @root_cs.
@@ -893,17 +880,19 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs,
893 bool update_root, struct ptr_heap *heap) 880 bool update_root, struct ptr_heap *heap)
894{ 881{
895 struct cpuset *cp; 882 struct cpuset *cp;
896 struct cgroup *pos_cgrp; 883 struct cgroup_subsys_state *pos_css;
897
898 if (update_root)
899 update_tasks_cpumask(root_cs, heap);
900 884
901 rcu_read_lock(); 885 rcu_read_lock();
902 cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { 886 cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
903 /* skip the whole subtree if @cp have some CPU */ 887 if (cp == root_cs) {
904 if (!cpumask_empty(cp->cpus_allowed)) { 888 if (!update_root)
905 pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); 889 continue;
906 continue; 890 } else {
891 /* skip the whole subtree if @cp have some CPU */
892 if (!cpumask_empty(cp->cpus_allowed)) {
893 pos_css = css_rightmost_descendant(pos_css);
894 continue;
895 }
907 } 896 }
908 if (!css_tryget(&cp->css)) 897 if (!css_tryget(&cp->css))
909 continue; 898 continue;
@@ -1059,20 +1048,24 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
1059 task_unlock(tsk); 1048 task_unlock(tsk);
1060} 1049}
1061 1050
1051struct cpuset_change_nodemask_arg {
1052 struct cpuset *cs;
1053 nodemask_t *newmems;
1054};
1055
1062/* 1056/*
1063 * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy 1057 * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy
1064 * of it to cpuset's new mems_allowed, and migrate pages to new nodes if 1058 * of it to cpuset's new mems_allowed, and migrate pages to new nodes if
1065 * memory_migrate flag is set. Called with cpuset_mutex held. 1059 * memory_migrate flag is set. Called with cpuset_mutex held.
1066 */ 1060 */
1067static void cpuset_change_nodemask(struct task_struct *p, 1061static void cpuset_change_nodemask(struct task_struct *p, void *data)
1068 struct cgroup_scanner *scan)
1069{ 1062{
1070 struct cpuset *cs = cgroup_cs(scan->cg); 1063 struct cpuset_change_nodemask_arg *arg = data;
1064 struct cpuset *cs = arg->cs;
1071 struct mm_struct *mm; 1065 struct mm_struct *mm;
1072 int migrate; 1066 int migrate;
1073 nodemask_t *newmems = scan->data;
1074 1067
1075 cpuset_change_task_nodemask(p, newmems); 1068 cpuset_change_task_nodemask(p, arg->newmems);
1076 1069
1077 mm = get_task_mm(p); 1070 mm = get_task_mm(p);
1078 if (!mm) 1071 if (!mm)
@@ -1082,7 +1075,7 @@ static void cpuset_change_nodemask(struct task_struct *p,
1082 1075
1083 mpol_rebind_mm(mm, &cs->mems_allowed); 1076 mpol_rebind_mm(mm, &cs->mems_allowed);
1084 if (migrate) 1077 if (migrate)
1085 cpuset_migrate_mm(mm, &cs->old_mems_allowed, newmems); 1078 cpuset_migrate_mm(mm, &cs->old_mems_allowed, arg->newmems);
1086 mmput(mm); 1079 mmput(mm);
1087} 1080}
1088 1081
@@ -1091,28 +1084,22 @@ static void *cpuset_being_rebound;
1091/** 1084/**
1092 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. 1085 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
1093 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed 1086 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
1094 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() 1087 * @heap: if NULL, defer allocating heap memory to css_scan_tasks()
1095 * 1088 *
1096 * Called with cpuset_mutex held 1089 * Called with cpuset_mutex held. No return value. It's guaranteed that
1097 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 1090 * css_scan_tasks() always returns 0 if @heap != NULL.
1098 * if @heap != NULL.
1099 */ 1091 */
1100static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) 1092static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap)
1101{ 1093{
1102 static nodemask_t newmems; /* protected by cpuset_mutex */ 1094 static nodemask_t newmems; /* protected by cpuset_mutex */
1103 struct cgroup_scanner scan;
1104 struct cpuset *mems_cs = effective_nodemask_cpuset(cs); 1095 struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
1096 struct cpuset_change_nodemask_arg arg = { .cs = cs,
1097 .newmems = &newmems };
1105 1098
1106 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ 1099 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
1107 1100
1108 guarantee_online_mems(mems_cs, &newmems); 1101 guarantee_online_mems(mems_cs, &newmems);
1109 1102
1110 scan.cg = cs->css.cgroup;
1111 scan.test_task = NULL;
1112 scan.process_task = cpuset_change_nodemask;
1113 scan.heap = heap;
1114 scan.data = &newmems;
1115
1116 /* 1103 /*
1117 * The mpol_rebind_mm() call takes mmap_sem, which we couldn't 1104 * The mpol_rebind_mm() call takes mmap_sem, which we couldn't
1118 * take while holding tasklist_lock. Forks can happen - the 1105 * take while holding tasklist_lock. Forks can happen - the
@@ -1123,7 +1110,7 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap)
1123 * It's ok if we rebind the same mm twice; mpol_rebind_mm() 1110 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
1124 * is idempotent. Also migrate pages in each mm to new nodes. 1111 * is idempotent. Also migrate pages in each mm to new nodes.
1125 */ 1112 */
1126 cgroup_scan_tasks(&scan); 1113 css_scan_tasks(&cs->css, NULL, cpuset_change_nodemask, &arg, heap);
1127 1114
1128 /* 1115 /*
1129 * All the tasks' nodemasks have been updated, update 1116 * All the tasks' nodemasks have been updated, update
@@ -1139,7 +1126,7 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap)
1139 * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy. 1126 * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy.
1140 * @cs: the root cpuset of the hierarchy 1127 * @cs: the root cpuset of the hierarchy
1141 * @update_root: update the root cpuset or not? 1128 * @update_root: update the root cpuset or not?
1142 * @heap: the heap used by cgroup_scan_tasks() 1129 * @heap: the heap used by css_scan_tasks()
1143 * 1130 *
1144 * This will update nodemasks of tasks in @root_cs and all other empty cpusets 1131 * This will update nodemasks of tasks in @root_cs and all other empty cpusets
1145 * which take on nodemask of @root_cs. 1132 * which take on nodemask of @root_cs.
@@ -1150,17 +1137,19 @@ static void update_tasks_nodemask_hier(struct cpuset *root_cs,
1150 bool update_root, struct ptr_heap *heap) 1137 bool update_root, struct ptr_heap *heap)
1151{ 1138{
1152 struct cpuset *cp; 1139 struct cpuset *cp;
1153 struct cgroup *pos_cgrp; 1140 struct cgroup_subsys_state *pos_css;
1154
1155 if (update_root)
1156 update_tasks_nodemask(root_cs, heap);
1157 1141
1158 rcu_read_lock(); 1142 rcu_read_lock();
1159 cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { 1143 cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
1160 /* skip the whole subtree if @cp have some CPU */ 1144 if (cp == root_cs) {
1161 if (!nodes_empty(cp->mems_allowed)) { 1145 if (!update_root)
1162 pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); 1146 continue;
1163 continue; 1147 } else {
1148 /* skip the whole subtree if @cp have some CPU */
1149 if (!nodes_empty(cp->mems_allowed)) {
1150 pos_css = css_rightmost_descendant(pos_css);
1151 continue;
1152 }
1164 } 1153 }
1165 if (!css_tryget(&cp->css)) 1154 if (!css_tryget(&cp->css))
1166 continue; 1155 continue;
@@ -1267,44 +1256,39 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
1267 return 0; 1256 return 0;
1268} 1257}
1269 1258
1270/* 1259/**
1271 * cpuset_change_flag - make a task's spread flags the same as its cpuset's 1260 * cpuset_change_flag - make a task's spread flags the same as its cpuset's
1272 * @tsk: task to be updated 1261 * @tsk: task to be updated
1273 * @scan: struct cgroup_scanner containing the cgroup of the task 1262 * @data: cpuset to @tsk belongs to
1274 * 1263 *
1275 * Called by cgroup_scan_tasks() for each task in a cgroup. 1264 * Called by css_scan_tasks() for each task in a cgroup.
1276 * 1265 *
1277 * We don't need to re-check for the cgroup/cpuset membership, since we're 1266 * We don't need to re-check for the cgroup/cpuset membership, since we're
1278 * holding cpuset_mutex at this point. 1267 * holding cpuset_mutex at this point.
1279 */ 1268 */
1280static void cpuset_change_flag(struct task_struct *tsk, 1269static void cpuset_change_flag(struct task_struct *tsk, void *data)
1281 struct cgroup_scanner *scan)
1282{ 1270{
1283 cpuset_update_task_spread_flag(cgroup_cs(scan->cg), tsk); 1271 struct cpuset *cs = data;
1272
1273 cpuset_update_task_spread_flag(cs, tsk);
1284} 1274}
1285 1275
1286/* 1276/**
1287 * update_tasks_flags - update the spread flags of tasks in the cpuset. 1277 * update_tasks_flags - update the spread flags of tasks in the cpuset.
1288 * @cs: the cpuset in which each task's spread flags needs to be changed 1278 * @cs: the cpuset in which each task's spread flags needs to be changed
1289 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() 1279 * @heap: if NULL, defer allocating heap memory to css_scan_tasks()
1290 * 1280 *
1291 * Called with cpuset_mutex held 1281 * Called with cpuset_mutex held
1292 * 1282 *
1293 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, 1283 * The css_scan_tasks() function will scan all the tasks in a cgroup,
1294 * calling callback functions for each. 1284 * calling callback functions for each.
1295 * 1285 *
1296 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 1286 * No return value. It's guaranteed that css_scan_tasks() always returns 0
1297 * if @heap != NULL. 1287 * if @heap != NULL.
1298 */ 1288 */
1299static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap) 1289static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap)
1300{ 1290{
1301 struct cgroup_scanner scan; 1291 css_scan_tasks(&cs->css, NULL, cpuset_change_flag, cs, heap);
1302
1303 scan.cg = cs->css.cgroup;
1304 scan.test_task = NULL;
1305 scan.process_task = cpuset_change_flag;
1306 scan.heap = heap;
1307 cgroup_scan_tasks(&scan);
1308} 1292}
1309 1293
1310/* 1294/*
@@ -1462,9 +1446,10 @@ static int fmeter_getrate(struct fmeter *fmp)
1462} 1446}
1463 1447
1464/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ 1448/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
1465static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 1449static int cpuset_can_attach(struct cgroup_subsys_state *css,
1450 struct cgroup_taskset *tset)
1466{ 1451{
1467 struct cpuset *cs = cgroup_cs(cgrp); 1452 struct cpuset *cs = css_cs(css);
1468 struct task_struct *task; 1453 struct task_struct *task;
1469 int ret; 1454 int ret;
1470 1455
@@ -1475,11 +1460,11 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1475 * flag is set. 1460 * flag is set.
1476 */ 1461 */
1477 ret = -ENOSPC; 1462 ret = -ENOSPC;
1478 if (!cgroup_sane_behavior(cgrp) && 1463 if (!cgroup_sane_behavior(css->cgroup) &&
1479 (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) 1464 (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
1480 goto out_unlock; 1465 goto out_unlock;
1481 1466
1482 cgroup_taskset_for_each(task, cgrp, tset) { 1467 cgroup_taskset_for_each(task, css, tset) {
1483 /* 1468 /*
1484 * Kthreads which disallow setaffinity shouldn't be moved 1469 * Kthreads which disallow setaffinity shouldn't be moved
1485 * to a new cpuset; we don't want to change their cpu 1470 * to a new cpuset; we don't want to change their cpu
@@ -1508,11 +1493,11 @@ out_unlock:
1508 return ret; 1493 return ret;
1509} 1494}
1510 1495
1511static void cpuset_cancel_attach(struct cgroup *cgrp, 1496static void cpuset_cancel_attach(struct cgroup_subsys_state *css,
1512 struct cgroup_taskset *tset) 1497 struct cgroup_taskset *tset)
1513{ 1498{
1514 mutex_lock(&cpuset_mutex); 1499 mutex_lock(&cpuset_mutex);
1515 cgroup_cs(cgrp)->attach_in_progress--; 1500 css_cs(css)->attach_in_progress--;
1516 mutex_unlock(&cpuset_mutex); 1501 mutex_unlock(&cpuset_mutex);
1517} 1502}
1518 1503
@@ -1523,16 +1508,18 @@ static void cpuset_cancel_attach(struct cgroup *cgrp,
1523 */ 1508 */
1524static cpumask_var_t cpus_attach; 1509static cpumask_var_t cpus_attach;
1525 1510
1526static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 1511static void cpuset_attach(struct cgroup_subsys_state *css,
1512 struct cgroup_taskset *tset)
1527{ 1513{
1528 /* static buf protected by cpuset_mutex */ 1514 /* static buf protected by cpuset_mutex */
1529 static nodemask_t cpuset_attach_nodemask_to; 1515 static nodemask_t cpuset_attach_nodemask_to;
1530 struct mm_struct *mm; 1516 struct mm_struct *mm;
1531 struct task_struct *task; 1517 struct task_struct *task;
1532 struct task_struct *leader = cgroup_taskset_first(tset); 1518 struct task_struct *leader = cgroup_taskset_first(tset);
1533 struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset); 1519 struct cgroup_subsys_state *oldcss = cgroup_taskset_cur_css(tset,
1534 struct cpuset *cs = cgroup_cs(cgrp); 1520 cpuset_subsys_id);
1535 struct cpuset *oldcs = cgroup_cs(oldcgrp); 1521 struct cpuset *cs = css_cs(css);
1522 struct cpuset *oldcs = css_cs(oldcss);
1536 struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); 1523 struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
1537 struct cpuset *mems_cs = effective_nodemask_cpuset(cs); 1524 struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
1538 1525
@@ -1546,7 +1533,7 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1546 1533
1547 guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to); 1534 guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to);
1548 1535
1549 cgroup_taskset_for_each(task, cgrp, tset) { 1536 cgroup_taskset_for_each(task, css, tset) {
1550 /* 1537 /*
1551 * can_attach beforehand should guarantee that this doesn't 1538 * can_attach beforehand should guarantee that this doesn't
1552 * fail. TODO: have a better way to handle failure here 1539 * fail. TODO: have a better way to handle failure here
@@ -1608,9 +1595,10 @@ typedef enum {
1608 FILE_SPREAD_SLAB, 1595 FILE_SPREAD_SLAB,
1609} cpuset_filetype_t; 1596} cpuset_filetype_t;
1610 1597
1611static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) 1598static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
1599 u64 val)
1612{ 1600{
1613 struct cpuset *cs = cgroup_cs(cgrp); 1601 struct cpuset *cs = css_cs(css);
1614 cpuset_filetype_t type = cft->private; 1602 cpuset_filetype_t type = cft->private;
1615 int retval = 0; 1603 int retval = 0;
1616 1604
@@ -1657,9 +1645,10 @@ out_unlock:
1657 return retval; 1645 return retval;
1658} 1646}
1659 1647
1660static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) 1648static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
1649 s64 val)
1661{ 1650{
1662 struct cpuset *cs = cgroup_cs(cgrp); 1651 struct cpuset *cs = css_cs(css);
1663 cpuset_filetype_t type = cft->private; 1652 cpuset_filetype_t type = cft->private;
1664 int retval = -ENODEV; 1653 int retval = -ENODEV;
1665 1654
@@ -1683,10 +1672,10 @@ out_unlock:
1683/* 1672/*
1684 * Common handling for a write to a "cpus" or "mems" file. 1673 * Common handling for a write to a "cpus" or "mems" file.
1685 */ 1674 */
1686static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, 1675static int cpuset_write_resmask(struct cgroup_subsys_state *css,
1687 const char *buf) 1676 struct cftype *cft, const char *buf)
1688{ 1677{
1689 struct cpuset *cs = cgroup_cs(cgrp); 1678 struct cpuset *cs = css_cs(css);
1690 struct cpuset *trialcs; 1679 struct cpuset *trialcs;
1691 int retval = -ENODEV; 1680 int retval = -ENODEV;
1692 1681
@@ -1765,13 +1754,12 @@ static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs)
1765 return count; 1754 return count;
1766} 1755}
1767 1756
1768static ssize_t cpuset_common_file_read(struct cgroup *cgrp, 1757static ssize_t cpuset_common_file_read(struct cgroup_subsys_state *css,
1769 struct cftype *cft, 1758 struct cftype *cft, struct file *file,
1770 struct file *file, 1759 char __user *buf, size_t nbytes,
1771 char __user *buf, 1760 loff_t *ppos)
1772 size_t nbytes, loff_t *ppos)
1773{ 1761{
1774 struct cpuset *cs = cgroup_cs(cgrp); 1762 struct cpuset *cs = css_cs(css);
1775 cpuset_filetype_t type = cft->private; 1763 cpuset_filetype_t type = cft->private;
1776 char *page; 1764 char *page;
1777 ssize_t retval = 0; 1765 ssize_t retval = 0;
@@ -1801,9 +1789,9 @@ out:
1801 return retval; 1789 return retval;
1802} 1790}
1803 1791
1804static u64 cpuset_read_u64(struct cgroup *cgrp, struct cftype *cft) 1792static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
1805{ 1793{
1806 struct cpuset *cs = cgroup_cs(cgrp); 1794 struct cpuset *cs = css_cs(css);
1807 cpuset_filetype_t type = cft->private; 1795 cpuset_filetype_t type = cft->private;
1808 switch (type) { 1796 switch (type) {
1809 case FILE_CPU_EXCLUSIVE: 1797 case FILE_CPU_EXCLUSIVE:
@@ -1832,9 +1820,9 @@ static u64 cpuset_read_u64(struct cgroup *cgrp, struct cftype *cft)
1832 return 0; 1820 return 0;
1833} 1821}
1834 1822
1835static s64 cpuset_read_s64(struct cgroup *cgrp, struct cftype *cft) 1823static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
1836{ 1824{
1837 struct cpuset *cs = cgroup_cs(cgrp); 1825 struct cpuset *cs = css_cs(css);
1838 cpuset_filetype_t type = cft->private; 1826 cpuset_filetype_t type = cft->private;
1839 switch (type) { 1827 switch (type) {
1840 case FILE_SCHED_RELAX_DOMAIN_LEVEL: 1828 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
@@ -1949,11 +1937,12 @@ static struct cftype files[] = {
1949 * cgrp: control group that the new cpuset will be part of 1937 * cgrp: control group that the new cpuset will be part of
1950 */ 1938 */
1951 1939
1952static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cgrp) 1940static struct cgroup_subsys_state *
1941cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
1953{ 1942{
1954 struct cpuset *cs; 1943 struct cpuset *cs;
1955 1944
1956 if (!cgrp->parent) 1945 if (!parent_css)
1957 return &top_cpuset.css; 1946 return &top_cpuset.css;
1958 1947
1959 cs = kzalloc(sizeof(*cs), GFP_KERNEL); 1948 cs = kzalloc(sizeof(*cs), GFP_KERNEL);
@@ -1973,12 +1962,12 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cgrp)
1973 return &cs->css; 1962 return &cs->css;
1974} 1963}
1975 1964
1976static int cpuset_css_online(struct cgroup *cgrp) 1965static int cpuset_css_online(struct cgroup_subsys_state *css)
1977{ 1966{
1978 struct cpuset *cs = cgroup_cs(cgrp); 1967 struct cpuset *cs = css_cs(css);
1979 struct cpuset *parent = parent_cs(cs); 1968 struct cpuset *parent = parent_cs(cs);
1980 struct cpuset *tmp_cs; 1969 struct cpuset *tmp_cs;
1981 struct cgroup *pos_cg; 1970 struct cgroup_subsys_state *pos_css;
1982 1971
1983 if (!parent) 1972 if (!parent)
1984 return 0; 1973 return 0;
@@ -1993,7 +1982,7 @@ static int cpuset_css_online(struct cgroup *cgrp)
1993 1982
1994 number_of_cpusets++; 1983 number_of_cpusets++;
1995 1984
1996 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags)) 1985 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
1997 goto out_unlock; 1986 goto out_unlock;
1998 1987
1999 /* 1988 /*
@@ -2010,7 +1999,7 @@ static int cpuset_css_online(struct cgroup *cgrp)
2010 * (and likewise for mems) to the new cgroup. 1999 * (and likewise for mems) to the new cgroup.
2011 */ 2000 */
2012 rcu_read_lock(); 2001 rcu_read_lock();
2013 cpuset_for_each_child(tmp_cs, pos_cg, parent) { 2002 cpuset_for_each_child(tmp_cs, pos_css, parent) {
2014 if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) { 2003 if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
2015 rcu_read_unlock(); 2004 rcu_read_unlock();
2016 goto out_unlock; 2005 goto out_unlock;
@@ -2027,9 +2016,15 @@ out_unlock:
2027 return 0; 2016 return 0;
2028} 2017}
2029 2018
2030static void cpuset_css_offline(struct cgroup *cgrp) 2019/*
2020 * If the cpuset being removed has its flag 'sched_load_balance'
2021 * enabled, then simulate turning sched_load_balance off, which
2022 * will call rebuild_sched_domains_locked().
2023 */
2024
2025static void cpuset_css_offline(struct cgroup_subsys_state *css)
2031{ 2026{
2032 struct cpuset *cs = cgroup_cs(cgrp); 2027 struct cpuset *cs = css_cs(css);
2033 2028
2034 mutex_lock(&cpuset_mutex); 2029 mutex_lock(&cpuset_mutex);
2035 2030
@@ -2042,15 +2037,9 @@ static void cpuset_css_offline(struct cgroup *cgrp)
2042 mutex_unlock(&cpuset_mutex); 2037 mutex_unlock(&cpuset_mutex);
2043} 2038}
2044 2039
2045/* 2040static void cpuset_css_free(struct cgroup_subsys_state *css)
2046 * If the cpuset being removed has its flag 'sched_load_balance'
2047 * enabled, then simulate turning sched_load_balance off, which
2048 * will call rebuild_sched_domains_locked().
2049 */
2050
2051static void cpuset_css_free(struct cgroup *cgrp)
2052{ 2041{
2053 struct cpuset *cs = cgroup_cs(cgrp); 2042 struct cpuset *cs = css_cs(css);
2054 2043
2055 free_cpumask_var(cs->cpus_allowed); 2044 free_cpumask_var(cs->cpus_allowed);
2056 kfree(cs); 2045 kfree(cs);
@@ -2257,11 +2246,11 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
2257 /* if cpus or mems changed, we need to propagate to descendants */ 2246 /* if cpus or mems changed, we need to propagate to descendants */
2258 if (cpus_updated || mems_updated) { 2247 if (cpus_updated || mems_updated) {
2259 struct cpuset *cs; 2248 struct cpuset *cs;
2260 struct cgroup *pos_cgrp; 2249 struct cgroup_subsys_state *pos_css;
2261 2250
2262 rcu_read_lock(); 2251 rcu_read_lock();
2263 cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset) { 2252 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
2264 if (!css_tryget(&cs->css)) 2253 if (cs == &top_cpuset || !css_tryget(&cs->css))
2265 continue; 2254 continue;
2266 rcu_read_unlock(); 2255 rcu_read_unlock();
2267 2256
@@ -2350,7 +2339,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
2350 2339
2351void cpuset_cpus_allowed_fallback(struct task_struct *tsk) 2340void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
2352{ 2341{
2353 const struct cpuset *cpus_cs; 2342 struct cpuset *cpus_cs;
2354 2343
2355 rcu_read_lock(); 2344 rcu_read_lock();
2356 cpus_cs = effective_cpumask_cpuset(task_cs(tsk)); 2345 cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
@@ -2423,7 +2412,7 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
2423 * callback_mutex. If no ancestor is mem_exclusive or mem_hardwall 2412 * callback_mutex. If no ancestor is mem_exclusive or mem_hardwall
2424 * (an unusual configuration), then returns the root cpuset. 2413 * (an unusual configuration), then returns the root cpuset.
2425 */ 2414 */
2426static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs) 2415static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
2427{ 2416{
2428 while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs)) 2417 while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
2429 cs = parent_cs(cs); 2418 cs = parent_cs(cs);
@@ -2493,7 +2482,7 @@ static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
2493 */ 2482 */
2494int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) 2483int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
2495{ 2484{
2496 const struct cpuset *cs; /* current cpuset ancestors */ 2485 struct cpuset *cs; /* current cpuset ancestors */
2497 int allowed; /* is allocation in zone z allowed? */ 2486 int allowed; /* is allocation in zone z allowed? */
2498 2487
2499 if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) 2488 if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
@@ -2731,7 +2720,7 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v)
2731 goto out_free; 2720 goto out_free;
2732 2721
2733 rcu_read_lock(); 2722 rcu_read_lock();
2734 css = task_subsys_state(tsk, cpuset_subsys_id); 2723 css = task_css(tsk, cpuset_subsys_id);
2735 retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); 2724 retval = cgroup_path(css->cgroup, buf, PAGE_SIZE);
2736 rcu_read_unlock(); 2725 rcu_read_unlock();
2737 if (retval < 0) 2726 if (retval < 0)
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index c77206184b8b..97b67df8fbfe 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -116,6 +116,9 @@ int get_callchain_buffers(void)
116 116
117 err = alloc_callchain_buffers(); 117 err = alloc_callchain_buffers();
118exit: 118exit:
119 if (err)
120 atomic_dec(&nr_callchain_events);
121
119 mutex_unlock(&callchain_mutex); 122 mutex_unlock(&callchain_mutex);
120 123
121 return err; 124 return err;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f86599e8c123..2207efc941d1 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -145,6 +145,7 @@ static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events);
145static atomic_t nr_mmap_events __read_mostly; 145static atomic_t nr_mmap_events __read_mostly;
146static atomic_t nr_comm_events __read_mostly; 146static atomic_t nr_comm_events __read_mostly;
147static atomic_t nr_task_events __read_mostly; 147static atomic_t nr_task_events __read_mostly;
148static atomic_t nr_freq_events __read_mostly;
148 149
149static LIST_HEAD(pmus); 150static LIST_HEAD(pmus);
150static DEFINE_MUTEX(pmus_lock); 151static DEFINE_MUTEX(pmus_lock);
@@ -340,8 +341,8 @@ struct perf_cgroup {
340static inline struct perf_cgroup * 341static inline struct perf_cgroup *
341perf_cgroup_from_task(struct task_struct *task) 342perf_cgroup_from_task(struct task_struct *task)
342{ 343{
343 return container_of(task_subsys_state(task, perf_subsys_id), 344 return container_of(task_css(task, perf_subsys_id),
344 struct perf_cgroup, css); 345 struct perf_cgroup, css);
345} 346}
346 347
347static inline bool 348static inline bool
@@ -591,7 +592,9 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
591 if (!f.file) 592 if (!f.file)
592 return -EBADF; 593 return -EBADF;
593 594
594 css = cgroup_css_from_dir(f.file, perf_subsys_id); 595 rcu_read_lock();
596
597 css = css_from_dir(f.file->f_dentry, &perf_subsys);
595 if (IS_ERR(css)) { 598 if (IS_ERR(css)) {
596 ret = PTR_ERR(css); 599 ret = PTR_ERR(css);
597 goto out; 600 goto out;
@@ -617,6 +620,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
617 ret = -EINVAL; 620 ret = -EINVAL;
618 } 621 }
619out: 622out:
623 rcu_read_unlock();
620 fdput(f); 624 fdput(f);
621 return ret; 625 return ret;
622} 626}
@@ -869,12 +873,8 @@ static void perf_pmu_rotate_start(struct pmu *pmu)
869 873
870 WARN_ON(!irqs_disabled()); 874 WARN_ON(!irqs_disabled());
871 875
872 if (list_empty(&cpuctx->rotation_list)) { 876 if (list_empty(&cpuctx->rotation_list))
873 int was_empty = list_empty(head);
874 list_add(&cpuctx->rotation_list, head); 877 list_add(&cpuctx->rotation_list, head);
875 if (was_empty)
876 tick_nohz_full_kick();
877 }
878} 878}
879 879
880static void get_ctx(struct perf_event_context *ctx) 880static void get_ctx(struct perf_event_context *ctx)
@@ -1216,6 +1216,9 @@ static void perf_event__id_header_size(struct perf_event *event)
1216 if (sample_type & PERF_SAMPLE_TIME) 1216 if (sample_type & PERF_SAMPLE_TIME)
1217 size += sizeof(data->time); 1217 size += sizeof(data->time);
1218 1218
1219 if (sample_type & PERF_SAMPLE_IDENTIFIER)
1220 size += sizeof(data->id);
1221
1219 if (sample_type & PERF_SAMPLE_ID) 1222 if (sample_type & PERF_SAMPLE_ID)
1220 size += sizeof(data->id); 1223 size += sizeof(data->id);
1221 1224
@@ -2712,7 +2715,7 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
2712 2715
2713 hwc = &event->hw; 2716 hwc = &event->hw;
2714 2717
2715 if (needs_unthr && hwc->interrupts == MAX_INTERRUPTS) { 2718 if (hwc->interrupts == MAX_INTERRUPTS) {
2716 hwc->interrupts = 0; 2719 hwc->interrupts = 0;
2717 perf_log_throttle(event, 1); 2720 perf_log_throttle(event, 1);
2718 event->pmu->start(event, 0); 2721 event->pmu->start(event, 0);
@@ -2811,10 +2814,11 @@ done:
2811#ifdef CONFIG_NO_HZ_FULL 2814#ifdef CONFIG_NO_HZ_FULL
2812bool perf_event_can_stop_tick(void) 2815bool perf_event_can_stop_tick(void)
2813{ 2816{
2814 if (list_empty(&__get_cpu_var(rotation_list))) 2817 if (atomic_read(&nr_freq_events) ||
2815 return true; 2818 __this_cpu_read(perf_throttled_count))
2816 else
2817 return false; 2819 return false;
2820 else
2821 return true;
2818} 2822}
2819#endif 2823#endif
2820 2824
@@ -3128,36 +3132,63 @@ static void free_event_rcu(struct rcu_head *head)
3128static void ring_buffer_put(struct ring_buffer *rb); 3132static void ring_buffer_put(struct ring_buffer *rb);
3129static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb); 3133static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb);
3130 3134
3131static void free_event(struct perf_event *event) 3135static void unaccount_event_cpu(struct perf_event *event, int cpu)
3132{ 3136{
3133 irq_work_sync(&event->pending); 3137 if (event->parent)
3138 return;
3139
3140 if (has_branch_stack(event)) {
3141 if (!(event->attach_state & PERF_ATTACH_TASK))
3142 atomic_dec(&per_cpu(perf_branch_stack_events, cpu));
3143 }
3144 if (is_cgroup_event(event))
3145 atomic_dec(&per_cpu(perf_cgroup_events, cpu));
3146}
3134 3147
3148static void unaccount_event(struct perf_event *event)
3149{
3150 if (event->parent)
3151 return;
3152
3153 if (event->attach_state & PERF_ATTACH_TASK)
3154 static_key_slow_dec_deferred(&perf_sched_events);
3155 if (event->attr.mmap || event->attr.mmap_data)
3156 atomic_dec(&nr_mmap_events);
3157 if (event->attr.comm)
3158 atomic_dec(&nr_comm_events);
3159 if (event->attr.task)
3160 atomic_dec(&nr_task_events);
3161 if (event->attr.freq)
3162 atomic_dec(&nr_freq_events);
3163 if (is_cgroup_event(event))
3164 static_key_slow_dec_deferred(&perf_sched_events);
3165 if (has_branch_stack(event))
3166 static_key_slow_dec_deferred(&perf_sched_events);
3167
3168 unaccount_event_cpu(event, event->cpu);
3169}
3170
3171static void __free_event(struct perf_event *event)
3172{
3135 if (!event->parent) { 3173 if (!event->parent) {
3136 if (event->attach_state & PERF_ATTACH_TASK)
3137 static_key_slow_dec_deferred(&perf_sched_events);
3138 if (event->attr.mmap || event->attr.mmap_data)
3139 atomic_dec(&nr_mmap_events);
3140 if (event->attr.comm)
3141 atomic_dec(&nr_comm_events);
3142 if (event->attr.task)
3143 atomic_dec(&nr_task_events);
3144 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) 3174 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
3145 put_callchain_buffers(); 3175 put_callchain_buffers();
3146 if (is_cgroup_event(event)) {
3147 atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
3148 static_key_slow_dec_deferred(&perf_sched_events);
3149 }
3150
3151 if (has_branch_stack(event)) {
3152 static_key_slow_dec_deferred(&perf_sched_events);
3153 /* is system-wide event */
3154 if (!(event->attach_state & PERF_ATTACH_TASK)) {
3155 atomic_dec(&per_cpu(perf_branch_stack_events,
3156 event->cpu));
3157 }
3158 }
3159 } 3176 }
3160 3177
3178 if (event->destroy)
3179 event->destroy(event);
3180
3181 if (event->ctx)
3182 put_ctx(event->ctx);
3183
3184 call_rcu(&event->rcu_head, free_event_rcu);
3185}
3186static void free_event(struct perf_event *event)
3187{
3188 irq_work_sync(&event->pending);
3189
3190 unaccount_event(event);
3191
3161 if (event->rb) { 3192 if (event->rb) {
3162 struct ring_buffer *rb; 3193 struct ring_buffer *rb;
3163 3194
@@ -3180,13 +3211,8 @@ static void free_event(struct perf_event *event)
3180 if (is_cgroup_event(event)) 3211 if (is_cgroup_event(event))
3181 perf_detach_cgroup(event); 3212 perf_detach_cgroup(event);
3182 3213
3183 if (event->destroy)
3184 event->destroy(event);
3185
3186 if (event->ctx)
3187 put_ctx(event->ctx);
3188 3214
3189 call_rcu(&event->rcu_head, free_event_rcu); 3215 __free_event(event);
3190} 3216}
3191 3217
3192int perf_event_release_kernel(struct perf_event *event) 3218int perf_event_release_kernel(struct perf_event *event)
@@ -3544,6 +3570,15 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3544 case PERF_EVENT_IOC_PERIOD: 3570 case PERF_EVENT_IOC_PERIOD:
3545 return perf_event_period(event, (u64 __user *)arg); 3571 return perf_event_period(event, (u64 __user *)arg);
3546 3572
3573 case PERF_EVENT_IOC_ID:
3574 {
3575 u64 id = primary_event_id(event);
3576
3577 if (copy_to_user((void __user *)arg, &id, sizeof(id)))
3578 return -EFAULT;
3579 return 0;
3580 }
3581
3547 case PERF_EVENT_IOC_SET_OUTPUT: 3582 case PERF_EVENT_IOC_SET_OUTPUT:
3548 { 3583 {
3549 int ret; 3584 int ret;
@@ -3641,6 +3676,10 @@ void perf_event_update_userpage(struct perf_event *event)
3641 u64 enabled, running, now; 3676 u64 enabled, running, now;
3642 3677
3643 rcu_read_lock(); 3678 rcu_read_lock();
3679 rb = rcu_dereference(event->rb);
3680 if (!rb)
3681 goto unlock;
3682
3644 /* 3683 /*
3645 * compute total_time_enabled, total_time_running 3684 * compute total_time_enabled, total_time_running
3646 * based on snapshot values taken when the event 3685 * based on snapshot values taken when the event
@@ -3651,12 +3690,8 @@ void perf_event_update_userpage(struct perf_event *event)
3651 * NMI context 3690 * NMI context
3652 */ 3691 */
3653 calc_timer_values(event, &now, &enabled, &running); 3692 calc_timer_values(event, &now, &enabled, &running);
3654 rb = rcu_dereference(event->rb);
3655 if (!rb)
3656 goto unlock;
3657 3693
3658 userpg = rb->user_page; 3694 userpg = rb->user_page;
3659
3660 /* 3695 /*
3661 * Disable preemption so as to not let the corresponding user-space 3696 * Disable preemption so as to not let the corresponding user-space
3662 * spin too long if we get preempted. 3697 * spin too long if we get preempted.
@@ -4251,7 +4286,7 @@ static void __perf_event_header__init_id(struct perf_event_header *header,
4251 if (sample_type & PERF_SAMPLE_TIME) 4286 if (sample_type & PERF_SAMPLE_TIME)
4252 data->time = perf_clock(); 4287 data->time = perf_clock();
4253 4288
4254 if (sample_type & PERF_SAMPLE_ID) 4289 if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
4255 data->id = primary_event_id(event); 4290 data->id = primary_event_id(event);
4256 4291
4257 if (sample_type & PERF_SAMPLE_STREAM_ID) 4292 if (sample_type & PERF_SAMPLE_STREAM_ID)
@@ -4290,6 +4325,9 @@ static void __perf_event__output_id_sample(struct perf_output_handle *handle,
4290 4325
4291 if (sample_type & PERF_SAMPLE_CPU) 4326 if (sample_type & PERF_SAMPLE_CPU)
4292 perf_output_put(handle, data->cpu_entry); 4327 perf_output_put(handle, data->cpu_entry);
4328
4329 if (sample_type & PERF_SAMPLE_IDENTIFIER)
4330 perf_output_put(handle, data->id);
4293} 4331}
4294 4332
4295void perf_event__output_id_sample(struct perf_event *event, 4333void perf_event__output_id_sample(struct perf_event *event,
@@ -4355,7 +4393,8 @@ static void perf_output_read_group(struct perf_output_handle *handle,
4355 list_for_each_entry(sub, &leader->sibling_list, group_entry) { 4393 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
4356 n = 0; 4394 n = 0;
4357 4395
4358 if (sub != event) 4396 if ((sub != event) &&
4397 (sub->state == PERF_EVENT_STATE_ACTIVE))
4359 sub->pmu->read(sub); 4398 sub->pmu->read(sub);
4360 4399
4361 values[n++] = perf_event_count(sub); 4400 values[n++] = perf_event_count(sub);
@@ -4402,6 +4441,9 @@ void perf_output_sample(struct perf_output_handle *handle,
4402 4441
4403 perf_output_put(handle, *header); 4442 perf_output_put(handle, *header);
4404 4443
4444 if (sample_type & PERF_SAMPLE_IDENTIFIER)
4445 perf_output_put(handle, data->id);
4446
4405 if (sample_type & PERF_SAMPLE_IP) 4447 if (sample_type & PERF_SAMPLE_IP)
4406 perf_output_put(handle, data->ip); 4448 perf_output_put(handle, data->ip);
4407 4449
@@ -4462,20 +4504,6 @@ void perf_output_sample(struct perf_output_handle *handle,
4462 } 4504 }
4463 } 4505 }
4464 4506
4465 if (!event->attr.watermark) {
4466 int wakeup_events = event->attr.wakeup_events;
4467
4468 if (wakeup_events) {
4469 struct ring_buffer *rb = handle->rb;
4470 int events = local_inc_return(&rb->events);
4471
4472 if (events >= wakeup_events) {
4473 local_sub(wakeup_events, &rb->events);
4474 local_inc(&rb->wakeup);
4475 }
4476 }
4477 }
4478
4479 if (sample_type & PERF_SAMPLE_BRANCH_STACK) { 4507 if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
4480 if (data->br_stack) { 4508 if (data->br_stack) {
4481 size_t size; 4509 size_t size;
@@ -4511,16 +4539,31 @@ void perf_output_sample(struct perf_output_handle *handle,
4511 } 4539 }
4512 } 4540 }
4513 4541
4514 if (sample_type & PERF_SAMPLE_STACK_USER) 4542 if (sample_type & PERF_SAMPLE_STACK_USER) {
4515 perf_output_sample_ustack(handle, 4543 perf_output_sample_ustack(handle,
4516 data->stack_user_size, 4544 data->stack_user_size,
4517 data->regs_user.regs); 4545 data->regs_user.regs);
4546 }
4518 4547
4519 if (sample_type & PERF_SAMPLE_WEIGHT) 4548 if (sample_type & PERF_SAMPLE_WEIGHT)
4520 perf_output_put(handle, data->weight); 4549 perf_output_put(handle, data->weight);
4521 4550
4522 if (sample_type & PERF_SAMPLE_DATA_SRC) 4551 if (sample_type & PERF_SAMPLE_DATA_SRC)
4523 perf_output_put(handle, data->data_src.val); 4552 perf_output_put(handle, data->data_src.val);
4553
4554 if (!event->attr.watermark) {
4555 int wakeup_events = event->attr.wakeup_events;
4556
4557 if (wakeup_events) {
4558 struct ring_buffer *rb = handle->rb;
4559 int events = local_inc_return(&rb->events);
4560
4561 if (events >= wakeup_events) {
4562 local_sub(wakeup_events, &rb->events);
4563 local_inc(&rb->wakeup);
4564 }
4565 }
4566 }
4524} 4567}
4525 4568
4526void perf_prepare_sample(struct perf_event_header *header, 4569void perf_prepare_sample(struct perf_event_header *header,
@@ -4680,12 +4723,10 @@ perf_event_read_event(struct perf_event *event,
4680 perf_output_end(&handle); 4723 perf_output_end(&handle);
4681} 4724}
4682 4725
4683typedef int (perf_event_aux_match_cb)(struct perf_event *event, void *data);
4684typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data); 4726typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data);
4685 4727
4686static void 4728static void
4687perf_event_aux_ctx(struct perf_event_context *ctx, 4729perf_event_aux_ctx(struct perf_event_context *ctx,
4688 perf_event_aux_match_cb match,
4689 perf_event_aux_output_cb output, 4730 perf_event_aux_output_cb output,
4690 void *data) 4731 void *data)
4691{ 4732{
@@ -4696,15 +4737,12 @@ perf_event_aux_ctx(struct perf_event_context *ctx,
4696 continue; 4737 continue;
4697 if (!event_filter_match(event)) 4738 if (!event_filter_match(event))
4698 continue; 4739 continue;
4699 if (match(event, data)) 4740 output(event, data);
4700 output(event, data);
4701 } 4741 }
4702} 4742}
4703 4743
4704static void 4744static void
4705perf_event_aux(perf_event_aux_match_cb match, 4745perf_event_aux(perf_event_aux_output_cb output, void *data,
4706 perf_event_aux_output_cb output,
4707 void *data,
4708 struct perf_event_context *task_ctx) 4746 struct perf_event_context *task_ctx)
4709{ 4747{
4710 struct perf_cpu_context *cpuctx; 4748 struct perf_cpu_context *cpuctx;
@@ -4717,7 +4755,7 @@ perf_event_aux(perf_event_aux_match_cb match,
4717 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); 4755 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
4718 if (cpuctx->unique_pmu != pmu) 4756 if (cpuctx->unique_pmu != pmu)
4719 goto next; 4757 goto next;
4720 perf_event_aux_ctx(&cpuctx->ctx, match, output, data); 4758 perf_event_aux_ctx(&cpuctx->ctx, output, data);
4721 if (task_ctx) 4759 if (task_ctx)
4722 goto next; 4760 goto next;
4723 ctxn = pmu->task_ctx_nr; 4761 ctxn = pmu->task_ctx_nr;
@@ -4725,14 +4763,14 @@ perf_event_aux(perf_event_aux_match_cb match,
4725 goto next; 4763 goto next;
4726 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); 4764 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
4727 if (ctx) 4765 if (ctx)
4728 perf_event_aux_ctx(ctx, match, output, data); 4766 perf_event_aux_ctx(ctx, output, data);
4729next: 4767next:
4730 put_cpu_ptr(pmu->pmu_cpu_context); 4768 put_cpu_ptr(pmu->pmu_cpu_context);
4731 } 4769 }
4732 4770
4733 if (task_ctx) { 4771 if (task_ctx) {
4734 preempt_disable(); 4772 preempt_disable();
4735 perf_event_aux_ctx(task_ctx, match, output, data); 4773 perf_event_aux_ctx(task_ctx, output, data);
4736 preempt_enable(); 4774 preempt_enable();
4737 } 4775 }
4738 rcu_read_unlock(); 4776 rcu_read_unlock();
@@ -4741,7 +4779,7 @@ next:
4741/* 4779/*
4742 * task tracking -- fork/exit 4780 * task tracking -- fork/exit
4743 * 4781 *
4744 * enabled by: attr.comm | attr.mmap | attr.mmap_data | attr.task 4782 * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task
4745 */ 4783 */
4746 4784
4747struct perf_task_event { 4785struct perf_task_event {
@@ -4759,6 +4797,13 @@ struct perf_task_event {
4759 } event_id; 4797 } event_id;
4760}; 4798};
4761 4799
4800static int perf_event_task_match(struct perf_event *event)
4801{
4802 return event->attr.comm || event->attr.mmap ||
4803 event->attr.mmap2 || event->attr.mmap_data ||
4804 event->attr.task;
4805}
4806
4762static void perf_event_task_output(struct perf_event *event, 4807static void perf_event_task_output(struct perf_event *event,
4763 void *data) 4808 void *data)
4764{ 4809{
@@ -4768,6 +4813,9 @@ static void perf_event_task_output(struct perf_event *event,
4768 struct task_struct *task = task_event->task; 4813 struct task_struct *task = task_event->task;
4769 int ret, size = task_event->event_id.header.size; 4814 int ret, size = task_event->event_id.header.size;
4770 4815
4816 if (!perf_event_task_match(event))
4817 return;
4818
4771 perf_event_header__init_id(&task_event->event_id.header, &sample, event); 4819 perf_event_header__init_id(&task_event->event_id.header, &sample, event);
4772 4820
4773 ret = perf_output_begin(&handle, event, 4821 ret = perf_output_begin(&handle, event,
@@ -4790,13 +4838,6 @@ out:
4790 task_event->event_id.header.size = size; 4838 task_event->event_id.header.size = size;
4791} 4839}
4792 4840
4793static int perf_event_task_match(struct perf_event *event,
4794 void *data __maybe_unused)
4795{
4796 return event->attr.comm || event->attr.mmap ||
4797 event->attr.mmap_data || event->attr.task;
4798}
4799
4800static void perf_event_task(struct task_struct *task, 4841static void perf_event_task(struct task_struct *task,
4801 struct perf_event_context *task_ctx, 4842 struct perf_event_context *task_ctx,
4802 int new) 4843 int new)
@@ -4825,8 +4866,7 @@ static void perf_event_task(struct task_struct *task,
4825 }, 4866 },
4826 }; 4867 };
4827 4868
4828 perf_event_aux(perf_event_task_match, 4869 perf_event_aux(perf_event_task_output,
4829 perf_event_task_output,
4830 &task_event, 4870 &task_event,
4831 task_ctx); 4871 task_ctx);
4832} 4872}
@@ -4853,6 +4893,11 @@ struct perf_comm_event {
4853 } event_id; 4893 } event_id;
4854}; 4894};
4855 4895
4896static int perf_event_comm_match(struct perf_event *event)
4897{
4898 return event->attr.comm;
4899}
4900
4856static void perf_event_comm_output(struct perf_event *event, 4901static void perf_event_comm_output(struct perf_event *event,
4857 void *data) 4902 void *data)
4858{ 4903{
@@ -4862,6 +4907,9 @@ static void perf_event_comm_output(struct perf_event *event,
4862 int size = comm_event->event_id.header.size; 4907 int size = comm_event->event_id.header.size;
4863 int ret; 4908 int ret;
4864 4909
4910 if (!perf_event_comm_match(event))
4911 return;
4912
4865 perf_event_header__init_id(&comm_event->event_id.header, &sample, event); 4913 perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
4866 ret = perf_output_begin(&handle, event, 4914 ret = perf_output_begin(&handle, event,
4867 comm_event->event_id.header.size); 4915 comm_event->event_id.header.size);
@@ -4883,12 +4931,6 @@ out:
4883 comm_event->event_id.header.size = size; 4931 comm_event->event_id.header.size = size;
4884} 4932}
4885 4933
4886static int perf_event_comm_match(struct perf_event *event,
4887 void *data __maybe_unused)
4888{
4889 return event->attr.comm;
4890}
4891
4892static void perf_event_comm_event(struct perf_comm_event *comm_event) 4934static void perf_event_comm_event(struct perf_comm_event *comm_event)
4893{ 4935{
4894 char comm[TASK_COMM_LEN]; 4936 char comm[TASK_COMM_LEN];
@@ -4903,8 +4945,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
4903 4945
4904 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; 4946 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
4905 4947
4906 perf_event_aux(perf_event_comm_match, 4948 perf_event_aux(perf_event_comm_output,
4907 perf_event_comm_output,
4908 comm_event, 4949 comm_event,
4909 NULL); 4950 NULL);
4910} 4951}
@@ -4955,6 +4996,9 @@ struct perf_mmap_event {
4955 4996
4956 const char *file_name; 4997 const char *file_name;
4957 int file_size; 4998 int file_size;
4999 int maj, min;
5000 u64 ino;
5001 u64 ino_generation;
4958 5002
4959 struct { 5003 struct {
4960 struct perf_event_header header; 5004 struct perf_event_header header;
@@ -4967,6 +5011,17 @@ struct perf_mmap_event {
4967 } event_id; 5011 } event_id;
4968}; 5012};
4969 5013
5014static int perf_event_mmap_match(struct perf_event *event,
5015 void *data)
5016{
5017 struct perf_mmap_event *mmap_event = data;
5018 struct vm_area_struct *vma = mmap_event->vma;
5019 int executable = vma->vm_flags & VM_EXEC;
5020
5021 return (!executable && event->attr.mmap_data) ||
5022 (executable && (event->attr.mmap || event->attr.mmap2));
5023}
5024
4970static void perf_event_mmap_output(struct perf_event *event, 5025static void perf_event_mmap_output(struct perf_event *event,
4971 void *data) 5026 void *data)
4972{ 5027{
@@ -4976,6 +5031,16 @@ static void perf_event_mmap_output(struct perf_event *event,
4976 int size = mmap_event->event_id.header.size; 5031 int size = mmap_event->event_id.header.size;
4977 int ret; 5032 int ret;
4978 5033
5034 if (!perf_event_mmap_match(event, data))
5035 return;
5036
5037 if (event->attr.mmap2) {
5038 mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
5039 mmap_event->event_id.header.size += sizeof(mmap_event->maj);
5040 mmap_event->event_id.header.size += sizeof(mmap_event->min);
5041 mmap_event->event_id.header.size += sizeof(mmap_event->ino);
5042 }
5043
4979 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); 5044 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
4980 ret = perf_output_begin(&handle, event, 5045 ret = perf_output_begin(&handle, event,
4981 mmap_event->event_id.header.size); 5046 mmap_event->event_id.header.size);
@@ -4986,6 +5051,14 @@ static void perf_event_mmap_output(struct perf_event *event,
4986 mmap_event->event_id.tid = perf_event_tid(event, current); 5051 mmap_event->event_id.tid = perf_event_tid(event, current);
4987 5052
4988 perf_output_put(&handle, mmap_event->event_id); 5053 perf_output_put(&handle, mmap_event->event_id);
5054
5055 if (event->attr.mmap2) {
5056 perf_output_put(&handle, mmap_event->maj);
5057 perf_output_put(&handle, mmap_event->min);
5058 perf_output_put(&handle, mmap_event->ino);
5059 perf_output_put(&handle, mmap_event->ino_generation);
5060 }
5061
4989 __output_copy(&handle, mmap_event->file_name, 5062 __output_copy(&handle, mmap_event->file_name,
4990 mmap_event->file_size); 5063 mmap_event->file_size);
4991 5064
@@ -4996,21 +5069,12 @@ out:
4996 mmap_event->event_id.header.size = size; 5069 mmap_event->event_id.header.size = size;
4997} 5070}
4998 5071
4999static int perf_event_mmap_match(struct perf_event *event,
5000 void *data)
5001{
5002 struct perf_mmap_event *mmap_event = data;
5003 struct vm_area_struct *vma = mmap_event->vma;
5004 int executable = vma->vm_flags & VM_EXEC;
5005
5006 return (!executable && event->attr.mmap_data) ||
5007 (executable && event->attr.mmap);
5008}
5009
5010static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) 5072static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
5011{ 5073{
5012 struct vm_area_struct *vma = mmap_event->vma; 5074 struct vm_area_struct *vma = mmap_event->vma;
5013 struct file *file = vma->vm_file; 5075 struct file *file = vma->vm_file;
5076 int maj = 0, min = 0;
5077 u64 ino = 0, gen = 0;
5014 unsigned int size; 5078 unsigned int size;
5015 char tmp[16]; 5079 char tmp[16];
5016 char *buf = NULL; 5080 char *buf = NULL;
@@ -5019,6 +5083,8 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
5019 memset(tmp, 0, sizeof(tmp)); 5083 memset(tmp, 0, sizeof(tmp));
5020 5084
5021 if (file) { 5085 if (file) {
5086 struct inode *inode;
5087 dev_t dev;
5022 /* 5088 /*
5023 * d_path works from the end of the rb backwards, so we 5089 * d_path works from the end of the rb backwards, so we
5024 * need to add enough zero bytes after the string to handle 5090 * need to add enough zero bytes after the string to handle
@@ -5034,6 +5100,13 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
5034 name = strncpy(tmp, "//toolong", sizeof(tmp)); 5100 name = strncpy(tmp, "//toolong", sizeof(tmp));
5035 goto got_name; 5101 goto got_name;
5036 } 5102 }
5103 inode = file_inode(vma->vm_file);
5104 dev = inode->i_sb->s_dev;
5105 ino = inode->i_ino;
5106 gen = inode->i_generation;
5107 maj = MAJOR(dev);
5108 min = MINOR(dev);
5109
5037 } else { 5110 } else {
5038 if (arch_vma_name(mmap_event->vma)) { 5111 if (arch_vma_name(mmap_event->vma)) {
5039 name = strncpy(tmp, arch_vma_name(mmap_event->vma), 5112 name = strncpy(tmp, arch_vma_name(mmap_event->vma),
@@ -5064,14 +5137,17 @@ got_name:
5064 5137
5065 mmap_event->file_name = name; 5138 mmap_event->file_name = name;
5066 mmap_event->file_size = size; 5139 mmap_event->file_size = size;
5140 mmap_event->maj = maj;
5141 mmap_event->min = min;
5142 mmap_event->ino = ino;
5143 mmap_event->ino_generation = gen;
5067 5144
5068 if (!(vma->vm_flags & VM_EXEC)) 5145 if (!(vma->vm_flags & VM_EXEC))
5069 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA; 5146 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
5070 5147
5071 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; 5148 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
5072 5149
5073 perf_event_aux(perf_event_mmap_match, 5150 perf_event_aux(perf_event_mmap_output,
5074 perf_event_mmap_output,
5075 mmap_event, 5151 mmap_event,
5076 NULL); 5152 NULL);
5077 5153
@@ -5101,6 +5177,10 @@ void perf_event_mmap(struct vm_area_struct *vma)
5101 .len = vma->vm_end - vma->vm_start, 5177 .len = vma->vm_end - vma->vm_start,
5102 .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT, 5178 .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT,
5103 }, 5179 },
5180 /* .maj (attr_mmap2 only) */
5181 /* .min (attr_mmap2 only) */
5182 /* .ino (attr_mmap2 only) */
5183 /* .ino_generation (attr_mmap2 only) */
5104 }; 5184 };
5105 5185
5106 perf_event_mmap_event(&mmap_event); 5186 perf_event_mmap_event(&mmap_event);
@@ -5178,6 +5258,7 @@ static int __perf_event_overflow(struct perf_event *event,
5178 __this_cpu_inc(perf_throttled_count); 5258 __this_cpu_inc(perf_throttled_count);
5179 hwc->interrupts = MAX_INTERRUPTS; 5259 hwc->interrupts = MAX_INTERRUPTS;
5180 perf_log_throttle(event, 0); 5260 perf_log_throttle(event, 0);
5261 tick_nohz_full_kick();
5181 ret = 1; 5262 ret = 1;
5182 } 5263 }
5183 } 5264 }
@@ -6443,6 +6524,44 @@ unlock:
6443 return pmu; 6524 return pmu;
6444} 6525}
6445 6526
6527static void account_event_cpu(struct perf_event *event, int cpu)
6528{
6529 if (event->parent)
6530 return;
6531
6532 if (has_branch_stack(event)) {
6533 if (!(event->attach_state & PERF_ATTACH_TASK))
6534 atomic_inc(&per_cpu(perf_branch_stack_events, cpu));
6535 }
6536 if (is_cgroup_event(event))
6537 atomic_inc(&per_cpu(perf_cgroup_events, cpu));
6538}
6539
6540static void account_event(struct perf_event *event)
6541{
6542 if (event->parent)
6543 return;
6544
6545 if (event->attach_state & PERF_ATTACH_TASK)
6546 static_key_slow_inc(&perf_sched_events.key);
6547 if (event->attr.mmap || event->attr.mmap_data)
6548 atomic_inc(&nr_mmap_events);
6549 if (event->attr.comm)
6550 atomic_inc(&nr_comm_events);
6551 if (event->attr.task)
6552 atomic_inc(&nr_task_events);
6553 if (event->attr.freq) {
6554 if (atomic_inc_return(&nr_freq_events) == 1)
6555 tick_nohz_full_kick_all();
6556 }
6557 if (has_branch_stack(event))
6558 static_key_slow_inc(&perf_sched_events.key);
6559 if (is_cgroup_event(event))
6560 static_key_slow_inc(&perf_sched_events.key);
6561
6562 account_event_cpu(event, event->cpu);
6563}
6564
6446/* 6565/*
6447 * Allocate and initialize a event structure 6566 * Allocate and initialize a event structure
6448 */ 6567 */
@@ -6457,7 +6576,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
6457 struct pmu *pmu; 6576 struct pmu *pmu;
6458 struct perf_event *event; 6577 struct perf_event *event;
6459 struct hw_perf_event *hwc; 6578 struct hw_perf_event *hwc;
6460 long err; 6579 long err = -EINVAL;
6461 6580
6462 if ((unsigned)cpu >= nr_cpu_ids) { 6581 if ((unsigned)cpu >= nr_cpu_ids) {
6463 if (!task || cpu != -1) 6582 if (!task || cpu != -1)
@@ -6540,49 +6659,35 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
6540 * we currently do not support PERF_FORMAT_GROUP on inherited events 6659 * we currently do not support PERF_FORMAT_GROUP on inherited events
6541 */ 6660 */
6542 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) 6661 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
6543 goto done; 6662 goto err_ns;
6544 6663
6545 pmu = perf_init_event(event); 6664 pmu = perf_init_event(event);
6546
6547done:
6548 err = 0;
6549 if (!pmu) 6665 if (!pmu)
6550 err = -EINVAL; 6666 goto err_ns;
6551 else if (IS_ERR(pmu)) 6667 else if (IS_ERR(pmu)) {
6552 err = PTR_ERR(pmu); 6668 err = PTR_ERR(pmu);
6553 6669 goto err_ns;
6554 if (err) {
6555 if (event->ns)
6556 put_pid_ns(event->ns);
6557 kfree(event);
6558 return ERR_PTR(err);
6559 } 6670 }
6560 6671
6561 if (!event->parent) { 6672 if (!event->parent) {
6562 if (event->attach_state & PERF_ATTACH_TASK)
6563 static_key_slow_inc(&perf_sched_events.key);
6564 if (event->attr.mmap || event->attr.mmap_data)
6565 atomic_inc(&nr_mmap_events);
6566 if (event->attr.comm)
6567 atomic_inc(&nr_comm_events);
6568 if (event->attr.task)
6569 atomic_inc(&nr_task_events);
6570 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { 6673 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
6571 err = get_callchain_buffers(); 6674 err = get_callchain_buffers();
6572 if (err) { 6675 if (err)
6573 free_event(event); 6676 goto err_pmu;
6574 return ERR_PTR(err);
6575 }
6576 }
6577 if (has_branch_stack(event)) {
6578 static_key_slow_inc(&perf_sched_events.key);
6579 if (!(event->attach_state & PERF_ATTACH_TASK))
6580 atomic_inc(&per_cpu(perf_branch_stack_events,
6581 event->cpu));
6582 } 6677 }
6583 } 6678 }
6584 6679
6585 return event; 6680 return event;
6681
6682err_pmu:
6683 if (event->destroy)
6684 event->destroy(event);
6685err_ns:
6686 if (event->ns)
6687 put_pid_ns(event->ns);
6688 kfree(event);
6689
6690 return ERR_PTR(err);
6586} 6691}
6587 6692
6588static int perf_copy_attr(struct perf_event_attr __user *uattr, 6693static int perf_copy_attr(struct perf_event_attr __user *uattr,
@@ -6864,17 +6969,14 @@ SYSCALL_DEFINE5(perf_event_open,
6864 6969
6865 if (flags & PERF_FLAG_PID_CGROUP) { 6970 if (flags & PERF_FLAG_PID_CGROUP) {
6866 err = perf_cgroup_connect(pid, event, &attr, group_leader); 6971 err = perf_cgroup_connect(pid, event, &attr, group_leader);
6867 if (err) 6972 if (err) {
6868 goto err_alloc; 6973 __free_event(event);
6869 /* 6974 goto err_task;
6870 * one more event: 6975 }
6871 * - that has cgroup constraint on event->cpu
6872 * - that may need work on context switch
6873 */
6874 atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
6875 static_key_slow_inc(&perf_sched_events.key);
6876 } 6976 }
6877 6977
6978 account_event(event);
6979
6878 /* 6980 /*
6879 * Special case software events and allow them to be part of 6981 * Special case software events and allow them to be part of
6880 * any hardware group. 6982 * any hardware group.
@@ -7070,6 +7172,8 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
7070 goto err; 7172 goto err;
7071 } 7173 }
7072 7174
7175 account_event(event);
7176
7073 ctx = find_get_context(event->pmu, task, cpu); 7177 ctx = find_get_context(event->pmu, task, cpu);
7074 if (IS_ERR(ctx)) { 7178 if (IS_ERR(ctx)) {
7075 err = PTR_ERR(ctx); 7179 err = PTR_ERR(ctx);
@@ -7106,6 +7210,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
7106 list_for_each_entry_safe(event, tmp, &src_ctx->event_list, 7210 list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
7107 event_entry) { 7211 event_entry) {
7108 perf_remove_from_context(event); 7212 perf_remove_from_context(event);
7213 unaccount_event_cpu(event, src_cpu);
7109 put_ctx(src_ctx); 7214 put_ctx(src_ctx);
7110 list_add(&event->event_entry, &events); 7215 list_add(&event->event_entry, &events);
7111 } 7216 }
@@ -7118,6 +7223,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
7118 list_del(&event->event_entry); 7223 list_del(&event->event_entry);
7119 if (event->state >= PERF_EVENT_STATE_OFF) 7224 if (event->state >= PERF_EVENT_STATE_OFF)
7120 event->state = PERF_EVENT_STATE_INACTIVE; 7225 event->state = PERF_EVENT_STATE_INACTIVE;
7226 account_event_cpu(event, dst_cpu);
7121 perf_install_in_context(dst_ctx, event, dst_cpu); 7227 perf_install_in_context(dst_ctx, event, dst_cpu);
7122 get_ctx(dst_ctx); 7228 get_ctx(dst_ctx);
7123 } 7229 }
@@ -7798,7 +7904,8 @@ unlock:
7798device_initcall(perf_event_sysfs_init); 7904device_initcall(perf_event_sysfs_init);
7799 7905
7800#ifdef CONFIG_CGROUP_PERF 7906#ifdef CONFIG_CGROUP_PERF
7801static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont) 7907static struct cgroup_subsys_state *
7908perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
7802{ 7909{
7803 struct perf_cgroup *jc; 7910 struct perf_cgroup *jc;
7804 7911
@@ -7815,11 +7922,10 @@ static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont)
7815 return &jc->css; 7922 return &jc->css;
7816} 7923}
7817 7924
7818static void perf_cgroup_css_free(struct cgroup *cont) 7925static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
7819{ 7926{
7820 struct perf_cgroup *jc; 7927 struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);
7821 jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), 7928
7822 struct perf_cgroup, css);
7823 free_percpu(jc->info); 7929 free_percpu(jc->info);
7824 kfree(jc); 7930 kfree(jc);
7825} 7931}
@@ -7831,15 +7937,17 @@ static int __perf_cgroup_move(void *info)
7831 return 0; 7937 return 0;
7832} 7938}
7833 7939
7834static void perf_cgroup_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 7940static void perf_cgroup_attach(struct cgroup_subsys_state *css,
7941 struct cgroup_taskset *tset)
7835{ 7942{
7836 struct task_struct *task; 7943 struct task_struct *task;
7837 7944
7838 cgroup_taskset_for_each(task, cgrp, tset) 7945 cgroup_taskset_for_each(task, css, tset)
7839 task_function_call(task, __perf_cgroup_move, task); 7946 task_function_call(task, __perf_cgroup_move, task);
7840} 7947}
7841 7948
7842static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, 7949static void perf_cgroup_exit(struct cgroup_subsys_state *css,
7950 struct cgroup_subsys_state *old_css,
7843 struct task_struct *task) 7951 struct task_struct *task)
7844{ 7952{
7845 /* 7953 /*
diff --git a/kernel/fork.c b/kernel/fork.c
index e23bb19e2a3e..bf46287c91a4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1177,7 +1177,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1177 * don't allow the creation of threads. 1177 * don't allow the creation of threads.
1178 */ 1178 */
1179 if ((clone_flags & (CLONE_VM|CLONE_NEWPID)) && 1179 if ((clone_flags & (CLONE_VM|CLONE_NEWPID)) &&
1180 (task_active_pid_ns(current) != current->nsproxy->pid_ns)) 1180 (task_active_pid_ns(current) !=
1181 current->nsproxy->pid_ns_for_children))
1181 return ERR_PTR(-EINVAL); 1182 return ERR_PTR(-EINVAL);
1182 1183
1183 retval = security_task_create(clone_flags); 1184 retval = security_task_create(clone_flags);
@@ -1351,7 +1352,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1351 1352
1352 if (pid != &init_struct_pid) { 1353 if (pid != &init_struct_pid) {
1353 retval = -ENOMEM; 1354 retval = -ENOMEM;
1354 pid = alloc_pid(p->nsproxy->pid_ns); 1355 pid = alloc_pid(p->nsproxy->pid_ns_for_children);
1355 if (!pid) 1356 if (!pid)
1356 goto bad_fork_cleanup_io; 1357 goto bad_fork_cleanup_io;
1357 } 1358 }
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 6df614912b9d..3e97fb126e6b 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -15,6 +15,7 @@
15#include <linux/lockdep.h> 15#include <linux/lockdep.h>
16#include <linux/export.h> 16#include <linux/export.h>
17#include <linux/sysctl.h> 17#include <linux/sysctl.h>
18#include <linux/utsname.h>
18 19
19/* 20/*
20 * The number of tasks checked: 21 * The number of tasks checked:
@@ -99,10 +100,14 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
99 * Ok, the task did not get scheduled for more than 2 minutes, 100 * Ok, the task did not get scheduled for more than 2 minutes,
100 * complain: 101 * complain:
101 */ 102 */
102 printk(KERN_ERR "INFO: task %s:%d blocked for more than " 103 pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n",
103 "%ld seconds.\n", t->comm, t->pid, timeout); 104 t->comm, t->pid, timeout);
104 printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" 105 pr_err(" %s %s %.*s\n",
105 " disables this message.\n"); 106 print_tainted(), init_utsname()->release,
107 (int)strcspn(init_utsname()->version, " "),
108 init_utsname()->version);
109 pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
110 " disables this message.\n");
106 sched_show_task(t); 111 sched_show_task(t);
107 debug_show_held_locks(t); 112 debug_show_held_locks(t);
108 113
diff --git a/kernel/lglock.c b/kernel/lglock.c
index 6535a667a5a7..86ae2aebf004 100644
--- a/kernel/lglock.c
+++ b/kernel/lglock.c
@@ -21,7 +21,7 @@ void lg_local_lock(struct lglock *lg)
21 arch_spinlock_t *lock; 21 arch_spinlock_t *lock;
22 22
23 preempt_disable(); 23 preempt_disable();
24 rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_); 24 lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
25 lock = this_cpu_ptr(lg->lock); 25 lock = this_cpu_ptr(lg->lock);
26 arch_spin_lock(lock); 26 arch_spin_lock(lock);
27} 27}
@@ -31,7 +31,7 @@ void lg_local_unlock(struct lglock *lg)
31{ 31{
32 arch_spinlock_t *lock; 32 arch_spinlock_t *lock;
33 33
34 rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); 34 lock_release(&lg->lock_dep_map, 1, _RET_IP_);
35 lock = this_cpu_ptr(lg->lock); 35 lock = this_cpu_ptr(lg->lock);
36 arch_spin_unlock(lock); 36 arch_spin_unlock(lock);
37 preempt_enable(); 37 preempt_enable();
@@ -43,7 +43,7 @@ void lg_local_lock_cpu(struct lglock *lg, int cpu)
43 arch_spinlock_t *lock; 43 arch_spinlock_t *lock;
44 44
45 preempt_disable(); 45 preempt_disable();
46 rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_); 46 lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
47 lock = per_cpu_ptr(lg->lock, cpu); 47 lock = per_cpu_ptr(lg->lock, cpu);
48 arch_spin_lock(lock); 48 arch_spin_lock(lock);
49} 49}
@@ -53,7 +53,7 @@ void lg_local_unlock_cpu(struct lglock *lg, int cpu)
53{ 53{
54 arch_spinlock_t *lock; 54 arch_spinlock_t *lock;
55 55
56 rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); 56 lock_release(&lg->lock_dep_map, 1, _RET_IP_);
57 lock = per_cpu_ptr(lg->lock, cpu); 57 lock = per_cpu_ptr(lg->lock, cpu);
58 arch_spin_unlock(lock); 58 arch_spin_unlock(lock);
59 preempt_enable(); 59 preempt_enable();
@@ -65,7 +65,7 @@ void lg_global_lock(struct lglock *lg)
65 int i; 65 int i;
66 66
67 preempt_disable(); 67 preempt_disable();
68 rwlock_acquire(&lg->lock_dep_map, 0, 0, _RET_IP_); 68 lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
69 for_each_possible_cpu(i) { 69 for_each_possible_cpu(i) {
70 arch_spinlock_t *lock; 70 arch_spinlock_t *lock;
71 lock = per_cpu_ptr(lg->lock, i); 71 lock = per_cpu_ptr(lg->lock, i);
@@ -78,7 +78,7 @@ void lg_global_unlock(struct lglock *lg)
78{ 78{
79 int i; 79 int i;
80 80
81 rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); 81 lock_release(&lg->lock_dep_map, 1, _RET_IP_);
82 for_each_possible_cpu(i) { 82 for_each_possible_cpu(i) {
83 arch_spinlock_t *lock; 83 arch_spinlock_t *lock;
84 lock = per_cpu_ptr(lg->lock, i); 84 lock = per_cpu_ptr(lg->lock, i);
diff --git a/kernel/mutex.c b/kernel/mutex.c
index a52ee7bb830d..6d647aedffea 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -209,11 +209,13 @@ int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
209 */ 209 */
210static inline int mutex_can_spin_on_owner(struct mutex *lock) 210static inline int mutex_can_spin_on_owner(struct mutex *lock)
211{ 211{
212 struct task_struct *owner;
212 int retval = 1; 213 int retval = 1;
213 214
214 rcu_read_lock(); 215 rcu_read_lock();
215 if (lock->owner) 216 owner = ACCESS_ONCE(lock->owner);
216 retval = lock->owner->on_cpu; 217 if (owner)
218 retval = owner->on_cpu;
217 rcu_read_unlock(); 219 rcu_read_unlock();
218 /* 220 /*
219 * if lock->owner is not set, the mutex owner may have just acquired 221 * if lock->owner is not set, the mutex owner may have just acquired
@@ -461,7 +463,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
461 * performed the optimistic spinning cannot be done. 463 * performed the optimistic spinning cannot be done.
462 */ 464 */
463 if (ACCESS_ONCE(ww->ctx)) 465 if (ACCESS_ONCE(ww->ctx))
464 break; 466 goto slowpath;
465 } 467 }
466 468
467 /* 469 /*
@@ -472,7 +474,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
472 owner = ACCESS_ONCE(lock->owner); 474 owner = ACCESS_ONCE(lock->owner);
473 if (owner && !mutex_spin_on_owner(lock, owner)) { 475 if (owner && !mutex_spin_on_owner(lock, owner)) {
474 mspin_unlock(MLOCK(lock), &node); 476 mspin_unlock(MLOCK(lock), &node);
475 break; 477 goto slowpath;
476 } 478 }
477 479
478 if ((atomic_read(&lock->count) == 1) && 480 if ((atomic_read(&lock->count) == 1) &&
@@ -499,7 +501,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
499 * the owner complete. 501 * the owner complete.
500 */ 502 */
501 if (!owner && (need_resched() || rt_task(task))) 503 if (!owner && (need_resched() || rt_task(task)))
502 break; 504 goto slowpath;
503 505
504 /* 506 /*
505 * The cpu_relax() call is a compiler barrier which forces 507 * The cpu_relax() call is a compiler barrier which forces
@@ -513,6 +515,10 @@ slowpath:
513#endif 515#endif
514 spin_lock_mutex(&lock->wait_lock, flags); 516 spin_lock_mutex(&lock->wait_lock, flags);
515 517
518 /* once more, can we acquire the lock? */
519 if (MUTEX_SHOW_NO_WAITER(lock) && (atomic_xchg(&lock->count, 0) == 1))
520 goto skip_wait;
521
516 debug_mutex_lock_common(lock, &waiter); 522 debug_mutex_lock_common(lock, &waiter);
517 debug_mutex_add_waiter(lock, &waiter, task_thread_info(task)); 523 debug_mutex_add_waiter(lock, &waiter, task_thread_info(task));
518 524
@@ -520,9 +526,6 @@ slowpath:
520 list_add_tail(&waiter.list, &lock->wait_list); 526 list_add_tail(&waiter.list, &lock->wait_list);
521 waiter.task = task; 527 waiter.task = task;
522 528
523 if (MUTEX_SHOW_NO_WAITER(lock) && (atomic_xchg(&lock->count, -1) == 1))
524 goto done;
525
526 lock_contended(&lock->dep_map, ip); 529 lock_contended(&lock->dep_map, ip);
527 530
528 for (;;) { 531 for (;;) {
@@ -536,7 +539,7 @@ slowpath:
536 * other waiters: 539 * other waiters:
537 */ 540 */
538 if (MUTEX_SHOW_NO_WAITER(lock) && 541 if (MUTEX_SHOW_NO_WAITER(lock) &&
539 (atomic_xchg(&lock->count, -1) == 1)) 542 (atomic_xchg(&lock->count, -1) == 1))
540 break; 543 break;
541 544
542 /* 545 /*
@@ -561,24 +564,25 @@ slowpath:
561 schedule_preempt_disabled(); 564 schedule_preempt_disabled();
562 spin_lock_mutex(&lock->wait_lock, flags); 565 spin_lock_mutex(&lock->wait_lock, flags);
563 } 566 }
567 mutex_remove_waiter(lock, &waiter, current_thread_info());
568 /* set it to 0 if there are no waiters left: */
569 if (likely(list_empty(&lock->wait_list)))
570 atomic_set(&lock->count, 0);
571 debug_mutex_free_waiter(&waiter);
564 572
565done: 573skip_wait:
574 /* got the lock - cleanup and rejoice! */
566 lock_acquired(&lock->dep_map, ip); 575 lock_acquired(&lock->dep_map, ip);
567 /* got the lock - rejoice! */
568 mutex_remove_waiter(lock, &waiter, current_thread_info());
569 mutex_set_owner(lock); 576 mutex_set_owner(lock);
570 577
571 if (!__builtin_constant_p(ww_ctx == NULL)) { 578 if (!__builtin_constant_p(ww_ctx == NULL)) {
572 struct ww_mutex *ww = container_of(lock, 579 struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
573 struct ww_mutex,
574 base);
575 struct mutex_waiter *cur; 580 struct mutex_waiter *cur;
576 581
577 /* 582 /*
578 * This branch gets optimized out for the common case, 583 * This branch gets optimized out for the common case,
579 * and is only important for ww_mutex_lock. 584 * and is only important for ww_mutex_lock.
580 */ 585 */
581
582 ww_mutex_lock_acquired(ww, ww_ctx); 586 ww_mutex_lock_acquired(ww, ww_ctx);
583 ww->ctx = ww_ctx; 587 ww->ctx = ww_ctx;
584 588
@@ -592,15 +596,8 @@ done:
592 } 596 }
593 } 597 }
594 598
595 /* set it to 0 if there are no waiters left: */
596 if (likely(list_empty(&lock->wait_list)))
597 atomic_set(&lock->count, 0);
598
599 spin_unlock_mutex(&lock->wait_lock, flags); 599 spin_unlock_mutex(&lock->wait_lock, flags);
600
601 debug_mutex_free_waiter(&waiter);
602 preempt_enable(); 600 preempt_enable();
603
604 return 0; 601 return 0;
605 602
606err: 603err:
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 364ceab15f0c..997cbb951a3b 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -29,15 +29,15 @@
29static struct kmem_cache *nsproxy_cachep; 29static struct kmem_cache *nsproxy_cachep;
30 30
31struct nsproxy init_nsproxy = { 31struct nsproxy init_nsproxy = {
32 .count = ATOMIC_INIT(1), 32 .count = ATOMIC_INIT(1),
33 .uts_ns = &init_uts_ns, 33 .uts_ns = &init_uts_ns,
34#if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC) 34#if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC)
35 .ipc_ns = &init_ipc_ns, 35 .ipc_ns = &init_ipc_ns,
36#endif 36#endif
37 .mnt_ns = NULL, 37 .mnt_ns = NULL,
38 .pid_ns = &init_pid_ns, 38 .pid_ns_for_children = &init_pid_ns,
39#ifdef CONFIG_NET 39#ifdef CONFIG_NET
40 .net_ns = &init_net, 40 .net_ns = &init_net,
41#endif 41#endif
42}; 42};
43 43
@@ -85,9 +85,10 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
85 goto out_ipc; 85 goto out_ipc;
86 } 86 }
87 87
88 new_nsp->pid_ns = copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns); 88 new_nsp->pid_ns_for_children =
89 if (IS_ERR(new_nsp->pid_ns)) { 89 copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns_for_children);
90 err = PTR_ERR(new_nsp->pid_ns); 90 if (IS_ERR(new_nsp->pid_ns_for_children)) {
91 err = PTR_ERR(new_nsp->pid_ns_for_children);
91 goto out_pid; 92 goto out_pid;
92 } 93 }
93 94
@@ -100,8 +101,8 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
100 return new_nsp; 101 return new_nsp;
101 102
102out_net: 103out_net:
103 if (new_nsp->pid_ns) 104 if (new_nsp->pid_ns_for_children)
104 put_pid_ns(new_nsp->pid_ns); 105 put_pid_ns(new_nsp->pid_ns_for_children);
105out_pid: 106out_pid:
106 if (new_nsp->ipc_ns) 107 if (new_nsp->ipc_ns)
107 put_ipc_ns(new_nsp->ipc_ns); 108 put_ipc_ns(new_nsp->ipc_ns);
@@ -174,8 +175,8 @@ void free_nsproxy(struct nsproxy *ns)
174 put_uts_ns(ns->uts_ns); 175 put_uts_ns(ns->uts_ns);
175 if (ns->ipc_ns) 176 if (ns->ipc_ns)
176 put_ipc_ns(ns->ipc_ns); 177 put_ipc_ns(ns->ipc_ns);
177 if (ns->pid_ns) 178 if (ns->pid_ns_for_children)
178 put_pid_ns(ns->pid_ns); 179 put_pid_ns(ns->pid_ns_for_children);
179 put_net(ns->net_ns); 180 put_net(ns->net_ns);
180 kmem_cache_free(nsproxy_cachep, ns); 181 kmem_cache_free(nsproxy_cachep, ns);
181} 182}
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 6917e8edb48e..601bb361c235 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -349,8 +349,8 @@ static int pidns_install(struct nsproxy *nsproxy, void *ns)
349 if (ancestor != active) 349 if (ancestor != active)
350 return -EINVAL; 350 return -EINVAL;
351 351
352 put_pid_ns(nsproxy->pid_ns); 352 put_pid_ns(nsproxy->pid_ns_for_children);
353 nsproxy->pid_ns = get_pid_ns(new); 353 nsproxy->pid_ns_for_children = get_pid_ns(new);
354 return 0; 354 return 0;
355} 355}
356 356
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index b26f5f1e773e..3085e62a80a5 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -39,7 +39,7 @@ static int resume_delay;
39static char resume_file[256] = CONFIG_PM_STD_PARTITION; 39static char resume_file[256] = CONFIG_PM_STD_PARTITION;
40dev_t swsusp_resume_device; 40dev_t swsusp_resume_device;
41sector_t swsusp_resume_block; 41sector_t swsusp_resume_block;
42int in_suspend __nosavedata; 42__visible int in_suspend __nosavedata;
43 43
44enum { 44enum {
45 HIBERNATION_INVALID, 45 HIBERNATION_INVALID,
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index ece04223bb1e..62ee437b5c7e 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -210,6 +210,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
210 goto Platform_wake; 210 goto Platform_wake;
211 } 211 }
212 212
213 ftrace_stop();
213 error = disable_nonboot_cpus(); 214 error = disable_nonboot_cpus();
214 if (error || suspend_test(TEST_CPUS)) 215 if (error || suspend_test(TEST_CPUS))
215 goto Enable_cpus; 216 goto Enable_cpus;
@@ -232,6 +233,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
232 233
233 Enable_cpus: 234 Enable_cpus:
234 enable_nonboot_cpus(); 235 enable_nonboot_cpus();
236 ftrace_start();
235 237
236 Platform_wake: 238 Platform_wake:
237 if (need_suspend_ops(state) && suspend_ops->wake) 239 if (need_suspend_ops(state) && suspend_ops->wake)
@@ -265,7 +267,6 @@ int suspend_devices_and_enter(suspend_state_t state)
265 goto Close; 267 goto Close;
266 } 268 }
267 suspend_console(); 269 suspend_console();
268 ftrace_stop();
269 suspend_test_start(); 270 suspend_test_start();
270 error = dpm_suspend_start(PMSG_SUSPEND); 271 error = dpm_suspend_start(PMSG_SUSPEND);
271 if (error) { 272 if (error) {
@@ -285,7 +286,6 @@ int suspend_devices_and_enter(suspend_state_t state)
285 suspend_test_start(); 286 suspend_test_start();
286 dpm_resume_end(PMSG_RESUME); 287 dpm_resume_end(PMSG_RESUME);
287 suspend_test_finish("resume devices"); 288 suspend_test_finish("resume devices");
288 ftrace_start();
289 resume_console(); 289 resume_console();
290 Close: 290 Close:
291 if (need_suspend_ops(state) && suspend_ops->end) 291 if (need_suspend_ops(state) && suspend_ops->end)
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 5b5a7080e2a5..b4e8500afdb3 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2226,6 +2226,13 @@ void register_console(struct console *newcon)
2226 struct console *bcon = NULL; 2226 struct console *bcon = NULL;
2227 struct console_cmdline *c; 2227 struct console_cmdline *c;
2228 2228
2229 if (console_drivers)
2230 for_each_console(bcon)
2231 if (WARN(bcon == newcon,
2232 "console '%s%d' already registered\n",
2233 bcon->name, bcon->index))
2234 return;
2235
2229 /* 2236 /*
2230 * before we register a new CON_BOOT console, make sure we don't 2237 * before we register a new CON_BOOT console, make sure we don't
2231 * already have a valid console 2238 * already have a valid console
diff --git a/kernel/rcu.h b/kernel/rcu.h
index 7f8e7590e3e5..77131966c4ad 100644
--- a/kernel/rcu.h
+++ b/kernel/rcu.h
@@ -67,12 +67,15 @@
67 67
68extern struct debug_obj_descr rcuhead_debug_descr; 68extern struct debug_obj_descr rcuhead_debug_descr;
69 69
70static inline void debug_rcu_head_queue(struct rcu_head *head) 70static inline int debug_rcu_head_queue(struct rcu_head *head)
71{ 71{
72 debug_object_activate(head, &rcuhead_debug_descr); 72 int r1;
73
74 r1 = debug_object_activate(head, &rcuhead_debug_descr);
73 debug_object_active_state(head, &rcuhead_debug_descr, 75 debug_object_active_state(head, &rcuhead_debug_descr,
74 STATE_RCU_HEAD_READY, 76 STATE_RCU_HEAD_READY,
75 STATE_RCU_HEAD_QUEUED); 77 STATE_RCU_HEAD_QUEUED);
78 return r1;
76} 79}
77 80
78static inline void debug_rcu_head_unqueue(struct rcu_head *head) 81static inline void debug_rcu_head_unqueue(struct rcu_head *head)
@@ -83,8 +86,9 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head)
83 debug_object_deactivate(head, &rcuhead_debug_descr); 86 debug_object_deactivate(head, &rcuhead_debug_descr);
84} 87}
85#else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ 88#else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
86static inline void debug_rcu_head_queue(struct rcu_head *head) 89static inline int debug_rcu_head_queue(struct rcu_head *head)
87{ 90{
91 return 0;
88} 92}
89 93
90static inline void debug_rcu_head_unqueue(struct rcu_head *head) 94static inline void debug_rcu_head_unqueue(struct rcu_head *head)
@@ -94,7 +98,7 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head)
94 98
95extern void kfree(const void *); 99extern void kfree(const void *);
96 100
97static inline bool __rcu_reclaim(char *rn, struct rcu_head *head) 101static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
98{ 102{
99 unsigned long offset = (unsigned long)head->func; 103 unsigned long offset = (unsigned long)head->func;
100 104
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index cce6ba8bbace..33eb4620aa17 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -212,43 +212,6 @@ static inline void debug_rcu_head_free(struct rcu_head *head)
212} 212}
213 213
214/* 214/*
215 * fixup_init is called when:
216 * - an active object is initialized
217 */
218static int rcuhead_fixup_init(void *addr, enum debug_obj_state state)
219{
220 struct rcu_head *head = addr;
221
222 switch (state) {
223 case ODEBUG_STATE_ACTIVE:
224 /*
225 * Ensure that queued callbacks are all executed.
226 * If we detect that we are nested in a RCU read-side critical
227 * section, we should simply fail, otherwise we would deadlock.
228 * In !PREEMPT configurations, there is no way to tell if we are
229 * in a RCU read-side critical section or not, so we never
230 * attempt any fixup and just print a warning.
231 */
232#ifndef CONFIG_PREEMPT
233 WARN_ON_ONCE(1);
234 return 0;
235#endif
236 if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
237 irqs_disabled()) {
238 WARN_ON_ONCE(1);
239 return 0;
240 }
241 rcu_barrier();
242 rcu_barrier_sched();
243 rcu_barrier_bh();
244 debug_object_init(head, &rcuhead_debug_descr);
245 return 1;
246 default:
247 return 0;
248 }
249}
250
251/*
252 * fixup_activate is called when: 215 * fixup_activate is called when:
253 * - an active object is activated 216 * - an active object is activated
254 * - an unknown object is activated (might be a statically initialized object) 217 * - an unknown object is activated (might be a statically initialized object)
@@ -268,69 +231,8 @@ static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state)
268 debug_object_init(head, &rcuhead_debug_descr); 231 debug_object_init(head, &rcuhead_debug_descr);
269 debug_object_activate(head, &rcuhead_debug_descr); 232 debug_object_activate(head, &rcuhead_debug_descr);
270 return 0; 233 return 0;
271
272 case ODEBUG_STATE_ACTIVE:
273 /*
274 * Ensure that queued callbacks are all executed.
275 * If we detect that we are nested in a RCU read-side critical
276 * section, we should simply fail, otherwise we would deadlock.
277 * In !PREEMPT configurations, there is no way to tell if we are
278 * in a RCU read-side critical section or not, so we never
279 * attempt any fixup and just print a warning.
280 */
281#ifndef CONFIG_PREEMPT
282 WARN_ON_ONCE(1);
283 return 0;
284#endif
285 if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
286 irqs_disabled()) {
287 WARN_ON_ONCE(1);
288 return 0;
289 }
290 rcu_barrier();
291 rcu_barrier_sched();
292 rcu_barrier_bh();
293 debug_object_activate(head, &rcuhead_debug_descr);
294 return 1;
295 default: 234 default:
296 return 0;
297 }
298}
299
300/*
301 * fixup_free is called when:
302 * - an active object is freed
303 */
304static int rcuhead_fixup_free(void *addr, enum debug_obj_state state)
305{
306 struct rcu_head *head = addr;
307
308 switch (state) {
309 case ODEBUG_STATE_ACTIVE:
310 /*
311 * Ensure that queued callbacks are all executed.
312 * If we detect that we are nested in a RCU read-side critical
313 * section, we should simply fail, otherwise we would deadlock.
314 * In !PREEMPT configurations, there is no way to tell if we are
315 * in a RCU read-side critical section or not, so we never
316 * attempt any fixup and just print a warning.
317 */
318#ifndef CONFIG_PREEMPT
319 WARN_ON_ONCE(1);
320 return 0;
321#endif
322 if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
323 irqs_disabled()) {
324 WARN_ON_ONCE(1);
325 return 0;
326 }
327 rcu_barrier();
328 rcu_barrier_sched();
329 rcu_barrier_bh();
330 debug_object_free(head, &rcuhead_debug_descr);
331 return 1; 235 return 1;
332 default:
333 return 0;
334 } 236 }
335} 237}
336 238
@@ -369,15 +271,13 @@ EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack);
369 271
370struct debug_obj_descr rcuhead_debug_descr = { 272struct debug_obj_descr rcuhead_debug_descr = {
371 .name = "rcu_head", 273 .name = "rcu_head",
372 .fixup_init = rcuhead_fixup_init,
373 .fixup_activate = rcuhead_fixup_activate, 274 .fixup_activate = rcuhead_fixup_activate,
374 .fixup_free = rcuhead_fixup_free,
375}; 275};
376EXPORT_SYMBOL_GPL(rcuhead_debug_descr); 276EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
377#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ 277#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
378 278
379#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE) 279#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE)
380void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp, 280void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp,
381 unsigned long secs, 281 unsigned long secs,
382 unsigned long c_old, unsigned long c) 282 unsigned long c_old, unsigned long c)
383{ 283{
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index aa344111de3e..9ed6075dc562 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -264,7 +264,7 @@ void rcu_check_callbacks(int cpu, int user)
264 */ 264 */
265static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) 265static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
266{ 266{
267 char *rn = NULL; 267 const char *rn = NULL;
268 struct rcu_head *next, *list; 268 struct rcu_head *next, *list;
269 unsigned long flags; 269 unsigned long flags;
270 RCU_TRACE(int cb_count = 0); 270 RCU_TRACE(int cb_count = 0);
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 0cd385acccfa..280d06cae352 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -36,7 +36,7 @@ struct rcu_ctrlblk {
36 RCU_TRACE(unsigned long gp_start); /* Start time for stalls. */ 36 RCU_TRACE(unsigned long gp_start); /* Start time for stalls. */
37 RCU_TRACE(unsigned long ticks_this_gp); /* Statistic for stalls. */ 37 RCU_TRACE(unsigned long ticks_this_gp); /* Statistic for stalls. */
38 RCU_TRACE(unsigned long jiffies_stall); /* Jiffies at next stall. */ 38 RCU_TRACE(unsigned long jiffies_stall); /* Jiffies at next stall. */
39 RCU_TRACE(char *name); /* Name of RCU type. */ 39 RCU_TRACE(const char *name); /* Name of RCU type. */
40}; 40};
41 41
42/* Definition for rcupdate control block. */ 42/* Definition for rcupdate control block. */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index f4871e52c546..be63101c6175 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -52,72 +52,78 @@
52MODULE_LICENSE("GPL"); 52MODULE_LICENSE("GPL");
53MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>"); 53MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>");
54 54
55static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ 55static int fqs_duration;
56static int nfakewriters = 4; /* # fake writer threads */
57static int stat_interval = 60; /* Interval between stats, in seconds. */
58 /* Zero means "only at end of test". */
59static bool verbose; /* Print more debug info. */
60static bool test_no_idle_hz = true;
61 /* Test RCU support for tickless idle CPUs. */
62static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/
63static int stutter = 5; /* Start/stop testing interval (in sec) */
64static int irqreader = 1; /* RCU readers from irq (timers). */
65static int fqs_duration; /* Duration of bursts (us), 0 to disable. */
66static int fqs_holdoff; /* Hold time within burst (us). */
67static int fqs_stutter = 3; /* Wait time between bursts (s). */
68static int n_barrier_cbs; /* Number of callbacks to test RCU barriers. */
69static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */
70static int onoff_holdoff; /* Seconds after boot before CPU hotplugs. */
71static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */
72static int stall_cpu; /* CPU-stall duration (s). 0 for no stall. */
73static int stall_cpu_holdoff = 10; /* Time to wait until stall (s). */
74static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */
75static int test_boost_interval = 7; /* Interval between boost tests, seconds. */
76static int test_boost_duration = 4; /* Duration of each boost test, seconds. */
77static char *torture_type = "rcu"; /* What RCU implementation to torture. */
78
79module_param(nreaders, int, 0444);
80MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
81module_param(nfakewriters, int, 0444);
82MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads");
83module_param(stat_interval, int, 0644);
84MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s");
85module_param(verbose, bool, 0444);
86MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
87module_param(test_no_idle_hz, bool, 0444);
88MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs");
89module_param(shuffle_interval, int, 0444);
90MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles");
91module_param(stutter, int, 0444);
92MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test");
93module_param(irqreader, int, 0444);
94MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers");
95module_param(fqs_duration, int, 0444); 56module_param(fqs_duration, int, 0444);
96MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us)"); 57MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us), 0 to disable");
58static int fqs_holdoff;
97module_param(fqs_holdoff, int, 0444); 59module_param(fqs_holdoff, int, 0444);
98MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); 60MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
61static int fqs_stutter = 3;
99module_param(fqs_stutter, int, 0444); 62module_param(fqs_stutter, int, 0444);
100MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); 63MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
64static bool gp_exp;
65module_param(gp_exp, bool, 0444);
66MODULE_PARM_DESC(gp_exp, "Use expedited GP wait primitives");
67static bool gp_normal;
68module_param(gp_normal, bool, 0444);
69MODULE_PARM_DESC(gp_normal, "Use normal (non-expedited) GP wait primitives");
70static int irqreader = 1;
71module_param(irqreader, int, 0444);
72MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers");
73static int n_barrier_cbs;
101module_param(n_barrier_cbs, int, 0444); 74module_param(n_barrier_cbs, int, 0444);
102MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing"); 75MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing");
103module_param(onoff_interval, int, 0444); 76static int nfakewriters = 4;
104MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); 77module_param(nfakewriters, int, 0444);
78MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads");
79static int nreaders = -1;
80module_param(nreaders, int, 0444);
81MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
82static int object_debug;
83module_param(object_debug, int, 0444);
84MODULE_PARM_DESC(object_debug, "Enable debug-object double call_rcu() testing");
85static int onoff_holdoff;
105module_param(onoff_holdoff, int, 0444); 86module_param(onoff_holdoff, int, 0444);
106MODULE_PARM_DESC(onoff_holdoff, "Time after boot before CPU hotplugs (s)"); 87MODULE_PARM_DESC(onoff_holdoff, "Time after boot before CPU hotplugs (s)");
88static int onoff_interval;
89module_param(onoff_interval, int, 0444);
90MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable");
91static int shuffle_interval = 3;
92module_param(shuffle_interval, int, 0444);
93MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles");
94static int shutdown_secs;
107module_param(shutdown_secs, int, 0444); 95module_param(shutdown_secs, int, 0444);
108MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), zero to disable."); 96MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), <= zero to disable.");
97static int stall_cpu;
109module_param(stall_cpu, int, 0444); 98module_param(stall_cpu, int, 0444);
110MODULE_PARM_DESC(stall_cpu, "Stall duration (s), zero to disable."); 99MODULE_PARM_DESC(stall_cpu, "Stall duration (s), zero to disable.");
100static int stall_cpu_holdoff = 10;
111module_param(stall_cpu_holdoff, int, 0444); 101module_param(stall_cpu_holdoff, int, 0444);
112MODULE_PARM_DESC(stall_cpu_holdoff, "Time to wait before starting stall (s)."); 102MODULE_PARM_DESC(stall_cpu_holdoff, "Time to wait before starting stall (s).");
103static int stat_interval = 60;
104module_param(stat_interval, int, 0644);
105MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s");
106static int stutter = 5;
107module_param(stutter, int, 0444);
108MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test");
109static int test_boost = 1;
113module_param(test_boost, int, 0444); 110module_param(test_boost, int, 0444);
114MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); 111MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
115module_param(test_boost_interval, int, 0444); 112static int test_boost_duration = 4;
116MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds.");
117module_param(test_boost_duration, int, 0444); 113module_param(test_boost_duration, int, 0444);
118MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds."); 114MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds.");
115static int test_boost_interval = 7;
116module_param(test_boost_interval, int, 0444);
117MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds.");
118static bool test_no_idle_hz = true;
119module_param(test_no_idle_hz, bool, 0444);
120MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs");
121static char *torture_type = "rcu";
119module_param(torture_type, charp, 0444); 122module_param(torture_type, charp, 0444);
120MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); 123MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)");
124static bool verbose;
125module_param(verbose, bool, 0444);
126MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
121 127
122#define TORTURE_FLAG "-torture:" 128#define TORTURE_FLAG "-torture:"
123#define PRINTK_STRING(s) \ 129#define PRINTK_STRING(s) \
@@ -267,7 +273,7 @@ rcutorture_shutdown_notify(struct notifier_block *unused1,
267 * Absorb kthreads into a kernel function that won't return, so that 273 * Absorb kthreads into a kernel function that won't return, so that
268 * they won't ever access module text or data again. 274 * they won't ever access module text or data again.
269 */ 275 */
270static void rcutorture_shutdown_absorb(char *title) 276static void rcutorture_shutdown_absorb(const char *title)
271{ 277{
272 if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { 278 if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
273 pr_notice( 279 pr_notice(
@@ -337,7 +343,7 @@ rcu_random(struct rcu_random_state *rrsp)
337} 343}
338 344
339static void 345static void
340rcu_stutter_wait(char *title) 346rcu_stutter_wait(const char *title)
341{ 347{
342 while (stutter_pause_test || !rcutorture_runnable) { 348 while (stutter_pause_test || !rcutorture_runnable) {
343 if (rcutorture_runnable) 349 if (rcutorture_runnable)
@@ -360,13 +366,14 @@ struct rcu_torture_ops {
360 int (*completed)(void); 366 int (*completed)(void);
361 void (*deferred_free)(struct rcu_torture *p); 367 void (*deferred_free)(struct rcu_torture *p);
362 void (*sync)(void); 368 void (*sync)(void);
369 void (*exp_sync)(void);
363 void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); 370 void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
364 void (*cb_barrier)(void); 371 void (*cb_barrier)(void);
365 void (*fqs)(void); 372 void (*fqs)(void);
366 int (*stats)(char *page); 373 int (*stats)(char *page);
367 int irq_capable; 374 int irq_capable;
368 int can_boost; 375 int can_boost;
369 char *name; 376 const char *name;
370}; 377};
371 378
372static struct rcu_torture_ops *cur_ops; 379static struct rcu_torture_ops *cur_ops;
@@ -443,81 +450,27 @@ static void rcu_torture_deferred_free(struct rcu_torture *p)
443 call_rcu(&p->rtort_rcu, rcu_torture_cb); 450 call_rcu(&p->rtort_rcu, rcu_torture_cb);
444} 451}
445 452
446static struct rcu_torture_ops rcu_ops = {
447 .init = NULL,
448 .readlock = rcu_torture_read_lock,
449 .read_delay = rcu_read_delay,
450 .readunlock = rcu_torture_read_unlock,
451 .completed = rcu_torture_completed,
452 .deferred_free = rcu_torture_deferred_free,
453 .sync = synchronize_rcu,
454 .call = call_rcu,
455 .cb_barrier = rcu_barrier,
456 .fqs = rcu_force_quiescent_state,
457 .stats = NULL,
458 .irq_capable = 1,
459 .can_boost = rcu_can_boost(),
460 .name = "rcu"
461};
462
463static void rcu_sync_torture_deferred_free(struct rcu_torture *p)
464{
465 int i;
466 struct rcu_torture *rp;
467 struct rcu_torture *rp1;
468
469 cur_ops->sync();
470 list_add(&p->rtort_free, &rcu_torture_removed);
471 list_for_each_entry_safe(rp, rp1, &rcu_torture_removed, rtort_free) {
472 i = rp->rtort_pipe_count;
473 if (i > RCU_TORTURE_PIPE_LEN)
474 i = RCU_TORTURE_PIPE_LEN;
475 atomic_inc(&rcu_torture_wcount[i]);
476 if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
477 rp->rtort_mbtest = 0;
478 list_del(&rp->rtort_free);
479 rcu_torture_free(rp);
480 }
481 }
482}
483
484static void rcu_sync_torture_init(void) 453static void rcu_sync_torture_init(void)
485{ 454{
486 INIT_LIST_HEAD(&rcu_torture_removed); 455 INIT_LIST_HEAD(&rcu_torture_removed);
487} 456}
488 457
489static struct rcu_torture_ops rcu_sync_ops = { 458static struct rcu_torture_ops rcu_ops = {
490 .init = rcu_sync_torture_init, 459 .init = rcu_sync_torture_init,
491 .readlock = rcu_torture_read_lock, 460 .readlock = rcu_torture_read_lock,
492 .read_delay = rcu_read_delay, 461 .read_delay = rcu_read_delay,
493 .readunlock = rcu_torture_read_unlock, 462 .readunlock = rcu_torture_read_unlock,
494 .completed = rcu_torture_completed, 463 .completed = rcu_torture_completed,
495 .deferred_free = rcu_sync_torture_deferred_free, 464 .deferred_free = rcu_torture_deferred_free,
496 .sync = synchronize_rcu, 465 .sync = synchronize_rcu,
497 .call = NULL, 466 .exp_sync = synchronize_rcu_expedited,
498 .cb_barrier = NULL, 467 .call = call_rcu,
499 .fqs = rcu_force_quiescent_state, 468 .cb_barrier = rcu_barrier,
500 .stats = NULL,
501 .irq_capable = 1,
502 .can_boost = rcu_can_boost(),
503 .name = "rcu_sync"
504};
505
506static struct rcu_torture_ops rcu_expedited_ops = {
507 .init = rcu_sync_torture_init,
508 .readlock = rcu_torture_read_lock,
509 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
510 .readunlock = rcu_torture_read_unlock,
511 .completed = rcu_no_completed,
512 .deferred_free = rcu_sync_torture_deferred_free,
513 .sync = synchronize_rcu_expedited,
514 .call = NULL,
515 .cb_barrier = NULL,
516 .fqs = rcu_force_quiescent_state, 469 .fqs = rcu_force_quiescent_state,
517 .stats = NULL, 470 .stats = NULL,
518 .irq_capable = 1, 471 .irq_capable = 1,
519 .can_boost = rcu_can_boost(), 472 .can_boost = rcu_can_boost(),
520 .name = "rcu_expedited" 473 .name = "rcu"
521}; 474};
522 475
523/* 476/*
@@ -546,13 +499,14 @@ static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
546} 499}
547 500
548static struct rcu_torture_ops rcu_bh_ops = { 501static struct rcu_torture_ops rcu_bh_ops = {
549 .init = NULL, 502 .init = rcu_sync_torture_init,
550 .readlock = rcu_bh_torture_read_lock, 503 .readlock = rcu_bh_torture_read_lock,
551 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 504 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
552 .readunlock = rcu_bh_torture_read_unlock, 505 .readunlock = rcu_bh_torture_read_unlock,
553 .completed = rcu_bh_torture_completed, 506 .completed = rcu_bh_torture_completed,
554 .deferred_free = rcu_bh_torture_deferred_free, 507 .deferred_free = rcu_bh_torture_deferred_free,
555 .sync = synchronize_rcu_bh, 508 .sync = synchronize_rcu_bh,
509 .exp_sync = synchronize_rcu_bh_expedited,
556 .call = call_rcu_bh, 510 .call = call_rcu_bh,
557 .cb_barrier = rcu_barrier_bh, 511 .cb_barrier = rcu_barrier_bh,
558 .fqs = rcu_bh_force_quiescent_state, 512 .fqs = rcu_bh_force_quiescent_state,
@@ -561,38 +515,6 @@ static struct rcu_torture_ops rcu_bh_ops = {
561 .name = "rcu_bh" 515 .name = "rcu_bh"
562}; 516};
563 517
564static struct rcu_torture_ops rcu_bh_sync_ops = {
565 .init = rcu_sync_torture_init,
566 .readlock = rcu_bh_torture_read_lock,
567 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
568 .readunlock = rcu_bh_torture_read_unlock,
569 .completed = rcu_bh_torture_completed,
570 .deferred_free = rcu_sync_torture_deferred_free,
571 .sync = synchronize_rcu_bh,
572 .call = NULL,
573 .cb_barrier = NULL,
574 .fqs = rcu_bh_force_quiescent_state,
575 .stats = NULL,
576 .irq_capable = 1,
577 .name = "rcu_bh_sync"
578};
579
580static struct rcu_torture_ops rcu_bh_expedited_ops = {
581 .init = rcu_sync_torture_init,
582 .readlock = rcu_bh_torture_read_lock,
583 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
584 .readunlock = rcu_bh_torture_read_unlock,
585 .completed = rcu_bh_torture_completed,
586 .deferred_free = rcu_sync_torture_deferred_free,
587 .sync = synchronize_rcu_bh_expedited,
588 .call = NULL,
589 .cb_barrier = NULL,
590 .fqs = rcu_bh_force_quiescent_state,
591 .stats = NULL,
592 .irq_capable = 1,
593 .name = "rcu_bh_expedited"
594};
595
596/* 518/*
597 * Definitions for srcu torture testing. 519 * Definitions for srcu torture testing.
598 */ 520 */
@@ -667,6 +589,11 @@ static int srcu_torture_stats(char *page)
667 return cnt; 589 return cnt;
668} 590}
669 591
592static void srcu_torture_synchronize_expedited(void)
593{
594 synchronize_srcu_expedited(&srcu_ctl);
595}
596
670static struct rcu_torture_ops srcu_ops = { 597static struct rcu_torture_ops srcu_ops = {
671 .init = rcu_sync_torture_init, 598 .init = rcu_sync_torture_init,
672 .readlock = srcu_torture_read_lock, 599 .readlock = srcu_torture_read_lock,
@@ -675,45 +602,13 @@ static struct rcu_torture_ops srcu_ops = {
675 .completed = srcu_torture_completed, 602 .completed = srcu_torture_completed,
676 .deferred_free = srcu_torture_deferred_free, 603 .deferred_free = srcu_torture_deferred_free,
677 .sync = srcu_torture_synchronize, 604 .sync = srcu_torture_synchronize,
605 .exp_sync = srcu_torture_synchronize_expedited,
678 .call = srcu_torture_call, 606 .call = srcu_torture_call,
679 .cb_barrier = srcu_torture_barrier, 607 .cb_barrier = srcu_torture_barrier,
680 .stats = srcu_torture_stats, 608 .stats = srcu_torture_stats,
681 .name = "srcu" 609 .name = "srcu"
682}; 610};
683 611
684static struct rcu_torture_ops srcu_sync_ops = {
685 .init = rcu_sync_torture_init,
686 .readlock = srcu_torture_read_lock,
687 .read_delay = srcu_read_delay,
688 .readunlock = srcu_torture_read_unlock,
689 .completed = srcu_torture_completed,
690 .deferred_free = rcu_sync_torture_deferred_free,
691 .sync = srcu_torture_synchronize,
692 .call = NULL,
693 .cb_barrier = NULL,
694 .stats = srcu_torture_stats,
695 .name = "srcu_sync"
696};
697
698static void srcu_torture_synchronize_expedited(void)
699{
700 synchronize_srcu_expedited(&srcu_ctl);
701}
702
703static struct rcu_torture_ops srcu_expedited_ops = {
704 .init = rcu_sync_torture_init,
705 .readlock = srcu_torture_read_lock,
706 .read_delay = srcu_read_delay,
707 .readunlock = srcu_torture_read_unlock,
708 .completed = srcu_torture_completed,
709 .deferred_free = rcu_sync_torture_deferred_free,
710 .sync = srcu_torture_synchronize_expedited,
711 .call = NULL,
712 .cb_barrier = NULL,
713 .stats = srcu_torture_stats,
714 .name = "srcu_expedited"
715};
716
717/* 612/*
718 * Definitions for sched torture testing. 613 * Definitions for sched torture testing.
719 */ 614 */
@@ -742,6 +637,8 @@ static struct rcu_torture_ops sched_ops = {
742 .completed = rcu_no_completed, 637 .completed = rcu_no_completed,
743 .deferred_free = rcu_sched_torture_deferred_free, 638 .deferred_free = rcu_sched_torture_deferred_free,
744 .sync = synchronize_sched, 639 .sync = synchronize_sched,
640 .exp_sync = synchronize_sched_expedited,
641 .call = call_rcu_sched,
745 .cb_barrier = rcu_barrier_sched, 642 .cb_barrier = rcu_barrier_sched,
746 .fqs = rcu_sched_force_quiescent_state, 643 .fqs = rcu_sched_force_quiescent_state,
747 .stats = NULL, 644 .stats = NULL,
@@ -749,35 +646,6 @@ static struct rcu_torture_ops sched_ops = {
749 .name = "sched" 646 .name = "sched"
750}; 647};
751 648
752static struct rcu_torture_ops sched_sync_ops = {
753 .init = rcu_sync_torture_init,
754 .readlock = sched_torture_read_lock,
755 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
756 .readunlock = sched_torture_read_unlock,
757 .completed = rcu_no_completed,
758 .deferred_free = rcu_sync_torture_deferred_free,
759 .sync = synchronize_sched,
760 .cb_barrier = NULL,
761 .fqs = rcu_sched_force_quiescent_state,
762 .stats = NULL,
763 .name = "sched_sync"
764};
765
766static struct rcu_torture_ops sched_expedited_ops = {
767 .init = rcu_sync_torture_init,
768 .readlock = sched_torture_read_lock,
769 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
770 .readunlock = sched_torture_read_unlock,
771 .completed = rcu_no_completed,
772 .deferred_free = rcu_sync_torture_deferred_free,
773 .sync = synchronize_sched_expedited,
774 .cb_barrier = NULL,
775 .fqs = rcu_sched_force_quiescent_state,
776 .stats = NULL,
777 .irq_capable = 1,
778 .name = "sched_expedited"
779};
780
781/* 649/*
782 * RCU torture priority-boost testing. Runs one real-time thread per 650 * RCU torture priority-boost testing. Runs one real-time thread per
783 * CPU for moderate bursts, repeatedly registering RCU callbacks and 651 * CPU for moderate bursts, repeatedly registering RCU callbacks and
@@ -927,9 +795,10 @@ rcu_torture_fqs(void *arg)
927static int 795static int
928rcu_torture_writer(void *arg) 796rcu_torture_writer(void *arg)
929{ 797{
798 bool exp;
930 int i; 799 int i;
931 long oldbatch = rcu_batches_completed();
932 struct rcu_torture *rp; 800 struct rcu_torture *rp;
801 struct rcu_torture *rp1;
933 struct rcu_torture *old_rp; 802 struct rcu_torture *old_rp;
934 static DEFINE_RCU_RANDOM(rand); 803 static DEFINE_RCU_RANDOM(rand);
935 804
@@ -954,10 +823,33 @@ rcu_torture_writer(void *arg)
954 i = RCU_TORTURE_PIPE_LEN; 823 i = RCU_TORTURE_PIPE_LEN;
955 atomic_inc(&rcu_torture_wcount[i]); 824 atomic_inc(&rcu_torture_wcount[i]);
956 old_rp->rtort_pipe_count++; 825 old_rp->rtort_pipe_count++;
957 cur_ops->deferred_free(old_rp); 826 if (gp_normal == gp_exp)
827 exp = !!(rcu_random(&rand) & 0x80);
828 else
829 exp = gp_exp;
830 if (!exp) {
831 cur_ops->deferred_free(old_rp);
832 } else {
833 cur_ops->exp_sync();
834 list_add(&old_rp->rtort_free,
835 &rcu_torture_removed);
836 list_for_each_entry_safe(rp, rp1,
837 &rcu_torture_removed,
838 rtort_free) {
839 i = rp->rtort_pipe_count;
840 if (i > RCU_TORTURE_PIPE_LEN)
841 i = RCU_TORTURE_PIPE_LEN;
842 atomic_inc(&rcu_torture_wcount[i]);
843 if (++rp->rtort_pipe_count >=
844 RCU_TORTURE_PIPE_LEN) {
845 rp->rtort_mbtest = 0;
846 list_del(&rp->rtort_free);
847 rcu_torture_free(rp);
848 }
849 }
850 }
958 } 851 }
959 rcutorture_record_progress(++rcu_torture_current_version); 852 rcutorture_record_progress(++rcu_torture_current_version);
960 oldbatch = cur_ops->completed();
961 rcu_stutter_wait("rcu_torture_writer"); 853 rcu_stutter_wait("rcu_torture_writer");
962 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); 854 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
963 VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); 855 VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping");
@@ -983,10 +875,18 @@ rcu_torture_fakewriter(void *arg)
983 schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); 875 schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10);
984 udelay(rcu_random(&rand) & 0x3ff); 876 udelay(rcu_random(&rand) & 0x3ff);
985 if (cur_ops->cb_barrier != NULL && 877 if (cur_ops->cb_barrier != NULL &&
986 rcu_random(&rand) % (nfakewriters * 8) == 0) 878 rcu_random(&rand) % (nfakewriters * 8) == 0) {
987 cur_ops->cb_barrier(); 879 cur_ops->cb_barrier();
988 else 880 } else if (gp_normal == gp_exp) {
881 if (rcu_random(&rand) & 0x80)
882 cur_ops->sync();
883 else
884 cur_ops->exp_sync();
885 } else if (gp_normal) {
989 cur_ops->sync(); 886 cur_ops->sync();
887 } else {
888 cur_ops->exp_sync();
889 }
990 rcu_stutter_wait("rcu_torture_fakewriter"); 890 rcu_stutter_wait("rcu_torture_fakewriter");
991 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); 891 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
992 892
@@ -1364,7 +1264,7 @@ rcu_torture_stutter(void *arg)
1364} 1264}
1365 1265
1366static inline void 1266static inline void
1367rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag) 1267rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
1368{ 1268{
1369 pr_alert("%s" TORTURE_FLAG 1269 pr_alert("%s" TORTURE_FLAG
1370 "--- %s: nreaders=%d nfakewriters=%d " 1270 "--- %s: nreaders=%d nfakewriters=%d "
@@ -1534,7 +1434,13 @@ rcu_torture_onoff(void *arg)
1534 torture_type, cpu); 1434 torture_type, cpu);
1535 starttime = jiffies; 1435 starttime = jiffies;
1536 n_online_attempts++; 1436 n_online_attempts++;
1537 if (cpu_up(cpu) == 0) { 1437 ret = cpu_up(cpu);
1438 if (ret) {
1439 if (verbose)
1440 pr_alert("%s" TORTURE_FLAG
1441 "rcu_torture_onoff task: online %d failed: errno %d\n",
1442 torture_type, cpu, ret);
1443 } else {
1538 if (verbose) 1444 if (verbose)
1539 pr_alert("%s" TORTURE_FLAG 1445 pr_alert("%s" TORTURE_FLAG
1540 "rcu_torture_onoff task: onlined %d\n", 1446 "rcu_torture_onoff task: onlined %d\n",
@@ -1934,6 +1840,62 @@ rcu_torture_cleanup(void)
1934 rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); 1840 rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS");
1935} 1841}
1936 1842
1843#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
1844static void rcu_torture_leak_cb(struct rcu_head *rhp)
1845{
1846}
1847
1848static void rcu_torture_err_cb(struct rcu_head *rhp)
1849{
1850 /*
1851 * This -might- happen due to race conditions, but is unlikely.
1852 * The scenario that leads to this happening is that the
1853 * first of the pair of duplicate callbacks is queued,
1854 * someone else starts a grace period that includes that
1855 * callback, then the second of the pair must wait for the
1856 * next grace period. Unlikely, but can happen. If it
1857 * does happen, the debug-objects subsystem won't have splatted.
1858 */
1859 pr_alert("rcutorture: duplicated callback was invoked.\n");
1860}
1861#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
1862
1863/*
1864 * Verify that double-free causes debug-objects to complain, but only
1865 * if CONFIG_DEBUG_OBJECTS_RCU_HEAD=y. Otherwise, say that the test
1866 * cannot be carried out.
1867 */
1868static void rcu_test_debug_objects(void)
1869{
1870#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
1871 struct rcu_head rh1;
1872 struct rcu_head rh2;
1873
1874 init_rcu_head_on_stack(&rh1);
1875 init_rcu_head_on_stack(&rh2);
1876 pr_alert("rcutorture: WARN: Duplicate call_rcu() test starting.\n");
1877
1878 /* Try to queue the rh2 pair of callbacks for the same grace period. */
1879 preempt_disable(); /* Prevent preemption from interrupting test. */
1880 rcu_read_lock(); /* Make it impossible to finish a grace period. */
1881 call_rcu(&rh1, rcu_torture_leak_cb); /* Start grace period. */
1882 local_irq_disable(); /* Make it harder to start a new grace period. */
1883 call_rcu(&rh2, rcu_torture_leak_cb);
1884 call_rcu(&rh2, rcu_torture_err_cb); /* Duplicate callback. */
1885 local_irq_enable();
1886 rcu_read_unlock();
1887 preempt_enable();
1888
1889 /* Wait for them all to get done so we can safely return. */
1890 rcu_barrier();
1891 pr_alert("rcutorture: WARN: Duplicate call_rcu() test complete.\n");
1892 destroy_rcu_head_on_stack(&rh1);
1893 destroy_rcu_head_on_stack(&rh2);
1894#else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
1895 pr_alert("rcutorture: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n");
1896#endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
1897}
1898
1937static int __init 1899static int __init
1938rcu_torture_init(void) 1900rcu_torture_init(void)
1939{ 1901{
@@ -1941,11 +1903,9 @@ rcu_torture_init(void)
1941 int cpu; 1903 int cpu;
1942 int firsterr = 0; 1904 int firsterr = 0;
1943 int retval; 1905 int retval;
1944 static struct rcu_torture_ops *torture_ops[] = 1906 static struct rcu_torture_ops *torture_ops[] = {
1945 { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, 1907 &rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops,
1946 &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, 1908 };
1947 &srcu_ops, &srcu_sync_ops, &srcu_expedited_ops,
1948 &sched_ops, &sched_sync_ops, &sched_expedited_ops, };
1949 1909
1950 mutex_lock(&fullstop_mutex); 1910 mutex_lock(&fullstop_mutex);
1951 1911
@@ -2163,6 +2123,8 @@ rcu_torture_init(void)
2163 firsterr = retval; 2123 firsterr = retval;
2164 goto unwind; 2124 goto unwind;
2165 } 2125 }
2126 if (object_debug)
2127 rcu_test_debug_objects();
2166 rcutorture_record_test_transition(); 2128 rcutorture_record_test_transition();
2167 mutex_unlock(&fullstop_mutex); 2129 mutex_unlock(&fullstop_mutex);
2168 return 0; 2130 return 0;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 068de3a93606..32618b3fe4e6 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -53,18 +53,38 @@
53#include <linux/delay.h> 53#include <linux/delay.h>
54#include <linux/stop_machine.h> 54#include <linux/stop_machine.h>
55#include <linux/random.h> 55#include <linux/random.h>
56#include <linux/ftrace_event.h>
57#include <linux/suspend.h>
56 58
57#include "rcutree.h" 59#include "rcutree.h"
58#include <trace/events/rcu.h> 60#include <trace/events/rcu.h>
59 61
60#include "rcu.h" 62#include "rcu.h"
61 63
64/*
65 * Strings used in tracepoints need to be exported via the
66 * tracing system such that tools like perf and trace-cmd can
67 * translate the string address pointers to actual text.
68 */
69#define TPS(x) tracepoint_string(x)
70
62/* Data structures. */ 71/* Data structures. */
63 72
64static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; 73static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
65static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; 74static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
66 75
67#define RCU_STATE_INITIALIZER(sname, sabbr, cr) { \ 76/*
77 * In order to export the rcu_state name to the tracing tools, it
78 * needs to be added in the __tracepoint_string section.
79 * This requires defining a separate variable tp_<sname>_varname
80 * that points to the string being used, and this will allow
81 * the tracing userspace tools to be able to decipher the string
82 * address to the matching string.
83 */
84#define RCU_STATE_INITIALIZER(sname, sabbr, cr) \
85static char sname##_varname[] = #sname; \
86static const char *tp_##sname##_varname __used __tracepoint_string = sname##_varname; \
87struct rcu_state sname##_state = { \
68 .level = { &sname##_state.node[0] }, \ 88 .level = { &sname##_state.node[0] }, \
69 .call = cr, \ 89 .call = cr, \
70 .fqs_state = RCU_GP_IDLE, \ 90 .fqs_state = RCU_GP_IDLE, \
@@ -75,16 +95,13 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
75 .orphan_donetail = &sname##_state.orphan_donelist, \ 95 .orphan_donetail = &sname##_state.orphan_donelist, \
76 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ 96 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
77 .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \ 97 .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \
78 .name = #sname, \ 98 .name = sname##_varname, \
79 .abbr = sabbr, \ 99 .abbr = sabbr, \
80} 100}; \
81 101DEFINE_PER_CPU(struct rcu_data, sname##_data)
82struct rcu_state rcu_sched_state =
83 RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
84DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
85 102
86struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); 103RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
87DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 104RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh);
88 105
89static struct rcu_state *rcu_state; 106static struct rcu_state *rcu_state;
90LIST_HEAD(rcu_struct_flavors); 107LIST_HEAD(rcu_struct_flavors);
@@ -178,7 +195,7 @@ void rcu_sched_qs(int cpu)
178 struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu); 195 struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu);
179 196
180 if (rdp->passed_quiesce == 0) 197 if (rdp->passed_quiesce == 0)
181 trace_rcu_grace_period("rcu_sched", rdp->gpnum, "cpuqs"); 198 trace_rcu_grace_period(TPS("rcu_sched"), rdp->gpnum, TPS("cpuqs"));
182 rdp->passed_quiesce = 1; 199 rdp->passed_quiesce = 1;
183} 200}
184 201
@@ -187,7 +204,7 @@ void rcu_bh_qs(int cpu)
187 struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); 204 struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
188 205
189 if (rdp->passed_quiesce == 0) 206 if (rdp->passed_quiesce == 0)
190 trace_rcu_grace_period("rcu_bh", rdp->gpnum, "cpuqs"); 207 trace_rcu_grace_period(TPS("rcu_bh"), rdp->gpnum, TPS("cpuqs"));
191 rdp->passed_quiesce = 1; 208 rdp->passed_quiesce = 1;
192} 209}
193 210
@@ -198,16 +215,20 @@ void rcu_bh_qs(int cpu)
198 */ 215 */
199void rcu_note_context_switch(int cpu) 216void rcu_note_context_switch(int cpu)
200{ 217{
201 trace_rcu_utilization("Start context switch"); 218 trace_rcu_utilization(TPS("Start context switch"));
202 rcu_sched_qs(cpu); 219 rcu_sched_qs(cpu);
203 rcu_preempt_note_context_switch(cpu); 220 rcu_preempt_note_context_switch(cpu);
204 trace_rcu_utilization("End context switch"); 221 trace_rcu_utilization(TPS("End context switch"));
205} 222}
206EXPORT_SYMBOL_GPL(rcu_note_context_switch); 223EXPORT_SYMBOL_GPL(rcu_note_context_switch);
207 224
208DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { 225DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
209 .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, 226 .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
210 .dynticks = ATOMIC_INIT(1), 227 .dynticks = ATOMIC_INIT(1),
228#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
229 .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE,
230 .dynticks_idle = ATOMIC_INIT(1),
231#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
211}; 232};
212 233
213static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ 234static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */
@@ -226,7 +247,10 @@ module_param(jiffies_till_next_fqs, ulong, 0644);
226 247
227static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, 248static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
228 struct rcu_data *rdp); 249 struct rcu_data *rdp);
229static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)); 250static void force_qs_rnp(struct rcu_state *rsp,
251 int (*f)(struct rcu_data *rsp, bool *isidle,
252 unsigned long *maxj),
253 bool *isidle, unsigned long *maxj);
230static void force_quiescent_state(struct rcu_state *rsp); 254static void force_quiescent_state(struct rcu_state *rsp);
231static int rcu_pending(int cpu); 255static int rcu_pending(int cpu);
232 256
@@ -345,11 +369,11 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
345static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, 369static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
346 bool user) 370 bool user)
347{ 371{
348 trace_rcu_dyntick("Start", oldval, rdtp->dynticks_nesting); 372 trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting);
349 if (!user && !is_idle_task(current)) { 373 if (!user && !is_idle_task(current)) {
350 struct task_struct *idle = idle_task(smp_processor_id()); 374 struct task_struct *idle = idle_task(smp_processor_id());
351 375
352 trace_rcu_dyntick("Error on entry: not idle task", oldval, 0); 376 trace_rcu_dyntick(TPS("Error on entry: not idle task"), oldval, 0);
353 ftrace_dump(DUMP_ORIG); 377 ftrace_dump(DUMP_ORIG);
354 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", 378 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
355 current->pid, current->comm, 379 current->pid, current->comm,
@@ -411,6 +435,7 @@ void rcu_idle_enter(void)
411 435
412 local_irq_save(flags); 436 local_irq_save(flags);
413 rcu_eqs_enter(false); 437 rcu_eqs_enter(false);
438 rcu_sysidle_enter(&__get_cpu_var(rcu_dynticks), 0);
414 local_irq_restore(flags); 439 local_irq_restore(flags);
415} 440}
416EXPORT_SYMBOL_GPL(rcu_idle_enter); 441EXPORT_SYMBOL_GPL(rcu_idle_enter);
@@ -428,27 +453,6 @@ void rcu_user_enter(void)
428{ 453{
429 rcu_eqs_enter(1); 454 rcu_eqs_enter(1);
430} 455}
431
432/**
433 * rcu_user_enter_after_irq - inform RCU that we are going to resume userspace
434 * after the current irq returns.
435 *
436 * This is similar to rcu_user_enter() but in the context of a non-nesting
437 * irq. After this call, RCU enters into idle mode when the interrupt
438 * returns.
439 */
440void rcu_user_enter_after_irq(void)
441{
442 unsigned long flags;
443 struct rcu_dynticks *rdtp;
444
445 local_irq_save(flags);
446 rdtp = &__get_cpu_var(rcu_dynticks);
447 /* Ensure this irq is interrupting a non-idle RCU state. */
448 WARN_ON_ONCE(!(rdtp->dynticks_nesting & DYNTICK_TASK_MASK));
449 rdtp->dynticks_nesting = 1;
450 local_irq_restore(flags);
451}
452#endif /* CONFIG_RCU_USER_QS */ 456#endif /* CONFIG_RCU_USER_QS */
453 457
454/** 458/**
@@ -479,9 +483,10 @@ void rcu_irq_exit(void)
479 rdtp->dynticks_nesting--; 483 rdtp->dynticks_nesting--;
480 WARN_ON_ONCE(rdtp->dynticks_nesting < 0); 484 WARN_ON_ONCE(rdtp->dynticks_nesting < 0);
481 if (rdtp->dynticks_nesting) 485 if (rdtp->dynticks_nesting)
482 trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting); 486 trace_rcu_dyntick(TPS("--="), oldval, rdtp->dynticks_nesting);
483 else 487 else
484 rcu_eqs_enter_common(rdtp, oldval, true); 488 rcu_eqs_enter_common(rdtp, oldval, true);
489 rcu_sysidle_enter(rdtp, 1);
485 local_irq_restore(flags); 490 local_irq_restore(flags);
486} 491}
487 492
@@ -501,11 +506,11 @@ static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval,
501 smp_mb__after_atomic_inc(); /* See above. */ 506 smp_mb__after_atomic_inc(); /* See above. */
502 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); 507 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
503 rcu_cleanup_after_idle(smp_processor_id()); 508 rcu_cleanup_after_idle(smp_processor_id());
504 trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting); 509 trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting);
505 if (!user && !is_idle_task(current)) { 510 if (!user && !is_idle_task(current)) {
506 struct task_struct *idle = idle_task(smp_processor_id()); 511 struct task_struct *idle = idle_task(smp_processor_id());
507 512
508 trace_rcu_dyntick("Error on exit: not idle task", 513 trace_rcu_dyntick(TPS("Error on exit: not idle task"),
509 oldval, rdtp->dynticks_nesting); 514 oldval, rdtp->dynticks_nesting);
510 ftrace_dump(DUMP_ORIG); 515 ftrace_dump(DUMP_ORIG);
511 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", 516 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
@@ -550,6 +555,7 @@ void rcu_idle_exit(void)
550 555
551 local_irq_save(flags); 556 local_irq_save(flags);
552 rcu_eqs_exit(false); 557 rcu_eqs_exit(false);
558 rcu_sysidle_exit(&__get_cpu_var(rcu_dynticks), 0);
553 local_irq_restore(flags); 559 local_irq_restore(flags);
554} 560}
555EXPORT_SYMBOL_GPL(rcu_idle_exit); 561EXPORT_SYMBOL_GPL(rcu_idle_exit);
@@ -565,28 +571,6 @@ void rcu_user_exit(void)
565{ 571{
566 rcu_eqs_exit(1); 572 rcu_eqs_exit(1);
567} 573}
568
569/**
570 * rcu_user_exit_after_irq - inform RCU that we won't resume to userspace
571 * idle mode after the current non-nesting irq returns.
572 *
573 * This is similar to rcu_user_exit() but in the context of an irq.
574 * This is called when the irq has interrupted a userspace RCU idle mode
575 * context. When the current non-nesting interrupt returns after this call,
576 * the CPU won't restore the RCU idle mode.
577 */
578void rcu_user_exit_after_irq(void)
579{
580 unsigned long flags;
581 struct rcu_dynticks *rdtp;
582
583 local_irq_save(flags);
584 rdtp = &__get_cpu_var(rcu_dynticks);
585 /* Ensure we are interrupting an RCU idle mode. */
586 WARN_ON_ONCE(rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK);
587 rdtp->dynticks_nesting += DYNTICK_TASK_EXIT_IDLE;
588 local_irq_restore(flags);
589}
590#endif /* CONFIG_RCU_USER_QS */ 574#endif /* CONFIG_RCU_USER_QS */
591 575
592/** 576/**
@@ -620,9 +604,10 @@ void rcu_irq_enter(void)
620 rdtp->dynticks_nesting++; 604 rdtp->dynticks_nesting++;
621 WARN_ON_ONCE(rdtp->dynticks_nesting == 0); 605 WARN_ON_ONCE(rdtp->dynticks_nesting == 0);
622 if (oldval) 606 if (oldval)
623 trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting); 607 trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting);
624 else 608 else
625 rcu_eqs_exit_common(rdtp, oldval, true); 609 rcu_eqs_exit_common(rdtp, oldval, true);
610 rcu_sysidle_exit(rdtp, 1);
626 local_irq_restore(flags); 611 local_irq_restore(flags);
627} 612}
628 613
@@ -746,9 +731,11 @@ static int rcu_is_cpu_rrupt_from_idle(void)
746 * credit them with an implicit quiescent state. Return 1 if this CPU 731 * credit them with an implicit quiescent state. Return 1 if this CPU
747 * is in dynticks idle mode, which is an extended quiescent state. 732 * is in dynticks idle mode, which is an extended quiescent state.
748 */ 733 */
749static int dyntick_save_progress_counter(struct rcu_data *rdp) 734static int dyntick_save_progress_counter(struct rcu_data *rdp,
735 bool *isidle, unsigned long *maxj)
750{ 736{
751 rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); 737 rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks);
738 rcu_sysidle_check_cpu(rdp, isidle, maxj);
752 return (rdp->dynticks_snap & 0x1) == 0; 739 return (rdp->dynticks_snap & 0x1) == 0;
753} 740}
754 741
@@ -758,7 +745,8 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp)
758 * idle state since the last call to dyntick_save_progress_counter() 745 * idle state since the last call to dyntick_save_progress_counter()
759 * for this same CPU, or by virtue of having been offline. 746 * for this same CPU, or by virtue of having been offline.
760 */ 747 */
761static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) 748static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
749 bool *isidle, unsigned long *maxj)
762{ 750{
763 unsigned int curr; 751 unsigned int curr;
764 unsigned int snap; 752 unsigned int snap;
@@ -775,7 +763,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
775 * of the current RCU grace period. 763 * of the current RCU grace period.
776 */ 764 */
777 if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) { 765 if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) {
778 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "dti"); 766 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
779 rdp->dynticks_fqs++; 767 rdp->dynticks_fqs++;
780 return 1; 768 return 1;
781 } 769 }
@@ -795,7 +783,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
795 return 0; /* Grace period is not old enough. */ 783 return 0; /* Grace period is not old enough. */
796 barrier(); 784 barrier();
797 if (cpu_is_offline(rdp->cpu)) { 785 if (cpu_is_offline(rdp->cpu)) {
798 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl"); 786 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("ofl"));
799 rdp->offline_fqs++; 787 rdp->offline_fqs++;
800 return 1; 788 return 1;
801 } 789 }
@@ -1032,7 +1020,7 @@ static unsigned long rcu_cbs_completed(struct rcu_state *rsp,
1032 * rcu_nocb_wait_gp(). 1020 * rcu_nocb_wait_gp().
1033 */ 1021 */
1034static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, 1022static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
1035 unsigned long c, char *s) 1023 unsigned long c, const char *s)
1036{ 1024{
1037 trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum, 1025 trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum,
1038 rnp->completed, c, rnp->level, 1026 rnp->completed, c, rnp->level,
@@ -1058,9 +1046,9 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
1058 * grace period is already marked as needed, return to the caller. 1046 * grace period is already marked as needed, return to the caller.
1059 */ 1047 */
1060 c = rcu_cbs_completed(rdp->rsp, rnp); 1048 c = rcu_cbs_completed(rdp->rsp, rnp);
1061 trace_rcu_future_gp(rnp, rdp, c, "Startleaf"); 1049 trace_rcu_future_gp(rnp, rdp, c, TPS("Startleaf"));
1062 if (rnp->need_future_gp[c & 0x1]) { 1050 if (rnp->need_future_gp[c & 0x1]) {
1063 trace_rcu_future_gp(rnp, rdp, c, "Prestartleaf"); 1051 trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartleaf"));
1064 return c; 1052 return c;
1065 } 1053 }
1066 1054
@@ -1074,7 +1062,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
1074 if (rnp->gpnum != rnp->completed || 1062 if (rnp->gpnum != rnp->completed ||
1075 ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) { 1063 ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) {
1076 rnp->need_future_gp[c & 0x1]++; 1064 rnp->need_future_gp[c & 0x1]++;
1077 trace_rcu_future_gp(rnp, rdp, c, "Startedleaf"); 1065 trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf"));
1078 return c; 1066 return c;
1079 } 1067 }
1080 1068
@@ -1102,7 +1090,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
1102 * recorded, trace and leave. 1090 * recorded, trace and leave.
1103 */ 1091 */
1104 if (rnp_root->need_future_gp[c & 0x1]) { 1092 if (rnp_root->need_future_gp[c & 0x1]) {
1105 trace_rcu_future_gp(rnp, rdp, c, "Prestartedroot"); 1093 trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartedroot"));
1106 goto unlock_out; 1094 goto unlock_out;
1107 } 1095 }
1108 1096
@@ -1111,9 +1099,9 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
1111 1099
1112 /* If a grace period is not already in progress, start one. */ 1100 /* If a grace period is not already in progress, start one. */
1113 if (rnp_root->gpnum != rnp_root->completed) { 1101 if (rnp_root->gpnum != rnp_root->completed) {
1114 trace_rcu_future_gp(rnp, rdp, c, "Startedleafroot"); 1102 trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleafroot"));
1115 } else { 1103 } else {
1116 trace_rcu_future_gp(rnp, rdp, c, "Startedroot"); 1104 trace_rcu_future_gp(rnp, rdp, c, TPS("Startedroot"));
1117 rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp); 1105 rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp);
1118 } 1106 }
1119unlock_out: 1107unlock_out:
@@ -1137,7 +1125,8 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
1137 rcu_nocb_gp_cleanup(rsp, rnp); 1125 rcu_nocb_gp_cleanup(rsp, rnp);
1138 rnp->need_future_gp[c & 0x1] = 0; 1126 rnp->need_future_gp[c & 0x1] = 0;
1139 needmore = rnp->need_future_gp[(c + 1) & 0x1]; 1127 needmore = rnp->need_future_gp[(c + 1) & 0x1];
1140 trace_rcu_future_gp(rnp, rdp, c, needmore ? "CleanupMore" : "Cleanup"); 1128 trace_rcu_future_gp(rnp, rdp, c,
1129 needmore ? TPS("CleanupMore") : TPS("Cleanup"));
1141 return needmore; 1130 return needmore;
1142} 1131}
1143 1132
@@ -1205,9 +1194,9 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1205 1194
1206 /* Trace depending on how much we were able to accelerate. */ 1195 /* Trace depending on how much we were able to accelerate. */
1207 if (!*rdp->nxttail[RCU_WAIT_TAIL]) 1196 if (!*rdp->nxttail[RCU_WAIT_TAIL])
1208 trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccWaitCB"); 1197 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB"));
1209 else 1198 else
1210 trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccReadyCB"); 1199 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB"));
1211} 1200}
1212 1201
1213/* 1202/*
@@ -1273,7 +1262,7 @@ static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struc
1273 1262
1274 /* Remember that we saw this grace-period completion. */ 1263 /* Remember that we saw this grace-period completion. */
1275 rdp->completed = rnp->completed; 1264 rdp->completed = rnp->completed;
1276 trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuend"); 1265 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend"));
1277 } 1266 }
1278 1267
1279 if (rdp->gpnum != rnp->gpnum) { 1268 if (rdp->gpnum != rnp->gpnum) {
@@ -1283,7 +1272,7 @@ static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struc
1283 * go looking for one. 1272 * go looking for one.
1284 */ 1273 */
1285 rdp->gpnum = rnp->gpnum; 1274 rdp->gpnum = rnp->gpnum;
1286 trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart"); 1275 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart"));
1287 rdp->passed_quiesce = 0; 1276 rdp->passed_quiesce = 0;
1288 rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); 1277 rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
1289 zero_cpu_stall_ticks(rdp); 1278 zero_cpu_stall_ticks(rdp);
@@ -1315,6 +1304,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
1315 struct rcu_data *rdp; 1304 struct rcu_data *rdp;
1316 struct rcu_node *rnp = rcu_get_root(rsp); 1305 struct rcu_node *rnp = rcu_get_root(rsp);
1317 1306
1307 rcu_bind_gp_kthread();
1318 raw_spin_lock_irq(&rnp->lock); 1308 raw_spin_lock_irq(&rnp->lock);
1319 rsp->gp_flags = 0; /* Clear all flags: New grace period. */ 1309 rsp->gp_flags = 0; /* Clear all flags: New grace period. */
1320 1310
@@ -1326,7 +1316,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
1326 1316
1327 /* Advance to a new grace period and initialize state. */ 1317 /* Advance to a new grace period and initialize state. */
1328 rsp->gpnum++; 1318 rsp->gpnum++;
1329 trace_rcu_grace_period(rsp->name, rsp->gpnum, "start"); 1319 trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start"));
1330 record_gp_stall_check_time(rsp); 1320 record_gp_stall_check_time(rsp);
1331 raw_spin_unlock_irq(&rnp->lock); 1321 raw_spin_unlock_irq(&rnp->lock);
1332 1322
@@ -1379,16 +1369,25 @@ static int rcu_gp_init(struct rcu_state *rsp)
1379int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) 1369int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
1380{ 1370{
1381 int fqs_state = fqs_state_in; 1371 int fqs_state = fqs_state_in;
1372 bool isidle = false;
1373 unsigned long maxj;
1382 struct rcu_node *rnp = rcu_get_root(rsp); 1374 struct rcu_node *rnp = rcu_get_root(rsp);
1383 1375
1384 rsp->n_force_qs++; 1376 rsp->n_force_qs++;
1385 if (fqs_state == RCU_SAVE_DYNTICK) { 1377 if (fqs_state == RCU_SAVE_DYNTICK) {
1386 /* Collect dyntick-idle snapshots. */ 1378 /* Collect dyntick-idle snapshots. */
1387 force_qs_rnp(rsp, dyntick_save_progress_counter); 1379 if (is_sysidle_rcu_state(rsp)) {
1380 isidle = 1;
1381 maxj = jiffies - ULONG_MAX / 4;
1382 }
1383 force_qs_rnp(rsp, dyntick_save_progress_counter,
1384 &isidle, &maxj);
1385 rcu_sysidle_report_gp(rsp, isidle, maxj);
1388 fqs_state = RCU_FORCE_QS; 1386 fqs_state = RCU_FORCE_QS;
1389 } else { 1387 } else {
1390 /* Handle dyntick-idle and offline CPUs. */ 1388 /* Handle dyntick-idle and offline CPUs. */
1391 force_qs_rnp(rsp, rcu_implicit_dynticks_qs); 1389 isidle = 0;
1390 force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj);
1392 } 1391 }
1393 /* Clear flag to prevent immediate re-entry. */ 1392 /* Clear flag to prevent immediate re-entry. */
1394 if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { 1393 if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
@@ -1448,7 +1447,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
1448 rcu_nocb_gp_set(rnp, nocb); 1447 rcu_nocb_gp_set(rnp, nocb);
1449 1448
1450 rsp->completed = rsp->gpnum; /* Declare grace period done. */ 1449 rsp->completed = rsp->gpnum; /* Declare grace period done. */
1451 trace_rcu_grace_period(rsp->name, rsp->completed, "end"); 1450 trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end"));
1452 rsp->fqs_state = RCU_GP_IDLE; 1451 rsp->fqs_state = RCU_GP_IDLE;
1453 rdp = this_cpu_ptr(rsp->rda); 1452 rdp = this_cpu_ptr(rsp->rda);
1454 rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */ 1453 rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */
@@ -1558,10 +1557,12 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
1558 1557
1559 /* 1558 /*
1560 * We can't do wakeups while holding the rnp->lock, as that 1559 * We can't do wakeups while holding the rnp->lock, as that
1561 * could cause possible deadlocks with the rq->lock. Deter 1560 * could cause possible deadlocks with the rq->lock. Defer
1562 * the wakeup to interrupt context. 1561 * the wakeup to interrupt context. And don't bother waking
1562 * up the running kthread.
1563 */ 1563 */
1564 irq_work_queue(&rsp->wakeup_work); 1564 if (current != rsp->gp_kthread)
1565 irq_work_queue(&rsp->wakeup_work);
1565} 1566}
1566 1567
1567/* 1568/*
@@ -1857,7 +1858,7 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
1857 RCU_TRACE(mask = rdp->grpmask); 1858 RCU_TRACE(mask = rdp->grpmask);
1858 trace_rcu_grace_period(rsp->name, 1859 trace_rcu_grace_period(rsp->name,
1859 rnp->gpnum + 1 - !!(rnp->qsmask & mask), 1860 rnp->gpnum + 1 - !!(rnp->qsmask & mask),
1860 "cpuofl"); 1861 TPS("cpuofl"));
1861} 1862}
1862 1863
1863/* 1864/*
@@ -2044,7 +2045,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
2044 */ 2045 */
2045void rcu_check_callbacks(int cpu, int user) 2046void rcu_check_callbacks(int cpu, int user)
2046{ 2047{
2047 trace_rcu_utilization("Start scheduler-tick"); 2048 trace_rcu_utilization(TPS("Start scheduler-tick"));
2048 increment_cpu_stall_ticks(); 2049 increment_cpu_stall_ticks();
2049 if (user || rcu_is_cpu_rrupt_from_idle()) { 2050 if (user || rcu_is_cpu_rrupt_from_idle()) {
2050 2051
@@ -2077,7 +2078,7 @@ void rcu_check_callbacks(int cpu, int user)
2077 rcu_preempt_check_callbacks(cpu); 2078 rcu_preempt_check_callbacks(cpu);
2078 if (rcu_pending(cpu)) 2079 if (rcu_pending(cpu))
2079 invoke_rcu_core(); 2080 invoke_rcu_core();
2080 trace_rcu_utilization("End scheduler-tick"); 2081 trace_rcu_utilization(TPS("End scheduler-tick"));
2081} 2082}
2082 2083
2083/* 2084/*
@@ -2087,7 +2088,10 @@ void rcu_check_callbacks(int cpu, int user)
2087 * 2088 *
2088 * The caller must have suppressed start of new grace periods. 2089 * The caller must have suppressed start of new grace periods.
2089 */ 2090 */
2090static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) 2091static void force_qs_rnp(struct rcu_state *rsp,
2092 int (*f)(struct rcu_data *rsp, bool *isidle,
2093 unsigned long *maxj),
2094 bool *isidle, unsigned long *maxj)
2091{ 2095{
2092 unsigned long bit; 2096 unsigned long bit;
2093 int cpu; 2097 int cpu;
@@ -2110,9 +2114,12 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
2110 cpu = rnp->grplo; 2114 cpu = rnp->grplo;
2111 bit = 1; 2115 bit = 1;
2112 for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { 2116 for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
2113 if ((rnp->qsmask & bit) != 0 && 2117 if ((rnp->qsmask & bit) != 0) {
2114 f(per_cpu_ptr(rsp->rda, cpu))) 2118 if ((rnp->qsmaskinit & bit) != 0)
2115 mask |= bit; 2119 *isidle = 0;
2120 if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj))
2121 mask |= bit;
2122 }
2116 } 2123 }
2117 if (mask != 0) { 2124 if (mask != 0) {
2118 2125
@@ -2208,10 +2215,10 @@ static void rcu_process_callbacks(struct softirq_action *unused)
2208 2215
2209 if (cpu_is_offline(smp_processor_id())) 2216 if (cpu_is_offline(smp_processor_id()))
2210 return; 2217 return;
2211 trace_rcu_utilization("Start RCU core"); 2218 trace_rcu_utilization(TPS("Start RCU core"));
2212 for_each_rcu_flavor(rsp) 2219 for_each_rcu_flavor(rsp)
2213 __rcu_process_callbacks(rsp); 2220 __rcu_process_callbacks(rsp);
2214 trace_rcu_utilization("End RCU core"); 2221 trace_rcu_utilization(TPS("End RCU core"));
2215} 2222}
2216 2223
2217/* 2224/*
@@ -2287,6 +2294,13 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
2287} 2294}
2288 2295
2289/* 2296/*
2297 * RCU callback function to leak a callback.
2298 */
2299static void rcu_leak_callback(struct rcu_head *rhp)
2300{
2301}
2302
2303/*
2290 * Helper function for call_rcu() and friends. The cpu argument will 2304 * Helper function for call_rcu() and friends. The cpu argument will
2291 * normally be -1, indicating "currently running CPU". It may specify 2305 * normally be -1, indicating "currently running CPU". It may specify
2292 * a CPU only if that CPU is a no-CBs CPU. Currently, only _rcu_barrier() 2306 * a CPU only if that CPU is a no-CBs CPU. Currently, only _rcu_barrier()
@@ -2300,7 +2314,12 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
2300 struct rcu_data *rdp; 2314 struct rcu_data *rdp;
2301 2315
2302 WARN_ON_ONCE((unsigned long)head & 0x3); /* Misaligned rcu_head! */ 2316 WARN_ON_ONCE((unsigned long)head & 0x3); /* Misaligned rcu_head! */
2303 debug_rcu_head_queue(head); 2317 if (debug_rcu_head_queue(head)) {
2318 /* Probable double call_rcu(), so leak the callback. */
2319 ACCESS_ONCE(head->func) = rcu_leak_callback;
2320 WARN_ONCE(1, "__call_rcu(): Leaked duplicate callback\n");
2321 return;
2322 }
2304 head->func = func; 2323 head->func = func;
2305 head->next = NULL; 2324 head->next = NULL;
2306 2325
@@ -2720,7 +2739,7 @@ static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy)
2720 * Helper function for _rcu_barrier() tracing. If tracing is disabled, 2739 * Helper function for _rcu_barrier() tracing. If tracing is disabled,
2721 * the compiler is expected to optimize this away. 2740 * the compiler is expected to optimize this away.
2722 */ 2741 */
2723static void _rcu_barrier_trace(struct rcu_state *rsp, char *s, 2742static void _rcu_barrier_trace(struct rcu_state *rsp, const char *s,
2724 int cpu, unsigned long done) 2743 int cpu, unsigned long done)
2725{ 2744{
2726 trace_rcu_barrier(rsp->name, s, cpu, 2745 trace_rcu_barrier(rsp->name, s, cpu,
@@ -2785,9 +2804,20 @@ static void _rcu_barrier(struct rcu_state *rsp)
2785 * transition. The "if" expression below therefore rounds the old 2804 * transition. The "if" expression below therefore rounds the old
2786 * value up to the next even number and adds two before comparing. 2805 * value up to the next even number and adds two before comparing.
2787 */ 2806 */
2788 snap_done = ACCESS_ONCE(rsp->n_barrier_done); 2807 snap_done = rsp->n_barrier_done;
2789 _rcu_barrier_trace(rsp, "Check", -1, snap_done); 2808 _rcu_barrier_trace(rsp, "Check", -1, snap_done);
2790 if (ULONG_CMP_GE(snap_done, ((snap + 1) & ~0x1) + 2)) { 2809
2810 /*
2811 * If the value in snap is odd, we needed to wait for the current
2812 * rcu_barrier() to complete, then wait for the next one, in other
2813 * words, we need the value of snap_done to be three larger than
2814 * the value of snap. On the other hand, if the value in snap is
2815 * even, we only had to wait for the next rcu_barrier() to complete,
2816 * in other words, we need the value of snap_done to be only two
2817 * greater than the value of snap. The "(snap + 3) & ~0x1" computes
2818 * this for us (thank you, Linus!).
2819 */
2820 if (ULONG_CMP_GE(snap_done, (snap + 3) & ~0x1)) {
2791 _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done); 2821 _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done);
2792 smp_mb(); /* caller's subsequent code after above check. */ 2822 smp_mb(); /* caller's subsequent code after above check. */
2793 mutex_unlock(&rsp->barrier_mutex); 2823 mutex_unlock(&rsp->barrier_mutex);
@@ -2930,6 +2960,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
2930 rdp->blimit = blimit; 2960 rdp->blimit = blimit;
2931 init_callback_list(rdp); /* Re-enable callbacks on this CPU. */ 2961 init_callback_list(rdp); /* Re-enable callbacks on this CPU. */
2932 rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; 2962 rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
2963 rcu_sysidle_init_percpu_data(rdp->dynticks);
2933 atomic_set(&rdp->dynticks->dynticks, 2964 atomic_set(&rdp->dynticks->dynticks,
2934 (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); 2965 (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
2935 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 2966 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
@@ -2952,7 +2983,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
2952 rdp->completed = rnp->completed; 2983 rdp->completed = rnp->completed;
2953 rdp->passed_quiesce = 0; 2984 rdp->passed_quiesce = 0;
2954 rdp->qs_pending = 0; 2985 rdp->qs_pending = 0;
2955 trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuonl"); 2986 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
2956 } 2987 }
2957 raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ 2988 raw_spin_unlock(&rnp->lock); /* irqs already disabled. */
2958 rnp = rnp->parent; 2989 rnp = rnp->parent;
@@ -2982,7 +3013,7 @@ static int rcu_cpu_notify(struct notifier_block *self,
2982 struct rcu_node *rnp = rdp->mynode; 3013 struct rcu_node *rnp = rdp->mynode;
2983 struct rcu_state *rsp; 3014 struct rcu_state *rsp;
2984 3015
2985 trace_rcu_utilization("Start CPU hotplug"); 3016 trace_rcu_utilization(TPS("Start CPU hotplug"));
2986 switch (action) { 3017 switch (action) {
2987 case CPU_UP_PREPARE: 3018 case CPU_UP_PREPARE:
2988 case CPU_UP_PREPARE_FROZEN: 3019 case CPU_UP_PREPARE_FROZEN:
@@ -3011,7 +3042,26 @@ static int rcu_cpu_notify(struct notifier_block *self,
3011 default: 3042 default:
3012 break; 3043 break;
3013 } 3044 }
3014 trace_rcu_utilization("End CPU hotplug"); 3045 trace_rcu_utilization(TPS("End CPU hotplug"));
3046 return NOTIFY_OK;
3047}
3048
3049static int rcu_pm_notify(struct notifier_block *self,
3050 unsigned long action, void *hcpu)
3051{
3052 switch (action) {
3053 case PM_HIBERNATION_PREPARE:
3054 case PM_SUSPEND_PREPARE:
3055 if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */
3056 rcu_expedited = 1;
3057 break;
3058 case PM_POST_HIBERNATION:
3059 case PM_POST_SUSPEND:
3060 rcu_expedited = 0;
3061 break;
3062 default:
3063 break;
3064 }
3015 return NOTIFY_OK; 3065 return NOTIFY_OK;
3016} 3066}
3017 3067
@@ -3256,6 +3306,7 @@ void __init rcu_init(void)
3256 * or the scheduler are operational. 3306 * or the scheduler are operational.
3257 */ 3307 */
3258 cpu_notifier(rcu_cpu_notify, 0); 3308 cpu_notifier(rcu_cpu_notify, 0);
3309 pm_notifier(rcu_pm_notify, 0);
3259 for_each_online_cpu(cpu) 3310 for_each_online_cpu(cpu)
3260 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); 3311 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
3261} 3312}
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index b3832581043c..5f97eab602cd 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -88,6 +88,14 @@ struct rcu_dynticks {
88 /* Process level is worth LLONG_MAX/2. */ 88 /* Process level is worth LLONG_MAX/2. */
89 int dynticks_nmi_nesting; /* Track NMI nesting level. */ 89 int dynticks_nmi_nesting; /* Track NMI nesting level. */
90 atomic_t dynticks; /* Even value for idle, else odd. */ 90 atomic_t dynticks; /* Even value for idle, else odd. */
91#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
92 long long dynticks_idle_nesting;
93 /* irq/process nesting level from idle. */
94 atomic_t dynticks_idle; /* Even value for idle, else odd. */
95 /* "Idle" excludes userspace execution. */
96 unsigned long dynticks_idle_jiffies;
97 /* End of last non-NMI non-idle period. */
98#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
91#ifdef CONFIG_RCU_FAST_NO_HZ 99#ifdef CONFIG_RCU_FAST_NO_HZ
92 bool all_lazy; /* Are all CPU's CBs lazy? */ 100 bool all_lazy; /* Are all CPU's CBs lazy? */
93 unsigned long nonlazy_posted; 101 unsigned long nonlazy_posted;
@@ -445,7 +453,7 @@ struct rcu_state {
445 /* for CPU stalls. */ 453 /* for CPU stalls. */
446 unsigned long gp_max; /* Maximum GP duration in */ 454 unsigned long gp_max; /* Maximum GP duration in */
447 /* jiffies. */ 455 /* jiffies. */
448 char *name; /* Name of structure. */ 456 const char *name; /* Name of structure. */
449 char abbr; /* Abbreviated name. */ 457 char abbr; /* Abbreviated name. */
450 struct list_head flavors; /* List of RCU flavors. */ 458 struct list_head flavors; /* List of RCU flavors. */
451 struct irq_work wakeup_work; /* Postponed wakeups */ 459 struct irq_work wakeup_work; /* Postponed wakeups */
@@ -545,6 +553,15 @@ static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
545static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); 553static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp);
546static void rcu_kick_nohz_cpu(int cpu); 554static void rcu_kick_nohz_cpu(int cpu);
547static bool init_nocb_callback_list(struct rcu_data *rdp); 555static bool init_nocb_callback_list(struct rcu_data *rdp);
556static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq);
557static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq);
558static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
559 unsigned long *maxj);
560static bool is_sysidle_rcu_state(struct rcu_state *rsp);
561static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
562 unsigned long maxj);
563static void rcu_bind_gp_kthread(void);
564static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp);
548 565
549#endif /* #ifndef RCU_TREE_NONCORE */ 566#endif /* #ifndef RCU_TREE_NONCORE */
550 567
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 769e12e3151b..130c97b027f2 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -28,7 +28,7 @@
28#include <linux/gfp.h> 28#include <linux/gfp.h>
29#include <linux/oom.h> 29#include <linux/oom.h>
30#include <linux/smpboot.h> 30#include <linux/smpboot.h>
31#include <linux/tick.h> 31#include "time/tick-internal.h"
32 32
33#define RCU_KTHREAD_PRIO 1 33#define RCU_KTHREAD_PRIO 1
34 34
@@ -110,9 +110,7 @@ static void __init rcu_bootup_announce_oddness(void)
110 110
111#ifdef CONFIG_TREE_PREEMPT_RCU 111#ifdef CONFIG_TREE_PREEMPT_RCU
112 112
113struct rcu_state rcu_preempt_state = 113RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
114 RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
115DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
116static struct rcu_state *rcu_state = &rcu_preempt_state; 114static struct rcu_state *rcu_state = &rcu_preempt_state;
117 115
118static int rcu_preempted_readers_exp(struct rcu_node *rnp); 116static int rcu_preempted_readers_exp(struct rcu_node *rnp);
@@ -169,7 +167,7 @@ static void rcu_preempt_qs(int cpu)
169 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); 167 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
170 168
171 if (rdp->passed_quiesce == 0) 169 if (rdp->passed_quiesce == 0)
172 trace_rcu_grace_period("rcu_preempt", rdp->gpnum, "cpuqs"); 170 trace_rcu_grace_period(TPS("rcu_preempt"), rdp->gpnum, TPS("cpuqs"));
173 rdp->passed_quiesce = 1; 171 rdp->passed_quiesce = 1;
174 current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; 172 current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
175} 173}
@@ -388,7 +386,7 @@ void rcu_read_unlock_special(struct task_struct *t)
388 np = rcu_next_node_entry(t, rnp); 386 np = rcu_next_node_entry(t, rnp);
389 list_del_init(&t->rcu_node_entry); 387 list_del_init(&t->rcu_node_entry);
390 t->rcu_blocked_node = NULL; 388 t->rcu_blocked_node = NULL;
391 trace_rcu_unlock_preempted_task("rcu_preempt", 389 trace_rcu_unlock_preempted_task(TPS("rcu_preempt"),
392 rnp->gpnum, t->pid); 390 rnp->gpnum, t->pid);
393 if (&t->rcu_node_entry == rnp->gp_tasks) 391 if (&t->rcu_node_entry == rnp->gp_tasks)
394 rnp->gp_tasks = np; 392 rnp->gp_tasks = np;
@@ -412,7 +410,7 @@ void rcu_read_unlock_special(struct task_struct *t)
412 */ 410 */
413 empty_exp_now = !rcu_preempted_readers_exp(rnp); 411 empty_exp_now = !rcu_preempted_readers_exp(rnp);
414 if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) { 412 if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) {
415 trace_rcu_quiescent_state_report("preempt_rcu", 413 trace_rcu_quiescent_state_report(TPS("preempt_rcu"),
416 rnp->gpnum, 414 rnp->gpnum,
417 0, rnp->qsmask, 415 0, rnp->qsmask,
418 rnp->level, 416 rnp->level,
@@ -1250,12 +1248,12 @@ static int rcu_boost_kthread(void *arg)
1250 int spincnt = 0; 1248 int spincnt = 0;
1251 int more2boost; 1249 int more2boost;
1252 1250
1253 trace_rcu_utilization("Start boost kthread@init"); 1251 trace_rcu_utilization(TPS("Start boost kthread@init"));
1254 for (;;) { 1252 for (;;) {
1255 rnp->boost_kthread_status = RCU_KTHREAD_WAITING; 1253 rnp->boost_kthread_status = RCU_KTHREAD_WAITING;
1256 trace_rcu_utilization("End boost kthread@rcu_wait"); 1254 trace_rcu_utilization(TPS("End boost kthread@rcu_wait"));
1257 rcu_wait(rnp->boost_tasks || rnp->exp_tasks); 1255 rcu_wait(rnp->boost_tasks || rnp->exp_tasks);
1258 trace_rcu_utilization("Start boost kthread@rcu_wait"); 1256 trace_rcu_utilization(TPS("Start boost kthread@rcu_wait"));
1259 rnp->boost_kthread_status = RCU_KTHREAD_RUNNING; 1257 rnp->boost_kthread_status = RCU_KTHREAD_RUNNING;
1260 more2boost = rcu_boost(rnp); 1258 more2boost = rcu_boost(rnp);
1261 if (more2boost) 1259 if (more2boost)
@@ -1264,14 +1262,14 @@ static int rcu_boost_kthread(void *arg)
1264 spincnt = 0; 1262 spincnt = 0;
1265 if (spincnt > 10) { 1263 if (spincnt > 10) {
1266 rnp->boost_kthread_status = RCU_KTHREAD_YIELDING; 1264 rnp->boost_kthread_status = RCU_KTHREAD_YIELDING;
1267 trace_rcu_utilization("End boost kthread@rcu_yield"); 1265 trace_rcu_utilization(TPS("End boost kthread@rcu_yield"));
1268 schedule_timeout_interruptible(2); 1266 schedule_timeout_interruptible(2);
1269 trace_rcu_utilization("Start boost kthread@rcu_yield"); 1267 trace_rcu_utilization(TPS("Start boost kthread@rcu_yield"));
1270 spincnt = 0; 1268 spincnt = 0;
1271 } 1269 }
1272 } 1270 }
1273 /* NOTREACHED */ 1271 /* NOTREACHED */
1274 trace_rcu_utilization("End boost kthread@notreached"); 1272 trace_rcu_utilization(TPS("End boost kthread@notreached"));
1275 return 0; 1273 return 0;
1276} 1274}
1277 1275
@@ -1419,7 +1417,7 @@ static void rcu_cpu_kthread(unsigned int cpu)
1419 int spincnt; 1417 int spincnt;
1420 1418
1421 for (spincnt = 0; spincnt < 10; spincnt++) { 1419 for (spincnt = 0; spincnt < 10; spincnt++) {
1422 trace_rcu_utilization("Start CPU kthread@rcu_wait"); 1420 trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
1423 local_bh_disable(); 1421 local_bh_disable();
1424 *statusp = RCU_KTHREAD_RUNNING; 1422 *statusp = RCU_KTHREAD_RUNNING;
1425 this_cpu_inc(rcu_cpu_kthread_loops); 1423 this_cpu_inc(rcu_cpu_kthread_loops);
@@ -1431,15 +1429,15 @@ static void rcu_cpu_kthread(unsigned int cpu)
1431 rcu_kthread_do_work(); 1429 rcu_kthread_do_work();
1432 local_bh_enable(); 1430 local_bh_enable();
1433 if (*workp == 0) { 1431 if (*workp == 0) {
1434 trace_rcu_utilization("End CPU kthread@rcu_wait"); 1432 trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
1435 *statusp = RCU_KTHREAD_WAITING; 1433 *statusp = RCU_KTHREAD_WAITING;
1436 return; 1434 return;
1437 } 1435 }
1438 } 1436 }
1439 *statusp = RCU_KTHREAD_YIELDING; 1437 *statusp = RCU_KTHREAD_YIELDING;
1440 trace_rcu_utilization("Start CPU kthread@rcu_yield"); 1438 trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
1441 schedule_timeout_interruptible(2); 1439 schedule_timeout_interruptible(2);
1442 trace_rcu_utilization("End CPU kthread@rcu_yield"); 1440 trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
1443 *statusp = RCU_KTHREAD_WAITING; 1441 *statusp = RCU_KTHREAD_WAITING;
1444} 1442}
1445 1443
@@ -2202,7 +2200,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
2202 * Wait for the grace period. Do so interruptibly to avoid messing 2200 * Wait for the grace period. Do so interruptibly to avoid messing
2203 * up the load average. 2201 * up the load average.
2204 */ 2202 */
2205 trace_rcu_future_gp(rnp, rdp, c, "StartWait"); 2203 trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait"));
2206 for (;;) { 2204 for (;;) {
2207 wait_event_interruptible( 2205 wait_event_interruptible(
2208 rnp->nocb_gp_wq[c & 0x1], 2206 rnp->nocb_gp_wq[c & 0x1],
@@ -2210,9 +2208,9 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
2210 if (likely(d)) 2208 if (likely(d))
2211 break; 2209 break;
2212 flush_signals(current); 2210 flush_signals(current);
2213 trace_rcu_future_gp(rnp, rdp, c, "ResumeWait"); 2211 trace_rcu_future_gp(rnp, rdp, c, TPS("ResumeWait"));
2214 } 2212 }
2215 trace_rcu_future_gp(rnp, rdp, c, "EndWait"); 2213 trace_rcu_future_gp(rnp, rdp, c, TPS("EndWait"));
2216 smp_mb(); /* Ensure that CB invocation happens after GP end. */ 2214 smp_mb(); /* Ensure that CB invocation happens after GP end. */
2217} 2215}
2218 2216
@@ -2375,3 +2373,425 @@ static void rcu_kick_nohz_cpu(int cpu)
2375 smp_send_reschedule(cpu); 2373 smp_send_reschedule(cpu);
2376#endif /* #ifdef CONFIG_NO_HZ_FULL */ 2374#endif /* #ifdef CONFIG_NO_HZ_FULL */
2377} 2375}
2376
2377
2378#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
2379
2380/*
2381 * Define RCU flavor that holds sysidle state. This needs to be the
2382 * most active flavor of RCU.
2383 */
2384#ifdef CONFIG_PREEMPT_RCU
2385static struct rcu_state *rcu_sysidle_state = &rcu_preempt_state;
2386#else /* #ifdef CONFIG_PREEMPT_RCU */
2387static struct rcu_state *rcu_sysidle_state = &rcu_sched_state;
2388#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
2389
2390static int full_sysidle_state; /* Current system-idle state. */
2391#define RCU_SYSIDLE_NOT 0 /* Some CPU is not idle. */
2392#define RCU_SYSIDLE_SHORT 1 /* All CPUs idle for brief period. */
2393#define RCU_SYSIDLE_LONG 2 /* All CPUs idle for long enough. */
2394#define RCU_SYSIDLE_FULL 3 /* All CPUs idle, ready for sysidle. */
2395#define RCU_SYSIDLE_FULL_NOTED 4 /* Actually entered sysidle state. */
2396
2397/*
2398 * Invoked to note exit from irq or task transition to idle. Note that
2399 * usermode execution does -not- count as idle here! After all, we want
2400 * to detect full-system idle states, not RCU quiescent states and grace
2401 * periods. The caller must have disabled interrupts.
2402 */
2403static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq)
2404{
2405 unsigned long j;
2406
2407 /* Adjust nesting, check for fully idle. */
2408 if (irq) {
2409 rdtp->dynticks_idle_nesting--;
2410 WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0);
2411 if (rdtp->dynticks_idle_nesting != 0)
2412 return; /* Still not fully idle. */
2413 } else {
2414 if ((rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) ==
2415 DYNTICK_TASK_NEST_VALUE) {
2416 rdtp->dynticks_idle_nesting = 0;
2417 } else {
2418 rdtp->dynticks_idle_nesting -= DYNTICK_TASK_NEST_VALUE;
2419 WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0);
2420 return; /* Still not fully idle. */
2421 }
2422 }
2423
2424 /* Record start of fully idle period. */
2425 j = jiffies;
2426 ACCESS_ONCE(rdtp->dynticks_idle_jiffies) = j;
2427 smp_mb__before_atomic_inc();
2428 atomic_inc(&rdtp->dynticks_idle);
2429 smp_mb__after_atomic_inc();
2430 WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1);
2431}
2432
2433/*
2434 * Unconditionally force exit from full system-idle state. This is
2435 * invoked when a normal CPU exits idle, but must be called separately
2436 * for the timekeeping CPU (tick_do_timer_cpu). The reason for this
2437 * is that the timekeeping CPU is permitted to take scheduling-clock
2438 * interrupts while the system is in system-idle state, and of course
2439 * rcu_sysidle_exit() has no way of distinguishing a scheduling-clock
2440 * interrupt from any other type of interrupt.
2441 */
2442void rcu_sysidle_force_exit(void)
2443{
2444 int oldstate = ACCESS_ONCE(full_sysidle_state);
2445 int newoldstate;
2446
2447 /*
2448 * Each pass through the following loop attempts to exit full
2449 * system-idle state. If contention proves to be a problem,
2450 * a trylock-based contention tree could be used here.
2451 */
2452 while (oldstate > RCU_SYSIDLE_SHORT) {
2453 newoldstate = cmpxchg(&full_sysidle_state,
2454 oldstate, RCU_SYSIDLE_NOT);
2455 if (oldstate == newoldstate &&
2456 oldstate == RCU_SYSIDLE_FULL_NOTED) {
2457 rcu_kick_nohz_cpu(tick_do_timer_cpu);
2458 return; /* We cleared it, done! */
2459 }
2460 oldstate = newoldstate;
2461 }
2462 smp_mb(); /* Order initial oldstate fetch vs. later non-idle work. */
2463}
2464
2465/*
2466 * Invoked to note entry to irq or task transition from idle. Note that
2467 * usermode execution does -not- count as idle here! The caller must
2468 * have disabled interrupts.
2469 */
2470static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq)
2471{
2472 /* Adjust nesting, check for already non-idle. */
2473 if (irq) {
2474 rdtp->dynticks_idle_nesting++;
2475 WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0);
2476 if (rdtp->dynticks_idle_nesting != 1)
2477 return; /* Already non-idle. */
2478 } else {
2479 /*
2480 * Allow for irq misnesting. Yes, it really is possible
2481 * to enter an irq handler then never leave it, and maybe
2482 * also vice versa. Handle both possibilities.
2483 */
2484 if (rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) {
2485 rdtp->dynticks_idle_nesting += DYNTICK_TASK_NEST_VALUE;
2486 WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0);
2487 return; /* Already non-idle. */
2488 } else {
2489 rdtp->dynticks_idle_nesting = DYNTICK_TASK_EXIT_IDLE;
2490 }
2491 }
2492
2493 /* Record end of idle period. */
2494 smp_mb__before_atomic_inc();
2495 atomic_inc(&rdtp->dynticks_idle);
2496 smp_mb__after_atomic_inc();
2497 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1));
2498
2499 /*
2500 * If we are the timekeeping CPU, we are permitted to be non-idle
2501 * during a system-idle state. This must be the case, because
2502 * the timekeeping CPU has to take scheduling-clock interrupts
2503 * during the time that the system is transitioning to full
2504 * system-idle state. This means that the timekeeping CPU must
2505 * invoke rcu_sysidle_force_exit() directly if it does anything
2506 * more than take a scheduling-clock interrupt.
2507 */
2508 if (smp_processor_id() == tick_do_timer_cpu)
2509 return;
2510
2511 /* Update system-idle state: We are clearly no longer fully idle! */
2512 rcu_sysidle_force_exit();
2513}
2514
2515/*
2516 * Check to see if the current CPU is idle. Note that usermode execution
2517 * does not count as idle. The caller must have disabled interrupts.
2518 */
2519static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
2520 unsigned long *maxj)
2521{
2522 int cur;
2523 unsigned long j;
2524 struct rcu_dynticks *rdtp = rdp->dynticks;
2525
2526 /*
2527 * If some other CPU has already reported non-idle, if this is
2528 * not the flavor of RCU that tracks sysidle state, or if this
2529 * is an offline or the timekeeping CPU, nothing to do.
2530 */
2531 if (!*isidle || rdp->rsp != rcu_sysidle_state ||
2532 cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu)
2533 return;
2534 if (rcu_gp_in_progress(rdp->rsp))
2535 WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu);
2536
2537 /* Pick up current idle and NMI-nesting counter and check. */
2538 cur = atomic_read(&rdtp->dynticks_idle);
2539 if (cur & 0x1) {
2540 *isidle = false; /* We are not idle! */
2541 return;
2542 }
2543 smp_mb(); /* Read counters before timestamps. */
2544
2545 /* Pick up timestamps. */
2546 j = ACCESS_ONCE(rdtp->dynticks_idle_jiffies);
2547 /* If this CPU entered idle more recently, update maxj timestamp. */
2548 if (ULONG_CMP_LT(*maxj, j))
2549 *maxj = j;
2550}
2551
2552/*
2553 * Is this the flavor of RCU that is handling full-system idle?
2554 */
2555static bool is_sysidle_rcu_state(struct rcu_state *rsp)
2556{
2557 return rsp == rcu_sysidle_state;
2558}
2559
2560/*
2561 * Bind the grace-period kthread for the sysidle flavor of RCU to the
2562 * timekeeping CPU.
2563 */
2564static void rcu_bind_gp_kthread(void)
2565{
2566 int cpu = ACCESS_ONCE(tick_do_timer_cpu);
2567
2568 if (cpu < 0 || cpu >= nr_cpu_ids)
2569 return;
2570 if (raw_smp_processor_id() != cpu)
2571 set_cpus_allowed_ptr(current, cpumask_of(cpu));
2572}
2573
2574/*
2575 * Return a delay in jiffies based on the number of CPUs, rcu_node
2576 * leaf fanout, and jiffies tick rate. The idea is to allow larger
2577 * systems more time to transition to full-idle state in order to
2578 * avoid the cache thrashing that otherwise occur on the state variable.
2579 * Really small systems (less than a couple of tens of CPUs) should
2580 * instead use a single global atomically incremented counter, and later
2581 * versions of this will automatically reconfigure themselves accordingly.
2582 */
2583static unsigned long rcu_sysidle_delay(void)
2584{
2585 if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL)
2586 return 0;
2587 return DIV_ROUND_UP(nr_cpu_ids * HZ, rcu_fanout_leaf * 1000);
2588}
2589
2590/*
2591 * Advance the full-system-idle state. This is invoked when all of
2592 * the non-timekeeping CPUs are idle.
2593 */
2594static void rcu_sysidle(unsigned long j)
2595{
2596 /* Check the current state. */
2597 switch (ACCESS_ONCE(full_sysidle_state)) {
2598 case RCU_SYSIDLE_NOT:
2599
2600 /* First time all are idle, so note a short idle period. */
2601 ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_SHORT;
2602 break;
2603
2604 case RCU_SYSIDLE_SHORT:
2605
2606 /*
2607 * Idle for a bit, time to advance to next state?
2608 * cmpxchg failure means race with non-idle, let them win.
2609 */
2610 if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay()))
2611 (void)cmpxchg(&full_sysidle_state,
2612 RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG);
2613 break;
2614
2615 case RCU_SYSIDLE_LONG:
2616
2617 /*
2618 * Do an additional check pass before advancing to full.
2619 * cmpxchg failure means race with non-idle, let them win.
2620 */
2621 if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay()))
2622 (void)cmpxchg(&full_sysidle_state,
2623 RCU_SYSIDLE_LONG, RCU_SYSIDLE_FULL);
2624 break;
2625
2626 default:
2627 break;
2628 }
2629}
2630
2631/*
2632 * Found a non-idle non-timekeeping CPU, so kick the system-idle state
2633 * back to the beginning.
2634 */
2635static void rcu_sysidle_cancel(void)
2636{
2637 smp_mb();
2638 ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT;
2639}
2640
2641/*
2642 * Update the sysidle state based on the results of a force-quiescent-state
2643 * scan of the CPUs' dyntick-idle state.
2644 */
2645static void rcu_sysidle_report(struct rcu_state *rsp, int isidle,
2646 unsigned long maxj, bool gpkt)
2647{
2648 if (rsp != rcu_sysidle_state)
2649 return; /* Wrong flavor, ignore. */
2650 if (gpkt && nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL)
2651 return; /* Running state machine from timekeeping CPU. */
2652 if (isidle)
2653 rcu_sysidle(maxj); /* More idle! */
2654 else
2655 rcu_sysidle_cancel(); /* Idle is over. */
2656}
2657
2658/*
2659 * Wrapper for rcu_sysidle_report() when called from the grace-period
2660 * kthread's context.
2661 */
2662static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
2663 unsigned long maxj)
2664{
2665 rcu_sysidle_report(rsp, isidle, maxj, true);
2666}
2667
2668/* Callback and function for forcing an RCU grace period. */
2669struct rcu_sysidle_head {
2670 struct rcu_head rh;
2671 int inuse;
2672};
2673
2674static void rcu_sysidle_cb(struct rcu_head *rhp)
2675{
2676 struct rcu_sysidle_head *rshp;
2677
2678 /*
2679 * The following memory barrier is needed to replace the
2680 * memory barriers that would normally be in the memory
2681 * allocator.
2682 */
2683 smp_mb(); /* grace period precedes setting inuse. */
2684
2685 rshp = container_of(rhp, struct rcu_sysidle_head, rh);
2686 ACCESS_ONCE(rshp->inuse) = 0;
2687}
2688
2689/*
2690 * Check to see if the system is fully idle, other than the timekeeping CPU.
2691 * The caller must have disabled interrupts.
2692 */
2693bool rcu_sys_is_idle(void)
2694{
2695 static struct rcu_sysidle_head rsh;
2696 int rss = ACCESS_ONCE(full_sysidle_state);
2697
2698 if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu))
2699 return false;
2700
2701 /* Handle small-system case by doing a full scan of CPUs. */
2702 if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) {
2703 int oldrss = rss - 1;
2704
2705 /*
2706 * One pass to advance to each state up to _FULL.
2707 * Give up if any pass fails to advance the state.
2708 */
2709 while (rss < RCU_SYSIDLE_FULL && oldrss < rss) {
2710 int cpu;
2711 bool isidle = true;
2712 unsigned long maxj = jiffies - ULONG_MAX / 4;
2713 struct rcu_data *rdp;
2714
2715 /* Scan all the CPUs looking for nonidle CPUs. */
2716 for_each_possible_cpu(cpu) {
2717 rdp = per_cpu_ptr(rcu_sysidle_state->rda, cpu);
2718 rcu_sysidle_check_cpu(rdp, &isidle, &maxj);
2719 if (!isidle)
2720 break;
2721 }
2722 rcu_sysidle_report(rcu_sysidle_state,
2723 isidle, maxj, false);
2724 oldrss = rss;
2725 rss = ACCESS_ONCE(full_sysidle_state);
2726 }
2727 }
2728
2729 /* If this is the first observation of an idle period, record it. */
2730 if (rss == RCU_SYSIDLE_FULL) {
2731 rss = cmpxchg(&full_sysidle_state,
2732 RCU_SYSIDLE_FULL, RCU_SYSIDLE_FULL_NOTED);
2733 return rss == RCU_SYSIDLE_FULL;
2734 }
2735
2736 smp_mb(); /* ensure rss load happens before later caller actions. */
2737
2738 /* If already fully idle, tell the caller (in case of races). */
2739 if (rss == RCU_SYSIDLE_FULL_NOTED)
2740 return true;
2741
2742 /*
2743 * If we aren't there yet, and a grace period is not in flight,
2744 * initiate a grace period. Either way, tell the caller that
2745 * we are not there yet. We use an xchg() rather than an assignment
2746 * to make up for the memory barriers that would otherwise be
2747 * provided by the memory allocator.
2748 */
2749 if (nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL &&
2750 !rcu_gp_in_progress(rcu_sysidle_state) &&
2751 !rsh.inuse && xchg(&rsh.inuse, 1) == 0)
2752 call_rcu(&rsh.rh, rcu_sysidle_cb);
2753 return false;
2754}
2755
2756/*
2757 * Initialize dynticks sysidle state for CPUs coming online.
2758 */
2759static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
2760{
2761 rdtp->dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE;
2762}
2763
2764#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
2765
2766static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq)
2767{
2768}
2769
2770static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq)
2771{
2772}
2773
2774static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
2775 unsigned long *maxj)
2776{
2777}
2778
2779static bool is_sysidle_rcu_state(struct rcu_state *rsp)
2780{
2781 return false;
2782}
2783
2784static void rcu_bind_gp_kthread(void)
2785{
2786}
2787
2788static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
2789 unsigned long maxj)
2790{
2791}
2792
2793static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
2794{
2795}
2796
2797#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 05c39f030314..725aa067ad63 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2527,13 +2527,11 @@ void __sched schedule_preempt_disabled(void)
2527 */ 2527 */
2528asmlinkage void __sched notrace preempt_schedule(void) 2528asmlinkage void __sched notrace preempt_schedule(void)
2529{ 2529{
2530 struct thread_info *ti = current_thread_info();
2531
2532 /* 2530 /*
2533 * If there is a non-zero preempt_count or interrupts are disabled, 2531 * If there is a non-zero preempt_count or interrupts are disabled,
2534 * we do not want to preempt the current task. Just return.. 2532 * we do not want to preempt the current task. Just return..
2535 */ 2533 */
2536 if (likely(ti->preempt_count || irqs_disabled())) 2534 if (likely(!preemptible()))
2537 return; 2535 return;
2538 2536
2539 do { 2537 do {
@@ -2677,7 +2675,7 @@ void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
2677 if (unlikely(!q)) 2675 if (unlikely(!q))
2678 return; 2676 return;
2679 2677
2680 if (unlikely(!nr_exclusive)) 2678 if (unlikely(nr_exclusive != 1))
2681 wake_flags = 0; 2679 wake_flags = 0;
2682 2680
2683 spin_lock_irqsave(&q->lock, flags); 2681 spin_lock_irqsave(&q->lock, flags);
@@ -4964,7 +4962,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
4964 SD_BALANCE_FORK | 4962 SD_BALANCE_FORK |
4965 SD_BALANCE_EXEC | 4963 SD_BALANCE_EXEC |
4966 SD_SHARE_CPUPOWER | 4964 SD_SHARE_CPUPOWER |
4967 SD_SHARE_PKG_RESOURCES); 4965 SD_SHARE_PKG_RESOURCES |
4966 SD_PREFER_SIBLING);
4968 if (nr_node_ids == 1) 4967 if (nr_node_ids == 1)
4969 pflags &= ~SD_SERIALIZE; 4968 pflags &= ~SD_SERIALIZE;
4970 } 4969 }
@@ -5133,18 +5132,23 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
5133 * two cpus are in the same cache domain, see cpus_share_cache(). 5132 * two cpus are in the same cache domain, see cpus_share_cache().
5134 */ 5133 */
5135DEFINE_PER_CPU(struct sched_domain *, sd_llc); 5134DEFINE_PER_CPU(struct sched_domain *, sd_llc);
5135DEFINE_PER_CPU(int, sd_llc_size);
5136DEFINE_PER_CPU(int, sd_llc_id); 5136DEFINE_PER_CPU(int, sd_llc_id);
5137 5137
5138static void update_top_cache_domain(int cpu) 5138static void update_top_cache_domain(int cpu)
5139{ 5139{
5140 struct sched_domain *sd; 5140 struct sched_domain *sd;
5141 int id = cpu; 5141 int id = cpu;
5142 int size = 1;
5142 5143
5143 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); 5144 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
5144 if (sd) 5145 if (sd) {
5145 id = cpumask_first(sched_domain_span(sd)); 5146 id = cpumask_first(sched_domain_span(sd));
5147 size = cpumask_weight(sched_domain_span(sd));
5148 }
5146 5149
5147 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); 5150 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
5151 per_cpu(sd_llc_size, cpu) = size;
5148 per_cpu(sd_llc_id, cpu) = id; 5152 per_cpu(sd_llc_id, cpu) = id;
5149} 5153}
5150 5154
@@ -5168,6 +5172,13 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
5168 tmp->parent = parent->parent; 5172 tmp->parent = parent->parent;
5169 if (parent->parent) 5173 if (parent->parent)
5170 parent->parent->child = tmp; 5174 parent->parent->child = tmp;
5175 /*
5176 * Transfer SD_PREFER_SIBLING down in case of a
5177 * degenerate parent; the spans match for this
5178 * so the property transfers.
5179 */
5180 if (parent->flags & SD_PREFER_SIBLING)
5181 tmp->flags |= SD_PREFER_SIBLING;
5171 destroy_sched_domain(parent, cpu); 5182 destroy_sched_domain(parent, cpu);
5172 } else 5183 } else
5173 tmp = tmp->parent; 5184 tmp = tmp->parent;
@@ -6234,8 +6245,9 @@ match1:
6234 ; 6245 ;
6235 } 6246 }
6236 6247
6248 n = ndoms_cur;
6237 if (doms_new == NULL) { 6249 if (doms_new == NULL) {
6238 ndoms_cur = 0; 6250 n = 0;
6239 doms_new = &fallback_doms; 6251 doms_new = &fallback_doms;
6240 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); 6252 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
6241 WARN_ON_ONCE(dattr_new); 6253 WARN_ON_ONCE(dattr_new);
@@ -6243,7 +6255,7 @@ match1:
6243 6255
6244 /* Build new domains */ 6256 /* Build new domains */
6245 for (i = 0; i < ndoms_new; i++) { 6257 for (i = 0; i < ndoms_new; i++) {
6246 for (j = 0; j < ndoms_cur && !new_topology; j++) { 6258 for (j = 0; j < n && !new_topology; j++) {
6247 if (cpumask_equal(doms_new[i], doms_cur[j]) 6259 if (cpumask_equal(doms_new[i], doms_cur[j])
6248 && dattrs_equal(dattr_new, i, dattr_cur, j)) 6260 && dattrs_equal(dattr_new, i, dattr_cur, j))
6249 goto match2; 6261 goto match2;
@@ -6815,7 +6827,7 @@ void sched_move_task(struct task_struct *tsk)
6815 if (unlikely(running)) 6827 if (unlikely(running))
6816 tsk->sched_class->put_prev_task(rq, tsk); 6828 tsk->sched_class->put_prev_task(rq, tsk);
6817 6829
6818 tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id, 6830 tg = container_of(task_css_check(tsk, cpu_cgroup_subsys_id,
6819 lockdep_is_held(&tsk->sighand->siglock)), 6831 lockdep_is_held(&tsk->sighand->siglock)),
6820 struct task_group, css); 6832 struct task_group, css);
6821 tg = autogroup_task_group(tsk, tg); 6833 tg = autogroup_task_group(tsk, tg);
@@ -7137,23 +7149,22 @@ int sched_rt_handler(struct ctl_table *table, int write,
7137 7149
7138#ifdef CONFIG_CGROUP_SCHED 7150#ifdef CONFIG_CGROUP_SCHED
7139 7151
7140/* return corresponding task_group object of a cgroup */ 7152static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
7141static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
7142{ 7153{
7143 return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id), 7154 return css ? container_of(css, struct task_group, css) : NULL;
7144 struct task_group, css);
7145} 7155}
7146 7156
7147static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp) 7157static struct cgroup_subsys_state *
7158cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
7148{ 7159{
7149 struct task_group *tg, *parent; 7160 struct task_group *parent = css_tg(parent_css);
7161 struct task_group *tg;
7150 7162
7151 if (!cgrp->parent) { 7163 if (!parent) {
7152 /* This is early initialization for the top cgroup */ 7164 /* This is early initialization for the top cgroup */
7153 return &root_task_group.css; 7165 return &root_task_group.css;
7154 } 7166 }
7155 7167
7156 parent = cgroup_tg(cgrp->parent);
7157 tg = sched_create_group(parent); 7168 tg = sched_create_group(parent);
7158 if (IS_ERR(tg)) 7169 if (IS_ERR(tg))
7159 return ERR_PTR(-ENOMEM); 7170 return ERR_PTR(-ENOMEM);
@@ -7161,41 +7172,38 @@ static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp)
7161 return &tg->css; 7172 return &tg->css;
7162} 7173}
7163 7174
7164static int cpu_cgroup_css_online(struct cgroup *cgrp) 7175static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
7165{ 7176{
7166 struct task_group *tg = cgroup_tg(cgrp); 7177 struct task_group *tg = css_tg(css);
7167 struct task_group *parent; 7178 struct task_group *parent = css_tg(css_parent(css));
7168
7169 if (!cgrp->parent)
7170 return 0;
7171 7179
7172 parent = cgroup_tg(cgrp->parent); 7180 if (parent)
7173 sched_online_group(tg, parent); 7181 sched_online_group(tg, parent);
7174 return 0; 7182 return 0;
7175} 7183}
7176 7184
7177static void cpu_cgroup_css_free(struct cgroup *cgrp) 7185static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
7178{ 7186{
7179 struct task_group *tg = cgroup_tg(cgrp); 7187 struct task_group *tg = css_tg(css);
7180 7188
7181 sched_destroy_group(tg); 7189 sched_destroy_group(tg);
7182} 7190}
7183 7191
7184static void cpu_cgroup_css_offline(struct cgroup *cgrp) 7192static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
7185{ 7193{
7186 struct task_group *tg = cgroup_tg(cgrp); 7194 struct task_group *tg = css_tg(css);
7187 7195
7188 sched_offline_group(tg); 7196 sched_offline_group(tg);
7189} 7197}
7190 7198
7191static int cpu_cgroup_can_attach(struct cgroup *cgrp, 7199static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
7192 struct cgroup_taskset *tset) 7200 struct cgroup_taskset *tset)
7193{ 7201{
7194 struct task_struct *task; 7202 struct task_struct *task;
7195 7203
7196 cgroup_taskset_for_each(task, cgrp, tset) { 7204 cgroup_taskset_for_each(task, css, tset) {
7197#ifdef CONFIG_RT_GROUP_SCHED 7205#ifdef CONFIG_RT_GROUP_SCHED
7198 if (!sched_rt_can_attach(cgroup_tg(cgrp), task)) 7206 if (!sched_rt_can_attach(css_tg(css), task))
7199 return -EINVAL; 7207 return -EINVAL;
7200#else 7208#else
7201 /* We don't support RT-tasks being in separate groups */ 7209 /* We don't support RT-tasks being in separate groups */
@@ -7206,18 +7214,18 @@ static int cpu_cgroup_can_attach(struct cgroup *cgrp,
7206 return 0; 7214 return 0;
7207} 7215}
7208 7216
7209static void cpu_cgroup_attach(struct cgroup *cgrp, 7217static void cpu_cgroup_attach(struct cgroup_subsys_state *css,
7210 struct cgroup_taskset *tset) 7218 struct cgroup_taskset *tset)
7211{ 7219{
7212 struct task_struct *task; 7220 struct task_struct *task;
7213 7221
7214 cgroup_taskset_for_each(task, cgrp, tset) 7222 cgroup_taskset_for_each(task, css, tset)
7215 sched_move_task(task); 7223 sched_move_task(task);
7216} 7224}
7217 7225
7218static void 7226static void cpu_cgroup_exit(struct cgroup_subsys_state *css,
7219cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, 7227 struct cgroup_subsys_state *old_css,
7220 struct task_struct *task) 7228 struct task_struct *task)
7221{ 7229{
7222 /* 7230 /*
7223 * cgroup_exit() is called in the copy_process() failure path. 7231 * cgroup_exit() is called in the copy_process() failure path.
@@ -7231,15 +7239,16 @@ cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
7231} 7239}
7232 7240
7233#ifdef CONFIG_FAIR_GROUP_SCHED 7241#ifdef CONFIG_FAIR_GROUP_SCHED
7234static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, 7242static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
7235 u64 shareval) 7243 struct cftype *cftype, u64 shareval)
7236{ 7244{
7237 return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval)); 7245 return sched_group_set_shares(css_tg(css), scale_load(shareval));
7238} 7246}
7239 7247
7240static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) 7248static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
7249 struct cftype *cft)
7241{ 7250{
7242 struct task_group *tg = cgroup_tg(cgrp); 7251 struct task_group *tg = css_tg(css);
7243 7252
7244 return (u64) scale_load_down(tg->shares); 7253 return (u64) scale_load_down(tg->shares);
7245} 7254}
@@ -7361,26 +7370,28 @@ long tg_get_cfs_period(struct task_group *tg)
7361 return cfs_period_us; 7370 return cfs_period_us;
7362} 7371}
7363 7372
7364static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft) 7373static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
7374 struct cftype *cft)
7365{ 7375{
7366 return tg_get_cfs_quota(cgroup_tg(cgrp)); 7376 return tg_get_cfs_quota(css_tg(css));
7367} 7377}
7368 7378
7369static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype, 7379static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css,
7370 s64 cfs_quota_us) 7380 struct cftype *cftype, s64 cfs_quota_us)
7371{ 7381{
7372 return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us); 7382 return tg_set_cfs_quota(css_tg(css), cfs_quota_us);
7373} 7383}
7374 7384
7375static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft) 7385static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css,
7386 struct cftype *cft)
7376{ 7387{
7377 return tg_get_cfs_period(cgroup_tg(cgrp)); 7388 return tg_get_cfs_period(css_tg(css));
7378} 7389}
7379 7390
7380static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype, 7391static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,
7381 u64 cfs_period_us) 7392 struct cftype *cftype, u64 cfs_period_us)
7382{ 7393{
7383 return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us); 7394 return tg_set_cfs_period(css_tg(css), cfs_period_us);
7384} 7395}
7385 7396
7386struct cfs_schedulable_data { 7397struct cfs_schedulable_data {
@@ -7461,10 +7472,10 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
7461 return ret; 7472 return ret;
7462} 7473}
7463 7474
7464static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft, 7475static int cpu_stats_show(struct cgroup_subsys_state *css, struct cftype *cft,
7465 struct cgroup_map_cb *cb) 7476 struct cgroup_map_cb *cb)
7466{ 7477{
7467 struct task_group *tg = cgroup_tg(cgrp); 7478 struct task_group *tg = css_tg(css);
7468 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 7479 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7469 7480
7470 cb->fill(cb, "nr_periods", cfs_b->nr_periods); 7481 cb->fill(cb, "nr_periods", cfs_b->nr_periods);
@@ -7477,26 +7488,28 @@ static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
7477#endif /* CONFIG_FAIR_GROUP_SCHED */ 7488#endif /* CONFIG_FAIR_GROUP_SCHED */
7478 7489
7479#ifdef CONFIG_RT_GROUP_SCHED 7490#ifdef CONFIG_RT_GROUP_SCHED
7480static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, 7491static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
7481 s64 val) 7492 struct cftype *cft, s64 val)
7482{ 7493{
7483 return sched_group_set_rt_runtime(cgroup_tg(cgrp), val); 7494 return sched_group_set_rt_runtime(css_tg(css), val);
7484} 7495}
7485 7496
7486static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft) 7497static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css,
7498 struct cftype *cft)
7487{ 7499{
7488 return sched_group_rt_runtime(cgroup_tg(cgrp)); 7500 return sched_group_rt_runtime(css_tg(css));
7489} 7501}
7490 7502
7491static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype, 7503static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css,
7492 u64 rt_period_us) 7504 struct cftype *cftype, u64 rt_period_us)
7493{ 7505{
7494 return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us); 7506 return sched_group_set_rt_period(css_tg(css), rt_period_us);
7495} 7507}
7496 7508
7497static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft) 7509static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
7510 struct cftype *cft)
7498{ 7511{
7499 return sched_group_rt_period(cgroup_tg(cgrp)); 7512 return sched_group_rt_period(css_tg(css));
7500} 7513}
7501#endif /* CONFIG_RT_GROUP_SCHED */ 7514#endif /* CONFIG_RT_GROUP_SCHED */
7502 7515
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index dbb7e2cd95eb..f64722ff0299 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -33,30 +33,20 @@ struct cpuacct {
33 struct kernel_cpustat __percpu *cpustat; 33 struct kernel_cpustat __percpu *cpustat;
34}; 34};
35 35
36/* return cpu accounting group corresponding to this container */ 36static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css)
37static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
38{ 37{
39 return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), 38 return css ? container_of(css, struct cpuacct, css) : NULL;
40 struct cpuacct, css);
41} 39}
42 40
43/* return cpu accounting group to which this task belongs */ 41/* return cpu accounting group to which this task belongs */
44static inline struct cpuacct *task_ca(struct task_struct *tsk) 42static inline struct cpuacct *task_ca(struct task_struct *tsk)
45{ 43{
46 return container_of(task_subsys_state(tsk, cpuacct_subsys_id), 44 return css_ca(task_css(tsk, cpuacct_subsys_id));
47 struct cpuacct, css);
48}
49
50static inline struct cpuacct *__parent_ca(struct cpuacct *ca)
51{
52 return cgroup_ca(ca->css.cgroup->parent);
53} 45}
54 46
55static inline struct cpuacct *parent_ca(struct cpuacct *ca) 47static inline struct cpuacct *parent_ca(struct cpuacct *ca)
56{ 48{
57 if (!ca->css.cgroup->parent) 49 return css_ca(css_parent(&ca->css));
58 return NULL;
59 return cgroup_ca(ca->css.cgroup->parent);
60} 50}
61 51
62static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); 52static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage);
@@ -66,11 +56,12 @@ static struct cpuacct root_cpuacct = {
66}; 56};
67 57
68/* create a new cpu accounting group */ 58/* create a new cpu accounting group */
69static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp) 59static struct cgroup_subsys_state *
60cpuacct_css_alloc(struct cgroup_subsys_state *parent_css)
70{ 61{
71 struct cpuacct *ca; 62 struct cpuacct *ca;
72 63
73 if (!cgrp->parent) 64 if (!parent_css)
74 return &root_cpuacct.css; 65 return &root_cpuacct.css;
75 66
76 ca = kzalloc(sizeof(*ca), GFP_KERNEL); 67 ca = kzalloc(sizeof(*ca), GFP_KERNEL);
@@ -96,9 +87,9 @@ out:
96} 87}
97 88
98/* destroy an existing cpu accounting group */ 89/* destroy an existing cpu accounting group */
99static void cpuacct_css_free(struct cgroup *cgrp) 90static void cpuacct_css_free(struct cgroup_subsys_state *css)
100{ 91{
101 struct cpuacct *ca = cgroup_ca(cgrp); 92 struct cpuacct *ca = css_ca(css);
102 93
103 free_percpu(ca->cpustat); 94 free_percpu(ca->cpustat);
104 free_percpu(ca->cpuusage); 95 free_percpu(ca->cpuusage);
@@ -141,9 +132,9 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
141} 132}
142 133
143/* return total cpu usage (in nanoseconds) of a group */ 134/* return total cpu usage (in nanoseconds) of a group */
144static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) 135static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft)
145{ 136{
146 struct cpuacct *ca = cgroup_ca(cgrp); 137 struct cpuacct *ca = css_ca(css);
147 u64 totalcpuusage = 0; 138 u64 totalcpuusage = 0;
148 int i; 139 int i;
149 140
@@ -153,10 +144,10 @@ static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
153 return totalcpuusage; 144 return totalcpuusage;
154} 145}
155 146
156static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, 147static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
157 u64 reset) 148 u64 reset)
158{ 149{
159 struct cpuacct *ca = cgroup_ca(cgrp); 150 struct cpuacct *ca = css_ca(css);
160 int err = 0; 151 int err = 0;
161 int i; 152 int i;
162 153
@@ -172,10 +163,10 @@ out:
172 return err; 163 return err;
173} 164}
174 165
175static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, 166static int cpuacct_percpu_seq_read(struct cgroup_subsys_state *css,
176 struct seq_file *m) 167 struct cftype *cft, struct seq_file *m)
177{ 168{
178 struct cpuacct *ca = cgroup_ca(cgroup); 169 struct cpuacct *ca = css_ca(css);
179 u64 percpu; 170 u64 percpu;
180 int i; 171 int i;
181 172
@@ -192,10 +183,10 @@ static const char * const cpuacct_stat_desc[] = {
192 [CPUACCT_STAT_SYSTEM] = "system", 183 [CPUACCT_STAT_SYSTEM] = "system",
193}; 184};
194 185
195static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, 186static int cpuacct_stats_show(struct cgroup_subsys_state *css,
196 struct cgroup_map_cb *cb) 187 struct cftype *cft, struct cgroup_map_cb *cb)
197{ 188{
198 struct cpuacct *ca = cgroup_ca(cgrp); 189 struct cpuacct *ca = css_ca(css);
199 int cpu; 190 int cpu;
200 s64 val = 0; 191 s64 val = 0;
201 192
@@ -281,7 +272,7 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val)
281 while (ca != &root_cpuacct) { 272 while (ca != &root_cpuacct) {
282 kcpustat = this_cpu_ptr(ca->cpustat); 273 kcpustat = this_cpu_ptr(ca->cpustat);
283 kcpustat->cpustat[index] += val; 274 kcpustat->cpustat[index] += val;
284 ca = __parent_ca(ca); 275 ca = parent_ca(ca);
285 } 276 }
286 rcu_read_unlock(); 277 rcu_read_unlock();
287} 278}
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index a7959e05a9d5..ace34f95e200 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -121,7 +121,7 @@ static inline void task_group_account_field(struct task_struct *p, int index,
121 * is the only cgroup, then nothing else should be necessary. 121 * is the only cgroup, then nothing else should be necessary.
122 * 122 *
123 */ 123 */
124 __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; 124 __this_cpu_add(kernel_cpustat.cpustat[index], tmp);
125 125
126 cpuacct_account_field(p, index, tmp); 126 cpuacct_account_field(p, index, tmp);
127} 127}
@@ -378,11 +378,8 @@ static inline void irqtime_account_process_tick(struct task_struct *p, int user_
378#ifdef CONFIG_VIRT_CPU_ACCOUNTING 378#ifdef CONFIG_VIRT_CPU_ACCOUNTING
379 379
380#ifndef __ARCH_HAS_VTIME_TASK_SWITCH 380#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
381void vtime_task_switch(struct task_struct *prev) 381void vtime_common_task_switch(struct task_struct *prev)
382{ 382{
383 if (!vtime_accounting_enabled())
384 return;
385
386 if (is_idle_task(prev)) 383 if (is_idle_task(prev))
387 vtime_account_idle(prev); 384 vtime_account_idle(prev);
388 else 385 else
@@ -404,11 +401,8 @@ void vtime_task_switch(struct task_struct *prev)
404 * vtime_account(). 401 * vtime_account().
405 */ 402 */
406#ifndef __ARCH_HAS_VTIME_ACCOUNT 403#ifndef __ARCH_HAS_VTIME_ACCOUNT
407void vtime_account_irq_enter(struct task_struct *tsk) 404void vtime_common_account_irq_enter(struct task_struct *tsk)
408{ 405{
409 if (!vtime_accounting_enabled())
410 return;
411
412 if (!in_interrupt()) { 406 if (!in_interrupt()) {
413 /* 407 /*
414 * If we interrupted user, context_tracking_in_user() 408 * If we interrupted user, context_tracking_in_user()
@@ -428,7 +422,7 @@ void vtime_account_irq_enter(struct task_struct *tsk)
428 } 422 }
429 vtime_account_system(tsk); 423 vtime_account_system(tsk);
430} 424}
431EXPORT_SYMBOL_GPL(vtime_account_irq_enter); 425EXPORT_SYMBOL_GPL(vtime_common_account_irq_enter);
432#endif /* __ARCH_HAS_VTIME_ACCOUNT */ 426#endif /* __ARCH_HAS_VTIME_ACCOUNT */
433#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ 427#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
434 428
@@ -559,12 +553,6 @@ static void cputime_adjust(struct task_cputime *curr,
559{ 553{
560 cputime_t rtime, stime, utime, total; 554 cputime_t rtime, stime, utime, total;
561 555
562 if (vtime_accounting_enabled()) {
563 *ut = curr->utime;
564 *st = curr->stime;
565 return;
566 }
567
568 stime = curr->stime; 556 stime = curr->stime;
569 total = stime + curr->utime; 557 total = stime + curr->utime;
570 558
@@ -664,23 +652,17 @@ static void __vtime_account_system(struct task_struct *tsk)
664 652
665void vtime_account_system(struct task_struct *tsk) 653void vtime_account_system(struct task_struct *tsk)
666{ 654{
667 if (!vtime_accounting_enabled())
668 return;
669
670 write_seqlock(&tsk->vtime_seqlock); 655 write_seqlock(&tsk->vtime_seqlock);
671 __vtime_account_system(tsk); 656 __vtime_account_system(tsk);
672 write_sequnlock(&tsk->vtime_seqlock); 657 write_sequnlock(&tsk->vtime_seqlock);
673} 658}
674 659
675void vtime_account_irq_exit(struct task_struct *tsk) 660void vtime_gen_account_irq_exit(struct task_struct *tsk)
676{ 661{
677 if (!vtime_accounting_enabled())
678 return;
679
680 write_seqlock(&tsk->vtime_seqlock); 662 write_seqlock(&tsk->vtime_seqlock);
663 __vtime_account_system(tsk);
681 if (context_tracking_in_user()) 664 if (context_tracking_in_user())
682 tsk->vtime_snap_whence = VTIME_USER; 665 tsk->vtime_snap_whence = VTIME_USER;
683 __vtime_account_system(tsk);
684 write_sequnlock(&tsk->vtime_seqlock); 666 write_sequnlock(&tsk->vtime_seqlock);
685} 667}
686 668
@@ -688,12 +670,8 @@ void vtime_account_user(struct task_struct *tsk)
688{ 670{
689 cputime_t delta_cpu; 671 cputime_t delta_cpu;
690 672
691 if (!vtime_accounting_enabled())
692 return;
693
694 delta_cpu = get_vtime_delta(tsk);
695
696 write_seqlock(&tsk->vtime_seqlock); 673 write_seqlock(&tsk->vtime_seqlock);
674 delta_cpu = get_vtime_delta(tsk);
697 tsk->vtime_snap_whence = VTIME_SYS; 675 tsk->vtime_snap_whence = VTIME_SYS;
698 account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); 676 account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
699 write_sequnlock(&tsk->vtime_seqlock); 677 write_sequnlock(&tsk->vtime_seqlock);
@@ -701,22 +679,27 @@ void vtime_account_user(struct task_struct *tsk)
701 679
702void vtime_user_enter(struct task_struct *tsk) 680void vtime_user_enter(struct task_struct *tsk)
703{ 681{
704 if (!vtime_accounting_enabled())
705 return;
706
707 write_seqlock(&tsk->vtime_seqlock); 682 write_seqlock(&tsk->vtime_seqlock);
708 tsk->vtime_snap_whence = VTIME_USER;
709 __vtime_account_system(tsk); 683 __vtime_account_system(tsk);
684 tsk->vtime_snap_whence = VTIME_USER;
710 write_sequnlock(&tsk->vtime_seqlock); 685 write_sequnlock(&tsk->vtime_seqlock);
711} 686}
712 687
713void vtime_guest_enter(struct task_struct *tsk) 688void vtime_guest_enter(struct task_struct *tsk)
714{ 689{
690 /*
691 * The flags must be updated under the lock with
692 * the vtime_snap flush and update.
693 * That enforces a right ordering and update sequence
694 * synchronization against the reader (task_gtime())
695 * that can thus safely catch up with a tickless delta.
696 */
715 write_seqlock(&tsk->vtime_seqlock); 697 write_seqlock(&tsk->vtime_seqlock);
716 __vtime_account_system(tsk); 698 __vtime_account_system(tsk);
717 current->flags |= PF_VCPU; 699 current->flags |= PF_VCPU;
718 write_sequnlock(&tsk->vtime_seqlock); 700 write_sequnlock(&tsk->vtime_seqlock);
719} 701}
702EXPORT_SYMBOL_GPL(vtime_guest_enter);
720 703
721void vtime_guest_exit(struct task_struct *tsk) 704void vtime_guest_exit(struct task_struct *tsk)
722{ 705{
@@ -725,6 +708,7 @@ void vtime_guest_exit(struct task_struct *tsk)
725 current->flags &= ~PF_VCPU; 708 current->flags &= ~PF_VCPU;
726 write_sequnlock(&tsk->vtime_seqlock); 709 write_sequnlock(&tsk->vtime_seqlock);
727} 710}
711EXPORT_SYMBOL_GPL(vtime_guest_exit);
728 712
729void vtime_account_idle(struct task_struct *tsk) 713void vtime_account_idle(struct task_struct *tsk)
730{ 714{
@@ -733,11 +717,6 @@ void vtime_account_idle(struct task_struct *tsk)
733 account_idle_time(delta_cpu); 717 account_idle_time(delta_cpu);
734} 718}
735 719
736bool vtime_accounting_enabled(void)
737{
738 return context_tracking_active();
739}
740
741void arch_vtime_task_switch(struct task_struct *prev) 720void arch_vtime_task_switch(struct task_struct *prev)
742{ 721{
743 write_seqlock(&prev->vtime_seqlock); 722 write_seqlock(&prev->vtime_seqlock);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 68f1609ca149..7f0a5e6cdae0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3018,6 +3018,23 @@ static unsigned long cpu_avg_load_per_task(int cpu)
3018 return 0; 3018 return 0;
3019} 3019}
3020 3020
3021static void record_wakee(struct task_struct *p)
3022{
3023 /*
3024 * Rough decay (wiping) for cost saving, don't worry
3025 * about the boundary, really active task won't care
3026 * about the loss.
3027 */
3028 if (jiffies > current->wakee_flip_decay_ts + HZ) {
3029 current->wakee_flips = 0;
3030 current->wakee_flip_decay_ts = jiffies;
3031 }
3032
3033 if (current->last_wakee != p) {
3034 current->last_wakee = p;
3035 current->wakee_flips++;
3036 }
3037}
3021 3038
3022static void task_waking_fair(struct task_struct *p) 3039static void task_waking_fair(struct task_struct *p)
3023{ 3040{
@@ -3038,6 +3055,7 @@ static void task_waking_fair(struct task_struct *p)
3038#endif 3055#endif
3039 3056
3040 se->vruntime -= min_vruntime; 3057 se->vruntime -= min_vruntime;
3058 record_wakee(p);
3041} 3059}
3042 3060
3043#ifdef CONFIG_FAIR_GROUP_SCHED 3061#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -3156,6 +3174,28 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
3156 3174
3157#endif 3175#endif
3158 3176
3177static int wake_wide(struct task_struct *p)
3178{
3179 int factor = this_cpu_read(sd_llc_size);
3180
3181 /*
3182 * Yeah, it's the switching-frequency, could means many wakee or
3183 * rapidly switch, use factor here will just help to automatically
3184 * adjust the loose-degree, so bigger node will lead to more pull.
3185 */
3186 if (p->wakee_flips > factor) {
3187 /*
3188 * wakee is somewhat hot, it needs certain amount of cpu
3189 * resource, so if waker is far more hot, prefer to leave
3190 * it alone.
3191 */
3192 if (current->wakee_flips > (factor * p->wakee_flips))
3193 return 1;
3194 }
3195
3196 return 0;
3197}
3198
3159static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) 3199static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
3160{ 3200{
3161 s64 this_load, load; 3201 s64 this_load, load;
@@ -3165,6 +3205,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
3165 unsigned long weight; 3205 unsigned long weight;
3166 int balanced; 3206 int balanced;
3167 3207
3208 /*
3209 * If we wake multiple tasks be careful to not bounce
3210 * ourselves around too much.
3211 */
3212 if (wake_wide(p))
3213 return 0;
3214
3168 idx = sd->wake_idx; 3215 idx = sd->wake_idx;
3169 this_cpu = smp_processor_id(); 3216 this_cpu = smp_processor_id();
3170 prev_cpu = task_cpu(p); 3217 prev_cpu = task_cpu(p);
@@ -4172,47 +4219,48 @@ static void update_blocked_averages(int cpu)
4172} 4219}
4173 4220
4174/* 4221/*
4175 * Compute the cpu's hierarchical load factor for each task group. 4222 * Compute the hierarchical load factor for cfs_rq and all its ascendants.
4176 * This needs to be done in a top-down fashion because the load of a child 4223 * This needs to be done in a top-down fashion because the load of a child
4177 * group is a fraction of its parents load. 4224 * group is a fraction of its parents load.
4178 */ 4225 */
4179static int tg_load_down(struct task_group *tg, void *data) 4226static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
4180{
4181 unsigned long load;
4182 long cpu = (long)data;
4183
4184 if (!tg->parent) {
4185 load = cpu_rq(cpu)->avg.load_avg_contrib;
4186 } else {
4187 load = tg->parent->cfs_rq[cpu]->h_load;
4188 load = div64_ul(load * tg->se[cpu]->avg.load_avg_contrib,
4189 tg->parent->cfs_rq[cpu]->runnable_load_avg + 1);
4190 }
4191
4192 tg->cfs_rq[cpu]->h_load = load;
4193
4194 return 0;
4195}
4196
4197static void update_h_load(long cpu)
4198{ 4227{
4199 struct rq *rq = cpu_rq(cpu); 4228 struct rq *rq = rq_of(cfs_rq);
4229 struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
4200 unsigned long now = jiffies; 4230 unsigned long now = jiffies;
4231 unsigned long load;
4201 4232
4202 if (rq->h_load_throttle == now) 4233 if (cfs_rq->last_h_load_update == now)
4203 return; 4234 return;
4204 4235
4205 rq->h_load_throttle = now; 4236 cfs_rq->h_load_next = NULL;
4237 for_each_sched_entity(se) {
4238 cfs_rq = cfs_rq_of(se);
4239 cfs_rq->h_load_next = se;
4240 if (cfs_rq->last_h_load_update == now)
4241 break;
4242 }
4206 4243
4207 rcu_read_lock(); 4244 if (!se) {
4208 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); 4245 cfs_rq->h_load = rq->avg.load_avg_contrib;
4209 rcu_read_unlock(); 4246 cfs_rq->last_h_load_update = now;
4247 }
4248
4249 while ((se = cfs_rq->h_load_next) != NULL) {
4250 load = cfs_rq->h_load;
4251 load = div64_ul(load * se->avg.load_avg_contrib,
4252 cfs_rq->runnable_load_avg + 1);
4253 cfs_rq = group_cfs_rq(se);
4254 cfs_rq->h_load = load;
4255 cfs_rq->last_h_load_update = now;
4256 }
4210} 4257}
4211 4258
4212static unsigned long task_h_load(struct task_struct *p) 4259static unsigned long task_h_load(struct task_struct *p)
4213{ 4260{
4214 struct cfs_rq *cfs_rq = task_cfs_rq(p); 4261 struct cfs_rq *cfs_rq = task_cfs_rq(p);
4215 4262
4263 update_cfs_rq_h_load(cfs_rq);
4216 return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load, 4264 return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load,
4217 cfs_rq->runnable_load_avg + 1); 4265 cfs_rq->runnable_load_avg + 1);
4218} 4266}
@@ -4221,10 +4269,6 @@ static inline void update_blocked_averages(int cpu)
4221{ 4269{
4222} 4270}
4223 4271
4224static inline void update_h_load(long cpu)
4225{
4226}
4227
4228static unsigned long task_h_load(struct task_struct *p) 4272static unsigned long task_h_load(struct task_struct *p)
4229{ 4273{
4230 return p->se.avg.load_avg_contrib; 4274 return p->se.avg.load_avg_contrib;
@@ -4233,50 +4277,56 @@ static unsigned long task_h_load(struct task_struct *p)
4233 4277
4234/********** Helpers for find_busiest_group ************************/ 4278/********** Helpers for find_busiest_group ************************/
4235/* 4279/*
4236 * sd_lb_stats - Structure to store the statistics of a sched_domain
4237 * during load balancing.
4238 */
4239struct sd_lb_stats {
4240 struct sched_group *busiest; /* Busiest group in this sd */
4241 struct sched_group *this; /* Local group in this sd */
4242 unsigned long total_load; /* Total load of all groups in sd */
4243 unsigned long total_pwr; /* Total power of all groups in sd */
4244 unsigned long avg_load; /* Average load across all groups in sd */
4245
4246 /** Statistics of this group */
4247 unsigned long this_load;
4248 unsigned long this_load_per_task;
4249 unsigned long this_nr_running;
4250 unsigned long this_has_capacity;
4251 unsigned int this_idle_cpus;
4252
4253 /* Statistics of the busiest group */
4254 unsigned int busiest_idle_cpus;
4255 unsigned long max_load;
4256 unsigned long busiest_load_per_task;
4257 unsigned long busiest_nr_running;
4258 unsigned long busiest_group_capacity;
4259 unsigned long busiest_has_capacity;
4260 unsigned int busiest_group_weight;
4261
4262 int group_imb; /* Is there imbalance in this sd */
4263};
4264
4265/*
4266 * sg_lb_stats - stats of a sched_group required for load_balancing 4280 * sg_lb_stats - stats of a sched_group required for load_balancing
4267 */ 4281 */
4268struct sg_lb_stats { 4282struct sg_lb_stats {
4269 unsigned long avg_load; /*Avg load across the CPUs of the group */ 4283 unsigned long avg_load; /*Avg load across the CPUs of the group */
4270 unsigned long group_load; /* Total load over the CPUs of the group */ 4284 unsigned long group_load; /* Total load over the CPUs of the group */
4271 unsigned long sum_nr_running; /* Nr tasks running in the group */
4272 unsigned long sum_weighted_load; /* Weighted load of group's tasks */ 4285 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
4273 unsigned long group_capacity; 4286 unsigned long load_per_task;
4274 unsigned long idle_cpus; 4287 unsigned long group_power;
4275 unsigned long group_weight; 4288 unsigned int sum_nr_running; /* Nr tasks running in the group */
4289 unsigned int group_capacity;
4290 unsigned int idle_cpus;
4291 unsigned int group_weight;
4276 int group_imb; /* Is there an imbalance in the group ? */ 4292 int group_imb; /* Is there an imbalance in the group ? */
4277 int group_has_capacity; /* Is there extra capacity in the group? */ 4293 int group_has_capacity; /* Is there extra capacity in the group? */
4278}; 4294};
4279 4295
4296/*
4297 * sd_lb_stats - Structure to store the statistics of a sched_domain
4298 * during load balancing.
4299 */
4300struct sd_lb_stats {
4301 struct sched_group *busiest; /* Busiest group in this sd */
4302 struct sched_group *local; /* Local group in this sd */
4303 unsigned long total_load; /* Total load of all groups in sd */
4304 unsigned long total_pwr; /* Total power of all groups in sd */
4305 unsigned long avg_load; /* Average load across all groups in sd */
4306
4307 struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
4308 struct sg_lb_stats local_stat; /* Statistics of the local group */
4309};
4310
4311static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
4312{
4313 /*
4314 * Skimp on the clearing to avoid duplicate work. We can avoid clearing
4315 * local_stat because update_sg_lb_stats() does a full clear/assignment.
4316 * We must however clear busiest_stat::avg_load because
4317 * update_sd_pick_busiest() reads this before assignment.
4318 */
4319 *sds = (struct sd_lb_stats){
4320 .busiest = NULL,
4321 .local = NULL,
4322 .total_load = 0UL,
4323 .total_pwr = 0UL,
4324 .busiest_stat = {
4325 .avg_load = 0UL,
4326 },
4327 };
4328}
4329
4280/** 4330/**
4281 * get_sd_load_idx - Obtain the load index for a given sched domain. 4331 * get_sd_load_idx - Obtain the load index for a given sched domain.
4282 * @sd: The sched_domain whose load_idx is to be obtained. 4332 * @sd: The sched_domain whose load_idx is to be obtained.
@@ -4460,33 +4510,99 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
4460 return 0; 4510 return 0;
4461} 4511}
4462 4512
4513/*
4514 * Group imbalance indicates (and tries to solve) the problem where balancing
4515 * groups is inadequate due to tsk_cpus_allowed() constraints.
4516 *
4517 * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
4518 * cpumask covering 1 cpu of the first group and 3 cpus of the second group.
4519 * Something like:
4520 *
4521 * { 0 1 2 3 } { 4 5 6 7 }
4522 * * * * *
4523 *
4524 * If we were to balance group-wise we'd place two tasks in the first group and
4525 * two tasks in the second group. Clearly this is undesired as it will overload
4526 * cpu 3 and leave one of the cpus in the second group unused.
4527 *
4528 * The current solution to this issue is detecting the skew in the first group
4529 * by noticing it has a cpu that is overloaded while the remaining cpus are
4530 * idle -- or rather, there's a distinct imbalance in the cpus; see
4531 * sg_imbalanced().
4532 *
4533 * When this is so detected; this group becomes a candidate for busiest; see
4534 * update_sd_pick_busiest(). And calculcate_imbalance() and
4535 * find_busiest_group() avoid some of the usual balance conditional to allow it
4536 * to create an effective group imbalance.
4537 *
4538 * This is a somewhat tricky proposition since the next run might not find the
4539 * group imbalance and decide the groups need to be balanced again. A most
4540 * subtle and fragile situation.
4541 */
4542
4543struct sg_imb_stats {
4544 unsigned long max_nr_running, min_nr_running;
4545 unsigned long max_cpu_load, min_cpu_load;
4546};
4547
4548static inline void init_sg_imb_stats(struct sg_imb_stats *sgi)
4549{
4550 sgi->max_cpu_load = sgi->max_nr_running = 0UL;
4551 sgi->min_cpu_load = sgi->min_nr_running = ~0UL;
4552}
4553
4554static inline void
4555update_sg_imb_stats(struct sg_imb_stats *sgi,
4556 unsigned long load, unsigned long nr_running)
4557{
4558 if (load > sgi->max_cpu_load)
4559 sgi->max_cpu_load = load;
4560 if (sgi->min_cpu_load > load)
4561 sgi->min_cpu_load = load;
4562
4563 if (nr_running > sgi->max_nr_running)
4564 sgi->max_nr_running = nr_running;
4565 if (sgi->min_nr_running > nr_running)
4566 sgi->min_nr_running = nr_running;
4567}
4568
4569static inline int
4570sg_imbalanced(struct sg_lb_stats *sgs, struct sg_imb_stats *sgi)
4571{
4572 /*
4573 * Consider the group unbalanced when the imbalance is larger
4574 * than the average weight of a task.
4575 *
4576 * APZ: with cgroup the avg task weight can vary wildly and
4577 * might not be a suitable number - should we keep a
4578 * normalized nr_running number somewhere that negates
4579 * the hierarchy?
4580 */
4581 if ((sgi->max_cpu_load - sgi->min_cpu_load) >= sgs->load_per_task &&
4582 (sgi->max_nr_running - sgi->min_nr_running) > 1)
4583 return 1;
4584
4585 return 0;
4586}
4587
4463/** 4588/**
4464 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 4589 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
4465 * @env: The load balancing environment. 4590 * @env: The load balancing environment.
4466 * @group: sched_group whose statistics are to be updated. 4591 * @group: sched_group whose statistics are to be updated.
4467 * @load_idx: Load index of sched_domain of this_cpu for load calc. 4592 * @load_idx: Load index of sched_domain of this_cpu for load calc.
4468 * @local_group: Does group contain this_cpu. 4593 * @local_group: Does group contain this_cpu.
4469 * @balance: Should we balance.
4470 * @sgs: variable to hold the statistics for this group. 4594 * @sgs: variable to hold the statistics for this group.
4471 */ 4595 */
4472static inline void update_sg_lb_stats(struct lb_env *env, 4596static inline void update_sg_lb_stats(struct lb_env *env,
4473 struct sched_group *group, int load_idx, 4597 struct sched_group *group, int load_idx,
4474 int local_group, int *balance, struct sg_lb_stats *sgs) 4598 int local_group, struct sg_lb_stats *sgs)
4475{ 4599{
4476 unsigned long nr_running, max_nr_running, min_nr_running; 4600 struct sg_imb_stats sgi;
4477 unsigned long load, max_cpu_load, min_cpu_load; 4601 unsigned long nr_running;
4478 unsigned int balance_cpu = -1, first_idle_cpu = 0; 4602 unsigned long load;
4479 unsigned long avg_load_per_task = 0;
4480 int i; 4603 int i;
4481 4604
4482 if (local_group) 4605 init_sg_imb_stats(&sgi);
4483 balance_cpu = group_balance_cpu(group);
4484
4485 /* Tally up the load of all CPUs in the group */
4486 max_cpu_load = 0;
4487 min_cpu_load = ~0UL;
4488 max_nr_running = 0;
4489 min_nr_running = ~0UL;
4490 4606
4491 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { 4607 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
4492 struct rq *rq = cpu_rq(i); 4608 struct rq *rq = cpu_rq(i);
@@ -4495,24 +4611,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
4495 4611
4496 /* Bias balancing toward cpus of our domain */ 4612 /* Bias balancing toward cpus of our domain */
4497 if (local_group) { 4613 if (local_group) {
4498 if (idle_cpu(i) && !first_idle_cpu &&
4499 cpumask_test_cpu(i, sched_group_mask(group))) {
4500 first_idle_cpu = 1;
4501 balance_cpu = i;
4502 }
4503
4504 load = target_load(i, load_idx); 4614 load = target_load(i, load_idx);
4505 } else { 4615 } else {
4506 load = source_load(i, load_idx); 4616 load = source_load(i, load_idx);
4507 if (load > max_cpu_load) 4617 update_sg_imb_stats(&sgi, load, nr_running);
4508 max_cpu_load = load;
4509 if (min_cpu_load > load)
4510 min_cpu_load = load;
4511
4512 if (nr_running > max_nr_running)
4513 max_nr_running = nr_running;
4514 if (min_nr_running > nr_running)
4515 min_nr_running = nr_running;
4516 } 4618 }
4517 4619
4518 sgs->group_load += load; 4620 sgs->group_load += load;
@@ -4522,46 +4624,25 @@ static inline void update_sg_lb_stats(struct lb_env *env,
4522 sgs->idle_cpus++; 4624 sgs->idle_cpus++;
4523 } 4625 }
4524 4626
4525 /* 4627 if (local_group && (env->idle != CPU_NEWLY_IDLE ||
4526 * First idle cpu or the first cpu(busiest) in this sched group 4628 time_after_eq(jiffies, group->sgp->next_update)))
4527 * is eligible for doing load balancing at this and above 4629 update_group_power(env->sd, env->dst_cpu);
4528 * domains. In the newly idle case, we will allow all the cpu's
4529 * to do the newly idle load balance.
4530 */
4531 if (local_group) {
4532 if (env->idle != CPU_NEWLY_IDLE) {
4533 if (balance_cpu != env->dst_cpu) {
4534 *balance = 0;
4535 return;
4536 }
4537 update_group_power(env->sd, env->dst_cpu);
4538 } else if (time_after_eq(jiffies, group->sgp->next_update))
4539 update_group_power(env->sd, env->dst_cpu);
4540 }
4541 4630
4542 /* Adjust by relative CPU power of the group */ 4631 /* Adjust by relative CPU power of the group */
4543 sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power; 4632 sgs->group_power = group->sgp->power;
4633 sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power;
4544 4634
4545 /*
4546 * Consider the group unbalanced when the imbalance is larger
4547 * than the average weight of a task.
4548 *
4549 * APZ: with cgroup the avg task weight can vary wildly and
4550 * might not be a suitable number - should we keep a
4551 * normalized nr_running number somewhere that negates
4552 * the hierarchy?
4553 */
4554 if (sgs->sum_nr_running) 4635 if (sgs->sum_nr_running)
4555 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; 4636 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
4637
4638 sgs->group_imb = sg_imbalanced(sgs, &sgi);
4556 4639
4557 if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && 4640 sgs->group_capacity =
4558 (max_nr_running - min_nr_running) > 1) 4641 DIV_ROUND_CLOSEST(sgs->group_power, SCHED_POWER_SCALE);
4559 sgs->group_imb = 1;
4560 4642
4561 sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
4562 SCHED_POWER_SCALE);
4563 if (!sgs->group_capacity) 4643 if (!sgs->group_capacity)
4564 sgs->group_capacity = fix_small_capacity(env->sd, group); 4644 sgs->group_capacity = fix_small_capacity(env->sd, group);
4645
4565 sgs->group_weight = group->group_weight; 4646 sgs->group_weight = group->group_weight;
4566 4647
4567 if (sgs->group_capacity > sgs->sum_nr_running) 4648 if (sgs->group_capacity > sgs->sum_nr_running)
@@ -4586,7 +4667,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
4586 struct sched_group *sg, 4667 struct sched_group *sg,
4587 struct sg_lb_stats *sgs) 4668 struct sg_lb_stats *sgs)
4588{ 4669{
4589 if (sgs->avg_load <= sds->max_load) 4670 if (sgs->avg_load <= sds->busiest_stat.avg_load)
4590 return false; 4671 return false;
4591 4672
4592 if (sgs->sum_nr_running > sgs->group_capacity) 4673 if (sgs->sum_nr_running > sgs->group_capacity)
@@ -4619,11 +4700,11 @@ static bool update_sd_pick_busiest(struct lb_env *env,
4619 * @sds: variable to hold the statistics for this sched_domain. 4700 * @sds: variable to hold the statistics for this sched_domain.
4620 */ 4701 */
4621static inline void update_sd_lb_stats(struct lb_env *env, 4702static inline void update_sd_lb_stats(struct lb_env *env,
4622 int *balance, struct sd_lb_stats *sds) 4703 struct sd_lb_stats *sds)
4623{ 4704{
4624 struct sched_domain *child = env->sd->child; 4705 struct sched_domain *child = env->sd->child;
4625 struct sched_group *sg = env->sd->groups; 4706 struct sched_group *sg = env->sd->groups;
4626 struct sg_lb_stats sgs; 4707 struct sg_lb_stats tmp_sgs;
4627 int load_idx, prefer_sibling = 0; 4708 int load_idx, prefer_sibling = 0;
4628 4709
4629 if (child && child->flags & SD_PREFER_SIBLING) 4710 if (child && child->flags & SD_PREFER_SIBLING)
@@ -4632,17 +4713,17 @@ static inline void update_sd_lb_stats(struct lb_env *env,
4632 load_idx = get_sd_load_idx(env->sd, env->idle); 4713 load_idx = get_sd_load_idx(env->sd, env->idle);
4633 4714
4634 do { 4715 do {
4716 struct sg_lb_stats *sgs = &tmp_sgs;
4635 int local_group; 4717 int local_group;
4636 4718
4637 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); 4719 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
4638 memset(&sgs, 0, sizeof(sgs)); 4720 if (local_group) {
4639 update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs); 4721 sds->local = sg;
4640 4722 sgs = &sds->local_stat;
4641 if (local_group && !(*balance)) 4723 }
4642 return;
4643 4724
4644 sds->total_load += sgs.group_load; 4725 memset(sgs, 0, sizeof(*sgs));
4645 sds->total_pwr += sg->sgp->power; 4726 update_sg_lb_stats(env, sg, load_idx, local_group, sgs);
4646 4727
4647 /* 4728 /*
4648 * In case the child domain prefers tasks go to siblings 4729 * In case the child domain prefers tasks go to siblings
@@ -4654,26 +4735,17 @@ static inline void update_sd_lb_stats(struct lb_env *env,
4654 * heaviest group when it is already under-utilized (possible 4735 * heaviest group when it is already under-utilized (possible
4655 * with a large weight task outweighs the tasks on the system). 4736 * with a large weight task outweighs the tasks on the system).
4656 */ 4737 */
4657 if (prefer_sibling && !local_group && sds->this_has_capacity) 4738 if (prefer_sibling && !local_group &&
4658 sgs.group_capacity = min(sgs.group_capacity, 1UL); 4739 sds->local && sds->local_stat.group_has_capacity)
4740 sgs->group_capacity = min(sgs->group_capacity, 1U);
4659 4741
4660 if (local_group) { 4742 /* Now, start updating sd_lb_stats */
4661 sds->this_load = sgs.avg_load; 4743 sds->total_load += sgs->group_load;
4662 sds->this = sg; 4744 sds->total_pwr += sgs->group_power;
4663 sds->this_nr_running = sgs.sum_nr_running; 4745
4664 sds->this_load_per_task = sgs.sum_weighted_load; 4746 if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) {
4665 sds->this_has_capacity = sgs.group_has_capacity;
4666 sds->this_idle_cpus = sgs.idle_cpus;
4667 } else if (update_sd_pick_busiest(env, sds, sg, &sgs)) {
4668 sds->max_load = sgs.avg_load;
4669 sds->busiest = sg; 4747 sds->busiest = sg;
4670 sds->busiest_nr_running = sgs.sum_nr_running; 4748 sds->busiest_stat = *sgs;
4671 sds->busiest_idle_cpus = sgs.idle_cpus;
4672 sds->busiest_group_capacity = sgs.group_capacity;
4673 sds->busiest_load_per_task = sgs.sum_weighted_load;
4674 sds->busiest_has_capacity = sgs.group_has_capacity;
4675 sds->busiest_group_weight = sgs.group_weight;
4676 sds->group_imb = sgs.group_imb;
4677 } 4749 }
4678 4750
4679 sg = sg->next; 4751 sg = sg->next;
@@ -4718,7 +4790,8 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
4718 return 0; 4790 return 0;
4719 4791
4720 env->imbalance = DIV_ROUND_CLOSEST( 4792 env->imbalance = DIV_ROUND_CLOSEST(
4721 sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE); 4793 sds->busiest_stat.avg_load * sds->busiest_stat.group_power,
4794 SCHED_POWER_SCALE);
4722 4795
4723 return 1; 4796 return 1;
4724} 4797}
@@ -4736,24 +4809,23 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4736 unsigned long tmp, pwr_now = 0, pwr_move = 0; 4809 unsigned long tmp, pwr_now = 0, pwr_move = 0;
4737 unsigned int imbn = 2; 4810 unsigned int imbn = 2;
4738 unsigned long scaled_busy_load_per_task; 4811 unsigned long scaled_busy_load_per_task;
4812 struct sg_lb_stats *local, *busiest;
4739 4813
4740 if (sds->this_nr_running) { 4814 local = &sds->local_stat;
4741 sds->this_load_per_task /= sds->this_nr_running; 4815 busiest = &sds->busiest_stat;
4742 if (sds->busiest_load_per_task > 4816
4743 sds->this_load_per_task) 4817 if (!local->sum_nr_running)
4744 imbn = 1; 4818 local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
4745 } else { 4819 else if (busiest->load_per_task > local->load_per_task)
4746 sds->this_load_per_task = 4820 imbn = 1;
4747 cpu_avg_load_per_task(env->dst_cpu);
4748 }
4749 4821
4750 scaled_busy_load_per_task = sds->busiest_load_per_task 4822 scaled_busy_load_per_task =
4751 * SCHED_POWER_SCALE; 4823 (busiest->load_per_task * SCHED_POWER_SCALE) /
4752 scaled_busy_load_per_task /= sds->busiest->sgp->power; 4824 busiest->group_power;
4753 4825
4754 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= 4826 if (busiest->avg_load - local->avg_load + scaled_busy_load_per_task >=
4755 (scaled_busy_load_per_task * imbn)) { 4827 (scaled_busy_load_per_task * imbn)) {
4756 env->imbalance = sds->busiest_load_per_task; 4828 env->imbalance = busiest->load_per_task;
4757 return; 4829 return;
4758 } 4830 }
4759 4831
@@ -4763,34 +4835,37 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4763 * moving them. 4835 * moving them.
4764 */ 4836 */
4765 4837
4766 pwr_now += sds->busiest->sgp->power * 4838 pwr_now += busiest->group_power *
4767 min(sds->busiest_load_per_task, sds->max_load); 4839 min(busiest->load_per_task, busiest->avg_load);
4768 pwr_now += sds->this->sgp->power * 4840 pwr_now += local->group_power *
4769 min(sds->this_load_per_task, sds->this_load); 4841 min(local->load_per_task, local->avg_load);
4770 pwr_now /= SCHED_POWER_SCALE; 4842 pwr_now /= SCHED_POWER_SCALE;
4771 4843
4772 /* Amount of load we'd subtract */ 4844 /* Amount of load we'd subtract */
4773 tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / 4845 tmp = (busiest->load_per_task * SCHED_POWER_SCALE) /
4774 sds->busiest->sgp->power; 4846 busiest->group_power;
4775 if (sds->max_load > tmp) 4847 if (busiest->avg_load > tmp) {
4776 pwr_move += sds->busiest->sgp->power * 4848 pwr_move += busiest->group_power *
4777 min(sds->busiest_load_per_task, sds->max_load - tmp); 4849 min(busiest->load_per_task,
4850 busiest->avg_load - tmp);
4851 }
4778 4852
4779 /* Amount of load we'd add */ 4853 /* Amount of load we'd add */
4780 if (sds->max_load * sds->busiest->sgp->power < 4854 if (busiest->avg_load * busiest->group_power <
4781 sds->busiest_load_per_task * SCHED_POWER_SCALE) 4855 busiest->load_per_task * SCHED_POWER_SCALE) {
4782 tmp = (sds->max_load * sds->busiest->sgp->power) / 4856 tmp = (busiest->avg_load * busiest->group_power) /
4783 sds->this->sgp->power; 4857 local->group_power;
4784 else 4858 } else {
4785 tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / 4859 tmp = (busiest->load_per_task * SCHED_POWER_SCALE) /
4786 sds->this->sgp->power; 4860 local->group_power;
4787 pwr_move += sds->this->sgp->power * 4861 }
4788 min(sds->this_load_per_task, sds->this_load + tmp); 4862 pwr_move += local->group_power *
4863 min(local->load_per_task, local->avg_load + tmp);
4789 pwr_move /= SCHED_POWER_SCALE; 4864 pwr_move /= SCHED_POWER_SCALE;
4790 4865
4791 /* Move if we gain throughput */ 4866 /* Move if we gain throughput */
4792 if (pwr_move > pwr_now) 4867 if (pwr_move > pwr_now)
4793 env->imbalance = sds->busiest_load_per_task; 4868 env->imbalance = busiest->load_per_task;
4794} 4869}
4795 4870
4796/** 4871/**
@@ -4802,11 +4877,18 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4802static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds) 4877static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4803{ 4878{
4804 unsigned long max_pull, load_above_capacity = ~0UL; 4879 unsigned long max_pull, load_above_capacity = ~0UL;
4880 struct sg_lb_stats *local, *busiest;
4805 4881
4806 sds->busiest_load_per_task /= sds->busiest_nr_running; 4882 local = &sds->local_stat;
4807 if (sds->group_imb) { 4883 busiest = &sds->busiest_stat;
4808 sds->busiest_load_per_task = 4884
4809 min(sds->busiest_load_per_task, sds->avg_load); 4885 if (busiest->group_imb) {
4886 /*
4887 * In the group_imb case we cannot rely on group-wide averages
4888 * to ensure cpu-load equilibrium, look at wider averages. XXX
4889 */
4890 busiest->load_per_task =
4891 min(busiest->load_per_task, sds->avg_load);
4810 } 4892 }
4811 4893
4812 /* 4894 /*
@@ -4814,21 +4896,22 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
4814 * max load less than avg load(as we skip the groups at or below 4896 * max load less than avg load(as we skip the groups at or below
4815 * its cpu_power, while calculating max_load..) 4897 * its cpu_power, while calculating max_load..)
4816 */ 4898 */
4817 if (sds->max_load < sds->avg_load) { 4899 if (busiest->avg_load < sds->avg_load) {
4818 env->imbalance = 0; 4900 env->imbalance = 0;
4819 return fix_small_imbalance(env, sds); 4901 return fix_small_imbalance(env, sds);
4820 } 4902 }
4821 4903
4822 if (!sds->group_imb) { 4904 if (!busiest->group_imb) {
4823 /* 4905 /*
4824 * Don't want to pull so many tasks that a group would go idle. 4906 * Don't want to pull so many tasks that a group would go idle.
4907 * Except of course for the group_imb case, since then we might
4908 * have to drop below capacity to reach cpu-load equilibrium.
4825 */ 4909 */
4826 load_above_capacity = (sds->busiest_nr_running - 4910 load_above_capacity =
4827 sds->busiest_group_capacity); 4911 (busiest->sum_nr_running - busiest->group_capacity);
4828 4912
4829 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); 4913 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
4830 4914 load_above_capacity /= busiest->group_power;
4831 load_above_capacity /= sds->busiest->sgp->power;
4832 } 4915 }
4833 4916
4834 /* 4917 /*
@@ -4838,15 +4921,14 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
4838 * we also don't want to reduce the group load below the group capacity 4921 * we also don't want to reduce the group load below the group capacity
4839 * (so that we can implement power-savings policies etc). Thus we look 4922 * (so that we can implement power-savings policies etc). Thus we look
4840 * for the minimum possible imbalance. 4923 * for the minimum possible imbalance.
4841 * Be careful of negative numbers as they'll appear as very large values
4842 * with unsigned longs.
4843 */ 4924 */
4844 max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); 4925 max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
4845 4926
4846 /* How much load to actually move to equalise the imbalance */ 4927 /* How much load to actually move to equalise the imbalance */
4847 env->imbalance = min(max_pull * sds->busiest->sgp->power, 4928 env->imbalance = min(
4848 (sds->avg_load - sds->this_load) * sds->this->sgp->power) 4929 max_pull * busiest->group_power,
4849 / SCHED_POWER_SCALE; 4930 (sds->avg_load - local->avg_load) * local->group_power
4931 ) / SCHED_POWER_SCALE;
4850 4932
4851 /* 4933 /*
4852 * if *imbalance is less than the average load per runnable task 4934 * if *imbalance is less than the average load per runnable task
@@ -4854,9 +4936,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
4854 * a think about bumping its value to force at least one task to be 4936 * a think about bumping its value to force at least one task to be
4855 * moved 4937 * moved
4856 */ 4938 */
4857 if (env->imbalance < sds->busiest_load_per_task) 4939 if (env->imbalance < busiest->load_per_task)
4858 return fix_small_imbalance(env, sds); 4940 return fix_small_imbalance(env, sds);
4859
4860} 4941}
4861 4942
4862/******* find_busiest_group() helpers end here *********************/ 4943/******* find_busiest_group() helpers end here *********************/
@@ -4872,69 +4953,62 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
4872 * to restore balance. 4953 * to restore balance.
4873 * 4954 *
4874 * @env: The load balancing environment. 4955 * @env: The load balancing environment.
4875 * @balance: Pointer to a variable indicating if this_cpu
4876 * is the appropriate cpu to perform load balancing at this_level.
4877 * 4956 *
4878 * Return: - The busiest group if imbalance exists. 4957 * Return: - The busiest group if imbalance exists.
4879 * - If no imbalance and user has opted for power-savings balance, 4958 * - If no imbalance and user has opted for power-savings balance,
4880 * return the least loaded group whose CPUs can be 4959 * return the least loaded group whose CPUs can be
4881 * put to idle by rebalancing its tasks onto our group. 4960 * put to idle by rebalancing its tasks onto our group.
4882 */ 4961 */
4883static struct sched_group * 4962static struct sched_group *find_busiest_group(struct lb_env *env)
4884find_busiest_group(struct lb_env *env, int *balance)
4885{ 4963{
4964 struct sg_lb_stats *local, *busiest;
4886 struct sd_lb_stats sds; 4965 struct sd_lb_stats sds;
4887 4966
4888 memset(&sds, 0, sizeof(sds)); 4967 init_sd_lb_stats(&sds);
4889 4968
4890 /* 4969 /*
4891 * Compute the various statistics relavent for load balancing at 4970 * Compute the various statistics relavent for load balancing at
4892 * this level. 4971 * this level.
4893 */ 4972 */
4894 update_sd_lb_stats(env, balance, &sds); 4973 update_sd_lb_stats(env, &sds);
4895 4974 local = &sds.local_stat;
4896 /* 4975 busiest = &sds.busiest_stat;
4897 * this_cpu is not the appropriate cpu to perform load balancing at
4898 * this level.
4899 */
4900 if (!(*balance))
4901 goto ret;
4902 4976
4903 if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && 4977 if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
4904 check_asym_packing(env, &sds)) 4978 check_asym_packing(env, &sds))
4905 return sds.busiest; 4979 return sds.busiest;
4906 4980
4907 /* There is no busy sibling group to pull tasks from */ 4981 /* There is no busy sibling group to pull tasks from */
4908 if (!sds.busiest || sds.busiest_nr_running == 0) 4982 if (!sds.busiest || busiest->sum_nr_running == 0)
4909 goto out_balanced; 4983 goto out_balanced;
4910 4984
4911 sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr; 4985 sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
4912 4986
4913 /* 4987 /*
4914 * If the busiest group is imbalanced the below checks don't 4988 * If the busiest group is imbalanced the below checks don't
4915 * work because they assumes all things are equal, which typically 4989 * work because they assume all things are equal, which typically
4916 * isn't true due to cpus_allowed constraints and the like. 4990 * isn't true due to cpus_allowed constraints and the like.
4917 */ 4991 */
4918 if (sds.group_imb) 4992 if (busiest->group_imb)
4919 goto force_balance; 4993 goto force_balance;
4920 4994
4921 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ 4995 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
4922 if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity && 4996 if (env->idle == CPU_NEWLY_IDLE && local->group_has_capacity &&
4923 !sds.busiest_has_capacity) 4997 !busiest->group_has_capacity)
4924 goto force_balance; 4998 goto force_balance;
4925 4999
4926 /* 5000 /*
4927 * If the local group is more busy than the selected busiest group 5001 * If the local group is more busy than the selected busiest group
4928 * don't try and pull any tasks. 5002 * don't try and pull any tasks.
4929 */ 5003 */
4930 if (sds.this_load >= sds.max_load) 5004 if (local->avg_load >= busiest->avg_load)
4931 goto out_balanced; 5005 goto out_balanced;
4932 5006
4933 /* 5007 /*
4934 * Don't pull any tasks if this group is already above the domain 5008 * Don't pull any tasks if this group is already above the domain
4935 * average load. 5009 * average load.
4936 */ 5010 */
4937 if (sds.this_load >= sds.avg_load) 5011 if (local->avg_load >= sds.avg_load)
4938 goto out_balanced; 5012 goto out_balanced;
4939 5013
4940 if (env->idle == CPU_IDLE) { 5014 if (env->idle == CPU_IDLE) {
@@ -4944,15 +5018,16 @@ find_busiest_group(struct lb_env *env, int *balance)
4944 * there is no imbalance between this and busiest group 5018 * there is no imbalance between this and busiest group
4945 * wrt to idle cpu's, it is balanced. 5019 * wrt to idle cpu's, it is balanced.
4946 */ 5020 */
4947 if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) && 5021 if ((local->idle_cpus < busiest->idle_cpus) &&
4948 sds.busiest_nr_running <= sds.busiest_group_weight) 5022 busiest->sum_nr_running <= busiest->group_weight)
4949 goto out_balanced; 5023 goto out_balanced;
4950 } else { 5024 } else {
4951 /* 5025 /*
4952 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use 5026 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
4953 * imbalance_pct to be conservative. 5027 * imbalance_pct to be conservative.
4954 */ 5028 */
4955 if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load) 5029 if (100 * busiest->avg_load <=
5030 env->sd->imbalance_pct * local->avg_load)
4956 goto out_balanced; 5031 goto out_balanced;
4957 } 5032 }
4958 5033
@@ -4962,7 +5037,6 @@ force_balance:
4962 return sds.busiest; 5037 return sds.busiest;
4963 5038
4964out_balanced: 5039out_balanced:
4965ret:
4966 env->imbalance = 0; 5040 env->imbalance = 0;
4967 return NULL; 5041 return NULL;
4968} 5042}
@@ -4974,10 +5048,10 @@ static struct rq *find_busiest_queue(struct lb_env *env,
4974 struct sched_group *group) 5048 struct sched_group *group)
4975{ 5049{
4976 struct rq *busiest = NULL, *rq; 5050 struct rq *busiest = NULL, *rq;
4977 unsigned long max_load = 0; 5051 unsigned long busiest_load = 0, busiest_power = 1;
4978 int i; 5052 int i;
4979 5053
4980 for_each_cpu(i, sched_group_cpus(group)) { 5054 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
4981 unsigned long power = power_of(i); 5055 unsigned long power = power_of(i);
4982 unsigned long capacity = DIV_ROUND_CLOSEST(power, 5056 unsigned long capacity = DIV_ROUND_CLOSEST(power,
4983 SCHED_POWER_SCALE); 5057 SCHED_POWER_SCALE);
@@ -4986,9 +5060,6 @@ static struct rq *find_busiest_queue(struct lb_env *env,
4986 if (!capacity) 5060 if (!capacity)
4987 capacity = fix_small_capacity(env->sd, group); 5061 capacity = fix_small_capacity(env->sd, group);
4988 5062
4989 if (!cpumask_test_cpu(i, env->cpus))
4990 continue;
4991
4992 rq = cpu_rq(i); 5063 rq = cpu_rq(i);
4993 wl = weighted_cpuload(i); 5064 wl = weighted_cpuload(i);
4994 5065
@@ -5004,11 +5075,15 @@ static struct rq *find_busiest_queue(struct lb_env *env,
5004 * the weighted_cpuload() scaled with the cpu power, so that 5075 * the weighted_cpuload() scaled with the cpu power, so that
5005 * the load can be moved away from the cpu that is potentially 5076 * the load can be moved away from the cpu that is potentially
5006 * running at a lower capacity. 5077 * running at a lower capacity.
5078 *
5079 * Thus we're looking for max(wl_i / power_i), crosswise
5080 * multiplication to rid ourselves of the division works out
5081 * to: wl_i * power_j > wl_j * power_i; where j is our
5082 * previous maximum.
5007 */ 5083 */
5008 wl = (wl * SCHED_POWER_SCALE) / power; 5084 if (wl * busiest_power > busiest_load * power) {
5009 5085 busiest_load = wl;
5010 if (wl > max_load) { 5086 busiest_power = power;
5011 max_load = wl;
5012 busiest = rq; 5087 busiest = rq;
5013 } 5088 }
5014 } 5089 }
@@ -5045,13 +5120,47 @@ static int need_active_balance(struct lb_env *env)
5045 5120
5046static int active_load_balance_cpu_stop(void *data); 5121static int active_load_balance_cpu_stop(void *data);
5047 5122
5123static int should_we_balance(struct lb_env *env)
5124{
5125 struct sched_group *sg = env->sd->groups;
5126 struct cpumask *sg_cpus, *sg_mask;
5127 int cpu, balance_cpu = -1;
5128
5129 /*
5130 * In the newly idle case, we will allow all the cpu's
5131 * to do the newly idle load balance.
5132 */
5133 if (env->idle == CPU_NEWLY_IDLE)
5134 return 1;
5135
5136 sg_cpus = sched_group_cpus(sg);
5137 sg_mask = sched_group_mask(sg);
5138 /* Try to find first idle cpu */
5139 for_each_cpu_and(cpu, sg_cpus, env->cpus) {
5140 if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu))
5141 continue;
5142
5143 balance_cpu = cpu;
5144 break;
5145 }
5146
5147 if (balance_cpu == -1)
5148 balance_cpu = group_balance_cpu(sg);
5149
5150 /*
5151 * First idle cpu or the first cpu(busiest) in this sched group
5152 * is eligible for doing load balancing at this and above domains.
5153 */
5154 return balance_cpu != env->dst_cpu;
5155}
5156
5048/* 5157/*
5049 * Check this_cpu to ensure it is balanced within domain. Attempt to move 5158 * Check this_cpu to ensure it is balanced within domain. Attempt to move
5050 * tasks if there is an imbalance. 5159 * tasks if there is an imbalance.
5051 */ 5160 */
5052static int load_balance(int this_cpu, struct rq *this_rq, 5161static int load_balance(int this_cpu, struct rq *this_rq,
5053 struct sched_domain *sd, enum cpu_idle_type idle, 5162 struct sched_domain *sd, enum cpu_idle_type idle,
5054 int *balance) 5163 int *continue_balancing)
5055{ 5164{
5056 int ld_moved, cur_ld_moved, active_balance = 0; 5165 int ld_moved, cur_ld_moved, active_balance = 0;
5057 struct sched_group *group; 5166 struct sched_group *group;
@@ -5081,11 +5190,12 @@ static int load_balance(int this_cpu, struct rq *this_rq,
5081 schedstat_inc(sd, lb_count[idle]); 5190 schedstat_inc(sd, lb_count[idle]);
5082 5191
5083redo: 5192redo:
5084 group = find_busiest_group(&env, balance); 5193 if (!should_we_balance(&env)) {
5085 5194 *continue_balancing = 0;
5086 if (*balance == 0)
5087 goto out_balanced; 5195 goto out_balanced;
5196 }
5088 5197
5198 group = find_busiest_group(&env);
5089 if (!group) { 5199 if (!group) {
5090 schedstat_inc(sd, lb_nobusyg[idle]); 5200 schedstat_inc(sd, lb_nobusyg[idle]);
5091 goto out_balanced; 5201 goto out_balanced;
@@ -5114,7 +5224,6 @@ redo:
5114 env.src_rq = busiest; 5224 env.src_rq = busiest;
5115 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); 5225 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
5116 5226
5117 update_h_load(env.src_cpu);
5118more_balance: 5227more_balance:
5119 local_irq_save(flags); 5228 local_irq_save(flags);
5120 double_rq_lock(env.dst_rq, busiest); 5229 double_rq_lock(env.dst_rq, busiest);
@@ -5298,7 +5407,7 @@ void idle_balance(int this_cpu, struct rq *this_rq)
5298 rcu_read_lock(); 5407 rcu_read_lock();
5299 for_each_domain(this_cpu, sd) { 5408 for_each_domain(this_cpu, sd) {
5300 unsigned long interval; 5409 unsigned long interval;
5301 int balance = 1; 5410 int continue_balancing = 1;
5302 5411
5303 if (!(sd->flags & SD_LOAD_BALANCE)) 5412 if (!(sd->flags & SD_LOAD_BALANCE))
5304 continue; 5413 continue;
@@ -5306,7 +5415,8 @@ void idle_balance(int this_cpu, struct rq *this_rq)
5306 if (sd->flags & SD_BALANCE_NEWIDLE) { 5415 if (sd->flags & SD_BALANCE_NEWIDLE) {
5307 /* If we've pulled tasks over stop searching: */ 5416 /* If we've pulled tasks over stop searching: */
5308 pulled_task = load_balance(this_cpu, this_rq, 5417 pulled_task = load_balance(this_cpu, this_rq,
5309 sd, CPU_NEWLY_IDLE, &balance); 5418 sd, CPU_NEWLY_IDLE,
5419 &continue_balancing);
5310 } 5420 }
5311 5421
5312 interval = msecs_to_jiffies(sd->balance_interval); 5422 interval = msecs_to_jiffies(sd->balance_interval);
@@ -5544,7 +5654,7 @@ void update_max_interval(void)
5544 */ 5654 */
5545static void rebalance_domains(int cpu, enum cpu_idle_type idle) 5655static void rebalance_domains(int cpu, enum cpu_idle_type idle)
5546{ 5656{
5547 int balance = 1; 5657 int continue_balancing = 1;
5548 struct rq *rq = cpu_rq(cpu); 5658 struct rq *rq = cpu_rq(cpu);
5549 unsigned long interval; 5659 unsigned long interval;
5550 struct sched_domain *sd; 5660 struct sched_domain *sd;
@@ -5576,7 +5686,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
5576 } 5686 }
5577 5687
5578 if (time_after_eq(jiffies, sd->last_balance + interval)) { 5688 if (time_after_eq(jiffies, sd->last_balance + interval)) {
5579 if (load_balance(cpu, rq, sd, idle, &balance)) { 5689 if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
5580 /* 5690 /*
5581 * The LBF_SOME_PINNED logic could have changed 5691 * The LBF_SOME_PINNED logic could have changed
5582 * env->dst_cpu, so we can't know our idle 5692 * env->dst_cpu, so we can't know our idle
@@ -5599,7 +5709,7 @@ out:
5599 * CPU in our sched group which is doing load balancing more 5709 * CPU in our sched group which is doing load balancing more
5600 * actively. 5710 * actively.
5601 */ 5711 */
5602 if (!balance) 5712 if (!continue_balancing)
5603 break; 5713 break;
5604 } 5714 }
5605 rcu_read_unlock(); 5715 rcu_read_unlock();
@@ -5895,11 +6005,9 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
5895 * and ensure we don't carry in an old decay_count if we 6005 * and ensure we don't carry in an old decay_count if we
5896 * switch back. 6006 * switch back.
5897 */ 6007 */
5898 if (p->se.avg.decay_count) { 6008 if (se->avg.decay_count) {
5899 struct cfs_rq *cfs_rq = cfs_rq_of(&p->se); 6009 __synchronize_entity_decay(se);
5900 __synchronize_entity_decay(&p->se); 6010 subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
5901 subtract_blocked_load_contrib(cfs_rq,
5902 p->se.avg.load_avg_contrib);
5903 } 6011 }
5904#endif 6012#endif
5905} 6013}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ef0a7b2439dd..b3c5653e1dca 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -285,7 +285,6 @@ struct cfs_rq {
285 /* Required to track per-cpu representation of a task_group */ 285 /* Required to track per-cpu representation of a task_group */
286 u32 tg_runnable_contrib; 286 u32 tg_runnable_contrib;
287 unsigned long tg_load_contrib; 287 unsigned long tg_load_contrib;
288#endif /* CONFIG_FAIR_GROUP_SCHED */
289 288
290 /* 289 /*
291 * h_load = weight * f(tg) 290 * h_load = weight * f(tg)
@@ -294,6 +293,9 @@ struct cfs_rq {
294 * this group. 293 * this group.
295 */ 294 */
296 unsigned long h_load; 295 unsigned long h_load;
296 u64 last_h_load_update;
297 struct sched_entity *h_load_next;
298#endif /* CONFIG_FAIR_GROUP_SCHED */
297#endif /* CONFIG_SMP */ 299#endif /* CONFIG_SMP */
298 300
299#ifdef CONFIG_FAIR_GROUP_SCHED 301#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -429,9 +431,6 @@ struct rq {
429#ifdef CONFIG_FAIR_GROUP_SCHED 431#ifdef CONFIG_FAIR_GROUP_SCHED
430 /* list of leaf cfs_rq on this cpu: */ 432 /* list of leaf cfs_rq on this cpu: */
431 struct list_head leaf_cfs_rq_list; 433 struct list_head leaf_cfs_rq_list;
432#ifdef CONFIG_SMP
433 unsigned long h_load_throttle;
434#endif /* CONFIG_SMP */
435#endif /* CONFIG_FAIR_GROUP_SCHED */ 434#endif /* CONFIG_FAIR_GROUP_SCHED */
436 435
437#ifdef CONFIG_RT_GROUP_SCHED 436#ifdef CONFIG_RT_GROUP_SCHED
@@ -595,6 +594,7 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
595} 594}
596 595
597DECLARE_PER_CPU(struct sched_domain *, sd_llc); 596DECLARE_PER_CPU(struct sched_domain *, sd_llc);
597DECLARE_PER_CPU(int, sd_llc_size);
598DECLARE_PER_CPU(int, sd_llc_id); 598DECLARE_PER_CPU(int, sd_llc_id);
599 599
600struct sched_group_power { 600struct sched_group_power {
@@ -665,9 +665,9 @@ extern int group_balance_cpu(struct sched_group *sg);
665/* 665/*
666 * Return the group to which this tasks belongs. 666 * Return the group to which this tasks belongs.
667 * 667 *
668 * We cannot use task_subsys_state() and friends because the cgroup 668 * We cannot use task_css() and friends because the cgroup subsystem
669 * subsystem changes that value before the cgroup_subsys::attach() method 669 * changes that value before the cgroup_subsys::attach() method is called,
670 * is called, therefore we cannot pin it and might observe the wrong value. 670 * therefore we cannot pin it and might observe the wrong value.
671 * 671 *
672 * The same is true for autogroup's p->signal->autogroup->tg, the autogroup 672 * The same is true for autogroup's p->signal->autogroup->tg, the autogroup
673 * core changes this before calling sched_move_task(). 673 * core changes this before calling sched_move_task().
diff --git a/kernel/smp.c b/kernel/smp.c
index fe9f773d7114..449b707fc20d 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -186,25 +186,13 @@ void generic_smp_call_function_single_interrupt(void)
186 186
187 while (!list_empty(&list)) { 187 while (!list_empty(&list)) {
188 struct call_single_data *csd; 188 struct call_single_data *csd;
189 unsigned int csd_flags;
190 189
191 csd = list_entry(list.next, struct call_single_data, list); 190 csd = list_entry(list.next, struct call_single_data, list);
192 list_del(&csd->list); 191 list_del(&csd->list);
193 192
194 /*
195 * 'csd' can be invalid after this call if flags == 0
196 * (when called through generic_exec_single()),
197 * so save them away before making the call:
198 */
199 csd_flags = csd->flags;
200
201 csd->func(csd->info); 193 csd->func(csd->info);
202 194
203 /* 195 csd_unlock(csd);
204 * Unlocked CSDs are valid through generic_exec_single():
205 */
206 if (csd_flags & CSD_FLAG_LOCK)
207 csd_unlock(csd);
208 } 196 }
209} 197}
210 198
@@ -278,8 +266,6 @@ EXPORT_SYMBOL(smp_call_function_single);
278 * @wait: If true, wait until function has completed. 266 * @wait: If true, wait until function has completed.
279 * 267 *
280 * Returns 0 on success, else a negative status code (if no cpus were online). 268 * Returns 0 on success, else a negative status code (if no cpus were online).
281 * Note that @wait will be implicitly turned on in case of allocation failures,
282 * since we fall back to on-stack allocation.
283 * 269 *
284 * Selection preference: 270 * Selection preference:
285 * 1) current cpu if in @mask 271 * 1) current cpu if in @mask
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 70f27e89012b..2b62fe86f9ec 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -105,7 +105,6 @@ config NO_HZ_FULL
105 select RCU_USER_QS 105 select RCU_USER_QS
106 select RCU_NOCB_CPU 106 select RCU_NOCB_CPU
107 select VIRT_CPU_ACCOUNTING_GEN 107 select VIRT_CPU_ACCOUNTING_GEN
108 select CONTEXT_TRACKING_FORCE
109 select IRQ_WORK 108 select IRQ_WORK
110 help 109 help
111 Adaptively try to shutdown the tick whenever possible, even when 110 Adaptively try to shutdown the tick whenever possible, even when
@@ -134,6 +133,56 @@ config NO_HZ_FULL_ALL
134 Note the boot CPU will still be kept outside the range to 133 Note the boot CPU will still be kept outside the range to
135 handle the timekeeping duty. 134 handle the timekeeping duty.
136 135
136config NO_HZ_FULL_SYSIDLE
137 bool "Detect full-system idle state for full dynticks system"
138 depends on NO_HZ_FULL
139 default n
140 help
141 At least one CPU must keep the scheduling-clock tick running for
142 timekeeping purposes whenever there is a non-idle CPU, where
143 "non-idle" also includes dynticks CPUs as long as they are
144 running non-idle tasks. Because the underlying adaptive-tick
145 support cannot distinguish between all CPUs being idle and
146 all CPUs each running a single task in dynticks mode, the
147 underlying support simply ensures that there is always a CPU
148 handling the scheduling-clock tick, whether or not all CPUs
149 are idle. This Kconfig option enables scalable detection of
150 the all-CPUs-idle state, thus allowing the scheduling-clock
151 tick to be disabled when all CPUs are idle. Note that scalable
152 detection of the all-CPUs-idle state means that larger systems
153 will be slower to declare the all-CPUs-idle state.
154
155 Say Y if you would like to help debug all-CPUs-idle detection.
156
157 Say N if you are unsure.
158
159config NO_HZ_FULL_SYSIDLE_SMALL
160 int "Number of CPUs above which large-system approach is used"
161 depends on NO_HZ_FULL_SYSIDLE
162 range 1 NR_CPUS
163 default 8
164 help
165 The full-system idle detection mechanism takes a lazy approach
166 on large systems, as is required to attain decent scalability.
167 However, on smaller systems, scalability is not anywhere near as
168 large a concern as is energy efficiency. The sysidle subsystem
169 therefore uses a fast but non-scalable algorithm for small
170 systems and a lazier but scalable algorithm for large systems.
171 This Kconfig parameter defines the number of CPUs in the largest
172 system that will be considered to be "small".
173
174 The default value will be fine in most cases. Battery-powered
175 systems that (1) enable NO_HZ_FULL_SYSIDLE, (2) have larger
176 numbers of CPUs, and (3) are suffering from battery-lifetime
177 problems due to long sysidle latencies might wish to experiment
178 with larger values for this Kconfig parameter. On the other
179 hand, they might be even better served by disabling NO_HZ_FULL
180 entirely, given that NO_HZ_FULL is intended for HPC and
181 real-time workloads that at present do not tend to be run on
182 battery-powered systems.
183
184 Take the default if you are unsure.
185
137config NO_HZ 186config NO_HZ
138 bool "Old Idle dynticks config" 187 bool "Old Idle dynticks config"
139 depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS 188 depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index e8a1516cc0a3..3612fc77f834 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -23,6 +23,7 @@
23#include <linux/irq_work.h> 23#include <linux/irq_work.h>
24#include <linux/posix-timers.h> 24#include <linux/posix-timers.h>
25#include <linux/perf_event.h> 25#include <linux/perf_event.h>
26#include <linux/context_tracking.h>
26 27
27#include <asm/irq_regs.h> 28#include <asm/irq_regs.h>
28 29
@@ -148,8 +149,8 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
148} 149}
149 150
150#ifdef CONFIG_NO_HZ_FULL 151#ifdef CONFIG_NO_HZ_FULL
151static cpumask_var_t nohz_full_mask; 152cpumask_var_t tick_nohz_full_mask;
152bool have_nohz_full_mask; 153bool tick_nohz_full_running;
153 154
154static bool can_stop_full_tick(void) 155static bool can_stop_full_tick(void)
155{ 156{
@@ -182,7 +183,7 @@ static bool can_stop_full_tick(void)
182 * Don't allow the user to think they can get 183 * Don't allow the user to think they can get
183 * full NO_HZ with this machine. 184 * full NO_HZ with this machine.
184 */ 185 */
185 WARN_ONCE(have_nohz_full_mask, 186 WARN_ONCE(tick_nohz_full_running,
186 "NO_HZ FULL will not work with unstable sched clock"); 187 "NO_HZ FULL will not work with unstable sched clock");
187 return false; 188 return false;
188 } 189 }
@@ -197,7 +198,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now);
197 * Re-evaluate the need for the tick on the current CPU 198 * Re-evaluate the need for the tick on the current CPU
198 * and restart it if necessary. 199 * and restart it if necessary.
199 */ 200 */
200void tick_nohz_full_check(void) 201void __tick_nohz_full_check(void)
201{ 202{
202 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 203 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
203 204
@@ -211,7 +212,7 @@ void tick_nohz_full_check(void)
211 212
212static void nohz_full_kick_work_func(struct irq_work *work) 213static void nohz_full_kick_work_func(struct irq_work *work)
213{ 214{
214 tick_nohz_full_check(); 215 __tick_nohz_full_check();
215} 216}
216 217
217static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { 218static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
@@ -230,7 +231,7 @@ void tick_nohz_full_kick(void)
230 231
231static void nohz_full_kick_ipi(void *info) 232static void nohz_full_kick_ipi(void *info)
232{ 233{
233 tick_nohz_full_check(); 234 __tick_nohz_full_check();
234} 235}
235 236
236/* 237/*
@@ -239,12 +240,13 @@ static void nohz_full_kick_ipi(void *info)
239 */ 240 */
240void tick_nohz_full_kick_all(void) 241void tick_nohz_full_kick_all(void)
241{ 242{
242 if (!have_nohz_full_mask) 243 if (!tick_nohz_full_running)
243 return; 244 return;
244 245
245 preempt_disable(); 246 preempt_disable();
246 smp_call_function_many(nohz_full_mask, 247 smp_call_function_many(tick_nohz_full_mask,
247 nohz_full_kick_ipi, NULL, false); 248 nohz_full_kick_ipi, NULL, false);
249 tick_nohz_full_kick();
248 preempt_enable(); 250 preempt_enable();
249} 251}
250 252
@@ -253,7 +255,7 @@ void tick_nohz_full_kick_all(void)
253 * It might need the tick due to per task/process properties: 255 * It might need the tick due to per task/process properties:
254 * perf events, posix cpu timers, ... 256 * perf events, posix cpu timers, ...
255 */ 257 */
256void tick_nohz_task_switch(struct task_struct *tsk) 258void __tick_nohz_task_switch(struct task_struct *tsk)
257{ 259{
258 unsigned long flags; 260 unsigned long flags;
259 261
@@ -269,31 +271,23 @@ out:
269 local_irq_restore(flags); 271 local_irq_restore(flags);
270} 272}
271 273
272int tick_nohz_full_cpu(int cpu)
273{
274 if (!have_nohz_full_mask)
275 return 0;
276
277 return cpumask_test_cpu(cpu, nohz_full_mask);
278}
279
280/* Parse the boot-time nohz CPU list from the kernel parameters. */ 274/* Parse the boot-time nohz CPU list from the kernel parameters. */
281static int __init tick_nohz_full_setup(char *str) 275static int __init tick_nohz_full_setup(char *str)
282{ 276{
283 int cpu; 277 int cpu;
284 278
285 alloc_bootmem_cpumask_var(&nohz_full_mask); 279 alloc_bootmem_cpumask_var(&tick_nohz_full_mask);
286 if (cpulist_parse(str, nohz_full_mask) < 0) { 280 if (cpulist_parse(str, tick_nohz_full_mask) < 0) {
287 pr_warning("NOHZ: Incorrect nohz_full cpumask\n"); 281 pr_warning("NOHZ: Incorrect nohz_full cpumask\n");
288 return 1; 282 return 1;
289 } 283 }
290 284
291 cpu = smp_processor_id(); 285 cpu = smp_processor_id();
292 if (cpumask_test_cpu(cpu, nohz_full_mask)) { 286 if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) {
293 pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu); 287 pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu);
294 cpumask_clear_cpu(cpu, nohz_full_mask); 288 cpumask_clear_cpu(cpu, tick_nohz_full_mask);
295 } 289 }
296 have_nohz_full_mask = true; 290 tick_nohz_full_running = true;
297 291
298 return 1; 292 return 1;
299} 293}
@@ -311,7 +305,7 @@ static int tick_nohz_cpu_down_callback(struct notifier_block *nfb,
311 * If we handle the timekeeping duty for full dynticks CPUs, 305 * If we handle the timekeeping duty for full dynticks CPUs,
312 * we can't safely shutdown that CPU. 306 * we can't safely shutdown that CPU.
313 */ 307 */
314 if (have_nohz_full_mask && tick_do_timer_cpu == cpu) 308 if (tick_nohz_full_running && tick_do_timer_cpu == cpu)
315 return NOTIFY_BAD; 309 return NOTIFY_BAD;
316 break; 310 break;
317 } 311 }
@@ -330,31 +324,34 @@ static int tick_nohz_init_all(void)
330 int err = -1; 324 int err = -1;
331 325
332#ifdef CONFIG_NO_HZ_FULL_ALL 326#ifdef CONFIG_NO_HZ_FULL_ALL
333 if (!alloc_cpumask_var(&nohz_full_mask, GFP_KERNEL)) { 327 if (!alloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL)) {
334 pr_err("NO_HZ: Can't allocate full dynticks cpumask\n"); 328 pr_err("NO_HZ: Can't allocate full dynticks cpumask\n");
335 return err; 329 return err;
336 } 330 }
337 err = 0; 331 err = 0;
338 cpumask_setall(nohz_full_mask); 332 cpumask_setall(tick_nohz_full_mask);
339 cpumask_clear_cpu(smp_processor_id(), nohz_full_mask); 333 cpumask_clear_cpu(smp_processor_id(), tick_nohz_full_mask);
340 have_nohz_full_mask = true; 334 tick_nohz_full_running = true;
341#endif 335#endif
342 return err; 336 return err;
343} 337}
344 338
345void __init tick_nohz_init(void) 339void __init tick_nohz_init(void)
346{ 340{
347 if (!have_nohz_full_mask) { 341 int cpu;
342
343 if (!tick_nohz_full_running) {
348 if (tick_nohz_init_all() < 0) 344 if (tick_nohz_init_all() < 0)
349 return; 345 return;
350 } 346 }
351 347
348 for_each_cpu(cpu, tick_nohz_full_mask)
349 context_tracking_cpu_set(cpu);
350
352 cpu_notifier(tick_nohz_cpu_down_callback, 0); 351 cpu_notifier(tick_nohz_cpu_down_callback, 0);
353 cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), nohz_full_mask); 352 cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), tick_nohz_full_mask);
354 pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf); 353 pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf);
355} 354}
356#else
357#define have_nohz_full_mask (0)
358#endif 355#endif
359 356
360/* 357/*
@@ -732,7 +729,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
732 return false; 729 return false;
733 } 730 }
734 731
735 if (have_nohz_full_mask) { 732 if (tick_nohz_full_enabled()) {
736 /* 733 /*
737 * Keep the tick alive to guarantee timekeeping progression 734 * Keep the tick alive to guarantee timekeeping progression
738 * if there are full dynticks CPUs around 735 * if there are full dynticks CPUs around
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 3bdf28323012..61ed862cdd37 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -265,10 +265,9 @@ static inline void timer_list_header(struct seq_file *m, u64 now)
265static int timer_list_show(struct seq_file *m, void *v) 265static int timer_list_show(struct seq_file *m, void *v)
266{ 266{
267 struct timer_list_iter *iter = v; 267 struct timer_list_iter *iter = v;
268 u64 now = ktime_to_ns(ktime_get());
269 268
270 if (iter->cpu == -1 && !iter->second_pass) 269 if (iter->cpu == -1 && !iter->second_pass)
271 timer_list_header(m, now); 270 timer_list_header(m, iter->now);
272 else if (!iter->second_pass) 271 else if (!iter->second_pass)
273 print_cpu(m, iter->cpu, iter->now); 272 print_cpu(m, iter->cpu, iter->now);
274#ifdef CONFIG_GENERIC_CLOCKEVENTS 273#ifdef CONFIG_GENERIC_CLOCKEVENTS
@@ -298,33 +297,41 @@ void sysrq_timer_list_show(void)
298 return; 297 return;
299} 298}
300 299
301static void *timer_list_start(struct seq_file *file, loff_t *offset) 300static void *move_iter(struct timer_list_iter *iter, loff_t offset)
302{ 301{
303 struct timer_list_iter *iter = file->private; 302 for (; offset; offset--) {
304 303 iter->cpu = cpumask_next(iter->cpu, cpu_online_mask);
305 if (!*offset) { 304 if (iter->cpu >= nr_cpu_ids) {
306 iter->cpu = -1;
307 iter->now = ktime_to_ns(ktime_get());
308 } else if (iter->cpu >= nr_cpu_ids) {
309#ifdef CONFIG_GENERIC_CLOCKEVENTS 305#ifdef CONFIG_GENERIC_CLOCKEVENTS
310 if (!iter->second_pass) { 306 if (!iter->second_pass) {
311 iter->cpu = -1; 307 iter->cpu = -1;
312 iter->second_pass = true; 308 iter->second_pass = true;
313 } else 309 } else
314 return NULL; 310 return NULL;
315#else 311#else
316 return NULL; 312 return NULL;
317#endif 313#endif
314 }
318 } 315 }
319 return iter; 316 return iter;
320} 317}
321 318
319static void *timer_list_start(struct seq_file *file, loff_t *offset)
320{
321 struct timer_list_iter *iter = file->private;
322
323 if (!*offset)
324 iter->now = ktime_to_ns(ktime_get());
325 iter->cpu = -1;
326 iter->second_pass = false;
327 return move_iter(iter, *offset);
328}
329
322static void *timer_list_next(struct seq_file *file, void *v, loff_t *offset) 330static void *timer_list_next(struct seq_file *file, void *v, loff_t *offset)
323{ 331{
324 struct timer_list_iter *iter = file->private; 332 struct timer_list_iter *iter = file->private;
325 iter->cpu = cpumask_next(iter->cpu, cpu_online_mask);
326 ++*offset; 333 ++*offset;
327 return timer_list_start(file, offset); 334 return move_iter(iter, 1);
328} 335}
329 336
330static void timer_list_stop(struct seq_file *seq, void *v) 337static void timer_list_stop(struct seq_file *seq, void *v)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index afaae41b0a02..fe39acd4c1aa 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1022,6 +1022,9 @@ extern struct list_head ftrace_events;
1022extern const char *__start___trace_bprintk_fmt[]; 1022extern const char *__start___trace_bprintk_fmt[];
1023extern const char *__stop___trace_bprintk_fmt[]; 1023extern const char *__stop___trace_bprintk_fmt[];
1024 1024
1025extern const char *__start___tracepoint_str[];
1026extern const char *__stop___tracepoint_str[];
1027
1025void trace_printk_init_buffers(void); 1028void trace_printk_init_buffers(void);
1026void trace_printk_start_comm(void); 1029void trace_printk_start_comm(void);
1027int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set); 1030int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set);
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index a9077c1b4ad3..2900817ba65c 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -244,12 +244,31 @@ static const char **find_next(void *v, loff_t *pos)
244{ 244{
245 const char **fmt = v; 245 const char **fmt = v;
246 int start_index; 246 int start_index;
247 int last_index;
247 248
248 start_index = __stop___trace_bprintk_fmt - __start___trace_bprintk_fmt; 249 start_index = __stop___trace_bprintk_fmt - __start___trace_bprintk_fmt;
249 250
250 if (*pos < start_index) 251 if (*pos < start_index)
251 return __start___trace_bprintk_fmt + *pos; 252 return __start___trace_bprintk_fmt + *pos;
252 253
254 /*
255 * The __tracepoint_str section is treated the same as the
256 * __trace_printk_fmt section. The difference is that the
257 * __trace_printk_fmt section should only be used by trace_printk()
258 * in a debugging environment, as if anything exists in that section
259 * the trace_prink() helper buffers are allocated, which would just
260 * waste space in a production environment.
261 *
262 * The __tracepoint_str sections on the other hand are used by
263 * tracepoints which need to map pointers to their strings to
264 * the ASCII text for userspace.
265 */
266 last_index = start_index;
267 start_index = __stop___tracepoint_str - __start___tracepoint_str;
268
269 if (*pos < last_index + start_index)
270 return __start___tracepoint_str + (*pos - last_index);
271
253 return find_next_mod_format(start_index, v, fmt, pos); 272 return find_next_mod_format(start_index, v, fmt, pos);
254} 273}
255 274
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 1241d8c91d5e..51c4f34d258e 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -553,14 +553,6 @@ void __init lockup_detector_init(void)
553{ 553{
554 set_sample_period(); 554 set_sample_period();
555 555
556#ifdef CONFIG_NO_HZ_FULL
557 if (watchdog_user_enabled) {
558 watchdog_user_enabled = 0;
559 pr_warning("Disabled lockup detectors by default for full dynticks\n");
560 pr_warning("You can reactivate it with 'sysctl -w kernel.watchdog=1'\n");
561 }
562#endif
563
564 if (watchdog_user_enabled) 556 if (watchdog_user_enabled)
565 watchdog_enable_all_cpus(); 557 watchdog_enable_all_cpus();
566} 558}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 7f5d4be22034..29b79852a845 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -16,9 +16,10 @@
16 * 16 *
17 * This is the generic async execution mechanism. Work items as are 17 * This is the generic async execution mechanism. Work items as are
18 * executed in process context. The worker pool is shared and 18 * executed in process context. The worker pool is shared and
19 * automatically managed. There is one worker pool for each CPU and 19 * automatically managed. There are two worker pools for each CPU (one for
20 * one extra for works which are better served by workers which are 20 * normal work items and the other for high priority ones) and some extra
21 * not bound to any specific CPU. 21 * pools for workqueues which are not bound to any specific CPU - the
22 * number of these backing pools is dynamic.
22 * 23 *
23 * Please read Documentation/workqueue.txt for details. 24 * Please read Documentation/workqueue.txt for details.
24 */ 25 */
@@ -2033,8 +2034,11 @@ static bool maybe_destroy_workers(struct worker_pool *pool)
2033 * multiple times. Does GFP_KERNEL allocations. 2034 * multiple times. Does GFP_KERNEL allocations.
2034 * 2035 *
2035 * RETURNS: 2036 * RETURNS:
2036 * spin_lock_irq(pool->lock) which may be released and regrabbed 2037 * %false if the pool don't need management and the caller can safely start
2037 * multiple times. Does GFP_KERNEL allocations. 2038 * processing works, %true indicates that the function released pool->lock
2039 * and reacquired it to perform some management function and that the
2040 * conditions that the caller verified while holding the lock before
2041 * calling the function might no longer be true.
2038 */ 2042 */
2039static bool manage_workers(struct worker *worker) 2043static bool manage_workers(struct worker *worker)
2040{ 2044{
@@ -2201,6 +2205,15 @@ __acquires(&pool->lock)
2201 dump_stack(); 2205 dump_stack();
2202 } 2206 }
2203 2207
2208 /*
2209 * The following prevents a kworker from hogging CPU on !PREEMPT
2210 * kernels, where a requeueing work item waiting for something to
2211 * happen could deadlock with stop_machine as such work item could
2212 * indefinitely requeue itself while all other CPUs are trapped in
2213 * stop_machine.
2214 */
2215 cond_resched();
2216
2204 spin_lock_irq(&pool->lock); 2217 spin_lock_irq(&pool->lock);
2205 2218
2206 /* clear cpu intensive status */ 2219 /* clear cpu intensive status */
@@ -3086,25 +3099,26 @@ static struct workqueue_struct *dev_to_wq(struct device *dev)
3086 return wq_dev->wq; 3099 return wq_dev->wq;
3087} 3100}
3088 3101
3089static ssize_t wq_per_cpu_show(struct device *dev, 3102static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr,
3090 struct device_attribute *attr, char *buf) 3103 char *buf)
3091{ 3104{
3092 struct workqueue_struct *wq = dev_to_wq(dev); 3105 struct workqueue_struct *wq = dev_to_wq(dev);
3093 3106
3094 return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND)); 3107 return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND));
3095} 3108}
3109static DEVICE_ATTR_RO(per_cpu);
3096 3110
3097static ssize_t wq_max_active_show(struct device *dev, 3111static ssize_t max_active_show(struct device *dev,
3098 struct device_attribute *attr, char *buf) 3112 struct device_attribute *attr, char *buf)
3099{ 3113{
3100 struct workqueue_struct *wq = dev_to_wq(dev); 3114 struct workqueue_struct *wq = dev_to_wq(dev);
3101 3115
3102 return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active); 3116 return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active);
3103} 3117}
3104 3118
3105static ssize_t wq_max_active_store(struct device *dev, 3119static ssize_t max_active_store(struct device *dev,
3106 struct device_attribute *attr, 3120 struct device_attribute *attr, const char *buf,
3107 const char *buf, size_t count) 3121 size_t count)
3108{ 3122{
3109 struct workqueue_struct *wq = dev_to_wq(dev); 3123 struct workqueue_struct *wq = dev_to_wq(dev);
3110 int val; 3124 int val;
@@ -3115,12 +3129,14 @@ static ssize_t wq_max_active_store(struct device *dev,
3115 workqueue_set_max_active(wq, val); 3129 workqueue_set_max_active(wq, val);
3116 return count; 3130 return count;
3117} 3131}
3132static DEVICE_ATTR_RW(max_active);
3118 3133
3119static struct device_attribute wq_sysfs_attrs[] = { 3134static struct attribute *wq_sysfs_attrs[] = {
3120 __ATTR(per_cpu, 0444, wq_per_cpu_show, NULL), 3135 &dev_attr_per_cpu.attr,
3121 __ATTR(max_active, 0644, wq_max_active_show, wq_max_active_store), 3136 &dev_attr_max_active.attr,
3122 __ATTR_NULL, 3137 NULL,
3123}; 3138};
3139ATTRIBUTE_GROUPS(wq_sysfs);
3124 3140
3125static ssize_t wq_pool_ids_show(struct device *dev, 3141static ssize_t wq_pool_ids_show(struct device *dev,
3126 struct device_attribute *attr, char *buf) 3142 struct device_attribute *attr, char *buf)
@@ -3270,7 +3286,7 @@ static struct device_attribute wq_sysfs_unbound_attrs[] = {
3270 3286
3271static struct bus_type wq_subsys = { 3287static struct bus_type wq_subsys = {
3272 .name = "workqueue", 3288 .name = "workqueue",
3273 .dev_attrs = wq_sysfs_attrs, 3289 .dev_groups = wq_sysfs_groups,
3274}; 3290};
3275 3291
3276static int __init wq_sysfs_init(void) 3292static int __init wq_sysfs_init(void)