diff options
Diffstat (limited to 'kernel')
72 files changed, 3988 insertions, 2796 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 470839d1a30e..35ef1185e359 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -2,7 +2,7 @@ | |||
2 | # Makefile for the linux kernel. | 2 | # Makefile for the linux kernel. |
3 | # | 3 | # |
4 | 4 | ||
5 | obj-y = fork.o exec_domain.o panic.o printk.o \ | 5 | obj-y = fork.o exec_domain.o panic.o \ |
6 | cpu.o exit.o itimer.o time.o softirq.o resource.o \ | 6 | cpu.o exit.o itimer.o time.o softirq.o resource.o \ |
7 | sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ | 7 | sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ |
8 | signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ | 8 | signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ |
@@ -24,6 +24,7 @@ endif | |||
24 | 24 | ||
25 | obj-y += sched/ | 25 | obj-y += sched/ |
26 | obj-y += power/ | 26 | obj-y += power/ |
27 | obj-y += printk/ | ||
27 | obj-y += cpu/ | 28 | obj-y += cpu/ |
28 | 29 | ||
29 | obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o | 30 | obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o |
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 0e0b20b8c5db..e0aeb32415ff 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -81,7 +81,7 @@ | |||
81 | */ | 81 | */ |
82 | #ifdef CONFIG_PROVE_RCU | 82 | #ifdef CONFIG_PROVE_RCU |
83 | DEFINE_MUTEX(cgroup_mutex); | 83 | DEFINE_MUTEX(cgroup_mutex); |
84 | EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for task_subsys_state_check() */ | 84 | EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for lockdep */ |
85 | #else | 85 | #else |
86 | static DEFINE_MUTEX(cgroup_mutex); | 86 | static DEFINE_MUTEX(cgroup_mutex); |
87 | #endif | 87 | #endif |
@@ -117,6 +117,7 @@ struct cfent { | |||
117 | struct list_head node; | 117 | struct list_head node; |
118 | struct dentry *dentry; | 118 | struct dentry *dentry; |
119 | struct cftype *type; | 119 | struct cftype *type; |
120 | struct cgroup_subsys_state *css; | ||
120 | 121 | ||
121 | /* file xattrs */ | 122 | /* file xattrs */ |
122 | struct simple_xattrs xattrs; | 123 | struct simple_xattrs xattrs; |
@@ -159,9 +160,9 @@ struct css_id { | |||
159 | */ | 160 | */ |
160 | struct cgroup_event { | 161 | struct cgroup_event { |
161 | /* | 162 | /* |
162 | * Cgroup which the event belongs to. | 163 | * css which the event belongs to. |
163 | */ | 164 | */ |
164 | struct cgroup *cgrp; | 165 | struct cgroup_subsys_state *css; |
165 | /* | 166 | /* |
166 | * Control file which the event associated. | 167 | * Control file which the event associated. |
167 | */ | 168 | */ |
@@ -215,10 +216,33 @@ static u64 cgroup_serial_nr_next = 1; | |||
215 | */ | 216 | */ |
216 | static int need_forkexit_callback __read_mostly; | 217 | static int need_forkexit_callback __read_mostly; |
217 | 218 | ||
218 | static void cgroup_offline_fn(struct work_struct *work); | 219 | static struct cftype cgroup_base_files[]; |
220 | |||
221 | static void cgroup_destroy_css_killed(struct cgroup *cgrp); | ||
219 | static int cgroup_destroy_locked(struct cgroup *cgrp); | 222 | static int cgroup_destroy_locked(struct cgroup *cgrp); |
220 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, | 223 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], |
221 | struct cftype cfts[], bool is_add); | 224 | bool is_add); |
225 | |||
226 | /** | ||
227 | * cgroup_css - obtain a cgroup's css for the specified subsystem | ||
228 | * @cgrp: the cgroup of interest | ||
229 | * @ss: the subsystem of interest (%NULL returns the dummy_css) | ||
230 | * | ||
231 | * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This | ||
232 | * function must be called either under cgroup_mutex or rcu_read_lock() and | ||
233 | * the caller is responsible for pinning the returned css if it wants to | ||
234 | * keep accessing it outside the said locks. This function may return | ||
235 | * %NULL if @cgrp doesn't have @subsys_id enabled. | ||
236 | */ | ||
237 | static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, | ||
238 | struct cgroup_subsys *ss) | ||
239 | { | ||
240 | if (ss) | ||
241 | return rcu_dereference_check(cgrp->subsys[ss->subsys_id], | ||
242 | lockdep_is_held(&cgroup_mutex)); | ||
243 | else | ||
244 | return &cgrp->dummy_css; | ||
245 | } | ||
222 | 246 | ||
223 | /* convenient tests for these bits */ | 247 | /* convenient tests for these bits */ |
224 | static inline bool cgroup_is_dead(const struct cgroup *cgrp) | 248 | static inline bool cgroup_is_dead(const struct cgroup *cgrp) |
@@ -365,9 +389,11 @@ static struct cgrp_cset_link init_cgrp_cset_link; | |||
365 | static int cgroup_init_idr(struct cgroup_subsys *ss, | 389 | static int cgroup_init_idr(struct cgroup_subsys *ss, |
366 | struct cgroup_subsys_state *css); | 390 | struct cgroup_subsys_state *css); |
367 | 391 | ||
368 | /* css_set_lock protects the list of css_set objects, and the | 392 | /* |
369 | * chain of tasks off each css_set. Nests outside task->alloc_lock | 393 | * css_set_lock protects the list of css_set objects, and the chain of |
370 | * due to cgroup_iter_start() */ | 394 | * tasks off each css_set. Nests outside task->alloc_lock due to |
395 | * css_task_iter_start(). | ||
396 | */ | ||
371 | static DEFINE_RWLOCK(css_set_lock); | 397 | static DEFINE_RWLOCK(css_set_lock); |
372 | static int css_set_count; | 398 | static int css_set_count; |
373 | 399 | ||
@@ -392,10 +418,12 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) | |||
392 | return key; | 418 | return key; |
393 | } | 419 | } |
394 | 420 | ||
395 | /* We don't maintain the lists running through each css_set to its | 421 | /* |
396 | * task until after the first call to cgroup_iter_start(). This | 422 | * We don't maintain the lists running through each css_set to its task |
397 | * reduces the fork()/exit() overhead for people who have cgroups | 423 | * until after the first call to css_task_iter_start(). This reduces the |
398 | * compiled into their kernel but not actually in use */ | 424 | * fork()/exit() overhead for people who have cgroups compiled into their |
425 | * kernel but not actually in use. | ||
426 | */ | ||
399 | static int use_task_css_set_links __read_mostly; | 427 | static int use_task_css_set_links __read_mostly; |
400 | 428 | ||
401 | static void __put_css_set(struct css_set *cset, int taskexit) | 429 | static void __put_css_set(struct css_set *cset, int taskexit) |
@@ -464,7 +492,7 @@ static inline void put_css_set_taskexit(struct css_set *cset) | |||
464 | * @new_cgrp: cgroup that's being entered by the task | 492 | * @new_cgrp: cgroup that's being entered by the task |
465 | * @template: desired set of css pointers in css_set (pre-calculated) | 493 | * @template: desired set of css pointers in css_set (pre-calculated) |
466 | * | 494 | * |
467 | * Returns true if "cg" matches "old_cg" except for the hierarchy | 495 | * Returns true if "cset" matches "old_cset" except for the hierarchy |
468 | * which "new_cgrp" belongs to, for which it should match "new_cgrp". | 496 | * which "new_cgrp" belongs to, for which it should match "new_cgrp". |
469 | */ | 497 | */ |
470 | static bool compare_css_sets(struct css_set *cset, | 498 | static bool compare_css_sets(struct css_set *cset, |
@@ -555,7 +583,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset, | |||
555 | /* Subsystem is in this hierarchy. So we want | 583 | /* Subsystem is in this hierarchy. So we want |
556 | * the subsystem state from the new | 584 | * the subsystem state from the new |
557 | * cgroup */ | 585 | * cgroup */ |
558 | template[i] = cgrp->subsys[i]; | 586 | template[i] = cgroup_css(cgrp, ss); |
559 | } else { | 587 | } else { |
560 | /* Subsystem is not in this hierarchy, so we | 588 | /* Subsystem is not in this hierarchy, so we |
561 | * don't want to change the subsystem state */ | 589 | * don't want to change the subsystem state */ |
@@ -803,8 +831,7 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, | |||
803 | 831 | ||
804 | static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); | 832 | static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); |
805 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); | 833 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); |
806 | static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, | 834 | static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask); |
807 | unsigned long subsys_mask); | ||
808 | static const struct inode_operations cgroup_dir_inode_operations; | 835 | static const struct inode_operations cgroup_dir_inode_operations; |
809 | static const struct file_operations proc_cgroupstats_operations; | 836 | static const struct file_operations proc_cgroupstats_operations; |
810 | 837 | ||
@@ -813,8 +840,7 @@ static struct backing_dev_info cgroup_backing_dev_info = { | |||
813 | .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, | 840 | .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, |
814 | }; | 841 | }; |
815 | 842 | ||
816 | static int alloc_css_id(struct cgroup_subsys *ss, | 843 | static int alloc_css_id(struct cgroup_subsys_state *child_css); |
817 | struct cgroup *parent, struct cgroup *child); | ||
818 | 844 | ||
819 | static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) | 845 | static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) |
820 | { | 846 | { |
@@ -845,15 +871,8 @@ static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry) | |||
845 | static void cgroup_free_fn(struct work_struct *work) | 871 | static void cgroup_free_fn(struct work_struct *work) |
846 | { | 872 | { |
847 | struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); | 873 | struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); |
848 | struct cgroup_subsys *ss; | ||
849 | 874 | ||
850 | mutex_lock(&cgroup_mutex); | 875 | mutex_lock(&cgroup_mutex); |
851 | /* | ||
852 | * Release the subsystem state objects. | ||
853 | */ | ||
854 | for_each_root_subsys(cgrp->root, ss) | ||
855 | ss->css_free(cgrp); | ||
856 | |||
857 | cgrp->root->number_of_cgroups--; | 876 | cgrp->root->number_of_cgroups--; |
858 | mutex_unlock(&cgroup_mutex); | 877 | mutex_unlock(&cgroup_mutex); |
859 | 878 | ||
@@ -864,8 +883,6 @@ static void cgroup_free_fn(struct work_struct *work) | |||
864 | */ | 883 | */ |
865 | dput(cgrp->parent->dentry); | 884 | dput(cgrp->parent->dentry); |
866 | 885 | ||
867 | ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); | ||
868 | |||
869 | /* | 886 | /* |
870 | * Drop the active superblock reference that we took when we | 887 | * Drop the active superblock reference that we took when we |
871 | * created the cgroup. This will free cgrp->root, if we are | 888 | * created the cgroup. This will free cgrp->root, if we are |
@@ -956,27 +973,22 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) | |||
956 | } | 973 | } |
957 | 974 | ||
958 | /** | 975 | /** |
959 | * cgroup_clear_directory - selective removal of base and subsystem files | 976 | * cgroup_clear_dir - remove subsys files in a cgroup directory |
960 | * @dir: directory containing the files | 977 | * @cgrp: target cgroup |
961 | * @base_files: true if the base files should be removed | ||
962 | * @subsys_mask: mask of the subsystem ids whose files should be removed | 978 | * @subsys_mask: mask of the subsystem ids whose files should be removed |
963 | */ | 979 | */ |
964 | static void cgroup_clear_directory(struct dentry *dir, bool base_files, | 980 | static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) |
965 | unsigned long subsys_mask) | ||
966 | { | 981 | { |
967 | struct cgroup *cgrp = __d_cgrp(dir); | ||
968 | struct cgroup_subsys *ss; | 982 | struct cgroup_subsys *ss; |
983 | int i; | ||
969 | 984 | ||
970 | for_each_root_subsys(cgrp->root, ss) { | 985 | for_each_subsys(ss, i) { |
971 | struct cftype_set *set; | 986 | struct cftype_set *set; |
972 | if (!test_bit(ss->subsys_id, &subsys_mask)) | 987 | |
988 | if (!test_bit(i, &subsys_mask)) | ||
973 | continue; | 989 | continue; |
974 | list_for_each_entry(set, &ss->cftsets, node) | 990 | list_for_each_entry(set, &ss->cftsets, node) |
975 | cgroup_addrm_files(cgrp, NULL, set->cfts, false); | 991 | cgroup_addrm_files(cgrp, set->cfts, false); |
976 | } | ||
977 | if (base_files) { | ||
978 | while (!list_empty(&cgrp->files)) | ||
979 | cgroup_rm_file(cgrp, NULL); | ||
980 | } | 992 | } |
981 | } | 993 | } |
982 | 994 | ||
@@ -986,9 +998,6 @@ static void cgroup_clear_directory(struct dentry *dir, bool base_files, | |||
986 | static void cgroup_d_remove_dir(struct dentry *dentry) | 998 | static void cgroup_d_remove_dir(struct dentry *dentry) |
987 | { | 999 | { |
988 | struct dentry *parent; | 1000 | struct dentry *parent; |
989 | struct cgroupfs_root *root = dentry->d_sb->s_fs_info; | ||
990 | |||
991 | cgroup_clear_directory(dentry, true, root->subsys_mask); | ||
992 | 1001 | ||
993 | parent = dentry->d_parent; | 1002 | parent = dentry->d_parent; |
994 | spin_lock(&parent->d_lock); | 1003 | spin_lock(&parent->d_lock); |
@@ -1009,79 +1018,84 @@ static int rebind_subsystems(struct cgroupfs_root *root, | |||
1009 | { | 1018 | { |
1010 | struct cgroup *cgrp = &root->top_cgroup; | 1019 | struct cgroup *cgrp = &root->top_cgroup; |
1011 | struct cgroup_subsys *ss; | 1020 | struct cgroup_subsys *ss; |
1012 | int i; | 1021 | unsigned long pinned = 0; |
1022 | int i, ret; | ||
1013 | 1023 | ||
1014 | BUG_ON(!mutex_is_locked(&cgroup_mutex)); | 1024 | BUG_ON(!mutex_is_locked(&cgroup_mutex)); |
1015 | BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); | 1025 | BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); |
1016 | 1026 | ||
1017 | /* Check that any added subsystems are currently free */ | 1027 | /* Check that any added subsystems are currently free */ |
1018 | for_each_subsys(ss, i) { | 1028 | for_each_subsys(ss, i) { |
1019 | unsigned long bit = 1UL << i; | 1029 | if (!(added_mask & (1 << i))) |
1020 | |||
1021 | if (!(bit & added_mask)) | ||
1022 | continue; | 1030 | continue; |
1023 | 1031 | ||
1032 | /* is the subsystem mounted elsewhere? */ | ||
1024 | if (ss->root != &cgroup_dummy_root) { | 1033 | if (ss->root != &cgroup_dummy_root) { |
1025 | /* Subsystem isn't free */ | 1034 | ret = -EBUSY; |
1026 | return -EBUSY; | 1035 | goto out_put; |
1027 | } | 1036 | } |
1037 | |||
1038 | /* pin the module */ | ||
1039 | if (!try_module_get(ss->module)) { | ||
1040 | ret = -ENOENT; | ||
1041 | goto out_put; | ||
1042 | } | ||
1043 | pinned |= 1 << i; | ||
1028 | } | 1044 | } |
1029 | 1045 | ||
1030 | /* Currently we don't handle adding/removing subsystems when | 1046 | /* subsys could be missing if unloaded between parsing and here */ |
1031 | * any child cgroups exist. This is theoretically supportable | 1047 | if (added_mask != pinned) { |
1032 | * but involves complex error handling, so it's being left until | 1048 | ret = -ENOENT; |
1033 | * later */ | 1049 | goto out_put; |
1034 | if (root->number_of_cgroups > 1) | 1050 | } |
1035 | return -EBUSY; | 1051 | |
1052 | ret = cgroup_populate_dir(cgrp, added_mask); | ||
1053 | if (ret) | ||
1054 | goto out_put; | ||
1055 | |||
1056 | /* | ||
1057 | * Nothing can fail from this point on. Remove files for the | ||
1058 | * removed subsystems and rebind each subsystem. | ||
1059 | */ | ||
1060 | cgroup_clear_dir(cgrp, removed_mask); | ||
1036 | 1061 | ||
1037 | /* Process each subsystem */ | ||
1038 | for_each_subsys(ss, i) { | 1062 | for_each_subsys(ss, i) { |
1039 | unsigned long bit = 1UL << i; | 1063 | unsigned long bit = 1UL << i; |
1040 | 1064 | ||
1041 | if (bit & added_mask) { | 1065 | if (bit & added_mask) { |
1042 | /* We're binding this subsystem to this hierarchy */ | 1066 | /* We're binding this subsystem to this hierarchy */ |
1043 | BUG_ON(cgrp->subsys[i]); | 1067 | BUG_ON(cgroup_css(cgrp, ss)); |
1044 | BUG_ON(!cgroup_dummy_top->subsys[i]); | 1068 | BUG_ON(!cgroup_css(cgroup_dummy_top, ss)); |
1045 | BUG_ON(cgroup_dummy_top->subsys[i]->cgroup != cgroup_dummy_top); | 1069 | BUG_ON(cgroup_css(cgroup_dummy_top, ss)->cgroup != cgroup_dummy_top); |
1070 | |||
1071 | rcu_assign_pointer(cgrp->subsys[i], | ||
1072 | cgroup_css(cgroup_dummy_top, ss)); | ||
1073 | cgroup_css(cgrp, ss)->cgroup = cgrp; | ||
1046 | 1074 | ||
1047 | cgrp->subsys[i] = cgroup_dummy_top->subsys[i]; | ||
1048 | cgrp->subsys[i]->cgroup = cgrp; | ||
1049 | list_move(&ss->sibling, &root->subsys_list); | 1075 | list_move(&ss->sibling, &root->subsys_list); |
1050 | ss->root = root; | 1076 | ss->root = root; |
1051 | if (ss->bind) | 1077 | if (ss->bind) |
1052 | ss->bind(cgrp); | 1078 | ss->bind(cgroup_css(cgrp, ss)); |
1053 | 1079 | ||
1054 | /* refcount was already taken, and we're keeping it */ | 1080 | /* refcount was already taken, and we're keeping it */ |
1055 | root->subsys_mask |= bit; | 1081 | root->subsys_mask |= bit; |
1056 | } else if (bit & removed_mask) { | 1082 | } else if (bit & removed_mask) { |
1057 | /* We're removing this subsystem */ | 1083 | /* We're removing this subsystem */ |
1058 | BUG_ON(cgrp->subsys[i] != cgroup_dummy_top->subsys[i]); | 1084 | BUG_ON(cgroup_css(cgrp, ss) != cgroup_css(cgroup_dummy_top, ss)); |
1059 | BUG_ON(cgrp->subsys[i]->cgroup != cgrp); | 1085 | BUG_ON(cgroup_css(cgrp, ss)->cgroup != cgrp); |
1060 | 1086 | ||
1061 | if (ss->bind) | 1087 | if (ss->bind) |
1062 | ss->bind(cgroup_dummy_top); | 1088 | ss->bind(cgroup_css(cgroup_dummy_top, ss)); |
1063 | cgroup_dummy_top->subsys[i]->cgroup = cgroup_dummy_top; | 1089 | |
1064 | cgrp->subsys[i] = NULL; | 1090 | cgroup_css(cgroup_dummy_top, ss)->cgroup = cgroup_dummy_top; |
1091 | RCU_INIT_POINTER(cgrp->subsys[i], NULL); | ||
1092 | |||
1065 | cgroup_subsys[i]->root = &cgroup_dummy_root; | 1093 | cgroup_subsys[i]->root = &cgroup_dummy_root; |
1066 | list_move(&ss->sibling, &cgroup_dummy_root.subsys_list); | 1094 | list_move(&ss->sibling, &cgroup_dummy_root.subsys_list); |
1067 | 1095 | ||
1068 | /* subsystem is now free - drop reference on module */ | 1096 | /* subsystem is now free - drop reference on module */ |
1069 | module_put(ss->module); | 1097 | module_put(ss->module); |
1070 | root->subsys_mask &= ~bit; | 1098 | root->subsys_mask &= ~bit; |
1071 | } else if (bit & root->subsys_mask) { | ||
1072 | /* Subsystem state should already exist */ | ||
1073 | BUG_ON(!cgrp->subsys[i]); | ||
1074 | /* | ||
1075 | * a refcount was taken, but we already had one, so | ||
1076 | * drop the extra reference. | ||
1077 | */ | ||
1078 | module_put(ss->module); | ||
1079 | #ifdef CONFIG_MODULE_UNLOAD | ||
1080 | BUG_ON(ss->module && !module_refcount(ss->module)); | ||
1081 | #endif | ||
1082 | } else { | ||
1083 | /* Subsystem state shouldn't exist */ | ||
1084 | BUG_ON(cgrp->subsys[i]); | ||
1085 | } | 1099 | } |
1086 | } | 1100 | } |
1087 | 1101 | ||
@@ -1092,6 +1106,12 @@ static int rebind_subsystems(struct cgroupfs_root *root, | |||
1092 | root->flags |= CGRP_ROOT_SUBSYS_BOUND; | 1106 | root->flags |= CGRP_ROOT_SUBSYS_BOUND; |
1093 | 1107 | ||
1094 | return 0; | 1108 | return 0; |
1109 | |||
1110 | out_put: | ||
1111 | for_each_subsys(ss, i) | ||
1112 | if (pinned & (1 << i)) | ||
1113 | module_put(ss->module); | ||
1114 | return ret; | ||
1095 | } | 1115 | } |
1096 | 1116 | ||
1097 | static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) | 1117 | static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) |
@@ -1142,7 +1162,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
1142 | char *token, *o = data; | 1162 | char *token, *o = data; |
1143 | bool all_ss = false, one_ss = false; | 1163 | bool all_ss = false, one_ss = false; |
1144 | unsigned long mask = (unsigned long)-1; | 1164 | unsigned long mask = (unsigned long)-1; |
1145 | bool module_pin_failed = false; | ||
1146 | struct cgroup_subsys *ss; | 1165 | struct cgroup_subsys *ss; |
1147 | int i; | 1166 | int i; |
1148 | 1167 | ||
@@ -1285,52 +1304,9 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
1285 | if (!opts->subsys_mask && !opts->name) | 1304 | if (!opts->subsys_mask && !opts->name) |
1286 | return -EINVAL; | 1305 | return -EINVAL; |
1287 | 1306 | ||
1288 | /* | ||
1289 | * Grab references on all the modules we'll need, so the subsystems | ||
1290 | * don't dance around before rebind_subsystems attaches them. This may | ||
1291 | * take duplicate reference counts on a subsystem that's already used, | ||
1292 | * but rebind_subsystems handles this case. | ||
1293 | */ | ||
1294 | for_each_subsys(ss, i) { | ||
1295 | if (!(opts->subsys_mask & (1UL << i))) | ||
1296 | continue; | ||
1297 | if (!try_module_get(cgroup_subsys[i]->module)) { | ||
1298 | module_pin_failed = true; | ||
1299 | break; | ||
1300 | } | ||
1301 | } | ||
1302 | if (module_pin_failed) { | ||
1303 | /* | ||
1304 | * oops, one of the modules was going away. this means that we | ||
1305 | * raced with a module_delete call, and to the user this is | ||
1306 | * essentially a "subsystem doesn't exist" case. | ||
1307 | */ | ||
1308 | for (i--; i >= 0; i--) { | ||
1309 | /* drop refcounts only on the ones we took */ | ||
1310 | unsigned long bit = 1UL << i; | ||
1311 | |||
1312 | if (!(bit & opts->subsys_mask)) | ||
1313 | continue; | ||
1314 | module_put(cgroup_subsys[i]->module); | ||
1315 | } | ||
1316 | return -ENOENT; | ||
1317 | } | ||
1318 | |||
1319 | return 0; | 1307 | return 0; |
1320 | } | 1308 | } |
1321 | 1309 | ||
1322 | static void drop_parsed_module_refcounts(unsigned long subsys_mask) | ||
1323 | { | ||
1324 | struct cgroup_subsys *ss; | ||
1325 | int i; | ||
1326 | |||
1327 | mutex_lock(&cgroup_mutex); | ||
1328 | for_each_subsys(ss, i) | ||
1329 | if (subsys_mask & (1UL << i)) | ||
1330 | module_put(cgroup_subsys[i]->module); | ||
1331 | mutex_unlock(&cgroup_mutex); | ||
1332 | } | ||
1333 | |||
1334 | static int cgroup_remount(struct super_block *sb, int *flags, char *data) | 1310 | static int cgroup_remount(struct super_block *sb, int *flags, char *data) |
1335 | { | 1311 | { |
1336 | int ret = 0; | 1312 | int ret = 0; |
@@ -1370,22 +1346,15 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) | |||
1370 | goto out_unlock; | 1346 | goto out_unlock; |
1371 | } | 1347 | } |
1372 | 1348 | ||
1373 | /* | 1349 | /* remounting is not allowed for populated hierarchies */ |
1374 | * Clear out the files of subsystems that should be removed, do | 1350 | if (root->number_of_cgroups > 1) { |
1375 | * this before rebind_subsystems, since rebind_subsystems may | 1351 | ret = -EBUSY; |
1376 | * change this hierarchy's subsys_list. | ||
1377 | */ | ||
1378 | cgroup_clear_directory(cgrp->dentry, false, removed_mask); | ||
1379 | |||
1380 | ret = rebind_subsystems(root, added_mask, removed_mask); | ||
1381 | if (ret) { | ||
1382 | /* rebind_subsystems failed, re-populate the removed files */ | ||
1383 | cgroup_populate_dir(cgrp, false, removed_mask); | ||
1384 | goto out_unlock; | 1352 | goto out_unlock; |
1385 | } | 1353 | } |
1386 | 1354 | ||
1387 | /* re-populate subsystem files */ | 1355 | ret = rebind_subsystems(root, added_mask, removed_mask); |
1388 | cgroup_populate_dir(cgrp, false, added_mask); | 1356 | if (ret) |
1357 | goto out_unlock; | ||
1389 | 1358 | ||
1390 | if (opts.release_agent) | 1359 | if (opts.release_agent) |
1391 | strcpy(root->release_agent_path, opts.release_agent); | 1360 | strcpy(root->release_agent_path, opts.release_agent); |
@@ -1395,8 +1364,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) | |||
1395 | mutex_unlock(&cgroup_root_mutex); | 1364 | mutex_unlock(&cgroup_root_mutex); |
1396 | mutex_unlock(&cgroup_mutex); | 1365 | mutex_unlock(&cgroup_mutex); |
1397 | mutex_unlock(&cgrp->dentry->d_inode->i_mutex); | 1366 | mutex_unlock(&cgrp->dentry->d_inode->i_mutex); |
1398 | if (ret) | ||
1399 | drop_parsed_module_refcounts(opts.subsys_mask); | ||
1400 | return ret; | 1367 | return ret; |
1401 | } | 1368 | } |
1402 | 1369 | ||
@@ -1416,6 +1383,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) | |||
1416 | INIT_LIST_HEAD(&cgrp->release_list); | 1383 | INIT_LIST_HEAD(&cgrp->release_list); |
1417 | INIT_LIST_HEAD(&cgrp->pidlists); | 1384 | INIT_LIST_HEAD(&cgrp->pidlists); |
1418 | mutex_init(&cgrp->pidlist_mutex); | 1385 | mutex_init(&cgrp->pidlist_mutex); |
1386 | cgrp->dummy_css.cgroup = cgrp; | ||
1419 | INIT_LIST_HEAD(&cgrp->event_list); | 1387 | INIT_LIST_HEAD(&cgrp->event_list); |
1420 | spin_lock_init(&cgrp->event_list_lock); | 1388 | spin_lock_init(&cgrp->event_list_lock); |
1421 | simple_xattrs_init(&cgrp->xattrs); | 1389 | simple_xattrs_init(&cgrp->xattrs); |
@@ -1431,6 +1399,7 @@ static void init_cgroup_root(struct cgroupfs_root *root) | |||
1431 | cgrp->root = root; | 1399 | cgrp->root = root; |
1432 | RCU_INIT_POINTER(cgrp->name, &root_cgroup_name); | 1400 | RCU_INIT_POINTER(cgrp->name, &root_cgroup_name); |
1433 | init_cgroup_housekeeping(cgrp); | 1401 | init_cgroup_housekeeping(cgrp); |
1402 | idr_init(&root->cgroup_idr); | ||
1434 | } | 1403 | } |
1435 | 1404 | ||
1436 | static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end) | 1405 | static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end) |
@@ -1503,7 +1472,6 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) | |||
1503 | */ | 1472 | */ |
1504 | root->subsys_mask = opts->subsys_mask; | 1473 | root->subsys_mask = opts->subsys_mask; |
1505 | root->flags = opts->flags; | 1474 | root->flags = opts->flags; |
1506 | ida_init(&root->cgroup_ida); | ||
1507 | if (opts->release_agent) | 1475 | if (opts->release_agent) |
1508 | strcpy(root->release_agent_path, opts->release_agent); | 1476 | strcpy(root->release_agent_path, opts->release_agent); |
1509 | if (opts->name) | 1477 | if (opts->name) |
@@ -1519,7 +1487,7 @@ static void cgroup_free_root(struct cgroupfs_root *root) | |||
1519 | /* hierarhcy ID shoulid already have been released */ | 1487 | /* hierarhcy ID shoulid already have been released */ |
1520 | WARN_ON_ONCE(root->hierarchy_id); | 1488 | WARN_ON_ONCE(root->hierarchy_id); |
1521 | 1489 | ||
1522 | ida_destroy(&root->cgroup_ida); | 1490 | idr_destroy(&root->cgroup_idr); |
1523 | kfree(root); | 1491 | kfree(root); |
1524 | } | 1492 | } |
1525 | } | 1493 | } |
@@ -1584,7 +1552,9 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1584 | int ret = 0; | 1552 | int ret = 0; |
1585 | struct super_block *sb; | 1553 | struct super_block *sb; |
1586 | struct cgroupfs_root *new_root; | 1554 | struct cgroupfs_root *new_root; |
1555 | struct list_head tmp_links; | ||
1587 | struct inode *inode; | 1556 | struct inode *inode; |
1557 | const struct cred *cred; | ||
1588 | 1558 | ||
1589 | /* First find the desired set of subsystems */ | 1559 | /* First find the desired set of subsystems */ |
1590 | mutex_lock(&cgroup_mutex); | 1560 | mutex_lock(&cgroup_mutex); |
@@ -1600,7 +1570,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1600 | new_root = cgroup_root_from_opts(&opts); | 1570 | new_root = cgroup_root_from_opts(&opts); |
1601 | if (IS_ERR(new_root)) { | 1571 | if (IS_ERR(new_root)) { |
1602 | ret = PTR_ERR(new_root); | 1572 | ret = PTR_ERR(new_root); |
1603 | goto drop_modules; | 1573 | goto out_err; |
1604 | } | 1574 | } |
1605 | opts.new_root = new_root; | 1575 | opts.new_root = new_root; |
1606 | 1576 | ||
@@ -1609,17 +1579,15 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1609 | if (IS_ERR(sb)) { | 1579 | if (IS_ERR(sb)) { |
1610 | ret = PTR_ERR(sb); | 1580 | ret = PTR_ERR(sb); |
1611 | cgroup_free_root(opts.new_root); | 1581 | cgroup_free_root(opts.new_root); |
1612 | goto drop_modules; | 1582 | goto out_err; |
1613 | } | 1583 | } |
1614 | 1584 | ||
1615 | root = sb->s_fs_info; | 1585 | root = sb->s_fs_info; |
1616 | BUG_ON(!root); | 1586 | BUG_ON(!root); |
1617 | if (root == opts.new_root) { | 1587 | if (root == opts.new_root) { |
1618 | /* We used the new root structure, so this is a new hierarchy */ | 1588 | /* We used the new root structure, so this is a new hierarchy */ |
1619 | struct list_head tmp_links; | ||
1620 | struct cgroup *root_cgrp = &root->top_cgroup; | 1589 | struct cgroup *root_cgrp = &root->top_cgroup; |
1621 | struct cgroupfs_root *existing_root; | 1590 | struct cgroupfs_root *existing_root; |
1622 | const struct cred *cred; | ||
1623 | int i; | 1591 | int i; |
1624 | struct css_set *cset; | 1592 | struct css_set *cset; |
1625 | 1593 | ||
@@ -1634,6 +1602,11 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1634 | mutex_lock(&cgroup_mutex); | 1602 | mutex_lock(&cgroup_mutex); |
1635 | mutex_lock(&cgroup_root_mutex); | 1603 | mutex_lock(&cgroup_root_mutex); |
1636 | 1604 | ||
1605 | root_cgrp->id = idr_alloc(&root->cgroup_idr, root_cgrp, | ||
1606 | 0, 1, GFP_KERNEL); | ||
1607 | if (root_cgrp->id < 0) | ||
1608 | goto unlock_drop; | ||
1609 | |||
1637 | /* Check for name clashes with existing mounts */ | 1610 | /* Check for name clashes with existing mounts */ |
1638 | ret = -EBUSY; | 1611 | ret = -EBUSY; |
1639 | if (strlen(root->name)) | 1612 | if (strlen(root->name)) |
@@ -1657,26 +1630,37 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1657 | if (ret) | 1630 | if (ret) |
1658 | goto unlock_drop; | 1631 | goto unlock_drop; |
1659 | 1632 | ||
1633 | sb->s_root->d_fsdata = root_cgrp; | ||
1634 | root_cgrp->dentry = sb->s_root; | ||
1635 | |||
1636 | /* | ||
1637 | * We're inside get_sb() and will call lookup_one_len() to | ||
1638 | * create the root files, which doesn't work if SELinux is | ||
1639 | * in use. The following cred dancing somehow works around | ||
1640 | * it. See 2ce9738ba ("cgroupfs: use init_cred when | ||
1641 | * populating new cgroupfs mount") for more details. | ||
1642 | */ | ||
1643 | cred = override_creds(&init_cred); | ||
1644 | |||
1645 | ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true); | ||
1646 | if (ret) | ||
1647 | goto rm_base_files; | ||
1648 | |||
1660 | ret = rebind_subsystems(root, root->subsys_mask, 0); | 1649 | ret = rebind_subsystems(root, root->subsys_mask, 0); |
1661 | if (ret == -EBUSY) { | 1650 | if (ret) |
1662 | free_cgrp_cset_links(&tmp_links); | 1651 | goto rm_base_files; |
1663 | goto unlock_drop; | 1652 | |
1664 | } | 1653 | revert_creds(cred); |
1654 | |||
1665 | /* | 1655 | /* |
1666 | * There must be no failure case after here, since rebinding | 1656 | * There must be no failure case after here, since rebinding |
1667 | * takes care of subsystems' refcounts, which are explicitly | 1657 | * takes care of subsystems' refcounts, which are explicitly |
1668 | * dropped in the failure exit path. | 1658 | * dropped in the failure exit path. |
1669 | */ | 1659 | */ |
1670 | 1660 | ||
1671 | /* EBUSY should be the only error here */ | ||
1672 | BUG_ON(ret); | ||
1673 | |||
1674 | list_add(&root->root_list, &cgroup_roots); | 1661 | list_add(&root->root_list, &cgroup_roots); |
1675 | cgroup_root_count++; | 1662 | cgroup_root_count++; |
1676 | 1663 | ||
1677 | sb->s_root->d_fsdata = root_cgrp; | ||
1678 | root->top_cgroup.dentry = sb->s_root; | ||
1679 | |||
1680 | /* Link the top cgroup in this hierarchy into all | 1664 | /* Link the top cgroup in this hierarchy into all |
1681 | * the css_set objects */ | 1665 | * the css_set objects */ |
1682 | write_lock(&css_set_lock); | 1666 | write_lock(&css_set_lock); |
@@ -1689,9 +1673,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1689 | BUG_ON(!list_empty(&root_cgrp->children)); | 1673 | BUG_ON(!list_empty(&root_cgrp->children)); |
1690 | BUG_ON(root->number_of_cgroups != 1); | 1674 | BUG_ON(root->number_of_cgroups != 1); |
1691 | 1675 | ||
1692 | cred = override_creds(&init_cred); | ||
1693 | cgroup_populate_dir(root_cgrp, true, root->subsys_mask); | ||
1694 | revert_creds(cred); | ||
1695 | mutex_unlock(&cgroup_root_mutex); | 1676 | mutex_unlock(&cgroup_root_mutex); |
1696 | mutex_unlock(&cgroup_mutex); | 1677 | mutex_unlock(&cgroup_mutex); |
1697 | mutex_unlock(&inode->i_mutex); | 1678 | mutex_unlock(&inode->i_mutex); |
@@ -1711,15 +1692,16 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1711 | pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n"); | 1692 | pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n"); |
1712 | } | 1693 | } |
1713 | } | 1694 | } |
1714 | |||
1715 | /* no subsys rebinding, so refcounts don't change */ | ||
1716 | drop_parsed_module_refcounts(opts.subsys_mask); | ||
1717 | } | 1695 | } |
1718 | 1696 | ||
1719 | kfree(opts.release_agent); | 1697 | kfree(opts.release_agent); |
1720 | kfree(opts.name); | 1698 | kfree(opts.name); |
1721 | return dget(sb->s_root); | 1699 | return dget(sb->s_root); |
1722 | 1700 | ||
1701 | rm_base_files: | ||
1702 | free_cgrp_cset_links(&tmp_links); | ||
1703 | cgroup_addrm_files(&root->top_cgroup, cgroup_base_files, false); | ||
1704 | revert_creds(cred); | ||
1723 | unlock_drop: | 1705 | unlock_drop: |
1724 | cgroup_exit_root_id(root); | 1706 | cgroup_exit_root_id(root); |
1725 | mutex_unlock(&cgroup_root_mutex); | 1707 | mutex_unlock(&cgroup_root_mutex); |
@@ -1727,8 +1709,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1727 | mutex_unlock(&inode->i_mutex); | 1709 | mutex_unlock(&inode->i_mutex); |
1728 | drop_new_super: | 1710 | drop_new_super: |
1729 | deactivate_locked_super(sb); | 1711 | deactivate_locked_super(sb); |
1730 | drop_modules: | ||
1731 | drop_parsed_module_refcounts(opts.subsys_mask); | ||
1732 | out_err: | 1712 | out_err: |
1733 | kfree(opts.release_agent); | 1713 | kfree(opts.release_agent); |
1734 | kfree(opts.name); | 1714 | kfree(opts.name); |
@@ -1746,6 +1726,7 @@ static void cgroup_kill_sb(struct super_block *sb) { | |||
1746 | BUG_ON(root->number_of_cgroups != 1); | 1726 | BUG_ON(root->number_of_cgroups != 1); |
1747 | BUG_ON(!list_empty(&cgrp->children)); | 1727 | BUG_ON(!list_empty(&cgrp->children)); |
1748 | 1728 | ||
1729 | mutex_lock(&cgrp->dentry->d_inode->i_mutex); | ||
1749 | mutex_lock(&cgroup_mutex); | 1730 | mutex_lock(&cgroup_mutex); |
1750 | mutex_lock(&cgroup_root_mutex); | 1731 | mutex_lock(&cgroup_root_mutex); |
1751 | 1732 | ||
@@ -1778,6 +1759,7 @@ static void cgroup_kill_sb(struct super_block *sb) { | |||
1778 | 1759 | ||
1779 | mutex_unlock(&cgroup_root_mutex); | 1760 | mutex_unlock(&cgroup_root_mutex); |
1780 | mutex_unlock(&cgroup_mutex); | 1761 | mutex_unlock(&cgroup_mutex); |
1762 | mutex_unlock(&cgrp->dentry->d_inode->i_mutex); | ||
1781 | 1763 | ||
1782 | simple_xattrs_free(&cgrp->xattrs); | 1764 | simple_xattrs_free(&cgrp->xattrs); |
1783 | 1765 | ||
@@ -1845,36 +1827,43 @@ out: | |||
1845 | EXPORT_SYMBOL_GPL(cgroup_path); | 1827 | EXPORT_SYMBOL_GPL(cgroup_path); |
1846 | 1828 | ||
1847 | /** | 1829 | /** |
1848 | * task_cgroup_path_from_hierarchy - cgroup path of a task on a hierarchy | 1830 | * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy |
1849 | * @task: target task | 1831 | * @task: target task |
1850 | * @hierarchy_id: the hierarchy to look up @task's cgroup from | ||
1851 | * @buf: the buffer to write the path into | 1832 | * @buf: the buffer to write the path into |
1852 | * @buflen: the length of the buffer | 1833 | * @buflen: the length of the buffer |
1853 | * | 1834 | * |
1854 | * Determine @task's cgroup on the hierarchy specified by @hierarchy_id and | 1835 | * Determine @task's cgroup on the first (the one with the lowest non-zero |
1855 | * copy its path into @buf. This function grabs cgroup_mutex and shouldn't | 1836 | * hierarchy_id) cgroup hierarchy and copy its path into @buf. This |
1856 | * be used inside locks used by cgroup controller callbacks. | 1837 | * function grabs cgroup_mutex and shouldn't be used inside locks used by |
1838 | * cgroup controller callbacks. | ||
1839 | * | ||
1840 | * Returns 0 on success, fails with -%ENAMETOOLONG if @buflen is too short. | ||
1857 | */ | 1841 | */ |
1858 | int task_cgroup_path_from_hierarchy(struct task_struct *task, int hierarchy_id, | 1842 | int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) |
1859 | char *buf, size_t buflen) | ||
1860 | { | 1843 | { |
1861 | struct cgroupfs_root *root; | 1844 | struct cgroupfs_root *root; |
1862 | struct cgroup *cgrp = NULL; | 1845 | struct cgroup *cgrp; |
1863 | int ret = -ENOENT; | 1846 | int hierarchy_id = 1, ret = 0; |
1847 | |||
1848 | if (buflen < 2) | ||
1849 | return -ENAMETOOLONG; | ||
1864 | 1850 | ||
1865 | mutex_lock(&cgroup_mutex); | 1851 | mutex_lock(&cgroup_mutex); |
1866 | 1852 | ||
1867 | root = idr_find(&cgroup_hierarchy_idr, hierarchy_id); | 1853 | root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id); |
1854 | |||
1868 | if (root) { | 1855 | if (root) { |
1869 | cgrp = task_cgroup_from_root(task, root); | 1856 | cgrp = task_cgroup_from_root(task, root); |
1870 | ret = cgroup_path(cgrp, buf, buflen); | 1857 | ret = cgroup_path(cgrp, buf, buflen); |
1858 | } else { | ||
1859 | /* if no hierarchy exists, everyone is in "/" */ | ||
1860 | memcpy(buf, "/", 2); | ||
1871 | } | 1861 | } |
1872 | 1862 | ||
1873 | mutex_unlock(&cgroup_mutex); | 1863 | mutex_unlock(&cgroup_mutex); |
1874 | |||
1875 | return ret; | 1864 | return ret; |
1876 | } | 1865 | } |
1877 | EXPORT_SYMBOL_GPL(task_cgroup_path_from_hierarchy); | 1866 | EXPORT_SYMBOL_GPL(task_cgroup_path); |
1878 | 1867 | ||
1879 | /* | 1868 | /* |
1880 | * Control Group taskset | 1869 | * Control Group taskset |
@@ -1882,7 +1871,7 @@ EXPORT_SYMBOL_GPL(task_cgroup_path_from_hierarchy); | |||
1882 | struct task_and_cgroup { | 1871 | struct task_and_cgroup { |
1883 | struct task_struct *task; | 1872 | struct task_struct *task; |
1884 | struct cgroup *cgrp; | 1873 | struct cgroup *cgrp; |
1885 | struct css_set *cg; | 1874 | struct css_set *cset; |
1886 | }; | 1875 | }; |
1887 | 1876 | ||
1888 | struct cgroup_taskset { | 1877 | struct cgroup_taskset { |
@@ -1932,18 +1921,20 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) | |||
1932 | EXPORT_SYMBOL_GPL(cgroup_taskset_next); | 1921 | EXPORT_SYMBOL_GPL(cgroup_taskset_next); |
1933 | 1922 | ||
1934 | /** | 1923 | /** |
1935 | * cgroup_taskset_cur_cgroup - return the matching cgroup for the current task | 1924 | * cgroup_taskset_cur_css - return the matching css for the current task |
1936 | * @tset: taskset of interest | 1925 | * @tset: taskset of interest |
1926 | * @subsys_id: the ID of the target subsystem | ||
1937 | * | 1927 | * |
1938 | * Return the cgroup for the current (last returned) task of @tset. This | 1928 | * Return the css for the current (last returned) task of @tset for |
1939 | * function must be preceded by either cgroup_taskset_first() or | 1929 | * subsystem specified by @subsys_id. This function must be preceded by |
1940 | * cgroup_taskset_next(). | 1930 | * either cgroup_taskset_first() or cgroup_taskset_next(). |
1941 | */ | 1931 | */ |
1942 | struct cgroup *cgroup_taskset_cur_cgroup(struct cgroup_taskset *tset) | 1932 | struct cgroup_subsys_state *cgroup_taskset_cur_css(struct cgroup_taskset *tset, |
1933 | int subsys_id) | ||
1943 | { | 1934 | { |
1944 | return tset->cur_cgrp; | 1935 | return cgroup_css(tset->cur_cgrp, cgroup_subsys[subsys_id]); |
1945 | } | 1936 | } |
1946 | EXPORT_SYMBOL_GPL(cgroup_taskset_cur_cgroup); | 1937 | EXPORT_SYMBOL_GPL(cgroup_taskset_cur_css); |
1947 | 1938 | ||
1948 | /** | 1939 | /** |
1949 | * cgroup_taskset_size - return the number of tasks in taskset | 1940 | * cgroup_taskset_size - return the number of tasks in taskset |
@@ -2082,8 +2073,10 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, | |||
2082 | * step 1: check that we can legitimately attach to the cgroup. | 2073 | * step 1: check that we can legitimately attach to the cgroup. |
2083 | */ | 2074 | */ |
2084 | for_each_root_subsys(root, ss) { | 2075 | for_each_root_subsys(root, ss) { |
2076 | struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); | ||
2077 | |||
2085 | if (ss->can_attach) { | 2078 | if (ss->can_attach) { |
2086 | retval = ss->can_attach(cgrp, &tset); | 2079 | retval = ss->can_attach(css, &tset); |
2087 | if (retval) { | 2080 | if (retval) { |
2088 | failed_ss = ss; | 2081 | failed_ss = ss; |
2089 | goto out_cancel_attach; | 2082 | goto out_cancel_attach; |
@@ -2100,8 +2093,8 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, | |||
2100 | 2093 | ||
2101 | tc = flex_array_get(group, i); | 2094 | tc = flex_array_get(group, i); |
2102 | old_cset = task_css_set(tc->task); | 2095 | old_cset = task_css_set(tc->task); |
2103 | tc->cg = find_css_set(old_cset, cgrp); | 2096 | tc->cset = find_css_set(old_cset, cgrp); |
2104 | if (!tc->cg) { | 2097 | if (!tc->cset) { |
2105 | retval = -ENOMEM; | 2098 | retval = -ENOMEM; |
2106 | goto out_put_css_set_refs; | 2099 | goto out_put_css_set_refs; |
2107 | } | 2100 | } |
@@ -2114,7 +2107,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, | |||
2114 | */ | 2107 | */ |
2115 | for (i = 0; i < group_size; i++) { | 2108 | for (i = 0; i < group_size; i++) { |
2116 | tc = flex_array_get(group, i); | 2109 | tc = flex_array_get(group, i); |
2117 | cgroup_task_migrate(tc->cgrp, tc->task, tc->cg); | 2110 | cgroup_task_migrate(tc->cgrp, tc->task, tc->cset); |
2118 | } | 2111 | } |
2119 | /* nothing is sensitive to fork() after this point. */ | 2112 | /* nothing is sensitive to fork() after this point. */ |
2120 | 2113 | ||
@@ -2122,8 +2115,10 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, | |||
2122 | * step 4: do subsystem attach callbacks. | 2115 | * step 4: do subsystem attach callbacks. |
2123 | */ | 2116 | */ |
2124 | for_each_root_subsys(root, ss) { | 2117 | for_each_root_subsys(root, ss) { |
2118 | struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); | ||
2119 | |||
2125 | if (ss->attach) | 2120 | if (ss->attach) |
2126 | ss->attach(cgrp, &tset); | 2121 | ss->attach(css, &tset); |
2127 | } | 2122 | } |
2128 | 2123 | ||
2129 | /* | 2124 | /* |
@@ -2134,18 +2129,20 @@ out_put_css_set_refs: | |||
2134 | if (retval) { | 2129 | if (retval) { |
2135 | for (i = 0; i < group_size; i++) { | 2130 | for (i = 0; i < group_size; i++) { |
2136 | tc = flex_array_get(group, i); | 2131 | tc = flex_array_get(group, i); |
2137 | if (!tc->cg) | 2132 | if (!tc->cset) |
2138 | break; | 2133 | break; |
2139 | put_css_set(tc->cg); | 2134 | put_css_set(tc->cset); |
2140 | } | 2135 | } |
2141 | } | 2136 | } |
2142 | out_cancel_attach: | 2137 | out_cancel_attach: |
2143 | if (retval) { | 2138 | if (retval) { |
2144 | for_each_root_subsys(root, ss) { | 2139 | for_each_root_subsys(root, ss) { |
2140 | struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); | ||
2141 | |||
2145 | if (ss == failed_ss) | 2142 | if (ss == failed_ss) |
2146 | break; | 2143 | break; |
2147 | if (ss->cancel_attach) | 2144 | if (ss->cancel_attach) |
2148 | ss->cancel_attach(cgrp, &tset); | 2145 | ss->cancel_attach(css, &tset); |
2149 | } | 2146 | } |
2150 | } | 2147 | } |
2151 | out_free_group_list: | 2148 | out_free_group_list: |
@@ -2246,9 +2243,9 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) | |||
2246 | 2243 | ||
2247 | mutex_lock(&cgroup_mutex); | 2244 | mutex_lock(&cgroup_mutex); |
2248 | for_each_active_root(root) { | 2245 | for_each_active_root(root) { |
2249 | struct cgroup *from_cg = task_cgroup_from_root(from, root); | 2246 | struct cgroup *from_cgrp = task_cgroup_from_root(from, root); |
2250 | 2247 | ||
2251 | retval = cgroup_attach_task(from_cg, tsk, false); | 2248 | retval = cgroup_attach_task(from_cgrp, tsk, false); |
2252 | if (retval) | 2249 | if (retval) |
2253 | break; | 2250 | break; |
2254 | } | 2251 | } |
@@ -2258,34 +2255,38 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) | |||
2258 | } | 2255 | } |
2259 | EXPORT_SYMBOL_GPL(cgroup_attach_task_all); | 2256 | EXPORT_SYMBOL_GPL(cgroup_attach_task_all); |
2260 | 2257 | ||
2261 | static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) | 2258 | static int cgroup_tasks_write(struct cgroup_subsys_state *css, |
2259 | struct cftype *cft, u64 pid) | ||
2262 | { | 2260 | { |
2263 | return attach_task_by_pid(cgrp, pid, false); | 2261 | return attach_task_by_pid(css->cgroup, pid, false); |
2264 | } | 2262 | } |
2265 | 2263 | ||
2266 | static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid) | 2264 | static int cgroup_procs_write(struct cgroup_subsys_state *css, |
2265 | struct cftype *cft, u64 tgid) | ||
2267 | { | 2266 | { |
2268 | return attach_task_by_pid(cgrp, tgid, true); | 2267 | return attach_task_by_pid(css->cgroup, tgid, true); |
2269 | } | 2268 | } |
2270 | 2269 | ||
2271 | static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, | 2270 | static int cgroup_release_agent_write(struct cgroup_subsys_state *css, |
2272 | const char *buffer) | 2271 | struct cftype *cft, const char *buffer) |
2273 | { | 2272 | { |
2274 | BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); | 2273 | BUILD_BUG_ON(sizeof(css->cgroup->root->release_agent_path) < PATH_MAX); |
2275 | if (strlen(buffer) >= PATH_MAX) | 2274 | if (strlen(buffer) >= PATH_MAX) |
2276 | return -EINVAL; | 2275 | return -EINVAL; |
2277 | if (!cgroup_lock_live_group(cgrp)) | 2276 | if (!cgroup_lock_live_group(css->cgroup)) |
2278 | return -ENODEV; | 2277 | return -ENODEV; |
2279 | mutex_lock(&cgroup_root_mutex); | 2278 | mutex_lock(&cgroup_root_mutex); |
2280 | strcpy(cgrp->root->release_agent_path, buffer); | 2279 | strcpy(css->cgroup->root->release_agent_path, buffer); |
2281 | mutex_unlock(&cgroup_root_mutex); | 2280 | mutex_unlock(&cgroup_root_mutex); |
2282 | mutex_unlock(&cgroup_mutex); | 2281 | mutex_unlock(&cgroup_mutex); |
2283 | return 0; | 2282 | return 0; |
2284 | } | 2283 | } |
2285 | 2284 | ||
2286 | static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft, | 2285 | static int cgroup_release_agent_show(struct cgroup_subsys_state *css, |
2287 | struct seq_file *seq) | 2286 | struct cftype *cft, struct seq_file *seq) |
2288 | { | 2287 | { |
2288 | struct cgroup *cgrp = css->cgroup; | ||
2289 | |||
2289 | if (!cgroup_lock_live_group(cgrp)) | 2290 | if (!cgroup_lock_live_group(cgrp)) |
2290 | return -ENODEV; | 2291 | return -ENODEV; |
2291 | seq_puts(seq, cgrp->root->release_agent_path); | 2292 | seq_puts(seq, cgrp->root->release_agent_path); |
@@ -2294,20 +2295,20 @@ static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft, | |||
2294 | return 0; | 2295 | return 0; |
2295 | } | 2296 | } |
2296 | 2297 | ||
2297 | static int cgroup_sane_behavior_show(struct cgroup *cgrp, struct cftype *cft, | 2298 | static int cgroup_sane_behavior_show(struct cgroup_subsys_state *css, |
2298 | struct seq_file *seq) | 2299 | struct cftype *cft, struct seq_file *seq) |
2299 | { | 2300 | { |
2300 | seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp)); | 2301 | seq_printf(seq, "%d\n", cgroup_sane_behavior(css->cgroup)); |
2301 | return 0; | 2302 | return 0; |
2302 | } | 2303 | } |
2303 | 2304 | ||
2304 | /* A buffer size big enough for numbers or short strings */ | 2305 | /* A buffer size big enough for numbers or short strings */ |
2305 | #define CGROUP_LOCAL_BUFFER_SIZE 64 | 2306 | #define CGROUP_LOCAL_BUFFER_SIZE 64 |
2306 | 2307 | ||
2307 | static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft, | 2308 | static ssize_t cgroup_write_X64(struct cgroup_subsys_state *css, |
2308 | struct file *file, | 2309 | struct cftype *cft, struct file *file, |
2309 | const char __user *userbuf, | 2310 | const char __user *userbuf, size_t nbytes, |
2310 | size_t nbytes, loff_t *unused_ppos) | 2311 | loff_t *unused_ppos) |
2311 | { | 2312 | { |
2312 | char buffer[CGROUP_LOCAL_BUFFER_SIZE]; | 2313 | char buffer[CGROUP_LOCAL_BUFFER_SIZE]; |
2313 | int retval = 0; | 2314 | int retval = 0; |
@@ -2325,22 +2326,22 @@ static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft, | |||
2325 | u64 val = simple_strtoull(strstrip(buffer), &end, 0); | 2326 | u64 val = simple_strtoull(strstrip(buffer), &end, 0); |
2326 | if (*end) | 2327 | if (*end) |
2327 | return -EINVAL; | 2328 | return -EINVAL; |
2328 | retval = cft->write_u64(cgrp, cft, val); | 2329 | retval = cft->write_u64(css, cft, val); |
2329 | } else { | 2330 | } else { |
2330 | s64 val = simple_strtoll(strstrip(buffer), &end, 0); | 2331 | s64 val = simple_strtoll(strstrip(buffer), &end, 0); |
2331 | if (*end) | 2332 | if (*end) |
2332 | return -EINVAL; | 2333 | return -EINVAL; |
2333 | retval = cft->write_s64(cgrp, cft, val); | 2334 | retval = cft->write_s64(css, cft, val); |
2334 | } | 2335 | } |
2335 | if (!retval) | 2336 | if (!retval) |
2336 | retval = nbytes; | 2337 | retval = nbytes; |
2337 | return retval; | 2338 | return retval; |
2338 | } | 2339 | } |
2339 | 2340 | ||
2340 | static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft, | 2341 | static ssize_t cgroup_write_string(struct cgroup_subsys_state *css, |
2341 | struct file *file, | 2342 | struct cftype *cft, struct file *file, |
2342 | const char __user *userbuf, | 2343 | const char __user *userbuf, size_t nbytes, |
2343 | size_t nbytes, loff_t *unused_ppos) | 2344 | loff_t *unused_ppos) |
2344 | { | 2345 | { |
2345 | char local_buffer[CGROUP_LOCAL_BUFFER_SIZE]; | 2346 | char local_buffer[CGROUP_LOCAL_BUFFER_SIZE]; |
2346 | int retval = 0; | 2347 | int retval = 0; |
@@ -2363,7 +2364,7 @@ static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft, | |||
2363 | } | 2364 | } |
2364 | 2365 | ||
2365 | buffer[nbytes] = 0; /* nul-terminate */ | 2366 | buffer[nbytes] = 0; /* nul-terminate */ |
2366 | retval = cft->write_string(cgrp, cft, strstrip(buffer)); | 2367 | retval = cft->write_string(css, cft, strstrip(buffer)); |
2367 | if (!retval) | 2368 | if (!retval) |
2368 | retval = nbytes; | 2369 | retval = nbytes; |
2369 | out: | 2370 | out: |
@@ -2373,65 +2374,60 @@ out: | |||
2373 | } | 2374 | } |
2374 | 2375 | ||
2375 | static ssize_t cgroup_file_write(struct file *file, const char __user *buf, | 2376 | static ssize_t cgroup_file_write(struct file *file, const char __user *buf, |
2376 | size_t nbytes, loff_t *ppos) | 2377 | size_t nbytes, loff_t *ppos) |
2377 | { | 2378 | { |
2379 | struct cfent *cfe = __d_cfe(file->f_dentry); | ||
2378 | struct cftype *cft = __d_cft(file->f_dentry); | 2380 | struct cftype *cft = __d_cft(file->f_dentry); |
2379 | struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); | 2381 | struct cgroup_subsys_state *css = cfe->css; |
2380 | 2382 | ||
2381 | if (cgroup_is_dead(cgrp)) | ||
2382 | return -ENODEV; | ||
2383 | if (cft->write) | 2383 | if (cft->write) |
2384 | return cft->write(cgrp, cft, file, buf, nbytes, ppos); | 2384 | return cft->write(css, cft, file, buf, nbytes, ppos); |
2385 | if (cft->write_u64 || cft->write_s64) | 2385 | if (cft->write_u64 || cft->write_s64) |
2386 | return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos); | 2386 | return cgroup_write_X64(css, cft, file, buf, nbytes, ppos); |
2387 | if (cft->write_string) | 2387 | if (cft->write_string) |
2388 | return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos); | 2388 | return cgroup_write_string(css, cft, file, buf, nbytes, ppos); |
2389 | if (cft->trigger) { | 2389 | if (cft->trigger) { |
2390 | int ret = cft->trigger(cgrp, (unsigned int)cft->private); | 2390 | int ret = cft->trigger(css, (unsigned int)cft->private); |
2391 | return ret ? ret : nbytes; | 2391 | return ret ? ret : nbytes; |
2392 | } | 2392 | } |
2393 | return -EINVAL; | 2393 | return -EINVAL; |
2394 | } | 2394 | } |
2395 | 2395 | ||
2396 | static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft, | 2396 | static ssize_t cgroup_read_u64(struct cgroup_subsys_state *css, |
2397 | struct file *file, | 2397 | struct cftype *cft, struct file *file, |
2398 | char __user *buf, size_t nbytes, | 2398 | char __user *buf, size_t nbytes, loff_t *ppos) |
2399 | loff_t *ppos) | ||
2400 | { | 2399 | { |
2401 | char tmp[CGROUP_LOCAL_BUFFER_SIZE]; | 2400 | char tmp[CGROUP_LOCAL_BUFFER_SIZE]; |
2402 | u64 val = cft->read_u64(cgrp, cft); | 2401 | u64 val = cft->read_u64(css, cft); |
2403 | int len = sprintf(tmp, "%llu\n", (unsigned long long) val); | 2402 | int len = sprintf(tmp, "%llu\n", (unsigned long long) val); |
2404 | 2403 | ||
2405 | return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); | 2404 | return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); |
2406 | } | 2405 | } |
2407 | 2406 | ||
2408 | static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft, | 2407 | static ssize_t cgroup_read_s64(struct cgroup_subsys_state *css, |
2409 | struct file *file, | 2408 | struct cftype *cft, struct file *file, |
2410 | char __user *buf, size_t nbytes, | 2409 | char __user *buf, size_t nbytes, loff_t *ppos) |
2411 | loff_t *ppos) | ||
2412 | { | 2410 | { |
2413 | char tmp[CGROUP_LOCAL_BUFFER_SIZE]; | 2411 | char tmp[CGROUP_LOCAL_BUFFER_SIZE]; |
2414 | s64 val = cft->read_s64(cgrp, cft); | 2412 | s64 val = cft->read_s64(css, cft); |
2415 | int len = sprintf(tmp, "%lld\n", (long long) val); | 2413 | int len = sprintf(tmp, "%lld\n", (long long) val); |
2416 | 2414 | ||
2417 | return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); | 2415 | return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); |
2418 | } | 2416 | } |
2419 | 2417 | ||
2420 | static ssize_t cgroup_file_read(struct file *file, char __user *buf, | 2418 | static ssize_t cgroup_file_read(struct file *file, char __user *buf, |
2421 | size_t nbytes, loff_t *ppos) | 2419 | size_t nbytes, loff_t *ppos) |
2422 | { | 2420 | { |
2421 | struct cfent *cfe = __d_cfe(file->f_dentry); | ||
2423 | struct cftype *cft = __d_cft(file->f_dentry); | 2422 | struct cftype *cft = __d_cft(file->f_dentry); |
2424 | struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); | 2423 | struct cgroup_subsys_state *css = cfe->css; |
2425 | |||
2426 | if (cgroup_is_dead(cgrp)) | ||
2427 | return -ENODEV; | ||
2428 | 2424 | ||
2429 | if (cft->read) | 2425 | if (cft->read) |
2430 | return cft->read(cgrp, cft, file, buf, nbytes, ppos); | 2426 | return cft->read(css, cft, file, buf, nbytes, ppos); |
2431 | if (cft->read_u64) | 2427 | if (cft->read_u64) |
2432 | return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos); | 2428 | return cgroup_read_u64(css, cft, file, buf, nbytes, ppos); |
2433 | if (cft->read_s64) | 2429 | if (cft->read_s64) |
2434 | return cgroup_read_s64(cgrp, cft, file, buf, nbytes, ppos); | 2430 | return cgroup_read_s64(css, cft, file, buf, nbytes, ppos); |
2435 | return -EINVAL; | 2431 | return -EINVAL; |
2436 | } | 2432 | } |
2437 | 2433 | ||
@@ -2440,11 +2436,6 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf, | |||
2440 | * supports string->u64 maps, but can be extended in future. | 2436 | * supports string->u64 maps, but can be extended in future. |
2441 | */ | 2437 | */ |
2442 | 2438 | ||
2443 | struct cgroup_seqfile_state { | ||
2444 | struct cftype *cft; | ||
2445 | struct cgroup *cgroup; | ||
2446 | }; | ||
2447 | |||
2448 | static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value) | 2439 | static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value) |
2449 | { | 2440 | { |
2450 | struct seq_file *sf = cb->state; | 2441 | struct seq_file *sf = cb->state; |
@@ -2453,69 +2444,86 @@ static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value) | |||
2453 | 2444 | ||
2454 | static int cgroup_seqfile_show(struct seq_file *m, void *arg) | 2445 | static int cgroup_seqfile_show(struct seq_file *m, void *arg) |
2455 | { | 2446 | { |
2456 | struct cgroup_seqfile_state *state = m->private; | 2447 | struct cfent *cfe = m->private; |
2457 | struct cftype *cft = state->cft; | 2448 | struct cftype *cft = cfe->type; |
2449 | struct cgroup_subsys_state *css = cfe->css; | ||
2450 | |||
2458 | if (cft->read_map) { | 2451 | if (cft->read_map) { |
2459 | struct cgroup_map_cb cb = { | 2452 | struct cgroup_map_cb cb = { |
2460 | .fill = cgroup_map_add, | 2453 | .fill = cgroup_map_add, |
2461 | .state = m, | 2454 | .state = m, |
2462 | }; | 2455 | }; |
2463 | return cft->read_map(state->cgroup, cft, &cb); | 2456 | return cft->read_map(css, cft, &cb); |
2464 | } | 2457 | } |
2465 | return cft->read_seq_string(state->cgroup, cft, m); | 2458 | return cft->read_seq_string(css, cft, m); |
2466 | } | ||
2467 | |||
2468 | static int cgroup_seqfile_release(struct inode *inode, struct file *file) | ||
2469 | { | ||
2470 | struct seq_file *seq = file->private_data; | ||
2471 | kfree(seq->private); | ||
2472 | return single_release(inode, file); | ||
2473 | } | 2459 | } |
2474 | 2460 | ||
2475 | static const struct file_operations cgroup_seqfile_operations = { | 2461 | static const struct file_operations cgroup_seqfile_operations = { |
2476 | .read = seq_read, | 2462 | .read = seq_read, |
2477 | .write = cgroup_file_write, | 2463 | .write = cgroup_file_write, |
2478 | .llseek = seq_lseek, | 2464 | .llseek = seq_lseek, |
2479 | .release = cgroup_seqfile_release, | 2465 | .release = single_release, |
2480 | }; | 2466 | }; |
2481 | 2467 | ||
2482 | static int cgroup_file_open(struct inode *inode, struct file *file) | 2468 | static int cgroup_file_open(struct inode *inode, struct file *file) |
2483 | { | 2469 | { |
2470 | struct cfent *cfe = __d_cfe(file->f_dentry); | ||
2471 | struct cftype *cft = __d_cft(file->f_dentry); | ||
2472 | struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent); | ||
2473 | struct cgroup_subsys_state *css; | ||
2484 | int err; | 2474 | int err; |
2485 | struct cftype *cft; | ||
2486 | 2475 | ||
2487 | err = generic_file_open(inode, file); | 2476 | err = generic_file_open(inode, file); |
2488 | if (err) | 2477 | if (err) |
2489 | return err; | 2478 | return err; |
2490 | cft = __d_cft(file->f_dentry); | ||
2491 | 2479 | ||
2492 | if (cft->read_map || cft->read_seq_string) { | 2480 | /* |
2493 | struct cgroup_seqfile_state *state; | 2481 | * If the file belongs to a subsystem, pin the css. Will be |
2482 | * unpinned either on open failure or release. This ensures that | ||
2483 | * @css stays alive for all file operations. | ||
2484 | */ | ||
2485 | rcu_read_lock(); | ||
2486 | css = cgroup_css(cgrp, cft->ss); | ||
2487 | if (cft->ss && !css_tryget(css)) | ||
2488 | css = NULL; | ||
2489 | rcu_read_unlock(); | ||
2494 | 2490 | ||
2495 | state = kzalloc(sizeof(*state), GFP_USER); | 2491 | if (!css) |
2496 | if (!state) | 2492 | return -ENODEV; |
2497 | return -ENOMEM; | 2493 | |
2494 | /* | ||
2495 | * @cfe->css is used by read/write/close to determine the | ||
2496 | * associated css. @file->private_data would be a better place but | ||
2497 | * that's already used by seqfile. Multiple accessors may use it | ||
2498 | * simultaneously which is okay as the association never changes. | ||
2499 | */ | ||
2500 | WARN_ON_ONCE(cfe->css && cfe->css != css); | ||
2501 | cfe->css = css; | ||
2498 | 2502 | ||
2499 | state->cft = cft; | 2503 | if (cft->read_map || cft->read_seq_string) { |
2500 | state->cgroup = __d_cgrp(file->f_dentry->d_parent); | ||
2501 | file->f_op = &cgroup_seqfile_operations; | 2504 | file->f_op = &cgroup_seqfile_operations; |
2502 | err = single_open(file, cgroup_seqfile_show, state); | 2505 | err = single_open(file, cgroup_seqfile_show, cfe); |
2503 | if (err < 0) | 2506 | } else if (cft->open) { |
2504 | kfree(state); | ||
2505 | } else if (cft->open) | ||
2506 | err = cft->open(inode, file); | 2507 | err = cft->open(inode, file); |
2507 | else | 2508 | } |
2508 | err = 0; | ||
2509 | 2509 | ||
2510 | if (css->ss && err) | ||
2511 | css_put(css); | ||
2510 | return err; | 2512 | return err; |
2511 | } | 2513 | } |
2512 | 2514 | ||
2513 | static int cgroup_file_release(struct inode *inode, struct file *file) | 2515 | static int cgroup_file_release(struct inode *inode, struct file *file) |
2514 | { | 2516 | { |
2517 | struct cfent *cfe = __d_cfe(file->f_dentry); | ||
2515 | struct cftype *cft = __d_cft(file->f_dentry); | 2518 | struct cftype *cft = __d_cft(file->f_dentry); |
2519 | struct cgroup_subsys_state *css = cfe->css; | ||
2520 | int ret = 0; | ||
2521 | |||
2516 | if (cft->release) | 2522 | if (cft->release) |
2517 | return cft->release(inode, file); | 2523 | ret = cft->release(inode, file); |
2518 | return 0; | 2524 | if (css->ss) |
2525 | css_put(css); | ||
2526 | return ret; | ||
2519 | } | 2527 | } |
2520 | 2528 | ||
2521 | /* | 2529 | /* |
@@ -2729,8 +2737,7 @@ static umode_t cgroup_file_mode(const struct cftype *cft) | |||
2729 | return mode; | 2737 | return mode; |
2730 | } | 2738 | } |
2731 | 2739 | ||
2732 | static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, | 2740 | static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) |
2733 | struct cftype *cft) | ||
2734 | { | 2741 | { |
2735 | struct dentry *dir = cgrp->dentry; | 2742 | struct dentry *dir = cgrp->dentry; |
2736 | struct cgroup *parent = __d_cgrp(dir); | 2743 | struct cgroup *parent = __d_cgrp(dir); |
@@ -2740,8 +2747,9 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, | |||
2740 | umode_t mode; | 2747 | umode_t mode; |
2741 | char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; | 2748 | char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; |
2742 | 2749 | ||
2743 | if (subsys && !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) { | 2750 | if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) && |
2744 | strcpy(name, subsys->name); | 2751 | !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) { |
2752 | strcpy(name, cft->ss->name); | ||
2745 | strcat(name, "."); | 2753 | strcat(name, "."); |
2746 | } | 2754 | } |
2747 | strcat(name, cft->name); | 2755 | strcat(name, cft->name); |
@@ -2775,11 +2783,25 @@ out: | |||
2775 | return error; | 2783 | return error; |
2776 | } | 2784 | } |
2777 | 2785 | ||
2778 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, | 2786 | /** |
2779 | struct cftype cfts[], bool is_add) | 2787 | * cgroup_addrm_files - add or remove files to a cgroup directory |
2788 | * @cgrp: the target cgroup | ||
2789 | * @cfts: array of cftypes to be added | ||
2790 | * @is_add: whether to add or remove | ||
2791 | * | ||
2792 | * Depending on @is_add, add or remove files defined by @cfts on @cgrp. | ||
2793 | * For removals, this function never fails. If addition fails, this | ||
2794 | * function doesn't remove files already added. The caller is responsible | ||
2795 | * for cleaning up. | ||
2796 | */ | ||
2797 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], | ||
2798 | bool is_add) | ||
2780 | { | 2799 | { |
2781 | struct cftype *cft; | 2800 | struct cftype *cft; |
2782 | int err, ret = 0; | 2801 | int ret; |
2802 | |||
2803 | lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); | ||
2804 | lockdep_assert_held(&cgroup_mutex); | ||
2783 | 2805 | ||
2784 | for (cft = cfts; cft->name[0] != '\0'; cft++) { | 2806 | for (cft = cfts; cft->name[0] != '\0'; cft++) { |
2785 | /* does cft->flags tell us to skip this file on @cgrp? */ | 2807 | /* does cft->flags tell us to skip this file on @cgrp? */ |
@@ -2791,16 +2813,17 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, | |||
2791 | continue; | 2813 | continue; |
2792 | 2814 | ||
2793 | if (is_add) { | 2815 | if (is_add) { |
2794 | err = cgroup_add_file(cgrp, subsys, cft); | 2816 | ret = cgroup_add_file(cgrp, cft); |
2795 | if (err) | 2817 | if (ret) { |
2796 | pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n", | 2818 | pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n", |
2797 | cft->name, err); | 2819 | cft->name, ret); |
2798 | ret = err; | 2820 | return ret; |
2821 | } | ||
2799 | } else { | 2822 | } else { |
2800 | cgroup_rm_file(cgrp, cft); | 2823 | cgroup_rm_file(cgrp, cft); |
2801 | } | 2824 | } |
2802 | } | 2825 | } |
2803 | return ret; | 2826 | return 0; |
2804 | } | 2827 | } |
2805 | 2828 | ||
2806 | static void cgroup_cfts_prepare(void) | 2829 | static void cgroup_cfts_prepare(void) |
@@ -2809,28 +2832,30 @@ static void cgroup_cfts_prepare(void) | |||
2809 | /* | 2832 | /* |
2810 | * Thanks to the entanglement with vfs inode locking, we can't walk | 2833 | * Thanks to the entanglement with vfs inode locking, we can't walk |
2811 | * the existing cgroups under cgroup_mutex and create files. | 2834 | * the existing cgroups under cgroup_mutex and create files. |
2812 | * Instead, we use cgroup_for_each_descendant_pre() and drop RCU | 2835 | * Instead, we use css_for_each_descendant_pre() and drop RCU read |
2813 | * read lock before calling cgroup_addrm_files(). | 2836 | * lock before calling cgroup_addrm_files(). |
2814 | */ | 2837 | */ |
2815 | mutex_lock(&cgroup_mutex); | 2838 | mutex_lock(&cgroup_mutex); |
2816 | } | 2839 | } |
2817 | 2840 | ||
2818 | static void cgroup_cfts_commit(struct cgroup_subsys *ss, | 2841 | static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) |
2819 | struct cftype *cfts, bool is_add) | ||
2820 | __releases(&cgroup_mutex) | 2842 | __releases(&cgroup_mutex) |
2821 | { | 2843 | { |
2822 | LIST_HEAD(pending); | 2844 | LIST_HEAD(pending); |
2823 | struct cgroup *cgrp, *root = &ss->root->top_cgroup; | 2845 | struct cgroup_subsys *ss = cfts[0].ss; |
2846 | struct cgroup *root = &ss->root->top_cgroup; | ||
2824 | struct super_block *sb = ss->root->sb; | 2847 | struct super_block *sb = ss->root->sb; |
2825 | struct dentry *prev = NULL; | 2848 | struct dentry *prev = NULL; |
2826 | struct inode *inode; | 2849 | struct inode *inode; |
2850 | struct cgroup_subsys_state *css; | ||
2827 | u64 update_before; | 2851 | u64 update_before; |
2852 | int ret = 0; | ||
2828 | 2853 | ||
2829 | /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ | 2854 | /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ |
2830 | if (!cfts || ss->root == &cgroup_dummy_root || | 2855 | if (!cfts || ss->root == &cgroup_dummy_root || |
2831 | !atomic_inc_not_zero(&sb->s_active)) { | 2856 | !atomic_inc_not_zero(&sb->s_active)) { |
2832 | mutex_unlock(&cgroup_mutex); | 2857 | mutex_unlock(&cgroup_mutex); |
2833 | return; | 2858 | return 0; |
2834 | } | 2859 | } |
2835 | 2860 | ||
2836 | /* | 2861 | /* |
@@ -2842,17 +2867,11 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss, | |||
2842 | 2867 | ||
2843 | mutex_unlock(&cgroup_mutex); | 2868 | mutex_unlock(&cgroup_mutex); |
2844 | 2869 | ||
2845 | /* @root always needs to be updated */ | ||
2846 | inode = root->dentry->d_inode; | ||
2847 | mutex_lock(&inode->i_mutex); | ||
2848 | mutex_lock(&cgroup_mutex); | ||
2849 | cgroup_addrm_files(root, ss, cfts, is_add); | ||
2850 | mutex_unlock(&cgroup_mutex); | ||
2851 | mutex_unlock(&inode->i_mutex); | ||
2852 | |||
2853 | /* add/rm files for all cgroups created before */ | 2870 | /* add/rm files for all cgroups created before */ |
2854 | rcu_read_lock(); | 2871 | rcu_read_lock(); |
2855 | cgroup_for_each_descendant_pre(cgrp, root) { | 2872 | css_for_each_descendant_pre(css, cgroup_css(root, ss)) { |
2873 | struct cgroup *cgrp = css->cgroup; | ||
2874 | |||
2856 | if (cgroup_is_dead(cgrp)) | 2875 | if (cgroup_is_dead(cgrp)) |
2857 | continue; | 2876 | continue; |
2858 | 2877 | ||
@@ -2866,15 +2885,18 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss, | |||
2866 | mutex_lock(&inode->i_mutex); | 2885 | mutex_lock(&inode->i_mutex); |
2867 | mutex_lock(&cgroup_mutex); | 2886 | mutex_lock(&cgroup_mutex); |
2868 | if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp)) | 2887 | if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp)) |
2869 | cgroup_addrm_files(cgrp, ss, cfts, is_add); | 2888 | ret = cgroup_addrm_files(cgrp, cfts, is_add); |
2870 | mutex_unlock(&cgroup_mutex); | 2889 | mutex_unlock(&cgroup_mutex); |
2871 | mutex_unlock(&inode->i_mutex); | 2890 | mutex_unlock(&inode->i_mutex); |
2872 | 2891 | ||
2873 | rcu_read_lock(); | 2892 | rcu_read_lock(); |
2893 | if (ret) | ||
2894 | break; | ||
2874 | } | 2895 | } |
2875 | rcu_read_unlock(); | 2896 | rcu_read_unlock(); |
2876 | dput(prev); | 2897 | dput(prev); |
2877 | deactivate_super(sb); | 2898 | deactivate_super(sb); |
2899 | return ret; | ||
2878 | } | 2900 | } |
2879 | 2901 | ||
2880 | /** | 2902 | /** |
@@ -2894,49 +2916,56 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss, | |||
2894 | int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) | 2916 | int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) |
2895 | { | 2917 | { |
2896 | struct cftype_set *set; | 2918 | struct cftype_set *set; |
2919 | struct cftype *cft; | ||
2920 | int ret; | ||
2897 | 2921 | ||
2898 | set = kzalloc(sizeof(*set), GFP_KERNEL); | 2922 | set = kzalloc(sizeof(*set), GFP_KERNEL); |
2899 | if (!set) | 2923 | if (!set) |
2900 | return -ENOMEM; | 2924 | return -ENOMEM; |
2901 | 2925 | ||
2926 | for (cft = cfts; cft->name[0] != '\0'; cft++) | ||
2927 | cft->ss = ss; | ||
2928 | |||
2902 | cgroup_cfts_prepare(); | 2929 | cgroup_cfts_prepare(); |
2903 | set->cfts = cfts; | 2930 | set->cfts = cfts; |
2904 | list_add_tail(&set->node, &ss->cftsets); | 2931 | list_add_tail(&set->node, &ss->cftsets); |
2905 | cgroup_cfts_commit(ss, cfts, true); | 2932 | ret = cgroup_cfts_commit(cfts, true); |
2906 | 2933 | if (ret) | |
2907 | return 0; | 2934 | cgroup_rm_cftypes(cfts); |
2935 | return ret; | ||
2908 | } | 2936 | } |
2909 | EXPORT_SYMBOL_GPL(cgroup_add_cftypes); | 2937 | EXPORT_SYMBOL_GPL(cgroup_add_cftypes); |
2910 | 2938 | ||
2911 | /** | 2939 | /** |
2912 | * cgroup_rm_cftypes - remove an array of cftypes from a subsystem | 2940 | * cgroup_rm_cftypes - remove an array of cftypes from a subsystem |
2913 | * @ss: target cgroup subsystem | ||
2914 | * @cfts: zero-length name terminated array of cftypes | 2941 | * @cfts: zero-length name terminated array of cftypes |
2915 | * | 2942 | * |
2916 | * Unregister @cfts from @ss. Files described by @cfts are removed from | 2943 | * Unregister @cfts. Files described by @cfts are removed from all |
2917 | * all existing cgroups to which @ss is attached and all future cgroups | 2944 | * existing cgroups and all future cgroups won't have them either. This |
2918 | * won't have them either. This function can be called anytime whether @ss | 2945 | * function can be called anytime whether @cfts' subsys is attached or not. |
2919 | * is attached or not. | ||
2920 | * | 2946 | * |
2921 | * Returns 0 on successful unregistration, -ENOENT if @cfts is not | 2947 | * Returns 0 on successful unregistration, -ENOENT if @cfts is not |
2922 | * registered with @ss. | 2948 | * registered. |
2923 | */ | 2949 | */ |
2924 | int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) | 2950 | int cgroup_rm_cftypes(struct cftype *cfts) |
2925 | { | 2951 | { |
2926 | struct cftype_set *set; | 2952 | struct cftype_set *set; |
2927 | 2953 | ||
2954 | if (!cfts || !cfts[0].ss) | ||
2955 | return -ENOENT; | ||
2956 | |||
2928 | cgroup_cfts_prepare(); | 2957 | cgroup_cfts_prepare(); |
2929 | 2958 | ||
2930 | list_for_each_entry(set, &ss->cftsets, node) { | 2959 | list_for_each_entry(set, &cfts[0].ss->cftsets, node) { |
2931 | if (set->cfts == cfts) { | 2960 | if (set->cfts == cfts) { |
2932 | list_del(&set->node); | 2961 | list_del(&set->node); |
2933 | kfree(set); | 2962 | kfree(set); |
2934 | cgroup_cfts_commit(ss, cfts, false); | 2963 | cgroup_cfts_commit(cfts, false); |
2935 | return 0; | 2964 | return 0; |
2936 | } | 2965 | } |
2937 | } | 2966 | } |
2938 | 2967 | ||
2939 | cgroup_cfts_commit(ss, NULL, false); | 2968 | cgroup_cfts_commit(NULL, false); |
2940 | return -ENOENT; | 2969 | return -ENOENT; |
2941 | } | 2970 | } |
2942 | 2971 | ||
@@ -2959,34 +2988,10 @@ int cgroup_task_count(const struct cgroup *cgrp) | |||
2959 | } | 2988 | } |
2960 | 2989 | ||
2961 | /* | 2990 | /* |
2962 | * Advance a list_head iterator. The iterator should be positioned at | 2991 | * To reduce the fork() overhead for systems that are not actually using |
2963 | * the start of a css_set | 2992 | * their cgroups capability, we don't maintain the lists running through |
2964 | */ | 2993 | * each css_set to its tasks until we see the list actually used - in other |
2965 | static void cgroup_advance_iter(struct cgroup *cgrp, struct cgroup_iter *it) | 2994 | * words after the first call to css_task_iter_start(). |
2966 | { | ||
2967 | struct list_head *l = it->cset_link; | ||
2968 | struct cgrp_cset_link *link; | ||
2969 | struct css_set *cset; | ||
2970 | |||
2971 | /* Advance to the next non-empty css_set */ | ||
2972 | do { | ||
2973 | l = l->next; | ||
2974 | if (l == &cgrp->cset_links) { | ||
2975 | it->cset_link = NULL; | ||
2976 | return; | ||
2977 | } | ||
2978 | link = list_entry(l, struct cgrp_cset_link, cset_link); | ||
2979 | cset = link->cset; | ||
2980 | } while (list_empty(&cset->tasks)); | ||
2981 | it->cset_link = l; | ||
2982 | it->task = cset->tasks.next; | ||
2983 | } | ||
2984 | |||
2985 | /* | ||
2986 | * To reduce the fork() overhead for systems that are not actually | ||
2987 | * using their cgroups capability, we don't maintain the lists running | ||
2988 | * through each css_set to its tasks until we see the list actually | ||
2989 | * used - in other words after the first call to cgroup_iter_start(). | ||
2990 | */ | 2995 | */ |
2991 | static void cgroup_enable_task_cg_lists(void) | 2996 | static void cgroup_enable_task_cg_lists(void) |
2992 | { | 2997 | { |
@@ -3017,16 +3022,21 @@ static void cgroup_enable_task_cg_lists(void) | |||
3017 | } | 3022 | } |
3018 | 3023 | ||
3019 | /** | 3024 | /** |
3020 | * cgroup_next_sibling - find the next sibling of a given cgroup | 3025 | * css_next_child - find the next child of a given css |
3021 | * @pos: the current cgroup | 3026 | * @pos_css: the current position (%NULL to initiate traversal) |
3027 | * @parent_css: css whose children to walk | ||
3022 | * | 3028 | * |
3023 | * This function returns the next sibling of @pos and should be called | 3029 | * This function returns the next child of @parent_css and should be called |
3024 | * under RCU read lock. The only requirement is that @pos is accessible. | 3030 | * under RCU read lock. The only requirement is that @parent_css and |
3025 | * The next sibling is guaranteed to be returned regardless of @pos's | 3031 | * @pos_css are accessible. The next sibling is guaranteed to be returned |
3026 | * state. | 3032 | * regardless of their states. |
3027 | */ | 3033 | */ |
3028 | struct cgroup *cgroup_next_sibling(struct cgroup *pos) | 3034 | struct cgroup_subsys_state * |
3035 | css_next_child(struct cgroup_subsys_state *pos_css, | ||
3036 | struct cgroup_subsys_state *parent_css) | ||
3029 | { | 3037 | { |
3038 | struct cgroup *pos = pos_css ? pos_css->cgroup : NULL; | ||
3039 | struct cgroup *cgrp = parent_css->cgroup; | ||
3030 | struct cgroup *next; | 3040 | struct cgroup *next; |
3031 | 3041 | ||
3032 | WARN_ON_ONCE(!rcu_read_lock_held()); | 3042 | WARN_ON_ONCE(!rcu_read_lock_held()); |
@@ -3041,78 +3051,81 @@ struct cgroup *cgroup_next_sibling(struct cgroup *pos) | |||
3041 | * safe to dereference from this RCU critical section. If | 3051 | * safe to dereference from this RCU critical section. If |
3042 | * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed | 3052 | * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed |
3043 | * to be visible as %true here. | 3053 | * to be visible as %true here. |
3054 | * | ||
3055 | * If @pos is dead, its next pointer can't be dereferenced; | ||
3056 | * however, as each cgroup is given a monotonically increasing | ||
3057 | * unique serial number and always appended to the sibling list, | ||
3058 | * the next one can be found by walking the parent's children until | ||
3059 | * we see a cgroup with higher serial number than @pos's. While | ||
3060 | * this path can be slower, it's taken only when either the current | ||
3061 | * cgroup is removed or iteration and removal race. | ||
3044 | */ | 3062 | */ |
3045 | if (likely(!cgroup_is_dead(pos))) { | 3063 | if (!pos) { |
3064 | next = list_entry_rcu(cgrp->children.next, struct cgroup, sibling); | ||
3065 | } else if (likely(!cgroup_is_dead(pos))) { | ||
3046 | next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); | 3066 | next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); |
3047 | if (&next->sibling != &pos->parent->children) | 3067 | } else { |
3048 | return next; | 3068 | list_for_each_entry_rcu(next, &cgrp->children, sibling) |
3049 | return NULL; | 3069 | if (next->serial_nr > pos->serial_nr) |
3070 | break; | ||
3050 | } | 3071 | } |
3051 | 3072 | ||
3052 | /* | 3073 | if (&next->sibling == &cgrp->children) |
3053 | * Can't dereference the next pointer. Each cgroup is given a | 3074 | return NULL; |
3054 | * monotonically increasing unique serial number and always | 3075 | |
3055 | * appended to the sibling list, so the next one can be found by | 3076 | return cgroup_css(next, parent_css->ss); |
3056 | * walking the parent's children until we see a cgroup with higher | ||
3057 | * serial number than @pos's. | ||
3058 | * | ||
3059 | * While this path can be slow, it's taken only when either the | ||
3060 | * current cgroup is removed or iteration and removal race. | ||
3061 | */ | ||
3062 | list_for_each_entry_rcu(next, &pos->parent->children, sibling) | ||
3063 | if (next->serial_nr > pos->serial_nr) | ||
3064 | return next; | ||
3065 | return NULL; | ||
3066 | } | 3077 | } |
3067 | EXPORT_SYMBOL_GPL(cgroup_next_sibling); | 3078 | EXPORT_SYMBOL_GPL(css_next_child); |
3068 | 3079 | ||
3069 | /** | 3080 | /** |
3070 | * cgroup_next_descendant_pre - find the next descendant for pre-order walk | 3081 | * css_next_descendant_pre - find the next descendant for pre-order walk |
3071 | * @pos: the current position (%NULL to initiate traversal) | 3082 | * @pos: the current position (%NULL to initiate traversal) |
3072 | * @cgroup: cgroup whose descendants to walk | 3083 | * @root: css whose descendants to walk |
3073 | * | 3084 | * |
3074 | * To be used by cgroup_for_each_descendant_pre(). Find the next | 3085 | * To be used by css_for_each_descendant_pre(). Find the next descendant |
3075 | * descendant to visit for pre-order traversal of @cgroup's descendants. | 3086 | * to visit for pre-order traversal of @root's descendants. @root is |
3087 | * included in the iteration and the first node to be visited. | ||
3076 | * | 3088 | * |
3077 | * While this function requires RCU read locking, it doesn't require the | 3089 | * While this function requires RCU read locking, it doesn't require the |
3078 | * whole traversal to be contained in a single RCU critical section. This | 3090 | * whole traversal to be contained in a single RCU critical section. This |
3079 | * function will return the correct next descendant as long as both @pos | 3091 | * function will return the correct next descendant as long as both @pos |
3080 | * and @cgroup are accessible and @pos is a descendant of @cgroup. | 3092 | * and @root are accessible and @pos is a descendant of @root. |
3081 | */ | 3093 | */ |
3082 | struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, | 3094 | struct cgroup_subsys_state * |
3083 | struct cgroup *cgroup) | 3095 | css_next_descendant_pre(struct cgroup_subsys_state *pos, |
3096 | struct cgroup_subsys_state *root) | ||
3084 | { | 3097 | { |
3085 | struct cgroup *next; | 3098 | struct cgroup_subsys_state *next; |
3086 | 3099 | ||
3087 | WARN_ON_ONCE(!rcu_read_lock_held()); | 3100 | WARN_ON_ONCE(!rcu_read_lock_held()); |
3088 | 3101 | ||
3089 | /* if first iteration, pretend we just visited @cgroup */ | 3102 | /* if first iteration, visit @root */ |
3090 | if (!pos) | 3103 | if (!pos) |
3091 | pos = cgroup; | 3104 | return root; |
3092 | 3105 | ||
3093 | /* visit the first child if exists */ | 3106 | /* visit the first child if exists */ |
3094 | next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling); | 3107 | next = css_next_child(NULL, pos); |
3095 | if (next) | 3108 | if (next) |
3096 | return next; | 3109 | return next; |
3097 | 3110 | ||
3098 | /* no child, visit my or the closest ancestor's next sibling */ | 3111 | /* no child, visit my or the closest ancestor's next sibling */ |
3099 | while (pos != cgroup) { | 3112 | while (pos != root) { |
3100 | next = cgroup_next_sibling(pos); | 3113 | next = css_next_child(pos, css_parent(pos)); |
3101 | if (next) | 3114 | if (next) |
3102 | return next; | 3115 | return next; |
3103 | pos = pos->parent; | 3116 | pos = css_parent(pos); |
3104 | } | 3117 | } |
3105 | 3118 | ||
3106 | return NULL; | 3119 | return NULL; |
3107 | } | 3120 | } |
3108 | EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); | 3121 | EXPORT_SYMBOL_GPL(css_next_descendant_pre); |
3109 | 3122 | ||
3110 | /** | 3123 | /** |
3111 | * cgroup_rightmost_descendant - return the rightmost descendant of a cgroup | 3124 | * css_rightmost_descendant - return the rightmost descendant of a css |
3112 | * @pos: cgroup of interest | 3125 | * @pos: css of interest |
3113 | * | 3126 | * |
3114 | * Return the rightmost descendant of @pos. If there's no descendant, | 3127 | * Return the rightmost descendant of @pos. If there's no descendant, @pos |
3115 | * @pos is returned. This can be used during pre-order traversal to skip | 3128 | * is returned. This can be used during pre-order traversal to skip |
3116 | * subtree of @pos. | 3129 | * subtree of @pos. |
3117 | * | 3130 | * |
3118 | * While this function requires RCU read locking, it doesn't require the | 3131 | * While this function requires RCU read locking, it doesn't require the |
@@ -3120,9 +3133,10 @@ EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); | |||
3120 | * function will return the correct rightmost descendant as long as @pos is | 3133 | * function will return the correct rightmost descendant as long as @pos is |
3121 | * accessible. | 3134 | * accessible. |
3122 | */ | 3135 | */ |
3123 | struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos) | 3136 | struct cgroup_subsys_state * |
3137 | css_rightmost_descendant(struct cgroup_subsys_state *pos) | ||
3124 | { | 3138 | { |
3125 | struct cgroup *last, *tmp; | 3139 | struct cgroup_subsys_state *last, *tmp; |
3126 | 3140 | ||
3127 | WARN_ON_ONCE(!rcu_read_lock_held()); | 3141 | WARN_ON_ONCE(!rcu_read_lock_held()); |
3128 | 3142 | ||
@@ -3130,82 +3144,138 @@ struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos) | |||
3130 | last = pos; | 3144 | last = pos; |
3131 | /* ->prev isn't RCU safe, walk ->next till the end */ | 3145 | /* ->prev isn't RCU safe, walk ->next till the end */ |
3132 | pos = NULL; | 3146 | pos = NULL; |
3133 | list_for_each_entry_rcu(tmp, &last->children, sibling) | 3147 | css_for_each_child(tmp, last) |
3134 | pos = tmp; | 3148 | pos = tmp; |
3135 | } while (pos); | 3149 | } while (pos); |
3136 | 3150 | ||
3137 | return last; | 3151 | return last; |
3138 | } | 3152 | } |
3139 | EXPORT_SYMBOL_GPL(cgroup_rightmost_descendant); | 3153 | EXPORT_SYMBOL_GPL(css_rightmost_descendant); |
3140 | 3154 | ||
3141 | static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos) | 3155 | static struct cgroup_subsys_state * |
3156 | css_leftmost_descendant(struct cgroup_subsys_state *pos) | ||
3142 | { | 3157 | { |
3143 | struct cgroup *last; | 3158 | struct cgroup_subsys_state *last; |
3144 | 3159 | ||
3145 | do { | 3160 | do { |
3146 | last = pos; | 3161 | last = pos; |
3147 | pos = list_first_or_null_rcu(&pos->children, struct cgroup, | 3162 | pos = css_next_child(NULL, pos); |
3148 | sibling); | ||
3149 | } while (pos); | 3163 | } while (pos); |
3150 | 3164 | ||
3151 | return last; | 3165 | return last; |
3152 | } | 3166 | } |
3153 | 3167 | ||
3154 | /** | 3168 | /** |
3155 | * cgroup_next_descendant_post - find the next descendant for post-order walk | 3169 | * css_next_descendant_post - find the next descendant for post-order walk |
3156 | * @pos: the current position (%NULL to initiate traversal) | 3170 | * @pos: the current position (%NULL to initiate traversal) |
3157 | * @cgroup: cgroup whose descendants to walk | 3171 | * @root: css whose descendants to walk |
3158 | * | 3172 | * |
3159 | * To be used by cgroup_for_each_descendant_post(). Find the next | 3173 | * To be used by css_for_each_descendant_post(). Find the next descendant |
3160 | * descendant to visit for post-order traversal of @cgroup's descendants. | 3174 | * to visit for post-order traversal of @root's descendants. @root is |
3175 | * included in the iteration and the last node to be visited. | ||
3161 | * | 3176 | * |
3162 | * While this function requires RCU read locking, it doesn't require the | 3177 | * While this function requires RCU read locking, it doesn't require the |
3163 | * whole traversal to be contained in a single RCU critical section. This | 3178 | * whole traversal to be contained in a single RCU critical section. This |
3164 | * function will return the correct next descendant as long as both @pos | 3179 | * function will return the correct next descendant as long as both @pos |
3165 | * and @cgroup are accessible and @pos is a descendant of @cgroup. | 3180 | * and @cgroup are accessible and @pos is a descendant of @cgroup. |
3166 | */ | 3181 | */ |
3167 | struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, | 3182 | struct cgroup_subsys_state * |
3168 | struct cgroup *cgroup) | 3183 | css_next_descendant_post(struct cgroup_subsys_state *pos, |
3184 | struct cgroup_subsys_state *root) | ||
3169 | { | 3185 | { |
3170 | struct cgroup *next; | 3186 | struct cgroup_subsys_state *next; |
3171 | 3187 | ||
3172 | WARN_ON_ONCE(!rcu_read_lock_held()); | 3188 | WARN_ON_ONCE(!rcu_read_lock_held()); |
3173 | 3189 | ||
3174 | /* if first iteration, visit the leftmost descendant */ | 3190 | /* if first iteration, visit the leftmost descendant */ |
3175 | if (!pos) { | 3191 | if (!pos) { |
3176 | next = cgroup_leftmost_descendant(cgroup); | 3192 | next = css_leftmost_descendant(root); |
3177 | return next != cgroup ? next : NULL; | 3193 | return next != root ? next : NULL; |
3178 | } | 3194 | } |
3179 | 3195 | ||
3196 | /* if we visited @root, we're done */ | ||
3197 | if (pos == root) | ||
3198 | return NULL; | ||
3199 | |||
3180 | /* if there's an unvisited sibling, visit its leftmost descendant */ | 3200 | /* if there's an unvisited sibling, visit its leftmost descendant */ |
3181 | next = cgroup_next_sibling(pos); | 3201 | next = css_next_child(pos, css_parent(pos)); |
3182 | if (next) | 3202 | if (next) |
3183 | return cgroup_leftmost_descendant(next); | 3203 | return css_leftmost_descendant(next); |
3184 | 3204 | ||
3185 | /* no sibling left, visit parent */ | 3205 | /* no sibling left, visit parent */ |
3186 | next = pos->parent; | 3206 | return css_parent(pos); |
3187 | return next != cgroup ? next : NULL; | 3207 | } |
3208 | EXPORT_SYMBOL_GPL(css_next_descendant_post); | ||
3209 | |||
3210 | /** | ||
3211 | * css_advance_task_iter - advance a task itererator to the next css_set | ||
3212 | * @it: the iterator to advance | ||
3213 | * | ||
3214 | * Advance @it to the next css_set to walk. | ||
3215 | */ | ||
3216 | static void css_advance_task_iter(struct css_task_iter *it) | ||
3217 | { | ||
3218 | struct list_head *l = it->cset_link; | ||
3219 | struct cgrp_cset_link *link; | ||
3220 | struct css_set *cset; | ||
3221 | |||
3222 | /* Advance to the next non-empty css_set */ | ||
3223 | do { | ||
3224 | l = l->next; | ||
3225 | if (l == &it->origin_css->cgroup->cset_links) { | ||
3226 | it->cset_link = NULL; | ||
3227 | return; | ||
3228 | } | ||
3229 | link = list_entry(l, struct cgrp_cset_link, cset_link); | ||
3230 | cset = link->cset; | ||
3231 | } while (list_empty(&cset->tasks)); | ||
3232 | it->cset_link = l; | ||
3233 | it->task = cset->tasks.next; | ||
3188 | } | 3234 | } |
3189 | EXPORT_SYMBOL_GPL(cgroup_next_descendant_post); | ||
3190 | 3235 | ||
3191 | void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) | 3236 | /** |
3237 | * css_task_iter_start - initiate task iteration | ||
3238 | * @css: the css to walk tasks of | ||
3239 | * @it: the task iterator to use | ||
3240 | * | ||
3241 | * Initiate iteration through the tasks of @css. The caller can call | ||
3242 | * css_task_iter_next() to walk through the tasks until the function | ||
3243 | * returns NULL. On completion of iteration, css_task_iter_end() must be | ||
3244 | * called. | ||
3245 | * | ||
3246 | * Note that this function acquires a lock which is released when the | ||
3247 | * iteration finishes. The caller can't sleep while iteration is in | ||
3248 | * progress. | ||
3249 | */ | ||
3250 | void css_task_iter_start(struct cgroup_subsys_state *css, | ||
3251 | struct css_task_iter *it) | ||
3192 | __acquires(css_set_lock) | 3252 | __acquires(css_set_lock) |
3193 | { | 3253 | { |
3194 | /* | 3254 | /* |
3195 | * The first time anyone tries to iterate across a cgroup, | 3255 | * The first time anyone tries to iterate across a css, we need to |
3196 | * we need to enable the list linking each css_set to its | 3256 | * enable the list linking each css_set to its tasks, and fix up |
3197 | * tasks, and fix up all existing tasks. | 3257 | * all existing tasks. |
3198 | */ | 3258 | */ |
3199 | if (!use_task_css_set_links) | 3259 | if (!use_task_css_set_links) |
3200 | cgroup_enable_task_cg_lists(); | 3260 | cgroup_enable_task_cg_lists(); |
3201 | 3261 | ||
3202 | read_lock(&css_set_lock); | 3262 | read_lock(&css_set_lock); |
3203 | it->cset_link = &cgrp->cset_links; | 3263 | |
3204 | cgroup_advance_iter(cgrp, it); | 3264 | it->origin_css = css; |
3265 | it->cset_link = &css->cgroup->cset_links; | ||
3266 | |||
3267 | css_advance_task_iter(it); | ||
3205 | } | 3268 | } |
3206 | 3269 | ||
3207 | struct task_struct *cgroup_iter_next(struct cgroup *cgrp, | 3270 | /** |
3208 | struct cgroup_iter *it) | 3271 | * css_task_iter_next - return the next task for the iterator |
3272 | * @it: the task iterator being iterated | ||
3273 | * | ||
3274 | * The "next" function for task iteration. @it should have been | ||
3275 | * initialized via css_task_iter_start(). Returns NULL when the iteration | ||
3276 | * reaches the end. | ||
3277 | */ | ||
3278 | struct task_struct *css_task_iter_next(struct css_task_iter *it) | ||
3209 | { | 3279 | { |
3210 | struct task_struct *res; | 3280 | struct task_struct *res; |
3211 | struct list_head *l = it->task; | 3281 | struct list_head *l = it->task; |
@@ -3219,16 +3289,24 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp, | |||
3219 | l = l->next; | 3289 | l = l->next; |
3220 | link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link); | 3290 | link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link); |
3221 | if (l == &link->cset->tasks) { | 3291 | if (l == &link->cset->tasks) { |
3222 | /* We reached the end of this task list - move on to | 3292 | /* |
3223 | * the next cg_cgroup_link */ | 3293 | * We reached the end of this task list - move on to the |
3224 | cgroup_advance_iter(cgrp, it); | 3294 | * next cgrp_cset_link. |
3295 | */ | ||
3296 | css_advance_task_iter(it); | ||
3225 | } else { | 3297 | } else { |
3226 | it->task = l; | 3298 | it->task = l; |
3227 | } | 3299 | } |
3228 | return res; | 3300 | return res; |
3229 | } | 3301 | } |
3230 | 3302 | ||
3231 | void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it) | 3303 | /** |
3304 | * css_task_iter_end - finish task iteration | ||
3305 | * @it: the task iterator to finish | ||
3306 | * | ||
3307 | * Finish task iteration started by css_task_iter_start(). | ||
3308 | */ | ||
3309 | void css_task_iter_end(struct css_task_iter *it) | ||
3232 | __releases(css_set_lock) | 3310 | __releases(css_set_lock) |
3233 | { | 3311 | { |
3234 | read_unlock(&css_set_lock); | 3312 | read_unlock(&css_set_lock); |
@@ -3269,46 +3347,49 @@ static inline int started_after(void *p1, void *p2) | |||
3269 | } | 3347 | } |
3270 | 3348 | ||
3271 | /** | 3349 | /** |
3272 | * cgroup_scan_tasks - iterate though all the tasks in a cgroup | 3350 | * css_scan_tasks - iterate though all the tasks in a css |
3273 | * @scan: struct cgroup_scanner containing arguments for the scan | 3351 | * @css: the css to iterate tasks of |
3352 | * @test: optional test callback | ||
3353 | * @process: process callback | ||
3354 | * @data: data passed to @test and @process | ||
3355 | * @heap: optional pre-allocated heap used for task iteration | ||
3356 | * | ||
3357 | * Iterate through all the tasks in @css, calling @test for each, and if it | ||
3358 | * returns %true, call @process for it also. | ||
3274 | * | 3359 | * |
3275 | * Arguments include pointers to callback functions test_task() and | 3360 | * @test may be NULL, meaning always true (select all tasks), which |
3276 | * process_task(). | 3361 | * effectively duplicates css_task_iter_{start,next,end}() but does not |
3277 | * Iterate through all the tasks in a cgroup, calling test_task() for each, | 3362 | * lock css_set_lock for the call to @process. |
3278 | * and if it returns true, call process_task() for it also. | ||
3279 | * The test_task pointer may be NULL, meaning always true (select all tasks). | ||
3280 | * Effectively duplicates cgroup_iter_{start,next,end}() | ||
3281 | * but does not lock css_set_lock for the call to process_task(). | ||
3282 | * The struct cgroup_scanner may be embedded in any structure of the caller's | ||
3283 | * creation. | ||
3284 | * It is guaranteed that process_task() will act on every task that | ||
3285 | * is a member of the cgroup for the duration of this call. This | ||
3286 | * function may or may not call process_task() for tasks that exit | ||
3287 | * or move to a different cgroup during the call, or are forked or | ||
3288 | * move into the cgroup during the call. | ||
3289 | * | 3363 | * |
3290 | * Note that test_task() may be called with locks held, and may in some | 3364 | * It is guaranteed that @process will act on every task that is a member |
3291 | * situations be called multiple times for the same task, so it should | 3365 | * of @css for the duration of this call. This function may or may not |
3292 | * be cheap. | 3366 | * call @process for tasks that exit or move to a different css during the |
3293 | * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been | 3367 | * call, or are forked or move into the css during the call. |
3294 | * pre-allocated and will be used for heap operations (and its "gt" member will | 3368 | * |
3295 | * be overwritten), else a temporary heap will be used (allocation of which | 3369 | * Note that @test may be called with locks held, and may in some |
3296 | * may cause this function to fail). | 3370 | * situations be called multiple times for the same task, so it should be |
3371 | * cheap. | ||
3372 | * | ||
3373 | * If @heap is non-NULL, a heap has been pre-allocated and will be used for | ||
3374 | * heap operations (and its "gt" member will be overwritten), else a | ||
3375 | * temporary heap will be used (allocation of which may cause this function | ||
3376 | * to fail). | ||
3297 | */ | 3377 | */ |
3298 | int cgroup_scan_tasks(struct cgroup_scanner *scan) | 3378 | int css_scan_tasks(struct cgroup_subsys_state *css, |
3379 | bool (*test)(struct task_struct *, void *), | ||
3380 | void (*process)(struct task_struct *, void *), | ||
3381 | void *data, struct ptr_heap *heap) | ||
3299 | { | 3382 | { |
3300 | int retval, i; | 3383 | int retval, i; |
3301 | struct cgroup_iter it; | 3384 | struct css_task_iter it; |
3302 | struct task_struct *p, *dropped; | 3385 | struct task_struct *p, *dropped; |
3303 | /* Never dereference latest_task, since it's not refcounted */ | 3386 | /* Never dereference latest_task, since it's not refcounted */ |
3304 | struct task_struct *latest_task = NULL; | 3387 | struct task_struct *latest_task = NULL; |
3305 | struct ptr_heap tmp_heap; | 3388 | struct ptr_heap tmp_heap; |
3306 | struct ptr_heap *heap; | ||
3307 | struct timespec latest_time = { 0, 0 }; | 3389 | struct timespec latest_time = { 0, 0 }; |
3308 | 3390 | ||
3309 | if (scan->heap) { | 3391 | if (heap) { |
3310 | /* The caller supplied our heap and pre-allocated its memory */ | 3392 | /* The caller supplied our heap and pre-allocated its memory */ |
3311 | heap = scan->heap; | ||
3312 | heap->gt = &started_after; | 3393 | heap->gt = &started_after; |
3313 | } else { | 3394 | } else { |
3314 | /* We need to allocate our own heap memory */ | 3395 | /* We need to allocate our own heap memory */ |
@@ -3321,25 +3402,24 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) | |||
3321 | 3402 | ||
3322 | again: | 3403 | again: |
3323 | /* | 3404 | /* |
3324 | * Scan tasks in the cgroup, using the scanner's "test_task" callback | 3405 | * Scan tasks in the css, using the @test callback to determine |
3325 | * to determine which are of interest, and using the scanner's | 3406 | * which are of interest, and invoking @process callback on the |
3326 | * "process_task" callback to process any of them that need an update. | 3407 | * ones which need an update. Since we don't want to hold any |
3327 | * Since we don't want to hold any locks during the task updates, | 3408 | * locks during the task updates, gather tasks to be processed in a |
3328 | * gather tasks to be processed in a heap structure. | 3409 | * heap structure. The heap is sorted by descending task start |
3329 | * The heap is sorted by descending task start time. | 3410 | * time. If the statically-sized heap fills up, we overflow tasks |
3330 | * If the statically-sized heap fills up, we overflow tasks that | 3411 | * that started later, and in future iterations only consider tasks |
3331 | * started later, and in future iterations only consider tasks that | 3412 | * that started after the latest task in the previous pass. This |
3332 | * started after the latest task in the previous pass. This | ||
3333 | * guarantees forward progress and that we don't miss any tasks. | 3413 | * guarantees forward progress and that we don't miss any tasks. |
3334 | */ | 3414 | */ |
3335 | heap->size = 0; | 3415 | heap->size = 0; |
3336 | cgroup_iter_start(scan->cg, &it); | 3416 | css_task_iter_start(css, &it); |
3337 | while ((p = cgroup_iter_next(scan->cg, &it))) { | 3417 | while ((p = css_task_iter_next(&it))) { |
3338 | /* | 3418 | /* |
3339 | * Only affect tasks that qualify per the caller's callback, | 3419 | * Only affect tasks that qualify per the caller's callback, |
3340 | * if he provided one | 3420 | * if he provided one |
3341 | */ | 3421 | */ |
3342 | if (scan->test_task && !scan->test_task(p, scan)) | 3422 | if (test && !test(p, data)) |
3343 | continue; | 3423 | continue; |
3344 | /* | 3424 | /* |
3345 | * Only process tasks that started after the last task | 3425 | * Only process tasks that started after the last task |
@@ -3367,7 +3447,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) | |||
3367 | * the heap and wasn't inserted | 3447 | * the heap and wasn't inserted |
3368 | */ | 3448 | */ |
3369 | } | 3449 | } |
3370 | cgroup_iter_end(scan->cg, &it); | 3450 | css_task_iter_end(&it); |
3371 | 3451 | ||
3372 | if (heap->size) { | 3452 | if (heap->size) { |
3373 | for (i = 0; i < heap->size; i++) { | 3453 | for (i = 0; i < heap->size; i++) { |
@@ -3377,7 +3457,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) | |||
3377 | latest_task = q; | 3457 | latest_task = q; |
3378 | } | 3458 | } |
3379 | /* Process the task per the caller's callback */ | 3459 | /* Process the task per the caller's callback */ |
3380 | scan->process_task(q, scan); | 3460 | process(q, data); |
3381 | put_task_struct(q); | 3461 | put_task_struct(q); |
3382 | } | 3462 | } |
3383 | /* | 3463 | /* |
@@ -3394,10 +3474,9 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) | |||
3394 | return 0; | 3474 | return 0; |
3395 | } | 3475 | } |
3396 | 3476 | ||
3397 | static void cgroup_transfer_one_task(struct task_struct *task, | 3477 | static void cgroup_transfer_one_task(struct task_struct *task, void *data) |
3398 | struct cgroup_scanner *scan) | ||
3399 | { | 3478 | { |
3400 | struct cgroup *new_cgroup = scan->data; | 3479 | struct cgroup *new_cgroup = data; |
3401 | 3480 | ||
3402 | mutex_lock(&cgroup_mutex); | 3481 | mutex_lock(&cgroup_mutex); |
3403 | cgroup_attach_task(new_cgroup, task, false); | 3482 | cgroup_attach_task(new_cgroup, task, false); |
@@ -3411,15 +3490,8 @@ static void cgroup_transfer_one_task(struct task_struct *task, | |||
3411 | */ | 3490 | */ |
3412 | int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) | 3491 | int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) |
3413 | { | 3492 | { |
3414 | struct cgroup_scanner scan; | 3493 | return css_scan_tasks(&from->dummy_css, NULL, cgroup_transfer_one_task, |
3415 | 3494 | to, NULL); | |
3416 | scan.cg = from; | ||
3417 | scan.test_task = NULL; /* select all tasks in cgroup */ | ||
3418 | scan.process_task = cgroup_transfer_one_task; | ||
3419 | scan.heap = NULL; | ||
3420 | scan.data = to; | ||
3421 | |||
3422 | return cgroup_scan_tasks(&scan); | ||
3423 | } | 3495 | } |
3424 | 3496 | ||
3425 | /* | 3497 | /* |
@@ -3461,7 +3533,7 @@ struct cgroup_pidlist { | |||
3461 | /* pointer to the cgroup we belong to, for list removal purposes */ | 3533 | /* pointer to the cgroup we belong to, for list removal purposes */ |
3462 | struct cgroup *owner; | 3534 | struct cgroup *owner; |
3463 | /* protects the other fields */ | 3535 | /* protects the other fields */ |
3464 | struct rw_semaphore mutex; | 3536 | struct rw_semaphore rwsem; |
3465 | }; | 3537 | }; |
3466 | 3538 | ||
3467 | /* | 3539 | /* |
@@ -3534,7 +3606,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, | |||
3534 | struct pid_namespace *ns = task_active_pid_ns(current); | 3606 | struct pid_namespace *ns = task_active_pid_ns(current); |
3535 | 3607 | ||
3536 | /* | 3608 | /* |
3537 | * We can't drop the pidlist_mutex before taking the l->mutex in case | 3609 | * We can't drop the pidlist_mutex before taking the l->rwsem in case |
3538 | * the last ref-holder is trying to remove l from the list at the same | 3610 | * the last ref-holder is trying to remove l from the list at the same |
3539 | * time. Holding the pidlist_mutex precludes somebody taking whichever | 3611 | * time. Holding the pidlist_mutex precludes somebody taking whichever |
3540 | * list we find out from under us - compare release_pid_array(). | 3612 | * list we find out from under us - compare release_pid_array(). |
@@ -3543,7 +3615,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, | |||
3543 | list_for_each_entry(l, &cgrp->pidlists, links) { | 3615 | list_for_each_entry(l, &cgrp->pidlists, links) { |
3544 | if (l->key.type == type && l->key.ns == ns) { | 3616 | if (l->key.type == type && l->key.ns == ns) { |
3545 | /* make sure l doesn't vanish out from under us */ | 3617 | /* make sure l doesn't vanish out from under us */ |
3546 | down_write(&l->mutex); | 3618 | down_write(&l->rwsem); |
3547 | mutex_unlock(&cgrp->pidlist_mutex); | 3619 | mutex_unlock(&cgrp->pidlist_mutex); |
3548 | return l; | 3620 | return l; |
3549 | } | 3621 | } |
@@ -3554,8 +3626,8 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, | |||
3554 | mutex_unlock(&cgrp->pidlist_mutex); | 3626 | mutex_unlock(&cgrp->pidlist_mutex); |
3555 | return l; | 3627 | return l; |
3556 | } | 3628 | } |
3557 | init_rwsem(&l->mutex); | 3629 | init_rwsem(&l->rwsem); |
3558 | down_write(&l->mutex); | 3630 | down_write(&l->rwsem); |
3559 | l->key.type = type; | 3631 | l->key.type = type; |
3560 | l->key.ns = get_pid_ns(ns); | 3632 | l->key.ns = get_pid_ns(ns); |
3561 | l->owner = cgrp; | 3633 | l->owner = cgrp; |
@@ -3573,7 +3645,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, | |||
3573 | pid_t *array; | 3645 | pid_t *array; |
3574 | int length; | 3646 | int length; |
3575 | int pid, n = 0; /* used for populating the array */ | 3647 | int pid, n = 0; /* used for populating the array */ |
3576 | struct cgroup_iter it; | 3648 | struct css_task_iter it; |
3577 | struct task_struct *tsk; | 3649 | struct task_struct *tsk; |
3578 | struct cgroup_pidlist *l; | 3650 | struct cgroup_pidlist *l; |
3579 | 3651 | ||
@@ -3588,8 +3660,8 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, | |||
3588 | if (!array) | 3660 | if (!array) |
3589 | return -ENOMEM; | 3661 | return -ENOMEM; |
3590 | /* now, populate the array */ | 3662 | /* now, populate the array */ |
3591 | cgroup_iter_start(cgrp, &it); | 3663 | css_task_iter_start(&cgrp->dummy_css, &it); |
3592 | while ((tsk = cgroup_iter_next(cgrp, &it))) { | 3664 | while ((tsk = css_task_iter_next(&it))) { |
3593 | if (unlikely(n == length)) | 3665 | if (unlikely(n == length)) |
3594 | break; | 3666 | break; |
3595 | /* get tgid or pid for procs or tasks file respectively */ | 3667 | /* get tgid or pid for procs or tasks file respectively */ |
@@ -3600,7 +3672,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, | |||
3600 | if (pid > 0) /* make sure to only use valid results */ | 3672 | if (pid > 0) /* make sure to only use valid results */ |
3601 | array[n++] = pid; | 3673 | array[n++] = pid; |
3602 | } | 3674 | } |
3603 | cgroup_iter_end(cgrp, &it); | 3675 | css_task_iter_end(&it); |
3604 | length = n; | 3676 | length = n; |
3605 | /* now sort & (if procs) strip out duplicates */ | 3677 | /* now sort & (if procs) strip out duplicates */ |
3606 | sort(array, length, sizeof(pid_t), cmppid, NULL); | 3678 | sort(array, length, sizeof(pid_t), cmppid, NULL); |
@@ -3616,7 +3688,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, | |||
3616 | l->list = array; | 3688 | l->list = array; |
3617 | l->length = length; | 3689 | l->length = length; |
3618 | l->use_count++; | 3690 | l->use_count++; |
3619 | up_write(&l->mutex); | 3691 | up_write(&l->rwsem); |
3620 | *lp = l; | 3692 | *lp = l; |
3621 | return 0; | 3693 | return 0; |
3622 | } | 3694 | } |
@@ -3634,7 +3706,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) | |||
3634 | { | 3706 | { |
3635 | int ret = -EINVAL; | 3707 | int ret = -EINVAL; |
3636 | struct cgroup *cgrp; | 3708 | struct cgroup *cgrp; |
3637 | struct cgroup_iter it; | 3709 | struct css_task_iter it; |
3638 | struct task_struct *tsk; | 3710 | struct task_struct *tsk; |
3639 | 3711 | ||
3640 | /* | 3712 | /* |
@@ -3648,8 +3720,8 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) | |||
3648 | ret = 0; | 3720 | ret = 0; |
3649 | cgrp = dentry->d_fsdata; | 3721 | cgrp = dentry->d_fsdata; |
3650 | 3722 | ||
3651 | cgroup_iter_start(cgrp, &it); | 3723 | css_task_iter_start(&cgrp->dummy_css, &it); |
3652 | while ((tsk = cgroup_iter_next(cgrp, &it))) { | 3724 | while ((tsk = css_task_iter_next(&it))) { |
3653 | switch (tsk->state) { | 3725 | switch (tsk->state) { |
3654 | case TASK_RUNNING: | 3726 | case TASK_RUNNING: |
3655 | stats->nr_running++; | 3727 | stats->nr_running++; |
@@ -3669,7 +3741,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) | |||
3669 | break; | 3741 | break; |
3670 | } | 3742 | } |
3671 | } | 3743 | } |
3672 | cgroup_iter_end(cgrp, &it); | 3744 | css_task_iter_end(&it); |
3673 | 3745 | ||
3674 | err: | 3746 | err: |
3675 | return ret; | 3747 | return ret; |
@@ -3694,7 +3766,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) | |||
3694 | int index = 0, pid = *pos; | 3766 | int index = 0, pid = *pos; |
3695 | int *iter; | 3767 | int *iter; |
3696 | 3768 | ||
3697 | down_read(&l->mutex); | 3769 | down_read(&l->rwsem); |
3698 | if (pid) { | 3770 | if (pid) { |
3699 | int end = l->length; | 3771 | int end = l->length; |
3700 | 3772 | ||
@@ -3721,7 +3793,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) | |||
3721 | static void cgroup_pidlist_stop(struct seq_file *s, void *v) | 3793 | static void cgroup_pidlist_stop(struct seq_file *s, void *v) |
3722 | { | 3794 | { |
3723 | struct cgroup_pidlist *l = s->private; | 3795 | struct cgroup_pidlist *l = s->private; |
3724 | up_read(&l->mutex); | 3796 | up_read(&l->rwsem); |
3725 | } | 3797 | } |
3726 | 3798 | ||
3727 | static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) | 3799 | static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) |
@@ -3767,7 +3839,7 @@ static void cgroup_release_pid_array(struct cgroup_pidlist *l) | |||
3767 | * pidlist_mutex, we have to take pidlist_mutex first. | 3839 | * pidlist_mutex, we have to take pidlist_mutex first. |
3768 | */ | 3840 | */ |
3769 | mutex_lock(&l->owner->pidlist_mutex); | 3841 | mutex_lock(&l->owner->pidlist_mutex); |
3770 | down_write(&l->mutex); | 3842 | down_write(&l->rwsem); |
3771 | BUG_ON(!l->use_count); | 3843 | BUG_ON(!l->use_count); |
3772 | if (!--l->use_count) { | 3844 | if (!--l->use_count) { |
3773 | /* we're the last user if refcount is 0; remove and free */ | 3845 | /* we're the last user if refcount is 0; remove and free */ |
@@ -3775,12 +3847,12 @@ static void cgroup_release_pid_array(struct cgroup_pidlist *l) | |||
3775 | mutex_unlock(&l->owner->pidlist_mutex); | 3847 | mutex_unlock(&l->owner->pidlist_mutex); |
3776 | pidlist_free(l->list); | 3848 | pidlist_free(l->list); |
3777 | put_pid_ns(l->key.ns); | 3849 | put_pid_ns(l->key.ns); |
3778 | up_write(&l->mutex); | 3850 | up_write(&l->rwsem); |
3779 | kfree(l); | 3851 | kfree(l); |
3780 | return; | 3852 | return; |
3781 | } | 3853 | } |
3782 | mutex_unlock(&l->owner->pidlist_mutex); | 3854 | mutex_unlock(&l->owner->pidlist_mutex); |
3783 | up_write(&l->mutex); | 3855 | up_write(&l->rwsem); |
3784 | } | 3856 | } |
3785 | 3857 | ||
3786 | static int cgroup_pidlist_release(struct inode *inode, struct file *file) | 3858 | static int cgroup_pidlist_release(struct inode *inode, struct file *file) |
@@ -3844,21 +3916,20 @@ static int cgroup_procs_open(struct inode *unused, struct file *file) | |||
3844 | return cgroup_pidlist_open(file, CGROUP_FILE_PROCS); | 3916 | return cgroup_pidlist_open(file, CGROUP_FILE_PROCS); |
3845 | } | 3917 | } |
3846 | 3918 | ||
3847 | static u64 cgroup_read_notify_on_release(struct cgroup *cgrp, | 3919 | static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, |
3848 | struct cftype *cft) | 3920 | struct cftype *cft) |
3849 | { | 3921 | { |
3850 | return notify_on_release(cgrp); | 3922 | return notify_on_release(css->cgroup); |
3851 | } | 3923 | } |
3852 | 3924 | ||
3853 | static int cgroup_write_notify_on_release(struct cgroup *cgrp, | 3925 | static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css, |
3854 | struct cftype *cft, | 3926 | struct cftype *cft, u64 val) |
3855 | u64 val) | ||
3856 | { | 3927 | { |
3857 | clear_bit(CGRP_RELEASABLE, &cgrp->flags); | 3928 | clear_bit(CGRP_RELEASABLE, &css->cgroup->flags); |
3858 | if (val) | 3929 | if (val) |
3859 | set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); | 3930 | set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); |
3860 | else | 3931 | else |
3861 | clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); | 3932 | clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); |
3862 | return 0; | 3933 | return 0; |
3863 | } | 3934 | } |
3864 | 3935 | ||
@@ -3888,18 +3959,18 @@ static void cgroup_event_remove(struct work_struct *work) | |||
3888 | { | 3959 | { |
3889 | struct cgroup_event *event = container_of(work, struct cgroup_event, | 3960 | struct cgroup_event *event = container_of(work, struct cgroup_event, |
3890 | remove); | 3961 | remove); |
3891 | struct cgroup *cgrp = event->cgrp; | 3962 | struct cgroup_subsys_state *css = event->css; |
3892 | 3963 | ||
3893 | remove_wait_queue(event->wqh, &event->wait); | 3964 | remove_wait_queue(event->wqh, &event->wait); |
3894 | 3965 | ||
3895 | event->cft->unregister_event(cgrp, event->cft, event->eventfd); | 3966 | event->cft->unregister_event(css, event->cft, event->eventfd); |
3896 | 3967 | ||
3897 | /* Notify userspace the event is going away. */ | 3968 | /* Notify userspace the event is going away. */ |
3898 | eventfd_signal(event->eventfd, 1); | 3969 | eventfd_signal(event->eventfd, 1); |
3899 | 3970 | ||
3900 | eventfd_ctx_put(event->eventfd); | 3971 | eventfd_ctx_put(event->eventfd); |
3901 | kfree(event); | 3972 | kfree(event); |
3902 | cgroup_dput(cgrp); | 3973 | css_put(css); |
3903 | } | 3974 | } |
3904 | 3975 | ||
3905 | /* | 3976 | /* |
@@ -3912,7 +3983,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, | |||
3912 | { | 3983 | { |
3913 | struct cgroup_event *event = container_of(wait, | 3984 | struct cgroup_event *event = container_of(wait, |
3914 | struct cgroup_event, wait); | 3985 | struct cgroup_event, wait); |
3915 | struct cgroup *cgrp = event->cgrp; | 3986 | struct cgroup *cgrp = event->css->cgroup; |
3916 | unsigned long flags = (unsigned long)key; | 3987 | unsigned long flags = (unsigned long)key; |
3917 | 3988 | ||
3918 | if (flags & POLLHUP) { | 3989 | if (flags & POLLHUP) { |
@@ -3956,14 +4027,15 @@ static void cgroup_event_ptable_queue_proc(struct file *file, | |||
3956 | * Input must be in format '<event_fd> <control_fd> <args>'. | 4027 | * Input must be in format '<event_fd> <control_fd> <args>'. |
3957 | * Interpretation of args is defined by control file implementation. | 4028 | * Interpretation of args is defined by control file implementation. |
3958 | */ | 4029 | */ |
3959 | static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, | 4030 | static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, |
3960 | const char *buffer) | 4031 | struct cftype *cft, const char *buffer) |
3961 | { | 4032 | { |
3962 | struct cgroup_event *event = NULL; | 4033 | struct cgroup *cgrp = dummy_css->cgroup; |
3963 | struct cgroup *cgrp_cfile; | 4034 | struct cgroup_event *event; |
4035 | struct cgroup_subsys_state *cfile_css; | ||
3964 | unsigned int efd, cfd; | 4036 | unsigned int efd, cfd; |
3965 | struct file *efile = NULL; | 4037 | struct file *efile; |
3966 | struct file *cfile = NULL; | 4038 | struct file *cfile; |
3967 | char *endp; | 4039 | char *endp; |
3968 | int ret; | 4040 | int ret; |
3969 | 4041 | ||
@@ -3980,7 +4052,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, | |||
3980 | event = kzalloc(sizeof(*event), GFP_KERNEL); | 4052 | event = kzalloc(sizeof(*event), GFP_KERNEL); |
3981 | if (!event) | 4053 | if (!event) |
3982 | return -ENOMEM; | 4054 | return -ENOMEM; |
3983 | event->cgrp = cgrp; | 4055 | |
3984 | INIT_LIST_HEAD(&event->list); | 4056 | INIT_LIST_HEAD(&event->list); |
3985 | init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); | 4057 | init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); |
3986 | init_waitqueue_func_entry(&event->wait, cgroup_event_wake); | 4058 | init_waitqueue_func_entry(&event->wait, cgroup_event_wake); |
@@ -3989,62 +4061,68 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, | |||
3989 | efile = eventfd_fget(efd); | 4061 | efile = eventfd_fget(efd); |
3990 | if (IS_ERR(efile)) { | 4062 | if (IS_ERR(efile)) { |
3991 | ret = PTR_ERR(efile); | 4063 | ret = PTR_ERR(efile); |
3992 | goto fail; | 4064 | goto out_kfree; |
3993 | } | 4065 | } |
3994 | 4066 | ||
3995 | event->eventfd = eventfd_ctx_fileget(efile); | 4067 | event->eventfd = eventfd_ctx_fileget(efile); |
3996 | if (IS_ERR(event->eventfd)) { | 4068 | if (IS_ERR(event->eventfd)) { |
3997 | ret = PTR_ERR(event->eventfd); | 4069 | ret = PTR_ERR(event->eventfd); |
3998 | goto fail; | 4070 | goto out_put_efile; |
3999 | } | 4071 | } |
4000 | 4072 | ||
4001 | cfile = fget(cfd); | 4073 | cfile = fget(cfd); |
4002 | if (!cfile) { | 4074 | if (!cfile) { |
4003 | ret = -EBADF; | 4075 | ret = -EBADF; |
4004 | goto fail; | 4076 | goto out_put_eventfd; |
4005 | } | 4077 | } |
4006 | 4078 | ||
4007 | /* the process need read permission on control file */ | 4079 | /* the process need read permission on control file */ |
4008 | /* AV: shouldn't we check that it's been opened for read instead? */ | 4080 | /* AV: shouldn't we check that it's been opened for read instead? */ |
4009 | ret = inode_permission(file_inode(cfile), MAY_READ); | 4081 | ret = inode_permission(file_inode(cfile), MAY_READ); |
4010 | if (ret < 0) | 4082 | if (ret < 0) |
4011 | goto fail; | 4083 | goto out_put_cfile; |
4012 | 4084 | ||
4013 | event->cft = __file_cft(cfile); | 4085 | event->cft = __file_cft(cfile); |
4014 | if (IS_ERR(event->cft)) { | 4086 | if (IS_ERR(event->cft)) { |
4015 | ret = PTR_ERR(event->cft); | 4087 | ret = PTR_ERR(event->cft); |
4016 | goto fail; | 4088 | goto out_put_cfile; |
4089 | } | ||
4090 | |||
4091 | if (!event->cft->ss) { | ||
4092 | ret = -EBADF; | ||
4093 | goto out_put_cfile; | ||
4017 | } | 4094 | } |
4018 | 4095 | ||
4019 | /* | 4096 | /* |
4020 | * The file to be monitored must be in the same cgroup as | 4097 | * Determine the css of @cfile, verify it belongs to the same |
4021 | * cgroup.event_control is. | 4098 | * cgroup as cgroup.event_control, and associate @event with it. |
4099 | * Remaining events are automatically removed on cgroup destruction | ||
4100 | * but the removal is asynchronous, so take an extra ref. | ||
4022 | */ | 4101 | */ |
4023 | cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent); | 4102 | rcu_read_lock(); |
4024 | if (cgrp_cfile != cgrp) { | 4103 | |
4025 | ret = -EINVAL; | 4104 | ret = -EINVAL; |
4026 | goto fail; | 4105 | event->css = cgroup_css(cgrp, event->cft->ss); |
4027 | } | 4106 | cfile_css = css_from_dir(cfile->f_dentry->d_parent, event->cft->ss); |
4107 | if (event->css && event->css == cfile_css && css_tryget(event->css)) | ||
4108 | ret = 0; | ||
4109 | |||
4110 | rcu_read_unlock(); | ||
4111 | if (ret) | ||
4112 | goto out_put_cfile; | ||
4028 | 4113 | ||
4029 | if (!event->cft->register_event || !event->cft->unregister_event) { | 4114 | if (!event->cft->register_event || !event->cft->unregister_event) { |
4030 | ret = -EINVAL; | 4115 | ret = -EINVAL; |
4031 | goto fail; | 4116 | goto out_put_css; |
4032 | } | 4117 | } |
4033 | 4118 | ||
4034 | ret = event->cft->register_event(cgrp, event->cft, | 4119 | ret = event->cft->register_event(event->css, event->cft, |
4035 | event->eventfd, buffer); | 4120 | event->eventfd, buffer); |
4036 | if (ret) | 4121 | if (ret) |
4037 | goto fail; | 4122 | goto out_put_css; |
4038 | 4123 | ||
4039 | efile->f_op->poll(efile, &event->pt); | 4124 | efile->f_op->poll(efile, &event->pt); |
4040 | 4125 | ||
4041 | /* | ||
4042 | * Events should be removed after rmdir of cgroup directory, but before | ||
4043 | * destroying subsystem state objects. Let's take reference to cgroup | ||
4044 | * directory dentry to do that. | ||
4045 | */ | ||
4046 | dget(cgrp->dentry); | ||
4047 | |||
4048 | spin_lock(&cgrp->event_list_lock); | 4126 | spin_lock(&cgrp->event_list_lock); |
4049 | list_add(&event->list, &cgrp->event_list); | 4127 | list_add(&event->list, &cgrp->event_list); |
4050 | spin_unlock(&cgrp->event_list_lock); | 4128 | spin_unlock(&cgrp->event_list_lock); |
@@ -4054,35 +4132,33 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, | |||
4054 | 4132 | ||
4055 | return 0; | 4133 | return 0; |
4056 | 4134 | ||
4057 | fail: | 4135 | out_put_css: |
4058 | if (cfile) | 4136 | css_put(event->css); |
4059 | fput(cfile); | 4137 | out_put_cfile: |
4060 | 4138 | fput(cfile); | |
4061 | if (event && event->eventfd && !IS_ERR(event->eventfd)) | 4139 | out_put_eventfd: |
4062 | eventfd_ctx_put(event->eventfd); | 4140 | eventfd_ctx_put(event->eventfd); |
4063 | 4141 | out_put_efile: | |
4064 | if (!IS_ERR_OR_NULL(efile)) | 4142 | fput(efile); |
4065 | fput(efile); | 4143 | out_kfree: |
4066 | |||
4067 | kfree(event); | 4144 | kfree(event); |
4068 | 4145 | ||
4069 | return ret; | 4146 | return ret; |
4070 | } | 4147 | } |
4071 | 4148 | ||
4072 | static u64 cgroup_clone_children_read(struct cgroup *cgrp, | 4149 | static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, |
4073 | struct cftype *cft) | 4150 | struct cftype *cft) |
4074 | { | 4151 | { |
4075 | return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); | 4152 | return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); |
4076 | } | 4153 | } |
4077 | 4154 | ||
4078 | static int cgroup_clone_children_write(struct cgroup *cgrp, | 4155 | static int cgroup_clone_children_write(struct cgroup_subsys_state *css, |
4079 | struct cftype *cft, | 4156 | struct cftype *cft, u64 val) |
4080 | u64 val) | ||
4081 | { | 4157 | { |
4082 | if (val) | 4158 | if (val) |
4083 | set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); | 4159 | set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); |
4084 | else | 4160 | else |
4085 | clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); | 4161 | clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); |
4086 | return 0; | 4162 | return 0; |
4087 | } | 4163 | } |
4088 | 4164 | ||
@@ -4141,36 +4217,34 @@ static struct cftype cgroup_base_files[] = { | |||
4141 | }; | 4217 | }; |
4142 | 4218 | ||
4143 | /** | 4219 | /** |
4144 | * cgroup_populate_dir - selectively creation of files in a directory | 4220 | * cgroup_populate_dir - create subsys files in a cgroup directory |
4145 | * @cgrp: target cgroup | 4221 | * @cgrp: target cgroup |
4146 | * @base_files: true if the base files should be added | ||
4147 | * @subsys_mask: mask of the subsystem ids whose files should be added | 4222 | * @subsys_mask: mask of the subsystem ids whose files should be added |
4223 | * | ||
4224 | * On failure, no file is added. | ||
4148 | */ | 4225 | */ |
4149 | static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, | 4226 | static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) |
4150 | unsigned long subsys_mask) | ||
4151 | { | 4227 | { |
4152 | int err; | ||
4153 | struct cgroup_subsys *ss; | 4228 | struct cgroup_subsys *ss; |
4154 | 4229 | int i, ret = 0; | |
4155 | if (base_files) { | ||
4156 | err = cgroup_addrm_files(cgrp, NULL, cgroup_base_files, true); | ||
4157 | if (err < 0) | ||
4158 | return err; | ||
4159 | } | ||
4160 | 4230 | ||
4161 | /* process cftsets of each subsystem */ | 4231 | /* process cftsets of each subsystem */ |
4162 | for_each_root_subsys(cgrp->root, ss) { | 4232 | for_each_subsys(ss, i) { |
4163 | struct cftype_set *set; | 4233 | struct cftype_set *set; |
4164 | if (!test_bit(ss->subsys_id, &subsys_mask)) | 4234 | |
4235 | if (!test_bit(i, &subsys_mask)) | ||
4165 | continue; | 4236 | continue; |
4166 | 4237 | ||
4167 | list_for_each_entry(set, &ss->cftsets, node) | 4238 | list_for_each_entry(set, &ss->cftsets, node) { |
4168 | cgroup_addrm_files(cgrp, ss, set->cfts, true); | 4239 | ret = cgroup_addrm_files(cgrp, set->cfts, true); |
4240 | if (ret < 0) | ||
4241 | goto err; | ||
4242 | } | ||
4169 | } | 4243 | } |
4170 | 4244 | ||
4171 | /* This cgroup is ready now */ | 4245 | /* This cgroup is ready now */ |
4172 | for_each_root_subsys(cgrp->root, ss) { | 4246 | for_each_root_subsys(cgrp->root, ss) { |
4173 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | 4247 | struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); |
4174 | struct css_id *id = rcu_dereference_protected(css->id, true); | 4248 | struct css_id *id = rcu_dereference_protected(css->id, true); |
4175 | 4249 | ||
4176 | /* | 4250 | /* |
@@ -4183,14 +4257,57 @@ static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, | |||
4183 | } | 4257 | } |
4184 | 4258 | ||
4185 | return 0; | 4259 | return 0; |
4260 | err: | ||
4261 | cgroup_clear_dir(cgrp, subsys_mask); | ||
4262 | return ret; | ||
4186 | } | 4263 | } |
4187 | 4264 | ||
4188 | static void css_dput_fn(struct work_struct *work) | 4265 | /* |
4266 | * css destruction is four-stage process. | ||
4267 | * | ||
4268 | * 1. Destruction starts. Killing of the percpu_ref is initiated. | ||
4269 | * Implemented in kill_css(). | ||
4270 | * | ||
4271 | * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs | ||
4272 | * and thus css_tryget() is guaranteed to fail, the css can be offlined | ||
4273 | * by invoking offline_css(). After offlining, the base ref is put. | ||
4274 | * Implemented in css_killed_work_fn(). | ||
4275 | * | ||
4276 | * 3. When the percpu_ref reaches zero, the only possible remaining | ||
4277 | * accessors are inside RCU read sections. css_release() schedules the | ||
4278 | * RCU callback. | ||
4279 | * | ||
4280 | * 4. After the grace period, the css can be freed. Implemented in | ||
4281 | * css_free_work_fn(). | ||
4282 | * | ||
4283 | * It is actually hairier because both step 2 and 4 require process context | ||
4284 | * and thus involve punting to css->destroy_work adding two additional | ||
4285 | * steps to the already complex sequence. | ||
4286 | */ | ||
4287 | static void css_free_work_fn(struct work_struct *work) | ||
4189 | { | 4288 | { |
4190 | struct cgroup_subsys_state *css = | 4289 | struct cgroup_subsys_state *css = |
4191 | container_of(work, struct cgroup_subsys_state, dput_work); | 4290 | container_of(work, struct cgroup_subsys_state, destroy_work); |
4291 | struct cgroup *cgrp = css->cgroup; | ||
4192 | 4292 | ||
4193 | cgroup_dput(css->cgroup); | 4293 | if (css->parent) |
4294 | css_put(css->parent); | ||
4295 | |||
4296 | css->ss->css_free(css); | ||
4297 | cgroup_dput(cgrp); | ||
4298 | } | ||
4299 | |||
4300 | static void css_free_rcu_fn(struct rcu_head *rcu_head) | ||
4301 | { | ||
4302 | struct cgroup_subsys_state *css = | ||
4303 | container_of(rcu_head, struct cgroup_subsys_state, rcu_head); | ||
4304 | |||
4305 | /* | ||
4306 | * css holds an extra ref to @cgrp->dentry which is put on the last | ||
4307 | * css_put(). dput() requires process context which we don't have. | ||
4308 | */ | ||
4309 | INIT_WORK(&css->destroy_work, css_free_work_fn); | ||
4310 | schedule_work(&css->destroy_work); | ||
4194 | } | 4311 | } |
4195 | 4312 | ||
4196 | static void css_release(struct percpu_ref *ref) | 4313 | static void css_release(struct percpu_ref *ref) |
@@ -4198,49 +4315,47 @@ static void css_release(struct percpu_ref *ref) | |||
4198 | struct cgroup_subsys_state *css = | 4315 | struct cgroup_subsys_state *css = |
4199 | container_of(ref, struct cgroup_subsys_state, refcnt); | 4316 | container_of(ref, struct cgroup_subsys_state, refcnt); |
4200 | 4317 | ||
4201 | schedule_work(&css->dput_work); | 4318 | call_rcu(&css->rcu_head, css_free_rcu_fn); |
4202 | } | 4319 | } |
4203 | 4320 | ||
4204 | static void init_cgroup_css(struct cgroup_subsys_state *css, | 4321 | static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, |
4205 | struct cgroup_subsys *ss, | 4322 | struct cgroup *cgrp) |
4206 | struct cgroup *cgrp) | ||
4207 | { | 4323 | { |
4208 | css->cgroup = cgrp; | 4324 | css->cgroup = cgrp; |
4325 | css->ss = ss; | ||
4209 | css->flags = 0; | 4326 | css->flags = 0; |
4210 | css->id = NULL; | 4327 | css->id = NULL; |
4211 | if (cgrp == cgroup_dummy_top) | 4328 | |
4329 | if (cgrp->parent) | ||
4330 | css->parent = cgroup_css(cgrp->parent, ss); | ||
4331 | else | ||
4212 | css->flags |= CSS_ROOT; | 4332 | css->flags |= CSS_ROOT; |
4213 | BUG_ON(cgrp->subsys[ss->subsys_id]); | ||
4214 | cgrp->subsys[ss->subsys_id] = css; | ||
4215 | 4333 | ||
4216 | /* | 4334 | BUG_ON(cgroup_css(cgrp, ss)); |
4217 | * css holds an extra ref to @cgrp->dentry which is put on the last | ||
4218 | * css_put(). dput() requires process context, which css_put() may | ||
4219 | * be called without. @css->dput_work will be used to invoke | ||
4220 | * dput() asynchronously from css_put(). | ||
4221 | */ | ||
4222 | INIT_WORK(&css->dput_work, css_dput_fn); | ||
4223 | } | 4335 | } |
4224 | 4336 | ||
4225 | /* invoke ->post_create() on a new CSS and mark it online if successful */ | 4337 | /* invoke ->css_online() on a new CSS and mark it online if successful */ |
4226 | static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp) | 4338 | static int online_css(struct cgroup_subsys_state *css) |
4227 | { | 4339 | { |
4340 | struct cgroup_subsys *ss = css->ss; | ||
4228 | int ret = 0; | 4341 | int ret = 0; |
4229 | 4342 | ||
4230 | lockdep_assert_held(&cgroup_mutex); | 4343 | lockdep_assert_held(&cgroup_mutex); |
4231 | 4344 | ||
4232 | if (ss->css_online) | 4345 | if (ss->css_online) |
4233 | ret = ss->css_online(cgrp); | 4346 | ret = ss->css_online(css); |
4234 | if (!ret) | 4347 | if (!ret) { |
4235 | cgrp->subsys[ss->subsys_id]->flags |= CSS_ONLINE; | 4348 | css->flags |= CSS_ONLINE; |
4349 | css->cgroup->nr_css++; | ||
4350 | rcu_assign_pointer(css->cgroup->subsys[ss->subsys_id], css); | ||
4351 | } | ||
4236 | return ret; | 4352 | return ret; |
4237 | } | 4353 | } |
4238 | 4354 | ||
4239 | /* if the CSS is online, invoke ->pre_destory() on it and mark it offline */ | 4355 | /* if the CSS is online, invoke ->css_offline() on it and mark it offline */ |
4240 | static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) | 4356 | static void offline_css(struct cgroup_subsys_state *css) |
4241 | __releases(&cgroup_mutex) __acquires(&cgroup_mutex) | ||
4242 | { | 4357 | { |
4243 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | 4358 | struct cgroup_subsys *ss = css->ss; |
4244 | 4359 | ||
4245 | lockdep_assert_held(&cgroup_mutex); | 4360 | lockdep_assert_held(&cgroup_mutex); |
4246 | 4361 | ||
@@ -4248,9 +4363,11 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) | |||
4248 | return; | 4363 | return; |
4249 | 4364 | ||
4250 | if (ss->css_offline) | 4365 | if (ss->css_offline) |
4251 | ss->css_offline(cgrp); | 4366 | ss->css_offline(css); |
4252 | 4367 | ||
4253 | cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE; | 4368 | css->flags &= ~CSS_ONLINE; |
4369 | css->cgroup->nr_css--; | ||
4370 | RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css); | ||
4254 | } | 4371 | } |
4255 | 4372 | ||
4256 | /* | 4373 | /* |
@@ -4264,6 +4381,7 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) | |||
4264 | static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | 4381 | static long cgroup_create(struct cgroup *parent, struct dentry *dentry, |
4265 | umode_t mode) | 4382 | umode_t mode) |
4266 | { | 4383 | { |
4384 | struct cgroup_subsys_state *css_ar[CGROUP_SUBSYS_COUNT] = { }; | ||
4267 | struct cgroup *cgrp; | 4385 | struct cgroup *cgrp; |
4268 | struct cgroup_name *name; | 4386 | struct cgroup_name *name; |
4269 | struct cgroupfs_root *root = parent->root; | 4387 | struct cgroupfs_root *root = parent->root; |
@@ -4281,7 +4399,11 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4281 | goto err_free_cgrp; | 4399 | goto err_free_cgrp; |
4282 | rcu_assign_pointer(cgrp->name, name); | 4400 | rcu_assign_pointer(cgrp->name, name); |
4283 | 4401 | ||
4284 | cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL); | 4402 | /* |
4403 | * Temporarily set the pointer to NULL, so idr_find() won't return | ||
4404 | * a half-baked cgroup. | ||
4405 | */ | ||
4406 | cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL); | ||
4285 | if (cgrp->id < 0) | 4407 | if (cgrp->id < 0) |
4286 | goto err_free_name; | 4408 | goto err_free_name; |
4287 | 4409 | ||
@@ -4310,6 +4432,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4310 | cgrp->dentry = dentry; | 4432 | cgrp->dentry = dentry; |
4311 | 4433 | ||
4312 | cgrp->parent = parent; | 4434 | cgrp->parent = parent; |
4435 | cgrp->dummy_css.parent = &parent->dummy_css; | ||
4313 | cgrp->root = parent->root; | 4436 | cgrp->root = parent->root; |
4314 | 4437 | ||
4315 | if (notify_on_release(parent)) | 4438 | if (notify_on_release(parent)) |
@@ -4321,20 +4444,21 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4321 | for_each_root_subsys(root, ss) { | 4444 | for_each_root_subsys(root, ss) { |
4322 | struct cgroup_subsys_state *css; | 4445 | struct cgroup_subsys_state *css; |
4323 | 4446 | ||
4324 | css = ss->css_alloc(cgrp); | 4447 | css = ss->css_alloc(cgroup_css(parent, ss)); |
4325 | if (IS_ERR(css)) { | 4448 | if (IS_ERR(css)) { |
4326 | err = PTR_ERR(css); | 4449 | err = PTR_ERR(css); |
4327 | goto err_free_all; | 4450 | goto err_free_all; |
4328 | } | 4451 | } |
4452 | css_ar[ss->subsys_id] = css; | ||
4329 | 4453 | ||
4330 | err = percpu_ref_init(&css->refcnt, css_release); | 4454 | err = percpu_ref_init(&css->refcnt, css_release); |
4331 | if (err) | 4455 | if (err) |
4332 | goto err_free_all; | 4456 | goto err_free_all; |
4333 | 4457 | ||
4334 | init_cgroup_css(css, ss, cgrp); | 4458 | init_css(css, ss, cgrp); |
4335 | 4459 | ||
4336 | if (ss->use_id) { | 4460 | if (ss->use_id) { |
4337 | err = alloc_css_id(ss, parent, cgrp); | 4461 | err = alloc_css_id(css); |
4338 | if (err) | 4462 | if (err) |
4339 | goto err_free_all; | 4463 | goto err_free_all; |
4340 | } | 4464 | } |
@@ -4356,16 +4480,22 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4356 | list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); | 4480 | list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); |
4357 | root->number_of_cgroups++; | 4481 | root->number_of_cgroups++; |
4358 | 4482 | ||
4359 | /* each css holds a ref to the cgroup's dentry */ | 4483 | /* each css holds a ref to the cgroup's dentry and the parent css */ |
4360 | for_each_root_subsys(root, ss) | 4484 | for_each_root_subsys(root, ss) { |
4485 | struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; | ||
4486 | |||
4361 | dget(dentry); | 4487 | dget(dentry); |
4488 | css_get(css->parent); | ||
4489 | } | ||
4362 | 4490 | ||
4363 | /* hold a ref to the parent's dentry */ | 4491 | /* hold a ref to the parent's dentry */ |
4364 | dget(parent->dentry); | 4492 | dget(parent->dentry); |
4365 | 4493 | ||
4366 | /* creation succeeded, notify subsystems */ | 4494 | /* creation succeeded, notify subsystems */ |
4367 | for_each_root_subsys(root, ss) { | 4495 | for_each_root_subsys(root, ss) { |
4368 | err = online_css(ss, cgrp); | 4496 | struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; |
4497 | |||
4498 | err = online_css(css); | ||
4369 | if (err) | 4499 | if (err) |
4370 | goto err_destroy; | 4500 | goto err_destroy; |
4371 | 4501 | ||
@@ -4379,7 +4509,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4379 | } | 4509 | } |
4380 | } | 4510 | } |
4381 | 4511 | ||
4382 | err = cgroup_populate_dir(cgrp, true, root->subsys_mask); | 4512 | idr_replace(&root->cgroup_idr, cgrp, cgrp->id); |
4513 | |||
4514 | err = cgroup_addrm_files(cgrp, cgroup_base_files, true); | ||
4515 | if (err) | ||
4516 | goto err_destroy; | ||
4517 | |||
4518 | err = cgroup_populate_dir(cgrp, root->subsys_mask); | ||
4383 | if (err) | 4519 | if (err) |
4384 | goto err_destroy; | 4520 | goto err_destroy; |
4385 | 4521 | ||
@@ -4390,18 +4526,18 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
4390 | 4526 | ||
4391 | err_free_all: | 4527 | err_free_all: |
4392 | for_each_root_subsys(root, ss) { | 4528 | for_each_root_subsys(root, ss) { |
4393 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | 4529 | struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; |
4394 | 4530 | ||
4395 | if (css) { | 4531 | if (css) { |
4396 | percpu_ref_cancel_init(&css->refcnt); | 4532 | percpu_ref_cancel_init(&css->refcnt); |
4397 | ss->css_free(cgrp); | 4533 | ss->css_free(css); |
4398 | } | 4534 | } |
4399 | } | 4535 | } |
4400 | mutex_unlock(&cgroup_mutex); | 4536 | mutex_unlock(&cgroup_mutex); |
4401 | /* Release the reference count that we took on the superblock */ | 4537 | /* Release the reference count that we took on the superblock */ |
4402 | deactivate_super(sb); | 4538 | deactivate_super(sb); |
4403 | err_free_id: | 4539 | err_free_id: |
4404 | ida_simple_remove(&root->cgroup_ida, cgrp->id); | 4540 | idr_remove(&root->cgroup_idr, cgrp->id); |
4405 | err_free_name: | 4541 | err_free_name: |
4406 | kfree(rcu_dereference_raw(cgrp->name)); | 4542 | kfree(rcu_dereference_raw(cgrp->name)); |
4407 | err_free_cgrp: | 4543 | err_free_cgrp: |
@@ -4423,22 +4559,84 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) | |||
4423 | return cgroup_create(c_parent, dentry, mode | S_IFDIR); | 4559 | return cgroup_create(c_parent, dentry, mode | S_IFDIR); |
4424 | } | 4560 | } |
4425 | 4561 | ||
4426 | static void cgroup_css_killed(struct cgroup *cgrp) | 4562 | /* |
4563 | * This is called when the refcnt of a css is confirmed to be killed. | ||
4564 | * css_tryget() is now guaranteed to fail. | ||
4565 | */ | ||
4566 | static void css_killed_work_fn(struct work_struct *work) | ||
4427 | { | 4567 | { |
4428 | if (!atomic_dec_and_test(&cgrp->css_kill_cnt)) | 4568 | struct cgroup_subsys_state *css = |
4429 | return; | 4569 | container_of(work, struct cgroup_subsys_state, destroy_work); |
4570 | struct cgroup *cgrp = css->cgroup; | ||
4430 | 4571 | ||
4431 | /* percpu ref's of all css's are killed, kick off the next step */ | 4572 | mutex_lock(&cgroup_mutex); |
4432 | INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn); | 4573 | |
4433 | schedule_work(&cgrp->destroy_work); | 4574 | /* |
4575 | * css_tryget() is guaranteed to fail now. Tell subsystems to | ||
4576 | * initate destruction. | ||
4577 | */ | ||
4578 | offline_css(css); | ||
4579 | |||
4580 | /* | ||
4581 | * If @cgrp is marked dead, it's waiting for refs of all css's to | ||
4582 | * be disabled before proceeding to the second phase of cgroup | ||
4583 | * destruction. If we are the last one, kick it off. | ||
4584 | */ | ||
4585 | if (!cgrp->nr_css && cgroup_is_dead(cgrp)) | ||
4586 | cgroup_destroy_css_killed(cgrp); | ||
4587 | |||
4588 | mutex_unlock(&cgroup_mutex); | ||
4589 | |||
4590 | /* | ||
4591 | * Put the css refs from kill_css(). Each css holds an extra | ||
4592 | * reference to the cgroup's dentry and cgroup removal proceeds | ||
4593 | * regardless of css refs. On the last put of each css, whenever | ||
4594 | * that may be, the extra dentry ref is put so that dentry | ||
4595 | * destruction happens only after all css's are released. | ||
4596 | */ | ||
4597 | css_put(css); | ||
4434 | } | 4598 | } |
4435 | 4599 | ||
4436 | static void css_ref_killed_fn(struct percpu_ref *ref) | 4600 | /* css kill confirmation processing requires process context, bounce */ |
4601 | static void css_killed_ref_fn(struct percpu_ref *ref) | ||
4437 | { | 4602 | { |
4438 | struct cgroup_subsys_state *css = | 4603 | struct cgroup_subsys_state *css = |
4439 | container_of(ref, struct cgroup_subsys_state, refcnt); | 4604 | container_of(ref, struct cgroup_subsys_state, refcnt); |
4440 | 4605 | ||
4441 | cgroup_css_killed(css->cgroup); | 4606 | INIT_WORK(&css->destroy_work, css_killed_work_fn); |
4607 | schedule_work(&css->destroy_work); | ||
4608 | } | ||
4609 | |||
4610 | /** | ||
4611 | * kill_css - destroy a css | ||
4612 | * @css: css to destroy | ||
4613 | * | ||
4614 | * This function initiates destruction of @css by removing cgroup interface | ||
4615 | * files and putting its base reference. ->css_offline() will be invoked | ||
4616 | * asynchronously once css_tryget() is guaranteed to fail and when the | ||
4617 | * reference count reaches zero, @css will be released. | ||
4618 | */ | ||
4619 | static void kill_css(struct cgroup_subsys_state *css) | ||
4620 | { | ||
4621 | cgroup_clear_dir(css->cgroup, 1 << css->ss->subsys_id); | ||
4622 | |||
4623 | /* | ||
4624 | * Killing would put the base ref, but we need to keep it alive | ||
4625 | * until after ->css_offline(). | ||
4626 | */ | ||
4627 | css_get(css); | ||
4628 | |||
4629 | /* | ||
4630 | * cgroup core guarantees that, by the time ->css_offline() is | ||
4631 | * invoked, no new css reference will be given out via | ||
4632 | * css_tryget(). We can't simply call percpu_ref_kill() and | ||
4633 | * proceed to offlining css's because percpu_ref_kill() doesn't | ||
4634 | * guarantee that the ref is seen as killed on all CPUs on return. | ||
4635 | * | ||
4636 | * Use percpu_ref_kill_and_confirm() to get notifications as each | ||
4637 | * css is confirmed to be seen as killed on all CPUs. | ||
4638 | */ | ||
4639 | percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn); | ||
4442 | } | 4640 | } |
4443 | 4641 | ||
4444 | /** | 4642 | /** |
@@ -4471,6 +4669,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) | |||
4471 | struct dentry *d = cgrp->dentry; | 4669 | struct dentry *d = cgrp->dentry; |
4472 | struct cgroup_event *event, *tmp; | 4670 | struct cgroup_event *event, *tmp; |
4473 | struct cgroup_subsys *ss; | 4671 | struct cgroup_subsys *ss; |
4672 | struct cgroup *child; | ||
4474 | bool empty; | 4673 | bool empty; |
4475 | 4674 | ||
4476 | lockdep_assert_held(&d->d_inode->i_mutex); | 4675 | lockdep_assert_held(&d->d_inode->i_mutex); |
@@ -4481,47 +4680,41 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) | |||
4481 | * @cgrp from being removed while __put_css_set() is in progress. | 4680 | * @cgrp from being removed while __put_css_set() is in progress. |
4482 | */ | 4681 | */ |
4483 | read_lock(&css_set_lock); | 4682 | read_lock(&css_set_lock); |
4484 | empty = list_empty(&cgrp->cset_links) && list_empty(&cgrp->children); | 4683 | empty = list_empty(&cgrp->cset_links); |
4485 | read_unlock(&css_set_lock); | 4684 | read_unlock(&css_set_lock); |
4486 | if (!empty) | 4685 | if (!empty) |
4487 | return -EBUSY; | 4686 | return -EBUSY; |
4488 | 4687 | ||
4489 | /* | 4688 | /* |
4490 | * Block new css_tryget() by killing css refcnts. cgroup core | 4689 | * Make sure there's no live children. We can't test ->children |
4491 | * guarantees that, by the time ->css_offline() is invoked, no new | 4690 | * emptiness as dead children linger on it while being destroyed; |
4492 | * css reference will be given out via css_tryget(). We can't | 4691 | * otherwise, "rmdir parent/child parent" may fail with -EBUSY. |
4493 | * simply call percpu_ref_kill() and proceed to offlining css's | ||
4494 | * because percpu_ref_kill() doesn't guarantee that the ref is seen | ||
4495 | * as killed on all CPUs on return. | ||
4496 | * | ||
4497 | * Use percpu_ref_kill_and_confirm() to get notifications as each | ||
4498 | * css is confirmed to be seen as killed on all CPUs. The | ||
4499 | * notification callback keeps track of the number of css's to be | ||
4500 | * killed and schedules cgroup_offline_fn() to perform the rest of | ||
4501 | * destruction once the percpu refs of all css's are confirmed to | ||
4502 | * be killed. | ||
4503 | */ | 4692 | */ |
4504 | atomic_set(&cgrp->css_kill_cnt, 1); | 4693 | empty = true; |
4505 | for_each_root_subsys(cgrp->root, ss) { | 4694 | rcu_read_lock(); |
4506 | struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; | 4695 | list_for_each_entry_rcu(child, &cgrp->children, sibling) { |
4507 | 4696 | empty = cgroup_is_dead(child); | |
4508 | /* | 4697 | if (!empty) |
4509 | * Killing would put the base ref, but we need to keep it | 4698 | break; |
4510 | * alive until after ->css_offline. | ||
4511 | */ | ||
4512 | percpu_ref_get(&css->refcnt); | ||
4513 | |||
4514 | atomic_inc(&cgrp->css_kill_cnt); | ||
4515 | percpu_ref_kill_and_confirm(&css->refcnt, css_ref_killed_fn); | ||
4516 | } | 4699 | } |
4517 | cgroup_css_killed(cgrp); | 4700 | rcu_read_unlock(); |
4701 | if (!empty) | ||
4702 | return -EBUSY; | ||
4703 | |||
4704 | /* | ||
4705 | * Initiate massacre of all css's. cgroup_destroy_css_killed() | ||
4706 | * will be invoked to perform the rest of destruction once the | ||
4707 | * percpu refs of all css's are confirmed to be killed. | ||
4708 | */ | ||
4709 | for_each_root_subsys(cgrp->root, ss) | ||
4710 | kill_css(cgroup_css(cgrp, ss)); | ||
4518 | 4711 | ||
4519 | /* | 4712 | /* |
4520 | * Mark @cgrp dead. This prevents further task migration and child | 4713 | * Mark @cgrp dead. This prevents further task migration and child |
4521 | * creation by disabling cgroup_lock_live_group(). Note that | 4714 | * creation by disabling cgroup_lock_live_group(). Note that |
4522 | * CGRP_DEAD assertion is depended upon by cgroup_next_sibling() to | 4715 | * CGRP_DEAD assertion is depended upon by css_next_child() to |
4523 | * resume iteration after dropping RCU read lock. See | 4716 | * resume iteration after dropping RCU read lock. See |
4524 | * cgroup_next_sibling() for details. | 4717 | * css_next_child() for details. |
4525 | */ | 4718 | */ |
4526 | set_bit(CGRP_DEAD, &cgrp->flags); | 4719 | set_bit(CGRP_DEAD, &cgrp->flags); |
4527 | 4720 | ||
@@ -4532,9 +4725,20 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) | |||
4532 | raw_spin_unlock(&release_list_lock); | 4725 | raw_spin_unlock(&release_list_lock); |
4533 | 4726 | ||
4534 | /* | 4727 | /* |
4535 | * Remove @cgrp directory. The removal puts the base ref but we | 4728 | * If @cgrp has css's attached, the second stage of cgroup |
4536 | * aren't quite done with @cgrp yet, so hold onto it. | 4729 | * destruction is kicked off from css_killed_work_fn() after the |
4730 | * refs of all attached css's are killed. If @cgrp doesn't have | ||
4731 | * any css, we kick it off here. | ||
4537 | */ | 4732 | */ |
4733 | if (!cgrp->nr_css) | ||
4734 | cgroup_destroy_css_killed(cgrp); | ||
4735 | |||
4736 | /* | ||
4737 | * Clear the base files and remove @cgrp directory. The removal | ||
4738 | * puts the base ref but we aren't quite done with @cgrp yet, so | ||
4739 | * hold onto it. | ||
4740 | */ | ||
4741 | cgroup_addrm_files(cgrp, cgroup_base_files, false); | ||
4538 | dget(d); | 4742 | dget(d); |
4539 | cgroup_d_remove_dir(d); | 4743 | cgroup_d_remove_dir(d); |
4540 | 4744 | ||
@@ -4554,50 +4758,36 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) | |||
4554 | }; | 4758 | }; |
4555 | 4759 | ||
4556 | /** | 4760 | /** |
4557 | * cgroup_offline_fn - the second step of cgroup destruction | 4761 | * cgroup_destroy_css_killed - the second step of cgroup destruction |
4558 | * @work: cgroup->destroy_free_work | 4762 | * @work: cgroup->destroy_free_work |
4559 | * | 4763 | * |
4560 | * This function is invoked from a work item for a cgroup which is being | 4764 | * This function is invoked from a work item for a cgroup which is being |
4561 | * destroyed after the percpu refcnts of all css's are guaranteed to be | 4765 | * destroyed after all css's are offlined and performs the rest of |
4562 | * seen as killed on all CPUs, and performs the rest of destruction. This | 4766 | * destruction. This is the second step of destruction described in the |
4563 | * is the second step of destruction described in the comment above | 4767 | * comment above cgroup_destroy_locked(). |
4564 | * cgroup_destroy_locked(). | ||
4565 | */ | 4768 | */ |
4566 | static void cgroup_offline_fn(struct work_struct *work) | 4769 | static void cgroup_destroy_css_killed(struct cgroup *cgrp) |
4567 | { | 4770 | { |
4568 | struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); | ||
4569 | struct cgroup *parent = cgrp->parent; | 4771 | struct cgroup *parent = cgrp->parent; |
4570 | struct dentry *d = cgrp->dentry; | 4772 | struct dentry *d = cgrp->dentry; |
4571 | struct cgroup_subsys *ss; | ||
4572 | 4773 | ||
4573 | mutex_lock(&cgroup_mutex); | 4774 | lockdep_assert_held(&cgroup_mutex); |
4574 | 4775 | ||
4575 | /* | 4776 | /* delete this cgroup from parent->children */ |
4576 | * css_tryget() is guaranteed to fail now. Tell subsystems to | 4777 | list_del_rcu(&cgrp->sibling); |
4577 | * initate destruction. | ||
4578 | */ | ||
4579 | for_each_root_subsys(cgrp->root, ss) | ||
4580 | offline_css(ss, cgrp); | ||
4581 | 4778 | ||
4582 | /* | 4779 | /* |
4583 | * Put the css refs from cgroup_destroy_locked(). Each css holds | 4780 | * We should remove the cgroup object from idr before its grace |
4584 | * an extra reference to the cgroup's dentry and cgroup removal | 4781 | * period starts, so we won't be looking up a cgroup while the |
4585 | * proceeds regardless of css refs. On the last put of each css, | 4782 | * cgroup is being freed. |
4586 | * whenever that may be, the extra dentry ref is put so that dentry | ||
4587 | * destruction happens only after all css's are released. | ||
4588 | */ | 4783 | */ |
4589 | for_each_root_subsys(cgrp->root, ss) | 4784 | idr_remove(&cgrp->root->cgroup_idr, cgrp->id); |
4590 | css_put(cgrp->subsys[ss->subsys_id]); | 4785 | cgrp->id = -1; |
4591 | |||
4592 | /* delete this cgroup from parent->children */ | ||
4593 | list_del_rcu(&cgrp->sibling); | ||
4594 | 4786 | ||
4595 | dput(d); | 4787 | dput(d); |
4596 | 4788 | ||
4597 | set_bit(CGRP_RELEASABLE, &parent->flags); | 4789 | set_bit(CGRP_RELEASABLE, &parent->flags); |
4598 | check_for_release(parent); | 4790 | check_for_release(parent); |
4599 | |||
4600 | mutex_unlock(&cgroup_mutex); | ||
4601 | } | 4791 | } |
4602 | 4792 | ||
4603 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) | 4793 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) |
@@ -4620,6 +4810,11 @@ static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss) | |||
4620 | * deregistration. | 4810 | * deregistration. |
4621 | */ | 4811 | */ |
4622 | if (ss->base_cftypes) { | 4812 | if (ss->base_cftypes) { |
4813 | struct cftype *cft; | ||
4814 | |||
4815 | for (cft = ss->base_cftypes; cft->name[0] != '\0'; cft++) | ||
4816 | cft->ss = ss; | ||
4817 | |||
4623 | ss->base_cftset.cfts = ss->base_cftypes; | 4818 | ss->base_cftset.cfts = ss->base_cftypes; |
4624 | list_add_tail(&ss->base_cftset.node, &ss->cftsets); | 4819 | list_add_tail(&ss->base_cftset.node, &ss->cftsets); |
4625 | } | 4820 | } |
@@ -4639,10 +4834,10 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) | |||
4639 | /* Create the top cgroup state for this subsystem */ | 4834 | /* Create the top cgroup state for this subsystem */ |
4640 | list_add(&ss->sibling, &cgroup_dummy_root.subsys_list); | 4835 | list_add(&ss->sibling, &cgroup_dummy_root.subsys_list); |
4641 | ss->root = &cgroup_dummy_root; | 4836 | ss->root = &cgroup_dummy_root; |
4642 | css = ss->css_alloc(cgroup_dummy_top); | 4837 | css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss)); |
4643 | /* We don't handle early failures gracefully */ | 4838 | /* We don't handle early failures gracefully */ |
4644 | BUG_ON(IS_ERR(css)); | 4839 | BUG_ON(IS_ERR(css)); |
4645 | init_cgroup_css(css, ss, cgroup_dummy_top); | 4840 | init_css(css, ss, cgroup_dummy_top); |
4646 | 4841 | ||
4647 | /* Update the init_css_set to contain a subsys | 4842 | /* Update the init_css_set to contain a subsys |
4648 | * pointer to this state - since the subsystem is | 4843 | * pointer to this state - since the subsystem is |
@@ -4657,7 +4852,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) | |||
4657 | * need to invoke fork callbacks here. */ | 4852 | * need to invoke fork callbacks here. */ |
4658 | BUG_ON(!list_empty(&init_task.tasks)); | 4853 | BUG_ON(!list_empty(&init_task.tasks)); |
4659 | 4854 | ||
4660 | BUG_ON(online_css(ss, cgroup_dummy_top)); | 4855 | BUG_ON(online_css(css)); |
4661 | 4856 | ||
4662 | mutex_unlock(&cgroup_mutex); | 4857 | mutex_unlock(&cgroup_mutex); |
4663 | 4858 | ||
@@ -4718,7 +4913,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
4718 | * struct, so this can happen first (i.e. before the dummy root | 4913 | * struct, so this can happen first (i.e. before the dummy root |
4719 | * attachment). | 4914 | * attachment). |
4720 | */ | 4915 | */ |
4721 | css = ss->css_alloc(cgroup_dummy_top); | 4916 | css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss)); |
4722 | if (IS_ERR(css)) { | 4917 | if (IS_ERR(css)) { |
4723 | /* failure case - need to deassign the cgroup_subsys[] slot. */ | 4918 | /* failure case - need to deassign the cgroup_subsys[] slot. */ |
4724 | cgroup_subsys[ss->subsys_id] = NULL; | 4919 | cgroup_subsys[ss->subsys_id] = NULL; |
@@ -4730,8 +4925,8 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
4730 | ss->root = &cgroup_dummy_root; | 4925 | ss->root = &cgroup_dummy_root; |
4731 | 4926 | ||
4732 | /* our new subsystem will be attached to the dummy hierarchy. */ | 4927 | /* our new subsystem will be attached to the dummy hierarchy. */ |
4733 | init_cgroup_css(css, ss, cgroup_dummy_top); | 4928 | init_css(css, ss, cgroup_dummy_top); |
4734 | /* init_idr must be after init_cgroup_css because it sets css->id. */ | 4929 | /* init_idr must be after init_css() because it sets css->id. */ |
4735 | if (ss->use_id) { | 4930 | if (ss->use_id) { |
4736 | ret = cgroup_init_idr(ss, css); | 4931 | ret = cgroup_init_idr(ss, css); |
4737 | if (ret) | 4932 | if (ret) |
@@ -4761,7 +4956,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
4761 | } | 4956 | } |
4762 | write_unlock(&css_set_lock); | 4957 | write_unlock(&css_set_lock); |
4763 | 4958 | ||
4764 | ret = online_css(ss, cgroup_dummy_top); | 4959 | ret = online_css(css); |
4765 | if (ret) | 4960 | if (ret) |
4766 | goto err_unload; | 4961 | goto err_unload; |
4767 | 4962 | ||
@@ -4793,14 +4988,14 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) | |||
4793 | 4988 | ||
4794 | /* | 4989 | /* |
4795 | * we shouldn't be called if the subsystem is in use, and the use of | 4990 | * we shouldn't be called if the subsystem is in use, and the use of |
4796 | * try_module_get in parse_cgroupfs_options should ensure that it | 4991 | * try_module_get() in rebind_subsystems() should ensure that it |
4797 | * doesn't start being used while we're killing it off. | 4992 | * doesn't start being used while we're killing it off. |
4798 | */ | 4993 | */ |
4799 | BUG_ON(ss->root != &cgroup_dummy_root); | 4994 | BUG_ON(ss->root != &cgroup_dummy_root); |
4800 | 4995 | ||
4801 | mutex_lock(&cgroup_mutex); | 4996 | mutex_lock(&cgroup_mutex); |
4802 | 4997 | ||
4803 | offline_css(ss, cgroup_dummy_top); | 4998 | offline_css(cgroup_css(cgroup_dummy_top, ss)); |
4804 | 4999 | ||
4805 | if (ss->use_id) | 5000 | if (ss->use_id) |
4806 | idr_destroy(&ss->idr); | 5001 | idr_destroy(&ss->idr); |
@@ -4834,8 +5029,8 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) | |||
4834 | * the cgrp->subsys pointer to find their state. note that this | 5029 | * the cgrp->subsys pointer to find their state. note that this |
4835 | * also takes care of freeing the css_id. | 5030 | * also takes care of freeing the css_id. |
4836 | */ | 5031 | */ |
4837 | ss->css_free(cgroup_dummy_top); | 5032 | ss->css_free(cgroup_css(cgroup_dummy_top, ss)); |
4838 | cgroup_dummy_top->subsys[ss->subsys_id] = NULL; | 5033 | RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL); |
4839 | 5034 | ||
4840 | mutex_unlock(&cgroup_mutex); | 5035 | mutex_unlock(&cgroup_mutex); |
4841 | } | 5036 | } |
@@ -4917,6 +5112,10 @@ int __init cgroup_init(void) | |||
4917 | 5112 | ||
4918 | BUG_ON(cgroup_init_root_id(&cgroup_dummy_root, 0, 1)); | 5113 | BUG_ON(cgroup_init_root_id(&cgroup_dummy_root, 0, 1)); |
4919 | 5114 | ||
5115 | err = idr_alloc(&cgroup_dummy_root.cgroup_idr, cgroup_dummy_top, | ||
5116 | 0, 1, GFP_KERNEL); | ||
5117 | BUG_ON(err < 0); | ||
5118 | |||
4920 | mutex_unlock(&cgroup_root_mutex); | 5119 | mutex_unlock(&cgroup_root_mutex); |
4921 | mutex_unlock(&cgroup_mutex); | 5120 | mutex_unlock(&cgroup_mutex); |
4922 | 5121 | ||
@@ -5073,7 +5272,7 @@ void cgroup_fork(struct task_struct *child) | |||
5073 | * Adds the task to the list running through its css_set if necessary and | 5272 | * Adds the task to the list running through its css_set if necessary and |
5074 | * call the subsystem fork() callbacks. Has to be after the task is | 5273 | * call the subsystem fork() callbacks. Has to be after the task is |
5075 | * visible on the task list in case we race with the first call to | 5274 | * visible on the task list in case we race with the first call to |
5076 | * cgroup_iter_start() - to guarantee that the new task ends up on its | 5275 | * cgroup_task_iter_start() - to guarantee that the new task ends up on its |
5077 | * list. | 5276 | * list. |
5078 | */ | 5277 | */ |
5079 | void cgroup_post_fork(struct task_struct *child) | 5278 | void cgroup_post_fork(struct task_struct *child) |
@@ -5186,10 +5385,10 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) | |||
5186 | */ | 5385 | */ |
5187 | for_each_builtin_subsys(ss, i) { | 5386 | for_each_builtin_subsys(ss, i) { |
5188 | if (ss->exit) { | 5387 | if (ss->exit) { |
5189 | struct cgroup *old_cgrp = cset->subsys[i]->cgroup; | 5388 | struct cgroup_subsys_state *old_css = cset->subsys[i]; |
5190 | struct cgroup *cgrp = task_cgroup(tsk, i); | 5389 | struct cgroup_subsys_state *css = task_css(tsk, i); |
5191 | 5390 | ||
5192 | ss->exit(cgrp, old_cgrp, tsk); | 5391 | ss->exit(css, old_css, tsk); |
5193 | } | 5392 | } |
5194 | } | 5393 | } |
5195 | } | 5394 | } |
@@ -5448,20 +5647,16 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss, | |||
5448 | return 0; | 5647 | return 0; |
5449 | } | 5648 | } |
5450 | 5649 | ||
5451 | static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent, | 5650 | static int alloc_css_id(struct cgroup_subsys_state *child_css) |
5452 | struct cgroup *child) | ||
5453 | { | 5651 | { |
5454 | int subsys_id, i, depth = 0; | 5652 | struct cgroup_subsys_state *parent_css = css_parent(child_css); |
5455 | struct cgroup_subsys_state *parent_css, *child_css; | ||
5456 | struct css_id *child_id, *parent_id; | 5653 | struct css_id *child_id, *parent_id; |
5654 | int i, depth; | ||
5457 | 5655 | ||
5458 | subsys_id = ss->subsys_id; | ||
5459 | parent_css = parent->subsys[subsys_id]; | ||
5460 | child_css = child->subsys[subsys_id]; | ||
5461 | parent_id = rcu_dereference_protected(parent_css->id, true); | 5656 | parent_id = rcu_dereference_protected(parent_css->id, true); |
5462 | depth = parent_id->depth + 1; | 5657 | depth = parent_id->depth + 1; |
5463 | 5658 | ||
5464 | child_id = get_new_cssid(ss, depth); | 5659 | child_id = get_new_cssid(child_css->ss, depth); |
5465 | if (IS_ERR(child_id)) | 5660 | if (IS_ERR(child_id)) |
5466 | return PTR_ERR(child_id); | 5661 | return PTR_ERR(child_id); |
5467 | 5662 | ||
@@ -5499,31 +5694,56 @@ struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id) | |||
5499 | } | 5694 | } |
5500 | EXPORT_SYMBOL_GPL(css_lookup); | 5695 | EXPORT_SYMBOL_GPL(css_lookup); |
5501 | 5696 | ||
5502 | /* | 5697 | /** |
5503 | * get corresponding css from file open on cgroupfs directory | 5698 | * css_from_dir - get corresponding css from the dentry of a cgroup dir |
5699 | * @dentry: directory dentry of interest | ||
5700 | * @ss: subsystem of interest | ||
5701 | * | ||
5702 | * Must be called under RCU read lock. The caller is responsible for | ||
5703 | * pinning the returned css if it needs to be accessed outside the RCU | ||
5704 | * critical section. | ||
5504 | */ | 5705 | */ |
5505 | struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) | 5706 | struct cgroup_subsys_state *css_from_dir(struct dentry *dentry, |
5707 | struct cgroup_subsys *ss) | ||
5506 | { | 5708 | { |
5507 | struct cgroup *cgrp; | 5709 | struct cgroup *cgrp; |
5508 | struct inode *inode; | ||
5509 | struct cgroup_subsys_state *css; | ||
5510 | 5710 | ||
5511 | inode = file_inode(f); | 5711 | WARN_ON_ONCE(!rcu_read_lock_held()); |
5512 | /* check in cgroup filesystem dir */ | 5712 | |
5513 | if (inode->i_op != &cgroup_dir_inode_operations) | 5713 | /* is @dentry a cgroup dir? */ |
5714 | if (!dentry->d_inode || | ||
5715 | dentry->d_inode->i_op != &cgroup_dir_inode_operations) | ||
5514 | return ERR_PTR(-EBADF); | 5716 | return ERR_PTR(-EBADF); |
5515 | 5717 | ||
5516 | if (id < 0 || id >= CGROUP_SUBSYS_COUNT) | 5718 | cgrp = __d_cgrp(dentry); |
5517 | return ERR_PTR(-EINVAL); | 5719 | return cgroup_css(cgrp, ss) ?: ERR_PTR(-ENOENT); |
5720 | } | ||
5518 | 5721 | ||
5519 | /* get cgroup */ | 5722 | /** |
5520 | cgrp = __d_cgrp(f->f_dentry); | 5723 | * css_from_id - lookup css by id |
5521 | css = cgrp->subsys[id]; | 5724 | * @id: the cgroup id |
5522 | return css ? css : ERR_PTR(-ENOENT); | 5725 | * @ss: cgroup subsys to be looked into |
5726 | * | ||
5727 | * Returns the css if there's valid one with @id, otherwise returns NULL. | ||
5728 | * Should be called under rcu_read_lock(). | ||
5729 | */ | ||
5730 | struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) | ||
5731 | { | ||
5732 | struct cgroup *cgrp; | ||
5733 | |||
5734 | rcu_lockdep_assert(rcu_read_lock_held() || | ||
5735 | lockdep_is_held(&cgroup_mutex), | ||
5736 | "css_from_id() needs proper protection"); | ||
5737 | |||
5738 | cgrp = idr_find(&ss->root->cgroup_idr, id); | ||
5739 | if (cgrp) | ||
5740 | return cgroup_css(cgrp, ss); | ||
5741 | return NULL; | ||
5523 | } | 5742 | } |
5524 | 5743 | ||
5525 | #ifdef CONFIG_CGROUP_DEBUG | 5744 | #ifdef CONFIG_CGROUP_DEBUG |
5526 | static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cgrp) | 5745 | static struct cgroup_subsys_state * |
5746 | debug_css_alloc(struct cgroup_subsys_state *parent_css) | ||
5527 | { | 5747 | { |
5528 | struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); | 5748 | struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); |
5529 | 5749 | ||
@@ -5533,22 +5753,24 @@ static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cgrp) | |||
5533 | return css; | 5753 | return css; |
5534 | } | 5754 | } |
5535 | 5755 | ||
5536 | static void debug_css_free(struct cgroup *cgrp) | 5756 | static void debug_css_free(struct cgroup_subsys_state *css) |
5537 | { | 5757 | { |
5538 | kfree(cgrp->subsys[debug_subsys_id]); | 5758 | kfree(css); |
5539 | } | 5759 | } |
5540 | 5760 | ||
5541 | static u64 debug_taskcount_read(struct cgroup *cgrp, struct cftype *cft) | 5761 | static u64 debug_taskcount_read(struct cgroup_subsys_state *css, |
5762 | struct cftype *cft) | ||
5542 | { | 5763 | { |
5543 | return cgroup_task_count(cgrp); | 5764 | return cgroup_task_count(css->cgroup); |
5544 | } | 5765 | } |
5545 | 5766 | ||
5546 | static u64 current_css_set_read(struct cgroup *cgrp, struct cftype *cft) | 5767 | static u64 current_css_set_read(struct cgroup_subsys_state *css, |
5768 | struct cftype *cft) | ||
5547 | { | 5769 | { |
5548 | return (u64)(unsigned long)current->cgroups; | 5770 | return (u64)(unsigned long)current->cgroups; |
5549 | } | 5771 | } |
5550 | 5772 | ||
5551 | static u64 current_css_set_refcount_read(struct cgroup *cgrp, | 5773 | static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css, |
5552 | struct cftype *cft) | 5774 | struct cftype *cft) |
5553 | { | 5775 | { |
5554 | u64 count; | 5776 | u64 count; |
@@ -5559,7 +5781,7 @@ static u64 current_css_set_refcount_read(struct cgroup *cgrp, | |||
5559 | return count; | 5781 | return count; |
5560 | } | 5782 | } |
5561 | 5783 | ||
5562 | static int current_css_set_cg_links_read(struct cgroup *cgrp, | 5784 | static int current_css_set_cg_links_read(struct cgroup_subsys_state *css, |
5563 | struct cftype *cft, | 5785 | struct cftype *cft, |
5564 | struct seq_file *seq) | 5786 | struct seq_file *seq) |
5565 | { | 5787 | { |
@@ -5586,14 +5808,13 @@ static int current_css_set_cg_links_read(struct cgroup *cgrp, | |||
5586 | } | 5808 | } |
5587 | 5809 | ||
5588 | #define MAX_TASKS_SHOWN_PER_CSS 25 | 5810 | #define MAX_TASKS_SHOWN_PER_CSS 25 |
5589 | static int cgroup_css_links_read(struct cgroup *cgrp, | 5811 | static int cgroup_css_links_read(struct cgroup_subsys_state *css, |
5590 | struct cftype *cft, | 5812 | struct cftype *cft, struct seq_file *seq) |
5591 | struct seq_file *seq) | ||
5592 | { | 5813 | { |
5593 | struct cgrp_cset_link *link; | 5814 | struct cgrp_cset_link *link; |
5594 | 5815 | ||
5595 | read_lock(&css_set_lock); | 5816 | read_lock(&css_set_lock); |
5596 | list_for_each_entry(link, &cgrp->cset_links, cset_link) { | 5817 | list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { |
5597 | struct css_set *cset = link->cset; | 5818 | struct css_set *cset = link->cset; |
5598 | struct task_struct *task; | 5819 | struct task_struct *task; |
5599 | int count = 0; | 5820 | int count = 0; |
@@ -5612,9 +5833,9 @@ static int cgroup_css_links_read(struct cgroup *cgrp, | |||
5612 | return 0; | 5833 | return 0; |
5613 | } | 5834 | } |
5614 | 5835 | ||
5615 | static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft) | 5836 | static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft) |
5616 | { | 5837 | { |
5617 | return test_bit(CGRP_RELEASABLE, &cgrp->flags); | 5838 | return test_bit(CGRP_RELEASABLE, &css->cgroup->flags); |
5618 | } | 5839 | } |
5619 | 5840 | ||
5620 | static struct cftype debug_files[] = { | 5841 | static struct cftype debug_files[] = { |
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 75dda1ea5026..f0ff64d0ebaa 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c | |||
@@ -45,25 +45,19 @@ struct freezer { | |||
45 | spinlock_t lock; | 45 | spinlock_t lock; |
46 | }; | 46 | }; |
47 | 47 | ||
48 | static inline struct freezer *cgroup_freezer(struct cgroup *cgroup) | 48 | static inline struct freezer *css_freezer(struct cgroup_subsys_state *css) |
49 | { | 49 | { |
50 | return container_of(cgroup_subsys_state(cgroup, freezer_subsys_id), | 50 | return css ? container_of(css, struct freezer, css) : NULL; |
51 | struct freezer, css); | ||
52 | } | 51 | } |
53 | 52 | ||
54 | static inline struct freezer *task_freezer(struct task_struct *task) | 53 | static inline struct freezer *task_freezer(struct task_struct *task) |
55 | { | 54 | { |
56 | return container_of(task_subsys_state(task, freezer_subsys_id), | 55 | return css_freezer(task_css(task, freezer_subsys_id)); |
57 | struct freezer, css); | ||
58 | } | 56 | } |
59 | 57 | ||
60 | static struct freezer *parent_freezer(struct freezer *freezer) | 58 | static struct freezer *parent_freezer(struct freezer *freezer) |
61 | { | 59 | { |
62 | struct cgroup *pcg = freezer->css.cgroup->parent; | 60 | return css_freezer(css_parent(&freezer->css)); |
63 | |||
64 | if (pcg) | ||
65 | return cgroup_freezer(pcg); | ||
66 | return NULL; | ||
67 | } | 61 | } |
68 | 62 | ||
69 | bool cgroup_freezing(struct task_struct *task) | 63 | bool cgroup_freezing(struct task_struct *task) |
@@ -92,7 +86,8 @@ static const char *freezer_state_strs(unsigned int state) | |||
92 | 86 | ||
93 | struct cgroup_subsys freezer_subsys; | 87 | struct cgroup_subsys freezer_subsys; |
94 | 88 | ||
95 | static struct cgroup_subsys_state *freezer_css_alloc(struct cgroup *cgroup) | 89 | static struct cgroup_subsys_state * |
90 | freezer_css_alloc(struct cgroup_subsys_state *parent_css) | ||
96 | { | 91 | { |
97 | struct freezer *freezer; | 92 | struct freezer *freezer; |
98 | 93 | ||
@@ -105,22 +100,22 @@ static struct cgroup_subsys_state *freezer_css_alloc(struct cgroup *cgroup) | |||
105 | } | 100 | } |
106 | 101 | ||
107 | /** | 102 | /** |
108 | * freezer_css_online - commit creation of a freezer cgroup | 103 | * freezer_css_online - commit creation of a freezer css |
109 | * @cgroup: cgroup being created | 104 | * @css: css being created |
110 | * | 105 | * |
111 | * We're committing to creation of @cgroup. Mark it online and inherit | 106 | * We're committing to creation of @css. Mark it online and inherit |
112 | * parent's freezing state while holding both parent's and our | 107 | * parent's freezing state while holding both parent's and our |
113 | * freezer->lock. | 108 | * freezer->lock. |
114 | */ | 109 | */ |
115 | static int freezer_css_online(struct cgroup *cgroup) | 110 | static int freezer_css_online(struct cgroup_subsys_state *css) |
116 | { | 111 | { |
117 | struct freezer *freezer = cgroup_freezer(cgroup); | 112 | struct freezer *freezer = css_freezer(css); |
118 | struct freezer *parent = parent_freezer(freezer); | 113 | struct freezer *parent = parent_freezer(freezer); |
119 | 114 | ||
120 | /* | 115 | /* |
121 | * The following double locking and freezing state inheritance | 116 | * The following double locking and freezing state inheritance |
122 | * guarantee that @cgroup can never escape ancestors' freezing | 117 | * guarantee that @cgroup can never escape ancestors' freezing |
123 | * states. See cgroup_for_each_descendant_pre() for details. | 118 | * states. See css_for_each_descendant_pre() for details. |
124 | */ | 119 | */ |
125 | if (parent) | 120 | if (parent) |
126 | spin_lock_irq(&parent->lock); | 121 | spin_lock_irq(&parent->lock); |
@@ -141,15 +136,15 @@ static int freezer_css_online(struct cgroup *cgroup) | |||
141 | } | 136 | } |
142 | 137 | ||
143 | /** | 138 | /** |
144 | * freezer_css_offline - initiate destruction of @cgroup | 139 | * freezer_css_offline - initiate destruction of a freezer css |
145 | * @cgroup: cgroup being destroyed | 140 | * @css: css being destroyed |
146 | * | 141 | * |
147 | * @cgroup is going away. Mark it dead and decrement system_freezing_count | 142 | * @css is going away. Mark it dead and decrement system_freezing_count if |
148 | * if it was holding one. | 143 | * it was holding one. |
149 | */ | 144 | */ |
150 | static void freezer_css_offline(struct cgroup *cgroup) | 145 | static void freezer_css_offline(struct cgroup_subsys_state *css) |
151 | { | 146 | { |
152 | struct freezer *freezer = cgroup_freezer(cgroup); | 147 | struct freezer *freezer = css_freezer(css); |
153 | 148 | ||
154 | spin_lock_irq(&freezer->lock); | 149 | spin_lock_irq(&freezer->lock); |
155 | 150 | ||
@@ -161,9 +156,9 @@ static void freezer_css_offline(struct cgroup *cgroup) | |||
161 | spin_unlock_irq(&freezer->lock); | 156 | spin_unlock_irq(&freezer->lock); |
162 | } | 157 | } |
163 | 158 | ||
164 | static void freezer_css_free(struct cgroup *cgroup) | 159 | static void freezer_css_free(struct cgroup_subsys_state *css) |
165 | { | 160 | { |
166 | kfree(cgroup_freezer(cgroup)); | 161 | kfree(css_freezer(css)); |
167 | } | 162 | } |
168 | 163 | ||
169 | /* | 164 | /* |
@@ -175,25 +170,26 @@ static void freezer_css_free(struct cgroup *cgroup) | |||
175 | * @freezer->lock. freezer_attach() makes the new tasks conform to the | 170 | * @freezer->lock. freezer_attach() makes the new tasks conform to the |
176 | * current state and all following state changes can see the new tasks. | 171 | * current state and all following state changes can see the new tasks. |
177 | */ | 172 | */ |
178 | static void freezer_attach(struct cgroup *new_cgrp, struct cgroup_taskset *tset) | 173 | static void freezer_attach(struct cgroup_subsys_state *new_css, |
174 | struct cgroup_taskset *tset) | ||
179 | { | 175 | { |
180 | struct freezer *freezer = cgroup_freezer(new_cgrp); | 176 | struct freezer *freezer = css_freezer(new_css); |
181 | struct task_struct *task; | 177 | struct task_struct *task; |
182 | bool clear_frozen = false; | 178 | bool clear_frozen = false; |
183 | 179 | ||
184 | spin_lock_irq(&freezer->lock); | 180 | spin_lock_irq(&freezer->lock); |
185 | 181 | ||
186 | /* | 182 | /* |
187 | * Make the new tasks conform to the current state of @new_cgrp. | 183 | * Make the new tasks conform to the current state of @new_css. |
188 | * For simplicity, when migrating any task to a FROZEN cgroup, we | 184 | * For simplicity, when migrating any task to a FROZEN cgroup, we |
189 | * revert it to FREEZING and let update_if_frozen() determine the | 185 | * revert it to FREEZING and let update_if_frozen() determine the |
190 | * correct state later. | 186 | * correct state later. |
191 | * | 187 | * |
192 | * Tasks in @tset are on @new_cgrp but may not conform to its | 188 | * Tasks in @tset are on @new_css but may not conform to its |
193 | * current state before executing the following - !frozen tasks may | 189 | * current state before executing the following - !frozen tasks may |
194 | * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. | 190 | * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. |
195 | */ | 191 | */ |
196 | cgroup_taskset_for_each(task, new_cgrp, tset) { | 192 | cgroup_taskset_for_each(task, new_css, tset) { |
197 | if (!(freezer->state & CGROUP_FREEZING)) { | 193 | if (!(freezer->state & CGROUP_FREEZING)) { |
198 | __thaw_task(task); | 194 | __thaw_task(task); |
199 | } else { | 195 | } else { |
@@ -231,7 +227,7 @@ static void freezer_fork(struct task_struct *task) | |||
231 | * The root cgroup is non-freezable, so we can skip the | 227 | * The root cgroup is non-freezable, so we can skip the |
232 | * following check. | 228 | * following check. |
233 | */ | 229 | */ |
234 | if (!freezer->css.cgroup->parent) | 230 | if (!parent_freezer(freezer)) |
235 | goto out; | 231 | goto out; |
236 | 232 | ||
237 | spin_lock_irq(&freezer->lock); | 233 | spin_lock_irq(&freezer->lock); |
@@ -244,7 +240,7 @@ out: | |||
244 | 240 | ||
245 | /** | 241 | /** |
246 | * update_if_frozen - update whether a cgroup finished freezing | 242 | * update_if_frozen - update whether a cgroup finished freezing |
247 | * @cgroup: cgroup of interest | 243 | * @css: css of interest |
248 | * | 244 | * |
249 | * Once FREEZING is initiated, transition to FROZEN is lazily updated by | 245 | * Once FREEZING is initiated, transition to FROZEN is lazily updated by |
250 | * calling this function. If the current state is FREEZING but not FROZEN, | 246 | * calling this function. If the current state is FREEZING but not FROZEN, |
@@ -255,14 +251,14 @@ out: | |||
255 | * update_if_frozen() on all descendants prior to invoking this function. | 251 | * update_if_frozen() on all descendants prior to invoking this function. |
256 | * | 252 | * |
257 | * Task states and freezer state might disagree while tasks are being | 253 | * Task states and freezer state might disagree while tasks are being |
258 | * migrated into or out of @cgroup, so we can't verify task states against | 254 | * migrated into or out of @css, so we can't verify task states against |
259 | * @freezer state here. See freezer_attach() for details. | 255 | * @freezer state here. See freezer_attach() for details. |
260 | */ | 256 | */ |
261 | static void update_if_frozen(struct cgroup *cgroup) | 257 | static void update_if_frozen(struct cgroup_subsys_state *css) |
262 | { | 258 | { |
263 | struct freezer *freezer = cgroup_freezer(cgroup); | 259 | struct freezer *freezer = css_freezer(css); |
264 | struct cgroup *pos; | 260 | struct cgroup_subsys_state *pos; |
265 | struct cgroup_iter it; | 261 | struct css_task_iter it; |
266 | struct task_struct *task; | 262 | struct task_struct *task; |
267 | 263 | ||
268 | WARN_ON_ONCE(!rcu_read_lock_held()); | 264 | WARN_ON_ONCE(!rcu_read_lock_held()); |
@@ -274,8 +270,8 @@ static void update_if_frozen(struct cgroup *cgroup) | |||
274 | goto out_unlock; | 270 | goto out_unlock; |
275 | 271 | ||
276 | /* are all (live) children frozen? */ | 272 | /* are all (live) children frozen? */ |
277 | cgroup_for_each_child(pos, cgroup) { | 273 | css_for_each_child(pos, css) { |
278 | struct freezer *child = cgroup_freezer(pos); | 274 | struct freezer *child = css_freezer(pos); |
279 | 275 | ||
280 | if ((child->state & CGROUP_FREEZER_ONLINE) && | 276 | if ((child->state & CGROUP_FREEZER_ONLINE) && |
281 | !(child->state & CGROUP_FROZEN)) | 277 | !(child->state & CGROUP_FROZEN)) |
@@ -283,9 +279,9 @@ static void update_if_frozen(struct cgroup *cgroup) | |||
283 | } | 279 | } |
284 | 280 | ||
285 | /* are all tasks frozen? */ | 281 | /* are all tasks frozen? */ |
286 | cgroup_iter_start(cgroup, &it); | 282 | css_task_iter_start(css, &it); |
287 | 283 | ||
288 | while ((task = cgroup_iter_next(cgroup, &it))) { | 284 | while ((task = css_task_iter_next(&it))) { |
289 | if (freezing(task)) { | 285 | if (freezing(task)) { |
290 | /* | 286 | /* |
291 | * freezer_should_skip() indicates that the task | 287 | * freezer_should_skip() indicates that the task |
@@ -300,52 +296,49 @@ static void update_if_frozen(struct cgroup *cgroup) | |||
300 | 296 | ||
301 | freezer->state |= CGROUP_FROZEN; | 297 | freezer->state |= CGROUP_FROZEN; |
302 | out_iter_end: | 298 | out_iter_end: |
303 | cgroup_iter_end(cgroup, &it); | 299 | css_task_iter_end(&it); |
304 | out_unlock: | 300 | out_unlock: |
305 | spin_unlock_irq(&freezer->lock); | 301 | spin_unlock_irq(&freezer->lock); |
306 | } | 302 | } |
307 | 303 | ||
308 | static int freezer_read(struct cgroup *cgroup, struct cftype *cft, | 304 | static int freezer_read(struct cgroup_subsys_state *css, struct cftype *cft, |
309 | struct seq_file *m) | 305 | struct seq_file *m) |
310 | { | 306 | { |
311 | struct cgroup *pos; | 307 | struct cgroup_subsys_state *pos; |
312 | 308 | ||
313 | rcu_read_lock(); | 309 | rcu_read_lock(); |
314 | 310 | ||
315 | /* update states bottom-up */ | 311 | /* update states bottom-up */ |
316 | cgroup_for_each_descendant_post(pos, cgroup) | 312 | css_for_each_descendant_post(pos, css) |
317 | update_if_frozen(pos); | 313 | update_if_frozen(pos); |
318 | update_if_frozen(cgroup); | ||
319 | 314 | ||
320 | rcu_read_unlock(); | 315 | rcu_read_unlock(); |
321 | 316 | ||
322 | seq_puts(m, freezer_state_strs(cgroup_freezer(cgroup)->state)); | 317 | seq_puts(m, freezer_state_strs(css_freezer(css)->state)); |
323 | seq_putc(m, '\n'); | 318 | seq_putc(m, '\n'); |
324 | return 0; | 319 | return 0; |
325 | } | 320 | } |
326 | 321 | ||
327 | static void freeze_cgroup(struct freezer *freezer) | 322 | static void freeze_cgroup(struct freezer *freezer) |
328 | { | 323 | { |
329 | struct cgroup *cgroup = freezer->css.cgroup; | 324 | struct css_task_iter it; |
330 | struct cgroup_iter it; | ||
331 | struct task_struct *task; | 325 | struct task_struct *task; |
332 | 326 | ||
333 | cgroup_iter_start(cgroup, &it); | 327 | css_task_iter_start(&freezer->css, &it); |
334 | while ((task = cgroup_iter_next(cgroup, &it))) | 328 | while ((task = css_task_iter_next(&it))) |
335 | freeze_task(task); | 329 | freeze_task(task); |
336 | cgroup_iter_end(cgroup, &it); | 330 | css_task_iter_end(&it); |
337 | } | 331 | } |
338 | 332 | ||
339 | static void unfreeze_cgroup(struct freezer *freezer) | 333 | static void unfreeze_cgroup(struct freezer *freezer) |
340 | { | 334 | { |
341 | struct cgroup *cgroup = freezer->css.cgroup; | 335 | struct css_task_iter it; |
342 | struct cgroup_iter it; | ||
343 | struct task_struct *task; | 336 | struct task_struct *task; |
344 | 337 | ||
345 | cgroup_iter_start(cgroup, &it); | 338 | css_task_iter_start(&freezer->css, &it); |
346 | while ((task = cgroup_iter_next(cgroup, &it))) | 339 | while ((task = css_task_iter_next(&it))) |
347 | __thaw_task(task); | 340 | __thaw_task(task); |
348 | cgroup_iter_end(cgroup, &it); | 341 | css_task_iter_end(&it); |
349 | } | 342 | } |
350 | 343 | ||
351 | /** | 344 | /** |
@@ -395,12 +388,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze, | |||
395 | */ | 388 | */ |
396 | static void freezer_change_state(struct freezer *freezer, bool freeze) | 389 | static void freezer_change_state(struct freezer *freezer, bool freeze) |
397 | { | 390 | { |
398 | struct cgroup *pos; | 391 | struct cgroup_subsys_state *pos; |
399 | |||
400 | /* update @freezer */ | ||
401 | spin_lock_irq(&freezer->lock); | ||
402 | freezer_apply_state(freezer, freeze, CGROUP_FREEZING_SELF); | ||
403 | spin_unlock_irq(&freezer->lock); | ||
404 | 392 | ||
405 | /* | 393 | /* |
406 | * Update all its descendants in pre-order traversal. Each | 394 | * Update all its descendants in pre-order traversal. Each |
@@ -408,24 +396,33 @@ static void freezer_change_state(struct freezer *freezer, bool freeze) | |||
408 | * CGROUP_FREEZING_PARENT. | 396 | * CGROUP_FREEZING_PARENT. |
409 | */ | 397 | */ |
410 | rcu_read_lock(); | 398 | rcu_read_lock(); |
411 | cgroup_for_each_descendant_pre(pos, freezer->css.cgroup) { | 399 | css_for_each_descendant_pre(pos, &freezer->css) { |
412 | struct freezer *pos_f = cgroup_freezer(pos); | 400 | struct freezer *pos_f = css_freezer(pos); |
413 | struct freezer *parent = parent_freezer(pos_f); | 401 | struct freezer *parent = parent_freezer(pos_f); |
414 | 402 | ||
415 | /* | ||
416 | * Our update to @parent->state is already visible which is | ||
417 | * all we need. No need to lock @parent. For more info on | ||
418 | * synchronization, see freezer_post_create(). | ||
419 | */ | ||
420 | spin_lock_irq(&pos_f->lock); | 403 | spin_lock_irq(&pos_f->lock); |
421 | freezer_apply_state(pos_f, parent->state & CGROUP_FREEZING, | 404 | |
422 | CGROUP_FREEZING_PARENT); | 405 | if (pos_f == freezer) { |
406 | freezer_apply_state(pos_f, freeze, | ||
407 | CGROUP_FREEZING_SELF); | ||
408 | } else { | ||
409 | /* | ||
410 | * Our update to @parent->state is already visible | ||
411 | * which is all we need. No need to lock @parent. | ||
412 | * For more info on synchronization, see | ||
413 | * freezer_post_create(). | ||
414 | */ | ||
415 | freezer_apply_state(pos_f, | ||
416 | parent->state & CGROUP_FREEZING, | ||
417 | CGROUP_FREEZING_PARENT); | ||
418 | } | ||
419 | |||
423 | spin_unlock_irq(&pos_f->lock); | 420 | spin_unlock_irq(&pos_f->lock); |
424 | } | 421 | } |
425 | rcu_read_unlock(); | 422 | rcu_read_unlock(); |
426 | } | 423 | } |
427 | 424 | ||
428 | static int freezer_write(struct cgroup *cgroup, struct cftype *cft, | 425 | static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft, |
429 | const char *buffer) | 426 | const char *buffer) |
430 | { | 427 | { |
431 | bool freeze; | 428 | bool freeze; |
@@ -437,20 +434,22 @@ static int freezer_write(struct cgroup *cgroup, struct cftype *cft, | |||
437 | else | 434 | else |
438 | return -EINVAL; | 435 | return -EINVAL; |
439 | 436 | ||
440 | freezer_change_state(cgroup_freezer(cgroup), freeze); | 437 | freezer_change_state(css_freezer(css), freeze); |
441 | return 0; | 438 | return 0; |
442 | } | 439 | } |
443 | 440 | ||
444 | static u64 freezer_self_freezing_read(struct cgroup *cgroup, struct cftype *cft) | 441 | static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css, |
442 | struct cftype *cft) | ||
445 | { | 443 | { |
446 | struct freezer *freezer = cgroup_freezer(cgroup); | 444 | struct freezer *freezer = css_freezer(css); |
447 | 445 | ||
448 | return (bool)(freezer->state & CGROUP_FREEZING_SELF); | 446 | return (bool)(freezer->state & CGROUP_FREEZING_SELF); |
449 | } | 447 | } |
450 | 448 | ||
451 | static u64 freezer_parent_freezing_read(struct cgroup *cgroup, struct cftype *cft) | 449 | static u64 freezer_parent_freezing_read(struct cgroup_subsys_state *css, |
450 | struct cftype *cft) | ||
452 | { | 451 | { |
453 | struct freezer *freezer = cgroup_freezer(cgroup); | 452 | struct freezer *freezer = css_freezer(css); |
454 | 453 | ||
455 | return (bool)(freezer->state & CGROUP_FREEZING_PARENT); | 454 | return (bool)(freezer->state & CGROUP_FREEZING_PARENT); |
456 | } | 455 | } |
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 383f8231e436..247091bf0587 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c | |||
@@ -20,22 +20,33 @@ | |||
20 | #include <linux/hardirq.h> | 20 | #include <linux/hardirq.h> |
21 | #include <linux/export.h> | 21 | #include <linux/export.h> |
22 | 22 | ||
23 | DEFINE_PER_CPU(struct context_tracking, context_tracking) = { | 23 | #define CREATE_TRACE_POINTS |
24 | #ifdef CONFIG_CONTEXT_TRACKING_FORCE | 24 | #include <trace/events/context_tracking.h> |
25 | .active = true, | 25 | |
26 | #endif | 26 | struct static_key context_tracking_enabled = STATIC_KEY_INIT_FALSE; |
27 | }; | 27 | EXPORT_SYMBOL_GPL(context_tracking_enabled); |
28 | |||
29 | DEFINE_PER_CPU(struct context_tracking, context_tracking); | ||
30 | EXPORT_SYMBOL_GPL(context_tracking); | ||
31 | |||
32 | void context_tracking_cpu_set(int cpu) | ||
33 | { | ||
34 | if (!per_cpu(context_tracking.active, cpu)) { | ||
35 | per_cpu(context_tracking.active, cpu) = true; | ||
36 | static_key_slow_inc(&context_tracking_enabled); | ||
37 | } | ||
38 | } | ||
28 | 39 | ||
29 | /** | 40 | /** |
30 | * user_enter - Inform the context tracking that the CPU is going to | 41 | * context_tracking_user_enter - Inform the context tracking that the CPU is going to |
31 | * enter userspace mode. | 42 | * enter userspace mode. |
32 | * | 43 | * |
33 | * This function must be called right before we switch from the kernel | 44 | * This function must be called right before we switch from the kernel |
34 | * to userspace, when it's guaranteed the remaining kernel instructions | 45 | * to userspace, when it's guaranteed the remaining kernel instructions |
35 | * to execute won't use any RCU read side critical section because this | 46 | * to execute won't use any RCU read side critical section because this |
36 | * function sets RCU in extended quiescent state. | 47 | * function sets RCU in extended quiescent state. |
37 | */ | 48 | */ |
38 | void user_enter(void) | 49 | void context_tracking_user_enter(void) |
39 | { | 50 | { |
40 | unsigned long flags; | 51 | unsigned long flags; |
41 | 52 | ||
@@ -54,17 +65,32 @@ void user_enter(void) | |||
54 | WARN_ON_ONCE(!current->mm); | 65 | WARN_ON_ONCE(!current->mm); |
55 | 66 | ||
56 | local_irq_save(flags); | 67 | local_irq_save(flags); |
57 | if (__this_cpu_read(context_tracking.active) && | 68 | if ( __this_cpu_read(context_tracking.state) != IN_USER) { |
58 | __this_cpu_read(context_tracking.state) != IN_USER) { | 69 | if (__this_cpu_read(context_tracking.active)) { |
70 | trace_user_enter(0); | ||
71 | /* | ||
72 | * At this stage, only low level arch entry code remains and | ||
73 | * then we'll run in userspace. We can assume there won't be | ||
74 | * any RCU read-side critical section until the next call to | ||
75 | * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency | ||
76 | * on the tick. | ||
77 | */ | ||
78 | vtime_user_enter(current); | ||
79 | rcu_user_enter(); | ||
80 | } | ||
59 | /* | 81 | /* |
60 | * At this stage, only low level arch entry code remains and | 82 | * Even if context tracking is disabled on this CPU, because it's outside |
61 | * then we'll run in userspace. We can assume there won't be | 83 | * the full dynticks mask for example, we still have to keep track of the |
62 | * any RCU read-side critical section until the next call to | 84 | * context transitions and states to prevent inconsistency on those of |
63 | * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency | 85 | * other CPUs. |
64 | * on the tick. | 86 | * If a task triggers an exception in userspace, sleep on the exception |
87 | * handler and then migrate to another CPU, that new CPU must know where | ||
88 | * the exception returns by the time we call exception_exit(). | ||
89 | * This information can only be provided by the previous CPU when it called | ||
90 | * exception_enter(). | ||
91 | * OTOH we can spare the calls to vtime and RCU when context_tracking.active | ||
92 | * is false because we know that CPU is not tickless. | ||
65 | */ | 93 | */ |
66 | vtime_user_enter(current); | ||
67 | rcu_user_enter(); | ||
68 | __this_cpu_write(context_tracking.state, IN_USER); | 94 | __this_cpu_write(context_tracking.state, IN_USER); |
69 | } | 95 | } |
70 | local_irq_restore(flags); | 96 | local_irq_restore(flags); |
@@ -87,10 +113,9 @@ void user_enter(void) | |||
87 | */ | 113 | */ |
88 | void __sched notrace preempt_schedule_context(void) | 114 | void __sched notrace preempt_schedule_context(void) |
89 | { | 115 | { |
90 | struct thread_info *ti = current_thread_info(); | ||
91 | enum ctx_state prev_ctx; | 116 | enum ctx_state prev_ctx; |
92 | 117 | ||
93 | if (likely(ti->preempt_count || irqs_disabled())) | 118 | if (likely(!preemptible())) |
94 | return; | 119 | return; |
95 | 120 | ||
96 | /* | 121 | /* |
@@ -112,8 +137,8 @@ EXPORT_SYMBOL_GPL(preempt_schedule_context); | |||
112 | #endif /* CONFIG_PREEMPT */ | 137 | #endif /* CONFIG_PREEMPT */ |
113 | 138 | ||
114 | /** | 139 | /** |
115 | * user_exit - Inform the context tracking that the CPU is | 140 | * context_tracking_user_exit - Inform the context tracking that the CPU is |
116 | * exiting userspace mode and entering the kernel. | 141 | * exiting userspace mode and entering the kernel. |
117 | * | 142 | * |
118 | * This function must be called after we entered the kernel from userspace | 143 | * This function must be called after we entered the kernel from userspace |
119 | * before any use of RCU read side critical section. This potentially include | 144 | * before any use of RCU read side critical section. This potentially include |
@@ -122,7 +147,7 @@ EXPORT_SYMBOL_GPL(preempt_schedule_context); | |||
122 | * This call supports re-entrancy. This way it can be called from any exception | 147 | * This call supports re-entrancy. This way it can be called from any exception |
123 | * handler without needing to know if we came from userspace or not. | 148 | * handler without needing to know if we came from userspace or not. |
124 | */ | 149 | */ |
125 | void user_exit(void) | 150 | void context_tracking_user_exit(void) |
126 | { | 151 | { |
127 | unsigned long flags; | 152 | unsigned long flags; |
128 | 153 | ||
@@ -131,38 +156,22 @@ void user_exit(void) | |||
131 | 156 | ||
132 | local_irq_save(flags); | 157 | local_irq_save(flags); |
133 | if (__this_cpu_read(context_tracking.state) == IN_USER) { | 158 | if (__this_cpu_read(context_tracking.state) == IN_USER) { |
134 | /* | 159 | if (__this_cpu_read(context_tracking.active)) { |
135 | * We are going to run code that may use RCU. Inform | 160 | /* |
136 | * RCU core about that (ie: we may need the tick again). | 161 | * We are going to run code that may use RCU. Inform |
137 | */ | 162 | * RCU core about that (ie: we may need the tick again). |
138 | rcu_user_exit(); | 163 | */ |
139 | vtime_user_exit(current); | 164 | rcu_user_exit(); |
165 | vtime_user_exit(current); | ||
166 | trace_user_exit(0); | ||
167 | } | ||
140 | __this_cpu_write(context_tracking.state, IN_KERNEL); | 168 | __this_cpu_write(context_tracking.state, IN_KERNEL); |
141 | } | 169 | } |
142 | local_irq_restore(flags); | 170 | local_irq_restore(flags); |
143 | } | 171 | } |
144 | 172 | ||
145 | void guest_enter(void) | ||
146 | { | ||
147 | if (vtime_accounting_enabled()) | ||
148 | vtime_guest_enter(current); | ||
149 | else | ||
150 | __guest_enter(); | ||
151 | } | ||
152 | EXPORT_SYMBOL_GPL(guest_enter); | ||
153 | |||
154 | void guest_exit(void) | ||
155 | { | ||
156 | if (vtime_accounting_enabled()) | ||
157 | vtime_guest_exit(current); | ||
158 | else | ||
159 | __guest_exit(); | ||
160 | } | ||
161 | EXPORT_SYMBOL_GPL(guest_exit); | ||
162 | |||
163 | |||
164 | /** | 173 | /** |
165 | * context_tracking_task_switch - context switch the syscall callbacks | 174 | * __context_tracking_task_switch - context switch the syscall callbacks |
166 | * @prev: the task that is being switched out | 175 | * @prev: the task that is being switched out |
167 | * @next: the task that is being switched in | 176 | * @next: the task that is being switched in |
168 | * | 177 | * |
@@ -174,11 +183,19 @@ EXPORT_SYMBOL_GPL(guest_exit); | |||
174 | * migrate to some CPU that doesn't do the context tracking. As such the TIF | 183 | * migrate to some CPU that doesn't do the context tracking. As such the TIF |
175 | * flag may not be desired there. | 184 | * flag may not be desired there. |
176 | */ | 185 | */ |
177 | void context_tracking_task_switch(struct task_struct *prev, | 186 | void __context_tracking_task_switch(struct task_struct *prev, |
178 | struct task_struct *next) | 187 | struct task_struct *next) |
179 | { | 188 | { |
180 | if (__this_cpu_read(context_tracking.active)) { | 189 | clear_tsk_thread_flag(prev, TIF_NOHZ); |
181 | clear_tsk_thread_flag(prev, TIF_NOHZ); | 190 | set_tsk_thread_flag(next, TIF_NOHZ); |
182 | set_tsk_thread_flag(next, TIF_NOHZ); | ||
183 | } | ||
184 | } | 191 | } |
192 | |||
193 | #ifdef CONFIG_CONTEXT_TRACKING_FORCE | ||
194 | void __init context_tracking_init(void) | ||
195 | { | ||
196 | int cpu; | ||
197 | |||
198 | for_each_possible_cpu(cpu) | ||
199 | context_tracking_cpu_set(cpu); | ||
200 | } | ||
201 | #endif | ||
diff --git a/kernel/cpu.c b/kernel/cpu.c index 198a38883e64..d7f07a2da5a6 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -113,7 +113,7 @@ EXPORT_SYMBOL_GPL(put_online_cpus); | |||
113 | * get_online_cpus() not an api which is called all that often. | 113 | * get_online_cpus() not an api which is called all that often. |
114 | * | 114 | * |
115 | */ | 115 | */ |
116 | static void cpu_hotplug_begin(void) | 116 | void cpu_hotplug_begin(void) |
117 | { | 117 | { |
118 | cpu_hotplug.active_writer = current; | 118 | cpu_hotplug.active_writer = current; |
119 | 119 | ||
@@ -127,7 +127,7 @@ static void cpu_hotplug_begin(void) | |||
127 | } | 127 | } |
128 | } | 128 | } |
129 | 129 | ||
130 | static void cpu_hotplug_done(void) | 130 | void cpu_hotplug_done(void) |
131 | { | 131 | { |
132 | cpu_hotplug.active_writer = NULL; | 132 | cpu_hotplug.active_writer = NULL; |
133 | mutex_unlock(&cpu_hotplug.lock); | 133 | mutex_unlock(&cpu_hotplug.lock); |
@@ -154,10 +154,7 @@ void cpu_hotplug_enable(void) | |||
154 | cpu_maps_update_done(); | 154 | cpu_maps_update_done(); |
155 | } | 155 | } |
156 | 156 | ||
157 | #else /* #if CONFIG_HOTPLUG_CPU */ | 157 | #endif /* CONFIG_HOTPLUG_CPU */ |
158 | static void cpu_hotplug_begin(void) {} | ||
159 | static void cpu_hotplug_done(void) {} | ||
160 | #endif /* #else #if CONFIG_HOTPLUG_CPU */ | ||
161 | 158 | ||
162 | /* Need to know about CPUs going up/down? */ | 159 | /* Need to know about CPUs going up/down? */ |
163 | int __ref register_cpu_notifier(struct notifier_block *nb) | 160 | int __ref register_cpu_notifier(struct notifier_block *nb) |
@@ -366,7 +363,7 @@ EXPORT_SYMBOL(cpu_down); | |||
366 | #endif /*CONFIG_HOTPLUG_CPU*/ | 363 | #endif /*CONFIG_HOTPLUG_CPU*/ |
367 | 364 | ||
368 | /* Requires cpu_add_remove_lock to be held */ | 365 | /* Requires cpu_add_remove_lock to be held */ |
369 | static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) | 366 | static int _cpu_up(unsigned int cpu, int tasks_frozen) |
370 | { | 367 | { |
371 | int ret, nr_calls = 0; | 368 | int ret, nr_calls = 0; |
372 | void *hcpu = (void *)(long)cpu; | 369 | void *hcpu = (void *)(long)cpu; |
@@ -419,7 +416,7 @@ out: | |||
419 | return ret; | 416 | return ret; |
420 | } | 417 | } |
421 | 418 | ||
422 | int __cpuinit cpu_up(unsigned int cpu) | 419 | int cpu_up(unsigned int cpu) |
423 | { | 420 | { |
424 | int err = 0; | 421 | int err = 0; |
425 | 422 | ||
@@ -618,7 +615,7 @@ core_initcall(cpu_hotplug_pm_sync_init); | |||
618 | * It must be called by the arch code on the new cpu, before the new cpu | 615 | * It must be called by the arch code on the new cpu, before the new cpu |
619 | * enables interrupts and before the "boot" cpu returns from __cpu_up(). | 616 | * enables interrupts and before the "boot" cpu returns from __cpu_up(). |
620 | */ | 617 | */ |
621 | void __cpuinit notify_cpu_starting(unsigned int cpu) | 618 | void notify_cpu_starting(unsigned int cpu) |
622 | { | 619 | { |
623 | unsigned long val = CPU_STARTING; | 620 | unsigned long val = CPU_STARTING; |
624 | 621 | ||
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index e5657788fedd..6bf981e13c43 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -68,10 +68,6 @@ | |||
68 | */ | 68 | */ |
69 | int number_of_cpusets __read_mostly; | 69 | int number_of_cpusets __read_mostly; |
70 | 70 | ||
71 | /* Forward declare cgroup structures */ | ||
72 | struct cgroup_subsys cpuset_subsys; | ||
73 | struct cpuset; | ||
74 | |||
75 | /* See "Frequency meter" comments, below. */ | 71 | /* See "Frequency meter" comments, below. */ |
76 | 72 | ||
77 | struct fmeter { | 73 | struct fmeter { |
@@ -115,27 +111,20 @@ struct cpuset { | |||
115 | int relax_domain_level; | 111 | int relax_domain_level; |
116 | }; | 112 | }; |
117 | 113 | ||
118 | /* Retrieve the cpuset for a cgroup */ | 114 | static inline struct cpuset *css_cs(struct cgroup_subsys_state *css) |
119 | static inline struct cpuset *cgroup_cs(struct cgroup *cgrp) | ||
120 | { | 115 | { |
121 | return container_of(cgroup_subsys_state(cgrp, cpuset_subsys_id), | 116 | return css ? container_of(css, struct cpuset, css) : NULL; |
122 | struct cpuset, css); | ||
123 | } | 117 | } |
124 | 118 | ||
125 | /* Retrieve the cpuset for a task */ | 119 | /* Retrieve the cpuset for a task */ |
126 | static inline struct cpuset *task_cs(struct task_struct *task) | 120 | static inline struct cpuset *task_cs(struct task_struct *task) |
127 | { | 121 | { |
128 | return container_of(task_subsys_state(task, cpuset_subsys_id), | 122 | return css_cs(task_css(task, cpuset_subsys_id)); |
129 | struct cpuset, css); | ||
130 | } | 123 | } |
131 | 124 | ||
132 | static inline struct cpuset *parent_cs(const struct cpuset *cs) | 125 | static inline struct cpuset *parent_cs(struct cpuset *cs) |
133 | { | 126 | { |
134 | struct cgroup *pcgrp = cs->css.cgroup->parent; | 127 | return css_cs(css_parent(&cs->css)); |
135 | |||
136 | if (pcgrp) | ||
137 | return cgroup_cs(pcgrp); | ||
138 | return NULL; | ||
139 | } | 128 | } |
140 | 129 | ||
141 | #ifdef CONFIG_NUMA | 130 | #ifdef CONFIG_NUMA |
@@ -212,29 +201,30 @@ static struct cpuset top_cpuset = { | |||
212 | /** | 201 | /** |
213 | * cpuset_for_each_child - traverse online children of a cpuset | 202 | * cpuset_for_each_child - traverse online children of a cpuset |
214 | * @child_cs: loop cursor pointing to the current child | 203 | * @child_cs: loop cursor pointing to the current child |
215 | * @pos_cgrp: used for iteration | 204 | * @pos_css: used for iteration |
216 | * @parent_cs: target cpuset to walk children of | 205 | * @parent_cs: target cpuset to walk children of |
217 | * | 206 | * |
218 | * Walk @child_cs through the online children of @parent_cs. Must be used | 207 | * Walk @child_cs through the online children of @parent_cs. Must be used |
219 | * with RCU read locked. | 208 | * with RCU read locked. |
220 | */ | 209 | */ |
221 | #define cpuset_for_each_child(child_cs, pos_cgrp, parent_cs) \ | 210 | #define cpuset_for_each_child(child_cs, pos_css, parent_cs) \ |
222 | cgroup_for_each_child((pos_cgrp), (parent_cs)->css.cgroup) \ | 211 | css_for_each_child((pos_css), &(parent_cs)->css) \ |
223 | if (is_cpuset_online(((child_cs) = cgroup_cs((pos_cgrp))))) | 212 | if (is_cpuset_online(((child_cs) = css_cs((pos_css))))) |
224 | 213 | ||
225 | /** | 214 | /** |
226 | * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants | 215 | * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants |
227 | * @des_cs: loop cursor pointing to the current descendant | 216 | * @des_cs: loop cursor pointing to the current descendant |
228 | * @pos_cgrp: used for iteration | 217 | * @pos_css: used for iteration |
229 | * @root_cs: target cpuset to walk ancestor of | 218 | * @root_cs: target cpuset to walk ancestor of |
230 | * | 219 | * |
231 | * Walk @des_cs through the online descendants of @root_cs. Must be used | 220 | * Walk @des_cs through the online descendants of @root_cs. Must be used |
232 | * with RCU read locked. The caller may modify @pos_cgrp by calling | 221 | * with RCU read locked. The caller may modify @pos_css by calling |
233 | * cgroup_rightmost_descendant() to skip subtree. | 222 | * css_rightmost_descendant() to skip subtree. @root_cs is included in the |
223 | * iteration and the first node to be visited. | ||
234 | */ | 224 | */ |
235 | #define cpuset_for_each_descendant_pre(des_cs, pos_cgrp, root_cs) \ | 225 | #define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \ |
236 | cgroup_for_each_descendant_pre((pos_cgrp), (root_cs)->css.cgroup) \ | 226 | css_for_each_descendant_pre((pos_css), &(root_cs)->css) \ |
237 | if (is_cpuset_online(((des_cs) = cgroup_cs((pos_cgrp))))) | 227 | if (is_cpuset_online(((des_cs) = css_cs((pos_css))))) |
238 | 228 | ||
239 | /* | 229 | /* |
240 | * There are two global mutexes guarding cpuset structures - cpuset_mutex | 230 | * There are two global mutexes guarding cpuset structures - cpuset_mutex |
@@ -320,8 +310,7 @@ static struct file_system_type cpuset_fs_type = { | |||
320 | * | 310 | * |
321 | * Call with callback_mutex held. | 311 | * Call with callback_mutex held. |
322 | */ | 312 | */ |
323 | static void guarantee_online_cpus(const struct cpuset *cs, | 313 | static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) |
324 | struct cpumask *pmask) | ||
325 | { | 314 | { |
326 | while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) | 315 | while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) |
327 | cs = parent_cs(cs); | 316 | cs = parent_cs(cs); |
@@ -339,7 +328,7 @@ static void guarantee_online_cpus(const struct cpuset *cs, | |||
339 | * | 328 | * |
340 | * Call with callback_mutex held. | 329 | * Call with callback_mutex held. |
341 | */ | 330 | */ |
342 | static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) | 331 | static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) |
343 | { | 332 | { |
344 | while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY])) | 333 | while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY])) |
345 | cs = parent_cs(cs); | 334 | cs = parent_cs(cs); |
@@ -384,7 +373,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) | |||
384 | * alloc_trial_cpuset - allocate a trial cpuset | 373 | * alloc_trial_cpuset - allocate a trial cpuset |
385 | * @cs: the cpuset that the trial cpuset duplicates | 374 | * @cs: the cpuset that the trial cpuset duplicates |
386 | */ | 375 | */ |
387 | static struct cpuset *alloc_trial_cpuset(const struct cpuset *cs) | 376 | static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) |
388 | { | 377 | { |
389 | struct cpuset *trial; | 378 | struct cpuset *trial; |
390 | 379 | ||
@@ -431,9 +420,9 @@ static void free_trial_cpuset(struct cpuset *trial) | |||
431 | * Return 0 if valid, -errno if not. | 420 | * Return 0 if valid, -errno if not. |
432 | */ | 421 | */ |
433 | 422 | ||
434 | static int validate_change(const struct cpuset *cur, const struct cpuset *trial) | 423 | static int validate_change(struct cpuset *cur, struct cpuset *trial) |
435 | { | 424 | { |
436 | struct cgroup *cgrp; | 425 | struct cgroup_subsys_state *css; |
437 | struct cpuset *c, *par; | 426 | struct cpuset *c, *par; |
438 | int ret; | 427 | int ret; |
439 | 428 | ||
@@ -441,7 +430,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) | |||
441 | 430 | ||
442 | /* Each of our child cpusets must be a subset of us */ | 431 | /* Each of our child cpusets must be a subset of us */ |
443 | ret = -EBUSY; | 432 | ret = -EBUSY; |
444 | cpuset_for_each_child(c, cgrp, cur) | 433 | cpuset_for_each_child(c, css, cur) |
445 | if (!is_cpuset_subset(c, trial)) | 434 | if (!is_cpuset_subset(c, trial)) |
446 | goto out; | 435 | goto out; |
447 | 436 | ||
@@ -462,7 +451,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) | |||
462 | * overlap | 451 | * overlap |
463 | */ | 452 | */ |
464 | ret = -EINVAL; | 453 | ret = -EINVAL; |
465 | cpuset_for_each_child(c, cgrp, par) { | 454 | cpuset_for_each_child(c, css, par) { |
466 | if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && | 455 | if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && |
467 | c != cur && | 456 | c != cur && |
468 | cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) | 457 | cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) |
@@ -475,13 +464,17 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) | |||
475 | 464 | ||
476 | /* | 465 | /* |
477 | * Cpusets with tasks - existing or newly being attached - can't | 466 | * Cpusets with tasks - existing or newly being attached - can't |
478 | * have empty cpus_allowed or mems_allowed. | 467 | * be changed to have empty cpus_allowed or mems_allowed. |
479 | */ | 468 | */ |
480 | ret = -ENOSPC; | 469 | ret = -ENOSPC; |
481 | if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress) && | 470 | if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress)) { |
482 | (cpumask_empty(trial->cpus_allowed) && | 471 | if (!cpumask_empty(cur->cpus_allowed) && |
483 | nodes_empty(trial->mems_allowed))) | 472 | cpumask_empty(trial->cpus_allowed)) |
484 | goto out; | 473 | goto out; |
474 | if (!nodes_empty(cur->mems_allowed) && | ||
475 | nodes_empty(trial->mems_allowed)) | ||
476 | goto out; | ||
477 | } | ||
485 | 478 | ||
486 | ret = 0; | 479 | ret = 0; |
487 | out: | 480 | out: |
@@ -511,13 +504,16 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr, | |||
511 | struct cpuset *root_cs) | 504 | struct cpuset *root_cs) |
512 | { | 505 | { |
513 | struct cpuset *cp; | 506 | struct cpuset *cp; |
514 | struct cgroup *pos_cgrp; | 507 | struct cgroup_subsys_state *pos_css; |
515 | 508 | ||
516 | rcu_read_lock(); | 509 | rcu_read_lock(); |
517 | cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { | 510 | cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { |
511 | if (cp == root_cs) | ||
512 | continue; | ||
513 | |||
518 | /* skip the whole subtree if @cp doesn't have any CPU */ | 514 | /* skip the whole subtree if @cp doesn't have any CPU */ |
519 | if (cpumask_empty(cp->cpus_allowed)) { | 515 | if (cpumask_empty(cp->cpus_allowed)) { |
520 | pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); | 516 | pos_css = css_rightmost_descendant(pos_css); |
521 | continue; | 517 | continue; |
522 | } | 518 | } |
523 | 519 | ||
@@ -592,7 +588,7 @@ static int generate_sched_domains(cpumask_var_t **domains, | |||
592 | struct sched_domain_attr *dattr; /* attributes for custom domains */ | 588 | struct sched_domain_attr *dattr; /* attributes for custom domains */ |
593 | int ndoms = 0; /* number of sched domains in result */ | 589 | int ndoms = 0; /* number of sched domains in result */ |
594 | int nslot; /* next empty doms[] struct cpumask slot */ | 590 | int nslot; /* next empty doms[] struct cpumask slot */ |
595 | struct cgroup *pos_cgrp; | 591 | struct cgroup_subsys_state *pos_css; |
596 | 592 | ||
597 | doms = NULL; | 593 | doms = NULL; |
598 | dattr = NULL; | 594 | dattr = NULL; |
@@ -621,7 +617,9 @@ static int generate_sched_domains(cpumask_var_t **domains, | |||
621 | csn = 0; | 617 | csn = 0; |
622 | 618 | ||
623 | rcu_read_lock(); | 619 | rcu_read_lock(); |
624 | cpuset_for_each_descendant_pre(cp, pos_cgrp, &top_cpuset) { | 620 | cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) { |
621 | if (cp == &top_cpuset) | ||
622 | continue; | ||
625 | /* | 623 | /* |
626 | * Continue traversing beyond @cp iff @cp has some CPUs and | 624 | * Continue traversing beyond @cp iff @cp has some CPUs and |
627 | * isn't load balancing. The former is obvious. The | 625 | * isn't load balancing. The former is obvious. The |
@@ -638,7 +636,7 @@ static int generate_sched_domains(cpumask_var_t **domains, | |||
638 | csa[csn++] = cp; | 636 | csa[csn++] = cp; |
639 | 637 | ||
640 | /* skip @cp's subtree */ | 638 | /* skip @cp's subtree */ |
641 | pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); | 639 | pos_css = css_rightmost_descendant(pos_css); |
642 | } | 640 | } |
643 | rcu_read_unlock(); | 641 | rcu_read_unlock(); |
644 | 642 | ||
@@ -833,52 +831,45 @@ static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs) | |||
833 | /** | 831 | /** |
834 | * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's | 832 | * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's |
835 | * @tsk: task to test | 833 | * @tsk: task to test |
836 | * @scan: struct cgroup_scanner containing the cgroup of the task | 834 | * @data: cpuset to @tsk belongs to |
837 | * | 835 | * |
838 | * Called by cgroup_scan_tasks() for each task in a cgroup whose | 836 | * Called by css_scan_tasks() for each task in a cgroup whose cpus_allowed |
839 | * cpus_allowed mask needs to be changed. | 837 | * mask needs to be changed. |
840 | * | 838 | * |
841 | * We don't need to re-check for the cgroup/cpuset membership, since we're | 839 | * We don't need to re-check for the cgroup/cpuset membership, since we're |
842 | * holding cpuset_mutex at this point. | 840 | * holding cpuset_mutex at this point. |
843 | */ | 841 | */ |
844 | static void cpuset_change_cpumask(struct task_struct *tsk, | 842 | static void cpuset_change_cpumask(struct task_struct *tsk, void *data) |
845 | struct cgroup_scanner *scan) | ||
846 | { | 843 | { |
847 | struct cpuset *cpus_cs; | 844 | struct cpuset *cs = data; |
845 | struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); | ||
848 | 846 | ||
849 | cpus_cs = effective_cpumask_cpuset(cgroup_cs(scan->cg)); | ||
850 | set_cpus_allowed_ptr(tsk, cpus_cs->cpus_allowed); | 847 | set_cpus_allowed_ptr(tsk, cpus_cs->cpus_allowed); |
851 | } | 848 | } |
852 | 849 | ||
853 | /** | 850 | /** |
854 | * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. | 851 | * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. |
855 | * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed | 852 | * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed |
856 | * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() | 853 | * @heap: if NULL, defer allocating heap memory to css_scan_tasks() |
857 | * | 854 | * |
858 | * Called with cpuset_mutex held | 855 | * Called with cpuset_mutex held |
859 | * | 856 | * |
860 | * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, | 857 | * The css_scan_tasks() function will scan all the tasks in a cgroup, |
861 | * calling callback functions for each. | 858 | * calling callback functions for each. |
862 | * | 859 | * |
863 | * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 | 860 | * No return value. It's guaranteed that css_scan_tasks() always returns 0 |
864 | * if @heap != NULL. | 861 | * if @heap != NULL. |
865 | */ | 862 | */ |
866 | static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) | 863 | static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) |
867 | { | 864 | { |
868 | struct cgroup_scanner scan; | 865 | css_scan_tasks(&cs->css, NULL, cpuset_change_cpumask, cs, heap); |
869 | |||
870 | scan.cg = cs->css.cgroup; | ||
871 | scan.test_task = NULL; | ||
872 | scan.process_task = cpuset_change_cpumask; | ||
873 | scan.heap = heap; | ||
874 | cgroup_scan_tasks(&scan); | ||
875 | } | 866 | } |
876 | 867 | ||
877 | /* | 868 | /* |
878 | * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy. | 869 | * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy. |
879 | * @root_cs: the root cpuset of the hierarchy | 870 | * @root_cs: the root cpuset of the hierarchy |
880 | * @update_root: update root cpuset or not? | 871 | * @update_root: update root cpuset or not? |
881 | * @heap: the heap used by cgroup_scan_tasks() | 872 | * @heap: the heap used by css_scan_tasks() |
882 | * | 873 | * |
883 | * This will update cpumasks of tasks in @root_cs and all other empty cpusets | 874 | * This will update cpumasks of tasks in @root_cs and all other empty cpusets |
884 | * which take on cpumask of @root_cs. | 875 | * which take on cpumask of @root_cs. |
@@ -889,17 +880,19 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs, | |||
889 | bool update_root, struct ptr_heap *heap) | 880 | bool update_root, struct ptr_heap *heap) |
890 | { | 881 | { |
891 | struct cpuset *cp; | 882 | struct cpuset *cp; |
892 | struct cgroup *pos_cgrp; | 883 | struct cgroup_subsys_state *pos_css; |
893 | |||
894 | if (update_root) | ||
895 | update_tasks_cpumask(root_cs, heap); | ||
896 | 884 | ||
897 | rcu_read_lock(); | 885 | rcu_read_lock(); |
898 | cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { | 886 | cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { |
899 | /* skip the whole subtree if @cp have some CPU */ | 887 | if (cp == root_cs) { |
900 | if (!cpumask_empty(cp->cpus_allowed)) { | 888 | if (!update_root) |
901 | pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); | 889 | continue; |
902 | continue; | 890 | } else { |
891 | /* skip the whole subtree if @cp have some CPU */ | ||
892 | if (!cpumask_empty(cp->cpus_allowed)) { | ||
893 | pos_css = css_rightmost_descendant(pos_css); | ||
894 | continue; | ||
895 | } | ||
903 | } | 896 | } |
904 | if (!css_tryget(&cp->css)) | 897 | if (!css_tryget(&cp->css)) |
905 | continue; | 898 | continue; |
@@ -1055,20 +1048,24 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, | |||
1055 | task_unlock(tsk); | 1048 | task_unlock(tsk); |
1056 | } | 1049 | } |
1057 | 1050 | ||
1051 | struct cpuset_change_nodemask_arg { | ||
1052 | struct cpuset *cs; | ||
1053 | nodemask_t *newmems; | ||
1054 | }; | ||
1055 | |||
1058 | /* | 1056 | /* |
1059 | * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy | 1057 | * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy |
1060 | * of it to cpuset's new mems_allowed, and migrate pages to new nodes if | 1058 | * of it to cpuset's new mems_allowed, and migrate pages to new nodes if |
1061 | * memory_migrate flag is set. Called with cpuset_mutex held. | 1059 | * memory_migrate flag is set. Called with cpuset_mutex held. |
1062 | */ | 1060 | */ |
1063 | static void cpuset_change_nodemask(struct task_struct *p, | 1061 | static void cpuset_change_nodemask(struct task_struct *p, void *data) |
1064 | struct cgroup_scanner *scan) | ||
1065 | { | 1062 | { |
1066 | struct cpuset *cs = cgroup_cs(scan->cg); | 1063 | struct cpuset_change_nodemask_arg *arg = data; |
1064 | struct cpuset *cs = arg->cs; | ||
1067 | struct mm_struct *mm; | 1065 | struct mm_struct *mm; |
1068 | int migrate; | 1066 | int migrate; |
1069 | nodemask_t *newmems = scan->data; | ||
1070 | 1067 | ||
1071 | cpuset_change_task_nodemask(p, newmems); | 1068 | cpuset_change_task_nodemask(p, arg->newmems); |
1072 | 1069 | ||
1073 | mm = get_task_mm(p); | 1070 | mm = get_task_mm(p); |
1074 | if (!mm) | 1071 | if (!mm) |
@@ -1078,7 +1075,7 @@ static void cpuset_change_nodemask(struct task_struct *p, | |||
1078 | 1075 | ||
1079 | mpol_rebind_mm(mm, &cs->mems_allowed); | 1076 | mpol_rebind_mm(mm, &cs->mems_allowed); |
1080 | if (migrate) | 1077 | if (migrate) |
1081 | cpuset_migrate_mm(mm, &cs->old_mems_allowed, newmems); | 1078 | cpuset_migrate_mm(mm, &cs->old_mems_allowed, arg->newmems); |
1082 | mmput(mm); | 1079 | mmput(mm); |
1083 | } | 1080 | } |
1084 | 1081 | ||
@@ -1087,28 +1084,22 @@ static void *cpuset_being_rebound; | |||
1087 | /** | 1084 | /** |
1088 | * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. | 1085 | * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. |
1089 | * @cs: the cpuset in which each task's mems_allowed mask needs to be changed | 1086 | * @cs: the cpuset in which each task's mems_allowed mask needs to be changed |
1090 | * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() | 1087 | * @heap: if NULL, defer allocating heap memory to css_scan_tasks() |
1091 | * | 1088 | * |
1092 | * Called with cpuset_mutex held | 1089 | * Called with cpuset_mutex held. No return value. It's guaranteed that |
1093 | * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 | 1090 | * css_scan_tasks() always returns 0 if @heap != NULL. |
1094 | * if @heap != NULL. | ||
1095 | */ | 1091 | */ |
1096 | static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) | 1092 | static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) |
1097 | { | 1093 | { |
1098 | static nodemask_t newmems; /* protected by cpuset_mutex */ | 1094 | static nodemask_t newmems; /* protected by cpuset_mutex */ |
1099 | struct cgroup_scanner scan; | ||
1100 | struct cpuset *mems_cs = effective_nodemask_cpuset(cs); | 1095 | struct cpuset *mems_cs = effective_nodemask_cpuset(cs); |
1096 | struct cpuset_change_nodemask_arg arg = { .cs = cs, | ||
1097 | .newmems = &newmems }; | ||
1101 | 1098 | ||
1102 | cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ | 1099 | cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ |
1103 | 1100 | ||
1104 | guarantee_online_mems(mems_cs, &newmems); | 1101 | guarantee_online_mems(mems_cs, &newmems); |
1105 | 1102 | ||
1106 | scan.cg = cs->css.cgroup; | ||
1107 | scan.test_task = NULL; | ||
1108 | scan.process_task = cpuset_change_nodemask; | ||
1109 | scan.heap = heap; | ||
1110 | scan.data = &newmems; | ||
1111 | |||
1112 | /* | 1103 | /* |
1113 | * The mpol_rebind_mm() call takes mmap_sem, which we couldn't | 1104 | * The mpol_rebind_mm() call takes mmap_sem, which we couldn't |
1114 | * take while holding tasklist_lock. Forks can happen - the | 1105 | * take while holding tasklist_lock. Forks can happen - the |
@@ -1119,7 +1110,7 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) | |||
1119 | * It's ok if we rebind the same mm twice; mpol_rebind_mm() | 1110 | * It's ok if we rebind the same mm twice; mpol_rebind_mm() |
1120 | * is idempotent. Also migrate pages in each mm to new nodes. | 1111 | * is idempotent. Also migrate pages in each mm to new nodes. |
1121 | */ | 1112 | */ |
1122 | cgroup_scan_tasks(&scan); | 1113 | css_scan_tasks(&cs->css, NULL, cpuset_change_nodemask, &arg, heap); |
1123 | 1114 | ||
1124 | /* | 1115 | /* |
1125 | * All the tasks' nodemasks have been updated, update | 1116 | * All the tasks' nodemasks have been updated, update |
@@ -1135,7 +1126,7 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) | |||
1135 | * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy. | 1126 | * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy. |
1136 | * @cs: the root cpuset of the hierarchy | 1127 | * @cs: the root cpuset of the hierarchy |
1137 | * @update_root: update the root cpuset or not? | 1128 | * @update_root: update the root cpuset or not? |
1138 | * @heap: the heap used by cgroup_scan_tasks() | 1129 | * @heap: the heap used by css_scan_tasks() |
1139 | * | 1130 | * |
1140 | * This will update nodemasks of tasks in @root_cs and all other empty cpusets | 1131 | * This will update nodemasks of tasks in @root_cs and all other empty cpusets |
1141 | * which take on nodemask of @root_cs. | 1132 | * which take on nodemask of @root_cs. |
@@ -1146,17 +1137,19 @@ static void update_tasks_nodemask_hier(struct cpuset *root_cs, | |||
1146 | bool update_root, struct ptr_heap *heap) | 1137 | bool update_root, struct ptr_heap *heap) |
1147 | { | 1138 | { |
1148 | struct cpuset *cp; | 1139 | struct cpuset *cp; |
1149 | struct cgroup *pos_cgrp; | 1140 | struct cgroup_subsys_state *pos_css; |
1150 | |||
1151 | if (update_root) | ||
1152 | update_tasks_nodemask(root_cs, heap); | ||
1153 | 1141 | ||
1154 | rcu_read_lock(); | 1142 | rcu_read_lock(); |
1155 | cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { | 1143 | cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { |
1156 | /* skip the whole subtree if @cp have some CPU */ | 1144 | if (cp == root_cs) { |
1157 | if (!nodes_empty(cp->mems_allowed)) { | 1145 | if (!update_root) |
1158 | pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); | 1146 | continue; |
1159 | continue; | 1147 | } else { |
1148 | /* skip the whole subtree if @cp have some CPU */ | ||
1149 | if (!nodes_empty(cp->mems_allowed)) { | ||
1150 | pos_css = css_rightmost_descendant(pos_css); | ||
1151 | continue; | ||
1152 | } | ||
1160 | } | 1153 | } |
1161 | if (!css_tryget(&cp->css)) | 1154 | if (!css_tryget(&cp->css)) |
1162 | continue; | 1155 | continue; |
@@ -1263,44 +1256,39 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) | |||
1263 | return 0; | 1256 | return 0; |
1264 | } | 1257 | } |
1265 | 1258 | ||
1266 | /* | 1259 | /** |
1267 | * cpuset_change_flag - make a task's spread flags the same as its cpuset's | 1260 | * cpuset_change_flag - make a task's spread flags the same as its cpuset's |
1268 | * @tsk: task to be updated | 1261 | * @tsk: task to be updated |
1269 | * @scan: struct cgroup_scanner containing the cgroup of the task | 1262 | * @data: cpuset to @tsk belongs to |
1270 | * | 1263 | * |
1271 | * Called by cgroup_scan_tasks() for each task in a cgroup. | 1264 | * Called by css_scan_tasks() for each task in a cgroup. |
1272 | * | 1265 | * |
1273 | * We don't need to re-check for the cgroup/cpuset membership, since we're | 1266 | * We don't need to re-check for the cgroup/cpuset membership, since we're |
1274 | * holding cpuset_mutex at this point. | 1267 | * holding cpuset_mutex at this point. |
1275 | */ | 1268 | */ |
1276 | static void cpuset_change_flag(struct task_struct *tsk, | 1269 | static void cpuset_change_flag(struct task_struct *tsk, void *data) |
1277 | struct cgroup_scanner *scan) | ||
1278 | { | 1270 | { |
1279 | cpuset_update_task_spread_flag(cgroup_cs(scan->cg), tsk); | 1271 | struct cpuset *cs = data; |
1272 | |||
1273 | cpuset_update_task_spread_flag(cs, tsk); | ||
1280 | } | 1274 | } |
1281 | 1275 | ||
1282 | /* | 1276 | /** |
1283 | * update_tasks_flags - update the spread flags of tasks in the cpuset. | 1277 | * update_tasks_flags - update the spread flags of tasks in the cpuset. |
1284 | * @cs: the cpuset in which each task's spread flags needs to be changed | 1278 | * @cs: the cpuset in which each task's spread flags needs to be changed |
1285 | * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() | 1279 | * @heap: if NULL, defer allocating heap memory to css_scan_tasks() |
1286 | * | 1280 | * |
1287 | * Called with cpuset_mutex held | 1281 | * Called with cpuset_mutex held |
1288 | * | 1282 | * |
1289 | * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, | 1283 | * The css_scan_tasks() function will scan all the tasks in a cgroup, |
1290 | * calling callback functions for each. | 1284 | * calling callback functions for each. |
1291 | * | 1285 | * |
1292 | * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 | 1286 | * No return value. It's guaranteed that css_scan_tasks() always returns 0 |
1293 | * if @heap != NULL. | 1287 | * if @heap != NULL. |
1294 | */ | 1288 | */ |
1295 | static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap) | 1289 | static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap) |
1296 | { | 1290 | { |
1297 | struct cgroup_scanner scan; | 1291 | css_scan_tasks(&cs->css, NULL, cpuset_change_flag, cs, heap); |
1298 | |||
1299 | scan.cg = cs->css.cgroup; | ||
1300 | scan.test_task = NULL; | ||
1301 | scan.process_task = cpuset_change_flag; | ||
1302 | scan.heap = heap; | ||
1303 | cgroup_scan_tasks(&scan); | ||
1304 | } | 1292 | } |
1305 | 1293 | ||
1306 | /* | 1294 | /* |
@@ -1458,9 +1446,10 @@ static int fmeter_getrate(struct fmeter *fmp) | |||
1458 | } | 1446 | } |
1459 | 1447 | ||
1460 | /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ | 1448 | /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ |
1461 | static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) | 1449 | static int cpuset_can_attach(struct cgroup_subsys_state *css, |
1450 | struct cgroup_taskset *tset) | ||
1462 | { | 1451 | { |
1463 | struct cpuset *cs = cgroup_cs(cgrp); | 1452 | struct cpuset *cs = css_cs(css); |
1464 | struct task_struct *task; | 1453 | struct task_struct *task; |
1465 | int ret; | 1454 | int ret; |
1466 | 1455 | ||
@@ -1471,11 +1460,11 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) | |||
1471 | * flag is set. | 1460 | * flag is set. |
1472 | */ | 1461 | */ |
1473 | ret = -ENOSPC; | 1462 | ret = -ENOSPC; |
1474 | if (!cgroup_sane_behavior(cgrp) && | 1463 | if (!cgroup_sane_behavior(css->cgroup) && |
1475 | (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) | 1464 | (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) |
1476 | goto out_unlock; | 1465 | goto out_unlock; |
1477 | 1466 | ||
1478 | cgroup_taskset_for_each(task, cgrp, tset) { | 1467 | cgroup_taskset_for_each(task, css, tset) { |
1479 | /* | 1468 | /* |
1480 | * Kthreads which disallow setaffinity shouldn't be moved | 1469 | * Kthreads which disallow setaffinity shouldn't be moved |
1481 | * to a new cpuset; we don't want to change their cpu | 1470 | * to a new cpuset; we don't want to change their cpu |
@@ -1504,11 +1493,11 @@ out_unlock: | |||
1504 | return ret; | 1493 | return ret; |
1505 | } | 1494 | } |
1506 | 1495 | ||
1507 | static void cpuset_cancel_attach(struct cgroup *cgrp, | 1496 | static void cpuset_cancel_attach(struct cgroup_subsys_state *css, |
1508 | struct cgroup_taskset *tset) | 1497 | struct cgroup_taskset *tset) |
1509 | { | 1498 | { |
1510 | mutex_lock(&cpuset_mutex); | 1499 | mutex_lock(&cpuset_mutex); |
1511 | cgroup_cs(cgrp)->attach_in_progress--; | 1500 | css_cs(css)->attach_in_progress--; |
1512 | mutex_unlock(&cpuset_mutex); | 1501 | mutex_unlock(&cpuset_mutex); |
1513 | } | 1502 | } |
1514 | 1503 | ||
@@ -1519,16 +1508,18 @@ static void cpuset_cancel_attach(struct cgroup *cgrp, | |||
1519 | */ | 1508 | */ |
1520 | static cpumask_var_t cpus_attach; | 1509 | static cpumask_var_t cpus_attach; |
1521 | 1510 | ||
1522 | static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) | 1511 | static void cpuset_attach(struct cgroup_subsys_state *css, |
1512 | struct cgroup_taskset *tset) | ||
1523 | { | 1513 | { |
1524 | /* static buf protected by cpuset_mutex */ | 1514 | /* static buf protected by cpuset_mutex */ |
1525 | static nodemask_t cpuset_attach_nodemask_to; | 1515 | static nodemask_t cpuset_attach_nodemask_to; |
1526 | struct mm_struct *mm; | 1516 | struct mm_struct *mm; |
1527 | struct task_struct *task; | 1517 | struct task_struct *task; |
1528 | struct task_struct *leader = cgroup_taskset_first(tset); | 1518 | struct task_struct *leader = cgroup_taskset_first(tset); |
1529 | struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset); | 1519 | struct cgroup_subsys_state *oldcss = cgroup_taskset_cur_css(tset, |
1530 | struct cpuset *cs = cgroup_cs(cgrp); | 1520 | cpuset_subsys_id); |
1531 | struct cpuset *oldcs = cgroup_cs(oldcgrp); | 1521 | struct cpuset *cs = css_cs(css); |
1522 | struct cpuset *oldcs = css_cs(oldcss); | ||
1532 | struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); | 1523 | struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); |
1533 | struct cpuset *mems_cs = effective_nodemask_cpuset(cs); | 1524 | struct cpuset *mems_cs = effective_nodemask_cpuset(cs); |
1534 | 1525 | ||
@@ -1542,7 +1533,7 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) | |||
1542 | 1533 | ||
1543 | guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to); | 1534 | guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to); |
1544 | 1535 | ||
1545 | cgroup_taskset_for_each(task, cgrp, tset) { | 1536 | cgroup_taskset_for_each(task, css, tset) { |
1546 | /* | 1537 | /* |
1547 | * can_attach beforehand should guarantee that this doesn't | 1538 | * can_attach beforehand should guarantee that this doesn't |
1548 | * fail. TODO: have a better way to handle failure here | 1539 | * fail. TODO: have a better way to handle failure here |
@@ -1604,15 +1595,18 @@ typedef enum { | |||
1604 | FILE_SPREAD_SLAB, | 1595 | FILE_SPREAD_SLAB, |
1605 | } cpuset_filetype_t; | 1596 | } cpuset_filetype_t; |
1606 | 1597 | ||
1607 | static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) | 1598 | static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, |
1599 | u64 val) | ||
1608 | { | 1600 | { |
1609 | struct cpuset *cs = cgroup_cs(cgrp); | 1601 | struct cpuset *cs = css_cs(css); |
1610 | cpuset_filetype_t type = cft->private; | 1602 | cpuset_filetype_t type = cft->private; |
1611 | int retval = -ENODEV; | 1603 | int retval = 0; |
1612 | 1604 | ||
1613 | mutex_lock(&cpuset_mutex); | 1605 | mutex_lock(&cpuset_mutex); |
1614 | if (!is_cpuset_online(cs)) | 1606 | if (!is_cpuset_online(cs)) { |
1607 | retval = -ENODEV; | ||
1615 | goto out_unlock; | 1608 | goto out_unlock; |
1609 | } | ||
1616 | 1610 | ||
1617 | switch (type) { | 1611 | switch (type) { |
1618 | case FILE_CPU_EXCLUSIVE: | 1612 | case FILE_CPU_EXCLUSIVE: |
@@ -1651,9 +1645,10 @@ out_unlock: | |||
1651 | return retval; | 1645 | return retval; |
1652 | } | 1646 | } |
1653 | 1647 | ||
1654 | static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) | 1648 | static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, |
1649 | s64 val) | ||
1655 | { | 1650 | { |
1656 | struct cpuset *cs = cgroup_cs(cgrp); | 1651 | struct cpuset *cs = css_cs(css); |
1657 | cpuset_filetype_t type = cft->private; | 1652 | cpuset_filetype_t type = cft->private; |
1658 | int retval = -ENODEV; | 1653 | int retval = -ENODEV; |
1659 | 1654 | ||
@@ -1677,10 +1672,10 @@ out_unlock: | |||
1677 | /* | 1672 | /* |
1678 | * Common handling for a write to a "cpus" or "mems" file. | 1673 | * Common handling for a write to a "cpus" or "mems" file. |
1679 | */ | 1674 | */ |
1680 | static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, | 1675 | static int cpuset_write_resmask(struct cgroup_subsys_state *css, |
1681 | const char *buf) | 1676 | struct cftype *cft, const char *buf) |
1682 | { | 1677 | { |
1683 | struct cpuset *cs = cgroup_cs(cgrp); | 1678 | struct cpuset *cs = css_cs(css); |
1684 | struct cpuset *trialcs; | 1679 | struct cpuset *trialcs; |
1685 | int retval = -ENODEV; | 1680 | int retval = -ENODEV; |
1686 | 1681 | ||
@@ -1759,13 +1754,12 @@ static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs) | |||
1759 | return count; | 1754 | return count; |
1760 | } | 1755 | } |
1761 | 1756 | ||
1762 | static ssize_t cpuset_common_file_read(struct cgroup *cgrp, | 1757 | static ssize_t cpuset_common_file_read(struct cgroup_subsys_state *css, |
1763 | struct cftype *cft, | 1758 | struct cftype *cft, struct file *file, |
1764 | struct file *file, | 1759 | char __user *buf, size_t nbytes, |
1765 | char __user *buf, | 1760 | loff_t *ppos) |
1766 | size_t nbytes, loff_t *ppos) | ||
1767 | { | 1761 | { |
1768 | struct cpuset *cs = cgroup_cs(cgrp); | 1762 | struct cpuset *cs = css_cs(css); |
1769 | cpuset_filetype_t type = cft->private; | 1763 | cpuset_filetype_t type = cft->private; |
1770 | char *page; | 1764 | char *page; |
1771 | ssize_t retval = 0; | 1765 | ssize_t retval = 0; |
@@ -1795,9 +1789,9 @@ out: | |||
1795 | return retval; | 1789 | return retval; |
1796 | } | 1790 | } |
1797 | 1791 | ||
1798 | static u64 cpuset_read_u64(struct cgroup *cgrp, struct cftype *cft) | 1792 | static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) |
1799 | { | 1793 | { |
1800 | struct cpuset *cs = cgroup_cs(cgrp); | 1794 | struct cpuset *cs = css_cs(css); |
1801 | cpuset_filetype_t type = cft->private; | 1795 | cpuset_filetype_t type = cft->private; |
1802 | switch (type) { | 1796 | switch (type) { |
1803 | case FILE_CPU_EXCLUSIVE: | 1797 | case FILE_CPU_EXCLUSIVE: |
@@ -1826,9 +1820,9 @@ static u64 cpuset_read_u64(struct cgroup *cgrp, struct cftype *cft) | |||
1826 | return 0; | 1820 | return 0; |
1827 | } | 1821 | } |
1828 | 1822 | ||
1829 | static s64 cpuset_read_s64(struct cgroup *cgrp, struct cftype *cft) | 1823 | static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) |
1830 | { | 1824 | { |
1831 | struct cpuset *cs = cgroup_cs(cgrp); | 1825 | struct cpuset *cs = css_cs(css); |
1832 | cpuset_filetype_t type = cft->private; | 1826 | cpuset_filetype_t type = cft->private; |
1833 | switch (type) { | 1827 | switch (type) { |
1834 | case FILE_SCHED_RELAX_DOMAIN_LEVEL: | 1828 | case FILE_SCHED_RELAX_DOMAIN_LEVEL: |
@@ -1943,11 +1937,12 @@ static struct cftype files[] = { | |||
1943 | * cgrp: control group that the new cpuset will be part of | 1937 | * cgrp: control group that the new cpuset will be part of |
1944 | */ | 1938 | */ |
1945 | 1939 | ||
1946 | static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cgrp) | 1940 | static struct cgroup_subsys_state * |
1941 | cpuset_css_alloc(struct cgroup_subsys_state *parent_css) | ||
1947 | { | 1942 | { |
1948 | struct cpuset *cs; | 1943 | struct cpuset *cs; |
1949 | 1944 | ||
1950 | if (!cgrp->parent) | 1945 | if (!parent_css) |
1951 | return &top_cpuset.css; | 1946 | return &top_cpuset.css; |
1952 | 1947 | ||
1953 | cs = kzalloc(sizeof(*cs), GFP_KERNEL); | 1948 | cs = kzalloc(sizeof(*cs), GFP_KERNEL); |
@@ -1967,12 +1962,12 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cgrp) | |||
1967 | return &cs->css; | 1962 | return &cs->css; |
1968 | } | 1963 | } |
1969 | 1964 | ||
1970 | static int cpuset_css_online(struct cgroup *cgrp) | 1965 | static int cpuset_css_online(struct cgroup_subsys_state *css) |
1971 | { | 1966 | { |
1972 | struct cpuset *cs = cgroup_cs(cgrp); | 1967 | struct cpuset *cs = css_cs(css); |
1973 | struct cpuset *parent = parent_cs(cs); | 1968 | struct cpuset *parent = parent_cs(cs); |
1974 | struct cpuset *tmp_cs; | 1969 | struct cpuset *tmp_cs; |
1975 | struct cgroup *pos_cg; | 1970 | struct cgroup_subsys_state *pos_css; |
1976 | 1971 | ||
1977 | if (!parent) | 1972 | if (!parent) |
1978 | return 0; | 1973 | return 0; |
@@ -1987,7 +1982,7 @@ static int cpuset_css_online(struct cgroup *cgrp) | |||
1987 | 1982 | ||
1988 | number_of_cpusets++; | 1983 | number_of_cpusets++; |
1989 | 1984 | ||
1990 | if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags)) | 1985 | if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) |
1991 | goto out_unlock; | 1986 | goto out_unlock; |
1992 | 1987 | ||
1993 | /* | 1988 | /* |
@@ -2004,7 +1999,7 @@ static int cpuset_css_online(struct cgroup *cgrp) | |||
2004 | * (and likewise for mems) to the new cgroup. | 1999 | * (and likewise for mems) to the new cgroup. |
2005 | */ | 2000 | */ |
2006 | rcu_read_lock(); | 2001 | rcu_read_lock(); |
2007 | cpuset_for_each_child(tmp_cs, pos_cg, parent) { | 2002 | cpuset_for_each_child(tmp_cs, pos_css, parent) { |
2008 | if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) { | 2003 | if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) { |
2009 | rcu_read_unlock(); | 2004 | rcu_read_unlock(); |
2010 | goto out_unlock; | 2005 | goto out_unlock; |
@@ -2021,9 +2016,15 @@ out_unlock: | |||
2021 | return 0; | 2016 | return 0; |
2022 | } | 2017 | } |
2023 | 2018 | ||
2024 | static void cpuset_css_offline(struct cgroup *cgrp) | 2019 | /* |
2020 | * If the cpuset being removed has its flag 'sched_load_balance' | ||
2021 | * enabled, then simulate turning sched_load_balance off, which | ||
2022 | * will call rebuild_sched_domains_locked(). | ||
2023 | */ | ||
2024 | |||
2025 | static void cpuset_css_offline(struct cgroup_subsys_state *css) | ||
2025 | { | 2026 | { |
2026 | struct cpuset *cs = cgroup_cs(cgrp); | 2027 | struct cpuset *cs = css_cs(css); |
2027 | 2028 | ||
2028 | mutex_lock(&cpuset_mutex); | 2029 | mutex_lock(&cpuset_mutex); |
2029 | 2030 | ||
@@ -2036,15 +2037,9 @@ static void cpuset_css_offline(struct cgroup *cgrp) | |||
2036 | mutex_unlock(&cpuset_mutex); | 2037 | mutex_unlock(&cpuset_mutex); |
2037 | } | 2038 | } |
2038 | 2039 | ||
2039 | /* | 2040 | static void cpuset_css_free(struct cgroup_subsys_state *css) |
2040 | * If the cpuset being removed has its flag 'sched_load_balance' | ||
2041 | * enabled, then simulate turning sched_load_balance off, which | ||
2042 | * will call rebuild_sched_domains_locked(). | ||
2043 | */ | ||
2044 | |||
2045 | static void cpuset_css_free(struct cgroup *cgrp) | ||
2046 | { | 2041 | { |
2047 | struct cpuset *cs = cgroup_cs(cgrp); | 2042 | struct cpuset *cs = css_cs(css); |
2048 | 2043 | ||
2049 | free_cpumask_var(cs->cpus_allowed); | 2044 | free_cpumask_var(cs->cpus_allowed); |
2050 | kfree(cs); | 2045 | kfree(cs); |
@@ -2251,11 +2246,11 @@ static void cpuset_hotplug_workfn(struct work_struct *work) | |||
2251 | /* if cpus or mems changed, we need to propagate to descendants */ | 2246 | /* if cpus or mems changed, we need to propagate to descendants */ |
2252 | if (cpus_updated || mems_updated) { | 2247 | if (cpus_updated || mems_updated) { |
2253 | struct cpuset *cs; | 2248 | struct cpuset *cs; |
2254 | struct cgroup *pos_cgrp; | 2249 | struct cgroup_subsys_state *pos_css; |
2255 | 2250 | ||
2256 | rcu_read_lock(); | 2251 | rcu_read_lock(); |
2257 | cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset) { | 2252 | cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { |
2258 | if (!css_tryget(&cs->css)) | 2253 | if (cs == &top_cpuset || !css_tryget(&cs->css)) |
2259 | continue; | 2254 | continue; |
2260 | rcu_read_unlock(); | 2255 | rcu_read_unlock(); |
2261 | 2256 | ||
@@ -2344,7 +2339,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) | |||
2344 | 2339 | ||
2345 | void cpuset_cpus_allowed_fallback(struct task_struct *tsk) | 2340 | void cpuset_cpus_allowed_fallback(struct task_struct *tsk) |
2346 | { | 2341 | { |
2347 | const struct cpuset *cpus_cs; | 2342 | struct cpuset *cpus_cs; |
2348 | 2343 | ||
2349 | rcu_read_lock(); | 2344 | rcu_read_lock(); |
2350 | cpus_cs = effective_cpumask_cpuset(task_cs(tsk)); | 2345 | cpus_cs = effective_cpumask_cpuset(task_cs(tsk)); |
@@ -2417,7 +2412,7 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) | |||
2417 | * callback_mutex. If no ancestor is mem_exclusive or mem_hardwall | 2412 | * callback_mutex. If no ancestor is mem_exclusive or mem_hardwall |
2418 | * (an unusual configuration), then returns the root cpuset. | 2413 | * (an unusual configuration), then returns the root cpuset. |
2419 | */ | 2414 | */ |
2420 | static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs) | 2415 | static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) |
2421 | { | 2416 | { |
2422 | while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs)) | 2417 | while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs)) |
2423 | cs = parent_cs(cs); | 2418 | cs = parent_cs(cs); |
@@ -2487,7 +2482,7 @@ static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs) | |||
2487 | */ | 2482 | */ |
2488 | int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) | 2483 | int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) |
2489 | { | 2484 | { |
2490 | const struct cpuset *cs; /* current cpuset ancestors */ | 2485 | struct cpuset *cs; /* current cpuset ancestors */ |
2491 | int allowed; /* is allocation in zone z allowed? */ | 2486 | int allowed; /* is allocation in zone z allowed? */ |
2492 | 2487 | ||
2493 | if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) | 2488 | if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) |
@@ -2725,7 +2720,7 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v) | |||
2725 | goto out_free; | 2720 | goto out_free; |
2726 | 2721 | ||
2727 | rcu_read_lock(); | 2722 | rcu_read_lock(); |
2728 | css = task_subsys_state(tsk, cpuset_subsys_id); | 2723 | css = task_css(tsk, cpuset_subsys_id); |
2729 | retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); | 2724 | retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); |
2730 | rcu_read_unlock(); | 2725 | rcu_read_unlock(); |
2731 | if (retval < 0) | 2726 | if (retval < 0) |
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index c77206184b8b..97b67df8fbfe 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c | |||
@@ -116,6 +116,9 @@ int get_callchain_buffers(void) | |||
116 | 116 | ||
117 | err = alloc_callchain_buffers(); | 117 | err = alloc_callchain_buffers(); |
118 | exit: | 118 | exit: |
119 | if (err) | ||
120 | atomic_dec(&nr_callchain_events); | ||
121 | |||
119 | mutex_unlock(&callchain_mutex); | 122 | mutex_unlock(&callchain_mutex); |
120 | 123 | ||
121 | return err; | 124 | return err; |
diff --git a/kernel/events/core.c b/kernel/events/core.c index eba8fb5834ae..2207efc941d1 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
@@ -145,6 +145,7 @@ static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events); | |||
145 | static atomic_t nr_mmap_events __read_mostly; | 145 | static atomic_t nr_mmap_events __read_mostly; |
146 | static atomic_t nr_comm_events __read_mostly; | 146 | static atomic_t nr_comm_events __read_mostly; |
147 | static atomic_t nr_task_events __read_mostly; | 147 | static atomic_t nr_task_events __read_mostly; |
148 | static atomic_t nr_freq_events __read_mostly; | ||
148 | 149 | ||
149 | static LIST_HEAD(pmus); | 150 | static LIST_HEAD(pmus); |
150 | static DEFINE_MUTEX(pmus_lock); | 151 | static DEFINE_MUTEX(pmus_lock); |
@@ -340,8 +341,8 @@ struct perf_cgroup { | |||
340 | static inline struct perf_cgroup * | 341 | static inline struct perf_cgroup * |
341 | perf_cgroup_from_task(struct task_struct *task) | 342 | perf_cgroup_from_task(struct task_struct *task) |
342 | { | 343 | { |
343 | return container_of(task_subsys_state(task, perf_subsys_id), | 344 | return container_of(task_css(task, perf_subsys_id), |
344 | struct perf_cgroup, css); | 345 | struct perf_cgroup, css); |
345 | } | 346 | } |
346 | 347 | ||
347 | static inline bool | 348 | static inline bool |
@@ -591,7 +592,9 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, | |||
591 | if (!f.file) | 592 | if (!f.file) |
592 | return -EBADF; | 593 | return -EBADF; |
593 | 594 | ||
594 | css = cgroup_css_from_dir(f.file, perf_subsys_id); | 595 | rcu_read_lock(); |
596 | |||
597 | css = css_from_dir(f.file->f_dentry, &perf_subsys); | ||
595 | if (IS_ERR(css)) { | 598 | if (IS_ERR(css)) { |
596 | ret = PTR_ERR(css); | 599 | ret = PTR_ERR(css); |
597 | goto out; | 600 | goto out; |
@@ -617,6 +620,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, | |||
617 | ret = -EINVAL; | 620 | ret = -EINVAL; |
618 | } | 621 | } |
619 | out: | 622 | out: |
623 | rcu_read_unlock(); | ||
620 | fdput(f); | 624 | fdput(f); |
621 | return ret; | 625 | return ret; |
622 | } | 626 | } |
@@ -869,12 +873,8 @@ static void perf_pmu_rotate_start(struct pmu *pmu) | |||
869 | 873 | ||
870 | WARN_ON(!irqs_disabled()); | 874 | WARN_ON(!irqs_disabled()); |
871 | 875 | ||
872 | if (list_empty(&cpuctx->rotation_list)) { | 876 | if (list_empty(&cpuctx->rotation_list)) |
873 | int was_empty = list_empty(head); | ||
874 | list_add(&cpuctx->rotation_list, head); | 877 | list_add(&cpuctx->rotation_list, head); |
875 | if (was_empty) | ||
876 | tick_nohz_full_kick(); | ||
877 | } | ||
878 | } | 878 | } |
879 | 879 | ||
880 | static void get_ctx(struct perf_event_context *ctx) | 880 | static void get_ctx(struct perf_event_context *ctx) |
@@ -1216,6 +1216,9 @@ static void perf_event__id_header_size(struct perf_event *event) | |||
1216 | if (sample_type & PERF_SAMPLE_TIME) | 1216 | if (sample_type & PERF_SAMPLE_TIME) |
1217 | size += sizeof(data->time); | 1217 | size += sizeof(data->time); |
1218 | 1218 | ||
1219 | if (sample_type & PERF_SAMPLE_IDENTIFIER) | ||
1220 | size += sizeof(data->id); | ||
1221 | |||
1219 | if (sample_type & PERF_SAMPLE_ID) | 1222 | if (sample_type & PERF_SAMPLE_ID) |
1220 | size += sizeof(data->id); | 1223 | size += sizeof(data->id); |
1221 | 1224 | ||
@@ -2712,7 +2715,7 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, | |||
2712 | 2715 | ||
2713 | hwc = &event->hw; | 2716 | hwc = &event->hw; |
2714 | 2717 | ||
2715 | if (needs_unthr && hwc->interrupts == MAX_INTERRUPTS) { | 2718 | if (hwc->interrupts == MAX_INTERRUPTS) { |
2716 | hwc->interrupts = 0; | 2719 | hwc->interrupts = 0; |
2717 | perf_log_throttle(event, 1); | 2720 | perf_log_throttle(event, 1); |
2718 | event->pmu->start(event, 0); | 2721 | event->pmu->start(event, 0); |
@@ -2811,10 +2814,11 @@ done: | |||
2811 | #ifdef CONFIG_NO_HZ_FULL | 2814 | #ifdef CONFIG_NO_HZ_FULL |
2812 | bool perf_event_can_stop_tick(void) | 2815 | bool perf_event_can_stop_tick(void) |
2813 | { | 2816 | { |
2814 | if (list_empty(&__get_cpu_var(rotation_list))) | 2817 | if (atomic_read(&nr_freq_events) || |
2815 | return true; | 2818 | __this_cpu_read(perf_throttled_count)) |
2816 | else | ||
2817 | return false; | 2819 | return false; |
2820 | else | ||
2821 | return true; | ||
2818 | } | 2822 | } |
2819 | #endif | 2823 | #endif |
2820 | 2824 | ||
@@ -3128,36 +3132,63 @@ static void free_event_rcu(struct rcu_head *head) | |||
3128 | static void ring_buffer_put(struct ring_buffer *rb); | 3132 | static void ring_buffer_put(struct ring_buffer *rb); |
3129 | static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb); | 3133 | static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb); |
3130 | 3134 | ||
3131 | static void free_event(struct perf_event *event) | 3135 | static void unaccount_event_cpu(struct perf_event *event, int cpu) |
3132 | { | 3136 | { |
3133 | irq_work_sync(&event->pending); | 3137 | if (event->parent) |
3138 | return; | ||
3139 | |||
3140 | if (has_branch_stack(event)) { | ||
3141 | if (!(event->attach_state & PERF_ATTACH_TASK)) | ||
3142 | atomic_dec(&per_cpu(perf_branch_stack_events, cpu)); | ||
3143 | } | ||
3144 | if (is_cgroup_event(event)) | ||
3145 | atomic_dec(&per_cpu(perf_cgroup_events, cpu)); | ||
3146 | } | ||
3147 | |||
3148 | static void unaccount_event(struct perf_event *event) | ||
3149 | { | ||
3150 | if (event->parent) | ||
3151 | return; | ||
3152 | |||
3153 | if (event->attach_state & PERF_ATTACH_TASK) | ||
3154 | static_key_slow_dec_deferred(&perf_sched_events); | ||
3155 | if (event->attr.mmap || event->attr.mmap_data) | ||
3156 | atomic_dec(&nr_mmap_events); | ||
3157 | if (event->attr.comm) | ||
3158 | atomic_dec(&nr_comm_events); | ||
3159 | if (event->attr.task) | ||
3160 | atomic_dec(&nr_task_events); | ||
3161 | if (event->attr.freq) | ||
3162 | atomic_dec(&nr_freq_events); | ||
3163 | if (is_cgroup_event(event)) | ||
3164 | static_key_slow_dec_deferred(&perf_sched_events); | ||
3165 | if (has_branch_stack(event)) | ||
3166 | static_key_slow_dec_deferred(&perf_sched_events); | ||
3167 | |||
3168 | unaccount_event_cpu(event, event->cpu); | ||
3169 | } | ||
3134 | 3170 | ||
3171 | static void __free_event(struct perf_event *event) | ||
3172 | { | ||
3135 | if (!event->parent) { | 3173 | if (!event->parent) { |
3136 | if (event->attach_state & PERF_ATTACH_TASK) | ||
3137 | static_key_slow_dec_deferred(&perf_sched_events); | ||
3138 | if (event->attr.mmap || event->attr.mmap_data) | ||
3139 | atomic_dec(&nr_mmap_events); | ||
3140 | if (event->attr.comm) | ||
3141 | atomic_dec(&nr_comm_events); | ||
3142 | if (event->attr.task) | ||
3143 | atomic_dec(&nr_task_events); | ||
3144 | if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) | 3174 | if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) |
3145 | put_callchain_buffers(); | 3175 | put_callchain_buffers(); |
3146 | if (is_cgroup_event(event)) { | ||
3147 | atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); | ||
3148 | static_key_slow_dec_deferred(&perf_sched_events); | ||
3149 | } | ||
3150 | |||
3151 | if (has_branch_stack(event)) { | ||
3152 | static_key_slow_dec_deferred(&perf_sched_events); | ||
3153 | /* is system-wide event */ | ||
3154 | if (!(event->attach_state & PERF_ATTACH_TASK)) { | ||
3155 | atomic_dec(&per_cpu(perf_branch_stack_events, | ||
3156 | event->cpu)); | ||
3157 | } | ||
3158 | } | ||
3159 | } | 3176 | } |
3160 | 3177 | ||
3178 | if (event->destroy) | ||
3179 | event->destroy(event); | ||
3180 | |||
3181 | if (event->ctx) | ||
3182 | put_ctx(event->ctx); | ||
3183 | |||
3184 | call_rcu(&event->rcu_head, free_event_rcu); | ||
3185 | } | ||
3186 | static void free_event(struct perf_event *event) | ||
3187 | { | ||
3188 | irq_work_sync(&event->pending); | ||
3189 | |||
3190 | unaccount_event(event); | ||
3191 | |||
3161 | if (event->rb) { | 3192 | if (event->rb) { |
3162 | struct ring_buffer *rb; | 3193 | struct ring_buffer *rb; |
3163 | 3194 | ||
@@ -3180,13 +3211,8 @@ static void free_event(struct perf_event *event) | |||
3180 | if (is_cgroup_event(event)) | 3211 | if (is_cgroup_event(event)) |
3181 | perf_detach_cgroup(event); | 3212 | perf_detach_cgroup(event); |
3182 | 3213 | ||
3183 | if (event->destroy) | ||
3184 | event->destroy(event); | ||
3185 | 3214 | ||
3186 | if (event->ctx) | 3215 | __free_event(event); |
3187 | put_ctx(event->ctx); | ||
3188 | |||
3189 | call_rcu(&event->rcu_head, free_event_rcu); | ||
3190 | } | 3216 | } |
3191 | 3217 | ||
3192 | int perf_event_release_kernel(struct perf_event *event) | 3218 | int perf_event_release_kernel(struct perf_event *event) |
@@ -3544,6 +3570,15 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) | |||
3544 | case PERF_EVENT_IOC_PERIOD: | 3570 | case PERF_EVENT_IOC_PERIOD: |
3545 | return perf_event_period(event, (u64 __user *)arg); | 3571 | return perf_event_period(event, (u64 __user *)arg); |
3546 | 3572 | ||
3573 | case PERF_EVENT_IOC_ID: | ||
3574 | { | ||
3575 | u64 id = primary_event_id(event); | ||
3576 | |||
3577 | if (copy_to_user((void __user *)arg, &id, sizeof(id))) | ||
3578 | return -EFAULT; | ||
3579 | return 0; | ||
3580 | } | ||
3581 | |||
3547 | case PERF_EVENT_IOC_SET_OUTPUT: | 3582 | case PERF_EVENT_IOC_SET_OUTPUT: |
3548 | { | 3583 | { |
3549 | int ret; | 3584 | int ret; |
@@ -3641,6 +3676,10 @@ void perf_event_update_userpage(struct perf_event *event) | |||
3641 | u64 enabled, running, now; | 3676 | u64 enabled, running, now; |
3642 | 3677 | ||
3643 | rcu_read_lock(); | 3678 | rcu_read_lock(); |
3679 | rb = rcu_dereference(event->rb); | ||
3680 | if (!rb) | ||
3681 | goto unlock; | ||
3682 | |||
3644 | /* | 3683 | /* |
3645 | * compute total_time_enabled, total_time_running | 3684 | * compute total_time_enabled, total_time_running |
3646 | * based on snapshot values taken when the event | 3685 | * based on snapshot values taken when the event |
@@ -3651,12 +3690,8 @@ void perf_event_update_userpage(struct perf_event *event) | |||
3651 | * NMI context | 3690 | * NMI context |
3652 | */ | 3691 | */ |
3653 | calc_timer_values(event, &now, &enabled, &running); | 3692 | calc_timer_values(event, &now, &enabled, &running); |
3654 | rb = rcu_dereference(event->rb); | ||
3655 | if (!rb) | ||
3656 | goto unlock; | ||
3657 | 3693 | ||
3658 | userpg = rb->user_page; | 3694 | userpg = rb->user_page; |
3659 | |||
3660 | /* | 3695 | /* |
3661 | * Disable preemption so as to not let the corresponding user-space | 3696 | * Disable preemption so as to not let the corresponding user-space |
3662 | * spin too long if we get preempted. | 3697 | * spin too long if we get preempted. |
@@ -4251,7 +4286,7 @@ static void __perf_event_header__init_id(struct perf_event_header *header, | |||
4251 | if (sample_type & PERF_SAMPLE_TIME) | 4286 | if (sample_type & PERF_SAMPLE_TIME) |
4252 | data->time = perf_clock(); | 4287 | data->time = perf_clock(); |
4253 | 4288 | ||
4254 | if (sample_type & PERF_SAMPLE_ID) | 4289 | if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER)) |
4255 | data->id = primary_event_id(event); | 4290 | data->id = primary_event_id(event); |
4256 | 4291 | ||
4257 | if (sample_type & PERF_SAMPLE_STREAM_ID) | 4292 | if (sample_type & PERF_SAMPLE_STREAM_ID) |
@@ -4290,6 +4325,9 @@ static void __perf_event__output_id_sample(struct perf_output_handle *handle, | |||
4290 | 4325 | ||
4291 | if (sample_type & PERF_SAMPLE_CPU) | 4326 | if (sample_type & PERF_SAMPLE_CPU) |
4292 | perf_output_put(handle, data->cpu_entry); | 4327 | perf_output_put(handle, data->cpu_entry); |
4328 | |||
4329 | if (sample_type & PERF_SAMPLE_IDENTIFIER) | ||
4330 | perf_output_put(handle, data->id); | ||
4293 | } | 4331 | } |
4294 | 4332 | ||
4295 | void perf_event__output_id_sample(struct perf_event *event, | 4333 | void perf_event__output_id_sample(struct perf_event *event, |
@@ -4355,7 +4393,8 @@ static void perf_output_read_group(struct perf_output_handle *handle, | |||
4355 | list_for_each_entry(sub, &leader->sibling_list, group_entry) { | 4393 | list_for_each_entry(sub, &leader->sibling_list, group_entry) { |
4356 | n = 0; | 4394 | n = 0; |
4357 | 4395 | ||
4358 | if (sub != event) | 4396 | if ((sub != event) && |
4397 | (sub->state == PERF_EVENT_STATE_ACTIVE)) | ||
4359 | sub->pmu->read(sub); | 4398 | sub->pmu->read(sub); |
4360 | 4399 | ||
4361 | values[n++] = perf_event_count(sub); | 4400 | values[n++] = perf_event_count(sub); |
@@ -4402,6 +4441,9 @@ void perf_output_sample(struct perf_output_handle *handle, | |||
4402 | 4441 | ||
4403 | perf_output_put(handle, *header); | 4442 | perf_output_put(handle, *header); |
4404 | 4443 | ||
4444 | if (sample_type & PERF_SAMPLE_IDENTIFIER) | ||
4445 | perf_output_put(handle, data->id); | ||
4446 | |||
4405 | if (sample_type & PERF_SAMPLE_IP) | 4447 | if (sample_type & PERF_SAMPLE_IP) |
4406 | perf_output_put(handle, data->ip); | 4448 | perf_output_put(handle, data->ip); |
4407 | 4449 | ||
@@ -4462,20 +4504,6 @@ void perf_output_sample(struct perf_output_handle *handle, | |||
4462 | } | 4504 | } |
4463 | } | 4505 | } |
4464 | 4506 | ||
4465 | if (!event->attr.watermark) { | ||
4466 | int wakeup_events = event->attr.wakeup_events; | ||
4467 | |||
4468 | if (wakeup_events) { | ||
4469 | struct ring_buffer *rb = handle->rb; | ||
4470 | int events = local_inc_return(&rb->events); | ||
4471 | |||
4472 | if (events >= wakeup_events) { | ||
4473 | local_sub(wakeup_events, &rb->events); | ||
4474 | local_inc(&rb->wakeup); | ||
4475 | } | ||
4476 | } | ||
4477 | } | ||
4478 | |||
4479 | if (sample_type & PERF_SAMPLE_BRANCH_STACK) { | 4507 | if (sample_type & PERF_SAMPLE_BRANCH_STACK) { |
4480 | if (data->br_stack) { | 4508 | if (data->br_stack) { |
4481 | size_t size; | 4509 | size_t size; |
@@ -4511,16 +4539,31 @@ void perf_output_sample(struct perf_output_handle *handle, | |||
4511 | } | 4539 | } |
4512 | } | 4540 | } |
4513 | 4541 | ||
4514 | if (sample_type & PERF_SAMPLE_STACK_USER) | 4542 | if (sample_type & PERF_SAMPLE_STACK_USER) { |
4515 | perf_output_sample_ustack(handle, | 4543 | perf_output_sample_ustack(handle, |
4516 | data->stack_user_size, | 4544 | data->stack_user_size, |
4517 | data->regs_user.regs); | 4545 | data->regs_user.regs); |
4546 | } | ||
4518 | 4547 | ||
4519 | if (sample_type & PERF_SAMPLE_WEIGHT) | 4548 | if (sample_type & PERF_SAMPLE_WEIGHT) |
4520 | perf_output_put(handle, data->weight); | 4549 | perf_output_put(handle, data->weight); |
4521 | 4550 | ||
4522 | if (sample_type & PERF_SAMPLE_DATA_SRC) | 4551 | if (sample_type & PERF_SAMPLE_DATA_SRC) |
4523 | perf_output_put(handle, data->data_src.val); | 4552 | perf_output_put(handle, data->data_src.val); |
4553 | |||
4554 | if (!event->attr.watermark) { | ||
4555 | int wakeup_events = event->attr.wakeup_events; | ||
4556 | |||
4557 | if (wakeup_events) { | ||
4558 | struct ring_buffer *rb = handle->rb; | ||
4559 | int events = local_inc_return(&rb->events); | ||
4560 | |||
4561 | if (events >= wakeup_events) { | ||
4562 | local_sub(wakeup_events, &rb->events); | ||
4563 | local_inc(&rb->wakeup); | ||
4564 | } | ||
4565 | } | ||
4566 | } | ||
4524 | } | 4567 | } |
4525 | 4568 | ||
4526 | void perf_prepare_sample(struct perf_event_header *header, | 4569 | void perf_prepare_sample(struct perf_event_header *header, |
@@ -4680,12 +4723,10 @@ perf_event_read_event(struct perf_event *event, | |||
4680 | perf_output_end(&handle); | 4723 | perf_output_end(&handle); |
4681 | } | 4724 | } |
4682 | 4725 | ||
4683 | typedef int (perf_event_aux_match_cb)(struct perf_event *event, void *data); | ||
4684 | typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data); | 4726 | typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data); |
4685 | 4727 | ||
4686 | static void | 4728 | static void |
4687 | perf_event_aux_ctx(struct perf_event_context *ctx, | 4729 | perf_event_aux_ctx(struct perf_event_context *ctx, |
4688 | perf_event_aux_match_cb match, | ||
4689 | perf_event_aux_output_cb output, | 4730 | perf_event_aux_output_cb output, |
4690 | void *data) | 4731 | void *data) |
4691 | { | 4732 | { |
@@ -4696,15 +4737,12 @@ perf_event_aux_ctx(struct perf_event_context *ctx, | |||
4696 | continue; | 4737 | continue; |
4697 | if (!event_filter_match(event)) | 4738 | if (!event_filter_match(event)) |
4698 | continue; | 4739 | continue; |
4699 | if (match(event, data)) | 4740 | output(event, data); |
4700 | output(event, data); | ||
4701 | } | 4741 | } |
4702 | } | 4742 | } |
4703 | 4743 | ||
4704 | static void | 4744 | static void |
4705 | perf_event_aux(perf_event_aux_match_cb match, | 4745 | perf_event_aux(perf_event_aux_output_cb output, void *data, |
4706 | perf_event_aux_output_cb output, | ||
4707 | void *data, | ||
4708 | struct perf_event_context *task_ctx) | 4746 | struct perf_event_context *task_ctx) |
4709 | { | 4747 | { |
4710 | struct perf_cpu_context *cpuctx; | 4748 | struct perf_cpu_context *cpuctx; |
@@ -4717,7 +4755,7 @@ perf_event_aux(perf_event_aux_match_cb match, | |||
4717 | cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); | 4755 | cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); |
4718 | if (cpuctx->unique_pmu != pmu) | 4756 | if (cpuctx->unique_pmu != pmu) |
4719 | goto next; | 4757 | goto next; |
4720 | perf_event_aux_ctx(&cpuctx->ctx, match, output, data); | 4758 | perf_event_aux_ctx(&cpuctx->ctx, output, data); |
4721 | if (task_ctx) | 4759 | if (task_ctx) |
4722 | goto next; | 4760 | goto next; |
4723 | ctxn = pmu->task_ctx_nr; | 4761 | ctxn = pmu->task_ctx_nr; |
@@ -4725,14 +4763,14 @@ perf_event_aux(perf_event_aux_match_cb match, | |||
4725 | goto next; | 4763 | goto next; |
4726 | ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); | 4764 | ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); |
4727 | if (ctx) | 4765 | if (ctx) |
4728 | perf_event_aux_ctx(ctx, match, output, data); | 4766 | perf_event_aux_ctx(ctx, output, data); |
4729 | next: | 4767 | next: |
4730 | put_cpu_ptr(pmu->pmu_cpu_context); | 4768 | put_cpu_ptr(pmu->pmu_cpu_context); |
4731 | } | 4769 | } |
4732 | 4770 | ||
4733 | if (task_ctx) { | 4771 | if (task_ctx) { |
4734 | preempt_disable(); | 4772 | preempt_disable(); |
4735 | perf_event_aux_ctx(task_ctx, match, output, data); | 4773 | perf_event_aux_ctx(task_ctx, output, data); |
4736 | preempt_enable(); | 4774 | preempt_enable(); |
4737 | } | 4775 | } |
4738 | rcu_read_unlock(); | 4776 | rcu_read_unlock(); |
@@ -4741,7 +4779,7 @@ next: | |||
4741 | /* | 4779 | /* |
4742 | * task tracking -- fork/exit | 4780 | * task tracking -- fork/exit |
4743 | * | 4781 | * |
4744 | * enabled by: attr.comm | attr.mmap | attr.mmap_data | attr.task | 4782 | * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task |
4745 | */ | 4783 | */ |
4746 | 4784 | ||
4747 | struct perf_task_event { | 4785 | struct perf_task_event { |
@@ -4759,6 +4797,13 @@ struct perf_task_event { | |||
4759 | } event_id; | 4797 | } event_id; |
4760 | }; | 4798 | }; |
4761 | 4799 | ||
4800 | static int perf_event_task_match(struct perf_event *event) | ||
4801 | { | ||
4802 | return event->attr.comm || event->attr.mmap || | ||
4803 | event->attr.mmap2 || event->attr.mmap_data || | ||
4804 | event->attr.task; | ||
4805 | } | ||
4806 | |||
4762 | static void perf_event_task_output(struct perf_event *event, | 4807 | static void perf_event_task_output(struct perf_event *event, |
4763 | void *data) | 4808 | void *data) |
4764 | { | 4809 | { |
@@ -4768,6 +4813,9 @@ static void perf_event_task_output(struct perf_event *event, | |||
4768 | struct task_struct *task = task_event->task; | 4813 | struct task_struct *task = task_event->task; |
4769 | int ret, size = task_event->event_id.header.size; | 4814 | int ret, size = task_event->event_id.header.size; |
4770 | 4815 | ||
4816 | if (!perf_event_task_match(event)) | ||
4817 | return; | ||
4818 | |||
4771 | perf_event_header__init_id(&task_event->event_id.header, &sample, event); | 4819 | perf_event_header__init_id(&task_event->event_id.header, &sample, event); |
4772 | 4820 | ||
4773 | ret = perf_output_begin(&handle, event, | 4821 | ret = perf_output_begin(&handle, event, |
@@ -4790,13 +4838,6 @@ out: | |||
4790 | task_event->event_id.header.size = size; | 4838 | task_event->event_id.header.size = size; |
4791 | } | 4839 | } |
4792 | 4840 | ||
4793 | static int perf_event_task_match(struct perf_event *event, | ||
4794 | void *data __maybe_unused) | ||
4795 | { | ||
4796 | return event->attr.comm || event->attr.mmap || | ||
4797 | event->attr.mmap_data || event->attr.task; | ||
4798 | } | ||
4799 | |||
4800 | static void perf_event_task(struct task_struct *task, | 4841 | static void perf_event_task(struct task_struct *task, |
4801 | struct perf_event_context *task_ctx, | 4842 | struct perf_event_context *task_ctx, |
4802 | int new) | 4843 | int new) |
@@ -4825,8 +4866,7 @@ static void perf_event_task(struct task_struct *task, | |||
4825 | }, | 4866 | }, |
4826 | }; | 4867 | }; |
4827 | 4868 | ||
4828 | perf_event_aux(perf_event_task_match, | 4869 | perf_event_aux(perf_event_task_output, |
4829 | perf_event_task_output, | ||
4830 | &task_event, | 4870 | &task_event, |
4831 | task_ctx); | 4871 | task_ctx); |
4832 | } | 4872 | } |
@@ -4853,6 +4893,11 @@ struct perf_comm_event { | |||
4853 | } event_id; | 4893 | } event_id; |
4854 | }; | 4894 | }; |
4855 | 4895 | ||
4896 | static int perf_event_comm_match(struct perf_event *event) | ||
4897 | { | ||
4898 | return event->attr.comm; | ||
4899 | } | ||
4900 | |||
4856 | static void perf_event_comm_output(struct perf_event *event, | 4901 | static void perf_event_comm_output(struct perf_event *event, |
4857 | void *data) | 4902 | void *data) |
4858 | { | 4903 | { |
@@ -4862,6 +4907,9 @@ static void perf_event_comm_output(struct perf_event *event, | |||
4862 | int size = comm_event->event_id.header.size; | 4907 | int size = comm_event->event_id.header.size; |
4863 | int ret; | 4908 | int ret; |
4864 | 4909 | ||
4910 | if (!perf_event_comm_match(event)) | ||
4911 | return; | ||
4912 | |||
4865 | perf_event_header__init_id(&comm_event->event_id.header, &sample, event); | 4913 | perf_event_header__init_id(&comm_event->event_id.header, &sample, event); |
4866 | ret = perf_output_begin(&handle, event, | 4914 | ret = perf_output_begin(&handle, event, |
4867 | comm_event->event_id.header.size); | 4915 | comm_event->event_id.header.size); |
@@ -4883,12 +4931,6 @@ out: | |||
4883 | comm_event->event_id.header.size = size; | 4931 | comm_event->event_id.header.size = size; |
4884 | } | 4932 | } |
4885 | 4933 | ||
4886 | static int perf_event_comm_match(struct perf_event *event, | ||
4887 | void *data __maybe_unused) | ||
4888 | { | ||
4889 | return event->attr.comm; | ||
4890 | } | ||
4891 | |||
4892 | static void perf_event_comm_event(struct perf_comm_event *comm_event) | 4934 | static void perf_event_comm_event(struct perf_comm_event *comm_event) |
4893 | { | 4935 | { |
4894 | char comm[TASK_COMM_LEN]; | 4936 | char comm[TASK_COMM_LEN]; |
@@ -4903,8 +4945,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) | |||
4903 | 4945 | ||
4904 | comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; | 4946 | comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; |
4905 | 4947 | ||
4906 | perf_event_aux(perf_event_comm_match, | 4948 | perf_event_aux(perf_event_comm_output, |
4907 | perf_event_comm_output, | ||
4908 | comm_event, | 4949 | comm_event, |
4909 | NULL); | 4950 | NULL); |
4910 | } | 4951 | } |
@@ -4955,6 +4996,9 @@ struct perf_mmap_event { | |||
4955 | 4996 | ||
4956 | const char *file_name; | 4997 | const char *file_name; |
4957 | int file_size; | 4998 | int file_size; |
4999 | int maj, min; | ||
5000 | u64 ino; | ||
5001 | u64 ino_generation; | ||
4958 | 5002 | ||
4959 | struct { | 5003 | struct { |
4960 | struct perf_event_header header; | 5004 | struct perf_event_header header; |
@@ -4967,6 +5011,17 @@ struct perf_mmap_event { | |||
4967 | } event_id; | 5011 | } event_id; |
4968 | }; | 5012 | }; |
4969 | 5013 | ||
5014 | static int perf_event_mmap_match(struct perf_event *event, | ||
5015 | void *data) | ||
5016 | { | ||
5017 | struct perf_mmap_event *mmap_event = data; | ||
5018 | struct vm_area_struct *vma = mmap_event->vma; | ||
5019 | int executable = vma->vm_flags & VM_EXEC; | ||
5020 | |||
5021 | return (!executable && event->attr.mmap_data) || | ||
5022 | (executable && (event->attr.mmap || event->attr.mmap2)); | ||
5023 | } | ||
5024 | |||
4970 | static void perf_event_mmap_output(struct perf_event *event, | 5025 | static void perf_event_mmap_output(struct perf_event *event, |
4971 | void *data) | 5026 | void *data) |
4972 | { | 5027 | { |
@@ -4976,6 +5031,16 @@ static void perf_event_mmap_output(struct perf_event *event, | |||
4976 | int size = mmap_event->event_id.header.size; | 5031 | int size = mmap_event->event_id.header.size; |
4977 | int ret; | 5032 | int ret; |
4978 | 5033 | ||
5034 | if (!perf_event_mmap_match(event, data)) | ||
5035 | return; | ||
5036 | |||
5037 | if (event->attr.mmap2) { | ||
5038 | mmap_event->event_id.header.type = PERF_RECORD_MMAP2; | ||
5039 | mmap_event->event_id.header.size += sizeof(mmap_event->maj); | ||
5040 | mmap_event->event_id.header.size += sizeof(mmap_event->min); | ||
5041 | mmap_event->event_id.header.size += sizeof(mmap_event->ino); | ||
5042 | } | ||
5043 | |||
4979 | perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); | 5044 | perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); |
4980 | ret = perf_output_begin(&handle, event, | 5045 | ret = perf_output_begin(&handle, event, |
4981 | mmap_event->event_id.header.size); | 5046 | mmap_event->event_id.header.size); |
@@ -4986,6 +5051,14 @@ static void perf_event_mmap_output(struct perf_event *event, | |||
4986 | mmap_event->event_id.tid = perf_event_tid(event, current); | 5051 | mmap_event->event_id.tid = perf_event_tid(event, current); |
4987 | 5052 | ||
4988 | perf_output_put(&handle, mmap_event->event_id); | 5053 | perf_output_put(&handle, mmap_event->event_id); |
5054 | |||
5055 | if (event->attr.mmap2) { | ||
5056 | perf_output_put(&handle, mmap_event->maj); | ||
5057 | perf_output_put(&handle, mmap_event->min); | ||
5058 | perf_output_put(&handle, mmap_event->ino); | ||
5059 | perf_output_put(&handle, mmap_event->ino_generation); | ||
5060 | } | ||
5061 | |||
4989 | __output_copy(&handle, mmap_event->file_name, | 5062 | __output_copy(&handle, mmap_event->file_name, |
4990 | mmap_event->file_size); | 5063 | mmap_event->file_size); |
4991 | 5064 | ||
@@ -4996,21 +5069,12 @@ out: | |||
4996 | mmap_event->event_id.header.size = size; | 5069 | mmap_event->event_id.header.size = size; |
4997 | } | 5070 | } |
4998 | 5071 | ||
4999 | static int perf_event_mmap_match(struct perf_event *event, | ||
5000 | void *data) | ||
5001 | { | ||
5002 | struct perf_mmap_event *mmap_event = data; | ||
5003 | struct vm_area_struct *vma = mmap_event->vma; | ||
5004 | int executable = vma->vm_flags & VM_EXEC; | ||
5005 | |||
5006 | return (!executable && event->attr.mmap_data) || | ||
5007 | (executable && event->attr.mmap); | ||
5008 | } | ||
5009 | |||
5010 | static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) | 5072 | static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) |
5011 | { | 5073 | { |
5012 | struct vm_area_struct *vma = mmap_event->vma; | 5074 | struct vm_area_struct *vma = mmap_event->vma; |
5013 | struct file *file = vma->vm_file; | 5075 | struct file *file = vma->vm_file; |
5076 | int maj = 0, min = 0; | ||
5077 | u64 ino = 0, gen = 0; | ||
5014 | unsigned int size; | 5078 | unsigned int size; |
5015 | char tmp[16]; | 5079 | char tmp[16]; |
5016 | char *buf = NULL; | 5080 | char *buf = NULL; |
@@ -5019,6 +5083,8 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) | |||
5019 | memset(tmp, 0, sizeof(tmp)); | 5083 | memset(tmp, 0, sizeof(tmp)); |
5020 | 5084 | ||
5021 | if (file) { | 5085 | if (file) { |
5086 | struct inode *inode; | ||
5087 | dev_t dev; | ||
5022 | /* | 5088 | /* |
5023 | * d_path works from the end of the rb backwards, so we | 5089 | * d_path works from the end of the rb backwards, so we |
5024 | * need to add enough zero bytes after the string to handle | 5090 | * need to add enough zero bytes after the string to handle |
@@ -5034,6 +5100,13 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) | |||
5034 | name = strncpy(tmp, "//toolong", sizeof(tmp)); | 5100 | name = strncpy(tmp, "//toolong", sizeof(tmp)); |
5035 | goto got_name; | 5101 | goto got_name; |
5036 | } | 5102 | } |
5103 | inode = file_inode(vma->vm_file); | ||
5104 | dev = inode->i_sb->s_dev; | ||
5105 | ino = inode->i_ino; | ||
5106 | gen = inode->i_generation; | ||
5107 | maj = MAJOR(dev); | ||
5108 | min = MINOR(dev); | ||
5109 | |||
5037 | } else { | 5110 | } else { |
5038 | if (arch_vma_name(mmap_event->vma)) { | 5111 | if (arch_vma_name(mmap_event->vma)) { |
5039 | name = strncpy(tmp, arch_vma_name(mmap_event->vma), | 5112 | name = strncpy(tmp, arch_vma_name(mmap_event->vma), |
@@ -5064,14 +5137,17 @@ got_name: | |||
5064 | 5137 | ||
5065 | mmap_event->file_name = name; | 5138 | mmap_event->file_name = name; |
5066 | mmap_event->file_size = size; | 5139 | mmap_event->file_size = size; |
5140 | mmap_event->maj = maj; | ||
5141 | mmap_event->min = min; | ||
5142 | mmap_event->ino = ino; | ||
5143 | mmap_event->ino_generation = gen; | ||
5067 | 5144 | ||
5068 | if (!(vma->vm_flags & VM_EXEC)) | 5145 | if (!(vma->vm_flags & VM_EXEC)) |
5069 | mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA; | 5146 | mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA; |
5070 | 5147 | ||
5071 | mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; | 5148 | mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; |
5072 | 5149 | ||
5073 | perf_event_aux(perf_event_mmap_match, | 5150 | perf_event_aux(perf_event_mmap_output, |
5074 | perf_event_mmap_output, | ||
5075 | mmap_event, | 5151 | mmap_event, |
5076 | NULL); | 5152 | NULL); |
5077 | 5153 | ||
@@ -5101,6 +5177,10 @@ void perf_event_mmap(struct vm_area_struct *vma) | |||
5101 | .len = vma->vm_end - vma->vm_start, | 5177 | .len = vma->vm_end - vma->vm_start, |
5102 | .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT, | 5178 | .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT, |
5103 | }, | 5179 | }, |
5180 | /* .maj (attr_mmap2 only) */ | ||
5181 | /* .min (attr_mmap2 only) */ | ||
5182 | /* .ino (attr_mmap2 only) */ | ||
5183 | /* .ino_generation (attr_mmap2 only) */ | ||
5104 | }; | 5184 | }; |
5105 | 5185 | ||
5106 | perf_event_mmap_event(&mmap_event); | 5186 | perf_event_mmap_event(&mmap_event); |
@@ -5178,6 +5258,7 @@ static int __perf_event_overflow(struct perf_event *event, | |||
5178 | __this_cpu_inc(perf_throttled_count); | 5258 | __this_cpu_inc(perf_throttled_count); |
5179 | hwc->interrupts = MAX_INTERRUPTS; | 5259 | hwc->interrupts = MAX_INTERRUPTS; |
5180 | perf_log_throttle(event, 0); | 5260 | perf_log_throttle(event, 0); |
5261 | tick_nohz_full_kick(); | ||
5181 | ret = 1; | 5262 | ret = 1; |
5182 | } | 5263 | } |
5183 | } | 5264 | } |
@@ -6234,8 +6315,6 @@ perf_event_mux_interval_ms_store(struct device *dev, | |||
6234 | return count; | 6315 | return count; |
6235 | } | 6316 | } |
6236 | 6317 | ||
6237 | #define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store) | ||
6238 | |||
6239 | static struct device_attribute pmu_dev_attrs[] = { | 6318 | static struct device_attribute pmu_dev_attrs[] = { |
6240 | __ATTR_RO(type), | 6319 | __ATTR_RO(type), |
6241 | __ATTR_RW(perf_event_mux_interval_ms), | 6320 | __ATTR_RW(perf_event_mux_interval_ms), |
@@ -6445,6 +6524,44 @@ unlock: | |||
6445 | return pmu; | 6524 | return pmu; |
6446 | } | 6525 | } |
6447 | 6526 | ||
6527 | static void account_event_cpu(struct perf_event *event, int cpu) | ||
6528 | { | ||
6529 | if (event->parent) | ||
6530 | return; | ||
6531 | |||
6532 | if (has_branch_stack(event)) { | ||
6533 | if (!(event->attach_state & PERF_ATTACH_TASK)) | ||
6534 | atomic_inc(&per_cpu(perf_branch_stack_events, cpu)); | ||
6535 | } | ||
6536 | if (is_cgroup_event(event)) | ||
6537 | atomic_inc(&per_cpu(perf_cgroup_events, cpu)); | ||
6538 | } | ||
6539 | |||
6540 | static void account_event(struct perf_event *event) | ||
6541 | { | ||
6542 | if (event->parent) | ||
6543 | return; | ||
6544 | |||
6545 | if (event->attach_state & PERF_ATTACH_TASK) | ||
6546 | static_key_slow_inc(&perf_sched_events.key); | ||
6547 | if (event->attr.mmap || event->attr.mmap_data) | ||
6548 | atomic_inc(&nr_mmap_events); | ||
6549 | if (event->attr.comm) | ||
6550 | atomic_inc(&nr_comm_events); | ||
6551 | if (event->attr.task) | ||
6552 | atomic_inc(&nr_task_events); | ||
6553 | if (event->attr.freq) { | ||
6554 | if (atomic_inc_return(&nr_freq_events) == 1) | ||
6555 | tick_nohz_full_kick_all(); | ||
6556 | } | ||
6557 | if (has_branch_stack(event)) | ||
6558 | static_key_slow_inc(&perf_sched_events.key); | ||
6559 | if (is_cgroup_event(event)) | ||
6560 | static_key_slow_inc(&perf_sched_events.key); | ||
6561 | |||
6562 | account_event_cpu(event, event->cpu); | ||
6563 | } | ||
6564 | |||
6448 | /* | 6565 | /* |
6449 | * Allocate and initialize a event structure | 6566 | * Allocate and initialize a event structure |
6450 | */ | 6567 | */ |
@@ -6459,7 +6576,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, | |||
6459 | struct pmu *pmu; | 6576 | struct pmu *pmu; |
6460 | struct perf_event *event; | 6577 | struct perf_event *event; |
6461 | struct hw_perf_event *hwc; | 6578 | struct hw_perf_event *hwc; |
6462 | long err; | 6579 | long err = -EINVAL; |
6463 | 6580 | ||
6464 | if ((unsigned)cpu >= nr_cpu_ids) { | 6581 | if ((unsigned)cpu >= nr_cpu_ids) { |
6465 | if (!task || cpu != -1) | 6582 | if (!task || cpu != -1) |
@@ -6542,49 +6659,35 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, | |||
6542 | * we currently do not support PERF_FORMAT_GROUP on inherited events | 6659 | * we currently do not support PERF_FORMAT_GROUP on inherited events |
6543 | */ | 6660 | */ |
6544 | if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) | 6661 | if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) |
6545 | goto done; | 6662 | goto err_ns; |
6546 | 6663 | ||
6547 | pmu = perf_init_event(event); | 6664 | pmu = perf_init_event(event); |
6548 | |||
6549 | done: | ||
6550 | err = 0; | ||
6551 | if (!pmu) | 6665 | if (!pmu) |
6552 | err = -EINVAL; | 6666 | goto err_ns; |
6553 | else if (IS_ERR(pmu)) | 6667 | else if (IS_ERR(pmu)) { |
6554 | err = PTR_ERR(pmu); | 6668 | err = PTR_ERR(pmu); |
6555 | 6669 | goto err_ns; | |
6556 | if (err) { | ||
6557 | if (event->ns) | ||
6558 | put_pid_ns(event->ns); | ||
6559 | kfree(event); | ||
6560 | return ERR_PTR(err); | ||
6561 | } | 6670 | } |
6562 | 6671 | ||
6563 | if (!event->parent) { | 6672 | if (!event->parent) { |
6564 | if (event->attach_state & PERF_ATTACH_TASK) | ||
6565 | static_key_slow_inc(&perf_sched_events.key); | ||
6566 | if (event->attr.mmap || event->attr.mmap_data) | ||
6567 | atomic_inc(&nr_mmap_events); | ||
6568 | if (event->attr.comm) | ||
6569 | atomic_inc(&nr_comm_events); | ||
6570 | if (event->attr.task) | ||
6571 | atomic_inc(&nr_task_events); | ||
6572 | if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { | 6673 | if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { |
6573 | err = get_callchain_buffers(); | 6674 | err = get_callchain_buffers(); |
6574 | if (err) { | 6675 | if (err) |
6575 | free_event(event); | 6676 | goto err_pmu; |
6576 | return ERR_PTR(err); | ||
6577 | } | ||
6578 | } | ||
6579 | if (has_branch_stack(event)) { | ||
6580 | static_key_slow_inc(&perf_sched_events.key); | ||
6581 | if (!(event->attach_state & PERF_ATTACH_TASK)) | ||
6582 | atomic_inc(&per_cpu(perf_branch_stack_events, | ||
6583 | event->cpu)); | ||
6584 | } | 6677 | } |
6585 | } | 6678 | } |
6586 | 6679 | ||
6587 | return event; | 6680 | return event; |
6681 | |||
6682 | err_pmu: | ||
6683 | if (event->destroy) | ||
6684 | event->destroy(event); | ||
6685 | err_ns: | ||
6686 | if (event->ns) | ||
6687 | put_pid_ns(event->ns); | ||
6688 | kfree(event); | ||
6689 | |||
6690 | return ERR_PTR(err); | ||
6588 | } | 6691 | } |
6589 | 6692 | ||
6590 | static int perf_copy_attr(struct perf_event_attr __user *uattr, | 6693 | static int perf_copy_attr(struct perf_event_attr __user *uattr, |
@@ -6866,17 +6969,14 @@ SYSCALL_DEFINE5(perf_event_open, | |||
6866 | 6969 | ||
6867 | if (flags & PERF_FLAG_PID_CGROUP) { | 6970 | if (flags & PERF_FLAG_PID_CGROUP) { |
6868 | err = perf_cgroup_connect(pid, event, &attr, group_leader); | 6971 | err = perf_cgroup_connect(pid, event, &attr, group_leader); |
6869 | if (err) | 6972 | if (err) { |
6870 | goto err_alloc; | 6973 | __free_event(event); |
6871 | /* | 6974 | goto err_task; |
6872 | * one more event: | 6975 | } |
6873 | * - that has cgroup constraint on event->cpu | ||
6874 | * - that may need work on context switch | ||
6875 | */ | ||
6876 | atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); | ||
6877 | static_key_slow_inc(&perf_sched_events.key); | ||
6878 | } | 6976 | } |
6879 | 6977 | ||
6978 | account_event(event); | ||
6979 | |||
6880 | /* | 6980 | /* |
6881 | * Special case software events and allow them to be part of | 6981 | * Special case software events and allow them to be part of |
6882 | * any hardware group. | 6982 | * any hardware group. |
@@ -7072,6 +7172,8 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, | |||
7072 | goto err; | 7172 | goto err; |
7073 | } | 7173 | } |
7074 | 7174 | ||
7175 | account_event(event); | ||
7176 | |||
7075 | ctx = find_get_context(event->pmu, task, cpu); | 7177 | ctx = find_get_context(event->pmu, task, cpu); |
7076 | if (IS_ERR(ctx)) { | 7178 | if (IS_ERR(ctx)) { |
7077 | err = PTR_ERR(ctx); | 7179 | err = PTR_ERR(ctx); |
@@ -7108,6 +7210,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) | |||
7108 | list_for_each_entry_safe(event, tmp, &src_ctx->event_list, | 7210 | list_for_each_entry_safe(event, tmp, &src_ctx->event_list, |
7109 | event_entry) { | 7211 | event_entry) { |
7110 | perf_remove_from_context(event); | 7212 | perf_remove_from_context(event); |
7213 | unaccount_event_cpu(event, src_cpu); | ||
7111 | put_ctx(src_ctx); | 7214 | put_ctx(src_ctx); |
7112 | list_add(&event->event_entry, &events); | 7215 | list_add(&event->event_entry, &events); |
7113 | } | 7216 | } |
@@ -7120,6 +7223,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) | |||
7120 | list_del(&event->event_entry); | 7223 | list_del(&event->event_entry); |
7121 | if (event->state >= PERF_EVENT_STATE_OFF) | 7224 | if (event->state >= PERF_EVENT_STATE_OFF) |
7122 | event->state = PERF_EVENT_STATE_INACTIVE; | 7225 | event->state = PERF_EVENT_STATE_INACTIVE; |
7226 | account_event_cpu(event, dst_cpu); | ||
7123 | perf_install_in_context(dst_ctx, event, dst_cpu); | 7227 | perf_install_in_context(dst_ctx, event, dst_cpu); |
7124 | get_ctx(dst_ctx); | 7228 | get_ctx(dst_ctx); |
7125 | } | 7229 | } |
@@ -7630,7 +7734,7 @@ static void __init perf_event_init_all_cpus(void) | |||
7630 | } | 7734 | } |
7631 | } | 7735 | } |
7632 | 7736 | ||
7633 | static void __cpuinit perf_event_init_cpu(int cpu) | 7737 | static void perf_event_init_cpu(int cpu) |
7634 | { | 7738 | { |
7635 | struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); | 7739 | struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); |
7636 | 7740 | ||
@@ -7719,7 +7823,7 @@ static struct notifier_block perf_reboot_notifier = { | |||
7719 | .priority = INT_MIN, | 7823 | .priority = INT_MIN, |
7720 | }; | 7824 | }; |
7721 | 7825 | ||
7722 | static int __cpuinit | 7826 | static int |
7723 | perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) | 7827 | perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) |
7724 | { | 7828 | { |
7725 | unsigned int cpu = (long)hcpu; | 7829 | unsigned int cpu = (long)hcpu; |
@@ -7800,7 +7904,8 @@ unlock: | |||
7800 | device_initcall(perf_event_sysfs_init); | 7904 | device_initcall(perf_event_sysfs_init); |
7801 | 7905 | ||
7802 | #ifdef CONFIG_CGROUP_PERF | 7906 | #ifdef CONFIG_CGROUP_PERF |
7803 | static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont) | 7907 | static struct cgroup_subsys_state * |
7908 | perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) | ||
7804 | { | 7909 | { |
7805 | struct perf_cgroup *jc; | 7910 | struct perf_cgroup *jc; |
7806 | 7911 | ||
@@ -7817,11 +7922,10 @@ static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont) | |||
7817 | return &jc->css; | 7922 | return &jc->css; |
7818 | } | 7923 | } |
7819 | 7924 | ||
7820 | static void perf_cgroup_css_free(struct cgroup *cont) | 7925 | static void perf_cgroup_css_free(struct cgroup_subsys_state *css) |
7821 | { | 7926 | { |
7822 | struct perf_cgroup *jc; | 7927 | struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css); |
7823 | jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), | 7928 | |
7824 | struct perf_cgroup, css); | ||
7825 | free_percpu(jc->info); | 7929 | free_percpu(jc->info); |
7826 | kfree(jc); | 7930 | kfree(jc); |
7827 | } | 7931 | } |
@@ -7833,15 +7937,17 @@ static int __perf_cgroup_move(void *info) | |||
7833 | return 0; | 7937 | return 0; |
7834 | } | 7938 | } |
7835 | 7939 | ||
7836 | static void perf_cgroup_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) | 7940 | static void perf_cgroup_attach(struct cgroup_subsys_state *css, |
7941 | struct cgroup_taskset *tset) | ||
7837 | { | 7942 | { |
7838 | struct task_struct *task; | 7943 | struct task_struct *task; |
7839 | 7944 | ||
7840 | cgroup_taskset_for_each(task, cgrp, tset) | 7945 | cgroup_taskset_for_each(task, css, tset) |
7841 | task_function_call(task, __perf_cgroup_move, task); | 7946 | task_function_call(task, __perf_cgroup_move, task); |
7842 | } | 7947 | } |
7843 | 7948 | ||
7844 | static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, | 7949 | static void perf_cgroup_exit(struct cgroup_subsys_state *css, |
7950 | struct cgroup_subsys_state *old_css, | ||
7845 | struct task_struct *task) | 7951 | struct task_struct *task) |
7846 | { | 7952 | { |
7847 | /* | 7953 | /* |
diff --git a/kernel/fork.c b/kernel/fork.c index 66635c80a813..bf46287c91a4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -1177,7 +1177,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1177 | * don't allow the creation of threads. | 1177 | * don't allow the creation of threads. |
1178 | */ | 1178 | */ |
1179 | if ((clone_flags & (CLONE_VM|CLONE_NEWPID)) && | 1179 | if ((clone_flags & (CLONE_VM|CLONE_NEWPID)) && |
1180 | (task_active_pid_ns(current) != current->nsproxy->pid_ns)) | 1180 | (task_active_pid_ns(current) != |
1181 | current->nsproxy->pid_ns_for_children)) | ||
1181 | return ERR_PTR(-EINVAL); | 1182 | return ERR_PTR(-EINVAL); |
1182 | 1183 | ||
1183 | retval = security_task_create(clone_flags); | 1184 | retval = security_task_create(clone_flags); |
@@ -1351,7 +1352,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1351 | 1352 | ||
1352 | if (pid != &init_struct_pid) { | 1353 | if (pid != &init_struct_pid) { |
1353 | retval = -ENOMEM; | 1354 | retval = -ENOMEM; |
1354 | pid = alloc_pid(p->nsproxy->pid_ns); | 1355 | pid = alloc_pid(p->nsproxy->pid_ns_for_children); |
1355 | if (!pid) | 1356 | if (!pid) |
1356 | goto bad_fork_cleanup_io; | 1357 | goto bad_fork_cleanup_io; |
1357 | } | 1358 | } |
@@ -1546,7 +1547,7 @@ static inline void init_idle_pids(struct pid_link *links) | |||
1546 | } | 1547 | } |
1547 | } | 1548 | } |
1548 | 1549 | ||
1549 | struct task_struct * __cpuinit fork_idle(int cpu) | 1550 | struct task_struct *fork_idle(int cpu) |
1550 | { | 1551 | { |
1551 | struct task_struct *task; | 1552 | struct task_struct *task; |
1552 | task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0); | 1553 | task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0); |
@@ -1679,6 +1680,12 @@ SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags, | |||
1679 | int __user *, parent_tidptr, | 1680 | int __user *, parent_tidptr, |
1680 | int __user *, child_tidptr, | 1681 | int __user *, child_tidptr, |
1681 | int, tls_val) | 1682 | int, tls_val) |
1683 | #elif defined(CONFIG_CLONE_BACKWARDS3) | ||
1684 | SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp, | ||
1685 | int, stack_size, | ||
1686 | int __user *, parent_tidptr, | ||
1687 | int __user *, child_tidptr, | ||
1688 | int, tls_val) | ||
1682 | #else | 1689 | #else |
1683 | SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, | 1690 | SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, |
1684 | int __user *, parent_tidptr, | 1691 | int __user *, parent_tidptr, |
diff --git a/kernel/freezer.c b/kernel/freezer.c index 8b2afc1c9df0..b462fa197517 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c | |||
@@ -33,7 +33,7 @@ static DEFINE_SPINLOCK(freezer_lock); | |||
33 | */ | 33 | */ |
34 | bool freezing_slow_path(struct task_struct *p) | 34 | bool freezing_slow_path(struct task_struct *p) |
35 | { | 35 | { |
36 | if (p->flags & PF_NOFREEZE) | 36 | if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK)) |
37 | return false; | 37 | return false; |
38 | 38 | ||
39 | if (pm_nosig_freezing || cgroup_freezing(p)) | 39 | if (pm_nosig_freezing || cgroup_freezing(p)) |
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index f0f4fe29cd21..383319bae3f7 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c | |||
@@ -1659,7 +1659,7 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp, | |||
1659 | /* | 1659 | /* |
1660 | * Functions related to boot-time initialization: | 1660 | * Functions related to boot-time initialization: |
1661 | */ | 1661 | */ |
1662 | static void __cpuinit init_hrtimers_cpu(int cpu) | 1662 | static void init_hrtimers_cpu(int cpu) |
1663 | { | 1663 | { |
1664 | struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); | 1664 | struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); |
1665 | int i; | 1665 | int i; |
@@ -1740,7 +1740,7 @@ static void migrate_hrtimers(int scpu) | |||
1740 | 1740 | ||
1741 | #endif /* CONFIG_HOTPLUG_CPU */ | 1741 | #endif /* CONFIG_HOTPLUG_CPU */ |
1742 | 1742 | ||
1743 | static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, | 1743 | static int hrtimer_cpu_notify(struct notifier_block *self, |
1744 | unsigned long action, void *hcpu) | 1744 | unsigned long action, void *hcpu) |
1745 | { | 1745 | { |
1746 | int scpu = (long)hcpu; | 1746 | int scpu = (long)hcpu; |
@@ -1773,7 +1773,7 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, | |||
1773 | return NOTIFY_OK; | 1773 | return NOTIFY_OK; |
1774 | } | 1774 | } |
1775 | 1775 | ||
1776 | static struct notifier_block __cpuinitdata hrtimers_nb = { | 1776 | static struct notifier_block hrtimers_nb = { |
1777 | .notifier_call = hrtimer_cpu_notify, | 1777 | .notifier_call = hrtimer_cpu_notify, |
1778 | }; | 1778 | }; |
1779 | 1779 | ||
diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 6df614912b9d..3e97fb126e6b 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c | |||
@@ -15,6 +15,7 @@ | |||
15 | #include <linux/lockdep.h> | 15 | #include <linux/lockdep.h> |
16 | #include <linux/export.h> | 16 | #include <linux/export.h> |
17 | #include <linux/sysctl.h> | 17 | #include <linux/sysctl.h> |
18 | #include <linux/utsname.h> | ||
18 | 19 | ||
19 | /* | 20 | /* |
20 | * The number of tasks checked: | 21 | * The number of tasks checked: |
@@ -99,10 +100,14 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) | |||
99 | * Ok, the task did not get scheduled for more than 2 minutes, | 100 | * Ok, the task did not get scheduled for more than 2 minutes, |
100 | * complain: | 101 | * complain: |
101 | */ | 102 | */ |
102 | printk(KERN_ERR "INFO: task %s:%d blocked for more than " | 103 | pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n", |
103 | "%ld seconds.\n", t->comm, t->pid, timeout); | 104 | t->comm, t->pid, timeout); |
104 | printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" | 105 | pr_err(" %s %s %.*s\n", |
105 | " disables this message.\n"); | 106 | print_tainted(), init_utsname()->release, |
107 | (int)strcspn(init_utsname()->version, " "), | ||
108 | init_utsname()->version); | ||
109 | pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" | ||
110 | " disables this message.\n"); | ||
106 | sched_show_task(t); | 111 | sched_show_task(t); |
107 | debug_show_held_locks(t); | 112 | debug_show_held_locks(t); |
108 | 113 | ||
diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 60f48fa0fd0d..297a9247a3b3 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c | |||
@@ -13,6 +13,7 @@ | |||
13 | #include <linux/sort.h> | 13 | #include <linux/sort.h> |
14 | #include <linux/err.h> | 14 | #include <linux/err.h> |
15 | #include <linux/static_key.h> | 15 | #include <linux/static_key.h> |
16 | #include <linux/jump_label_ratelimit.h> | ||
16 | 17 | ||
17 | #ifdef HAVE_JUMP_LABEL | 18 | #ifdef HAVE_JUMP_LABEL |
18 | 19 | ||
diff --git a/kernel/lglock.c b/kernel/lglock.c index 6535a667a5a7..86ae2aebf004 100644 --- a/kernel/lglock.c +++ b/kernel/lglock.c | |||
@@ -21,7 +21,7 @@ void lg_local_lock(struct lglock *lg) | |||
21 | arch_spinlock_t *lock; | 21 | arch_spinlock_t *lock; |
22 | 22 | ||
23 | preempt_disable(); | 23 | preempt_disable(); |
24 | rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_); | 24 | lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); |
25 | lock = this_cpu_ptr(lg->lock); | 25 | lock = this_cpu_ptr(lg->lock); |
26 | arch_spin_lock(lock); | 26 | arch_spin_lock(lock); |
27 | } | 27 | } |
@@ -31,7 +31,7 @@ void lg_local_unlock(struct lglock *lg) | |||
31 | { | 31 | { |
32 | arch_spinlock_t *lock; | 32 | arch_spinlock_t *lock; |
33 | 33 | ||
34 | rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); | 34 | lock_release(&lg->lock_dep_map, 1, _RET_IP_); |
35 | lock = this_cpu_ptr(lg->lock); | 35 | lock = this_cpu_ptr(lg->lock); |
36 | arch_spin_unlock(lock); | 36 | arch_spin_unlock(lock); |
37 | preempt_enable(); | 37 | preempt_enable(); |
@@ -43,7 +43,7 @@ void lg_local_lock_cpu(struct lglock *lg, int cpu) | |||
43 | arch_spinlock_t *lock; | 43 | arch_spinlock_t *lock; |
44 | 44 | ||
45 | preempt_disable(); | 45 | preempt_disable(); |
46 | rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_); | 46 | lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); |
47 | lock = per_cpu_ptr(lg->lock, cpu); | 47 | lock = per_cpu_ptr(lg->lock, cpu); |
48 | arch_spin_lock(lock); | 48 | arch_spin_lock(lock); |
49 | } | 49 | } |
@@ -53,7 +53,7 @@ void lg_local_unlock_cpu(struct lglock *lg, int cpu) | |||
53 | { | 53 | { |
54 | arch_spinlock_t *lock; | 54 | arch_spinlock_t *lock; |
55 | 55 | ||
56 | rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); | 56 | lock_release(&lg->lock_dep_map, 1, _RET_IP_); |
57 | lock = per_cpu_ptr(lg->lock, cpu); | 57 | lock = per_cpu_ptr(lg->lock, cpu); |
58 | arch_spin_unlock(lock); | 58 | arch_spin_unlock(lock); |
59 | preempt_enable(); | 59 | preempt_enable(); |
@@ -65,7 +65,7 @@ void lg_global_lock(struct lglock *lg) | |||
65 | int i; | 65 | int i; |
66 | 66 | ||
67 | preempt_disable(); | 67 | preempt_disable(); |
68 | rwlock_acquire(&lg->lock_dep_map, 0, 0, _RET_IP_); | 68 | lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); |
69 | for_each_possible_cpu(i) { | 69 | for_each_possible_cpu(i) { |
70 | arch_spinlock_t *lock; | 70 | arch_spinlock_t *lock; |
71 | lock = per_cpu_ptr(lg->lock, i); | 71 | lock = per_cpu_ptr(lg->lock, i); |
@@ -78,7 +78,7 @@ void lg_global_unlock(struct lglock *lg) | |||
78 | { | 78 | { |
79 | int i; | 79 | int i; |
80 | 80 | ||
81 | rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); | 81 | lock_release(&lg->lock_dep_map, 1, _RET_IP_); |
82 | for_each_possible_cpu(i) { | 82 | for_each_possible_cpu(i) { |
83 | arch_spinlock_t *lock; | 83 | arch_spinlock_t *lock; |
84 | lock = per_cpu_ptr(lg->lock, i); | 84 | lock = per_cpu_ptr(lg->lock, i); |
diff --git a/kernel/mutex.c b/kernel/mutex.c index ff05f4bd86eb..6d647aedffea 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c | |||
@@ -209,11 +209,13 @@ int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) | |||
209 | */ | 209 | */ |
210 | static inline int mutex_can_spin_on_owner(struct mutex *lock) | 210 | static inline int mutex_can_spin_on_owner(struct mutex *lock) |
211 | { | 211 | { |
212 | struct task_struct *owner; | ||
212 | int retval = 1; | 213 | int retval = 1; |
213 | 214 | ||
214 | rcu_read_lock(); | 215 | rcu_read_lock(); |
215 | if (lock->owner) | 216 | owner = ACCESS_ONCE(lock->owner); |
216 | retval = lock->owner->on_cpu; | 217 | if (owner) |
218 | retval = owner->on_cpu; | ||
217 | rcu_read_unlock(); | 219 | rcu_read_unlock(); |
218 | /* | 220 | /* |
219 | * if lock->owner is not set, the mutex owner may have just acquired | 221 | * if lock->owner is not set, the mutex owner may have just acquired |
@@ -461,7 +463,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, | |||
461 | * performed the optimistic spinning cannot be done. | 463 | * performed the optimistic spinning cannot be done. |
462 | */ | 464 | */ |
463 | if (ACCESS_ONCE(ww->ctx)) | 465 | if (ACCESS_ONCE(ww->ctx)) |
464 | break; | 466 | goto slowpath; |
465 | } | 467 | } |
466 | 468 | ||
467 | /* | 469 | /* |
@@ -472,7 +474,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, | |||
472 | owner = ACCESS_ONCE(lock->owner); | 474 | owner = ACCESS_ONCE(lock->owner); |
473 | if (owner && !mutex_spin_on_owner(lock, owner)) { | 475 | if (owner && !mutex_spin_on_owner(lock, owner)) { |
474 | mspin_unlock(MLOCK(lock), &node); | 476 | mspin_unlock(MLOCK(lock), &node); |
475 | break; | 477 | goto slowpath; |
476 | } | 478 | } |
477 | 479 | ||
478 | if ((atomic_read(&lock->count) == 1) && | 480 | if ((atomic_read(&lock->count) == 1) && |
@@ -499,7 +501,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, | |||
499 | * the owner complete. | 501 | * the owner complete. |
500 | */ | 502 | */ |
501 | if (!owner && (need_resched() || rt_task(task))) | 503 | if (!owner && (need_resched() || rt_task(task))) |
502 | break; | 504 | goto slowpath; |
503 | 505 | ||
504 | /* | 506 | /* |
505 | * The cpu_relax() call is a compiler barrier which forces | 507 | * The cpu_relax() call is a compiler barrier which forces |
@@ -513,6 +515,10 @@ slowpath: | |||
513 | #endif | 515 | #endif |
514 | spin_lock_mutex(&lock->wait_lock, flags); | 516 | spin_lock_mutex(&lock->wait_lock, flags); |
515 | 517 | ||
518 | /* once more, can we acquire the lock? */ | ||
519 | if (MUTEX_SHOW_NO_WAITER(lock) && (atomic_xchg(&lock->count, 0) == 1)) | ||
520 | goto skip_wait; | ||
521 | |||
516 | debug_mutex_lock_common(lock, &waiter); | 522 | debug_mutex_lock_common(lock, &waiter); |
517 | debug_mutex_add_waiter(lock, &waiter, task_thread_info(task)); | 523 | debug_mutex_add_waiter(lock, &waiter, task_thread_info(task)); |
518 | 524 | ||
@@ -520,9 +526,6 @@ slowpath: | |||
520 | list_add_tail(&waiter.list, &lock->wait_list); | 526 | list_add_tail(&waiter.list, &lock->wait_list); |
521 | waiter.task = task; | 527 | waiter.task = task; |
522 | 528 | ||
523 | if (MUTEX_SHOW_NO_WAITER(lock) && (atomic_xchg(&lock->count, -1) == 1)) | ||
524 | goto done; | ||
525 | |||
526 | lock_contended(&lock->dep_map, ip); | 529 | lock_contended(&lock->dep_map, ip); |
527 | 530 | ||
528 | for (;;) { | 531 | for (;;) { |
@@ -536,7 +539,7 @@ slowpath: | |||
536 | * other waiters: | 539 | * other waiters: |
537 | */ | 540 | */ |
538 | if (MUTEX_SHOW_NO_WAITER(lock) && | 541 | if (MUTEX_SHOW_NO_WAITER(lock) && |
539 | (atomic_xchg(&lock->count, -1) == 1)) | 542 | (atomic_xchg(&lock->count, -1) == 1)) |
540 | break; | 543 | break; |
541 | 544 | ||
542 | /* | 545 | /* |
@@ -561,24 +564,25 @@ slowpath: | |||
561 | schedule_preempt_disabled(); | 564 | schedule_preempt_disabled(); |
562 | spin_lock_mutex(&lock->wait_lock, flags); | 565 | spin_lock_mutex(&lock->wait_lock, flags); |
563 | } | 566 | } |
567 | mutex_remove_waiter(lock, &waiter, current_thread_info()); | ||
568 | /* set it to 0 if there are no waiters left: */ | ||
569 | if (likely(list_empty(&lock->wait_list))) | ||
570 | atomic_set(&lock->count, 0); | ||
571 | debug_mutex_free_waiter(&waiter); | ||
564 | 572 | ||
565 | done: | 573 | skip_wait: |
574 | /* got the lock - cleanup and rejoice! */ | ||
566 | lock_acquired(&lock->dep_map, ip); | 575 | lock_acquired(&lock->dep_map, ip); |
567 | /* got the lock - rejoice! */ | ||
568 | mutex_remove_waiter(lock, &waiter, current_thread_info()); | ||
569 | mutex_set_owner(lock); | 576 | mutex_set_owner(lock); |
570 | 577 | ||
571 | if (!__builtin_constant_p(ww_ctx == NULL)) { | 578 | if (!__builtin_constant_p(ww_ctx == NULL)) { |
572 | struct ww_mutex *ww = container_of(lock, | 579 | struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); |
573 | struct ww_mutex, | ||
574 | base); | ||
575 | struct mutex_waiter *cur; | 580 | struct mutex_waiter *cur; |
576 | 581 | ||
577 | /* | 582 | /* |
578 | * This branch gets optimized out for the common case, | 583 | * This branch gets optimized out for the common case, |
579 | * and is only important for ww_mutex_lock. | 584 | * and is only important for ww_mutex_lock. |
580 | */ | 585 | */ |
581 | |||
582 | ww_mutex_lock_acquired(ww, ww_ctx); | 586 | ww_mutex_lock_acquired(ww, ww_ctx); |
583 | ww->ctx = ww_ctx; | 587 | ww->ctx = ww_ctx; |
584 | 588 | ||
@@ -592,15 +596,8 @@ done: | |||
592 | } | 596 | } |
593 | } | 597 | } |
594 | 598 | ||
595 | /* set it to 0 if there are no waiters left: */ | ||
596 | if (likely(list_empty(&lock->wait_list))) | ||
597 | atomic_set(&lock->count, 0); | ||
598 | |||
599 | spin_unlock_mutex(&lock->wait_lock, flags); | 599 | spin_unlock_mutex(&lock->wait_lock, flags); |
600 | |||
601 | debug_mutex_free_waiter(&waiter); | ||
602 | preempt_enable(); | 600 | preempt_enable(); |
603 | |||
604 | return 0; | 601 | return 0; |
605 | 602 | ||
606 | err: | 603 | err: |
@@ -686,7 +683,7 @@ __ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) | |||
686 | might_sleep(); | 683 | might_sleep(); |
687 | ret = __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, | 684 | ret = __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, |
688 | 0, &ctx->dep_map, _RET_IP_, ctx); | 685 | 0, &ctx->dep_map, _RET_IP_, ctx); |
689 | if (!ret && ctx->acquired > 0) | 686 | if (!ret && ctx->acquired > 1) |
690 | return ww_mutex_deadlock_injection(lock, ctx); | 687 | return ww_mutex_deadlock_injection(lock, ctx); |
691 | 688 | ||
692 | return ret; | 689 | return ret; |
@@ -702,7 +699,7 @@ __ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) | |||
702 | ret = __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, | 699 | ret = __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, |
703 | 0, &ctx->dep_map, _RET_IP_, ctx); | 700 | 0, &ctx->dep_map, _RET_IP_, ctx); |
704 | 701 | ||
705 | if (!ret && ctx->acquired > 0) | 702 | if (!ret && ctx->acquired > 1) |
706 | return ww_mutex_deadlock_injection(lock, ctx); | 703 | return ww_mutex_deadlock_injection(lock, ctx); |
707 | 704 | ||
708 | return ret; | 705 | return ret; |
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 364ceab15f0c..997cbb951a3b 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c | |||
@@ -29,15 +29,15 @@ | |||
29 | static struct kmem_cache *nsproxy_cachep; | 29 | static struct kmem_cache *nsproxy_cachep; |
30 | 30 | ||
31 | struct nsproxy init_nsproxy = { | 31 | struct nsproxy init_nsproxy = { |
32 | .count = ATOMIC_INIT(1), | 32 | .count = ATOMIC_INIT(1), |
33 | .uts_ns = &init_uts_ns, | 33 | .uts_ns = &init_uts_ns, |
34 | #if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC) | 34 | #if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC) |
35 | .ipc_ns = &init_ipc_ns, | 35 | .ipc_ns = &init_ipc_ns, |
36 | #endif | 36 | #endif |
37 | .mnt_ns = NULL, | 37 | .mnt_ns = NULL, |
38 | .pid_ns = &init_pid_ns, | 38 | .pid_ns_for_children = &init_pid_ns, |
39 | #ifdef CONFIG_NET | 39 | #ifdef CONFIG_NET |
40 | .net_ns = &init_net, | 40 | .net_ns = &init_net, |
41 | #endif | 41 | #endif |
42 | }; | 42 | }; |
43 | 43 | ||
@@ -85,9 +85,10 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, | |||
85 | goto out_ipc; | 85 | goto out_ipc; |
86 | } | 86 | } |
87 | 87 | ||
88 | new_nsp->pid_ns = copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns); | 88 | new_nsp->pid_ns_for_children = |
89 | if (IS_ERR(new_nsp->pid_ns)) { | 89 | copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns_for_children); |
90 | err = PTR_ERR(new_nsp->pid_ns); | 90 | if (IS_ERR(new_nsp->pid_ns_for_children)) { |
91 | err = PTR_ERR(new_nsp->pid_ns_for_children); | ||
91 | goto out_pid; | 92 | goto out_pid; |
92 | } | 93 | } |
93 | 94 | ||
@@ -100,8 +101,8 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, | |||
100 | return new_nsp; | 101 | return new_nsp; |
101 | 102 | ||
102 | out_net: | 103 | out_net: |
103 | if (new_nsp->pid_ns) | 104 | if (new_nsp->pid_ns_for_children) |
104 | put_pid_ns(new_nsp->pid_ns); | 105 | put_pid_ns(new_nsp->pid_ns_for_children); |
105 | out_pid: | 106 | out_pid: |
106 | if (new_nsp->ipc_ns) | 107 | if (new_nsp->ipc_ns) |
107 | put_ipc_ns(new_nsp->ipc_ns); | 108 | put_ipc_ns(new_nsp->ipc_ns); |
@@ -174,8 +175,8 @@ void free_nsproxy(struct nsproxy *ns) | |||
174 | put_uts_ns(ns->uts_ns); | 175 | put_uts_ns(ns->uts_ns); |
175 | if (ns->ipc_ns) | 176 | if (ns->ipc_ns) |
176 | put_ipc_ns(ns->ipc_ns); | 177 | put_ipc_ns(ns->ipc_ns); |
177 | if (ns->pid_ns) | 178 | if (ns->pid_ns_for_children) |
178 | put_pid_ns(ns->pid_ns); | 179 | put_pid_ns(ns->pid_ns_for_children); |
179 | put_net(ns->net_ns); | 180 | put_net(ns->net_ns); |
180 | kmem_cache_free(nsproxy_cachep, ns); | 181 | kmem_cache_free(nsproxy_cachep, ns); |
181 | } | 182 | } |
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 6917e8edb48e..601bb361c235 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c | |||
@@ -349,8 +349,8 @@ static int pidns_install(struct nsproxy *nsproxy, void *ns) | |||
349 | if (ancestor != active) | 349 | if (ancestor != active) |
350 | return -EINVAL; | 350 | return -EINVAL; |
351 | 351 | ||
352 | put_pid_ns(nsproxy->pid_ns); | 352 | put_pid_ns(nsproxy->pid_ns_for_children); |
353 | nsproxy->pid_ns = get_pid_ns(new); | 353 | nsproxy->pid_ns_for_children = get_pid_ns(new); |
354 | return 0; | 354 | return 0; |
355 | } | 355 | } |
356 | 356 | ||
diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c index c6422ffeda9a..9012ecf7b814 100644 --- a/kernel/power/autosleep.c +++ b/kernel/power/autosleep.c | |||
@@ -32,7 +32,8 @@ static void try_to_suspend(struct work_struct *work) | |||
32 | 32 | ||
33 | mutex_lock(&autosleep_lock); | 33 | mutex_lock(&autosleep_lock); |
34 | 34 | ||
35 | if (!pm_save_wakeup_count(initial_count)) { | 35 | if (!pm_save_wakeup_count(initial_count) || |
36 | system_state != SYSTEM_RUNNING) { | ||
36 | mutex_unlock(&autosleep_lock); | 37 | mutex_unlock(&autosleep_lock); |
37 | goto out; | 38 | goto out; |
38 | } | 39 | } |
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index b26f5f1e773e..3085e62a80a5 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c | |||
@@ -39,7 +39,7 @@ static int resume_delay; | |||
39 | static char resume_file[256] = CONFIG_PM_STD_PARTITION; | 39 | static char resume_file[256] = CONFIG_PM_STD_PARTITION; |
40 | dev_t swsusp_resume_device; | 40 | dev_t swsusp_resume_device; |
41 | sector_t swsusp_resume_block; | 41 | sector_t swsusp_resume_block; |
42 | int in_suspend __nosavedata; | 42 | __visible int in_suspend __nosavedata; |
43 | 43 | ||
44 | enum { | 44 | enum { |
45 | HIBERNATION_INVALID, | 45 | HIBERNATION_INVALID, |
diff --git a/kernel/power/process.c b/kernel/power/process.c index fc0df8486449..06ec8869dbf1 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c | |||
@@ -109,6 +109,8 @@ static int try_to_freeze_tasks(bool user_only) | |||
109 | 109 | ||
110 | /** | 110 | /** |
111 | * freeze_processes - Signal user space processes to enter the refrigerator. | 111 | * freeze_processes - Signal user space processes to enter the refrigerator. |
112 | * The current thread will not be frozen. The same process that calls | ||
113 | * freeze_processes must later call thaw_processes. | ||
112 | * | 114 | * |
113 | * On success, returns 0. On failure, -errno and system is fully thawed. | 115 | * On success, returns 0. On failure, -errno and system is fully thawed. |
114 | */ | 116 | */ |
@@ -120,6 +122,9 @@ int freeze_processes(void) | |||
120 | if (error) | 122 | if (error) |
121 | return error; | 123 | return error; |
122 | 124 | ||
125 | /* Make sure this task doesn't get frozen */ | ||
126 | current->flags |= PF_SUSPEND_TASK; | ||
127 | |||
123 | if (!pm_freezing) | 128 | if (!pm_freezing) |
124 | atomic_inc(&system_freezing_cnt); | 129 | atomic_inc(&system_freezing_cnt); |
125 | 130 | ||
@@ -168,6 +173,7 @@ int freeze_kernel_threads(void) | |||
168 | void thaw_processes(void) | 173 | void thaw_processes(void) |
169 | { | 174 | { |
170 | struct task_struct *g, *p; | 175 | struct task_struct *g, *p; |
176 | struct task_struct *curr = current; | ||
171 | 177 | ||
172 | if (pm_freezing) | 178 | if (pm_freezing) |
173 | atomic_dec(&system_freezing_cnt); | 179 | atomic_dec(&system_freezing_cnt); |
@@ -182,10 +188,15 @@ void thaw_processes(void) | |||
182 | 188 | ||
183 | read_lock(&tasklist_lock); | 189 | read_lock(&tasklist_lock); |
184 | do_each_thread(g, p) { | 190 | do_each_thread(g, p) { |
191 | /* No other threads should have PF_SUSPEND_TASK set */ | ||
192 | WARN_ON((p != curr) && (p->flags & PF_SUSPEND_TASK)); | ||
185 | __thaw_task(p); | 193 | __thaw_task(p); |
186 | } while_each_thread(g, p); | 194 | } while_each_thread(g, p); |
187 | read_unlock(&tasklist_lock); | 195 | read_unlock(&tasklist_lock); |
188 | 196 | ||
197 | WARN_ON(!(curr->flags & PF_SUSPEND_TASK)); | ||
198 | curr->flags &= ~PF_SUSPEND_TASK; | ||
199 | |||
189 | usermodehelper_enable(); | 200 | usermodehelper_enable(); |
190 | 201 | ||
191 | schedule(); | 202 | schedule(); |
diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 06fe28589e9c..a394297f8b2f 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c | |||
@@ -296,6 +296,17 @@ int pm_qos_request_active(struct pm_qos_request *req) | |||
296 | } | 296 | } |
297 | EXPORT_SYMBOL_GPL(pm_qos_request_active); | 297 | EXPORT_SYMBOL_GPL(pm_qos_request_active); |
298 | 298 | ||
299 | static void __pm_qos_update_request(struct pm_qos_request *req, | ||
300 | s32 new_value) | ||
301 | { | ||
302 | trace_pm_qos_update_request(req->pm_qos_class, new_value); | ||
303 | |||
304 | if (new_value != req->node.prio) | ||
305 | pm_qos_update_target( | ||
306 | pm_qos_array[req->pm_qos_class]->constraints, | ||
307 | &req->node, PM_QOS_UPDATE_REQ, new_value); | ||
308 | } | ||
309 | |||
299 | /** | 310 | /** |
300 | * pm_qos_work_fn - the timeout handler of pm_qos_update_request_timeout | 311 | * pm_qos_work_fn - the timeout handler of pm_qos_update_request_timeout |
301 | * @work: work struct for the delayed work (timeout) | 312 | * @work: work struct for the delayed work (timeout) |
@@ -308,7 +319,7 @@ static void pm_qos_work_fn(struct work_struct *work) | |||
308 | struct pm_qos_request, | 319 | struct pm_qos_request, |
309 | work); | 320 | work); |
310 | 321 | ||
311 | pm_qos_update_request(req, PM_QOS_DEFAULT_VALUE); | 322 | __pm_qos_update_request(req, PM_QOS_DEFAULT_VALUE); |
312 | } | 323 | } |
313 | 324 | ||
314 | /** | 325 | /** |
@@ -364,12 +375,7 @@ void pm_qos_update_request(struct pm_qos_request *req, | |||
364 | } | 375 | } |
365 | 376 | ||
366 | cancel_delayed_work_sync(&req->work); | 377 | cancel_delayed_work_sync(&req->work); |
367 | 378 | __pm_qos_update_request(req, new_value); | |
368 | trace_pm_qos_update_request(req->pm_qos_class, new_value); | ||
369 | if (new_value != req->node.prio) | ||
370 | pm_qos_update_target( | ||
371 | pm_qos_array[req->pm_qos_class]->constraints, | ||
372 | &req->node, PM_QOS_UPDATE_REQ, new_value); | ||
373 | } | 379 | } |
374 | EXPORT_SYMBOL_GPL(pm_qos_update_request); | 380 | EXPORT_SYMBOL_GPL(pm_qos_update_request); |
375 | 381 | ||
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index ece04223bb1e..62ee437b5c7e 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c | |||
@@ -210,6 +210,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) | |||
210 | goto Platform_wake; | 210 | goto Platform_wake; |
211 | } | 211 | } |
212 | 212 | ||
213 | ftrace_stop(); | ||
213 | error = disable_nonboot_cpus(); | 214 | error = disable_nonboot_cpus(); |
214 | if (error || suspend_test(TEST_CPUS)) | 215 | if (error || suspend_test(TEST_CPUS)) |
215 | goto Enable_cpus; | 216 | goto Enable_cpus; |
@@ -232,6 +233,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) | |||
232 | 233 | ||
233 | Enable_cpus: | 234 | Enable_cpus: |
234 | enable_nonboot_cpus(); | 235 | enable_nonboot_cpus(); |
236 | ftrace_start(); | ||
235 | 237 | ||
236 | Platform_wake: | 238 | Platform_wake: |
237 | if (need_suspend_ops(state) && suspend_ops->wake) | 239 | if (need_suspend_ops(state) && suspend_ops->wake) |
@@ -265,7 +267,6 @@ int suspend_devices_and_enter(suspend_state_t state) | |||
265 | goto Close; | 267 | goto Close; |
266 | } | 268 | } |
267 | suspend_console(); | 269 | suspend_console(); |
268 | ftrace_stop(); | ||
269 | suspend_test_start(); | 270 | suspend_test_start(); |
270 | error = dpm_suspend_start(PMSG_SUSPEND); | 271 | error = dpm_suspend_start(PMSG_SUSPEND); |
271 | if (error) { | 272 | if (error) { |
@@ -285,7 +286,6 @@ int suspend_devices_and_enter(suspend_state_t state) | |||
285 | suspend_test_start(); | 286 | suspend_test_start(); |
286 | dpm_resume_end(PMSG_RESUME); | 287 | dpm_resume_end(PMSG_RESUME); |
287 | suspend_test_finish("resume devices"); | 288 | suspend_test_finish("resume devices"); |
288 | ftrace_start(); | ||
289 | resume_console(); | 289 | resume_console(); |
290 | Close: | 290 | Close: |
291 | if (need_suspend_ops(state) && suspend_ops->end) | 291 | if (need_suspend_ops(state) && suspend_ops->end) |
diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile new file mode 100644 index 000000000000..85405bdcf2b3 --- /dev/null +++ b/kernel/printk/Makefile | |||
@@ -0,0 +1,2 @@ | |||
1 | obj-y = printk.o | ||
2 | obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o | ||
diff --git a/kernel/printk/braille.c b/kernel/printk/braille.c new file mode 100644 index 000000000000..276762f3a460 --- /dev/null +++ b/kernel/printk/braille.c | |||
@@ -0,0 +1,49 @@ | |||
1 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | ||
2 | |||
3 | #include <linux/kernel.h> | ||
4 | #include <linux/console.h> | ||
5 | #include <linux/string.h> | ||
6 | |||
7 | #include "console_cmdline.h" | ||
8 | #include "braille.h" | ||
9 | |||
10 | char *_braille_console_setup(char **str, char **brl_options) | ||
11 | { | ||
12 | if (!memcmp(*str, "brl,", 4)) { | ||
13 | *brl_options = ""; | ||
14 | *str += 4; | ||
15 | } else if (!memcmp(str, "brl=", 4)) { | ||
16 | *brl_options = *str + 4; | ||
17 | *str = strchr(*brl_options, ','); | ||
18 | if (!*str) | ||
19 | pr_err("need port name after brl=\n"); | ||
20 | else | ||
21 | *((*str)++) = 0; | ||
22 | } else | ||
23 | return NULL; | ||
24 | |||
25 | return *str; | ||
26 | } | ||
27 | |||
28 | int | ||
29 | _braille_register_console(struct console *console, struct console_cmdline *c) | ||
30 | { | ||
31 | int rtn = 0; | ||
32 | |||
33 | if (c->brl_options) { | ||
34 | console->flags |= CON_BRL; | ||
35 | rtn = braille_register_console(console, c->index, c->options, | ||
36 | c->brl_options); | ||
37 | } | ||
38 | |||
39 | return rtn; | ||
40 | } | ||
41 | |||
42 | int | ||
43 | _braille_unregister_console(struct console *console) | ||
44 | { | ||
45 | if (console->flags & CON_BRL) | ||
46 | return braille_unregister_console(console); | ||
47 | |||
48 | return 0; | ||
49 | } | ||
diff --git a/kernel/printk/braille.h b/kernel/printk/braille.h new file mode 100644 index 000000000000..769d771145c8 --- /dev/null +++ b/kernel/printk/braille.h | |||
@@ -0,0 +1,48 @@ | |||
1 | #ifndef _PRINTK_BRAILLE_H | ||
2 | #define _PRINTK_BRAILLE_H | ||
3 | |||
4 | #ifdef CONFIG_A11Y_BRAILLE_CONSOLE | ||
5 | |||
6 | static inline void | ||
7 | braille_set_options(struct console_cmdline *c, char *brl_options) | ||
8 | { | ||
9 | c->brl_options = brl_options; | ||
10 | } | ||
11 | |||
12 | char * | ||
13 | _braille_console_setup(char **str, char **brl_options); | ||
14 | |||
15 | int | ||
16 | _braille_register_console(struct console *console, struct console_cmdline *c); | ||
17 | |||
18 | int | ||
19 | _braille_unregister_console(struct console *console); | ||
20 | |||
21 | #else | ||
22 | |||
23 | static inline void | ||
24 | braille_set_options(struct console_cmdline *c, char *brl_options) | ||
25 | { | ||
26 | } | ||
27 | |||
28 | static inline char * | ||
29 | _braille_console_setup(char **str, char **brl_options) | ||
30 | { | ||
31 | return NULL; | ||
32 | } | ||
33 | |||
34 | static inline int | ||
35 | _braille_register_console(struct console *console, struct console_cmdline *c) | ||
36 | { | ||
37 | return 0; | ||
38 | } | ||
39 | |||
40 | static inline int | ||
41 | _braille_unregister_console(struct console *console) | ||
42 | { | ||
43 | return 0; | ||
44 | } | ||
45 | |||
46 | #endif | ||
47 | |||
48 | #endif | ||
diff --git a/kernel/printk/console_cmdline.h b/kernel/printk/console_cmdline.h new file mode 100644 index 000000000000..cbd69d842341 --- /dev/null +++ b/kernel/printk/console_cmdline.h | |||
@@ -0,0 +1,14 @@ | |||
1 | #ifndef _CONSOLE_CMDLINE_H | ||
2 | #define _CONSOLE_CMDLINE_H | ||
3 | |||
4 | struct console_cmdline | ||
5 | { | ||
6 | char name[8]; /* Name of the driver */ | ||
7 | int index; /* Minor dev. to use */ | ||
8 | char *options; /* Options for the driver */ | ||
9 | #ifdef CONFIG_A11Y_BRAILLE_CONSOLE | ||
10 | char *brl_options; /* Options for braille driver */ | ||
11 | #endif | ||
12 | }; | ||
13 | |||
14 | #endif | ||
diff --git a/kernel/printk.c b/kernel/printk/printk.c index d37d45c90ae6..b4e8500afdb3 100644 --- a/kernel/printk.c +++ b/kernel/printk/printk.c | |||
@@ -51,6 +51,9 @@ | |||
51 | #define CREATE_TRACE_POINTS | 51 | #define CREATE_TRACE_POINTS |
52 | #include <trace/events/printk.h> | 52 | #include <trace/events/printk.h> |
53 | 53 | ||
54 | #include "console_cmdline.h" | ||
55 | #include "braille.h" | ||
56 | |||
54 | /* printk's without a loglevel use this.. */ | 57 | /* printk's without a loglevel use this.. */ |
55 | #define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL | 58 | #define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL |
56 | 59 | ||
@@ -105,19 +108,11 @@ static struct console *exclusive_console; | |||
105 | /* | 108 | /* |
106 | * Array of consoles built from command line options (console=) | 109 | * Array of consoles built from command line options (console=) |
107 | */ | 110 | */ |
108 | struct console_cmdline | ||
109 | { | ||
110 | char name[8]; /* Name of the driver */ | ||
111 | int index; /* Minor dev. to use */ | ||
112 | char *options; /* Options for the driver */ | ||
113 | #ifdef CONFIG_A11Y_BRAILLE_CONSOLE | ||
114 | char *brl_options; /* Options for braille driver */ | ||
115 | #endif | ||
116 | }; | ||
117 | 111 | ||
118 | #define MAX_CMDLINECONSOLES 8 | 112 | #define MAX_CMDLINECONSOLES 8 |
119 | 113 | ||
120 | static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES]; | 114 | static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES]; |
115 | |||
121 | static int selected_console = -1; | 116 | static int selected_console = -1; |
122 | static int preferred_console = -1; | 117 | static int preferred_console = -1; |
123 | int console_set_on_cmdline; | 118 | int console_set_on_cmdline; |
@@ -178,7 +173,7 @@ static int console_may_schedule; | |||
178 | * 67 "g" | 173 | * 67 "g" |
179 | * 0032 00 00 00 padding to next message header | 174 | * 0032 00 00 00 padding to next message header |
180 | * | 175 | * |
181 | * The 'struct log' buffer header must never be directly exported to | 176 | * The 'struct printk_log' buffer header must never be directly exported to |
182 | * userspace, it is a kernel-private implementation detail that might | 177 | * userspace, it is a kernel-private implementation detail that might |
183 | * need to be changed in the future, when the requirements change. | 178 | * need to be changed in the future, when the requirements change. |
184 | * | 179 | * |
@@ -200,7 +195,7 @@ enum log_flags { | |||
200 | LOG_CONT = 8, /* text is a fragment of a continuation line */ | 195 | LOG_CONT = 8, /* text is a fragment of a continuation line */ |
201 | }; | 196 | }; |
202 | 197 | ||
203 | struct log { | 198 | struct printk_log { |
204 | u64 ts_nsec; /* timestamp in nanoseconds */ | 199 | u64 ts_nsec; /* timestamp in nanoseconds */ |
205 | u16 len; /* length of entire record */ | 200 | u16 len; /* length of entire record */ |
206 | u16 text_len; /* length of text buffer */ | 201 | u16 text_len; /* length of text buffer */ |
@@ -248,7 +243,7 @@ static u32 clear_idx; | |||
248 | #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) | 243 | #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) |
249 | #define LOG_ALIGN 4 | 244 | #define LOG_ALIGN 4 |
250 | #else | 245 | #else |
251 | #define LOG_ALIGN __alignof__(struct log) | 246 | #define LOG_ALIGN __alignof__(struct printk_log) |
252 | #endif | 247 | #endif |
253 | #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) | 248 | #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) |
254 | static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); | 249 | static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); |
@@ -259,35 +254,35 @@ static u32 log_buf_len = __LOG_BUF_LEN; | |||
259 | static volatile unsigned int logbuf_cpu = UINT_MAX; | 254 | static volatile unsigned int logbuf_cpu = UINT_MAX; |
260 | 255 | ||
261 | /* human readable text of the record */ | 256 | /* human readable text of the record */ |
262 | static char *log_text(const struct log *msg) | 257 | static char *log_text(const struct printk_log *msg) |
263 | { | 258 | { |
264 | return (char *)msg + sizeof(struct log); | 259 | return (char *)msg + sizeof(struct printk_log); |
265 | } | 260 | } |
266 | 261 | ||
267 | /* optional key/value pair dictionary attached to the record */ | 262 | /* optional key/value pair dictionary attached to the record */ |
268 | static char *log_dict(const struct log *msg) | 263 | static char *log_dict(const struct printk_log *msg) |
269 | { | 264 | { |
270 | return (char *)msg + sizeof(struct log) + msg->text_len; | 265 | return (char *)msg + sizeof(struct printk_log) + msg->text_len; |
271 | } | 266 | } |
272 | 267 | ||
273 | /* get record by index; idx must point to valid msg */ | 268 | /* get record by index; idx must point to valid msg */ |
274 | static struct log *log_from_idx(u32 idx) | 269 | static struct printk_log *log_from_idx(u32 idx) |
275 | { | 270 | { |
276 | struct log *msg = (struct log *)(log_buf + idx); | 271 | struct printk_log *msg = (struct printk_log *)(log_buf + idx); |
277 | 272 | ||
278 | /* | 273 | /* |
279 | * A length == 0 record is the end of buffer marker. Wrap around and | 274 | * A length == 0 record is the end of buffer marker. Wrap around and |
280 | * read the message at the start of the buffer. | 275 | * read the message at the start of the buffer. |
281 | */ | 276 | */ |
282 | if (!msg->len) | 277 | if (!msg->len) |
283 | return (struct log *)log_buf; | 278 | return (struct printk_log *)log_buf; |
284 | return msg; | 279 | return msg; |
285 | } | 280 | } |
286 | 281 | ||
287 | /* get next record; idx must point to valid msg */ | 282 | /* get next record; idx must point to valid msg */ |
288 | static u32 log_next(u32 idx) | 283 | static u32 log_next(u32 idx) |
289 | { | 284 | { |
290 | struct log *msg = (struct log *)(log_buf + idx); | 285 | struct printk_log *msg = (struct printk_log *)(log_buf + idx); |
291 | 286 | ||
292 | /* length == 0 indicates the end of the buffer; wrap */ | 287 | /* length == 0 indicates the end of the buffer; wrap */ |
293 | /* | 288 | /* |
@@ -296,7 +291,7 @@ static u32 log_next(u32 idx) | |||
296 | * return the one after that. | 291 | * return the one after that. |
297 | */ | 292 | */ |
298 | if (!msg->len) { | 293 | if (!msg->len) { |
299 | msg = (struct log *)log_buf; | 294 | msg = (struct printk_log *)log_buf; |
300 | return msg->len; | 295 | return msg->len; |
301 | } | 296 | } |
302 | return idx + msg->len; | 297 | return idx + msg->len; |
@@ -308,11 +303,11 @@ static void log_store(int facility, int level, | |||
308 | const char *dict, u16 dict_len, | 303 | const char *dict, u16 dict_len, |
309 | const char *text, u16 text_len) | 304 | const char *text, u16 text_len) |
310 | { | 305 | { |
311 | struct log *msg; | 306 | struct printk_log *msg; |
312 | u32 size, pad_len; | 307 | u32 size, pad_len; |
313 | 308 | ||
314 | /* number of '\0' padding bytes to next message */ | 309 | /* number of '\0' padding bytes to next message */ |
315 | size = sizeof(struct log) + text_len + dict_len; | 310 | size = sizeof(struct printk_log) + text_len + dict_len; |
316 | pad_len = (-size) & (LOG_ALIGN - 1); | 311 | pad_len = (-size) & (LOG_ALIGN - 1); |
317 | size += pad_len; | 312 | size += pad_len; |
318 | 313 | ||
@@ -324,7 +319,7 @@ static void log_store(int facility, int level, | |||
324 | else | 319 | else |
325 | free = log_first_idx - log_next_idx; | 320 | free = log_first_idx - log_next_idx; |
326 | 321 | ||
327 | if (free > size + sizeof(struct log)) | 322 | if (free > size + sizeof(struct printk_log)) |
328 | break; | 323 | break; |
329 | 324 | ||
330 | /* drop old messages until we have enough contiuous space */ | 325 | /* drop old messages until we have enough contiuous space */ |
@@ -332,18 +327,18 @@ static void log_store(int facility, int level, | |||
332 | log_first_seq++; | 327 | log_first_seq++; |
333 | } | 328 | } |
334 | 329 | ||
335 | if (log_next_idx + size + sizeof(struct log) >= log_buf_len) { | 330 | if (log_next_idx + size + sizeof(struct printk_log) >= log_buf_len) { |
336 | /* | 331 | /* |
337 | * This message + an additional empty header does not fit | 332 | * This message + an additional empty header does not fit |
338 | * at the end of the buffer. Add an empty header with len == 0 | 333 | * at the end of the buffer. Add an empty header with len == 0 |
339 | * to signify a wrap around. | 334 | * to signify a wrap around. |
340 | */ | 335 | */ |
341 | memset(log_buf + log_next_idx, 0, sizeof(struct log)); | 336 | memset(log_buf + log_next_idx, 0, sizeof(struct printk_log)); |
342 | log_next_idx = 0; | 337 | log_next_idx = 0; |
343 | } | 338 | } |
344 | 339 | ||
345 | /* fill message */ | 340 | /* fill message */ |
346 | msg = (struct log *)(log_buf + log_next_idx); | 341 | msg = (struct printk_log *)(log_buf + log_next_idx); |
347 | memcpy(log_text(msg), text, text_len); | 342 | memcpy(log_text(msg), text, text_len); |
348 | msg->text_len = text_len; | 343 | msg->text_len = text_len; |
349 | memcpy(log_dict(msg), dict, dict_len); | 344 | memcpy(log_dict(msg), dict, dict_len); |
@@ -356,7 +351,7 @@ static void log_store(int facility, int level, | |||
356 | else | 351 | else |
357 | msg->ts_nsec = local_clock(); | 352 | msg->ts_nsec = local_clock(); |
358 | memset(log_dict(msg) + dict_len, 0, pad_len); | 353 | memset(log_dict(msg) + dict_len, 0, pad_len); |
359 | msg->len = sizeof(struct log) + text_len + dict_len + pad_len; | 354 | msg->len = sizeof(struct printk_log) + text_len + dict_len + pad_len; |
360 | 355 | ||
361 | /* insert message */ | 356 | /* insert message */ |
362 | log_next_idx += msg->len; | 357 | log_next_idx += msg->len; |
@@ -479,7 +474,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, | |||
479 | size_t count, loff_t *ppos) | 474 | size_t count, loff_t *ppos) |
480 | { | 475 | { |
481 | struct devkmsg_user *user = file->private_data; | 476 | struct devkmsg_user *user = file->private_data; |
482 | struct log *msg; | 477 | struct printk_log *msg; |
483 | u64 ts_usec; | 478 | u64 ts_usec; |
484 | size_t i; | 479 | size_t i; |
485 | char cont = '-'; | 480 | char cont = '-'; |
@@ -724,14 +719,14 @@ void log_buf_kexec_setup(void) | |||
724 | VMCOREINFO_SYMBOL(log_first_idx); | 719 | VMCOREINFO_SYMBOL(log_first_idx); |
725 | VMCOREINFO_SYMBOL(log_next_idx); | 720 | VMCOREINFO_SYMBOL(log_next_idx); |
726 | /* | 721 | /* |
727 | * Export struct log size and field offsets. User space tools can | 722 | * Export struct printk_log size and field offsets. User space tools can |
728 | * parse it and detect any changes to structure down the line. | 723 | * parse it and detect any changes to structure down the line. |
729 | */ | 724 | */ |
730 | VMCOREINFO_STRUCT_SIZE(log); | 725 | VMCOREINFO_STRUCT_SIZE(printk_log); |
731 | VMCOREINFO_OFFSET(log, ts_nsec); | 726 | VMCOREINFO_OFFSET(printk_log, ts_nsec); |
732 | VMCOREINFO_OFFSET(log, len); | 727 | VMCOREINFO_OFFSET(printk_log, len); |
733 | VMCOREINFO_OFFSET(log, text_len); | 728 | VMCOREINFO_OFFSET(printk_log, text_len); |
734 | VMCOREINFO_OFFSET(log, dict_len); | 729 | VMCOREINFO_OFFSET(printk_log, dict_len); |
735 | } | 730 | } |
736 | #endif | 731 | #endif |
737 | 732 | ||
@@ -884,7 +879,7 @@ static size_t print_time(u64 ts, char *buf) | |||
884 | (unsigned long)ts, rem_nsec / 1000); | 879 | (unsigned long)ts, rem_nsec / 1000); |
885 | } | 880 | } |
886 | 881 | ||
887 | static size_t print_prefix(const struct log *msg, bool syslog, char *buf) | 882 | static size_t print_prefix(const struct printk_log *msg, bool syslog, char *buf) |
888 | { | 883 | { |
889 | size_t len = 0; | 884 | size_t len = 0; |
890 | unsigned int prefix = (msg->facility << 3) | msg->level; | 885 | unsigned int prefix = (msg->facility << 3) | msg->level; |
@@ -907,7 +902,7 @@ static size_t print_prefix(const struct log *msg, bool syslog, char *buf) | |||
907 | return len; | 902 | return len; |
908 | } | 903 | } |
909 | 904 | ||
910 | static size_t msg_print_text(const struct log *msg, enum log_flags prev, | 905 | static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev, |
911 | bool syslog, char *buf, size_t size) | 906 | bool syslog, char *buf, size_t size) |
912 | { | 907 | { |
913 | const char *text = log_text(msg); | 908 | const char *text = log_text(msg); |
@@ -969,7 +964,7 @@ static size_t msg_print_text(const struct log *msg, enum log_flags prev, | |||
969 | static int syslog_print(char __user *buf, int size) | 964 | static int syslog_print(char __user *buf, int size) |
970 | { | 965 | { |
971 | char *text; | 966 | char *text; |
972 | struct log *msg; | 967 | struct printk_log *msg; |
973 | int len = 0; | 968 | int len = 0; |
974 | 969 | ||
975 | text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); | 970 | text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); |
@@ -1060,7 +1055,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) | |||
1060 | idx = clear_idx; | 1055 | idx = clear_idx; |
1061 | prev = 0; | 1056 | prev = 0; |
1062 | while (seq < log_next_seq) { | 1057 | while (seq < log_next_seq) { |
1063 | struct log *msg = log_from_idx(idx); | 1058 | struct printk_log *msg = log_from_idx(idx); |
1064 | 1059 | ||
1065 | len += msg_print_text(msg, prev, true, NULL, 0); | 1060 | len += msg_print_text(msg, prev, true, NULL, 0); |
1066 | prev = msg->flags; | 1061 | prev = msg->flags; |
@@ -1073,7 +1068,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) | |||
1073 | idx = clear_idx; | 1068 | idx = clear_idx; |
1074 | prev = 0; | 1069 | prev = 0; |
1075 | while (len > size && seq < log_next_seq) { | 1070 | while (len > size && seq < log_next_seq) { |
1076 | struct log *msg = log_from_idx(idx); | 1071 | struct printk_log *msg = log_from_idx(idx); |
1077 | 1072 | ||
1078 | len -= msg_print_text(msg, prev, true, NULL, 0); | 1073 | len -= msg_print_text(msg, prev, true, NULL, 0); |
1079 | prev = msg->flags; | 1074 | prev = msg->flags; |
@@ -1087,7 +1082,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) | |||
1087 | len = 0; | 1082 | len = 0; |
1088 | prev = 0; | 1083 | prev = 0; |
1089 | while (len >= 0 && seq < next_seq) { | 1084 | while (len >= 0 && seq < next_seq) { |
1090 | struct log *msg = log_from_idx(idx); | 1085 | struct printk_log *msg = log_from_idx(idx); |
1091 | int textlen; | 1086 | int textlen; |
1092 | 1087 | ||
1093 | textlen = msg_print_text(msg, prev, true, text, | 1088 | textlen = msg_print_text(msg, prev, true, text, |
@@ -1233,7 +1228,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) | |||
1233 | 1228 | ||
1234 | error = 0; | 1229 | error = 0; |
1235 | while (seq < log_next_seq) { | 1230 | while (seq < log_next_seq) { |
1236 | struct log *msg = log_from_idx(idx); | 1231 | struct printk_log *msg = log_from_idx(idx); |
1237 | 1232 | ||
1238 | error += msg_print_text(msg, prev, true, NULL, 0); | 1233 | error += msg_print_text(msg, prev, true, NULL, 0); |
1239 | idx = log_next(idx); | 1234 | idx = log_next(idx); |
@@ -1719,10 +1714,10 @@ static struct cont { | |||
1719 | u8 level; | 1714 | u8 level; |
1720 | bool flushed:1; | 1715 | bool flushed:1; |
1721 | } cont; | 1716 | } cont; |
1722 | static struct log *log_from_idx(u32 idx) { return NULL; } | 1717 | static struct printk_log *log_from_idx(u32 idx) { return NULL; } |
1723 | static u32 log_next(u32 idx) { return 0; } | 1718 | static u32 log_next(u32 idx) { return 0; } |
1724 | static void call_console_drivers(int level, const char *text, size_t len) {} | 1719 | static void call_console_drivers(int level, const char *text, size_t len) {} |
1725 | static size_t msg_print_text(const struct log *msg, enum log_flags prev, | 1720 | static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev, |
1726 | bool syslog, char *buf, size_t size) { return 0; } | 1721 | bool syslog, char *buf, size_t size) { return 0; } |
1727 | static size_t cont_print_text(char *text, size_t size) { return 0; } | 1722 | static size_t cont_print_text(char *text, size_t size) { return 0; } |
1728 | 1723 | ||
@@ -1761,23 +1756,23 @@ static int __add_preferred_console(char *name, int idx, char *options, | |||
1761 | * See if this tty is not yet registered, and | 1756 | * See if this tty is not yet registered, and |
1762 | * if we have a slot free. | 1757 | * if we have a slot free. |
1763 | */ | 1758 | */ |
1764 | for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) | 1759 | for (i = 0, c = console_cmdline; |
1765 | if (strcmp(console_cmdline[i].name, name) == 0 && | 1760 | i < MAX_CMDLINECONSOLES && c->name[0]; |
1766 | console_cmdline[i].index == idx) { | 1761 | i++, c++) { |
1767 | if (!brl_options) | 1762 | if (strcmp(c->name, name) == 0 && c->index == idx) { |
1768 | selected_console = i; | 1763 | if (!brl_options) |
1769 | return 0; | 1764 | selected_console = i; |
1765 | return 0; | ||
1770 | } | 1766 | } |
1767 | } | ||
1771 | if (i == MAX_CMDLINECONSOLES) | 1768 | if (i == MAX_CMDLINECONSOLES) |
1772 | return -E2BIG; | 1769 | return -E2BIG; |
1773 | if (!brl_options) | 1770 | if (!brl_options) |
1774 | selected_console = i; | 1771 | selected_console = i; |
1775 | c = &console_cmdline[i]; | ||
1776 | strlcpy(c->name, name, sizeof(c->name)); | 1772 | strlcpy(c->name, name, sizeof(c->name)); |
1777 | c->options = options; | 1773 | c->options = options; |
1778 | #ifdef CONFIG_A11Y_BRAILLE_CONSOLE | 1774 | braille_set_options(c, brl_options); |
1779 | c->brl_options = brl_options; | 1775 | |
1780 | #endif | ||
1781 | c->index = idx; | 1776 | c->index = idx; |
1782 | return 0; | 1777 | return 0; |
1783 | } | 1778 | } |
@@ -1790,20 +1785,8 @@ static int __init console_setup(char *str) | |||
1790 | char *s, *options, *brl_options = NULL; | 1785 | char *s, *options, *brl_options = NULL; |
1791 | int idx; | 1786 | int idx; |
1792 | 1787 | ||
1793 | #ifdef CONFIG_A11Y_BRAILLE_CONSOLE | 1788 | if (_braille_console_setup(&str, &brl_options)) |
1794 | if (!memcmp(str, "brl,", 4)) { | 1789 | return 1; |
1795 | brl_options = ""; | ||
1796 | str += 4; | ||
1797 | } else if (!memcmp(str, "brl=", 4)) { | ||
1798 | brl_options = str + 4; | ||
1799 | str = strchr(brl_options, ','); | ||
1800 | if (!str) { | ||
1801 | printk(KERN_ERR "need port name after brl=\n"); | ||
1802 | return 1; | ||
1803 | } | ||
1804 | *(str++) = 0; | ||
1805 | } | ||
1806 | #endif | ||
1807 | 1790 | ||
1808 | /* | 1791 | /* |
1809 | * Decode str into name, index, options. | 1792 | * Decode str into name, index, options. |
@@ -1858,15 +1841,15 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha | |||
1858 | struct console_cmdline *c; | 1841 | struct console_cmdline *c; |
1859 | int i; | 1842 | int i; |
1860 | 1843 | ||
1861 | for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) | 1844 | for (i = 0, c = console_cmdline; |
1862 | if (strcmp(console_cmdline[i].name, name) == 0 && | 1845 | i < MAX_CMDLINECONSOLES && c->name[0]; |
1863 | console_cmdline[i].index == idx) { | 1846 | i++, c++) |
1864 | c = &console_cmdline[i]; | 1847 | if (strcmp(c->name, name) == 0 && c->index == idx) { |
1865 | strlcpy(c->name, name_new, sizeof(c->name)); | 1848 | strlcpy(c->name, name_new, sizeof(c->name)); |
1866 | c->name[sizeof(c->name) - 1] = 0; | 1849 | c->name[sizeof(c->name) - 1] = 0; |
1867 | c->options = options; | 1850 | c->options = options; |
1868 | c->index = idx_new; | 1851 | c->index = idx_new; |
1869 | return i; | 1852 | return i; |
1870 | } | 1853 | } |
1871 | /* not found */ | 1854 | /* not found */ |
1872 | return -1; | 1855 | return -1; |
@@ -1921,7 +1904,7 @@ void resume_console(void) | |||
1921 | * called when a new CPU comes online (or fails to come up), and ensures | 1904 | * called when a new CPU comes online (or fails to come up), and ensures |
1922 | * that any such output gets printed. | 1905 | * that any such output gets printed. |
1923 | */ | 1906 | */ |
1924 | static int __cpuinit console_cpu_notify(struct notifier_block *self, | 1907 | static int console_cpu_notify(struct notifier_block *self, |
1925 | unsigned long action, void *hcpu) | 1908 | unsigned long action, void *hcpu) |
1926 | { | 1909 | { |
1927 | switch (action) { | 1910 | switch (action) { |
@@ -2046,7 +2029,7 @@ void console_unlock(void) | |||
2046 | console_cont_flush(text, sizeof(text)); | 2029 | console_cont_flush(text, sizeof(text)); |
2047 | again: | 2030 | again: |
2048 | for (;;) { | 2031 | for (;;) { |
2049 | struct log *msg; | 2032 | struct printk_log *msg; |
2050 | size_t len; | 2033 | size_t len; |
2051 | int level; | 2034 | int level; |
2052 | 2035 | ||
@@ -2241,6 +2224,14 @@ void register_console(struct console *newcon) | |||
2241 | int i; | 2224 | int i; |
2242 | unsigned long flags; | 2225 | unsigned long flags; |
2243 | struct console *bcon = NULL; | 2226 | struct console *bcon = NULL; |
2227 | struct console_cmdline *c; | ||
2228 | |||
2229 | if (console_drivers) | ||
2230 | for_each_console(bcon) | ||
2231 | if (WARN(bcon == newcon, | ||
2232 | "console '%s%d' already registered\n", | ||
2233 | bcon->name, bcon->index)) | ||
2234 | return; | ||
2244 | 2235 | ||
2245 | /* | 2236 | /* |
2246 | * before we register a new CON_BOOT console, make sure we don't | 2237 | * before we register a new CON_BOOT console, make sure we don't |
@@ -2288,30 +2279,25 @@ void register_console(struct console *newcon) | |||
2288 | * See if this console matches one we selected on | 2279 | * See if this console matches one we selected on |
2289 | * the command line. | 2280 | * the command line. |
2290 | */ | 2281 | */ |
2291 | for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; | 2282 | for (i = 0, c = console_cmdline; |
2292 | i++) { | 2283 | i < MAX_CMDLINECONSOLES && c->name[0]; |
2293 | if (strcmp(console_cmdline[i].name, newcon->name) != 0) | 2284 | i++, c++) { |
2285 | if (strcmp(c->name, newcon->name) != 0) | ||
2294 | continue; | 2286 | continue; |
2295 | if (newcon->index >= 0 && | 2287 | if (newcon->index >= 0 && |
2296 | newcon->index != console_cmdline[i].index) | 2288 | newcon->index != c->index) |
2297 | continue; | 2289 | continue; |
2298 | if (newcon->index < 0) | 2290 | if (newcon->index < 0) |
2299 | newcon->index = console_cmdline[i].index; | 2291 | newcon->index = c->index; |
2300 | #ifdef CONFIG_A11Y_BRAILLE_CONSOLE | 2292 | |
2301 | if (console_cmdline[i].brl_options) { | 2293 | if (_braille_register_console(newcon, c)) |
2302 | newcon->flags |= CON_BRL; | ||
2303 | braille_register_console(newcon, | ||
2304 | console_cmdline[i].index, | ||
2305 | console_cmdline[i].options, | ||
2306 | console_cmdline[i].brl_options); | ||
2307 | return; | 2294 | return; |
2308 | } | 2295 | |
2309 | #endif | ||
2310 | if (newcon->setup && | 2296 | if (newcon->setup && |
2311 | newcon->setup(newcon, console_cmdline[i].options) != 0) | 2297 | newcon->setup(newcon, console_cmdline[i].options) != 0) |
2312 | break; | 2298 | break; |
2313 | newcon->flags |= CON_ENABLED; | 2299 | newcon->flags |= CON_ENABLED; |
2314 | newcon->index = console_cmdline[i].index; | 2300 | newcon->index = c->index; |
2315 | if (i == selected_console) { | 2301 | if (i == selected_console) { |
2316 | newcon->flags |= CON_CONSDEV; | 2302 | newcon->flags |= CON_CONSDEV; |
2317 | preferred_console = selected_console; | 2303 | preferred_console = selected_console; |
@@ -2394,13 +2380,13 @@ EXPORT_SYMBOL(register_console); | |||
2394 | int unregister_console(struct console *console) | 2380 | int unregister_console(struct console *console) |
2395 | { | 2381 | { |
2396 | struct console *a, *b; | 2382 | struct console *a, *b; |
2397 | int res = 1; | 2383 | int res; |
2398 | 2384 | ||
2399 | #ifdef CONFIG_A11Y_BRAILLE_CONSOLE | 2385 | res = _braille_unregister_console(console); |
2400 | if (console->flags & CON_BRL) | 2386 | if (res) |
2401 | return braille_unregister_console(console); | 2387 | return res; |
2402 | #endif | ||
2403 | 2388 | ||
2389 | res = 1; | ||
2404 | console_lock(); | 2390 | console_lock(); |
2405 | if (console_drivers == console) { | 2391 | if (console_drivers == console) { |
2406 | console_drivers=console->next; | 2392 | console_drivers=console->next; |
@@ -2666,7 +2652,7 @@ void kmsg_dump(enum kmsg_dump_reason reason) | |||
2666 | bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog, | 2652 | bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog, |
2667 | char *line, size_t size, size_t *len) | 2653 | char *line, size_t size, size_t *len) |
2668 | { | 2654 | { |
2669 | struct log *msg; | 2655 | struct printk_log *msg; |
2670 | size_t l = 0; | 2656 | size_t l = 0; |
2671 | bool ret = false; | 2657 | bool ret = false; |
2672 | 2658 | ||
@@ -2778,7 +2764,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, | |||
2778 | idx = dumper->cur_idx; | 2764 | idx = dumper->cur_idx; |
2779 | prev = 0; | 2765 | prev = 0; |
2780 | while (seq < dumper->next_seq) { | 2766 | while (seq < dumper->next_seq) { |
2781 | struct log *msg = log_from_idx(idx); | 2767 | struct printk_log *msg = log_from_idx(idx); |
2782 | 2768 | ||
2783 | l += msg_print_text(msg, prev, true, NULL, 0); | 2769 | l += msg_print_text(msg, prev, true, NULL, 0); |
2784 | idx = log_next(idx); | 2770 | idx = log_next(idx); |
@@ -2791,7 +2777,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, | |||
2791 | idx = dumper->cur_idx; | 2777 | idx = dumper->cur_idx; |
2792 | prev = 0; | 2778 | prev = 0; |
2793 | while (l > size && seq < dumper->next_seq) { | 2779 | while (l > size && seq < dumper->next_seq) { |
2794 | struct log *msg = log_from_idx(idx); | 2780 | struct printk_log *msg = log_from_idx(idx); |
2795 | 2781 | ||
2796 | l -= msg_print_text(msg, prev, true, NULL, 0); | 2782 | l -= msg_print_text(msg, prev, true, NULL, 0); |
2797 | idx = log_next(idx); | 2783 | idx = log_next(idx); |
@@ -2806,7 +2792,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, | |||
2806 | l = 0; | 2792 | l = 0; |
2807 | prev = 0; | 2793 | prev = 0; |
2808 | while (seq < dumper->next_seq) { | 2794 | while (seq < dumper->next_seq) { |
2809 | struct log *msg = log_from_idx(idx); | 2795 | struct printk_log *msg = log_from_idx(idx); |
2810 | 2796 | ||
2811 | l += msg_print_text(msg, prev, syslog, buf + l, size - l); | 2797 | l += msg_print_text(msg, prev, syslog, buf + l, size - l); |
2812 | idx = log_next(idx); | 2798 | idx = log_next(idx); |
diff --git a/kernel/profile.c b/kernel/profile.c index 0bf400737660..6631e1ef55ab 100644 --- a/kernel/profile.c +++ b/kernel/profile.c | |||
@@ -331,7 +331,7 @@ out: | |||
331 | put_cpu(); | 331 | put_cpu(); |
332 | } | 332 | } |
333 | 333 | ||
334 | static int __cpuinit profile_cpu_callback(struct notifier_block *info, | 334 | static int profile_cpu_callback(struct notifier_block *info, |
335 | unsigned long action, void *__cpu) | 335 | unsigned long action, void *__cpu) |
336 | { | 336 | { |
337 | int node, cpu = (unsigned long)__cpu; | 337 | int node, cpu = (unsigned long)__cpu; |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 4041f5747e73..a146ee327f6a 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
@@ -469,7 +469,6 @@ static int ptrace_detach(struct task_struct *child, unsigned int data) | |||
469 | /* Architecture-specific hardware disable .. */ | 469 | /* Architecture-specific hardware disable .. */ |
470 | ptrace_disable(child); | 470 | ptrace_disable(child); |
471 | clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); | 471 | clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); |
472 | flush_ptrace_hw_breakpoint(child); | ||
473 | 472 | ||
474 | write_lock_irq(&tasklist_lock); | 473 | write_lock_irq(&tasklist_lock); |
475 | /* | 474 | /* |
diff --git a/kernel/rcu.h b/kernel/rcu.h index 7f8e7590e3e5..77131966c4ad 100644 --- a/kernel/rcu.h +++ b/kernel/rcu.h | |||
@@ -67,12 +67,15 @@ | |||
67 | 67 | ||
68 | extern struct debug_obj_descr rcuhead_debug_descr; | 68 | extern struct debug_obj_descr rcuhead_debug_descr; |
69 | 69 | ||
70 | static inline void debug_rcu_head_queue(struct rcu_head *head) | 70 | static inline int debug_rcu_head_queue(struct rcu_head *head) |
71 | { | 71 | { |
72 | debug_object_activate(head, &rcuhead_debug_descr); | 72 | int r1; |
73 | |||
74 | r1 = debug_object_activate(head, &rcuhead_debug_descr); | ||
73 | debug_object_active_state(head, &rcuhead_debug_descr, | 75 | debug_object_active_state(head, &rcuhead_debug_descr, |
74 | STATE_RCU_HEAD_READY, | 76 | STATE_RCU_HEAD_READY, |
75 | STATE_RCU_HEAD_QUEUED); | 77 | STATE_RCU_HEAD_QUEUED); |
78 | return r1; | ||
76 | } | 79 | } |
77 | 80 | ||
78 | static inline void debug_rcu_head_unqueue(struct rcu_head *head) | 81 | static inline void debug_rcu_head_unqueue(struct rcu_head *head) |
@@ -83,8 +86,9 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head) | |||
83 | debug_object_deactivate(head, &rcuhead_debug_descr); | 86 | debug_object_deactivate(head, &rcuhead_debug_descr); |
84 | } | 87 | } |
85 | #else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ | 88 | #else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ |
86 | static inline void debug_rcu_head_queue(struct rcu_head *head) | 89 | static inline int debug_rcu_head_queue(struct rcu_head *head) |
87 | { | 90 | { |
91 | return 0; | ||
88 | } | 92 | } |
89 | 93 | ||
90 | static inline void debug_rcu_head_unqueue(struct rcu_head *head) | 94 | static inline void debug_rcu_head_unqueue(struct rcu_head *head) |
@@ -94,7 +98,7 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head) | |||
94 | 98 | ||
95 | extern void kfree(const void *); | 99 | extern void kfree(const void *); |
96 | 100 | ||
97 | static inline bool __rcu_reclaim(char *rn, struct rcu_head *head) | 101 | static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) |
98 | { | 102 | { |
99 | unsigned long offset = (unsigned long)head->func; | 103 | unsigned long offset = (unsigned long)head->func; |
100 | 104 | ||
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index cce6ba8bbace..33eb4620aa17 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
@@ -212,43 +212,6 @@ static inline void debug_rcu_head_free(struct rcu_head *head) | |||
212 | } | 212 | } |
213 | 213 | ||
214 | /* | 214 | /* |
215 | * fixup_init is called when: | ||
216 | * - an active object is initialized | ||
217 | */ | ||
218 | static int rcuhead_fixup_init(void *addr, enum debug_obj_state state) | ||
219 | { | ||
220 | struct rcu_head *head = addr; | ||
221 | |||
222 | switch (state) { | ||
223 | case ODEBUG_STATE_ACTIVE: | ||
224 | /* | ||
225 | * Ensure that queued callbacks are all executed. | ||
226 | * If we detect that we are nested in a RCU read-side critical | ||
227 | * section, we should simply fail, otherwise we would deadlock. | ||
228 | * In !PREEMPT configurations, there is no way to tell if we are | ||
229 | * in a RCU read-side critical section or not, so we never | ||
230 | * attempt any fixup and just print a warning. | ||
231 | */ | ||
232 | #ifndef CONFIG_PREEMPT | ||
233 | WARN_ON_ONCE(1); | ||
234 | return 0; | ||
235 | #endif | ||
236 | if (rcu_preempt_depth() != 0 || preempt_count() != 0 || | ||
237 | irqs_disabled()) { | ||
238 | WARN_ON_ONCE(1); | ||
239 | return 0; | ||
240 | } | ||
241 | rcu_barrier(); | ||
242 | rcu_barrier_sched(); | ||
243 | rcu_barrier_bh(); | ||
244 | debug_object_init(head, &rcuhead_debug_descr); | ||
245 | return 1; | ||
246 | default: | ||
247 | return 0; | ||
248 | } | ||
249 | } | ||
250 | |||
251 | /* | ||
252 | * fixup_activate is called when: | 215 | * fixup_activate is called when: |
253 | * - an active object is activated | 216 | * - an active object is activated |
254 | * - an unknown object is activated (might be a statically initialized object) | 217 | * - an unknown object is activated (might be a statically initialized object) |
@@ -268,69 +231,8 @@ static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state) | |||
268 | debug_object_init(head, &rcuhead_debug_descr); | 231 | debug_object_init(head, &rcuhead_debug_descr); |
269 | debug_object_activate(head, &rcuhead_debug_descr); | 232 | debug_object_activate(head, &rcuhead_debug_descr); |
270 | return 0; | 233 | return 0; |
271 | |||
272 | case ODEBUG_STATE_ACTIVE: | ||
273 | /* | ||
274 | * Ensure that queued callbacks are all executed. | ||
275 | * If we detect that we are nested in a RCU read-side critical | ||
276 | * section, we should simply fail, otherwise we would deadlock. | ||
277 | * In !PREEMPT configurations, there is no way to tell if we are | ||
278 | * in a RCU read-side critical section or not, so we never | ||
279 | * attempt any fixup and just print a warning. | ||
280 | */ | ||
281 | #ifndef CONFIG_PREEMPT | ||
282 | WARN_ON_ONCE(1); | ||
283 | return 0; | ||
284 | #endif | ||
285 | if (rcu_preempt_depth() != 0 || preempt_count() != 0 || | ||
286 | irqs_disabled()) { | ||
287 | WARN_ON_ONCE(1); | ||
288 | return 0; | ||
289 | } | ||
290 | rcu_barrier(); | ||
291 | rcu_barrier_sched(); | ||
292 | rcu_barrier_bh(); | ||
293 | debug_object_activate(head, &rcuhead_debug_descr); | ||
294 | return 1; | ||
295 | default: | 234 | default: |
296 | return 0; | ||
297 | } | ||
298 | } | ||
299 | |||
300 | /* | ||
301 | * fixup_free is called when: | ||
302 | * - an active object is freed | ||
303 | */ | ||
304 | static int rcuhead_fixup_free(void *addr, enum debug_obj_state state) | ||
305 | { | ||
306 | struct rcu_head *head = addr; | ||
307 | |||
308 | switch (state) { | ||
309 | case ODEBUG_STATE_ACTIVE: | ||
310 | /* | ||
311 | * Ensure that queued callbacks are all executed. | ||
312 | * If we detect that we are nested in a RCU read-side critical | ||
313 | * section, we should simply fail, otherwise we would deadlock. | ||
314 | * In !PREEMPT configurations, there is no way to tell if we are | ||
315 | * in a RCU read-side critical section or not, so we never | ||
316 | * attempt any fixup and just print a warning. | ||
317 | */ | ||
318 | #ifndef CONFIG_PREEMPT | ||
319 | WARN_ON_ONCE(1); | ||
320 | return 0; | ||
321 | #endif | ||
322 | if (rcu_preempt_depth() != 0 || preempt_count() != 0 || | ||
323 | irqs_disabled()) { | ||
324 | WARN_ON_ONCE(1); | ||
325 | return 0; | ||
326 | } | ||
327 | rcu_barrier(); | ||
328 | rcu_barrier_sched(); | ||
329 | rcu_barrier_bh(); | ||
330 | debug_object_free(head, &rcuhead_debug_descr); | ||
331 | return 1; | 235 | return 1; |
332 | default: | ||
333 | return 0; | ||
334 | } | 236 | } |
335 | } | 237 | } |
336 | 238 | ||
@@ -369,15 +271,13 @@ EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack); | |||
369 | 271 | ||
370 | struct debug_obj_descr rcuhead_debug_descr = { | 272 | struct debug_obj_descr rcuhead_debug_descr = { |
371 | .name = "rcu_head", | 273 | .name = "rcu_head", |
372 | .fixup_init = rcuhead_fixup_init, | ||
373 | .fixup_activate = rcuhead_fixup_activate, | 274 | .fixup_activate = rcuhead_fixup_activate, |
374 | .fixup_free = rcuhead_fixup_free, | ||
375 | }; | 275 | }; |
376 | EXPORT_SYMBOL_GPL(rcuhead_debug_descr); | 276 | EXPORT_SYMBOL_GPL(rcuhead_debug_descr); |
377 | #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ | 277 | #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ |
378 | 278 | ||
379 | #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE) | 279 | #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE) |
380 | void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp, | 280 | void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp, |
381 | unsigned long secs, | 281 | unsigned long secs, |
382 | unsigned long c_old, unsigned long c) | 282 | unsigned long c_old, unsigned long c) |
383 | { | 283 | { |
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index aa344111de3e..9ed6075dc562 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c | |||
@@ -264,7 +264,7 @@ void rcu_check_callbacks(int cpu, int user) | |||
264 | */ | 264 | */ |
265 | static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) | 265 | static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) |
266 | { | 266 | { |
267 | char *rn = NULL; | 267 | const char *rn = NULL; |
268 | struct rcu_head *next, *list; | 268 | struct rcu_head *next, *list; |
269 | unsigned long flags; | 269 | unsigned long flags; |
270 | RCU_TRACE(int cb_count = 0); | 270 | RCU_TRACE(int cb_count = 0); |
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 0cd385acccfa..280d06cae352 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h | |||
@@ -36,7 +36,7 @@ struct rcu_ctrlblk { | |||
36 | RCU_TRACE(unsigned long gp_start); /* Start time for stalls. */ | 36 | RCU_TRACE(unsigned long gp_start); /* Start time for stalls. */ |
37 | RCU_TRACE(unsigned long ticks_this_gp); /* Statistic for stalls. */ | 37 | RCU_TRACE(unsigned long ticks_this_gp); /* Statistic for stalls. */ |
38 | RCU_TRACE(unsigned long jiffies_stall); /* Jiffies at next stall. */ | 38 | RCU_TRACE(unsigned long jiffies_stall); /* Jiffies at next stall. */ |
39 | RCU_TRACE(char *name); /* Name of RCU type. */ | 39 | RCU_TRACE(const char *name); /* Name of RCU type. */ |
40 | }; | 40 | }; |
41 | 41 | ||
42 | /* Definition for rcupdate control block. */ | 42 | /* Definition for rcupdate control block. */ |
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index b1fa5510388d..be63101c6175 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
@@ -52,72 +52,78 @@ | |||
52 | MODULE_LICENSE("GPL"); | 52 | MODULE_LICENSE("GPL"); |
53 | MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>"); | 53 | MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>"); |
54 | 54 | ||
55 | static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ | 55 | static int fqs_duration; |
56 | static int nfakewriters = 4; /* # fake writer threads */ | ||
57 | static int stat_interval = 60; /* Interval between stats, in seconds. */ | ||
58 | /* Zero means "only at end of test". */ | ||
59 | static bool verbose; /* Print more debug info. */ | ||
60 | static bool test_no_idle_hz = true; | ||
61 | /* Test RCU support for tickless idle CPUs. */ | ||
62 | static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ | ||
63 | static int stutter = 5; /* Start/stop testing interval (in sec) */ | ||
64 | static int irqreader = 1; /* RCU readers from irq (timers). */ | ||
65 | static int fqs_duration; /* Duration of bursts (us), 0 to disable. */ | ||
66 | static int fqs_holdoff; /* Hold time within burst (us). */ | ||
67 | static int fqs_stutter = 3; /* Wait time between bursts (s). */ | ||
68 | static int n_barrier_cbs; /* Number of callbacks to test RCU barriers. */ | ||
69 | static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */ | ||
70 | static int onoff_holdoff; /* Seconds after boot before CPU hotplugs. */ | ||
71 | static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */ | ||
72 | static int stall_cpu; /* CPU-stall duration (s). 0 for no stall. */ | ||
73 | static int stall_cpu_holdoff = 10; /* Time to wait until stall (s). */ | ||
74 | static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */ | ||
75 | static int test_boost_interval = 7; /* Interval between boost tests, seconds. */ | ||
76 | static int test_boost_duration = 4; /* Duration of each boost test, seconds. */ | ||
77 | static char *torture_type = "rcu"; /* What RCU implementation to torture. */ | ||
78 | |||
79 | module_param(nreaders, int, 0444); | ||
80 | MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); | ||
81 | module_param(nfakewriters, int, 0444); | ||
82 | MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads"); | ||
83 | module_param(stat_interval, int, 0644); | ||
84 | MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); | ||
85 | module_param(verbose, bool, 0444); | ||
86 | MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); | ||
87 | module_param(test_no_idle_hz, bool, 0444); | ||
88 | MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); | ||
89 | module_param(shuffle_interval, int, 0444); | ||
90 | MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); | ||
91 | module_param(stutter, int, 0444); | ||
92 | MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test"); | ||
93 | module_param(irqreader, int, 0444); | ||
94 | MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers"); | ||
95 | module_param(fqs_duration, int, 0444); | 56 | module_param(fqs_duration, int, 0444); |
96 | MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us)"); | 57 | MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us), 0 to disable"); |
58 | static int fqs_holdoff; | ||
97 | module_param(fqs_holdoff, int, 0444); | 59 | module_param(fqs_holdoff, int, 0444); |
98 | MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); | 60 | MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); |
61 | static int fqs_stutter = 3; | ||
99 | module_param(fqs_stutter, int, 0444); | 62 | module_param(fqs_stutter, int, 0444); |
100 | MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); | 63 | MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); |
64 | static bool gp_exp; | ||
65 | module_param(gp_exp, bool, 0444); | ||
66 | MODULE_PARM_DESC(gp_exp, "Use expedited GP wait primitives"); | ||
67 | static bool gp_normal; | ||
68 | module_param(gp_normal, bool, 0444); | ||
69 | MODULE_PARM_DESC(gp_normal, "Use normal (non-expedited) GP wait primitives"); | ||
70 | static int irqreader = 1; | ||
71 | module_param(irqreader, int, 0444); | ||
72 | MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers"); | ||
73 | static int n_barrier_cbs; | ||
101 | module_param(n_barrier_cbs, int, 0444); | 74 | module_param(n_barrier_cbs, int, 0444); |
102 | MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing"); | 75 | MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing"); |
103 | module_param(onoff_interval, int, 0444); | 76 | static int nfakewriters = 4; |
104 | MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); | 77 | module_param(nfakewriters, int, 0444); |
78 | MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads"); | ||
79 | static int nreaders = -1; | ||
80 | module_param(nreaders, int, 0444); | ||
81 | MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); | ||
82 | static int object_debug; | ||
83 | module_param(object_debug, int, 0444); | ||
84 | MODULE_PARM_DESC(object_debug, "Enable debug-object double call_rcu() testing"); | ||
85 | static int onoff_holdoff; | ||
105 | module_param(onoff_holdoff, int, 0444); | 86 | module_param(onoff_holdoff, int, 0444); |
106 | MODULE_PARM_DESC(onoff_holdoff, "Time after boot before CPU hotplugs (s)"); | 87 | MODULE_PARM_DESC(onoff_holdoff, "Time after boot before CPU hotplugs (s)"); |
88 | static int onoff_interval; | ||
89 | module_param(onoff_interval, int, 0444); | ||
90 | MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); | ||
91 | static int shuffle_interval = 3; | ||
92 | module_param(shuffle_interval, int, 0444); | ||
93 | MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); | ||
94 | static int shutdown_secs; | ||
107 | module_param(shutdown_secs, int, 0444); | 95 | module_param(shutdown_secs, int, 0444); |
108 | MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), zero to disable."); | 96 | MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), <= zero to disable."); |
97 | static int stall_cpu; | ||
109 | module_param(stall_cpu, int, 0444); | 98 | module_param(stall_cpu, int, 0444); |
110 | MODULE_PARM_DESC(stall_cpu, "Stall duration (s), zero to disable."); | 99 | MODULE_PARM_DESC(stall_cpu, "Stall duration (s), zero to disable."); |
100 | static int stall_cpu_holdoff = 10; | ||
111 | module_param(stall_cpu_holdoff, int, 0444); | 101 | module_param(stall_cpu_holdoff, int, 0444); |
112 | MODULE_PARM_DESC(stall_cpu_holdoff, "Time to wait before starting stall (s)."); | 102 | MODULE_PARM_DESC(stall_cpu_holdoff, "Time to wait before starting stall (s)."); |
103 | static int stat_interval = 60; | ||
104 | module_param(stat_interval, int, 0644); | ||
105 | MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); | ||
106 | static int stutter = 5; | ||
107 | module_param(stutter, int, 0444); | ||
108 | MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test"); | ||
109 | static int test_boost = 1; | ||
113 | module_param(test_boost, int, 0444); | 110 | module_param(test_boost, int, 0444); |
114 | MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); | 111 | MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); |
115 | module_param(test_boost_interval, int, 0444); | 112 | static int test_boost_duration = 4; |
116 | MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds."); | ||
117 | module_param(test_boost_duration, int, 0444); | 113 | module_param(test_boost_duration, int, 0444); |
118 | MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds."); | 114 | MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds."); |
115 | static int test_boost_interval = 7; | ||
116 | module_param(test_boost_interval, int, 0444); | ||
117 | MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds."); | ||
118 | static bool test_no_idle_hz = true; | ||
119 | module_param(test_no_idle_hz, bool, 0444); | ||
120 | MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); | ||
121 | static char *torture_type = "rcu"; | ||
119 | module_param(torture_type, charp, 0444); | 122 | module_param(torture_type, charp, 0444); |
120 | MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); | 123 | MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)"); |
124 | static bool verbose; | ||
125 | module_param(verbose, bool, 0444); | ||
126 | MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); | ||
121 | 127 | ||
122 | #define TORTURE_FLAG "-torture:" | 128 | #define TORTURE_FLAG "-torture:" |
123 | #define PRINTK_STRING(s) \ | 129 | #define PRINTK_STRING(s) \ |
@@ -267,7 +273,7 @@ rcutorture_shutdown_notify(struct notifier_block *unused1, | |||
267 | * Absorb kthreads into a kernel function that won't return, so that | 273 | * Absorb kthreads into a kernel function that won't return, so that |
268 | * they won't ever access module text or data again. | 274 | * they won't ever access module text or data again. |
269 | */ | 275 | */ |
270 | static void rcutorture_shutdown_absorb(char *title) | 276 | static void rcutorture_shutdown_absorb(const char *title) |
271 | { | 277 | { |
272 | if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { | 278 | if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { |
273 | pr_notice( | 279 | pr_notice( |
@@ -337,7 +343,7 @@ rcu_random(struct rcu_random_state *rrsp) | |||
337 | } | 343 | } |
338 | 344 | ||
339 | static void | 345 | static void |
340 | rcu_stutter_wait(char *title) | 346 | rcu_stutter_wait(const char *title) |
341 | { | 347 | { |
342 | while (stutter_pause_test || !rcutorture_runnable) { | 348 | while (stutter_pause_test || !rcutorture_runnable) { |
343 | if (rcutorture_runnable) | 349 | if (rcutorture_runnable) |
@@ -360,13 +366,14 @@ struct rcu_torture_ops { | |||
360 | int (*completed)(void); | 366 | int (*completed)(void); |
361 | void (*deferred_free)(struct rcu_torture *p); | 367 | void (*deferred_free)(struct rcu_torture *p); |
362 | void (*sync)(void); | 368 | void (*sync)(void); |
369 | void (*exp_sync)(void); | ||
363 | void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); | 370 | void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); |
364 | void (*cb_barrier)(void); | 371 | void (*cb_barrier)(void); |
365 | void (*fqs)(void); | 372 | void (*fqs)(void); |
366 | int (*stats)(char *page); | 373 | int (*stats)(char *page); |
367 | int irq_capable; | 374 | int irq_capable; |
368 | int can_boost; | 375 | int can_boost; |
369 | char *name; | 376 | const char *name; |
370 | }; | 377 | }; |
371 | 378 | ||
372 | static struct rcu_torture_ops *cur_ops; | 379 | static struct rcu_torture_ops *cur_ops; |
@@ -443,81 +450,27 @@ static void rcu_torture_deferred_free(struct rcu_torture *p) | |||
443 | call_rcu(&p->rtort_rcu, rcu_torture_cb); | 450 | call_rcu(&p->rtort_rcu, rcu_torture_cb); |
444 | } | 451 | } |
445 | 452 | ||
446 | static struct rcu_torture_ops rcu_ops = { | ||
447 | .init = NULL, | ||
448 | .readlock = rcu_torture_read_lock, | ||
449 | .read_delay = rcu_read_delay, | ||
450 | .readunlock = rcu_torture_read_unlock, | ||
451 | .completed = rcu_torture_completed, | ||
452 | .deferred_free = rcu_torture_deferred_free, | ||
453 | .sync = synchronize_rcu, | ||
454 | .call = call_rcu, | ||
455 | .cb_barrier = rcu_barrier, | ||
456 | .fqs = rcu_force_quiescent_state, | ||
457 | .stats = NULL, | ||
458 | .irq_capable = 1, | ||
459 | .can_boost = rcu_can_boost(), | ||
460 | .name = "rcu" | ||
461 | }; | ||
462 | |||
463 | static void rcu_sync_torture_deferred_free(struct rcu_torture *p) | ||
464 | { | ||
465 | int i; | ||
466 | struct rcu_torture *rp; | ||
467 | struct rcu_torture *rp1; | ||
468 | |||
469 | cur_ops->sync(); | ||
470 | list_add(&p->rtort_free, &rcu_torture_removed); | ||
471 | list_for_each_entry_safe(rp, rp1, &rcu_torture_removed, rtort_free) { | ||
472 | i = rp->rtort_pipe_count; | ||
473 | if (i > RCU_TORTURE_PIPE_LEN) | ||
474 | i = RCU_TORTURE_PIPE_LEN; | ||
475 | atomic_inc(&rcu_torture_wcount[i]); | ||
476 | if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { | ||
477 | rp->rtort_mbtest = 0; | ||
478 | list_del(&rp->rtort_free); | ||
479 | rcu_torture_free(rp); | ||
480 | } | ||
481 | } | ||
482 | } | ||
483 | |||
484 | static void rcu_sync_torture_init(void) | 453 | static void rcu_sync_torture_init(void) |
485 | { | 454 | { |
486 | INIT_LIST_HEAD(&rcu_torture_removed); | 455 | INIT_LIST_HEAD(&rcu_torture_removed); |
487 | } | 456 | } |
488 | 457 | ||
489 | static struct rcu_torture_ops rcu_sync_ops = { | 458 | static struct rcu_torture_ops rcu_ops = { |
490 | .init = rcu_sync_torture_init, | 459 | .init = rcu_sync_torture_init, |
491 | .readlock = rcu_torture_read_lock, | 460 | .readlock = rcu_torture_read_lock, |
492 | .read_delay = rcu_read_delay, | 461 | .read_delay = rcu_read_delay, |
493 | .readunlock = rcu_torture_read_unlock, | 462 | .readunlock = rcu_torture_read_unlock, |
494 | .completed = rcu_torture_completed, | 463 | .completed = rcu_torture_completed, |
495 | .deferred_free = rcu_sync_torture_deferred_free, | 464 | .deferred_free = rcu_torture_deferred_free, |
496 | .sync = synchronize_rcu, | 465 | .sync = synchronize_rcu, |
497 | .call = NULL, | 466 | .exp_sync = synchronize_rcu_expedited, |
498 | .cb_barrier = NULL, | 467 | .call = call_rcu, |
499 | .fqs = rcu_force_quiescent_state, | 468 | .cb_barrier = rcu_barrier, |
500 | .stats = NULL, | ||
501 | .irq_capable = 1, | ||
502 | .can_boost = rcu_can_boost(), | ||
503 | .name = "rcu_sync" | ||
504 | }; | ||
505 | |||
506 | static struct rcu_torture_ops rcu_expedited_ops = { | ||
507 | .init = rcu_sync_torture_init, | ||
508 | .readlock = rcu_torture_read_lock, | ||
509 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | ||
510 | .readunlock = rcu_torture_read_unlock, | ||
511 | .completed = rcu_no_completed, | ||
512 | .deferred_free = rcu_sync_torture_deferred_free, | ||
513 | .sync = synchronize_rcu_expedited, | ||
514 | .call = NULL, | ||
515 | .cb_barrier = NULL, | ||
516 | .fqs = rcu_force_quiescent_state, | 469 | .fqs = rcu_force_quiescent_state, |
517 | .stats = NULL, | 470 | .stats = NULL, |
518 | .irq_capable = 1, | 471 | .irq_capable = 1, |
519 | .can_boost = rcu_can_boost(), | 472 | .can_boost = rcu_can_boost(), |
520 | .name = "rcu_expedited" | 473 | .name = "rcu" |
521 | }; | 474 | }; |
522 | 475 | ||
523 | /* | 476 | /* |
@@ -546,13 +499,14 @@ static void rcu_bh_torture_deferred_free(struct rcu_torture *p) | |||
546 | } | 499 | } |
547 | 500 | ||
548 | static struct rcu_torture_ops rcu_bh_ops = { | 501 | static struct rcu_torture_ops rcu_bh_ops = { |
549 | .init = NULL, | 502 | .init = rcu_sync_torture_init, |
550 | .readlock = rcu_bh_torture_read_lock, | 503 | .readlock = rcu_bh_torture_read_lock, |
551 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | 504 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ |
552 | .readunlock = rcu_bh_torture_read_unlock, | 505 | .readunlock = rcu_bh_torture_read_unlock, |
553 | .completed = rcu_bh_torture_completed, | 506 | .completed = rcu_bh_torture_completed, |
554 | .deferred_free = rcu_bh_torture_deferred_free, | 507 | .deferred_free = rcu_bh_torture_deferred_free, |
555 | .sync = synchronize_rcu_bh, | 508 | .sync = synchronize_rcu_bh, |
509 | .exp_sync = synchronize_rcu_bh_expedited, | ||
556 | .call = call_rcu_bh, | 510 | .call = call_rcu_bh, |
557 | .cb_barrier = rcu_barrier_bh, | 511 | .cb_barrier = rcu_barrier_bh, |
558 | .fqs = rcu_bh_force_quiescent_state, | 512 | .fqs = rcu_bh_force_quiescent_state, |
@@ -561,38 +515,6 @@ static struct rcu_torture_ops rcu_bh_ops = { | |||
561 | .name = "rcu_bh" | 515 | .name = "rcu_bh" |
562 | }; | 516 | }; |
563 | 517 | ||
564 | static struct rcu_torture_ops rcu_bh_sync_ops = { | ||
565 | .init = rcu_sync_torture_init, | ||
566 | .readlock = rcu_bh_torture_read_lock, | ||
567 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | ||
568 | .readunlock = rcu_bh_torture_read_unlock, | ||
569 | .completed = rcu_bh_torture_completed, | ||
570 | .deferred_free = rcu_sync_torture_deferred_free, | ||
571 | .sync = synchronize_rcu_bh, | ||
572 | .call = NULL, | ||
573 | .cb_barrier = NULL, | ||
574 | .fqs = rcu_bh_force_quiescent_state, | ||
575 | .stats = NULL, | ||
576 | .irq_capable = 1, | ||
577 | .name = "rcu_bh_sync" | ||
578 | }; | ||
579 | |||
580 | static struct rcu_torture_ops rcu_bh_expedited_ops = { | ||
581 | .init = rcu_sync_torture_init, | ||
582 | .readlock = rcu_bh_torture_read_lock, | ||
583 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | ||
584 | .readunlock = rcu_bh_torture_read_unlock, | ||
585 | .completed = rcu_bh_torture_completed, | ||
586 | .deferred_free = rcu_sync_torture_deferred_free, | ||
587 | .sync = synchronize_rcu_bh_expedited, | ||
588 | .call = NULL, | ||
589 | .cb_barrier = NULL, | ||
590 | .fqs = rcu_bh_force_quiescent_state, | ||
591 | .stats = NULL, | ||
592 | .irq_capable = 1, | ||
593 | .name = "rcu_bh_expedited" | ||
594 | }; | ||
595 | |||
596 | /* | 518 | /* |
597 | * Definitions for srcu torture testing. | 519 | * Definitions for srcu torture testing. |
598 | */ | 520 | */ |
@@ -667,6 +589,11 @@ static int srcu_torture_stats(char *page) | |||
667 | return cnt; | 589 | return cnt; |
668 | } | 590 | } |
669 | 591 | ||
592 | static void srcu_torture_synchronize_expedited(void) | ||
593 | { | ||
594 | synchronize_srcu_expedited(&srcu_ctl); | ||
595 | } | ||
596 | |||
670 | static struct rcu_torture_ops srcu_ops = { | 597 | static struct rcu_torture_ops srcu_ops = { |
671 | .init = rcu_sync_torture_init, | 598 | .init = rcu_sync_torture_init, |
672 | .readlock = srcu_torture_read_lock, | 599 | .readlock = srcu_torture_read_lock, |
@@ -675,45 +602,13 @@ static struct rcu_torture_ops srcu_ops = { | |||
675 | .completed = srcu_torture_completed, | 602 | .completed = srcu_torture_completed, |
676 | .deferred_free = srcu_torture_deferred_free, | 603 | .deferred_free = srcu_torture_deferred_free, |
677 | .sync = srcu_torture_synchronize, | 604 | .sync = srcu_torture_synchronize, |
605 | .exp_sync = srcu_torture_synchronize_expedited, | ||
678 | .call = srcu_torture_call, | 606 | .call = srcu_torture_call, |
679 | .cb_barrier = srcu_torture_barrier, | 607 | .cb_barrier = srcu_torture_barrier, |
680 | .stats = srcu_torture_stats, | 608 | .stats = srcu_torture_stats, |
681 | .name = "srcu" | 609 | .name = "srcu" |
682 | }; | 610 | }; |
683 | 611 | ||
684 | static struct rcu_torture_ops srcu_sync_ops = { | ||
685 | .init = rcu_sync_torture_init, | ||
686 | .readlock = srcu_torture_read_lock, | ||
687 | .read_delay = srcu_read_delay, | ||
688 | .readunlock = srcu_torture_read_unlock, | ||
689 | .completed = srcu_torture_completed, | ||
690 | .deferred_free = rcu_sync_torture_deferred_free, | ||
691 | .sync = srcu_torture_synchronize, | ||
692 | .call = NULL, | ||
693 | .cb_barrier = NULL, | ||
694 | .stats = srcu_torture_stats, | ||
695 | .name = "srcu_sync" | ||
696 | }; | ||
697 | |||
698 | static void srcu_torture_synchronize_expedited(void) | ||
699 | { | ||
700 | synchronize_srcu_expedited(&srcu_ctl); | ||
701 | } | ||
702 | |||
703 | static struct rcu_torture_ops srcu_expedited_ops = { | ||
704 | .init = rcu_sync_torture_init, | ||
705 | .readlock = srcu_torture_read_lock, | ||
706 | .read_delay = srcu_read_delay, | ||
707 | .readunlock = srcu_torture_read_unlock, | ||
708 | .completed = srcu_torture_completed, | ||
709 | .deferred_free = rcu_sync_torture_deferred_free, | ||
710 | .sync = srcu_torture_synchronize_expedited, | ||
711 | .call = NULL, | ||
712 | .cb_barrier = NULL, | ||
713 | .stats = srcu_torture_stats, | ||
714 | .name = "srcu_expedited" | ||
715 | }; | ||
716 | |||
717 | /* | 612 | /* |
718 | * Definitions for sched torture testing. | 613 | * Definitions for sched torture testing. |
719 | */ | 614 | */ |
@@ -742,6 +637,8 @@ static struct rcu_torture_ops sched_ops = { | |||
742 | .completed = rcu_no_completed, | 637 | .completed = rcu_no_completed, |
743 | .deferred_free = rcu_sched_torture_deferred_free, | 638 | .deferred_free = rcu_sched_torture_deferred_free, |
744 | .sync = synchronize_sched, | 639 | .sync = synchronize_sched, |
640 | .exp_sync = synchronize_sched_expedited, | ||
641 | .call = call_rcu_sched, | ||
745 | .cb_barrier = rcu_barrier_sched, | 642 | .cb_barrier = rcu_barrier_sched, |
746 | .fqs = rcu_sched_force_quiescent_state, | 643 | .fqs = rcu_sched_force_quiescent_state, |
747 | .stats = NULL, | 644 | .stats = NULL, |
@@ -749,35 +646,6 @@ static struct rcu_torture_ops sched_ops = { | |||
749 | .name = "sched" | 646 | .name = "sched" |
750 | }; | 647 | }; |
751 | 648 | ||
752 | static struct rcu_torture_ops sched_sync_ops = { | ||
753 | .init = rcu_sync_torture_init, | ||
754 | .readlock = sched_torture_read_lock, | ||
755 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | ||
756 | .readunlock = sched_torture_read_unlock, | ||
757 | .completed = rcu_no_completed, | ||
758 | .deferred_free = rcu_sync_torture_deferred_free, | ||
759 | .sync = synchronize_sched, | ||
760 | .cb_barrier = NULL, | ||
761 | .fqs = rcu_sched_force_quiescent_state, | ||
762 | .stats = NULL, | ||
763 | .name = "sched_sync" | ||
764 | }; | ||
765 | |||
766 | static struct rcu_torture_ops sched_expedited_ops = { | ||
767 | .init = rcu_sync_torture_init, | ||
768 | .readlock = sched_torture_read_lock, | ||
769 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | ||
770 | .readunlock = sched_torture_read_unlock, | ||
771 | .completed = rcu_no_completed, | ||
772 | .deferred_free = rcu_sync_torture_deferred_free, | ||
773 | .sync = synchronize_sched_expedited, | ||
774 | .cb_barrier = NULL, | ||
775 | .fqs = rcu_sched_force_quiescent_state, | ||
776 | .stats = NULL, | ||
777 | .irq_capable = 1, | ||
778 | .name = "sched_expedited" | ||
779 | }; | ||
780 | |||
781 | /* | 649 | /* |
782 | * RCU torture priority-boost testing. Runs one real-time thread per | 650 | * RCU torture priority-boost testing. Runs one real-time thread per |
783 | * CPU for moderate bursts, repeatedly registering RCU callbacks and | 651 | * CPU for moderate bursts, repeatedly registering RCU callbacks and |
@@ -927,9 +795,10 @@ rcu_torture_fqs(void *arg) | |||
927 | static int | 795 | static int |
928 | rcu_torture_writer(void *arg) | 796 | rcu_torture_writer(void *arg) |
929 | { | 797 | { |
798 | bool exp; | ||
930 | int i; | 799 | int i; |
931 | long oldbatch = rcu_batches_completed(); | ||
932 | struct rcu_torture *rp; | 800 | struct rcu_torture *rp; |
801 | struct rcu_torture *rp1; | ||
933 | struct rcu_torture *old_rp; | 802 | struct rcu_torture *old_rp; |
934 | static DEFINE_RCU_RANDOM(rand); | 803 | static DEFINE_RCU_RANDOM(rand); |
935 | 804 | ||
@@ -954,10 +823,33 @@ rcu_torture_writer(void *arg) | |||
954 | i = RCU_TORTURE_PIPE_LEN; | 823 | i = RCU_TORTURE_PIPE_LEN; |
955 | atomic_inc(&rcu_torture_wcount[i]); | 824 | atomic_inc(&rcu_torture_wcount[i]); |
956 | old_rp->rtort_pipe_count++; | 825 | old_rp->rtort_pipe_count++; |
957 | cur_ops->deferred_free(old_rp); | 826 | if (gp_normal == gp_exp) |
827 | exp = !!(rcu_random(&rand) & 0x80); | ||
828 | else | ||
829 | exp = gp_exp; | ||
830 | if (!exp) { | ||
831 | cur_ops->deferred_free(old_rp); | ||
832 | } else { | ||
833 | cur_ops->exp_sync(); | ||
834 | list_add(&old_rp->rtort_free, | ||
835 | &rcu_torture_removed); | ||
836 | list_for_each_entry_safe(rp, rp1, | ||
837 | &rcu_torture_removed, | ||
838 | rtort_free) { | ||
839 | i = rp->rtort_pipe_count; | ||
840 | if (i > RCU_TORTURE_PIPE_LEN) | ||
841 | i = RCU_TORTURE_PIPE_LEN; | ||
842 | atomic_inc(&rcu_torture_wcount[i]); | ||
843 | if (++rp->rtort_pipe_count >= | ||
844 | RCU_TORTURE_PIPE_LEN) { | ||
845 | rp->rtort_mbtest = 0; | ||
846 | list_del(&rp->rtort_free); | ||
847 | rcu_torture_free(rp); | ||
848 | } | ||
849 | } | ||
850 | } | ||
958 | } | 851 | } |
959 | rcutorture_record_progress(++rcu_torture_current_version); | 852 | rcutorture_record_progress(++rcu_torture_current_version); |
960 | oldbatch = cur_ops->completed(); | ||
961 | rcu_stutter_wait("rcu_torture_writer"); | 853 | rcu_stutter_wait("rcu_torture_writer"); |
962 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); | 854 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); |
963 | VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); | 855 | VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); |
@@ -983,10 +875,18 @@ rcu_torture_fakewriter(void *arg) | |||
983 | schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); | 875 | schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); |
984 | udelay(rcu_random(&rand) & 0x3ff); | 876 | udelay(rcu_random(&rand) & 0x3ff); |
985 | if (cur_ops->cb_barrier != NULL && | 877 | if (cur_ops->cb_barrier != NULL && |
986 | rcu_random(&rand) % (nfakewriters * 8) == 0) | 878 | rcu_random(&rand) % (nfakewriters * 8) == 0) { |
987 | cur_ops->cb_barrier(); | 879 | cur_ops->cb_barrier(); |
988 | else | 880 | } else if (gp_normal == gp_exp) { |
881 | if (rcu_random(&rand) & 0x80) | ||
882 | cur_ops->sync(); | ||
883 | else | ||
884 | cur_ops->exp_sync(); | ||
885 | } else if (gp_normal) { | ||
989 | cur_ops->sync(); | 886 | cur_ops->sync(); |
887 | } else { | ||
888 | cur_ops->exp_sync(); | ||
889 | } | ||
990 | rcu_stutter_wait("rcu_torture_fakewriter"); | 890 | rcu_stutter_wait("rcu_torture_fakewriter"); |
991 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); | 891 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); |
992 | 892 | ||
@@ -1364,7 +1264,7 @@ rcu_torture_stutter(void *arg) | |||
1364 | } | 1264 | } |
1365 | 1265 | ||
1366 | static inline void | 1266 | static inline void |
1367 | rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag) | 1267 | rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) |
1368 | { | 1268 | { |
1369 | pr_alert("%s" TORTURE_FLAG | 1269 | pr_alert("%s" TORTURE_FLAG |
1370 | "--- %s: nreaders=%d nfakewriters=%d " | 1270 | "--- %s: nreaders=%d nfakewriters=%d " |
@@ -1476,7 +1376,7 @@ rcu_torture_shutdown(void *arg) | |||
1476 | * Execute random CPU-hotplug operations at the interval specified | 1376 | * Execute random CPU-hotplug operations at the interval specified |
1477 | * by the onoff_interval. | 1377 | * by the onoff_interval. |
1478 | */ | 1378 | */ |
1479 | static int __cpuinit | 1379 | static int |
1480 | rcu_torture_onoff(void *arg) | 1380 | rcu_torture_onoff(void *arg) |
1481 | { | 1381 | { |
1482 | int cpu; | 1382 | int cpu; |
@@ -1534,7 +1434,13 @@ rcu_torture_onoff(void *arg) | |||
1534 | torture_type, cpu); | 1434 | torture_type, cpu); |
1535 | starttime = jiffies; | 1435 | starttime = jiffies; |
1536 | n_online_attempts++; | 1436 | n_online_attempts++; |
1537 | if (cpu_up(cpu) == 0) { | 1437 | ret = cpu_up(cpu); |
1438 | if (ret) { | ||
1439 | if (verbose) | ||
1440 | pr_alert("%s" TORTURE_FLAG | ||
1441 | "rcu_torture_onoff task: online %d failed: errno %d\n", | ||
1442 | torture_type, cpu, ret); | ||
1443 | } else { | ||
1538 | if (verbose) | 1444 | if (verbose) |
1539 | pr_alert("%s" TORTURE_FLAG | 1445 | pr_alert("%s" TORTURE_FLAG |
1540 | "rcu_torture_onoff task: onlined %d\n", | 1446 | "rcu_torture_onoff task: onlined %d\n", |
@@ -1558,7 +1464,7 @@ rcu_torture_onoff(void *arg) | |||
1558 | return 0; | 1464 | return 0; |
1559 | } | 1465 | } |
1560 | 1466 | ||
1561 | static int __cpuinit | 1467 | static int |
1562 | rcu_torture_onoff_init(void) | 1468 | rcu_torture_onoff_init(void) |
1563 | { | 1469 | { |
1564 | int ret; | 1470 | int ret; |
@@ -1601,7 +1507,7 @@ static void rcu_torture_onoff_cleanup(void) | |||
1601 | * CPU-stall kthread. It waits as specified by stall_cpu_holdoff, then | 1507 | * CPU-stall kthread. It waits as specified by stall_cpu_holdoff, then |
1602 | * induces a CPU stall for the time specified by stall_cpu. | 1508 | * induces a CPU stall for the time specified by stall_cpu. |
1603 | */ | 1509 | */ |
1604 | static int __cpuinit rcu_torture_stall(void *args) | 1510 | static int rcu_torture_stall(void *args) |
1605 | { | 1511 | { |
1606 | unsigned long stop_at; | 1512 | unsigned long stop_at; |
1607 | 1513 | ||
@@ -1934,6 +1840,62 @@ rcu_torture_cleanup(void) | |||
1934 | rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); | 1840 | rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); |
1935 | } | 1841 | } |
1936 | 1842 | ||
1843 | #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD | ||
1844 | static void rcu_torture_leak_cb(struct rcu_head *rhp) | ||
1845 | { | ||
1846 | } | ||
1847 | |||
1848 | static void rcu_torture_err_cb(struct rcu_head *rhp) | ||
1849 | { | ||
1850 | /* | ||
1851 | * This -might- happen due to race conditions, but is unlikely. | ||
1852 | * The scenario that leads to this happening is that the | ||
1853 | * first of the pair of duplicate callbacks is queued, | ||
1854 | * someone else starts a grace period that includes that | ||
1855 | * callback, then the second of the pair must wait for the | ||
1856 | * next grace period. Unlikely, but can happen. If it | ||
1857 | * does happen, the debug-objects subsystem won't have splatted. | ||
1858 | */ | ||
1859 | pr_alert("rcutorture: duplicated callback was invoked.\n"); | ||
1860 | } | ||
1861 | #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ | ||
1862 | |||
1863 | /* | ||
1864 | * Verify that double-free causes debug-objects to complain, but only | ||
1865 | * if CONFIG_DEBUG_OBJECTS_RCU_HEAD=y. Otherwise, say that the test | ||
1866 | * cannot be carried out. | ||
1867 | */ | ||
1868 | static void rcu_test_debug_objects(void) | ||
1869 | { | ||
1870 | #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD | ||
1871 | struct rcu_head rh1; | ||
1872 | struct rcu_head rh2; | ||
1873 | |||
1874 | init_rcu_head_on_stack(&rh1); | ||
1875 | init_rcu_head_on_stack(&rh2); | ||
1876 | pr_alert("rcutorture: WARN: Duplicate call_rcu() test starting.\n"); | ||
1877 | |||
1878 | /* Try to queue the rh2 pair of callbacks for the same grace period. */ | ||
1879 | preempt_disable(); /* Prevent preemption from interrupting test. */ | ||
1880 | rcu_read_lock(); /* Make it impossible to finish a grace period. */ | ||
1881 | call_rcu(&rh1, rcu_torture_leak_cb); /* Start grace period. */ | ||
1882 | local_irq_disable(); /* Make it harder to start a new grace period. */ | ||
1883 | call_rcu(&rh2, rcu_torture_leak_cb); | ||
1884 | call_rcu(&rh2, rcu_torture_err_cb); /* Duplicate callback. */ | ||
1885 | local_irq_enable(); | ||
1886 | rcu_read_unlock(); | ||
1887 | preempt_enable(); | ||
1888 | |||
1889 | /* Wait for them all to get done so we can safely return. */ | ||
1890 | rcu_barrier(); | ||
1891 | pr_alert("rcutorture: WARN: Duplicate call_rcu() test complete.\n"); | ||
1892 | destroy_rcu_head_on_stack(&rh1); | ||
1893 | destroy_rcu_head_on_stack(&rh2); | ||
1894 | #else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ | ||
1895 | pr_alert("rcutorture: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n"); | ||
1896 | #endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ | ||
1897 | } | ||
1898 | |||
1937 | static int __init | 1899 | static int __init |
1938 | rcu_torture_init(void) | 1900 | rcu_torture_init(void) |
1939 | { | 1901 | { |
@@ -1941,11 +1903,9 @@ rcu_torture_init(void) | |||
1941 | int cpu; | 1903 | int cpu; |
1942 | int firsterr = 0; | 1904 | int firsterr = 0; |
1943 | int retval; | 1905 | int retval; |
1944 | static struct rcu_torture_ops *torture_ops[] = | 1906 | static struct rcu_torture_ops *torture_ops[] = { |
1945 | { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, | 1907 | &rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops, |
1946 | &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, | 1908 | }; |
1947 | &srcu_ops, &srcu_sync_ops, &srcu_expedited_ops, | ||
1948 | &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; | ||
1949 | 1909 | ||
1950 | mutex_lock(&fullstop_mutex); | 1910 | mutex_lock(&fullstop_mutex); |
1951 | 1911 | ||
@@ -2163,6 +2123,8 @@ rcu_torture_init(void) | |||
2163 | firsterr = retval; | 2123 | firsterr = retval; |
2164 | goto unwind; | 2124 | goto unwind; |
2165 | } | 2125 | } |
2126 | if (object_debug) | ||
2127 | rcu_test_debug_objects(); | ||
2166 | rcutorture_record_test_transition(); | 2128 | rcutorture_record_test_transition(); |
2167 | mutex_unlock(&fullstop_mutex); | 2129 | mutex_unlock(&fullstop_mutex); |
2168 | return 0; | 2130 | return 0; |
diff --git a/kernel/rcutree.c b/kernel/rcutree.c index e08abb9461ac..32618b3fe4e6 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c | |||
@@ -53,18 +53,38 @@ | |||
53 | #include <linux/delay.h> | 53 | #include <linux/delay.h> |
54 | #include <linux/stop_machine.h> | 54 | #include <linux/stop_machine.h> |
55 | #include <linux/random.h> | 55 | #include <linux/random.h> |
56 | #include <linux/ftrace_event.h> | ||
57 | #include <linux/suspend.h> | ||
56 | 58 | ||
57 | #include "rcutree.h" | 59 | #include "rcutree.h" |
58 | #include <trace/events/rcu.h> | 60 | #include <trace/events/rcu.h> |
59 | 61 | ||
60 | #include "rcu.h" | 62 | #include "rcu.h" |
61 | 63 | ||
64 | /* | ||
65 | * Strings used in tracepoints need to be exported via the | ||
66 | * tracing system such that tools like perf and trace-cmd can | ||
67 | * translate the string address pointers to actual text. | ||
68 | */ | ||
69 | #define TPS(x) tracepoint_string(x) | ||
70 | |||
62 | /* Data structures. */ | 71 | /* Data structures. */ |
63 | 72 | ||
64 | static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; | 73 | static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; |
65 | static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; | 74 | static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; |
66 | 75 | ||
67 | #define RCU_STATE_INITIALIZER(sname, sabbr, cr) { \ | 76 | /* |
77 | * In order to export the rcu_state name to the tracing tools, it | ||
78 | * needs to be added in the __tracepoint_string section. | ||
79 | * This requires defining a separate variable tp_<sname>_varname | ||
80 | * that points to the string being used, and this will allow | ||
81 | * the tracing userspace tools to be able to decipher the string | ||
82 | * address to the matching string. | ||
83 | */ | ||
84 | #define RCU_STATE_INITIALIZER(sname, sabbr, cr) \ | ||
85 | static char sname##_varname[] = #sname; \ | ||
86 | static const char *tp_##sname##_varname __used __tracepoint_string = sname##_varname; \ | ||
87 | struct rcu_state sname##_state = { \ | ||
68 | .level = { &sname##_state.node[0] }, \ | 88 | .level = { &sname##_state.node[0] }, \ |
69 | .call = cr, \ | 89 | .call = cr, \ |
70 | .fqs_state = RCU_GP_IDLE, \ | 90 | .fqs_state = RCU_GP_IDLE, \ |
@@ -75,16 +95,13 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; | |||
75 | .orphan_donetail = &sname##_state.orphan_donelist, \ | 95 | .orphan_donetail = &sname##_state.orphan_donelist, \ |
76 | .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ | 96 | .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ |
77 | .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \ | 97 | .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \ |
78 | .name = #sname, \ | 98 | .name = sname##_varname, \ |
79 | .abbr = sabbr, \ | 99 | .abbr = sabbr, \ |
80 | } | 100 | }; \ |
81 | 101 | DEFINE_PER_CPU(struct rcu_data, sname##_data) | |
82 | struct rcu_state rcu_sched_state = | ||
83 | RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); | ||
84 | DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); | ||
85 | 102 | ||
86 | struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); | 103 | RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); |
87 | DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); | 104 | RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); |
88 | 105 | ||
89 | static struct rcu_state *rcu_state; | 106 | static struct rcu_state *rcu_state; |
90 | LIST_HEAD(rcu_struct_flavors); | 107 | LIST_HEAD(rcu_struct_flavors); |
@@ -178,7 +195,7 @@ void rcu_sched_qs(int cpu) | |||
178 | struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu); | 195 | struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu); |
179 | 196 | ||
180 | if (rdp->passed_quiesce == 0) | 197 | if (rdp->passed_quiesce == 0) |
181 | trace_rcu_grace_period("rcu_sched", rdp->gpnum, "cpuqs"); | 198 | trace_rcu_grace_period(TPS("rcu_sched"), rdp->gpnum, TPS("cpuqs")); |
182 | rdp->passed_quiesce = 1; | 199 | rdp->passed_quiesce = 1; |
183 | } | 200 | } |
184 | 201 | ||
@@ -187,7 +204,7 @@ void rcu_bh_qs(int cpu) | |||
187 | struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); | 204 | struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); |
188 | 205 | ||
189 | if (rdp->passed_quiesce == 0) | 206 | if (rdp->passed_quiesce == 0) |
190 | trace_rcu_grace_period("rcu_bh", rdp->gpnum, "cpuqs"); | 207 | trace_rcu_grace_period(TPS("rcu_bh"), rdp->gpnum, TPS("cpuqs")); |
191 | rdp->passed_quiesce = 1; | 208 | rdp->passed_quiesce = 1; |
192 | } | 209 | } |
193 | 210 | ||
@@ -198,16 +215,20 @@ void rcu_bh_qs(int cpu) | |||
198 | */ | 215 | */ |
199 | void rcu_note_context_switch(int cpu) | 216 | void rcu_note_context_switch(int cpu) |
200 | { | 217 | { |
201 | trace_rcu_utilization("Start context switch"); | 218 | trace_rcu_utilization(TPS("Start context switch")); |
202 | rcu_sched_qs(cpu); | 219 | rcu_sched_qs(cpu); |
203 | rcu_preempt_note_context_switch(cpu); | 220 | rcu_preempt_note_context_switch(cpu); |
204 | trace_rcu_utilization("End context switch"); | 221 | trace_rcu_utilization(TPS("End context switch")); |
205 | } | 222 | } |
206 | EXPORT_SYMBOL_GPL(rcu_note_context_switch); | 223 | EXPORT_SYMBOL_GPL(rcu_note_context_switch); |
207 | 224 | ||
208 | DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { | 225 | DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { |
209 | .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, | 226 | .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, |
210 | .dynticks = ATOMIC_INIT(1), | 227 | .dynticks = ATOMIC_INIT(1), |
228 | #ifdef CONFIG_NO_HZ_FULL_SYSIDLE | ||
229 | .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE, | ||
230 | .dynticks_idle = ATOMIC_INIT(1), | ||
231 | #endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ | ||
211 | }; | 232 | }; |
212 | 233 | ||
213 | static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ | 234 | static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ |
@@ -226,7 +247,10 @@ module_param(jiffies_till_next_fqs, ulong, 0644); | |||
226 | 247 | ||
227 | static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, | 248 | static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, |
228 | struct rcu_data *rdp); | 249 | struct rcu_data *rdp); |
229 | static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)); | 250 | static void force_qs_rnp(struct rcu_state *rsp, |
251 | int (*f)(struct rcu_data *rsp, bool *isidle, | ||
252 | unsigned long *maxj), | ||
253 | bool *isidle, unsigned long *maxj); | ||
230 | static void force_quiescent_state(struct rcu_state *rsp); | 254 | static void force_quiescent_state(struct rcu_state *rsp); |
231 | static int rcu_pending(int cpu); | 255 | static int rcu_pending(int cpu); |
232 | 256 | ||
@@ -345,11 +369,11 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp) | |||
345 | static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, | 369 | static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, |
346 | bool user) | 370 | bool user) |
347 | { | 371 | { |
348 | trace_rcu_dyntick("Start", oldval, rdtp->dynticks_nesting); | 372 | trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting); |
349 | if (!user && !is_idle_task(current)) { | 373 | if (!user && !is_idle_task(current)) { |
350 | struct task_struct *idle = idle_task(smp_processor_id()); | 374 | struct task_struct *idle = idle_task(smp_processor_id()); |
351 | 375 | ||
352 | trace_rcu_dyntick("Error on entry: not idle task", oldval, 0); | 376 | trace_rcu_dyntick(TPS("Error on entry: not idle task"), oldval, 0); |
353 | ftrace_dump(DUMP_ORIG); | 377 | ftrace_dump(DUMP_ORIG); |
354 | WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", | 378 | WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", |
355 | current->pid, current->comm, | 379 | current->pid, current->comm, |
@@ -411,6 +435,7 @@ void rcu_idle_enter(void) | |||
411 | 435 | ||
412 | local_irq_save(flags); | 436 | local_irq_save(flags); |
413 | rcu_eqs_enter(false); | 437 | rcu_eqs_enter(false); |
438 | rcu_sysidle_enter(&__get_cpu_var(rcu_dynticks), 0); | ||
414 | local_irq_restore(flags); | 439 | local_irq_restore(flags); |
415 | } | 440 | } |
416 | EXPORT_SYMBOL_GPL(rcu_idle_enter); | 441 | EXPORT_SYMBOL_GPL(rcu_idle_enter); |
@@ -428,27 +453,6 @@ void rcu_user_enter(void) | |||
428 | { | 453 | { |
429 | rcu_eqs_enter(1); | 454 | rcu_eqs_enter(1); |
430 | } | 455 | } |
431 | |||
432 | /** | ||
433 | * rcu_user_enter_after_irq - inform RCU that we are going to resume userspace | ||
434 | * after the current irq returns. | ||
435 | * | ||
436 | * This is similar to rcu_user_enter() but in the context of a non-nesting | ||
437 | * irq. After this call, RCU enters into idle mode when the interrupt | ||
438 | * returns. | ||
439 | */ | ||
440 | void rcu_user_enter_after_irq(void) | ||
441 | { | ||
442 | unsigned long flags; | ||
443 | struct rcu_dynticks *rdtp; | ||
444 | |||
445 | local_irq_save(flags); | ||
446 | rdtp = &__get_cpu_var(rcu_dynticks); | ||
447 | /* Ensure this irq is interrupting a non-idle RCU state. */ | ||
448 | WARN_ON_ONCE(!(rdtp->dynticks_nesting & DYNTICK_TASK_MASK)); | ||
449 | rdtp->dynticks_nesting = 1; | ||
450 | local_irq_restore(flags); | ||
451 | } | ||
452 | #endif /* CONFIG_RCU_USER_QS */ | 456 | #endif /* CONFIG_RCU_USER_QS */ |
453 | 457 | ||
454 | /** | 458 | /** |
@@ -479,9 +483,10 @@ void rcu_irq_exit(void) | |||
479 | rdtp->dynticks_nesting--; | 483 | rdtp->dynticks_nesting--; |
480 | WARN_ON_ONCE(rdtp->dynticks_nesting < 0); | 484 | WARN_ON_ONCE(rdtp->dynticks_nesting < 0); |
481 | if (rdtp->dynticks_nesting) | 485 | if (rdtp->dynticks_nesting) |
482 | trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting); | 486 | trace_rcu_dyntick(TPS("--="), oldval, rdtp->dynticks_nesting); |
483 | else | 487 | else |
484 | rcu_eqs_enter_common(rdtp, oldval, true); | 488 | rcu_eqs_enter_common(rdtp, oldval, true); |
489 | rcu_sysidle_enter(rdtp, 1); | ||
485 | local_irq_restore(flags); | 490 | local_irq_restore(flags); |
486 | } | 491 | } |
487 | 492 | ||
@@ -501,11 +506,11 @@ static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval, | |||
501 | smp_mb__after_atomic_inc(); /* See above. */ | 506 | smp_mb__after_atomic_inc(); /* See above. */ |
502 | WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); | 507 | WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); |
503 | rcu_cleanup_after_idle(smp_processor_id()); | 508 | rcu_cleanup_after_idle(smp_processor_id()); |
504 | trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting); | 509 | trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting); |
505 | if (!user && !is_idle_task(current)) { | 510 | if (!user && !is_idle_task(current)) { |
506 | struct task_struct *idle = idle_task(smp_processor_id()); | 511 | struct task_struct *idle = idle_task(smp_processor_id()); |
507 | 512 | ||
508 | trace_rcu_dyntick("Error on exit: not idle task", | 513 | trace_rcu_dyntick(TPS("Error on exit: not idle task"), |
509 | oldval, rdtp->dynticks_nesting); | 514 | oldval, rdtp->dynticks_nesting); |
510 | ftrace_dump(DUMP_ORIG); | 515 | ftrace_dump(DUMP_ORIG); |
511 | WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", | 516 | WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", |
@@ -550,6 +555,7 @@ void rcu_idle_exit(void) | |||
550 | 555 | ||
551 | local_irq_save(flags); | 556 | local_irq_save(flags); |
552 | rcu_eqs_exit(false); | 557 | rcu_eqs_exit(false); |
558 | rcu_sysidle_exit(&__get_cpu_var(rcu_dynticks), 0); | ||
553 | local_irq_restore(flags); | 559 | local_irq_restore(flags); |
554 | } | 560 | } |
555 | EXPORT_SYMBOL_GPL(rcu_idle_exit); | 561 | EXPORT_SYMBOL_GPL(rcu_idle_exit); |
@@ -565,28 +571,6 @@ void rcu_user_exit(void) | |||
565 | { | 571 | { |
566 | rcu_eqs_exit(1); | 572 | rcu_eqs_exit(1); |
567 | } | 573 | } |
568 | |||
569 | /** | ||
570 | * rcu_user_exit_after_irq - inform RCU that we won't resume to userspace | ||
571 | * idle mode after the current non-nesting irq returns. | ||
572 | * | ||
573 | * This is similar to rcu_user_exit() but in the context of an irq. | ||
574 | * This is called when the irq has interrupted a userspace RCU idle mode | ||
575 | * context. When the current non-nesting interrupt returns after this call, | ||
576 | * the CPU won't restore the RCU idle mode. | ||
577 | */ | ||
578 | void rcu_user_exit_after_irq(void) | ||
579 | { | ||
580 | unsigned long flags; | ||
581 | struct rcu_dynticks *rdtp; | ||
582 | |||
583 | local_irq_save(flags); | ||
584 | rdtp = &__get_cpu_var(rcu_dynticks); | ||
585 | /* Ensure we are interrupting an RCU idle mode. */ | ||
586 | WARN_ON_ONCE(rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK); | ||
587 | rdtp->dynticks_nesting += DYNTICK_TASK_EXIT_IDLE; | ||
588 | local_irq_restore(flags); | ||
589 | } | ||
590 | #endif /* CONFIG_RCU_USER_QS */ | 574 | #endif /* CONFIG_RCU_USER_QS */ |
591 | 575 | ||
592 | /** | 576 | /** |
@@ -620,9 +604,10 @@ void rcu_irq_enter(void) | |||
620 | rdtp->dynticks_nesting++; | 604 | rdtp->dynticks_nesting++; |
621 | WARN_ON_ONCE(rdtp->dynticks_nesting == 0); | 605 | WARN_ON_ONCE(rdtp->dynticks_nesting == 0); |
622 | if (oldval) | 606 | if (oldval) |
623 | trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting); | 607 | trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting); |
624 | else | 608 | else |
625 | rcu_eqs_exit_common(rdtp, oldval, true); | 609 | rcu_eqs_exit_common(rdtp, oldval, true); |
610 | rcu_sysidle_exit(rdtp, 1); | ||
626 | local_irq_restore(flags); | 611 | local_irq_restore(flags); |
627 | } | 612 | } |
628 | 613 | ||
@@ -746,9 +731,11 @@ static int rcu_is_cpu_rrupt_from_idle(void) | |||
746 | * credit them with an implicit quiescent state. Return 1 if this CPU | 731 | * credit them with an implicit quiescent state. Return 1 if this CPU |
747 | * is in dynticks idle mode, which is an extended quiescent state. | 732 | * is in dynticks idle mode, which is an extended quiescent state. |
748 | */ | 733 | */ |
749 | static int dyntick_save_progress_counter(struct rcu_data *rdp) | 734 | static int dyntick_save_progress_counter(struct rcu_data *rdp, |
735 | bool *isidle, unsigned long *maxj) | ||
750 | { | 736 | { |
751 | rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); | 737 | rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); |
738 | rcu_sysidle_check_cpu(rdp, isidle, maxj); | ||
752 | return (rdp->dynticks_snap & 0x1) == 0; | 739 | return (rdp->dynticks_snap & 0x1) == 0; |
753 | } | 740 | } |
754 | 741 | ||
@@ -758,7 +745,8 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp) | |||
758 | * idle state since the last call to dyntick_save_progress_counter() | 745 | * idle state since the last call to dyntick_save_progress_counter() |
759 | * for this same CPU, or by virtue of having been offline. | 746 | * for this same CPU, or by virtue of having been offline. |
760 | */ | 747 | */ |
761 | static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) | 748 | static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, |
749 | bool *isidle, unsigned long *maxj) | ||
762 | { | 750 | { |
763 | unsigned int curr; | 751 | unsigned int curr; |
764 | unsigned int snap; | 752 | unsigned int snap; |
@@ -775,7 +763,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) | |||
775 | * of the current RCU grace period. | 763 | * of the current RCU grace period. |
776 | */ | 764 | */ |
777 | if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) { | 765 | if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) { |
778 | trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "dti"); | 766 | trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); |
779 | rdp->dynticks_fqs++; | 767 | rdp->dynticks_fqs++; |
780 | return 1; | 768 | return 1; |
781 | } | 769 | } |
@@ -795,7 +783,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) | |||
795 | return 0; /* Grace period is not old enough. */ | 783 | return 0; /* Grace period is not old enough. */ |
796 | barrier(); | 784 | barrier(); |
797 | if (cpu_is_offline(rdp->cpu)) { | 785 | if (cpu_is_offline(rdp->cpu)) { |
798 | trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl"); | 786 | trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("ofl")); |
799 | rdp->offline_fqs++; | 787 | rdp->offline_fqs++; |
800 | return 1; | 788 | return 1; |
801 | } | 789 | } |
@@ -1032,7 +1020,7 @@ static unsigned long rcu_cbs_completed(struct rcu_state *rsp, | |||
1032 | * rcu_nocb_wait_gp(). | 1020 | * rcu_nocb_wait_gp(). |
1033 | */ | 1021 | */ |
1034 | static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, | 1022 | static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, |
1035 | unsigned long c, char *s) | 1023 | unsigned long c, const char *s) |
1036 | { | 1024 | { |
1037 | trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum, | 1025 | trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum, |
1038 | rnp->completed, c, rnp->level, | 1026 | rnp->completed, c, rnp->level, |
@@ -1058,9 +1046,9 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) | |||
1058 | * grace period is already marked as needed, return to the caller. | 1046 | * grace period is already marked as needed, return to the caller. |
1059 | */ | 1047 | */ |
1060 | c = rcu_cbs_completed(rdp->rsp, rnp); | 1048 | c = rcu_cbs_completed(rdp->rsp, rnp); |
1061 | trace_rcu_future_gp(rnp, rdp, c, "Startleaf"); | 1049 | trace_rcu_future_gp(rnp, rdp, c, TPS("Startleaf")); |
1062 | if (rnp->need_future_gp[c & 0x1]) { | 1050 | if (rnp->need_future_gp[c & 0x1]) { |
1063 | trace_rcu_future_gp(rnp, rdp, c, "Prestartleaf"); | 1051 | trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartleaf")); |
1064 | return c; | 1052 | return c; |
1065 | } | 1053 | } |
1066 | 1054 | ||
@@ -1074,7 +1062,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) | |||
1074 | if (rnp->gpnum != rnp->completed || | 1062 | if (rnp->gpnum != rnp->completed || |
1075 | ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) { | 1063 | ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) { |
1076 | rnp->need_future_gp[c & 0x1]++; | 1064 | rnp->need_future_gp[c & 0x1]++; |
1077 | trace_rcu_future_gp(rnp, rdp, c, "Startedleaf"); | 1065 | trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf")); |
1078 | return c; | 1066 | return c; |
1079 | } | 1067 | } |
1080 | 1068 | ||
@@ -1102,7 +1090,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) | |||
1102 | * recorded, trace and leave. | 1090 | * recorded, trace and leave. |
1103 | */ | 1091 | */ |
1104 | if (rnp_root->need_future_gp[c & 0x1]) { | 1092 | if (rnp_root->need_future_gp[c & 0x1]) { |
1105 | trace_rcu_future_gp(rnp, rdp, c, "Prestartedroot"); | 1093 | trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartedroot")); |
1106 | goto unlock_out; | 1094 | goto unlock_out; |
1107 | } | 1095 | } |
1108 | 1096 | ||
@@ -1111,9 +1099,9 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) | |||
1111 | 1099 | ||
1112 | /* If a grace period is not already in progress, start one. */ | 1100 | /* If a grace period is not already in progress, start one. */ |
1113 | if (rnp_root->gpnum != rnp_root->completed) { | 1101 | if (rnp_root->gpnum != rnp_root->completed) { |
1114 | trace_rcu_future_gp(rnp, rdp, c, "Startedleafroot"); | 1102 | trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleafroot")); |
1115 | } else { | 1103 | } else { |
1116 | trace_rcu_future_gp(rnp, rdp, c, "Startedroot"); | 1104 | trace_rcu_future_gp(rnp, rdp, c, TPS("Startedroot")); |
1117 | rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp); | 1105 | rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp); |
1118 | } | 1106 | } |
1119 | unlock_out: | 1107 | unlock_out: |
@@ -1137,7 +1125,8 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) | |||
1137 | rcu_nocb_gp_cleanup(rsp, rnp); | 1125 | rcu_nocb_gp_cleanup(rsp, rnp); |
1138 | rnp->need_future_gp[c & 0x1] = 0; | 1126 | rnp->need_future_gp[c & 0x1] = 0; |
1139 | needmore = rnp->need_future_gp[(c + 1) & 0x1]; | 1127 | needmore = rnp->need_future_gp[(c + 1) & 0x1]; |
1140 | trace_rcu_future_gp(rnp, rdp, c, needmore ? "CleanupMore" : "Cleanup"); | 1128 | trace_rcu_future_gp(rnp, rdp, c, |
1129 | needmore ? TPS("CleanupMore") : TPS("Cleanup")); | ||
1141 | return needmore; | 1130 | return needmore; |
1142 | } | 1131 | } |
1143 | 1132 | ||
@@ -1205,9 +1194,9 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, | |||
1205 | 1194 | ||
1206 | /* Trace depending on how much we were able to accelerate. */ | 1195 | /* Trace depending on how much we were able to accelerate. */ |
1207 | if (!*rdp->nxttail[RCU_WAIT_TAIL]) | 1196 | if (!*rdp->nxttail[RCU_WAIT_TAIL]) |
1208 | trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccWaitCB"); | 1197 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB")); |
1209 | else | 1198 | else |
1210 | trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccReadyCB"); | 1199 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB")); |
1211 | } | 1200 | } |
1212 | 1201 | ||
1213 | /* | 1202 | /* |
@@ -1273,7 +1262,7 @@ static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struc | |||
1273 | 1262 | ||
1274 | /* Remember that we saw this grace-period completion. */ | 1263 | /* Remember that we saw this grace-period completion. */ |
1275 | rdp->completed = rnp->completed; | 1264 | rdp->completed = rnp->completed; |
1276 | trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuend"); | 1265 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend")); |
1277 | } | 1266 | } |
1278 | 1267 | ||
1279 | if (rdp->gpnum != rnp->gpnum) { | 1268 | if (rdp->gpnum != rnp->gpnum) { |
@@ -1283,7 +1272,7 @@ static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struc | |||
1283 | * go looking for one. | 1272 | * go looking for one. |
1284 | */ | 1273 | */ |
1285 | rdp->gpnum = rnp->gpnum; | 1274 | rdp->gpnum = rnp->gpnum; |
1286 | trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart"); | 1275 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); |
1287 | rdp->passed_quiesce = 0; | 1276 | rdp->passed_quiesce = 0; |
1288 | rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); | 1277 | rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); |
1289 | zero_cpu_stall_ticks(rdp); | 1278 | zero_cpu_stall_ticks(rdp); |
@@ -1315,6 +1304,7 @@ static int rcu_gp_init(struct rcu_state *rsp) | |||
1315 | struct rcu_data *rdp; | 1304 | struct rcu_data *rdp; |
1316 | struct rcu_node *rnp = rcu_get_root(rsp); | 1305 | struct rcu_node *rnp = rcu_get_root(rsp); |
1317 | 1306 | ||
1307 | rcu_bind_gp_kthread(); | ||
1318 | raw_spin_lock_irq(&rnp->lock); | 1308 | raw_spin_lock_irq(&rnp->lock); |
1319 | rsp->gp_flags = 0; /* Clear all flags: New grace period. */ | 1309 | rsp->gp_flags = 0; /* Clear all flags: New grace period. */ |
1320 | 1310 | ||
@@ -1326,7 +1316,7 @@ static int rcu_gp_init(struct rcu_state *rsp) | |||
1326 | 1316 | ||
1327 | /* Advance to a new grace period and initialize state. */ | 1317 | /* Advance to a new grace period and initialize state. */ |
1328 | rsp->gpnum++; | 1318 | rsp->gpnum++; |
1329 | trace_rcu_grace_period(rsp->name, rsp->gpnum, "start"); | 1319 | trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); |
1330 | record_gp_stall_check_time(rsp); | 1320 | record_gp_stall_check_time(rsp); |
1331 | raw_spin_unlock_irq(&rnp->lock); | 1321 | raw_spin_unlock_irq(&rnp->lock); |
1332 | 1322 | ||
@@ -1379,16 +1369,25 @@ static int rcu_gp_init(struct rcu_state *rsp) | |||
1379 | int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) | 1369 | int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) |
1380 | { | 1370 | { |
1381 | int fqs_state = fqs_state_in; | 1371 | int fqs_state = fqs_state_in; |
1372 | bool isidle = false; | ||
1373 | unsigned long maxj; | ||
1382 | struct rcu_node *rnp = rcu_get_root(rsp); | 1374 | struct rcu_node *rnp = rcu_get_root(rsp); |
1383 | 1375 | ||
1384 | rsp->n_force_qs++; | 1376 | rsp->n_force_qs++; |
1385 | if (fqs_state == RCU_SAVE_DYNTICK) { | 1377 | if (fqs_state == RCU_SAVE_DYNTICK) { |
1386 | /* Collect dyntick-idle snapshots. */ | 1378 | /* Collect dyntick-idle snapshots. */ |
1387 | force_qs_rnp(rsp, dyntick_save_progress_counter); | 1379 | if (is_sysidle_rcu_state(rsp)) { |
1380 | isidle = 1; | ||
1381 | maxj = jiffies - ULONG_MAX / 4; | ||
1382 | } | ||
1383 | force_qs_rnp(rsp, dyntick_save_progress_counter, | ||
1384 | &isidle, &maxj); | ||
1385 | rcu_sysidle_report_gp(rsp, isidle, maxj); | ||
1388 | fqs_state = RCU_FORCE_QS; | 1386 | fqs_state = RCU_FORCE_QS; |
1389 | } else { | 1387 | } else { |
1390 | /* Handle dyntick-idle and offline CPUs. */ | 1388 | /* Handle dyntick-idle and offline CPUs. */ |
1391 | force_qs_rnp(rsp, rcu_implicit_dynticks_qs); | 1389 | isidle = 0; |
1390 | force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj); | ||
1392 | } | 1391 | } |
1393 | /* Clear flag to prevent immediate re-entry. */ | 1392 | /* Clear flag to prevent immediate re-entry. */ |
1394 | if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { | 1393 | if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { |
@@ -1448,7 +1447,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) | |||
1448 | rcu_nocb_gp_set(rnp, nocb); | 1447 | rcu_nocb_gp_set(rnp, nocb); |
1449 | 1448 | ||
1450 | rsp->completed = rsp->gpnum; /* Declare grace period done. */ | 1449 | rsp->completed = rsp->gpnum; /* Declare grace period done. */ |
1451 | trace_rcu_grace_period(rsp->name, rsp->completed, "end"); | 1450 | trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end")); |
1452 | rsp->fqs_state = RCU_GP_IDLE; | 1451 | rsp->fqs_state = RCU_GP_IDLE; |
1453 | rdp = this_cpu_ptr(rsp->rda); | 1452 | rdp = this_cpu_ptr(rsp->rda); |
1454 | rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */ | 1453 | rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */ |
@@ -1558,10 +1557,12 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, | |||
1558 | 1557 | ||
1559 | /* | 1558 | /* |
1560 | * We can't do wakeups while holding the rnp->lock, as that | 1559 | * We can't do wakeups while holding the rnp->lock, as that |
1561 | * could cause possible deadlocks with the rq->lock. Deter | 1560 | * could cause possible deadlocks with the rq->lock. Defer |
1562 | * the wakeup to interrupt context. | 1561 | * the wakeup to interrupt context. And don't bother waking |
1562 | * up the running kthread. | ||
1563 | */ | 1563 | */ |
1564 | irq_work_queue(&rsp->wakeup_work); | 1564 | if (current != rsp->gp_kthread) |
1565 | irq_work_queue(&rsp->wakeup_work); | ||
1565 | } | 1566 | } |
1566 | 1567 | ||
1567 | /* | 1568 | /* |
@@ -1857,7 +1858,7 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) | |||
1857 | RCU_TRACE(mask = rdp->grpmask); | 1858 | RCU_TRACE(mask = rdp->grpmask); |
1858 | trace_rcu_grace_period(rsp->name, | 1859 | trace_rcu_grace_period(rsp->name, |
1859 | rnp->gpnum + 1 - !!(rnp->qsmask & mask), | 1860 | rnp->gpnum + 1 - !!(rnp->qsmask & mask), |
1860 | "cpuofl"); | 1861 | TPS("cpuofl")); |
1861 | } | 1862 | } |
1862 | 1863 | ||
1863 | /* | 1864 | /* |
@@ -2044,7 +2045,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
2044 | */ | 2045 | */ |
2045 | void rcu_check_callbacks(int cpu, int user) | 2046 | void rcu_check_callbacks(int cpu, int user) |
2046 | { | 2047 | { |
2047 | trace_rcu_utilization("Start scheduler-tick"); | 2048 | trace_rcu_utilization(TPS("Start scheduler-tick")); |
2048 | increment_cpu_stall_ticks(); | 2049 | increment_cpu_stall_ticks(); |
2049 | if (user || rcu_is_cpu_rrupt_from_idle()) { | 2050 | if (user || rcu_is_cpu_rrupt_from_idle()) { |
2050 | 2051 | ||
@@ -2077,7 +2078,7 @@ void rcu_check_callbacks(int cpu, int user) | |||
2077 | rcu_preempt_check_callbacks(cpu); | 2078 | rcu_preempt_check_callbacks(cpu); |
2078 | if (rcu_pending(cpu)) | 2079 | if (rcu_pending(cpu)) |
2079 | invoke_rcu_core(); | 2080 | invoke_rcu_core(); |
2080 | trace_rcu_utilization("End scheduler-tick"); | 2081 | trace_rcu_utilization(TPS("End scheduler-tick")); |
2081 | } | 2082 | } |
2082 | 2083 | ||
2083 | /* | 2084 | /* |
@@ -2087,7 +2088,10 @@ void rcu_check_callbacks(int cpu, int user) | |||
2087 | * | 2088 | * |
2088 | * The caller must have suppressed start of new grace periods. | 2089 | * The caller must have suppressed start of new grace periods. |
2089 | */ | 2090 | */ |
2090 | static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) | 2091 | static void force_qs_rnp(struct rcu_state *rsp, |
2092 | int (*f)(struct rcu_data *rsp, bool *isidle, | ||
2093 | unsigned long *maxj), | ||
2094 | bool *isidle, unsigned long *maxj) | ||
2091 | { | 2095 | { |
2092 | unsigned long bit; | 2096 | unsigned long bit; |
2093 | int cpu; | 2097 | int cpu; |
@@ -2110,9 +2114,12 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) | |||
2110 | cpu = rnp->grplo; | 2114 | cpu = rnp->grplo; |
2111 | bit = 1; | 2115 | bit = 1; |
2112 | for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { | 2116 | for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { |
2113 | if ((rnp->qsmask & bit) != 0 && | 2117 | if ((rnp->qsmask & bit) != 0) { |
2114 | f(per_cpu_ptr(rsp->rda, cpu))) | 2118 | if ((rnp->qsmaskinit & bit) != 0) |
2115 | mask |= bit; | 2119 | *isidle = 0; |
2120 | if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) | ||
2121 | mask |= bit; | ||
2122 | } | ||
2116 | } | 2123 | } |
2117 | if (mask != 0) { | 2124 | if (mask != 0) { |
2118 | 2125 | ||
@@ -2208,10 +2215,10 @@ static void rcu_process_callbacks(struct softirq_action *unused) | |||
2208 | 2215 | ||
2209 | if (cpu_is_offline(smp_processor_id())) | 2216 | if (cpu_is_offline(smp_processor_id())) |
2210 | return; | 2217 | return; |
2211 | trace_rcu_utilization("Start RCU core"); | 2218 | trace_rcu_utilization(TPS("Start RCU core")); |
2212 | for_each_rcu_flavor(rsp) | 2219 | for_each_rcu_flavor(rsp) |
2213 | __rcu_process_callbacks(rsp); | 2220 | __rcu_process_callbacks(rsp); |
2214 | trace_rcu_utilization("End RCU core"); | 2221 | trace_rcu_utilization(TPS("End RCU core")); |
2215 | } | 2222 | } |
2216 | 2223 | ||
2217 | /* | 2224 | /* |
@@ -2287,6 +2294,13 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, | |||
2287 | } | 2294 | } |
2288 | 2295 | ||
2289 | /* | 2296 | /* |
2297 | * RCU callback function to leak a callback. | ||
2298 | */ | ||
2299 | static void rcu_leak_callback(struct rcu_head *rhp) | ||
2300 | { | ||
2301 | } | ||
2302 | |||
2303 | /* | ||
2290 | * Helper function for call_rcu() and friends. The cpu argument will | 2304 | * Helper function for call_rcu() and friends. The cpu argument will |
2291 | * normally be -1, indicating "currently running CPU". It may specify | 2305 | * normally be -1, indicating "currently running CPU". It may specify |
2292 | * a CPU only if that CPU is a no-CBs CPU. Currently, only _rcu_barrier() | 2306 | * a CPU only if that CPU is a no-CBs CPU. Currently, only _rcu_barrier() |
@@ -2300,7 +2314,12 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
2300 | struct rcu_data *rdp; | 2314 | struct rcu_data *rdp; |
2301 | 2315 | ||
2302 | WARN_ON_ONCE((unsigned long)head & 0x3); /* Misaligned rcu_head! */ | 2316 | WARN_ON_ONCE((unsigned long)head & 0x3); /* Misaligned rcu_head! */ |
2303 | debug_rcu_head_queue(head); | 2317 | if (debug_rcu_head_queue(head)) { |
2318 | /* Probable double call_rcu(), so leak the callback. */ | ||
2319 | ACCESS_ONCE(head->func) = rcu_leak_callback; | ||
2320 | WARN_ONCE(1, "__call_rcu(): Leaked duplicate callback\n"); | ||
2321 | return; | ||
2322 | } | ||
2304 | head->func = func; | 2323 | head->func = func; |
2305 | head->next = NULL; | 2324 | head->next = NULL; |
2306 | 2325 | ||
@@ -2720,7 +2739,7 @@ static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy) | |||
2720 | * Helper function for _rcu_barrier() tracing. If tracing is disabled, | 2739 | * Helper function for _rcu_barrier() tracing. If tracing is disabled, |
2721 | * the compiler is expected to optimize this away. | 2740 | * the compiler is expected to optimize this away. |
2722 | */ | 2741 | */ |
2723 | static void _rcu_barrier_trace(struct rcu_state *rsp, char *s, | 2742 | static void _rcu_barrier_trace(struct rcu_state *rsp, const char *s, |
2724 | int cpu, unsigned long done) | 2743 | int cpu, unsigned long done) |
2725 | { | 2744 | { |
2726 | trace_rcu_barrier(rsp->name, s, cpu, | 2745 | trace_rcu_barrier(rsp->name, s, cpu, |
@@ -2785,9 +2804,20 @@ static void _rcu_barrier(struct rcu_state *rsp) | |||
2785 | * transition. The "if" expression below therefore rounds the old | 2804 | * transition. The "if" expression below therefore rounds the old |
2786 | * value up to the next even number and adds two before comparing. | 2805 | * value up to the next even number and adds two before comparing. |
2787 | */ | 2806 | */ |
2788 | snap_done = ACCESS_ONCE(rsp->n_barrier_done); | 2807 | snap_done = rsp->n_barrier_done; |
2789 | _rcu_barrier_trace(rsp, "Check", -1, snap_done); | 2808 | _rcu_barrier_trace(rsp, "Check", -1, snap_done); |
2790 | if (ULONG_CMP_GE(snap_done, ((snap + 1) & ~0x1) + 2)) { | 2809 | |
2810 | /* | ||
2811 | * If the value in snap is odd, we needed to wait for the current | ||
2812 | * rcu_barrier() to complete, then wait for the next one, in other | ||
2813 | * words, we need the value of snap_done to be three larger than | ||
2814 | * the value of snap. On the other hand, if the value in snap is | ||
2815 | * even, we only had to wait for the next rcu_barrier() to complete, | ||
2816 | * in other words, we need the value of snap_done to be only two | ||
2817 | * greater than the value of snap. The "(snap + 3) & ~0x1" computes | ||
2818 | * this for us (thank you, Linus!). | ||
2819 | */ | ||
2820 | if (ULONG_CMP_GE(snap_done, (snap + 3) & ~0x1)) { | ||
2791 | _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done); | 2821 | _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done); |
2792 | smp_mb(); /* caller's subsequent code after above check. */ | 2822 | smp_mb(); /* caller's subsequent code after above check. */ |
2793 | mutex_unlock(&rsp->barrier_mutex); | 2823 | mutex_unlock(&rsp->barrier_mutex); |
@@ -2910,7 +2940,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) | |||
2910 | * can accept some slop in the rsp->completed access due to the fact | 2940 | * can accept some slop in the rsp->completed access due to the fact |
2911 | * that this CPU cannot possibly have any RCU callbacks in flight yet. | 2941 | * that this CPU cannot possibly have any RCU callbacks in flight yet. |
2912 | */ | 2942 | */ |
2913 | static void __cpuinit | 2943 | static void |
2914 | rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) | 2944 | rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) |
2915 | { | 2945 | { |
2916 | unsigned long flags; | 2946 | unsigned long flags; |
@@ -2930,6 +2960,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) | |||
2930 | rdp->blimit = blimit; | 2960 | rdp->blimit = blimit; |
2931 | init_callback_list(rdp); /* Re-enable callbacks on this CPU. */ | 2961 | init_callback_list(rdp); /* Re-enable callbacks on this CPU. */ |
2932 | rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; | 2962 | rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; |
2963 | rcu_sysidle_init_percpu_data(rdp->dynticks); | ||
2933 | atomic_set(&rdp->dynticks->dynticks, | 2964 | atomic_set(&rdp->dynticks->dynticks, |
2934 | (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); | 2965 | (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); |
2935 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 2966 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
@@ -2952,7 +2983,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) | |||
2952 | rdp->completed = rnp->completed; | 2983 | rdp->completed = rnp->completed; |
2953 | rdp->passed_quiesce = 0; | 2984 | rdp->passed_quiesce = 0; |
2954 | rdp->qs_pending = 0; | 2985 | rdp->qs_pending = 0; |
2955 | trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuonl"); | 2986 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); |
2956 | } | 2987 | } |
2957 | raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ | 2988 | raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ |
2958 | rnp = rnp->parent; | 2989 | rnp = rnp->parent; |
@@ -2962,7 +2993,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) | |||
2962 | mutex_unlock(&rsp->onoff_mutex); | 2993 | mutex_unlock(&rsp->onoff_mutex); |
2963 | } | 2994 | } |
2964 | 2995 | ||
2965 | static void __cpuinit rcu_prepare_cpu(int cpu) | 2996 | static void rcu_prepare_cpu(int cpu) |
2966 | { | 2997 | { |
2967 | struct rcu_state *rsp; | 2998 | struct rcu_state *rsp; |
2968 | 2999 | ||
@@ -2974,7 +3005,7 @@ static void __cpuinit rcu_prepare_cpu(int cpu) | |||
2974 | /* | 3005 | /* |
2975 | * Handle CPU online/offline notification events. | 3006 | * Handle CPU online/offline notification events. |
2976 | */ | 3007 | */ |
2977 | static int __cpuinit rcu_cpu_notify(struct notifier_block *self, | 3008 | static int rcu_cpu_notify(struct notifier_block *self, |
2978 | unsigned long action, void *hcpu) | 3009 | unsigned long action, void *hcpu) |
2979 | { | 3010 | { |
2980 | long cpu = (long)hcpu; | 3011 | long cpu = (long)hcpu; |
@@ -2982,7 +3013,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, | |||
2982 | struct rcu_node *rnp = rdp->mynode; | 3013 | struct rcu_node *rnp = rdp->mynode; |
2983 | struct rcu_state *rsp; | 3014 | struct rcu_state *rsp; |
2984 | 3015 | ||
2985 | trace_rcu_utilization("Start CPU hotplug"); | 3016 | trace_rcu_utilization(TPS("Start CPU hotplug")); |
2986 | switch (action) { | 3017 | switch (action) { |
2987 | case CPU_UP_PREPARE: | 3018 | case CPU_UP_PREPARE: |
2988 | case CPU_UP_PREPARE_FROZEN: | 3019 | case CPU_UP_PREPARE_FROZEN: |
@@ -3011,7 +3042,26 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, | |||
3011 | default: | 3042 | default: |
3012 | break; | 3043 | break; |
3013 | } | 3044 | } |
3014 | trace_rcu_utilization("End CPU hotplug"); | 3045 | trace_rcu_utilization(TPS("End CPU hotplug")); |
3046 | return NOTIFY_OK; | ||
3047 | } | ||
3048 | |||
3049 | static int rcu_pm_notify(struct notifier_block *self, | ||
3050 | unsigned long action, void *hcpu) | ||
3051 | { | ||
3052 | switch (action) { | ||
3053 | case PM_HIBERNATION_PREPARE: | ||
3054 | case PM_SUSPEND_PREPARE: | ||
3055 | if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */ | ||
3056 | rcu_expedited = 1; | ||
3057 | break; | ||
3058 | case PM_POST_HIBERNATION: | ||
3059 | case PM_POST_SUSPEND: | ||
3060 | rcu_expedited = 0; | ||
3061 | break; | ||
3062 | default: | ||
3063 | break; | ||
3064 | } | ||
3015 | return NOTIFY_OK; | 3065 | return NOTIFY_OK; |
3016 | } | 3066 | } |
3017 | 3067 | ||
@@ -3256,6 +3306,7 @@ void __init rcu_init(void) | |||
3256 | * or the scheduler are operational. | 3306 | * or the scheduler are operational. |
3257 | */ | 3307 | */ |
3258 | cpu_notifier(rcu_cpu_notify, 0); | 3308 | cpu_notifier(rcu_cpu_notify, 0); |
3309 | pm_notifier(rcu_pm_notify, 0); | ||
3259 | for_each_online_cpu(cpu) | 3310 | for_each_online_cpu(cpu) |
3260 | rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); | 3311 | rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); |
3261 | } | 3312 | } |
diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 4a39d364493c..5f97eab602cd 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h | |||
@@ -88,6 +88,14 @@ struct rcu_dynticks { | |||
88 | /* Process level is worth LLONG_MAX/2. */ | 88 | /* Process level is worth LLONG_MAX/2. */ |
89 | int dynticks_nmi_nesting; /* Track NMI nesting level. */ | 89 | int dynticks_nmi_nesting; /* Track NMI nesting level. */ |
90 | atomic_t dynticks; /* Even value for idle, else odd. */ | 90 | atomic_t dynticks; /* Even value for idle, else odd. */ |
91 | #ifdef CONFIG_NO_HZ_FULL_SYSIDLE | ||
92 | long long dynticks_idle_nesting; | ||
93 | /* irq/process nesting level from idle. */ | ||
94 | atomic_t dynticks_idle; /* Even value for idle, else odd. */ | ||
95 | /* "Idle" excludes userspace execution. */ | ||
96 | unsigned long dynticks_idle_jiffies; | ||
97 | /* End of last non-NMI non-idle period. */ | ||
98 | #endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ | ||
91 | #ifdef CONFIG_RCU_FAST_NO_HZ | 99 | #ifdef CONFIG_RCU_FAST_NO_HZ |
92 | bool all_lazy; /* Are all CPU's CBs lazy? */ | 100 | bool all_lazy; /* Are all CPU's CBs lazy? */ |
93 | unsigned long nonlazy_posted; | 101 | unsigned long nonlazy_posted; |
@@ -445,7 +453,7 @@ struct rcu_state { | |||
445 | /* for CPU stalls. */ | 453 | /* for CPU stalls. */ |
446 | unsigned long gp_max; /* Maximum GP duration in */ | 454 | unsigned long gp_max; /* Maximum GP duration in */ |
447 | /* jiffies. */ | 455 | /* jiffies. */ |
448 | char *name; /* Name of structure. */ | 456 | const char *name; /* Name of structure. */ |
449 | char abbr; /* Abbreviated name. */ | 457 | char abbr; /* Abbreviated name. */ |
450 | struct list_head flavors; /* List of RCU flavors. */ | 458 | struct list_head flavors; /* List of RCU flavors. */ |
451 | struct irq_work wakeup_work; /* Postponed wakeups */ | 459 | struct irq_work wakeup_work; /* Postponed wakeups */ |
@@ -521,10 +529,10 @@ static void invoke_rcu_callbacks_kthread(void); | |||
521 | static bool rcu_is_callbacks_kthread(void); | 529 | static bool rcu_is_callbacks_kthread(void); |
522 | #ifdef CONFIG_RCU_BOOST | 530 | #ifdef CONFIG_RCU_BOOST |
523 | static void rcu_preempt_do_callbacks(void); | 531 | static void rcu_preempt_do_callbacks(void); |
524 | static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, | 532 | static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, |
525 | struct rcu_node *rnp); | 533 | struct rcu_node *rnp); |
526 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 534 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
527 | static void __cpuinit rcu_prepare_kthreads(int cpu); | 535 | static void rcu_prepare_kthreads(int cpu); |
528 | static void rcu_cleanup_after_idle(int cpu); | 536 | static void rcu_cleanup_after_idle(int cpu); |
529 | static void rcu_prepare_for_idle(int cpu); | 537 | static void rcu_prepare_for_idle(int cpu); |
530 | static void rcu_idle_count_callbacks_posted(void); | 538 | static void rcu_idle_count_callbacks_posted(void); |
@@ -545,6 +553,15 @@ static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); | |||
545 | static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); | 553 | static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); |
546 | static void rcu_kick_nohz_cpu(int cpu); | 554 | static void rcu_kick_nohz_cpu(int cpu); |
547 | static bool init_nocb_callback_list(struct rcu_data *rdp); | 555 | static bool init_nocb_callback_list(struct rcu_data *rdp); |
556 | static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq); | ||
557 | static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq); | ||
558 | static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, | ||
559 | unsigned long *maxj); | ||
560 | static bool is_sysidle_rcu_state(struct rcu_state *rsp); | ||
561 | static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, | ||
562 | unsigned long maxj); | ||
563 | static void rcu_bind_gp_kthread(void); | ||
564 | static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp); | ||
548 | 565 | ||
549 | #endif /* #ifndef RCU_TREE_NONCORE */ | 566 | #endif /* #ifndef RCU_TREE_NONCORE */ |
550 | 567 | ||
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 63098a59216e..130c97b027f2 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h | |||
@@ -28,7 +28,7 @@ | |||
28 | #include <linux/gfp.h> | 28 | #include <linux/gfp.h> |
29 | #include <linux/oom.h> | 29 | #include <linux/oom.h> |
30 | #include <linux/smpboot.h> | 30 | #include <linux/smpboot.h> |
31 | #include <linux/tick.h> | 31 | #include "time/tick-internal.h" |
32 | 32 | ||
33 | #define RCU_KTHREAD_PRIO 1 | 33 | #define RCU_KTHREAD_PRIO 1 |
34 | 34 | ||
@@ -110,9 +110,7 @@ static void __init rcu_bootup_announce_oddness(void) | |||
110 | 110 | ||
111 | #ifdef CONFIG_TREE_PREEMPT_RCU | 111 | #ifdef CONFIG_TREE_PREEMPT_RCU |
112 | 112 | ||
113 | struct rcu_state rcu_preempt_state = | 113 | RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); |
114 | RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); | ||
115 | DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); | ||
116 | static struct rcu_state *rcu_state = &rcu_preempt_state; | 114 | static struct rcu_state *rcu_state = &rcu_preempt_state; |
117 | 115 | ||
118 | static int rcu_preempted_readers_exp(struct rcu_node *rnp); | 116 | static int rcu_preempted_readers_exp(struct rcu_node *rnp); |
@@ -169,7 +167,7 @@ static void rcu_preempt_qs(int cpu) | |||
169 | struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); | 167 | struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); |
170 | 168 | ||
171 | if (rdp->passed_quiesce == 0) | 169 | if (rdp->passed_quiesce == 0) |
172 | trace_rcu_grace_period("rcu_preempt", rdp->gpnum, "cpuqs"); | 170 | trace_rcu_grace_period(TPS("rcu_preempt"), rdp->gpnum, TPS("cpuqs")); |
173 | rdp->passed_quiesce = 1; | 171 | rdp->passed_quiesce = 1; |
174 | current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; | 172 | current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; |
175 | } | 173 | } |
@@ -388,7 +386,7 @@ void rcu_read_unlock_special(struct task_struct *t) | |||
388 | np = rcu_next_node_entry(t, rnp); | 386 | np = rcu_next_node_entry(t, rnp); |
389 | list_del_init(&t->rcu_node_entry); | 387 | list_del_init(&t->rcu_node_entry); |
390 | t->rcu_blocked_node = NULL; | 388 | t->rcu_blocked_node = NULL; |
391 | trace_rcu_unlock_preempted_task("rcu_preempt", | 389 | trace_rcu_unlock_preempted_task(TPS("rcu_preempt"), |
392 | rnp->gpnum, t->pid); | 390 | rnp->gpnum, t->pid); |
393 | if (&t->rcu_node_entry == rnp->gp_tasks) | 391 | if (&t->rcu_node_entry == rnp->gp_tasks) |
394 | rnp->gp_tasks = np; | 392 | rnp->gp_tasks = np; |
@@ -412,7 +410,7 @@ void rcu_read_unlock_special(struct task_struct *t) | |||
412 | */ | 410 | */ |
413 | empty_exp_now = !rcu_preempted_readers_exp(rnp); | 411 | empty_exp_now = !rcu_preempted_readers_exp(rnp); |
414 | if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) { | 412 | if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) { |
415 | trace_rcu_quiescent_state_report("preempt_rcu", | 413 | trace_rcu_quiescent_state_report(TPS("preempt_rcu"), |
416 | rnp->gpnum, | 414 | rnp->gpnum, |
417 | 0, rnp->qsmask, | 415 | 0, rnp->qsmask, |
418 | rnp->level, | 416 | rnp->level, |
@@ -1250,12 +1248,12 @@ static int rcu_boost_kthread(void *arg) | |||
1250 | int spincnt = 0; | 1248 | int spincnt = 0; |
1251 | int more2boost; | 1249 | int more2boost; |
1252 | 1250 | ||
1253 | trace_rcu_utilization("Start boost kthread@init"); | 1251 | trace_rcu_utilization(TPS("Start boost kthread@init")); |
1254 | for (;;) { | 1252 | for (;;) { |
1255 | rnp->boost_kthread_status = RCU_KTHREAD_WAITING; | 1253 | rnp->boost_kthread_status = RCU_KTHREAD_WAITING; |
1256 | trace_rcu_utilization("End boost kthread@rcu_wait"); | 1254 | trace_rcu_utilization(TPS("End boost kthread@rcu_wait")); |
1257 | rcu_wait(rnp->boost_tasks || rnp->exp_tasks); | 1255 | rcu_wait(rnp->boost_tasks || rnp->exp_tasks); |
1258 | trace_rcu_utilization("Start boost kthread@rcu_wait"); | 1256 | trace_rcu_utilization(TPS("Start boost kthread@rcu_wait")); |
1259 | rnp->boost_kthread_status = RCU_KTHREAD_RUNNING; | 1257 | rnp->boost_kthread_status = RCU_KTHREAD_RUNNING; |
1260 | more2boost = rcu_boost(rnp); | 1258 | more2boost = rcu_boost(rnp); |
1261 | if (more2boost) | 1259 | if (more2boost) |
@@ -1264,14 +1262,14 @@ static int rcu_boost_kthread(void *arg) | |||
1264 | spincnt = 0; | 1262 | spincnt = 0; |
1265 | if (spincnt > 10) { | 1263 | if (spincnt > 10) { |
1266 | rnp->boost_kthread_status = RCU_KTHREAD_YIELDING; | 1264 | rnp->boost_kthread_status = RCU_KTHREAD_YIELDING; |
1267 | trace_rcu_utilization("End boost kthread@rcu_yield"); | 1265 | trace_rcu_utilization(TPS("End boost kthread@rcu_yield")); |
1268 | schedule_timeout_interruptible(2); | 1266 | schedule_timeout_interruptible(2); |
1269 | trace_rcu_utilization("Start boost kthread@rcu_yield"); | 1267 | trace_rcu_utilization(TPS("Start boost kthread@rcu_yield")); |
1270 | spincnt = 0; | 1268 | spincnt = 0; |
1271 | } | 1269 | } |
1272 | } | 1270 | } |
1273 | /* NOTREACHED */ | 1271 | /* NOTREACHED */ |
1274 | trace_rcu_utilization("End boost kthread@notreached"); | 1272 | trace_rcu_utilization(TPS("End boost kthread@notreached")); |
1275 | return 0; | 1273 | return 0; |
1276 | } | 1274 | } |
1277 | 1275 | ||
@@ -1352,7 +1350,7 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) | |||
1352 | * already exist. We only create this kthread for preemptible RCU. | 1350 | * already exist. We only create this kthread for preemptible RCU. |
1353 | * Returns zero if all is well, a negated errno otherwise. | 1351 | * Returns zero if all is well, a negated errno otherwise. |
1354 | */ | 1352 | */ |
1355 | static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, | 1353 | static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, |
1356 | struct rcu_node *rnp) | 1354 | struct rcu_node *rnp) |
1357 | { | 1355 | { |
1358 | int rnp_index = rnp - &rsp->node[0]; | 1356 | int rnp_index = rnp - &rsp->node[0]; |
@@ -1419,7 +1417,7 @@ static void rcu_cpu_kthread(unsigned int cpu) | |||
1419 | int spincnt; | 1417 | int spincnt; |
1420 | 1418 | ||
1421 | for (spincnt = 0; spincnt < 10; spincnt++) { | 1419 | for (spincnt = 0; spincnt < 10; spincnt++) { |
1422 | trace_rcu_utilization("Start CPU kthread@rcu_wait"); | 1420 | trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait")); |
1423 | local_bh_disable(); | 1421 | local_bh_disable(); |
1424 | *statusp = RCU_KTHREAD_RUNNING; | 1422 | *statusp = RCU_KTHREAD_RUNNING; |
1425 | this_cpu_inc(rcu_cpu_kthread_loops); | 1423 | this_cpu_inc(rcu_cpu_kthread_loops); |
@@ -1431,15 +1429,15 @@ static void rcu_cpu_kthread(unsigned int cpu) | |||
1431 | rcu_kthread_do_work(); | 1429 | rcu_kthread_do_work(); |
1432 | local_bh_enable(); | 1430 | local_bh_enable(); |
1433 | if (*workp == 0) { | 1431 | if (*workp == 0) { |
1434 | trace_rcu_utilization("End CPU kthread@rcu_wait"); | 1432 | trace_rcu_utilization(TPS("End CPU kthread@rcu_wait")); |
1435 | *statusp = RCU_KTHREAD_WAITING; | 1433 | *statusp = RCU_KTHREAD_WAITING; |
1436 | return; | 1434 | return; |
1437 | } | 1435 | } |
1438 | } | 1436 | } |
1439 | *statusp = RCU_KTHREAD_YIELDING; | 1437 | *statusp = RCU_KTHREAD_YIELDING; |
1440 | trace_rcu_utilization("Start CPU kthread@rcu_yield"); | 1438 | trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield")); |
1441 | schedule_timeout_interruptible(2); | 1439 | schedule_timeout_interruptible(2); |
1442 | trace_rcu_utilization("End CPU kthread@rcu_yield"); | 1440 | trace_rcu_utilization(TPS("End CPU kthread@rcu_yield")); |
1443 | *statusp = RCU_KTHREAD_WAITING; | 1441 | *statusp = RCU_KTHREAD_WAITING; |
1444 | } | 1442 | } |
1445 | 1443 | ||
@@ -1507,7 +1505,7 @@ static int __init rcu_spawn_kthreads(void) | |||
1507 | } | 1505 | } |
1508 | early_initcall(rcu_spawn_kthreads); | 1506 | early_initcall(rcu_spawn_kthreads); |
1509 | 1507 | ||
1510 | static void __cpuinit rcu_prepare_kthreads(int cpu) | 1508 | static void rcu_prepare_kthreads(int cpu) |
1511 | { | 1509 | { |
1512 | struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); | 1510 | struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); |
1513 | struct rcu_node *rnp = rdp->mynode; | 1511 | struct rcu_node *rnp = rdp->mynode; |
@@ -1549,7 +1547,7 @@ static int __init rcu_scheduler_really_started(void) | |||
1549 | } | 1547 | } |
1550 | early_initcall(rcu_scheduler_really_started); | 1548 | early_initcall(rcu_scheduler_really_started); |
1551 | 1549 | ||
1552 | static void __cpuinit rcu_prepare_kthreads(int cpu) | 1550 | static void rcu_prepare_kthreads(int cpu) |
1553 | { | 1551 | { |
1554 | } | 1552 | } |
1555 | 1553 | ||
@@ -2202,7 +2200,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) | |||
2202 | * Wait for the grace period. Do so interruptibly to avoid messing | 2200 | * Wait for the grace period. Do so interruptibly to avoid messing |
2203 | * up the load average. | 2201 | * up the load average. |
2204 | */ | 2202 | */ |
2205 | trace_rcu_future_gp(rnp, rdp, c, "StartWait"); | 2203 | trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait")); |
2206 | for (;;) { | 2204 | for (;;) { |
2207 | wait_event_interruptible( | 2205 | wait_event_interruptible( |
2208 | rnp->nocb_gp_wq[c & 0x1], | 2206 | rnp->nocb_gp_wq[c & 0x1], |
@@ -2210,9 +2208,9 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) | |||
2210 | if (likely(d)) | 2208 | if (likely(d)) |
2211 | break; | 2209 | break; |
2212 | flush_signals(current); | 2210 | flush_signals(current); |
2213 | trace_rcu_future_gp(rnp, rdp, c, "ResumeWait"); | 2211 | trace_rcu_future_gp(rnp, rdp, c, TPS("ResumeWait")); |
2214 | } | 2212 | } |
2215 | trace_rcu_future_gp(rnp, rdp, c, "EndWait"); | 2213 | trace_rcu_future_gp(rnp, rdp, c, TPS("EndWait")); |
2216 | smp_mb(); /* Ensure that CB invocation happens after GP end. */ | 2214 | smp_mb(); /* Ensure that CB invocation happens after GP end. */ |
2217 | } | 2215 | } |
2218 | 2216 | ||
@@ -2375,3 +2373,425 @@ static void rcu_kick_nohz_cpu(int cpu) | |||
2375 | smp_send_reschedule(cpu); | 2373 | smp_send_reschedule(cpu); |
2376 | #endif /* #ifdef CONFIG_NO_HZ_FULL */ | 2374 | #endif /* #ifdef CONFIG_NO_HZ_FULL */ |
2377 | } | 2375 | } |
2376 | |||
2377 | |||
2378 | #ifdef CONFIG_NO_HZ_FULL_SYSIDLE | ||
2379 | |||
2380 | /* | ||
2381 | * Define RCU flavor that holds sysidle state. This needs to be the | ||
2382 | * most active flavor of RCU. | ||
2383 | */ | ||
2384 | #ifdef CONFIG_PREEMPT_RCU | ||
2385 | static struct rcu_state *rcu_sysidle_state = &rcu_preempt_state; | ||
2386 | #else /* #ifdef CONFIG_PREEMPT_RCU */ | ||
2387 | static struct rcu_state *rcu_sysidle_state = &rcu_sched_state; | ||
2388 | #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ | ||
2389 | |||
2390 | static int full_sysidle_state; /* Current system-idle state. */ | ||
2391 | #define RCU_SYSIDLE_NOT 0 /* Some CPU is not idle. */ | ||
2392 | #define RCU_SYSIDLE_SHORT 1 /* All CPUs idle for brief period. */ | ||
2393 | #define RCU_SYSIDLE_LONG 2 /* All CPUs idle for long enough. */ | ||
2394 | #define RCU_SYSIDLE_FULL 3 /* All CPUs idle, ready for sysidle. */ | ||
2395 | #define RCU_SYSIDLE_FULL_NOTED 4 /* Actually entered sysidle state. */ | ||
2396 | |||
2397 | /* | ||
2398 | * Invoked to note exit from irq or task transition to idle. Note that | ||
2399 | * usermode execution does -not- count as idle here! After all, we want | ||
2400 | * to detect full-system idle states, not RCU quiescent states and grace | ||
2401 | * periods. The caller must have disabled interrupts. | ||
2402 | */ | ||
2403 | static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) | ||
2404 | { | ||
2405 | unsigned long j; | ||
2406 | |||
2407 | /* Adjust nesting, check for fully idle. */ | ||
2408 | if (irq) { | ||
2409 | rdtp->dynticks_idle_nesting--; | ||
2410 | WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0); | ||
2411 | if (rdtp->dynticks_idle_nesting != 0) | ||
2412 | return; /* Still not fully idle. */ | ||
2413 | } else { | ||
2414 | if ((rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) == | ||
2415 | DYNTICK_TASK_NEST_VALUE) { | ||
2416 | rdtp->dynticks_idle_nesting = 0; | ||
2417 | } else { | ||
2418 | rdtp->dynticks_idle_nesting -= DYNTICK_TASK_NEST_VALUE; | ||
2419 | WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0); | ||
2420 | return; /* Still not fully idle. */ | ||
2421 | } | ||
2422 | } | ||
2423 | |||
2424 | /* Record start of fully idle period. */ | ||
2425 | j = jiffies; | ||
2426 | ACCESS_ONCE(rdtp->dynticks_idle_jiffies) = j; | ||
2427 | smp_mb__before_atomic_inc(); | ||
2428 | atomic_inc(&rdtp->dynticks_idle); | ||
2429 | smp_mb__after_atomic_inc(); | ||
2430 | WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1); | ||
2431 | } | ||
2432 | |||
2433 | /* | ||
2434 | * Unconditionally force exit from full system-idle state. This is | ||
2435 | * invoked when a normal CPU exits idle, but must be called separately | ||
2436 | * for the timekeeping CPU (tick_do_timer_cpu). The reason for this | ||
2437 | * is that the timekeeping CPU is permitted to take scheduling-clock | ||
2438 | * interrupts while the system is in system-idle state, and of course | ||
2439 | * rcu_sysidle_exit() has no way of distinguishing a scheduling-clock | ||
2440 | * interrupt from any other type of interrupt. | ||
2441 | */ | ||
2442 | void rcu_sysidle_force_exit(void) | ||
2443 | { | ||
2444 | int oldstate = ACCESS_ONCE(full_sysidle_state); | ||
2445 | int newoldstate; | ||
2446 | |||
2447 | /* | ||
2448 | * Each pass through the following loop attempts to exit full | ||
2449 | * system-idle state. If contention proves to be a problem, | ||
2450 | * a trylock-based contention tree could be used here. | ||
2451 | */ | ||
2452 | while (oldstate > RCU_SYSIDLE_SHORT) { | ||
2453 | newoldstate = cmpxchg(&full_sysidle_state, | ||
2454 | oldstate, RCU_SYSIDLE_NOT); | ||
2455 | if (oldstate == newoldstate && | ||
2456 | oldstate == RCU_SYSIDLE_FULL_NOTED) { | ||
2457 | rcu_kick_nohz_cpu(tick_do_timer_cpu); | ||
2458 | return; /* We cleared it, done! */ | ||
2459 | } | ||
2460 | oldstate = newoldstate; | ||
2461 | } | ||
2462 | smp_mb(); /* Order initial oldstate fetch vs. later non-idle work. */ | ||
2463 | } | ||
2464 | |||
2465 | /* | ||
2466 | * Invoked to note entry to irq or task transition from idle. Note that | ||
2467 | * usermode execution does -not- count as idle here! The caller must | ||
2468 | * have disabled interrupts. | ||
2469 | */ | ||
2470 | static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) | ||
2471 | { | ||
2472 | /* Adjust nesting, check for already non-idle. */ | ||
2473 | if (irq) { | ||
2474 | rdtp->dynticks_idle_nesting++; | ||
2475 | WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0); | ||
2476 | if (rdtp->dynticks_idle_nesting != 1) | ||
2477 | return; /* Already non-idle. */ | ||
2478 | } else { | ||
2479 | /* | ||
2480 | * Allow for irq misnesting. Yes, it really is possible | ||
2481 | * to enter an irq handler then never leave it, and maybe | ||
2482 | * also vice versa. Handle both possibilities. | ||
2483 | */ | ||
2484 | if (rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) { | ||
2485 | rdtp->dynticks_idle_nesting += DYNTICK_TASK_NEST_VALUE; | ||
2486 | WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0); | ||
2487 | return; /* Already non-idle. */ | ||
2488 | } else { | ||
2489 | rdtp->dynticks_idle_nesting = DYNTICK_TASK_EXIT_IDLE; | ||
2490 | } | ||
2491 | } | ||
2492 | |||
2493 | /* Record end of idle period. */ | ||
2494 | smp_mb__before_atomic_inc(); | ||
2495 | atomic_inc(&rdtp->dynticks_idle); | ||
2496 | smp_mb__after_atomic_inc(); | ||
2497 | WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1)); | ||
2498 | |||
2499 | /* | ||
2500 | * If we are the timekeeping CPU, we are permitted to be non-idle | ||
2501 | * during a system-idle state. This must be the case, because | ||
2502 | * the timekeeping CPU has to take scheduling-clock interrupts | ||
2503 | * during the time that the system is transitioning to full | ||
2504 | * system-idle state. This means that the timekeeping CPU must | ||
2505 | * invoke rcu_sysidle_force_exit() directly if it does anything | ||
2506 | * more than take a scheduling-clock interrupt. | ||
2507 | */ | ||
2508 | if (smp_processor_id() == tick_do_timer_cpu) | ||
2509 | return; | ||
2510 | |||
2511 | /* Update system-idle state: We are clearly no longer fully idle! */ | ||
2512 | rcu_sysidle_force_exit(); | ||
2513 | } | ||
2514 | |||
2515 | /* | ||
2516 | * Check to see if the current CPU is idle. Note that usermode execution | ||
2517 | * does not count as idle. The caller must have disabled interrupts. | ||
2518 | */ | ||
2519 | static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, | ||
2520 | unsigned long *maxj) | ||
2521 | { | ||
2522 | int cur; | ||
2523 | unsigned long j; | ||
2524 | struct rcu_dynticks *rdtp = rdp->dynticks; | ||
2525 | |||
2526 | /* | ||
2527 | * If some other CPU has already reported non-idle, if this is | ||
2528 | * not the flavor of RCU that tracks sysidle state, or if this | ||
2529 | * is an offline or the timekeeping CPU, nothing to do. | ||
2530 | */ | ||
2531 | if (!*isidle || rdp->rsp != rcu_sysidle_state || | ||
2532 | cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu) | ||
2533 | return; | ||
2534 | if (rcu_gp_in_progress(rdp->rsp)) | ||
2535 | WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu); | ||
2536 | |||
2537 | /* Pick up current idle and NMI-nesting counter and check. */ | ||
2538 | cur = atomic_read(&rdtp->dynticks_idle); | ||
2539 | if (cur & 0x1) { | ||
2540 | *isidle = false; /* We are not idle! */ | ||
2541 | return; | ||
2542 | } | ||
2543 | smp_mb(); /* Read counters before timestamps. */ | ||
2544 | |||
2545 | /* Pick up timestamps. */ | ||
2546 | j = ACCESS_ONCE(rdtp->dynticks_idle_jiffies); | ||
2547 | /* If this CPU entered idle more recently, update maxj timestamp. */ | ||
2548 | if (ULONG_CMP_LT(*maxj, j)) | ||
2549 | *maxj = j; | ||
2550 | } | ||
2551 | |||
2552 | /* | ||
2553 | * Is this the flavor of RCU that is handling full-system idle? | ||
2554 | */ | ||
2555 | static bool is_sysidle_rcu_state(struct rcu_state *rsp) | ||
2556 | { | ||
2557 | return rsp == rcu_sysidle_state; | ||
2558 | } | ||
2559 | |||
2560 | /* | ||
2561 | * Bind the grace-period kthread for the sysidle flavor of RCU to the | ||
2562 | * timekeeping CPU. | ||
2563 | */ | ||
2564 | static void rcu_bind_gp_kthread(void) | ||
2565 | { | ||
2566 | int cpu = ACCESS_ONCE(tick_do_timer_cpu); | ||
2567 | |||
2568 | if (cpu < 0 || cpu >= nr_cpu_ids) | ||
2569 | return; | ||
2570 | if (raw_smp_processor_id() != cpu) | ||
2571 | set_cpus_allowed_ptr(current, cpumask_of(cpu)); | ||
2572 | } | ||
2573 | |||
2574 | /* | ||
2575 | * Return a delay in jiffies based on the number of CPUs, rcu_node | ||
2576 | * leaf fanout, and jiffies tick rate. The idea is to allow larger | ||
2577 | * systems more time to transition to full-idle state in order to | ||
2578 | * avoid the cache thrashing that otherwise occur on the state variable. | ||
2579 | * Really small systems (less than a couple of tens of CPUs) should | ||
2580 | * instead use a single global atomically incremented counter, and later | ||
2581 | * versions of this will automatically reconfigure themselves accordingly. | ||
2582 | */ | ||
2583 | static unsigned long rcu_sysidle_delay(void) | ||
2584 | { | ||
2585 | if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) | ||
2586 | return 0; | ||
2587 | return DIV_ROUND_UP(nr_cpu_ids * HZ, rcu_fanout_leaf * 1000); | ||
2588 | } | ||
2589 | |||
2590 | /* | ||
2591 | * Advance the full-system-idle state. This is invoked when all of | ||
2592 | * the non-timekeeping CPUs are idle. | ||
2593 | */ | ||
2594 | static void rcu_sysidle(unsigned long j) | ||
2595 | { | ||
2596 | /* Check the current state. */ | ||
2597 | switch (ACCESS_ONCE(full_sysidle_state)) { | ||
2598 | case RCU_SYSIDLE_NOT: | ||
2599 | |||
2600 | /* First time all are idle, so note a short idle period. */ | ||
2601 | ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_SHORT; | ||
2602 | break; | ||
2603 | |||
2604 | case RCU_SYSIDLE_SHORT: | ||
2605 | |||
2606 | /* | ||
2607 | * Idle for a bit, time to advance to next state? | ||
2608 | * cmpxchg failure means race with non-idle, let them win. | ||
2609 | */ | ||
2610 | if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay())) | ||
2611 | (void)cmpxchg(&full_sysidle_state, | ||
2612 | RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG); | ||
2613 | break; | ||
2614 | |||
2615 | case RCU_SYSIDLE_LONG: | ||
2616 | |||
2617 | /* | ||
2618 | * Do an additional check pass before advancing to full. | ||
2619 | * cmpxchg failure means race with non-idle, let them win. | ||
2620 | */ | ||
2621 | if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay())) | ||
2622 | (void)cmpxchg(&full_sysidle_state, | ||
2623 | RCU_SYSIDLE_LONG, RCU_SYSIDLE_FULL); | ||
2624 | break; | ||
2625 | |||
2626 | default: | ||
2627 | break; | ||
2628 | } | ||
2629 | } | ||
2630 | |||
2631 | /* | ||
2632 | * Found a non-idle non-timekeeping CPU, so kick the system-idle state | ||
2633 | * back to the beginning. | ||
2634 | */ | ||
2635 | static void rcu_sysidle_cancel(void) | ||
2636 | { | ||
2637 | smp_mb(); | ||
2638 | ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT; | ||
2639 | } | ||
2640 | |||
2641 | /* | ||
2642 | * Update the sysidle state based on the results of a force-quiescent-state | ||
2643 | * scan of the CPUs' dyntick-idle state. | ||
2644 | */ | ||
2645 | static void rcu_sysidle_report(struct rcu_state *rsp, int isidle, | ||
2646 | unsigned long maxj, bool gpkt) | ||
2647 | { | ||
2648 | if (rsp != rcu_sysidle_state) | ||
2649 | return; /* Wrong flavor, ignore. */ | ||
2650 | if (gpkt && nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) | ||
2651 | return; /* Running state machine from timekeeping CPU. */ | ||
2652 | if (isidle) | ||
2653 | rcu_sysidle(maxj); /* More idle! */ | ||
2654 | else | ||
2655 | rcu_sysidle_cancel(); /* Idle is over. */ | ||
2656 | } | ||
2657 | |||
2658 | /* | ||
2659 | * Wrapper for rcu_sysidle_report() when called from the grace-period | ||
2660 | * kthread's context. | ||
2661 | */ | ||
2662 | static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, | ||
2663 | unsigned long maxj) | ||
2664 | { | ||
2665 | rcu_sysidle_report(rsp, isidle, maxj, true); | ||
2666 | } | ||
2667 | |||
2668 | /* Callback and function for forcing an RCU grace period. */ | ||
2669 | struct rcu_sysidle_head { | ||
2670 | struct rcu_head rh; | ||
2671 | int inuse; | ||
2672 | }; | ||
2673 | |||
2674 | static void rcu_sysidle_cb(struct rcu_head *rhp) | ||
2675 | { | ||
2676 | struct rcu_sysidle_head *rshp; | ||
2677 | |||
2678 | /* | ||
2679 | * The following memory barrier is needed to replace the | ||
2680 | * memory barriers that would normally be in the memory | ||
2681 | * allocator. | ||
2682 | */ | ||
2683 | smp_mb(); /* grace period precedes setting inuse. */ | ||
2684 | |||
2685 | rshp = container_of(rhp, struct rcu_sysidle_head, rh); | ||
2686 | ACCESS_ONCE(rshp->inuse) = 0; | ||
2687 | } | ||
2688 | |||
2689 | /* | ||
2690 | * Check to see if the system is fully idle, other than the timekeeping CPU. | ||
2691 | * The caller must have disabled interrupts. | ||
2692 | */ | ||
2693 | bool rcu_sys_is_idle(void) | ||
2694 | { | ||
2695 | static struct rcu_sysidle_head rsh; | ||
2696 | int rss = ACCESS_ONCE(full_sysidle_state); | ||
2697 | |||
2698 | if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu)) | ||
2699 | return false; | ||
2700 | |||
2701 | /* Handle small-system case by doing a full scan of CPUs. */ | ||
2702 | if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) { | ||
2703 | int oldrss = rss - 1; | ||
2704 | |||
2705 | /* | ||
2706 | * One pass to advance to each state up to _FULL. | ||
2707 | * Give up if any pass fails to advance the state. | ||
2708 | */ | ||
2709 | while (rss < RCU_SYSIDLE_FULL && oldrss < rss) { | ||
2710 | int cpu; | ||
2711 | bool isidle = true; | ||
2712 | unsigned long maxj = jiffies - ULONG_MAX / 4; | ||
2713 | struct rcu_data *rdp; | ||
2714 | |||
2715 | /* Scan all the CPUs looking for nonidle CPUs. */ | ||
2716 | for_each_possible_cpu(cpu) { | ||
2717 | rdp = per_cpu_ptr(rcu_sysidle_state->rda, cpu); | ||
2718 | rcu_sysidle_check_cpu(rdp, &isidle, &maxj); | ||
2719 | if (!isidle) | ||
2720 | break; | ||
2721 | } | ||
2722 | rcu_sysidle_report(rcu_sysidle_state, | ||
2723 | isidle, maxj, false); | ||
2724 | oldrss = rss; | ||
2725 | rss = ACCESS_ONCE(full_sysidle_state); | ||
2726 | } | ||
2727 | } | ||
2728 | |||
2729 | /* If this is the first observation of an idle period, record it. */ | ||
2730 | if (rss == RCU_SYSIDLE_FULL) { | ||
2731 | rss = cmpxchg(&full_sysidle_state, | ||
2732 | RCU_SYSIDLE_FULL, RCU_SYSIDLE_FULL_NOTED); | ||
2733 | return rss == RCU_SYSIDLE_FULL; | ||
2734 | } | ||
2735 | |||
2736 | smp_mb(); /* ensure rss load happens before later caller actions. */ | ||
2737 | |||
2738 | /* If already fully idle, tell the caller (in case of races). */ | ||
2739 | if (rss == RCU_SYSIDLE_FULL_NOTED) | ||
2740 | return true; | ||
2741 | |||
2742 | /* | ||
2743 | * If we aren't there yet, and a grace period is not in flight, | ||
2744 | * initiate a grace period. Either way, tell the caller that | ||
2745 | * we are not there yet. We use an xchg() rather than an assignment | ||
2746 | * to make up for the memory barriers that would otherwise be | ||
2747 | * provided by the memory allocator. | ||
2748 | */ | ||
2749 | if (nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL && | ||
2750 | !rcu_gp_in_progress(rcu_sysidle_state) && | ||
2751 | !rsh.inuse && xchg(&rsh.inuse, 1) == 0) | ||
2752 | call_rcu(&rsh.rh, rcu_sysidle_cb); | ||
2753 | return false; | ||
2754 | } | ||
2755 | |||
2756 | /* | ||
2757 | * Initialize dynticks sysidle state for CPUs coming online. | ||
2758 | */ | ||
2759 | static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) | ||
2760 | { | ||
2761 | rdtp->dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE; | ||
2762 | } | ||
2763 | |||
2764 | #else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ | ||
2765 | |||
2766 | static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) | ||
2767 | { | ||
2768 | } | ||
2769 | |||
2770 | static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) | ||
2771 | { | ||
2772 | } | ||
2773 | |||
2774 | static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, | ||
2775 | unsigned long *maxj) | ||
2776 | { | ||
2777 | } | ||
2778 | |||
2779 | static bool is_sysidle_rcu_state(struct rcu_state *rsp) | ||
2780 | { | ||
2781 | return false; | ||
2782 | } | ||
2783 | |||
2784 | static void rcu_bind_gp_kthread(void) | ||
2785 | { | ||
2786 | } | ||
2787 | |||
2788 | static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, | ||
2789 | unsigned long maxj) | ||
2790 | { | ||
2791 | } | ||
2792 | |||
2793 | static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) | ||
2794 | { | ||
2795 | } | ||
2796 | |||
2797 | #endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ | ||
diff --git a/kernel/relay.c b/kernel/relay.c index b91488ba2e5a..5001c9887db1 100644 --- a/kernel/relay.c +++ b/kernel/relay.c | |||
@@ -516,7 +516,7 @@ static void setup_callbacks(struct rchan *chan, | |||
516 | * | 516 | * |
517 | * Returns the success/failure of the operation. (%NOTIFY_OK, %NOTIFY_BAD) | 517 | * Returns the success/failure of the operation. (%NOTIFY_OK, %NOTIFY_BAD) |
518 | */ | 518 | */ |
519 | static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb, | 519 | static int relay_hotcpu_callback(struct notifier_block *nb, |
520 | unsigned long action, | 520 | unsigned long action, |
521 | void *hcpu) | 521 | void *hcpu) |
522 | { | 522 | { |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0d8eb4525e76..725aa067ad63 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -933,6 +933,8 @@ static int effective_prio(struct task_struct *p) | |||
933 | /** | 933 | /** |
934 | * task_curr - is this task currently executing on a CPU? | 934 | * task_curr - is this task currently executing on a CPU? |
935 | * @p: the task in question. | 935 | * @p: the task in question. |
936 | * | ||
937 | * Return: 1 if the task is currently executing. 0 otherwise. | ||
936 | */ | 938 | */ |
937 | inline int task_curr(const struct task_struct *p) | 939 | inline int task_curr(const struct task_struct *p) |
938 | { | 940 | { |
@@ -1482,7 +1484,7 @@ static void ttwu_queue(struct task_struct *p, int cpu) | |||
1482 | * the simpler "current->state = TASK_RUNNING" to mark yourself | 1484 | * the simpler "current->state = TASK_RUNNING" to mark yourself |
1483 | * runnable without the overhead of this. | 1485 | * runnable without the overhead of this. |
1484 | * | 1486 | * |
1485 | * Returns %true if @p was woken up, %false if it was already running | 1487 | * Return: %true if @p was woken up, %false if it was already running. |
1486 | * or @state didn't match @p's state. | 1488 | * or @state didn't match @p's state. |
1487 | */ | 1489 | */ |
1488 | static int | 1490 | static int |
@@ -1491,7 +1493,13 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) | |||
1491 | unsigned long flags; | 1493 | unsigned long flags; |
1492 | int cpu, success = 0; | 1494 | int cpu, success = 0; |
1493 | 1495 | ||
1494 | smp_wmb(); | 1496 | /* |
1497 | * If we are going to wake up a thread waiting for CONDITION we | ||
1498 | * need to ensure that CONDITION=1 done by the caller can not be | ||
1499 | * reordered with p->state check below. This pairs with mb() in | ||
1500 | * set_current_state() the waiting thread does. | ||
1501 | */ | ||
1502 | smp_mb__before_spinlock(); | ||
1495 | raw_spin_lock_irqsave(&p->pi_lock, flags); | 1503 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
1496 | if (!(p->state & state)) | 1504 | if (!(p->state & state)) |
1497 | goto out; | 1505 | goto out; |
@@ -1577,8 +1585,9 @@ out: | |||
1577 | * @p: The process to be woken up. | 1585 | * @p: The process to be woken up. |
1578 | * | 1586 | * |
1579 | * Attempt to wake up the nominated process and move it to the set of runnable | 1587 | * Attempt to wake up the nominated process and move it to the set of runnable |
1580 | * processes. Returns 1 if the process was woken up, 0 if it was already | 1588 | * processes. |
1581 | * running. | 1589 | * |
1590 | * Return: 1 if the process was woken up, 0 if it was already running. | ||
1582 | * | 1591 | * |
1583 | * It may be assumed that this function implies a write memory barrier before | 1592 | * It may be assumed that this function implies a write memory barrier before |
1584 | * changing the task state if and only if any tasks are woken up. | 1593 | * changing the task state if and only if any tasks are woken up. |
@@ -2191,6 +2200,8 @@ void scheduler_tick(void) | |||
2191 | * This makes sure that uptime, CFS vruntime, load | 2200 | * This makes sure that uptime, CFS vruntime, load |
2192 | * balancing, etc... continue to move forward, even | 2201 | * balancing, etc... continue to move forward, even |
2193 | * with a very low granularity. | 2202 | * with a very low granularity. |
2203 | * | ||
2204 | * Return: Maximum deferment in nanoseconds. | ||
2194 | */ | 2205 | */ |
2195 | u64 scheduler_tick_max_deferment(void) | 2206 | u64 scheduler_tick_max_deferment(void) |
2196 | { | 2207 | { |
@@ -2394,6 +2405,12 @@ need_resched: | |||
2394 | if (sched_feat(HRTICK)) | 2405 | if (sched_feat(HRTICK)) |
2395 | hrtick_clear(rq); | 2406 | hrtick_clear(rq); |
2396 | 2407 | ||
2408 | /* | ||
2409 | * Make sure that signal_pending_state()->signal_pending() below | ||
2410 | * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) | ||
2411 | * done by the caller to avoid the race with signal_wake_up(). | ||
2412 | */ | ||
2413 | smp_mb__before_spinlock(); | ||
2397 | raw_spin_lock_irq(&rq->lock); | 2414 | raw_spin_lock_irq(&rq->lock); |
2398 | 2415 | ||
2399 | switch_count = &prev->nivcsw; | 2416 | switch_count = &prev->nivcsw; |
@@ -2510,13 +2527,11 @@ void __sched schedule_preempt_disabled(void) | |||
2510 | */ | 2527 | */ |
2511 | asmlinkage void __sched notrace preempt_schedule(void) | 2528 | asmlinkage void __sched notrace preempt_schedule(void) |
2512 | { | 2529 | { |
2513 | struct thread_info *ti = current_thread_info(); | ||
2514 | |||
2515 | /* | 2530 | /* |
2516 | * If there is a non-zero preempt_count or interrupts are disabled, | 2531 | * If there is a non-zero preempt_count or interrupts are disabled, |
2517 | * we do not want to preempt the current task. Just return.. | 2532 | * we do not want to preempt the current task. Just return.. |
2518 | */ | 2533 | */ |
2519 | if (likely(ti->preempt_count || irqs_disabled())) | 2534 | if (likely(!preemptible())) |
2520 | return; | 2535 | return; |
2521 | 2536 | ||
2522 | do { | 2537 | do { |
@@ -2660,7 +2675,7 @@ void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, | |||
2660 | if (unlikely(!q)) | 2675 | if (unlikely(!q)) |
2661 | return; | 2676 | return; |
2662 | 2677 | ||
2663 | if (unlikely(!nr_exclusive)) | 2678 | if (unlikely(nr_exclusive != 1)) |
2664 | wake_flags = 0; | 2679 | wake_flags = 0; |
2665 | 2680 | ||
2666 | spin_lock_irqsave(&q->lock, flags); | 2681 | spin_lock_irqsave(&q->lock, flags); |
@@ -2796,8 +2811,8 @@ EXPORT_SYMBOL(wait_for_completion); | |||
2796 | * specified timeout to expire. The timeout is in jiffies. It is not | 2811 | * specified timeout to expire. The timeout is in jiffies. It is not |
2797 | * interruptible. | 2812 | * interruptible. |
2798 | * | 2813 | * |
2799 | * The return value is 0 if timed out, and positive (at least 1, or number of | 2814 | * Return: 0 if timed out, and positive (at least 1, or number of jiffies left |
2800 | * jiffies left till timeout) if completed. | 2815 | * till timeout) if completed. |
2801 | */ | 2816 | */ |
2802 | unsigned long __sched | 2817 | unsigned long __sched |
2803 | wait_for_completion_timeout(struct completion *x, unsigned long timeout) | 2818 | wait_for_completion_timeout(struct completion *x, unsigned long timeout) |
@@ -2829,8 +2844,8 @@ EXPORT_SYMBOL(wait_for_completion_io); | |||
2829 | * specified timeout to expire. The timeout is in jiffies. It is not | 2844 | * specified timeout to expire. The timeout is in jiffies. It is not |
2830 | * interruptible. The caller is accounted as waiting for IO. | 2845 | * interruptible. The caller is accounted as waiting for IO. |
2831 | * | 2846 | * |
2832 | * The return value is 0 if timed out, and positive (at least 1, or number of | 2847 | * Return: 0 if timed out, and positive (at least 1, or number of jiffies left |
2833 | * jiffies left till timeout) if completed. | 2848 | * till timeout) if completed. |
2834 | */ | 2849 | */ |
2835 | unsigned long __sched | 2850 | unsigned long __sched |
2836 | wait_for_completion_io_timeout(struct completion *x, unsigned long timeout) | 2851 | wait_for_completion_io_timeout(struct completion *x, unsigned long timeout) |
@@ -2846,7 +2861,7 @@ EXPORT_SYMBOL(wait_for_completion_io_timeout); | |||
2846 | * This waits for completion of a specific task to be signaled. It is | 2861 | * This waits for completion of a specific task to be signaled. It is |
2847 | * interruptible. | 2862 | * interruptible. |
2848 | * | 2863 | * |
2849 | * The return value is -ERESTARTSYS if interrupted, 0 if completed. | 2864 | * Return: -ERESTARTSYS if interrupted, 0 if completed. |
2850 | */ | 2865 | */ |
2851 | int __sched wait_for_completion_interruptible(struct completion *x) | 2866 | int __sched wait_for_completion_interruptible(struct completion *x) |
2852 | { | 2867 | { |
@@ -2865,8 +2880,8 @@ EXPORT_SYMBOL(wait_for_completion_interruptible); | |||
2865 | * This waits for either a completion of a specific task to be signaled or for a | 2880 | * This waits for either a completion of a specific task to be signaled or for a |
2866 | * specified timeout to expire. It is interruptible. The timeout is in jiffies. | 2881 | * specified timeout to expire. It is interruptible. The timeout is in jiffies. |
2867 | * | 2882 | * |
2868 | * The return value is -ERESTARTSYS if interrupted, 0 if timed out, | 2883 | * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1, |
2869 | * positive (at least 1, or number of jiffies left till timeout) if completed. | 2884 | * or number of jiffies left till timeout) if completed. |
2870 | */ | 2885 | */ |
2871 | long __sched | 2886 | long __sched |
2872 | wait_for_completion_interruptible_timeout(struct completion *x, | 2887 | wait_for_completion_interruptible_timeout(struct completion *x, |
@@ -2883,7 +2898,7 @@ EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); | |||
2883 | * This waits to be signaled for completion of a specific task. It can be | 2898 | * This waits to be signaled for completion of a specific task. It can be |
2884 | * interrupted by a kill signal. | 2899 | * interrupted by a kill signal. |
2885 | * | 2900 | * |
2886 | * The return value is -ERESTARTSYS if interrupted, 0 if completed. | 2901 | * Return: -ERESTARTSYS if interrupted, 0 if completed. |
2887 | */ | 2902 | */ |
2888 | int __sched wait_for_completion_killable(struct completion *x) | 2903 | int __sched wait_for_completion_killable(struct completion *x) |
2889 | { | 2904 | { |
@@ -2903,8 +2918,8 @@ EXPORT_SYMBOL(wait_for_completion_killable); | |||
2903 | * signaled or for a specified timeout to expire. It can be | 2918 | * signaled or for a specified timeout to expire. It can be |
2904 | * interrupted by a kill signal. The timeout is in jiffies. | 2919 | * interrupted by a kill signal. The timeout is in jiffies. |
2905 | * | 2920 | * |
2906 | * The return value is -ERESTARTSYS if interrupted, 0 if timed out, | 2921 | * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1, |
2907 | * positive (at least 1, or number of jiffies left till timeout) if completed. | 2922 | * or number of jiffies left till timeout) if completed. |
2908 | */ | 2923 | */ |
2909 | long __sched | 2924 | long __sched |
2910 | wait_for_completion_killable_timeout(struct completion *x, | 2925 | wait_for_completion_killable_timeout(struct completion *x, |
@@ -2918,7 +2933,7 @@ EXPORT_SYMBOL(wait_for_completion_killable_timeout); | |||
2918 | * try_wait_for_completion - try to decrement a completion without blocking | 2933 | * try_wait_for_completion - try to decrement a completion without blocking |
2919 | * @x: completion structure | 2934 | * @x: completion structure |
2920 | * | 2935 | * |
2921 | * Returns: 0 if a decrement cannot be done without blocking | 2936 | * Return: 0 if a decrement cannot be done without blocking |
2922 | * 1 if a decrement succeeded. | 2937 | * 1 if a decrement succeeded. |
2923 | * | 2938 | * |
2924 | * If a completion is being used as a counting completion, | 2939 | * If a completion is being used as a counting completion, |
@@ -2945,7 +2960,7 @@ EXPORT_SYMBOL(try_wait_for_completion); | |||
2945 | * completion_done - Test to see if a completion has any waiters | 2960 | * completion_done - Test to see if a completion has any waiters |
2946 | * @x: completion structure | 2961 | * @x: completion structure |
2947 | * | 2962 | * |
2948 | * Returns: 0 if there are waiters (wait_for_completion() in progress) | 2963 | * Return: 0 if there are waiters (wait_for_completion() in progress) |
2949 | * 1 if there are no waiters. | 2964 | * 1 if there are no waiters. |
2950 | * | 2965 | * |
2951 | */ | 2966 | */ |
@@ -3182,7 +3197,7 @@ SYSCALL_DEFINE1(nice, int, increment) | |||
3182 | * task_prio - return the priority value of a given task. | 3197 | * task_prio - return the priority value of a given task. |
3183 | * @p: the task in question. | 3198 | * @p: the task in question. |
3184 | * | 3199 | * |
3185 | * This is the priority value as seen by users in /proc. | 3200 | * Return: The priority value as seen by users in /proc. |
3186 | * RT tasks are offset by -200. Normal tasks are centered | 3201 | * RT tasks are offset by -200. Normal tasks are centered |
3187 | * around 0, value goes from -16 to +15. | 3202 | * around 0, value goes from -16 to +15. |
3188 | */ | 3203 | */ |
@@ -3194,6 +3209,8 @@ int task_prio(const struct task_struct *p) | |||
3194 | /** | 3209 | /** |
3195 | * task_nice - return the nice value of a given task. | 3210 | * task_nice - return the nice value of a given task. |
3196 | * @p: the task in question. | 3211 | * @p: the task in question. |
3212 | * | ||
3213 | * Return: The nice value [ -20 ... 0 ... 19 ]. | ||
3197 | */ | 3214 | */ |
3198 | int task_nice(const struct task_struct *p) | 3215 | int task_nice(const struct task_struct *p) |
3199 | { | 3216 | { |
@@ -3204,6 +3221,8 @@ EXPORT_SYMBOL(task_nice); | |||
3204 | /** | 3221 | /** |
3205 | * idle_cpu - is a given cpu idle currently? | 3222 | * idle_cpu - is a given cpu idle currently? |
3206 | * @cpu: the processor in question. | 3223 | * @cpu: the processor in question. |
3224 | * | ||
3225 | * Return: 1 if the CPU is currently idle. 0 otherwise. | ||
3207 | */ | 3226 | */ |
3208 | int idle_cpu(int cpu) | 3227 | int idle_cpu(int cpu) |
3209 | { | 3228 | { |
@@ -3226,6 +3245,8 @@ int idle_cpu(int cpu) | |||
3226 | /** | 3245 | /** |
3227 | * idle_task - return the idle task for a given cpu. | 3246 | * idle_task - return the idle task for a given cpu. |
3228 | * @cpu: the processor in question. | 3247 | * @cpu: the processor in question. |
3248 | * | ||
3249 | * Return: The idle task for the cpu @cpu. | ||
3229 | */ | 3250 | */ |
3230 | struct task_struct *idle_task(int cpu) | 3251 | struct task_struct *idle_task(int cpu) |
3231 | { | 3252 | { |
@@ -3235,6 +3256,8 @@ struct task_struct *idle_task(int cpu) | |||
3235 | /** | 3256 | /** |
3236 | * find_process_by_pid - find a process with a matching PID value. | 3257 | * find_process_by_pid - find a process with a matching PID value. |
3237 | * @pid: the pid in question. | 3258 | * @pid: the pid in question. |
3259 | * | ||
3260 | * The task of @pid, if found. %NULL otherwise. | ||
3238 | */ | 3261 | */ |
3239 | static struct task_struct *find_process_by_pid(pid_t pid) | 3262 | static struct task_struct *find_process_by_pid(pid_t pid) |
3240 | { | 3263 | { |
@@ -3432,6 +3455,8 @@ recheck: | |||
3432 | * @policy: new policy. | 3455 | * @policy: new policy. |
3433 | * @param: structure containing the new RT priority. | 3456 | * @param: structure containing the new RT priority. |
3434 | * | 3457 | * |
3458 | * Return: 0 on success. An error code otherwise. | ||
3459 | * | ||
3435 | * NOTE that the task may be already dead. | 3460 | * NOTE that the task may be already dead. |
3436 | */ | 3461 | */ |
3437 | int sched_setscheduler(struct task_struct *p, int policy, | 3462 | int sched_setscheduler(struct task_struct *p, int policy, |
@@ -3451,6 +3476,8 @@ EXPORT_SYMBOL_GPL(sched_setscheduler); | |||
3451 | * current context has permission. For example, this is needed in | 3476 | * current context has permission. For example, this is needed in |
3452 | * stop_machine(): we create temporary high priority worker threads, | 3477 | * stop_machine(): we create temporary high priority worker threads, |
3453 | * but our caller might not have that capability. | 3478 | * but our caller might not have that capability. |
3479 | * | ||
3480 | * Return: 0 on success. An error code otherwise. | ||
3454 | */ | 3481 | */ |
3455 | int sched_setscheduler_nocheck(struct task_struct *p, int policy, | 3482 | int sched_setscheduler_nocheck(struct task_struct *p, int policy, |
3456 | const struct sched_param *param) | 3483 | const struct sched_param *param) |
@@ -3485,6 +3512,8 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) | |||
3485 | * @pid: the pid in question. | 3512 | * @pid: the pid in question. |
3486 | * @policy: new policy. | 3513 | * @policy: new policy. |
3487 | * @param: structure containing the new RT priority. | 3514 | * @param: structure containing the new RT priority. |
3515 | * | ||
3516 | * Return: 0 on success. An error code otherwise. | ||
3488 | */ | 3517 | */ |
3489 | SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, | 3518 | SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, |
3490 | struct sched_param __user *, param) | 3519 | struct sched_param __user *, param) |
@@ -3500,6 +3529,8 @@ SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, | |||
3500 | * sys_sched_setparam - set/change the RT priority of a thread | 3529 | * sys_sched_setparam - set/change the RT priority of a thread |
3501 | * @pid: the pid in question. | 3530 | * @pid: the pid in question. |
3502 | * @param: structure containing the new RT priority. | 3531 | * @param: structure containing the new RT priority. |
3532 | * | ||
3533 | * Return: 0 on success. An error code otherwise. | ||
3503 | */ | 3534 | */ |
3504 | SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) | 3535 | SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) |
3505 | { | 3536 | { |
@@ -3509,6 +3540,9 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) | |||
3509 | /** | 3540 | /** |
3510 | * sys_sched_getscheduler - get the policy (scheduling class) of a thread | 3541 | * sys_sched_getscheduler - get the policy (scheduling class) of a thread |
3511 | * @pid: the pid in question. | 3542 | * @pid: the pid in question. |
3543 | * | ||
3544 | * Return: On success, the policy of the thread. Otherwise, a negative error | ||
3545 | * code. | ||
3512 | */ | 3546 | */ |
3513 | SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) | 3547 | SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) |
3514 | { | 3548 | { |
@@ -3535,6 +3569,9 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) | |||
3535 | * sys_sched_getparam - get the RT priority of a thread | 3569 | * sys_sched_getparam - get the RT priority of a thread |
3536 | * @pid: the pid in question. | 3570 | * @pid: the pid in question. |
3537 | * @param: structure containing the RT priority. | 3571 | * @param: structure containing the RT priority. |
3572 | * | ||
3573 | * Return: On success, 0 and the RT priority is in @param. Otherwise, an error | ||
3574 | * code. | ||
3538 | */ | 3575 | */ |
3539 | SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) | 3576 | SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) |
3540 | { | 3577 | { |
@@ -3659,6 +3696,8 @@ static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, | |||
3659 | * @pid: pid of the process | 3696 | * @pid: pid of the process |
3660 | * @len: length in bytes of the bitmask pointed to by user_mask_ptr | 3697 | * @len: length in bytes of the bitmask pointed to by user_mask_ptr |
3661 | * @user_mask_ptr: user-space pointer to the new cpu mask | 3698 | * @user_mask_ptr: user-space pointer to the new cpu mask |
3699 | * | ||
3700 | * Return: 0 on success. An error code otherwise. | ||
3662 | */ | 3701 | */ |
3663 | SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, | 3702 | SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, |
3664 | unsigned long __user *, user_mask_ptr) | 3703 | unsigned long __user *, user_mask_ptr) |
@@ -3710,6 +3749,8 @@ out_unlock: | |||
3710 | * @pid: pid of the process | 3749 | * @pid: pid of the process |
3711 | * @len: length in bytes of the bitmask pointed to by user_mask_ptr | 3750 | * @len: length in bytes of the bitmask pointed to by user_mask_ptr |
3712 | * @user_mask_ptr: user-space pointer to hold the current cpu mask | 3751 | * @user_mask_ptr: user-space pointer to hold the current cpu mask |
3752 | * | ||
3753 | * Return: 0 on success. An error code otherwise. | ||
3713 | */ | 3754 | */ |
3714 | SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, | 3755 | SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, |
3715 | unsigned long __user *, user_mask_ptr) | 3756 | unsigned long __user *, user_mask_ptr) |
@@ -3744,6 +3785,8 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, | |||
3744 | * | 3785 | * |
3745 | * This function yields the current CPU to other tasks. If there are no | 3786 | * This function yields the current CPU to other tasks. If there are no |
3746 | * other threads running on this CPU then this function will return. | 3787 | * other threads running on this CPU then this function will return. |
3788 | * | ||
3789 | * Return: 0. | ||
3747 | */ | 3790 | */ |
3748 | SYSCALL_DEFINE0(sched_yield) | 3791 | SYSCALL_DEFINE0(sched_yield) |
3749 | { | 3792 | { |
@@ -3869,7 +3912,7 @@ EXPORT_SYMBOL(yield); | |||
3869 | * It's the caller's job to ensure that the target task struct | 3912 | * It's the caller's job to ensure that the target task struct |
3870 | * can't go away on us before we can do any checks. | 3913 | * can't go away on us before we can do any checks. |
3871 | * | 3914 | * |
3872 | * Returns: | 3915 | * Return: |
3873 | * true (>0) if we indeed boosted the target task. | 3916 | * true (>0) if we indeed boosted the target task. |
3874 | * false (0) if we failed to boost the target. | 3917 | * false (0) if we failed to boost the target. |
3875 | * -ESRCH if there's no task to yield to. | 3918 | * -ESRCH if there's no task to yield to. |
@@ -3972,8 +4015,9 @@ long __sched io_schedule_timeout(long timeout) | |||
3972 | * sys_sched_get_priority_max - return maximum RT priority. | 4015 | * sys_sched_get_priority_max - return maximum RT priority. |
3973 | * @policy: scheduling class. | 4016 | * @policy: scheduling class. |
3974 | * | 4017 | * |
3975 | * this syscall returns the maximum rt_priority that can be used | 4018 | * Return: On success, this syscall returns the maximum |
3976 | * by a given scheduling class. | 4019 | * rt_priority that can be used by a given scheduling class. |
4020 | * On failure, a negative error code is returned. | ||
3977 | */ | 4021 | */ |
3978 | SYSCALL_DEFINE1(sched_get_priority_max, int, policy) | 4022 | SYSCALL_DEFINE1(sched_get_priority_max, int, policy) |
3979 | { | 4023 | { |
@@ -3997,8 +4041,9 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy) | |||
3997 | * sys_sched_get_priority_min - return minimum RT priority. | 4041 | * sys_sched_get_priority_min - return minimum RT priority. |
3998 | * @policy: scheduling class. | 4042 | * @policy: scheduling class. |
3999 | * | 4043 | * |
4000 | * this syscall returns the minimum rt_priority that can be used | 4044 | * Return: On success, this syscall returns the minimum |
4001 | * by a given scheduling class. | 4045 | * rt_priority that can be used by a given scheduling class. |
4046 | * On failure, a negative error code is returned. | ||
4002 | */ | 4047 | */ |
4003 | SYSCALL_DEFINE1(sched_get_priority_min, int, policy) | 4048 | SYSCALL_DEFINE1(sched_get_priority_min, int, policy) |
4004 | { | 4049 | { |
@@ -4024,6 +4069,9 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy) | |||
4024 | * | 4069 | * |
4025 | * this syscall writes the default timeslice value of a given process | 4070 | * this syscall writes the default timeslice value of a given process |
4026 | * into the user-space timespec buffer. A value of '0' means infinity. | 4071 | * into the user-space timespec buffer. A value of '0' means infinity. |
4072 | * | ||
4073 | * Return: On success, 0 and the timeslice is in @interval. Otherwise, | ||
4074 | * an error code. | ||
4027 | */ | 4075 | */ |
4028 | SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, | 4076 | SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, |
4029 | struct timespec __user *, interval) | 4077 | struct timespec __user *, interval) |
@@ -4133,7 +4181,7 @@ void show_state_filter(unsigned long state_filter) | |||
4133 | debug_show_all_locks(); | 4181 | debug_show_all_locks(); |
4134 | } | 4182 | } |
4135 | 4183 | ||
4136 | void __cpuinit init_idle_bootup_task(struct task_struct *idle) | 4184 | void init_idle_bootup_task(struct task_struct *idle) |
4137 | { | 4185 | { |
4138 | idle->sched_class = &idle_sched_class; | 4186 | idle->sched_class = &idle_sched_class; |
4139 | } | 4187 | } |
@@ -4146,7 +4194,7 @@ void __cpuinit init_idle_bootup_task(struct task_struct *idle) | |||
4146 | * NOTE: this function does not set the idle thread's NEED_RESCHED | 4194 | * NOTE: this function does not set the idle thread's NEED_RESCHED |
4147 | * flag, to make booting more robust. | 4195 | * flag, to make booting more robust. |
4148 | */ | 4196 | */ |
4149 | void __cpuinit init_idle(struct task_struct *idle, int cpu) | 4197 | void init_idle(struct task_struct *idle, int cpu) |
4150 | { | 4198 | { |
4151 | struct rq *rq = cpu_rq(cpu); | 4199 | struct rq *rq = cpu_rq(cpu); |
4152 | unsigned long flags; | 4200 | unsigned long flags; |
@@ -4630,7 +4678,7 @@ static void set_rq_offline(struct rq *rq) | |||
4630 | * migration_call - callback that gets triggered when a CPU is added. | 4678 | * migration_call - callback that gets triggered when a CPU is added. |
4631 | * Here we can start up the necessary migration thread for the new CPU. | 4679 | * Here we can start up the necessary migration thread for the new CPU. |
4632 | */ | 4680 | */ |
4633 | static int __cpuinit | 4681 | static int |
4634 | migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | 4682 | migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) |
4635 | { | 4683 | { |
4636 | int cpu = (long)hcpu; | 4684 | int cpu = (long)hcpu; |
@@ -4684,12 +4732,12 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
4684 | * happens before everything else. This has to be lower priority than | 4732 | * happens before everything else. This has to be lower priority than |
4685 | * the notifier in the perf_event subsystem, though. | 4733 | * the notifier in the perf_event subsystem, though. |
4686 | */ | 4734 | */ |
4687 | static struct notifier_block __cpuinitdata migration_notifier = { | 4735 | static struct notifier_block migration_notifier = { |
4688 | .notifier_call = migration_call, | 4736 | .notifier_call = migration_call, |
4689 | .priority = CPU_PRI_MIGRATION, | 4737 | .priority = CPU_PRI_MIGRATION, |
4690 | }; | 4738 | }; |
4691 | 4739 | ||
4692 | static int __cpuinit sched_cpu_active(struct notifier_block *nfb, | 4740 | static int sched_cpu_active(struct notifier_block *nfb, |
4693 | unsigned long action, void *hcpu) | 4741 | unsigned long action, void *hcpu) |
4694 | { | 4742 | { |
4695 | switch (action & ~CPU_TASKS_FROZEN) { | 4743 | switch (action & ~CPU_TASKS_FROZEN) { |
@@ -4702,7 +4750,7 @@ static int __cpuinit sched_cpu_active(struct notifier_block *nfb, | |||
4702 | } | 4750 | } |
4703 | } | 4751 | } |
4704 | 4752 | ||
4705 | static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb, | 4753 | static int sched_cpu_inactive(struct notifier_block *nfb, |
4706 | unsigned long action, void *hcpu) | 4754 | unsigned long action, void *hcpu) |
4707 | { | 4755 | { |
4708 | switch (action & ~CPU_TASKS_FROZEN) { | 4756 | switch (action & ~CPU_TASKS_FROZEN) { |
@@ -4914,7 +4962,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) | |||
4914 | SD_BALANCE_FORK | | 4962 | SD_BALANCE_FORK | |
4915 | SD_BALANCE_EXEC | | 4963 | SD_BALANCE_EXEC | |
4916 | SD_SHARE_CPUPOWER | | 4964 | SD_SHARE_CPUPOWER | |
4917 | SD_SHARE_PKG_RESOURCES); | 4965 | SD_SHARE_PKG_RESOURCES | |
4966 | SD_PREFER_SIBLING); | ||
4918 | if (nr_node_ids == 1) | 4967 | if (nr_node_ids == 1) |
4919 | pflags &= ~SD_SERIALIZE; | 4968 | pflags &= ~SD_SERIALIZE; |
4920 | } | 4969 | } |
@@ -5083,18 +5132,23 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) | |||
5083 | * two cpus are in the same cache domain, see cpus_share_cache(). | 5132 | * two cpus are in the same cache domain, see cpus_share_cache(). |
5084 | */ | 5133 | */ |
5085 | DEFINE_PER_CPU(struct sched_domain *, sd_llc); | 5134 | DEFINE_PER_CPU(struct sched_domain *, sd_llc); |
5135 | DEFINE_PER_CPU(int, sd_llc_size); | ||
5086 | DEFINE_PER_CPU(int, sd_llc_id); | 5136 | DEFINE_PER_CPU(int, sd_llc_id); |
5087 | 5137 | ||
5088 | static void update_top_cache_domain(int cpu) | 5138 | static void update_top_cache_domain(int cpu) |
5089 | { | 5139 | { |
5090 | struct sched_domain *sd; | 5140 | struct sched_domain *sd; |
5091 | int id = cpu; | 5141 | int id = cpu; |
5142 | int size = 1; | ||
5092 | 5143 | ||
5093 | sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); | 5144 | sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); |
5094 | if (sd) | 5145 | if (sd) { |
5095 | id = cpumask_first(sched_domain_span(sd)); | 5146 | id = cpumask_first(sched_domain_span(sd)); |
5147 | size = cpumask_weight(sched_domain_span(sd)); | ||
5148 | } | ||
5096 | 5149 | ||
5097 | rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); | 5150 | rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); |
5151 | per_cpu(sd_llc_size, cpu) = size; | ||
5098 | per_cpu(sd_llc_id, cpu) = id; | 5152 | per_cpu(sd_llc_id, cpu) = id; |
5099 | } | 5153 | } |
5100 | 5154 | ||
@@ -5118,6 +5172,13 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | |||
5118 | tmp->parent = parent->parent; | 5172 | tmp->parent = parent->parent; |
5119 | if (parent->parent) | 5173 | if (parent->parent) |
5120 | parent->parent->child = tmp; | 5174 | parent->parent->child = tmp; |
5175 | /* | ||
5176 | * Transfer SD_PREFER_SIBLING down in case of a | ||
5177 | * degenerate parent; the spans match for this | ||
5178 | * so the property transfers. | ||
5179 | */ | ||
5180 | if (parent->flags & SD_PREFER_SIBLING) | ||
5181 | tmp->flags |= SD_PREFER_SIBLING; | ||
5121 | destroy_sched_domain(parent, cpu); | 5182 | destroy_sched_domain(parent, cpu); |
5122 | } else | 5183 | } else |
5123 | tmp = tmp->parent; | 5184 | tmp = tmp->parent; |
@@ -6184,8 +6245,9 @@ match1: | |||
6184 | ; | 6245 | ; |
6185 | } | 6246 | } |
6186 | 6247 | ||
6248 | n = ndoms_cur; | ||
6187 | if (doms_new == NULL) { | 6249 | if (doms_new == NULL) { |
6188 | ndoms_cur = 0; | 6250 | n = 0; |
6189 | doms_new = &fallback_doms; | 6251 | doms_new = &fallback_doms; |
6190 | cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); | 6252 | cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); |
6191 | WARN_ON_ONCE(dattr_new); | 6253 | WARN_ON_ONCE(dattr_new); |
@@ -6193,7 +6255,7 @@ match1: | |||
6193 | 6255 | ||
6194 | /* Build new domains */ | 6256 | /* Build new domains */ |
6195 | for (i = 0; i < ndoms_new; i++) { | 6257 | for (i = 0; i < ndoms_new; i++) { |
6196 | for (j = 0; j < ndoms_cur && !new_topology; j++) { | 6258 | for (j = 0; j < n && !new_topology; j++) { |
6197 | if (cpumask_equal(doms_new[i], doms_cur[j]) | 6259 | if (cpumask_equal(doms_new[i], doms_cur[j]) |
6198 | && dattrs_equal(dattr_new, i, dattr_cur, j)) | 6260 | && dattrs_equal(dattr_new, i, dattr_cur, j)) |
6199 | goto match2; | 6261 | goto match2; |
@@ -6632,6 +6694,8 @@ void normalize_rt_tasks(void) | |||
6632 | * @cpu: the processor in question. | 6694 | * @cpu: the processor in question. |
6633 | * | 6695 | * |
6634 | * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! | 6696 | * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! |
6697 | * | ||
6698 | * Return: The current task for @cpu. | ||
6635 | */ | 6699 | */ |
6636 | struct task_struct *curr_task(int cpu) | 6700 | struct task_struct *curr_task(int cpu) |
6637 | { | 6701 | { |
@@ -6763,7 +6827,7 @@ void sched_move_task(struct task_struct *tsk) | |||
6763 | if (unlikely(running)) | 6827 | if (unlikely(running)) |
6764 | tsk->sched_class->put_prev_task(rq, tsk); | 6828 | tsk->sched_class->put_prev_task(rq, tsk); |
6765 | 6829 | ||
6766 | tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id, | 6830 | tg = container_of(task_css_check(tsk, cpu_cgroup_subsys_id, |
6767 | lockdep_is_held(&tsk->sighand->siglock)), | 6831 | lockdep_is_held(&tsk->sighand->siglock)), |
6768 | struct task_group, css); | 6832 | struct task_group, css); |
6769 | tg = autogroup_task_group(tsk, tg); | 6833 | tg = autogroup_task_group(tsk, tg); |
@@ -7085,23 +7149,22 @@ int sched_rt_handler(struct ctl_table *table, int write, | |||
7085 | 7149 | ||
7086 | #ifdef CONFIG_CGROUP_SCHED | 7150 | #ifdef CONFIG_CGROUP_SCHED |
7087 | 7151 | ||
7088 | /* return corresponding task_group object of a cgroup */ | 7152 | static inline struct task_group *css_tg(struct cgroup_subsys_state *css) |
7089 | static inline struct task_group *cgroup_tg(struct cgroup *cgrp) | ||
7090 | { | 7153 | { |
7091 | return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id), | 7154 | return css ? container_of(css, struct task_group, css) : NULL; |
7092 | struct task_group, css); | ||
7093 | } | 7155 | } |
7094 | 7156 | ||
7095 | static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp) | 7157 | static struct cgroup_subsys_state * |
7158 | cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) | ||
7096 | { | 7159 | { |
7097 | struct task_group *tg, *parent; | 7160 | struct task_group *parent = css_tg(parent_css); |
7161 | struct task_group *tg; | ||
7098 | 7162 | ||
7099 | if (!cgrp->parent) { | 7163 | if (!parent) { |
7100 | /* This is early initialization for the top cgroup */ | 7164 | /* This is early initialization for the top cgroup */ |
7101 | return &root_task_group.css; | 7165 | return &root_task_group.css; |
7102 | } | 7166 | } |
7103 | 7167 | ||
7104 | parent = cgroup_tg(cgrp->parent); | ||
7105 | tg = sched_create_group(parent); | 7168 | tg = sched_create_group(parent); |
7106 | if (IS_ERR(tg)) | 7169 | if (IS_ERR(tg)) |
7107 | return ERR_PTR(-ENOMEM); | 7170 | return ERR_PTR(-ENOMEM); |
@@ -7109,41 +7172,38 @@ static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp) | |||
7109 | return &tg->css; | 7172 | return &tg->css; |
7110 | } | 7173 | } |
7111 | 7174 | ||
7112 | static int cpu_cgroup_css_online(struct cgroup *cgrp) | 7175 | static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) |
7113 | { | 7176 | { |
7114 | struct task_group *tg = cgroup_tg(cgrp); | 7177 | struct task_group *tg = css_tg(css); |
7115 | struct task_group *parent; | 7178 | struct task_group *parent = css_tg(css_parent(css)); |
7116 | 7179 | ||
7117 | if (!cgrp->parent) | 7180 | if (parent) |
7118 | return 0; | 7181 | sched_online_group(tg, parent); |
7119 | |||
7120 | parent = cgroup_tg(cgrp->parent); | ||
7121 | sched_online_group(tg, parent); | ||
7122 | return 0; | 7182 | return 0; |
7123 | } | 7183 | } |
7124 | 7184 | ||
7125 | static void cpu_cgroup_css_free(struct cgroup *cgrp) | 7185 | static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) |
7126 | { | 7186 | { |
7127 | struct task_group *tg = cgroup_tg(cgrp); | 7187 | struct task_group *tg = css_tg(css); |
7128 | 7188 | ||
7129 | sched_destroy_group(tg); | 7189 | sched_destroy_group(tg); |
7130 | } | 7190 | } |
7131 | 7191 | ||
7132 | static void cpu_cgroup_css_offline(struct cgroup *cgrp) | 7192 | static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) |
7133 | { | 7193 | { |
7134 | struct task_group *tg = cgroup_tg(cgrp); | 7194 | struct task_group *tg = css_tg(css); |
7135 | 7195 | ||
7136 | sched_offline_group(tg); | 7196 | sched_offline_group(tg); |
7137 | } | 7197 | } |
7138 | 7198 | ||
7139 | static int cpu_cgroup_can_attach(struct cgroup *cgrp, | 7199 | static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, |
7140 | struct cgroup_taskset *tset) | 7200 | struct cgroup_taskset *tset) |
7141 | { | 7201 | { |
7142 | struct task_struct *task; | 7202 | struct task_struct *task; |
7143 | 7203 | ||
7144 | cgroup_taskset_for_each(task, cgrp, tset) { | 7204 | cgroup_taskset_for_each(task, css, tset) { |
7145 | #ifdef CONFIG_RT_GROUP_SCHED | 7205 | #ifdef CONFIG_RT_GROUP_SCHED |
7146 | if (!sched_rt_can_attach(cgroup_tg(cgrp), task)) | 7206 | if (!sched_rt_can_attach(css_tg(css), task)) |
7147 | return -EINVAL; | 7207 | return -EINVAL; |
7148 | #else | 7208 | #else |
7149 | /* We don't support RT-tasks being in separate groups */ | 7209 | /* We don't support RT-tasks being in separate groups */ |
@@ -7154,18 +7214,18 @@ static int cpu_cgroup_can_attach(struct cgroup *cgrp, | |||
7154 | return 0; | 7214 | return 0; |
7155 | } | 7215 | } |
7156 | 7216 | ||
7157 | static void cpu_cgroup_attach(struct cgroup *cgrp, | 7217 | static void cpu_cgroup_attach(struct cgroup_subsys_state *css, |
7158 | struct cgroup_taskset *tset) | 7218 | struct cgroup_taskset *tset) |
7159 | { | 7219 | { |
7160 | struct task_struct *task; | 7220 | struct task_struct *task; |
7161 | 7221 | ||
7162 | cgroup_taskset_for_each(task, cgrp, tset) | 7222 | cgroup_taskset_for_each(task, css, tset) |
7163 | sched_move_task(task); | 7223 | sched_move_task(task); |
7164 | } | 7224 | } |
7165 | 7225 | ||
7166 | static void | 7226 | static void cpu_cgroup_exit(struct cgroup_subsys_state *css, |
7167 | cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, | 7227 | struct cgroup_subsys_state *old_css, |
7168 | struct task_struct *task) | 7228 | struct task_struct *task) |
7169 | { | 7229 | { |
7170 | /* | 7230 | /* |
7171 | * cgroup_exit() is called in the copy_process() failure path. | 7231 | * cgroup_exit() is called in the copy_process() failure path. |
@@ -7179,15 +7239,16 @@ cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, | |||
7179 | } | 7239 | } |
7180 | 7240 | ||
7181 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7241 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7182 | static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, | 7242 | static int cpu_shares_write_u64(struct cgroup_subsys_state *css, |
7183 | u64 shareval) | 7243 | struct cftype *cftype, u64 shareval) |
7184 | { | 7244 | { |
7185 | return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval)); | 7245 | return sched_group_set_shares(css_tg(css), scale_load(shareval)); |
7186 | } | 7246 | } |
7187 | 7247 | ||
7188 | static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) | 7248 | static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css, |
7249 | struct cftype *cft) | ||
7189 | { | 7250 | { |
7190 | struct task_group *tg = cgroup_tg(cgrp); | 7251 | struct task_group *tg = css_tg(css); |
7191 | 7252 | ||
7192 | return (u64) scale_load_down(tg->shares); | 7253 | return (u64) scale_load_down(tg->shares); |
7193 | } | 7254 | } |
@@ -7309,26 +7370,28 @@ long tg_get_cfs_period(struct task_group *tg) | |||
7309 | return cfs_period_us; | 7370 | return cfs_period_us; |
7310 | } | 7371 | } |
7311 | 7372 | ||
7312 | static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft) | 7373 | static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css, |
7374 | struct cftype *cft) | ||
7313 | { | 7375 | { |
7314 | return tg_get_cfs_quota(cgroup_tg(cgrp)); | 7376 | return tg_get_cfs_quota(css_tg(css)); |
7315 | } | 7377 | } |
7316 | 7378 | ||
7317 | static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype, | 7379 | static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css, |
7318 | s64 cfs_quota_us) | 7380 | struct cftype *cftype, s64 cfs_quota_us) |
7319 | { | 7381 | { |
7320 | return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us); | 7382 | return tg_set_cfs_quota(css_tg(css), cfs_quota_us); |
7321 | } | 7383 | } |
7322 | 7384 | ||
7323 | static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft) | 7385 | static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css, |
7386 | struct cftype *cft) | ||
7324 | { | 7387 | { |
7325 | return tg_get_cfs_period(cgroup_tg(cgrp)); | 7388 | return tg_get_cfs_period(css_tg(css)); |
7326 | } | 7389 | } |
7327 | 7390 | ||
7328 | static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype, | 7391 | static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css, |
7329 | u64 cfs_period_us) | 7392 | struct cftype *cftype, u64 cfs_period_us) |
7330 | { | 7393 | { |
7331 | return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us); | 7394 | return tg_set_cfs_period(css_tg(css), cfs_period_us); |
7332 | } | 7395 | } |
7333 | 7396 | ||
7334 | struct cfs_schedulable_data { | 7397 | struct cfs_schedulable_data { |
@@ -7409,10 +7472,10 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) | |||
7409 | return ret; | 7472 | return ret; |
7410 | } | 7473 | } |
7411 | 7474 | ||
7412 | static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft, | 7475 | static int cpu_stats_show(struct cgroup_subsys_state *css, struct cftype *cft, |
7413 | struct cgroup_map_cb *cb) | 7476 | struct cgroup_map_cb *cb) |
7414 | { | 7477 | { |
7415 | struct task_group *tg = cgroup_tg(cgrp); | 7478 | struct task_group *tg = css_tg(css); |
7416 | struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; | 7479 | struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; |
7417 | 7480 | ||
7418 | cb->fill(cb, "nr_periods", cfs_b->nr_periods); | 7481 | cb->fill(cb, "nr_periods", cfs_b->nr_periods); |
@@ -7425,26 +7488,28 @@ static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft, | |||
7425 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 7488 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
7426 | 7489 | ||
7427 | #ifdef CONFIG_RT_GROUP_SCHED | 7490 | #ifdef CONFIG_RT_GROUP_SCHED |
7428 | static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, | 7491 | static int cpu_rt_runtime_write(struct cgroup_subsys_state *css, |
7429 | s64 val) | 7492 | struct cftype *cft, s64 val) |
7430 | { | 7493 | { |
7431 | return sched_group_set_rt_runtime(cgroup_tg(cgrp), val); | 7494 | return sched_group_set_rt_runtime(css_tg(css), val); |
7432 | } | 7495 | } |
7433 | 7496 | ||
7434 | static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft) | 7497 | static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css, |
7498 | struct cftype *cft) | ||
7435 | { | 7499 | { |
7436 | return sched_group_rt_runtime(cgroup_tg(cgrp)); | 7500 | return sched_group_rt_runtime(css_tg(css)); |
7437 | } | 7501 | } |
7438 | 7502 | ||
7439 | static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype, | 7503 | static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css, |
7440 | u64 rt_period_us) | 7504 | struct cftype *cftype, u64 rt_period_us) |
7441 | { | 7505 | { |
7442 | return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us); | 7506 | return sched_group_set_rt_period(css_tg(css), rt_period_us); |
7443 | } | 7507 | } |
7444 | 7508 | ||
7445 | static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft) | 7509 | static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, |
7510 | struct cftype *cft) | ||
7446 | { | 7511 | { |
7447 | return sched_group_rt_period(cgroup_tg(cgrp)); | 7512 | return sched_group_rt_period(css_tg(css)); |
7448 | } | 7513 | } |
7449 | #endif /* CONFIG_RT_GROUP_SCHED */ | 7514 | #endif /* CONFIG_RT_GROUP_SCHED */ |
7450 | 7515 | ||
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index dbb7e2cd95eb..f64722ff0299 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c | |||
@@ -33,30 +33,20 @@ struct cpuacct { | |||
33 | struct kernel_cpustat __percpu *cpustat; | 33 | struct kernel_cpustat __percpu *cpustat; |
34 | }; | 34 | }; |
35 | 35 | ||
36 | /* return cpu accounting group corresponding to this container */ | 36 | static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css) |
37 | static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) | ||
38 | { | 37 | { |
39 | return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), | 38 | return css ? container_of(css, struct cpuacct, css) : NULL; |
40 | struct cpuacct, css); | ||
41 | } | 39 | } |
42 | 40 | ||
43 | /* return cpu accounting group to which this task belongs */ | 41 | /* return cpu accounting group to which this task belongs */ |
44 | static inline struct cpuacct *task_ca(struct task_struct *tsk) | 42 | static inline struct cpuacct *task_ca(struct task_struct *tsk) |
45 | { | 43 | { |
46 | return container_of(task_subsys_state(tsk, cpuacct_subsys_id), | 44 | return css_ca(task_css(tsk, cpuacct_subsys_id)); |
47 | struct cpuacct, css); | ||
48 | } | ||
49 | |||
50 | static inline struct cpuacct *__parent_ca(struct cpuacct *ca) | ||
51 | { | ||
52 | return cgroup_ca(ca->css.cgroup->parent); | ||
53 | } | 45 | } |
54 | 46 | ||
55 | static inline struct cpuacct *parent_ca(struct cpuacct *ca) | 47 | static inline struct cpuacct *parent_ca(struct cpuacct *ca) |
56 | { | 48 | { |
57 | if (!ca->css.cgroup->parent) | 49 | return css_ca(css_parent(&ca->css)); |
58 | return NULL; | ||
59 | return cgroup_ca(ca->css.cgroup->parent); | ||
60 | } | 50 | } |
61 | 51 | ||
62 | static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); | 52 | static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); |
@@ -66,11 +56,12 @@ static struct cpuacct root_cpuacct = { | |||
66 | }; | 56 | }; |
67 | 57 | ||
68 | /* create a new cpu accounting group */ | 58 | /* create a new cpu accounting group */ |
69 | static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp) | 59 | static struct cgroup_subsys_state * |
60 | cpuacct_css_alloc(struct cgroup_subsys_state *parent_css) | ||
70 | { | 61 | { |
71 | struct cpuacct *ca; | 62 | struct cpuacct *ca; |
72 | 63 | ||
73 | if (!cgrp->parent) | 64 | if (!parent_css) |
74 | return &root_cpuacct.css; | 65 | return &root_cpuacct.css; |
75 | 66 | ||
76 | ca = kzalloc(sizeof(*ca), GFP_KERNEL); | 67 | ca = kzalloc(sizeof(*ca), GFP_KERNEL); |
@@ -96,9 +87,9 @@ out: | |||
96 | } | 87 | } |
97 | 88 | ||
98 | /* destroy an existing cpu accounting group */ | 89 | /* destroy an existing cpu accounting group */ |
99 | static void cpuacct_css_free(struct cgroup *cgrp) | 90 | static void cpuacct_css_free(struct cgroup_subsys_state *css) |
100 | { | 91 | { |
101 | struct cpuacct *ca = cgroup_ca(cgrp); | 92 | struct cpuacct *ca = css_ca(css); |
102 | 93 | ||
103 | free_percpu(ca->cpustat); | 94 | free_percpu(ca->cpustat); |
104 | free_percpu(ca->cpuusage); | 95 | free_percpu(ca->cpuusage); |
@@ -141,9 +132,9 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) | |||
141 | } | 132 | } |
142 | 133 | ||
143 | /* return total cpu usage (in nanoseconds) of a group */ | 134 | /* return total cpu usage (in nanoseconds) of a group */ |
144 | static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) | 135 | static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft) |
145 | { | 136 | { |
146 | struct cpuacct *ca = cgroup_ca(cgrp); | 137 | struct cpuacct *ca = css_ca(css); |
147 | u64 totalcpuusage = 0; | 138 | u64 totalcpuusage = 0; |
148 | int i; | 139 | int i; |
149 | 140 | ||
@@ -153,10 +144,10 @@ static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) | |||
153 | return totalcpuusage; | 144 | return totalcpuusage; |
154 | } | 145 | } |
155 | 146 | ||
156 | static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, | 147 | static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, |
157 | u64 reset) | 148 | u64 reset) |
158 | { | 149 | { |
159 | struct cpuacct *ca = cgroup_ca(cgrp); | 150 | struct cpuacct *ca = css_ca(css); |
160 | int err = 0; | 151 | int err = 0; |
161 | int i; | 152 | int i; |
162 | 153 | ||
@@ -172,10 +163,10 @@ out: | |||
172 | return err; | 163 | return err; |
173 | } | 164 | } |
174 | 165 | ||
175 | static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, | 166 | static int cpuacct_percpu_seq_read(struct cgroup_subsys_state *css, |
176 | struct seq_file *m) | 167 | struct cftype *cft, struct seq_file *m) |
177 | { | 168 | { |
178 | struct cpuacct *ca = cgroup_ca(cgroup); | 169 | struct cpuacct *ca = css_ca(css); |
179 | u64 percpu; | 170 | u64 percpu; |
180 | int i; | 171 | int i; |
181 | 172 | ||
@@ -192,10 +183,10 @@ static const char * const cpuacct_stat_desc[] = { | |||
192 | [CPUACCT_STAT_SYSTEM] = "system", | 183 | [CPUACCT_STAT_SYSTEM] = "system", |
193 | }; | 184 | }; |
194 | 185 | ||
195 | static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, | 186 | static int cpuacct_stats_show(struct cgroup_subsys_state *css, |
196 | struct cgroup_map_cb *cb) | 187 | struct cftype *cft, struct cgroup_map_cb *cb) |
197 | { | 188 | { |
198 | struct cpuacct *ca = cgroup_ca(cgrp); | 189 | struct cpuacct *ca = css_ca(css); |
199 | int cpu; | 190 | int cpu; |
200 | s64 val = 0; | 191 | s64 val = 0; |
201 | 192 | ||
@@ -281,7 +272,7 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val) | |||
281 | while (ca != &root_cpuacct) { | 272 | while (ca != &root_cpuacct) { |
282 | kcpustat = this_cpu_ptr(ca->cpustat); | 273 | kcpustat = this_cpu_ptr(ca->cpustat); |
283 | kcpustat->cpustat[index] += val; | 274 | kcpustat->cpustat[index] += val; |
284 | ca = __parent_ca(ca); | 275 | ca = parent_ca(ca); |
285 | } | 276 | } |
286 | rcu_read_unlock(); | 277 | rcu_read_unlock(); |
287 | } | 278 | } |
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 1095e878a46f..8b836b376d91 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c | |||
@@ -62,7 +62,7 @@ static int convert_prio(int prio) | |||
62 | * any discrepancies created by racing against the uncertainty of the current | 62 | * any discrepancies created by racing against the uncertainty of the current |
63 | * priority configuration. | 63 | * priority configuration. |
64 | * | 64 | * |
65 | * Returns: (int)bool - CPUs were found | 65 | * Return: (int)bool - CPUs were found |
66 | */ | 66 | */ |
67 | int cpupri_find(struct cpupri *cp, struct task_struct *p, | 67 | int cpupri_find(struct cpupri *cp, struct task_struct *p, |
68 | struct cpumask *lowest_mask) | 68 | struct cpumask *lowest_mask) |
@@ -203,7 +203,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) | |||
203 | * cpupri_init - initialize the cpupri structure | 203 | * cpupri_init - initialize the cpupri structure |
204 | * @cp: The cpupri context | 204 | * @cp: The cpupri context |
205 | * | 205 | * |
206 | * Returns: -ENOMEM if memory fails. | 206 | * Return: -ENOMEM on memory allocation failure. |
207 | */ | 207 | */ |
208 | int cpupri_init(struct cpupri *cp) | 208 | int cpupri_init(struct cpupri *cp) |
209 | { | 209 | { |
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index a7959e05a9d5..ace34f95e200 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c | |||
@@ -121,7 +121,7 @@ static inline void task_group_account_field(struct task_struct *p, int index, | |||
121 | * is the only cgroup, then nothing else should be necessary. | 121 | * is the only cgroup, then nothing else should be necessary. |
122 | * | 122 | * |
123 | */ | 123 | */ |
124 | __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; | 124 | __this_cpu_add(kernel_cpustat.cpustat[index], tmp); |
125 | 125 | ||
126 | cpuacct_account_field(p, index, tmp); | 126 | cpuacct_account_field(p, index, tmp); |
127 | } | 127 | } |
@@ -378,11 +378,8 @@ static inline void irqtime_account_process_tick(struct task_struct *p, int user_ | |||
378 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING | 378 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING |
379 | 379 | ||
380 | #ifndef __ARCH_HAS_VTIME_TASK_SWITCH | 380 | #ifndef __ARCH_HAS_VTIME_TASK_SWITCH |
381 | void vtime_task_switch(struct task_struct *prev) | 381 | void vtime_common_task_switch(struct task_struct *prev) |
382 | { | 382 | { |
383 | if (!vtime_accounting_enabled()) | ||
384 | return; | ||
385 | |||
386 | if (is_idle_task(prev)) | 383 | if (is_idle_task(prev)) |
387 | vtime_account_idle(prev); | 384 | vtime_account_idle(prev); |
388 | else | 385 | else |
@@ -404,11 +401,8 @@ void vtime_task_switch(struct task_struct *prev) | |||
404 | * vtime_account(). | 401 | * vtime_account(). |
405 | */ | 402 | */ |
406 | #ifndef __ARCH_HAS_VTIME_ACCOUNT | 403 | #ifndef __ARCH_HAS_VTIME_ACCOUNT |
407 | void vtime_account_irq_enter(struct task_struct *tsk) | 404 | void vtime_common_account_irq_enter(struct task_struct *tsk) |
408 | { | 405 | { |
409 | if (!vtime_accounting_enabled()) | ||
410 | return; | ||
411 | |||
412 | if (!in_interrupt()) { | 406 | if (!in_interrupt()) { |
413 | /* | 407 | /* |
414 | * If we interrupted user, context_tracking_in_user() | 408 | * If we interrupted user, context_tracking_in_user() |
@@ -428,7 +422,7 @@ void vtime_account_irq_enter(struct task_struct *tsk) | |||
428 | } | 422 | } |
429 | vtime_account_system(tsk); | 423 | vtime_account_system(tsk); |
430 | } | 424 | } |
431 | EXPORT_SYMBOL_GPL(vtime_account_irq_enter); | 425 | EXPORT_SYMBOL_GPL(vtime_common_account_irq_enter); |
432 | #endif /* __ARCH_HAS_VTIME_ACCOUNT */ | 426 | #endif /* __ARCH_HAS_VTIME_ACCOUNT */ |
433 | #endif /* CONFIG_VIRT_CPU_ACCOUNTING */ | 427 | #endif /* CONFIG_VIRT_CPU_ACCOUNTING */ |
434 | 428 | ||
@@ -559,12 +553,6 @@ static void cputime_adjust(struct task_cputime *curr, | |||
559 | { | 553 | { |
560 | cputime_t rtime, stime, utime, total; | 554 | cputime_t rtime, stime, utime, total; |
561 | 555 | ||
562 | if (vtime_accounting_enabled()) { | ||
563 | *ut = curr->utime; | ||
564 | *st = curr->stime; | ||
565 | return; | ||
566 | } | ||
567 | |||
568 | stime = curr->stime; | 556 | stime = curr->stime; |
569 | total = stime + curr->utime; | 557 | total = stime + curr->utime; |
570 | 558 | ||
@@ -664,23 +652,17 @@ static void __vtime_account_system(struct task_struct *tsk) | |||
664 | 652 | ||
665 | void vtime_account_system(struct task_struct *tsk) | 653 | void vtime_account_system(struct task_struct *tsk) |
666 | { | 654 | { |
667 | if (!vtime_accounting_enabled()) | ||
668 | return; | ||
669 | |||
670 | write_seqlock(&tsk->vtime_seqlock); | 655 | write_seqlock(&tsk->vtime_seqlock); |
671 | __vtime_account_system(tsk); | 656 | __vtime_account_system(tsk); |
672 | write_sequnlock(&tsk->vtime_seqlock); | 657 | write_sequnlock(&tsk->vtime_seqlock); |
673 | } | 658 | } |
674 | 659 | ||
675 | void vtime_account_irq_exit(struct task_struct *tsk) | 660 | void vtime_gen_account_irq_exit(struct task_struct *tsk) |
676 | { | 661 | { |
677 | if (!vtime_accounting_enabled()) | ||
678 | return; | ||
679 | |||
680 | write_seqlock(&tsk->vtime_seqlock); | 662 | write_seqlock(&tsk->vtime_seqlock); |
663 | __vtime_account_system(tsk); | ||
681 | if (context_tracking_in_user()) | 664 | if (context_tracking_in_user()) |
682 | tsk->vtime_snap_whence = VTIME_USER; | 665 | tsk->vtime_snap_whence = VTIME_USER; |
683 | __vtime_account_system(tsk); | ||
684 | write_sequnlock(&tsk->vtime_seqlock); | 666 | write_sequnlock(&tsk->vtime_seqlock); |
685 | } | 667 | } |
686 | 668 | ||
@@ -688,12 +670,8 @@ void vtime_account_user(struct task_struct *tsk) | |||
688 | { | 670 | { |
689 | cputime_t delta_cpu; | 671 | cputime_t delta_cpu; |
690 | 672 | ||
691 | if (!vtime_accounting_enabled()) | ||
692 | return; | ||
693 | |||
694 | delta_cpu = get_vtime_delta(tsk); | ||
695 | |||
696 | write_seqlock(&tsk->vtime_seqlock); | 673 | write_seqlock(&tsk->vtime_seqlock); |
674 | delta_cpu = get_vtime_delta(tsk); | ||
697 | tsk->vtime_snap_whence = VTIME_SYS; | 675 | tsk->vtime_snap_whence = VTIME_SYS; |
698 | account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); | 676 | account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); |
699 | write_sequnlock(&tsk->vtime_seqlock); | 677 | write_sequnlock(&tsk->vtime_seqlock); |
@@ -701,22 +679,27 @@ void vtime_account_user(struct task_struct *tsk) | |||
701 | 679 | ||
702 | void vtime_user_enter(struct task_struct *tsk) | 680 | void vtime_user_enter(struct task_struct *tsk) |
703 | { | 681 | { |
704 | if (!vtime_accounting_enabled()) | ||
705 | return; | ||
706 | |||
707 | write_seqlock(&tsk->vtime_seqlock); | 682 | write_seqlock(&tsk->vtime_seqlock); |
708 | tsk->vtime_snap_whence = VTIME_USER; | ||
709 | __vtime_account_system(tsk); | 683 | __vtime_account_system(tsk); |
684 | tsk->vtime_snap_whence = VTIME_USER; | ||
710 | write_sequnlock(&tsk->vtime_seqlock); | 685 | write_sequnlock(&tsk->vtime_seqlock); |
711 | } | 686 | } |
712 | 687 | ||
713 | void vtime_guest_enter(struct task_struct *tsk) | 688 | void vtime_guest_enter(struct task_struct *tsk) |
714 | { | 689 | { |
690 | /* | ||
691 | * The flags must be updated under the lock with | ||
692 | * the vtime_snap flush and update. | ||
693 | * That enforces a right ordering and update sequence | ||
694 | * synchronization against the reader (task_gtime()) | ||
695 | * that can thus safely catch up with a tickless delta. | ||
696 | */ | ||
715 | write_seqlock(&tsk->vtime_seqlock); | 697 | write_seqlock(&tsk->vtime_seqlock); |
716 | __vtime_account_system(tsk); | 698 | __vtime_account_system(tsk); |
717 | current->flags |= PF_VCPU; | 699 | current->flags |= PF_VCPU; |
718 | write_sequnlock(&tsk->vtime_seqlock); | 700 | write_sequnlock(&tsk->vtime_seqlock); |
719 | } | 701 | } |
702 | EXPORT_SYMBOL_GPL(vtime_guest_enter); | ||
720 | 703 | ||
721 | void vtime_guest_exit(struct task_struct *tsk) | 704 | void vtime_guest_exit(struct task_struct *tsk) |
722 | { | 705 | { |
@@ -725,6 +708,7 @@ void vtime_guest_exit(struct task_struct *tsk) | |||
725 | current->flags &= ~PF_VCPU; | 708 | current->flags &= ~PF_VCPU; |
726 | write_sequnlock(&tsk->vtime_seqlock); | 709 | write_sequnlock(&tsk->vtime_seqlock); |
727 | } | 710 | } |
711 | EXPORT_SYMBOL_GPL(vtime_guest_exit); | ||
728 | 712 | ||
729 | void vtime_account_idle(struct task_struct *tsk) | 713 | void vtime_account_idle(struct task_struct *tsk) |
730 | { | 714 | { |
@@ -733,11 +717,6 @@ void vtime_account_idle(struct task_struct *tsk) | |||
733 | account_idle_time(delta_cpu); | 717 | account_idle_time(delta_cpu); |
734 | } | 718 | } |
735 | 719 | ||
736 | bool vtime_accounting_enabled(void) | ||
737 | { | ||
738 | return context_tracking_active(); | ||
739 | } | ||
740 | |||
741 | void arch_vtime_task_switch(struct task_struct *prev) | 720 | void arch_vtime_task_switch(struct task_struct *prev) |
742 | { | 721 | { |
743 | write_seqlock(&prev->vtime_seqlock); | 722 | write_seqlock(&prev->vtime_seqlock); |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f77f9c527449..7f0a5e6cdae0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -851,7 +851,7 @@ void task_numa_fault(int node, int pages, bool migrated) | |||
851 | { | 851 | { |
852 | struct task_struct *p = current; | 852 | struct task_struct *p = current; |
853 | 853 | ||
854 | if (!sched_feat_numa(NUMA)) | 854 | if (!numabalancing_enabled) |
855 | return; | 855 | return; |
856 | 856 | ||
857 | /* FIXME: Allocate task-specific structure for placement policy here */ | 857 | /* FIXME: Allocate task-specific structure for placement policy here */ |
@@ -2032,6 +2032,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) | |||
2032 | */ | 2032 | */ |
2033 | update_entity_load_avg(curr, 1); | 2033 | update_entity_load_avg(curr, 1); |
2034 | update_cfs_rq_blocked_load(cfs_rq, 1); | 2034 | update_cfs_rq_blocked_load(cfs_rq, 1); |
2035 | update_cfs_shares(cfs_rq); | ||
2035 | 2036 | ||
2036 | #ifdef CONFIG_SCHED_HRTICK | 2037 | #ifdef CONFIG_SCHED_HRTICK |
2037 | /* | 2038 | /* |
@@ -3017,6 +3018,23 @@ static unsigned long cpu_avg_load_per_task(int cpu) | |||
3017 | return 0; | 3018 | return 0; |
3018 | } | 3019 | } |
3019 | 3020 | ||
3021 | static void record_wakee(struct task_struct *p) | ||
3022 | { | ||
3023 | /* | ||
3024 | * Rough decay (wiping) for cost saving, don't worry | ||
3025 | * about the boundary, really active task won't care | ||
3026 | * about the loss. | ||
3027 | */ | ||
3028 | if (jiffies > current->wakee_flip_decay_ts + HZ) { | ||
3029 | current->wakee_flips = 0; | ||
3030 | current->wakee_flip_decay_ts = jiffies; | ||
3031 | } | ||
3032 | |||
3033 | if (current->last_wakee != p) { | ||
3034 | current->last_wakee = p; | ||
3035 | current->wakee_flips++; | ||
3036 | } | ||
3037 | } | ||
3020 | 3038 | ||
3021 | static void task_waking_fair(struct task_struct *p) | 3039 | static void task_waking_fair(struct task_struct *p) |
3022 | { | 3040 | { |
@@ -3037,6 +3055,7 @@ static void task_waking_fair(struct task_struct *p) | |||
3037 | #endif | 3055 | #endif |
3038 | 3056 | ||
3039 | se->vruntime -= min_vruntime; | 3057 | se->vruntime -= min_vruntime; |
3058 | record_wakee(p); | ||
3040 | } | 3059 | } |
3041 | 3060 | ||
3042 | #ifdef CONFIG_FAIR_GROUP_SCHED | 3061 | #ifdef CONFIG_FAIR_GROUP_SCHED |
@@ -3155,6 +3174,28 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu, | |||
3155 | 3174 | ||
3156 | #endif | 3175 | #endif |
3157 | 3176 | ||
3177 | static int wake_wide(struct task_struct *p) | ||
3178 | { | ||
3179 | int factor = this_cpu_read(sd_llc_size); | ||
3180 | |||
3181 | /* | ||
3182 | * Yeah, it's the switching-frequency, could means many wakee or | ||
3183 | * rapidly switch, use factor here will just help to automatically | ||
3184 | * adjust the loose-degree, so bigger node will lead to more pull. | ||
3185 | */ | ||
3186 | if (p->wakee_flips > factor) { | ||
3187 | /* | ||
3188 | * wakee is somewhat hot, it needs certain amount of cpu | ||
3189 | * resource, so if waker is far more hot, prefer to leave | ||
3190 | * it alone. | ||
3191 | */ | ||
3192 | if (current->wakee_flips > (factor * p->wakee_flips)) | ||
3193 | return 1; | ||
3194 | } | ||
3195 | |||
3196 | return 0; | ||
3197 | } | ||
3198 | |||
3158 | static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) | 3199 | static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) |
3159 | { | 3200 | { |
3160 | s64 this_load, load; | 3201 | s64 this_load, load; |
@@ -3164,6 +3205,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) | |||
3164 | unsigned long weight; | 3205 | unsigned long weight; |
3165 | int balanced; | 3206 | int balanced; |
3166 | 3207 | ||
3208 | /* | ||
3209 | * If we wake multiple tasks be careful to not bounce | ||
3210 | * ourselves around too much. | ||
3211 | */ | ||
3212 | if (wake_wide(p)) | ||
3213 | return 0; | ||
3214 | |||
3167 | idx = sd->wake_idx; | 3215 | idx = sd->wake_idx; |
3168 | this_cpu = smp_processor_id(); | 3216 | this_cpu = smp_processor_id(); |
3169 | prev_cpu = task_cpu(p); | 3217 | prev_cpu = task_cpu(p); |
@@ -4171,47 +4219,48 @@ static void update_blocked_averages(int cpu) | |||
4171 | } | 4219 | } |
4172 | 4220 | ||
4173 | /* | 4221 | /* |
4174 | * Compute the cpu's hierarchical load factor for each task group. | 4222 | * Compute the hierarchical load factor for cfs_rq and all its ascendants. |
4175 | * This needs to be done in a top-down fashion because the load of a child | 4223 | * This needs to be done in a top-down fashion because the load of a child |
4176 | * group is a fraction of its parents load. | 4224 | * group is a fraction of its parents load. |
4177 | */ | 4225 | */ |
4178 | static int tg_load_down(struct task_group *tg, void *data) | 4226 | static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq) |
4179 | { | ||
4180 | unsigned long load; | ||
4181 | long cpu = (long)data; | ||
4182 | |||
4183 | if (!tg->parent) { | ||
4184 | load = cpu_rq(cpu)->avg.load_avg_contrib; | ||
4185 | } else { | ||
4186 | load = tg->parent->cfs_rq[cpu]->h_load; | ||
4187 | load = div64_ul(load * tg->se[cpu]->avg.load_avg_contrib, | ||
4188 | tg->parent->cfs_rq[cpu]->runnable_load_avg + 1); | ||
4189 | } | ||
4190 | |||
4191 | tg->cfs_rq[cpu]->h_load = load; | ||
4192 | |||
4193 | return 0; | ||
4194 | } | ||
4195 | |||
4196 | static void update_h_load(long cpu) | ||
4197 | { | 4227 | { |
4198 | struct rq *rq = cpu_rq(cpu); | 4228 | struct rq *rq = rq_of(cfs_rq); |
4229 | struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)]; | ||
4199 | unsigned long now = jiffies; | 4230 | unsigned long now = jiffies; |
4231 | unsigned long load; | ||
4200 | 4232 | ||
4201 | if (rq->h_load_throttle == now) | 4233 | if (cfs_rq->last_h_load_update == now) |
4202 | return; | 4234 | return; |
4203 | 4235 | ||
4204 | rq->h_load_throttle = now; | 4236 | cfs_rq->h_load_next = NULL; |
4237 | for_each_sched_entity(se) { | ||
4238 | cfs_rq = cfs_rq_of(se); | ||
4239 | cfs_rq->h_load_next = se; | ||
4240 | if (cfs_rq->last_h_load_update == now) | ||
4241 | break; | ||
4242 | } | ||
4205 | 4243 | ||
4206 | rcu_read_lock(); | 4244 | if (!se) { |
4207 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); | 4245 | cfs_rq->h_load = rq->avg.load_avg_contrib; |
4208 | rcu_read_unlock(); | 4246 | cfs_rq->last_h_load_update = now; |
4247 | } | ||
4248 | |||
4249 | while ((se = cfs_rq->h_load_next) != NULL) { | ||
4250 | load = cfs_rq->h_load; | ||
4251 | load = div64_ul(load * se->avg.load_avg_contrib, | ||
4252 | cfs_rq->runnable_load_avg + 1); | ||
4253 | cfs_rq = group_cfs_rq(se); | ||
4254 | cfs_rq->h_load = load; | ||
4255 | cfs_rq->last_h_load_update = now; | ||
4256 | } | ||
4209 | } | 4257 | } |
4210 | 4258 | ||
4211 | static unsigned long task_h_load(struct task_struct *p) | 4259 | static unsigned long task_h_load(struct task_struct *p) |
4212 | { | 4260 | { |
4213 | struct cfs_rq *cfs_rq = task_cfs_rq(p); | 4261 | struct cfs_rq *cfs_rq = task_cfs_rq(p); |
4214 | 4262 | ||
4263 | update_cfs_rq_h_load(cfs_rq); | ||
4215 | return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load, | 4264 | return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load, |
4216 | cfs_rq->runnable_load_avg + 1); | 4265 | cfs_rq->runnable_load_avg + 1); |
4217 | } | 4266 | } |
@@ -4220,10 +4269,6 @@ static inline void update_blocked_averages(int cpu) | |||
4220 | { | 4269 | { |
4221 | } | 4270 | } |
4222 | 4271 | ||
4223 | static inline void update_h_load(long cpu) | ||
4224 | { | ||
4225 | } | ||
4226 | |||
4227 | static unsigned long task_h_load(struct task_struct *p) | 4272 | static unsigned long task_h_load(struct task_struct *p) |
4228 | { | 4273 | { |
4229 | return p->se.avg.load_avg_contrib; | 4274 | return p->se.avg.load_avg_contrib; |
@@ -4232,54 +4277,62 @@ static unsigned long task_h_load(struct task_struct *p) | |||
4232 | 4277 | ||
4233 | /********** Helpers for find_busiest_group ************************/ | 4278 | /********** Helpers for find_busiest_group ************************/ |
4234 | /* | 4279 | /* |
4235 | * sd_lb_stats - Structure to store the statistics of a sched_domain | ||
4236 | * during load balancing. | ||
4237 | */ | ||
4238 | struct sd_lb_stats { | ||
4239 | struct sched_group *busiest; /* Busiest group in this sd */ | ||
4240 | struct sched_group *this; /* Local group in this sd */ | ||
4241 | unsigned long total_load; /* Total load of all groups in sd */ | ||
4242 | unsigned long total_pwr; /* Total power of all groups in sd */ | ||
4243 | unsigned long avg_load; /* Average load across all groups in sd */ | ||
4244 | |||
4245 | /** Statistics of this group */ | ||
4246 | unsigned long this_load; | ||
4247 | unsigned long this_load_per_task; | ||
4248 | unsigned long this_nr_running; | ||
4249 | unsigned long this_has_capacity; | ||
4250 | unsigned int this_idle_cpus; | ||
4251 | |||
4252 | /* Statistics of the busiest group */ | ||
4253 | unsigned int busiest_idle_cpus; | ||
4254 | unsigned long max_load; | ||
4255 | unsigned long busiest_load_per_task; | ||
4256 | unsigned long busiest_nr_running; | ||
4257 | unsigned long busiest_group_capacity; | ||
4258 | unsigned long busiest_has_capacity; | ||
4259 | unsigned int busiest_group_weight; | ||
4260 | |||
4261 | int group_imb; /* Is there imbalance in this sd */ | ||
4262 | }; | ||
4263 | |||
4264 | /* | ||
4265 | * sg_lb_stats - stats of a sched_group required for load_balancing | 4280 | * sg_lb_stats - stats of a sched_group required for load_balancing |
4266 | */ | 4281 | */ |
4267 | struct sg_lb_stats { | 4282 | struct sg_lb_stats { |
4268 | unsigned long avg_load; /*Avg load across the CPUs of the group */ | 4283 | unsigned long avg_load; /*Avg load across the CPUs of the group */ |
4269 | unsigned long group_load; /* Total load over the CPUs of the group */ | 4284 | unsigned long group_load; /* Total load over the CPUs of the group */ |
4270 | unsigned long sum_nr_running; /* Nr tasks running in the group */ | ||
4271 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ | 4285 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ |
4272 | unsigned long group_capacity; | 4286 | unsigned long load_per_task; |
4273 | unsigned long idle_cpus; | 4287 | unsigned long group_power; |
4274 | unsigned long group_weight; | 4288 | unsigned int sum_nr_running; /* Nr tasks running in the group */ |
4289 | unsigned int group_capacity; | ||
4290 | unsigned int idle_cpus; | ||
4291 | unsigned int group_weight; | ||
4275 | int group_imb; /* Is there an imbalance in the group ? */ | 4292 | int group_imb; /* Is there an imbalance in the group ? */ |
4276 | int group_has_capacity; /* Is there extra capacity in the group? */ | 4293 | int group_has_capacity; /* Is there extra capacity in the group? */ |
4277 | }; | 4294 | }; |
4278 | 4295 | ||
4296 | /* | ||
4297 | * sd_lb_stats - Structure to store the statistics of a sched_domain | ||
4298 | * during load balancing. | ||
4299 | */ | ||
4300 | struct sd_lb_stats { | ||
4301 | struct sched_group *busiest; /* Busiest group in this sd */ | ||
4302 | struct sched_group *local; /* Local group in this sd */ | ||
4303 | unsigned long total_load; /* Total load of all groups in sd */ | ||
4304 | unsigned long total_pwr; /* Total power of all groups in sd */ | ||
4305 | unsigned long avg_load; /* Average load across all groups in sd */ | ||
4306 | |||
4307 | struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */ | ||
4308 | struct sg_lb_stats local_stat; /* Statistics of the local group */ | ||
4309 | }; | ||
4310 | |||
4311 | static inline void init_sd_lb_stats(struct sd_lb_stats *sds) | ||
4312 | { | ||
4313 | /* | ||
4314 | * Skimp on the clearing to avoid duplicate work. We can avoid clearing | ||
4315 | * local_stat because update_sg_lb_stats() does a full clear/assignment. | ||
4316 | * We must however clear busiest_stat::avg_load because | ||
4317 | * update_sd_pick_busiest() reads this before assignment. | ||
4318 | */ | ||
4319 | *sds = (struct sd_lb_stats){ | ||
4320 | .busiest = NULL, | ||
4321 | .local = NULL, | ||
4322 | .total_load = 0UL, | ||
4323 | .total_pwr = 0UL, | ||
4324 | .busiest_stat = { | ||
4325 | .avg_load = 0UL, | ||
4326 | }, | ||
4327 | }; | ||
4328 | } | ||
4329 | |||
4279 | /** | 4330 | /** |
4280 | * get_sd_load_idx - Obtain the load index for a given sched domain. | 4331 | * get_sd_load_idx - Obtain the load index for a given sched domain. |
4281 | * @sd: The sched_domain whose load_idx is to be obtained. | 4332 | * @sd: The sched_domain whose load_idx is to be obtained. |
4282 | * @idle: The Idle status of the CPU for whose sd load_icx is obtained. | 4333 | * @idle: The Idle status of the CPU for whose sd load_icx is obtained. |
4334 | * | ||
4335 | * Return: The load index. | ||
4283 | */ | 4336 | */ |
4284 | static inline int get_sd_load_idx(struct sched_domain *sd, | 4337 | static inline int get_sd_load_idx(struct sched_domain *sd, |
4285 | enum cpu_idle_type idle) | 4338 | enum cpu_idle_type idle) |
@@ -4457,33 +4510,99 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | |||
4457 | return 0; | 4510 | return 0; |
4458 | } | 4511 | } |
4459 | 4512 | ||
4513 | /* | ||
4514 | * Group imbalance indicates (and tries to solve) the problem where balancing | ||
4515 | * groups is inadequate due to tsk_cpus_allowed() constraints. | ||
4516 | * | ||
4517 | * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a | ||
4518 | * cpumask covering 1 cpu of the first group and 3 cpus of the second group. | ||
4519 | * Something like: | ||
4520 | * | ||
4521 | * { 0 1 2 3 } { 4 5 6 7 } | ||
4522 | * * * * * | ||
4523 | * | ||
4524 | * If we were to balance group-wise we'd place two tasks in the first group and | ||
4525 | * two tasks in the second group. Clearly this is undesired as it will overload | ||
4526 | * cpu 3 and leave one of the cpus in the second group unused. | ||
4527 | * | ||
4528 | * The current solution to this issue is detecting the skew in the first group | ||
4529 | * by noticing it has a cpu that is overloaded while the remaining cpus are | ||
4530 | * idle -- or rather, there's a distinct imbalance in the cpus; see | ||
4531 | * sg_imbalanced(). | ||
4532 | * | ||
4533 | * When this is so detected; this group becomes a candidate for busiest; see | ||
4534 | * update_sd_pick_busiest(). And calculcate_imbalance() and | ||
4535 | * find_busiest_group() avoid some of the usual balance conditional to allow it | ||
4536 | * to create an effective group imbalance. | ||
4537 | * | ||
4538 | * This is a somewhat tricky proposition since the next run might not find the | ||
4539 | * group imbalance and decide the groups need to be balanced again. A most | ||
4540 | * subtle and fragile situation. | ||
4541 | */ | ||
4542 | |||
4543 | struct sg_imb_stats { | ||
4544 | unsigned long max_nr_running, min_nr_running; | ||
4545 | unsigned long max_cpu_load, min_cpu_load; | ||
4546 | }; | ||
4547 | |||
4548 | static inline void init_sg_imb_stats(struct sg_imb_stats *sgi) | ||
4549 | { | ||
4550 | sgi->max_cpu_load = sgi->max_nr_running = 0UL; | ||
4551 | sgi->min_cpu_load = sgi->min_nr_running = ~0UL; | ||
4552 | } | ||
4553 | |||
4554 | static inline void | ||
4555 | update_sg_imb_stats(struct sg_imb_stats *sgi, | ||
4556 | unsigned long load, unsigned long nr_running) | ||
4557 | { | ||
4558 | if (load > sgi->max_cpu_load) | ||
4559 | sgi->max_cpu_load = load; | ||
4560 | if (sgi->min_cpu_load > load) | ||
4561 | sgi->min_cpu_load = load; | ||
4562 | |||
4563 | if (nr_running > sgi->max_nr_running) | ||
4564 | sgi->max_nr_running = nr_running; | ||
4565 | if (sgi->min_nr_running > nr_running) | ||
4566 | sgi->min_nr_running = nr_running; | ||
4567 | } | ||
4568 | |||
4569 | static inline int | ||
4570 | sg_imbalanced(struct sg_lb_stats *sgs, struct sg_imb_stats *sgi) | ||
4571 | { | ||
4572 | /* | ||
4573 | * Consider the group unbalanced when the imbalance is larger | ||
4574 | * than the average weight of a task. | ||
4575 | * | ||
4576 | * APZ: with cgroup the avg task weight can vary wildly and | ||
4577 | * might not be a suitable number - should we keep a | ||
4578 | * normalized nr_running number somewhere that negates | ||
4579 | * the hierarchy? | ||
4580 | */ | ||
4581 | if ((sgi->max_cpu_load - sgi->min_cpu_load) >= sgs->load_per_task && | ||
4582 | (sgi->max_nr_running - sgi->min_nr_running) > 1) | ||
4583 | return 1; | ||
4584 | |||
4585 | return 0; | ||
4586 | } | ||
4587 | |||
4460 | /** | 4588 | /** |
4461 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. | 4589 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. |
4462 | * @env: The load balancing environment. | 4590 | * @env: The load balancing environment. |
4463 | * @group: sched_group whose statistics are to be updated. | 4591 | * @group: sched_group whose statistics are to be updated. |
4464 | * @load_idx: Load index of sched_domain of this_cpu for load calc. | 4592 | * @load_idx: Load index of sched_domain of this_cpu for load calc. |
4465 | * @local_group: Does group contain this_cpu. | 4593 | * @local_group: Does group contain this_cpu. |
4466 | * @balance: Should we balance. | ||
4467 | * @sgs: variable to hold the statistics for this group. | 4594 | * @sgs: variable to hold the statistics for this group. |
4468 | */ | 4595 | */ |
4469 | static inline void update_sg_lb_stats(struct lb_env *env, | 4596 | static inline void update_sg_lb_stats(struct lb_env *env, |
4470 | struct sched_group *group, int load_idx, | 4597 | struct sched_group *group, int load_idx, |
4471 | int local_group, int *balance, struct sg_lb_stats *sgs) | 4598 | int local_group, struct sg_lb_stats *sgs) |
4472 | { | 4599 | { |
4473 | unsigned long nr_running, max_nr_running, min_nr_running; | 4600 | struct sg_imb_stats sgi; |
4474 | unsigned long load, max_cpu_load, min_cpu_load; | 4601 | unsigned long nr_running; |
4475 | unsigned int balance_cpu = -1, first_idle_cpu = 0; | 4602 | unsigned long load; |
4476 | unsigned long avg_load_per_task = 0; | ||
4477 | int i; | 4603 | int i; |
4478 | 4604 | ||
4479 | if (local_group) | 4605 | init_sg_imb_stats(&sgi); |
4480 | balance_cpu = group_balance_cpu(group); | ||
4481 | |||
4482 | /* Tally up the load of all CPUs in the group */ | ||
4483 | max_cpu_load = 0; | ||
4484 | min_cpu_load = ~0UL; | ||
4485 | max_nr_running = 0; | ||
4486 | min_nr_running = ~0UL; | ||
4487 | 4606 | ||
4488 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { | 4607 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { |
4489 | struct rq *rq = cpu_rq(i); | 4608 | struct rq *rq = cpu_rq(i); |
@@ -4492,24 +4611,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
4492 | 4611 | ||
4493 | /* Bias balancing toward cpus of our domain */ | 4612 | /* Bias balancing toward cpus of our domain */ |
4494 | if (local_group) { | 4613 | if (local_group) { |
4495 | if (idle_cpu(i) && !first_idle_cpu && | ||
4496 | cpumask_test_cpu(i, sched_group_mask(group))) { | ||
4497 | first_idle_cpu = 1; | ||
4498 | balance_cpu = i; | ||
4499 | } | ||
4500 | |||
4501 | load = target_load(i, load_idx); | 4614 | load = target_load(i, load_idx); |
4502 | } else { | 4615 | } else { |
4503 | load = source_load(i, load_idx); | 4616 | load = source_load(i, load_idx); |
4504 | if (load > max_cpu_load) | 4617 | update_sg_imb_stats(&sgi, load, nr_running); |
4505 | max_cpu_load = load; | ||
4506 | if (min_cpu_load > load) | ||
4507 | min_cpu_load = load; | ||
4508 | |||
4509 | if (nr_running > max_nr_running) | ||
4510 | max_nr_running = nr_running; | ||
4511 | if (min_nr_running > nr_running) | ||
4512 | min_nr_running = nr_running; | ||
4513 | } | 4618 | } |
4514 | 4619 | ||
4515 | sgs->group_load += load; | 4620 | sgs->group_load += load; |
@@ -4519,46 +4624,25 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
4519 | sgs->idle_cpus++; | 4624 | sgs->idle_cpus++; |
4520 | } | 4625 | } |
4521 | 4626 | ||
4522 | /* | 4627 | if (local_group && (env->idle != CPU_NEWLY_IDLE || |
4523 | * First idle cpu or the first cpu(busiest) in this sched group | 4628 | time_after_eq(jiffies, group->sgp->next_update))) |
4524 | * is eligible for doing load balancing at this and above | 4629 | update_group_power(env->sd, env->dst_cpu); |
4525 | * domains. In the newly idle case, we will allow all the cpu's | ||
4526 | * to do the newly idle load balance. | ||
4527 | */ | ||
4528 | if (local_group) { | ||
4529 | if (env->idle != CPU_NEWLY_IDLE) { | ||
4530 | if (balance_cpu != env->dst_cpu) { | ||
4531 | *balance = 0; | ||
4532 | return; | ||
4533 | } | ||
4534 | update_group_power(env->sd, env->dst_cpu); | ||
4535 | } else if (time_after_eq(jiffies, group->sgp->next_update)) | ||
4536 | update_group_power(env->sd, env->dst_cpu); | ||
4537 | } | ||
4538 | 4630 | ||
4539 | /* Adjust by relative CPU power of the group */ | 4631 | /* Adjust by relative CPU power of the group */ |
4540 | sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power; | 4632 | sgs->group_power = group->sgp->power; |
4633 | sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power; | ||
4541 | 4634 | ||
4542 | /* | ||
4543 | * Consider the group unbalanced when the imbalance is larger | ||
4544 | * than the average weight of a task. | ||
4545 | * | ||
4546 | * APZ: with cgroup the avg task weight can vary wildly and | ||
4547 | * might not be a suitable number - should we keep a | ||
4548 | * normalized nr_running number somewhere that negates | ||
4549 | * the hierarchy? | ||
4550 | */ | ||
4551 | if (sgs->sum_nr_running) | 4635 | if (sgs->sum_nr_running) |
4552 | avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; | 4636 | sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; |
4637 | |||
4638 | sgs->group_imb = sg_imbalanced(sgs, &sgi); | ||
4553 | 4639 | ||
4554 | if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && | 4640 | sgs->group_capacity = |
4555 | (max_nr_running - min_nr_running) > 1) | 4641 | DIV_ROUND_CLOSEST(sgs->group_power, SCHED_POWER_SCALE); |
4556 | sgs->group_imb = 1; | ||
4557 | 4642 | ||
4558 | sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power, | ||
4559 | SCHED_POWER_SCALE); | ||
4560 | if (!sgs->group_capacity) | 4643 | if (!sgs->group_capacity) |
4561 | sgs->group_capacity = fix_small_capacity(env->sd, group); | 4644 | sgs->group_capacity = fix_small_capacity(env->sd, group); |
4645 | |||
4562 | sgs->group_weight = group->group_weight; | 4646 | sgs->group_weight = group->group_weight; |
4563 | 4647 | ||
4564 | if (sgs->group_capacity > sgs->sum_nr_running) | 4648 | if (sgs->group_capacity > sgs->sum_nr_running) |
@@ -4574,13 +4658,16 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
4574 | * | 4658 | * |
4575 | * Determine if @sg is a busier group than the previously selected | 4659 | * Determine if @sg is a busier group than the previously selected |
4576 | * busiest group. | 4660 | * busiest group. |
4661 | * | ||
4662 | * Return: %true if @sg is a busier group than the previously selected | ||
4663 | * busiest group. %false otherwise. | ||
4577 | */ | 4664 | */ |
4578 | static bool update_sd_pick_busiest(struct lb_env *env, | 4665 | static bool update_sd_pick_busiest(struct lb_env *env, |
4579 | struct sd_lb_stats *sds, | 4666 | struct sd_lb_stats *sds, |
4580 | struct sched_group *sg, | 4667 | struct sched_group *sg, |
4581 | struct sg_lb_stats *sgs) | 4668 | struct sg_lb_stats *sgs) |
4582 | { | 4669 | { |
4583 | if (sgs->avg_load <= sds->max_load) | 4670 | if (sgs->avg_load <= sds->busiest_stat.avg_load) |
4584 | return false; | 4671 | return false; |
4585 | 4672 | ||
4586 | if (sgs->sum_nr_running > sgs->group_capacity) | 4673 | if (sgs->sum_nr_running > sgs->group_capacity) |
@@ -4613,11 +4700,11 @@ static bool update_sd_pick_busiest(struct lb_env *env, | |||
4613 | * @sds: variable to hold the statistics for this sched_domain. | 4700 | * @sds: variable to hold the statistics for this sched_domain. |
4614 | */ | 4701 | */ |
4615 | static inline void update_sd_lb_stats(struct lb_env *env, | 4702 | static inline void update_sd_lb_stats(struct lb_env *env, |
4616 | int *balance, struct sd_lb_stats *sds) | 4703 | struct sd_lb_stats *sds) |
4617 | { | 4704 | { |
4618 | struct sched_domain *child = env->sd->child; | 4705 | struct sched_domain *child = env->sd->child; |
4619 | struct sched_group *sg = env->sd->groups; | 4706 | struct sched_group *sg = env->sd->groups; |
4620 | struct sg_lb_stats sgs; | 4707 | struct sg_lb_stats tmp_sgs; |
4621 | int load_idx, prefer_sibling = 0; | 4708 | int load_idx, prefer_sibling = 0; |
4622 | 4709 | ||
4623 | if (child && child->flags & SD_PREFER_SIBLING) | 4710 | if (child && child->flags & SD_PREFER_SIBLING) |
@@ -4626,17 +4713,17 @@ static inline void update_sd_lb_stats(struct lb_env *env, | |||
4626 | load_idx = get_sd_load_idx(env->sd, env->idle); | 4713 | load_idx = get_sd_load_idx(env->sd, env->idle); |
4627 | 4714 | ||
4628 | do { | 4715 | do { |
4716 | struct sg_lb_stats *sgs = &tmp_sgs; | ||
4629 | int local_group; | 4717 | int local_group; |
4630 | 4718 | ||
4631 | local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); | 4719 | local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); |
4632 | memset(&sgs, 0, sizeof(sgs)); | 4720 | if (local_group) { |
4633 | update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs); | 4721 | sds->local = sg; |
4634 | 4722 | sgs = &sds->local_stat; | |
4635 | if (local_group && !(*balance)) | 4723 | } |
4636 | return; | ||
4637 | 4724 | ||
4638 | sds->total_load += sgs.group_load; | 4725 | memset(sgs, 0, sizeof(*sgs)); |
4639 | sds->total_pwr += sg->sgp->power; | 4726 | update_sg_lb_stats(env, sg, load_idx, local_group, sgs); |
4640 | 4727 | ||
4641 | /* | 4728 | /* |
4642 | * In case the child domain prefers tasks go to siblings | 4729 | * In case the child domain prefers tasks go to siblings |
@@ -4648,26 +4735,17 @@ static inline void update_sd_lb_stats(struct lb_env *env, | |||
4648 | * heaviest group when it is already under-utilized (possible | 4735 | * heaviest group when it is already under-utilized (possible |
4649 | * with a large weight task outweighs the tasks on the system). | 4736 | * with a large weight task outweighs the tasks on the system). |
4650 | */ | 4737 | */ |
4651 | if (prefer_sibling && !local_group && sds->this_has_capacity) | 4738 | if (prefer_sibling && !local_group && |
4652 | sgs.group_capacity = min(sgs.group_capacity, 1UL); | 4739 | sds->local && sds->local_stat.group_has_capacity) |
4740 | sgs->group_capacity = min(sgs->group_capacity, 1U); | ||
4653 | 4741 | ||
4654 | if (local_group) { | 4742 | /* Now, start updating sd_lb_stats */ |
4655 | sds->this_load = sgs.avg_load; | 4743 | sds->total_load += sgs->group_load; |
4656 | sds->this = sg; | 4744 | sds->total_pwr += sgs->group_power; |
4657 | sds->this_nr_running = sgs.sum_nr_running; | 4745 | |
4658 | sds->this_load_per_task = sgs.sum_weighted_load; | 4746 | if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) { |
4659 | sds->this_has_capacity = sgs.group_has_capacity; | ||
4660 | sds->this_idle_cpus = sgs.idle_cpus; | ||
4661 | } else if (update_sd_pick_busiest(env, sds, sg, &sgs)) { | ||
4662 | sds->max_load = sgs.avg_load; | ||
4663 | sds->busiest = sg; | 4747 | sds->busiest = sg; |
4664 | sds->busiest_nr_running = sgs.sum_nr_running; | 4748 | sds->busiest_stat = *sgs; |
4665 | sds->busiest_idle_cpus = sgs.idle_cpus; | ||
4666 | sds->busiest_group_capacity = sgs.group_capacity; | ||
4667 | sds->busiest_load_per_task = sgs.sum_weighted_load; | ||
4668 | sds->busiest_has_capacity = sgs.group_has_capacity; | ||
4669 | sds->busiest_group_weight = sgs.group_weight; | ||
4670 | sds->group_imb = sgs.group_imb; | ||
4671 | } | 4749 | } |
4672 | 4750 | ||
4673 | sg = sg->next; | 4751 | sg = sg->next; |
@@ -4691,7 +4769,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, | |||
4691 | * assuming lower CPU number will be equivalent to lower a SMT thread | 4769 | * assuming lower CPU number will be equivalent to lower a SMT thread |
4692 | * number. | 4770 | * number. |
4693 | * | 4771 | * |
4694 | * Returns 1 when packing is required and a task should be moved to | 4772 | * Return: 1 when packing is required and a task should be moved to |
4695 | * this CPU. The amount of the imbalance is returned in *imbalance. | 4773 | * this CPU. The amount of the imbalance is returned in *imbalance. |
4696 | * | 4774 | * |
4697 | * @env: The load balancing environment. | 4775 | * @env: The load balancing environment. |
@@ -4712,7 +4790,8 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) | |||
4712 | return 0; | 4790 | return 0; |
4713 | 4791 | ||
4714 | env->imbalance = DIV_ROUND_CLOSEST( | 4792 | env->imbalance = DIV_ROUND_CLOSEST( |
4715 | sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE); | 4793 | sds->busiest_stat.avg_load * sds->busiest_stat.group_power, |
4794 | SCHED_POWER_SCALE); | ||
4716 | 4795 | ||
4717 | return 1; | 4796 | return 1; |
4718 | } | 4797 | } |
@@ -4730,24 +4809,23 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) | |||
4730 | unsigned long tmp, pwr_now = 0, pwr_move = 0; | 4809 | unsigned long tmp, pwr_now = 0, pwr_move = 0; |
4731 | unsigned int imbn = 2; | 4810 | unsigned int imbn = 2; |
4732 | unsigned long scaled_busy_load_per_task; | 4811 | unsigned long scaled_busy_load_per_task; |
4812 | struct sg_lb_stats *local, *busiest; | ||
4733 | 4813 | ||
4734 | if (sds->this_nr_running) { | 4814 | local = &sds->local_stat; |
4735 | sds->this_load_per_task /= sds->this_nr_running; | 4815 | busiest = &sds->busiest_stat; |
4736 | if (sds->busiest_load_per_task > | 4816 | |
4737 | sds->this_load_per_task) | 4817 | if (!local->sum_nr_running) |
4738 | imbn = 1; | 4818 | local->load_per_task = cpu_avg_load_per_task(env->dst_cpu); |
4739 | } else { | 4819 | else if (busiest->load_per_task > local->load_per_task) |
4740 | sds->this_load_per_task = | 4820 | imbn = 1; |
4741 | cpu_avg_load_per_task(env->dst_cpu); | ||
4742 | } | ||
4743 | 4821 | ||
4744 | scaled_busy_load_per_task = sds->busiest_load_per_task | 4822 | scaled_busy_load_per_task = |
4745 | * SCHED_POWER_SCALE; | 4823 | (busiest->load_per_task * SCHED_POWER_SCALE) / |
4746 | scaled_busy_load_per_task /= sds->busiest->sgp->power; | 4824 | busiest->group_power; |
4747 | 4825 | ||
4748 | if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= | 4826 | if (busiest->avg_load - local->avg_load + scaled_busy_load_per_task >= |
4749 | (scaled_busy_load_per_task * imbn)) { | 4827 | (scaled_busy_load_per_task * imbn)) { |
4750 | env->imbalance = sds->busiest_load_per_task; | 4828 | env->imbalance = busiest->load_per_task; |
4751 | return; | 4829 | return; |
4752 | } | 4830 | } |
4753 | 4831 | ||
@@ -4757,34 +4835,37 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) | |||
4757 | * moving them. | 4835 | * moving them. |
4758 | */ | 4836 | */ |
4759 | 4837 | ||
4760 | pwr_now += sds->busiest->sgp->power * | 4838 | pwr_now += busiest->group_power * |
4761 | min(sds->busiest_load_per_task, sds->max_load); | 4839 | min(busiest->load_per_task, busiest->avg_load); |
4762 | pwr_now += sds->this->sgp->power * | 4840 | pwr_now += local->group_power * |
4763 | min(sds->this_load_per_task, sds->this_load); | 4841 | min(local->load_per_task, local->avg_load); |
4764 | pwr_now /= SCHED_POWER_SCALE; | 4842 | pwr_now /= SCHED_POWER_SCALE; |
4765 | 4843 | ||
4766 | /* Amount of load we'd subtract */ | 4844 | /* Amount of load we'd subtract */ |
4767 | tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / | 4845 | tmp = (busiest->load_per_task * SCHED_POWER_SCALE) / |
4768 | sds->busiest->sgp->power; | 4846 | busiest->group_power; |
4769 | if (sds->max_load > tmp) | 4847 | if (busiest->avg_load > tmp) { |
4770 | pwr_move += sds->busiest->sgp->power * | 4848 | pwr_move += busiest->group_power * |
4771 | min(sds->busiest_load_per_task, sds->max_load - tmp); | 4849 | min(busiest->load_per_task, |
4850 | busiest->avg_load - tmp); | ||
4851 | } | ||
4772 | 4852 | ||
4773 | /* Amount of load we'd add */ | 4853 | /* Amount of load we'd add */ |
4774 | if (sds->max_load * sds->busiest->sgp->power < | 4854 | if (busiest->avg_load * busiest->group_power < |
4775 | sds->busiest_load_per_task * SCHED_POWER_SCALE) | 4855 | busiest->load_per_task * SCHED_POWER_SCALE) { |
4776 | tmp = (sds->max_load * sds->busiest->sgp->power) / | 4856 | tmp = (busiest->avg_load * busiest->group_power) / |
4777 | sds->this->sgp->power; | 4857 | local->group_power; |
4778 | else | 4858 | } else { |
4779 | tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / | 4859 | tmp = (busiest->load_per_task * SCHED_POWER_SCALE) / |
4780 | sds->this->sgp->power; | 4860 | local->group_power; |
4781 | pwr_move += sds->this->sgp->power * | 4861 | } |
4782 | min(sds->this_load_per_task, sds->this_load + tmp); | 4862 | pwr_move += local->group_power * |
4863 | min(local->load_per_task, local->avg_load + tmp); | ||
4783 | pwr_move /= SCHED_POWER_SCALE; | 4864 | pwr_move /= SCHED_POWER_SCALE; |
4784 | 4865 | ||
4785 | /* Move if we gain throughput */ | 4866 | /* Move if we gain throughput */ |
4786 | if (pwr_move > pwr_now) | 4867 | if (pwr_move > pwr_now) |
4787 | env->imbalance = sds->busiest_load_per_task; | 4868 | env->imbalance = busiest->load_per_task; |
4788 | } | 4869 | } |
4789 | 4870 | ||
4790 | /** | 4871 | /** |
@@ -4796,11 +4877,18 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) | |||
4796 | static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds) | 4877 | static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds) |
4797 | { | 4878 | { |
4798 | unsigned long max_pull, load_above_capacity = ~0UL; | 4879 | unsigned long max_pull, load_above_capacity = ~0UL; |
4880 | struct sg_lb_stats *local, *busiest; | ||
4799 | 4881 | ||
4800 | sds->busiest_load_per_task /= sds->busiest_nr_running; | 4882 | local = &sds->local_stat; |
4801 | if (sds->group_imb) { | 4883 | busiest = &sds->busiest_stat; |
4802 | sds->busiest_load_per_task = | 4884 | |
4803 | min(sds->busiest_load_per_task, sds->avg_load); | 4885 | if (busiest->group_imb) { |
4886 | /* | ||
4887 | * In the group_imb case we cannot rely on group-wide averages | ||
4888 | * to ensure cpu-load equilibrium, look at wider averages. XXX | ||
4889 | */ | ||
4890 | busiest->load_per_task = | ||
4891 | min(busiest->load_per_task, sds->avg_load); | ||
4804 | } | 4892 | } |
4805 | 4893 | ||
4806 | /* | 4894 | /* |
@@ -4808,21 +4896,22 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
4808 | * max load less than avg load(as we skip the groups at or below | 4896 | * max load less than avg load(as we skip the groups at or below |
4809 | * its cpu_power, while calculating max_load..) | 4897 | * its cpu_power, while calculating max_load..) |
4810 | */ | 4898 | */ |
4811 | if (sds->max_load < sds->avg_load) { | 4899 | if (busiest->avg_load < sds->avg_load) { |
4812 | env->imbalance = 0; | 4900 | env->imbalance = 0; |
4813 | return fix_small_imbalance(env, sds); | 4901 | return fix_small_imbalance(env, sds); |
4814 | } | 4902 | } |
4815 | 4903 | ||
4816 | if (!sds->group_imb) { | 4904 | if (!busiest->group_imb) { |
4817 | /* | 4905 | /* |
4818 | * Don't want to pull so many tasks that a group would go idle. | 4906 | * Don't want to pull so many tasks that a group would go idle. |
4907 | * Except of course for the group_imb case, since then we might | ||
4908 | * have to drop below capacity to reach cpu-load equilibrium. | ||
4819 | */ | 4909 | */ |
4820 | load_above_capacity = (sds->busiest_nr_running - | 4910 | load_above_capacity = |
4821 | sds->busiest_group_capacity); | 4911 | (busiest->sum_nr_running - busiest->group_capacity); |
4822 | 4912 | ||
4823 | load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); | 4913 | load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); |
4824 | 4914 | load_above_capacity /= busiest->group_power; | |
4825 | load_above_capacity /= sds->busiest->sgp->power; | ||
4826 | } | 4915 | } |
4827 | 4916 | ||
4828 | /* | 4917 | /* |
@@ -4832,15 +4921,14 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
4832 | * we also don't want to reduce the group load below the group capacity | 4921 | * we also don't want to reduce the group load below the group capacity |
4833 | * (so that we can implement power-savings policies etc). Thus we look | 4922 | * (so that we can implement power-savings policies etc). Thus we look |
4834 | * for the minimum possible imbalance. | 4923 | * for the minimum possible imbalance. |
4835 | * Be careful of negative numbers as they'll appear as very large values | ||
4836 | * with unsigned longs. | ||
4837 | */ | 4924 | */ |
4838 | max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); | 4925 | max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity); |
4839 | 4926 | ||
4840 | /* How much load to actually move to equalise the imbalance */ | 4927 | /* How much load to actually move to equalise the imbalance */ |
4841 | env->imbalance = min(max_pull * sds->busiest->sgp->power, | 4928 | env->imbalance = min( |
4842 | (sds->avg_load - sds->this_load) * sds->this->sgp->power) | 4929 | max_pull * busiest->group_power, |
4843 | / SCHED_POWER_SCALE; | 4930 | (sds->avg_load - local->avg_load) * local->group_power |
4931 | ) / SCHED_POWER_SCALE; | ||
4844 | 4932 | ||
4845 | /* | 4933 | /* |
4846 | * if *imbalance is less than the average load per runnable task | 4934 | * if *imbalance is less than the average load per runnable task |
@@ -4848,9 +4936,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
4848 | * a think about bumping its value to force at least one task to be | 4936 | * a think about bumping its value to force at least one task to be |
4849 | * moved | 4937 | * moved |
4850 | */ | 4938 | */ |
4851 | if (env->imbalance < sds->busiest_load_per_task) | 4939 | if (env->imbalance < busiest->load_per_task) |
4852 | return fix_small_imbalance(env, sds); | 4940 | return fix_small_imbalance(env, sds); |
4853 | |||
4854 | } | 4941 | } |
4855 | 4942 | ||
4856 | /******* find_busiest_group() helpers end here *********************/ | 4943 | /******* find_busiest_group() helpers end here *********************/ |
@@ -4866,69 +4953,62 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
4866 | * to restore balance. | 4953 | * to restore balance. |
4867 | * | 4954 | * |
4868 | * @env: The load balancing environment. | 4955 | * @env: The load balancing environment. |
4869 | * @balance: Pointer to a variable indicating if this_cpu | ||
4870 | * is the appropriate cpu to perform load balancing at this_level. | ||
4871 | * | 4956 | * |
4872 | * Returns: - the busiest group if imbalance exists. | 4957 | * Return: - The busiest group if imbalance exists. |
4873 | * - If no imbalance and user has opted for power-savings balance, | 4958 | * - If no imbalance and user has opted for power-savings balance, |
4874 | * return the least loaded group whose CPUs can be | 4959 | * return the least loaded group whose CPUs can be |
4875 | * put to idle by rebalancing its tasks onto our group. | 4960 | * put to idle by rebalancing its tasks onto our group. |
4876 | */ | 4961 | */ |
4877 | static struct sched_group * | 4962 | static struct sched_group *find_busiest_group(struct lb_env *env) |
4878 | find_busiest_group(struct lb_env *env, int *balance) | ||
4879 | { | 4963 | { |
4964 | struct sg_lb_stats *local, *busiest; | ||
4880 | struct sd_lb_stats sds; | 4965 | struct sd_lb_stats sds; |
4881 | 4966 | ||
4882 | memset(&sds, 0, sizeof(sds)); | 4967 | init_sd_lb_stats(&sds); |
4883 | 4968 | ||
4884 | /* | 4969 | /* |
4885 | * Compute the various statistics relavent for load balancing at | 4970 | * Compute the various statistics relavent for load balancing at |
4886 | * this level. | 4971 | * this level. |
4887 | */ | 4972 | */ |
4888 | update_sd_lb_stats(env, balance, &sds); | 4973 | update_sd_lb_stats(env, &sds); |
4889 | 4974 | local = &sds.local_stat; | |
4890 | /* | 4975 | busiest = &sds.busiest_stat; |
4891 | * this_cpu is not the appropriate cpu to perform load balancing at | ||
4892 | * this level. | ||
4893 | */ | ||
4894 | if (!(*balance)) | ||
4895 | goto ret; | ||
4896 | 4976 | ||
4897 | if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && | 4977 | if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && |
4898 | check_asym_packing(env, &sds)) | 4978 | check_asym_packing(env, &sds)) |
4899 | return sds.busiest; | 4979 | return sds.busiest; |
4900 | 4980 | ||
4901 | /* There is no busy sibling group to pull tasks from */ | 4981 | /* There is no busy sibling group to pull tasks from */ |
4902 | if (!sds.busiest || sds.busiest_nr_running == 0) | 4982 | if (!sds.busiest || busiest->sum_nr_running == 0) |
4903 | goto out_balanced; | 4983 | goto out_balanced; |
4904 | 4984 | ||
4905 | sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr; | 4985 | sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr; |
4906 | 4986 | ||
4907 | /* | 4987 | /* |
4908 | * If the busiest group is imbalanced the below checks don't | 4988 | * If the busiest group is imbalanced the below checks don't |
4909 | * work because they assumes all things are equal, which typically | 4989 | * work because they assume all things are equal, which typically |
4910 | * isn't true due to cpus_allowed constraints and the like. | 4990 | * isn't true due to cpus_allowed constraints and the like. |
4911 | */ | 4991 | */ |
4912 | if (sds.group_imb) | 4992 | if (busiest->group_imb) |
4913 | goto force_balance; | 4993 | goto force_balance; |
4914 | 4994 | ||
4915 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ | 4995 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ |
4916 | if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity && | 4996 | if (env->idle == CPU_NEWLY_IDLE && local->group_has_capacity && |
4917 | !sds.busiest_has_capacity) | 4997 | !busiest->group_has_capacity) |
4918 | goto force_balance; | 4998 | goto force_balance; |
4919 | 4999 | ||
4920 | /* | 5000 | /* |
4921 | * If the local group is more busy than the selected busiest group | 5001 | * If the local group is more busy than the selected busiest group |
4922 | * don't try and pull any tasks. | 5002 | * don't try and pull any tasks. |
4923 | */ | 5003 | */ |
4924 | if (sds.this_load >= sds.max_load) | 5004 | if (local->avg_load >= busiest->avg_load) |
4925 | goto out_balanced; | 5005 | goto out_balanced; |
4926 | 5006 | ||
4927 | /* | 5007 | /* |
4928 | * Don't pull any tasks if this group is already above the domain | 5008 | * Don't pull any tasks if this group is already above the domain |
4929 | * average load. | 5009 | * average load. |
4930 | */ | 5010 | */ |
4931 | if (sds.this_load >= sds.avg_load) | 5011 | if (local->avg_load >= sds.avg_load) |
4932 | goto out_balanced; | 5012 | goto out_balanced; |
4933 | 5013 | ||
4934 | if (env->idle == CPU_IDLE) { | 5014 | if (env->idle == CPU_IDLE) { |
@@ -4938,15 +5018,16 @@ find_busiest_group(struct lb_env *env, int *balance) | |||
4938 | * there is no imbalance between this and busiest group | 5018 | * there is no imbalance between this and busiest group |
4939 | * wrt to idle cpu's, it is balanced. | 5019 | * wrt to idle cpu's, it is balanced. |
4940 | */ | 5020 | */ |
4941 | if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) && | 5021 | if ((local->idle_cpus < busiest->idle_cpus) && |
4942 | sds.busiest_nr_running <= sds.busiest_group_weight) | 5022 | busiest->sum_nr_running <= busiest->group_weight) |
4943 | goto out_balanced; | 5023 | goto out_balanced; |
4944 | } else { | 5024 | } else { |
4945 | /* | 5025 | /* |
4946 | * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use | 5026 | * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use |
4947 | * imbalance_pct to be conservative. | 5027 | * imbalance_pct to be conservative. |
4948 | */ | 5028 | */ |
4949 | if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load) | 5029 | if (100 * busiest->avg_load <= |
5030 | env->sd->imbalance_pct * local->avg_load) | ||
4950 | goto out_balanced; | 5031 | goto out_balanced; |
4951 | } | 5032 | } |
4952 | 5033 | ||
@@ -4956,7 +5037,6 @@ force_balance: | |||
4956 | return sds.busiest; | 5037 | return sds.busiest; |
4957 | 5038 | ||
4958 | out_balanced: | 5039 | out_balanced: |
4959 | ret: | ||
4960 | env->imbalance = 0; | 5040 | env->imbalance = 0; |
4961 | return NULL; | 5041 | return NULL; |
4962 | } | 5042 | } |
@@ -4968,10 +5048,10 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
4968 | struct sched_group *group) | 5048 | struct sched_group *group) |
4969 | { | 5049 | { |
4970 | struct rq *busiest = NULL, *rq; | 5050 | struct rq *busiest = NULL, *rq; |
4971 | unsigned long max_load = 0; | 5051 | unsigned long busiest_load = 0, busiest_power = 1; |
4972 | int i; | 5052 | int i; |
4973 | 5053 | ||
4974 | for_each_cpu(i, sched_group_cpus(group)) { | 5054 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { |
4975 | unsigned long power = power_of(i); | 5055 | unsigned long power = power_of(i); |
4976 | unsigned long capacity = DIV_ROUND_CLOSEST(power, | 5056 | unsigned long capacity = DIV_ROUND_CLOSEST(power, |
4977 | SCHED_POWER_SCALE); | 5057 | SCHED_POWER_SCALE); |
@@ -4980,9 +5060,6 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
4980 | if (!capacity) | 5060 | if (!capacity) |
4981 | capacity = fix_small_capacity(env->sd, group); | 5061 | capacity = fix_small_capacity(env->sd, group); |
4982 | 5062 | ||
4983 | if (!cpumask_test_cpu(i, env->cpus)) | ||
4984 | continue; | ||
4985 | |||
4986 | rq = cpu_rq(i); | 5063 | rq = cpu_rq(i); |
4987 | wl = weighted_cpuload(i); | 5064 | wl = weighted_cpuload(i); |
4988 | 5065 | ||
@@ -4998,11 +5075,15 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
4998 | * the weighted_cpuload() scaled with the cpu power, so that | 5075 | * the weighted_cpuload() scaled with the cpu power, so that |
4999 | * the load can be moved away from the cpu that is potentially | 5076 | * the load can be moved away from the cpu that is potentially |
5000 | * running at a lower capacity. | 5077 | * running at a lower capacity. |
5078 | * | ||
5079 | * Thus we're looking for max(wl_i / power_i), crosswise | ||
5080 | * multiplication to rid ourselves of the division works out | ||
5081 | * to: wl_i * power_j > wl_j * power_i; where j is our | ||
5082 | * previous maximum. | ||
5001 | */ | 5083 | */ |
5002 | wl = (wl * SCHED_POWER_SCALE) / power; | 5084 | if (wl * busiest_power > busiest_load * power) { |
5003 | 5085 | busiest_load = wl; | |
5004 | if (wl > max_load) { | 5086 | busiest_power = power; |
5005 | max_load = wl; | ||
5006 | busiest = rq; | 5087 | busiest = rq; |
5007 | } | 5088 | } |
5008 | } | 5089 | } |
@@ -5039,13 +5120,47 @@ static int need_active_balance(struct lb_env *env) | |||
5039 | 5120 | ||
5040 | static int active_load_balance_cpu_stop(void *data); | 5121 | static int active_load_balance_cpu_stop(void *data); |
5041 | 5122 | ||
5123 | static int should_we_balance(struct lb_env *env) | ||
5124 | { | ||
5125 | struct sched_group *sg = env->sd->groups; | ||
5126 | struct cpumask *sg_cpus, *sg_mask; | ||
5127 | int cpu, balance_cpu = -1; | ||
5128 | |||
5129 | /* | ||
5130 | * In the newly idle case, we will allow all the cpu's | ||
5131 | * to do the newly idle load balance. | ||
5132 | */ | ||
5133 | if (env->idle == CPU_NEWLY_IDLE) | ||
5134 | return 1; | ||
5135 | |||
5136 | sg_cpus = sched_group_cpus(sg); | ||
5137 | sg_mask = sched_group_mask(sg); | ||
5138 | /* Try to find first idle cpu */ | ||
5139 | for_each_cpu_and(cpu, sg_cpus, env->cpus) { | ||
5140 | if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu)) | ||
5141 | continue; | ||
5142 | |||
5143 | balance_cpu = cpu; | ||
5144 | break; | ||
5145 | } | ||
5146 | |||
5147 | if (balance_cpu == -1) | ||
5148 | balance_cpu = group_balance_cpu(sg); | ||
5149 | |||
5150 | /* | ||
5151 | * First idle cpu or the first cpu(busiest) in this sched group | ||
5152 | * is eligible for doing load balancing at this and above domains. | ||
5153 | */ | ||
5154 | return balance_cpu != env->dst_cpu; | ||
5155 | } | ||
5156 | |||
5042 | /* | 5157 | /* |
5043 | * Check this_cpu to ensure it is balanced within domain. Attempt to move | 5158 | * Check this_cpu to ensure it is balanced within domain. Attempt to move |
5044 | * tasks if there is an imbalance. | 5159 | * tasks if there is an imbalance. |
5045 | */ | 5160 | */ |
5046 | static int load_balance(int this_cpu, struct rq *this_rq, | 5161 | static int load_balance(int this_cpu, struct rq *this_rq, |
5047 | struct sched_domain *sd, enum cpu_idle_type idle, | 5162 | struct sched_domain *sd, enum cpu_idle_type idle, |
5048 | int *balance) | 5163 | int *continue_balancing) |
5049 | { | 5164 | { |
5050 | int ld_moved, cur_ld_moved, active_balance = 0; | 5165 | int ld_moved, cur_ld_moved, active_balance = 0; |
5051 | struct sched_group *group; | 5166 | struct sched_group *group; |
@@ -5075,11 +5190,12 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
5075 | schedstat_inc(sd, lb_count[idle]); | 5190 | schedstat_inc(sd, lb_count[idle]); |
5076 | 5191 | ||
5077 | redo: | 5192 | redo: |
5078 | group = find_busiest_group(&env, balance); | 5193 | if (!should_we_balance(&env)) { |
5079 | 5194 | *continue_balancing = 0; | |
5080 | if (*balance == 0) | ||
5081 | goto out_balanced; | 5195 | goto out_balanced; |
5196 | } | ||
5082 | 5197 | ||
5198 | group = find_busiest_group(&env); | ||
5083 | if (!group) { | 5199 | if (!group) { |
5084 | schedstat_inc(sd, lb_nobusyg[idle]); | 5200 | schedstat_inc(sd, lb_nobusyg[idle]); |
5085 | goto out_balanced; | 5201 | goto out_balanced; |
@@ -5108,7 +5224,6 @@ redo: | |||
5108 | env.src_rq = busiest; | 5224 | env.src_rq = busiest; |
5109 | env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); | 5225 | env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); |
5110 | 5226 | ||
5111 | update_h_load(env.src_cpu); | ||
5112 | more_balance: | 5227 | more_balance: |
5113 | local_irq_save(flags); | 5228 | local_irq_save(flags); |
5114 | double_rq_lock(env.dst_rq, busiest); | 5229 | double_rq_lock(env.dst_rq, busiest); |
@@ -5292,7 +5407,7 @@ void idle_balance(int this_cpu, struct rq *this_rq) | |||
5292 | rcu_read_lock(); | 5407 | rcu_read_lock(); |
5293 | for_each_domain(this_cpu, sd) { | 5408 | for_each_domain(this_cpu, sd) { |
5294 | unsigned long interval; | 5409 | unsigned long interval; |
5295 | int balance = 1; | 5410 | int continue_balancing = 1; |
5296 | 5411 | ||
5297 | if (!(sd->flags & SD_LOAD_BALANCE)) | 5412 | if (!(sd->flags & SD_LOAD_BALANCE)) |
5298 | continue; | 5413 | continue; |
@@ -5300,7 +5415,8 @@ void idle_balance(int this_cpu, struct rq *this_rq) | |||
5300 | if (sd->flags & SD_BALANCE_NEWIDLE) { | 5415 | if (sd->flags & SD_BALANCE_NEWIDLE) { |
5301 | /* If we've pulled tasks over stop searching: */ | 5416 | /* If we've pulled tasks over stop searching: */ |
5302 | pulled_task = load_balance(this_cpu, this_rq, | 5417 | pulled_task = load_balance(this_cpu, this_rq, |
5303 | sd, CPU_NEWLY_IDLE, &balance); | 5418 | sd, CPU_NEWLY_IDLE, |
5419 | &continue_balancing); | ||
5304 | } | 5420 | } |
5305 | 5421 | ||
5306 | interval = msecs_to_jiffies(sd->balance_interval); | 5422 | interval = msecs_to_jiffies(sd->balance_interval); |
@@ -5506,7 +5622,7 @@ void nohz_balance_enter_idle(int cpu) | |||
5506 | set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); | 5622 | set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); |
5507 | } | 5623 | } |
5508 | 5624 | ||
5509 | static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb, | 5625 | static int sched_ilb_notifier(struct notifier_block *nfb, |
5510 | unsigned long action, void *hcpu) | 5626 | unsigned long action, void *hcpu) |
5511 | { | 5627 | { |
5512 | switch (action & ~CPU_TASKS_FROZEN) { | 5628 | switch (action & ~CPU_TASKS_FROZEN) { |
@@ -5538,7 +5654,7 @@ void update_max_interval(void) | |||
5538 | */ | 5654 | */ |
5539 | static void rebalance_domains(int cpu, enum cpu_idle_type idle) | 5655 | static void rebalance_domains(int cpu, enum cpu_idle_type idle) |
5540 | { | 5656 | { |
5541 | int balance = 1; | 5657 | int continue_balancing = 1; |
5542 | struct rq *rq = cpu_rq(cpu); | 5658 | struct rq *rq = cpu_rq(cpu); |
5543 | unsigned long interval; | 5659 | unsigned long interval; |
5544 | struct sched_domain *sd; | 5660 | struct sched_domain *sd; |
@@ -5570,7 +5686,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
5570 | } | 5686 | } |
5571 | 5687 | ||
5572 | if (time_after_eq(jiffies, sd->last_balance + interval)) { | 5688 | if (time_after_eq(jiffies, sd->last_balance + interval)) { |
5573 | if (load_balance(cpu, rq, sd, idle, &balance)) { | 5689 | if (load_balance(cpu, rq, sd, idle, &continue_balancing)) { |
5574 | /* | 5690 | /* |
5575 | * The LBF_SOME_PINNED logic could have changed | 5691 | * The LBF_SOME_PINNED logic could have changed |
5576 | * env->dst_cpu, so we can't know our idle | 5692 | * env->dst_cpu, so we can't know our idle |
@@ -5593,7 +5709,7 @@ out: | |||
5593 | * CPU in our sched group which is doing load balancing more | 5709 | * CPU in our sched group which is doing load balancing more |
5594 | * actively. | 5710 | * actively. |
5595 | */ | 5711 | */ |
5596 | if (!balance) | 5712 | if (!continue_balancing) |
5597 | break; | 5713 | break; |
5598 | } | 5714 | } |
5599 | rcu_read_unlock(); | 5715 | rcu_read_unlock(); |
@@ -5786,7 +5902,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) | |||
5786 | entity_tick(cfs_rq, se, queued); | 5902 | entity_tick(cfs_rq, se, queued); |
5787 | } | 5903 | } |
5788 | 5904 | ||
5789 | if (sched_feat_numa(NUMA)) | 5905 | if (numabalancing_enabled) |
5790 | task_tick_numa(rq, curr); | 5906 | task_tick_numa(rq, curr); |
5791 | 5907 | ||
5792 | update_rq_runnable_avg(rq, 1); | 5908 | update_rq_runnable_avg(rq, 1); |
@@ -5889,11 +6005,9 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) | |||
5889 | * and ensure we don't carry in an old decay_count if we | 6005 | * and ensure we don't carry in an old decay_count if we |
5890 | * switch back. | 6006 | * switch back. |
5891 | */ | 6007 | */ |
5892 | if (p->se.avg.decay_count) { | 6008 | if (se->avg.decay_count) { |
5893 | struct cfs_rq *cfs_rq = cfs_rq_of(&p->se); | 6009 | __synchronize_entity_decay(se); |
5894 | __synchronize_entity_decay(&p->se); | 6010 | subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib); |
5895 | subtract_blocked_load_contrib(cfs_rq, | ||
5896 | p->se.avg.load_avg_contrib); | ||
5897 | } | 6011 | } |
5898 | #endif | 6012 | #endif |
5899 | } | 6013 | } |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ef0a7b2439dd..b3c5653e1dca 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -285,7 +285,6 @@ struct cfs_rq { | |||
285 | /* Required to track per-cpu representation of a task_group */ | 285 | /* Required to track per-cpu representation of a task_group */ |
286 | u32 tg_runnable_contrib; | 286 | u32 tg_runnable_contrib; |
287 | unsigned long tg_load_contrib; | 287 | unsigned long tg_load_contrib; |
288 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
289 | 288 | ||
290 | /* | 289 | /* |
291 | * h_load = weight * f(tg) | 290 | * h_load = weight * f(tg) |
@@ -294,6 +293,9 @@ struct cfs_rq { | |||
294 | * this group. | 293 | * this group. |
295 | */ | 294 | */ |
296 | unsigned long h_load; | 295 | unsigned long h_load; |
296 | u64 last_h_load_update; | ||
297 | struct sched_entity *h_load_next; | ||
298 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
297 | #endif /* CONFIG_SMP */ | 299 | #endif /* CONFIG_SMP */ |
298 | 300 | ||
299 | #ifdef CONFIG_FAIR_GROUP_SCHED | 301 | #ifdef CONFIG_FAIR_GROUP_SCHED |
@@ -429,9 +431,6 @@ struct rq { | |||
429 | #ifdef CONFIG_FAIR_GROUP_SCHED | 431 | #ifdef CONFIG_FAIR_GROUP_SCHED |
430 | /* list of leaf cfs_rq on this cpu: */ | 432 | /* list of leaf cfs_rq on this cpu: */ |
431 | struct list_head leaf_cfs_rq_list; | 433 | struct list_head leaf_cfs_rq_list; |
432 | #ifdef CONFIG_SMP | ||
433 | unsigned long h_load_throttle; | ||
434 | #endif /* CONFIG_SMP */ | ||
435 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 434 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
436 | 435 | ||
437 | #ifdef CONFIG_RT_GROUP_SCHED | 436 | #ifdef CONFIG_RT_GROUP_SCHED |
@@ -595,6 +594,7 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag) | |||
595 | } | 594 | } |
596 | 595 | ||
597 | DECLARE_PER_CPU(struct sched_domain *, sd_llc); | 596 | DECLARE_PER_CPU(struct sched_domain *, sd_llc); |
597 | DECLARE_PER_CPU(int, sd_llc_size); | ||
598 | DECLARE_PER_CPU(int, sd_llc_id); | 598 | DECLARE_PER_CPU(int, sd_llc_id); |
599 | 599 | ||
600 | struct sched_group_power { | 600 | struct sched_group_power { |
@@ -665,9 +665,9 @@ extern int group_balance_cpu(struct sched_group *sg); | |||
665 | /* | 665 | /* |
666 | * Return the group to which this tasks belongs. | 666 | * Return the group to which this tasks belongs. |
667 | * | 667 | * |
668 | * We cannot use task_subsys_state() and friends because the cgroup | 668 | * We cannot use task_css() and friends because the cgroup subsystem |
669 | * subsystem changes that value before the cgroup_subsys::attach() method | 669 | * changes that value before the cgroup_subsys::attach() method is called, |
670 | * is called, therefore we cannot pin it and might observe the wrong value. | 670 | * therefore we cannot pin it and might observe the wrong value. |
671 | * | 671 | * |
672 | * The same is true for autogroup's p->signal->autogroup->tg, the autogroup | 672 | * The same is true for autogroup's p->signal->autogroup->tg, the autogroup |
673 | * core changes this before calling sched_move_task(). | 673 | * core changes this before calling sched_move_task(). |
diff --git a/kernel/smp.c b/kernel/smp.c index 4dba0f7b72ad..449b707fc20d 100644 --- a/kernel/smp.c +++ b/kernel/smp.c | |||
@@ -73,7 +73,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
73 | return NOTIFY_OK; | 73 | return NOTIFY_OK; |
74 | } | 74 | } |
75 | 75 | ||
76 | static struct notifier_block __cpuinitdata hotplug_cfd_notifier = { | 76 | static struct notifier_block hotplug_cfd_notifier = { |
77 | .notifier_call = hotplug_cfd, | 77 | .notifier_call = hotplug_cfd, |
78 | }; | 78 | }; |
79 | 79 | ||
@@ -186,25 +186,13 @@ void generic_smp_call_function_single_interrupt(void) | |||
186 | 186 | ||
187 | while (!list_empty(&list)) { | 187 | while (!list_empty(&list)) { |
188 | struct call_single_data *csd; | 188 | struct call_single_data *csd; |
189 | unsigned int csd_flags; | ||
190 | 189 | ||
191 | csd = list_entry(list.next, struct call_single_data, list); | 190 | csd = list_entry(list.next, struct call_single_data, list); |
192 | list_del(&csd->list); | 191 | list_del(&csd->list); |
193 | 192 | ||
194 | /* | ||
195 | * 'csd' can be invalid after this call if flags == 0 | ||
196 | * (when called through generic_exec_single()), | ||
197 | * so save them away before making the call: | ||
198 | */ | ||
199 | csd_flags = csd->flags; | ||
200 | |||
201 | csd->func(csd->info); | 193 | csd->func(csd->info); |
202 | 194 | ||
203 | /* | 195 | csd_unlock(csd); |
204 | * Unlocked CSDs are valid through generic_exec_single(): | ||
205 | */ | ||
206 | if (csd_flags & CSD_FLAG_LOCK) | ||
207 | csd_unlock(csd); | ||
208 | } | 196 | } |
209 | } | 197 | } |
210 | 198 | ||
@@ -278,8 +266,6 @@ EXPORT_SYMBOL(smp_call_function_single); | |||
278 | * @wait: If true, wait until function has completed. | 266 | * @wait: If true, wait until function has completed. |
279 | * | 267 | * |
280 | * Returns 0 on success, else a negative status code (if no cpus were online). | 268 | * Returns 0 on success, else a negative status code (if no cpus were online). |
281 | * Note that @wait will be implicitly turned on in case of allocation failures, | ||
282 | * since we fall back to on-stack allocation. | ||
283 | * | 269 | * |
284 | * Selection preference: | 270 | * Selection preference: |
285 | * 1) current cpu if in @mask | 271 | * 1) current cpu if in @mask |
diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 02fc5c933673..eb89e1807408 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c | |||
@@ -24,7 +24,7 @@ | |||
24 | */ | 24 | */ |
25 | static DEFINE_PER_CPU(struct task_struct *, idle_threads); | 25 | static DEFINE_PER_CPU(struct task_struct *, idle_threads); |
26 | 26 | ||
27 | struct task_struct * __cpuinit idle_thread_get(unsigned int cpu) | 27 | struct task_struct *idle_thread_get(unsigned int cpu) |
28 | { | 28 | { |
29 | struct task_struct *tsk = per_cpu(idle_threads, cpu); | 29 | struct task_struct *tsk = per_cpu(idle_threads, cpu); |
30 | 30 | ||
diff --git a/kernel/softirq.c b/kernel/softirq.c index ca25e6e704a2..be3d3514c325 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
@@ -699,7 +699,7 @@ void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq) | |||
699 | } | 699 | } |
700 | EXPORT_SYMBOL(send_remote_softirq); | 700 | EXPORT_SYMBOL(send_remote_softirq); |
701 | 701 | ||
702 | static int __cpuinit remote_softirq_cpu_notify(struct notifier_block *self, | 702 | static int remote_softirq_cpu_notify(struct notifier_block *self, |
703 | unsigned long action, void *hcpu) | 703 | unsigned long action, void *hcpu) |
704 | { | 704 | { |
705 | /* | 705 | /* |
@@ -728,7 +728,7 @@ static int __cpuinit remote_softirq_cpu_notify(struct notifier_block *self, | |||
728 | return NOTIFY_OK; | 728 | return NOTIFY_OK; |
729 | } | 729 | } |
730 | 730 | ||
731 | static struct notifier_block __cpuinitdata remote_softirq_cpu_notifier = { | 731 | static struct notifier_block remote_softirq_cpu_notifier = { |
732 | .notifier_call = remote_softirq_cpu_notify, | 732 | .notifier_call = remote_softirq_cpu_notify, |
733 | }; | 733 | }; |
734 | 734 | ||
@@ -830,7 +830,7 @@ static void takeover_tasklets(unsigned int cpu) | |||
830 | } | 830 | } |
831 | #endif /* CONFIG_HOTPLUG_CPU */ | 831 | #endif /* CONFIG_HOTPLUG_CPU */ |
832 | 832 | ||
833 | static int __cpuinit cpu_callback(struct notifier_block *nfb, | 833 | static int cpu_callback(struct notifier_block *nfb, |
834 | unsigned long action, | 834 | unsigned long action, |
835 | void *hcpu) | 835 | void *hcpu) |
836 | { | 836 | { |
@@ -845,7 +845,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, | |||
845 | return NOTIFY_OK; | 845 | return NOTIFY_OK; |
846 | } | 846 | } |
847 | 847 | ||
848 | static struct notifier_block __cpuinitdata cpu_nfb = { | 848 | static struct notifier_block cpu_nfb = { |
849 | .notifier_call = cpu_callback | 849 | .notifier_call = cpu_callback |
850 | }; | 850 | }; |
851 | 851 | ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ac09d98490aa..07f6fc468e17 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -2346,7 +2346,11 @@ static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp, | |||
2346 | int write, void *data) | 2346 | int write, void *data) |
2347 | { | 2347 | { |
2348 | if (write) { | 2348 | if (write) { |
2349 | *valp = msecs_to_jiffies(*negp ? -*lvalp : *lvalp); | 2349 | unsigned long jif = msecs_to_jiffies(*negp ? -*lvalp : *lvalp); |
2350 | |||
2351 | if (jif > INT_MAX) | ||
2352 | return 1; | ||
2353 | *valp = (int)jif; | ||
2350 | } else { | 2354 | } else { |
2351 | int val = *valp; | 2355 | int val = *valp; |
2352 | unsigned long lval; | 2356 | unsigned long lval; |
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 70f27e89012b..2b62fe86f9ec 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig | |||
@@ -105,7 +105,6 @@ config NO_HZ_FULL | |||
105 | select RCU_USER_QS | 105 | select RCU_USER_QS |
106 | select RCU_NOCB_CPU | 106 | select RCU_NOCB_CPU |
107 | select VIRT_CPU_ACCOUNTING_GEN | 107 | select VIRT_CPU_ACCOUNTING_GEN |
108 | select CONTEXT_TRACKING_FORCE | ||
109 | select IRQ_WORK | 108 | select IRQ_WORK |
110 | help | 109 | help |
111 | Adaptively try to shutdown the tick whenever possible, even when | 110 | Adaptively try to shutdown the tick whenever possible, even when |
@@ -134,6 +133,56 @@ config NO_HZ_FULL_ALL | |||
134 | Note the boot CPU will still be kept outside the range to | 133 | Note the boot CPU will still be kept outside the range to |
135 | handle the timekeeping duty. | 134 | handle the timekeeping duty. |
136 | 135 | ||
136 | config NO_HZ_FULL_SYSIDLE | ||
137 | bool "Detect full-system idle state for full dynticks system" | ||
138 | depends on NO_HZ_FULL | ||
139 | default n | ||
140 | help | ||
141 | At least one CPU must keep the scheduling-clock tick running for | ||
142 | timekeeping purposes whenever there is a non-idle CPU, where | ||
143 | "non-idle" also includes dynticks CPUs as long as they are | ||
144 | running non-idle tasks. Because the underlying adaptive-tick | ||
145 | support cannot distinguish between all CPUs being idle and | ||
146 | all CPUs each running a single task in dynticks mode, the | ||
147 | underlying support simply ensures that there is always a CPU | ||
148 | handling the scheduling-clock tick, whether or not all CPUs | ||
149 | are idle. This Kconfig option enables scalable detection of | ||
150 | the all-CPUs-idle state, thus allowing the scheduling-clock | ||
151 | tick to be disabled when all CPUs are idle. Note that scalable | ||
152 | detection of the all-CPUs-idle state means that larger systems | ||
153 | will be slower to declare the all-CPUs-idle state. | ||
154 | |||
155 | Say Y if you would like to help debug all-CPUs-idle detection. | ||
156 | |||
157 | Say N if you are unsure. | ||
158 | |||
159 | config NO_HZ_FULL_SYSIDLE_SMALL | ||
160 | int "Number of CPUs above which large-system approach is used" | ||
161 | depends on NO_HZ_FULL_SYSIDLE | ||
162 | range 1 NR_CPUS | ||
163 | default 8 | ||
164 | help | ||
165 | The full-system idle detection mechanism takes a lazy approach | ||
166 | on large systems, as is required to attain decent scalability. | ||
167 | However, on smaller systems, scalability is not anywhere near as | ||
168 | large a concern as is energy efficiency. The sysidle subsystem | ||
169 | therefore uses a fast but non-scalable algorithm for small | ||
170 | systems and a lazier but scalable algorithm for large systems. | ||
171 | This Kconfig parameter defines the number of CPUs in the largest | ||
172 | system that will be considered to be "small". | ||
173 | |||
174 | The default value will be fine in most cases. Battery-powered | ||
175 | systems that (1) enable NO_HZ_FULL_SYSIDLE, (2) have larger | ||
176 | numbers of CPUs, and (3) are suffering from battery-lifetime | ||
177 | problems due to long sysidle latencies might wish to experiment | ||
178 | with larger values for this Kconfig parameter. On the other | ||
179 | hand, they might be even better served by disabling NO_HZ_FULL | ||
180 | entirely, given that NO_HZ_FULL is intended for HPC and | ||
181 | real-time workloads that at present do not tend to be run on | ||
182 | battery-powered systems. | ||
183 | |||
184 | Take the default if you are unsure. | ||
185 | |||
137 | config NO_HZ | 186 | config NO_HZ |
138 | bool "Old Idle dynticks config" | 187 | bool "Old Idle dynticks config" |
139 | depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS | 188 | depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS |
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index a326f27d7f09..0b479a6a22bb 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c | |||
@@ -121,7 +121,7 @@ void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate) | |||
121 | BUG_ON(bits > 32); | 121 | BUG_ON(bits > 32); |
122 | WARN_ON(!irqs_disabled()); | 122 | WARN_ON(!irqs_disabled()); |
123 | read_sched_clock = read; | 123 | read_sched_clock = read; |
124 | sched_clock_mask = (1 << bits) - 1; | 124 | sched_clock_mask = (1ULL << bits) - 1; |
125 | cd.rate = rate; | 125 | cd.rate = rate; |
126 | 126 | ||
127 | /* calculate the mult/shift to convert counter ticks to ns. */ | 127 | /* calculate the mult/shift to convert counter ticks to ns. */ |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 69601726a745..3612fc77f834 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -23,6 +23,7 @@ | |||
23 | #include <linux/irq_work.h> | 23 | #include <linux/irq_work.h> |
24 | #include <linux/posix-timers.h> | 24 | #include <linux/posix-timers.h> |
25 | #include <linux/perf_event.h> | 25 | #include <linux/perf_event.h> |
26 | #include <linux/context_tracking.h> | ||
26 | 27 | ||
27 | #include <asm/irq_regs.h> | 28 | #include <asm/irq_regs.h> |
28 | 29 | ||
@@ -148,8 +149,8 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) | |||
148 | } | 149 | } |
149 | 150 | ||
150 | #ifdef CONFIG_NO_HZ_FULL | 151 | #ifdef CONFIG_NO_HZ_FULL |
151 | static cpumask_var_t nohz_full_mask; | 152 | cpumask_var_t tick_nohz_full_mask; |
152 | bool have_nohz_full_mask; | 153 | bool tick_nohz_full_running; |
153 | 154 | ||
154 | static bool can_stop_full_tick(void) | 155 | static bool can_stop_full_tick(void) |
155 | { | 156 | { |
@@ -182,7 +183,8 @@ static bool can_stop_full_tick(void) | |||
182 | * Don't allow the user to think they can get | 183 | * Don't allow the user to think they can get |
183 | * full NO_HZ with this machine. | 184 | * full NO_HZ with this machine. |
184 | */ | 185 | */ |
185 | WARN_ONCE(1, "NO_HZ FULL will not work with unstable sched clock"); | 186 | WARN_ONCE(tick_nohz_full_running, |
187 | "NO_HZ FULL will not work with unstable sched clock"); | ||
186 | return false; | 188 | return false; |
187 | } | 189 | } |
188 | #endif | 190 | #endif |
@@ -196,7 +198,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now); | |||
196 | * Re-evaluate the need for the tick on the current CPU | 198 | * Re-evaluate the need for the tick on the current CPU |
197 | * and restart it if necessary. | 199 | * and restart it if necessary. |
198 | */ | 200 | */ |
199 | void tick_nohz_full_check(void) | 201 | void __tick_nohz_full_check(void) |
200 | { | 202 | { |
201 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | 203 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); |
202 | 204 | ||
@@ -210,7 +212,7 @@ void tick_nohz_full_check(void) | |||
210 | 212 | ||
211 | static void nohz_full_kick_work_func(struct irq_work *work) | 213 | static void nohz_full_kick_work_func(struct irq_work *work) |
212 | { | 214 | { |
213 | tick_nohz_full_check(); | 215 | __tick_nohz_full_check(); |
214 | } | 216 | } |
215 | 217 | ||
216 | static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { | 218 | static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { |
@@ -229,7 +231,7 @@ void tick_nohz_full_kick(void) | |||
229 | 231 | ||
230 | static void nohz_full_kick_ipi(void *info) | 232 | static void nohz_full_kick_ipi(void *info) |
231 | { | 233 | { |
232 | tick_nohz_full_check(); | 234 | __tick_nohz_full_check(); |
233 | } | 235 | } |
234 | 236 | ||
235 | /* | 237 | /* |
@@ -238,12 +240,13 @@ static void nohz_full_kick_ipi(void *info) | |||
238 | */ | 240 | */ |
239 | void tick_nohz_full_kick_all(void) | 241 | void tick_nohz_full_kick_all(void) |
240 | { | 242 | { |
241 | if (!have_nohz_full_mask) | 243 | if (!tick_nohz_full_running) |
242 | return; | 244 | return; |
243 | 245 | ||
244 | preempt_disable(); | 246 | preempt_disable(); |
245 | smp_call_function_many(nohz_full_mask, | 247 | smp_call_function_many(tick_nohz_full_mask, |
246 | nohz_full_kick_ipi, NULL, false); | 248 | nohz_full_kick_ipi, NULL, false); |
249 | tick_nohz_full_kick(); | ||
247 | preempt_enable(); | 250 | preempt_enable(); |
248 | } | 251 | } |
249 | 252 | ||
@@ -252,7 +255,7 @@ void tick_nohz_full_kick_all(void) | |||
252 | * It might need the tick due to per task/process properties: | 255 | * It might need the tick due to per task/process properties: |
253 | * perf events, posix cpu timers, ... | 256 | * perf events, posix cpu timers, ... |
254 | */ | 257 | */ |
255 | void tick_nohz_task_switch(struct task_struct *tsk) | 258 | void __tick_nohz_task_switch(struct task_struct *tsk) |
256 | { | 259 | { |
257 | unsigned long flags; | 260 | unsigned long flags; |
258 | 261 | ||
@@ -268,37 +271,29 @@ out: | |||
268 | local_irq_restore(flags); | 271 | local_irq_restore(flags); |
269 | } | 272 | } |
270 | 273 | ||
271 | int tick_nohz_full_cpu(int cpu) | ||
272 | { | ||
273 | if (!have_nohz_full_mask) | ||
274 | return 0; | ||
275 | |||
276 | return cpumask_test_cpu(cpu, nohz_full_mask); | ||
277 | } | ||
278 | |||
279 | /* Parse the boot-time nohz CPU list from the kernel parameters. */ | 274 | /* Parse the boot-time nohz CPU list from the kernel parameters. */ |
280 | static int __init tick_nohz_full_setup(char *str) | 275 | static int __init tick_nohz_full_setup(char *str) |
281 | { | 276 | { |
282 | int cpu; | 277 | int cpu; |
283 | 278 | ||
284 | alloc_bootmem_cpumask_var(&nohz_full_mask); | 279 | alloc_bootmem_cpumask_var(&tick_nohz_full_mask); |
285 | if (cpulist_parse(str, nohz_full_mask) < 0) { | 280 | if (cpulist_parse(str, tick_nohz_full_mask) < 0) { |
286 | pr_warning("NOHZ: Incorrect nohz_full cpumask\n"); | 281 | pr_warning("NOHZ: Incorrect nohz_full cpumask\n"); |
287 | return 1; | 282 | return 1; |
288 | } | 283 | } |
289 | 284 | ||
290 | cpu = smp_processor_id(); | 285 | cpu = smp_processor_id(); |
291 | if (cpumask_test_cpu(cpu, nohz_full_mask)) { | 286 | if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { |
292 | pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu); | 287 | pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu); |
293 | cpumask_clear_cpu(cpu, nohz_full_mask); | 288 | cpumask_clear_cpu(cpu, tick_nohz_full_mask); |
294 | } | 289 | } |
295 | have_nohz_full_mask = true; | 290 | tick_nohz_full_running = true; |
296 | 291 | ||
297 | return 1; | 292 | return 1; |
298 | } | 293 | } |
299 | __setup("nohz_full=", tick_nohz_full_setup); | 294 | __setup("nohz_full=", tick_nohz_full_setup); |
300 | 295 | ||
301 | static int __cpuinit tick_nohz_cpu_down_callback(struct notifier_block *nfb, | 296 | static int tick_nohz_cpu_down_callback(struct notifier_block *nfb, |
302 | unsigned long action, | 297 | unsigned long action, |
303 | void *hcpu) | 298 | void *hcpu) |
304 | { | 299 | { |
@@ -310,7 +305,7 @@ static int __cpuinit tick_nohz_cpu_down_callback(struct notifier_block *nfb, | |||
310 | * If we handle the timekeeping duty for full dynticks CPUs, | 305 | * If we handle the timekeeping duty for full dynticks CPUs, |
311 | * we can't safely shutdown that CPU. | 306 | * we can't safely shutdown that CPU. |
312 | */ | 307 | */ |
313 | if (have_nohz_full_mask && tick_do_timer_cpu == cpu) | 308 | if (tick_nohz_full_running && tick_do_timer_cpu == cpu) |
314 | return NOTIFY_BAD; | 309 | return NOTIFY_BAD; |
315 | break; | 310 | break; |
316 | } | 311 | } |
@@ -329,14 +324,14 @@ static int tick_nohz_init_all(void) | |||
329 | int err = -1; | 324 | int err = -1; |
330 | 325 | ||
331 | #ifdef CONFIG_NO_HZ_FULL_ALL | 326 | #ifdef CONFIG_NO_HZ_FULL_ALL |
332 | if (!alloc_cpumask_var(&nohz_full_mask, GFP_KERNEL)) { | 327 | if (!alloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL)) { |
333 | pr_err("NO_HZ: Can't allocate full dynticks cpumask\n"); | 328 | pr_err("NO_HZ: Can't allocate full dynticks cpumask\n"); |
334 | return err; | 329 | return err; |
335 | } | 330 | } |
336 | err = 0; | 331 | err = 0; |
337 | cpumask_setall(nohz_full_mask); | 332 | cpumask_setall(tick_nohz_full_mask); |
338 | cpumask_clear_cpu(smp_processor_id(), nohz_full_mask); | 333 | cpumask_clear_cpu(smp_processor_id(), tick_nohz_full_mask); |
339 | have_nohz_full_mask = true; | 334 | tick_nohz_full_running = true; |
340 | #endif | 335 | #endif |
341 | return err; | 336 | return err; |
342 | } | 337 | } |
@@ -345,17 +340,18 @@ void __init tick_nohz_init(void) | |||
345 | { | 340 | { |
346 | int cpu; | 341 | int cpu; |
347 | 342 | ||
348 | if (!have_nohz_full_mask) { | 343 | if (!tick_nohz_full_running) { |
349 | if (tick_nohz_init_all() < 0) | 344 | if (tick_nohz_init_all() < 0) |
350 | return; | 345 | return; |
351 | } | 346 | } |
352 | 347 | ||
348 | for_each_cpu(cpu, tick_nohz_full_mask) | ||
349 | context_tracking_cpu_set(cpu); | ||
350 | |||
353 | cpu_notifier(tick_nohz_cpu_down_callback, 0); | 351 | cpu_notifier(tick_nohz_cpu_down_callback, 0); |
354 | cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), nohz_full_mask); | 352 | cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), tick_nohz_full_mask); |
355 | pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf); | 353 | pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf); |
356 | } | 354 | } |
357 | #else | ||
358 | #define have_nohz_full_mask (0) | ||
359 | #endif | 355 | #endif |
360 | 356 | ||
361 | /* | 357 | /* |
@@ -733,7 +729,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) | |||
733 | return false; | 729 | return false; |
734 | } | 730 | } |
735 | 731 | ||
736 | if (have_nohz_full_mask) { | 732 | if (tick_nohz_full_enabled()) { |
737 | /* | 733 | /* |
738 | * Keep the tick alive to guarantee timekeeping progression | 734 | * Keep the tick alive to guarantee timekeeping progression |
739 | * if there are full dynticks CPUs around | 735 | * if there are full dynticks CPUs around |
@@ -827,13 +823,10 @@ void tick_nohz_irq_exit(void) | |||
827 | { | 823 | { |
828 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | 824 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); |
829 | 825 | ||
830 | if (ts->inidle) { | 826 | if (ts->inidle) |
831 | /* Cancel the timer because CPU already waken up from the C-states*/ | ||
832 | menu_hrtimer_cancel(); | ||
833 | __tick_nohz_idle_enter(ts); | 827 | __tick_nohz_idle_enter(ts); |
834 | } else { | 828 | else |
835 | tick_nohz_full_stop_tick(ts); | 829 | tick_nohz_full_stop_tick(ts); |
836 | } | ||
837 | } | 830 | } |
838 | 831 | ||
839 | /** | 832 | /** |
@@ -931,8 +924,6 @@ void tick_nohz_idle_exit(void) | |||
931 | 924 | ||
932 | ts->inidle = 0; | 925 | ts->inidle = 0; |
933 | 926 | ||
934 | /* Cancel the timer because CPU already waken up from the C-states*/ | ||
935 | menu_hrtimer_cancel(); | ||
936 | if (ts->idle_active || ts->tick_stopped) | 927 | if (ts->idle_active || ts->tick_stopped) |
937 | now = ktime_get(); | 928 | now = ktime_get(); |
938 | 929 | ||
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 3bdf28323012..61ed862cdd37 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c | |||
@@ -265,10 +265,9 @@ static inline void timer_list_header(struct seq_file *m, u64 now) | |||
265 | static int timer_list_show(struct seq_file *m, void *v) | 265 | static int timer_list_show(struct seq_file *m, void *v) |
266 | { | 266 | { |
267 | struct timer_list_iter *iter = v; | 267 | struct timer_list_iter *iter = v; |
268 | u64 now = ktime_to_ns(ktime_get()); | ||
269 | 268 | ||
270 | if (iter->cpu == -1 && !iter->second_pass) | 269 | if (iter->cpu == -1 && !iter->second_pass) |
271 | timer_list_header(m, now); | 270 | timer_list_header(m, iter->now); |
272 | else if (!iter->second_pass) | 271 | else if (!iter->second_pass) |
273 | print_cpu(m, iter->cpu, iter->now); | 272 | print_cpu(m, iter->cpu, iter->now); |
274 | #ifdef CONFIG_GENERIC_CLOCKEVENTS | 273 | #ifdef CONFIG_GENERIC_CLOCKEVENTS |
@@ -298,33 +297,41 @@ void sysrq_timer_list_show(void) | |||
298 | return; | 297 | return; |
299 | } | 298 | } |
300 | 299 | ||
301 | static void *timer_list_start(struct seq_file *file, loff_t *offset) | 300 | static void *move_iter(struct timer_list_iter *iter, loff_t offset) |
302 | { | 301 | { |
303 | struct timer_list_iter *iter = file->private; | 302 | for (; offset; offset--) { |
304 | 303 | iter->cpu = cpumask_next(iter->cpu, cpu_online_mask); | |
305 | if (!*offset) { | 304 | if (iter->cpu >= nr_cpu_ids) { |
306 | iter->cpu = -1; | ||
307 | iter->now = ktime_to_ns(ktime_get()); | ||
308 | } else if (iter->cpu >= nr_cpu_ids) { | ||
309 | #ifdef CONFIG_GENERIC_CLOCKEVENTS | 305 | #ifdef CONFIG_GENERIC_CLOCKEVENTS |
310 | if (!iter->second_pass) { | 306 | if (!iter->second_pass) { |
311 | iter->cpu = -1; | 307 | iter->cpu = -1; |
312 | iter->second_pass = true; | 308 | iter->second_pass = true; |
313 | } else | 309 | } else |
314 | return NULL; | 310 | return NULL; |
315 | #else | 311 | #else |
316 | return NULL; | 312 | return NULL; |
317 | #endif | 313 | #endif |
314 | } | ||
318 | } | 315 | } |
319 | return iter; | 316 | return iter; |
320 | } | 317 | } |
321 | 318 | ||
319 | static void *timer_list_start(struct seq_file *file, loff_t *offset) | ||
320 | { | ||
321 | struct timer_list_iter *iter = file->private; | ||
322 | |||
323 | if (!*offset) | ||
324 | iter->now = ktime_to_ns(ktime_get()); | ||
325 | iter->cpu = -1; | ||
326 | iter->second_pass = false; | ||
327 | return move_iter(iter, *offset); | ||
328 | } | ||
329 | |||
322 | static void *timer_list_next(struct seq_file *file, void *v, loff_t *offset) | 330 | static void *timer_list_next(struct seq_file *file, void *v, loff_t *offset) |
323 | { | 331 | { |
324 | struct timer_list_iter *iter = file->private; | 332 | struct timer_list_iter *iter = file->private; |
325 | iter->cpu = cpumask_next(iter->cpu, cpu_online_mask); | ||
326 | ++*offset; | 333 | ++*offset; |
327 | return timer_list_start(file, offset); | 334 | return move_iter(iter, 1); |
328 | } | 335 | } |
329 | 336 | ||
330 | static void timer_list_stop(struct seq_file *seq, void *v) | 337 | static void timer_list_stop(struct seq_file *seq, void *v) |
diff --git a/kernel/timer.c b/kernel/timer.c index 15bc1b41021d..4296d13db3d1 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
@@ -1505,11 +1505,11 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout) | |||
1505 | } | 1505 | } |
1506 | EXPORT_SYMBOL(schedule_timeout_uninterruptible); | 1506 | EXPORT_SYMBOL(schedule_timeout_uninterruptible); |
1507 | 1507 | ||
1508 | static int __cpuinit init_timers_cpu(int cpu) | 1508 | static int init_timers_cpu(int cpu) |
1509 | { | 1509 | { |
1510 | int j; | 1510 | int j; |
1511 | struct tvec_base *base; | 1511 | struct tvec_base *base; |
1512 | static char __cpuinitdata tvec_base_done[NR_CPUS]; | 1512 | static char tvec_base_done[NR_CPUS]; |
1513 | 1513 | ||
1514 | if (!tvec_base_done[cpu]) { | 1514 | if (!tvec_base_done[cpu]) { |
1515 | static char boot_done; | 1515 | static char boot_done; |
@@ -1577,7 +1577,7 @@ static void migrate_timer_list(struct tvec_base *new_base, struct list_head *hea | |||
1577 | } | 1577 | } |
1578 | } | 1578 | } |
1579 | 1579 | ||
1580 | static void __cpuinit migrate_timers(int cpu) | 1580 | static void migrate_timers(int cpu) |
1581 | { | 1581 | { |
1582 | struct tvec_base *old_base; | 1582 | struct tvec_base *old_base; |
1583 | struct tvec_base *new_base; | 1583 | struct tvec_base *new_base; |
@@ -1610,7 +1610,7 @@ static void __cpuinit migrate_timers(int cpu) | |||
1610 | } | 1610 | } |
1611 | #endif /* CONFIG_HOTPLUG_CPU */ | 1611 | #endif /* CONFIG_HOTPLUG_CPU */ |
1612 | 1612 | ||
1613 | static int __cpuinit timer_cpu_notify(struct notifier_block *self, | 1613 | static int timer_cpu_notify(struct notifier_block *self, |
1614 | unsigned long action, void *hcpu) | 1614 | unsigned long action, void *hcpu) |
1615 | { | 1615 | { |
1616 | long cpu = (long)hcpu; | 1616 | long cpu = (long)hcpu; |
@@ -1635,7 +1635,7 @@ static int __cpuinit timer_cpu_notify(struct notifier_block *self, | |||
1635 | return NOTIFY_OK; | 1635 | return NOTIFY_OK; |
1636 | } | 1636 | } |
1637 | 1637 | ||
1638 | static struct notifier_block __cpuinitdata timers_nb = { | 1638 | static struct notifier_block timers_nb = { |
1639 | .notifier_call = timer_cpu_notify, | 1639 | .notifier_call = timer_cpu_notify, |
1640 | }; | 1640 | }; |
1641 | 1641 | ||
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 67708f46baae..a6d098c6df3f 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
@@ -1441,12 +1441,22 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, | |||
1441 | * the hashes are freed with call_rcu_sched(). | 1441 | * the hashes are freed with call_rcu_sched(). |
1442 | */ | 1442 | */ |
1443 | static int | 1443 | static int |
1444 | ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) | 1444 | ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) |
1445 | { | 1445 | { |
1446 | struct ftrace_hash *filter_hash; | 1446 | struct ftrace_hash *filter_hash; |
1447 | struct ftrace_hash *notrace_hash; | 1447 | struct ftrace_hash *notrace_hash; |
1448 | int ret; | 1448 | int ret; |
1449 | 1449 | ||
1450 | #ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS | ||
1451 | /* | ||
1452 | * There's a small race when adding ops that the ftrace handler | ||
1453 | * that wants regs, may be called without them. We can not | ||
1454 | * allow that handler to be called if regs is NULL. | ||
1455 | */ | ||
1456 | if (regs == NULL && (ops->flags & FTRACE_OPS_FL_SAVE_REGS)) | ||
1457 | return 0; | ||
1458 | #endif | ||
1459 | |||
1450 | filter_hash = rcu_dereference_raw_notrace(ops->filter_hash); | 1460 | filter_hash = rcu_dereference_raw_notrace(ops->filter_hash); |
1451 | notrace_hash = rcu_dereference_raw_notrace(ops->notrace_hash); | 1461 | notrace_hash = rcu_dereference_raw_notrace(ops->notrace_hash); |
1452 | 1462 | ||
@@ -2159,12 +2169,57 @@ static cycle_t ftrace_update_time; | |||
2159 | static unsigned long ftrace_update_cnt; | 2169 | static unsigned long ftrace_update_cnt; |
2160 | unsigned long ftrace_update_tot_cnt; | 2170 | unsigned long ftrace_update_tot_cnt; |
2161 | 2171 | ||
2162 | static int ops_traces_mod(struct ftrace_ops *ops) | 2172 | static inline int ops_traces_mod(struct ftrace_ops *ops) |
2163 | { | 2173 | { |
2164 | struct ftrace_hash *hash; | 2174 | /* |
2175 | * Filter_hash being empty will default to trace module. | ||
2176 | * But notrace hash requires a test of individual module functions. | ||
2177 | */ | ||
2178 | return ftrace_hash_empty(ops->filter_hash) && | ||
2179 | ftrace_hash_empty(ops->notrace_hash); | ||
2180 | } | ||
2181 | |||
2182 | /* | ||
2183 | * Check if the current ops references the record. | ||
2184 | * | ||
2185 | * If the ops traces all functions, then it was already accounted for. | ||
2186 | * If the ops does not trace the current record function, skip it. | ||
2187 | * If the ops ignores the function via notrace filter, skip it. | ||
2188 | */ | ||
2189 | static inline bool | ||
2190 | ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec) | ||
2191 | { | ||
2192 | /* If ops isn't enabled, ignore it */ | ||
2193 | if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) | ||
2194 | return 0; | ||
2195 | |||
2196 | /* If ops traces all mods, we already accounted for it */ | ||
2197 | if (ops_traces_mod(ops)) | ||
2198 | return 0; | ||
2199 | |||
2200 | /* The function must be in the filter */ | ||
2201 | if (!ftrace_hash_empty(ops->filter_hash) && | ||
2202 | !ftrace_lookup_ip(ops->filter_hash, rec->ip)) | ||
2203 | return 0; | ||
2204 | |||
2205 | /* If in notrace hash, we ignore it too */ | ||
2206 | if (ftrace_lookup_ip(ops->notrace_hash, rec->ip)) | ||
2207 | return 0; | ||
2165 | 2208 | ||
2166 | hash = ops->filter_hash; | 2209 | return 1; |
2167 | return ftrace_hash_empty(hash); | 2210 | } |
2211 | |||
2212 | static int referenced_filters(struct dyn_ftrace *rec) | ||
2213 | { | ||
2214 | struct ftrace_ops *ops; | ||
2215 | int cnt = 0; | ||
2216 | |||
2217 | for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) { | ||
2218 | if (ops_references_rec(ops, rec)) | ||
2219 | cnt++; | ||
2220 | } | ||
2221 | |||
2222 | return cnt; | ||
2168 | } | 2223 | } |
2169 | 2224 | ||
2170 | static int ftrace_update_code(struct module *mod) | 2225 | static int ftrace_update_code(struct module *mod) |
@@ -2173,6 +2228,7 @@ static int ftrace_update_code(struct module *mod) | |||
2173 | struct dyn_ftrace *p; | 2228 | struct dyn_ftrace *p; |
2174 | cycle_t start, stop; | 2229 | cycle_t start, stop; |
2175 | unsigned long ref = 0; | 2230 | unsigned long ref = 0; |
2231 | bool test = false; | ||
2176 | int i; | 2232 | int i; |
2177 | 2233 | ||
2178 | /* | 2234 | /* |
@@ -2186,9 +2242,12 @@ static int ftrace_update_code(struct module *mod) | |||
2186 | 2242 | ||
2187 | for (ops = ftrace_ops_list; | 2243 | for (ops = ftrace_ops_list; |
2188 | ops != &ftrace_list_end; ops = ops->next) { | 2244 | ops != &ftrace_list_end; ops = ops->next) { |
2189 | if (ops->flags & FTRACE_OPS_FL_ENABLED && | 2245 | if (ops->flags & FTRACE_OPS_FL_ENABLED) { |
2190 | ops_traces_mod(ops)) | 2246 | if (ops_traces_mod(ops)) |
2191 | ref++; | 2247 | ref++; |
2248 | else | ||
2249 | test = true; | ||
2250 | } | ||
2192 | } | 2251 | } |
2193 | } | 2252 | } |
2194 | 2253 | ||
@@ -2198,12 +2257,16 @@ static int ftrace_update_code(struct module *mod) | |||
2198 | for (pg = ftrace_new_pgs; pg; pg = pg->next) { | 2257 | for (pg = ftrace_new_pgs; pg; pg = pg->next) { |
2199 | 2258 | ||
2200 | for (i = 0; i < pg->index; i++) { | 2259 | for (i = 0; i < pg->index; i++) { |
2260 | int cnt = ref; | ||
2261 | |||
2201 | /* If something went wrong, bail without enabling anything */ | 2262 | /* If something went wrong, bail without enabling anything */ |
2202 | if (unlikely(ftrace_disabled)) | 2263 | if (unlikely(ftrace_disabled)) |
2203 | return -1; | 2264 | return -1; |
2204 | 2265 | ||
2205 | p = &pg->records[i]; | 2266 | p = &pg->records[i]; |
2206 | p->flags = ref; | 2267 | if (test) |
2268 | cnt += referenced_filters(p); | ||
2269 | p->flags = cnt; | ||
2207 | 2270 | ||
2208 | /* | 2271 | /* |
2209 | * Do the initial record conversion from mcount jump | 2272 | * Do the initial record conversion from mcount jump |
@@ -2223,7 +2286,7 @@ static int ftrace_update_code(struct module *mod) | |||
2223 | * conversion puts the module to the correct state, thus | 2286 | * conversion puts the module to the correct state, thus |
2224 | * passing the ftrace_make_call check. | 2287 | * passing the ftrace_make_call check. |
2225 | */ | 2288 | */ |
2226 | if (ftrace_start_up && ref) { | 2289 | if (ftrace_start_up && cnt) { |
2227 | int failed = __ftrace_replace_code(p, 1); | 2290 | int failed = __ftrace_replace_code(p, 1); |
2228 | if (failed) | 2291 | if (failed) |
2229 | ftrace_bug(failed, p->ip); | 2292 | ftrace_bug(failed, p->ip); |
@@ -3374,6 +3437,12 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove) | |||
3374 | return add_hash_entry(hash, ip); | 3437 | return add_hash_entry(hash, ip); |
3375 | } | 3438 | } |
3376 | 3439 | ||
3440 | static void ftrace_ops_update_code(struct ftrace_ops *ops) | ||
3441 | { | ||
3442 | if (ops->flags & FTRACE_OPS_FL_ENABLED && ftrace_enabled) | ||
3443 | ftrace_run_update_code(FTRACE_UPDATE_CALLS); | ||
3444 | } | ||
3445 | |||
3377 | static int | 3446 | static int |
3378 | ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, | 3447 | ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, |
3379 | unsigned long ip, int remove, int reset, int enable) | 3448 | unsigned long ip, int remove, int reset, int enable) |
@@ -3416,9 +3485,8 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, | |||
3416 | 3485 | ||
3417 | mutex_lock(&ftrace_lock); | 3486 | mutex_lock(&ftrace_lock); |
3418 | ret = ftrace_hash_move(ops, enable, orig_hash, hash); | 3487 | ret = ftrace_hash_move(ops, enable, orig_hash, hash); |
3419 | if (!ret && ops->flags & FTRACE_OPS_FL_ENABLED | 3488 | if (!ret) |
3420 | && ftrace_enabled) | 3489 | ftrace_ops_update_code(ops); |
3421 | ftrace_run_update_code(FTRACE_UPDATE_CALLS); | ||
3422 | 3490 | ||
3423 | mutex_unlock(&ftrace_lock); | 3491 | mutex_unlock(&ftrace_lock); |
3424 | 3492 | ||
@@ -3645,9 +3713,8 @@ int ftrace_regex_release(struct inode *inode, struct file *file) | |||
3645 | mutex_lock(&ftrace_lock); | 3713 | mutex_lock(&ftrace_lock); |
3646 | ret = ftrace_hash_move(iter->ops, filter_hash, | 3714 | ret = ftrace_hash_move(iter->ops, filter_hash, |
3647 | orig_hash, iter->hash); | 3715 | orig_hash, iter->hash); |
3648 | if (!ret && (iter->ops->flags & FTRACE_OPS_FL_ENABLED) | 3716 | if (!ret) |
3649 | && ftrace_enabled) | 3717 | ftrace_ops_update_code(iter->ops); |
3650 | ftrace_run_update_code(FTRACE_UPDATE_CALLS); | ||
3651 | 3718 | ||
3652 | mutex_unlock(&ftrace_lock); | 3719 | mutex_unlock(&ftrace_lock); |
3653 | } | 3720 | } |
@@ -4218,7 +4285,7 @@ static inline void ftrace_startup_enable(int command) { } | |||
4218 | # define ftrace_shutdown_sysctl() do { } while (0) | 4285 | # define ftrace_shutdown_sysctl() do { } while (0) |
4219 | 4286 | ||
4220 | static inline int | 4287 | static inline int |
4221 | ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) | 4288 | ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) |
4222 | { | 4289 | { |
4223 | return 1; | 4290 | return 1; |
4224 | } | 4291 | } |
@@ -4241,7 +4308,7 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, | |||
4241 | do_for_each_ftrace_op(op, ftrace_control_list) { | 4308 | do_for_each_ftrace_op(op, ftrace_control_list) { |
4242 | if (!(op->flags & FTRACE_OPS_FL_STUB) && | 4309 | if (!(op->flags & FTRACE_OPS_FL_STUB) && |
4243 | !ftrace_function_local_disabled(op) && | 4310 | !ftrace_function_local_disabled(op) && |
4244 | ftrace_ops_test(op, ip)) | 4311 | ftrace_ops_test(op, ip, regs)) |
4245 | op->func(ip, parent_ip, op, regs); | 4312 | op->func(ip, parent_ip, op, regs); |
4246 | } while_for_each_ftrace_op(op); | 4313 | } while_for_each_ftrace_op(op); |
4247 | trace_recursion_clear(TRACE_CONTROL_BIT); | 4314 | trace_recursion_clear(TRACE_CONTROL_BIT); |
@@ -4274,7 +4341,7 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, | |||
4274 | */ | 4341 | */ |
4275 | preempt_disable_notrace(); | 4342 | preempt_disable_notrace(); |
4276 | do_for_each_ftrace_op(op, ftrace_ops_list) { | 4343 | do_for_each_ftrace_op(op, ftrace_ops_list) { |
4277 | if (ftrace_ops_test(op, ip)) | 4344 | if (ftrace_ops_test(op, ip, regs)) |
4278 | op->func(ip, parent_ip, op, regs); | 4345 | op->func(ip, parent_ip, op, regs); |
4279 | } while_for_each_ftrace_op(op); | 4346 | } while_for_each_ftrace_op(op); |
4280 | preempt_enable_notrace(); | 4347 | preempt_enable_notrace(); |
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index e444ff88f0a4..cc2f66f68dc5 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
@@ -36,11 +36,11 @@ int ring_buffer_print_entry_header(struct trace_seq *s) | |||
36 | { | 36 | { |
37 | int ret; | 37 | int ret; |
38 | 38 | ||
39 | ret = trace_seq_printf(s, "# compressed entry header\n"); | 39 | ret = trace_seq_puts(s, "# compressed entry header\n"); |
40 | ret = trace_seq_printf(s, "\ttype_len : 5 bits\n"); | 40 | ret = trace_seq_puts(s, "\ttype_len : 5 bits\n"); |
41 | ret = trace_seq_printf(s, "\ttime_delta : 27 bits\n"); | 41 | ret = trace_seq_puts(s, "\ttime_delta : 27 bits\n"); |
42 | ret = trace_seq_printf(s, "\tarray : 32 bits\n"); | 42 | ret = trace_seq_puts(s, "\tarray : 32 bits\n"); |
43 | ret = trace_seq_printf(s, "\n"); | 43 | ret = trace_seq_putc(s, '\n'); |
44 | ret = trace_seq_printf(s, "\tpadding : type == %d\n", | 44 | ret = trace_seq_printf(s, "\tpadding : type == %d\n", |
45 | RINGBUF_TYPE_PADDING); | 45 | RINGBUF_TYPE_PADDING); |
46 | ret = trace_seq_printf(s, "\ttime_extend : type == %d\n", | 46 | ret = trace_seq_printf(s, "\ttime_extend : type == %d\n", |
@@ -1066,7 +1066,7 @@ static int rb_check_list(struct ring_buffer_per_cpu *cpu_buffer, | |||
1066 | } | 1066 | } |
1067 | 1067 | ||
1068 | /** | 1068 | /** |
1069 | * check_pages - integrity check of buffer pages | 1069 | * rb_check_pages - integrity check of buffer pages |
1070 | * @cpu_buffer: CPU buffer with pages to test | 1070 | * @cpu_buffer: CPU buffer with pages to test |
1071 | * | 1071 | * |
1072 | * As a safety measure we check to make sure the data pages have not | 1072 | * As a safety measure we check to make sure the data pages have not |
@@ -1258,7 +1258,7 @@ static int rb_cpu_notify(struct notifier_block *self, | |||
1258 | #endif | 1258 | #endif |
1259 | 1259 | ||
1260 | /** | 1260 | /** |
1261 | * ring_buffer_alloc - allocate a new ring_buffer | 1261 | * __ring_buffer_alloc - allocate a new ring_buffer |
1262 | * @size: the size in bytes per cpu that is needed. | 1262 | * @size: the size in bytes per cpu that is needed. |
1263 | * @flags: attributes to set for the ring buffer. | 1263 | * @flags: attributes to set for the ring buffer. |
1264 | * | 1264 | * |
@@ -1607,6 +1607,7 @@ static void update_pages_handler(struct work_struct *work) | |||
1607 | * ring_buffer_resize - resize the ring buffer | 1607 | * ring_buffer_resize - resize the ring buffer |
1608 | * @buffer: the buffer to resize. | 1608 | * @buffer: the buffer to resize. |
1609 | * @size: the new size. | 1609 | * @size: the new size. |
1610 | * @cpu_id: the cpu buffer to resize | ||
1610 | * | 1611 | * |
1611 | * Minimum size is 2 * BUF_PAGE_SIZE. | 1612 | * Minimum size is 2 * BUF_PAGE_SIZE. |
1612 | * | 1613 | * |
@@ -3956,11 +3957,11 @@ EXPORT_SYMBOL_GPL(ring_buffer_consume); | |||
3956 | * expected. | 3957 | * expected. |
3957 | * | 3958 | * |
3958 | * After a sequence of ring_buffer_read_prepare calls, the user is | 3959 | * After a sequence of ring_buffer_read_prepare calls, the user is |
3959 | * expected to make at least one call to ring_buffer_prepare_sync. | 3960 | * expected to make at least one call to ring_buffer_read_prepare_sync. |
3960 | * Afterwards, ring_buffer_read_start is invoked to get things going | 3961 | * Afterwards, ring_buffer_read_start is invoked to get things going |
3961 | * for real. | 3962 | * for real. |
3962 | * | 3963 | * |
3963 | * This overall must be paired with ring_buffer_finish. | 3964 | * This overall must be paired with ring_buffer_read_finish. |
3964 | */ | 3965 | */ |
3965 | struct ring_buffer_iter * | 3966 | struct ring_buffer_iter * |
3966 | ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu) | 3967 | ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu) |
@@ -4009,7 +4010,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync); | |||
4009 | * an intervening ring_buffer_read_prepare_sync must have been | 4010 | * an intervening ring_buffer_read_prepare_sync must have been |
4010 | * performed. | 4011 | * performed. |
4011 | * | 4012 | * |
4012 | * Must be paired with ring_buffer_finish. | 4013 | * Must be paired with ring_buffer_read_finish. |
4013 | */ | 4014 | */ |
4014 | void | 4015 | void |
4015 | ring_buffer_read_start(struct ring_buffer_iter *iter) | 4016 | ring_buffer_read_start(struct ring_buffer_iter *iter) |
@@ -4031,7 +4032,7 @@ ring_buffer_read_start(struct ring_buffer_iter *iter) | |||
4031 | EXPORT_SYMBOL_GPL(ring_buffer_read_start); | 4032 | EXPORT_SYMBOL_GPL(ring_buffer_read_start); |
4032 | 4033 | ||
4033 | /** | 4034 | /** |
4034 | * ring_buffer_finish - finish reading the iterator of the buffer | 4035 | * ring_buffer_read_finish - finish reading the iterator of the buffer |
4035 | * @iter: The iterator retrieved by ring_buffer_start | 4036 | * @iter: The iterator retrieved by ring_buffer_start |
4036 | * | 4037 | * |
4037 | * This re-enables the recording to the buffer, and frees the | 4038 | * This re-enables the recording to the buffer, and frees the |
@@ -4346,6 +4347,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); | |||
4346 | /** | 4347 | /** |
4347 | * ring_buffer_alloc_read_page - allocate a page to read from buffer | 4348 | * ring_buffer_alloc_read_page - allocate a page to read from buffer |
4348 | * @buffer: the buffer to allocate for. | 4349 | * @buffer: the buffer to allocate for. |
4350 | * @cpu: the cpu buffer to allocate. | ||
4349 | * | 4351 | * |
4350 | * This function is used in conjunction with ring_buffer_read_page. | 4352 | * This function is used in conjunction with ring_buffer_read_page. |
4351 | * When reading a full page from the ring buffer, these functions | 4353 | * When reading a full page from the ring buffer, these functions |
@@ -4403,7 +4405,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_free_read_page); | |||
4403 | * to swap with a page in the ring buffer. | 4405 | * to swap with a page in the ring buffer. |
4404 | * | 4406 | * |
4405 | * for example: | 4407 | * for example: |
4406 | * rpage = ring_buffer_alloc_read_page(buffer); | 4408 | * rpage = ring_buffer_alloc_read_page(buffer, cpu); |
4407 | * if (!rpage) | 4409 | * if (!rpage) |
4408 | * return error; | 4410 | * return error; |
4409 | * ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0); | 4411 | * ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0); |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0cd500bffd9b..496f94d57698 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
@@ -243,20 +243,25 @@ int filter_current_check_discard(struct ring_buffer *buffer, | |||
243 | } | 243 | } |
244 | EXPORT_SYMBOL_GPL(filter_current_check_discard); | 244 | EXPORT_SYMBOL_GPL(filter_current_check_discard); |
245 | 245 | ||
246 | cycle_t ftrace_now(int cpu) | 246 | cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu) |
247 | { | 247 | { |
248 | u64 ts; | 248 | u64 ts; |
249 | 249 | ||
250 | /* Early boot up does not have a buffer yet */ | 250 | /* Early boot up does not have a buffer yet */ |
251 | if (!global_trace.trace_buffer.buffer) | 251 | if (!buf->buffer) |
252 | return trace_clock_local(); | 252 | return trace_clock_local(); |
253 | 253 | ||
254 | ts = ring_buffer_time_stamp(global_trace.trace_buffer.buffer, cpu); | 254 | ts = ring_buffer_time_stamp(buf->buffer, cpu); |
255 | ring_buffer_normalize_time_stamp(global_trace.trace_buffer.buffer, cpu, &ts); | 255 | ring_buffer_normalize_time_stamp(buf->buffer, cpu, &ts); |
256 | 256 | ||
257 | return ts; | 257 | return ts; |
258 | } | 258 | } |
259 | 259 | ||
260 | cycle_t ftrace_now(int cpu) | ||
261 | { | ||
262 | return buffer_ftrace_now(&global_trace.trace_buffer, cpu); | ||
263 | } | ||
264 | |||
260 | /** | 265 | /** |
261 | * tracing_is_enabled - Show if global_trace has been disabled | 266 | * tracing_is_enabled - Show if global_trace has been disabled |
262 | * | 267 | * |
@@ -1211,7 +1216,7 @@ void tracing_reset_online_cpus(struct trace_buffer *buf) | |||
1211 | /* Make sure all commits have finished */ | 1216 | /* Make sure all commits have finished */ |
1212 | synchronize_sched(); | 1217 | synchronize_sched(); |
1213 | 1218 | ||
1214 | buf->time_start = ftrace_now(buf->cpu); | 1219 | buf->time_start = buffer_ftrace_now(buf, buf->cpu); |
1215 | 1220 | ||
1216 | for_each_online_cpu(cpu) | 1221 | for_each_online_cpu(cpu) |
1217 | ring_buffer_reset_cpu(buffer, cpu); | 1222 | ring_buffer_reset_cpu(buffer, cpu); |
@@ -1219,23 +1224,17 @@ void tracing_reset_online_cpus(struct trace_buffer *buf) | |||
1219 | ring_buffer_record_enable(buffer); | 1224 | ring_buffer_record_enable(buffer); |
1220 | } | 1225 | } |
1221 | 1226 | ||
1222 | void tracing_reset_current(int cpu) | 1227 | /* Must have trace_types_lock held */ |
1223 | { | ||
1224 | tracing_reset(&global_trace.trace_buffer, cpu); | ||
1225 | } | ||
1226 | |||
1227 | void tracing_reset_all_online_cpus(void) | 1228 | void tracing_reset_all_online_cpus(void) |
1228 | { | 1229 | { |
1229 | struct trace_array *tr; | 1230 | struct trace_array *tr; |
1230 | 1231 | ||
1231 | mutex_lock(&trace_types_lock); | ||
1232 | list_for_each_entry(tr, &ftrace_trace_arrays, list) { | 1232 | list_for_each_entry(tr, &ftrace_trace_arrays, list) { |
1233 | tracing_reset_online_cpus(&tr->trace_buffer); | 1233 | tracing_reset_online_cpus(&tr->trace_buffer); |
1234 | #ifdef CONFIG_TRACER_MAX_TRACE | 1234 | #ifdef CONFIG_TRACER_MAX_TRACE |
1235 | tracing_reset_online_cpus(&tr->max_buffer); | 1235 | tracing_reset_online_cpus(&tr->max_buffer); |
1236 | #endif | 1236 | #endif |
1237 | } | 1237 | } |
1238 | mutex_unlock(&trace_types_lock); | ||
1239 | } | 1238 | } |
1240 | 1239 | ||
1241 | #define SAVED_CMDLINES 128 | 1240 | #define SAVED_CMDLINES 128 |
@@ -2843,6 +2842,17 @@ static int s_show(struct seq_file *m, void *v) | |||
2843 | return 0; | 2842 | return 0; |
2844 | } | 2843 | } |
2845 | 2844 | ||
2845 | /* | ||
2846 | * Should be used after trace_array_get(), trace_types_lock | ||
2847 | * ensures that i_cdev was already initialized. | ||
2848 | */ | ||
2849 | static inline int tracing_get_cpu(struct inode *inode) | ||
2850 | { | ||
2851 | if (inode->i_cdev) /* See trace_create_cpu_file() */ | ||
2852 | return (long)inode->i_cdev - 1; | ||
2853 | return RING_BUFFER_ALL_CPUS; | ||
2854 | } | ||
2855 | |||
2846 | static const struct seq_operations tracer_seq_ops = { | 2856 | static const struct seq_operations tracer_seq_ops = { |
2847 | .start = s_start, | 2857 | .start = s_start, |
2848 | .next = s_next, | 2858 | .next = s_next, |
@@ -2851,9 +2861,9 @@ static const struct seq_operations tracer_seq_ops = { | |||
2851 | }; | 2861 | }; |
2852 | 2862 | ||
2853 | static struct trace_iterator * | 2863 | static struct trace_iterator * |
2854 | __tracing_open(struct trace_array *tr, struct trace_cpu *tc, | 2864 | __tracing_open(struct inode *inode, struct file *file, bool snapshot) |
2855 | struct inode *inode, struct file *file, bool snapshot) | ||
2856 | { | 2865 | { |
2866 | struct trace_array *tr = inode->i_private; | ||
2857 | struct trace_iterator *iter; | 2867 | struct trace_iterator *iter; |
2858 | int cpu; | 2868 | int cpu; |
2859 | 2869 | ||
@@ -2894,8 +2904,8 @@ __tracing_open(struct trace_array *tr, struct trace_cpu *tc, | |||
2894 | iter->trace_buffer = &tr->trace_buffer; | 2904 | iter->trace_buffer = &tr->trace_buffer; |
2895 | iter->snapshot = snapshot; | 2905 | iter->snapshot = snapshot; |
2896 | iter->pos = -1; | 2906 | iter->pos = -1; |
2907 | iter->cpu_file = tracing_get_cpu(inode); | ||
2897 | mutex_init(&iter->mutex); | 2908 | mutex_init(&iter->mutex); |
2898 | iter->cpu_file = tc->cpu; | ||
2899 | 2909 | ||
2900 | /* Notify the tracer early; before we stop tracing. */ | 2910 | /* Notify the tracer early; before we stop tracing. */ |
2901 | if (iter->trace && iter->trace->open) | 2911 | if (iter->trace && iter->trace->open) |
@@ -2971,45 +2981,22 @@ static int tracing_open_generic_tr(struct inode *inode, struct file *filp) | |||
2971 | filp->private_data = inode->i_private; | 2981 | filp->private_data = inode->i_private; |
2972 | 2982 | ||
2973 | return 0; | 2983 | return 0; |
2974 | |||
2975 | } | ||
2976 | |||
2977 | static int tracing_open_generic_tc(struct inode *inode, struct file *filp) | ||
2978 | { | ||
2979 | struct trace_cpu *tc = inode->i_private; | ||
2980 | struct trace_array *tr = tc->tr; | ||
2981 | |||
2982 | if (tracing_disabled) | ||
2983 | return -ENODEV; | ||
2984 | |||
2985 | if (trace_array_get(tr) < 0) | ||
2986 | return -ENODEV; | ||
2987 | |||
2988 | filp->private_data = inode->i_private; | ||
2989 | |||
2990 | return 0; | ||
2991 | |||
2992 | } | 2984 | } |
2993 | 2985 | ||
2994 | static int tracing_release(struct inode *inode, struct file *file) | 2986 | static int tracing_release(struct inode *inode, struct file *file) |
2995 | { | 2987 | { |
2988 | struct trace_array *tr = inode->i_private; | ||
2996 | struct seq_file *m = file->private_data; | 2989 | struct seq_file *m = file->private_data; |
2997 | struct trace_iterator *iter; | 2990 | struct trace_iterator *iter; |
2998 | struct trace_array *tr; | ||
2999 | int cpu; | 2991 | int cpu; |
3000 | 2992 | ||
3001 | /* Writes do not use seq_file, need to grab tr from inode */ | ||
3002 | if (!(file->f_mode & FMODE_READ)) { | 2993 | if (!(file->f_mode & FMODE_READ)) { |
3003 | struct trace_cpu *tc = inode->i_private; | 2994 | trace_array_put(tr); |
3004 | |||
3005 | trace_array_put(tc->tr); | ||
3006 | return 0; | 2995 | return 0; |
3007 | } | 2996 | } |
3008 | 2997 | ||
2998 | /* Writes do not use seq_file */ | ||
3009 | iter = m->private; | 2999 | iter = m->private; |
3010 | tr = iter->tr; | ||
3011 | trace_array_put(tr); | ||
3012 | |||
3013 | mutex_lock(&trace_types_lock); | 3000 | mutex_lock(&trace_types_lock); |
3014 | 3001 | ||
3015 | for_each_tracing_cpu(cpu) { | 3002 | for_each_tracing_cpu(cpu) { |
@@ -3023,6 +3010,9 @@ static int tracing_release(struct inode *inode, struct file *file) | |||
3023 | if (!iter->snapshot) | 3010 | if (!iter->snapshot) |
3024 | /* reenable tracing if it was previously enabled */ | 3011 | /* reenable tracing if it was previously enabled */ |
3025 | tracing_start_tr(tr); | 3012 | tracing_start_tr(tr); |
3013 | |||
3014 | __trace_array_put(tr); | ||
3015 | |||
3026 | mutex_unlock(&trace_types_lock); | 3016 | mutex_unlock(&trace_types_lock); |
3027 | 3017 | ||
3028 | mutex_destroy(&iter->mutex); | 3018 | mutex_destroy(&iter->mutex); |
@@ -3042,15 +3032,6 @@ static int tracing_release_generic_tr(struct inode *inode, struct file *file) | |||
3042 | return 0; | 3032 | return 0; |
3043 | } | 3033 | } |
3044 | 3034 | ||
3045 | static int tracing_release_generic_tc(struct inode *inode, struct file *file) | ||
3046 | { | ||
3047 | struct trace_cpu *tc = inode->i_private; | ||
3048 | struct trace_array *tr = tc->tr; | ||
3049 | |||
3050 | trace_array_put(tr); | ||
3051 | return 0; | ||
3052 | } | ||
3053 | |||
3054 | static int tracing_single_release_tr(struct inode *inode, struct file *file) | 3035 | static int tracing_single_release_tr(struct inode *inode, struct file *file) |
3055 | { | 3036 | { |
3056 | struct trace_array *tr = inode->i_private; | 3037 | struct trace_array *tr = inode->i_private; |
@@ -3062,8 +3043,7 @@ static int tracing_single_release_tr(struct inode *inode, struct file *file) | |||
3062 | 3043 | ||
3063 | static int tracing_open(struct inode *inode, struct file *file) | 3044 | static int tracing_open(struct inode *inode, struct file *file) |
3064 | { | 3045 | { |
3065 | struct trace_cpu *tc = inode->i_private; | 3046 | struct trace_array *tr = inode->i_private; |
3066 | struct trace_array *tr = tc->tr; | ||
3067 | struct trace_iterator *iter; | 3047 | struct trace_iterator *iter; |
3068 | int ret = 0; | 3048 | int ret = 0; |
3069 | 3049 | ||
@@ -3071,16 +3051,17 @@ static int tracing_open(struct inode *inode, struct file *file) | |||
3071 | return -ENODEV; | 3051 | return -ENODEV; |
3072 | 3052 | ||
3073 | /* If this file was open for write, then erase contents */ | 3053 | /* If this file was open for write, then erase contents */ |
3074 | if ((file->f_mode & FMODE_WRITE) && | 3054 | if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { |
3075 | (file->f_flags & O_TRUNC)) { | 3055 | int cpu = tracing_get_cpu(inode); |
3076 | if (tc->cpu == RING_BUFFER_ALL_CPUS) | 3056 | |
3057 | if (cpu == RING_BUFFER_ALL_CPUS) | ||
3077 | tracing_reset_online_cpus(&tr->trace_buffer); | 3058 | tracing_reset_online_cpus(&tr->trace_buffer); |
3078 | else | 3059 | else |
3079 | tracing_reset(&tr->trace_buffer, tc->cpu); | 3060 | tracing_reset(&tr->trace_buffer, cpu); |
3080 | } | 3061 | } |
3081 | 3062 | ||
3082 | if (file->f_mode & FMODE_READ) { | 3063 | if (file->f_mode & FMODE_READ) { |
3083 | iter = __tracing_open(tr, tc, inode, file, false); | 3064 | iter = __tracing_open(inode, file, false); |
3084 | if (IS_ERR(iter)) | 3065 | if (IS_ERR(iter)) |
3085 | ret = PTR_ERR(iter); | 3066 | ret = PTR_ERR(iter); |
3086 | else if (trace_flags & TRACE_ITER_LATENCY_FMT) | 3067 | else if (trace_flags & TRACE_ITER_LATENCY_FMT) |
@@ -3447,6 +3428,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, | |||
3447 | static int tracing_trace_options_open(struct inode *inode, struct file *file) | 3428 | static int tracing_trace_options_open(struct inode *inode, struct file *file) |
3448 | { | 3429 | { |
3449 | struct trace_array *tr = inode->i_private; | 3430 | struct trace_array *tr = inode->i_private; |
3431 | int ret; | ||
3450 | 3432 | ||
3451 | if (tracing_disabled) | 3433 | if (tracing_disabled) |
3452 | return -ENODEV; | 3434 | return -ENODEV; |
@@ -3454,7 +3436,11 @@ static int tracing_trace_options_open(struct inode *inode, struct file *file) | |||
3454 | if (trace_array_get(tr) < 0) | 3436 | if (trace_array_get(tr) < 0) |
3455 | return -ENODEV; | 3437 | return -ENODEV; |
3456 | 3438 | ||
3457 | return single_open(file, tracing_trace_options_show, inode->i_private); | 3439 | ret = single_open(file, tracing_trace_options_show, inode->i_private); |
3440 | if (ret < 0) | ||
3441 | trace_array_put(tr); | ||
3442 | |||
3443 | return ret; | ||
3458 | } | 3444 | } |
3459 | 3445 | ||
3460 | static const struct file_operations tracing_iter_fops = { | 3446 | static const struct file_operations tracing_iter_fops = { |
@@ -3537,14 +3523,14 @@ static const char readme_msg[] = | |||
3537 | "\n snapshot\t\t- Like 'trace' but shows the content of the static snapshot buffer\n" | 3523 | "\n snapshot\t\t- Like 'trace' but shows the content of the static snapshot buffer\n" |
3538 | "\t\t\t Read the contents for more information\n" | 3524 | "\t\t\t Read the contents for more information\n" |
3539 | #endif | 3525 | #endif |
3540 | #ifdef CONFIG_STACKTRACE | 3526 | #ifdef CONFIG_STACK_TRACER |
3541 | " stack_trace\t\t- Shows the max stack trace when active\n" | 3527 | " stack_trace\t\t- Shows the max stack trace when active\n" |
3542 | " stack_max_size\t- Shows current max stack size that was traced\n" | 3528 | " stack_max_size\t- Shows current max stack size that was traced\n" |
3543 | "\t\t\t Write into this file to reset the max size (trigger a new trace)\n" | 3529 | "\t\t\t Write into this file to reset the max size (trigger a new trace)\n" |
3544 | #ifdef CONFIG_DYNAMIC_FTRACE | 3530 | #ifdef CONFIG_DYNAMIC_FTRACE |
3545 | " stack_trace_filter\t- Like set_ftrace_filter but limits what stack_trace traces\n" | 3531 | " stack_trace_filter\t- Like set_ftrace_filter but limits what stack_trace traces\n" |
3546 | #endif | 3532 | #endif |
3547 | #endif /* CONFIG_STACKTRACE */ | 3533 | #endif /* CONFIG_STACK_TRACER */ |
3548 | ; | 3534 | ; |
3549 | 3535 | ||
3550 | static ssize_t | 3536 | static ssize_t |
@@ -3941,8 +3927,7 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf, | |||
3941 | 3927 | ||
3942 | static int tracing_open_pipe(struct inode *inode, struct file *filp) | 3928 | static int tracing_open_pipe(struct inode *inode, struct file *filp) |
3943 | { | 3929 | { |
3944 | struct trace_cpu *tc = inode->i_private; | 3930 | struct trace_array *tr = inode->i_private; |
3945 | struct trace_array *tr = tc->tr; | ||
3946 | struct trace_iterator *iter; | 3931 | struct trace_iterator *iter; |
3947 | int ret = 0; | 3932 | int ret = 0; |
3948 | 3933 | ||
@@ -3958,6 +3943,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) | |||
3958 | iter = kzalloc(sizeof(*iter), GFP_KERNEL); | 3943 | iter = kzalloc(sizeof(*iter), GFP_KERNEL); |
3959 | if (!iter) { | 3944 | if (!iter) { |
3960 | ret = -ENOMEM; | 3945 | ret = -ENOMEM; |
3946 | __trace_array_put(tr); | ||
3961 | goto out; | 3947 | goto out; |
3962 | } | 3948 | } |
3963 | 3949 | ||
@@ -3987,9 +3973,9 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) | |||
3987 | if (trace_clocks[tr->clock_id].in_ns) | 3973 | if (trace_clocks[tr->clock_id].in_ns) |
3988 | iter->iter_flags |= TRACE_FILE_TIME_IN_NS; | 3974 | iter->iter_flags |= TRACE_FILE_TIME_IN_NS; |
3989 | 3975 | ||
3990 | iter->cpu_file = tc->cpu; | 3976 | iter->tr = tr; |
3991 | iter->tr = tc->tr; | 3977 | iter->trace_buffer = &tr->trace_buffer; |
3992 | iter->trace_buffer = &tc->tr->trace_buffer; | 3978 | iter->cpu_file = tracing_get_cpu(inode); |
3993 | mutex_init(&iter->mutex); | 3979 | mutex_init(&iter->mutex); |
3994 | filp->private_data = iter; | 3980 | filp->private_data = iter; |
3995 | 3981 | ||
@@ -4012,8 +3998,7 @@ fail: | |||
4012 | static int tracing_release_pipe(struct inode *inode, struct file *file) | 3998 | static int tracing_release_pipe(struct inode *inode, struct file *file) |
4013 | { | 3999 | { |
4014 | struct trace_iterator *iter = file->private_data; | 4000 | struct trace_iterator *iter = file->private_data; |
4015 | struct trace_cpu *tc = inode->i_private; | 4001 | struct trace_array *tr = inode->i_private; |
4016 | struct trace_array *tr = tc->tr; | ||
4017 | 4002 | ||
4018 | mutex_lock(&trace_types_lock); | 4003 | mutex_lock(&trace_types_lock); |
4019 | 4004 | ||
@@ -4166,6 +4151,7 @@ waitagain: | |||
4166 | memset(&iter->seq, 0, | 4151 | memset(&iter->seq, 0, |
4167 | sizeof(struct trace_iterator) - | 4152 | sizeof(struct trace_iterator) - |
4168 | offsetof(struct trace_iterator, seq)); | 4153 | offsetof(struct trace_iterator, seq)); |
4154 | cpumask_clear(iter->started); | ||
4169 | iter->pos = -1; | 4155 | iter->pos = -1; |
4170 | 4156 | ||
4171 | trace_event_read_lock(); | 4157 | trace_event_read_lock(); |
@@ -4366,15 +4352,16 @@ static ssize_t | |||
4366 | tracing_entries_read(struct file *filp, char __user *ubuf, | 4352 | tracing_entries_read(struct file *filp, char __user *ubuf, |
4367 | size_t cnt, loff_t *ppos) | 4353 | size_t cnt, loff_t *ppos) |
4368 | { | 4354 | { |
4369 | struct trace_cpu *tc = filp->private_data; | 4355 | struct inode *inode = file_inode(filp); |
4370 | struct trace_array *tr = tc->tr; | 4356 | struct trace_array *tr = inode->i_private; |
4357 | int cpu = tracing_get_cpu(inode); | ||
4371 | char buf[64]; | 4358 | char buf[64]; |
4372 | int r = 0; | 4359 | int r = 0; |
4373 | ssize_t ret; | 4360 | ssize_t ret; |
4374 | 4361 | ||
4375 | mutex_lock(&trace_types_lock); | 4362 | mutex_lock(&trace_types_lock); |
4376 | 4363 | ||
4377 | if (tc->cpu == RING_BUFFER_ALL_CPUS) { | 4364 | if (cpu == RING_BUFFER_ALL_CPUS) { |
4378 | int cpu, buf_size_same; | 4365 | int cpu, buf_size_same; |
4379 | unsigned long size; | 4366 | unsigned long size; |
4380 | 4367 | ||
@@ -4401,7 +4388,7 @@ tracing_entries_read(struct file *filp, char __user *ubuf, | |||
4401 | } else | 4388 | } else |
4402 | r = sprintf(buf, "X\n"); | 4389 | r = sprintf(buf, "X\n"); |
4403 | } else | 4390 | } else |
4404 | r = sprintf(buf, "%lu\n", per_cpu_ptr(tr->trace_buffer.data, tc->cpu)->entries >> 10); | 4391 | r = sprintf(buf, "%lu\n", per_cpu_ptr(tr->trace_buffer.data, cpu)->entries >> 10); |
4405 | 4392 | ||
4406 | mutex_unlock(&trace_types_lock); | 4393 | mutex_unlock(&trace_types_lock); |
4407 | 4394 | ||
@@ -4413,7 +4400,8 @@ static ssize_t | |||
4413 | tracing_entries_write(struct file *filp, const char __user *ubuf, | 4400 | tracing_entries_write(struct file *filp, const char __user *ubuf, |
4414 | size_t cnt, loff_t *ppos) | 4401 | size_t cnt, loff_t *ppos) |
4415 | { | 4402 | { |
4416 | struct trace_cpu *tc = filp->private_data; | 4403 | struct inode *inode = file_inode(filp); |
4404 | struct trace_array *tr = inode->i_private; | ||
4417 | unsigned long val; | 4405 | unsigned long val; |
4418 | int ret; | 4406 | int ret; |
4419 | 4407 | ||
@@ -4427,8 +4415,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, | |||
4427 | 4415 | ||
4428 | /* value is in KB */ | 4416 | /* value is in KB */ |
4429 | val <<= 10; | 4417 | val <<= 10; |
4430 | 4418 | ret = tracing_resize_ring_buffer(tr, val, tracing_get_cpu(inode)); | |
4431 | ret = tracing_resize_ring_buffer(tc->tr, val, tc->cpu); | ||
4432 | if (ret < 0) | 4419 | if (ret < 0) |
4433 | return ret; | 4420 | return ret; |
4434 | 4421 | ||
@@ -4482,7 +4469,7 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp) | |||
4482 | 4469 | ||
4483 | /* disable tracing ? */ | 4470 | /* disable tracing ? */ |
4484 | if (trace_flags & TRACE_ITER_STOP_ON_FREE) | 4471 | if (trace_flags & TRACE_ITER_STOP_ON_FREE) |
4485 | tracing_off(); | 4472 | tracer_tracing_off(tr); |
4486 | /* resize the ring buffer to 0 */ | 4473 | /* resize the ring buffer to 0 */ |
4487 | tracing_resize_ring_buffer(tr, 0, RING_BUFFER_ALL_CPUS); | 4474 | tracing_resize_ring_buffer(tr, 0, RING_BUFFER_ALL_CPUS); |
4488 | 4475 | ||
@@ -4647,12 +4634,12 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, | |||
4647 | * New clock may not be consistent with the previous clock. | 4634 | * New clock may not be consistent with the previous clock. |
4648 | * Reset the buffer so that it doesn't have incomparable timestamps. | 4635 | * Reset the buffer so that it doesn't have incomparable timestamps. |
4649 | */ | 4636 | */ |
4650 | tracing_reset_online_cpus(&global_trace.trace_buffer); | 4637 | tracing_reset_online_cpus(&tr->trace_buffer); |
4651 | 4638 | ||
4652 | #ifdef CONFIG_TRACER_MAX_TRACE | 4639 | #ifdef CONFIG_TRACER_MAX_TRACE |
4653 | if (tr->flags & TRACE_ARRAY_FL_GLOBAL && tr->max_buffer.buffer) | 4640 | if (tr->flags & TRACE_ARRAY_FL_GLOBAL && tr->max_buffer.buffer) |
4654 | ring_buffer_set_clock(tr->max_buffer.buffer, trace_clocks[i].func); | 4641 | ring_buffer_set_clock(tr->max_buffer.buffer, trace_clocks[i].func); |
4655 | tracing_reset_online_cpus(&global_trace.max_buffer); | 4642 | tracing_reset_online_cpus(&tr->max_buffer); |
4656 | #endif | 4643 | #endif |
4657 | 4644 | ||
4658 | mutex_unlock(&trace_types_lock); | 4645 | mutex_unlock(&trace_types_lock); |
@@ -4689,8 +4676,7 @@ struct ftrace_buffer_info { | |||
4689 | #ifdef CONFIG_TRACER_SNAPSHOT | 4676 | #ifdef CONFIG_TRACER_SNAPSHOT |
4690 | static int tracing_snapshot_open(struct inode *inode, struct file *file) | 4677 | static int tracing_snapshot_open(struct inode *inode, struct file *file) |
4691 | { | 4678 | { |
4692 | struct trace_cpu *tc = inode->i_private; | 4679 | struct trace_array *tr = inode->i_private; |
4693 | struct trace_array *tr = tc->tr; | ||
4694 | struct trace_iterator *iter; | 4680 | struct trace_iterator *iter; |
4695 | struct seq_file *m; | 4681 | struct seq_file *m; |
4696 | int ret = 0; | 4682 | int ret = 0; |
@@ -4699,26 +4685,29 @@ static int tracing_snapshot_open(struct inode *inode, struct file *file) | |||
4699 | return -ENODEV; | 4685 | return -ENODEV; |
4700 | 4686 | ||
4701 | if (file->f_mode & FMODE_READ) { | 4687 | if (file->f_mode & FMODE_READ) { |
4702 | iter = __tracing_open(tr, tc, inode, file, true); | 4688 | iter = __tracing_open(inode, file, true); |
4703 | if (IS_ERR(iter)) | 4689 | if (IS_ERR(iter)) |
4704 | ret = PTR_ERR(iter); | 4690 | ret = PTR_ERR(iter); |
4705 | } else { | 4691 | } else { |
4706 | /* Writes still need the seq_file to hold the private data */ | 4692 | /* Writes still need the seq_file to hold the private data */ |
4693 | ret = -ENOMEM; | ||
4707 | m = kzalloc(sizeof(*m), GFP_KERNEL); | 4694 | m = kzalloc(sizeof(*m), GFP_KERNEL); |
4708 | if (!m) | 4695 | if (!m) |
4709 | return -ENOMEM; | 4696 | goto out; |
4710 | iter = kzalloc(sizeof(*iter), GFP_KERNEL); | 4697 | iter = kzalloc(sizeof(*iter), GFP_KERNEL); |
4711 | if (!iter) { | 4698 | if (!iter) { |
4712 | kfree(m); | 4699 | kfree(m); |
4713 | return -ENOMEM; | 4700 | goto out; |
4714 | } | 4701 | } |
4702 | ret = 0; | ||
4703 | |||
4715 | iter->tr = tr; | 4704 | iter->tr = tr; |
4716 | iter->trace_buffer = &tc->tr->max_buffer; | 4705 | iter->trace_buffer = &tr->max_buffer; |
4717 | iter->cpu_file = tc->cpu; | 4706 | iter->cpu_file = tracing_get_cpu(inode); |
4718 | m->private = iter; | 4707 | m->private = iter; |
4719 | file->private_data = m; | 4708 | file->private_data = m; |
4720 | } | 4709 | } |
4721 | 4710 | out: | |
4722 | if (ret < 0) | 4711 | if (ret < 0) |
4723 | trace_array_put(tr); | 4712 | trace_array_put(tr); |
4724 | 4713 | ||
@@ -4873,11 +4862,11 @@ static const struct file_operations tracing_pipe_fops = { | |||
4873 | }; | 4862 | }; |
4874 | 4863 | ||
4875 | static const struct file_operations tracing_entries_fops = { | 4864 | static const struct file_operations tracing_entries_fops = { |
4876 | .open = tracing_open_generic_tc, | 4865 | .open = tracing_open_generic_tr, |
4877 | .read = tracing_entries_read, | 4866 | .read = tracing_entries_read, |
4878 | .write = tracing_entries_write, | 4867 | .write = tracing_entries_write, |
4879 | .llseek = generic_file_llseek, | 4868 | .llseek = generic_file_llseek, |
4880 | .release = tracing_release_generic_tc, | 4869 | .release = tracing_release_generic_tr, |
4881 | }; | 4870 | }; |
4882 | 4871 | ||
4883 | static const struct file_operations tracing_total_entries_fops = { | 4872 | static const struct file_operations tracing_total_entries_fops = { |
@@ -4929,8 +4918,7 @@ static const struct file_operations snapshot_raw_fops = { | |||
4929 | 4918 | ||
4930 | static int tracing_buffers_open(struct inode *inode, struct file *filp) | 4919 | static int tracing_buffers_open(struct inode *inode, struct file *filp) |
4931 | { | 4920 | { |
4932 | struct trace_cpu *tc = inode->i_private; | 4921 | struct trace_array *tr = inode->i_private; |
4933 | struct trace_array *tr = tc->tr; | ||
4934 | struct ftrace_buffer_info *info; | 4922 | struct ftrace_buffer_info *info; |
4935 | int ret; | 4923 | int ret; |
4936 | 4924 | ||
@@ -4948,10 +4936,8 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp) | |||
4948 | 4936 | ||
4949 | mutex_lock(&trace_types_lock); | 4937 | mutex_lock(&trace_types_lock); |
4950 | 4938 | ||
4951 | tr->ref++; | ||
4952 | |||
4953 | info->iter.tr = tr; | 4939 | info->iter.tr = tr; |
4954 | info->iter.cpu_file = tc->cpu; | 4940 | info->iter.cpu_file = tracing_get_cpu(inode); |
4955 | info->iter.trace = tr->current_trace; | 4941 | info->iter.trace = tr->current_trace; |
4956 | info->iter.trace_buffer = &tr->trace_buffer; | 4942 | info->iter.trace_buffer = &tr->trace_buffer; |
4957 | info->spare = NULL; | 4943 | info->spare = NULL; |
@@ -5268,14 +5254,14 @@ static ssize_t | |||
5268 | tracing_stats_read(struct file *filp, char __user *ubuf, | 5254 | tracing_stats_read(struct file *filp, char __user *ubuf, |
5269 | size_t count, loff_t *ppos) | 5255 | size_t count, loff_t *ppos) |
5270 | { | 5256 | { |
5271 | struct trace_cpu *tc = filp->private_data; | 5257 | struct inode *inode = file_inode(filp); |
5272 | struct trace_array *tr = tc->tr; | 5258 | struct trace_array *tr = inode->i_private; |
5273 | struct trace_buffer *trace_buf = &tr->trace_buffer; | 5259 | struct trace_buffer *trace_buf = &tr->trace_buffer; |
5260 | int cpu = tracing_get_cpu(inode); | ||
5274 | struct trace_seq *s; | 5261 | struct trace_seq *s; |
5275 | unsigned long cnt; | 5262 | unsigned long cnt; |
5276 | unsigned long long t; | 5263 | unsigned long long t; |
5277 | unsigned long usec_rem; | 5264 | unsigned long usec_rem; |
5278 | int cpu = tc->cpu; | ||
5279 | 5265 | ||
5280 | s = kmalloc(sizeof(*s), GFP_KERNEL); | 5266 | s = kmalloc(sizeof(*s), GFP_KERNEL); |
5281 | if (!s) | 5267 | if (!s) |
@@ -5328,9 +5314,10 @@ tracing_stats_read(struct file *filp, char __user *ubuf, | |||
5328 | } | 5314 | } |
5329 | 5315 | ||
5330 | static const struct file_operations tracing_stats_fops = { | 5316 | static const struct file_operations tracing_stats_fops = { |
5331 | .open = tracing_open_generic, | 5317 | .open = tracing_open_generic_tr, |
5332 | .read = tracing_stats_read, | 5318 | .read = tracing_stats_read, |
5333 | .llseek = generic_file_llseek, | 5319 | .llseek = generic_file_llseek, |
5320 | .release = tracing_release_generic_tr, | ||
5334 | }; | 5321 | }; |
5335 | 5322 | ||
5336 | #ifdef CONFIG_DYNAMIC_FTRACE | 5323 | #ifdef CONFIG_DYNAMIC_FTRACE |
@@ -5519,10 +5506,20 @@ static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu) | |||
5519 | return tr->percpu_dir; | 5506 | return tr->percpu_dir; |
5520 | } | 5507 | } |
5521 | 5508 | ||
5509 | static struct dentry * | ||
5510 | trace_create_cpu_file(const char *name, umode_t mode, struct dentry *parent, | ||
5511 | void *data, long cpu, const struct file_operations *fops) | ||
5512 | { | ||
5513 | struct dentry *ret = trace_create_file(name, mode, parent, data, fops); | ||
5514 | |||
5515 | if (ret) /* See tracing_get_cpu() */ | ||
5516 | ret->d_inode->i_cdev = (void *)(cpu + 1); | ||
5517 | return ret; | ||
5518 | } | ||
5519 | |||
5522 | static void | 5520 | static void |
5523 | tracing_init_debugfs_percpu(struct trace_array *tr, long cpu) | 5521 | tracing_init_debugfs_percpu(struct trace_array *tr, long cpu) |
5524 | { | 5522 | { |
5525 | struct trace_array_cpu *data = per_cpu_ptr(tr->trace_buffer.data, cpu); | ||
5526 | struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu); | 5523 | struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu); |
5527 | struct dentry *d_cpu; | 5524 | struct dentry *d_cpu; |
5528 | char cpu_dir[30]; /* 30 characters should be more than enough */ | 5525 | char cpu_dir[30]; /* 30 characters should be more than enough */ |
@@ -5538,28 +5535,28 @@ tracing_init_debugfs_percpu(struct trace_array *tr, long cpu) | |||
5538 | } | 5535 | } |
5539 | 5536 | ||
5540 | /* per cpu trace_pipe */ | 5537 | /* per cpu trace_pipe */ |
5541 | trace_create_file("trace_pipe", 0444, d_cpu, | 5538 | trace_create_cpu_file("trace_pipe", 0444, d_cpu, |
5542 | (void *)&data->trace_cpu, &tracing_pipe_fops); | 5539 | tr, cpu, &tracing_pipe_fops); |
5543 | 5540 | ||
5544 | /* per cpu trace */ | 5541 | /* per cpu trace */ |
5545 | trace_create_file("trace", 0644, d_cpu, | 5542 | trace_create_cpu_file("trace", 0644, d_cpu, |
5546 | (void *)&data->trace_cpu, &tracing_fops); | 5543 | tr, cpu, &tracing_fops); |
5547 | 5544 | ||
5548 | trace_create_file("trace_pipe_raw", 0444, d_cpu, | 5545 | trace_create_cpu_file("trace_pipe_raw", 0444, d_cpu, |
5549 | (void *)&data->trace_cpu, &tracing_buffers_fops); | 5546 | tr, cpu, &tracing_buffers_fops); |
5550 | 5547 | ||
5551 | trace_create_file("stats", 0444, d_cpu, | 5548 | trace_create_cpu_file("stats", 0444, d_cpu, |
5552 | (void *)&data->trace_cpu, &tracing_stats_fops); | 5549 | tr, cpu, &tracing_stats_fops); |
5553 | 5550 | ||
5554 | trace_create_file("buffer_size_kb", 0444, d_cpu, | 5551 | trace_create_cpu_file("buffer_size_kb", 0444, d_cpu, |
5555 | (void *)&data->trace_cpu, &tracing_entries_fops); | 5552 | tr, cpu, &tracing_entries_fops); |
5556 | 5553 | ||
5557 | #ifdef CONFIG_TRACER_SNAPSHOT | 5554 | #ifdef CONFIG_TRACER_SNAPSHOT |
5558 | trace_create_file("snapshot", 0644, d_cpu, | 5555 | trace_create_cpu_file("snapshot", 0644, d_cpu, |
5559 | (void *)&data->trace_cpu, &snapshot_fops); | 5556 | tr, cpu, &snapshot_fops); |
5560 | 5557 | ||
5561 | trace_create_file("snapshot_raw", 0444, d_cpu, | 5558 | trace_create_cpu_file("snapshot_raw", 0444, d_cpu, |
5562 | (void *)&data->trace_cpu, &snapshot_raw_fops); | 5559 | tr, cpu, &snapshot_raw_fops); |
5563 | #endif | 5560 | #endif |
5564 | } | 5561 | } |
5565 | 5562 | ||
@@ -5868,17 +5865,6 @@ struct dentry *trace_instance_dir; | |||
5868 | static void | 5865 | static void |
5869 | init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer); | 5866 | init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer); |
5870 | 5867 | ||
5871 | static void init_trace_buffers(struct trace_array *tr, struct trace_buffer *buf) | ||
5872 | { | ||
5873 | int cpu; | ||
5874 | |||
5875 | for_each_tracing_cpu(cpu) { | ||
5876 | memset(per_cpu_ptr(buf->data, cpu), 0, sizeof(struct trace_array_cpu)); | ||
5877 | per_cpu_ptr(buf->data, cpu)->trace_cpu.cpu = cpu; | ||
5878 | per_cpu_ptr(buf->data, cpu)->trace_cpu.tr = tr; | ||
5879 | } | ||
5880 | } | ||
5881 | |||
5882 | static int | 5868 | static int |
5883 | allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size) | 5869 | allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size) |
5884 | { | 5870 | { |
@@ -5896,8 +5882,6 @@ allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size | |||
5896 | return -ENOMEM; | 5882 | return -ENOMEM; |
5897 | } | 5883 | } |
5898 | 5884 | ||
5899 | init_trace_buffers(tr, buf); | ||
5900 | |||
5901 | /* Allocate the first page for all buffers */ | 5885 | /* Allocate the first page for all buffers */ |
5902 | set_buffer_entries(&tr->trace_buffer, | 5886 | set_buffer_entries(&tr->trace_buffer, |
5903 | ring_buffer_size(tr->trace_buffer.buffer, 0)); | 5887 | ring_buffer_size(tr->trace_buffer.buffer, 0)); |
@@ -5964,17 +5948,15 @@ static int new_instance_create(const char *name) | |||
5964 | if (allocate_trace_buffers(tr, trace_buf_size) < 0) | 5948 | if (allocate_trace_buffers(tr, trace_buf_size) < 0) |
5965 | goto out_free_tr; | 5949 | goto out_free_tr; |
5966 | 5950 | ||
5967 | /* Holder for file callbacks */ | ||
5968 | tr->trace_cpu.cpu = RING_BUFFER_ALL_CPUS; | ||
5969 | tr->trace_cpu.tr = tr; | ||
5970 | |||
5971 | tr->dir = debugfs_create_dir(name, trace_instance_dir); | 5951 | tr->dir = debugfs_create_dir(name, trace_instance_dir); |
5972 | if (!tr->dir) | 5952 | if (!tr->dir) |
5973 | goto out_free_tr; | 5953 | goto out_free_tr; |
5974 | 5954 | ||
5975 | ret = event_trace_add_tracer(tr->dir, tr); | 5955 | ret = event_trace_add_tracer(tr->dir, tr); |
5976 | if (ret) | 5956 | if (ret) { |
5957 | debugfs_remove_recursive(tr->dir); | ||
5977 | goto out_free_tr; | 5958 | goto out_free_tr; |
5959 | } | ||
5978 | 5960 | ||
5979 | init_tracer_debugfs(tr, tr->dir); | 5961 | init_tracer_debugfs(tr, tr->dir); |
5980 | 5962 | ||
@@ -6120,13 +6102,13 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer) | |||
6120 | tr, &tracing_iter_fops); | 6102 | tr, &tracing_iter_fops); |
6121 | 6103 | ||
6122 | trace_create_file("trace", 0644, d_tracer, | 6104 | trace_create_file("trace", 0644, d_tracer, |
6123 | (void *)&tr->trace_cpu, &tracing_fops); | 6105 | tr, &tracing_fops); |
6124 | 6106 | ||
6125 | trace_create_file("trace_pipe", 0444, d_tracer, | 6107 | trace_create_file("trace_pipe", 0444, d_tracer, |
6126 | (void *)&tr->trace_cpu, &tracing_pipe_fops); | 6108 | tr, &tracing_pipe_fops); |
6127 | 6109 | ||
6128 | trace_create_file("buffer_size_kb", 0644, d_tracer, | 6110 | trace_create_file("buffer_size_kb", 0644, d_tracer, |
6129 | (void *)&tr->trace_cpu, &tracing_entries_fops); | 6111 | tr, &tracing_entries_fops); |
6130 | 6112 | ||
6131 | trace_create_file("buffer_total_size_kb", 0444, d_tracer, | 6113 | trace_create_file("buffer_total_size_kb", 0444, d_tracer, |
6132 | tr, &tracing_total_entries_fops); | 6114 | tr, &tracing_total_entries_fops); |
@@ -6141,11 +6123,11 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer) | |||
6141 | &trace_clock_fops); | 6123 | &trace_clock_fops); |
6142 | 6124 | ||
6143 | trace_create_file("tracing_on", 0644, d_tracer, | 6125 | trace_create_file("tracing_on", 0644, d_tracer, |
6144 | tr, &rb_simple_fops); | 6126 | tr, &rb_simple_fops); |
6145 | 6127 | ||
6146 | #ifdef CONFIG_TRACER_SNAPSHOT | 6128 | #ifdef CONFIG_TRACER_SNAPSHOT |
6147 | trace_create_file("snapshot", 0644, d_tracer, | 6129 | trace_create_file("snapshot", 0644, d_tracer, |
6148 | (void *)&tr->trace_cpu, &snapshot_fops); | 6130 | tr, &snapshot_fops); |
6149 | #endif | 6131 | #endif |
6150 | 6132 | ||
6151 | for_each_tracing_cpu(cpu) | 6133 | for_each_tracing_cpu(cpu) |
@@ -6439,10 +6421,6 @@ __init static int tracer_alloc_buffers(void) | |||
6439 | 6421 | ||
6440 | global_trace.flags = TRACE_ARRAY_FL_GLOBAL; | 6422 | global_trace.flags = TRACE_ARRAY_FL_GLOBAL; |
6441 | 6423 | ||
6442 | /* Holder for file callbacks */ | ||
6443 | global_trace.trace_cpu.cpu = RING_BUFFER_ALL_CPUS; | ||
6444 | global_trace.trace_cpu.tr = &global_trace; | ||
6445 | |||
6446 | INIT_LIST_HEAD(&global_trace.systems); | 6424 | INIT_LIST_HEAD(&global_trace.systems); |
6447 | INIT_LIST_HEAD(&global_trace.events); | 6425 | INIT_LIST_HEAD(&global_trace.events); |
6448 | list_add(&global_trace.list, &ftrace_trace_arrays); | 6426 | list_add(&global_trace.list, &ftrace_trace_arrays); |
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 4a4f6e1828b6..fe39acd4c1aa 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
@@ -130,19 +130,12 @@ enum trace_flag_type { | |||
130 | 130 | ||
131 | struct trace_array; | 131 | struct trace_array; |
132 | 132 | ||
133 | struct trace_cpu { | ||
134 | struct trace_array *tr; | ||
135 | struct dentry *dir; | ||
136 | int cpu; | ||
137 | }; | ||
138 | |||
139 | /* | 133 | /* |
140 | * The CPU trace array - it consists of thousands of trace entries | 134 | * The CPU trace array - it consists of thousands of trace entries |
141 | * plus some other descriptor data: (for example which task started | 135 | * plus some other descriptor data: (for example which task started |
142 | * the trace, etc.) | 136 | * the trace, etc.) |
143 | */ | 137 | */ |
144 | struct trace_array_cpu { | 138 | struct trace_array_cpu { |
145 | struct trace_cpu trace_cpu; | ||
146 | atomic_t disabled; | 139 | atomic_t disabled; |
147 | void *buffer_page; /* ring buffer spare */ | 140 | void *buffer_page; /* ring buffer spare */ |
148 | 141 | ||
@@ -196,7 +189,6 @@ struct trace_array { | |||
196 | bool allocated_snapshot; | 189 | bool allocated_snapshot; |
197 | #endif | 190 | #endif |
198 | int buffer_disabled; | 191 | int buffer_disabled; |
199 | struct trace_cpu trace_cpu; /* place holder */ | ||
200 | #ifdef CONFIG_FTRACE_SYSCALLS | 192 | #ifdef CONFIG_FTRACE_SYSCALLS |
201 | int sys_refcount_enter; | 193 | int sys_refcount_enter; |
202 | int sys_refcount_exit; | 194 | int sys_refcount_exit; |
@@ -214,7 +206,6 @@ struct trace_array { | |||
214 | struct dentry *event_dir; | 206 | struct dentry *event_dir; |
215 | struct list_head systems; | 207 | struct list_head systems; |
216 | struct list_head events; | 208 | struct list_head events; |
217 | struct task_struct *waiter; | ||
218 | int ref; | 209 | int ref; |
219 | }; | 210 | }; |
220 | 211 | ||
@@ -680,6 +671,15 @@ extern int trace_selftest_startup_sched_switch(struct tracer *trace, | |||
680 | struct trace_array *tr); | 671 | struct trace_array *tr); |
681 | extern int trace_selftest_startup_branch(struct tracer *trace, | 672 | extern int trace_selftest_startup_branch(struct tracer *trace, |
682 | struct trace_array *tr); | 673 | struct trace_array *tr); |
674 | /* | ||
675 | * Tracer data references selftest functions that only occur | ||
676 | * on boot up. These can be __init functions. Thus, when selftests | ||
677 | * are enabled, then the tracers need to reference __init functions. | ||
678 | */ | ||
679 | #define __tracer_data __refdata | ||
680 | #else | ||
681 | /* Tracers are seldom changed. Optimize when selftests are disabled. */ | ||
682 | #define __tracer_data __read_mostly | ||
683 | #endif /* CONFIG_FTRACE_STARTUP_TEST */ | 683 | #endif /* CONFIG_FTRACE_STARTUP_TEST */ |
684 | 684 | ||
685 | extern void *head_page(struct trace_array_cpu *data); | 685 | extern void *head_page(struct trace_array_cpu *data); |
@@ -1022,6 +1022,9 @@ extern struct list_head ftrace_events; | |||
1022 | extern const char *__start___trace_bprintk_fmt[]; | 1022 | extern const char *__start___trace_bprintk_fmt[]; |
1023 | extern const char *__stop___trace_bprintk_fmt[]; | 1023 | extern const char *__stop___trace_bprintk_fmt[]; |
1024 | 1024 | ||
1025 | extern const char *__start___tracepoint_str[]; | ||
1026 | extern const char *__stop___tracepoint_str[]; | ||
1027 | |||
1025 | void trace_printk_init_buffers(void); | 1028 | void trace_printk_init_buffers(void); |
1026 | void trace_printk_start_comm(void); | 1029 | void trace_printk_start_comm(void); |
1027 | int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set); | 1030 | int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set); |
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 84b1e045faba..80c36bcf66e8 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c | |||
@@ -236,6 +236,10 @@ __kprobes void *perf_trace_buf_prepare(int size, unsigned short type, | |||
236 | 236 | ||
237 | BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long)); | 237 | BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long)); |
238 | 238 | ||
239 | if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, | ||
240 | "perf buffer not large enough")) | ||
241 | return NULL; | ||
242 | |||
239 | pc = preempt_count(); | 243 | pc = preempt_count(); |
240 | 244 | ||
241 | *rctxp = perf_swevent_get_recursion_context(); | 245 | *rctxp = perf_swevent_get_recursion_context(); |
@@ -266,6 +270,10 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip, | |||
266 | struct pt_regs regs; | 270 | struct pt_regs regs; |
267 | int rctx; | 271 | int rctx; |
268 | 272 | ||
273 | head = this_cpu_ptr(event_function.perf_events); | ||
274 | if (hlist_empty(head)) | ||
275 | return; | ||
276 | |||
269 | #define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \ | 277 | #define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \ |
270 | sizeof(u64)) - sizeof(u32)) | 278 | sizeof(u64)) - sizeof(u32)) |
271 | 279 | ||
@@ -279,8 +287,6 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip, | |||
279 | 287 | ||
280 | entry->ip = ip; | 288 | entry->ip = ip; |
281 | entry->parent_ip = parent_ip; | 289 | entry->parent_ip = parent_ip; |
282 | |||
283 | head = this_cpu_ptr(event_function.perf_events); | ||
284 | perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0, | 290 | perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0, |
285 | 1, ®s, head, NULL); | 291 | 1, ®s, head, NULL); |
286 | 292 | ||
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 7d854290bf81..29a7ebcfb426 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
@@ -409,33 +409,42 @@ static void put_system(struct ftrace_subsystem_dir *dir) | |||
409 | mutex_unlock(&event_mutex); | 409 | mutex_unlock(&event_mutex); |
410 | } | 410 | } |
411 | 411 | ||
412 | /* | 412 | static void remove_subsystem(struct ftrace_subsystem_dir *dir) |
413 | * Open and update trace_array ref count. | ||
414 | * Must have the current trace_array passed to it. | ||
415 | */ | ||
416 | static int tracing_open_generic_file(struct inode *inode, struct file *filp) | ||
417 | { | 413 | { |
418 | struct ftrace_event_file *file = inode->i_private; | 414 | if (!dir) |
419 | struct trace_array *tr = file->tr; | 415 | return; |
420 | int ret; | ||
421 | 416 | ||
422 | if (trace_array_get(tr) < 0) | 417 | if (!--dir->nr_events) { |
423 | return -ENODEV; | 418 | debugfs_remove_recursive(dir->entry); |
419 | list_del(&dir->list); | ||
420 | __put_system_dir(dir); | ||
421 | } | ||
422 | } | ||
424 | 423 | ||
425 | ret = tracing_open_generic(inode, filp); | 424 | static void *event_file_data(struct file *filp) |
426 | if (ret < 0) | 425 | { |
427 | trace_array_put(tr); | 426 | return ACCESS_ONCE(file_inode(filp)->i_private); |
428 | return ret; | ||
429 | } | 427 | } |
430 | 428 | ||
431 | static int tracing_release_generic_file(struct inode *inode, struct file *filp) | 429 | static void remove_event_file_dir(struct ftrace_event_file *file) |
432 | { | 430 | { |
433 | struct ftrace_event_file *file = inode->i_private; | 431 | struct dentry *dir = file->dir; |
434 | struct trace_array *tr = file->tr; | 432 | struct dentry *child; |
435 | 433 | ||
436 | trace_array_put(tr); | 434 | if (dir) { |
435 | spin_lock(&dir->d_lock); /* probably unneeded */ | ||
436 | list_for_each_entry(child, &dir->d_subdirs, d_u.d_child) { | ||
437 | if (child->d_inode) /* probably unneeded */ | ||
438 | child->d_inode->i_private = NULL; | ||
439 | } | ||
440 | spin_unlock(&dir->d_lock); | ||
437 | 441 | ||
438 | return 0; | 442 | debugfs_remove_recursive(dir); |
443 | } | ||
444 | |||
445 | list_del(&file->list); | ||
446 | remove_subsystem(file->system); | ||
447 | kmem_cache_free(file_cachep, file); | ||
439 | } | 448 | } |
440 | 449 | ||
441 | /* | 450 | /* |
@@ -679,15 +688,25 @@ static ssize_t | |||
679 | event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, | 688 | event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, |
680 | loff_t *ppos) | 689 | loff_t *ppos) |
681 | { | 690 | { |
682 | struct ftrace_event_file *file = filp->private_data; | 691 | struct ftrace_event_file *file; |
692 | unsigned long flags; | ||
683 | char buf[4] = "0"; | 693 | char buf[4] = "0"; |
684 | 694 | ||
685 | if (file->flags & FTRACE_EVENT_FL_ENABLED && | 695 | mutex_lock(&event_mutex); |
686 | !(file->flags & FTRACE_EVENT_FL_SOFT_DISABLED)) | 696 | file = event_file_data(filp); |
697 | if (likely(file)) | ||
698 | flags = file->flags; | ||
699 | mutex_unlock(&event_mutex); | ||
700 | |||
701 | if (!file) | ||
702 | return -ENODEV; | ||
703 | |||
704 | if (flags & FTRACE_EVENT_FL_ENABLED && | ||
705 | !(flags & FTRACE_EVENT_FL_SOFT_DISABLED)) | ||
687 | strcpy(buf, "1"); | 706 | strcpy(buf, "1"); |
688 | 707 | ||
689 | if (file->flags & FTRACE_EVENT_FL_SOFT_DISABLED || | 708 | if (flags & FTRACE_EVENT_FL_SOFT_DISABLED || |
690 | file->flags & FTRACE_EVENT_FL_SOFT_MODE) | 709 | flags & FTRACE_EVENT_FL_SOFT_MODE) |
691 | strcat(buf, "*"); | 710 | strcat(buf, "*"); |
692 | 711 | ||
693 | strcat(buf, "\n"); | 712 | strcat(buf, "\n"); |
@@ -699,13 +718,10 @@ static ssize_t | |||
699 | event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, | 718 | event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, |
700 | loff_t *ppos) | 719 | loff_t *ppos) |
701 | { | 720 | { |
702 | struct ftrace_event_file *file = filp->private_data; | 721 | struct ftrace_event_file *file; |
703 | unsigned long val; | 722 | unsigned long val; |
704 | int ret; | 723 | int ret; |
705 | 724 | ||
706 | if (!file) | ||
707 | return -EINVAL; | ||
708 | |||
709 | ret = kstrtoul_from_user(ubuf, cnt, 10, &val); | 725 | ret = kstrtoul_from_user(ubuf, cnt, 10, &val); |
710 | if (ret) | 726 | if (ret) |
711 | return ret; | 727 | return ret; |
@@ -717,8 +733,11 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, | |||
717 | switch (val) { | 733 | switch (val) { |
718 | case 0: | 734 | case 0: |
719 | case 1: | 735 | case 1: |
736 | ret = -ENODEV; | ||
720 | mutex_lock(&event_mutex); | 737 | mutex_lock(&event_mutex); |
721 | ret = ftrace_event_enable_disable(file, val); | 738 | file = event_file_data(filp); |
739 | if (likely(file)) | ||
740 | ret = ftrace_event_enable_disable(file, val); | ||
722 | mutex_unlock(&event_mutex); | 741 | mutex_unlock(&event_mutex); |
723 | break; | 742 | break; |
724 | 743 | ||
@@ -825,65 +844,39 @@ enum { | |||
825 | 844 | ||
826 | static void *f_next(struct seq_file *m, void *v, loff_t *pos) | 845 | static void *f_next(struct seq_file *m, void *v, loff_t *pos) |
827 | { | 846 | { |
828 | struct ftrace_event_call *call = m->private; | 847 | struct ftrace_event_call *call = event_file_data(m->private); |
829 | struct ftrace_event_field *field; | ||
830 | struct list_head *common_head = &ftrace_common_fields; | 848 | struct list_head *common_head = &ftrace_common_fields; |
831 | struct list_head *head = trace_get_fields(call); | 849 | struct list_head *head = trace_get_fields(call); |
850 | struct list_head *node = v; | ||
832 | 851 | ||
833 | (*pos)++; | 852 | (*pos)++; |
834 | 853 | ||
835 | switch ((unsigned long)v) { | 854 | switch ((unsigned long)v) { |
836 | case FORMAT_HEADER: | 855 | case FORMAT_HEADER: |
837 | if (unlikely(list_empty(common_head))) | 856 | node = common_head; |
838 | return NULL; | 857 | break; |
839 | |||
840 | field = list_entry(common_head->prev, | ||
841 | struct ftrace_event_field, link); | ||
842 | return field; | ||
843 | 858 | ||
844 | case FORMAT_FIELD_SEPERATOR: | 859 | case FORMAT_FIELD_SEPERATOR: |
845 | if (unlikely(list_empty(head))) | 860 | node = head; |
846 | return NULL; | 861 | break; |
847 | |||
848 | field = list_entry(head->prev, struct ftrace_event_field, link); | ||
849 | return field; | ||
850 | 862 | ||
851 | case FORMAT_PRINTFMT: | 863 | case FORMAT_PRINTFMT: |
852 | /* all done */ | 864 | /* all done */ |
853 | return NULL; | 865 | return NULL; |
854 | } | 866 | } |
855 | 867 | ||
856 | field = v; | 868 | node = node->prev; |
857 | if (field->link.prev == common_head) | 869 | if (node == common_head) |
858 | return (void *)FORMAT_FIELD_SEPERATOR; | 870 | return (void *)FORMAT_FIELD_SEPERATOR; |
859 | else if (field->link.prev == head) | 871 | else if (node == head) |
860 | return (void *)FORMAT_PRINTFMT; | 872 | return (void *)FORMAT_PRINTFMT; |
861 | 873 | else | |
862 | field = list_entry(field->link.prev, struct ftrace_event_field, link); | 874 | return node; |
863 | |||
864 | return field; | ||
865 | } | ||
866 | |||
867 | static void *f_start(struct seq_file *m, loff_t *pos) | ||
868 | { | ||
869 | loff_t l = 0; | ||
870 | void *p; | ||
871 | |||
872 | /* Start by showing the header */ | ||
873 | if (!*pos) | ||
874 | return (void *)FORMAT_HEADER; | ||
875 | |||
876 | p = (void *)FORMAT_HEADER; | ||
877 | do { | ||
878 | p = f_next(m, p, &l); | ||
879 | } while (p && l < *pos); | ||
880 | |||
881 | return p; | ||
882 | } | 875 | } |
883 | 876 | ||
884 | static int f_show(struct seq_file *m, void *v) | 877 | static int f_show(struct seq_file *m, void *v) |
885 | { | 878 | { |
886 | struct ftrace_event_call *call = m->private; | 879 | struct ftrace_event_call *call = event_file_data(m->private); |
887 | struct ftrace_event_field *field; | 880 | struct ftrace_event_field *field; |
888 | const char *array_descriptor; | 881 | const char *array_descriptor; |
889 | 882 | ||
@@ -904,8 +897,7 @@ static int f_show(struct seq_file *m, void *v) | |||
904 | return 0; | 897 | return 0; |
905 | } | 898 | } |
906 | 899 | ||
907 | field = v; | 900 | field = list_entry(v, struct ftrace_event_field, link); |
908 | |||
909 | /* | 901 | /* |
910 | * Smartly shows the array type(except dynamic array). | 902 | * Smartly shows the array type(except dynamic array). |
911 | * Normal: | 903 | * Normal: |
@@ -932,8 +924,25 @@ static int f_show(struct seq_file *m, void *v) | |||
932 | return 0; | 924 | return 0; |
933 | } | 925 | } |
934 | 926 | ||
927 | static void *f_start(struct seq_file *m, loff_t *pos) | ||
928 | { | ||
929 | void *p = (void *)FORMAT_HEADER; | ||
930 | loff_t l = 0; | ||
931 | |||
932 | /* ->stop() is called even if ->start() fails */ | ||
933 | mutex_lock(&event_mutex); | ||
934 | if (!event_file_data(m->private)) | ||
935 | return ERR_PTR(-ENODEV); | ||
936 | |||
937 | while (l < *pos && p) | ||
938 | p = f_next(m, p, &l); | ||
939 | |||
940 | return p; | ||
941 | } | ||
942 | |||
935 | static void f_stop(struct seq_file *m, void *p) | 943 | static void f_stop(struct seq_file *m, void *p) |
936 | { | 944 | { |
945 | mutex_unlock(&event_mutex); | ||
937 | } | 946 | } |
938 | 947 | ||
939 | static const struct seq_operations trace_format_seq_ops = { | 948 | static const struct seq_operations trace_format_seq_ops = { |
@@ -945,7 +954,6 @@ static const struct seq_operations trace_format_seq_ops = { | |||
945 | 954 | ||
946 | static int trace_format_open(struct inode *inode, struct file *file) | 955 | static int trace_format_open(struct inode *inode, struct file *file) |
947 | { | 956 | { |
948 | struct ftrace_event_call *call = inode->i_private; | ||
949 | struct seq_file *m; | 957 | struct seq_file *m; |
950 | int ret; | 958 | int ret; |
951 | 959 | ||
@@ -954,7 +962,7 @@ static int trace_format_open(struct inode *inode, struct file *file) | |||
954 | return ret; | 962 | return ret; |
955 | 963 | ||
956 | m = file->private_data; | 964 | m = file->private_data; |
957 | m->private = call; | 965 | m->private = file; |
958 | 966 | ||
959 | return 0; | 967 | return 0; |
960 | } | 968 | } |
@@ -962,45 +970,47 @@ static int trace_format_open(struct inode *inode, struct file *file) | |||
962 | static ssize_t | 970 | static ssize_t |
963 | event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) | 971 | event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) |
964 | { | 972 | { |
965 | struct ftrace_event_call *call = filp->private_data; | 973 | int id = (long)event_file_data(filp); |
966 | struct trace_seq *s; | 974 | char buf[32]; |
967 | int r; | 975 | int len; |
968 | 976 | ||
969 | if (*ppos) | 977 | if (*ppos) |
970 | return 0; | 978 | return 0; |
971 | 979 | ||
972 | s = kmalloc(sizeof(*s), GFP_KERNEL); | 980 | if (unlikely(!id)) |
973 | if (!s) | 981 | return -ENODEV; |
974 | return -ENOMEM; | ||
975 | 982 | ||
976 | trace_seq_init(s); | 983 | len = sprintf(buf, "%d\n", id); |
977 | trace_seq_printf(s, "%d\n", call->event.type); | ||
978 | 984 | ||
979 | r = simple_read_from_buffer(ubuf, cnt, ppos, | 985 | return simple_read_from_buffer(ubuf, cnt, ppos, buf, len); |
980 | s->buffer, s->len); | ||
981 | kfree(s); | ||
982 | return r; | ||
983 | } | 986 | } |
984 | 987 | ||
985 | static ssize_t | 988 | static ssize_t |
986 | event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, | 989 | event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, |
987 | loff_t *ppos) | 990 | loff_t *ppos) |
988 | { | 991 | { |
989 | struct ftrace_event_call *call = filp->private_data; | 992 | struct ftrace_event_call *call; |
990 | struct trace_seq *s; | 993 | struct trace_seq *s; |
991 | int r; | 994 | int r = -ENODEV; |
992 | 995 | ||
993 | if (*ppos) | 996 | if (*ppos) |
994 | return 0; | 997 | return 0; |
995 | 998 | ||
996 | s = kmalloc(sizeof(*s), GFP_KERNEL); | 999 | s = kmalloc(sizeof(*s), GFP_KERNEL); |
1000 | |||
997 | if (!s) | 1001 | if (!s) |
998 | return -ENOMEM; | 1002 | return -ENOMEM; |
999 | 1003 | ||
1000 | trace_seq_init(s); | 1004 | trace_seq_init(s); |
1001 | 1005 | ||
1002 | print_event_filter(call, s); | 1006 | mutex_lock(&event_mutex); |
1003 | r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); | 1007 | call = event_file_data(filp); |
1008 | if (call) | ||
1009 | print_event_filter(call, s); | ||
1010 | mutex_unlock(&event_mutex); | ||
1011 | |||
1012 | if (call) | ||
1013 | r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); | ||
1004 | 1014 | ||
1005 | kfree(s); | 1015 | kfree(s); |
1006 | 1016 | ||
@@ -1011,9 +1021,9 @@ static ssize_t | |||
1011 | event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, | 1021 | event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, |
1012 | loff_t *ppos) | 1022 | loff_t *ppos) |
1013 | { | 1023 | { |
1014 | struct ftrace_event_call *call = filp->private_data; | 1024 | struct ftrace_event_call *call; |
1015 | char *buf; | 1025 | char *buf; |
1016 | int err; | 1026 | int err = -ENODEV; |
1017 | 1027 | ||
1018 | if (cnt >= PAGE_SIZE) | 1028 | if (cnt >= PAGE_SIZE) |
1019 | return -EINVAL; | 1029 | return -EINVAL; |
@@ -1028,7 +1038,12 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, | |||
1028 | } | 1038 | } |
1029 | buf[cnt] = '\0'; | 1039 | buf[cnt] = '\0'; |
1030 | 1040 | ||
1031 | err = apply_event_filter(call, buf); | 1041 | mutex_lock(&event_mutex); |
1042 | call = event_file_data(filp); | ||
1043 | if (call) | ||
1044 | err = apply_event_filter(call, buf); | ||
1045 | mutex_unlock(&event_mutex); | ||
1046 | |||
1032 | free_page((unsigned long) buf); | 1047 | free_page((unsigned long) buf); |
1033 | if (err < 0) | 1048 | if (err < 0) |
1034 | return err; | 1049 | return err; |
@@ -1218,6 +1233,7 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) | |||
1218 | 1233 | ||
1219 | static int ftrace_event_avail_open(struct inode *inode, struct file *file); | 1234 | static int ftrace_event_avail_open(struct inode *inode, struct file *file); |
1220 | static int ftrace_event_set_open(struct inode *inode, struct file *file); | 1235 | static int ftrace_event_set_open(struct inode *inode, struct file *file); |
1236 | static int ftrace_event_release(struct inode *inode, struct file *file); | ||
1221 | 1237 | ||
1222 | static const struct seq_operations show_event_seq_ops = { | 1238 | static const struct seq_operations show_event_seq_ops = { |
1223 | .start = t_start, | 1239 | .start = t_start, |
@@ -1245,14 +1261,13 @@ static const struct file_operations ftrace_set_event_fops = { | |||
1245 | .read = seq_read, | 1261 | .read = seq_read, |
1246 | .write = ftrace_event_write, | 1262 | .write = ftrace_event_write, |
1247 | .llseek = seq_lseek, | 1263 | .llseek = seq_lseek, |
1248 | .release = seq_release, | 1264 | .release = ftrace_event_release, |
1249 | }; | 1265 | }; |
1250 | 1266 | ||
1251 | static const struct file_operations ftrace_enable_fops = { | 1267 | static const struct file_operations ftrace_enable_fops = { |
1252 | .open = tracing_open_generic_file, | 1268 | .open = tracing_open_generic, |
1253 | .read = event_enable_read, | 1269 | .read = event_enable_read, |
1254 | .write = event_enable_write, | 1270 | .write = event_enable_write, |
1255 | .release = tracing_release_generic_file, | ||
1256 | .llseek = default_llseek, | 1271 | .llseek = default_llseek, |
1257 | }; | 1272 | }; |
1258 | 1273 | ||
@@ -1264,7 +1279,6 @@ static const struct file_operations ftrace_event_format_fops = { | |||
1264 | }; | 1279 | }; |
1265 | 1280 | ||
1266 | static const struct file_operations ftrace_event_id_fops = { | 1281 | static const struct file_operations ftrace_event_id_fops = { |
1267 | .open = tracing_open_generic, | ||
1268 | .read = event_id_read, | 1282 | .read = event_id_read, |
1269 | .llseek = default_llseek, | 1283 | .llseek = default_llseek, |
1270 | }; | 1284 | }; |
@@ -1323,6 +1337,15 @@ ftrace_event_open(struct inode *inode, struct file *file, | |||
1323 | return ret; | 1337 | return ret; |
1324 | } | 1338 | } |
1325 | 1339 | ||
1340 | static int ftrace_event_release(struct inode *inode, struct file *file) | ||
1341 | { | ||
1342 | struct trace_array *tr = inode->i_private; | ||
1343 | |||
1344 | trace_array_put(tr); | ||
1345 | |||
1346 | return seq_release(inode, file); | ||
1347 | } | ||
1348 | |||
1326 | static int | 1349 | static int |
1327 | ftrace_event_avail_open(struct inode *inode, struct file *file) | 1350 | ftrace_event_avail_open(struct inode *inode, struct file *file) |
1328 | { | 1351 | { |
@@ -1336,12 +1359,19 @@ ftrace_event_set_open(struct inode *inode, struct file *file) | |||
1336 | { | 1359 | { |
1337 | const struct seq_operations *seq_ops = &show_set_event_seq_ops; | 1360 | const struct seq_operations *seq_ops = &show_set_event_seq_ops; |
1338 | struct trace_array *tr = inode->i_private; | 1361 | struct trace_array *tr = inode->i_private; |
1362 | int ret; | ||
1363 | |||
1364 | if (trace_array_get(tr) < 0) | ||
1365 | return -ENODEV; | ||
1339 | 1366 | ||
1340 | if ((file->f_mode & FMODE_WRITE) && | 1367 | if ((file->f_mode & FMODE_WRITE) && |
1341 | (file->f_flags & O_TRUNC)) | 1368 | (file->f_flags & O_TRUNC)) |
1342 | ftrace_clear_events(tr); | 1369 | ftrace_clear_events(tr); |
1343 | 1370 | ||
1344 | return ftrace_event_open(inode, file, seq_ops); | 1371 | ret = ftrace_event_open(inode, file, seq_ops); |
1372 | if (ret < 0) | ||
1373 | trace_array_put(tr); | ||
1374 | return ret; | ||
1345 | } | 1375 | } |
1346 | 1376 | ||
1347 | static struct event_subsystem * | 1377 | static struct event_subsystem * |
@@ -1496,8 +1526,8 @@ event_create_dir(struct dentry *parent, | |||
1496 | 1526 | ||
1497 | #ifdef CONFIG_PERF_EVENTS | 1527 | #ifdef CONFIG_PERF_EVENTS |
1498 | if (call->event.type && call->class->reg) | 1528 | if (call->event.type && call->class->reg) |
1499 | trace_create_file("id", 0444, file->dir, call, | 1529 | trace_create_file("id", 0444, file->dir, |
1500 | id); | 1530 | (void *)(long)call->event.type, id); |
1501 | #endif | 1531 | #endif |
1502 | 1532 | ||
1503 | /* | 1533 | /* |
@@ -1522,33 +1552,16 @@ event_create_dir(struct dentry *parent, | |||
1522 | return 0; | 1552 | return 0; |
1523 | } | 1553 | } |
1524 | 1554 | ||
1525 | static void remove_subsystem(struct ftrace_subsystem_dir *dir) | ||
1526 | { | ||
1527 | if (!dir) | ||
1528 | return; | ||
1529 | |||
1530 | if (!--dir->nr_events) { | ||
1531 | debugfs_remove_recursive(dir->entry); | ||
1532 | list_del(&dir->list); | ||
1533 | __put_system_dir(dir); | ||
1534 | } | ||
1535 | } | ||
1536 | |||
1537 | static void remove_event_from_tracers(struct ftrace_event_call *call) | 1555 | static void remove_event_from_tracers(struct ftrace_event_call *call) |
1538 | { | 1556 | { |
1539 | struct ftrace_event_file *file; | 1557 | struct ftrace_event_file *file; |
1540 | struct trace_array *tr; | 1558 | struct trace_array *tr; |
1541 | 1559 | ||
1542 | do_for_each_event_file_safe(tr, file) { | 1560 | do_for_each_event_file_safe(tr, file) { |
1543 | |||
1544 | if (file->event_call != call) | 1561 | if (file->event_call != call) |
1545 | continue; | 1562 | continue; |
1546 | 1563 | ||
1547 | list_del(&file->list); | 1564 | remove_event_file_dir(file); |
1548 | debugfs_remove_recursive(file->dir); | ||
1549 | remove_subsystem(file->system); | ||
1550 | kmem_cache_free(file_cachep, file); | ||
1551 | |||
1552 | /* | 1565 | /* |
1553 | * The do_for_each_event_file_safe() is | 1566 | * The do_for_each_event_file_safe() is |
1554 | * a double loop. After finding the call for this | 1567 | * a double loop. After finding the call for this |
@@ -1700,16 +1713,53 @@ static void __trace_remove_event_call(struct ftrace_event_call *call) | |||
1700 | destroy_preds(call); | 1713 | destroy_preds(call); |
1701 | } | 1714 | } |
1702 | 1715 | ||
1716 | static int probe_remove_event_call(struct ftrace_event_call *call) | ||
1717 | { | ||
1718 | struct trace_array *tr; | ||
1719 | struct ftrace_event_file *file; | ||
1720 | |||
1721 | #ifdef CONFIG_PERF_EVENTS | ||
1722 | if (call->perf_refcount) | ||
1723 | return -EBUSY; | ||
1724 | #endif | ||
1725 | do_for_each_event_file(tr, file) { | ||
1726 | if (file->event_call != call) | ||
1727 | continue; | ||
1728 | /* | ||
1729 | * We can't rely on ftrace_event_enable_disable(enable => 0) | ||
1730 | * we are going to do, FTRACE_EVENT_FL_SOFT_MODE can suppress | ||
1731 | * TRACE_REG_UNREGISTER. | ||
1732 | */ | ||
1733 | if (file->flags & FTRACE_EVENT_FL_ENABLED) | ||
1734 | return -EBUSY; | ||
1735 | /* | ||
1736 | * The do_for_each_event_file_safe() is | ||
1737 | * a double loop. After finding the call for this | ||
1738 | * trace_array, we use break to jump to the next | ||
1739 | * trace_array. | ||
1740 | */ | ||
1741 | break; | ||
1742 | } while_for_each_event_file(); | ||
1743 | |||
1744 | __trace_remove_event_call(call); | ||
1745 | |||
1746 | return 0; | ||
1747 | } | ||
1748 | |||
1703 | /* Remove an event_call */ | 1749 | /* Remove an event_call */ |
1704 | void trace_remove_event_call(struct ftrace_event_call *call) | 1750 | int trace_remove_event_call(struct ftrace_event_call *call) |
1705 | { | 1751 | { |
1752 | int ret; | ||
1753 | |||
1706 | mutex_lock(&trace_types_lock); | 1754 | mutex_lock(&trace_types_lock); |
1707 | mutex_lock(&event_mutex); | 1755 | mutex_lock(&event_mutex); |
1708 | down_write(&trace_event_sem); | 1756 | down_write(&trace_event_sem); |
1709 | __trace_remove_event_call(call); | 1757 | ret = probe_remove_event_call(call); |
1710 | up_write(&trace_event_sem); | 1758 | up_write(&trace_event_sem); |
1711 | mutex_unlock(&event_mutex); | 1759 | mutex_unlock(&event_mutex); |
1712 | mutex_unlock(&trace_types_lock); | 1760 | mutex_unlock(&trace_types_lock); |
1761 | |||
1762 | return ret; | ||
1713 | } | 1763 | } |
1714 | 1764 | ||
1715 | #define for_each_event(event, start, end) \ | 1765 | #define for_each_event(event, start, end) \ |
@@ -2278,12 +2328,8 @@ __trace_remove_event_dirs(struct trace_array *tr) | |||
2278 | { | 2328 | { |
2279 | struct ftrace_event_file *file, *next; | 2329 | struct ftrace_event_file *file, *next; |
2280 | 2330 | ||
2281 | list_for_each_entry_safe(file, next, &tr->events, list) { | 2331 | list_for_each_entry_safe(file, next, &tr->events, list) |
2282 | list_del(&file->list); | 2332 | remove_event_file_dir(file); |
2283 | debugfs_remove_recursive(file->dir); | ||
2284 | remove_subsystem(file->system); | ||
2285 | kmem_cache_free(file_cachep, file); | ||
2286 | } | ||
2287 | } | 2333 | } |
2288 | 2334 | ||
2289 | static void | 2335 | static void |
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 0d883dc057d6..97daa8cf958d 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c | |||
@@ -637,17 +637,15 @@ static void append_filter_err(struct filter_parse_state *ps, | |||
637 | free_page((unsigned long) buf); | 637 | free_page((unsigned long) buf); |
638 | } | 638 | } |
639 | 639 | ||
640 | /* caller must hold event_mutex */ | ||
640 | void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) | 641 | void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) |
641 | { | 642 | { |
642 | struct event_filter *filter; | 643 | struct event_filter *filter = call->filter; |
643 | 644 | ||
644 | mutex_lock(&event_mutex); | ||
645 | filter = call->filter; | ||
646 | if (filter && filter->filter_string) | 645 | if (filter && filter->filter_string) |
647 | trace_seq_printf(s, "%s\n", filter->filter_string); | 646 | trace_seq_printf(s, "%s\n", filter->filter_string); |
648 | else | 647 | else |
649 | trace_seq_printf(s, "none\n"); | 648 | trace_seq_puts(s, "none\n"); |
650 | mutex_unlock(&event_mutex); | ||
651 | } | 649 | } |
652 | 650 | ||
653 | void print_subsystem_event_filter(struct event_subsystem *system, | 651 | void print_subsystem_event_filter(struct event_subsystem *system, |
@@ -660,7 +658,7 @@ void print_subsystem_event_filter(struct event_subsystem *system, | |||
660 | if (filter && filter->filter_string) | 658 | if (filter && filter->filter_string) |
661 | trace_seq_printf(s, "%s\n", filter->filter_string); | 659 | trace_seq_printf(s, "%s\n", filter->filter_string); |
662 | else | 660 | else |
663 | trace_seq_printf(s, DEFAULT_SYS_FILTER_MESSAGE "\n"); | 661 | trace_seq_puts(s, DEFAULT_SYS_FILTER_MESSAGE "\n"); |
664 | mutex_unlock(&event_mutex); | 662 | mutex_unlock(&event_mutex); |
665 | } | 663 | } |
666 | 664 | ||
@@ -1841,23 +1839,22 @@ static int create_system_filter(struct event_subsystem *system, | |||
1841 | return err; | 1839 | return err; |
1842 | } | 1840 | } |
1843 | 1841 | ||
1842 | /* caller must hold event_mutex */ | ||
1844 | int apply_event_filter(struct ftrace_event_call *call, char *filter_string) | 1843 | int apply_event_filter(struct ftrace_event_call *call, char *filter_string) |
1845 | { | 1844 | { |
1846 | struct event_filter *filter; | 1845 | struct event_filter *filter; |
1847 | int err = 0; | 1846 | int err; |
1848 | |||
1849 | mutex_lock(&event_mutex); | ||
1850 | 1847 | ||
1851 | if (!strcmp(strstrip(filter_string), "0")) { | 1848 | if (!strcmp(strstrip(filter_string), "0")) { |
1852 | filter_disable(call); | 1849 | filter_disable(call); |
1853 | filter = call->filter; | 1850 | filter = call->filter; |
1854 | if (!filter) | 1851 | if (!filter) |
1855 | goto out_unlock; | 1852 | return 0; |
1856 | RCU_INIT_POINTER(call->filter, NULL); | 1853 | RCU_INIT_POINTER(call->filter, NULL); |
1857 | /* Make sure the filter is not being used */ | 1854 | /* Make sure the filter is not being used */ |
1858 | synchronize_sched(); | 1855 | synchronize_sched(); |
1859 | __free_filter(filter); | 1856 | __free_filter(filter); |
1860 | goto out_unlock; | 1857 | return 0; |
1861 | } | 1858 | } |
1862 | 1859 | ||
1863 | err = create_filter(call, filter_string, true, &filter); | 1860 | err = create_filter(call, filter_string, true, &filter); |
@@ -1884,8 +1881,6 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) | |||
1884 | __free_filter(tmp); | 1881 | __free_filter(tmp); |
1885 | } | 1882 | } |
1886 | } | 1883 | } |
1887 | out_unlock: | ||
1888 | mutex_unlock(&event_mutex); | ||
1889 | 1884 | ||
1890 | return err; | 1885 | return err; |
1891 | } | 1886 | } |
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index b863f93b30f3..38fe1483c508 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c | |||
@@ -199,7 +199,7 @@ static int func_set_flag(u32 old_flags, u32 bit, int set) | |||
199 | return 0; | 199 | return 0; |
200 | } | 200 | } |
201 | 201 | ||
202 | static struct tracer function_trace __read_mostly = | 202 | static struct tracer function_trace __tracer_data = |
203 | { | 203 | { |
204 | .name = "function", | 204 | .name = "function", |
205 | .init = function_trace_init, | 205 | .init = function_trace_init, |
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 8388bc99f2ee..b5c09242683d 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c | |||
@@ -446,7 +446,7 @@ print_graph_proc(struct trace_seq *s, pid_t pid) | |||
446 | 446 | ||
447 | /* First spaces to align center */ | 447 | /* First spaces to align center */ |
448 | for (i = 0; i < spaces / 2; i++) { | 448 | for (i = 0; i < spaces / 2; i++) { |
449 | ret = trace_seq_printf(s, " "); | 449 | ret = trace_seq_putc(s, ' '); |
450 | if (!ret) | 450 | if (!ret) |
451 | return TRACE_TYPE_PARTIAL_LINE; | 451 | return TRACE_TYPE_PARTIAL_LINE; |
452 | } | 452 | } |
@@ -457,7 +457,7 @@ print_graph_proc(struct trace_seq *s, pid_t pid) | |||
457 | 457 | ||
458 | /* Last spaces to align center */ | 458 | /* Last spaces to align center */ |
459 | for (i = 0; i < spaces - (spaces / 2); i++) { | 459 | for (i = 0; i < spaces - (spaces / 2); i++) { |
460 | ret = trace_seq_printf(s, " "); | 460 | ret = trace_seq_putc(s, ' '); |
461 | if (!ret) | 461 | if (!ret) |
462 | return TRACE_TYPE_PARTIAL_LINE; | 462 | return TRACE_TYPE_PARTIAL_LINE; |
463 | } | 463 | } |
@@ -503,7 +503,7 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data) | |||
503 | ------------------------------------------ | 503 | ------------------------------------------ |
504 | 504 | ||
505 | */ | 505 | */ |
506 | ret = trace_seq_printf(s, | 506 | ret = trace_seq_puts(s, |
507 | " ------------------------------------------\n"); | 507 | " ------------------------------------------\n"); |
508 | if (!ret) | 508 | if (!ret) |
509 | return TRACE_TYPE_PARTIAL_LINE; | 509 | return TRACE_TYPE_PARTIAL_LINE; |
@@ -516,7 +516,7 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data) | |||
516 | if (ret == TRACE_TYPE_PARTIAL_LINE) | 516 | if (ret == TRACE_TYPE_PARTIAL_LINE) |
517 | return TRACE_TYPE_PARTIAL_LINE; | 517 | return TRACE_TYPE_PARTIAL_LINE; |
518 | 518 | ||
519 | ret = trace_seq_printf(s, " => "); | 519 | ret = trace_seq_puts(s, " => "); |
520 | if (!ret) | 520 | if (!ret) |
521 | return TRACE_TYPE_PARTIAL_LINE; | 521 | return TRACE_TYPE_PARTIAL_LINE; |
522 | 522 | ||
@@ -524,7 +524,7 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data) | |||
524 | if (ret == TRACE_TYPE_PARTIAL_LINE) | 524 | if (ret == TRACE_TYPE_PARTIAL_LINE) |
525 | return TRACE_TYPE_PARTIAL_LINE; | 525 | return TRACE_TYPE_PARTIAL_LINE; |
526 | 526 | ||
527 | ret = trace_seq_printf(s, | 527 | ret = trace_seq_puts(s, |
528 | "\n ------------------------------------------\n\n"); | 528 | "\n ------------------------------------------\n\n"); |
529 | if (!ret) | 529 | if (!ret) |
530 | return TRACE_TYPE_PARTIAL_LINE; | 530 | return TRACE_TYPE_PARTIAL_LINE; |
@@ -645,7 +645,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, | |||
645 | ret = print_graph_proc(s, pid); | 645 | ret = print_graph_proc(s, pid); |
646 | if (ret == TRACE_TYPE_PARTIAL_LINE) | 646 | if (ret == TRACE_TYPE_PARTIAL_LINE) |
647 | return TRACE_TYPE_PARTIAL_LINE; | 647 | return TRACE_TYPE_PARTIAL_LINE; |
648 | ret = trace_seq_printf(s, " | "); | 648 | ret = trace_seq_puts(s, " | "); |
649 | if (!ret) | 649 | if (!ret) |
650 | return TRACE_TYPE_PARTIAL_LINE; | 650 | return TRACE_TYPE_PARTIAL_LINE; |
651 | } | 651 | } |
@@ -657,9 +657,9 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, | |||
657 | return ret; | 657 | return ret; |
658 | 658 | ||
659 | if (type == TRACE_GRAPH_ENT) | 659 | if (type == TRACE_GRAPH_ENT) |
660 | ret = trace_seq_printf(s, "==========>"); | 660 | ret = trace_seq_puts(s, "==========>"); |
661 | else | 661 | else |
662 | ret = trace_seq_printf(s, "<=========="); | 662 | ret = trace_seq_puts(s, "<=========="); |
663 | 663 | ||
664 | if (!ret) | 664 | if (!ret) |
665 | return TRACE_TYPE_PARTIAL_LINE; | 665 | return TRACE_TYPE_PARTIAL_LINE; |
@@ -668,7 +668,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, | |||
668 | if (ret != TRACE_TYPE_HANDLED) | 668 | if (ret != TRACE_TYPE_HANDLED) |
669 | return ret; | 669 | return ret; |
670 | 670 | ||
671 | ret = trace_seq_printf(s, "\n"); | 671 | ret = trace_seq_putc(s, '\n'); |
672 | 672 | ||
673 | if (!ret) | 673 | if (!ret) |
674 | return TRACE_TYPE_PARTIAL_LINE; | 674 | return TRACE_TYPE_PARTIAL_LINE; |
@@ -705,13 +705,13 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) | |||
705 | len += strlen(nsecs_str); | 705 | len += strlen(nsecs_str); |
706 | } | 706 | } |
707 | 707 | ||
708 | ret = trace_seq_printf(s, " us "); | 708 | ret = trace_seq_puts(s, " us "); |
709 | if (!ret) | 709 | if (!ret) |
710 | return TRACE_TYPE_PARTIAL_LINE; | 710 | return TRACE_TYPE_PARTIAL_LINE; |
711 | 711 | ||
712 | /* Print remaining spaces to fit the row's width */ | 712 | /* Print remaining spaces to fit the row's width */ |
713 | for (i = len; i < 7; i++) { | 713 | for (i = len; i < 7; i++) { |
714 | ret = trace_seq_printf(s, " "); | 714 | ret = trace_seq_putc(s, ' '); |
715 | if (!ret) | 715 | if (!ret) |
716 | return TRACE_TYPE_PARTIAL_LINE; | 716 | return TRACE_TYPE_PARTIAL_LINE; |
717 | } | 717 | } |
@@ -731,13 +731,13 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s, | |||
731 | /* No real adata, just filling the column with spaces */ | 731 | /* No real adata, just filling the column with spaces */ |
732 | switch (duration) { | 732 | switch (duration) { |
733 | case DURATION_FILL_FULL: | 733 | case DURATION_FILL_FULL: |
734 | ret = trace_seq_printf(s, " | "); | 734 | ret = trace_seq_puts(s, " | "); |
735 | return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; | 735 | return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; |
736 | case DURATION_FILL_START: | 736 | case DURATION_FILL_START: |
737 | ret = trace_seq_printf(s, " "); | 737 | ret = trace_seq_puts(s, " "); |
738 | return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; | 738 | return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; |
739 | case DURATION_FILL_END: | 739 | case DURATION_FILL_END: |
740 | ret = trace_seq_printf(s, " |"); | 740 | ret = trace_seq_puts(s, " |"); |
741 | return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; | 741 | return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; |
742 | } | 742 | } |
743 | 743 | ||
@@ -745,10 +745,10 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s, | |||
745 | if (flags & TRACE_GRAPH_PRINT_OVERHEAD) { | 745 | if (flags & TRACE_GRAPH_PRINT_OVERHEAD) { |
746 | /* Duration exceeded 100 msecs */ | 746 | /* Duration exceeded 100 msecs */ |
747 | if (duration > 100000ULL) | 747 | if (duration > 100000ULL) |
748 | ret = trace_seq_printf(s, "! "); | 748 | ret = trace_seq_puts(s, "! "); |
749 | /* Duration exceeded 10 msecs */ | 749 | /* Duration exceeded 10 msecs */ |
750 | else if (duration > 10000ULL) | 750 | else if (duration > 10000ULL) |
751 | ret = trace_seq_printf(s, "+ "); | 751 | ret = trace_seq_puts(s, "+ "); |
752 | } | 752 | } |
753 | 753 | ||
754 | /* | 754 | /* |
@@ -757,7 +757,7 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s, | |||
757 | * to fill out the space. | 757 | * to fill out the space. |
758 | */ | 758 | */ |
759 | if (ret == -1) | 759 | if (ret == -1) |
760 | ret = trace_seq_printf(s, " "); | 760 | ret = trace_seq_puts(s, " "); |
761 | 761 | ||
762 | /* Catching here any failure happenned above */ | 762 | /* Catching here any failure happenned above */ |
763 | if (!ret) | 763 | if (!ret) |
@@ -767,7 +767,7 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s, | |||
767 | if (ret != TRACE_TYPE_HANDLED) | 767 | if (ret != TRACE_TYPE_HANDLED) |
768 | return ret; | 768 | return ret; |
769 | 769 | ||
770 | ret = trace_seq_printf(s, "| "); | 770 | ret = trace_seq_puts(s, "| "); |
771 | if (!ret) | 771 | if (!ret) |
772 | return TRACE_TYPE_PARTIAL_LINE; | 772 | return TRACE_TYPE_PARTIAL_LINE; |
773 | 773 | ||
@@ -817,7 +817,7 @@ print_graph_entry_leaf(struct trace_iterator *iter, | |||
817 | 817 | ||
818 | /* Function */ | 818 | /* Function */ |
819 | for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { | 819 | for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { |
820 | ret = trace_seq_printf(s, " "); | 820 | ret = trace_seq_putc(s, ' '); |
821 | if (!ret) | 821 | if (!ret) |
822 | return TRACE_TYPE_PARTIAL_LINE; | 822 | return TRACE_TYPE_PARTIAL_LINE; |
823 | } | 823 | } |
@@ -858,7 +858,7 @@ print_graph_entry_nested(struct trace_iterator *iter, | |||
858 | 858 | ||
859 | /* Function */ | 859 | /* Function */ |
860 | for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { | 860 | for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { |
861 | ret = trace_seq_printf(s, " "); | 861 | ret = trace_seq_putc(s, ' '); |
862 | if (!ret) | 862 | if (!ret) |
863 | return TRACE_TYPE_PARTIAL_LINE; | 863 | return TRACE_TYPE_PARTIAL_LINE; |
864 | } | 864 | } |
@@ -917,7 +917,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, | |||
917 | if (ret == TRACE_TYPE_PARTIAL_LINE) | 917 | if (ret == TRACE_TYPE_PARTIAL_LINE) |
918 | return TRACE_TYPE_PARTIAL_LINE; | 918 | return TRACE_TYPE_PARTIAL_LINE; |
919 | 919 | ||
920 | ret = trace_seq_printf(s, " | "); | 920 | ret = trace_seq_puts(s, " | "); |
921 | if (!ret) | 921 | if (!ret) |
922 | return TRACE_TYPE_PARTIAL_LINE; | 922 | return TRACE_TYPE_PARTIAL_LINE; |
923 | } | 923 | } |
@@ -1117,7 +1117,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, | |||
1117 | 1117 | ||
1118 | /* Closing brace */ | 1118 | /* Closing brace */ |
1119 | for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) { | 1119 | for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) { |
1120 | ret = trace_seq_printf(s, " "); | 1120 | ret = trace_seq_putc(s, ' '); |
1121 | if (!ret) | 1121 | if (!ret) |
1122 | return TRACE_TYPE_PARTIAL_LINE; | 1122 | return TRACE_TYPE_PARTIAL_LINE; |
1123 | } | 1123 | } |
@@ -1129,7 +1129,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, | |||
1129 | * belongs to, write out the function name. | 1129 | * belongs to, write out the function name. |
1130 | */ | 1130 | */ |
1131 | if (func_match) { | 1131 | if (func_match) { |
1132 | ret = trace_seq_printf(s, "}\n"); | 1132 | ret = trace_seq_puts(s, "}\n"); |
1133 | if (!ret) | 1133 | if (!ret) |
1134 | return TRACE_TYPE_PARTIAL_LINE; | 1134 | return TRACE_TYPE_PARTIAL_LINE; |
1135 | } else { | 1135 | } else { |
@@ -1179,13 +1179,13 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, | |||
1179 | /* Indentation */ | 1179 | /* Indentation */ |
1180 | if (depth > 0) | 1180 | if (depth > 0) |
1181 | for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++) { | 1181 | for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++) { |
1182 | ret = trace_seq_printf(s, " "); | 1182 | ret = trace_seq_putc(s, ' '); |
1183 | if (!ret) | 1183 | if (!ret) |
1184 | return TRACE_TYPE_PARTIAL_LINE; | 1184 | return TRACE_TYPE_PARTIAL_LINE; |
1185 | } | 1185 | } |
1186 | 1186 | ||
1187 | /* The comment */ | 1187 | /* The comment */ |
1188 | ret = trace_seq_printf(s, "/* "); | 1188 | ret = trace_seq_puts(s, "/* "); |
1189 | if (!ret) | 1189 | if (!ret) |
1190 | return TRACE_TYPE_PARTIAL_LINE; | 1190 | return TRACE_TYPE_PARTIAL_LINE; |
1191 | 1191 | ||
@@ -1216,7 +1216,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, | |||
1216 | s->len--; | 1216 | s->len--; |
1217 | } | 1217 | } |
1218 | 1218 | ||
1219 | ret = trace_seq_printf(s, " */\n"); | 1219 | ret = trace_seq_puts(s, " */\n"); |
1220 | if (!ret) | 1220 | if (!ret) |
1221 | return TRACE_TYPE_PARTIAL_LINE; | 1221 | return TRACE_TYPE_PARTIAL_LINE; |
1222 | 1222 | ||
@@ -1448,7 +1448,7 @@ static struct trace_event graph_trace_ret_event = { | |||
1448 | .funcs = &graph_functions | 1448 | .funcs = &graph_functions |
1449 | }; | 1449 | }; |
1450 | 1450 | ||
1451 | static struct tracer graph_trace __read_mostly = { | 1451 | static struct tracer graph_trace __tracer_data = { |
1452 | .name = "function_graph", | 1452 | .name = "function_graph", |
1453 | .open = graph_trace_open, | 1453 | .open = graph_trace_open, |
1454 | .pipe_open = graph_trace_open, | 1454 | .pipe_open = graph_trace_open, |
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 7ed6976493c8..243f6834d026 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c | |||
@@ -95,7 +95,7 @@ static __kprobes bool trace_probe_is_on_module(struct trace_probe *tp) | |||
95 | } | 95 | } |
96 | 96 | ||
97 | static int register_probe_event(struct trace_probe *tp); | 97 | static int register_probe_event(struct trace_probe *tp); |
98 | static void unregister_probe_event(struct trace_probe *tp); | 98 | static int unregister_probe_event(struct trace_probe *tp); |
99 | 99 | ||
100 | static DEFINE_MUTEX(probe_lock); | 100 | static DEFINE_MUTEX(probe_lock); |
101 | static LIST_HEAD(probe_list); | 101 | static LIST_HEAD(probe_list); |
@@ -243,11 +243,11 @@ find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file) | |||
243 | static int | 243 | static int |
244 | disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) | 244 | disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) |
245 | { | 245 | { |
246 | struct event_file_link *link = NULL; | ||
247 | int wait = 0; | ||
246 | int ret = 0; | 248 | int ret = 0; |
247 | 249 | ||
248 | if (file) { | 250 | if (file) { |
249 | struct event_file_link *link; | ||
250 | |||
251 | link = find_event_file_link(tp, file); | 251 | link = find_event_file_link(tp, file); |
252 | if (!link) { | 252 | if (!link) { |
253 | ret = -EINVAL; | 253 | ret = -EINVAL; |
@@ -255,10 +255,7 @@ disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) | |||
255 | } | 255 | } |
256 | 256 | ||
257 | list_del_rcu(&link->list); | 257 | list_del_rcu(&link->list); |
258 | /* synchronize with kprobe_trace_func/kretprobe_trace_func */ | 258 | wait = 1; |
259 | synchronize_sched(); | ||
260 | kfree(link); | ||
261 | |||
262 | if (!list_empty(&tp->files)) | 259 | if (!list_empty(&tp->files)) |
263 | goto out; | 260 | goto out; |
264 | 261 | ||
@@ -271,8 +268,22 @@ disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) | |||
271 | disable_kretprobe(&tp->rp); | 268 | disable_kretprobe(&tp->rp); |
272 | else | 269 | else |
273 | disable_kprobe(&tp->rp.kp); | 270 | disable_kprobe(&tp->rp.kp); |
271 | wait = 1; | ||
274 | } | 272 | } |
275 | out: | 273 | out: |
274 | if (wait) { | ||
275 | /* | ||
276 | * Synchronize with kprobe_trace_func/kretprobe_trace_func | ||
277 | * to ensure disabled (all running handlers are finished). | ||
278 | * This is not only for kfree(), but also the caller, | ||
279 | * trace_remove_event_call() supposes it for releasing | ||
280 | * event_call related objects, which will be accessed in | ||
281 | * the kprobe_trace_func/kretprobe_trace_func. | ||
282 | */ | ||
283 | synchronize_sched(); | ||
284 | kfree(link); /* Ignored if link == NULL */ | ||
285 | } | ||
286 | |||
276 | return ret; | 287 | return ret; |
277 | } | 288 | } |
278 | 289 | ||
@@ -340,9 +351,12 @@ static int unregister_trace_probe(struct trace_probe *tp) | |||
340 | if (trace_probe_is_enabled(tp)) | 351 | if (trace_probe_is_enabled(tp)) |
341 | return -EBUSY; | 352 | return -EBUSY; |
342 | 353 | ||
354 | /* Will fail if probe is being used by ftrace or perf */ | ||
355 | if (unregister_probe_event(tp)) | ||
356 | return -EBUSY; | ||
357 | |||
343 | __unregister_trace_probe(tp); | 358 | __unregister_trace_probe(tp); |
344 | list_del(&tp->list); | 359 | list_del(&tp->list); |
345 | unregister_probe_event(tp); | ||
346 | 360 | ||
347 | return 0; | 361 | return 0; |
348 | } | 362 | } |
@@ -621,7 +635,9 @@ static int release_all_trace_probes(void) | |||
621 | /* TODO: Use batch unregistration */ | 635 | /* TODO: Use batch unregistration */ |
622 | while (!list_empty(&probe_list)) { | 636 | while (!list_empty(&probe_list)) { |
623 | tp = list_entry(probe_list.next, struct trace_probe, list); | 637 | tp = list_entry(probe_list.next, struct trace_probe, list); |
624 | unregister_trace_probe(tp); | 638 | ret = unregister_trace_probe(tp); |
639 | if (ret) | ||
640 | goto end; | ||
625 | free_trace_probe(tp); | 641 | free_trace_probe(tp); |
626 | } | 642 | } |
627 | 643 | ||
@@ -1087,9 +1103,6 @@ kprobe_perf_func(struct trace_probe *tp, struct pt_regs *regs) | |||
1087 | __size = sizeof(*entry) + tp->size + dsize; | 1103 | __size = sizeof(*entry) + tp->size + dsize; |
1088 | size = ALIGN(__size + sizeof(u32), sizeof(u64)); | 1104 | size = ALIGN(__size + sizeof(u32), sizeof(u64)); |
1089 | size -= sizeof(u32); | 1105 | size -= sizeof(u32); |
1090 | if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, | ||
1091 | "profile buffer not large enough")) | ||
1092 | return; | ||
1093 | 1106 | ||
1094 | entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); | 1107 | entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); |
1095 | if (!entry) | 1108 | if (!entry) |
@@ -1120,9 +1133,6 @@ kretprobe_perf_func(struct trace_probe *tp, struct kretprobe_instance *ri, | |||
1120 | __size = sizeof(*entry) + tp->size + dsize; | 1133 | __size = sizeof(*entry) + tp->size + dsize; |
1121 | size = ALIGN(__size + sizeof(u32), sizeof(u64)); | 1134 | size = ALIGN(__size + sizeof(u32), sizeof(u64)); |
1122 | size -= sizeof(u32); | 1135 | size -= sizeof(u32); |
1123 | if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, | ||
1124 | "profile buffer not large enough")) | ||
1125 | return; | ||
1126 | 1136 | ||
1127 | entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); | 1137 | entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); |
1128 | if (!entry) | 1138 | if (!entry) |
@@ -1242,11 +1252,15 @@ static int register_probe_event(struct trace_probe *tp) | |||
1242 | return ret; | 1252 | return ret; |
1243 | } | 1253 | } |
1244 | 1254 | ||
1245 | static void unregister_probe_event(struct trace_probe *tp) | 1255 | static int unregister_probe_event(struct trace_probe *tp) |
1246 | { | 1256 | { |
1257 | int ret; | ||
1258 | |||
1247 | /* tp->event is unregistered in trace_remove_event_call() */ | 1259 | /* tp->event is unregistered in trace_remove_event_call() */ |
1248 | trace_remove_event_call(&tp->call); | 1260 | ret = trace_remove_event_call(&tp->call); |
1249 | kfree(tp->call.print_fmt); | 1261 | if (!ret) |
1262 | kfree(tp->call.print_fmt); | ||
1263 | return ret; | ||
1250 | } | 1264 | } |
1251 | 1265 | ||
1252 | /* Make a debugfs interface for controlling probe points */ | 1266 | /* Make a debugfs interface for controlling probe points */ |
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index a5e8f4878bfa..b3dcfb2f0fef 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c | |||
@@ -90,7 +90,7 @@ static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev) | |||
90 | if (drv) | 90 | if (drv) |
91 | ret += trace_seq_printf(s, " %s\n", drv->name); | 91 | ret += trace_seq_printf(s, " %s\n", drv->name); |
92 | else | 92 | else |
93 | ret += trace_seq_printf(s, " \n"); | 93 | ret += trace_seq_puts(s, " \n"); |
94 | return ret; | 94 | return ret; |
95 | } | 95 | } |
96 | 96 | ||
@@ -107,7 +107,7 @@ static void mmio_pipe_open(struct trace_iterator *iter) | |||
107 | struct header_iter *hiter; | 107 | struct header_iter *hiter; |
108 | struct trace_seq *s = &iter->seq; | 108 | struct trace_seq *s = &iter->seq; |
109 | 109 | ||
110 | trace_seq_printf(s, "VERSION 20070824\n"); | 110 | trace_seq_puts(s, "VERSION 20070824\n"); |
111 | 111 | ||
112 | hiter = kzalloc(sizeof(*hiter), GFP_KERNEL); | 112 | hiter = kzalloc(sizeof(*hiter), GFP_KERNEL); |
113 | if (!hiter) | 113 | if (!hiter) |
@@ -209,7 +209,7 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter) | |||
209 | (rw->value >> 0) & 0xff, rw->pc, 0); | 209 | (rw->value >> 0) & 0xff, rw->pc, 0); |
210 | break; | 210 | break; |
211 | default: | 211 | default: |
212 | ret = trace_seq_printf(s, "rw what?\n"); | 212 | ret = trace_seq_puts(s, "rw what?\n"); |
213 | break; | 213 | break; |
214 | } | 214 | } |
215 | if (ret) | 215 | if (ret) |
@@ -245,7 +245,7 @@ static enum print_line_t mmio_print_map(struct trace_iterator *iter) | |||
245 | secs, usec_rem, m->map_id, 0UL, 0); | 245 | secs, usec_rem, m->map_id, 0UL, 0); |
246 | break; | 246 | break; |
247 | default: | 247 | default: |
248 | ret = trace_seq_printf(s, "map what?\n"); | 248 | ret = trace_seq_puts(s, "map what?\n"); |
249 | break; | 249 | break; |
250 | } | 250 | } |
251 | if (ret) | 251 | if (ret) |
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index bb922d9ee51b..34e7cbac0c9c 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c | |||
@@ -78,7 +78,7 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter) | |||
78 | 78 | ||
79 | trace_assign_type(field, entry); | 79 | trace_assign_type(field, entry); |
80 | 80 | ||
81 | ret = trace_seq_printf(s, "%s", field->buf); | 81 | ret = trace_seq_puts(s, field->buf); |
82 | if (!ret) | 82 | if (!ret) |
83 | return TRACE_TYPE_PARTIAL_LINE; | 83 | return TRACE_TYPE_PARTIAL_LINE; |
84 | 84 | ||
@@ -558,14 +558,14 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s, | |||
558 | if (ret) | 558 | if (ret) |
559 | ret = trace_seq_puts(s, "??"); | 559 | ret = trace_seq_puts(s, "??"); |
560 | if (ret) | 560 | if (ret) |
561 | ret = trace_seq_puts(s, "\n"); | 561 | ret = trace_seq_putc(s, '\n'); |
562 | continue; | 562 | continue; |
563 | } | 563 | } |
564 | if (!ret) | 564 | if (!ret) |
565 | break; | 565 | break; |
566 | if (ret) | 566 | if (ret) |
567 | ret = seq_print_user_ip(s, mm, ip, sym_flags); | 567 | ret = seq_print_user_ip(s, mm, ip, sym_flags); |
568 | ret = trace_seq_puts(s, "\n"); | 568 | ret = trace_seq_putc(s, '\n'); |
569 | } | 569 | } |
570 | 570 | ||
571 | if (mm) | 571 | if (mm) |
@@ -579,7 +579,7 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) | |||
579 | int ret; | 579 | int ret; |
580 | 580 | ||
581 | if (!ip) | 581 | if (!ip) |
582 | return trace_seq_printf(s, "0"); | 582 | return trace_seq_putc(s, '0'); |
583 | 583 | ||
584 | if (sym_flags & TRACE_ITER_SYM_OFFSET) | 584 | if (sym_flags & TRACE_ITER_SYM_OFFSET) |
585 | ret = seq_print_sym_offset(s, "%s", ip); | 585 | ret = seq_print_sym_offset(s, "%s", ip); |
@@ -964,14 +964,14 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags, | |||
964 | goto partial; | 964 | goto partial; |
965 | 965 | ||
966 | if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) { | 966 | if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) { |
967 | if (!trace_seq_printf(s, " <-")) | 967 | if (!trace_seq_puts(s, " <-")) |
968 | goto partial; | 968 | goto partial; |
969 | if (!seq_print_ip_sym(s, | 969 | if (!seq_print_ip_sym(s, |
970 | field->parent_ip, | 970 | field->parent_ip, |
971 | flags)) | 971 | flags)) |
972 | goto partial; | 972 | goto partial; |
973 | } | 973 | } |
974 | if (!trace_seq_printf(s, "\n")) | 974 | if (!trace_seq_putc(s, '\n')) |
975 | goto partial; | 975 | goto partial; |
976 | 976 | ||
977 | return TRACE_TYPE_HANDLED; | 977 | return TRACE_TYPE_HANDLED; |
@@ -1210,7 +1210,7 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, | |||
1210 | 1210 | ||
1211 | if (!seq_print_ip_sym(s, *p, flags)) | 1211 | if (!seq_print_ip_sym(s, *p, flags)) |
1212 | goto partial; | 1212 | goto partial; |
1213 | if (!trace_seq_puts(s, "\n")) | 1213 | if (!trace_seq_putc(s, '\n')) |
1214 | goto partial; | 1214 | goto partial; |
1215 | } | 1215 | } |
1216 | 1216 | ||
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index a9077c1b4ad3..2900817ba65c 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c | |||
@@ -244,12 +244,31 @@ static const char **find_next(void *v, loff_t *pos) | |||
244 | { | 244 | { |
245 | const char **fmt = v; | 245 | const char **fmt = v; |
246 | int start_index; | 246 | int start_index; |
247 | int last_index; | ||
247 | 248 | ||
248 | start_index = __stop___trace_bprintk_fmt - __start___trace_bprintk_fmt; | 249 | start_index = __stop___trace_bprintk_fmt - __start___trace_bprintk_fmt; |
249 | 250 | ||
250 | if (*pos < start_index) | 251 | if (*pos < start_index) |
251 | return __start___trace_bprintk_fmt + *pos; | 252 | return __start___trace_bprintk_fmt + *pos; |
252 | 253 | ||
254 | /* | ||
255 | * The __tracepoint_str section is treated the same as the | ||
256 | * __trace_printk_fmt section. The difference is that the | ||
257 | * __trace_printk_fmt section should only be used by trace_printk() | ||
258 | * in a debugging environment, as if anything exists in that section | ||
259 | * the trace_prink() helper buffers are allocated, which would just | ||
260 | * waste space in a production environment. | ||
261 | * | ||
262 | * The __tracepoint_str sections on the other hand are used by | ||
263 | * tracepoints which need to map pointers to their strings to | ||
264 | * the ASCII text for userspace. | ||
265 | */ | ||
266 | last_index = start_index; | ||
267 | start_index = __stop___tracepoint_str - __start___tracepoint_str; | ||
268 | |||
269 | if (*pos < last_index + start_index) | ||
270 | return __start___tracepoint_str + (*pos - last_index); | ||
271 | |||
253 | return find_next_mod_format(start_index, v, fmt, pos); | 272 | return find_next_mod_format(start_index, v, fmt, pos); |
254 | } | 273 | } |
255 | 274 | ||
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 322e16461072..8fd03657bc7d 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c | |||
@@ -175,7 +175,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags, | |||
175 | entry = syscall_nr_to_meta(syscall); | 175 | entry = syscall_nr_to_meta(syscall); |
176 | 176 | ||
177 | if (!entry) { | 177 | if (!entry) { |
178 | trace_seq_printf(s, "\n"); | 178 | trace_seq_putc(s, '\n'); |
179 | return TRACE_TYPE_HANDLED; | 179 | return TRACE_TYPE_HANDLED; |
180 | } | 180 | } |
181 | 181 | ||
@@ -566,15 +566,15 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) | |||
566 | if (!sys_data) | 566 | if (!sys_data) |
567 | return; | 567 | return; |
568 | 568 | ||
569 | head = this_cpu_ptr(sys_data->enter_event->perf_events); | ||
570 | if (hlist_empty(head)) | ||
571 | return; | ||
572 | |||
569 | /* get the size after alignment with the u32 buffer size field */ | 573 | /* get the size after alignment with the u32 buffer size field */ |
570 | size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec); | 574 | size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec); |
571 | size = ALIGN(size + sizeof(u32), sizeof(u64)); | 575 | size = ALIGN(size + sizeof(u32), sizeof(u64)); |
572 | size -= sizeof(u32); | 576 | size -= sizeof(u32); |
573 | 577 | ||
574 | if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, | ||
575 | "perf buffer not large enough")) | ||
576 | return; | ||
577 | |||
578 | rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size, | 578 | rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size, |
579 | sys_data->enter_event->event.type, regs, &rctx); | 579 | sys_data->enter_event->event.type, regs, &rctx); |
580 | if (!rec) | 580 | if (!rec) |
@@ -583,8 +583,6 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) | |||
583 | rec->nr = syscall_nr; | 583 | rec->nr = syscall_nr; |
584 | syscall_get_arguments(current, regs, 0, sys_data->nb_args, | 584 | syscall_get_arguments(current, regs, 0, sys_data->nb_args, |
585 | (unsigned long *)&rec->args); | 585 | (unsigned long *)&rec->args); |
586 | |||
587 | head = this_cpu_ptr(sys_data->enter_event->perf_events); | ||
588 | perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); | 586 | perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); |
589 | } | 587 | } |
590 | 588 | ||
@@ -642,18 +640,14 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) | |||
642 | if (!sys_data) | 640 | if (!sys_data) |
643 | return; | 641 | return; |
644 | 642 | ||
643 | head = this_cpu_ptr(sys_data->exit_event->perf_events); | ||
644 | if (hlist_empty(head)) | ||
645 | return; | ||
646 | |||
645 | /* We can probably do that at build time */ | 647 | /* We can probably do that at build time */ |
646 | size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64)); | 648 | size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64)); |
647 | size -= sizeof(u32); | 649 | size -= sizeof(u32); |
648 | 650 | ||
649 | /* | ||
650 | * Impossible, but be paranoid with the future | ||
651 | * How to put this check outside runtime? | ||
652 | */ | ||
653 | if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, | ||
654 | "exit event has grown above perf buffer size")) | ||
655 | return; | ||
656 | |||
657 | rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size, | 651 | rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size, |
658 | sys_data->exit_event->event.type, regs, &rctx); | 652 | sys_data->exit_event->event.type, regs, &rctx); |
659 | if (!rec) | 653 | if (!rec) |
@@ -661,8 +655,6 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) | |||
661 | 655 | ||
662 | rec->nr = syscall_nr; | 656 | rec->nr = syscall_nr; |
663 | rec->ret = syscall_get_return_value(current, regs); | 657 | rec->ret = syscall_get_return_value(current, regs); |
664 | |||
665 | head = this_cpu_ptr(sys_data->exit_event->perf_events); | ||
666 | perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); | 658 | perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); |
667 | } | 659 | } |
668 | 660 | ||
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index d5d0cd368a56..272261b5f94f 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c | |||
@@ -70,7 +70,7 @@ struct trace_uprobe { | |||
70 | (sizeof(struct probe_arg) * (n))) | 70 | (sizeof(struct probe_arg) * (n))) |
71 | 71 | ||
72 | static int register_uprobe_event(struct trace_uprobe *tu); | 72 | static int register_uprobe_event(struct trace_uprobe *tu); |
73 | static void unregister_uprobe_event(struct trace_uprobe *tu); | 73 | static int unregister_uprobe_event(struct trace_uprobe *tu); |
74 | 74 | ||
75 | static DEFINE_MUTEX(uprobe_lock); | 75 | static DEFINE_MUTEX(uprobe_lock); |
76 | static LIST_HEAD(uprobe_list); | 76 | static LIST_HEAD(uprobe_list); |
@@ -164,11 +164,17 @@ static struct trace_uprobe *find_probe_event(const char *event, const char *grou | |||
164 | } | 164 | } |
165 | 165 | ||
166 | /* Unregister a trace_uprobe and probe_event: call with locking uprobe_lock */ | 166 | /* Unregister a trace_uprobe and probe_event: call with locking uprobe_lock */ |
167 | static void unregister_trace_uprobe(struct trace_uprobe *tu) | 167 | static int unregister_trace_uprobe(struct trace_uprobe *tu) |
168 | { | 168 | { |
169 | int ret; | ||
170 | |||
171 | ret = unregister_uprobe_event(tu); | ||
172 | if (ret) | ||
173 | return ret; | ||
174 | |||
169 | list_del(&tu->list); | 175 | list_del(&tu->list); |
170 | unregister_uprobe_event(tu); | ||
171 | free_trace_uprobe(tu); | 176 | free_trace_uprobe(tu); |
177 | return 0; | ||
172 | } | 178 | } |
173 | 179 | ||
174 | /* Register a trace_uprobe and probe_event */ | 180 | /* Register a trace_uprobe and probe_event */ |
@@ -181,9 +187,12 @@ static int register_trace_uprobe(struct trace_uprobe *tu) | |||
181 | 187 | ||
182 | /* register as an event */ | 188 | /* register as an event */ |
183 | old_tp = find_probe_event(tu->call.name, tu->call.class->system); | 189 | old_tp = find_probe_event(tu->call.name, tu->call.class->system); |
184 | if (old_tp) | 190 | if (old_tp) { |
185 | /* delete old event */ | 191 | /* delete old event */ |
186 | unregister_trace_uprobe(old_tp); | 192 | ret = unregister_trace_uprobe(old_tp); |
193 | if (ret) | ||
194 | goto end; | ||
195 | } | ||
187 | 196 | ||
188 | ret = register_uprobe_event(tu); | 197 | ret = register_uprobe_event(tu); |
189 | if (ret) { | 198 | if (ret) { |
@@ -256,6 +265,8 @@ static int create_trace_uprobe(int argc, char **argv) | |||
256 | group = UPROBE_EVENT_SYSTEM; | 265 | group = UPROBE_EVENT_SYSTEM; |
257 | 266 | ||
258 | if (is_delete) { | 267 | if (is_delete) { |
268 | int ret; | ||
269 | |||
259 | if (!event) { | 270 | if (!event) { |
260 | pr_info("Delete command needs an event name.\n"); | 271 | pr_info("Delete command needs an event name.\n"); |
261 | return -EINVAL; | 272 | return -EINVAL; |
@@ -269,9 +280,9 @@ static int create_trace_uprobe(int argc, char **argv) | |||
269 | return -ENOENT; | 280 | return -ENOENT; |
270 | } | 281 | } |
271 | /* delete an event */ | 282 | /* delete an event */ |
272 | unregister_trace_uprobe(tu); | 283 | ret = unregister_trace_uprobe(tu); |
273 | mutex_unlock(&uprobe_lock); | 284 | mutex_unlock(&uprobe_lock); |
274 | return 0; | 285 | return ret; |
275 | } | 286 | } |
276 | 287 | ||
277 | if (argc < 2) { | 288 | if (argc < 2) { |
@@ -408,16 +419,20 @@ fail_address_parse: | |||
408 | return ret; | 419 | return ret; |
409 | } | 420 | } |
410 | 421 | ||
411 | static void cleanup_all_probes(void) | 422 | static int cleanup_all_probes(void) |
412 | { | 423 | { |
413 | struct trace_uprobe *tu; | 424 | struct trace_uprobe *tu; |
425 | int ret = 0; | ||
414 | 426 | ||
415 | mutex_lock(&uprobe_lock); | 427 | mutex_lock(&uprobe_lock); |
416 | while (!list_empty(&uprobe_list)) { | 428 | while (!list_empty(&uprobe_list)) { |
417 | tu = list_entry(uprobe_list.next, struct trace_uprobe, list); | 429 | tu = list_entry(uprobe_list.next, struct trace_uprobe, list); |
418 | unregister_trace_uprobe(tu); | 430 | ret = unregister_trace_uprobe(tu); |
431 | if (ret) | ||
432 | break; | ||
419 | } | 433 | } |
420 | mutex_unlock(&uprobe_lock); | 434 | mutex_unlock(&uprobe_lock); |
435 | return ret; | ||
421 | } | 436 | } |
422 | 437 | ||
423 | /* Probes listing interfaces */ | 438 | /* Probes listing interfaces */ |
@@ -462,8 +477,13 @@ static const struct seq_operations probes_seq_op = { | |||
462 | 477 | ||
463 | static int probes_open(struct inode *inode, struct file *file) | 478 | static int probes_open(struct inode *inode, struct file *file) |
464 | { | 479 | { |
465 | if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) | 480 | int ret; |
466 | cleanup_all_probes(); | 481 | |
482 | if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { | ||
483 | ret = cleanup_all_probes(); | ||
484 | if (ret) | ||
485 | return ret; | ||
486 | } | ||
467 | 487 | ||
468 | return seq_open(file, &probes_seq_op); | 488 | return seq_open(file, &probes_seq_op); |
469 | } | 489 | } |
@@ -818,8 +838,6 @@ static void uprobe_perf_print(struct trace_uprobe *tu, | |||
818 | 838 | ||
819 | size = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); | 839 | size = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); |
820 | size = ALIGN(size + tu->size + sizeof(u32), sizeof(u64)) - sizeof(u32); | 840 | size = ALIGN(size + tu->size + sizeof(u32), sizeof(u64)) - sizeof(u32); |
821 | if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) | ||
822 | return; | ||
823 | 841 | ||
824 | preempt_disable(); | 842 | preempt_disable(); |
825 | head = this_cpu_ptr(call->perf_events); | 843 | head = this_cpu_ptr(call->perf_events); |
@@ -970,12 +988,17 @@ static int register_uprobe_event(struct trace_uprobe *tu) | |||
970 | return ret; | 988 | return ret; |
971 | } | 989 | } |
972 | 990 | ||
973 | static void unregister_uprobe_event(struct trace_uprobe *tu) | 991 | static int unregister_uprobe_event(struct trace_uprobe *tu) |
974 | { | 992 | { |
993 | int ret; | ||
994 | |||
975 | /* tu->event is unregistered in trace_remove_event_call() */ | 995 | /* tu->event is unregistered in trace_remove_event_call() */ |
976 | trace_remove_event_call(&tu->call); | 996 | ret = trace_remove_event_call(&tu->call); |
997 | if (ret) | ||
998 | return ret; | ||
977 | kfree(tu->call.print_fmt); | 999 | kfree(tu->call.print_fmt); |
978 | tu->call.print_fmt = NULL; | 1000 | tu->call.print_fmt = NULL; |
1001 | return 0; | ||
979 | } | 1002 | } |
980 | 1003 | ||
981 | /* Make a trace interface for controling probe points */ | 1004 | /* Make a trace interface for controling probe points */ |
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index d8c30db06c5b..9064b919a406 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c | |||
@@ -62,6 +62,9 @@ int create_user_ns(struct cred *new) | |||
62 | kgid_t group = new->egid; | 62 | kgid_t group = new->egid; |
63 | int ret; | 63 | int ret; |
64 | 64 | ||
65 | if (parent_ns->level > 32) | ||
66 | return -EUSERS; | ||
67 | |||
65 | /* | 68 | /* |
66 | * Verify that we can not violate the policy of which files | 69 | * Verify that we can not violate the policy of which files |
67 | * may be accessed that is specified by the root directory, | 70 | * may be accessed that is specified by the root directory, |
@@ -92,6 +95,7 @@ int create_user_ns(struct cred *new) | |||
92 | atomic_set(&ns->count, 1); | 95 | atomic_set(&ns->count, 1); |
93 | /* Leave the new->user_ns reference with the new user namespace. */ | 96 | /* Leave the new->user_ns reference with the new user namespace. */ |
94 | ns->parent = parent_ns; | 97 | ns->parent = parent_ns; |
98 | ns->level = parent_ns->level + 1; | ||
95 | ns->owner = owner; | 99 | ns->owner = owner; |
96 | ns->group = group; | 100 | ns->group = group; |
97 | 101 | ||
@@ -105,16 +109,21 @@ int create_user_ns(struct cred *new) | |||
105 | int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) | 109 | int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) |
106 | { | 110 | { |
107 | struct cred *cred; | 111 | struct cred *cred; |
112 | int err = -ENOMEM; | ||
108 | 113 | ||
109 | if (!(unshare_flags & CLONE_NEWUSER)) | 114 | if (!(unshare_flags & CLONE_NEWUSER)) |
110 | return 0; | 115 | return 0; |
111 | 116 | ||
112 | cred = prepare_creds(); | 117 | cred = prepare_creds(); |
113 | if (!cred) | 118 | if (cred) { |
114 | return -ENOMEM; | 119 | err = create_user_ns(cred); |
120 | if (err) | ||
121 | put_cred(cred); | ||
122 | else | ||
123 | *new_cred = cred; | ||
124 | } | ||
115 | 125 | ||
116 | *new_cred = cred; | 126 | return err; |
117 | return create_user_ns(cred); | ||
118 | } | 127 | } |
119 | 128 | ||
120 | void free_user_ns(struct user_namespace *ns) | 129 | void free_user_ns(struct user_namespace *ns) |
diff --git a/kernel/wait.c b/kernel/wait.c index ce0daa320a26..d550920e040c 100644 --- a/kernel/wait.c +++ b/kernel/wait.c | |||
@@ -333,7 +333,8 @@ int __wait_on_atomic_t(wait_queue_head_t *wq, struct wait_bit_queue *q, | |||
333 | prepare_to_wait(wq, &q->wait, mode); | 333 | prepare_to_wait(wq, &q->wait, mode); |
334 | val = q->key.flags; | 334 | val = q->key.flags; |
335 | if (atomic_read(val) == 0) | 335 | if (atomic_read(val) == 0) |
336 | ret = (*action)(val); | 336 | break; |
337 | ret = (*action)(val); | ||
337 | } while (!ret && atomic_read(val) != 0); | 338 | } while (!ret && atomic_read(val) != 0); |
338 | finish_wait(wq, &q->wait); | 339 | finish_wait(wq, &q->wait); |
339 | return ret; | 340 | return ret; |
@@ -362,8 +363,7 @@ EXPORT_SYMBOL(out_of_line_wait_on_atomic_t); | |||
362 | 363 | ||
363 | /** | 364 | /** |
364 | * wake_up_atomic_t - Wake up a waiter on a atomic_t | 365 | * wake_up_atomic_t - Wake up a waiter on a atomic_t |
365 | * @word: The word being waited on, a kernel virtual address | 366 | * @p: The atomic_t being waited on, a kernel virtual address |
366 | * @bit: The bit of the word being waited on | ||
367 | * | 367 | * |
368 | * Wake up anyone waiting for the atomic_t to go to zero. | 368 | * Wake up anyone waiting for the atomic_t to go to zero. |
369 | * | 369 | * |
diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 1241d8c91d5e..51c4f34d258e 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c | |||
@@ -553,14 +553,6 @@ void __init lockup_detector_init(void) | |||
553 | { | 553 | { |
554 | set_sample_period(); | 554 | set_sample_period(); |
555 | 555 | ||
556 | #ifdef CONFIG_NO_HZ_FULL | ||
557 | if (watchdog_user_enabled) { | ||
558 | watchdog_user_enabled = 0; | ||
559 | pr_warning("Disabled lockup detectors by default for full dynticks\n"); | ||
560 | pr_warning("You can reactivate it with 'sysctl -w kernel.watchdog=1'\n"); | ||
561 | } | ||
562 | #endif | ||
563 | |||
564 | if (watchdog_user_enabled) | 556 | if (watchdog_user_enabled) |
565 | watchdog_enable_all_cpus(); | 557 | watchdog_enable_all_cpus(); |
566 | } | 558 | } |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f02c4a4a0c3c..29b79852a845 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -16,9 +16,10 @@ | |||
16 | * | 16 | * |
17 | * This is the generic async execution mechanism. Work items as are | 17 | * This is the generic async execution mechanism. Work items as are |
18 | * executed in process context. The worker pool is shared and | 18 | * executed in process context. The worker pool is shared and |
19 | * automatically managed. There is one worker pool for each CPU and | 19 | * automatically managed. There are two worker pools for each CPU (one for |
20 | * one extra for works which are better served by workers which are | 20 | * normal work items and the other for high priority ones) and some extra |
21 | * not bound to any specific CPU. | 21 | * pools for workqueues which are not bound to any specific CPU - the |
22 | * number of these backing pools is dynamic. | ||
22 | * | 23 | * |
23 | * Please read Documentation/workqueue.txt for details. | 24 | * Please read Documentation/workqueue.txt for details. |
24 | */ | 25 | */ |
@@ -2033,8 +2034,11 @@ static bool maybe_destroy_workers(struct worker_pool *pool) | |||
2033 | * multiple times. Does GFP_KERNEL allocations. | 2034 | * multiple times. Does GFP_KERNEL allocations. |
2034 | * | 2035 | * |
2035 | * RETURNS: | 2036 | * RETURNS: |
2036 | * spin_lock_irq(pool->lock) which may be released and regrabbed | 2037 | * %false if the pool don't need management and the caller can safely start |
2037 | * multiple times. Does GFP_KERNEL allocations. | 2038 | * processing works, %true indicates that the function released pool->lock |
2039 | * and reacquired it to perform some management function and that the | ||
2040 | * conditions that the caller verified while holding the lock before | ||
2041 | * calling the function might no longer be true. | ||
2038 | */ | 2042 | */ |
2039 | static bool manage_workers(struct worker *worker) | 2043 | static bool manage_workers(struct worker *worker) |
2040 | { | 2044 | { |
@@ -2201,6 +2205,15 @@ __acquires(&pool->lock) | |||
2201 | dump_stack(); | 2205 | dump_stack(); |
2202 | } | 2206 | } |
2203 | 2207 | ||
2208 | /* | ||
2209 | * The following prevents a kworker from hogging CPU on !PREEMPT | ||
2210 | * kernels, where a requeueing work item waiting for something to | ||
2211 | * happen could deadlock with stop_machine as such work item could | ||
2212 | * indefinitely requeue itself while all other CPUs are trapped in | ||
2213 | * stop_machine. | ||
2214 | */ | ||
2215 | cond_resched(); | ||
2216 | |||
2204 | spin_lock_irq(&pool->lock); | 2217 | spin_lock_irq(&pool->lock); |
2205 | 2218 | ||
2206 | /* clear cpu intensive status */ | 2219 | /* clear cpu intensive status */ |
@@ -2817,6 +2830,19 @@ already_gone: | |||
2817 | return false; | 2830 | return false; |
2818 | } | 2831 | } |
2819 | 2832 | ||
2833 | static bool __flush_work(struct work_struct *work) | ||
2834 | { | ||
2835 | struct wq_barrier barr; | ||
2836 | |||
2837 | if (start_flush_work(work, &barr)) { | ||
2838 | wait_for_completion(&barr.done); | ||
2839 | destroy_work_on_stack(&barr.work); | ||
2840 | return true; | ||
2841 | } else { | ||
2842 | return false; | ||
2843 | } | ||
2844 | } | ||
2845 | |||
2820 | /** | 2846 | /** |
2821 | * flush_work - wait for a work to finish executing the last queueing instance | 2847 | * flush_work - wait for a work to finish executing the last queueing instance |
2822 | * @work: the work to flush | 2848 | * @work: the work to flush |
@@ -2830,18 +2856,10 @@ already_gone: | |||
2830 | */ | 2856 | */ |
2831 | bool flush_work(struct work_struct *work) | 2857 | bool flush_work(struct work_struct *work) |
2832 | { | 2858 | { |
2833 | struct wq_barrier barr; | ||
2834 | |||
2835 | lock_map_acquire(&work->lockdep_map); | 2859 | lock_map_acquire(&work->lockdep_map); |
2836 | lock_map_release(&work->lockdep_map); | 2860 | lock_map_release(&work->lockdep_map); |
2837 | 2861 | ||
2838 | if (start_flush_work(work, &barr)) { | 2862 | return __flush_work(work); |
2839 | wait_for_completion(&barr.done); | ||
2840 | destroy_work_on_stack(&barr.work); | ||
2841 | return true; | ||
2842 | } else { | ||
2843 | return false; | ||
2844 | } | ||
2845 | } | 2863 | } |
2846 | EXPORT_SYMBOL_GPL(flush_work); | 2864 | EXPORT_SYMBOL_GPL(flush_work); |
2847 | 2865 | ||
@@ -3081,25 +3099,26 @@ static struct workqueue_struct *dev_to_wq(struct device *dev) | |||
3081 | return wq_dev->wq; | 3099 | return wq_dev->wq; |
3082 | } | 3100 | } |
3083 | 3101 | ||
3084 | static ssize_t wq_per_cpu_show(struct device *dev, | 3102 | static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr, |
3085 | struct device_attribute *attr, char *buf) | 3103 | char *buf) |
3086 | { | 3104 | { |
3087 | struct workqueue_struct *wq = dev_to_wq(dev); | 3105 | struct workqueue_struct *wq = dev_to_wq(dev); |
3088 | 3106 | ||
3089 | return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND)); | 3107 | return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND)); |
3090 | } | 3108 | } |
3109 | static DEVICE_ATTR_RO(per_cpu); | ||
3091 | 3110 | ||
3092 | static ssize_t wq_max_active_show(struct device *dev, | 3111 | static ssize_t max_active_show(struct device *dev, |
3093 | struct device_attribute *attr, char *buf) | 3112 | struct device_attribute *attr, char *buf) |
3094 | { | 3113 | { |
3095 | struct workqueue_struct *wq = dev_to_wq(dev); | 3114 | struct workqueue_struct *wq = dev_to_wq(dev); |
3096 | 3115 | ||
3097 | return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active); | 3116 | return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active); |
3098 | } | 3117 | } |
3099 | 3118 | ||
3100 | static ssize_t wq_max_active_store(struct device *dev, | 3119 | static ssize_t max_active_store(struct device *dev, |
3101 | struct device_attribute *attr, | 3120 | struct device_attribute *attr, const char *buf, |
3102 | const char *buf, size_t count) | 3121 | size_t count) |
3103 | { | 3122 | { |
3104 | struct workqueue_struct *wq = dev_to_wq(dev); | 3123 | struct workqueue_struct *wq = dev_to_wq(dev); |
3105 | int val; | 3124 | int val; |
@@ -3110,12 +3129,14 @@ static ssize_t wq_max_active_store(struct device *dev, | |||
3110 | workqueue_set_max_active(wq, val); | 3129 | workqueue_set_max_active(wq, val); |
3111 | return count; | 3130 | return count; |
3112 | } | 3131 | } |
3132 | static DEVICE_ATTR_RW(max_active); | ||
3113 | 3133 | ||
3114 | static struct device_attribute wq_sysfs_attrs[] = { | 3134 | static struct attribute *wq_sysfs_attrs[] = { |
3115 | __ATTR(per_cpu, 0444, wq_per_cpu_show, NULL), | 3135 | &dev_attr_per_cpu.attr, |
3116 | __ATTR(max_active, 0644, wq_max_active_show, wq_max_active_store), | 3136 | &dev_attr_max_active.attr, |
3117 | __ATTR_NULL, | 3137 | NULL, |
3118 | }; | 3138 | }; |
3139 | ATTRIBUTE_GROUPS(wq_sysfs); | ||
3119 | 3140 | ||
3120 | static ssize_t wq_pool_ids_show(struct device *dev, | 3141 | static ssize_t wq_pool_ids_show(struct device *dev, |
3121 | struct device_attribute *attr, char *buf) | 3142 | struct device_attribute *attr, char *buf) |
@@ -3265,7 +3286,7 @@ static struct device_attribute wq_sysfs_unbound_attrs[] = { | |||
3265 | 3286 | ||
3266 | static struct bus_type wq_subsys = { | 3287 | static struct bus_type wq_subsys = { |
3267 | .name = "workqueue", | 3288 | .name = "workqueue", |
3268 | .dev_attrs = wq_sysfs_attrs, | 3289 | .dev_groups = wq_sysfs_groups, |
3269 | }; | 3290 | }; |
3270 | 3291 | ||
3271 | static int __init wq_sysfs_init(void) | 3292 | static int __init wq_sysfs_init(void) |
@@ -3411,6 +3432,12 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, | |||
3411 | { | 3432 | { |
3412 | to->nice = from->nice; | 3433 | to->nice = from->nice; |
3413 | cpumask_copy(to->cpumask, from->cpumask); | 3434 | cpumask_copy(to->cpumask, from->cpumask); |
3435 | /* | ||
3436 | * Unlike hash and equality test, this function doesn't ignore | ||
3437 | * ->no_numa as it is used for both pool and wq attrs. Instead, | ||
3438 | * get_unbound_pool() explicitly clears ->no_numa after copying. | ||
3439 | */ | ||
3440 | to->no_numa = from->no_numa; | ||
3414 | } | 3441 | } |
3415 | 3442 | ||
3416 | /* hash value of the content of @attr */ | 3443 | /* hash value of the content of @attr */ |
@@ -3578,6 +3605,12 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) | |||
3578 | lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */ | 3605 | lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */ |
3579 | copy_workqueue_attrs(pool->attrs, attrs); | 3606 | copy_workqueue_attrs(pool->attrs, attrs); |
3580 | 3607 | ||
3608 | /* | ||
3609 | * no_numa isn't a worker_pool attribute, always clear it. See | ||
3610 | * 'struct workqueue_attrs' comments for detail. | ||
3611 | */ | ||
3612 | pool->attrs->no_numa = false; | ||
3613 | |||
3581 | /* if cpumask is contained inside a NUMA node, we belong to that node */ | 3614 | /* if cpumask is contained inside a NUMA node, we belong to that node */ |
3582 | if (wq_numa_enabled) { | 3615 | if (wq_numa_enabled) { |
3583 | for_each_node(node) { | 3616 | for_each_node(node) { |
@@ -4644,7 +4677,7 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu) | |||
4644 | * Workqueues should be brought up before normal priority CPU notifiers. | 4677 | * Workqueues should be brought up before normal priority CPU notifiers. |
4645 | * This will be registered high priority CPU notifier. | 4678 | * This will be registered high priority CPU notifier. |
4646 | */ | 4679 | */ |
4647 | static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, | 4680 | static int workqueue_cpu_up_callback(struct notifier_block *nfb, |
4648 | unsigned long action, | 4681 | unsigned long action, |
4649 | void *hcpu) | 4682 | void *hcpu) |
4650 | { | 4683 | { |
@@ -4697,7 +4730,7 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, | |||
4697 | * Workqueues should be brought down after normal priority CPU notifiers. | 4730 | * Workqueues should be brought down after normal priority CPU notifiers. |
4698 | * This will be registered as low priority CPU notifier. | 4731 | * This will be registered as low priority CPU notifier. |
4699 | */ | 4732 | */ |
4700 | static int __cpuinit workqueue_cpu_down_callback(struct notifier_block *nfb, | 4733 | static int workqueue_cpu_down_callback(struct notifier_block *nfb, |
4701 | unsigned long action, | 4734 | unsigned long action, |
4702 | void *hcpu) | 4735 | void *hcpu) |
4703 | { | 4736 | { |
@@ -4756,7 +4789,14 @@ long work_on_cpu(int cpu, long (*fn)(void *), void *arg) | |||
4756 | 4789 | ||
4757 | INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn); | 4790 | INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn); |
4758 | schedule_work_on(cpu, &wfc.work); | 4791 | schedule_work_on(cpu, &wfc.work); |
4759 | flush_work(&wfc.work); | 4792 | |
4793 | /* | ||
4794 | * The work item is on-stack and can't lead to deadlock through | ||
4795 | * flushing. Use __flush_work() to avoid spurious lockdep warnings | ||
4796 | * when work_on_cpu()s are nested. | ||
4797 | */ | ||
4798 | __flush_work(&wfc.work); | ||
4799 | |||
4760 | return wfc.ret; | 4800 | return wfc.ret; |
4761 | } | 4801 | } |
4762 | EXPORT_SYMBOL_GPL(work_on_cpu); | 4802 | EXPORT_SYMBOL_GPL(work_on_cpu); |