aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile3
-rw-r--r--kernel/cgroup.c1687
-rw-r--r--kernel/cgroup_freezer.c155
-rw-r--r--kernel/context_tracking.c125
-rw-r--r--kernel/cpu.c15
-rw-r--r--kernel/cpuset.c337
-rw-r--r--kernel/events/callchain.c3
-rw-r--r--kernel/events/core.c416
-rw-r--r--kernel/fork.c13
-rw-r--r--kernel/freezer.c2
-rw-r--r--kernel/hrtimer.c6
-rw-r--r--kernel/hung_task.c13
-rw-r--r--kernel/jump_label.c1
-rw-r--r--kernel/lglock.c12
-rw-r--r--kernel/mutex.c47
-rw-r--r--kernel/nsproxy.c27
-rw-r--r--kernel/pid_namespace.c4
-rw-r--r--kernel/power/autosleep.c3
-rw-r--r--kernel/power/hibernate.c2
-rw-r--r--kernel/power/process.c11
-rw-r--r--kernel/power/qos.c20
-rw-r--r--kernel/power/suspend.c4
-rw-r--r--kernel/printk/Makefile2
-rw-r--r--kernel/printk/braille.c49
-rw-r--r--kernel/printk/braille.h48
-rw-r--r--kernel/printk/console_cmdline.h14
-rw-r--r--kernel/printk/printk.c (renamed from kernel/printk.c)192
-rw-r--r--kernel/profile.c2
-rw-r--r--kernel/ptrace.c1
-rw-r--r--kernel/rcu.h12
-rw-r--r--kernel/rcupdate.c102
-rw-r--r--kernel/rcutiny.c2
-rw-r--r--kernel/rcutiny_plugin.h2
-rw-r--r--kernel/rcutorture.c402
-rw-r--r--kernel/rcutree.c261
-rw-r--r--kernel/rcutree.h23
-rw-r--r--kernel/rcutree_plugin.h466
-rw-r--r--kernel/relay.c2
-rw-r--r--kernel/sched/core.c249
-rw-r--r--kernel/sched/cpuacct.c51
-rw-r--r--kernel/sched/cpupri.c4
-rw-r--r--kernel/sched/cputime.c55
-rw-r--r--kernel/sched/fair.c632
-rw-r--r--kernel/sched/sched.h14
-rw-r--r--kernel/smp.c18
-rw-r--r--kernel/smpboot.c2
-rw-r--r--kernel/softirq.c8
-rw-r--r--kernel/sysctl.c6
-rw-r--r--kernel/time/Kconfig51
-rw-r--r--kernel/time/sched_clock.c2
-rw-r--r--kernel/time/tick-sched.c71
-rw-r--r--kernel/time/timer_list.c41
-rw-r--r--kernel/timer.c10
-rw-r--r--kernel/trace/ftrace.c105
-rw-r--r--kernel/trace/ring_buffer.c26
-rw-r--r--kernel/trace/trace.c254
-rw-r--r--kernel/trace/trace.h21
-rw-r--r--kernel/trace/trace_event_perf.c10
-rw-r--r--kernel/trace/trace_events.c292
-rw-r--r--kernel/trace/trace_events_filter.c21
-rw-r--r--kernel/trace/trace_functions.c2
-rw-r--r--kernel/trace/trace_functions_graph.c54
-rw-r--r--kernel/trace/trace_kprobe.c50
-rw-r--r--kernel/trace/trace_mmiotrace.c8
-rw-r--r--kernel/trace/trace_output.c14
-rw-r--r--kernel/trace/trace_printk.c19
-rw-r--r--kernel/trace/trace_syscalls.c26
-rw-r--r--kernel/trace/trace_uprobe.c53
-rw-r--r--kernel/user_namespace.c17
-rw-r--r--kernel/wait.c6
-rw-r--r--kernel/watchdog.c8
-rw-r--r--kernel/workqueue.c98
72 files changed, 3988 insertions, 2796 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 470839d1a30e..35ef1185e359 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -2,7 +2,7 @@
2# Makefile for the linux kernel. 2# Makefile for the linux kernel.
3# 3#
4 4
5obj-y = fork.o exec_domain.o panic.o printk.o \ 5obj-y = fork.o exec_domain.o panic.o \
6 cpu.o exit.o itimer.o time.o softirq.o resource.o \ 6 cpu.o exit.o itimer.o time.o softirq.o resource.o \
7 sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ 7 sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \
8 signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ 8 signal.o sys.o kmod.o workqueue.o pid.o task_work.o \
@@ -24,6 +24,7 @@ endif
24 24
25obj-y += sched/ 25obj-y += sched/
26obj-y += power/ 26obj-y += power/
27obj-y += printk/
27obj-y += cpu/ 28obj-y += cpu/
28 29
29obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o 30obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 0e0b20b8c5db..e0aeb32415ff 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -81,7 +81,7 @@
81 */ 81 */
82#ifdef CONFIG_PROVE_RCU 82#ifdef CONFIG_PROVE_RCU
83DEFINE_MUTEX(cgroup_mutex); 83DEFINE_MUTEX(cgroup_mutex);
84EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for task_subsys_state_check() */ 84EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for lockdep */
85#else 85#else
86static DEFINE_MUTEX(cgroup_mutex); 86static DEFINE_MUTEX(cgroup_mutex);
87#endif 87#endif
@@ -117,6 +117,7 @@ struct cfent {
117 struct list_head node; 117 struct list_head node;
118 struct dentry *dentry; 118 struct dentry *dentry;
119 struct cftype *type; 119 struct cftype *type;
120 struct cgroup_subsys_state *css;
120 121
121 /* file xattrs */ 122 /* file xattrs */
122 struct simple_xattrs xattrs; 123 struct simple_xattrs xattrs;
@@ -159,9 +160,9 @@ struct css_id {
159 */ 160 */
160struct cgroup_event { 161struct cgroup_event {
161 /* 162 /*
162 * Cgroup which the event belongs to. 163 * css which the event belongs to.
163 */ 164 */
164 struct cgroup *cgrp; 165 struct cgroup_subsys_state *css;
165 /* 166 /*
166 * Control file which the event associated. 167 * Control file which the event associated.
167 */ 168 */
@@ -215,10 +216,33 @@ static u64 cgroup_serial_nr_next = 1;
215 */ 216 */
216static int need_forkexit_callback __read_mostly; 217static int need_forkexit_callback __read_mostly;
217 218
218static void cgroup_offline_fn(struct work_struct *work); 219static struct cftype cgroup_base_files[];
220
221static void cgroup_destroy_css_killed(struct cgroup *cgrp);
219static int cgroup_destroy_locked(struct cgroup *cgrp); 222static int cgroup_destroy_locked(struct cgroup *cgrp);
220static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, 223static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
221 struct cftype cfts[], bool is_add); 224 bool is_add);
225
226/**
227 * cgroup_css - obtain a cgroup's css for the specified subsystem
228 * @cgrp: the cgroup of interest
229 * @ss: the subsystem of interest (%NULL returns the dummy_css)
230 *
231 * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This
232 * function must be called either under cgroup_mutex or rcu_read_lock() and
233 * the caller is responsible for pinning the returned css if it wants to
234 * keep accessing it outside the said locks. This function may return
235 * %NULL if @cgrp doesn't have @subsys_id enabled.
236 */
237static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
238 struct cgroup_subsys *ss)
239{
240 if (ss)
241 return rcu_dereference_check(cgrp->subsys[ss->subsys_id],
242 lockdep_is_held(&cgroup_mutex));
243 else
244 return &cgrp->dummy_css;
245}
222 246
223/* convenient tests for these bits */ 247/* convenient tests for these bits */
224static inline bool cgroup_is_dead(const struct cgroup *cgrp) 248static inline bool cgroup_is_dead(const struct cgroup *cgrp)
@@ -365,9 +389,11 @@ static struct cgrp_cset_link init_cgrp_cset_link;
365static int cgroup_init_idr(struct cgroup_subsys *ss, 389static int cgroup_init_idr(struct cgroup_subsys *ss,
366 struct cgroup_subsys_state *css); 390 struct cgroup_subsys_state *css);
367 391
368/* css_set_lock protects the list of css_set objects, and the 392/*
369 * chain of tasks off each css_set. Nests outside task->alloc_lock 393 * css_set_lock protects the list of css_set objects, and the chain of
370 * due to cgroup_iter_start() */ 394 * tasks off each css_set. Nests outside task->alloc_lock due to
395 * css_task_iter_start().
396 */
371static DEFINE_RWLOCK(css_set_lock); 397static DEFINE_RWLOCK(css_set_lock);
372static int css_set_count; 398static int css_set_count;
373 399
@@ -392,10 +418,12 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
392 return key; 418 return key;
393} 419}
394 420
395/* We don't maintain the lists running through each css_set to its 421/*
396 * task until after the first call to cgroup_iter_start(). This 422 * We don't maintain the lists running through each css_set to its task
397 * reduces the fork()/exit() overhead for people who have cgroups 423 * until after the first call to css_task_iter_start(). This reduces the
398 * compiled into their kernel but not actually in use */ 424 * fork()/exit() overhead for people who have cgroups compiled into their
425 * kernel but not actually in use.
426 */
399static int use_task_css_set_links __read_mostly; 427static int use_task_css_set_links __read_mostly;
400 428
401static void __put_css_set(struct css_set *cset, int taskexit) 429static void __put_css_set(struct css_set *cset, int taskexit)
@@ -464,7 +492,7 @@ static inline void put_css_set_taskexit(struct css_set *cset)
464 * @new_cgrp: cgroup that's being entered by the task 492 * @new_cgrp: cgroup that's being entered by the task
465 * @template: desired set of css pointers in css_set (pre-calculated) 493 * @template: desired set of css pointers in css_set (pre-calculated)
466 * 494 *
467 * Returns true if "cg" matches "old_cg" except for the hierarchy 495 * Returns true if "cset" matches "old_cset" except for the hierarchy
468 * which "new_cgrp" belongs to, for which it should match "new_cgrp". 496 * which "new_cgrp" belongs to, for which it should match "new_cgrp".
469 */ 497 */
470static bool compare_css_sets(struct css_set *cset, 498static bool compare_css_sets(struct css_set *cset,
@@ -555,7 +583,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset,
555 /* Subsystem is in this hierarchy. So we want 583 /* Subsystem is in this hierarchy. So we want
556 * the subsystem state from the new 584 * the subsystem state from the new
557 * cgroup */ 585 * cgroup */
558 template[i] = cgrp->subsys[i]; 586 template[i] = cgroup_css(cgrp, ss);
559 } else { 587 } else {
560 /* Subsystem is not in this hierarchy, so we 588 /* Subsystem is not in this hierarchy, so we
561 * don't want to change the subsystem state */ 589 * don't want to change the subsystem state */
@@ -803,8 +831,7 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
803 831
804static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); 832static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
805static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); 833static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
806static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, 834static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask);
807 unsigned long subsys_mask);
808static const struct inode_operations cgroup_dir_inode_operations; 835static const struct inode_operations cgroup_dir_inode_operations;
809static const struct file_operations proc_cgroupstats_operations; 836static const struct file_operations proc_cgroupstats_operations;
810 837
@@ -813,8 +840,7 @@ static struct backing_dev_info cgroup_backing_dev_info = {
813 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, 840 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
814}; 841};
815 842
816static int alloc_css_id(struct cgroup_subsys *ss, 843static int alloc_css_id(struct cgroup_subsys_state *child_css);
817 struct cgroup *parent, struct cgroup *child);
818 844
819static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) 845static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
820{ 846{
@@ -845,15 +871,8 @@ static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry)
845static void cgroup_free_fn(struct work_struct *work) 871static void cgroup_free_fn(struct work_struct *work)
846{ 872{
847 struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); 873 struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
848 struct cgroup_subsys *ss;
849 874
850 mutex_lock(&cgroup_mutex); 875 mutex_lock(&cgroup_mutex);
851 /*
852 * Release the subsystem state objects.
853 */
854 for_each_root_subsys(cgrp->root, ss)
855 ss->css_free(cgrp);
856
857 cgrp->root->number_of_cgroups--; 876 cgrp->root->number_of_cgroups--;
858 mutex_unlock(&cgroup_mutex); 877 mutex_unlock(&cgroup_mutex);
859 878
@@ -864,8 +883,6 @@ static void cgroup_free_fn(struct work_struct *work)
864 */ 883 */
865 dput(cgrp->parent->dentry); 884 dput(cgrp->parent->dentry);
866 885
867 ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
868
869 /* 886 /*
870 * Drop the active superblock reference that we took when we 887 * Drop the active superblock reference that we took when we
871 * created the cgroup. This will free cgrp->root, if we are 888 * created the cgroup. This will free cgrp->root, if we are
@@ -956,27 +973,22 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
956} 973}
957 974
958/** 975/**
959 * cgroup_clear_directory - selective removal of base and subsystem files 976 * cgroup_clear_dir - remove subsys files in a cgroup directory
960 * @dir: directory containing the files 977 * @cgrp: target cgroup
961 * @base_files: true if the base files should be removed
962 * @subsys_mask: mask of the subsystem ids whose files should be removed 978 * @subsys_mask: mask of the subsystem ids whose files should be removed
963 */ 979 */
964static void cgroup_clear_directory(struct dentry *dir, bool base_files, 980static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
965 unsigned long subsys_mask)
966{ 981{
967 struct cgroup *cgrp = __d_cgrp(dir);
968 struct cgroup_subsys *ss; 982 struct cgroup_subsys *ss;
983 int i;
969 984
970 for_each_root_subsys(cgrp->root, ss) { 985 for_each_subsys(ss, i) {
971 struct cftype_set *set; 986 struct cftype_set *set;
972 if (!test_bit(ss->subsys_id, &subsys_mask)) 987
988 if (!test_bit(i, &subsys_mask))
973 continue; 989 continue;
974 list_for_each_entry(set, &ss->cftsets, node) 990 list_for_each_entry(set, &ss->cftsets, node)
975 cgroup_addrm_files(cgrp, NULL, set->cfts, false); 991 cgroup_addrm_files(cgrp, set->cfts, false);
976 }
977 if (base_files) {
978 while (!list_empty(&cgrp->files))
979 cgroup_rm_file(cgrp, NULL);
980 } 992 }
981} 993}
982 994
@@ -986,9 +998,6 @@ static void cgroup_clear_directory(struct dentry *dir, bool base_files,
986static void cgroup_d_remove_dir(struct dentry *dentry) 998static void cgroup_d_remove_dir(struct dentry *dentry)
987{ 999{
988 struct dentry *parent; 1000 struct dentry *parent;
989 struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
990
991 cgroup_clear_directory(dentry, true, root->subsys_mask);
992 1001
993 parent = dentry->d_parent; 1002 parent = dentry->d_parent;
994 spin_lock(&parent->d_lock); 1003 spin_lock(&parent->d_lock);
@@ -1009,79 +1018,84 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1009{ 1018{
1010 struct cgroup *cgrp = &root->top_cgroup; 1019 struct cgroup *cgrp = &root->top_cgroup;
1011 struct cgroup_subsys *ss; 1020 struct cgroup_subsys *ss;
1012 int i; 1021 unsigned long pinned = 0;
1022 int i, ret;
1013 1023
1014 BUG_ON(!mutex_is_locked(&cgroup_mutex)); 1024 BUG_ON(!mutex_is_locked(&cgroup_mutex));
1015 BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); 1025 BUG_ON(!mutex_is_locked(&cgroup_root_mutex));
1016 1026
1017 /* Check that any added subsystems are currently free */ 1027 /* Check that any added subsystems are currently free */
1018 for_each_subsys(ss, i) { 1028 for_each_subsys(ss, i) {
1019 unsigned long bit = 1UL << i; 1029 if (!(added_mask & (1 << i)))
1020
1021 if (!(bit & added_mask))
1022 continue; 1030 continue;
1023 1031
1032 /* is the subsystem mounted elsewhere? */
1024 if (ss->root != &cgroup_dummy_root) { 1033 if (ss->root != &cgroup_dummy_root) {
1025 /* Subsystem isn't free */ 1034 ret = -EBUSY;
1026 return -EBUSY; 1035 goto out_put;
1027 } 1036 }
1037
1038 /* pin the module */
1039 if (!try_module_get(ss->module)) {
1040 ret = -ENOENT;
1041 goto out_put;
1042 }
1043 pinned |= 1 << i;
1028 } 1044 }
1029 1045
1030 /* Currently we don't handle adding/removing subsystems when 1046 /* subsys could be missing if unloaded between parsing and here */
1031 * any child cgroups exist. This is theoretically supportable 1047 if (added_mask != pinned) {
1032 * but involves complex error handling, so it's being left until 1048 ret = -ENOENT;
1033 * later */ 1049 goto out_put;
1034 if (root->number_of_cgroups > 1) 1050 }
1035 return -EBUSY; 1051
1052 ret = cgroup_populate_dir(cgrp, added_mask);
1053 if (ret)
1054 goto out_put;
1055
1056 /*
1057 * Nothing can fail from this point on. Remove files for the
1058 * removed subsystems and rebind each subsystem.
1059 */
1060 cgroup_clear_dir(cgrp, removed_mask);
1036 1061
1037 /* Process each subsystem */
1038 for_each_subsys(ss, i) { 1062 for_each_subsys(ss, i) {
1039 unsigned long bit = 1UL << i; 1063 unsigned long bit = 1UL << i;
1040 1064
1041 if (bit & added_mask) { 1065 if (bit & added_mask) {
1042 /* We're binding this subsystem to this hierarchy */ 1066 /* We're binding this subsystem to this hierarchy */
1043 BUG_ON(cgrp->subsys[i]); 1067 BUG_ON(cgroup_css(cgrp, ss));
1044 BUG_ON(!cgroup_dummy_top->subsys[i]); 1068 BUG_ON(!cgroup_css(cgroup_dummy_top, ss));
1045 BUG_ON(cgroup_dummy_top->subsys[i]->cgroup != cgroup_dummy_top); 1069 BUG_ON(cgroup_css(cgroup_dummy_top, ss)->cgroup != cgroup_dummy_top);
1070
1071 rcu_assign_pointer(cgrp->subsys[i],
1072 cgroup_css(cgroup_dummy_top, ss));
1073 cgroup_css(cgrp, ss)->cgroup = cgrp;
1046 1074
1047 cgrp->subsys[i] = cgroup_dummy_top->subsys[i];
1048 cgrp->subsys[i]->cgroup = cgrp;
1049 list_move(&ss->sibling, &root->subsys_list); 1075 list_move(&ss->sibling, &root->subsys_list);
1050 ss->root = root; 1076 ss->root = root;
1051 if (ss->bind) 1077 if (ss->bind)
1052 ss->bind(cgrp); 1078 ss->bind(cgroup_css(cgrp, ss));
1053 1079
1054 /* refcount was already taken, and we're keeping it */ 1080 /* refcount was already taken, and we're keeping it */
1055 root->subsys_mask |= bit; 1081 root->subsys_mask |= bit;
1056 } else if (bit & removed_mask) { 1082 } else if (bit & removed_mask) {
1057 /* We're removing this subsystem */ 1083 /* We're removing this subsystem */
1058 BUG_ON(cgrp->subsys[i] != cgroup_dummy_top->subsys[i]); 1084 BUG_ON(cgroup_css(cgrp, ss) != cgroup_css(cgroup_dummy_top, ss));
1059 BUG_ON(cgrp->subsys[i]->cgroup != cgrp); 1085 BUG_ON(cgroup_css(cgrp, ss)->cgroup != cgrp);
1060 1086
1061 if (ss->bind) 1087 if (ss->bind)
1062 ss->bind(cgroup_dummy_top); 1088 ss->bind(cgroup_css(cgroup_dummy_top, ss));
1063 cgroup_dummy_top->subsys[i]->cgroup = cgroup_dummy_top; 1089
1064 cgrp->subsys[i] = NULL; 1090 cgroup_css(cgroup_dummy_top, ss)->cgroup = cgroup_dummy_top;
1091 RCU_INIT_POINTER(cgrp->subsys[i], NULL);
1092
1065 cgroup_subsys[i]->root = &cgroup_dummy_root; 1093 cgroup_subsys[i]->root = &cgroup_dummy_root;
1066 list_move(&ss->sibling, &cgroup_dummy_root.subsys_list); 1094 list_move(&ss->sibling, &cgroup_dummy_root.subsys_list);
1067 1095
1068 /* subsystem is now free - drop reference on module */ 1096 /* subsystem is now free - drop reference on module */
1069 module_put(ss->module); 1097 module_put(ss->module);
1070 root->subsys_mask &= ~bit; 1098 root->subsys_mask &= ~bit;
1071 } else if (bit & root->subsys_mask) {
1072 /* Subsystem state should already exist */
1073 BUG_ON(!cgrp->subsys[i]);
1074 /*
1075 * a refcount was taken, but we already had one, so
1076 * drop the extra reference.
1077 */
1078 module_put(ss->module);
1079#ifdef CONFIG_MODULE_UNLOAD
1080 BUG_ON(ss->module && !module_refcount(ss->module));
1081#endif
1082 } else {
1083 /* Subsystem state shouldn't exist */
1084 BUG_ON(cgrp->subsys[i]);
1085 } 1099 }
1086 } 1100 }
1087 1101
@@ -1092,6 +1106,12 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1092 root->flags |= CGRP_ROOT_SUBSYS_BOUND; 1106 root->flags |= CGRP_ROOT_SUBSYS_BOUND;
1093 1107
1094 return 0; 1108 return 0;
1109
1110out_put:
1111 for_each_subsys(ss, i)
1112 if (pinned & (1 << i))
1113 module_put(ss->module);
1114 return ret;
1095} 1115}
1096 1116
1097static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) 1117static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
@@ -1142,7 +1162,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1142 char *token, *o = data; 1162 char *token, *o = data;
1143 bool all_ss = false, one_ss = false; 1163 bool all_ss = false, one_ss = false;
1144 unsigned long mask = (unsigned long)-1; 1164 unsigned long mask = (unsigned long)-1;
1145 bool module_pin_failed = false;
1146 struct cgroup_subsys *ss; 1165 struct cgroup_subsys *ss;
1147 int i; 1166 int i;
1148 1167
@@ -1285,52 +1304,9 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1285 if (!opts->subsys_mask && !opts->name) 1304 if (!opts->subsys_mask && !opts->name)
1286 return -EINVAL; 1305 return -EINVAL;
1287 1306
1288 /*
1289 * Grab references on all the modules we'll need, so the subsystems
1290 * don't dance around before rebind_subsystems attaches them. This may
1291 * take duplicate reference counts on a subsystem that's already used,
1292 * but rebind_subsystems handles this case.
1293 */
1294 for_each_subsys(ss, i) {
1295 if (!(opts->subsys_mask & (1UL << i)))
1296 continue;
1297 if (!try_module_get(cgroup_subsys[i]->module)) {
1298 module_pin_failed = true;
1299 break;
1300 }
1301 }
1302 if (module_pin_failed) {
1303 /*
1304 * oops, one of the modules was going away. this means that we
1305 * raced with a module_delete call, and to the user this is
1306 * essentially a "subsystem doesn't exist" case.
1307 */
1308 for (i--; i >= 0; i--) {
1309 /* drop refcounts only on the ones we took */
1310 unsigned long bit = 1UL << i;
1311
1312 if (!(bit & opts->subsys_mask))
1313 continue;
1314 module_put(cgroup_subsys[i]->module);
1315 }
1316 return -ENOENT;
1317 }
1318
1319 return 0; 1307 return 0;
1320} 1308}
1321 1309
1322static void drop_parsed_module_refcounts(unsigned long subsys_mask)
1323{
1324 struct cgroup_subsys *ss;
1325 int i;
1326
1327 mutex_lock(&cgroup_mutex);
1328 for_each_subsys(ss, i)
1329 if (subsys_mask & (1UL << i))
1330 module_put(cgroup_subsys[i]->module);
1331 mutex_unlock(&cgroup_mutex);
1332}
1333
1334static int cgroup_remount(struct super_block *sb, int *flags, char *data) 1310static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1335{ 1311{
1336 int ret = 0; 1312 int ret = 0;
@@ -1370,22 +1346,15 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1370 goto out_unlock; 1346 goto out_unlock;
1371 } 1347 }
1372 1348
1373 /* 1349 /* remounting is not allowed for populated hierarchies */
1374 * Clear out the files of subsystems that should be removed, do 1350 if (root->number_of_cgroups > 1) {
1375 * this before rebind_subsystems, since rebind_subsystems may 1351 ret = -EBUSY;
1376 * change this hierarchy's subsys_list.
1377 */
1378 cgroup_clear_directory(cgrp->dentry, false, removed_mask);
1379
1380 ret = rebind_subsystems(root, added_mask, removed_mask);
1381 if (ret) {
1382 /* rebind_subsystems failed, re-populate the removed files */
1383 cgroup_populate_dir(cgrp, false, removed_mask);
1384 goto out_unlock; 1352 goto out_unlock;
1385 } 1353 }
1386 1354
1387 /* re-populate subsystem files */ 1355 ret = rebind_subsystems(root, added_mask, removed_mask);
1388 cgroup_populate_dir(cgrp, false, added_mask); 1356 if (ret)
1357 goto out_unlock;
1389 1358
1390 if (opts.release_agent) 1359 if (opts.release_agent)
1391 strcpy(root->release_agent_path, opts.release_agent); 1360 strcpy(root->release_agent_path, opts.release_agent);
@@ -1395,8 +1364,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1395 mutex_unlock(&cgroup_root_mutex); 1364 mutex_unlock(&cgroup_root_mutex);
1396 mutex_unlock(&cgroup_mutex); 1365 mutex_unlock(&cgroup_mutex);
1397 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 1366 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
1398 if (ret)
1399 drop_parsed_module_refcounts(opts.subsys_mask);
1400 return ret; 1367 return ret;
1401} 1368}
1402 1369
@@ -1416,6 +1383,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1416 INIT_LIST_HEAD(&cgrp->release_list); 1383 INIT_LIST_HEAD(&cgrp->release_list);
1417 INIT_LIST_HEAD(&cgrp->pidlists); 1384 INIT_LIST_HEAD(&cgrp->pidlists);
1418 mutex_init(&cgrp->pidlist_mutex); 1385 mutex_init(&cgrp->pidlist_mutex);
1386 cgrp->dummy_css.cgroup = cgrp;
1419 INIT_LIST_HEAD(&cgrp->event_list); 1387 INIT_LIST_HEAD(&cgrp->event_list);
1420 spin_lock_init(&cgrp->event_list_lock); 1388 spin_lock_init(&cgrp->event_list_lock);
1421 simple_xattrs_init(&cgrp->xattrs); 1389 simple_xattrs_init(&cgrp->xattrs);
@@ -1431,6 +1399,7 @@ static void init_cgroup_root(struct cgroupfs_root *root)
1431 cgrp->root = root; 1399 cgrp->root = root;
1432 RCU_INIT_POINTER(cgrp->name, &root_cgroup_name); 1400 RCU_INIT_POINTER(cgrp->name, &root_cgroup_name);
1433 init_cgroup_housekeeping(cgrp); 1401 init_cgroup_housekeeping(cgrp);
1402 idr_init(&root->cgroup_idr);
1434} 1403}
1435 1404
1436static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end) 1405static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end)
@@ -1503,7 +1472,6 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1503 */ 1472 */
1504 root->subsys_mask = opts->subsys_mask; 1473 root->subsys_mask = opts->subsys_mask;
1505 root->flags = opts->flags; 1474 root->flags = opts->flags;
1506 ida_init(&root->cgroup_ida);
1507 if (opts->release_agent) 1475 if (opts->release_agent)
1508 strcpy(root->release_agent_path, opts->release_agent); 1476 strcpy(root->release_agent_path, opts->release_agent);
1509 if (opts->name) 1477 if (opts->name)
@@ -1519,7 +1487,7 @@ static void cgroup_free_root(struct cgroupfs_root *root)
1519 /* hierarhcy ID shoulid already have been released */ 1487 /* hierarhcy ID shoulid already have been released */
1520 WARN_ON_ONCE(root->hierarchy_id); 1488 WARN_ON_ONCE(root->hierarchy_id);
1521 1489
1522 ida_destroy(&root->cgroup_ida); 1490 idr_destroy(&root->cgroup_idr);
1523 kfree(root); 1491 kfree(root);
1524 } 1492 }
1525} 1493}
@@ -1584,7 +1552,9 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1584 int ret = 0; 1552 int ret = 0;
1585 struct super_block *sb; 1553 struct super_block *sb;
1586 struct cgroupfs_root *new_root; 1554 struct cgroupfs_root *new_root;
1555 struct list_head tmp_links;
1587 struct inode *inode; 1556 struct inode *inode;
1557 const struct cred *cred;
1588 1558
1589 /* First find the desired set of subsystems */ 1559 /* First find the desired set of subsystems */
1590 mutex_lock(&cgroup_mutex); 1560 mutex_lock(&cgroup_mutex);
@@ -1600,7 +1570,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1600 new_root = cgroup_root_from_opts(&opts); 1570 new_root = cgroup_root_from_opts(&opts);
1601 if (IS_ERR(new_root)) { 1571 if (IS_ERR(new_root)) {
1602 ret = PTR_ERR(new_root); 1572 ret = PTR_ERR(new_root);
1603 goto drop_modules; 1573 goto out_err;
1604 } 1574 }
1605 opts.new_root = new_root; 1575 opts.new_root = new_root;
1606 1576
@@ -1609,17 +1579,15 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1609 if (IS_ERR(sb)) { 1579 if (IS_ERR(sb)) {
1610 ret = PTR_ERR(sb); 1580 ret = PTR_ERR(sb);
1611 cgroup_free_root(opts.new_root); 1581 cgroup_free_root(opts.new_root);
1612 goto drop_modules; 1582 goto out_err;
1613 } 1583 }
1614 1584
1615 root = sb->s_fs_info; 1585 root = sb->s_fs_info;
1616 BUG_ON(!root); 1586 BUG_ON(!root);
1617 if (root == opts.new_root) { 1587 if (root == opts.new_root) {
1618 /* We used the new root structure, so this is a new hierarchy */ 1588 /* We used the new root structure, so this is a new hierarchy */
1619 struct list_head tmp_links;
1620 struct cgroup *root_cgrp = &root->top_cgroup; 1589 struct cgroup *root_cgrp = &root->top_cgroup;
1621 struct cgroupfs_root *existing_root; 1590 struct cgroupfs_root *existing_root;
1622 const struct cred *cred;
1623 int i; 1591 int i;
1624 struct css_set *cset; 1592 struct css_set *cset;
1625 1593
@@ -1634,6 +1602,11 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1634 mutex_lock(&cgroup_mutex); 1602 mutex_lock(&cgroup_mutex);
1635 mutex_lock(&cgroup_root_mutex); 1603 mutex_lock(&cgroup_root_mutex);
1636 1604
1605 root_cgrp->id = idr_alloc(&root->cgroup_idr, root_cgrp,
1606 0, 1, GFP_KERNEL);
1607 if (root_cgrp->id < 0)
1608 goto unlock_drop;
1609
1637 /* Check for name clashes with existing mounts */ 1610 /* Check for name clashes with existing mounts */
1638 ret = -EBUSY; 1611 ret = -EBUSY;
1639 if (strlen(root->name)) 1612 if (strlen(root->name))
@@ -1657,26 +1630,37 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1657 if (ret) 1630 if (ret)
1658 goto unlock_drop; 1631 goto unlock_drop;
1659 1632
1633 sb->s_root->d_fsdata = root_cgrp;
1634 root_cgrp->dentry = sb->s_root;
1635
1636 /*
1637 * We're inside get_sb() and will call lookup_one_len() to
1638 * create the root files, which doesn't work if SELinux is
1639 * in use. The following cred dancing somehow works around
1640 * it. See 2ce9738ba ("cgroupfs: use init_cred when
1641 * populating new cgroupfs mount") for more details.
1642 */
1643 cred = override_creds(&init_cred);
1644
1645 ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true);
1646 if (ret)
1647 goto rm_base_files;
1648
1660 ret = rebind_subsystems(root, root->subsys_mask, 0); 1649 ret = rebind_subsystems(root, root->subsys_mask, 0);
1661 if (ret == -EBUSY) { 1650 if (ret)
1662 free_cgrp_cset_links(&tmp_links); 1651 goto rm_base_files;
1663 goto unlock_drop; 1652
1664 } 1653 revert_creds(cred);
1654
1665 /* 1655 /*
1666 * There must be no failure case after here, since rebinding 1656 * There must be no failure case after here, since rebinding
1667 * takes care of subsystems' refcounts, which are explicitly 1657 * takes care of subsystems' refcounts, which are explicitly
1668 * dropped in the failure exit path. 1658 * dropped in the failure exit path.
1669 */ 1659 */
1670 1660
1671 /* EBUSY should be the only error here */
1672 BUG_ON(ret);
1673
1674 list_add(&root->root_list, &cgroup_roots); 1661 list_add(&root->root_list, &cgroup_roots);
1675 cgroup_root_count++; 1662 cgroup_root_count++;
1676 1663
1677 sb->s_root->d_fsdata = root_cgrp;
1678 root->top_cgroup.dentry = sb->s_root;
1679
1680 /* Link the top cgroup in this hierarchy into all 1664 /* Link the top cgroup in this hierarchy into all
1681 * the css_set objects */ 1665 * the css_set objects */
1682 write_lock(&css_set_lock); 1666 write_lock(&css_set_lock);
@@ -1689,9 +1673,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1689 BUG_ON(!list_empty(&root_cgrp->children)); 1673 BUG_ON(!list_empty(&root_cgrp->children));
1690 BUG_ON(root->number_of_cgroups != 1); 1674 BUG_ON(root->number_of_cgroups != 1);
1691 1675
1692 cred = override_creds(&init_cred);
1693 cgroup_populate_dir(root_cgrp, true, root->subsys_mask);
1694 revert_creds(cred);
1695 mutex_unlock(&cgroup_root_mutex); 1676 mutex_unlock(&cgroup_root_mutex);
1696 mutex_unlock(&cgroup_mutex); 1677 mutex_unlock(&cgroup_mutex);
1697 mutex_unlock(&inode->i_mutex); 1678 mutex_unlock(&inode->i_mutex);
@@ -1711,15 +1692,16 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1711 pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n"); 1692 pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n");
1712 } 1693 }
1713 } 1694 }
1714
1715 /* no subsys rebinding, so refcounts don't change */
1716 drop_parsed_module_refcounts(opts.subsys_mask);
1717 } 1695 }
1718 1696
1719 kfree(opts.release_agent); 1697 kfree(opts.release_agent);
1720 kfree(opts.name); 1698 kfree(opts.name);
1721 return dget(sb->s_root); 1699 return dget(sb->s_root);
1722 1700
1701 rm_base_files:
1702 free_cgrp_cset_links(&tmp_links);
1703 cgroup_addrm_files(&root->top_cgroup, cgroup_base_files, false);
1704 revert_creds(cred);
1723 unlock_drop: 1705 unlock_drop:
1724 cgroup_exit_root_id(root); 1706 cgroup_exit_root_id(root);
1725 mutex_unlock(&cgroup_root_mutex); 1707 mutex_unlock(&cgroup_root_mutex);
@@ -1727,8 +1709,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1727 mutex_unlock(&inode->i_mutex); 1709 mutex_unlock(&inode->i_mutex);
1728 drop_new_super: 1710 drop_new_super:
1729 deactivate_locked_super(sb); 1711 deactivate_locked_super(sb);
1730 drop_modules:
1731 drop_parsed_module_refcounts(opts.subsys_mask);
1732 out_err: 1712 out_err:
1733 kfree(opts.release_agent); 1713 kfree(opts.release_agent);
1734 kfree(opts.name); 1714 kfree(opts.name);
@@ -1746,6 +1726,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
1746 BUG_ON(root->number_of_cgroups != 1); 1726 BUG_ON(root->number_of_cgroups != 1);
1747 BUG_ON(!list_empty(&cgrp->children)); 1727 BUG_ON(!list_empty(&cgrp->children));
1748 1728
1729 mutex_lock(&cgrp->dentry->d_inode->i_mutex);
1749 mutex_lock(&cgroup_mutex); 1730 mutex_lock(&cgroup_mutex);
1750 mutex_lock(&cgroup_root_mutex); 1731 mutex_lock(&cgroup_root_mutex);
1751 1732
@@ -1778,6 +1759,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
1778 1759
1779 mutex_unlock(&cgroup_root_mutex); 1760 mutex_unlock(&cgroup_root_mutex);
1780 mutex_unlock(&cgroup_mutex); 1761 mutex_unlock(&cgroup_mutex);
1762 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
1781 1763
1782 simple_xattrs_free(&cgrp->xattrs); 1764 simple_xattrs_free(&cgrp->xattrs);
1783 1765
@@ -1845,36 +1827,43 @@ out:
1845EXPORT_SYMBOL_GPL(cgroup_path); 1827EXPORT_SYMBOL_GPL(cgroup_path);
1846 1828
1847/** 1829/**
1848 * task_cgroup_path_from_hierarchy - cgroup path of a task on a hierarchy 1830 * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
1849 * @task: target task 1831 * @task: target task
1850 * @hierarchy_id: the hierarchy to look up @task's cgroup from
1851 * @buf: the buffer to write the path into 1832 * @buf: the buffer to write the path into
1852 * @buflen: the length of the buffer 1833 * @buflen: the length of the buffer
1853 * 1834 *
1854 * Determine @task's cgroup on the hierarchy specified by @hierarchy_id and 1835 * Determine @task's cgroup on the first (the one with the lowest non-zero
1855 * copy its path into @buf. This function grabs cgroup_mutex and shouldn't 1836 * hierarchy_id) cgroup hierarchy and copy its path into @buf. This
1856 * be used inside locks used by cgroup controller callbacks. 1837 * function grabs cgroup_mutex and shouldn't be used inside locks used by
1838 * cgroup controller callbacks.
1839 *
1840 * Returns 0 on success, fails with -%ENAMETOOLONG if @buflen is too short.
1857 */ 1841 */
1858int task_cgroup_path_from_hierarchy(struct task_struct *task, int hierarchy_id, 1842int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
1859 char *buf, size_t buflen)
1860{ 1843{
1861 struct cgroupfs_root *root; 1844 struct cgroupfs_root *root;
1862 struct cgroup *cgrp = NULL; 1845 struct cgroup *cgrp;
1863 int ret = -ENOENT; 1846 int hierarchy_id = 1, ret = 0;
1847
1848 if (buflen < 2)
1849 return -ENAMETOOLONG;
1864 1850
1865 mutex_lock(&cgroup_mutex); 1851 mutex_lock(&cgroup_mutex);
1866 1852
1867 root = idr_find(&cgroup_hierarchy_idr, hierarchy_id); 1853 root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
1854
1868 if (root) { 1855 if (root) {
1869 cgrp = task_cgroup_from_root(task, root); 1856 cgrp = task_cgroup_from_root(task, root);
1870 ret = cgroup_path(cgrp, buf, buflen); 1857 ret = cgroup_path(cgrp, buf, buflen);
1858 } else {
1859 /* if no hierarchy exists, everyone is in "/" */
1860 memcpy(buf, "/", 2);
1871 } 1861 }
1872 1862
1873 mutex_unlock(&cgroup_mutex); 1863 mutex_unlock(&cgroup_mutex);
1874
1875 return ret; 1864 return ret;
1876} 1865}
1877EXPORT_SYMBOL_GPL(task_cgroup_path_from_hierarchy); 1866EXPORT_SYMBOL_GPL(task_cgroup_path);
1878 1867
1879/* 1868/*
1880 * Control Group taskset 1869 * Control Group taskset
@@ -1882,7 +1871,7 @@ EXPORT_SYMBOL_GPL(task_cgroup_path_from_hierarchy);
1882struct task_and_cgroup { 1871struct task_and_cgroup {
1883 struct task_struct *task; 1872 struct task_struct *task;
1884 struct cgroup *cgrp; 1873 struct cgroup *cgrp;
1885 struct css_set *cg; 1874 struct css_set *cset;
1886}; 1875};
1887 1876
1888struct cgroup_taskset { 1877struct cgroup_taskset {
@@ -1932,18 +1921,20 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
1932EXPORT_SYMBOL_GPL(cgroup_taskset_next); 1921EXPORT_SYMBOL_GPL(cgroup_taskset_next);
1933 1922
1934/** 1923/**
1935 * cgroup_taskset_cur_cgroup - return the matching cgroup for the current task 1924 * cgroup_taskset_cur_css - return the matching css for the current task
1936 * @tset: taskset of interest 1925 * @tset: taskset of interest
1926 * @subsys_id: the ID of the target subsystem
1937 * 1927 *
1938 * Return the cgroup for the current (last returned) task of @tset. This 1928 * Return the css for the current (last returned) task of @tset for
1939 * function must be preceded by either cgroup_taskset_first() or 1929 * subsystem specified by @subsys_id. This function must be preceded by
1940 * cgroup_taskset_next(). 1930 * either cgroup_taskset_first() or cgroup_taskset_next().
1941 */ 1931 */
1942struct cgroup *cgroup_taskset_cur_cgroup(struct cgroup_taskset *tset) 1932struct cgroup_subsys_state *cgroup_taskset_cur_css(struct cgroup_taskset *tset,
1933 int subsys_id)
1943{ 1934{
1944 return tset->cur_cgrp; 1935 return cgroup_css(tset->cur_cgrp, cgroup_subsys[subsys_id]);
1945} 1936}
1946EXPORT_SYMBOL_GPL(cgroup_taskset_cur_cgroup); 1937EXPORT_SYMBOL_GPL(cgroup_taskset_cur_css);
1947 1938
1948/** 1939/**
1949 * cgroup_taskset_size - return the number of tasks in taskset 1940 * cgroup_taskset_size - return the number of tasks in taskset
@@ -2082,8 +2073,10 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2082 * step 1: check that we can legitimately attach to the cgroup. 2073 * step 1: check that we can legitimately attach to the cgroup.
2083 */ 2074 */
2084 for_each_root_subsys(root, ss) { 2075 for_each_root_subsys(root, ss) {
2076 struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
2077
2085 if (ss->can_attach) { 2078 if (ss->can_attach) {
2086 retval = ss->can_attach(cgrp, &tset); 2079 retval = ss->can_attach(css, &tset);
2087 if (retval) { 2080 if (retval) {
2088 failed_ss = ss; 2081 failed_ss = ss;
2089 goto out_cancel_attach; 2082 goto out_cancel_attach;
@@ -2100,8 +2093,8 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2100 2093
2101 tc = flex_array_get(group, i); 2094 tc = flex_array_get(group, i);
2102 old_cset = task_css_set(tc->task); 2095 old_cset = task_css_set(tc->task);
2103 tc->cg = find_css_set(old_cset, cgrp); 2096 tc->cset = find_css_set(old_cset, cgrp);
2104 if (!tc->cg) { 2097 if (!tc->cset) {
2105 retval = -ENOMEM; 2098 retval = -ENOMEM;
2106 goto out_put_css_set_refs; 2099 goto out_put_css_set_refs;
2107 } 2100 }
@@ -2114,7 +2107,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2114 */ 2107 */
2115 for (i = 0; i < group_size; i++) { 2108 for (i = 0; i < group_size; i++) {
2116 tc = flex_array_get(group, i); 2109 tc = flex_array_get(group, i);
2117 cgroup_task_migrate(tc->cgrp, tc->task, tc->cg); 2110 cgroup_task_migrate(tc->cgrp, tc->task, tc->cset);
2118 } 2111 }
2119 /* nothing is sensitive to fork() after this point. */ 2112 /* nothing is sensitive to fork() after this point. */
2120 2113
@@ -2122,8 +2115,10 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
2122 * step 4: do subsystem attach callbacks. 2115 * step 4: do subsystem attach callbacks.
2123 */ 2116 */
2124 for_each_root_subsys(root, ss) { 2117 for_each_root_subsys(root, ss) {
2118 struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
2119
2125 if (ss->attach) 2120 if (ss->attach)
2126 ss->attach(cgrp, &tset); 2121 ss->attach(css, &tset);
2127 } 2122 }
2128 2123
2129 /* 2124 /*
@@ -2134,18 +2129,20 @@ out_put_css_set_refs:
2134 if (retval) { 2129 if (retval) {
2135 for (i = 0; i < group_size; i++) { 2130 for (i = 0; i < group_size; i++) {
2136 tc = flex_array_get(group, i); 2131 tc = flex_array_get(group, i);
2137 if (!tc->cg) 2132 if (!tc->cset)
2138 break; 2133 break;
2139 put_css_set(tc->cg); 2134 put_css_set(tc->cset);
2140 } 2135 }
2141 } 2136 }
2142out_cancel_attach: 2137out_cancel_attach:
2143 if (retval) { 2138 if (retval) {
2144 for_each_root_subsys(root, ss) { 2139 for_each_root_subsys(root, ss) {
2140 struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
2141
2145 if (ss == failed_ss) 2142 if (ss == failed_ss)
2146 break; 2143 break;
2147 if (ss->cancel_attach) 2144 if (ss->cancel_attach)
2148 ss->cancel_attach(cgrp, &tset); 2145 ss->cancel_attach(css, &tset);
2149 } 2146 }
2150 } 2147 }
2151out_free_group_list: 2148out_free_group_list:
@@ -2246,9 +2243,9 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
2246 2243
2247 mutex_lock(&cgroup_mutex); 2244 mutex_lock(&cgroup_mutex);
2248 for_each_active_root(root) { 2245 for_each_active_root(root) {
2249 struct cgroup *from_cg = task_cgroup_from_root(from, root); 2246 struct cgroup *from_cgrp = task_cgroup_from_root(from, root);
2250 2247
2251 retval = cgroup_attach_task(from_cg, tsk, false); 2248 retval = cgroup_attach_task(from_cgrp, tsk, false);
2252 if (retval) 2249 if (retval)
2253 break; 2250 break;
2254 } 2251 }
@@ -2258,34 +2255,38 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
2258} 2255}
2259EXPORT_SYMBOL_GPL(cgroup_attach_task_all); 2256EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
2260 2257
2261static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) 2258static int cgroup_tasks_write(struct cgroup_subsys_state *css,
2259 struct cftype *cft, u64 pid)
2262{ 2260{
2263 return attach_task_by_pid(cgrp, pid, false); 2261 return attach_task_by_pid(css->cgroup, pid, false);
2264} 2262}
2265 2263
2266static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid) 2264static int cgroup_procs_write(struct cgroup_subsys_state *css,
2265 struct cftype *cft, u64 tgid)
2267{ 2266{
2268 return attach_task_by_pid(cgrp, tgid, true); 2267 return attach_task_by_pid(css->cgroup, tgid, true);
2269} 2268}
2270 2269
2271static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, 2270static int cgroup_release_agent_write(struct cgroup_subsys_state *css,
2272 const char *buffer) 2271 struct cftype *cft, const char *buffer)
2273{ 2272{
2274 BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); 2273 BUILD_BUG_ON(sizeof(css->cgroup->root->release_agent_path) < PATH_MAX);
2275 if (strlen(buffer) >= PATH_MAX) 2274 if (strlen(buffer) >= PATH_MAX)
2276 return -EINVAL; 2275 return -EINVAL;
2277 if (!cgroup_lock_live_group(cgrp)) 2276 if (!cgroup_lock_live_group(css->cgroup))
2278 return -ENODEV; 2277 return -ENODEV;
2279 mutex_lock(&cgroup_root_mutex); 2278 mutex_lock(&cgroup_root_mutex);
2280 strcpy(cgrp->root->release_agent_path, buffer); 2279 strcpy(css->cgroup->root->release_agent_path, buffer);
2281 mutex_unlock(&cgroup_root_mutex); 2280 mutex_unlock(&cgroup_root_mutex);
2282 mutex_unlock(&cgroup_mutex); 2281 mutex_unlock(&cgroup_mutex);
2283 return 0; 2282 return 0;
2284} 2283}
2285 2284
2286static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft, 2285static int cgroup_release_agent_show(struct cgroup_subsys_state *css,
2287 struct seq_file *seq) 2286 struct cftype *cft, struct seq_file *seq)
2288{ 2287{
2288 struct cgroup *cgrp = css->cgroup;
2289
2289 if (!cgroup_lock_live_group(cgrp)) 2290 if (!cgroup_lock_live_group(cgrp))
2290 return -ENODEV; 2291 return -ENODEV;
2291 seq_puts(seq, cgrp->root->release_agent_path); 2292 seq_puts(seq, cgrp->root->release_agent_path);
@@ -2294,20 +2295,20 @@ static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
2294 return 0; 2295 return 0;
2295} 2296}
2296 2297
2297static int cgroup_sane_behavior_show(struct cgroup *cgrp, struct cftype *cft, 2298static int cgroup_sane_behavior_show(struct cgroup_subsys_state *css,
2298 struct seq_file *seq) 2299 struct cftype *cft, struct seq_file *seq)
2299{ 2300{
2300 seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp)); 2301 seq_printf(seq, "%d\n", cgroup_sane_behavior(css->cgroup));
2301 return 0; 2302 return 0;
2302} 2303}
2303 2304
2304/* A buffer size big enough for numbers or short strings */ 2305/* A buffer size big enough for numbers or short strings */
2305#define CGROUP_LOCAL_BUFFER_SIZE 64 2306#define CGROUP_LOCAL_BUFFER_SIZE 64
2306 2307
2307static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft, 2308static ssize_t cgroup_write_X64(struct cgroup_subsys_state *css,
2308 struct file *file, 2309 struct cftype *cft, struct file *file,
2309 const char __user *userbuf, 2310 const char __user *userbuf, size_t nbytes,
2310 size_t nbytes, loff_t *unused_ppos) 2311 loff_t *unused_ppos)
2311{ 2312{
2312 char buffer[CGROUP_LOCAL_BUFFER_SIZE]; 2313 char buffer[CGROUP_LOCAL_BUFFER_SIZE];
2313 int retval = 0; 2314 int retval = 0;
@@ -2325,22 +2326,22 @@ static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
2325 u64 val = simple_strtoull(strstrip(buffer), &end, 0); 2326 u64 val = simple_strtoull(strstrip(buffer), &end, 0);
2326 if (*end) 2327 if (*end)
2327 return -EINVAL; 2328 return -EINVAL;
2328 retval = cft->write_u64(cgrp, cft, val); 2329 retval = cft->write_u64(css, cft, val);
2329 } else { 2330 } else {
2330 s64 val = simple_strtoll(strstrip(buffer), &end, 0); 2331 s64 val = simple_strtoll(strstrip(buffer), &end, 0);
2331 if (*end) 2332 if (*end)
2332 return -EINVAL; 2333 return -EINVAL;
2333 retval = cft->write_s64(cgrp, cft, val); 2334 retval = cft->write_s64(css, cft, val);
2334 } 2335 }
2335 if (!retval) 2336 if (!retval)
2336 retval = nbytes; 2337 retval = nbytes;
2337 return retval; 2338 return retval;
2338} 2339}
2339 2340
2340static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft, 2341static ssize_t cgroup_write_string(struct cgroup_subsys_state *css,
2341 struct file *file, 2342 struct cftype *cft, struct file *file,
2342 const char __user *userbuf, 2343 const char __user *userbuf, size_t nbytes,
2343 size_t nbytes, loff_t *unused_ppos) 2344 loff_t *unused_ppos)
2344{ 2345{
2345 char local_buffer[CGROUP_LOCAL_BUFFER_SIZE]; 2346 char local_buffer[CGROUP_LOCAL_BUFFER_SIZE];
2346 int retval = 0; 2347 int retval = 0;
@@ -2363,7 +2364,7 @@ static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
2363 } 2364 }
2364 2365
2365 buffer[nbytes] = 0; /* nul-terminate */ 2366 buffer[nbytes] = 0; /* nul-terminate */
2366 retval = cft->write_string(cgrp, cft, strstrip(buffer)); 2367 retval = cft->write_string(css, cft, strstrip(buffer));
2367 if (!retval) 2368 if (!retval)
2368 retval = nbytes; 2369 retval = nbytes;
2369out: 2370out:
@@ -2373,65 +2374,60 @@ out:
2373} 2374}
2374 2375
2375static ssize_t cgroup_file_write(struct file *file, const char __user *buf, 2376static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
2376 size_t nbytes, loff_t *ppos) 2377 size_t nbytes, loff_t *ppos)
2377{ 2378{
2379 struct cfent *cfe = __d_cfe(file->f_dentry);
2378 struct cftype *cft = __d_cft(file->f_dentry); 2380 struct cftype *cft = __d_cft(file->f_dentry);
2379 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2381 struct cgroup_subsys_state *css = cfe->css;
2380 2382
2381 if (cgroup_is_dead(cgrp))
2382 return -ENODEV;
2383 if (cft->write) 2383 if (cft->write)
2384 return cft->write(cgrp, cft, file, buf, nbytes, ppos); 2384 return cft->write(css, cft, file, buf, nbytes, ppos);
2385 if (cft->write_u64 || cft->write_s64) 2385 if (cft->write_u64 || cft->write_s64)
2386 return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos); 2386 return cgroup_write_X64(css, cft, file, buf, nbytes, ppos);
2387 if (cft->write_string) 2387 if (cft->write_string)
2388 return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos); 2388 return cgroup_write_string(css, cft, file, buf, nbytes, ppos);
2389 if (cft->trigger) { 2389 if (cft->trigger) {
2390 int ret = cft->trigger(cgrp, (unsigned int)cft->private); 2390 int ret = cft->trigger(css, (unsigned int)cft->private);
2391 return ret ? ret : nbytes; 2391 return ret ? ret : nbytes;
2392 } 2392 }
2393 return -EINVAL; 2393 return -EINVAL;
2394} 2394}
2395 2395
2396static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft, 2396static ssize_t cgroup_read_u64(struct cgroup_subsys_state *css,
2397 struct file *file, 2397 struct cftype *cft, struct file *file,
2398 char __user *buf, size_t nbytes, 2398 char __user *buf, size_t nbytes, loff_t *ppos)
2399 loff_t *ppos)
2400{ 2399{
2401 char tmp[CGROUP_LOCAL_BUFFER_SIZE]; 2400 char tmp[CGROUP_LOCAL_BUFFER_SIZE];
2402 u64 val = cft->read_u64(cgrp, cft); 2401 u64 val = cft->read_u64(css, cft);
2403 int len = sprintf(tmp, "%llu\n", (unsigned long long) val); 2402 int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
2404 2403
2405 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); 2404 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
2406} 2405}
2407 2406
2408static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft, 2407static ssize_t cgroup_read_s64(struct cgroup_subsys_state *css,
2409 struct file *file, 2408 struct cftype *cft, struct file *file,
2410 char __user *buf, size_t nbytes, 2409 char __user *buf, size_t nbytes, loff_t *ppos)
2411 loff_t *ppos)
2412{ 2410{
2413 char tmp[CGROUP_LOCAL_BUFFER_SIZE]; 2411 char tmp[CGROUP_LOCAL_BUFFER_SIZE];
2414 s64 val = cft->read_s64(cgrp, cft); 2412 s64 val = cft->read_s64(css, cft);
2415 int len = sprintf(tmp, "%lld\n", (long long) val); 2413 int len = sprintf(tmp, "%lld\n", (long long) val);
2416 2414
2417 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); 2415 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
2418} 2416}
2419 2417
2420static ssize_t cgroup_file_read(struct file *file, char __user *buf, 2418static ssize_t cgroup_file_read(struct file *file, char __user *buf,
2421 size_t nbytes, loff_t *ppos) 2419 size_t nbytes, loff_t *ppos)
2422{ 2420{
2421 struct cfent *cfe = __d_cfe(file->f_dentry);
2423 struct cftype *cft = __d_cft(file->f_dentry); 2422 struct cftype *cft = __d_cft(file->f_dentry);
2424 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2423 struct cgroup_subsys_state *css = cfe->css;
2425
2426 if (cgroup_is_dead(cgrp))
2427 return -ENODEV;
2428 2424
2429 if (cft->read) 2425 if (cft->read)
2430 return cft->read(cgrp, cft, file, buf, nbytes, ppos); 2426 return cft->read(css, cft, file, buf, nbytes, ppos);
2431 if (cft->read_u64) 2427 if (cft->read_u64)
2432 return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos); 2428 return cgroup_read_u64(css, cft, file, buf, nbytes, ppos);
2433 if (cft->read_s64) 2429 if (cft->read_s64)
2434 return cgroup_read_s64(cgrp, cft, file, buf, nbytes, ppos); 2430 return cgroup_read_s64(css, cft, file, buf, nbytes, ppos);
2435 return -EINVAL; 2431 return -EINVAL;
2436} 2432}
2437 2433
@@ -2440,11 +2436,6 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf,
2440 * supports string->u64 maps, but can be extended in future. 2436 * supports string->u64 maps, but can be extended in future.
2441 */ 2437 */
2442 2438
2443struct cgroup_seqfile_state {
2444 struct cftype *cft;
2445 struct cgroup *cgroup;
2446};
2447
2448static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value) 2439static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
2449{ 2440{
2450 struct seq_file *sf = cb->state; 2441 struct seq_file *sf = cb->state;
@@ -2453,69 +2444,86 @@ static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
2453 2444
2454static int cgroup_seqfile_show(struct seq_file *m, void *arg) 2445static int cgroup_seqfile_show(struct seq_file *m, void *arg)
2455{ 2446{
2456 struct cgroup_seqfile_state *state = m->private; 2447 struct cfent *cfe = m->private;
2457 struct cftype *cft = state->cft; 2448 struct cftype *cft = cfe->type;
2449 struct cgroup_subsys_state *css = cfe->css;
2450
2458 if (cft->read_map) { 2451 if (cft->read_map) {
2459 struct cgroup_map_cb cb = { 2452 struct cgroup_map_cb cb = {
2460 .fill = cgroup_map_add, 2453 .fill = cgroup_map_add,
2461 .state = m, 2454 .state = m,
2462 }; 2455 };
2463 return cft->read_map(state->cgroup, cft, &cb); 2456 return cft->read_map(css, cft, &cb);
2464 } 2457 }
2465 return cft->read_seq_string(state->cgroup, cft, m); 2458 return cft->read_seq_string(css, cft, m);
2466}
2467
2468static int cgroup_seqfile_release(struct inode *inode, struct file *file)
2469{
2470 struct seq_file *seq = file->private_data;
2471 kfree(seq->private);
2472 return single_release(inode, file);
2473} 2459}
2474 2460
2475static const struct file_operations cgroup_seqfile_operations = { 2461static const struct file_operations cgroup_seqfile_operations = {
2476 .read = seq_read, 2462 .read = seq_read,
2477 .write = cgroup_file_write, 2463 .write = cgroup_file_write,
2478 .llseek = seq_lseek, 2464 .llseek = seq_lseek,
2479 .release = cgroup_seqfile_release, 2465 .release = single_release,
2480}; 2466};
2481 2467
2482static int cgroup_file_open(struct inode *inode, struct file *file) 2468static int cgroup_file_open(struct inode *inode, struct file *file)
2483{ 2469{
2470 struct cfent *cfe = __d_cfe(file->f_dentry);
2471 struct cftype *cft = __d_cft(file->f_dentry);
2472 struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent);
2473 struct cgroup_subsys_state *css;
2484 int err; 2474 int err;
2485 struct cftype *cft;
2486 2475
2487 err = generic_file_open(inode, file); 2476 err = generic_file_open(inode, file);
2488 if (err) 2477 if (err)
2489 return err; 2478 return err;
2490 cft = __d_cft(file->f_dentry);
2491 2479
2492 if (cft->read_map || cft->read_seq_string) { 2480 /*
2493 struct cgroup_seqfile_state *state; 2481 * If the file belongs to a subsystem, pin the css. Will be
2482 * unpinned either on open failure or release. This ensures that
2483 * @css stays alive for all file operations.
2484 */
2485 rcu_read_lock();
2486 css = cgroup_css(cgrp, cft->ss);
2487 if (cft->ss && !css_tryget(css))
2488 css = NULL;
2489 rcu_read_unlock();
2494 2490
2495 state = kzalloc(sizeof(*state), GFP_USER); 2491 if (!css)
2496 if (!state) 2492 return -ENODEV;
2497 return -ENOMEM; 2493
2494 /*
2495 * @cfe->css is used by read/write/close to determine the
2496 * associated css. @file->private_data would be a better place but
2497 * that's already used by seqfile. Multiple accessors may use it
2498 * simultaneously which is okay as the association never changes.
2499 */
2500 WARN_ON_ONCE(cfe->css && cfe->css != css);
2501 cfe->css = css;
2498 2502
2499 state->cft = cft; 2503 if (cft->read_map || cft->read_seq_string) {
2500 state->cgroup = __d_cgrp(file->f_dentry->d_parent);
2501 file->f_op = &cgroup_seqfile_operations; 2504 file->f_op = &cgroup_seqfile_operations;
2502 err = single_open(file, cgroup_seqfile_show, state); 2505 err = single_open(file, cgroup_seqfile_show, cfe);
2503 if (err < 0) 2506 } else if (cft->open) {
2504 kfree(state);
2505 } else if (cft->open)
2506 err = cft->open(inode, file); 2507 err = cft->open(inode, file);
2507 else 2508 }
2508 err = 0;
2509 2509
2510 if (css->ss && err)
2511 css_put(css);
2510 return err; 2512 return err;
2511} 2513}
2512 2514
2513static int cgroup_file_release(struct inode *inode, struct file *file) 2515static int cgroup_file_release(struct inode *inode, struct file *file)
2514{ 2516{
2517 struct cfent *cfe = __d_cfe(file->f_dentry);
2515 struct cftype *cft = __d_cft(file->f_dentry); 2518 struct cftype *cft = __d_cft(file->f_dentry);
2519 struct cgroup_subsys_state *css = cfe->css;
2520 int ret = 0;
2521
2516 if (cft->release) 2522 if (cft->release)
2517 return cft->release(inode, file); 2523 ret = cft->release(inode, file);
2518 return 0; 2524 if (css->ss)
2525 css_put(css);
2526 return ret;
2519} 2527}
2520 2528
2521/* 2529/*
@@ -2729,8 +2737,7 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
2729 return mode; 2737 return mode;
2730} 2738}
2731 2739
2732static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, 2740static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
2733 struct cftype *cft)
2734{ 2741{
2735 struct dentry *dir = cgrp->dentry; 2742 struct dentry *dir = cgrp->dentry;
2736 struct cgroup *parent = __d_cgrp(dir); 2743 struct cgroup *parent = __d_cgrp(dir);
@@ -2740,8 +2747,9 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2740 umode_t mode; 2747 umode_t mode;
2741 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; 2748 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
2742 2749
2743 if (subsys && !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) { 2750 if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
2744 strcpy(name, subsys->name); 2751 !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
2752 strcpy(name, cft->ss->name);
2745 strcat(name, "."); 2753 strcat(name, ".");
2746 } 2754 }
2747 strcat(name, cft->name); 2755 strcat(name, cft->name);
@@ -2775,11 +2783,25 @@ out:
2775 return error; 2783 return error;
2776} 2784}
2777 2785
2778static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, 2786/**
2779 struct cftype cfts[], bool is_add) 2787 * cgroup_addrm_files - add or remove files to a cgroup directory
2788 * @cgrp: the target cgroup
2789 * @cfts: array of cftypes to be added
2790 * @is_add: whether to add or remove
2791 *
2792 * Depending on @is_add, add or remove files defined by @cfts on @cgrp.
2793 * For removals, this function never fails. If addition fails, this
2794 * function doesn't remove files already added. The caller is responsible
2795 * for cleaning up.
2796 */
2797static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
2798 bool is_add)
2780{ 2799{
2781 struct cftype *cft; 2800 struct cftype *cft;
2782 int err, ret = 0; 2801 int ret;
2802
2803 lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
2804 lockdep_assert_held(&cgroup_mutex);
2783 2805
2784 for (cft = cfts; cft->name[0] != '\0'; cft++) { 2806 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2785 /* does cft->flags tell us to skip this file on @cgrp? */ 2807 /* does cft->flags tell us to skip this file on @cgrp? */
@@ -2791,16 +2813,17 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2791 continue; 2813 continue;
2792 2814
2793 if (is_add) { 2815 if (is_add) {
2794 err = cgroup_add_file(cgrp, subsys, cft); 2816 ret = cgroup_add_file(cgrp, cft);
2795 if (err) 2817 if (ret) {
2796 pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n", 2818 pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n",
2797 cft->name, err); 2819 cft->name, ret);
2798 ret = err; 2820 return ret;
2821 }
2799 } else { 2822 } else {
2800 cgroup_rm_file(cgrp, cft); 2823 cgroup_rm_file(cgrp, cft);
2801 } 2824 }
2802 } 2825 }
2803 return ret; 2826 return 0;
2804} 2827}
2805 2828
2806static void cgroup_cfts_prepare(void) 2829static void cgroup_cfts_prepare(void)
@@ -2809,28 +2832,30 @@ static void cgroup_cfts_prepare(void)
2809 /* 2832 /*
2810 * Thanks to the entanglement with vfs inode locking, we can't walk 2833 * Thanks to the entanglement with vfs inode locking, we can't walk
2811 * the existing cgroups under cgroup_mutex and create files. 2834 * the existing cgroups under cgroup_mutex and create files.
2812 * Instead, we use cgroup_for_each_descendant_pre() and drop RCU 2835 * Instead, we use css_for_each_descendant_pre() and drop RCU read
2813 * read lock before calling cgroup_addrm_files(). 2836 * lock before calling cgroup_addrm_files().
2814 */ 2837 */
2815 mutex_lock(&cgroup_mutex); 2838 mutex_lock(&cgroup_mutex);
2816} 2839}
2817 2840
2818static void cgroup_cfts_commit(struct cgroup_subsys *ss, 2841static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
2819 struct cftype *cfts, bool is_add)
2820 __releases(&cgroup_mutex) 2842 __releases(&cgroup_mutex)
2821{ 2843{
2822 LIST_HEAD(pending); 2844 LIST_HEAD(pending);
2823 struct cgroup *cgrp, *root = &ss->root->top_cgroup; 2845 struct cgroup_subsys *ss = cfts[0].ss;
2846 struct cgroup *root = &ss->root->top_cgroup;
2824 struct super_block *sb = ss->root->sb; 2847 struct super_block *sb = ss->root->sb;
2825 struct dentry *prev = NULL; 2848 struct dentry *prev = NULL;
2826 struct inode *inode; 2849 struct inode *inode;
2850 struct cgroup_subsys_state *css;
2827 u64 update_before; 2851 u64 update_before;
2852 int ret = 0;
2828 2853
2829 /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ 2854 /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */
2830 if (!cfts || ss->root == &cgroup_dummy_root || 2855 if (!cfts || ss->root == &cgroup_dummy_root ||
2831 !atomic_inc_not_zero(&sb->s_active)) { 2856 !atomic_inc_not_zero(&sb->s_active)) {
2832 mutex_unlock(&cgroup_mutex); 2857 mutex_unlock(&cgroup_mutex);
2833 return; 2858 return 0;
2834 } 2859 }
2835 2860
2836 /* 2861 /*
@@ -2842,17 +2867,11 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss,
2842 2867
2843 mutex_unlock(&cgroup_mutex); 2868 mutex_unlock(&cgroup_mutex);
2844 2869
2845 /* @root always needs to be updated */
2846 inode = root->dentry->d_inode;
2847 mutex_lock(&inode->i_mutex);
2848 mutex_lock(&cgroup_mutex);
2849 cgroup_addrm_files(root, ss, cfts, is_add);
2850 mutex_unlock(&cgroup_mutex);
2851 mutex_unlock(&inode->i_mutex);
2852
2853 /* add/rm files for all cgroups created before */ 2870 /* add/rm files for all cgroups created before */
2854 rcu_read_lock(); 2871 rcu_read_lock();
2855 cgroup_for_each_descendant_pre(cgrp, root) { 2872 css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
2873 struct cgroup *cgrp = css->cgroup;
2874
2856 if (cgroup_is_dead(cgrp)) 2875 if (cgroup_is_dead(cgrp))
2857 continue; 2876 continue;
2858 2877
@@ -2866,15 +2885,18 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss,
2866 mutex_lock(&inode->i_mutex); 2885 mutex_lock(&inode->i_mutex);
2867 mutex_lock(&cgroup_mutex); 2886 mutex_lock(&cgroup_mutex);
2868 if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp)) 2887 if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp))
2869 cgroup_addrm_files(cgrp, ss, cfts, is_add); 2888 ret = cgroup_addrm_files(cgrp, cfts, is_add);
2870 mutex_unlock(&cgroup_mutex); 2889 mutex_unlock(&cgroup_mutex);
2871 mutex_unlock(&inode->i_mutex); 2890 mutex_unlock(&inode->i_mutex);
2872 2891
2873 rcu_read_lock(); 2892 rcu_read_lock();
2893 if (ret)
2894 break;
2874 } 2895 }
2875 rcu_read_unlock(); 2896 rcu_read_unlock();
2876 dput(prev); 2897 dput(prev);
2877 deactivate_super(sb); 2898 deactivate_super(sb);
2899 return ret;
2878} 2900}
2879 2901
2880/** 2902/**
@@ -2894,49 +2916,56 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss,
2894int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) 2916int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
2895{ 2917{
2896 struct cftype_set *set; 2918 struct cftype_set *set;
2919 struct cftype *cft;
2920 int ret;
2897 2921
2898 set = kzalloc(sizeof(*set), GFP_KERNEL); 2922 set = kzalloc(sizeof(*set), GFP_KERNEL);
2899 if (!set) 2923 if (!set)
2900 return -ENOMEM; 2924 return -ENOMEM;
2901 2925
2926 for (cft = cfts; cft->name[0] != '\0'; cft++)
2927 cft->ss = ss;
2928
2902 cgroup_cfts_prepare(); 2929 cgroup_cfts_prepare();
2903 set->cfts = cfts; 2930 set->cfts = cfts;
2904 list_add_tail(&set->node, &ss->cftsets); 2931 list_add_tail(&set->node, &ss->cftsets);
2905 cgroup_cfts_commit(ss, cfts, true); 2932 ret = cgroup_cfts_commit(cfts, true);
2906 2933 if (ret)
2907 return 0; 2934 cgroup_rm_cftypes(cfts);
2935 return ret;
2908} 2936}
2909EXPORT_SYMBOL_GPL(cgroup_add_cftypes); 2937EXPORT_SYMBOL_GPL(cgroup_add_cftypes);
2910 2938
2911/** 2939/**
2912 * cgroup_rm_cftypes - remove an array of cftypes from a subsystem 2940 * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
2913 * @ss: target cgroup subsystem
2914 * @cfts: zero-length name terminated array of cftypes 2941 * @cfts: zero-length name terminated array of cftypes
2915 * 2942 *
2916 * Unregister @cfts from @ss. Files described by @cfts are removed from 2943 * Unregister @cfts. Files described by @cfts are removed from all
2917 * all existing cgroups to which @ss is attached and all future cgroups 2944 * existing cgroups and all future cgroups won't have them either. This
2918 * won't have them either. This function can be called anytime whether @ss 2945 * function can be called anytime whether @cfts' subsys is attached or not.
2919 * is attached or not.
2920 * 2946 *
2921 * Returns 0 on successful unregistration, -ENOENT if @cfts is not 2947 * Returns 0 on successful unregistration, -ENOENT if @cfts is not
2922 * registered with @ss. 2948 * registered.
2923 */ 2949 */
2924int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) 2950int cgroup_rm_cftypes(struct cftype *cfts)
2925{ 2951{
2926 struct cftype_set *set; 2952 struct cftype_set *set;
2927 2953
2954 if (!cfts || !cfts[0].ss)
2955 return -ENOENT;
2956
2928 cgroup_cfts_prepare(); 2957 cgroup_cfts_prepare();
2929 2958
2930 list_for_each_entry(set, &ss->cftsets, node) { 2959 list_for_each_entry(set, &cfts[0].ss->cftsets, node) {
2931 if (set->cfts == cfts) { 2960 if (set->cfts == cfts) {
2932 list_del(&set->node); 2961 list_del(&set->node);
2933 kfree(set); 2962 kfree(set);
2934 cgroup_cfts_commit(ss, cfts, false); 2963 cgroup_cfts_commit(cfts, false);
2935 return 0; 2964 return 0;
2936 } 2965 }
2937 } 2966 }
2938 2967
2939 cgroup_cfts_commit(ss, NULL, false); 2968 cgroup_cfts_commit(NULL, false);
2940 return -ENOENT; 2969 return -ENOENT;
2941} 2970}
2942 2971
@@ -2959,34 +2988,10 @@ int cgroup_task_count(const struct cgroup *cgrp)
2959} 2988}
2960 2989
2961/* 2990/*
2962 * Advance a list_head iterator. The iterator should be positioned at 2991 * To reduce the fork() overhead for systems that are not actually using
2963 * the start of a css_set 2992 * their cgroups capability, we don't maintain the lists running through
2964 */ 2993 * each css_set to its tasks until we see the list actually used - in other
2965static void cgroup_advance_iter(struct cgroup *cgrp, struct cgroup_iter *it) 2994 * words after the first call to css_task_iter_start().
2966{
2967 struct list_head *l = it->cset_link;
2968 struct cgrp_cset_link *link;
2969 struct css_set *cset;
2970
2971 /* Advance to the next non-empty css_set */
2972 do {
2973 l = l->next;
2974 if (l == &cgrp->cset_links) {
2975 it->cset_link = NULL;
2976 return;
2977 }
2978 link = list_entry(l, struct cgrp_cset_link, cset_link);
2979 cset = link->cset;
2980 } while (list_empty(&cset->tasks));
2981 it->cset_link = l;
2982 it->task = cset->tasks.next;
2983}
2984
2985/*
2986 * To reduce the fork() overhead for systems that are not actually
2987 * using their cgroups capability, we don't maintain the lists running
2988 * through each css_set to its tasks until we see the list actually
2989 * used - in other words after the first call to cgroup_iter_start().
2990 */ 2995 */
2991static void cgroup_enable_task_cg_lists(void) 2996static void cgroup_enable_task_cg_lists(void)
2992{ 2997{
@@ -3017,16 +3022,21 @@ static void cgroup_enable_task_cg_lists(void)
3017} 3022}
3018 3023
3019/** 3024/**
3020 * cgroup_next_sibling - find the next sibling of a given cgroup 3025 * css_next_child - find the next child of a given css
3021 * @pos: the current cgroup 3026 * @pos_css: the current position (%NULL to initiate traversal)
3027 * @parent_css: css whose children to walk
3022 * 3028 *
3023 * This function returns the next sibling of @pos and should be called 3029 * This function returns the next child of @parent_css and should be called
3024 * under RCU read lock. The only requirement is that @pos is accessible. 3030 * under RCU read lock. The only requirement is that @parent_css and
3025 * The next sibling is guaranteed to be returned regardless of @pos's 3031 * @pos_css are accessible. The next sibling is guaranteed to be returned
3026 * state. 3032 * regardless of their states.
3027 */ 3033 */
3028struct cgroup *cgroup_next_sibling(struct cgroup *pos) 3034struct cgroup_subsys_state *
3035css_next_child(struct cgroup_subsys_state *pos_css,
3036 struct cgroup_subsys_state *parent_css)
3029{ 3037{
3038 struct cgroup *pos = pos_css ? pos_css->cgroup : NULL;
3039 struct cgroup *cgrp = parent_css->cgroup;
3030 struct cgroup *next; 3040 struct cgroup *next;
3031 3041
3032 WARN_ON_ONCE(!rcu_read_lock_held()); 3042 WARN_ON_ONCE(!rcu_read_lock_held());
@@ -3041,78 +3051,81 @@ struct cgroup *cgroup_next_sibling(struct cgroup *pos)
3041 * safe to dereference from this RCU critical section. If 3051 * safe to dereference from this RCU critical section. If
3042 * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed 3052 * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed
3043 * to be visible as %true here. 3053 * to be visible as %true here.
3054 *
3055 * If @pos is dead, its next pointer can't be dereferenced;
3056 * however, as each cgroup is given a monotonically increasing
3057 * unique serial number and always appended to the sibling list,
3058 * the next one can be found by walking the parent's children until
3059 * we see a cgroup with higher serial number than @pos's. While
3060 * this path can be slower, it's taken only when either the current
3061 * cgroup is removed or iteration and removal race.
3044 */ 3062 */
3045 if (likely(!cgroup_is_dead(pos))) { 3063 if (!pos) {
3064 next = list_entry_rcu(cgrp->children.next, struct cgroup, sibling);
3065 } else if (likely(!cgroup_is_dead(pos))) {
3046 next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); 3066 next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
3047 if (&next->sibling != &pos->parent->children) 3067 } else {
3048 return next; 3068 list_for_each_entry_rcu(next, &cgrp->children, sibling)
3049 return NULL; 3069 if (next->serial_nr > pos->serial_nr)
3070 break;
3050 } 3071 }
3051 3072
3052 /* 3073 if (&next->sibling == &cgrp->children)
3053 * Can't dereference the next pointer. Each cgroup is given a 3074 return NULL;
3054 * monotonically increasing unique serial number and always 3075
3055 * appended to the sibling list, so the next one can be found by 3076 return cgroup_css(next, parent_css->ss);
3056 * walking the parent's children until we see a cgroup with higher
3057 * serial number than @pos's.
3058 *
3059 * While this path can be slow, it's taken only when either the
3060 * current cgroup is removed or iteration and removal race.
3061 */
3062 list_for_each_entry_rcu(next, &pos->parent->children, sibling)
3063 if (next->serial_nr > pos->serial_nr)
3064 return next;
3065 return NULL;
3066} 3077}
3067EXPORT_SYMBOL_GPL(cgroup_next_sibling); 3078EXPORT_SYMBOL_GPL(css_next_child);
3068 3079
3069/** 3080/**
3070 * cgroup_next_descendant_pre - find the next descendant for pre-order walk 3081 * css_next_descendant_pre - find the next descendant for pre-order walk
3071 * @pos: the current position (%NULL to initiate traversal) 3082 * @pos: the current position (%NULL to initiate traversal)
3072 * @cgroup: cgroup whose descendants to walk 3083 * @root: css whose descendants to walk
3073 * 3084 *
3074 * To be used by cgroup_for_each_descendant_pre(). Find the next 3085 * To be used by css_for_each_descendant_pre(). Find the next descendant
3075 * descendant to visit for pre-order traversal of @cgroup's descendants. 3086 * to visit for pre-order traversal of @root's descendants. @root is
3087 * included in the iteration and the first node to be visited.
3076 * 3088 *
3077 * While this function requires RCU read locking, it doesn't require the 3089 * While this function requires RCU read locking, it doesn't require the
3078 * whole traversal to be contained in a single RCU critical section. This 3090 * whole traversal to be contained in a single RCU critical section. This
3079 * function will return the correct next descendant as long as both @pos 3091 * function will return the correct next descendant as long as both @pos
3080 * and @cgroup are accessible and @pos is a descendant of @cgroup. 3092 * and @root are accessible and @pos is a descendant of @root.
3081 */ 3093 */
3082struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, 3094struct cgroup_subsys_state *
3083 struct cgroup *cgroup) 3095css_next_descendant_pre(struct cgroup_subsys_state *pos,
3096 struct cgroup_subsys_state *root)
3084{ 3097{
3085 struct cgroup *next; 3098 struct cgroup_subsys_state *next;
3086 3099
3087 WARN_ON_ONCE(!rcu_read_lock_held()); 3100 WARN_ON_ONCE(!rcu_read_lock_held());
3088 3101
3089 /* if first iteration, pretend we just visited @cgroup */ 3102 /* if first iteration, visit @root */
3090 if (!pos) 3103 if (!pos)
3091 pos = cgroup; 3104 return root;
3092 3105
3093 /* visit the first child if exists */ 3106 /* visit the first child if exists */
3094 next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling); 3107 next = css_next_child(NULL, pos);
3095 if (next) 3108 if (next)
3096 return next; 3109 return next;
3097 3110
3098 /* no child, visit my or the closest ancestor's next sibling */ 3111 /* no child, visit my or the closest ancestor's next sibling */
3099 while (pos != cgroup) { 3112 while (pos != root) {
3100 next = cgroup_next_sibling(pos); 3113 next = css_next_child(pos, css_parent(pos));
3101 if (next) 3114 if (next)
3102 return next; 3115 return next;
3103 pos = pos->parent; 3116 pos = css_parent(pos);
3104 } 3117 }
3105 3118
3106 return NULL; 3119 return NULL;
3107} 3120}
3108EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); 3121EXPORT_SYMBOL_GPL(css_next_descendant_pre);
3109 3122
3110/** 3123/**
3111 * cgroup_rightmost_descendant - return the rightmost descendant of a cgroup 3124 * css_rightmost_descendant - return the rightmost descendant of a css
3112 * @pos: cgroup of interest 3125 * @pos: css of interest
3113 * 3126 *
3114 * Return the rightmost descendant of @pos. If there's no descendant, 3127 * Return the rightmost descendant of @pos. If there's no descendant, @pos
3115 * @pos is returned. This can be used during pre-order traversal to skip 3128 * is returned. This can be used during pre-order traversal to skip
3116 * subtree of @pos. 3129 * subtree of @pos.
3117 * 3130 *
3118 * While this function requires RCU read locking, it doesn't require the 3131 * While this function requires RCU read locking, it doesn't require the
@@ -3120,9 +3133,10 @@ EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);
3120 * function will return the correct rightmost descendant as long as @pos is 3133 * function will return the correct rightmost descendant as long as @pos is
3121 * accessible. 3134 * accessible.
3122 */ 3135 */
3123struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos) 3136struct cgroup_subsys_state *
3137css_rightmost_descendant(struct cgroup_subsys_state *pos)
3124{ 3138{
3125 struct cgroup *last, *tmp; 3139 struct cgroup_subsys_state *last, *tmp;
3126 3140
3127 WARN_ON_ONCE(!rcu_read_lock_held()); 3141 WARN_ON_ONCE(!rcu_read_lock_held());
3128 3142
@@ -3130,82 +3144,138 @@ struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos)
3130 last = pos; 3144 last = pos;
3131 /* ->prev isn't RCU safe, walk ->next till the end */ 3145 /* ->prev isn't RCU safe, walk ->next till the end */
3132 pos = NULL; 3146 pos = NULL;
3133 list_for_each_entry_rcu(tmp, &last->children, sibling) 3147 css_for_each_child(tmp, last)
3134 pos = tmp; 3148 pos = tmp;
3135 } while (pos); 3149 } while (pos);
3136 3150
3137 return last; 3151 return last;
3138} 3152}
3139EXPORT_SYMBOL_GPL(cgroup_rightmost_descendant); 3153EXPORT_SYMBOL_GPL(css_rightmost_descendant);
3140 3154
3141static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos) 3155static struct cgroup_subsys_state *
3156css_leftmost_descendant(struct cgroup_subsys_state *pos)
3142{ 3157{
3143 struct cgroup *last; 3158 struct cgroup_subsys_state *last;
3144 3159
3145 do { 3160 do {
3146 last = pos; 3161 last = pos;
3147 pos = list_first_or_null_rcu(&pos->children, struct cgroup, 3162 pos = css_next_child(NULL, pos);
3148 sibling);
3149 } while (pos); 3163 } while (pos);
3150 3164
3151 return last; 3165 return last;
3152} 3166}
3153 3167
3154/** 3168/**
3155 * cgroup_next_descendant_post - find the next descendant for post-order walk 3169 * css_next_descendant_post - find the next descendant for post-order walk
3156 * @pos: the current position (%NULL to initiate traversal) 3170 * @pos: the current position (%NULL to initiate traversal)
3157 * @cgroup: cgroup whose descendants to walk 3171 * @root: css whose descendants to walk
3158 * 3172 *
3159 * To be used by cgroup_for_each_descendant_post(). Find the next 3173 * To be used by css_for_each_descendant_post(). Find the next descendant
3160 * descendant to visit for post-order traversal of @cgroup's descendants. 3174 * to visit for post-order traversal of @root's descendants. @root is
3175 * included in the iteration and the last node to be visited.
3161 * 3176 *
3162 * While this function requires RCU read locking, it doesn't require the 3177 * While this function requires RCU read locking, it doesn't require the
3163 * whole traversal to be contained in a single RCU critical section. This 3178 * whole traversal to be contained in a single RCU critical section. This
3164 * function will return the correct next descendant as long as both @pos 3179 * function will return the correct next descendant as long as both @pos
3165 * and @cgroup are accessible and @pos is a descendant of @cgroup. 3180 * and @cgroup are accessible and @pos is a descendant of @cgroup.
3166 */ 3181 */
3167struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, 3182struct cgroup_subsys_state *
3168 struct cgroup *cgroup) 3183css_next_descendant_post(struct cgroup_subsys_state *pos,
3184 struct cgroup_subsys_state *root)
3169{ 3185{
3170 struct cgroup *next; 3186 struct cgroup_subsys_state *next;
3171 3187
3172 WARN_ON_ONCE(!rcu_read_lock_held()); 3188 WARN_ON_ONCE(!rcu_read_lock_held());
3173 3189
3174 /* if first iteration, visit the leftmost descendant */ 3190 /* if first iteration, visit the leftmost descendant */
3175 if (!pos) { 3191 if (!pos) {
3176 next = cgroup_leftmost_descendant(cgroup); 3192 next = css_leftmost_descendant(root);
3177 return next != cgroup ? next : NULL; 3193 return next != root ? next : NULL;
3178 } 3194 }
3179 3195
3196 /* if we visited @root, we're done */
3197 if (pos == root)
3198 return NULL;
3199
3180 /* if there's an unvisited sibling, visit its leftmost descendant */ 3200 /* if there's an unvisited sibling, visit its leftmost descendant */
3181 next = cgroup_next_sibling(pos); 3201 next = css_next_child(pos, css_parent(pos));
3182 if (next) 3202 if (next)
3183 return cgroup_leftmost_descendant(next); 3203 return css_leftmost_descendant(next);
3184 3204
3185 /* no sibling left, visit parent */ 3205 /* no sibling left, visit parent */
3186 next = pos->parent; 3206 return css_parent(pos);
3187 return next != cgroup ? next : NULL; 3207}
3208EXPORT_SYMBOL_GPL(css_next_descendant_post);
3209
3210/**
3211 * css_advance_task_iter - advance a task itererator to the next css_set
3212 * @it: the iterator to advance
3213 *
3214 * Advance @it to the next css_set to walk.
3215 */
3216static void css_advance_task_iter(struct css_task_iter *it)
3217{
3218 struct list_head *l = it->cset_link;
3219 struct cgrp_cset_link *link;
3220 struct css_set *cset;
3221
3222 /* Advance to the next non-empty css_set */
3223 do {
3224 l = l->next;
3225 if (l == &it->origin_css->cgroup->cset_links) {
3226 it->cset_link = NULL;
3227 return;
3228 }
3229 link = list_entry(l, struct cgrp_cset_link, cset_link);
3230 cset = link->cset;
3231 } while (list_empty(&cset->tasks));
3232 it->cset_link = l;
3233 it->task = cset->tasks.next;
3188} 3234}
3189EXPORT_SYMBOL_GPL(cgroup_next_descendant_post);
3190 3235
3191void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) 3236/**
3237 * css_task_iter_start - initiate task iteration
3238 * @css: the css to walk tasks of
3239 * @it: the task iterator to use
3240 *
3241 * Initiate iteration through the tasks of @css. The caller can call
3242 * css_task_iter_next() to walk through the tasks until the function
3243 * returns NULL. On completion of iteration, css_task_iter_end() must be
3244 * called.
3245 *
3246 * Note that this function acquires a lock which is released when the
3247 * iteration finishes. The caller can't sleep while iteration is in
3248 * progress.
3249 */
3250void css_task_iter_start(struct cgroup_subsys_state *css,
3251 struct css_task_iter *it)
3192 __acquires(css_set_lock) 3252 __acquires(css_set_lock)
3193{ 3253{
3194 /* 3254 /*
3195 * The first time anyone tries to iterate across a cgroup, 3255 * The first time anyone tries to iterate across a css, we need to
3196 * we need to enable the list linking each css_set to its 3256 * enable the list linking each css_set to its tasks, and fix up
3197 * tasks, and fix up all existing tasks. 3257 * all existing tasks.
3198 */ 3258 */
3199 if (!use_task_css_set_links) 3259 if (!use_task_css_set_links)
3200 cgroup_enable_task_cg_lists(); 3260 cgroup_enable_task_cg_lists();
3201 3261
3202 read_lock(&css_set_lock); 3262 read_lock(&css_set_lock);
3203 it->cset_link = &cgrp->cset_links; 3263
3204 cgroup_advance_iter(cgrp, it); 3264 it->origin_css = css;
3265 it->cset_link = &css->cgroup->cset_links;
3266
3267 css_advance_task_iter(it);
3205} 3268}
3206 3269
3207struct task_struct *cgroup_iter_next(struct cgroup *cgrp, 3270/**
3208 struct cgroup_iter *it) 3271 * css_task_iter_next - return the next task for the iterator
3272 * @it: the task iterator being iterated
3273 *
3274 * The "next" function for task iteration. @it should have been
3275 * initialized via css_task_iter_start(). Returns NULL when the iteration
3276 * reaches the end.
3277 */
3278struct task_struct *css_task_iter_next(struct css_task_iter *it)
3209{ 3279{
3210 struct task_struct *res; 3280 struct task_struct *res;
3211 struct list_head *l = it->task; 3281 struct list_head *l = it->task;
@@ -3219,16 +3289,24 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
3219 l = l->next; 3289 l = l->next;
3220 link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link); 3290 link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link);
3221 if (l == &link->cset->tasks) { 3291 if (l == &link->cset->tasks) {
3222 /* We reached the end of this task list - move on to 3292 /*
3223 * the next cg_cgroup_link */ 3293 * We reached the end of this task list - move on to the
3224 cgroup_advance_iter(cgrp, it); 3294 * next cgrp_cset_link.
3295 */
3296 css_advance_task_iter(it);
3225 } else { 3297 } else {
3226 it->task = l; 3298 it->task = l;
3227 } 3299 }
3228 return res; 3300 return res;
3229} 3301}
3230 3302
3231void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it) 3303/**
3304 * css_task_iter_end - finish task iteration
3305 * @it: the task iterator to finish
3306 *
3307 * Finish task iteration started by css_task_iter_start().
3308 */
3309void css_task_iter_end(struct css_task_iter *it)
3232 __releases(css_set_lock) 3310 __releases(css_set_lock)
3233{ 3311{
3234 read_unlock(&css_set_lock); 3312 read_unlock(&css_set_lock);
@@ -3269,46 +3347,49 @@ static inline int started_after(void *p1, void *p2)
3269} 3347}
3270 3348
3271/** 3349/**
3272 * cgroup_scan_tasks - iterate though all the tasks in a cgroup 3350 * css_scan_tasks - iterate though all the tasks in a css
3273 * @scan: struct cgroup_scanner containing arguments for the scan 3351 * @css: the css to iterate tasks of
3352 * @test: optional test callback
3353 * @process: process callback
3354 * @data: data passed to @test and @process
3355 * @heap: optional pre-allocated heap used for task iteration
3356 *
3357 * Iterate through all the tasks in @css, calling @test for each, and if it
3358 * returns %true, call @process for it also.
3274 * 3359 *
3275 * Arguments include pointers to callback functions test_task() and 3360 * @test may be NULL, meaning always true (select all tasks), which
3276 * process_task(). 3361 * effectively duplicates css_task_iter_{start,next,end}() but does not
3277 * Iterate through all the tasks in a cgroup, calling test_task() for each, 3362 * lock css_set_lock for the call to @process.
3278 * and if it returns true, call process_task() for it also.
3279 * The test_task pointer may be NULL, meaning always true (select all tasks).
3280 * Effectively duplicates cgroup_iter_{start,next,end}()
3281 * but does not lock css_set_lock for the call to process_task().
3282 * The struct cgroup_scanner may be embedded in any structure of the caller's
3283 * creation.
3284 * It is guaranteed that process_task() will act on every task that
3285 * is a member of the cgroup for the duration of this call. This
3286 * function may or may not call process_task() for tasks that exit
3287 * or move to a different cgroup during the call, or are forked or
3288 * move into the cgroup during the call.
3289 * 3363 *
3290 * Note that test_task() may be called with locks held, and may in some 3364 * It is guaranteed that @process will act on every task that is a member
3291 * situations be called multiple times for the same task, so it should 3365 * of @css for the duration of this call. This function may or may not
3292 * be cheap. 3366 * call @process for tasks that exit or move to a different css during the
3293 * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been 3367 * call, or are forked or move into the css during the call.
3294 * pre-allocated and will be used for heap operations (and its "gt" member will 3368 *
3295 * be overwritten), else a temporary heap will be used (allocation of which 3369 * Note that @test may be called with locks held, and may in some
3296 * may cause this function to fail). 3370 * situations be called multiple times for the same task, so it should be
3371 * cheap.
3372 *
3373 * If @heap is non-NULL, a heap has been pre-allocated and will be used for
3374 * heap operations (and its "gt" member will be overwritten), else a
3375 * temporary heap will be used (allocation of which may cause this function
3376 * to fail).
3297 */ 3377 */
3298int cgroup_scan_tasks(struct cgroup_scanner *scan) 3378int css_scan_tasks(struct cgroup_subsys_state *css,
3379 bool (*test)(struct task_struct *, void *),
3380 void (*process)(struct task_struct *, void *),
3381 void *data, struct ptr_heap *heap)
3299{ 3382{
3300 int retval, i; 3383 int retval, i;
3301 struct cgroup_iter it; 3384 struct css_task_iter it;
3302 struct task_struct *p, *dropped; 3385 struct task_struct *p, *dropped;
3303 /* Never dereference latest_task, since it's not refcounted */ 3386 /* Never dereference latest_task, since it's not refcounted */
3304 struct task_struct *latest_task = NULL; 3387 struct task_struct *latest_task = NULL;
3305 struct ptr_heap tmp_heap; 3388 struct ptr_heap tmp_heap;
3306 struct ptr_heap *heap;
3307 struct timespec latest_time = { 0, 0 }; 3389 struct timespec latest_time = { 0, 0 };
3308 3390
3309 if (scan->heap) { 3391 if (heap) {
3310 /* The caller supplied our heap and pre-allocated its memory */ 3392 /* The caller supplied our heap and pre-allocated its memory */
3311 heap = scan->heap;
3312 heap->gt = &started_after; 3393 heap->gt = &started_after;
3313 } else { 3394 } else {
3314 /* We need to allocate our own heap memory */ 3395 /* We need to allocate our own heap memory */
@@ -3321,25 +3402,24 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
3321 3402
3322 again: 3403 again:
3323 /* 3404 /*
3324 * Scan tasks in the cgroup, using the scanner's "test_task" callback 3405 * Scan tasks in the css, using the @test callback to determine
3325 * to determine which are of interest, and using the scanner's 3406 * which are of interest, and invoking @process callback on the
3326 * "process_task" callback to process any of them that need an update. 3407 * ones which need an update. Since we don't want to hold any
3327 * Since we don't want to hold any locks during the task updates, 3408 * locks during the task updates, gather tasks to be processed in a
3328 * gather tasks to be processed in a heap structure. 3409 * heap structure. The heap is sorted by descending task start
3329 * The heap is sorted by descending task start time. 3410 * time. If the statically-sized heap fills up, we overflow tasks
3330 * If the statically-sized heap fills up, we overflow tasks that 3411 * that started later, and in future iterations only consider tasks
3331 * started later, and in future iterations only consider tasks that 3412 * that started after the latest task in the previous pass. This
3332 * started after the latest task in the previous pass. This
3333 * guarantees forward progress and that we don't miss any tasks. 3413 * guarantees forward progress and that we don't miss any tasks.
3334 */ 3414 */
3335 heap->size = 0; 3415 heap->size = 0;
3336 cgroup_iter_start(scan->cg, &it); 3416 css_task_iter_start(css, &it);
3337 while ((p = cgroup_iter_next(scan->cg, &it))) { 3417 while ((p = css_task_iter_next(&it))) {
3338 /* 3418 /*
3339 * Only affect tasks that qualify per the caller's callback, 3419 * Only affect tasks that qualify per the caller's callback,
3340 * if he provided one 3420 * if he provided one
3341 */ 3421 */
3342 if (scan->test_task && !scan->test_task(p, scan)) 3422 if (test && !test(p, data))
3343 continue; 3423 continue;
3344 /* 3424 /*
3345 * Only process tasks that started after the last task 3425 * Only process tasks that started after the last task
@@ -3367,7 +3447,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
3367 * the heap and wasn't inserted 3447 * the heap and wasn't inserted
3368 */ 3448 */
3369 } 3449 }
3370 cgroup_iter_end(scan->cg, &it); 3450 css_task_iter_end(&it);
3371 3451
3372 if (heap->size) { 3452 if (heap->size) {
3373 for (i = 0; i < heap->size; i++) { 3453 for (i = 0; i < heap->size; i++) {
@@ -3377,7 +3457,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
3377 latest_task = q; 3457 latest_task = q;
3378 } 3458 }
3379 /* Process the task per the caller's callback */ 3459 /* Process the task per the caller's callback */
3380 scan->process_task(q, scan); 3460 process(q, data);
3381 put_task_struct(q); 3461 put_task_struct(q);
3382 } 3462 }
3383 /* 3463 /*
@@ -3394,10 +3474,9 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
3394 return 0; 3474 return 0;
3395} 3475}
3396 3476
3397static void cgroup_transfer_one_task(struct task_struct *task, 3477static void cgroup_transfer_one_task(struct task_struct *task, void *data)
3398 struct cgroup_scanner *scan)
3399{ 3478{
3400 struct cgroup *new_cgroup = scan->data; 3479 struct cgroup *new_cgroup = data;
3401 3480
3402 mutex_lock(&cgroup_mutex); 3481 mutex_lock(&cgroup_mutex);
3403 cgroup_attach_task(new_cgroup, task, false); 3482 cgroup_attach_task(new_cgroup, task, false);
@@ -3411,15 +3490,8 @@ static void cgroup_transfer_one_task(struct task_struct *task,
3411 */ 3490 */
3412int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) 3491int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
3413{ 3492{
3414 struct cgroup_scanner scan; 3493 return css_scan_tasks(&from->dummy_css, NULL, cgroup_transfer_one_task,
3415 3494 to, NULL);
3416 scan.cg = from;
3417 scan.test_task = NULL; /* select all tasks in cgroup */
3418 scan.process_task = cgroup_transfer_one_task;
3419 scan.heap = NULL;
3420 scan.data = to;
3421
3422 return cgroup_scan_tasks(&scan);
3423} 3495}
3424 3496
3425/* 3497/*
@@ -3461,7 +3533,7 @@ struct cgroup_pidlist {
3461 /* pointer to the cgroup we belong to, for list removal purposes */ 3533 /* pointer to the cgroup we belong to, for list removal purposes */
3462 struct cgroup *owner; 3534 struct cgroup *owner;
3463 /* protects the other fields */ 3535 /* protects the other fields */
3464 struct rw_semaphore mutex; 3536 struct rw_semaphore rwsem;
3465}; 3537};
3466 3538
3467/* 3539/*
@@ -3534,7 +3606,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3534 struct pid_namespace *ns = task_active_pid_ns(current); 3606 struct pid_namespace *ns = task_active_pid_ns(current);
3535 3607
3536 /* 3608 /*
3537 * We can't drop the pidlist_mutex before taking the l->mutex in case 3609 * We can't drop the pidlist_mutex before taking the l->rwsem in case
3538 * the last ref-holder is trying to remove l from the list at the same 3610 * the last ref-holder is trying to remove l from the list at the same
3539 * time. Holding the pidlist_mutex precludes somebody taking whichever 3611 * time. Holding the pidlist_mutex precludes somebody taking whichever
3540 * list we find out from under us - compare release_pid_array(). 3612 * list we find out from under us - compare release_pid_array().
@@ -3543,7 +3615,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3543 list_for_each_entry(l, &cgrp->pidlists, links) { 3615 list_for_each_entry(l, &cgrp->pidlists, links) {
3544 if (l->key.type == type && l->key.ns == ns) { 3616 if (l->key.type == type && l->key.ns == ns) {
3545 /* make sure l doesn't vanish out from under us */ 3617 /* make sure l doesn't vanish out from under us */
3546 down_write(&l->mutex); 3618 down_write(&l->rwsem);
3547 mutex_unlock(&cgrp->pidlist_mutex); 3619 mutex_unlock(&cgrp->pidlist_mutex);
3548 return l; 3620 return l;
3549 } 3621 }
@@ -3554,8 +3626,8 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
3554 mutex_unlock(&cgrp->pidlist_mutex); 3626 mutex_unlock(&cgrp->pidlist_mutex);
3555 return l; 3627 return l;
3556 } 3628 }
3557 init_rwsem(&l->mutex); 3629 init_rwsem(&l->rwsem);
3558 down_write(&l->mutex); 3630 down_write(&l->rwsem);
3559 l->key.type = type; 3631 l->key.type = type;
3560 l->key.ns = get_pid_ns(ns); 3632 l->key.ns = get_pid_ns(ns);
3561 l->owner = cgrp; 3633 l->owner = cgrp;
@@ -3573,7 +3645,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3573 pid_t *array; 3645 pid_t *array;
3574 int length; 3646 int length;
3575 int pid, n = 0; /* used for populating the array */ 3647 int pid, n = 0; /* used for populating the array */
3576 struct cgroup_iter it; 3648 struct css_task_iter it;
3577 struct task_struct *tsk; 3649 struct task_struct *tsk;
3578 struct cgroup_pidlist *l; 3650 struct cgroup_pidlist *l;
3579 3651
@@ -3588,8 +3660,8 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3588 if (!array) 3660 if (!array)
3589 return -ENOMEM; 3661 return -ENOMEM;
3590 /* now, populate the array */ 3662 /* now, populate the array */
3591 cgroup_iter_start(cgrp, &it); 3663 css_task_iter_start(&cgrp->dummy_css, &it);
3592 while ((tsk = cgroup_iter_next(cgrp, &it))) { 3664 while ((tsk = css_task_iter_next(&it))) {
3593 if (unlikely(n == length)) 3665 if (unlikely(n == length))
3594 break; 3666 break;
3595 /* get tgid or pid for procs or tasks file respectively */ 3667 /* get tgid or pid for procs or tasks file respectively */
@@ -3600,7 +3672,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3600 if (pid > 0) /* make sure to only use valid results */ 3672 if (pid > 0) /* make sure to only use valid results */
3601 array[n++] = pid; 3673 array[n++] = pid;
3602 } 3674 }
3603 cgroup_iter_end(cgrp, &it); 3675 css_task_iter_end(&it);
3604 length = n; 3676 length = n;
3605 /* now sort & (if procs) strip out duplicates */ 3677 /* now sort & (if procs) strip out duplicates */
3606 sort(array, length, sizeof(pid_t), cmppid, NULL); 3678 sort(array, length, sizeof(pid_t), cmppid, NULL);
@@ -3616,7 +3688,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3616 l->list = array; 3688 l->list = array;
3617 l->length = length; 3689 l->length = length;
3618 l->use_count++; 3690 l->use_count++;
3619 up_write(&l->mutex); 3691 up_write(&l->rwsem);
3620 *lp = l; 3692 *lp = l;
3621 return 0; 3693 return 0;
3622} 3694}
@@ -3634,7 +3706,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3634{ 3706{
3635 int ret = -EINVAL; 3707 int ret = -EINVAL;
3636 struct cgroup *cgrp; 3708 struct cgroup *cgrp;
3637 struct cgroup_iter it; 3709 struct css_task_iter it;
3638 struct task_struct *tsk; 3710 struct task_struct *tsk;
3639 3711
3640 /* 3712 /*
@@ -3648,8 +3720,8 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3648 ret = 0; 3720 ret = 0;
3649 cgrp = dentry->d_fsdata; 3721 cgrp = dentry->d_fsdata;
3650 3722
3651 cgroup_iter_start(cgrp, &it); 3723 css_task_iter_start(&cgrp->dummy_css, &it);
3652 while ((tsk = cgroup_iter_next(cgrp, &it))) { 3724 while ((tsk = css_task_iter_next(&it))) {
3653 switch (tsk->state) { 3725 switch (tsk->state) {
3654 case TASK_RUNNING: 3726 case TASK_RUNNING:
3655 stats->nr_running++; 3727 stats->nr_running++;
@@ -3669,7 +3741,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
3669 break; 3741 break;
3670 } 3742 }
3671 } 3743 }
3672 cgroup_iter_end(cgrp, &it); 3744 css_task_iter_end(&it);
3673 3745
3674err: 3746err:
3675 return ret; 3747 return ret;
@@ -3694,7 +3766,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
3694 int index = 0, pid = *pos; 3766 int index = 0, pid = *pos;
3695 int *iter; 3767 int *iter;
3696 3768
3697 down_read(&l->mutex); 3769 down_read(&l->rwsem);
3698 if (pid) { 3770 if (pid) {
3699 int end = l->length; 3771 int end = l->length;
3700 3772
@@ -3721,7 +3793,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
3721static void cgroup_pidlist_stop(struct seq_file *s, void *v) 3793static void cgroup_pidlist_stop(struct seq_file *s, void *v)
3722{ 3794{
3723 struct cgroup_pidlist *l = s->private; 3795 struct cgroup_pidlist *l = s->private;
3724 up_read(&l->mutex); 3796 up_read(&l->rwsem);
3725} 3797}
3726 3798
3727static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) 3799static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
@@ -3767,7 +3839,7 @@ static void cgroup_release_pid_array(struct cgroup_pidlist *l)
3767 * pidlist_mutex, we have to take pidlist_mutex first. 3839 * pidlist_mutex, we have to take pidlist_mutex first.
3768 */ 3840 */
3769 mutex_lock(&l->owner->pidlist_mutex); 3841 mutex_lock(&l->owner->pidlist_mutex);
3770 down_write(&l->mutex); 3842 down_write(&l->rwsem);
3771 BUG_ON(!l->use_count); 3843 BUG_ON(!l->use_count);
3772 if (!--l->use_count) { 3844 if (!--l->use_count) {
3773 /* we're the last user if refcount is 0; remove and free */ 3845 /* we're the last user if refcount is 0; remove and free */
@@ -3775,12 +3847,12 @@ static void cgroup_release_pid_array(struct cgroup_pidlist *l)
3775 mutex_unlock(&l->owner->pidlist_mutex); 3847 mutex_unlock(&l->owner->pidlist_mutex);
3776 pidlist_free(l->list); 3848 pidlist_free(l->list);
3777 put_pid_ns(l->key.ns); 3849 put_pid_ns(l->key.ns);
3778 up_write(&l->mutex); 3850 up_write(&l->rwsem);
3779 kfree(l); 3851 kfree(l);
3780 return; 3852 return;
3781 } 3853 }
3782 mutex_unlock(&l->owner->pidlist_mutex); 3854 mutex_unlock(&l->owner->pidlist_mutex);
3783 up_write(&l->mutex); 3855 up_write(&l->rwsem);
3784} 3856}
3785 3857
3786static int cgroup_pidlist_release(struct inode *inode, struct file *file) 3858static int cgroup_pidlist_release(struct inode *inode, struct file *file)
@@ -3844,21 +3916,20 @@ static int cgroup_procs_open(struct inode *unused, struct file *file)
3844 return cgroup_pidlist_open(file, CGROUP_FILE_PROCS); 3916 return cgroup_pidlist_open(file, CGROUP_FILE_PROCS);
3845} 3917}
3846 3918
3847static u64 cgroup_read_notify_on_release(struct cgroup *cgrp, 3919static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
3848 struct cftype *cft) 3920 struct cftype *cft)
3849{ 3921{
3850 return notify_on_release(cgrp); 3922 return notify_on_release(css->cgroup);
3851} 3923}
3852 3924
3853static int cgroup_write_notify_on_release(struct cgroup *cgrp, 3925static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
3854 struct cftype *cft, 3926 struct cftype *cft, u64 val)
3855 u64 val)
3856{ 3927{
3857 clear_bit(CGRP_RELEASABLE, &cgrp->flags); 3928 clear_bit(CGRP_RELEASABLE, &css->cgroup->flags);
3858 if (val) 3929 if (val)
3859 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 3930 set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
3860 else 3931 else
3861 clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 3932 clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
3862 return 0; 3933 return 0;
3863} 3934}
3864 3935
@@ -3888,18 +3959,18 @@ static void cgroup_event_remove(struct work_struct *work)
3888{ 3959{
3889 struct cgroup_event *event = container_of(work, struct cgroup_event, 3960 struct cgroup_event *event = container_of(work, struct cgroup_event,
3890 remove); 3961 remove);
3891 struct cgroup *cgrp = event->cgrp; 3962 struct cgroup_subsys_state *css = event->css;
3892 3963
3893 remove_wait_queue(event->wqh, &event->wait); 3964 remove_wait_queue(event->wqh, &event->wait);
3894 3965
3895 event->cft->unregister_event(cgrp, event->cft, event->eventfd); 3966 event->cft->unregister_event(css, event->cft, event->eventfd);
3896 3967
3897 /* Notify userspace the event is going away. */ 3968 /* Notify userspace the event is going away. */
3898 eventfd_signal(event->eventfd, 1); 3969 eventfd_signal(event->eventfd, 1);
3899 3970
3900 eventfd_ctx_put(event->eventfd); 3971 eventfd_ctx_put(event->eventfd);
3901 kfree(event); 3972 kfree(event);
3902 cgroup_dput(cgrp); 3973 css_put(css);
3903} 3974}
3904 3975
3905/* 3976/*
@@ -3912,7 +3983,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
3912{ 3983{
3913 struct cgroup_event *event = container_of(wait, 3984 struct cgroup_event *event = container_of(wait,
3914 struct cgroup_event, wait); 3985 struct cgroup_event, wait);
3915 struct cgroup *cgrp = event->cgrp; 3986 struct cgroup *cgrp = event->css->cgroup;
3916 unsigned long flags = (unsigned long)key; 3987 unsigned long flags = (unsigned long)key;
3917 3988
3918 if (flags & POLLHUP) { 3989 if (flags & POLLHUP) {
@@ -3956,14 +4027,15 @@ static void cgroup_event_ptable_queue_proc(struct file *file,
3956 * Input must be in format '<event_fd> <control_fd> <args>'. 4027 * Input must be in format '<event_fd> <control_fd> <args>'.
3957 * Interpretation of args is defined by control file implementation. 4028 * Interpretation of args is defined by control file implementation.
3958 */ 4029 */
3959static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, 4030static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css,
3960 const char *buffer) 4031 struct cftype *cft, const char *buffer)
3961{ 4032{
3962 struct cgroup_event *event = NULL; 4033 struct cgroup *cgrp = dummy_css->cgroup;
3963 struct cgroup *cgrp_cfile; 4034 struct cgroup_event *event;
4035 struct cgroup_subsys_state *cfile_css;
3964 unsigned int efd, cfd; 4036 unsigned int efd, cfd;
3965 struct file *efile = NULL; 4037 struct file *efile;
3966 struct file *cfile = NULL; 4038 struct file *cfile;
3967 char *endp; 4039 char *endp;
3968 int ret; 4040 int ret;
3969 4041
@@ -3980,7 +4052,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
3980 event = kzalloc(sizeof(*event), GFP_KERNEL); 4052 event = kzalloc(sizeof(*event), GFP_KERNEL);
3981 if (!event) 4053 if (!event)
3982 return -ENOMEM; 4054 return -ENOMEM;
3983 event->cgrp = cgrp; 4055
3984 INIT_LIST_HEAD(&event->list); 4056 INIT_LIST_HEAD(&event->list);
3985 init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); 4057 init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
3986 init_waitqueue_func_entry(&event->wait, cgroup_event_wake); 4058 init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
@@ -3989,62 +4061,68 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
3989 efile = eventfd_fget(efd); 4061 efile = eventfd_fget(efd);
3990 if (IS_ERR(efile)) { 4062 if (IS_ERR(efile)) {
3991 ret = PTR_ERR(efile); 4063 ret = PTR_ERR(efile);
3992 goto fail; 4064 goto out_kfree;
3993 } 4065 }
3994 4066
3995 event->eventfd = eventfd_ctx_fileget(efile); 4067 event->eventfd = eventfd_ctx_fileget(efile);
3996 if (IS_ERR(event->eventfd)) { 4068 if (IS_ERR(event->eventfd)) {
3997 ret = PTR_ERR(event->eventfd); 4069 ret = PTR_ERR(event->eventfd);
3998 goto fail; 4070 goto out_put_efile;
3999 } 4071 }
4000 4072
4001 cfile = fget(cfd); 4073 cfile = fget(cfd);
4002 if (!cfile) { 4074 if (!cfile) {
4003 ret = -EBADF; 4075 ret = -EBADF;
4004 goto fail; 4076 goto out_put_eventfd;
4005 } 4077 }
4006 4078
4007 /* the process need read permission on control file */ 4079 /* the process need read permission on control file */
4008 /* AV: shouldn't we check that it's been opened for read instead? */ 4080 /* AV: shouldn't we check that it's been opened for read instead? */
4009 ret = inode_permission(file_inode(cfile), MAY_READ); 4081 ret = inode_permission(file_inode(cfile), MAY_READ);
4010 if (ret < 0) 4082 if (ret < 0)
4011 goto fail; 4083 goto out_put_cfile;
4012 4084
4013 event->cft = __file_cft(cfile); 4085 event->cft = __file_cft(cfile);
4014 if (IS_ERR(event->cft)) { 4086 if (IS_ERR(event->cft)) {
4015 ret = PTR_ERR(event->cft); 4087 ret = PTR_ERR(event->cft);
4016 goto fail; 4088 goto out_put_cfile;
4089 }
4090
4091 if (!event->cft->ss) {
4092 ret = -EBADF;
4093 goto out_put_cfile;
4017 } 4094 }
4018 4095
4019 /* 4096 /*
4020 * The file to be monitored must be in the same cgroup as 4097 * Determine the css of @cfile, verify it belongs to the same
4021 * cgroup.event_control is. 4098 * cgroup as cgroup.event_control, and associate @event with it.
4099 * Remaining events are automatically removed on cgroup destruction
4100 * but the removal is asynchronous, so take an extra ref.
4022 */ 4101 */
4023 cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent); 4102 rcu_read_lock();
4024 if (cgrp_cfile != cgrp) { 4103
4025 ret = -EINVAL; 4104 ret = -EINVAL;
4026 goto fail; 4105 event->css = cgroup_css(cgrp, event->cft->ss);
4027 } 4106 cfile_css = css_from_dir(cfile->f_dentry->d_parent, event->cft->ss);
4107 if (event->css && event->css == cfile_css && css_tryget(event->css))
4108 ret = 0;
4109
4110 rcu_read_unlock();
4111 if (ret)
4112 goto out_put_cfile;
4028 4113
4029 if (!event->cft->register_event || !event->cft->unregister_event) { 4114 if (!event->cft->register_event || !event->cft->unregister_event) {
4030 ret = -EINVAL; 4115 ret = -EINVAL;
4031 goto fail; 4116 goto out_put_css;
4032 } 4117 }
4033 4118
4034 ret = event->cft->register_event(cgrp, event->cft, 4119 ret = event->cft->register_event(event->css, event->cft,
4035 event->eventfd, buffer); 4120 event->eventfd, buffer);
4036 if (ret) 4121 if (ret)
4037 goto fail; 4122 goto out_put_css;
4038 4123
4039 efile->f_op->poll(efile, &event->pt); 4124 efile->f_op->poll(efile, &event->pt);
4040 4125
4041 /*
4042 * Events should be removed after rmdir of cgroup directory, but before
4043 * destroying subsystem state objects. Let's take reference to cgroup
4044 * directory dentry to do that.
4045 */
4046 dget(cgrp->dentry);
4047
4048 spin_lock(&cgrp->event_list_lock); 4126 spin_lock(&cgrp->event_list_lock);
4049 list_add(&event->list, &cgrp->event_list); 4127 list_add(&event->list, &cgrp->event_list);
4050 spin_unlock(&cgrp->event_list_lock); 4128 spin_unlock(&cgrp->event_list_lock);
@@ -4054,35 +4132,33 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
4054 4132
4055 return 0; 4133 return 0;
4056 4134
4057fail: 4135out_put_css:
4058 if (cfile) 4136 css_put(event->css);
4059 fput(cfile); 4137out_put_cfile:
4060 4138 fput(cfile);
4061 if (event && event->eventfd && !IS_ERR(event->eventfd)) 4139out_put_eventfd:
4062 eventfd_ctx_put(event->eventfd); 4140 eventfd_ctx_put(event->eventfd);
4063 4141out_put_efile:
4064 if (!IS_ERR_OR_NULL(efile)) 4142 fput(efile);
4065 fput(efile); 4143out_kfree:
4066
4067 kfree(event); 4144 kfree(event);
4068 4145
4069 return ret; 4146 return ret;
4070} 4147}
4071 4148
4072static u64 cgroup_clone_children_read(struct cgroup *cgrp, 4149static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
4073 struct cftype *cft) 4150 struct cftype *cft)
4074{ 4151{
4075 return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); 4152 return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
4076} 4153}
4077 4154
4078static int cgroup_clone_children_write(struct cgroup *cgrp, 4155static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
4079 struct cftype *cft, 4156 struct cftype *cft, u64 val)
4080 u64 val)
4081{ 4157{
4082 if (val) 4158 if (val)
4083 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); 4159 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
4084 else 4160 else
4085 clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); 4161 clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
4086 return 0; 4162 return 0;
4087} 4163}
4088 4164
@@ -4141,36 +4217,34 @@ static struct cftype cgroup_base_files[] = {
4141}; 4217};
4142 4218
4143/** 4219/**
4144 * cgroup_populate_dir - selectively creation of files in a directory 4220 * cgroup_populate_dir - create subsys files in a cgroup directory
4145 * @cgrp: target cgroup 4221 * @cgrp: target cgroup
4146 * @base_files: true if the base files should be added
4147 * @subsys_mask: mask of the subsystem ids whose files should be added 4222 * @subsys_mask: mask of the subsystem ids whose files should be added
4223 *
4224 * On failure, no file is added.
4148 */ 4225 */
4149static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, 4226static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
4150 unsigned long subsys_mask)
4151{ 4227{
4152 int err;
4153 struct cgroup_subsys *ss; 4228 struct cgroup_subsys *ss;
4154 4229 int i, ret = 0;
4155 if (base_files) {
4156 err = cgroup_addrm_files(cgrp, NULL, cgroup_base_files, true);
4157 if (err < 0)
4158 return err;
4159 }
4160 4230
4161 /* process cftsets of each subsystem */ 4231 /* process cftsets of each subsystem */
4162 for_each_root_subsys(cgrp->root, ss) { 4232 for_each_subsys(ss, i) {
4163 struct cftype_set *set; 4233 struct cftype_set *set;
4164 if (!test_bit(ss->subsys_id, &subsys_mask)) 4234
4235 if (!test_bit(i, &subsys_mask))
4165 continue; 4236 continue;
4166 4237
4167 list_for_each_entry(set, &ss->cftsets, node) 4238 list_for_each_entry(set, &ss->cftsets, node) {
4168 cgroup_addrm_files(cgrp, ss, set->cfts, true); 4239 ret = cgroup_addrm_files(cgrp, set->cfts, true);
4240 if (ret < 0)
4241 goto err;
4242 }
4169 } 4243 }
4170 4244
4171 /* This cgroup is ready now */ 4245 /* This cgroup is ready now */
4172 for_each_root_subsys(cgrp->root, ss) { 4246 for_each_root_subsys(cgrp->root, ss) {
4173 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4247 struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
4174 struct css_id *id = rcu_dereference_protected(css->id, true); 4248 struct css_id *id = rcu_dereference_protected(css->id, true);
4175 4249
4176 /* 4250 /*
@@ -4183,14 +4257,57 @@ static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files,
4183 } 4257 }
4184 4258
4185 return 0; 4259 return 0;
4260err:
4261 cgroup_clear_dir(cgrp, subsys_mask);
4262 return ret;
4186} 4263}
4187 4264
4188static void css_dput_fn(struct work_struct *work) 4265/*
4266 * css destruction is four-stage process.
4267 *
4268 * 1. Destruction starts. Killing of the percpu_ref is initiated.
4269 * Implemented in kill_css().
4270 *
4271 * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
4272 * and thus css_tryget() is guaranteed to fail, the css can be offlined
4273 * by invoking offline_css(). After offlining, the base ref is put.
4274 * Implemented in css_killed_work_fn().
4275 *
4276 * 3. When the percpu_ref reaches zero, the only possible remaining
4277 * accessors are inside RCU read sections. css_release() schedules the
4278 * RCU callback.
4279 *
4280 * 4. After the grace period, the css can be freed. Implemented in
4281 * css_free_work_fn().
4282 *
4283 * It is actually hairier because both step 2 and 4 require process context
4284 * and thus involve punting to css->destroy_work adding two additional
4285 * steps to the already complex sequence.
4286 */
4287static void css_free_work_fn(struct work_struct *work)
4189{ 4288{
4190 struct cgroup_subsys_state *css = 4289 struct cgroup_subsys_state *css =
4191 container_of(work, struct cgroup_subsys_state, dput_work); 4290 container_of(work, struct cgroup_subsys_state, destroy_work);
4291 struct cgroup *cgrp = css->cgroup;
4192 4292
4193 cgroup_dput(css->cgroup); 4293 if (css->parent)
4294 css_put(css->parent);
4295
4296 css->ss->css_free(css);
4297 cgroup_dput(cgrp);
4298}
4299
4300static void css_free_rcu_fn(struct rcu_head *rcu_head)
4301{
4302 struct cgroup_subsys_state *css =
4303 container_of(rcu_head, struct cgroup_subsys_state, rcu_head);
4304
4305 /*
4306 * css holds an extra ref to @cgrp->dentry which is put on the last
4307 * css_put(). dput() requires process context which we don't have.
4308 */
4309 INIT_WORK(&css->destroy_work, css_free_work_fn);
4310 schedule_work(&css->destroy_work);
4194} 4311}
4195 4312
4196static void css_release(struct percpu_ref *ref) 4313static void css_release(struct percpu_ref *ref)
@@ -4198,49 +4315,47 @@ static void css_release(struct percpu_ref *ref)
4198 struct cgroup_subsys_state *css = 4315 struct cgroup_subsys_state *css =
4199 container_of(ref, struct cgroup_subsys_state, refcnt); 4316 container_of(ref, struct cgroup_subsys_state, refcnt);
4200 4317
4201 schedule_work(&css->dput_work); 4318 call_rcu(&css->rcu_head, css_free_rcu_fn);
4202} 4319}
4203 4320
4204static void init_cgroup_css(struct cgroup_subsys_state *css, 4321static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss,
4205 struct cgroup_subsys *ss, 4322 struct cgroup *cgrp)
4206 struct cgroup *cgrp)
4207{ 4323{
4208 css->cgroup = cgrp; 4324 css->cgroup = cgrp;
4325 css->ss = ss;
4209 css->flags = 0; 4326 css->flags = 0;
4210 css->id = NULL; 4327 css->id = NULL;
4211 if (cgrp == cgroup_dummy_top) 4328
4329 if (cgrp->parent)
4330 css->parent = cgroup_css(cgrp->parent, ss);
4331 else
4212 css->flags |= CSS_ROOT; 4332 css->flags |= CSS_ROOT;
4213 BUG_ON(cgrp->subsys[ss->subsys_id]);
4214 cgrp->subsys[ss->subsys_id] = css;
4215 4333
4216 /* 4334 BUG_ON(cgroup_css(cgrp, ss));
4217 * css holds an extra ref to @cgrp->dentry which is put on the last
4218 * css_put(). dput() requires process context, which css_put() may
4219 * be called without. @css->dput_work will be used to invoke
4220 * dput() asynchronously from css_put().
4221 */
4222 INIT_WORK(&css->dput_work, css_dput_fn);
4223} 4335}
4224 4336
4225/* invoke ->post_create() on a new CSS and mark it online if successful */ 4337/* invoke ->css_online() on a new CSS and mark it online if successful */
4226static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp) 4338static int online_css(struct cgroup_subsys_state *css)
4227{ 4339{
4340 struct cgroup_subsys *ss = css->ss;
4228 int ret = 0; 4341 int ret = 0;
4229 4342
4230 lockdep_assert_held(&cgroup_mutex); 4343 lockdep_assert_held(&cgroup_mutex);
4231 4344
4232 if (ss->css_online) 4345 if (ss->css_online)
4233 ret = ss->css_online(cgrp); 4346 ret = ss->css_online(css);
4234 if (!ret) 4347 if (!ret) {
4235 cgrp->subsys[ss->subsys_id]->flags |= CSS_ONLINE; 4348 css->flags |= CSS_ONLINE;
4349 css->cgroup->nr_css++;
4350 rcu_assign_pointer(css->cgroup->subsys[ss->subsys_id], css);
4351 }
4236 return ret; 4352 return ret;
4237} 4353}
4238 4354
4239/* if the CSS is online, invoke ->pre_destory() on it and mark it offline */ 4355/* if the CSS is online, invoke ->css_offline() on it and mark it offline */
4240static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) 4356static void offline_css(struct cgroup_subsys_state *css)
4241 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4242{ 4357{
4243 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4358 struct cgroup_subsys *ss = css->ss;
4244 4359
4245 lockdep_assert_held(&cgroup_mutex); 4360 lockdep_assert_held(&cgroup_mutex);
4246 4361
@@ -4248,9 +4363,11 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
4248 return; 4363 return;
4249 4364
4250 if (ss->css_offline) 4365 if (ss->css_offline)
4251 ss->css_offline(cgrp); 4366 ss->css_offline(css);
4252 4367
4253 cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE; 4368 css->flags &= ~CSS_ONLINE;
4369 css->cgroup->nr_css--;
4370 RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css);
4254} 4371}
4255 4372
4256/* 4373/*
@@ -4264,6 +4381,7 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
4264static long cgroup_create(struct cgroup *parent, struct dentry *dentry, 4381static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4265 umode_t mode) 4382 umode_t mode)
4266{ 4383{
4384 struct cgroup_subsys_state *css_ar[CGROUP_SUBSYS_COUNT] = { };
4267 struct cgroup *cgrp; 4385 struct cgroup *cgrp;
4268 struct cgroup_name *name; 4386 struct cgroup_name *name;
4269 struct cgroupfs_root *root = parent->root; 4387 struct cgroupfs_root *root = parent->root;
@@ -4281,7 +4399,11 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4281 goto err_free_cgrp; 4399 goto err_free_cgrp;
4282 rcu_assign_pointer(cgrp->name, name); 4400 rcu_assign_pointer(cgrp->name, name);
4283 4401
4284 cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL); 4402 /*
4403 * Temporarily set the pointer to NULL, so idr_find() won't return
4404 * a half-baked cgroup.
4405 */
4406 cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL);
4285 if (cgrp->id < 0) 4407 if (cgrp->id < 0)
4286 goto err_free_name; 4408 goto err_free_name;
4287 4409
@@ -4310,6 +4432,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4310 cgrp->dentry = dentry; 4432 cgrp->dentry = dentry;
4311 4433
4312 cgrp->parent = parent; 4434 cgrp->parent = parent;
4435 cgrp->dummy_css.parent = &parent->dummy_css;
4313 cgrp->root = parent->root; 4436 cgrp->root = parent->root;
4314 4437
4315 if (notify_on_release(parent)) 4438 if (notify_on_release(parent))
@@ -4321,20 +4444,21 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4321 for_each_root_subsys(root, ss) { 4444 for_each_root_subsys(root, ss) {
4322 struct cgroup_subsys_state *css; 4445 struct cgroup_subsys_state *css;
4323 4446
4324 css = ss->css_alloc(cgrp); 4447 css = ss->css_alloc(cgroup_css(parent, ss));
4325 if (IS_ERR(css)) { 4448 if (IS_ERR(css)) {
4326 err = PTR_ERR(css); 4449 err = PTR_ERR(css);
4327 goto err_free_all; 4450 goto err_free_all;
4328 } 4451 }
4452 css_ar[ss->subsys_id] = css;
4329 4453
4330 err = percpu_ref_init(&css->refcnt, css_release); 4454 err = percpu_ref_init(&css->refcnt, css_release);
4331 if (err) 4455 if (err)
4332 goto err_free_all; 4456 goto err_free_all;
4333 4457
4334 init_cgroup_css(css, ss, cgrp); 4458 init_css(css, ss, cgrp);
4335 4459
4336 if (ss->use_id) { 4460 if (ss->use_id) {
4337 err = alloc_css_id(ss, parent, cgrp); 4461 err = alloc_css_id(css);
4338 if (err) 4462 if (err)
4339 goto err_free_all; 4463 goto err_free_all;
4340 } 4464 }
@@ -4356,16 +4480,22 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4356 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); 4480 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
4357 root->number_of_cgroups++; 4481 root->number_of_cgroups++;
4358 4482
4359 /* each css holds a ref to the cgroup's dentry */ 4483 /* each css holds a ref to the cgroup's dentry and the parent css */
4360 for_each_root_subsys(root, ss) 4484 for_each_root_subsys(root, ss) {
4485 struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
4486
4361 dget(dentry); 4487 dget(dentry);
4488 css_get(css->parent);
4489 }
4362 4490
4363 /* hold a ref to the parent's dentry */ 4491 /* hold a ref to the parent's dentry */
4364 dget(parent->dentry); 4492 dget(parent->dentry);
4365 4493
4366 /* creation succeeded, notify subsystems */ 4494 /* creation succeeded, notify subsystems */
4367 for_each_root_subsys(root, ss) { 4495 for_each_root_subsys(root, ss) {
4368 err = online_css(ss, cgrp); 4496 struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
4497
4498 err = online_css(css);
4369 if (err) 4499 if (err)
4370 goto err_destroy; 4500 goto err_destroy;
4371 4501
@@ -4379,7 +4509,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4379 } 4509 }
4380 } 4510 }
4381 4511
4382 err = cgroup_populate_dir(cgrp, true, root->subsys_mask); 4512 idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
4513
4514 err = cgroup_addrm_files(cgrp, cgroup_base_files, true);
4515 if (err)
4516 goto err_destroy;
4517
4518 err = cgroup_populate_dir(cgrp, root->subsys_mask);
4383 if (err) 4519 if (err)
4384 goto err_destroy; 4520 goto err_destroy;
4385 4521
@@ -4390,18 +4526,18 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4390 4526
4391err_free_all: 4527err_free_all:
4392 for_each_root_subsys(root, ss) { 4528 for_each_root_subsys(root, ss) {
4393 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4529 struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
4394 4530
4395 if (css) { 4531 if (css) {
4396 percpu_ref_cancel_init(&css->refcnt); 4532 percpu_ref_cancel_init(&css->refcnt);
4397 ss->css_free(cgrp); 4533 ss->css_free(css);
4398 } 4534 }
4399 } 4535 }
4400 mutex_unlock(&cgroup_mutex); 4536 mutex_unlock(&cgroup_mutex);
4401 /* Release the reference count that we took on the superblock */ 4537 /* Release the reference count that we took on the superblock */
4402 deactivate_super(sb); 4538 deactivate_super(sb);
4403err_free_id: 4539err_free_id:
4404 ida_simple_remove(&root->cgroup_ida, cgrp->id); 4540 idr_remove(&root->cgroup_idr, cgrp->id);
4405err_free_name: 4541err_free_name:
4406 kfree(rcu_dereference_raw(cgrp->name)); 4542 kfree(rcu_dereference_raw(cgrp->name));
4407err_free_cgrp: 4543err_free_cgrp:
@@ -4423,22 +4559,84 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
4423 return cgroup_create(c_parent, dentry, mode | S_IFDIR); 4559 return cgroup_create(c_parent, dentry, mode | S_IFDIR);
4424} 4560}
4425 4561
4426static void cgroup_css_killed(struct cgroup *cgrp) 4562/*
4563 * This is called when the refcnt of a css is confirmed to be killed.
4564 * css_tryget() is now guaranteed to fail.
4565 */
4566static void css_killed_work_fn(struct work_struct *work)
4427{ 4567{
4428 if (!atomic_dec_and_test(&cgrp->css_kill_cnt)) 4568 struct cgroup_subsys_state *css =
4429 return; 4569 container_of(work, struct cgroup_subsys_state, destroy_work);
4570 struct cgroup *cgrp = css->cgroup;
4430 4571
4431 /* percpu ref's of all css's are killed, kick off the next step */ 4572 mutex_lock(&cgroup_mutex);
4432 INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn); 4573
4433 schedule_work(&cgrp->destroy_work); 4574 /*
4575 * css_tryget() is guaranteed to fail now. Tell subsystems to
4576 * initate destruction.
4577 */
4578 offline_css(css);
4579
4580 /*
4581 * If @cgrp is marked dead, it's waiting for refs of all css's to
4582 * be disabled before proceeding to the second phase of cgroup
4583 * destruction. If we are the last one, kick it off.
4584 */
4585 if (!cgrp->nr_css && cgroup_is_dead(cgrp))
4586 cgroup_destroy_css_killed(cgrp);
4587
4588 mutex_unlock(&cgroup_mutex);
4589
4590 /*
4591 * Put the css refs from kill_css(). Each css holds an extra
4592 * reference to the cgroup's dentry and cgroup removal proceeds
4593 * regardless of css refs. On the last put of each css, whenever
4594 * that may be, the extra dentry ref is put so that dentry
4595 * destruction happens only after all css's are released.
4596 */
4597 css_put(css);
4434} 4598}
4435 4599
4436static void css_ref_killed_fn(struct percpu_ref *ref) 4600/* css kill confirmation processing requires process context, bounce */
4601static void css_killed_ref_fn(struct percpu_ref *ref)
4437{ 4602{
4438 struct cgroup_subsys_state *css = 4603 struct cgroup_subsys_state *css =
4439 container_of(ref, struct cgroup_subsys_state, refcnt); 4604 container_of(ref, struct cgroup_subsys_state, refcnt);
4440 4605
4441 cgroup_css_killed(css->cgroup); 4606 INIT_WORK(&css->destroy_work, css_killed_work_fn);
4607 schedule_work(&css->destroy_work);
4608}
4609
4610/**
4611 * kill_css - destroy a css
4612 * @css: css to destroy
4613 *
4614 * This function initiates destruction of @css by removing cgroup interface
4615 * files and putting its base reference. ->css_offline() will be invoked
4616 * asynchronously once css_tryget() is guaranteed to fail and when the
4617 * reference count reaches zero, @css will be released.
4618 */
4619static void kill_css(struct cgroup_subsys_state *css)
4620{
4621 cgroup_clear_dir(css->cgroup, 1 << css->ss->subsys_id);
4622
4623 /*
4624 * Killing would put the base ref, but we need to keep it alive
4625 * until after ->css_offline().
4626 */
4627 css_get(css);
4628
4629 /*
4630 * cgroup core guarantees that, by the time ->css_offline() is
4631 * invoked, no new css reference will be given out via
4632 * css_tryget(). We can't simply call percpu_ref_kill() and
4633 * proceed to offlining css's because percpu_ref_kill() doesn't
4634 * guarantee that the ref is seen as killed on all CPUs on return.
4635 *
4636 * Use percpu_ref_kill_and_confirm() to get notifications as each
4637 * css is confirmed to be seen as killed on all CPUs.
4638 */
4639 percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
4442} 4640}
4443 4641
4444/** 4642/**
@@ -4471,6 +4669,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4471 struct dentry *d = cgrp->dentry; 4669 struct dentry *d = cgrp->dentry;
4472 struct cgroup_event *event, *tmp; 4670 struct cgroup_event *event, *tmp;
4473 struct cgroup_subsys *ss; 4671 struct cgroup_subsys *ss;
4672 struct cgroup *child;
4474 bool empty; 4673 bool empty;
4475 4674
4476 lockdep_assert_held(&d->d_inode->i_mutex); 4675 lockdep_assert_held(&d->d_inode->i_mutex);
@@ -4481,47 +4680,41 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4481 * @cgrp from being removed while __put_css_set() is in progress. 4680 * @cgrp from being removed while __put_css_set() is in progress.
4482 */ 4681 */
4483 read_lock(&css_set_lock); 4682 read_lock(&css_set_lock);
4484 empty = list_empty(&cgrp->cset_links) && list_empty(&cgrp->children); 4683 empty = list_empty(&cgrp->cset_links);
4485 read_unlock(&css_set_lock); 4684 read_unlock(&css_set_lock);
4486 if (!empty) 4685 if (!empty)
4487 return -EBUSY; 4686 return -EBUSY;
4488 4687
4489 /* 4688 /*
4490 * Block new css_tryget() by killing css refcnts. cgroup core 4689 * Make sure there's no live children. We can't test ->children
4491 * guarantees that, by the time ->css_offline() is invoked, no new 4690 * emptiness as dead children linger on it while being destroyed;
4492 * css reference will be given out via css_tryget(). We can't 4691 * otherwise, "rmdir parent/child parent" may fail with -EBUSY.
4493 * simply call percpu_ref_kill() and proceed to offlining css's
4494 * because percpu_ref_kill() doesn't guarantee that the ref is seen
4495 * as killed on all CPUs on return.
4496 *
4497 * Use percpu_ref_kill_and_confirm() to get notifications as each
4498 * css is confirmed to be seen as killed on all CPUs. The
4499 * notification callback keeps track of the number of css's to be
4500 * killed and schedules cgroup_offline_fn() to perform the rest of
4501 * destruction once the percpu refs of all css's are confirmed to
4502 * be killed.
4503 */ 4692 */
4504 atomic_set(&cgrp->css_kill_cnt, 1); 4693 empty = true;
4505 for_each_root_subsys(cgrp->root, ss) { 4694 rcu_read_lock();
4506 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4695 list_for_each_entry_rcu(child, &cgrp->children, sibling) {
4507 4696 empty = cgroup_is_dead(child);
4508 /* 4697 if (!empty)
4509 * Killing would put the base ref, but we need to keep it 4698 break;
4510 * alive until after ->css_offline.
4511 */
4512 percpu_ref_get(&css->refcnt);
4513
4514 atomic_inc(&cgrp->css_kill_cnt);
4515 percpu_ref_kill_and_confirm(&css->refcnt, css_ref_killed_fn);
4516 } 4699 }
4517 cgroup_css_killed(cgrp); 4700 rcu_read_unlock();
4701 if (!empty)
4702 return -EBUSY;
4703
4704 /*
4705 * Initiate massacre of all css's. cgroup_destroy_css_killed()
4706 * will be invoked to perform the rest of destruction once the
4707 * percpu refs of all css's are confirmed to be killed.
4708 */
4709 for_each_root_subsys(cgrp->root, ss)
4710 kill_css(cgroup_css(cgrp, ss));
4518 4711
4519 /* 4712 /*
4520 * Mark @cgrp dead. This prevents further task migration and child 4713 * Mark @cgrp dead. This prevents further task migration and child
4521 * creation by disabling cgroup_lock_live_group(). Note that 4714 * creation by disabling cgroup_lock_live_group(). Note that
4522 * CGRP_DEAD assertion is depended upon by cgroup_next_sibling() to 4715 * CGRP_DEAD assertion is depended upon by css_next_child() to
4523 * resume iteration after dropping RCU read lock. See 4716 * resume iteration after dropping RCU read lock. See
4524 * cgroup_next_sibling() for details. 4717 * css_next_child() for details.
4525 */ 4718 */
4526 set_bit(CGRP_DEAD, &cgrp->flags); 4719 set_bit(CGRP_DEAD, &cgrp->flags);
4527 4720
@@ -4532,9 +4725,20 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4532 raw_spin_unlock(&release_list_lock); 4725 raw_spin_unlock(&release_list_lock);
4533 4726
4534 /* 4727 /*
4535 * Remove @cgrp directory. The removal puts the base ref but we 4728 * If @cgrp has css's attached, the second stage of cgroup
4536 * aren't quite done with @cgrp yet, so hold onto it. 4729 * destruction is kicked off from css_killed_work_fn() after the
4730 * refs of all attached css's are killed. If @cgrp doesn't have
4731 * any css, we kick it off here.
4537 */ 4732 */
4733 if (!cgrp->nr_css)
4734 cgroup_destroy_css_killed(cgrp);
4735
4736 /*
4737 * Clear the base files and remove @cgrp directory. The removal
4738 * puts the base ref but we aren't quite done with @cgrp yet, so
4739 * hold onto it.
4740 */
4741 cgroup_addrm_files(cgrp, cgroup_base_files, false);
4538 dget(d); 4742 dget(d);
4539 cgroup_d_remove_dir(d); 4743 cgroup_d_remove_dir(d);
4540 4744
@@ -4554,50 +4758,36 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4554}; 4758};
4555 4759
4556/** 4760/**
4557 * cgroup_offline_fn - the second step of cgroup destruction 4761 * cgroup_destroy_css_killed - the second step of cgroup destruction
4558 * @work: cgroup->destroy_free_work 4762 * @work: cgroup->destroy_free_work
4559 * 4763 *
4560 * This function is invoked from a work item for a cgroup which is being 4764 * This function is invoked from a work item for a cgroup which is being
4561 * destroyed after the percpu refcnts of all css's are guaranteed to be 4765 * destroyed after all css's are offlined and performs the rest of
4562 * seen as killed on all CPUs, and performs the rest of destruction. This 4766 * destruction. This is the second step of destruction described in the
4563 * is the second step of destruction described in the comment above 4767 * comment above cgroup_destroy_locked().
4564 * cgroup_destroy_locked().
4565 */ 4768 */
4566static void cgroup_offline_fn(struct work_struct *work) 4769static void cgroup_destroy_css_killed(struct cgroup *cgrp)
4567{ 4770{
4568 struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
4569 struct cgroup *parent = cgrp->parent; 4771 struct cgroup *parent = cgrp->parent;
4570 struct dentry *d = cgrp->dentry; 4772 struct dentry *d = cgrp->dentry;
4571 struct cgroup_subsys *ss;
4572 4773
4573 mutex_lock(&cgroup_mutex); 4774 lockdep_assert_held(&cgroup_mutex);
4574 4775
4575 /* 4776 /* delete this cgroup from parent->children */
4576 * css_tryget() is guaranteed to fail now. Tell subsystems to 4777 list_del_rcu(&cgrp->sibling);
4577 * initate destruction.
4578 */
4579 for_each_root_subsys(cgrp->root, ss)
4580 offline_css(ss, cgrp);
4581 4778
4582 /* 4779 /*
4583 * Put the css refs from cgroup_destroy_locked(). Each css holds 4780 * We should remove the cgroup object from idr before its grace
4584 * an extra reference to the cgroup's dentry and cgroup removal 4781 * period starts, so we won't be looking up a cgroup while the
4585 * proceeds regardless of css refs. On the last put of each css, 4782 * cgroup is being freed.
4586 * whenever that may be, the extra dentry ref is put so that dentry
4587 * destruction happens only after all css's are released.
4588 */ 4783 */
4589 for_each_root_subsys(cgrp->root, ss) 4784 idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
4590 css_put(cgrp->subsys[ss->subsys_id]); 4785 cgrp->id = -1;
4591
4592 /* delete this cgroup from parent->children */
4593 list_del_rcu(&cgrp->sibling);
4594 4786
4595 dput(d); 4787 dput(d);
4596 4788
4597 set_bit(CGRP_RELEASABLE, &parent->flags); 4789 set_bit(CGRP_RELEASABLE, &parent->flags);
4598 check_for_release(parent); 4790 check_for_release(parent);
4599
4600 mutex_unlock(&cgroup_mutex);
4601} 4791}
4602 4792
4603static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) 4793static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
@@ -4620,6 +4810,11 @@ static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss)
4620 * deregistration. 4810 * deregistration.
4621 */ 4811 */
4622 if (ss->base_cftypes) { 4812 if (ss->base_cftypes) {
4813 struct cftype *cft;
4814
4815 for (cft = ss->base_cftypes; cft->name[0] != '\0'; cft++)
4816 cft->ss = ss;
4817
4623 ss->base_cftset.cfts = ss->base_cftypes; 4818 ss->base_cftset.cfts = ss->base_cftypes;
4624 list_add_tail(&ss->base_cftset.node, &ss->cftsets); 4819 list_add_tail(&ss->base_cftset.node, &ss->cftsets);
4625 } 4820 }
@@ -4639,10 +4834,10 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4639 /* Create the top cgroup state for this subsystem */ 4834 /* Create the top cgroup state for this subsystem */
4640 list_add(&ss->sibling, &cgroup_dummy_root.subsys_list); 4835 list_add(&ss->sibling, &cgroup_dummy_root.subsys_list);
4641 ss->root = &cgroup_dummy_root; 4836 ss->root = &cgroup_dummy_root;
4642 css = ss->css_alloc(cgroup_dummy_top); 4837 css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss));
4643 /* We don't handle early failures gracefully */ 4838 /* We don't handle early failures gracefully */
4644 BUG_ON(IS_ERR(css)); 4839 BUG_ON(IS_ERR(css));
4645 init_cgroup_css(css, ss, cgroup_dummy_top); 4840 init_css(css, ss, cgroup_dummy_top);
4646 4841
4647 /* Update the init_css_set to contain a subsys 4842 /* Update the init_css_set to contain a subsys
4648 * pointer to this state - since the subsystem is 4843 * pointer to this state - since the subsystem is
@@ -4657,7 +4852,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4657 * need to invoke fork callbacks here. */ 4852 * need to invoke fork callbacks here. */
4658 BUG_ON(!list_empty(&init_task.tasks)); 4853 BUG_ON(!list_empty(&init_task.tasks));
4659 4854
4660 BUG_ON(online_css(ss, cgroup_dummy_top)); 4855 BUG_ON(online_css(css));
4661 4856
4662 mutex_unlock(&cgroup_mutex); 4857 mutex_unlock(&cgroup_mutex);
4663 4858
@@ -4718,7 +4913,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4718 * struct, so this can happen first (i.e. before the dummy root 4913 * struct, so this can happen first (i.e. before the dummy root
4719 * attachment). 4914 * attachment).
4720 */ 4915 */
4721 css = ss->css_alloc(cgroup_dummy_top); 4916 css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss));
4722 if (IS_ERR(css)) { 4917 if (IS_ERR(css)) {
4723 /* failure case - need to deassign the cgroup_subsys[] slot. */ 4918 /* failure case - need to deassign the cgroup_subsys[] slot. */
4724 cgroup_subsys[ss->subsys_id] = NULL; 4919 cgroup_subsys[ss->subsys_id] = NULL;
@@ -4730,8 +4925,8 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4730 ss->root = &cgroup_dummy_root; 4925 ss->root = &cgroup_dummy_root;
4731 4926
4732 /* our new subsystem will be attached to the dummy hierarchy. */ 4927 /* our new subsystem will be attached to the dummy hierarchy. */
4733 init_cgroup_css(css, ss, cgroup_dummy_top); 4928 init_css(css, ss, cgroup_dummy_top);
4734 /* init_idr must be after init_cgroup_css because it sets css->id. */ 4929 /* init_idr must be after init_css() because it sets css->id. */
4735 if (ss->use_id) { 4930 if (ss->use_id) {
4736 ret = cgroup_init_idr(ss, css); 4931 ret = cgroup_init_idr(ss, css);
4737 if (ret) 4932 if (ret)
@@ -4761,7 +4956,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4761 } 4956 }
4762 write_unlock(&css_set_lock); 4957 write_unlock(&css_set_lock);
4763 4958
4764 ret = online_css(ss, cgroup_dummy_top); 4959 ret = online_css(css);
4765 if (ret) 4960 if (ret)
4766 goto err_unload; 4961 goto err_unload;
4767 4962
@@ -4793,14 +4988,14 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4793 4988
4794 /* 4989 /*
4795 * we shouldn't be called if the subsystem is in use, and the use of 4990 * we shouldn't be called if the subsystem is in use, and the use of
4796 * try_module_get in parse_cgroupfs_options should ensure that it 4991 * try_module_get() in rebind_subsystems() should ensure that it
4797 * doesn't start being used while we're killing it off. 4992 * doesn't start being used while we're killing it off.
4798 */ 4993 */
4799 BUG_ON(ss->root != &cgroup_dummy_root); 4994 BUG_ON(ss->root != &cgroup_dummy_root);
4800 4995
4801 mutex_lock(&cgroup_mutex); 4996 mutex_lock(&cgroup_mutex);
4802 4997
4803 offline_css(ss, cgroup_dummy_top); 4998 offline_css(cgroup_css(cgroup_dummy_top, ss));
4804 4999
4805 if (ss->use_id) 5000 if (ss->use_id)
4806 idr_destroy(&ss->idr); 5001 idr_destroy(&ss->idr);
@@ -4834,8 +5029,8 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4834 * the cgrp->subsys pointer to find their state. note that this 5029 * the cgrp->subsys pointer to find their state. note that this
4835 * also takes care of freeing the css_id. 5030 * also takes care of freeing the css_id.
4836 */ 5031 */
4837 ss->css_free(cgroup_dummy_top); 5032 ss->css_free(cgroup_css(cgroup_dummy_top, ss));
4838 cgroup_dummy_top->subsys[ss->subsys_id] = NULL; 5033 RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL);
4839 5034
4840 mutex_unlock(&cgroup_mutex); 5035 mutex_unlock(&cgroup_mutex);
4841} 5036}
@@ -4917,6 +5112,10 @@ int __init cgroup_init(void)
4917 5112
4918 BUG_ON(cgroup_init_root_id(&cgroup_dummy_root, 0, 1)); 5113 BUG_ON(cgroup_init_root_id(&cgroup_dummy_root, 0, 1));
4919 5114
5115 err = idr_alloc(&cgroup_dummy_root.cgroup_idr, cgroup_dummy_top,
5116 0, 1, GFP_KERNEL);
5117 BUG_ON(err < 0);
5118
4920 mutex_unlock(&cgroup_root_mutex); 5119 mutex_unlock(&cgroup_root_mutex);
4921 mutex_unlock(&cgroup_mutex); 5120 mutex_unlock(&cgroup_mutex);
4922 5121
@@ -5073,7 +5272,7 @@ void cgroup_fork(struct task_struct *child)
5073 * Adds the task to the list running through its css_set if necessary and 5272 * Adds the task to the list running through its css_set if necessary and
5074 * call the subsystem fork() callbacks. Has to be after the task is 5273 * call the subsystem fork() callbacks. Has to be after the task is
5075 * visible on the task list in case we race with the first call to 5274 * visible on the task list in case we race with the first call to
5076 * cgroup_iter_start() - to guarantee that the new task ends up on its 5275 * cgroup_task_iter_start() - to guarantee that the new task ends up on its
5077 * list. 5276 * list.
5078 */ 5277 */
5079void cgroup_post_fork(struct task_struct *child) 5278void cgroup_post_fork(struct task_struct *child)
@@ -5186,10 +5385,10 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
5186 */ 5385 */
5187 for_each_builtin_subsys(ss, i) { 5386 for_each_builtin_subsys(ss, i) {
5188 if (ss->exit) { 5387 if (ss->exit) {
5189 struct cgroup *old_cgrp = cset->subsys[i]->cgroup; 5388 struct cgroup_subsys_state *old_css = cset->subsys[i];
5190 struct cgroup *cgrp = task_cgroup(tsk, i); 5389 struct cgroup_subsys_state *css = task_css(tsk, i);
5191 5390
5192 ss->exit(cgrp, old_cgrp, tsk); 5391 ss->exit(css, old_css, tsk);
5193 } 5392 }
5194 } 5393 }
5195 } 5394 }
@@ -5448,20 +5647,16 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
5448 return 0; 5647 return 0;
5449} 5648}
5450 5649
5451static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent, 5650static int alloc_css_id(struct cgroup_subsys_state *child_css)
5452 struct cgroup *child)
5453{ 5651{
5454 int subsys_id, i, depth = 0; 5652 struct cgroup_subsys_state *parent_css = css_parent(child_css);
5455 struct cgroup_subsys_state *parent_css, *child_css;
5456 struct css_id *child_id, *parent_id; 5653 struct css_id *child_id, *parent_id;
5654 int i, depth;
5457 5655
5458 subsys_id = ss->subsys_id;
5459 parent_css = parent->subsys[subsys_id];
5460 child_css = child->subsys[subsys_id];
5461 parent_id = rcu_dereference_protected(parent_css->id, true); 5656 parent_id = rcu_dereference_protected(parent_css->id, true);
5462 depth = parent_id->depth + 1; 5657 depth = parent_id->depth + 1;
5463 5658
5464 child_id = get_new_cssid(ss, depth); 5659 child_id = get_new_cssid(child_css->ss, depth);
5465 if (IS_ERR(child_id)) 5660 if (IS_ERR(child_id))
5466 return PTR_ERR(child_id); 5661 return PTR_ERR(child_id);
5467 5662
@@ -5499,31 +5694,56 @@ struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
5499} 5694}
5500EXPORT_SYMBOL_GPL(css_lookup); 5695EXPORT_SYMBOL_GPL(css_lookup);
5501 5696
5502/* 5697/**
5503 * get corresponding css from file open on cgroupfs directory 5698 * css_from_dir - get corresponding css from the dentry of a cgroup dir
5699 * @dentry: directory dentry of interest
5700 * @ss: subsystem of interest
5701 *
5702 * Must be called under RCU read lock. The caller is responsible for
5703 * pinning the returned css if it needs to be accessed outside the RCU
5704 * critical section.
5504 */ 5705 */
5505struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) 5706struct cgroup_subsys_state *css_from_dir(struct dentry *dentry,
5707 struct cgroup_subsys *ss)
5506{ 5708{
5507 struct cgroup *cgrp; 5709 struct cgroup *cgrp;
5508 struct inode *inode;
5509 struct cgroup_subsys_state *css;
5510 5710
5511 inode = file_inode(f); 5711 WARN_ON_ONCE(!rcu_read_lock_held());
5512 /* check in cgroup filesystem dir */ 5712
5513 if (inode->i_op != &cgroup_dir_inode_operations) 5713 /* is @dentry a cgroup dir? */
5714 if (!dentry->d_inode ||
5715 dentry->d_inode->i_op != &cgroup_dir_inode_operations)
5514 return ERR_PTR(-EBADF); 5716 return ERR_PTR(-EBADF);
5515 5717
5516 if (id < 0 || id >= CGROUP_SUBSYS_COUNT) 5718 cgrp = __d_cgrp(dentry);
5517 return ERR_PTR(-EINVAL); 5719 return cgroup_css(cgrp, ss) ?: ERR_PTR(-ENOENT);
5720}
5518 5721
5519 /* get cgroup */ 5722/**
5520 cgrp = __d_cgrp(f->f_dentry); 5723 * css_from_id - lookup css by id
5521 css = cgrp->subsys[id]; 5724 * @id: the cgroup id
5522 return css ? css : ERR_PTR(-ENOENT); 5725 * @ss: cgroup subsys to be looked into
5726 *
5727 * Returns the css if there's valid one with @id, otherwise returns NULL.
5728 * Should be called under rcu_read_lock().
5729 */
5730struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
5731{
5732 struct cgroup *cgrp;
5733
5734 rcu_lockdep_assert(rcu_read_lock_held() ||
5735 lockdep_is_held(&cgroup_mutex),
5736 "css_from_id() needs proper protection");
5737
5738 cgrp = idr_find(&ss->root->cgroup_idr, id);
5739 if (cgrp)
5740 return cgroup_css(cgrp, ss);
5741 return NULL;
5523} 5742}
5524 5743
5525#ifdef CONFIG_CGROUP_DEBUG 5744#ifdef CONFIG_CGROUP_DEBUG
5526static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cgrp) 5745static struct cgroup_subsys_state *
5746debug_css_alloc(struct cgroup_subsys_state *parent_css)
5527{ 5747{
5528 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); 5748 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
5529 5749
@@ -5533,22 +5753,24 @@ static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cgrp)
5533 return css; 5753 return css;
5534} 5754}
5535 5755
5536static void debug_css_free(struct cgroup *cgrp) 5756static void debug_css_free(struct cgroup_subsys_state *css)
5537{ 5757{
5538 kfree(cgrp->subsys[debug_subsys_id]); 5758 kfree(css);
5539} 5759}
5540 5760
5541static u64 debug_taskcount_read(struct cgroup *cgrp, struct cftype *cft) 5761static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
5762 struct cftype *cft)
5542{ 5763{
5543 return cgroup_task_count(cgrp); 5764 return cgroup_task_count(css->cgroup);
5544} 5765}
5545 5766
5546static u64 current_css_set_read(struct cgroup *cgrp, struct cftype *cft) 5767static u64 current_css_set_read(struct cgroup_subsys_state *css,
5768 struct cftype *cft)
5547{ 5769{
5548 return (u64)(unsigned long)current->cgroups; 5770 return (u64)(unsigned long)current->cgroups;
5549} 5771}
5550 5772
5551static u64 current_css_set_refcount_read(struct cgroup *cgrp, 5773static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
5552 struct cftype *cft) 5774 struct cftype *cft)
5553{ 5775{
5554 u64 count; 5776 u64 count;
@@ -5559,7 +5781,7 @@ static u64 current_css_set_refcount_read(struct cgroup *cgrp,
5559 return count; 5781 return count;
5560} 5782}
5561 5783
5562static int current_css_set_cg_links_read(struct cgroup *cgrp, 5784static int current_css_set_cg_links_read(struct cgroup_subsys_state *css,
5563 struct cftype *cft, 5785 struct cftype *cft,
5564 struct seq_file *seq) 5786 struct seq_file *seq)
5565{ 5787{
@@ -5586,14 +5808,13 @@ static int current_css_set_cg_links_read(struct cgroup *cgrp,
5586} 5808}
5587 5809
5588#define MAX_TASKS_SHOWN_PER_CSS 25 5810#define MAX_TASKS_SHOWN_PER_CSS 25
5589static int cgroup_css_links_read(struct cgroup *cgrp, 5811static int cgroup_css_links_read(struct cgroup_subsys_state *css,
5590 struct cftype *cft, 5812 struct cftype *cft, struct seq_file *seq)
5591 struct seq_file *seq)
5592{ 5813{
5593 struct cgrp_cset_link *link; 5814 struct cgrp_cset_link *link;
5594 5815
5595 read_lock(&css_set_lock); 5816 read_lock(&css_set_lock);
5596 list_for_each_entry(link, &cgrp->cset_links, cset_link) { 5817 list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
5597 struct css_set *cset = link->cset; 5818 struct css_set *cset = link->cset;
5598 struct task_struct *task; 5819 struct task_struct *task;
5599 int count = 0; 5820 int count = 0;
@@ -5612,9 +5833,9 @@ static int cgroup_css_links_read(struct cgroup *cgrp,
5612 return 0; 5833 return 0;
5613} 5834}
5614 5835
5615static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft) 5836static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
5616{ 5837{
5617 return test_bit(CGRP_RELEASABLE, &cgrp->flags); 5838 return test_bit(CGRP_RELEASABLE, &css->cgroup->flags);
5618} 5839}
5619 5840
5620static struct cftype debug_files[] = { 5841static struct cftype debug_files[] = {
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 75dda1ea5026..f0ff64d0ebaa 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -45,25 +45,19 @@ struct freezer {
45 spinlock_t lock; 45 spinlock_t lock;
46}; 46};
47 47
48static inline struct freezer *cgroup_freezer(struct cgroup *cgroup) 48static inline struct freezer *css_freezer(struct cgroup_subsys_state *css)
49{ 49{
50 return container_of(cgroup_subsys_state(cgroup, freezer_subsys_id), 50 return css ? container_of(css, struct freezer, css) : NULL;
51 struct freezer, css);
52} 51}
53 52
54static inline struct freezer *task_freezer(struct task_struct *task) 53static inline struct freezer *task_freezer(struct task_struct *task)
55{ 54{
56 return container_of(task_subsys_state(task, freezer_subsys_id), 55 return css_freezer(task_css(task, freezer_subsys_id));
57 struct freezer, css);
58} 56}
59 57
60static struct freezer *parent_freezer(struct freezer *freezer) 58static struct freezer *parent_freezer(struct freezer *freezer)
61{ 59{
62 struct cgroup *pcg = freezer->css.cgroup->parent; 60 return css_freezer(css_parent(&freezer->css));
63
64 if (pcg)
65 return cgroup_freezer(pcg);
66 return NULL;
67} 61}
68 62
69bool cgroup_freezing(struct task_struct *task) 63bool cgroup_freezing(struct task_struct *task)
@@ -92,7 +86,8 @@ static const char *freezer_state_strs(unsigned int state)
92 86
93struct cgroup_subsys freezer_subsys; 87struct cgroup_subsys freezer_subsys;
94 88
95static struct cgroup_subsys_state *freezer_css_alloc(struct cgroup *cgroup) 89static struct cgroup_subsys_state *
90freezer_css_alloc(struct cgroup_subsys_state *parent_css)
96{ 91{
97 struct freezer *freezer; 92 struct freezer *freezer;
98 93
@@ -105,22 +100,22 @@ static struct cgroup_subsys_state *freezer_css_alloc(struct cgroup *cgroup)
105} 100}
106 101
107/** 102/**
108 * freezer_css_online - commit creation of a freezer cgroup 103 * freezer_css_online - commit creation of a freezer css
109 * @cgroup: cgroup being created 104 * @css: css being created
110 * 105 *
111 * We're committing to creation of @cgroup. Mark it online and inherit 106 * We're committing to creation of @css. Mark it online and inherit
112 * parent's freezing state while holding both parent's and our 107 * parent's freezing state while holding both parent's and our
113 * freezer->lock. 108 * freezer->lock.
114 */ 109 */
115static int freezer_css_online(struct cgroup *cgroup) 110static int freezer_css_online(struct cgroup_subsys_state *css)
116{ 111{
117 struct freezer *freezer = cgroup_freezer(cgroup); 112 struct freezer *freezer = css_freezer(css);
118 struct freezer *parent = parent_freezer(freezer); 113 struct freezer *parent = parent_freezer(freezer);
119 114
120 /* 115 /*
121 * The following double locking and freezing state inheritance 116 * The following double locking and freezing state inheritance
122 * guarantee that @cgroup can never escape ancestors' freezing 117 * guarantee that @cgroup can never escape ancestors' freezing
123 * states. See cgroup_for_each_descendant_pre() for details. 118 * states. See css_for_each_descendant_pre() for details.
124 */ 119 */
125 if (parent) 120 if (parent)
126 spin_lock_irq(&parent->lock); 121 spin_lock_irq(&parent->lock);
@@ -141,15 +136,15 @@ static int freezer_css_online(struct cgroup *cgroup)
141} 136}
142 137
143/** 138/**
144 * freezer_css_offline - initiate destruction of @cgroup 139 * freezer_css_offline - initiate destruction of a freezer css
145 * @cgroup: cgroup being destroyed 140 * @css: css being destroyed
146 * 141 *
147 * @cgroup is going away. Mark it dead and decrement system_freezing_count 142 * @css is going away. Mark it dead and decrement system_freezing_count if
148 * if it was holding one. 143 * it was holding one.
149 */ 144 */
150static void freezer_css_offline(struct cgroup *cgroup) 145static void freezer_css_offline(struct cgroup_subsys_state *css)
151{ 146{
152 struct freezer *freezer = cgroup_freezer(cgroup); 147 struct freezer *freezer = css_freezer(css);
153 148
154 spin_lock_irq(&freezer->lock); 149 spin_lock_irq(&freezer->lock);
155 150
@@ -161,9 +156,9 @@ static void freezer_css_offline(struct cgroup *cgroup)
161 spin_unlock_irq(&freezer->lock); 156 spin_unlock_irq(&freezer->lock);
162} 157}
163 158
164static void freezer_css_free(struct cgroup *cgroup) 159static void freezer_css_free(struct cgroup_subsys_state *css)
165{ 160{
166 kfree(cgroup_freezer(cgroup)); 161 kfree(css_freezer(css));
167} 162}
168 163
169/* 164/*
@@ -175,25 +170,26 @@ static void freezer_css_free(struct cgroup *cgroup)
175 * @freezer->lock. freezer_attach() makes the new tasks conform to the 170 * @freezer->lock. freezer_attach() makes the new tasks conform to the
176 * current state and all following state changes can see the new tasks. 171 * current state and all following state changes can see the new tasks.
177 */ 172 */
178static void freezer_attach(struct cgroup *new_cgrp, struct cgroup_taskset *tset) 173static void freezer_attach(struct cgroup_subsys_state *new_css,
174 struct cgroup_taskset *tset)
179{ 175{
180 struct freezer *freezer = cgroup_freezer(new_cgrp); 176 struct freezer *freezer = css_freezer(new_css);
181 struct task_struct *task; 177 struct task_struct *task;
182 bool clear_frozen = false; 178 bool clear_frozen = false;
183 179
184 spin_lock_irq(&freezer->lock); 180 spin_lock_irq(&freezer->lock);
185 181
186 /* 182 /*
187 * Make the new tasks conform to the current state of @new_cgrp. 183 * Make the new tasks conform to the current state of @new_css.
188 * For simplicity, when migrating any task to a FROZEN cgroup, we 184 * For simplicity, when migrating any task to a FROZEN cgroup, we
189 * revert it to FREEZING and let update_if_frozen() determine the 185 * revert it to FREEZING and let update_if_frozen() determine the
190 * correct state later. 186 * correct state later.
191 * 187 *
192 * Tasks in @tset are on @new_cgrp but may not conform to its 188 * Tasks in @tset are on @new_css but may not conform to its
193 * current state before executing the following - !frozen tasks may 189 * current state before executing the following - !frozen tasks may
194 * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. 190 * be visible in a FROZEN cgroup and frozen tasks in a THAWED one.
195 */ 191 */
196 cgroup_taskset_for_each(task, new_cgrp, tset) { 192 cgroup_taskset_for_each(task, new_css, tset) {
197 if (!(freezer->state & CGROUP_FREEZING)) { 193 if (!(freezer->state & CGROUP_FREEZING)) {
198 __thaw_task(task); 194 __thaw_task(task);
199 } else { 195 } else {
@@ -231,7 +227,7 @@ static void freezer_fork(struct task_struct *task)
231 * The root cgroup is non-freezable, so we can skip the 227 * The root cgroup is non-freezable, so we can skip the
232 * following check. 228 * following check.
233 */ 229 */
234 if (!freezer->css.cgroup->parent) 230 if (!parent_freezer(freezer))
235 goto out; 231 goto out;
236 232
237 spin_lock_irq(&freezer->lock); 233 spin_lock_irq(&freezer->lock);
@@ -244,7 +240,7 @@ out:
244 240
245/** 241/**
246 * update_if_frozen - update whether a cgroup finished freezing 242 * update_if_frozen - update whether a cgroup finished freezing
247 * @cgroup: cgroup of interest 243 * @css: css of interest
248 * 244 *
249 * Once FREEZING is initiated, transition to FROZEN is lazily updated by 245 * Once FREEZING is initiated, transition to FROZEN is lazily updated by
250 * calling this function. If the current state is FREEZING but not FROZEN, 246 * calling this function. If the current state is FREEZING but not FROZEN,
@@ -255,14 +251,14 @@ out:
255 * update_if_frozen() on all descendants prior to invoking this function. 251 * update_if_frozen() on all descendants prior to invoking this function.
256 * 252 *
257 * Task states and freezer state might disagree while tasks are being 253 * Task states and freezer state might disagree while tasks are being
258 * migrated into or out of @cgroup, so we can't verify task states against 254 * migrated into or out of @css, so we can't verify task states against
259 * @freezer state here. See freezer_attach() for details. 255 * @freezer state here. See freezer_attach() for details.
260 */ 256 */
261static void update_if_frozen(struct cgroup *cgroup) 257static void update_if_frozen(struct cgroup_subsys_state *css)
262{ 258{
263 struct freezer *freezer = cgroup_freezer(cgroup); 259 struct freezer *freezer = css_freezer(css);
264 struct cgroup *pos; 260 struct cgroup_subsys_state *pos;
265 struct cgroup_iter it; 261 struct css_task_iter it;
266 struct task_struct *task; 262 struct task_struct *task;
267 263
268 WARN_ON_ONCE(!rcu_read_lock_held()); 264 WARN_ON_ONCE(!rcu_read_lock_held());
@@ -274,8 +270,8 @@ static void update_if_frozen(struct cgroup *cgroup)
274 goto out_unlock; 270 goto out_unlock;
275 271
276 /* are all (live) children frozen? */ 272 /* are all (live) children frozen? */
277 cgroup_for_each_child(pos, cgroup) { 273 css_for_each_child(pos, css) {
278 struct freezer *child = cgroup_freezer(pos); 274 struct freezer *child = css_freezer(pos);
279 275
280 if ((child->state & CGROUP_FREEZER_ONLINE) && 276 if ((child->state & CGROUP_FREEZER_ONLINE) &&
281 !(child->state & CGROUP_FROZEN)) 277 !(child->state & CGROUP_FROZEN))
@@ -283,9 +279,9 @@ static void update_if_frozen(struct cgroup *cgroup)
283 } 279 }
284 280
285 /* are all tasks frozen? */ 281 /* are all tasks frozen? */
286 cgroup_iter_start(cgroup, &it); 282 css_task_iter_start(css, &it);
287 283
288 while ((task = cgroup_iter_next(cgroup, &it))) { 284 while ((task = css_task_iter_next(&it))) {
289 if (freezing(task)) { 285 if (freezing(task)) {
290 /* 286 /*
291 * freezer_should_skip() indicates that the task 287 * freezer_should_skip() indicates that the task
@@ -300,52 +296,49 @@ static void update_if_frozen(struct cgroup *cgroup)
300 296
301 freezer->state |= CGROUP_FROZEN; 297 freezer->state |= CGROUP_FROZEN;
302out_iter_end: 298out_iter_end:
303 cgroup_iter_end(cgroup, &it); 299 css_task_iter_end(&it);
304out_unlock: 300out_unlock:
305 spin_unlock_irq(&freezer->lock); 301 spin_unlock_irq(&freezer->lock);
306} 302}
307 303
308static int freezer_read(struct cgroup *cgroup, struct cftype *cft, 304static int freezer_read(struct cgroup_subsys_state *css, struct cftype *cft,
309 struct seq_file *m) 305 struct seq_file *m)
310{ 306{
311 struct cgroup *pos; 307 struct cgroup_subsys_state *pos;
312 308
313 rcu_read_lock(); 309 rcu_read_lock();
314 310
315 /* update states bottom-up */ 311 /* update states bottom-up */
316 cgroup_for_each_descendant_post(pos, cgroup) 312 css_for_each_descendant_post(pos, css)
317 update_if_frozen(pos); 313 update_if_frozen(pos);
318 update_if_frozen(cgroup);
319 314
320 rcu_read_unlock(); 315 rcu_read_unlock();
321 316
322 seq_puts(m, freezer_state_strs(cgroup_freezer(cgroup)->state)); 317 seq_puts(m, freezer_state_strs(css_freezer(css)->state));
323 seq_putc(m, '\n'); 318 seq_putc(m, '\n');
324 return 0; 319 return 0;
325} 320}
326 321
327static void freeze_cgroup(struct freezer *freezer) 322static void freeze_cgroup(struct freezer *freezer)
328{ 323{
329 struct cgroup *cgroup = freezer->css.cgroup; 324 struct css_task_iter it;
330 struct cgroup_iter it;
331 struct task_struct *task; 325 struct task_struct *task;
332 326
333 cgroup_iter_start(cgroup, &it); 327 css_task_iter_start(&freezer->css, &it);
334 while ((task = cgroup_iter_next(cgroup, &it))) 328 while ((task = css_task_iter_next(&it)))
335 freeze_task(task); 329 freeze_task(task);
336 cgroup_iter_end(cgroup, &it); 330 css_task_iter_end(&it);
337} 331}
338 332
339static void unfreeze_cgroup(struct freezer *freezer) 333static void unfreeze_cgroup(struct freezer *freezer)
340{ 334{
341 struct cgroup *cgroup = freezer->css.cgroup; 335 struct css_task_iter it;
342 struct cgroup_iter it;
343 struct task_struct *task; 336 struct task_struct *task;
344 337
345 cgroup_iter_start(cgroup, &it); 338 css_task_iter_start(&freezer->css, &it);
346 while ((task = cgroup_iter_next(cgroup, &it))) 339 while ((task = css_task_iter_next(&it)))
347 __thaw_task(task); 340 __thaw_task(task);
348 cgroup_iter_end(cgroup, &it); 341 css_task_iter_end(&it);
349} 342}
350 343
351/** 344/**
@@ -395,12 +388,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze,
395 */ 388 */
396static void freezer_change_state(struct freezer *freezer, bool freeze) 389static void freezer_change_state(struct freezer *freezer, bool freeze)
397{ 390{
398 struct cgroup *pos; 391 struct cgroup_subsys_state *pos;
399
400 /* update @freezer */
401 spin_lock_irq(&freezer->lock);
402 freezer_apply_state(freezer, freeze, CGROUP_FREEZING_SELF);
403 spin_unlock_irq(&freezer->lock);
404 392
405 /* 393 /*
406 * Update all its descendants in pre-order traversal. Each 394 * Update all its descendants in pre-order traversal. Each
@@ -408,24 +396,33 @@ static void freezer_change_state(struct freezer *freezer, bool freeze)
408 * CGROUP_FREEZING_PARENT. 396 * CGROUP_FREEZING_PARENT.
409 */ 397 */
410 rcu_read_lock(); 398 rcu_read_lock();
411 cgroup_for_each_descendant_pre(pos, freezer->css.cgroup) { 399 css_for_each_descendant_pre(pos, &freezer->css) {
412 struct freezer *pos_f = cgroup_freezer(pos); 400 struct freezer *pos_f = css_freezer(pos);
413 struct freezer *parent = parent_freezer(pos_f); 401 struct freezer *parent = parent_freezer(pos_f);
414 402
415 /*
416 * Our update to @parent->state is already visible which is
417 * all we need. No need to lock @parent. For more info on
418 * synchronization, see freezer_post_create().
419 */
420 spin_lock_irq(&pos_f->lock); 403 spin_lock_irq(&pos_f->lock);
421 freezer_apply_state(pos_f, parent->state & CGROUP_FREEZING, 404
422 CGROUP_FREEZING_PARENT); 405 if (pos_f == freezer) {
406 freezer_apply_state(pos_f, freeze,
407 CGROUP_FREEZING_SELF);
408 } else {
409 /*
410 * Our update to @parent->state is already visible
411 * which is all we need. No need to lock @parent.
412 * For more info on synchronization, see
413 * freezer_post_create().
414 */
415 freezer_apply_state(pos_f,
416 parent->state & CGROUP_FREEZING,
417 CGROUP_FREEZING_PARENT);
418 }
419
423 spin_unlock_irq(&pos_f->lock); 420 spin_unlock_irq(&pos_f->lock);
424 } 421 }
425 rcu_read_unlock(); 422 rcu_read_unlock();
426} 423}
427 424
428static int freezer_write(struct cgroup *cgroup, struct cftype *cft, 425static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft,
429 const char *buffer) 426 const char *buffer)
430{ 427{
431 bool freeze; 428 bool freeze;
@@ -437,20 +434,22 @@ static int freezer_write(struct cgroup *cgroup, struct cftype *cft,
437 else 434 else
438 return -EINVAL; 435 return -EINVAL;
439 436
440 freezer_change_state(cgroup_freezer(cgroup), freeze); 437 freezer_change_state(css_freezer(css), freeze);
441 return 0; 438 return 0;
442} 439}
443 440
444static u64 freezer_self_freezing_read(struct cgroup *cgroup, struct cftype *cft) 441static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css,
442 struct cftype *cft)
445{ 443{
446 struct freezer *freezer = cgroup_freezer(cgroup); 444 struct freezer *freezer = css_freezer(css);
447 445
448 return (bool)(freezer->state & CGROUP_FREEZING_SELF); 446 return (bool)(freezer->state & CGROUP_FREEZING_SELF);
449} 447}
450 448
451static u64 freezer_parent_freezing_read(struct cgroup *cgroup, struct cftype *cft) 449static u64 freezer_parent_freezing_read(struct cgroup_subsys_state *css,
450 struct cftype *cft)
452{ 451{
453 struct freezer *freezer = cgroup_freezer(cgroup); 452 struct freezer *freezer = css_freezer(css);
454 453
455 return (bool)(freezer->state & CGROUP_FREEZING_PARENT); 454 return (bool)(freezer->state & CGROUP_FREEZING_PARENT);
456} 455}
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 383f8231e436..247091bf0587 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -20,22 +20,33 @@
20#include <linux/hardirq.h> 20#include <linux/hardirq.h>
21#include <linux/export.h> 21#include <linux/export.h>
22 22
23DEFINE_PER_CPU(struct context_tracking, context_tracking) = { 23#define CREATE_TRACE_POINTS
24#ifdef CONFIG_CONTEXT_TRACKING_FORCE 24#include <trace/events/context_tracking.h>
25 .active = true, 25
26#endif 26struct static_key context_tracking_enabled = STATIC_KEY_INIT_FALSE;
27}; 27EXPORT_SYMBOL_GPL(context_tracking_enabled);
28
29DEFINE_PER_CPU(struct context_tracking, context_tracking);
30EXPORT_SYMBOL_GPL(context_tracking);
31
32void context_tracking_cpu_set(int cpu)
33{
34 if (!per_cpu(context_tracking.active, cpu)) {
35 per_cpu(context_tracking.active, cpu) = true;
36 static_key_slow_inc(&context_tracking_enabled);
37 }
38}
28 39
29/** 40/**
30 * user_enter - Inform the context tracking that the CPU is going to 41 * context_tracking_user_enter - Inform the context tracking that the CPU is going to
31 * enter userspace mode. 42 * enter userspace mode.
32 * 43 *
33 * This function must be called right before we switch from the kernel 44 * This function must be called right before we switch from the kernel
34 * to userspace, when it's guaranteed the remaining kernel instructions 45 * to userspace, when it's guaranteed the remaining kernel instructions
35 * to execute won't use any RCU read side critical section because this 46 * to execute won't use any RCU read side critical section because this
36 * function sets RCU in extended quiescent state. 47 * function sets RCU in extended quiescent state.
37 */ 48 */
38void user_enter(void) 49void context_tracking_user_enter(void)
39{ 50{
40 unsigned long flags; 51 unsigned long flags;
41 52
@@ -54,17 +65,32 @@ void user_enter(void)
54 WARN_ON_ONCE(!current->mm); 65 WARN_ON_ONCE(!current->mm);
55 66
56 local_irq_save(flags); 67 local_irq_save(flags);
57 if (__this_cpu_read(context_tracking.active) && 68 if ( __this_cpu_read(context_tracking.state) != IN_USER) {
58 __this_cpu_read(context_tracking.state) != IN_USER) { 69 if (__this_cpu_read(context_tracking.active)) {
70 trace_user_enter(0);
71 /*
72 * At this stage, only low level arch entry code remains and
73 * then we'll run in userspace. We can assume there won't be
74 * any RCU read-side critical section until the next call to
75 * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency
76 * on the tick.
77 */
78 vtime_user_enter(current);
79 rcu_user_enter();
80 }
59 /* 81 /*
60 * At this stage, only low level arch entry code remains and 82 * Even if context tracking is disabled on this CPU, because it's outside
61 * then we'll run in userspace. We can assume there won't be 83 * the full dynticks mask for example, we still have to keep track of the
62 * any RCU read-side critical section until the next call to 84 * context transitions and states to prevent inconsistency on those of
63 * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency 85 * other CPUs.
64 * on the tick. 86 * If a task triggers an exception in userspace, sleep on the exception
87 * handler and then migrate to another CPU, that new CPU must know where
88 * the exception returns by the time we call exception_exit().
89 * This information can only be provided by the previous CPU when it called
90 * exception_enter().
91 * OTOH we can spare the calls to vtime and RCU when context_tracking.active
92 * is false because we know that CPU is not tickless.
65 */ 93 */
66 vtime_user_enter(current);
67 rcu_user_enter();
68 __this_cpu_write(context_tracking.state, IN_USER); 94 __this_cpu_write(context_tracking.state, IN_USER);
69 } 95 }
70 local_irq_restore(flags); 96 local_irq_restore(flags);
@@ -87,10 +113,9 @@ void user_enter(void)
87 */ 113 */
88void __sched notrace preempt_schedule_context(void) 114void __sched notrace preempt_schedule_context(void)
89{ 115{
90 struct thread_info *ti = current_thread_info();
91 enum ctx_state prev_ctx; 116 enum ctx_state prev_ctx;
92 117
93 if (likely(ti->preempt_count || irqs_disabled())) 118 if (likely(!preemptible()))
94 return; 119 return;
95 120
96 /* 121 /*
@@ -112,8 +137,8 @@ EXPORT_SYMBOL_GPL(preempt_schedule_context);
112#endif /* CONFIG_PREEMPT */ 137#endif /* CONFIG_PREEMPT */
113 138
114/** 139/**
115 * user_exit - Inform the context tracking that the CPU is 140 * context_tracking_user_exit - Inform the context tracking that the CPU is
116 * exiting userspace mode and entering the kernel. 141 * exiting userspace mode and entering the kernel.
117 * 142 *
118 * This function must be called after we entered the kernel from userspace 143 * This function must be called after we entered the kernel from userspace
119 * before any use of RCU read side critical section. This potentially include 144 * before any use of RCU read side critical section. This potentially include
@@ -122,7 +147,7 @@ EXPORT_SYMBOL_GPL(preempt_schedule_context);
122 * This call supports re-entrancy. This way it can be called from any exception 147 * This call supports re-entrancy. This way it can be called from any exception
123 * handler without needing to know if we came from userspace or not. 148 * handler without needing to know if we came from userspace or not.
124 */ 149 */
125void user_exit(void) 150void context_tracking_user_exit(void)
126{ 151{
127 unsigned long flags; 152 unsigned long flags;
128 153
@@ -131,38 +156,22 @@ void user_exit(void)
131 156
132 local_irq_save(flags); 157 local_irq_save(flags);
133 if (__this_cpu_read(context_tracking.state) == IN_USER) { 158 if (__this_cpu_read(context_tracking.state) == IN_USER) {
134 /* 159 if (__this_cpu_read(context_tracking.active)) {
135 * We are going to run code that may use RCU. Inform 160 /*
136 * RCU core about that (ie: we may need the tick again). 161 * We are going to run code that may use RCU. Inform
137 */ 162 * RCU core about that (ie: we may need the tick again).
138 rcu_user_exit(); 163 */
139 vtime_user_exit(current); 164 rcu_user_exit();
165 vtime_user_exit(current);
166 trace_user_exit(0);
167 }
140 __this_cpu_write(context_tracking.state, IN_KERNEL); 168 __this_cpu_write(context_tracking.state, IN_KERNEL);
141 } 169 }
142 local_irq_restore(flags); 170 local_irq_restore(flags);
143} 171}
144 172
145void guest_enter(void)
146{
147 if (vtime_accounting_enabled())
148 vtime_guest_enter(current);
149 else
150 __guest_enter();
151}
152EXPORT_SYMBOL_GPL(guest_enter);
153
154void guest_exit(void)
155{
156 if (vtime_accounting_enabled())
157 vtime_guest_exit(current);
158 else
159 __guest_exit();
160}
161EXPORT_SYMBOL_GPL(guest_exit);
162
163
164/** 173/**
165 * context_tracking_task_switch - context switch the syscall callbacks 174 * __context_tracking_task_switch - context switch the syscall callbacks
166 * @prev: the task that is being switched out 175 * @prev: the task that is being switched out
167 * @next: the task that is being switched in 176 * @next: the task that is being switched in
168 * 177 *
@@ -174,11 +183,19 @@ EXPORT_SYMBOL_GPL(guest_exit);
174 * migrate to some CPU that doesn't do the context tracking. As such the TIF 183 * migrate to some CPU that doesn't do the context tracking. As such the TIF
175 * flag may not be desired there. 184 * flag may not be desired there.
176 */ 185 */
177void context_tracking_task_switch(struct task_struct *prev, 186void __context_tracking_task_switch(struct task_struct *prev,
178 struct task_struct *next) 187 struct task_struct *next)
179{ 188{
180 if (__this_cpu_read(context_tracking.active)) { 189 clear_tsk_thread_flag(prev, TIF_NOHZ);
181 clear_tsk_thread_flag(prev, TIF_NOHZ); 190 set_tsk_thread_flag(next, TIF_NOHZ);
182 set_tsk_thread_flag(next, TIF_NOHZ);
183 }
184} 191}
192
193#ifdef CONFIG_CONTEXT_TRACKING_FORCE
194void __init context_tracking_init(void)
195{
196 int cpu;
197
198 for_each_possible_cpu(cpu)
199 context_tracking_cpu_set(cpu);
200}
201#endif
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 198a38883e64..d7f07a2da5a6 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -113,7 +113,7 @@ EXPORT_SYMBOL_GPL(put_online_cpus);
113 * get_online_cpus() not an api which is called all that often. 113 * get_online_cpus() not an api which is called all that often.
114 * 114 *
115 */ 115 */
116static void cpu_hotplug_begin(void) 116void cpu_hotplug_begin(void)
117{ 117{
118 cpu_hotplug.active_writer = current; 118 cpu_hotplug.active_writer = current;
119 119
@@ -127,7 +127,7 @@ static void cpu_hotplug_begin(void)
127 } 127 }
128} 128}
129 129
130static void cpu_hotplug_done(void) 130void cpu_hotplug_done(void)
131{ 131{
132 cpu_hotplug.active_writer = NULL; 132 cpu_hotplug.active_writer = NULL;
133 mutex_unlock(&cpu_hotplug.lock); 133 mutex_unlock(&cpu_hotplug.lock);
@@ -154,10 +154,7 @@ void cpu_hotplug_enable(void)
154 cpu_maps_update_done(); 154 cpu_maps_update_done();
155} 155}
156 156
157#else /* #if CONFIG_HOTPLUG_CPU */ 157#endif /* CONFIG_HOTPLUG_CPU */
158static void cpu_hotplug_begin(void) {}
159static void cpu_hotplug_done(void) {}
160#endif /* #else #if CONFIG_HOTPLUG_CPU */
161 158
162/* Need to know about CPUs going up/down? */ 159/* Need to know about CPUs going up/down? */
163int __ref register_cpu_notifier(struct notifier_block *nb) 160int __ref register_cpu_notifier(struct notifier_block *nb)
@@ -366,7 +363,7 @@ EXPORT_SYMBOL(cpu_down);
366#endif /*CONFIG_HOTPLUG_CPU*/ 363#endif /*CONFIG_HOTPLUG_CPU*/
367 364
368/* Requires cpu_add_remove_lock to be held */ 365/* Requires cpu_add_remove_lock to be held */
369static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) 366static int _cpu_up(unsigned int cpu, int tasks_frozen)
370{ 367{
371 int ret, nr_calls = 0; 368 int ret, nr_calls = 0;
372 void *hcpu = (void *)(long)cpu; 369 void *hcpu = (void *)(long)cpu;
@@ -419,7 +416,7 @@ out:
419 return ret; 416 return ret;
420} 417}
421 418
422int __cpuinit cpu_up(unsigned int cpu) 419int cpu_up(unsigned int cpu)
423{ 420{
424 int err = 0; 421 int err = 0;
425 422
@@ -618,7 +615,7 @@ core_initcall(cpu_hotplug_pm_sync_init);
618 * It must be called by the arch code on the new cpu, before the new cpu 615 * It must be called by the arch code on the new cpu, before the new cpu
619 * enables interrupts and before the "boot" cpu returns from __cpu_up(). 616 * enables interrupts and before the "boot" cpu returns from __cpu_up().
620 */ 617 */
621void __cpuinit notify_cpu_starting(unsigned int cpu) 618void notify_cpu_starting(unsigned int cpu)
622{ 619{
623 unsigned long val = CPU_STARTING; 620 unsigned long val = CPU_STARTING;
624 621
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index e5657788fedd..6bf981e13c43 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -68,10 +68,6 @@
68 */ 68 */
69int number_of_cpusets __read_mostly; 69int number_of_cpusets __read_mostly;
70 70
71/* Forward declare cgroup structures */
72struct cgroup_subsys cpuset_subsys;
73struct cpuset;
74
75/* See "Frequency meter" comments, below. */ 71/* See "Frequency meter" comments, below. */
76 72
77struct fmeter { 73struct fmeter {
@@ -115,27 +111,20 @@ struct cpuset {
115 int relax_domain_level; 111 int relax_domain_level;
116}; 112};
117 113
118/* Retrieve the cpuset for a cgroup */ 114static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
119static inline struct cpuset *cgroup_cs(struct cgroup *cgrp)
120{ 115{
121 return container_of(cgroup_subsys_state(cgrp, cpuset_subsys_id), 116 return css ? container_of(css, struct cpuset, css) : NULL;
122 struct cpuset, css);
123} 117}
124 118
125/* Retrieve the cpuset for a task */ 119/* Retrieve the cpuset for a task */
126static inline struct cpuset *task_cs(struct task_struct *task) 120static inline struct cpuset *task_cs(struct task_struct *task)
127{ 121{
128 return container_of(task_subsys_state(task, cpuset_subsys_id), 122 return css_cs(task_css(task, cpuset_subsys_id));
129 struct cpuset, css);
130} 123}
131 124
132static inline struct cpuset *parent_cs(const struct cpuset *cs) 125static inline struct cpuset *parent_cs(struct cpuset *cs)
133{ 126{
134 struct cgroup *pcgrp = cs->css.cgroup->parent; 127 return css_cs(css_parent(&cs->css));
135
136 if (pcgrp)
137 return cgroup_cs(pcgrp);
138 return NULL;
139} 128}
140 129
141#ifdef CONFIG_NUMA 130#ifdef CONFIG_NUMA
@@ -212,29 +201,30 @@ static struct cpuset top_cpuset = {
212/** 201/**
213 * cpuset_for_each_child - traverse online children of a cpuset 202 * cpuset_for_each_child - traverse online children of a cpuset
214 * @child_cs: loop cursor pointing to the current child 203 * @child_cs: loop cursor pointing to the current child
215 * @pos_cgrp: used for iteration 204 * @pos_css: used for iteration
216 * @parent_cs: target cpuset to walk children of 205 * @parent_cs: target cpuset to walk children of
217 * 206 *
218 * Walk @child_cs through the online children of @parent_cs. Must be used 207 * Walk @child_cs through the online children of @parent_cs. Must be used
219 * with RCU read locked. 208 * with RCU read locked.
220 */ 209 */
221#define cpuset_for_each_child(child_cs, pos_cgrp, parent_cs) \ 210#define cpuset_for_each_child(child_cs, pos_css, parent_cs) \
222 cgroup_for_each_child((pos_cgrp), (parent_cs)->css.cgroup) \ 211 css_for_each_child((pos_css), &(parent_cs)->css) \
223 if (is_cpuset_online(((child_cs) = cgroup_cs((pos_cgrp))))) 212 if (is_cpuset_online(((child_cs) = css_cs((pos_css)))))
224 213
225/** 214/**
226 * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants 215 * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
227 * @des_cs: loop cursor pointing to the current descendant 216 * @des_cs: loop cursor pointing to the current descendant
228 * @pos_cgrp: used for iteration 217 * @pos_css: used for iteration
229 * @root_cs: target cpuset to walk ancestor of 218 * @root_cs: target cpuset to walk ancestor of
230 * 219 *
231 * Walk @des_cs through the online descendants of @root_cs. Must be used 220 * Walk @des_cs through the online descendants of @root_cs. Must be used
232 * with RCU read locked. The caller may modify @pos_cgrp by calling 221 * with RCU read locked. The caller may modify @pos_css by calling
233 * cgroup_rightmost_descendant() to skip subtree. 222 * css_rightmost_descendant() to skip subtree. @root_cs is included in the
223 * iteration and the first node to be visited.
234 */ 224 */
235#define cpuset_for_each_descendant_pre(des_cs, pos_cgrp, root_cs) \ 225#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \
236 cgroup_for_each_descendant_pre((pos_cgrp), (root_cs)->css.cgroup) \ 226 css_for_each_descendant_pre((pos_css), &(root_cs)->css) \
237 if (is_cpuset_online(((des_cs) = cgroup_cs((pos_cgrp))))) 227 if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
238 228
239/* 229/*
240 * There are two global mutexes guarding cpuset structures - cpuset_mutex 230 * There are two global mutexes guarding cpuset structures - cpuset_mutex
@@ -320,8 +310,7 @@ static struct file_system_type cpuset_fs_type = {
320 * 310 *
321 * Call with callback_mutex held. 311 * Call with callback_mutex held.
322 */ 312 */
323static void guarantee_online_cpus(const struct cpuset *cs, 313static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
324 struct cpumask *pmask)
325{ 314{
326 while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) 315 while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
327 cs = parent_cs(cs); 316 cs = parent_cs(cs);
@@ -339,7 +328,7 @@ static void guarantee_online_cpus(const struct cpuset *cs,
339 * 328 *
340 * Call with callback_mutex held. 329 * Call with callback_mutex held.
341 */ 330 */
342static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) 331static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
343{ 332{
344 while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY])) 333 while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY]))
345 cs = parent_cs(cs); 334 cs = parent_cs(cs);
@@ -384,7 +373,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
384 * alloc_trial_cpuset - allocate a trial cpuset 373 * alloc_trial_cpuset - allocate a trial cpuset
385 * @cs: the cpuset that the trial cpuset duplicates 374 * @cs: the cpuset that the trial cpuset duplicates
386 */ 375 */
387static struct cpuset *alloc_trial_cpuset(const struct cpuset *cs) 376static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
388{ 377{
389 struct cpuset *trial; 378 struct cpuset *trial;
390 379
@@ -431,9 +420,9 @@ static void free_trial_cpuset(struct cpuset *trial)
431 * Return 0 if valid, -errno if not. 420 * Return 0 if valid, -errno if not.
432 */ 421 */
433 422
434static int validate_change(const struct cpuset *cur, const struct cpuset *trial) 423static int validate_change(struct cpuset *cur, struct cpuset *trial)
435{ 424{
436 struct cgroup *cgrp; 425 struct cgroup_subsys_state *css;
437 struct cpuset *c, *par; 426 struct cpuset *c, *par;
438 int ret; 427 int ret;
439 428
@@ -441,7 +430,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
441 430
442 /* Each of our child cpusets must be a subset of us */ 431 /* Each of our child cpusets must be a subset of us */
443 ret = -EBUSY; 432 ret = -EBUSY;
444 cpuset_for_each_child(c, cgrp, cur) 433 cpuset_for_each_child(c, css, cur)
445 if (!is_cpuset_subset(c, trial)) 434 if (!is_cpuset_subset(c, trial))
446 goto out; 435 goto out;
447 436
@@ -462,7 +451,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
462 * overlap 451 * overlap
463 */ 452 */
464 ret = -EINVAL; 453 ret = -EINVAL;
465 cpuset_for_each_child(c, cgrp, par) { 454 cpuset_for_each_child(c, css, par) {
466 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && 455 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
467 c != cur && 456 c != cur &&
468 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) 457 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
@@ -475,13 +464,17 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
475 464
476 /* 465 /*
477 * Cpusets with tasks - existing or newly being attached - can't 466 * Cpusets with tasks - existing or newly being attached - can't
478 * have empty cpus_allowed or mems_allowed. 467 * be changed to have empty cpus_allowed or mems_allowed.
479 */ 468 */
480 ret = -ENOSPC; 469 ret = -ENOSPC;
481 if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress) && 470 if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress)) {
482 (cpumask_empty(trial->cpus_allowed) && 471 if (!cpumask_empty(cur->cpus_allowed) &&
483 nodes_empty(trial->mems_allowed))) 472 cpumask_empty(trial->cpus_allowed))
484 goto out; 473 goto out;
474 if (!nodes_empty(cur->mems_allowed) &&
475 nodes_empty(trial->mems_allowed))
476 goto out;
477 }
485 478
486 ret = 0; 479 ret = 0;
487out: 480out:
@@ -511,13 +504,16 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr,
511 struct cpuset *root_cs) 504 struct cpuset *root_cs)
512{ 505{
513 struct cpuset *cp; 506 struct cpuset *cp;
514 struct cgroup *pos_cgrp; 507 struct cgroup_subsys_state *pos_css;
515 508
516 rcu_read_lock(); 509 rcu_read_lock();
517 cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { 510 cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
511 if (cp == root_cs)
512 continue;
513
518 /* skip the whole subtree if @cp doesn't have any CPU */ 514 /* skip the whole subtree if @cp doesn't have any CPU */
519 if (cpumask_empty(cp->cpus_allowed)) { 515 if (cpumask_empty(cp->cpus_allowed)) {
520 pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); 516 pos_css = css_rightmost_descendant(pos_css);
521 continue; 517 continue;
522 } 518 }
523 519
@@ -592,7 +588,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
592 struct sched_domain_attr *dattr; /* attributes for custom domains */ 588 struct sched_domain_attr *dattr; /* attributes for custom domains */
593 int ndoms = 0; /* number of sched domains in result */ 589 int ndoms = 0; /* number of sched domains in result */
594 int nslot; /* next empty doms[] struct cpumask slot */ 590 int nslot; /* next empty doms[] struct cpumask slot */
595 struct cgroup *pos_cgrp; 591 struct cgroup_subsys_state *pos_css;
596 592
597 doms = NULL; 593 doms = NULL;
598 dattr = NULL; 594 dattr = NULL;
@@ -621,7 +617,9 @@ static int generate_sched_domains(cpumask_var_t **domains,
621 csn = 0; 617 csn = 0;
622 618
623 rcu_read_lock(); 619 rcu_read_lock();
624 cpuset_for_each_descendant_pre(cp, pos_cgrp, &top_cpuset) { 620 cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
621 if (cp == &top_cpuset)
622 continue;
625 /* 623 /*
626 * Continue traversing beyond @cp iff @cp has some CPUs and 624 * Continue traversing beyond @cp iff @cp has some CPUs and
627 * isn't load balancing. The former is obvious. The 625 * isn't load balancing. The former is obvious. The
@@ -638,7 +636,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
638 csa[csn++] = cp; 636 csa[csn++] = cp;
639 637
640 /* skip @cp's subtree */ 638 /* skip @cp's subtree */
641 pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); 639 pos_css = css_rightmost_descendant(pos_css);
642 } 640 }
643 rcu_read_unlock(); 641 rcu_read_unlock();
644 642
@@ -833,52 +831,45 @@ static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs)
833/** 831/**
834 * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's 832 * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's
835 * @tsk: task to test 833 * @tsk: task to test
836 * @scan: struct cgroup_scanner containing the cgroup of the task 834 * @data: cpuset to @tsk belongs to
837 * 835 *
838 * Called by cgroup_scan_tasks() for each task in a cgroup whose 836 * Called by css_scan_tasks() for each task in a cgroup whose cpus_allowed
839 * cpus_allowed mask needs to be changed. 837 * mask needs to be changed.
840 * 838 *
841 * We don't need to re-check for the cgroup/cpuset membership, since we're 839 * We don't need to re-check for the cgroup/cpuset membership, since we're
842 * holding cpuset_mutex at this point. 840 * holding cpuset_mutex at this point.
843 */ 841 */
844static void cpuset_change_cpumask(struct task_struct *tsk, 842static void cpuset_change_cpumask(struct task_struct *tsk, void *data)
845 struct cgroup_scanner *scan)
846{ 843{
847 struct cpuset *cpus_cs; 844 struct cpuset *cs = data;
845 struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
848 846
849 cpus_cs = effective_cpumask_cpuset(cgroup_cs(scan->cg));
850 set_cpus_allowed_ptr(tsk, cpus_cs->cpus_allowed); 847 set_cpus_allowed_ptr(tsk, cpus_cs->cpus_allowed);
851} 848}
852 849
853/** 850/**
854 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. 851 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
855 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed 852 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
856 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() 853 * @heap: if NULL, defer allocating heap memory to css_scan_tasks()
857 * 854 *
858 * Called with cpuset_mutex held 855 * Called with cpuset_mutex held
859 * 856 *
860 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, 857 * The css_scan_tasks() function will scan all the tasks in a cgroup,
861 * calling callback functions for each. 858 * calling callback functions for each.
862 * 859 *
863 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 860 * No return value. It's guaranteed that css_scan_tasks() always returns 0
864 * if @heap != NULL. 861 * if @heap != NULL.
865 */ 862 */
866static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) 863static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
867{ 864{
868 struct cgroup_scanner scan; 865 css_scan_tasks(&cs->css, NULL, cpuset_change_cpumask, cs, heap);
869
870 scan.cg = cs->css.cgroup;
871 scan.test_task = NULL;
872 scan.process_task = cpuset_change_cpumask;
873 scan.heap = heap;
874 cgroup_scan_tasks(&scan);
875} 866}
876 867
877/* 868/*
878 * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy. 869 * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy.
879 * @root_cs: the root cpuset of the hierarchy 870 * @root_cs: the root cpuset of the hierarchy
880 * @update_root: update root cpuset or not? 871 * @update_root: update root cpuset or not?
881 * @heap: the heap used by cgroup_scan_tasks() 872 * @heap: the heap used by css_scan_tasks()
882 * 873 *
883 * This will update cpumasks of tasks in @root_cs and all other empty cpusets 874 * This will update cpumasks of tasks in @root_cs and all other empty cpusets
884 * which take on cpumask of @root_cs. 875 * which take on cpumask of @root_cs.
@@ -889,17 +880,19 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs,
889 bool update_root, struct ptr_heap *heap) 880 bool update_root, struct ptr_heap *heap)
890{ 881{
891 struct cpuset *cp; 882 struct cpuset *cp;
892 struct cgroup *pos_cgrp; 883 struct cgroup_subsys_state *pos_css;
893
894 if (update_root)
895 update_tasks_cpumask(root_cs, heap);
896 884
897 rcu_read_lock(); 885 rcu_read_lock();
898 cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { 886 cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
899 /* skip the whole subtree if @cp have some CPU */ 887 if (cp == root_cs) {
900 if (!cpumask_empty(cp->cpus_allowed)) { 888 if (!update_root)
901 pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); 889 continue;
902 continue; 890 } else {
891 /* skip the whole subtree if @cp have some CPU */
892 if (!cpumask_empty(cp->cpus_allowed)) {
893 pos_css = css_rightmost_descendant(pos_css);
894 continue;
895 }
903 } 896 }
904 if (!css_tryget(&cp->css)) 897 if (!css_tryget(&cp->css))
905 continue; 898 continue;
@@ -1055,20 +1048,24 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
1055 task_unlock(tsk); 1048 task_unlock(tsk);
1056} 1049}
1057 1050
1051struct cpuset_change_nodemask_arg {
1052 struct cpuset *cs;
1053 nodemask_t *newmems;
1054};
1055
1058/* 1056/*
1059 * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy 1057 * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy
1060 * of it to cpuset's new mems_allowed, and migrate pages to new nodes if 1058 * of it to cpuset's new mems_allowed, and migrate pages to new nodes if
1061 * memory_migrate flag is set. Called with cpuset_mutex held. 1059 * memory_migrate flag is set. Called with cpuset_mutex held.
1062 */ 1060 */
1063static void cpuset_change_nodemask(struct task_struct *p, 1061static void cpuset_change_nodemask(struct task_struct *p, void *data)
1064 struct cgroup_scanner *scan)
1065{ 1062{
1066 struct cpuset *cs = cgroup_cs(scan->cg); 1063 struct cpuset_change_nodemask_arg *arg = data;
1064 struct cpuset *cs = arg->cs;
1067 struct mm_struct *mm; 1065 struct mm_struct *mm;
1068 int migrate; 1066 int migrate;
1069 nodemask_t *newmems = scan->data;
1070 1067
1071 cpuset_change_task_nodemask(p, newmems); 1068 cpuset_change_task_nodemask(p, arg->newmems);
1072 1069
1073 mm = get_task_mm(p); 1070 mm = get_task_mm(p);
1074 if (!mm) 1071 if (!mm)
@@ -1078,7 +1075,7 @@ static void cpuset_change_nodemask(struct task_struct *p,
1078 1075
1079 mpol_rebind_mm(mm, &cs->mems_allowed); 1076 mpol_rebind_mm(mm, &cs->mems_allowed);
1080 if (migrate) 1077 if (migrate)
1081 cpuset_migrate_mm(mm, &cs->old_mems_allowed, newmems); 1078 cpuset_migrate_mm(mm, &cs->old_mems_allowed, arg->newmems);
1082 mmput(mm); 1079 mmput(mm);
1083} 1080}
1084 1081
@@ -1087,28 +1084,22 @@ static void *cpuset_being_rebound;
1087/** 1084/**
1088 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. 1085 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
1089 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed 1086 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
1090 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() 1087 * @heap: if NULL, defer allocating heap memory to css_scan_tasks()
1091 * 1088 *
1092 * Called with cpuset_mutex held 1089 * Called with cpuset_mutex held. No return value. It's guaranteed that
1093 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 1090 * css_scan_tasks() always returns 0 if @heap != NULL.
1094 * if @heap != NULL.
1095 */ 1091 */
1096static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) 1092static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap)
1097{ 1093{
1098 static nodemask_t newmems; /* protected by cpuset_mutex */ 1094 static nodemask_t newmems; /* protected by cpuset_mutex */
1099 struct cgroup_scanner scan;
1100 struct cpuset *mems_cs = effective_nodemask_cpuset(cs); 1095 struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
1096 struct cpuset_change_nodemask_arg arg = { .cs = cs,
1097 .newmems = &newmems };
1101 1098
1102 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ 1099 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
1103 1100
1104 guarantee_online_mems(mems_cs, &newmems); 1101 guarantee_online_mems(mems_cs, &newmems);
1105 1102
1106 scan.cg = cs->css.cgroup;
1107 scan.test_task = NULL;
1108 scan.process_task = cpuset_change_nodemask;
1109 scan.heap = heap;
1110 scan.data = &newmems;
1111
1112 /* 1103 /*
1113 * The mpol_rebind_mm() call takes mmap_sem, which we couldn't 1104 * The mpol_rebind_mm() call takes mmap_sem, which we couldn't
1114 * take while holding tasklist_lock. Forks can happen - the 1105 * take while holding tasklist_lock. Forks can happen - the
@@ -1119,7 +1110,7 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap)
1119 * It's ok if we rebind the same mm twice; mpol_rebind_mm() 1110 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
1120 * is idempotent. Also migrate pages in each mm to new nodes. 1111 * is idempotent. Also migrate pages in each mm to new nodes.
1121 */ 1112 */
1122 cgroup_scan_tasks(&scan); 1113 css_scan_tasks(&cs->css, NULL, cpuset_change_nodemask, &arg, heap);
1123 1114
1124 /* 1115 /*
1125 * All the tasks' nodemasks have been updated, update 1116 * All the tasks' nodemasks have been updated, update
@@ -1135,7 +1126,7 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap)
1135 * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy. 1126 * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy.
1136 * @cs: the root cpuset of the hierarchy 1127 * @cs: the root cpuset of the hierarchy
1137 * @update_root: update the root cpuset or not? 1128 * @update_root: update the root cpuset or not?
1138 * @heap: the heap used by cgroup_scan_tasks() 1129 * @heap: the heap used by css_scan_tasks()
1139 * 1130 *
1140 * This will update nodemasks of tasks in @root_cs and all other empty cpusets 1131 * This will update nodemasks of tasks in @root_cs and all other empty cpusets
1141 * which take on nodemask of @root_cs. 1132 * which take on nodemask of @root_cs.
@@ -1146,17 +1137,19 @@ static void update_tasks_nodemask_hier(struct cpuset *root_cs,
1146 bool update_root, struct ptr_heap *heap) 1137 bool update_root, struct ptr_heap *heap)
1147{ 1138{
1148 struct cpuset *cp; 1139 struct cpuset *cp;
1149 struct cgroup *pos_cgrp; 1140 struct cgroup_subsys_state *pos_css;
1150
1151 if (update_root)
1152 update_tasks_nodemask(root_cs, heap);
1153 1141
1154 rcu_read_lock(); 1142 rcu_read_lock();
1155 cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { 1143 cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
1156 /* skip the whole subtree if @cp have some CPU */ 1144 if (cp == root_cs) {
1157 if (!nodes_empty(cp->mems_allowed)) { 1145 if (!update_root)
1158 pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); 1146 continue;
1159 continue; 1147 } else {
1148 /* skip the whole subtree if @cp have some CPU */
1149 if (!nodes_empty(cp->mems_allowed)) {
1150 pos_css = css_rightmost_descendant(pos_css);
1151 continue;
1152 }
1160 } 1153 }
1161 if (!css_tryget(&cp->css)) 1154 if (!css_tryget(&cp->css))
1162 continue; 1155 continue;
@@ -1263,44 +1256,39 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
1263 return 0; 1256 return 0;
1264} 1257}
1265 1258
1266/* 1259/**
1267 * cpuset_change_flag - make a task's spread flags the same as its cpuset's 1260 * cpuset_change_flag - make a task's spread flags the same as its cpuset's
1268 * @tsk: task to be updated 1261 * @tsk: task to be updated
1269 * @scan: struct cgroup_scanner containing the cgroup of the task 1262 * @data: cpuset to @tsk belongs to
1270 * 1263 *
1271 * Called by cgroup_scan_tasks() for each task in a cgroup. 1264 * Called by css_scan_tasks() for each task in a cgroup.
1272 * 1265 *
1273 * We don't need to re-check for the cgroup/cpuset membership, since we're 1266 * We don't need to re-check for the cgroup/cpuset membership, since we're
1274 * holding cpuset_mutex at this point. 1267 * holding cpuset_mutex at this point.
1275 */ 1268 */
1276static void cpuset_change_flag(struct task_struct *tsk, 1269static void cpuset_change_flag(struct task_struct *tsk, void *data)
1277 struct cgroup_scanner *scan)
1278{ 1270{
1279 cpuset_update_task_spread_flag(cgroup_cs(scan->cg), tsk); 1271 struct cpuset *cs = data;
1272
1273 cpuset_update_task_spread_flag(cs, tsk);
1280} 1274}
1281 1275
1282/* 1276/**
1283 * update_tasks_flags - update the spread flags of tasks in the cpuset. 1277 * update_tasks_flags - update the spread flags of tasks in the cpuset.
1284 * @cs: the cpuset in which each task's spread flags needs to be changed 1278 * @cs: the cpuset in which each task's spread flags needs to be changed
1285 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() 1279 * @heap: if NULL, defer allocating heap memory to css_scan_tasks()
1286 * 1280 *
1287 * Called with cpuset_mutex held 1281 * Called with cpuset_mutex held
1288 * 1282 *
1289 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, 1283 * The css_scan_tasks() function will scan all the tasks in a cgroup,
1290 * calling callback functions for each. 1284 * calling callback functions for each.
1291 * 1285 *
1292 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 1286 * No return value. It's guaranteed that css_scan_tasks() always returns 0
1293 * if @heap != NULL. 1287 * if @heap != NULL.
1294 */ 1288 */
1295static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap) 1289static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap)
1296{ 1290{
1297 struct cgroup_scanner scan; 1291 css_scan_tasks(&cs->css, NULL, cpuset_change_flag, cs, heap);
1298
1299 scan.cg = cs->css.cgroup;
1300 scan.test_task = NULL;
1301 scan.process_task = cpuset_change_flag;
1302 scan.heap = heap;
1303 cgroup_scan_tasks(&scan);
1304} 1292}
1305 1293
1306/* 1294/*
@@ -1458,9 +1446,10 @@ static int fmeter_getrate(struct fmeter *fmp)
1458} 1446}
1459 1447
1460/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ 1448/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
1461static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 1449static int cpuset_can_attach(struct cgroup_subsys_state *css,
1450 struct cgroup_taskset *tset)
1462{ 1451{
1463 struct cpuset *cs = cgroup_cs(cgrp); 1452 struct cpuset *cs = css_cs(css);
1464 struct task_struct *task; 1453 struct task_struct *task;
1465 int ret; 1454 int ret;
1466 1455
@@ -1471,11 +1460,11 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1471 * flag is set. 1460 * flag is set.
1472 */ 1461 */
1473 ret = -ENOSPC; 1462 ret = -ENOSPC;
1474 if (!cgroup_sane_behavior(cgrp) && 1463 if (!cgroup_sane_behavior(css->cgroup) &&
1475 (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) 1464 (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
1476 goto out_unlock; 1465 goto out_unlock;
1477 1466
1478 cgroup_taskset_for_each(task, cgrp, tset) { 1467 cgroup_taskset_for_each(task, css, tset) {
1479 /* 1468 /*
1480 * Kthreads which disallow setaffinity shouldn't be moved 1469 * Kthreads which disallow setaffinity shouldn't be moved
1481 * to a new cpuset; we don't want to change their cpu 1470 * to a new cpuset; we don't want to change their cpu
@@ -1504,11 +1493,11 @@ out_unlock:
1504 return ret; 1493 return ret;
1505} 1494}
1506 1495
1507static void cpuset_cancel_attach(struct cgroup *cgrp, 1496static void cpuset_cancel_attach(struct cgroup_subsys_state *css,
1508 struct cgroup_taskset *tset) 1497 struct cgroup_taskset *tset)
1509{ 1498{
1510 mutex_lock(&cpuset_mutex); 1499 mutex_lock(&cpuset_mutex);
1511 cgroup_cs(cgrp)->attach_in_progress--; 1500 css_cs(css)->attach_in_progress--;
1512 mutex_unlock(&cpuset_mutex); 1501 mutex_unlock(&cpuset_mutex);
1513} 1502}
1514 1503
@@ -1519,16 +1508,18 @@ static void cpuset_cancel_attach(struct cgroup *cgrp,
1519 */ 1508 */
1520static cpumask_var_t cpus_attach; 1509static cpumask_var_t cpus_attach;
1521 1510
1522static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 1511static void cpuset_attach(struct cgroup_subsys_state *css,
1512 struct cgroup_taskset *tset)
1523{ 1513{
1524 /* static buf protected by cpuset_mutex */ 1514 /* static buf protected by cpuset_mutex */
1525 static nodemask_t cpuset_attach_nodemask_to; 1515 static nodemask_t cpuset_attach_nodemask_to;
1526 struct mm_struct *mm; 1516 struct mm_struct *mm;
1527 struct task_struct *task; 1517 struct task_struct *task;
1528 struct task_struct *leader = cgroup_taskset_first(tset); 1518 struct task_struct *leader = cgroup_taskset_first(tset);
1529 struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset); 1519 struct cgroup_subsys_state *oldcss = cgroup_taskset_cur_css(tset,
1530 struct cpuset *cs = cgroup_cs(cgrp); 1520 cpuset_subsys_id);
1531 struct cpuset *oldcs = cgroup_cs(oldcgrp); 1521 struct cpuset *cs = css_cs(css);
1522 struct cpuset *oldcs = css_cs(oldcss);
1532 struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); 1523 struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
1533 struct cpuset *mems_cs = effective_nodemask_cpuset(cs); 1524 struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
1534 1525
@@ -1542,7 +1533,7 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1542 1533
1543 guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to); 1534 guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to);
1544 1535
1545 cgroup_taskset_for_each(task, cgrp, tset) { 1536 cgroup_taskset_for_each(task, css, tset) {
1546 /* 1537 /*
1547 * can_attach beforehand should guarantee that this doesn't 1538 * can_attach beforehand should guarantee that this doesn't
1548 * fail. TODO: have a better way to handle failure here 1539 * fail. TODO: have a better way to handle failure here
@@ -1604,15 +1595,18 @@ typedef enum {
1604 FILE_SPREAD_SLAB, 1595 FILE_SPREAD_SLAB,
1605} cpuset_filetype_t; 1596} cpuset_filetype_t;
1606 1597
1607static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) 1598static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
1599 u64 val)
1608{ 1600{
1609 struct cpuset *cs = cgroup_cs(cgrp); 1601 struct cpuset *cs = css_cs(css);
1610 cpuset_filetype_t type = cft->private; 1602 cpuset_filetype_t type = cft->private;
1611 int retval = -ENODEV; 1603 int retval = 0;
1612 1604
1613 mutex_lock(&cpuset_mutex); 1605 mutex_lock(&cpuset_mutex);
1614 if (!is_cpuset_online(cs)) 1606 if (!is_cpuset_online(cs)) {
1607 retval = -ENODEV;
1615 goto out_unlock; 1608 goto out_unlock;
1609 }
1616 1610
1617 switch (type) { 1611 switch (type) {
1618 case FILE_CPU_EXCLUSIVE: 1612 case FILE_CPU_EXCLUSIVE:
@@ -1651,9 +1645,10 @@ out_unlock:
1651 return retval; 1645 return retval;
1652} 1646}
1653 1647
1654static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) 1648static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
1649 s64 val)
1655{ 1650{
1656 struct cpuset *cs = cgroup_cs(cgrp); 1651 struct cpuset *cs = css_cs(css);
1657 cpuset_filetype_t type = cft->private; 1652 cpuset_filetype_t type = cft->private;
1658 int retval = -ENODEV; 1653 int retval = -ENODEV;
1659 1654
@@ -1677,10 +1672,10 @@ out_unlock:
1677/* 1672/*
1678 * Common handling for a write to a "cpus" or "mems" file. 1673 * Common handling for a write to a "cpus" or "mems" file.
1679 */ 1674 */
1680static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, 1675static int cpuset_write_resmask(struct cgroup_subsys_state *css,
1681 const char *buf) 1676 struct cftype *cft, const char *buf)
1682{ 1677{
1683 struct cpuset *cs = cgroup_cs(cgrp); 1678 struct cpuset *cs = css_cs(css);
1684 struct cpuset *trialcs; 1679 struct cpuset *trialcs;
1685 int retval = -ENODEV; 1680 int retval = -ENODEV;
1686 1681
@@ -1759,13 +1754,12 @@ static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs)
1759 return count; 1754 return count;
1760} 1755}
1761 1756
1762static ssize_t cpuset_common_file_read(struct cgroup *cgrp, 1757static ssize_t cpuset_common_file_read(struct cgroup_subsys_state *css,
1763 struct cftype *cft, 1758 struct cftype *cft, struct file *file,
1764 struct file *file, 1759 char __user *buf, size_t nbytes,
1765 char __user *buf, 1760 loff_t *ppos)
1766 size_t nbytes, loff_t *ppos)
1767{ 1761{
1768 struct cpuset *cs = cgroup_cs(cgrp); 1762 struct cpuset *cs = css_cs(css);
1769 cpuset_filetype_t type = cft->private; 1763 cpuset_filetype_t type = cft->private;
1770 char *page; 1764 char *page;
1771 ssize_t retval = 0; 1765 ssize_t retval = 0;
@@ -1795,9 +1789,9 @@ out:
1795 return retval; 1789 return retval;
1796} 1790}
1797 1791
1798static u64 cpuset_read_u64(struct cgroup *cgrp, struct cftype *cft) 1792static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
1799{ 1793{
1800 struct cpuset *cs = cgroup_cs(cgrp); 1794 struct cpuset *cs = css_cs(css);
1801 cpuset_filetype_t type = cft->private; 1795 cpuset_filetype_t type = cft->private;
1802 switch (type) { 1796 switch (type) {
1803 case FILE_CPU_EXCLUSIVE: 1797 case FILE_CPU_EXCLUSIVE:
@@ -1826,9 +1820,9 @@ static u64 cpuset_read_u64(struct cgroup *cgrp, struct cftype *cft)
1826 return 0; 1820 return 0;
1827} 1821}
1828 1822
1829static s64 cpuset_read_s64(struct cgroup *cgrp, struct cftype *cft) 1823static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
1830{ 1824{
1831 struct cpuset *cs = cgroup_cs(cgrp); 1825 struct cpuset *cs = css_cs(css);
1832 cpuset_filetype_t type = cft->private; 1826 cpuset_filetype_t type = cft->private;
1833 switch (type) { 1827 switch (type) {
1834 case FILE_SCHED_RELAX_DOMAIN_LEVEL: 1828 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
@@ -1943,11 +1937,12 @@ static struct cftype files[] = {
1943 * cgrp: control group that the new cpuset will be part of 1937 * cgrp: control group that the new cpuset will be part of
1944 */ 1938 */
1945 1939
1946static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cgrp) 1940static struct cgroup_subsys_state *
1941cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
1947{ 1942{
1948 struct cpuset *cs; 1943 struct cpuset *cs;
1949 1944
1950 if (!cgrp->parent) 1945 if (!parent_css)
1951 return &top_cpuset.css; 1946 return &top_cpuset.css;
1952 1947
1953 cs = kzalloc(sizeof(*cs), GFP_KERNEL); 1948 cs = kzalloc(sizeof(*cs), GFP_KERNEL);
@@ -1967,12 +1962,12 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cgrp)
1967 return &cs->css; 1962 return &cs->css;
1968} 1963}
1969 1964
1970static int cpuset_css_online(struct cgroup *cgrp) 1965static int cpuset_css_online(struct cgroup_subsys_state *css)
1971{ 1966{
1972 struct cpuset *cs = cgroup_cs(cgrp); 1967 struct cpuset *cs = css_cs(css);
1973 struct cpuset *parent = parent_cs(cs); 1968 struct cpuset *parent = parent_cs(cs);
1974 struct cpuset *tmp_cs; 1969 struct cpuset *tmp_cs;
1975 struct cgroup *pos_cg; 1970 struct cgroup_subsys_state *pos_css;
1976 1971
1977 if (!parent) 1972 if (!parent)
1978 return 0; 1973 return 0;
@@ -1987,7 +1982,7 @@ static int cpuset_css_online(struct cgroup *cgrp)
1987 1982
1988 number_of_cpusets++; 1983 number_of_cpusets++;
1989 1984
1990 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags)) 1985 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
1991 goto out_unlock; 1986 goto out_unlock;
1992 1987
1993 /* 1988 /*
@@ -2004,7 +1999,7 @@ static int cpuset_css_online(struct cgroup *cgrp)
2004 * (and likewise for mems) to the new cgroup. 1999 * (and likewise for mems) to the new cgroup.
2005 */ 2000 */
2006 rcu_read_lock(); 2001 rcu_read_lock();
2007 cpuset_for_each_child(tmp_cs, pos_cg, parent) { 2002 cpuset_for_each_child(tmp_cs, pos_css, parent) {
2008 if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) { 2003 if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
2009 rcu_read_unlock(); 2004 rcu_read_unlock();
2010 goto out_unlock; 2005 goto out_unlock;
@@ -2021,9 +2016,15 @@ out_unlock:
2021 return 0; 2016 return 0;
2022} 2017}
2023 2018
2024static void cpuset_css_offline(struct cgroup *cgrp) 2019/*
2020 * If the cpuset being removed has its flag 'sched_load_balance'
2021 * enabled, then simulate turning sched_load_balance off, which
2022 * will call rebuild_sched_domains_locked().
2023 */
2024
2025static void cpuset_css_offline(struct cgroup_subsys_state *css)
2025{ 2026{
2026 struct cpuset *cs = cgroup_cs(cgrp); 2027 struct cpuset *cs = css_cs(css);
2027 2028
2028 mutex_lock(&cpuset_mutex); 2029 mutex_lock(&cpuset_mutex);
2029 2030
@@ -2036,15 +2037,9 @@ static void cpuset_css_offline(struct cgroup *cgrp)
2036 mutex_unlock(&cpuset_mutex); 2037 mutex_unlock(&cpuset_mutex);
2037} 2038}
2038 2039
2039/* 2040static void cpuset_css_free(struct cgroup_subsys_state *css)
2040 * If the cpuset being removed has its flag 'sched_load_balance'
2041 * enabled, then simulate turning sched_load_balance off, which
2042 * will call rebuild_sched_domains_locked().
2043 */
2044
2045static void cpuset_css_free(struct cgroup *cgrp)
2046{ 2041{
2047 struct cpuset *cs = cgroup_cs(cgrp); 2042 struct cpuset *cs = css_cs(css);
2048 2043
2049 free_cpumask_var(cs->cpus_allowed); 2044 free_cpumask_var(cs->cpus_allowed);
2050 kfree(cs); 2045 kfree(cs);
@@ -2251,11 +2246,11 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
2251 /* if cpus or mems changed, we need to propagate to descendants */ 2246 /* if cpus or mems changed, we need to propagate to descendants */
2252 if (cpus_updated || mems_updated) { 2247 if (cpus_updated || mems_updated) {
2253 struct cpuset *cs; 2248 struct cpuset *cs;
2254 struct cgroup *pos_cgrp; 2249 struct cgroup_subsys_state *pos_css;
2255 2250
2256 rcu_read_lock(); 2251 rcu_read_lock();
2257 cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset) { 2252 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
2258 if (!css_tryget(&cs->css)) 2253 if (cs == &top_cpuset || !css_tryget(&cs->css))
2259 continue; 2254 continue;
2260 rcu_read_unlock(); 2255 rcu_read_unlock();
2261 2256
@@ -2344,7 +2339,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
2344 2339
2345void cpuset_cpus_allowed_fallback(struct task_struct *tsk) 2340void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
2346{ 2341{
2347 const struct cpuset *cpus_cs; 2342 struct cpuset *cpus_cs;
2348 2343
2349 rcu_read_lock(); 2344 rcu_read_lock();
2350 cpus_cs = effective_cpumask_cpuset(task_cs(tsk)); 2345 cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
@@ -2417,7 +2412,7 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
2417 * callback_mutex. If no ancestor is mem_exclusive or mem_hardwall 2412 * callback_mutex. If no ancestor is mem_exclusive or mem_hardwall
2418 * (an unusual configuration), then returns the root cpuset. 2413 * (an unusual configuration), then returns the root cpuset.
2419 */ 2414 */
2420static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs) 2415static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
2421{ 2416{
2422 while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs)) 2417 while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
2423 cs = parent_cs(cs); 2418 cs = parent_cs(cs);
@@ -2487,7 +2482,7 @@ static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
2487 */ 2482 */
2488int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) 2483int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
2489{ 2484{
2490 const struct cpuset *cs; /* current cpuset ancestors */ 2485 struct cpuset *cs; /* current cpuset ancestors */
2491 int allowed; /* is allocation in zone z allowed? */ 2486 int allowed; /* is allocation in zone z allowed? */
2492 2487
2493 if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) 2488 if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
@@ -2725,7 +2720,7 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v)
2725 goto out_free; 2720 goto out_free;
2726 2721
2727 rcu_read_lock(); 2722 rcu_read_lock();
2728 css = task_subsys_state(tsk, cpuset_subsys_id); 2723 css = task_css(tsk, cpuset_subsys_id);
2729 retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); 2724 retval = cgroup_path(css->cgroup, buf, PAGE_SIZE);
2730 rcu_read_unlock(); 2725 rcu_read_unlock();
2731 if (retval < 0) 2726 if (retval < 0)
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index c77206184b8b..97b67df8fbfe 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -116,6 +116,9 @@ int get_callchain_buffers(void)
116 116
117 err = alloc_callchain_buffers(); 117 err = alloc_callchain_buffers();
118exit: 118exit:
119 if (err)
120 atomic_dec(&nr_callchain_events);
121
119 mutex_unlock(&callchain_mutex); 122 mutex_unlock(&callchain_mutex);
120 123
121 return err; 124 return err;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index eba8fb5834ae..2207efc941d1 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -145,6 +145,7 @@ static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events);
145static atomic_t nr_mmap_events __read_mostly; 145static atomic_t nr_mmap_events __read_mostly;
146static atomic_t nr_comm_events __read_mostly; 146static atomic_t nr_comm_events __read_mostly;
147static atomic_t nr_task_events __read_mostly; 147static atomic_t nr_task_events __read_mostly;
148static atomic_t nr_freq_events __read_mostly;
148 149
149static LIST_HEAD(pmus); 150static LIST_HEAD(pmus);
150static DEFINE_MUTEX(pmus_lock); 151static DEFINE_MUTEX(pmus_lock);
@@ -340,8 +341,8 @@ struct perf_cgroup {
340static inline struct perf_cgroup * 341static inline struct perf_cgroup *
341perf_cgroup_from_task(struct task_struct *task) 342perf_cgroup_from_task(struct task_struct *task)
342{ 343{
343 return container_of(task_subsys_state(task, perf_subsys_id), 344 return container_of(task_css(task, perf_subsys_id),
344 struct perf_cgroup, css); 345 struct perf_cgroup, css);
345} 346}
346 347
347static inline bool 348static inline bool
@@ -591,7 +592,9 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
591 if (!f.file) 592 if (!f.file)
592 return -EBADF; 593 return -EBADF;
593 594
594 css = cgroup_css_from_dir(f.file, perf_subsys_id); 595 rcu_read_lock();
596
597 css = css_from_dir(f.file->f_dentry, &perf_subsys);
595 if (IS_ERR(css)) { 598 if (IS_ERR(css)) {
596 ret = PTR_ERR(css); 599 ret = PTR_ERR(css);
597 goto out; 600 goto out;
@@ -617,6 +620,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
617 ret = -EINVAL; 620 ret = -EINVAL;
618 } 621 }
619out: 622out:
623 rcu_read_unlock();
620 fdput(f); 624 fdput(f);
621 return ret; 625 return ret;
622} 626}
@@ -869,12 +873,8 @@ static void perf_pmu_rotate_start(struct pmu *pmu)
869 873
870 WARN_ON(!irqs_disabled()); 874 WARN_ON(!irqs_disabled());
871 875
872 if (list_empty(&cpuctx->rotation_list)) { 876 if (list_empty(&cpuctx->rotation_list))
873 int was_empty = list_empty(head);
874 list_add(&cpuctx->rotation_list, head); 877 list_add(&cpuctx->rotation_list, head);
875 if (was_empty)
876 tick_nohz_full_kick();
877 }
878} 878}
879 879
880static void get_ctx(struct perf_event_context *ctx) 880static void get_ctx(struct perf_event_context *ctx)
@@ -1216,6 +1216,9 @@ static void perf_event__id_header_size(struct perf_event *event)
1216 if (sample_type & PERF_SAMPLE_TIME) 1216 if (sample_type & PERF_SAMPLE_TIME)
1217 size += sizeof(data->time); 1217 size += sizeof(data->time);
1218 1218
1219 if (sample_type & PERF_SAMPLE_IDENTIFIER)
1220 size += sizeof(data->id);
1221
1219 if (sample_type & PERF_SAMPLE_ID) 1222 if (sample_type & PERF_SAMPLE_ID)
1220 size += sizeof(data->id); 1223 size += sizeof(data->id);
1221 1224
@@ -2712,7 +2715,7 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
2712 2715
2713 hwc = &event->hw; 2716 hwc = &event->hw;
2714 2717
2715 if (needs_unthr && hwc->interrupts == MAX_INTERRUPTS) { 2718 if (hwc->interrupts == MAX_INTERRUPTS) {
2716 hwc->interrupts = 0; 2719 hwc->interrupts = 0;
2717 perf_log_throttle(event, 1); 2720 perf_log_throttle(event, 1);
2718 event->pmu->start(event, 0); 2721 event->pmu->start(event, 0);
@@ -2811,10 +2814,11 @@ done:
2811#ifdef CONFIG_NO_HZ_FULL 2814#ifdef CONFIG_NO_HZ_FULL
2812bool perf_event_can_stop_tick(void) 2815bool perf_event_can_stop_tick(void)
2813{ 2816{
2814 if (list_empty(&__get_cpu_var(rotation_list))) 2817 if (atomic_read(&nr_freq_events) ||
2815 return true; 2818 __this_cpu_read(perf_throttled_count))
2816 else
2817 return false; 2819 return false;
2820 else
2821 return true;
2818} 2822}
2819#endif 2823#endif
2820 2824
@@ -3128,36 +3132,63 @@ static void free_event_rcu(struct rcu_head *head)
3128static void ring_buffer_put(struct ring_buffer *rb); 3132static void ring_buffer_put(struct ring_buffer *rb);
3129static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb); 3133static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb);
3130 3134
3131static void free_event(struct perf_event *event) 3135static void unaccount_event_cpu(struct perf_event *event, int cpu)
3132{ 3136{
3133 irq_work_sync(&event->pending); 3137 if (event->parent)
3138 return;
3139
3140 if (has_branch_stack(event)) {
3141 if (!(event->attach_state & PERF_ATTACH_TASK))
3142 atomic_dec(&per_cpu(perf_branch_stack_events, cpu));
3143 }
3144 if (is_cgroup_event(event))
3145 atomic_dec(&per_cpu(perf_cgroup_events, cpu));
3146}
3147
3148static void unaccount_event(struct perf_event *event)
3149{
3150 if (event->parent)
3151 return;
3152
3153 if (event->attach_state & PERF_ATTACH_TASK)
3154 static_key_slow_dec_deferred(&perf_sched_events);
3155 if (event->attr.mmap || event->attr.mmap_data)
3156 atomic_dec(&nr_mmap_events);
3157 if (event->attr.comm)
3158 atomic_dec(&nr_comm_events);
3159 if (event->attr.task)
3160 atomic_dec(&nr_task_events);
3161 if (event->attr.freq)
3162 atomic_dec(&nr_freq_events);
3163 if (is_cgroup_event(event))
3164 static_key_slow_dec_deferred(&perf_sched_events);
3165 if (has_branch_stack(event))
3166 static_key_slow_dec_deferred(&perf_sched_events);
3167
3168 unaccount_event_cpu(event, event->cpu);
3169}
3134 3170
3171static void __free_event(struct perf_event *event)
3172{
3135 if (!event->parent) { 3173 if (!event->parent) {
3136 if (event->attach_state & PERF_ATTACH_TASK)
3137 static_key_slow_dec_deferred(&perf_sched_events);
3138 if (event->attr.mmap || event->attr.mmap_data)
3139 atomic_dec(&nr_mmap_events);
3140 if (event->attr.comm)
3141 atomic_dec(&nr_comm_events);
3142 if (event->attr.task)
3143 atomic_dec(&nr_task_events);
3144 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) 3174 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
3145 put_callchain_buffers(); 3175 put_callchain_buffers();
3146 if (is_cgroup_event(event)) {
3147 atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
3148 static_key_slow_dec_deferred(&perf_sched_events);
3149 }
3150
3151 if (has_branch_stack(event)) {
3152 static_key_slow_dec_deferred(&perf_sched_events);
3153 /* is system-wide event */
3154 if (!(event->attach_state & PERF_ATTACH_TASK)) {
3155 atomic_dec(&per_cpu(perf_branch_stack_events,
3156 event->cpu));
3157 }
3158 }
3159 } 3176 }
3160 3177
3178 if (event->destroy)
3179 event->destroy(event);
3180
3181 if (event->ctx)
3182 put_ctx(event->ctx);
3183
3184 call_rcu(&event->rcu_head, free_event_rcu);
3185}
3186static void free_event(struct perf_event *event)
3187{
3188 irq_work_sync(&event->pending);
3189
3190 unaccount_event(event);
3191
3161 if (event->rb) { 3192 if (event->rb) {
3162 struct ring_buffer *rb; 3193 struct ring_buffer *rb;
3163 3194
@@ -3180,13 +3211,8 @@ static void free_event(struct perf_event *event)
3180 if (is_cgroup_event(event)) 3211 if (is_cgroup_event(event))
3181 perf_detach_cgroup(event); 3212 perf_detach_cgroup(event);
3182 3213
3183 if (event->destroy)
3184 event->destroy(event);
3185 3214
3186 if (event->ctx) 3215 __free_event(event);
3187 put_ctx(event->ctx);
3188
3189 call_rcu(&event->rcu_head, free_event_rcu);
3190} 3216}
3191 3217
3192int perf_event_release_kernel(struct perf_event *event) 3218int perf_event_release_kernel(struct perf_event *event)
@@ -3544,6 +3570,15 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3544 case PERF_EVENT_IOC_PERIOD: 3570 case PERF_EVENT_IOC_PERIOD:
3545 return perf_event_period(event, (u64 __user *)arg); 3571 return perf_event_period(event, (u64 __user *)arg);
3546 3572
3573 case PERF_EVENT_IOC_ID:
3574 {
3575 u64 id = primary_event_id(event);
3576
3577 if (copy_to_user((void __user *)arg, &id, sizeof(id)))
3578 return -EFAULT;
3579 return 0;
3580 }
3581
3547 case PERF_EVENT_IOC_SET_OUTPUT: 3582 case PERF_EVENT_IOC_SET_OUTPUT:
3548 { 3583 {
3549 int ret; 3584 int ret;
@@ -3641,6 +3676,10 @@ void perf_event_update_userpage(struct perf_event *event)
3641 u64 enabled, running, now; 3676 u64 enabled, running, now;
3642 3677
3643 rcu_read_lock(); 3678 rcu_read_lock();
3679 rb = rcu_dereference(event->rb);
3680 if (!rb)
3681 goto unlock;
3682
3644 /* 3683 /*
3645 * compute total_time_enabled, total_time_running 3684 * compute total_time_enabled, total_time_running
3646 * based on snapshot values taken when the event 3685 * based on snapshot values taken when the event
@@ -3651,12 +3690,8 @@ void perf_event_update_userpage(struct perf_event *event)
3651 * NMI context 3690 * NMI context
3652 */ 3691 */
3653 calc_timer_values(event, &now, &enabled, &running); 3692 calc_timer_values(event, &now, &enabled, &running);
3654 rb = rcu_dereference(event->rb);
3655 if (!rb)
3656 goto unlock;
3657 3693
3658 userpg = rb->user_page; 3694 userpg = rb->user_page;
3659
3660 /* 3695 /*
3661 * Disable preemption so as to not let the corresponding user-space 3696 * Disable preemption so as to not let the corresponding user-space
3662 * spin too long if we get preempted. 3697 * spin too long if we get preempted.
@@ -4251,7 +4286,7 @@ static void __perf_event_header__init_id(struct perf_event_header *header,
4251 if (sample_type & PERF_SAMPLE_TIME) 4286 if (sample_type & PERF_SAMPLE_TIME)
4252 data->time = perf_clock(); 4287 data->time = perf_clock();
4253 4288
4254 if (sample_type & PERF_SAMPLE_ID) 4289 if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
4255 data->id = primary_event_id(event); 4290 data->id = primary_event_id(event);
4256 4291
4257 if (sample_type & PERF_SAMPLE_STREAM_ID) 4292 if (sample_type & PERF_SAMPLE_STREAM_ID)
@@ -4290,6 +4325,9 @@ static void __perf_event__output_id_sample(struct perf_output_handle *handle,
4290 4325
4291 if (sample_type & PERF_SAMPLE_CPU) 4326 if (sample_type & PERF_SAMPLE_CPU)
4292 perf_output_put(handle, data->cpu_entry); 4327 perf_output_put(handle, data->cpu_entry);
4328
4329 if (sample_type & PERF_SAMPLE_IDENTIFIER)
4330 perf_output_put(handle, data->id);
4293} 4331}
4294 4332
4295void perf_event__output_id_sample(struct perf_event *event, 4333void perf_event__output_id_sample(struct perf_event *event,
@@ -4355,7 +4393,8 @@ static void perf_output_read_group(struct perf_output_handle *handle,
4355 list_for_each_entry(sub, &leader->sibling_list, group_entry) { 4393 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
4356 n = 0; 4394 n = 0;
4357 4395
4358 if (sub != event) 4396 if ((sub != event) &&
4397 (sub->state == PERF_EVENT_STATE_ACTIVE))
4359 sub->pmu->read(sub); 4398 sub->pmu->read(sub);
4360 4399
4361 values[n++] = perf_event_count(sub); 4400 values[n++] = perf_event_count(sub);
@@ -4402,6 +4441,9 @@ void perf_output_sample(struct perf_output_handle *handle,
4402 4441
4403 perf_output_put(handle, *header); 4442 perf_output_put(handle, *header);
4404 4443
4444 if (sample_type & PERF_SAMPLE_IDENTIFIER)
4445 perf_output_put(handle, data->id);
4446
4405 if (sample_type & PERF_SAMPLE_IP) 4447 if (sample_type & PERF_SAMPLE_IP)
4406 perf_output_put(handle, data->ip); 4448 perf_output_put(handle, data->ip);
4407 4449
@@ -4462,20 +4504,6 @@ void perf_output_sample(struct perf_output_handle *handle,
4462 } 4504 }
4463 } 4505 }
4464 4506
4465 if (!event->attr.watermark) {
4466 int wakeup_events = event->attr.wakeup_events;
4467
4468 if (wakeup_events) {
4469 struct ring_buffer *rb = handle->rb;
4470 int events = local_inc_return(&rb->events);
4471
4472 if (events >= wakeup_events) {
4473 local_sub(wakeup_events, &rb->events);
4474 local_inc(&rb->wakeup);
4475 }
4476 }
4477 }
4478
4479 if (sample_type & PERF_SAMPLE_BRANCH_STACK) { 4507 if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
4480 if (data->br_stack) { 4508 if (data->br_stack) {
4481 size_t size; 4509 size_t size;
@@ -4511,16 +4539,31 @@ void perf_output_sample(struct perf_output_handle *handle,
4511 } 4539 }
4512 } 4540 }
4513 4541
4514 if (sample_type & PERF_SAMPLE_STACK_USER) 4542 if (sample_type & PERF_SAMPLE_STACK_USER) {
4515 perf_output_sample_ustack(handle, 4543 perf_output_sample_ustack(handle,
4516 data->stack_user_size, 4544 data->stack_user_size,
4517 data->regs_user.regs); 4545 data->regs_user.regs);
4546 }
4518 4547
4519 if (sample_type & PERF_SAMPLE_WEIGHT) 4548 if (sample_type & PERF_SAMPLE_WEIGHT)
4520 perf_output_put(handle, data->weight); 4549 perf_output_put(handle, data->weight);
4521 4550
4522 if (sample_type & PERF_SAMPLE_DATA_SRC) 4551 if (sample_type & PERF_SAMPLE_DATA_SRC)
4523 perf_output_put(handle, data->data_src.val); 4552 perf_output_put(handle, data->data_src.val);
4553
4554 if (!event->attr.watermark) {
4555 int wakeup_events = event->attr.wakeup_events;
4556
4557 if (wakeup_events) {
4558 struct ring_buffer *rb = handle->rb;
4559 int events = local_inc_return(&rb->events);
4560
4561 if (events >= wakeup_events) {
4562 local_sub(wakeup_events, &rb->events);
4563 local_inc(&rb->wakeup);
4564 }
4565 }
4566 }
4524} 4567}
4525 4568
4526void perf_prepare_sample(struct perf_event_header *header, 4569void perf_prepare_sample(struct perf_event_header *header,
@@ -4680,12 +4723,10 @@ perf_event_read_event(struct perf_event *event,
4680 perf_output_end(&handle); 4723 perf_output_end(&handle);
4681} 4724}
4682 4725
4683typedef int (perf_event_aux_match_cb)(struct perf_event *event, void *data);
4684typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data); 4726typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data);
4685 4727
4686static void 4728static void
4687perf_event_aux_ctx(struct perf_event_context *ctx, 4729perf_event_aux_ctx(struct perf_event_context *ctx,
4688 perf_event_aux_match_cb match,
4689 perf_event_aux_output_cb output, 4730 perf_event_aux_output_cb output,
4690 void *data) 4731 void *data)
4691{ 4732{
@@ -4696,15 +4737,12 @@ perf_event_aux_ctx(struct perf_event_context *ctx,
4696 continue; 4737 continue;
4697 if (!event_filter_match(event)) 4738 if (!event_filter_match(event))
4698 continue; 4739 continue;
4699 if (match(event, data)) 4740 output(event, data);
4700 output(event, data);
4701 } 4741 }
4702} 4742}
4703 4743
4704static void 4744static void
4705perf_event_aux(perf_event_aux_match_cb match, 4745perf_event_aux(perf_event_aux_output_cb output, void *data,
4706 perf_event_aux_output_cb output,
4707 void *data,
4708 struct perf_event_context *task_ctx) 4746 struct perf_event_context *task_ctx)
4709{ 4747{
4710 struct perf_cpu_context *cpuctx; 4748 struct perf_cpu_context *cpuctx;
@@ -4717,7 +4755,7 @@ perf_event_aux(perf_event_aux_match_cb match,
4717 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); 4755 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
4718 if (cpuctx->unique_pmu != pmu) 4756 if (cpuctx->unique_pmu != pmu)
4719 goto next; 4757 goto next;
4720 perf_event_aux_ctx(&cpuctx->ctx, match, output, data); 4758 perf_event_aux_ctx(&cpuctx->ctx, output, data);
4721 if (task_ctx) 4759 if (task_ctx)
4722 goto next; 4760 goto next;
4723 ctxn = pmu->task_ctx_nr; 4761 ctxn = pmu->task_ctx_nr;
@@ -4725,14 +4763,14 @@ perf_event_aux(perf_event_aux_match_cb match,
4725 goto next; 4763 goto next;
4726 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); 4764 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
4727 if (ctx) 4765 if (ctx)
4728 perf_event_aux_ctx(ctx, match, output, data); 4766 perf_event_aux_ctx(ctx, output, data);
4729next: 4767next:
4730 put_cpu_ptr(pmu->pmu_cpu_context); 4768 put_cpu_ptr(pmu->pmu_cpu_context);
4731 } 4769 }
4732 4770
4733 if (task_ctx) { 4771 if (task_ctx) {
4734 preempt_disable(); 4772 preempt_disable();
4735 perf_event_aux_ctx(task_ctx, match, output, data); 4773 perf_event_aux_ctx(task_ctx, output, data);
4736 preempt_enable(); 4774 preempt_enable();
4737 } 4775 }
4738 rcu_read_unlock(); 4776 rcu_read_unlock();
@@ -4741,7 +4779,7 @@ next:
4741/* 4779/*
4742 * task tracking -- fork/exit 4780 * task tracking -- fork/exit
4743 * 4781 *
4744 * enabled by: attr.comm | attr.mmap | attr.mmap_data | attr.task 4782 * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task
4745 */ 4783 */
4746 4784
4747struct perf_task_event { 4785struct perf_task_event {
@@ -4759,6 +4797,13 @@ struct perf_task_event {
4759 } event_id; 4797 } event_id;
4760}; 4798};
4761 4799
4800static int perf_event_task_match(struct perf_event *event)
4801{
4802 return event->attr.comm || event->attr.mmap ||
4803 event->attr.mmap2 || event->attr.mmap_data ||
4804 event->attr.task;
4805}
4806
4762static void perf_event_task_output(struct perf_event *event, 4807static void perf_event_task_output(struct perf_event *event,
4763 void *data) 4808 void *data)
4764{ 4809{
@@ -4768,6 +4813,9 @@ static void perf_event_task_output(struct perf_event *event,
4768 struct task_struct *task = task_event->task; 4813 struct task_struct *task = task_event->task;
4769 int ret, size = task_event->event_id.header.size; 4814 int ret, size = task_event->event_id.header.size;
4770 4815
4816 if (!perf_event_task_match(event))
4817 return;
4818
4771 perf_event_header__init_id(&task_event->event_id.header, &sample, event); 4819 perf_event_header__init_id(&task_event->event_id.header, &sample, event);
4772 4820
4773 ret = perf_output_begin(&handle, event, 4821 ret = perf_output_begin(&handle, event,
@@ -4790,13 +4838,6 @@ out:
4790 task_event->event_id.header.size = size; 4838 task_event->event_id.header.size = size;
4791} 4839}
4792 4840
4793static int perf_event_task_match(struct perf_event *event,
4794 void *data __maybe_unused)
4795{
4796 return event->attr.comm || event->attr.mmap ||
4797 event->attr.mmap_data || event->attr.task;
4798}
4799
4800static void perf_event_task(struct task_struct *task, 4841static void perf_event_task(struct task_struct *task,
4801 struct perf_event_context *task_ctx, 4842 struct perf_event_context *task_ctx,
4802 int new) 4843 int new)
@@ -4825,8 +4866,7 @@ static void perf_event_task(struct task_struct *task,
4825 }, 4866 },
4826 }; 4867 };
4827 4868
4828 perf_event_aux(perf_event_task_match, 4869 perf_event_aux(perf_event_task_output,
4829 perf_event_task_output,
4830 &task_event, 4870 &task_event,
4831 task_ctx); 4871 task_ctx);
4832} 4872}
@@ -4853,6 +4893,11 @@ struct perf_comm_event {
4853 } event_id; 4893 } event_id;
4854}; 4894};
4855 4895
4896static int perf_event_comm_match(struct perf_event *event)
4897{
4898 return event->attr.comm;
4899}
4900
4856static void perf_event_comm_output(struct perf_event *event, 4901static void perf_event_comm_output(struct perf_event *event,
4857 void *data) 4902 void *data)
4858{ 4903{
@@ -4862,6 +4907,9 @@ static void perf_event_comm_output(struct perf_event *event,
4862 int size = comm_event->event_id.header.size; 4907 int size = comm_event->event_id.header.size;
4863 int ret; 4908 int ret;
4864 4909
4910 if (!perf_event_comm_match(event))
4911 return;
4912
4865 perf_event_header__init_id(&comm_event->event_id.header, &sample, event); 4913 perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
4866 ret = perf_output_begin(&handle, event, 4914 ret = perf_output_begin(&handle, event,
4867 comm_event->event_id.header.size); 4915 comm_event->event_id.header.size);
@@ -4883,12 +4931,6 @@ out:
4883 comm_event->event_id.header.size = size; 4931 comm_event->event_id.header.size = size;
4884} 4932}
4885 4933
4886static int perf_event_comm_match(struct perf_event *event,
4887 void *data __maybe_unused)
4888{
4889 return event->attr.comm;
4890}
4891
4892static void perf_event_comm_event(struct perf_comm_event *comm_event) 4934static void perf_event_comm_event(struct perf_comm_event *comm_event)
4893{ 4935{
4894 char comm[TASK_COMM_LEN]; 4936 char comm[TASK_COMM_LEN];
@@ -4903,8 +4945,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
4903 4945
4904 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; 4946 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
4905 4947
4906 perf_event_aux(perf_event_comm_match, 4948 perf_event_aux(perf_event_comm_output,
4907 perf_event_comm_output,
4908 comm_event, 4949 comm_event,
4909 NULL); 4950 NULL);
4910} 4951}
@@ -4955,6 +4996,9 @@ struct perf_mmap_event {
4955 4996
4956 const char *file_name; 4997 const char *file_name;
4957 int file_size; 4998 int file_size;
4999 int maj, min;
5000 u64 ino;
5001 u64 ino_generation;
4958 5002
4959 struct { 5003 struct {
4960 struct perf_event_header header; 5004 struct perf_event_header header;
@@ -4967,6 +5011,17 @@ struct perf_mmap_event {
4967 } event_id; 5011 } event_id;
4968}; 5012};
4969 5013
5014static int perf_event_mmap_match(struct perf_event *event,
5015 void *data)
5016{
5017 struct perf_mmap_event *mmap_event = data;
5018 struct vm_area_struct *vma = mmap_event->vma;
5019 int executable = vma->vm_flags & VM_EXEC;
5020
5021 return (!executable && event->attr.mmap_data) ||
5022 (executable && (event->attr.mmap || event->attr.mmap2));
5023}
5024
4970static void perf_event_mmap_output(struct perf_event *event, 5025static void perf_event_mmap_output(struct perf_event *event,
4971 void *data) 5026 void *data)
4972{ 5027{
@@ -4976,6 +5031,16 @@ static void perf_event_mmap_output(struct perf_event *event,
4976 int size = mmap_event->event_id.header.size; 5031 int size = mmap_event->event_id.header.size;
4977 int ret; 5032 int ret;
4978 5033
5034 if (!perf_event_mmap_match(event, data))
5035 return;
5036
5037 if (event->attr.mmap2) {
5038 mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
5039 mmap_event->event_id.header.size += sizeof(mmap_event->maj);
5040 mmap_event->event_id.header.size += sizeof(mmap_event->min);
5041 mmap_event->event_id.header.size += sizeof(mmap_event->ino);
5042 }
5043
4979 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); 5044 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
4980 ret = perf_output_begin(&handle, event, 5045 ret = perf_output_begin(&handle, event,
4981 mmap_event->event_id.header.size); 5046 mmap_event->event_id.header.size);
@@ -4986,6 +5051,14 @@ static void perf_event_mmap_output(struct perf_event *event,
4986 mmap_event->event_id.tid = perf_event_tid(event, current); 5051 mmap_event->event_id.tid = perf_event_tid(event, current);
4987 5052
4988 perf_output_put(&handle, mmap_event->event_id); 5053 perf_output_put(&handle, mmap_event->event_id);
5054
5055 if (event->attr.mmap2) {
5056 perf_output_put(&handle, mmap_event->maj);
5057 perf_output_put(&handle, mmap_event->min);
5058 perf_output_put(&handle, mmap_event->ino);
5059 perf_output_put(&handle, mmap_event->ino_generation);
5060 }
5061
4989 __output_copy(&handle, mmap_event->file_name, 5062 __output_copy(&handle, mmap_event->file_name,
4990 mmap_event->file_size); 5063 mmap_event->file_size);
4991 5064
@@ -4996,21 +5069,12 @@ out:
4996 mmap_event->event_id.header.size = size; 5069 mmap_event->event_id.header.size = size;
4997} 5070}
4998 5071
4999static int perf_event_mmap_match(struct perf_event *event,
5000 void *data)
5001{
5002 struct perf_mmap_event *mmap_event = data;
5003 struct vm_area_struct *vma = mmap_event->vma;
5004 int executable = vma->vm_flags & VM_EXEC;
5005
5006 return (!executable && event->attr.mmap_data) ||
5007 (executable && event->attr.mmap);
5008}
5009
5010static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) 5072static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
5011{ 5073{
5012 struct vm_area_struct *vma = mmap_event->vma; 5074 struct vm_area_struct *vma = mmap_event->vma;
5013 struct file *file = vma->vm_file; 5075 struct file *file = vma->vm_file;
5076 int maj = 0, min = 0;
5077 u64 ino = 0, gen = 0;
5014 unsigned int size; 5078 unsigned int size;
5015 char tmp[16]; 5079 char tmp[16];
5016 char *buf = NULL; 5080 char *buf = NULL;
@@ -5019,6 +5083,8 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
5019 memset(tmp, 0, sizeof(tmp)); 5083 memset(tmp, 0, sizeof(tmp));
5020 5084
5021 if (file) { 5085 if (file) {
5086 struct inode *inode;
5087 dev_t dev;
5022 /* 5088 /*
5023 * d_path works from the end of the rb backwards, so we 5089 * d_path works from the end of the rb backwards, so we
5024 * need to add enough zero bytes after the string to handle 5090 * need to add enough zero bytes after the string to handle
@@ -5034,6 +5100,13 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
5034 name = strncpy(tmp, "//toolong", sizeof(tmp)); 5100 name = strncpy(tmp, "//toolong", sizeof(tmp));
5035 goto got_name; 5101 goto got_name;
5036 } 5102 }
5103 inode = file_inode(vma->vm_file);
5104 dev = inode->i_sb->s_dev;
5105 ino = inode->i_ino;
5106 gen = inode->i_generation;
5107 maj = MAJOR(dev);
5108 min = MINOR(dev);
5109
5037 } else { 5110 } else {
5038 if (arch_vma_name(mmap_event->vma)) { 5111 if (arch_vma_name(mmap_event->vma)) {
5039 name = strncpy(tmp, arch_vma_name(mmap_event->vma), 5112 name = strncpy(tmp, arch_vma_name(mmap_event->vma),
@@ -5064,14 +5137,17 @@ got_name:
5064 5137
5065 mmap_event->file_name = name; 5138 mmap_event->file_name = name;
5066 mmap_event->file_size = size; 5139 mmap_event->file_size = size;
5140 mmap_event->maj = maj;
5141 mmap_event->min = min;
5142 mmap_event->ino = ino;
5143 mmap_event->ino_generation = gen;
5067 5144
5068 if (!(vma->vm_flags & VM_EXEC)) 5145 if (!(vma->vm_flags & VM_EXEC))
5069 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA; 5146 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
5070 5147
5071 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; 5148 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
5072 5149
5073 perf_event_aux(perf_event_mmap_match, 5150 perf_event_aux(perf_event_mmap_output,
5074 perf_event_mmap_output,
5075 mmap_event, 5151 mmap_event,
5076 NULL); 5152 NULL);
5077 5153
@@ -5101,6 +5177,10 @@ void perf_event_mmap(struct vm_area_struct *vma)
5101 .len = vma->vm_end - vma->vm_start, 5177 .len = vma->vm_end - vma->vm_start,
5102 .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT, 5178 .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT,
5103 }, 5179 },
5180 /* .maj (attr_mmap2 only) */
5181 /* .min (attr_mmap2 only) */
5182 /* .ino (attr_mmap2 only) */
5183 /* .ino_generation (attr_mmap2 only) */
5104 }; 5184 };
5105 5185
5106 perf_event_mmap_event(&mmap_event); 5186 perf_event_mmap_event(&mmap_event);
@@ -5178,6 +5258,7 @@ static int __perf_event_overflow(struct perf_event *event,
5178 __this_cpu_inc(perf_throttled_count); 5258 __this_cpu_inc(perf_throttled_count);
5179 hwc->interrupts = MAX_INTERRUPTS; 5259 hwc->interrupts = MAX_INTERRUPTS;
5180 perf_log_throttle(event, 0); 5260 perf_log_throttle(event, 0);
5261 tick_nohz_full_kick();
5181 ret = 1; 5262 ret = 1;
5182 } 5263 }
5183 } 5264 }
@@ -6234,8 +6315,6 @@ perf_event_mux_interval_ms_store(struct device *dev,
6234 return count; 6315 return count;
6235} 6316}
6236 6317
6237#define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store)
6238
6239static struct device_attribute pmu_dev_attrs[] = { 6318static struct device_attribute pmu_dev_attrs[] = {
6240 __ATTR_RO(type), 6319 __ATTR_RO(type),
6241 __ATTR_RW(perf_event_mux_interval_ms), 6320 __ATTR_RW(perf_event_mux_interval_ms),
@@ -6445,6 +6524,44 @@ unlock:
6445 return pmu; 6524 return pmu;
6446} 6525}
6447 6526
6527static void account_event_cpu(struct perf_event *event, int cpu)
6528{
6529 if (event->parent)
6530 return;
6531
6532 if (has_branch_stack(event)) {
6533 if (!(event->attach_state & PERF_ATTACH_TASK))
6534 atomic_inc(&per_cpu(perf_branch_stack_events, cpu));
6535 }
6536 if (is_cgroup_event(event))
6537 atomic_inc(&per_cpu(perf_cgroup_events, cpu));
6538}
6539
6540static void account_event(struct perf_event *event)
6541{
6542 if (event->parent)
6543 return;
6544
6545 if (event->attach_state & PERF_ATTACH_TASK)
6546 static_key_slow_inc(&perf_sched_events.key);
6547 if (event->attr.mmap || event->attr.mmap_data)
6548 atomic_inc(&nr_mmap_events);
6549 if (event->attr.comm)
6550 atomic_inc(&nr_comm_events);
6551 if (event->attr.task)
6552 atomic_inc(&nr_task_events);
6553 if (event->attr.freq) {
6554 if (atomic_inc_return(&nr_freq_events) == 1)
6555 tick_nohz_full_kick_all();
6556 }
6557 if (has_branch_stack(event))
6558 static_key_slow_inc(&perf_sched_events.key);
6559 if (is_cgroup_event(event))
6560 static_key_slow_inc(&perf_sched_events.key);
6561
6562 account_event_cpu(event, event->cpu);
6563}
6564
6448/* 6565/*
6449 * Allocate and initialize a event structure 6566 * Allocate and initialize a event structure
6450 */ 6567 */
@@ -6459,7 +6576,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
6459 struct pmu *pmu; 6576 struct pmu *pmu;
6460 struct perf_event *event; 6577 struct perf_event *event;
6461 struct hw_perf_event *hwc; 6578 struct hw_perf_event *hwc;
6462 long err; 6579 long err = -EINVAL;
6463 6580
6464 if ((unsigned)cpu >= nr_cpu_ids) { 6581 if ((unsigned)cpu >= nr_cpu_ids) {
6465 if (!task || cpu != -1) 6582 if (!task || cpu != -1)
@@ -6542,49 +6659,35 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
6542 * we currently do not support PERF_FORMAT_GROUP on inherited events 6659 * we currently do not support PERF_FORMAT_GROUP on inherited events
6543 */ 6660 */
6544 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) 6661 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
6545 goto done; 6662 goto err_ns;
6546 6663
6547 pmu = perf_init_event(event); 6664 pmu = perf_init_event(event);
6548
6549done:
6550 err = 0;
6551 if (!pmu) 6665 if (!pmu)
6552 err = -EINVAL; 6666 goto err_ns;
6553 else if (IS_ERR(pmu)) 6667 else if (IS_ERR(pmu)) {
6554 err = PTR_ERR(pmu); 6668 err = PTR_ERR(pmu);
6555 6669 goto err_ns;
6556 if (err) {
6557 if (event->ns)
6558 put_pid_ns(event->ns);
6559 kfree(event);
6560 return ERR_PTR(err);
6561 } 6670 }
6562 6671
6563 if (!event->parent) { 6672 if (!event->parent) {
6564 if (event->attach_state & PERF_ATTACH_TASK)
6565 static_key_slow_inc(&perf_sched_events.key);
6566 if (event->attr.mmap || event->attr.mmap_data)
6567 atomic_inc(&nr_mmap_events);
6568 if (event->attr.comm)
6569 atomic_inc(&nr_comm_events);
6570 if (event->attr.task)
6571 atomic_inc(&nr_task_events);
6572 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { 6673 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
6573 err = get_callchain_buffers(); 6674 err = get_callchain_buffers();
6574 if (err) { 6675 if (err)
6575 free_event(event); 6676 goto err_pmu;
6576 return ERR_PTR(err);
6577 }
6578 }
6579 if (has_branch_stack(event)) {
6580 static_key_slow_inc(&perf_sched_events.key);
6581 if (!(event->attach_state & PERF_ATTACH_TASK))
6582 atomic_inc(&per_cpu(perf_branch_stack_events,
6583 event->cpu));
6584 } 6677 }
6585 } 6678 }
6586 6679
6587 return event; 6680 return event;
6681
6682err_pmu:
6683 if (event->destroy)
6684 event->destroy(event);
6685err_ns:
6686 if (event->ns)
6687 put_pid_ns(event->ns);
6688 kfree(event);
6689
6690 return ERR_PTR(err);
6588} 6691}
6589 6692
6590static int perf_copy_attr(struct perf_event_attr __user *uattr, 6693static int perf_copy_attr(struct perf_event_attr __user *uattr,
@@ -6866,17 +6969,14 @@ SYSCALL_DEFINE5(perf_event_open,
6866 6969
6867 if (flags & PERF_FLAG_PID_CGROUP) { 6970 if (flags & PERF_FLAG_PID_CGROUP) {
6868 err = perf_cgroup_connect(pid, event, &attr, group_leader); 6971 err = perf_cgroup_connect(pid, event, &attr, group_leader);
6869 if (err) 6972 if (err) {
6870 goto err_alloc; 6973 __free_event(event);
6871 /* 6974 goto err_task;
6872 * one more event: 6975 }
6873 * - that has cgroup constraint on event->cpu
6874 * - that may need work on context switch
6875 */
6876 atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
6877 static_key_slow_inc(&perf_sched_events.key);
6878 } 6976 }
6879 6977
6978 account_event(event);
6979
6880 /* 6980 /*
6881 * Special case software events and allow them to be part of 6981 * Special case software events and allow them to be part of
6882 * any hardware group. 6982 * any hardware group.
@@ -7072,6 +7172,8 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
7072 goto err; 7172 goto err;
7073 } 7173 }
7074 7174
7175 account_event(event);
7176
7075 ctx = find_get_context(event->pmu, task, cpu); 7177 ctx = find_get_context(event->pmu, task, cpu);
7076 if (IS_ERR(ctx)) { 7178 if (IS_ERR(ctx)) {
7077 err = PTR_ERR(ctx); 7179 err = PTR_ERR(ctx);
@@ -7108,6 +7210,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
7108 list_for_each_entry_safe(event, tmp, &src_ctx->event_list, 7210 list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
7109 event_entry) { 7211 event_entry) {
7110 perf_remove_from_context(event); 7212 perf_remove_from_context(event);
7213 unaccount_event_cpu(event, src_cpu);
7111 put_ctx(src_ctx); 7214 put_ctx(src_ctx);
7112 list_add(&event->event_entry, &events); 7215 list_add(&event->event_entry, &events);
7113 } 7216 }
@@ -7120,6 +7223,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
7120 list_del(&event->event_entry); 7223 list_del(&event->event_entry);
7121 if (event->state >= PERF_EVENT_STATE_OFF) 7224 if (event->state >= PERF_EVENT_STATE_OFF)
7122 event->state = PERF_EVENT_STATE_INACTIVE; 7225 event->state = PERF_EVENT_STATE_INACTIVE;
7226 account_event_cpu(event, dst_cpu);
7123 perf_install_in_context(dst_ctx, event, dst_cpu); 7227 perf_install_in_context(dst_ctx, event, dst_cpu);
7124 get_ctx(dst_ctx); 7228 get_ctx(dst_ctx);
7125 } 7229 }
@@ -7630,7 +7734,7 @@ static void __init perf_event_init_all_cpus(void)
7630 } 7734 }
7631} 7735}
7632 7736
7633static void __cpuinit perf_event_init_cpu(int cpu) 7737static void perf_event_init_cpu(int cpu)
7634{ 7738{
7635 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); 7739 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
7636 7740
@@ -7719,7 +7823,7 @@ static struct notifier_block perf_reboot_notifier = {
7719 .priority = INT_MIN, 7823 .priority = INT_MIN,
7720}; 7824};
7721 7825
7722static int __cpuinit 7826static int
7723perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) 7827perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
7724{ 7828{
7725 unsigned int cpu = (long)hcpu; 7829 unsigned int cpu = (long)hcpu;
@@ -7800,7 +7904,8 @@ unlock:
7800device_initcall(perf_event_sysfs_init); 7904device_initcall(perf_event_sysfs_init);
7801 7905
7802#ifdef CONFIG_CGROUP_PERF 7906#ifdef CONFIG_CGROUP_PERF
7803static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont) 7907static struct cgroup_subsys_state *
7908perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
7804{ 7909{
7805 struct perf_cgroup *jc; 7910 struct perf_cgroup *jc;
7806 7911
@@ -7817,11 +7922,10 @@ static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont)
7817 return &jc->css; 7922 return &jc->css;
7818} 7923}
7819 7924
7820static void perf_cgroup_css_free(struct cgroup *cont) 7925static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
7821{ 7926{
7822 struct perf_cgroup *jc; 7927 struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);
7823 jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), 7928
7824 struct perf_cgroup, css);
7825 free_percpu(jc->info); 7929 free_percpu(jc->info);
7826 kfree(jc); 7930 kfree(jc);
7827} 7931}
@@ -7833,15 +7937,17 @@ static int __perf_cgroup_move(void *info)
7833 return 0; 7937 return 0;
7834} 7938}
7835 7939
7836static void perf_cgroup_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 7940static void perf_cgroup_attach(struct cgroup_subsys_state *css,
7941 struct cgroup_taskset *tset)
7837{ 7942{
7838 struct task_struct *task; 7943 struct task_struct *task;
7839 7944
7840 cgroup_taskset_for_each(task, cgrp, tset) 7945 cgroup_taskset_for_each(task, css, tset)
7841 task_function_call(task, __perf_cgroup_move, task); 7946 task_function_call(task, __perf_cgroup_move, task);
7842} 7947}
7843 7948
7844static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, 7949static void perf_cgroup_exit(struct cgroup_subsys_state *css,
7950 struct cgroup_subsys_state *old_css,
7845 struct task_struct *task) 7951 struct task_struct *task)
7846{ 7952{
7847 /* 7953 /*
diff --git a/kernel/fork.c b/kernel/fork.c
index 66635c80a813..bf46287c91a4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1177,7 +1177,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1177 * don't allow the creation of threads. 1177 * don't allow the creation of threads.
1178 */ 1178 */
1179 if ((clone_flags & (CLONE_VM|CLONE_NEWPID)) && 1179 if ((clone_flags & (CLONE_VM|CLONE_NEWPID)) &&
1180 (task_active_pid_ns(current) != current->nsproxy->pid_ns)) 1180 (task_active_pid_ns(current) !=
1181 current->nsproxy->pid_ns_for_children))
1181 return ERR_PTR(-EINVAL); 1182 return ERR_PTR(-EINVAL);
1182 1183
1183 retval = security_task_create(clone_flags); 1184 retval = security_task_create(clone_flags);
@@ -1351,7 +1352,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1351 1352
1352 if (pid != &init_struct_pid) { 1353 if (pid != &init_struct_pid) {
1353 retval = -ENOMEM; 1354 retval = -ENOMEM;
1354 pid = alloc_pid(p->nsproxy->pid_ns); 1355 pid = alloc_pid(p->nsproxy->pid_ns_for_children);
1355 if (!pid) 1356 if (!pid)
1356 goto bad_fork_cleanup_io; 1357 goto bad_fork_cleanup_io;
1357 } 1358 }
@@ -1546,7 +1547,7 @@ static inline void init_idle_pids(struct pid_link *links)
1546 } 1547 }
1547} 1548}
1548 1549
1549struct task_struct * __cpuinit fork_idle(int cpu) 1550struct task_struct *fork_idle(int cpu)
1550{ 1551{
1551 struct task_struct *task; 1552 struct task_struct *task;
1552 task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0); 1553 task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0);
@@ -1679,6 +1680,12 @@ SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
1679 int __user *, parent_tidptr, 1680 int __user *, parent_tidptr,
1680 int __user *, child_tidptr, 1681 int __user *, child_tidptr,
1681 int, tls_val) 1682 int, tls_val)
1683#elif defined(CONFIG_CLONE_BACKWARDS3)
1684SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp,
1685 int, stack_size,
1686 int __user *, parent_tidptr,
1687 int __user *, child_tidptr,
1688 int, tls_val)
1682#else 1689#else
1683SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, 1690SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
1684 int __user *, parent_tidptr, 1691 int __user *, parent_tidptr,
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 8b2afc1c9df0..b462fa197517 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -33,7 +33,7 @@ static DEFINE_SPINLOCK(freezer_lock);
33 */ 33 */
34bool freezing_slow_path(struct task_struct *p) 34bool freezing_slow_path(struct task_struct *p)
35{ 35{
36 if (p->flags & PF_NOFREEZE) 36 if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK))
37 return false; 37 return false;
38 38
39 if (pm_nosig_freezing || cgroup_freezing(p)) 39 if (pm_nosig_freezing || cgroup_freezing(p))
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index f0f4fe29cd21..383319bae3f7 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1659,7 +1659,7 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
1659/* 1659/*
1660 * Functions related to boot-time initialization: 1660 * Functions related to boot-time initialization:
1661 */ 1661 */
1662static void __cpuinit init_hrtimers_cpu(int cpu) 1662static void init_hrtimers_cpu(int cpu)
1663{ 1663{
1664 struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); 1664 struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
1665 int i; 1665 int i;
@@ -1740,7 +1740,7 @@ static void migrate_hrtimers(int scpu)
1740 1740
1741#endif /* CONFIG_HOTPLUG_CPU */ 1741#endif /* CONFIG_HOTPLUG_CPU */
1742 1742
1743static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, 1743static int hrtimer_cpu_notify(struct notifier_block *self,
1744 unsigned long action, void *hcpu) 1744 unsigned long action, void *hcpu)
1745{ 1745{
1746 int scpu = (long)hcpu; 1746 int scpu = (long)hcpu;
@@ -1773,7 +1773,7 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,
1773 return NOTIFY_OK; 1773 return NOTIFY_OK;
1774} 1774}
1775 1775
1776static struct notifier_block __cpuinitdata hrtimers_nb = { 1776static struct notifier_block hrtimers_nb = {
1777 .notifier_call = hrtimer_cpu_notify, 1777 .notifier_call = hrtimer_cpu_notify,
1778}; 1778};
1779 1779
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 6df614912b9d..3e97fb126e6b 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -15,6 +15,7 @@
15#include <linux/lockdep.h> 15#include <linux/lockdep.h>
16#include <linux/export.h> 16#include <linux/export.h>
17#include <linux/sysctl.h> 17#include <linux/sysctl.h>
18#include <linux/utsname.h>
18 19
19/* 20/*
20 * The number of tasks checked: 21 * The number of tasks checked:
@@ -99,10 +100,14 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
99 * Ok, the task did not get scheduled for more than 2 minutes, 100 * Ok, the task did not get scheduled for more than 2 minutes,
100 * complain: 101 * complain:
101 */ 102 */
102 printk(KERN_ERR "INFO: task %s:%d blocked for more than " 103 pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n",
103 "%ld seconds.\n", t->comm, t->pid, timeout); 104 t->comm, t->pid, timeout);
104 printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" 105 pr_err(" %s %s %.*s\n",
105 " disables this message.\n"); 106 print_tainted(), init_utsname()->release,
107 (int)strcspn(init_utsname()->version, " "),
108 init_utsname()->version);
109 pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
110 " disables this message.\n");
106 sched_show_task(t); 111 sched_show_task(t);
107 debug_show_held_locks(t); 112 debug_show_held_locks(t);
108 113
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 60f48fa0fd0d..297a9247a3b3 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -13,6 +13,7 @@
13#include <linux/sort.h> 13#include <linux/sort.h>
14#include <linux/err.h> 14#include <linux/err.h>
15#include <linux/static_key.h> 15#include <linux/static_key.h>
16#include <linux/jump_label_ratelimit.h>
16 17
17#ifdef HAVE_JUMP_LABEL 18#ifdef HAVE_JUMP_LABEL
18 19
diff --git a/kernel/lglock.c b/kernel/lglock.c
index 6535a667a5a7..86ae2aebf004 100644
--- a/kernel/lglock.c
+++ b/kernel/lglock.c
@@ -21,7 +21,7 @@ void lg_local_lock(struct lglock *lg)
21 arch_spinlock_t *lock; 21 arch_spinlock_t *lock;
22 22
23 preempt_disable(); 23 preempt_disable();
24 rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_); 24 lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
25 lock = this_cpu_ptr(lg->lock); 25 lock = this_cpu_ptr(lg->lock);
26 arch_spin_lock(lock); 26 arch_spin_lock(lock);
27} 27}
@@ -31,7 +31,7 @@ void lg_local_unlock(struct lglock *lg)
31{ 31{
32 arch_spinlock_t *lock; 32 arch_spinlock_t *lock;
33 33
34 rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); 34 lock_release(&lg->lock_dep_map, 1, _RET_IP_);
35 lock = this_cpu_ptr(lg->lock); 35 lock = this_cpu_ptr(lg->lock);
36 arch_spin_unlock(lock); 36 arch_spin_unlock(lock);
37 preempt_enable(); 37 preempt_enable();
@@ -43,7 +43,7 @@ void lg_local_lock_cpu(struct lglock *lg, int cpu)
43 arch_spinlock_t *lock; 43 arch_spinlock_t *lock;
44 44
45 preempt_disable(); 45 preempt_disable();
46 rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_); 46 lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
47 lock = per_cpu_ptr(lg->lock, cpu); 47 lock = per_cpu_ptr(lg->lock, cpu);
48 arch_spin_lock(lock); 48 arch_spin_lock(lock);
49} 49}
@@ -53,7 +53,7 @@ void lg_local_unlock_cpu(struct lglock *lg, int cpu)
53{ 53{
54 arch_spinlock_t *lock; 54 arch_spinlock_t *lock;
55 55
56 rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); 56 lock_release(&lg->lock_dep_map, 1, _RET_IP_);
57 lock = per_cpu_ptr(lg->lock, cpu); 57 lock = per_cpu_ptr(lg->lock, cpu);
58 arch_spin_unlock(lock); 58 arch_spin_unlock(lock);
59 preempt_enable(); 59 preempt_enable();
@@ -65,7 +65,7 @@ void lg_global_lock(struct lglock *lg)
65 int i; 65 int i;
66 66
67 preempt_disable(); 67 preempt_disable();
68 rwlock_acquire(&lg->lock_dep_map, 0, 0, _RET_IP_); 68 lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
69 for_each_possible_cpu(i) { 69 for_each_possible_cpu(i) {
70 arch_spinlock_t *lock; 70 arch_spinlock_t *lock;
71 lock = per_cpu_ptr(lg->lock, i); 71 lock = per_cpu_ptr(lg->lock, i);
@@ -78,7 +78,7 @@ void lg_global_unlock(struct lglock *lg)
78{ 78{
79 int i; 79 int i;
80 80
81 rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); 81 lock_release(&lg->lock_dep_map, 1, _RET_IP_);
82 for_each_possible_cpu(i) { 82 for_each_possible_cpu(i) {
83 arch_spinlock_t *lock; 83 arch_spinlock_t *lock;
84 lock = per_cpu_ptr(lg->lock, i); 84 lock = per_cpu_ptr(lg->lock, i);
diff --git a/kernel/mutex.c b/kernel/mutex.c
index ff05f4bd86eb..6d647aedffea 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -209,11 +209,13 @@ int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
209 */ 209 */
210static inline int mutex_can_spin_on_owner(struct mutex *lock) 210static inline int mutex_can_spin_on_owner(struct mutex *lock)
211{ 211{
212 struct task_struct *owner;
212 int retval = 1; 213 int retval = 1;
213 214
214 rcu_read_lock(); 215 rcu_read_lock();
215 if (lock->owner) 216 owner = ACCESS_ONCE(lock->owner);
216 retval = lock->owner->on_cpu; 217 if (owner)
218 retval = owner->on_cpu;
217 rcu_read_unlock(); 219 rcu_read_unlock();
218 /* 220 /*
219 * if lock->owner is not set, the mutex owner may have just acquired 221 * if lock->owner is not set, the mutex owner may have just acquired
@@ -461,7 +463,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
461 * performed the optimistic spinning cannot be done. 463 * performed the optimistic spinning cannot be done.
462 */ 464 */
463 if (ACCESS_ONCE(ww->ctx)) 465 if (ACCESS_ONCE(ww->ctx))
464 break; 466 goto slowpath;
465 } 467 }
466 468
467 /* 469 /*
@@ -472,7 +474,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
472 owner = ACCESS_ONCE(lock->owner); 474 owner = ACCESS_ONCE(lock->owner);
473 if (owner && !mutex_spin_on_owner(lock, owner)) { 475 if (owner && !mutex_spin_on_owner(lock, owner)) {
474 mspin_unlock(MLOCK(lock), &node); 476 mspin_unlock(MLOCK(lock), &node);
475 break; 477 goto slowpath;
476 } 478 }
477 479
478 if ((atomic_read(&lock->count) == 1) && 480 if ((atomic_read(&lock->count) == 1) &&
@@ -499,7 +501,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
499 * the owner complete. 501 * the owner complete.
500 */ 502 */
501 if (!owner && (need_resched() || rt_task(task))) 503 if (!owner && (need_resched() || rt_task(task)))
502 break; 504 goto slowpath;
503 505
504 /* 506 /*
505 * The cpu_relax() call is a compiler barrier which forces 507 * The cpu_relax() call is a compiler barrier which forces
@@ -513,6 +515,10 @@ slowpath:
513#endif 515#endif
514 spin_lock_mutex(&lock->wait_lock, flags); 516 spin_lock_mutex(&lock->wait_lock, flags);
515 517
518 /* once more, can we acquire the lock? */
519 if (MUTEX_SHOW_NO_WAITER(lock) && (atomic_xchg(&lock->count, 0) == 1))
520 goto skip_wait;
521
516 debug_mutex_lock_common(lock, &waiter); 522 debug_mutex_lock_common(lock, &waiter);
517 debug_mutex_add_waiter(lock, &waiter, task_thread_info(task)); 523 debug_mutex_add_waiter(lock, &waiter, task_thread_info(task));
518 524
@@ -520,9 +526,6 @@ slowpath:
520 list_add_tail(&waiter.list, &lock->wait_list); 526 list_add_tail(&waiter.list, &lock->wait_list);
521 waiter.task = task; 527 waiter.task = task;
522 528
523 if (MUTEX_SHOW_NO_WAITER(lock) && (atomic_xchg(&lock->count, -1) == 1))
524 goto done;
525
526 lock_contended(&lock->dep_map, ip); 529 lock_contended(&lock->dep_map, ip);
527 530
528 for (;;) { 531 for (;;) {
@@ -536,7 +539,7 @@ slowpath:
536 * other waiters: 539 * other waiters:
537 */ 540 */
538 if (MUTEX_SHOW_NO_WAITER(lock) && 541 if (MUTEX_SHOW_NO_WAITER(lock) &&
539 (atomic_xchg(&lock->count, -1) == 1)) 542 (atomic_xchg(&lock->count, -1) == 1))
540 break; 543 break;
541 544
542 /* 545 /*
@@ -561,24 +564,25 @@ slowpath:
561 schedule_preempt_disabled(); 564 schedule_preempt_disabled();
562 spin_lock_mutex(&lock->wait_lock, flags); 565 spin_lock_mutex(&lock->wait_lock, flags);
563 } 566 }
567 mutex_remove_waiter(lock, &waiter, current_thread_info());
568 /* set it to 0 if there are no waiters left: */
569 if (likely(list_empty(&lock->wait_list)))
570 atomic_set(&lock->count, 0);
571 debug_mutex_free_waiter(&waiter);
564 572
565done: 573skip_wait:
574 /* got the lock - cleanup and rejoice! */
566 lock_acquired(&lock->dep_map, ip); 575 lock_acquired(&lock->dep_map, ip);
567 /* got the lock - rejoice! */
568 mutex_remove_waiter(lock, &waiter, current_thread_info());
569 mutex_set_owner(lock); 576 mutex_set_owner(lock);
570 577
571 if (!__builtin_constant_p(ww_ctx == NULL)) { 578 if (!__builtin_constant_p(ww_ctx == NULL)) {
572 struct ww_mutex *ww = container_of(lock, 579 struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
573 struct ww_mutex,
574 base);
575 struct mutex_waiter *cur; 580 struct mutex_waiter *cur;
576 581
577 /* 582 /*
578 * This branch gets optimized out for the common case, 583 * This branch gets optimized out for the common case,
579 * and is only important for ww_mutex_lock. 584 * and is only important for ww_mutex_lock.
580 */ 585 */
581
582 ww_mutex_lock_acquired(ww, ww_ctx); 586 ww_mutex_lock_acquired(ww, ww_ctx);
583 ww->ctx = ww_ctx; 587 ww->ctx = ww_ctx;
584 588
@@ -592,15 +596,8 @@ done:
592 } 596 }
593 } 597 }
594 598
595 /* set it to 0 if there are no waiters left: */
596 if (likely(list_empty(&lock->wait_list)))
597 atomic_set(&lock->count, 0);
598
599 spin_unlock_mutex(&lock->wait_lock, flags); 599 spin_unlock_mutex(&lock->wait_lock, flags);
600
601 debug_mutex_free_waiter(&waiter);
602 preempt_enable(); 600 preempt_enable();
603
604 return 0; 601 return 0;
605 602
606err: 603err:
@@ -686,7 +683,7 @@ __ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
686 might_sleep(); 683 might_sleep();
687 ret = __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, 684 ret = __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE,
688 0, &ctx->dep_map, _RET_IP_, ctx); 685 0, &ctx->dep_map, _RET_IP_, ctx);
689 if (!ret && ctx->acquired > 0) 686 if (!ret && ctx->acquired > 1)
690 return ww_mutex_deadlock_injection(lock, ctx); 687 return ww_mutex_deadlock_injection(lock, ctx);
691 688
692 return ret; 689 return ret;
@@ -702,7 +699,7 @@ __ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
702 ret = __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, 699 ret = __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE,
703 0, &ctx->dep_map, _RET_IP_, ctx); 700 0, &ctx->dep_map, _RET_IP_, ctx);
704 701
705 if (!ret && ctx->acquired > 0) 702 if (!ret && ctx->acquired > 1)
706 return ww_mutex_deadlock_injection(lock, ctx); 703 return ww_mutex_deadlock_injection(lock, ctx);
707 704
708 return ret; 705 return ret;
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 364ceab15f0c..997cbb951a3b 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -29,15 +29,15 @@
29static struct kmem_cache *nsproxy_cachep; 29static struct kmem_cache *nsproxy_cachep;
30 30
31struct nsproxy init_nsproxy = { 31struct nsproxy init_nsproxy = {
32 .count = ATOMIC_INIT(1), 32 .count = ATOMIC_INIT(1),
33 .uts_ns = &init_uts_ns, 33 .uts_ns = &init_uts_ns,
34#if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC) 34#if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC)
35 .ipc_ns = &init_ipc_ns, 35 .ipc_ns = &init_ipc_ns,
36#endif 36#endif
37 .mnt_ns = NULL, 37 .mnt_ns = NULL,
38 .pid_ns = &init_pid_ns, 38 .pid_ns_for_children = &init_pid_ns,
39#ifdef CONFIG_NET 39#ifdef CONFIG_NET
40 .net_ns = &init_net, 40 .net_ns = &init_net,
41#endif 41#endif
42}; 42};
43 43
@@ -85,9 +85,10 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
85 goto out_ipc; 85 goto out_ipc;
86 } 86 }
87 87
88 new_nsp->pid_ns = copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns); 88 new_nsp->pid_ns_for_children =
89 if (IS_ERR(new_nsp->pid_ns)) { 89 copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns_for_children);
90 err = PTR_ERR(new_nsp->pid_ns); 90 if (IS_ERR(new_nsp->pid_ns_for_children)) {
91 err = PTR_ERR(new_nsp->pid_ns_for_children);
91 goto out_pid; 92 goto out_pid;
92 } 93 }
93 94
@@ -100,8 +101,8 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
100 return new_nsp; 101 return new_nsp;
101 102
102out_net: 103out_net:
103 if (new_nsp->pid_ns) 104 if (new_nsp->pid_ns_for_children)
104 put_pid_ns(new_nsp->pid_ns); 105 put_pid_ns(new_nsp->pid_ns_for_children);
105out_pid: 106out_pid:
106 if (new_nsp->ipc_ns) 107 if (new_nsp->ipc_ns)
107 put_ipc_ns(new_nsp->ipc_ns); 108 put_ipc_ns(new_nsp->ipc_ns);
@@ -174,8 +175,8 @@ void free_nsproxy(struct nsproxy *ns)
174 put_uts_ns(ns->uts_ns); 175 put_uts_ns(ns->uts_ns);
175 if (ns->ipc_ns) 176 if (ns->ipc_ns)
176 put_ipc_ns(ns->ipc_ns); 177 put_ipc_ns(ns->ipc_ns);
177 if (ns->pid_ns) 178 if (ns->pid_ns_for_children)
178 put_pid_ns(ns->pid_ns); 179 put_pid_ns(ns->pid_ns_for_children);
179 put_net(ns->net_ns); 180 put_net(ns->net_ns);
180 kmem_cache_free(nsproxy_cachep, ns); 181 kmem_cache_free(nsproxy_cachep, ns);
181} 182}
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 6917e8edb48e..601bb361c235 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -349,8 +349,8 @@ static int pidns_install(struct nsproxy *nsproxy, void *ns)
349 if (ancestor != active) 349 if (ancestor != active)
350 return -EINVAL; 350 return -EINVAL;
351 351
352 put_pid_ns(nsproxy->pid_ns); 352 put_pid_ns(nsproxy->pid_ns_for_children);
353 nsproxy->pid_ns = get_pid_ns(new); 353 nsproxy->pid_ns_for_children = get_pid_ns(new);
354 return 0; 354 return 0;
355} 355}
356 356
diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c
index c6422ffeda9a..9012ecf7b814 100644
--- a/kernel/power/autosleep.c
+++ b/kernel/power/autosleep.c
@@ -32,7 +32,8 @@ static void try_to_suspend(struct work_struct *work)
32 32
33 mutex_lock(&autosleep_lock); 33 mutex_lock(&autosleep_lock);
34 34
35 if (!pm_save_wakeup_count(initial_count)) { 35 if (!pm_save_wakeup_count(initial_count) ||
36 system_state != SYSTEM_RUNNING) {
36 mutex_unlock(&autosleep_lock); 37 mutex_unlock(&autosleep_lock);
37 goto out; 38 goto out;
38 } 39 }
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index b26f5f1e773e..3085e62a80a5 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -39,7 +39,7 @@ static int resume_delay;
39static char resume_file[256] = CONFIG_PM_STD_PARTITION; 39static char resume_file[256] = CONFIG_PM_STD_PARTITION;
40dev_t swsusp_resume_device; 40dev_t swsusp_resume_device;
41sector_t swsusp_resume_block; 41sector_t swsusp_resume_block;
42int in_suspend __nosavedata; 42__visible int in_suspend __nosavedata;
43 43
44enum { 44enum {
45 HIBERNATION_INVALID, 45 HIBERNATION_INVALID,
diff --git a/kernel/power/process.c b/kernel/power/process.c
index fc0df8486449..06ec8869dbf1 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -109,6 +109,8 @@ static int try_to_freeze_tasks(bool user_only)
109 109
110/** 110/**
111 * freeze_processes - Signal user space processes to enter the refrigerator. 111 * freeze_processes - Signal user space processes to enter the refrigerator.
112 * The current thread will not be frozen. The same process that calls
113 * freeze_processes must later call thaw_processes.
112 * 114 *
113 * On success, returns 0. On failure, -errno and system is fully thawed. 115 * On success, returns 0. On failure, -errno and system is fully thawed.
114 */ 116 */
@@ -120,6 +122,9 @@ int freeze_processes(void)
120 if (error) 122 if (error)
121 return error; 123 return error;
122 124
125 /* Make sure this task doesn't get frozen */
126 current->flags |= PF_SUSPEND_TASK;
127
123 if (!pm_freezing) 128 if (!pm_freezing)
124 atomic_inc(&system_freezing_cnt); 129 atomic_inc(&system_freezing_cnt);
125 130
@@ -168,6 +173,7 @@ int freeze_kernel_threads(void)
168void thaw_processes(void) 173void thaw_processes(void)
169{ 174{
170 struct task_struct *g, *p; 175 struct task_struct *g, *p;
176 struct task_struct *curr = current;
171 177
172 if (pm_freezing) 178 if (pm_freezing)
173 atomic_dec(&system_freezing_cnt); 179 atomic_dec(&system_freezing_cnt);
@@ -182,10 +188,15 @@ void thaw_processes(void)
182 188
183 read_lock(&tasklist_lock); 189 read_lock(&tasklist_lock);
184 do_each_thread(g, p) { 190 do_each_thread(g, p) {
191 /* No other threads should have PF_SUSPEND_TASK set */
192 WARN_ON((p != curr) && (p->flags & PF_SUSPEND_TASK));
185 __thaw_task(p); 193 __thaw_task(p);
186 } while_each_thread(g, p); 194 } while_each_thread(g, p);
187 read_unlock(&tasklist_lock); 195 read_unlock(&tasklist_lock);
188 196
197 WARN_ON(!(curr->flags & PF_SUSPEND_TASK));
198 curr->flags &= ~PF_SUSPEND_TASK;
199
189 usermodehelper_enable(); 200 usermodehelper_enable();
190 201
191 schedule(); 202 schedule();
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 06fe28589e9c..a394297f8b2f 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -296,6 +296,17 @@ int pm_qos_request_active(struct pm_qos_request *req)
296} 296}
297EXPORT_SYMBOL_GPL(pm_qos_request_active); 297EXPORT_SYMBOL_GPL(pm_qos_request_active);
298 298
299static void __pm_qos_update_request(struct pm_qos_request *req,
300 s32 new_value)
301{
302 trace_pm_qos_update_request(req->pm_qos_class, new_value);
303
304 if (new_value != req->node.prio)
305 pm_qos_update_target(
306 pm_qos_array[req->pm_qos_class]->constraints,
307 &req->node, PM_QOS_UPDATE_REQ, new_value);
308}
309
299/** 310/**
300 * pm_qos_work_fn - the timeout handler of pm_qos_update_request_timeout 311 * pm_qos_work_fn - the timeout handler of pm_qos_update_request_timeout
301 * @work: work struct for the delayed work (timeout) 312 * @work: work struct for the delayed work (timeout)
@@ -308,7 +319,7 @@ static void pm_qos_work_fn(struct work_struct *work)
308 struct pm_qos_request, 319 struct pm_qos_request,
309 work); 320 work);
310 321
311 pm_qos_update_request(req, PM_QOS_DEFAULT_VALUE); 322 __pm_qos_update_request(req, PM_QOS_DEFAULT_VALUE);
312} 323}
313 324
314/** 325/**
@@ -364,12 +375,7 @@ void pm_qos_update_request(struct pm_qos_request *req,
364 } 375 }
365 376
366 cancel_delayed_work_sync(&req->work); 377 cancel_delayed_work_sync(&req->work);
367 378 __pm_qos_update_request(req, new_value);
368 trace_pm_qos_update_request(req->pm_qos_class, new_value);
369 if (new_value != req->node.prio)
370 pm_qos_update_target(
371 pm_qos_array[req->pm_qos_class]->constraints,
372 &req->node, PM_QOS_UPDATE_REQ, new_value);
373} 379}
374EXPORT_SYMBOL_GPL(pm_qos_update_request); 380EXPORT_SYMBOL_GPL(pm_qos_update_request);
375 381
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index ece04223bb1e..62ee437b5c7e 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -210,6 +210,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
210 goto Platform_wake; 210 goto Platform_wake;
211 } 211 }
212 212
213 ftrace_stop();
213 error = disable_nonboot_cpus(); 214 error = disable_nonboot_cpus();
214 if (error || suspend_test(TEST_CPUS)) 215 if (error || suspend_test(TEST_CPUS))
215 goto Enable_cpus; 216 goto Enable_cpus;
@@ -232,6 +233,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
232 233
233 Enable_cpus: 234 Enable_cpus:
234 enable_nonboot_cpus(); 235 enable_nonboot_cpus();
236 ftrace_start();
235 237
236 Platform_wake: 238 Platform_wake:
237 if (need_suspend_ops(state) && suspend_ops->wake) 239 if (need_suspend_ops(state) && suspend_ops->wake)
@@ -265,7 +267,6 @@ int suspend_devices_and_enter(suspend_state_t state)
265 goto Close; 267 goto Close;
266 } 268 }
267 suspend_console(); 269 suspend_console();
268 ftrace_stop();
269 suspend_test_start(); 270 suspend_test_start();
270 error = dpm_suspend_start(PMSG_SUSPEND); 271 error = dpm_suspend_start(PMSG_SUSPEND);
271 if (error) { 272 if (error) {
@@ -285,7 +286,6 @@ int suspend_devices_and_enter(suspend_state_t state)
285 suspend_test_start(); 286 suspend_test_start();
286 dpm_resume_end(PMSG_RESUME); 287 dpm_resume_end(PMSG_RESUME);
287 suspend_test_finish("resume devices"); 288 suspend_test_finish("resume devices");
288 ftrace_start();
289 resume_console(); 289 resume_console();
290 Close: 290 Close:
291 if (need_suspend_ops(state) && suspend_ops->end) 291 if (need_suspend_ops(state) && suspend_ops->end)
diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile
new file mode 100644
index 000000000000..85405bdcf2b3
--- /dev/null
+++ b/kernel/printk/Makefile
@@ -0,0 +1,2 @@
1obj-y = printk.o
2obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o
diff --git a/kernel/printk/braille.c b/kernel/printk/braille.c
new file mode 100644
index 000000000000..276762f3a460
--- /dev/null
+++ b/kernel/printk/braille.c
@@ -0,0 +1,49 @@
1#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
2
3#include <linux/kernel.h>
4#include <linux/console.h>
5#include <linux/string.h>
6
7#include "console_cmdline.h"
8#include "braille.h"
9
10char *_braille_console_setup(char **str, char **brl_options)
11{
12 if (!memcmp(*str, "brl,", 4)) {
13 *brl_options = "";
14 *str += 4;
15 } else if (!memcmp(str, "brl=", 4)) {
16 *brl_options = *str + 4;
17 *str = strchr(*brl_options, ',');
18 if (!*str)
19 pr_err("need port name after brl=\n");
20 else
21 *((*str)++) = 0;
22 } else
23 return NULL;
24
25 return *str;
26}
27
28int
29_braille_register_console(struct console *console, struct console_cmdline *c)
30{
31 int rtn = 0;
32
33 if (c->brl_options) {
34 console->flags |= CON_BRL;
35 rtn = braille_register_console(console, c->index, c->options,
36 c->brl_options);
37 }
38
39 return rtn;
40}
41
42int
43_braille_unregister_console(struct console *console)
44{
45 if (console->flags & CON_BRL)
46 return braille_unregister_console(console);
47
48 return 0;
49}
diff --git a/kernel/printk/braille.h b/kernel/printk/braille.h
new file mode 100644
index 000000000000..769d771145c8
--- /dev/null
+++ b/kernel/printk/braille.h
@@ -0,0 +1,48 @@
1#ifndef _PRINTK_BRAILLE_H
2#define _PRINTK_BRAILLE_H
3
4#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
5
6static inline void
7braille_set_options(struct console_cmdline *c, char *brl_options)
8{
9 c->brl_options = brl_options;
10}
11
12char *
13_braille_console_setup(char **str, char **brl_options);
14
15int
16_braille_register_console(struct console *console, struct console_cmdline *c);
17
18int
19_braille_unregister_console(struct console *console);
20
21#else
22
23static inline void
24braille_set_options(struct console_cmdline *c, char *brl_options)
25{
26}
27
28static inline char *
29_braille_console_setup(char **str, char **brl_options)
30{
31 return NULL;
32}
33
34static inline int
35_braille_register_console(struct console *console, struct console_cmdline *c)
36{
37 return 0;
38}
39
40static inline int
41_braille_unregister_console(struct console *console)
42{
43 return 0;
44}
45
46#endif
47
48#endif
diff --git a/kernel/printk/console_cmdline.h b/kernel/printk/console_cmdline.h
new file mode 100644
index 000000000000..cbd69d842341
--- /dev/null
+++ b/kernel/printk/console_cmdline.h
@@ -0,0 +1,14 @@
1#ifndef _CONSOLE_CMDLINE_H
2#define _CONSOLE_CMDLINE_H
3
4struct console_cmdline
5{
6 char name[8]; /* Name of the driver */
7 int index; /* Minor dev. to use */
8 char *options; /* Options for the driver */
9#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
10 char *brl_options; /* Options for braille driver */
11#endif
12};
13
14#endif
diff --git a/kernel/printk.c b/kernel/printk/printk.c
index d37d45c90ae6..b4e8500afdb3 100644
--- a/kernel/printk.c
+++ b/kernel/printk/printk.c
@@ -51,6 +51,9 @@
51#define CREATE_TRACE_POINTS 51#define CREATE_TRACE_POINTS
52#include <trace/events/printk.h> 52#include <trace/events/printk.h>
53 53
54#include "console_cmdline.h"
55#include "braille.h"
56
54/* printk's without a loglevel use this.. */ 57/* printk's without a loglevel use this.. */
55#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL 58#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL
56 59
@@ -105,19 +108,11 @@ static struct console *exclusive_console;
105/* 108/*
106 * Array of consoles built from command line options (console=) 109 * Array of consoles built from command line options (console=)
107 */ 110 */
108struct console_cmdline
109{
110 char name[8]; /* Name of the driver */
111 int index; /* Minor dev. to use */
112 char *options; /* Options for the driver */
113#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
114 char *brl_options; /* Options for braille driver */
115#endif
116};
117 111
118#define MAX_CMDLINECONSOLES 8 112#define MAX_CMDLINECONSOLES 8
119 113
120static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES]; 114static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES];
115
121static int selected_console = -1; 116static int selected_console = -1;
122static int preferred_console = -1; 117static int preferred_console = -1;
123int console_set_on_cmdline; 118int console_set_on_cmdline;
@@ -178,7 +173,7 @@ static int console_may_schedule;
178 * 67 "g" 173 * 67 "g"
179 * 0032 00 00 00 padding to next message header 174 * 0032 00 00 00 padding to next message header
180 * 175 *
181 * The 'struct log' buffer header must never be directly exported to 176 * The 'struct printk_log' buffer header must never be directly exported to
182 * userspace, it is a kernel-private implementation detail that might 177 * userspace, it is a kernel-private implementation detail that might
183 * need to be changed in the future, when the requirements change. 178 * need to be changed in the future, when the requirements change.
184 * 179 *
@@ -200,7 +195,7 @@ enum log_flags {
200 LOG_CONT = 8, /* text is a fragment of a continuation line */ 195 LOG_CONT = 8, /* text is a fragment of a continuation line */
201}; 196};
202 197
203struct log { 198struct printk_log {
204 u64 ts_nsec; /* timestamp in nanoseconds */ 199 u64 ts_nsec; /* timestamp in nanoseconds */
205 u16 len; /* length of entire record */ 200 u16 len; /* length of entire record */
206 u16 text_len; /* length of text buffer */ 201 u16 text_len; /* length of text buffer */
@@ -248,7 +243,7 @@ static u32 clear_idx;
248#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) 243#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
249#define LOG_ALIGN 4 244#define LOG_ALIGN 4
250#else 245#else
251#define LOG_ALIGN __alignof__(struct log) 246#define LOG_ALIGN __alignof__(struct printk_log)
252#endif 247#endif
253#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) 248#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
254static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); 249static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
@@ -259,35 +254,35 @@ static u32 log_buf_len = __LOG_BUF_LEN;
259static volatile unsigned int logbuf_cpu = UINT_MAX; 254static volatile unsigned int logbuf_cpu = UINT_MAX;
260 255
261/* human readable text of the record */ 256/* human readable text of the record */
262static char *log_text(const struct log *msg) 257static char *log_text(const struct printk_log *msg)
263{ 258{
264 return (char *)msg + sizeof(struct log); 259 return (char *)msg + sizeof(struct printk_log);
265} 260}
266 261
267/* optional key/value pair dictionary attached to the record */ 262/* optional key/value pair dictionary attached to the record */
268static char *log_dict(const struct log *msg) 263static char *log_dict(const struct printk_log *msg)
269{ 264{
270 return (char *)msg + sizeof(struct log) + msg->text_len; 265 return (char *)msg + sizeof(struct printk_log) + msg->text_len;
271} 266}
272 267
273/* get record by index; idx must point to valid msg */ 268/* get record by index; idx must point to valid msg */
274static struct log *log_from_idx(u32 idx) 269static struct printk_log *log_from_idx(u32 idx)
275{ 270{
276 struct log *msg = (struct log *)(log_buf + idx); 271 struct printk_log *msg = (struct printk_log *)(log_buf + idx);
277 272
278 /* 273 /*
279 * A length == 0 record is the end of buffer marker. Wrap around and 274 * A length == 0 record is the end of buffer marker. Wrap around and
280 * read the message at the start of the buffer. 275 * read the message at the start of the buffer.
281 */ 276 */
282 if (!msg->len) 277 if (!msg->len)
283 return (struct log *)log_buf; 278 return (struct printk_log *)log_buf;
284 return msg; 279 return msg;
285} 280}
286 281
287/* get next record; idx must point to valid msg */ 282/* get next record; idx must point to valid msg */
288static u32 log_next(u32 idx) 283static u32 log_next(u32 idx)
289{ 284{
290 struct log *msg = (struct log *)(log_buf + idx); 285 struct printk_log *msg = (struct printk_log *)(log_buf + idx);
291 286
292 /* length == 0 indicates the end of the buffer; wrap */ 287 /* length == 0 indicates the end of the buffer; wrap */
293 /* 288 /*
@@ -296,7 +291,7 @@ static u32 log_next(u32 idx)
296 * return the one after that. 291 * return the one after that.
297 */ 292 */
298 if (!msg->len) { 293 if (!msg->len) {
299 msg = (struct log *)log_buf; 294 msg = (struct printk_log *)log_buf;
300 return msg->len; 295 return msg->len;
301 } 296 }
302 return idx + msg->len; 297 return idx + msg->len;
@@ -308,11 +303,11 @@ static void log_store(int facility, int level,
308 const char *dict, u16 dict_len, 303 const char *dict, u16 dict_len,
309 const char *text, u16 text_len) 304 const char *text, u16 text_len)
310{ 305{
311 struct log *msg; 306 struct printk_log *msg;
312 u32 size, pad_len; 307 u32 size, pad_len;
313 308
314 /* number of '\0' padding bytes to next message */ 309 /* number of '\0' padding bytes to next message */
315 size = sizeof(struct log) + text_len + dict_len; 310 size = sizeof(struct printk_log) + text_len + dict_len;
316 pad_len = (-size) & (LOG_ALIGN - 1); 311 pad_len = (-size) & (LOG_ALIGN - 1);
317 size += pad_len; 312 size += pad_len;
318 313
@@ -324,7 +319,7 @@ static void log_store(int facility, int level,
324 else 319 else
325 free = log_first_idx - log_next_idx; 320 free = log_first_idx - log_next_idx;
326 321
327 if (free > size + sizeof(struct log)) 322 if (free > size + sizeof(struct printk_log))
328 break; 323 break;
329 324
330 /* drop old messages until we have enough contiuous space */ 325 /* drop old messages until we have enough contiuous space */
@@ -332,18 +327,18 @@ static void log_store(int facility, int level,
332 log_first_seq++; 327 log_first_seq++;
333 } 328 }
334 329
335 if (log_next_idx + size + sizeof(struct log) >= log_buf_len) { 330 if (log_next_idx + size + sizeof(struct printk_log) >= log_buf_len) {
336 /* 331 /*
337 * This message + an additional empty header does not fit 332 * This message + an additional empty header does not fit
338 * at the end of the buffer. Add an empty header with len == 0 333 * at the end of the buffer. Add an empty header with len == 0
339 * to signify a wrap around. 334 * to signify a wrap around.
340 */ 335 */
341 memset(log_buf + log_next_idx, 0, sizeof(struct log)); 336 memset(log_buf + log_next_idx, 0, sizeof(struct printk_log));
342 log_next_idx = 0; 337 log_next_idx = 0;
343 } 338 }
344 339
345 /* fill message */ 340 /* fill message */
346 msg = (struct log *)(log_buf + log_next_idx); 341 msg = (struct printk_log *)(log_buf + log_next_idx);
347 memcpy(log_text(msg), text, text_len); 342 memcpy(log_text(msg), text, text_len);
348 msg->text_len = text_len; 343 msg->text_len = text_len;
349 memcpy(log_dict(msg), dict, dict_len); 344 memcpy(log_dict(msg), dict, dict_len);
@@ -356,7 +351,7 @@ static void log_store(int facility, int level,
356 else 351 else
357 msg->ts_nsec = local_clock(); 352 msg->ts_nsec = local_clock();
358 memset(log_dict(msg) + dict_len, 0, pad_len); 353 memset(log_dict(msg) + dict_len, 0, pad_len);
359 msg->len = sizeof(struct log) + text_len + dict_len + pad_len; 354 msg->len = sizeof(struct printk_log) + text_len + dict_len + pad_len;
360 355
361 /* insert message */ 356 /* insert message */
362 log_next_idx += msg->len; 357 log_next_idx += msg->len;
@@ -479,7 +474,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
479 size_t count, loff_t *ppos) 474 size_t count, loff_t *ppos)
480{ 475{
481 struct devkmsg_user *user = file->private_data; 476 struct devkmsg_user *user = file->private_data;
482 struct log *msg; 477 struct printk_log *msg;
483 u64 ts_usec; 478 u64 ts_usec;
484 size_t i; 479 size_t i;
485 char cont = '-'; 480 char cont = '-';
@@ -724,14 +719,14 @@ void log_buf_kexec_setup(void)
724 VMCOREINFO_SYMBOL(log_first_idx); 719 VMCOREINFO_SYMBOL(log_first_idx);
725 VMCOREINFO_SYMBOL(log_next_idx); 720 VMCOREINFO_SYMBOL(log_next_idx);
726 /* 721 /*
727 * Export struct log size and field offsets. User space tools can 722 * Export struct printk_log size and field offsets. User space tools can
728 * parse it and detect any changes to structure down the line. 723 * parse it and detect any changes to structure down the line.
729 */ 724 */
730 VMCOREINFO_STRUCT_SIZE(log); 725 VMCOREINFO_STRUCT_SIZE(printk_log);
731 VMCOREINFO_OFFSET(log, ts_nsec); 726 VMCOREINFO_OFFSET(printk_log, ts_nsec);
732 VMCOREINFO_OFFSET(log, len); 727 VMCOREINFO_OFFSET(printk_log, len);
733 VMCOREINFO_OFFSET(log, text_len); 728 VMCOREINFO_OFFSET(printk_log, text_len);
734 VMCOREINFO_OFFSET(log, dict_len); 729 VMCOREINFO_OFFSET(printk_log, dict_len);
735} 730}
736#endif 731#endif
737 732
@@ -884,7 +879,7 @@ static size_t print_time(u64 ts, char *buf)
884 (unsigned long)ts, rem_nsec / 1000); 879 (unsigned long)ts, rem_nsec / 1000);
885} 880}
886 881
887static size_t print_prefix(const struct log *msg, bool syslog, char *buf) 882static size_t print_prefix(const struct printk_log *msg, bool syslog, char *buf)
888{ 883{
889 size_t len = 0; 884 size_t len = 0;
890 unsigned int prefix = (msg->facility << 3) | msg->level; 885 unsigned int prefix = (msg->facility << 3) | msg->level;
@@ -907,7 +902,7 @@ static size_t print_prefix(const struct log *msg, bool syslog, char *buf)
907 return len; 902 return len;
908} 903}
909 904
910static size_t msg_print_text(const struct log *msg, enum log_flags prev, 905static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev,
911 bool syslog, char *buf, size_t size) 906 bool syslog, char *buf, size_t size)
912{ 907{
913 const char *text = log_text(msg); 908 const char *text = log_text(msg);
@@ -969,7 +964,7 @@ static size_t msg_print_text(const struct log *msg, enum log_flags prev,
969static int syslog_print(char __user *buf, int size) 964static int syslog_print(char __user *buf, int size)
970{ 965{
971 char *text; 966 char *text;
972 struct log *msg; 967 struct printk_log *msg;
973 int len = 0; 968 int len = 0;
974 969
975 text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); 970 text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
@@ -1060,7 +1055,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
1060 idx = clear_idx; 1055 idx = clear_idx;
1061 prev = 0; 1056 prev = 0;
1062 while (seq < log_next_seq) { 1057 while (seq < log_next_seq) {
1063 struct log *msg = log_from_idx(idx); 1058 struct printk_log *msg = log_from_idx(idx);
1064 1059
1065 len += msg_print_text(msg, prev, true, NULL, 0); 1060 len += msg_print_text(msg, prev, true, NULL, 0);
1066 prev = msg->flags; 1061 prev = msg->flags;
@@ -1073,7 +1068,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
1073 idx = clear_idx; 1068 idx = clear_idx;
1074 prev = 0; 1069 prev = 0;
1075 while (len > size && seq < log_next_seq) { 1070 while (len > size && seq < log_next_seq) {
1076 struct log *msg = log_from_idx(idx); 1071 struct printk_log *msg = log_from_idx(idx);
1077 1072
1078 len -= msg_print_text(msg, prev, true, NULL, 0); 1073 len -= msg_print_text(msg, prev, true, NULL, 0);
1079 prev = msg->flags; 1074 prev = msg->flags;
@@ -1087,7 +1082,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
1087 len = 0; 1082 len = 0;
1088 prev = 0; 1083 prev = 0;
1089 while (len >= 0 && seq < next_seq) { 1084 while (len >= 0 && seq < next_seq) {
1090 struct log *msg = log_from_idx(idx); 1085 struct printk_log *msg = log_from_idx(idx);
1091 int textlen; 1086 int textlen;
1092 1087
1093 textlen = msg_print_text(msg, prev, true, text, 1088 textlen = msg_print_text(msg, prev, true, text,
@@ -1233,7 +1228,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
1233 1228
1234 error = 0; 1229 error = 0;
1235 while (seq < log_next_seq) { 1230 while (seq < log_next_seq) {
1236 struct log *msg = log_from_idx(idx); 1231 struct printk_log *msg = log_from_idx(idx);
1237 1232
1238 error += msg_print_text(msg, prev, true, NULL, 0); 1233 error += msg_print_text(msg, prev, true, NULL, 0);
1239 idx = log_next(idx); 1234 idx = log_next(idx);
@@ -1719,10 +1714,10 @@ static struct cont {
1719 u8 level; 1714 u8 level;
1720 bool flushed:1; 1715 bool flushed:1;
1721} cont; 1716} cont;
1722static struct log *log_from_idx(u32 idx) { return NULL; } 1717static struct printk_log *log_from_idx(u32 idx) { return NULL; }
1723static u32 log_next(u32 idx) { return 0; } 1718static u32 log_next(u32 idx) { return 0; }
1724static void call_console_drivers(int level, const char *text, size_t len) {} 1719static void call_console_drivers(int level, const char *text, size_t len) {}
1725static size_t msg_print_text(const struct log *msg, enum log_flags prev, 1720static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev,
1726 bool syslog, char *buf, size_t size) { return 0; } 1721 bool syslog, char *buf, size_t size) { return 0; }
1727static size_t cont_print_text(char *text, size_t size) { return 0; } 1722static size_t cont_print_text(char *text, size_t size) { return 0; }
1728 1723
@@ -1761,23 +1756,23 @@ static int __add_preferred_console(char *name, int idx, char *options,
1761 * See if this tty is not yet registered, and 1756 * See if this tty is not yet registered, and
1762 * if we have a slot free. 1757 * if we have a slot free.
1763 */ 1758 */
1764 for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) 1759 for (i = 0, c = console_cmdline;
1765 if (strcmp(console_cmdline[i].name, name) == 0 && 1760 i < MAX_CMDLINECONSOLES && c->name[0];
1766 console_cmdline[i].index == idx) { 1761 i++, c++) {
1767 if (!brl_options) 1762 if (strcmp(c->name, name) == 0 && c->index == idx) {
1768 selected_console = i; 1763 if (!brl_options)
1769 return 0; 1764 selected_console = i;
1765 return 0;
1770 } 1766 }
1767 }
1771 if (i == MAX_CMDLINECONSOLES) 1768 if (i == MAX_CMDLINECONSOLES)
1772 return -E2BIG; 1769 return -E2BIG;
1773 if (!brl_options) 1770 if (!brl_options)
1774 selected_console = i; 1771 selected_console = i;
1775 c = &console_cmdline[i];
1776 strlcpy(c->name, name, sizeof(c->name)); 1772 strlcpy(c->name, name, sizeof(c->name));
1777 c->options = options; 1773 c->options = options;
1778#ifdef CONFIG_A11Y_BRAILLE_CONSOLE 1774 braille_set_options(c, brl_options);
1779 c->brl_options = brl_options; 1775
1780#endif
1781 c->index = idx; 1776 c->index = idx;
1782 return 0; 1777 return 0;
1783} 1778}
@@ -1790,20 +1785,8 @@ static int __init console_setup(char *str)
1790 char *s, *options, *brl_options = NULL; 1785 char *s, *options, *brl_options = NULL;
1791 int idx; 1786 int idx;
1792 1787
1793#ifdef CONFIG_A11Y_BRAILLE_CONSOLE 1788 if (_braille_console_setup(&str, &brl_options))
1794 if (!memcmp(str, "brl,", 4)) { 1789 return 1;
1795 brl_options = "";
1796 str += 4;
1797 } else if (!memcmp(str, "brl=", 4)) {
1798 brl_options = str + 4;
1799 str = strchr(brl_options, ',');
1800 if (!str) {
1801 printk(KERN_ERR "need port name after brl=\n");
1802 return 1;
1803 }
1804 *(str++) = 0;
1805 }
1806#endif
1807 1790
1808 /* 1791 /*
1809 * Decode str into name, index, options. 1792 * Decode str into name, index, options.
@@ -1858,15 +1841,15 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha
1858 struct console_cmdline *c; 1841 struct console_cmdline *c;
1859 int i; 1842 int i;
1860 1843
1861 for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) 1844 for (i = 0, c = console_cmdline;
1862 if (strcmp(console_cmdline[i].name, name) == 0 && 1845 i < MAX_CMDLINECONSOLES && c->name[0];
1863 console_cmdline[i].index == idx) { 1846 i++, c++)
1864 c = &console_cmdline[i]; 1847 if (strcmp(c->name, name) == 0 && c->index == idx) {
1865 strlcpy(c->name, name_new, sizeof(c->name)); 1848 strlcpy(c->name, name_new, sizeof(c->name));
1866 c->name[sizeof(c->name) - 1] = 0; 1849 c->name[sizeof(c->name) - 1] = 0;
1867 c->options = options; 1850 c->options = options;
1868 c->index = idx_new; 1851 c->index = idx_new;
1869 return i; 1852 return i;
1870 } 1853 }
1871 /* not found */ 1854 /* not found */
1872 return -1; 1855 return -1;
@@ -1921,7 +1904,7 @@ void resume_console(void)
1921 * called when a new CPU comes online (or fails to come up), and ensures 1904 * called when a new CPU comes online (or fails to come up), and ensures
1922 * that any such output gets printed. 1905 * that any such output gets printed.
1923 */ 1906 */
1924static int __cpuinit console_cpu_notify(struct notifier_block *self, 1907static int console_cpu_notify(struct notifier_block *self,
1925 unsigned long action, void *hcpu) 1908 unsigned long action, void *hcpu)
1926{ 1909{
1927 switch (action) { 1910 switch (action) {
@@ -2046,7 +2029,7 @@ void console_unlock(void)
2046 console_cont_flush(text, sizeof(text)); 2029 console_cont_flush(text, sizeof(text));
2047again: 2030again:
2048 for (;;) { 2031 for (;;) {
2049 struct log *msg; 2032 struct printk_log *msg;
2050 size_t len; 2033 size_t len;
2051 int level; 2034 int level;
2052 2035
@@ -2241,6 +2224,14 @@ void register_console(struct console *newcon)
2241 int i; 2224 int i;
2242 unsigned long flags; 2225 unsigned long flags;
2243 struct console *bcon = NULL; 2226 struct console *bcon = NULL;
2227 struct console_cmdline *c;
2228
2229 if (console_drivers)
2230 for_each_console(bcon)
2231 if (WARN(bcon == newcon,
2232 "console '%s%d' already registered\n",
2233 bcon->name, bcon->index))
2234 return;
2244 2235
2245 /* 2236 /*
2246 * before we register a new CON_BOOT console, make sure we don't 2237 * before we register a new CON_BOOT console, make sure we don't
@@ -2288,30 +2279,25 @@ void register_console(struct console *newcon)
2288 * See if this console matches one we selected on 2279 * See if this console matches one we selected on
2289 * the command line. 2280 * the command line.
2290 */ 2281 */
2291 for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; 2282 for (i = 0, c = console_cmdline;
2292 i++) { 2283 i < MAX_CMDLINECONSOLES && c->name[0];
2293 if (strcmp(console_cmdline[i].name, newcon->name) != 0) 2284 i++, c++) {
2285 if (strcmp(c->name, newcon->name) != 0)
2294 continue; 2286 continue;
2295 if (newcon->index >= 0 && 2287 if (newcon->index >= 0 &&
2296 newcon->index != console_cmdline[i].index) 2288 newcon->index != c->index)
2297 continue; 2289 continue;
2298 if (newcon->index < 0) 2290 if (newcon->index < 0)
2299 newcon->index = console_cmdline[i].index; 2291 newcon->index = c->index;
2300#ifdef CONFIG_A11Y_BRAILLE_CONSOLE 2292
2301 if (console_cmdline[i].brl_options) { 2293 if (_braille_register_console(newcon, c))
2302 newcon->flags |= CON_BRL;
2303 braille_register_console(newcon,
2304 console_cmdline[i].index,
2305 console_cmdline[i].options,
2306 console_cmdline[i].brl_options);
2307 return; 2294 return;
2308 } 2295
2309#endif
2310 if (newcon->setup && 2296 if (newcon->setup &&
2311 newcon->setup(newcon, console_cmdline[i].options) != 0) 2297 newcon->setup(newcon, console_cmdline[i].options) != 0)
2312 break; 2298 break;
2313 newcon->flags |= CON_ENABLED; 2299 newcon->flags |= CON_ENABLED;
2314 newcon->index = console_cmdline[i].index; 2300 newcon->index = c->index;
2315 if (i == selected_console) { 2301 if (i == selected_console) {
2316 newcon->flags |= CON_CONSDEV; 2302 newcon->flags |= CON_CONSDEV;
2317 preferred_console = selected_console; 2303 preferred_console = selected_console;
@@ -2394,13 +2380,13 @@ EXPORT_SYMBOL(register_console);
2394int unregister_console(struct console *console) 2380int unregister_console(struct console *console)
2395{ 2381{
2396 struct console *a, *b; 2382 struct console *a, *b;
2397 int res = 1; 2383 int res;
2398 2384
2399#ifdef CONFIG_A11Y_BRAILLE_CONSOLE 2385 res = _braille_unregister_console(console);
2400 if (console->flags & CON_BRL) 2386 if (res)
2401 return braille_unregister_console(console); 2387 return res;
2402#endif
2403 2388
2389 res = 1;
2404 console_lock(); 2390 console_lock();
2405 if (console_drivers == console) { 2391 if (console_drivers == console) {
2406 console_drivers=console->next; 2392 console_drivers=console->next;
@@ -2666,7 +2652,7 @@ void kmsg_dump(enum kmsg_dump_reason reason)
2666bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog, 2652bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog,
2667 char *line, size_t size, size_t *len) 2653 char *line, size_t size, size_t *len)
2668{ 2654{
2669 struct log *msg; 2655 struct printk_log *msg;
2670 size_t l = 0; 2656 size_t l = 0;
2671 bool ret = false; 2657 bool ret = false;
2672 2658
@@ -2778,7 +2764,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
2778 idx = dumper->cur_idx; 2764 idx = dumper->cur_idx;
2779 prev = 0; 2765 prev = 0;
2780 while (seq < dumper->next_seq) { 2766 while (seq < dumper->next_seq) {
2781 struct log *msg = log_from_idx(idx); 2767 struct printk_log *msg = log_from_idx(idx);
2782 2768
2783 l += msg_print_text(msg, prev, true, NULL, 0); 2769 l += msg_print_text(msg, prev, true, NULL, 0);
2784 idx = log_next(idx); 2770 idx = log_next(idx);
@@ -2791,7 +2777,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
2791 idx = dumper->cur_idx; 2777 idx = dumper->cur_idx;
2792 prev = 0; 2778 prev = 0;
2793 while (l > size && seq < dumper->next_seq) { 2779 while (l > size && seq < dumper->next_seq) {
2794 struct log *msg = log_from_idx(idx); 2780 struct printk_log *msg = log_from_idx(idx);
2795 2781
2796 l -= msg_print_text(msg, prev, true, NULL, 0); 2782 l -= msg_print_text(msg, prev, true, NULL, 0);
2797 idx = log_next(idx); 2783 idx = log_next(idx);
@@ -2806,7 +2792,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
2806 l = 0; 2792 l = 0;
2807 prev = 0; 2793 prev = 0;
2808 while (seq < dumper->next_seq) { 2794 while (seq < dumper->next_seq) {
2809 struct log *msg = log_from_idx(idx); 2795 struct printk_log *msg = log_from_idx(idx);
2810 2796
2811 l += msg_print_text(msg, prev, syslog, buf + l, size - l); 2797 l += msg_print_text(msg, prev, syslog, buf + l, size - l);
2812 idx = log_next(idx); 2798 idx = log_next(idx);
diff --git a/kernel/profile.c b/kernel/profile.c
index 0bf400737660..6631e1ef55ab 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -331,7 +331,7 @@ out:
331 put_cpu(); 331 put_cpu();
332} 332}
333 333
334static int __cpuinit profile_cpu_callback(struct notifier_block *info, 334static int profile_cpu_callback(struct notifier_block *info,
335 unsigned long action, void *__cpu) 335 unsigned long action, void *__cpu)
336{ 336{
337 int node, cpu = (unsigned long)__cpu; 337 int node, cpu = (unsigned long)__cpu;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 4041f5747e73..a146ee327f6a 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -469,7 +469,6 @@ static int ptrace_detach(struct task_struct *child, unsigned int data)
469 /* Architecture-specific hardware disable .. */ 469 /* Architecture-specific hardware disable .. */
470 ptrace_disable(child); 470 ptrace_disable(child);
471 clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); 471 clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
472 flush_ptrace_hw_breakpoint(child);
473 472
474 write_lock_irq(&tasklist_lock); 473 write_lock_irq(&tasklist_lock);
475 /* 474 /*
diff --git a/kernel/rcu.h b/kernel/rcu.h
index 7f8e7590e3e5..77131966c4ad 100644
--- a/kernel/rcu.h
+++ b/kernel/rcu.h
@@ -67,12 +67,15 @@
67 67
68extern struct debug_obj_descr rcuhead_debug_descr; 68extern struct debug_obj_descr rcuhead_debug_descr;
69 69
70static inline void debug_rcu_head_queue(struct rcu_head *head) 70static inline int debug_rcu_head_queue(struct rcu_head *head)
71{ 71{
72 debug_object_activate(head, &rcuhead_debug_descr); 72 int r1;
73
74 r1 = debug_object_activate(head, &rcuhead_debug_descr);
73 debug_object_active_state(head, &rcuhead_debug_descr, 75 debug_object_active_state(head, &rcuhead_debug_descr,
74 STATE_RCU_HEAD_READY, 76 STATE_RCU_HEAD_READY,
75 STATE_RCU_HEAD_QUEUED); 77 STATE_RCU_HEAD_QUEUED);
78 return r1;
76} 79}
77 80
78static inline void debug_rcu_head_unqueue(struct rcu_head *head) 81static inline void debug_rcu_head_unqueue(struct rcu_head *head)
@@ -83,8 +86,9 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head)
83 debug_object_deactivate(head, &rcuhead_debug_descr); 86 debug_object_deactivate(head, &rcuhead_debug_descr);
84} 87}
85#else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ 88#else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
86static inline void debug_rcu_head_queue(struct rcu_head *head) 89static inline int debug_rcu_head_queue(struct rcu_head *head)
87{ 90{
91 return 0;
88} 92}
89 93
90static inline void debug_rcu_head_unqueue(struct rcu_head *head) 94static inline void debug_rcu_head_unqueue(struct rcu_head *head)
@@ -94,7 +98,7 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head)
94 98
95extern void kfree(const void *); 99extern void kfree(const void *);
96 100
97static inline bool __rcu_reclaim(char *rn, struct rcu_head *head) 101static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
98{ 102{
99 unsigned long offset = (unsigned long)head->func; 103 unsigned long offset = (unsigned long)head->func;
100 104
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index cce6ba8bbace..33eb4620aa17 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -212,43 +212,6 @@ static inline void debug_rcu_head_free(struct rcu_head *head)
212} 212}
213 213
214/* 214/*
215 * fixup_init is called when:
216 * - an active object is initialized
217 */
218static int rcuhead_fixup_init(void *addr, enum debug_obj_state state)
219{
220 struct rcu_head *head = addr;
221
222 switch (state) {
223 case ODEBUG_STATE_ACTIVE:
224 /*
225 * Ensure that queued callbacks are all executed.
226 * If we detect that we are nested in a RCU read-side critical
227 * section, we should simply fail, otherwise we would deadlock.
228 * In !PREEMPT configurations, there is no way to tell if we are
229 * in a RCU read-side critical section or not, so we never
230 * attempt any fixup and just print a warning.
231 */
232#ifndef CONFIG_PREEMPT
233 WARN_ON_ONCE(1);
234 return 0;
235#endif
236 if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
237 irqs_disabled()) {
238 WARN_ON_ONCE(1);
239 return 0;
240 }
241 rcu_barrier();
242 rcu_barrier_sched();
243 rcu_barrier_bh();
244 debug_object_init(head, &rcuhead_debug_descr);
245 return 1;
246 default:
247 return 0;
248 }
249}
250
251/*
252 * fixup_activate is called when: 215 * fixup_activate is called when:
253 * - an active object is activated 216 * - an active object is activated
254 * - an unknown object is activated (might be a statically initialized object) 217 * - an unknown object is activated (might be a statically initialized object)
@@ -268,69 +231,8 @@ static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state)
268 debug_object_init(head, &rcuhead_debug_descr); 231 debug_object_init(head, &rcuhead_debug_descr);
269 debug_object_activate(head, &rcuhead_debug_descr); 232 debug_object_activate(head, &rcuhead_debug_descr);
270 return 0; 233 return 0;
271
272 case ODEBUG_STATE_ACTIVE:
273 /*
274 * Ensure that queued callbacks are all executed.
275 * If we detect that we are nested in a RCU read-side critical
276 * section, we should simply fail, otherwise we would deadlock.
277 * In !PREEMPT configurations, there is no way to tell if we are
278 * in a RCU read-side critical section or not, so we never
279 * attempt any fixup and just print a warning.
280 */
281#ifndef CONFIG_PREEMPT
282 WARN_ON_ONCE(1);
283 return 0;
284#endif
285 if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
286 irqs_disabled()) {
287 WARN_ON_ONCE(1);
288 return 0;
289 }
290 rcu_barrier();
291 rcu_barrier_sched();
292 rcu_barrier_bh();
293 debug_object_activate(head, &rcuhead_debug_descr);
294 return 1;
295 default: 234 default:
296 return 0;
297 }
298}
299
300/*
301 * fixup_free is called when:
302 * - an active object is freed
303 */
304static int rcuhead_fixup_free(void *addr, enum debug_obj_state state)
305{
306 struct rcu_head *head = addr;
307
308 switch (state) {
309 case ODEBUG_STATE_ACTIVE:
310 /*
311 * Ensure that queued callbacks are all executed.
312 * If we detect that we are nested in a RCU read-side critical
313 * section, we should simply fail, otherwise we would deadlock.
314 * In !PREEMPT configurations, there is no way to tell if we are
315 * in a RCU read-side critical section or not, so we never
316 * attempt any fixup and just print a warning.
317 */
318#ifndef CONFIG_PREEMPT
319 WARN_ON_ONCE(1);
320 return 0;
321#endif
322 if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
323 irqs_disabled()) {
324 WARN_ON_ONCE(1);
325 return 0;
326 }
327 rcu_barrier();
328 rcu_barrier_sched();
329 rcu_barrier_bh();
330 debug_object_free(head, &rcuhead_debug_descr);
331 return 1; 235 return 1;
332 default:
333 return 0;
334 } 236 }
335} 237}
336 238
@@ -369,15 +271,13 @@ EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack);
369 271
370struct debug_obj_descr rcuhead_debug_descr = { 272struct debug_obj_descr rcuhead_debug_descr = {
371 .name = "rcu_head", 273 .name = "rcu_head",
372 .fixup_init = rcuhead_fixup_init,
373 .fixup_activate = rcuhead_fixup_activate, 274 .fixup_activate = rcuhead_fixup_activate,
374 .fixup_free = rcuhead_fixup_free,
375}; 275};
376EXPORT_SYMBOL_GPL(rcuhead_debug_descr); 276EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
377#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ 277#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
378 278
379#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE) 279#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE)
380void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp, 280void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp,
381 unsigned long secs, 281 unsigned long secs,
382 unsigned long c_old, unsigned long c) 282 unsigned long c_old, unsigned long c)
383{ 283{
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index aa344111de3e..9ed6075dc562 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -264,7 +264,7 @@ void rcu_check_callbacks(int cpu, int user)
264 */ 264 */
265static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) 265static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
266{ 266{
267 char *rn = NULL; 267 const char *rn = NULL;
268 struct rcu_head *next, *list; 268 struct rcu_head *next, *list;
269 unsigned long flags; 269 unsigned long flags;
270 RCU_TRACE(int cb_count = 0); 270 RCU_TRACE(int cb_count = 0);
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 0cd385acccfa..280d06cae352 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -36,7 +36,7 @@ struct rcu_ctrlblk {
36 RCU_TRACE(unsigned long gp_start); /* Start time for stalls. */ 36 RCU_TRACE(unsigned long gp_start); /* Start time for stalls. */
37 RCU_TRACE(unsigned long ticks_this_gp); /* Statistic for stalls. */ 37 RCU_TRACE(unsigned long ticks_this_gp); /* Statistic for stalls. */
38 RCU_TRACE(unsigned long jiffies_stall); /* Jiffies at next stall. */ 38 RCU_TRACE(unsigned long jiffies_stall); /* Jiffies at next stall. */
39 RCU_TRACE(char *name); /* Name of RCU type. */ 39 RCU_TRACE(const char *name); /* Name of RCU type. */
40}; 40};
41 41
42/* Definition for rcupdate control block. */ 42/* Definition for rcupdate control block. */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index b1fa5510388d..be63101c6175 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -52,72 +52,78 @@
52MODULE_LICENSE("GPL"); 52MODULE_LICENSE("GPL");
53MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>"); 53MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>");
54 54
55static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ 55static int fqs_duration;
56static int nfakewriters = 4; /* # fake writer threads */
57static int stat_interval = 60; /* Interval between stats, in seconds. */
58 /* Zero means "only at end of test". */
59static bool verbose; /* Print more debug info. */
60static bool test_no_idle_hz = true;
61 /* Test RCU support for tickless idle CPUs. */
62static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/
63static int stutter = 5; /* Start/stop testing interval (in sec) */
64static int irqreader = 1; /* RCU readers from irq (timers). */
65static int fqs_duration; /* Duration of bursts (us), 0 to disable. */
66static int fqs_holdoff; /* Hold time within burst (us). */
67static int fqs_stutter = 3; /* Wait time between bursts (s). */
68static int n_barrier_cbs; /* Number of callbacks to test RCU barriers. */
69static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */
70static int onoff_holdoff; /* Seconds after boot before CPU hotplugs. */
71static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */
72static int stall_cpu; /* CPU-stall duration (s). 0 for no stall. */
73static int stall_cpu_holdoff = 10; /* Time to wait until stall (s). */
74static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */
75static int test_boost_interval = 7; /* Interval between boost tests, seconds. */
76static int test_boost_duration = 4; /* Duration of each boost test, seconds. */
77static char *torture_type = "rcu"; /* What RCU implementation to torture. */
78
79module_param(nreaders, int, 0444);
80MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
81module_param(nfakewriters, int, 0444);
82MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads");
83module_param(stat_interval, int, 0644);
84MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s");
85module_param(verbose, bool, 0444);
86MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
87module_param(test_no_idle_hz, bool, 0444);
88MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs");
89module_param(shuffle_interval, int, 0444);
90MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles");
91module_param(stutter, int, 0444);
92MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test");
93module_param(irqreader, int, 0444);
94MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers");
95module_param(fqs_duration, int, 0444); 56module_param(fqs_duration, int, 0444);
96MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us)"); 57MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us), 0 to disable");
58static int fqs_holdoff;
97module_param(fqs_holdoff, int, 0444); 59module_param(fqs_holdoff, int, 0444);
98MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); 60MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
61static int fqs_stutter = 3;
99module_param(fqs_stutter, int, 0444); 62module_param(fqs_stutter, int, 0444);
100MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); 63MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
64static bool gp_exp;
65module_param(gp_exp, bool, 0444);
66MODULE_PARM_DESC(gp_exp, "Use expedited GP wait primitives");
67static bool gp_normal;
68module_param(gp_normal, bool, 0444);
69MODULE_PARM_DESC(gp_normal, "Use normal (non-expedited) GP wait primitives");
70static int irqreader = 1;
71module_param(irqreader, int, 0444);
72MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers");
73static int n_barrier_cbs;
101module_param(n_barrier_cbs, int, 0444); 74module_param(n_barrier_cbs, int, 0444);
102MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing"); 75MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing");
103module_param(onoff_interval, int, 0444); 76static int nfakewriters = 4;
104MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); 77module_param(nfakewriters, int, 0444);
78MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads");
79static int nreaders = -1;
80module_param(nreaders, int, 0444);
81MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
82static int object_debug;
83module_param(object_debug, int, 0444);
84MODULE_PARM_DESC(object_debug, "Enable debug-object double call_rcu() testing");
85static int onoff_holdoff;
105module_param(onoff_holdoff, int, 0444); 86module_param(onoff_holdoff, int, 0444);
106MODULE_PARM_DESC(onoff_holdoff, "Time after boot before CPU hotplugs (s)"); 87MODULE_PARM_DESC(onoff_holdoff, "Time after boot before CPU hotplugs (s)");
88static int onoff_interval;
89module_param(onoff_interval, int, 0444);
90MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable");
91static int shuffle_interval = 3;
92module_param(shuffle_interval, int, 0444);
93MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles");
94static int shutdown_secs;
107module_param(shutdown_secs, int, 0444); 95module_param(shutdown_secs, int, 0444);
108MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), zero to disable."); 96MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), <= zero to disable.");
97static int stall_cpu;
109module_param(stall_cpu, int, 0444); 98module_param(stall_cpu, int, 0444);
110MODULE_PARM_DESC(stall_cpu, "Stall duration (s), zero to disable."); 99MODULE_PARM_DESC(stall_cpu, "Stall duration (s), zero to disable.");
100static int stall_cpu_holdoff = 10;
111module_param(stall_cpu_holdoff, int, 0444); 101module_param(stall_cpu_holdoff, int, 0444);
112MODULE_PARM_DESC(stall_cpu_holdoff, "Time to wait before starting stall (s)."); 102MODULE_PARM_DESC(stall_cpu_holdoff, "Time to wait before starting stall (s).");
103static int stat_interval = 60;
104module_param(stat_interval, int, 0644);
105MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s");
106static int stutter = 5;
107module_param(stutter, int, 0444);
108MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test");
109static int test_boost = 1;
113module_param(test_boost, int, 0444); 110module_param(test_boost, int, 0444);
114MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); 111MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
115module_param(test_boost_interval, int, 0444); 112static int test_boost_duration = 4;
116MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds.");
117module_param(test_boost_duration, int, 0444); 113module_param(test_boost_duration, int, 0444);
118MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds."); 114MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds.");
115static int test_boost_interval = 7;
116module_param(test_boost_interval, int, 0444);
117MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds.");
118static bool test_no_idle_hz = true;
119module_param(test_no_idle_hz, bool, 0444);
120MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs");
121static char *torture_type = "rcu";
119module_param(torture_type, charp, 0444); 122module_param(torture_type, charp, 0444);
120MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); 123MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)");
124static bool verbose;
125module_param(verbose, bool, 0444);
126MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
121 127
122#define TORTURE_FLAG "-torture:" 128#define TORTURE_FLAG "-torture:"
123#define PRINTK_STRING(s) \ 129#define PRINTK_STRING(s) \
@@ -267,7 +273,7 @@ rcutorture_shutdown_notify(struct notifier_block *unused1,
267 * Absorb kthreads into a kernel function that won't return, so that 273 * Absorb kthreads into a kernel function that won't return, so that
268 * they won't ever access module text or data again. 274 * they won't ever access module text or data again.
269 */ 275 */
270static void rcutorture_shutdown_absorb(char *title) 276static void rcutorture_shutdown_absorb(const char *title)
271{ 277{
272 if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { 278 if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
273 pr_notice( 279 pr_notice(
@@ -337,7 +343,7 @@ rcu_random(struct rcu_random_state *rrsp)
337} 343}
338 344
339static void 345static void
340rcu_stutter_wait(char *title) 346rcu_stutter_wait(const char *title)
341{ 347{
342 while (stutter_pause_test || !rcutorture_runnable) { 348 while (stutter_pause_test || !rcutorture_runnable) {
343 if (rcutorture_runnable) 349 if (rcutorture_runnable)
@@ -360,13 +366,14 @@ struct rcu_torture_ops {
360 int (*completed)(void); 366 int (*completed)(void);
361 void (*deferred_free)(struct rcu_torture *p); 367 void (*deferred_free)(struct rcu_torture *p);
362 void (*sync)(void); 368 void (*sync)(void);
369 void (*exp_sync)(void);
363 void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); 370 void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
364 void (*cb_barrier)(void); 371 void (*cb_barrier)(void);
365 void (*fqs)(void); 372 void (*fqs)(void);
366 int (*stats)(char *page); 373 int (*stats)(char *page);
367 int irq_capable; 374 int irq_capable;
368 int can_boost; 375 int can_boost;
369 char *name; 376 const char *name;
370}; 377};
371 378
372static struct rcu_torture_ops *cur_ops; 379static struct rcu_torture_ops *cur_ops;
@@ -443,81 +450,27 @@ static void rcu_torture_deferred_free(struct rcu_torture *p)
443 call_rcu(&p->rtort_rcu, rcu_torture_cb); 450 call_rcu(&p->rtort_rcu, rcu_torture_cb);
444} 451}
445 452
446static struct rcu_torture_ops rcu_ops = {
447 .init = NULL,
448 .readlock = rcu_torture_read_lock,
449 .read_delay = rcu_read_delay,
450 .readunlock = rcu_torture_read_unlock,
451 .completed = rcu_torture_completed,
452 .deferred_free = rcu_torture_deferred_free,
453 .sync = synchronize_rcu,
454 .call = call_rcu,
455 .cb_barrier = rcu_barrier,
456 .fqs = rcu_force_quiescent_state,
457 .stats = NULL,
458 .irq_capable = 1,
459 .can_boost = rcu_can_boost(),
460 .name = "rcu"
461};
462
463static void rcu_sync_torture_deferred_free(struct rcu_torture *p)
464{
465 int i;
466 struct rcu_torture *rp;
467 struct rcu_torture *rp1;
468
469 cur_ops->sync();
470 list_add(&p->rtort_free, &rcu_torture_removed);
471 list_for_each_entry_safe(rp, rp1, &rcu_torture_removed, rtort_free) {
472 i = rp->rtort_pipe_count;
473 if (i > RCU_TORTURE_PIPE_LEN)
474 i = RCU_TORTURE_PIPE_LEN;
475 atomic_inc(&rcu_torture_wcount[i]);
476 if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
477 rp->rtort_mbtest = 0;
478 list_del(&rp->rtort_free);
479 rcu_torture_free(rp);
480 }
481 }
482}
483
484static void rcu_sync_torture_init(void) 453static void rcu_sync_torture_init(void)
485{ 454{
486 INIT_LIST_HEAD(&rcu_torture_removed); 455 INIT_LIST_HEAD(&rcu_torture_removed);
487} 456}
488 457
489static struct rcu_torture_ops rcu_sync_ops = { 458static struct rcu_torture_ops rcu_ops = {
490 .init = rcu_sync_torture_init, 459 .init = rcu_sync_torture_init,
491 .readlock = rcu_torture_read_lock, 460 .readlock = rcu_torture_read_lock,
492 .read_delay = rcu_read_delay, 461 .read_delay = rcu_read_delay,
493 .readunlock = rcu_torture_read_unlock, 462 .readunlock = rcu_torture_read_unlock,
494 .completed = rcu_torture_completed, 463 .completed = rcu_torture_completed,
495 .deferred_free = rcu_sync_torture_deferred_free, 464 .deferred_free = rcu_torture_deferred_free,
496 .sync = synchronize_rcu, 465 .sync = synchronize_rcu,
497 .call = NULL, 466 .exp_sync = synchronize_rcu_expedited,
498 .cb_barrier = NULL, 467 .call = call_rcu,
499 .fqs = rcu_force_quiescent_state, 468 .cb_barrier = rcu_barrier,
500 .stats = NULL,
501 .irq_capable = 1,
502 .can_boost = rcu_can_boost(),
503 .name = "rcu_sync"
504};
505
506static struct rcu_torture_ops rcu_expedited_ops = {
507 .init = rcu_sync_torture_init,
508 .readlock = rcu_torture_read_lock,
509 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
510 .readunlock = rcu_torture_read_unlock,
511 .completed = rcu_no_completed,
512 .deferred_free = rcu_sync_torture_deferred_free,
513 .sync = synchronize_rcu_expedited,
514 .call = NULL,
515 .cb_barrier = NULL,
516 .fqs = rcu_force_quiescent_state, 469 .fqs = rcu_force_quiescent_state,
517 .stats = NULL, 470 .stats = NULL,
518 .irq_capable = 1, 471 .irq_capable = 1,
519 .can_boost = rcu_can_boost(), 472 .can_boost = rcu_can_boost(),
520 .name = "rcu_expedited" 473 .name = "rcu"
521}; 474};
522 475
523/* 476/*
@@ -546,13 +499,14 @@ static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
546} 499}
547 500
548static struct rcu_torture_ops rcu_bh_ops = { 501static struct rcu_torture_ops rcu_bh_ops = {
549 .init = NULL, 502 .init = rcu_sync_torture_init,
550 .readlock = rcu_bh_torture_read_lock, 503 .readlock = rcu_bh_torture_read_lock,
551 .read_delay = rcu_read_delay, /* just reuse rcu's version. */ 504 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
552 .readunlock = rcu_bh_torture_read_unlock, 505 .readunlock = rcu_bh_torture_read_unlock,
553 .completed = rcu_bh_torture_completed, 506 .completed = rcu_bh_torture_completed,
554 .deferred_free = rcu_bh_torture_deferred_free, 507 .deferred_free = rcu_bh_torture_deferred_free,
555 .sync = synchronize_rcu_bh, 508 .sync = synchronize_rcu_bh,
509 .exp_sync = synchronize_rcu_bh_expedited,
556 .call = call_rcu_bh, 510 .call = call_rcu_bh,
557 .cb_barrier = rcu_barrier_bh, 511 .cb_barrier = rcu_barrier_bh,
558 .fqs = rcu_bh_force_quiescent_state, 512 .fqs = rcu_bh_force_quiescent_state,
@@ -561,38 +515,6 @@ static struct rcu_torture_ops rcu_bh_ops = {
561 .name = "rcu_bh" 515 .name = "rcu_bh"
562}; 516};
563 517
564static struct rcu_torture_ops rcu_bh_sync_ops = {
565 .init = rcu_sync_torture_init,
566 .readlock = rcu_bh_torture_read_lock,
567 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
568 .readunlock = rcu_bh_torture_read_unlock,
569 .completed = rcu_bh_torture_completed,
570 .deferred_free = rcu_sync_torture_deferred_free,
571 .sync = synchronize_rcu_bh,
572 .call = NULL,
573 .cb_barrier = NULL,
574 .fqs = rcu_bh_force_quiescent_state,
575 .stats = NULL,
576 .irq_capable = 1,
577 .name = "rcu_bh_sync"
578};
579
580static struct rcu_torture_ops rcu_bh_expedited_ops = {
581 .init = rcu_sync_torture_init,
582 .readlock = rcu_bh_torture_read_lock,
583 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
584 .readunlock = rcu_bh_torture_read_unlock,
585 .completed = rcu_bh_torture_completed,
586 .deferred_free = rcu_sync_torture_deferred_free,
587 .sync = synchronize_rcu_bh_expedited,
588 .call = NULL,
589 .cb_barrier = NULL,
590 .fqs = rcu_bh_force_quiescent_state,
591 .stats = NULL,
592 .irq_capable = 1,
593 .name = "rcu_bh_expedited"
594};
595
596/* 518/*
597 * Definitions for srcu torture testing. 519 * Definitions for srcu torture testing.
598 */ 520 */
@@ -667,6 +589,11 @@ static int srcu_torture_stats(char *page)
667 return cnt; 589 return cnt;
668} 590}
669 591
592static void srcu_torture_synchronize_expedited(void)
593{
594 synchronize_srcu_expedited(&srcu_ctl);
595}
596
670static struct rcu_torture_ops srcu_ops = { 597static struct rcu_torture_ops srcu_ops = {
671 .init = rcu_sync_torture_init, 598 .init = rcu_sync_torture_init,
672 .readlock = srcu_torture_read_lock, 599 .readlock = srcu_torture_read_lock,
@@ -675,45 +602,13 @@ static struct rcu_torture_ops srcu_ops = {
675 .completed = srcu_torture_completed, 602 .completed = srcu_torture_completed,
676 .deferred_free = srcu_torture_deferred_free, 603 .deferred_free = srcu_torture_deferred_free,
677 .sync = srcu_torture_synchronize, 604 .sync = srcu_torture_synchronize,
605 .exp_sync = srcu_torture_synchronize_expedited,
678 .call = srcu_torture_call, 606 .call = srcu_torture_call,
679 .cb_barrier = srcu_torture_barrier, 607 .cb_barrier = srcu_torture_barrier,
680 .stats = srcu_torture_stats, 608 .stats = srcu_torture_stats,
681 .name = "srcu" 609 .name = "srcu"
682}; 610};
683 611
684static struct rcu_torture_ops srcu_sync_ops = {
685 .init = rcu_sync_torture_init,
686 .readlock = srcu_torture_read_lock,
687 .read_delay = srcu_read_delay,
688 .readunlock = srcu_torture_read_unlock,
689 .completed = srcu_torture_completed,
690 .deferred_free = rcu_sync_torture_deferred_free,
691 .sync = srcu_torture_synchronize,
692 .call = NULL,
693 .cb_barrier = NULL,
694 .stats = srcu_torture_stats,
695 .name = "srcu_sync"
696};
697
698static void srcu_torture_synchronize_expedited(void)
699{
700 synchronize_srcu_expedited(&srcu_ctl);
701}
702
703static struct rcu_torture_ops srcu_expedited_ops = {
704 .init = rcu_sync_torture_init,
705 .readlock = srcu_torture_read_lock,
706 .read_delay = srcu_read_delay,
707 .readunlock = srcu_torture_read_unlock,
708 .completed = srcu_torture_completed,
709 .deferred_free = rcu_sync_torture_deferred_free,
710 .sync = srcu_torture_synchronize_expedited,
711 .call = NULL,
712 .cb_barrier = NULL,
713 .stats = srcu_torture_stats,
714 .name = "srcu_expedited"
715};
716
717/* 612/*
718 * Definitions for sched torture testing. 613 * Definitions for sched torture testing.
719 */ 614 */
@@ -742,6 +637,8 @@ static struct rcu_torture_ops sched_ops = {
742 .completed = rcu_no_completed, 637 .completed = rcu_no_completed,
743 .deferred_free = rcu_sched_torture_deferred_free, 638 .deferred_free = rcu_sched_torture_deferred_free,
744 .sync = synchronize_sched, 639 .sync = synchronize_sched,
640 .exp_sync = synchronize_sched_expedited,
641 .call = call_rcu_sched,
745 .cb_barrier = rcu_barrier_sched, 642 .cb_barrier = rcu_barrier_sched,
746 .fqs = rcu_sched_force_quiescent_state, 643 .fqs = rcu_sched_force_quiescent_state,
747 .stats = NULL, 644 .stats = NULL,
@@ -749,35 +646,6 @@ static struct rcu_torture_ops sched_ops = {
749 .name = "sched" 646 .name = "sched"
750}; 647};
751 648
752static struct rcu_torture_ops sched_sync_ops = {
753 .init = rcu_sync_torture_init,
754 .readlock = sched_torture_read_lock,
755 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
756 .readunlock = sched_torture_read_unlock,
757 .completed = rcu_no_completed,
758 .deferred_free = rcu_sync_torture_deferred_free,
759 .sync = synchronize_sched,
760 .cb_barrier = NULL,
761 .fqs = rcu_sched_force_quiescent_state,
762 .stats = NULL,
763 .name = "sched_sync"
764};
765
766static struct rcu_torture_ops sched_expedited_ops = {
767 .init = rcu_sync_torture_init,
768 .readlock = sched_torture_read_lock,
769 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
770 .readunlock = sched_torture_read_unlock,
771 .completed = rcu_no_completed,
772 .deferred_free = rcu_sync_torture_deferred_free,
773 .sync = synchronize_sched_expedited,
774 .cb_barrier = NULL,
775 .fqs = rcu_sched_force_quiescent_state,
776 .stats = NULL,
777 .irq_capable = 1,
778 .name = "sched_expedited"
779};
780
781/* 649/*
782 * RCU torture priority-boost testing. Runs one real-time thread per 650 * RCU torture priority-boost testing. Runs one real-time thread per
783 * CPU for moderate bursts, repeatedly registering RCU callbacks and 651 * CPU for moderate bursts, repeatedly registering RCU callbacks and
@@ -927,9 +795,10 @@ rcu_torture_fqs(void *arg)
927static int 795static int
928rcu_torture_writer(void *arg) 796rcu_torture_writer(void *arg)
929{ 797{
798 bool exp;
930 int i; 799 int i;
931 long oldbatch = rcu_batches_completed();
932 struct rcu_torture *rp; 800 struct rcu_torture *rp;
801 struct rcu_torture *rp1;
933 struct rcu_torture *old_rp; 802 struct rcu_torture *old_rp;
934 static DEFINE_RCU_RANDOM(rand); 803 static DEFINE_RCU_RANDOM(rand);
935 804
@@ -954,10 +823,33 @@ rcu_torture_writer(void *arg)
954 i = RCU_TORTURE_PIPE_LEN; 823 i = RCU_TORTURE_PIPE_LEN;
955 atomic_inc(&rcu_torture_wcount[i]); 824 atomic_inc(&rcu_torture_wcount[i]);
956 old_rp->rtort_pipe_count++; 825 old_rp->rtort_pipe_count++;
957 cur_ops->deferred_free(old_rp); 826 if (gp_normal == gp_exp)
827 exp = !!(rcu_random(&rand) & 0x80);
828 else
829 exp = gp_exp;
830 if (!exp) {
831 cur_ops->deferred_free(old_rp);
832 } else {
833 cur_ops->exp_sync();
834 list_add(&old_rp->rtort_free,
835 &rcu_torture_removed);
836 list_for_each_entry_safe(rp, rp1,
837 &rcu_torture_removed,
838 rtort_free) {
839 i = rp->rtort_pipe_count;
840 if (i > RCU_TORTURE_PIPE_LEN)
841 i = RCU_TORTURE_PIPE_LEN;
842 atomic_inc(&rcu_torture_wcount[i]);
843 if (++rp->rtort_pipe_count >=
844 RCU_TORTURE_PIPE_LEN) {
845 rp->rtort_mbtest = 0;
846 list_del(&rp->rtort_free);
847 rcu_torture_free(rp);
848 }
849 }
850 }
958 } 851 }
959 rcutorture_record_progress(++rcu_torture_current_version); 852 rcutorture_record_progress(++rcu_torture_current_version);
960 oldbatch = cur_ops->completed();
961 rcu_stutter_wait("rcu_torture_writer"); 853 rcu_stutter_wait("rcu_torture_writer");
962 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); 854 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
963 VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); 855 VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping");
@@ -983,10 +875,18 @@ rcu_torture_fakewriter(void *arg)
983 schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); 875 schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10);
984 udelay(rcu_random(&rand) & 0x3ff); 876 udelay(rcu_random(&rand) & 0x3ff);
985 if (cur_ops->cb_barrier != NULL && 877 if (cur_ops->cb_barrier != NULL &&
986 rcu_random(&rand) % (nfakewriters * 8) == 0) 878 rcu_random(&rand) % (nfakewriters * 8) == 0) {
987 cur_ops->cb_barrier(); 879 cur_ops->cb_barrier();
988 else 880 } else if (gp_normal == gp_exp) {
881 if (rcu_random(&rand) & 0x80)
882 cur_ops->sync();
883 else
884 cur_ops->exp_sync();
885 } else if (gp_normal) {
989 cur_ops->sync(); 886 cur_ops->sync();
887 } else {
888 cur_ops->exp_sync();
889 }
990 rcu_stutter_wait("rcu_torture_fakewriter"); 890 rcu_stutter_wait("rcu_torture_fakewriter");
991 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); 891 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
992 892
@@ -1364,7 +1264,7 @@ rcu_torture_stutter(void *arg)
1364} 1264}
1365 1265
1366static inline void 1266static inline void
1367rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag) 1267rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
1368{ 1268{
1369 pr_alert("%s" TORTURE_FLAG 1269 pr_alert("%s" TORTURE_FLAG
1370 "--- %s: nreaders=%d nfakewriters=%d " 1270 "--- %s: nreaders=%d nfakewriters=%d "
@@ -1476,7 +1376,7 @@ rcu_torture_shutdown(void *arg)
1476 * Execute random CPU-hotplug operations at the interval specified 1376 * Execute random CPU-hotplug operations at the interval specified
1477 * by the onoff_interval. 1377 * by the onoff_interval.
1478 */ 1378 */
1479static int __cpuinit 1379static int
1480rcu_torture_onoff(void *arg) 1380rcu_torture_onoff(void *arg)
1481{ 1381{
1482 int cpu; 1382 int cpu;
@@ -1534,7 +1434,13 @@ rcu_torture_onoff(void *arg)
1534 torture_type, cpu); 1434 torture_type, cpu);
1535 starttime = jiffies; 1435 starttime = jiffies;
1536 n_online_attempts++; 1436 n_online_attempts++;
1537 if (cpu_up(cpu) == 0) { 1437 ret = cpu_up(cpu);
1438 if (ret) {
1439 if (verbose)
1440 pr_alert("%s" TORTURE_FLAG
1441 "rcu_torture_onoff task: online %d failed: errno %d\n",
1442 torture_type, cpu, ret);
1443 } else {
1538 if (verbose) 1444 if (verbose)
1539 pr_alert("%s" TORTURE_FLAG 1445 pr_alert("%s" TORTURE_FLAG
1540 "rcu_torture_onoff task: onlined %d\n", 1446 "rcu_torture_onoff task: onlined %d\n",
@@ -1558,7 +1464,7 @@ rcu_torture_onoff(void *arg)
1558 return 0; 1464 return 0;
1559} 1465}
1560 1466
1561static int __cpuinit 1467static int
1562rcu_torture_onoff_init(void) 1468rcu_torture_onoff_init(void)
1563{ 1469{
1564 int ret; 1470 int ret;
@@ -1601,7 +1507,7 @@ static void rcu_torture_onoff_cleanup(void)
1601 * CPU-stall kthread. It waits as specified by stall_cpu_holdoff, then 1507 * CPU-stall kthread. It waits as specified by stall_cpu_holdoff, then
1602 * induces a CPU stall for the time specified by stall_cpu. 1508 * induces a CPU stall for the time specified by stall_cpu.
1603 */ 1509 */
1604static int __cpuinit rcu_torture_stall(void *args) 1510static int rcu_torture_stall(void *args)
1605{ 1511{
1606 unsigned long stop_at; 1512 unsigned long stop_at;
1607 1513
@@ -1934,6 +1840,62 @@ rcu_torture_cleanup(void)
1934 rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); 1840 rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS");
1935} 1841}
1936 1842
1843#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
1844static void rcu_torture_leak_cb(struct rcu_head *rhp)
1845{
1846}
1847
1848static void rcu_torture_err_cb(struct rcu_head *rhp)
1849{
1850 /*
1851 * This -might- happen due to race conditions, but is unlikely.
1852 * The scenario that leads to this happening is that the
1853 * first of the pair of duplicate callbacks is queued,
1854 * someone else starts a grace period that includes that
1855 * callback, then the second of the pair must wait for the
1856 * next grace period. Unlikely, but can happen. If it
1857 * does happen, the debug-objects subsystem won't have splatted.
1858 */
1859 pr_alert("rcutorture: duplicated callback was invoked.\n");
1860}
1861#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
1862
1863/*
1864 * Verify that double-free causes debug-objects to complain, but only
1865 * if CONFIG_DEBUG_OBJECTS_RCU_HEAD=y. Otherwise, say that the test
1866 * cannot be carried out.
1867 */
1868static void rcu_test_debug_objects(void)
1869{
1870#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
1871 struct rcu_head rh1;
1872 struct rcu_head rh2;
1873
1874 init_rcu_head_on_stack(&rh1);
1875 init_rcu_head_on_stack(&rh2);
1876 pr_alert("rcutorture: WARN: Duplicate call_rcu() test starting.\n");
1877
1878 /* Try to queue the rh2 pair of callbacks for the same grace period. */
1879 preempt_disable(); /* Prevent preemption from interrupting test. */
1880 rcu_read_lock(); /* Make it impossible to finish a grace period. */
1881 call_rcu(&rh1, rcu_torture_leak_cb); /* Start grace period. */
1882 local_irq_disable(); /* Make it harder to start a new grace period. */
1883 call_rcu(&rh2, rcu_torture_leak_cb);
1884 call_rcu(&rh2, rcu_torture_err_cb); /* Duplicate callback. */
1885 local_irq_enable();
1886 rcu_read_unlock();
1887 preempt_enable();
1888
1889 /* Wait for them all to get done so we can safely return. */
1890 rcu_barrier();
1891 pr_alert("rcutorture: WARN: Duplicate call_rcu() test complete.\n");
1892 destroy_rcu_head_on_stack(&rh1);
1893 destroy_rcu_head_on_stack(&rh2);
1894#else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
1895 pr_alert("rcutorture: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n");
1896#endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
1897}
1898
1937static int __init 1899static int __init
1938rcu_torture_init(void) 1900rcu_torture_init(void)
1939{ 1901{
@@ -1941,11 +1903,9 @@ rcu_torture_init(void)
1941 int cpu; 1903 int cpu;
1942 int firsterr = 0; 1904 int firsterr = 0;
1943 int retval; 1905 int retval;
1944 static struct rcu_torture_ops *torture_ops[] = 1906 static struct rcu_torture_ops *torture_ops[] = {
1945 { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, 1907 &rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops,
1946 &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, 1908 };
1947 &srcu_ops, &srcu_sync_ops, &srcu_expedited_ops,
1948 &sched_ops, &sched_sync_ops, &sched_expedited_ops, };
1949 1909
1950 mutex_lock(&fullstop_mutex); 1910 mutex_lock(&fullstop_mutex);
1951 1911
@@ -2163,6 +2123,8 @@ rcu_torture_init(void)
2163 firsterr = retval; 2123 firsterr = retval;
2164 goto unwind; 2124 goto unwind;
2165 } 2125 }
2126 if (object_debug)
2127 rcu_test_debug_objects();
2166 rcutorture_record_test_transition(); 2128 rcutorture_record_test_transition();
2167 mutex_unlock(&fullstop_mutex); 2129 mutex_unlock(&fullstop_mutex);
2168 return 0; 2130 return 0;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index e08abb9461ac..32618b3fe4e6 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -53,18 +53,38 @@
53#include <linux/delay.h> 53#include <linux/delay.h>
54#include <linux/stop_machine.h> 54#include <linux/stop_machine.h>
55#include <linux/random.h> 55#include <linux/random.h>
56#include <linux/ftrace_event.h>
57#include <linux/suspend.h>
56 58
57#include "rcutree.h" 59#include "rcutree.h"
58#include <trace/events/rcu.h> 60#include <trace/events/rcu.h>
59 61
60#include "rcu.h" 62#include "rcu.h"
61 63
64/*
65 * Strings used in tracepoints need to be exported via the
66 * tracing system such that tools like perf and trace-cmd can
67 * translate the string address pointers to actual text.
68 */
69#define TPS(x) tracepoint_string(x)
70
62/* Data structures. */ 71/* Data structures. */
63 72
64static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; 73static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
65static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; 74static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
66 75
67#define RCU_STATE_INITIALIZER(sname, sabbr, cr) { \ 76/*
77 * In order to export the rcu_state name to the tracing tools, it
78 * needs to be added in the __tracepoint_string section.
79 * This requires defining a separate variable tp_<sname>_varname
80 * that points to the string being used, and this will allow
81 * the tracing userspace tools to be able to decipher the string
82 * address to the matching string.
83 */
84#define RCU_STATE_INITIALIZER(sname, sabbr, cr) \
85static char sname##_varname[] = #sname; \
86static const char *tp_##sname##_varname __used __tracepoint_string = sname##_varname; \
87struct rcu_state sname##_state = { \
68 .level = { &sname##_state.node[0] }, \ 88 .level = { &sname##_state.node[0] }, \
69 .call = cr, \ 89 .call = cr, \
70 .fqs_state = RCU_GP_IDLE, \ 90 .fqs_state = RCU_GP_IDLE, \
@@ -75,16 +95,13 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
75 .orphan_donetail = &sname##_state.orphan_donelist, \ 95 .orphan_donetail = &sname##_state.orphan_donelist, \
76 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ 96 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
77 .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \ 97 .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \
78 .name = #sname, \ 98 .name = sname##_varname, \
79 .abbr = sabbr, \ 99 .abbr = sabbr, \
80} 100}; \
81 101DEFINE_PER_CPU(struct rcu_data, sname##_data)
82struct rcu_state rcu_sched_state =
83 RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
84DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
85 102
86struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); 103RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
87DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 104RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh);
88 105
89static struct rcu_state *rcu_state; 106static struct rcu_state *rcu_state;
90LIST_HEAD(rcu_struct_flavors); 107LIST_HEAD(rcu_struct_flavors);
@@ -178,7 +195,7 @@ void rcu_sched_qs(int cpu)
178 struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu); 195 struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu);
179 196
180 if (rdp->passed_quiesce == 0) 197 if (rdp->passed_quiesce == 0)
181 trace_rcu_grace_period("rcu_sched", rdp->gpnum, "cpuqs"); 198 trace_rcu_grace_period(TPS("rcu_sched"), rdp->gpnum, TPS("cpuqs"));
182 rdp->passed_quiesce = 1; 199 rdp->passed_quiesce = 1;
183} 200}
184 201
@@ -187,7 +204,7 @@ void rcu_bh_qs(int cpu)
187 struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); 204 struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
188 205
189 if (rdp->passed_quiesce == 0) 206 if (rdp->passed_quiesce == 0)
190 trace_rcu_grace_period("rcu_bh", rdp->gpnum, "cpuqs"); 207 trace_rcu_grace_period(TPS("rcu_bh"), rdp->gpnum, TPS("cpuqs"));
191 rdp->passed_quiesce = 1; 208 rdp->passed_quiesce = 1;
192} 209}
193 210
@@ -198,16 +215,20 @@ void rcu_bh_qs(int cpu)
198 */ 215 */
199void rcu_note_context_switch(int cpu) 216void rcu_note_context_switch(int cpu)
200{ 217{
201 trace_rcu_utilization("Start context switch"); 218 trace_rcu_utilization(TPS("Start context switch"));
202 rcu_sched_qs(cpu); 219 rcu_sched_qs(cpu);
203 rcu_preempt_note_context_switch(cpu); 220 rcu_preempt_note_context_switch(cpu);
204 trace_rcu_utilization("End context switch"); 221 trace_rcu_utilization(TPS("End context switch"));
205} 222}
206EXPORT_SYMBOL_GPL(rcu_note_context_switch); 223EXPORT_SYMBOL_GPL(rcu_note_context_switch);
207 224
208DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { 225DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
209 .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, 226 .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
210 .dynticks = ATOMIC_INIT(1), 227 .dynticks = ATOMIC_INIT(1),
228#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
229 .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE,
230 .dynticks_idle = ATOMIC_INIT(1),
231#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
211}; 232};
212 233
213static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ 234static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */
@@ -226,7 +247,10 @@ module_param(jiffies_till_next_fqs, ulong, 0644);
226 247
227static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, 248static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
228 struct rcu_data *rdp); 249 struct rcu_data *rdp);
229static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)); 250static void force_qs_rnp(struct rcu_state *rsp,
251 int (*f)(struct rcu_data *rsp, bool *isidle,
252 unsigned long *maxj),
253 bool *isidle, unsigned long *maxj);
230static void force_quiescent_state(struct rcu_state *rsp); 254static void force_quiescent_state(struct rcu_state *rsp);
231static int rcu_pending(int cpu); 255static int rcu_pending(int cpu);
232 256
@@ -345,11 +369,11 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
345static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, 369static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
346 bool user) 370 bool user)
347{ 371{
348 trace_rcu_dyntick("Start", oldval, rdtp->dynticks_nesting); 372 trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting);
349 if (!user && !is_idle_task(current)) { 373 if (!user && !is_idle_task(current)) {
350 struct task_struct *idle = idle_task(smp_processor_id()); 374 struct task_struct *idle = idle_task(smp_processor_id());
351 375
352 trace_rcu_dyntick("Error on entry: not idle task", oldval, 0); 376 trace_rcu_dyntick(TPS("Error on entry: not idle task"), oldval, 0);
353 ftrace_dump(DUMP_ORIG); 377 ftrace_dump(DUMP_ORIG);
354 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", 378 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
355 current->pid, current->comm, 379 current->pid, current->comm,
@@ -411,6 +435,7 @@ void rcu_idle_enter(void)
411 435
412 local_irq_save(flags); 436 local_irq_save(flags);
413 rcu_eqs_enter(false); 437 rcu_eqs_enter(false);
438 rcu_sysidle_enter(&__get_cpu_var(rcu_dynticks), 0);
414 local_irq_restore(flags); 439 local_irq_restore(flags);
415} 440}
416EXPORT_SYMBOL_GPL(rcu_idle_enter); 441EXPORT_SYMBOL_GPL(rcu_idle_enter);
@@ -428,27 +453,6 @@ void rcu_user_enter(void)
428{ 453{
429 rcu_eqs_enter(1); 454 rcu_eqs_enter(1);
430} 455}
431
432/**
433 * rcu_user_enter_after_irq - inform RCU that we are going to resume userspace
434 * after the current irq returns.
435 *
436 * This is similar to rcu_user_enter() but in the context of a non-nesting
437 * irq. After this call, RCU enters into idle mode when the interrupt
438 * returns.
439 */
440void rcu_user_enter_after_irq(void)
441{
442 unsigned long flags;
443 struct rcu_dynticks *rdtp;
444
445 local_irq_save(flags);
446 rdtp = &__get_cpu_var(rcu_dynticks);
447 /* Ensure this irq is interrupting a non-idle RCU state. */
448 WARN_ON_ONCE(!(rdtp->dynticks_nesting & DYNTICK_TASK_MASK));
449 rdtp->dynticks_nesting = 1;
450 local_irq_restore(flags);
451}
452#endif /* CONFIG_RCU_USER_QS */ 456#endif /* CONFIG_RCU_USER_QS */
453 457
454/** 458/**
@@ -479,9 +483,10 @@ void rcu_irq_exit(void)
479 rdtp->dynticks_nesting--; 483 rdtp->dynticks_nesting--;
480 WARN_ON_ONCE(rdtp->dynticks_nesting < 0); 484 WARN_ON_ONCE(rdtp->dynticks_nesting < 0);
481 if (rdtp->dynticks_nesting) 485 if (rdtp->dynticks_nesting)
482 trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting); 486 trace_rcu_dyntick(TPS("--="), oldval, rdtp->dynticks_nesting);
483 else 487 else
484 rcu_eqs_enter_common(rdtp, oldval, true); 488 rcu_eqs_enter_common(rdtp, oldval, true);
489 rcu_sysidle_enter(rdtp, 1);
485 local_irq_restore(flags); 490 local_irq_restore(flags);
486} 491}
487 492
@@ -501,11 +506,11 @@ static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval,
501 smp_mb__after_atomic_inc(); /* See above. */ 506 smp_mb__after_atomic_inc(); /* See above. */
502 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); 507 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
503 rcu_cleanup_after_idle(smp_processor_id()); 508 rcu_cleanup_after_idle(smp_processor_id());
504 trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting); 509 trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting);
505 if (!user && !is_idle_task(current)) { 510 if (!user && !is_idle_task(current)) {
506 struct task_struct *idle = idle_task(smp_processor_id()); 511 struct task_struct *idle = idle_task(smp_processor_id());
507 512
508 trace_rcu_dyntick("Error on exit: not idle task", 513 trace_rcu_dyntick(TPS("Error on exit: not idle task"),
509 oldval, rdtp->dynticks_nesting); 514 oldval, rdtp->dynticks_nesting);
510 ftrace_dump(DUMP_ORIG); 515 ftrace_dump(DUMP_ORIG);
511 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", 516 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
@@ -550,6 +555,7 @@ void rcu_idle_exit(void)
550 555
551 local_irq_save(flags); 556 local_irq_save(flags);
552 rcu_eqs_exit(false); 557 rcu_eqs_exit(false);
558 rcu_sysidle_exit(&__get_cpu_var(rcu_dynticks), 0);
553 local_irq_restore(flags); 559 local_irq_restore(flags);
554} 560}
555EXPORT_SYMBOL_GPL(rcu_idle_exit); 561EXPORT_SYMBOL_GPL(rcu_idle_exit);
@@ -565,28 +571,6 @@ void rcu_user_exit(void)
565{ 571{
566 rcu_eqs_exit(1); 572 rcu_eqs_exit(1);
567} 573}
568
569/**
570 * rcu_user_exit_after_irq - inform RCU that we won't resume to userspace
571 * idle mode after the current non-nesting irq returns.
572 *
573 * This is similar to rcu_user_exit() but in the context of an irq.
574 * This is called when the irq has interrupted a userspace RCU idle mode
575 * context. When the current non-nesting interrupt returns after this call,
576 * the CPU won't restore the RCU idle mode.
577 */
578void rcu_user_exit_after_irq(void)
579{
580 unsigned long flags;
581 struct rcu_dynticks *rdtp;
582
583 local_irq_save(flags);
584 rdtp = &__get_cpu_var(rcu_dynticks);
585 /* Ensure we are interrupting an RCU idle mode. */
586 WARN_ON_ONCE(rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK);
587 rdtp->dynticks_nesting += DYNTICK_TASK_EXIT_IDLE;
588 local_irq_restore(flags);
589}
590#endif /* CONFIG_RCU_USER_QS */ 574#endif /* CONFIG_RCU_USER_QS */
591 575
592/** 576/**
@@ -620,9 +604,10 @@ void rcu_irq_enter(void)
620 rdtp->dynticks_nesting++; 604 rdtp->dynticks_nesting++;
621 WARN_ON_ONCE(rdtp->dynticks_nesting == 0); 605 WARN_ON_ONCE(rdtp->dynticks_nesting == 0);
622 if (oldval) 606 if (oldval)
623 trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting); 607 trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting);
624 else 608 else
625 rcu_eqs_exit_common(rdtp, oldval, true); 609 rcu_eqs_exit_common(rdtp, oldval, true);
610 rcu_sysidle_exit(rdtp, 1);
626 local_irq_restore(flags); 611 local_irq_restore(flags);
627} 612}
628 613
@@ -746,9 +731,11 @@ static int rcu_is_cpu_rrupt_from_idle(void)
746 * credit them with an implicit quiescent state. Return 1 if this CPU 731 * credit them with an implicit quiescent state. Return 1 if this CPU
747 * is in dynticks idle mode, which is an extended quiescent state. 732 * is in dynticks idle mode, which is an extended quiescent state.
748 */ 733 */
749static int dyntick_save_progress_counter(struct rcu_data *rdp) 734static int dyntick_save_progress_counter(struct rcu_data *rdp,
735 bool *isidle, unsigned long *maxj)
750{ 736{
751 rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); 737 rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks);
738 rcu_sysidle_check_cpu(rdp, isidle, maxj);
752 return (rdp->dynticks_snap & 0x1) == 0; 739 return (rdp->dynticks_snap & 0x1) == 0;
753} 740}
754 741
@@ -758,7 +745,8 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp)
758 * idle state since the last call to dyntick_save_progress_counter() 745 * idle state since the last call to dyntick_save_progress_counter()
759 * for this same CPU, or by virtue of having been offline. 746 * for this same CPU, or by virtue of having been offline.
760 */ 747 */
761static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) 748static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
749 bool *isidle, unsigned long *maxj)
762{ 750{
763 unsigned int curr; 751 unsigned int curr;
764 unsigned int snap; 752 unsigned int snap;
@@ -775,7 +763,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
775 * of the current RCU grace period. 763 * of the current RCU grace period.
776 */ 764 */
777 if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) { 765 if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) {
778 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "dti"); 766 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
779 rdp->dynticks_fqs++; 767 rdp->dynticks_fqs++;
780 return 1; 768 return 1;
781 } 769 }
@@ -795,7 +783,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
795 return 0; /* Grace period is not old enough. */ 783 return 0; /* Grace period is not old enough. */
796 barrier(); 784 barrier();
797 if (cpu_is_offline(rdp->cpu)) { 785 if (cpu_is_offline(rdp->cpu)) {
798 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl"); 786 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("ofl"));
799 rdp->offline_fqs++; 787 rdp->offline_fqs++;
800 return 1; 788 return 1;
801 } 789 }
@@ -1032,7 +1020,7 @@ static unsigned long rcu_cbs_completed(struct rcu_state *rsp,
1032 * rcu_nocb_wait_gp(). 1020 * rcu_nocb_wait_gp().
1033 */ 1021 */
1034static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, 1022static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
1035 unsigned long c, char *s) 1023 unsigned long c, const char *s)
1036{ 1024{
1037 trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum, 1025 trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum,
1038 rnp->completed, c, rnp->level, 1026 rnp->completed, c, rnp->level,
@@ -1058,9 +1046,9 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
1058 * grace period is already marked as needed, return to the caller. 1046 * grace period is already marked as needed, return to the caller.
1059 */ 1047 */
1060 c = rcu_cbs_completed(rdp->rsp, rnp); 1048 c = rcu_cbs_completed(rdp->rsp, rnp);
1061 trace_rcu_future_gp(rnp, rdp, c, "Startleaf"); 1049 trace_rcu_future_gp(rnp, rdp, c, TPS("Startleaf"));
1062 if (rnp->need_future_gp[c & 0x1]) { 1050 if (rnp->need_future_gp[c & 0x1]) {
1063 trace_rcu_future_gp(rnp, rdp, c, "Prestartleaf"); 1051 trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartleaf"));
1064 return c; 1052 return c;
1065 } 1053 }
1066 1054
@@ -1074,7 +1062,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
1074 if (rnp->gpnum != rnp->completed || 1062 if (rnp->gpnum != rnp->completed ||
1075 ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) { 1063 ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) {
1076 rnp->need_future_gp[c & 0x1]++; 1064 rnp->need_future_gp[c & 0x1]++;
1077 trace_rcu_future_gp(rnp, rdp, c, "Startedleaf"); 1065 trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf"));
1078 return c; 1066 return c;
1079 } 1067 }
1080 1068
@@ -1102,7 +1090,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
1102 * recorded, trace and leave. 1090 * recorded, trace and leave.
1103 */ 1091 */
1104 if (rnp_root->need_future_gp[c & 0x1]) { 1092 if (rnp_root->need_future_gp[c & 0x1]) {
1105 trace_rcu_future_gp(rnp, rdp, c, "Prestartedroot"); 1093 trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartedroot"));
1106 goto unlock_out; 1094 goto unlock_out;
1107 } 1095 }
1108 1096
@@ -1111,9 +1099,9 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
1111 1099
1112 /* If a grace period is not already in progress, start one. */ 1100 /* If a grace period is not already in progress, start one. */
1113 if (rnp_root->gpnum != rnp_root->completed) { 1101 if (rnp_root->gpnum != rnp_root->completed) {
1114 trace_rcu_future_gp(rnp, rdp, c, "Startedleafroot"); 1102 trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleafroot"));
1115 } else { 1103 } else {
1116 trace_rcu_future_gp(rnp, rdp, c, "Startedroot"); 1104 trace_rcu_future_gp(rnp, rdp, c, TPS("Startedroot"));
1117 rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp); 1105 rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp);
1118 } 1106 }
1119unlock_out: 1107unlock_out:
@@ -1137,7 +1125,8 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
1137 rcu_nocb_gp_cleanup(rsp, rnp); 1125 rcu_nocb_gp_cleanup(rsp, rnp);
1138 rnp->need_future_gp[c & 0x1] = 0; 1126 rnp->need_future_gp[c & 0x1] = 0;
1139 needmore = rnp->need_future_gp[(c + 1) & 0x1]; 1127 needmore = rnp->need_future_gp[(c + 1) & 0x1];
1140 trace_rcu_future_gp(rnp, rdp, c, needmore ? "CleanupMore" : "Cleanup"); 1128 trace_rcu_future_gp(rnp, rdp, c,
1129 needmore ? TPS("CleanupMore") : TPS("Cleanup"));
1141 return needmore; 1130 return needmore;
1142} 1131}
1143 1132
@@ -1205,9 +1194,9 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1205 1194
1206 /* Trace depending on how much we were able to accelerate. */ 1195 /* Trace depending on how much we were able to accelerate. */
1207 if (!*rdp->nxttail[RCU_WAIT_TAIL]) 1196 if (!*rdp->nxttail[RCU_WAIT_TAIL])
1208 trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccWaitCB"); 1197 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB"));
1209 else 1198 else
1210 trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccReadyCB"); 1199 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB"));
1211} 1200}
1212 1201
1213/* 1202/*
@@ -1273,7 +1262,7 @@ static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struc
1273 1262
1274 /* Remember that we saw this grace-period completion. */ 1263 /* Remember that we saw this grace-period completion. */
1275 rdp->completed = rnp->completed; 1264 rdp->completed = rnp->completed;
1276 trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuend"); 1265 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend"));
1277 } 1266 }
1278 1267
1279 if (rdp->gpnum != rnp->gpnum) { 1268 if (rdp->gpnum != rnp->gpnum) {
@@ -1283,7 +1272,7 @@ static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struc
1283 * go looking for one. 1272 * go looking for one.
1284 */ 1273 */
1285 rdp->gpnum = rnp->gpnum; 1274 rdp->gpnum = rnp->gpnum;
1286 trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart"); 1275 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart"));
1287 rdp->passed_quiesce = 0; 1276 rdp->passed_quiesce = 0;
1288 rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); 1277 rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
1289 zero_cpu_stall_ticks(rdp); 1278 zero_cpu_stall_ticks(rdp);
@@ -1315,6 +1304,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
1315 struct rcu_data *rdp; 1304 struct rcu_data *rdp;
1316 struct rcu_node *rnp = rcu_get_root(rsp); 1305 struct rcu_node *rnp = rcu_get_root(rsp);
1317 1306
1307 rcu_bind_gp_kthread();
1318 raw_spin_lock_irq(&rnp->lock); 1308 raw_spin_lock_irq(&rnp->lock);
1319 rsp->gp_flags = 0; /* Clear all flags: New grace period. */ 1309 rsp->gp_flags = 0; /* Clear all flags: New grace period. */
1320 1310
@@ -1326,7 +1316,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
1326 1316
1327 /* Advance to a new grace period and initialize state. */ 1317 /* Advance to a new grace period and initialize state. */
1328 rsp->gpnum++; 1318 rsp->gpnum++;
1329 trace_rcu_grace_period(rsp->name, rsp->gpnum, "start"); 1319 trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start"));
1330 record_gp_stall_check_time(rsp); 1320 record_gp_stall_check_time(rsp);
1331 raw_spin_unlock_irq(&rnp->lock); 1321 raw_spin_unlock_irq(&rnp->lock);
1332 1322
@@ -1379,16 +1369,25 @@ static int rcu_gp_init(struct rcu_state *rsp)
1379int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) 1369int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
1380{ 1370{
1381 int fqs_state = fqs_state_in; 1371 int fqs_state = fqs_state_in;
1372 bool isidle = false;
1373 unsigned long maxj;
1382 struct rcu_node *rnp = rcu_get_root(rsp); 1374 struct rcu_node *rnp = rcu_get_root(rsp);
1383 1375
1384 rsp->n_force_qs++; 1376 rsp->n_force_qs++;
1385 if (fqs_state == RCU_SAVE_DYNTICK) { 1377 if (fqs_state == RCU_SAVE_DYNTICK) {
1386 /* Collect dyntick-idle snapshots. */ 1378 /* Collect dyntick-idle snapshots. */
1387 force_qs_rnp(rsp, dyntick_save_progress_counter); 1379 if (is_sysidle_rcu_state(rsp)) {
1380 isidle = 1;
1381 maxj = jiffies - ULONG_MAX / 4;
1382 }
1383 force_qs_rnp(rsp, dyntick_save_progress_counter,
1384 &isidle, &maxj);
1385 rcu_sysidle_report_gp(rsp, isidle, maxj);
1388 fqs_state = RCU_FORCE_QS; 1386 fqs_state = RCU_FORCE_QS;
1389 } else { 1387 } else {
1390 /* Handle dyntick-idle and offline CPUs. */ 1388 /* Handle dyntick-idle and offline CPUs. */
1391 force_qs_rnp(rsp, rcu_implicit_dynticks_qs); 1389 isidle = 0;
1390 force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj);
1392 } 1391 }
1393 /* Clear flag to prevent immediate re-entry. */ 1392 /* Clear flag to prevent immediate re-entry. */
1394 if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { 1393 if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
@@ -1448,7 +1447,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
1448 rcu_nocb_gp_set(rnp, nocb); 1447 rcu_nocb_gp_set(rnp, nocb);
1449 1448
1450 rsp->completed = rsp->gpnum; /* Declare grace period done. */ 1449 rsp->completed = rsp->gpnum; /* Declare grace period done. */
1451 trace_rcu_grace_period(rsp->name, rsp->completed, "end"); 1450 trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end"));
1452 rsp->fqs_state = RCU_GP_IDLE; 1451 rsp->fqs_state = RCU_GP_IDLE;
1453 rdp = this_cpu_ptr(rsp->rda); 1452 rdp = this_cpu_ptr(rsp->rda);
1454 rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */ 1453 rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */
@@ -1558,10 +1557,12 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
1558 1557
1559 /* 1558 /*
1560 * We can't do wakeups while holding the rnp->lock, as that 1559 * We can't do wakeups while holding the rnp->lock, as that
1561 * could cause possible deadlocks with the rq->lock. Deter 1560 * could cause possible deadlocks with the rq->lock. Defer
1562 * the wakeup to interrupt context. 1561 * the wakeup to interrupt context. And don't bother waking
1562 * up the running kthread.
1563 */ 1563 */
1564 irq_work_queue(&rsp->wakeup_work); 1564 if (current != rsp->gp_kthread)
1565 irq_work_queue(&rsp->wakeup_work);
1565} 1566}
1566 1567
1567/* 1568/*
@@ -1857,7 +1858,7 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
1857 RCU_TRACE(mask = rdp->grpmask); 1858 RCU_TRACE(mask = rdp->grpmask);
1858 trace_rcu_grace_period(rsp->name, 1859 trace_rcu_grace_period(rsp->name,
1859 rnp->gpnum + 1 - !!(rnp->qsmask & mask), 1860 rnp->gpnum + 1 - !!(rnp->qsmask & mask),
1860 "cpuofl"); 1861 TPS("cpuofl"));
1861} 1862}
1862 1863
1863/* 1864/*
@@ -2044,7 +2045,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
2044 */ 2045 */
2045void rcu_check_callbacks(int cpu, int user) 2046void rcu_check_callbacks(int cpu, int user)
2046{ 2047{
2047 trace_rcu_utilization("Start scheduler-tick"); 2048 trace_rcu_utilization(TPS("Start scheduler-tick"));
2048 increment_cpu_stall_ticks(); 2049 increment_cpu_stall_ticks();
2049 if (user || rcu_is_cpu_rrupt_from_idle()) { 2050 if (user || rcu_is_cpu_rrupt_from_idle()) {
2050 2051
@@ -2077,7 +2078,7 @@ void rcu_check_callbacks(int cpu, int user)
2077 rcu_preempt_check_callbacks(cpu); 2078 rcu_preempt_check_callbacks(cpu);
2078 if (rcu_pending(cpu)) 2079 if (rcu_pending(cpu))
2079 invoke_rcu_core(); 2080 invoke_rcu_core();
2080 trace_rcu_utilization("End scheduler-tick"); 2081 trace_rcu_utilization(TPS("End scheduler-tick"));
2081} 2082}
2082 2083
2083/* 2084/*
@@ -2087,7 +2088,10 @@ void rcu_check_callbacks(int cpu, int user)
2087 * 2088 *
2088 * The caller must have suppressed start of new grace periods. 2089 * The caller must have suppressed start of new grace periods.
2089 */ 2090 */
2090static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) 2091static void force_qs_rnp(struct rcu_state *rsp,
2092 int (*f)(struct rcu_data *rsp, bool *isidle,
2093 unsigned long *maxj),
2094 bool *isidle, unsigned long *maxj)
2091{ 2095{
2092 unsigned long bit; 2096 unsigned long bit;
2093 int cpu; 2097 int cpu;
@@ -2110,9 +2114,12 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
2110 cpu = rnp->grplo; 2114 cpu = rnp->grplo;
2111 bit = 1; 2115 bit = 1;
2112 for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { 2116 for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
2113 if ((rnp->qsmask & bit) != 0 && 2117 if ((rnp->qsmask & bit) != 0) {
2114 f(per_cpu_ptr(rsp->rda, cpu))) 2118 if ((rnp->qsmaskinit & bit) != 0)
2115 mask |= bit; 2119 *isidle = 0;
2120 if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj))
2121 mask |= bit;
2122 }
2116 } 2123 }
2117 if (mask != 0) { 2124 if (mask != 0) {
2118 2125
@@ -2208,10 +2215,10 @@ static void rcu_process_callbacks(struct softirq_action *unused)
2208 2215
2209 if (cpu_is_offline(smp_processor_id())) 2216 if (cpu_is_offline(smp_processor_id()))
2210 return; 2217 return;
2211 trace_rcu_utilization("Start RCU core"); 2218 trace_rcu_utilization(TPS("Start RCU core"));
2212 for_each_rcu_flavor(rsp) 2219 for_each_rcu_flavor(rsp)
2213 __rcu_process_callbacks(rsp); 2220 __rcu_process_callbacks(rsp);
2214 trace_rcu_utilization("End RCU core"); 2221 trace_rcu_utilization(TPS("End RCU core"));
2215} 2222}
2216 2223
2217/* 2224/*
@@ -2287,6 +2294,13 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
2287} 2294}
2288 2295
2289/* 2296/*
2297 * RCU callback function to leak a callback.
2298 */
2299static void rcu_leak_callback(struct rcu_head *rhp)
2300{
2301}
2302
2303/*
2290 * Helper function for call_rcu() and friends. The cpu argument will 2304 * Helper function for call_rcu() and friends. The cpu argument will
2291 * normally be -1, indicating "currently running CPU". It may specify 2305 * normally be -1, indicating "currently running CPU". It may specify
2292 * a CPU only if that CPU is a no-CBs CPU. Currently, only _rcu_barrier() 2306 * a CPU only if that CPU is a no-CBs CPU. Currently, only _rcu_barrier()
@@ -2300,7 +2314,12 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
2300 struct rcu_data *rdp; 2314 struct rcu_data *rdp;
2301 2315
2302 WARN_ON_ONCE((unsigned long)head & 0x3); /* Misaligned rcu_head! */ 2316 WARN_ON_ONCE((unsigned long)head & 0x3); /* Misaligned rcu_head! */
2303 debug_rcu_head_queue(head); 2317 if (debug_rcu_head_queue(head)) {
2318 /* Probable double call_rcu(), so leak the callback. */
2319 ACCESS_ONCE(head->func) = rcu_leak_callback;
2320 WARN_ONCE(1, "__call_rcu(): Leaked duplicate callback\n");
2321 return;
2322 }
2304 head->func = func; 2323 head->func = func;
2305 head->next = NULL; 2324 head->next = NULL;
2306 2325
@@ -2720,7 +2739,7 @@ static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy)
2720 * Helper function for _rcu_barrier() tracing. If tracing is disabled, 2739 * Helper function for _rcu_barrier() tracing. If tracing is disabled,
2721 * the compiler is expected to optimize this away. 2740 * the compiler is expected to optimize this away.
2722 */ 2741 */
2723static void _rcu_barrier_trace(struct rcu_state *rsp, char *s, 2742static void _rcu_barrier_trace(struct rcu_state *rsp, const char *s,
2724 int cpu, unsigned long done) 2743 int cpu, unsigned long done)
2725{ 2744{
2726 trace_rcu_barrier(rsp->name, s, cpu, 2745 trace_rcu_barrier(rsp->name, s, cpu,
@@ -2785,9 +2804,20 @@ static void _rcu_barrier(struct rcu_state *rsp)
2785 * transition. The "if" expression below therefore rounds the old 2804 * transition. The "if" expression below therefore rounds the old
2786 * value up to the next even number and adds two before comparing. 2805 * value up to the next even number and adds two before comparing.
2787 */ 2806 */
2788 snap_done = ACCESS_ONCE(rsp->n_barrier_done); 2807 snap_done = rsp->n_barrier_done;
2789 _rcu_barrier_trace(rsp, "Check", -1, snap_done); 2808 _rcu_barrier_trace(rsp, "Check", -1, snap_done);
2790 if (ULONG_CMP_GE(snap_done, ((snap + 1) & ~0x1) + 2)) { 2809
2810 /*
2811 * If the value in snap is odd, we needed to wait for the current
2812 * rcu_barrier() to complete, then wait for the next one, in other
2813 * words, we need the value of snap_done to be three larger than
2814 * the value of snap. On the other hand, if the value in snap is
2815 * even, we only had to wait for the next rcu_barrier() to complete,
2816 * in other words, we need the value of snap_done to be only two
2817 * greater than the value of snap. The "(snap + 3) & ~0x1" computes
2818 * this for us (thank you, Linus!).
2819 */
2820 if (ULONG_CMP_GE(snap_done, (snap + 3) & ~0x1)) {
2791 _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done); 2821 _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done);
2792 smp_mb(); /* caller's subsequent code after above check. */ 2822 smp_mb(); /* caller's subsequent code after above check. */
2793 mutex_unlock(&rsp->barrier_mutex); 2823 mutex_unlock(&rsp->barrier_mutex);
@@ -2910,7 +2940,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
2910 * can accept some slop in the rsp->completed access due to the fact 2940 * can accept some slop in the rsp->completed access due to the fact
2911 * that this CPU cannot possibly have any RCU callbacks in flight yet. 2941 * that this CPU cannot possibly have any RCU callbacks in flight yet.
2912 */ 2942 */
2913static void __cpuinit 2943static void
2914rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) 2944rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
2915{ 2945{
2916 unsigned long flags; 2946 unsigned long flags;
@@ -2930,6 +2960,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
2930 rdp->blimit = blimit; 2960 rdp->blimit = blimit;
2931 init_callback_list(rdp); /* Re-enable callbacks on this CPU. */ 2961 init_callback_list(rdp); /* Re-enable callbacks on this CPU. */
2932 rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; 2962 rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
2963 rcu_sysidle_init_percpu_data(rdp->dynticks);
2933 atomic_set(&rdp->dynticks->dynticks, 2964 atomic_set(&rdp->dynticks->dynticks,
2934 (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); 2965 (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
2935 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 2966 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
@@ -2952,7 +2983,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
2952 rdp->completed = rnp->completed; 2983 rdp->completed = rnp->completed;
2953 rdp->passed_quiesce = 0; 2984 rdp->passed_quiesce = 0;
2954 rdp->qs_pending = 0; 2985 rdp->qs_pending = 0;
2955 trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuonl"); 2986 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
2956 } 2987 }
2957 raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ 2988 raw_spin_unlock(&rnp->lock); /* irqs already disabled. */
2958 rnp = rnp->parent; 2989 rnp = rnp->parent;
@@ -2962,7 +2993,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
2962 mutex_unlock(&rsp->onoff_mutex); 2993 mutex_unlock(&rsp->onoff_mutex);
2963} 2994}
2964 2995
2965static void __cpuinit rcu_prepare_cpu(int cpu) 2996static void rcu_prepare_cpu(int cpu)
2966{ 2997{
2967 struct rcu_state *rsp; 2998 struct rcu_state *rsp;
2968 2999
@@ -2974,7 +3005,7 @@ static void __cpuinit rcu_prepare_cpu(int cpu)
2974/* 3005/*
2975 * Handle CPU online/offline notification events. 3006 * Handle CPU online/offline notification events.
2976 */ 3007 */
2977static int __cpuinit rcu_cpu_notify(struct notifier_block *self, 3008static int rcu_cpu_notify(struct notifier_block *self,
2978 unsigned long action, void *hcpu) 3009 unsigned long action, void *hcpu)
2979{ 3010{
2980 long cpu = (long)hcpu; 3011 long cpu = (long)hcpu;
@@ -2982,7 +3013,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
2982 struct rcu_node *rnp = rdp->mynode; 3013 struct rcu_node *rnp = rdp->mynode;
2983 struct rcu_state *rsp; 3014 struct rcu_state *rsp;
2984 3015
2985 trace_rcu_utilization("Start CPU hotplug"); 3016 trace_rcu_utilization(TPS("Start CPU hotplug"));
2986 switch (action) { 3017 switch (action) {
2987 case CPU_UP_PREPARE: 3018 case CPU_UP_PREPARE:
2988 case CPU_UP_PREPARE_FROZEN: 3019 case CPU_UP_PREPARE_FROZEN:
@@ -3011,7 +3042,26 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
3011 default: 3042 default:
3012 break; 3043 break;
3013 } 3044 }
3014 trace_rcu_utilization("End CPU hotplug"); 3045 trace_rcu_utilization(TPS("End CPU hotplug"));
3046 return NOTIFY_OK;
3047}
3048
3049static int rcu_pm_notify(struct notifier_block *self,
3050 unsigned long action, void *hcpu)
3051{
3052 switch (action) {
3053 case PM_HIBERNATION_PREPARE:
3054 case PM_SUSPEND_PREPARE:
3055 if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */
3056 rcu_expedited = 1;
3057 break;
3058 case PM_POST_HIBERNATION:
3059 case PM_POST_SUSPEND:
3060 rcu_expedited = 0;
3061 break;
3062 default:
3063 break;
3064 }
3015 return NOTIFY_OK; 3065 return NOTIFY_OK;
3016} 3066}
3017 3067
@@ -3256,6 +3306,7 @@ void __init rcu_init(void)
3256 * or the scheduler are operational. 3306 * or the scheduler are operational.
3257 */ 3307 */
3258 cpu_notifier(rcu_cpu_notify, 0); 3308 cpu_notifier(rcu_cpu_notify, 0);
3309 pm_notifier(rcu_pm_notify, 0);
3259 for_each_online_cpu(cpu) 3310 for_each_online_cpu(cpu)
3260 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); 3311 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
3261} 3312}
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 4a39d364493c..5f97eab602cd 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -88,6 +88,14 @@ struct rcu_dynticks {
88 /* Process level is worth LLONG_MAX/2. */ 88 /* Process level is worth LLONG_MAX/2. */
89 int dynticks_nmi_nesting; /* Track NMI nesting level. */ 89 int dynticks_nmi_nesting; /* Track NMI nesting level. */
90 atomic_t dynticks; /* Even value for idle, else odd. */ 90 atomic_t dynticks; /* Even value for idle, else odd. */
91#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
92 long long dynticks_idle_nesting;
93 /* irq/process nesting level from idle. */
94 atomic_t dynticks_idle; /* Even value for idle, else odd. */
95 /* "Idle" excludes userspace execution. */
96 unsigned long dynticks_idle_jiffies;
97 /* End of last non-NMI non-idle period. */
98#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
91#ifdef CONFIG_RCU_FAST_NO_HZ 99#ifdef CONFIG_RCU_FAST_NO_HZ
92 bool all_lazy; /* Are all CPU's CBs lazy? */ 100 bool all_lazy; /* Are all CPU's CBs lazy? */
93 unsigned long nonlazy_posted; 101 unsigned long nonlazy_posted;
@@ -445,7 +453,7 @@ struct rcu_state {
445 /* for CPU stalls. */ 453 /* for CPU stalls. */
446 unsigned long gp_max; /* Maximum GP duration in */ 454 unsigned long gp_max; /* Maximum GP duration in */
447 /* jiffies. */ 455 /* jiffies. */
448 char *name; /* Name of structure. */ 456 const char *name; /* Name of structure. */
449 char abbr; /* Abbreviated name. */ 457 char abbr; /* Abbreviated name. */
450 struct list_head flavors; /* List of RCU flavors. */ 458 struct list_head flavors; /* List of RCU flavors. */
451 struct irq_work wakeup_work; /* Postponed wakeups */ 459 struct irq_work wakeup_work; /* Postponed wakeups */
@@ -521,10 +529,10 @@ static void invoke_rcu_callbacks_kthread(void);
521static bool rcu_is_callbacks_kthread(void); 529static bool rcu_is_callbacks_kthread(void);
522#ifdef CONFIG_RCU_BOOST 530#ifdef CONFIG_RCU_BOOST
523static void rcu_preempt_do_callbacks(void); 531static void rcu_preempt_do_callbacks(void);
524static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, 532static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
525 struct rcu_node *rnp); 533 struct rcu_node *rnp);
526#endif /* #ifdef CONFIG_RCU_BOOST */ 534#endif /* #ifdef CONFIG_RCU_BOOST */
527static void __cpuinit rcu_prepare_kthreads(int cpu); 535static void rcu_prepare_kthreads(int cpu);
528static void rcu_cleanup_after_idle(int cpu); 536static void rcu_cleanup_after_idle(int cpu);
529static void rcu_prepare_for_idle(int cpu); 537static void rcu_prepare_for_idle(int cpu);
530static void rcu_idle_count_callbacks_posted(void); 538static void rcu_idle_count_callbacks_posted(void);
@@ -545,6 +553,15 @@ static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
545static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); 553static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp);
546static void rcu_kick_nohz_cpu(int cpu); 554static void rcu_kick_nohz_cpu(int cpu);
547static bool init_nocb_callback_list(struct rcu_data *rdp); 555static bool init_nocb_callback_list(struct rcu_data *rdp);
556static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq);
557static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq);
558static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
559 unsigned long *maxj);
560static bool is_sysidle_rcu_state(struct rcu_state *rsp);
561static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
562 unsigned long maxj);
563static void rcu_bind_gp_kthread(void);
564static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp);
548 565
549#endif /* #ifndef RCU_TREE_NONCORE */ 566#endif /* #ifndef RCU_TREE_NONCORE */
550 567
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 63098a59216e..130c97b027f2 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -28,7 +28,7 @@
28#include <linux/gfp.h> 28#include <linux/gfp.h>
29#include <linux/oom.h> 29#include <linux/oom.h>
30#include <linux/smpboot.h> 30#include <linux/smpboot.h>
31#include <linux/tick.h> 31#include "time/tick-internal.h"
32 32
33#define RCU_KTHREAD_PRIO 1 33#define RCU_KTHREAD_PRIO 1
34 34
@@ -110,9 +110,7 @@ static void __init rcu_bootup_announce_oddness(void)
110 110
111#ifdef CONFIG_TREE_PREEMPT_RCU 111#ifdef CONFIG_TREE_PREEMPT_RCU
112 112
113struct rcu_state rcu_preempt_state = 113RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
114 RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
115DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
116static struct rcu_state *rcu_state = &rcu_preempt_state; 114static struct rcu_state *rcu_state = &rcu_preempt_state;
117 115
118static int rcu_preempted_readers_exp(struct rcu_node *rnp); 116static int rcu_preempted_readers_exp(struct rcu_node *rnp);
@@ -169,7 +167,7 @@ static void rcu_preempt_qs(int cpu)
169 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); 167 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
170 168
171 if (rdp->passed_quiesce == 0) 169 if (rdp->passed_quiesce == 0)
172 trace_rcu_grace_period("rcu_preempt", rdp->gpnum, "cpuqs"); 170 trace_rcu_grace_period(TPS("rcu_preempt"), rdp->gpnum, TPS("cpuqs"));
173 rdp->passed_quiesce = 1; 171 rdp->passed_quiesce = 1;
174 current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; 172 current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
175} 173}
@@ -388,7 +386,7 @@ void rcu_read_unlock_special(struct task_struct *t)
388 np = rcu_next_node_entry(t, rnp); 386 np = rcu_next_node_entry(t, rnp);
389 list_del_init(&t->rcu_node_entry); 387 list_del_init(&t->rcu_node_entry);
390 t->rcu_blocked_node = NULL; 388 t->rcu_blocked_node = NULL;
391 trace_rcu_unlock_preempted_task("rcu_preempt", 389 trace_rcu_unlock_preempted_task(TPS("rcu_preempt"),
392 rnp->gpnum, t->pid); 390 rnp->gpnum, t->pid);
393 if (&t->rcu_node_entry == rnp->gp_tasks) 391 if (&t->rcu_node_entry == rnp->gp_tasks)
394 rnp->gp_tasks = np; 392 rnp->gp_tasks = np;
@@ -412,7 +410,7 @@ void rcu_read_unlock_special(struct task_struct *t)
412 */ 410 */
413 empty_exp_now = !rcu_preempted_readers_exp(rnp); 411 empty_exp_now = !rcu_preempted_readers_exp(rnp);
414 if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) { 412 if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) {
415 trace_rcu_quiescent_state_report("preempt_rcu", 413 trace_rcu_quiescent_state_report(TPS("preempt_rcu"),
416 rnp->gpnum, 414 rnp->gpnum,
417 0, rnp->qsmask, 415 0, rnp->qsmask,
418 rnp->level, 416 rnp->level,
@@ -1250,12 +1248,12 @@ static int rcu_boost_kthread(void *arg)
1250 int spincnt = 0; 1248 int spincnt = 0;
1251 int more2boost; 1249 int more2boost;
1252 1250
1253 trace_rcu_utilization("Start boost kthread@init"); 1251 trace_rcu_utilization(TPS("Start boost kthread@init"));
1254 for (;;) { 1252 for (;;) {
1255 rnp->boost_kthread_status = RCU_KTHREAD_WAITING; 1253 rnp->boost_kthread_status = RCU_KTHREAD_WAITING;
1256 trace_rcu_utilization("End boost kthread@rcu_wait"); 1254 trace_rcu_utilization(TPS("End boost kthread@rcu_wait"));
1257 rcu_wait(rnp->boost_tasks || rnp->exp_tasks); 1255 rcu_wait(rnp->boost_tasks || rnp->exp_tasks);
1258 trace_rcu_utilization("Start boost kthread@rcu_wait"); 1256 trace_rcu_utilization(TPS("Start boost kthread@rcu_wait"));
1259 rnp->boost_kthread_status = RCU_KTHREAD_RUNNING; 1257 rnp->boost_kthread_status = RCU_KTHREAD_RUNNING;
1260 more2boost = rcu_boost(rnp); 1258 more2boost = rcu_boost(rnp);
1261 if (more2boost) 1259 if (more2boost)
@@ -1264,14 +1262,14 @@ static int rcu_boost_kthread(void *arg)
1264 spincnt = 0; 1262 spincnt = 0;
1265 if (spincnt > 10) { 1263 if (spincnt > 10) {
1266 rnp->boost_kthread_status = RCU_KTHREAD_YIELDING; 1264 rnp->boost_kthread_status = RCU_KTHREAD_YIELDING;
1267 trace_rcu_utilization("End boost kthread@rcu_yield"); 1265 trace_rcu_utilization(TPS("End boost kthread@rcu_yield"));
1268 schedule_timeout_interruptible(2); 1266 schedule_timeout_interruptible(2);
1269 trace_rcu_utilization("Start boost kthread@rcu_yield"); 1267 trace_rcu_utilization(TPS("Start boost kthread@rcu_yield"));
1270 spincnt = 0; 1268 spincnt = 0;
1271 } 1269 }
1272 } 1270 }
1273 /* NOTREACHED */ 1271 /* NOTREACHED */
1274 trace_rcu_utilization("End boost kthread@notreached"); 1272 trace_rcu_utilization(TPS("End boost kthread@notreached"));
1275 return 0; 1273 return 0;
1276} 1274}
1277 1275
@@ -1352,7 +1350,7 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
1352 * already exist. We only create this kthread for preemptible RCU. 1350 * already exist. We only create this kthread for preemptible RCU.
1353 * Returns zero if all is well, a negated errno otherwise. 1351 * Returns zero if all is well, a negated errno otherwise.
1354 */ 1352 */
1355static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, 1353static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
1356 struct rcu_node *rnp) 1354 struct rcu_node *rnp)
1357{ 1355{
1358 int rnp_index = rnp - &rsp->node[0]; 1356 int rnp_index = rnp - &rsp->node[0];
@@ -1419,7 +1417,7 @@ static void rcu_cpu_kthread(unsigned int cpu)
1419 int spincnt; 1417 int spincnt;
1420 1418
1421 for (spincnt = 0; spincnt < 10; spincnt++) { 1419 for (spincnt = 0; spincnt < 10; spincnt++) {
1422 trace_rcu_utilization("Start CPU kthread@rcu_wait"); 1420 trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
1423 local_bh_disable(); 1421 local_bh_disable();
1424 *statusp = RCU_KTHREAD_RUNNING; 1422 *statusp = RCU_KTHREAD_RUNNING;
1425 this_cpu_inc(rcu_cpu_kthread_loops); 1423 this_cpu_inc(rcu_cpu_kthread_loops);
@@ -1431,15 +1429,15 @@ static void rcu_cpu_kthread(unsigned int cpu)
1431 rcu_kthread_do_work(); 1429 rcu_kthread_do_work();
1432 local_bh_enable(); 1430 local_bh_enable();
1433 if (*workp == 0) { 1431 if (*workp == 0) {
1434 trace_rcu_utilization("End CPU kthread@rcu_wait"); 1432 trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
1435 *statusp = RCU_KTHREAD_WAITING; 1433 *statusp = RCU_KTHREAD_WAITING;
1436 return; 1434 return;
1437 } 1435 }
1438 } 1436 }
1439 *statusp = RCU_KTHREAD_YIELDING; 1437 *statusp = RCU_KTHREAD_YIELDING;
1440 trace_rcu_utilization("Start CPU kthread@rcu_yield"); 1438 trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
1441 schedule_timeout_interruptible(2); 1439 schedule_timeout_interruptible(2);
1442 trace_rcu_utilization("End CPU kthread@rcu_yield"); 1440 trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
1443 *statusp = RCU_KTHREAD_WAITING; 1441 *statusp = RCU_KTHREAD_WAITING;
1444} 1442}
1445 1443
@@ -1507,7 +1505,7 @@ static int __init rcu_spawn_kthreads(void)
1507} 1505}
1508early_initcall(rcu_spawn_kthreads); 1506early_initcall(rcu_spawn_kthreads);
1509 1507
1510static void __cpuinit rcu_prepare_kthreads(int cpu) 1508static void rcu_prepare_kthreads(int cpu)
1511{ 1509{
1512 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); 1510 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
1513 struct rcu_node *rnp = rdp->mynode; 1511 struct rcu_node *rnp = rdp->mynode;
@@ -1549,7 +1547,7 @@ static int __init rcu_scheduler_really_started(void)
1549} 1547}
1550early_initcall(rcu_scheduler_really_started); 1548early_initcall(rcu_scheduler_really_started);
1551 1549
1552static void __cpuinit rcu_prepare_kthreads(int cpu) 1550static void rcu_prepare_kthreads(int cpu)
1553{ 1551{
1554} 1552}
1555 1553
@@ -2202,7 +2200,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
2202 * Wait for the grace period. Do so interruptibly to avoid messing 2200 * Wait for the grace period. Do so interruptibly to avoid messing
2203 * up the load average. 2201 * up the load average.
2204 */ 2202 */
2205 trace_rcu_future_gp(rnp, rdp, c, "StartWait"); 2203 trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait"));
2206 for (;;) { 2204 for (;;) {
2207 wait_event_interruptible( 2205 wait_event_interruptible(
2208 rnp->nocb_gp_wq[c & 0x1], 2206 rnp->nocb_gp_wq[c & 0x1],
@@ -2210,9 +2208,9 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
2210 if (likely(d)) 2208 if (likely(d))
2211 break; 2209 break;
2212 flush_signals(current); 2210 flush_signals(current);
2213 trace_rcu_future_gp(rnp, rdp, c, "ResumeWait"); 2211 trace_rcu_future_gp(rnp, rdp, c, TPS("ResumeWait"));
2214 } 2212 }
2215 trace_rcu_future_gp(rnp, rdp, c, "EndWait"); 2213 trace_rcu_future_gp(rnp, rdp, c, TPS("EndWait"));
2216 smp_mb(); /* Ensure that CB invocation happens after GP end. */ 2214 smp_mb(); /* Ensure that CB invocation happens after GP end. */
2217} 2215}
2218 2216
@@ -2375,3 +2373,425 @@ static void rcu_kick_nohz_cpu(int cpu)
2375 smp_send_reschedule(cpu); 2373 smp_send_reschedule(cpu);
2376#endif /* #ifdef CONFIG_NO_HZ_FULL */ 2374#endif /* #ifdef CONFIG_NO_HZ_FULL */
2377} 2375}
2376
2377
2378#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
2379
2380/*
2381 * Define RCU flavor that holds sysidle state. This needs to be the
2382 * most active flavor of RCU.
2383 */
2384#ifdef CONFIG_PREEMPT_RCU
2385static struct rcu_state *rcu_sysidle_state = &rcu_preempt_state;
2386#else /* #ifdef CONFIG_PREEMPT_RCU */
2387static struct rcu_state *rcu_sysidle_state = &rcu_sched_state;
2388#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
2389
2390static int full_sysidle_state; /* Current system-idle state. */
2391#define RCU_SYSIDLE_NOT 0 /* Some CPU is not idle. */
2392#define RCU_SYSIDLE_SHORT 1 /* All CPUs idle for brief period. */
2393#define RCU_SYSIDLE_LONG 2 /* All CPUs idle for long enough. */
2394#define RCU_SYSIDLE_FULL 3 /* All CPUs idle, ready for sysidle. */
2395#define RCU_SYSIDLE_FULL_NOTED 4 /* Actually entered sysidle state. */
2396
2397/*
2398 * Invoked to note exit from irq or task transition to idle. Note that
2399 * usermode execution does -not- count as idle here! After all, we want
2400 * to detect full-system idle states, not RCU quiescent states and grace
2401 * periods. The caller must have disabled interrupts.
2402 */
2403static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq)
2404{
2405 unsigned long j;
2406
2407 /* Adjust nesting, check for fully idle. */
2408 if (irq) {
2409 rdtp->dynticks_idle_nesting--;
2410 WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0);
2411 if (rdtp->dynticks_idle_nesting != 0)
2412 return; /* Still not fully idle. */
2413 } else {
2414 if ((rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) ==
2415 DYNTICK_TASK_NEST_VALUE) {
2416 rdtp->dynticks_idle_nesting = 0;
2417 } else {
2418 rdtp->dynticks_idle_nesting -= DYNTICK_TASK_NEST_VALUE;
2419 WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0);
2420 return; /* Still not fully idle. */
2421 }
2422 }
2423
2424 /* Record start of fully idle period. */
2425 j = jiffies;
2426 ACCESS_ONCE(rdtp->dynticks_idle_jiffies) = j;
2427 smp_mb__before_atomic_inc();
2428 atomic_inc(&rdtp->dynticks_idle);
2429 smp_mb__after_atomic_inc();
2430 WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1);
2431}
2432
2433/*
2434 * Unconditionally force exit from full system-idle state. This is
2435 * invoked when a normal CPU exits idle, but must be called separately
2436 * for the timekeeping CPU (tick_do_timer_cpu). The reason for this
2437 * is that the timekeeping CPU is permitted to take scheduling-clock
2438 * interrupts while the system is in system-idle state, and of course
2439 * rcu_sysidle_exit() has no way of distinguishing a scheduling-clock
2440 * interrupt from any other type of interrupt.
2441 */
2442void rcu_sysidle_force_exit(void)
2443{
2444 int oldstate = ACCESS_ONCE(full_sysidle_state);
2445 int newoldstate;
2446
2447 /*
2448 * Each pass through the following loop attempts to exit full
2449 * system-idle state. If contention proves to be a problem,
2450 * a trylock-based contention tree could be used here.
2451 */
2452 while (oldstate > RCU_SYSIDLE_SHORT) {
2453 newoldstate = cmpxchg(&full_sysidle_state,
2454 oldstate, RCU_SYSIDLE_NOT);
2455 if (oldstate == newoldstate &&
2456 oldstate == RCU_SYSIDLE_FULL_NOTED) {
2457 rcu_kick_nohz_cpu(tick_do_timer_cpu);
2458 return; /* We cleared it, done! */
2459 }
2460 oldstate = newoldstate;
2461 }
2462 smp_mb(); /* Order initial oldstate fetch vs. later non-idle work. */
2463}
2464
2465/*
2466 * Invoked to note entry to irq or task transition from idle. Note that
2467 * usermode execution does -not- count as idle here! The caller must
2468 * have disabled interrupts.
2469 */
2470static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq)
2471{
2472 /* Adjust nesting, check for already non-idle. */
2473 if (irq) {
2474 rdtp->dynticks_idle_nesting++;
2475 WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0);
2476 if (rdtp->dynticks_idle_nesting != 1)
2477 return; /* Already non-idle. */
2478 } else {
2479 /*
2480 * Allow for irq misnesting. Yes, it really is possible
2481 * to enter an irq handler then never leave it, and maybe
2482 * also vice versa. Handle both possibilities.
2483 */
2484 if (rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) {
2485 rdtp->dynticks_idle_nesting += DYNTICK_TASK_NEST_VALUE;
2486 WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0);
2487 return; /* Already non-idle. */
2488 } else {
2489 rdtp->dynticks_idle_nesting = DYNTICK_TASK_EXIT_IDLE;
2490 }
2491 }
2492
2493 /* Record end of idle period. */
2494 smp_mb__before_atomic_inc();
2495 atomic_inc(&rdtp->dynticks_idle);
2496 smp_mb__after_atomic_inc();
2497 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1));
2498
2499 /*
2500 * If we are the timekeeping CPU, we are permitted to be non-idle
2501 * during a system-idle state. This must be the case, because
2502 * the timekeeping CPU has to take scheduling-clock interrupts
2503 * during the time that the system is transitioning to full
2504 * system-idle state. This means that the timekeeping CPU must
2505 * invoke rcu_sysidle_force_exit() directly if it does anything
2506 * more than take a scheduling-clock interrupt.
2507 */
2508 if (smp_processor_id() == tick_do_timer_cpu)
2509 return;
2510
2511 /* Update system-idle state: We are clearly no longer fully idle! */
2512 rcu_sysidle_force_exit();
2513}
2514
2515/*
2516 * Check to see if the current CPU is idle. Note that usermode execution
2517 * does not count as idle. The caller must have disabled interrupts.
2518 */
2519static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
2520 unsigned long *maxj)
2521{
2522 int cur;
2523 unsigned long j;
2524 struct rcu_dynticks *rdtp = rdp->dynticks;
2525
2526 /*
2527 * If some other CPU has already reported non-idle, if this is
2528 * not the flavor of RCU that tracks sysidle state, or if this
2529 * is an offline or the timekeeping CPU, nothing to do.
2530 */
2531 if (!*isidle || rdp->rsp != rcu_sysidle_state ||
2532 cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu)
2533 return;
2534 if (rcu_gp_in_progress(rdp->rsp))
2535 WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu);
2536
2537 /* Pick up current idle and NMI-nesting counter and check. */
2538 cur = atomic_read(&rdtp->dynticks_idle);
2539 if (cur & 0x1) {
2540 *isidle = false; /* We are not idle! */
2541 return;
2542 }
2543 smp_mb(); /* Read counters before timestamps. */
2544
2545 /* Pick up timestamps. */
2546 j = ACCESS_ONCE(rdtp->dynticks_idle_jiffies);
2547 /* If this CPU entered idle more recently, update maxj timestamp. */
2548 if (ULONG_CMP_LT(*maxj, j))
2549 *maxj = j;
2550}
2551
2552/*
2553 * Is this the flavor of RCU that is handling full-system idle?
2554 */
2555static bool is_sysidle_rcu_state(struct rcu_state *rsp)
2556{
2557 return rsp == rcu_sysidle_state;
2558}
2559
2560/*
2561 * Bind the grace-period kthread for the sysidle flavor of RCU to the
2562 * timekeeping CPU.
2563 */
2564static void rcu_bind_gp_kthread(void)
2565{
2566 int cpu = ACCESS_ONCE(tick_do_timer_cpu);
2567
2568 if (cpu < 0 || cpu >= nr_cpu_ids)
2569 return;
2570 if (raw_smp_processor_id() != cpu)
2571 set_cpus_allowed_ptr(current, cpumask_of(cpu));
2572}
2573
2574/*
2575 * Return a delay in jiffies based on the number of CPUs, rcu_node
2576 * leaf fanout, and jiffies tick rate. The idea is to allow larger
2577 * systems more time to transition to full-idle state in order to
2578 * avoid the cache thrashing that otherwise occur on the state variable.
2579 * Really small systems (less than a couple of tens of CPUs) should
2580 * instead use a single global atomically incremented counter, and later
2581 * versions of this will automatically reconfigure themselves accordingly.
2582 */
2583static unsigned long rcu_sysidle_delay(void)
2584{
2585 if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL)
2586 return 0;
2587 return DIV_ROUND_UP(nr_cpu_ids * HZ, rcu_fanout_leaf * 1000);
2588}
2589
2590/*
2591 * Advance the full-system-idle state. This is invoked when all of
2592 * the non-timekeeping CPUs are idle.
2593 */
2594static void rcu_sysidle(unsigned long j)
2595{
2596 /* Check the current state. */
2597 switch (ACCESS_ONCE(full_sysidle_state)) {
2598 case RCU_SYSIDLE_NOT:
2599
2600 /* First time all are idle, so note a short idle period. */
2601 ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_SHORT;
2602 break;
2603
2604 case RCU_SYSIDLE_SHORT:
2605
2606 /*
2607 * Idle for a bit, time to advance to next state?
2608 * cmpxchg failure means race with non-idle, let them win.
2609 */
2610 if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay()))
2611 (void)cmpxchg(&full_sysidle_state,
2612 RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG);
2613 break;
2614
2615 case RCU_SYSIDLE_LONG:
2616
2617 /*
2618 * Do an additional check pass before advancing to full.
2619 * cmpxchg failure means race with non-idle, let them win.
2620 */
2621 if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay()))
2622 (void)cmpxchg(&full_sysidle_state,
2623 RCU_SYSIDLE_LONG, RCU_SYSIDLE_FULL);
2624 break;
2625
2626 default:
2627 break;
2628 }
2629}
2630
2631/*
2632 * Found a non-idle non-timekeeping CPU, so kick the system-idle state
2633 * back to the beginning.
2634 */
2635static void rcu_sysidle_cancel(void)
2636{
2637 smp_mb();
2638 ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT;
2639}
2640
2641/*
2642 * Update the sysidle state based on the results of a force-quiescent-state
2643 * scan of the CPUs' dyntick-idle state.
2644 */
2645static void rcu_sysidle_report(struct rcu_state *rsp, int isidle,
2646 unsigned long maxj, bool gpkt)
2647{
2648 if (rsp != rcu_sysidle_state)
2649 return; /* Wrong flavor, ignore. */
2650 if (gpkt && nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL)
2651 return; /* Running state machine from timekeeping CPU. */
2652 if (isidle)
2653 rcu_sysidle(maxj); /* More idle! */
2654 else
2655 rcu_sysidle_cancel(); /* Idle is over. */
2656}
2657
2658/*
2659 * Wrapper for rcu_sysidle_report() when called from the grace-period
2660 * kthread's context.
2661 */
2662static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
2663 unsigned long maxj)
2664{
2665 rcu_sysidle_report(rsp, isidle, maxj, true);
2666}
2667
2668/* Callback and function for forcing an RCU grace period. */
2669struct rcu_sysidle_head {
2670 struct rcu_head rh;
2671 int inuse;
2672};
2673
2674static void rcu_sysidle_cb(struct rcu_head *rhp)
2675{
2676 struct rcu_sysidle_head *rshp;
2677
2678 /*
2679 * The following memory barrier is needed to replace the
2680 * memory barriers that would normally be in the memory
2681 * allocator.
2682 */
2683 smp_mb(); /* grace period precedes setting inuse. */
2684
2685 rshp = container_of(rhp, struct rcu_sysidle_head, rh);
2686 ACCESS_ONCE(rshp->inuse) = 0;
2687}
2688
2689/*
2690 * Check to see if the system is fully idle, other than the timekeeping CPU.
2691 * The caller must have disabled interrupts.
2692 */
2693bool rcu_sys_is_idle(void)
2694{
2695 static struct rcu_sysidle_head rsh;
2696 int rss = ACCESS_ONCE(full_sysidle_state);
2697
2698 if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu))
2699 return false;
2700
2701 /* Handle small-system case by doing a full scan of CPUs. */
2702 if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) {
2703 int oldrss = rss - 1;
2704
2705 /*
2706 * One pass to advance to each state up to _FULL.
2707 * Give up if any pass fails to advance the state.
2708 */
2709 while (rss < RCU_SYSIDLE_FULL && oldrss < rss) {
2710 int cpu;
2711 bool isidle = true;
2712 unsigned long maxj = jiffies - ULONG_MAX / 4;
2713 struct rcu_data *rdp;
2714
2715 /* Scan all the CPUs looking for nonidle CPUs. */
2716 for_each_possible_cpu(cpu) {
2717 rdp = per_cpu_ptr(rcu_sysidle_state->rda, cpu);
2718 rcu_sysidle_check_cpu(rdp, &isidle, &maxj);
2719 if (!isidle)
2720 break;
2721 }
2722 rcu_sysidle_report(rcu_sysidle_state,
2723 isidle, maxj, false);
2724 oldrss = rss;
2725 rss = ACCESS_ONCE(full_sysidle_state);
2726 }
2727 }
2728
2729 /* If this is the first observation of an idle period, record it. */
2730 if (rss == RCU_SYSIDLE_FULL) {
2731 rss = cmpxchg(&full_sysidle_state,
2732 RCU_SYSIDLE_FULL, RCU_SYSIDLE_FULL_NOTED);
2733 return rss == RCU_SYSIDLE_FULL;
2734 }
2735
2736 smp_mb(); /* ensure rss load happens before later caller actions. */
2737
2738 /* If already fully idle, tell the caller (in case of races). */
2739 if (rss == RCU_SYSIDLE_FULL_NOTED)
2740 return true;
2741
2742 /*
2743 * If we aren't there yet, and a grace period is not in flight,
2744 * initiate a grace period. Either way, tell the caller that
2745 * we are not there yet. We use an xchg() rather than an assignment
2746 * to make up for the memory barriers that would otherwise be
2747 * provided by the memory allocator.
2748 */
2749 if (nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL &&
2750 !rcu_gp_in_progress(rcu_sysidle_state) &&
2751 !rsh.inuse && xchg(&rsh.inuse, 1) == 0)
2752 call_rcu(&rsh.rh, rcu_sysidle_cb);
2753 return false;
2754}
2755
2756/*
2757 * Initialize dynticks sysidle state for CPUs coming online.
2758 */
2759static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
2760{
2761 rdtp->dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE;
2762}
2763
2764#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
2765
2766static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq)
2767{
2768}
2769
2770static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq)
2771{
2772}
2773
2774static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
2775 unsigned long *maxj)
2776{
2777}
2778
2779static bool is_sysidle_rcu_state(struct rcu_state *rsp)
2780{
2781 return false;
2782}
2783
2784static void rcu_bind_gp_kthread(void)
2785{
2786}
2787
2788static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
2789 unsigned long maxj)
2790{
2791}
2792
2793static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
2794{
2795}
2796
2797#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
diff --git a/kernel/relay.c b/kernel/relay.c
index b91488ba2e5a..5001c9887db1 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -516,7 +516,7 @@ static void setup_callbacks(struct rchan *chan,
516 * 516 *
517 * Returns the success/failure of the operation. (%NOTIFY_OK, %NOTIFY_BAD) 517 * Returns the success/failure of the operation. (%NOTIFY_OK, %NOTIFY_BAD)
518 */ 518 */
519static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb, 519static int relay_hotcpu_callback(struct notifier_block *nb,
520 unsigned long action, 520 unsigned long action,
521 void *hcpu) 521 void *hcpu)
522{ 522{
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0d8eb4525e76..725aa067ad63 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -933,6 +933,8 @@ static int effective_prio(struct task_struct *p)
933/** 933/**
934 * task_curr - is this task currently executing on a CPU? 934 * task_curr - is this task currently executing on a CPU?
935 * @p: the task in question. 935 * @p: the task in question.
936 *
937 * Return: 1 if the task is currently executing. 0 otherwise.
936 */ 938 */
937inline int task_curr(const struct task_struct *p) 939inline int task_curr(const struct task_struct *p)
938{ 940{
@@ -1482,7 +1484,7 @@ static void ttwu_queue(struct task_struct *p, int cpu)
1482 * the simpler "current->state = TASK_RUNNING" to mark yourself 1484 * the simpler "current->state = TASK_RUNNING" to mark yourself
1483 * runnable without the overhead of this. 1485 * runnable without the overhead of this.
1484 * 1486 *
1485 * Returns %true if @p was woken up, %false if it was already running 1487 * Return: %true if @p was woken up, %false if it was already running.
1486 * or @state didn't match @p's state. 1488 * or @state didn't match @p's state.
1487 */ 1489 */
1488static int 1490static int
@@ -1491,7 +1493,13 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
1491 unsigned long flags; 1493 unsigned long flags;
1492 int cpu, success = 0; 1494 int cpu, success = 0;
1493 1495
1494 smp_wmb(); 1496 /*
1497 * If we are going to wake up a thread waiting for CONDITION we
1498 * need to ensure that CONDITION=1 done by the caller can not be
1499 * reordered with p->state check below. This pairs with mb() in
1500 * set_current_state() the waiting thread does.
1501 */
1502 smp_mb__before_spinlock();
1495 raw_spin_lock_irqsave(&p->pi_lock, flags); 1503 raw_spin_lock_irqsave(&p->pi_lock, flags);
1496 if (!(p->state & state)) 1504 if (!(p->state & state))
1497 goto out; 1505 goto out;
@@ -1577,8 +1585,9 @@ out:
1577 * @p: The process to be woken up. 1585 * @p: The process to be woken up.
1578 * 1586 *
1579 * Attempt to wake up the nominated process and move it to the set of runnable 1587 * Attempt to wake up the nominated process and move it to the set of runnable
1580 * processes. Returns 1 if the process was woken up, 0 if it was already 1588 * processes.
1581 * running. 1589 *
1590 * Return: 1 if the process was woken up, 0 if it was already running.
1582 * 1591 *
1583 * It may be assumed that this function implies a write memory barrier before 1592 * It may be assumed that this function implies a write memory barrier before
1584 * changing the task state if and only if any tasks are woken up. 1593 * changing the task state if and only if any tasks are woken up.
@@ -2191,6 +2200,8 @@ void scheduler_tick(void)
2191 * This makes sure that uptime, CFS vruntime, load 2200 * This makes sure that uptime, CFS vruntime, load
2192 * balancing, etc... continue to move forward, even 2201 * balancing, etc... continue to move forward, even
2193 * with a very low granularity. 2202 * with a very low granularity.
2203 *
2204 * Return: Maximum deferment in nanoseconds.
2194 */ 2205 */
2195u64 scheduler_tick_max_deferment(void) 2206u64 scheduler_tick_max_deferment(void)
2196{ 2207{
@@ -2394,6 +2405,12 @@ need_resched:
2394 if (sched_feat(HRTICK)) 2405 if (sched_feat(HRTICK))
2395 hrtick_clear(rq); 2406 hrtick_clear(rq);
2396 2407
2408 /*
2409 * Make sure that signal_pending_state()->signal_pending() below
2410 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
2411 * done by the caller to avoid the race with signal_wake_up().
2412 */
2413 smp_mb__before_spinlock();
2397 raw_spin_lock_irq(&rq->lock); 2414 raw_spin_lock_irq(&rq->lock);
2398 2415
2399 switch_count = &prev->nivcsw; 2416 switch_count = &prev->nivcsw;
@@ -2510,13 +2527,11 @@ void __sched schedule_preempt_disabled(void)
2510 */ 2527 */
2511asmlinkage void __sched notrace preempt_schedule(void) 2528asmlinkage void __sched notrace preempt_schedule(void)
2512{ 2529{
2513 struct thread_info *ti = current_thread_info();
2514
2515 /* 2530 /*
2516 * If there is a non-zero preempt_count or interrupts are disabled, 2531 * If there is a non-zero preempt_count or interrupts are disabled,
2517 * we do not want to preempt the current task. Just return.. 2532 * we do not want to preempt the current task. Just return..
2518 */ 2533 */
2519 if (likely(ti->preempt_count || irqs_disabled())) 2534 if (likely(!preemptible()))
2520 return; 2535 return;
2521 2536
2522 do { 2537 do {
@@ -2660,7 +2675,7 @@ void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
2660 if (unlikely(!q)) 2675 if (unlikely(!q))
2661 return; 2676 return;
2662 2677
2663 if (unlikely(!nr_exclusive)) 2678 if (unlikely(nr_exclusive != 1))
2664 wake_flags = 0; 2679 wake_flags = 0;
2665 2680
2666 spin_lock_irqsave(&q->lock, flags); 2681 spin_lock_irqsave(&q->lock, flags);
@@ -2796,8 +2811,8 @@ EXPORT_SYMBOL(wait_for_completion);
2796 * specified timeout to expire. The timeout is in jiffies. It is not 2811 * specified timeout to expire. The timeout is in jiffies. It is not
2797 * interruptible. 2812 * interruptible.
2798 * 2813 *
2799 * The return value is 0 if timed out, and positive (at least 1, or number of 2814 * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
2800 * jiffies left till timeout) if completed. 2815 * till timeout) if completed.
2801 */ 2816 */
2802unsigned long __sched 2817unsigned long __sched
2803wait_for_completion_timeout(struct completion *x, unsigned long timeout) 2818wait_for_completion_timeout(struct completion *x, unsigned long timeout)
@@ -2829,8 +2844,8 @@ EXPORT_SYMBOL(wait_for_completion_io);
2829 * specified timeout to expire. The timeout is in jiffies. It is not 2844 * specified timeout to expire. The timeout is in jiffies. It is not
2830 * interruptible. The caller is accounted as waiting for IO. 2845 * interruptible. The caller is accounted as waiting for IO.
2831 * 2846 *
2832 * The return value is 0 if timed out, and positive (at least 1, or number of 2847 * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
2833 * jiffies left till timeout) if completed. 2848 * till timeout) if completed.
2834 */ 2849 */
2835unsigned long __sched 2850unsigned long __sched
2836wait_for_completion_io_timeout(struct completion *x, unsigned long timeout) 2851wait_for_completion_io_timeout(struct completion *x, unsigned long timeout)
@@ -2846,7 +2861,7 @@ EXPORT_SYMBOL(wait_for_completion_io_timeout);
2846 * This waits for completion of a specific task to be signaled. It is 2861 * This waits for completion of a specific task to be signaled. It is
2847 * interruptible. 2862 * interruptible.
2848 * 2863 *
2849 * The return value is -ERESTARTSYS if interrupted, 0 if completed. 2864 * Return: -ERESTARTSYS if interrupted, 0 if completed.
2850 */ 2865 */
2851int __sched wait_for_completion_interruptible(struct completion *x) 2866int __sched wait_for_completion_interruptible(struct completion *x)
2852{ 2867{
@@ -2865,8 +2880,8 @@ EXPORT_SYMBOL(wait_for_completion_interruptible);
2865 * This waits for either a completion of a specific task to be signaled or for a 2880 * This waits for either a completion of a specific task to be signaled or for a
2866 * specified timeout to expire. It is interruptible. The timeout is in jiffies. 2881 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
2867 * 2882 *
2868 * The return value is -ERESTARTSYS if interrupted, 0 if timed out, 2883 * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
2869 * positive (at least 1, or number of jiffies left till timeout) if completed. 2884 * or number of jiffies left till timeout) if completed.
2870 */ 2885 */
2871long __sched 2886long __sched
2872wait_for_completion_interruptible_timeout(struct completion *x, 2887wait_for_completion_interruptible_timeout(struct completion *x,
@@ -2883,7 +2898,7 @@ EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
2883 * This waits to be signaled for completion of a specific task. It can be 2898 * This waits to be signaled for completion of a specific task. It can be
2884 * interrupted by a kill signal. 2899 * interrupted by a kill signal.
2885 * 2900 *
2886 * The return value is -ERESTARTSYS if interrupted, 0 if completed. 2901 * Return: -ERESTARTSYS if interrupted, 0 if completed.
2887 */ 2902 */
2888int __sched wait_for_completion_killable(struct completion *x) 2903int __sched wait_for_completion_killable(struct completion *x)
2889{ 2904{
@@ -2903,8 +2918,8 @@ EXPORT_SYMBOL(wait_for_completion_killable);
2903 * signaled or for a specified timeout to expire. It can be 2918 * signaled or for a specified timeout to expire. It can be
2904 * interrupted by a kill signal. The timeout is in jiffies. 2919 * interrupted by a kill signal. The timeout is in jiffies.
2905 * 2920 *
2906 * The return value is -ERESTARTSYS if interrupted, 0 if timed out, 2921 * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
2907 * positive (at least 1, or number of jiffies left till timeout) if completed. 2922 * or number of jiffies left till timeout) if completed.
2908 */ 2923 */
2909long __sched 2924long __sched
2910wait_for_completion_killable_timeout(struct completion *x, 2925wait_for_completion_killable_timeout(struct completion *x,
@@ -2918,7 +2933,7 @@ EXPORT_SYMBOL(wait_for_completion_killable_timeout);
2918 * try_wait_for_completion - try to decrement a completion without blocking 2933 * try_wait_for_completion - try to decrement a completion without blocking
2919 * @x: completion structure 2934 * @x: completion structure
2920 * 2935 *
2921 * Returns: 0 if a decrement cannot be done without blocking 2936 * Return: 0 if a decrement cannot be done without blocking
2922 * 1 if a decrement succeeded. 2937 * 1 if a decrement succeeded.
2923 * 2938 *
2924 * If a completion is being used as a counting completion, 2939 * If a completion is being used as a counting completion,
@@ -2945,7 +2960,7 @@ EXPORT_SYMBOL(try_wait_for_completion);
2945 * completion_done - Test to see if a completion has any waiters 2960 * completion_done - Test to see if a completion has any waiters
2946 * @x: completion structure 2961 * @x: completion structure
2947 * 2962 *
2948 * Returns: 0 if there are waiters (wait_for_completion() in progress) 2963 * Return: 0 if there are waiters (wait_for_completion() in progress)
2949 * 1 if there are no waiters. 2964 * 1 if there are no waiters.
2950 * 2965 *
2951 */ 2966 */
@@ -3182,7 +3197,7 @@ SYSCALL_DEFINE1(nice, int, increment)
3182 * task_prio - return the priority value of a given task. 3197 * task_prio - return the priority value of a given task.
3183 * @p: the task in question. 3198 * @p: the task in question.
3184 * 3199 *
3185 * This is the priority value as seen by users in /proc. 3200 * Return: The priority value as seen by users in /proc.
3186 * RT tasks are offset by -200. Normal tasks are centered 3201 * RT tasks are offset by -200. Normal tasks are centered
3187 * around 0, value goes from -16 to +15. 3202 * around 0, value goes from -16 to +15.
3188 */ 3203 */
@@ -3194,6 +3209,8 @@ int task_prio(const struct task_struct *p)
3194/** 3209/**
3195 * task_nice - return the nice value of a given task. 3210 * task_nice - return the nice value of a given task.
3196 * @p: the task in question. 3211 * @p: the task in question.
3212 *
3213 * Return: The nice value [ -20 ... 0 ... 19 ].
3197 */ 3214 */
3198int task_nice(const struct task_struct *p) 3215int task_nice(const struct task_struct *p)
3199{ 3216{
@@ -3204,6 +3221,8 @@ EXPORT_SYMBOL(task_nice);
3204/** 3221/**
3205 * idle_cpu - is a given cpu idle currently? 3222 * idle_cpu - is a given cpu idle currently?
3206 * @cpu: the processor in question. 3223 * @cpu: the processor in question.
3224 *
3225 * Return: 1 if the CPU is currently idle. 0 otherwise.
3207 */ 3226 */
3208int idle_cpu(int cpu) 3227int idle_cpu(int cpu)
3209{ 3228{
@@ -3226,6 +3245,8 @@ int idle_cpu(int cpu)
3226/** 3245/**
3227 * idle_task - return the idle task for a given cpu. 3246 * idle_task - return the idle task for a given cpu.
3228 * @cpu: the processor in question. 3247 * @cpu: the processor in question.
3248 *
3249 * Return: The idle task for the cpu @cpu.
3229 */ 3250 */
3230struct task_struct *idle_task(int cpu) 3251struct task_struct *idle_task(int cpu)
3231{ 3252{
@@ -3235,6 +3256,8 @@ struct task_struct *idle_task(int cpu)
3235/** 3256/**
3236 * find_process_by_pid - find a process with a matching PID value. 3257 * find_process_by_pid - find a process with a matching PID value.
3237 * @pid: the pid in question. 3258 * @pid: the pid in question.
3259 *
3260 * The task of @pid, if found. %NULL otherwise.
3238 */ 3261 */
3239static struct task_struct *find_process_by_pid(pid_t pid) 3262static struct task_struct *find_process_by_pid(pid_t pid)
3240{ 3263{
@@ -3432,6 +3455,8 @@ recheck:
3432 * @policy: new policy. 3455 * @policy: new policy.
3433 * @param: structure containing the new RT priority. 3456 * @param: structure containing the new RT priority.
3434 * 3457 *
3458 * Return: 0 on success. An error code otherwise.
3459 *
3435 * NOTE that the task may be already dead. 3460 * NOTE that the task may be already dead.
3436 */ 3461 */
3437int sched_setscheduler(struct task_struct *p, int policy, 3462int sched_setscheduler(struct task_struct *p, int policy,
@@ -3451,6 +3476,8 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
3451 * current context has permission. For example, this is needed in 3476 * current context has permission. For example, this is needed in
3452 * stop_machine(): we create temporary high priority worker threads, 3477 * stop_machine(): we create temporary high priority worker threads,
3453 * but our caller might not have that capability. 3478 * but our caller might not have that capability.
3479 *
3480 * Return: 0 on success. An error code otherwise.
3454 */ 3481 */
3455int sched_setscheduler_nocheck(struct task_struct *p, int policy, 3482int sched_setscheduler_nocheck(struct task_struct *p, int policy,
3456 const struct sched_param *param) 3483 const struct sched_param *param)
@@ -3485,6 +3512,8 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
3485 * @pid: the pid in question. 3512 * @pid: the pid in question.
3486 * @policy: new policy. 3513 * @policy: new policy.
3487 * @param: structure containing the new RT priority. 3514 * @param: structure containing the new RT priority.
3515 *
3516 * Return: 0 on success. An error code otherwise.
3488 */ 3517 */
3489SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, 3518SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
3490 struct sched_param __user *, param) 3519 struct sched_param __user *, param)
@@ -3500,6 +3529,8 @@ SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
3500 * sys_sched_setparam - set/change the RT priority of a thread 3529 * sys_sched_setparam - set/change the RT priority of a thread
3501 * @pid: the pid in question. 3530 * @pid: the pid in question.
3502 * @param: structure containing the new RT priority. 3531 * @param: structure containing the new RT priority.
3532 *
3533 * Return: 0 on success. An error code otherwise.
3503 */ 3534 */
3504SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) 3535SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
3505{ 3536{
@@ -3509,6 +3540,9 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
3509/** 3540/**
3510 * sys_sched_getscheduler - get the policy (scheduling class) of a thread 3541 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
3511 * @pid: the pid in question. 3542 * @pid: the pid in question.
3543 *
3544 * Return: On success, the policy of the thread. Otherwise, a negative error
3545 * code.
3512 */ 3546 */
3513SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) 3547SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
3514{ 3548{
@@ -3535,6 +3569,9 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
3535 * sys_sched_getparam - get the RT priority of a thread 3569 * sys_sched_getparam - get the RT priority of a thread
3536 * @pid: the pid in question. 3570 * @pid: the pid in question.
3537 * @param: structure containing the RT priority. 3571 * @param: structure containing the RT priority.
3572 *
3573 * Return: On success, 0 and the RT priority is in @param. Otherwise, an error
3574 * code.
3538 */ 3575 */
3539SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) 3576SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
3540{ 3577{
@@ -3659,6 +3696,8 @@ static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
3659 * @pid: pid of the process 3696 * @pid: pid of the process
3660 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 3697 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
3661 * @user_mask_ptr: user-space pointer to the new cpu mask 3698 * @user_mask_ptr: user-space pointer to the new cpu mask
3699 *
3700 * Return: 0 on success. An error code otherwise.
3662 */ 3701 */
3663SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, 3702SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
3664 unsigned long __user *, user_mask_ptr) 3703 unsigned long __user *, user_mask_ptr)
@@ -3710,6 +3749,8 @@ out_unlock:
3710 * @pid: pid of the process 3749 * @pid: pid of the process
3711 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 3750 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
3712 * @user_mask_ptr: user-space pointer to hold the current cpu mask 3751 * @user_mask_ptr: user-space pointer to hold the current cpu mask
3752 *
3753 * Return: 0 on success. An error code otherwise.
3713 */ 3754 */
3714SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, 3755SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
3715 unsigned long __user *, user_mask_ptr) 3756 unsigned long __user *, user_mask_ptr)
@@ -3744,6 +3785,8 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
3744 * 3785 *
3745 * This function yields the current CPU to other tasks. If there are no 3786 * This function yields the current CPU to other tasks. If there are no
3746 * other threads running on this CPU then this function will return. 3787 * other threads running on this CPU then this function will return.
3788 *
3789 * Return: 0.
3747 */ 3790 */
3748SYSCALL_DEFINE0(sched_yield) 3791SYSCALL_DEFINE0(sched_yield)
3749{ 3792{
@@ -3869,7 +3912,7 @@ EXPORT_SYMBOL(yield);
3869 * It's the caller's job to ensure that the target task struct 3912 * It's the caller's job to ensure that the target task struct
3870 * can't go away on us before we can do any checks. 3913 * can't go away on us before we can do any checks.
3871 * 3914 *
3872 * Returns: 3915 * Return:
3873 * true (>0) if we indeed boosted the target task. 3916 * true (>0) if we indeed boosted the target task.
3874 * false (0) if we failed to boost the target. 3917 * false (0) if we failed to boost the target.
3875 * -ESRCH if there's no task to yield to. 3918 * -ESRCH if there's no task to yield to.
@@ -3972,8 +4015,9 @@ long __sched io_schedule_timeout(long timeout)
3972 * sys_sched_get_priority_max - return maximum RT priority. 4015 * sys_sched_get_priority_max - return maximum RT priority.
3973 * @policy: scheduling class. 4016 * @policy: scheduling class.
3974 * 4017 *
3975 * this syscall returns the maximum rt_priority that can be used 4018 * Return: On success, this syscall returns the maximum
3976 * by a given scheduling class. 4019 * rt_priority that can be used by a given scheduling class.
4020 * On failure, a negative error code is returned.
3977 */ 4021 */
3978SYSCALL_DEFINE1(sched_get_priority_max, int, policy) 4022SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
3979{ 4023{
@@ -3997,8 +4041,9 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
3997 * sys_sched_get_priority_min - return minimum RT priority. 4041 * sys_sched_get_priority_min - return minimum RT priority.
3998 * @policy: scheduling class. 4042 * @policy: scheduling class.
3999 * 4043 *
4000 * this syscall returns the minimum rt_priority that can be used 4044 * Return: On success, this syscall returns the minimum
4001 * by a given scheduling class. 4045 * rt_priority that can be used by a given scheduling class.
4046 * On failure, a negative error code is returned.
4002 */ 4047 */
4003SYSCALL_DEFINE1(sched_get_priority_min, int, policy) 4048SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
4004{ 4049{
@@ -4024,6 +4069,9 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
4024 * 4069 *
4025 * this syscall writes the default timeslice value of a given process 4070 * this syscall writes the default timeslice value of a given process
4026 * into the user-space timespec buffer. A value of '0' means infinity. 4071 * into the user-space timespec buffer. A value of '0' means infinity.
4072 *
4073 * Return: On success, 0 and the timeslice is in @interval. Otherwise,
4074 * an error code.
4027 */ 4075 */
4028SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, 4076SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
4029 struct timespec __user *, interval) 4077 struct timespec __user *, interval)
@@ -4133,7 +4181,7 @@ void show_state_filter(unsigned long state_filter)
4133 debug_show_all_locks(); 4181 debug_show_all_locks();
4134} 4182}
4135 4183
4136void __cpuinit init_idle_bootup_task(struct task_struct *idle) 4184void init_idle_bootup_task(struct task_struct *idle)
4137{ 4185{
4138 idle->sched_class = &idle_sched_class; 4186 idle->sched_class = &idle_sched_class;
4139} 4187}
@@ -4146,7 +4194,7 @@ void __cpuinit init_idle_bootup_task(struct task_struct *idle)
4146 * NOTE: this function does not set the idle thread's NEED_RESCHED 4194 * NOTE: this function does not set the idle thread's NEED_RESCHED
4147 * flag, to make booting more robust. 4195 * flag, to make booting more robust.
4148 */ 4196 */
4149void __cpuinit init_idle(struct task_struct *idle, int cpu) 4197void init_idle(struct task_struct *idle, int cpu)
4150{ 4198{
4151 struct rq *rq = cpu_rq(cpu); 4199 struct rq *rq = cpu_rq(cpu);
4152 unsigned long flags; 4200 unsigned long flags;
@@ -4630,7 +4678,7 @@ static void set_rq_offline(struct rq *rq)
4630 * migration_call - callback that gets triggered when a CPU is added. 4678 * migration_call - callback that gets triggered when a CPU is added.
4631 * Here we can start up the necessary migration thread for the new CPU. 4679 * Here we can start up the necessary migration thread for the new CPU.
4632 */ 4680 */
4633static int __cpuinit 4681static int
4634migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) 4682migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
4635{ 4683{
4636 int cpu = (long)hcpu; 4684 int cpu = (long)hcpu;
@@ -4684,12 +4732,12 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
4684 * happens before everything else. This has to be lower priority than 4732 * happens before everything else. This has to be lower priority than
4685 * the notifier in the perf_event subsystem, though. 4733 * the notifier in the perf_event subsystem, though.
4686 */ 4734 */
4687static struct notifier_block __cpuinitdata migration_notifier = { 4735static struct notifier_block migration_notifier = {
4688 .notifier_call = migration_call, 4736 .notifier_call = migration_call,
4689 .priority = CPU_PRI_MIGRATION, 4737 .priority = CPU_PRI_MIGRATION,
4690}; 4738};
4691 4739
4692static int __cpuinit sched_cpu_active(struct notifier_block *nfb, 4740static int sched_cpu_active(struct notifier_block *nfb,
4693 unsigned long action, void *hcpu) 4741 unsigned long action, void *hcpu)
4694{ 4742{
4695 switch (action & ~CPU_TASKS_FROZEN) { 4743 switch (action & ~CPU_TASKS_FROZEN) {
@@ -4702,7 +4750,7 @@ static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
4702 } 4750 }
4703} 4751}
4704 4752
4705static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb, 4753static int sched_cpu_inactive(struct notifier_block *nfb,
4706 unsigned long action, void *hcpu) 4754 unsigned long action, void *hcpu)
4707{ 4755{
4708 switch (action & ~CPU_TASKS_FROZEN) { 4756 switch (action & ~CPU_TASKS_FROZEN) {
@@ -4914,7 +4962,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
4914 SD_BALANCE_FORK | 4962 SD_BALANCE_FORK |
4915 SD_BALANCE_EXEC | 4963 SD_BALANCE_EXEC |
4916 SD_SHARE_CPUPOWER | 4964 SD_SHARE_CPUPOWER |
4917 SD_SHARE_PKG_RESOURCES); 4965 SD_SHARE_PKG_RESOURCES |
4966 SD_PREFER_SIBLING);
4918 if (nr_node_ids == 1) 4967 if (nr_node_ids == 1)
4919 pflags &= ~SD_SERIALIZE; 4968 pflags &= ~SD_SERIALIZE;
4920 } 4969 }
@@ -5083,18 +5132,23 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
5083 * two cpus are in the same cache domain, see cpus_share_cache(). 5132 * two cpus are in the same cache domain, see cpus_share_cache().
5084 */ 5133 */
5085DEFINE_PER_CPU(struct sched_domain *, sd_llc); 5134DEFINE_PER_CPU(struct sched_domain *, sd_llc);
5135DEFINE_PER_CPU(int, sd_llc_size);
5086DEFINE_PER_CPU(int, sd_llc_id); 5136DEFINE_PER_CPU(int, sd_llc_id);
5087 5137
5088static void update_top_cache_domain(int cpu) 5138static void update_top_cache_domain(int cpu)
5089{ 5139{
5090 struct sched_domain *sd; 5140 struct sched_domain *sd;
5091 int id = cpu; 5141 int id = cpu;
5142 int size = 1;
5092 5143
5093 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); 5144 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
5094 if (sd) 5145 if (sd) {
5095 id = cpumask_first(sched_domain_span(sd)); 5146 id = cpumask_first(sched_domain_span(sd));
5147 size = cpumask_weight(sched_domain_span(sd));
5148 }
5096 5149
5097 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); 5150 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
5151 per_cpu(sd_llc_size, cpu) = size;
5098 per_cpu(sd_llc_id, cpu) = id; 5152 per_cpu(sd_llc_id, cpu) = id;
5099} 5153}
5100 5154
@@ -5118,6 +5172,13 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
5118 tmp->parent = parent->parent; 5172 tmp->parent = parent->parent;
5119 if (parent->parent) 5173 if (parent->parent)
5120 parent->parent->child = tmp; 5174 parent->parent->child = tmp;
5175 /*
5176 * Transfer SD_PREFER_SIBLING down in case of a
5177 * degenerate parent; the spans match for this
5178 * so the property transfers.
5179 */
5180 if (parent->flags & SD_PREFER_SIBLING)
5181 tmp->flags |= SD_PREFER_SIBLING;
5121 destroy_sched_domain(parent, cpu); 5182 destroy_sched_domain(parent, cpu);
5122 } else 5183 } else
5123 tmp = tmp->parent; 5184 tmp = tmp->parent;
@@ -6184,8 +6245,9 @@ match1:
6184 ; 6245 ;
6185 } 6246 }
6186 6247
6248 n = ndoms_cur;
6187 if (doms_new == NULL) { 6249 if (doms_new == NULL) {
6188 ndoms_cur = 0; 6250 n = 0;
6189 doms_new = &fallback_doms; 6251 doms_new = &fallback_doms;
6190 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); 6252 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
6191 WARN_ON_ONCE(dattr_new); 6253 WARN_ON_ONCE(dattr_new);
@@ -6193,7 +6255,7 @@ match1:
6193 6255
6194 /* Build new domains */ 6256 /* Build new domains */
6195 for (i = 0; i < ndoms_new; i++) { 6257 for (i = 0; i < ndoms_new; i++) {
6196 for (j = 0; j < ndoms_cur && !new_topology; j++) { 6258 for (j = 0; j < n && !new_topology; j++) {
6197 if (cpumask_equal(doms_new[i], doms_cur[j]) 6259 if (cpumask_equal(doms_new[i], doms_cur[j])
6198 && dattrs_equal(dattr_new, i, dattr_cur, j)) 6260 && dattrs_equal(dattr_new, i, dattr_cur, j))
6199 goto match2; 6261 goto match2;
@@ -6632,6 +6694,8 @@ void normalize_rt_tasks(void)
6632 * @cpu: the processor in question. 6694 * @cpu: the processor in question.
6633 * 6695 *
6634 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! 6696 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6697 *
6698 * Return: The current task for @cpu.
6635 */ 6699 */
6636struct task_struct *curr_task(int cpu) 6700struct task_struct *curr_task(int cpu)
6637{ 6701{
@@ -6763,7 +6827,7 @@ void sched_move_task(struct task_struct *tsk)
6763 if (unlikely(running)) 6827 if (unlikely(running))
6764 tsk->sched_class->put_prev_task(rq, tsk); 6828 tsk->sched_class->put_prev_task(rq, tsk);
6765 6829
6766 tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id, 6830 tg = container_of(task_css_check(tsk, cpu_cgroup_subsys_id,
6767 lockdep_is_held(&tsk->sighand->siglock)), 6831 lockdep_is_held(&tsk->sighand->siglock)),
6768 struct task_group, css); 6832 struct task_group, css);
6769 tg = autogroup_task_group(tsk, tg); 6833 tg = autogroup_task_group(tsk, tg);
@@ -7085,23 +7149,22 @@ int sched_rt_handler(struct ctl_table *table, int write,
7085 7149
7086#ifdef CONFIG_CGROUP_SCHED 7150#ifdef CONFIG_CGROUP_SCHED
7087 7151
7088/* return corresponding task_group object of a cgroup */ 7152static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
7089static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
7090{ 7153{
7091 return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id), 7154 return css ? container_of(css, struct task_group, css) : NULL;
7092 struct task_group, css);
7093} 7155}
7094 7156
7095static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp) 7157static struct cgroup_subsys_state *
7158cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
7096{ 7159{
7097 struct task_group *tg, *parent; 7160 struct task_group *parent = css_tg(parent_css);
7161 struct task_group *tg;
7098 7162
7099 if (!cgrp->parent) { 7163 if (!parent) {
7100 /* This is early initialization for the top cgroup */ 7164 /* This is early initialization for the top cgroup */
7101 return &root_task_group.css; 7165 return &root_task_group.css;
7102 } 7166 }
7103 7167
7104 parent = cgroup_tg(cgrp->parent);
7105 tg = sched_create_group(parent); 7168 tg = sched_create_group(parent);
7106 if (IS_ERR(tg)) 7169 if (IS_ERR(tg))
7107 return ERR_PTR(-ENOMEM); 7170 return ERR_PTR(-ENOMEM);
@@ -7109,41 +7172,38 @@ static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp)
7109 return &tg->css; 7172 return &tg->css;
7110} 7173}
7111 7174
7112static int cpu_cgroup_css_online(struct cgroup *cgrp) 7175static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
7113{ 7176{
7114 struct task_group *tg = cgroup_tg(cgrp); 7177 struct task_group *tg = css_tg(css);
7115 struct task_group *parent; 7178 struct task_group *parent = css_tg(css_parent(css));
7116 7179
7117 if (!cgrp->parent) 7180 if (parent)
7118 return 0; 7181 sched_online_group(tg, parent);
7119
7120 parent = cgroup_tg(cgrp->parent);
7121 sched_online_group(tg, parent);
7122 return 0; 7182 return 0;
7123} 7183}
7124 7184
7125static void cpu_cgroup_css_free(struct cgroup *cgrp) 7185static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
7126{ 7186{
7127 struct task_group *tg = cgroup_tg(cgrp); 7187 struct task_group *tg = css_tg(css);
7128 7188
7129 sched_destroy_group(tg); 7189 sched_destroy_group(tg);
7130} 7190}
7131 7191
7132static void cpu_cgroup_css_offline(struct cgroup *cgrp) 7192static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
7133{ 7193{
7134 struct task_group *tg = cgroup_tg(cgrp); 7194 struct task_group *tg = css_tg(css);
7135 7195
7136 sched_offline_group(tg); 7196 sched_offline_group(tg);
7137} 7197}
7138 7198
7139static int cpu_cgroup_can_attach(struct cgroup *cgrp, 7199static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
7140 struct cgroup_taskset *tset) 7200 struct cgroup_taskset *tset)
7141{ 7201{
7142 struct task_struct *task; 7202 struct task_struct *task;
7143 7203
7144 cgroup_taskset_for_each(task, cgrp, tset) { 7204 cgroup_taskset_for_each(task, css, tset) {
7145#ifdef CONFIG_RT_GROUP_SCHED 7205#ifdef CONFIG_RT_GROUP_SCHED
7146 if (!sched_rt_can_attach(cgroup_tg(cgrp), task)) 7206 if (!sched_rt_can_attach(css_tg(css), task))
7147 return -EINVAL; 7207 return -EINVAL;
7148#else 7208#else
7149 /* We don't support RT-tasks being in separate groups */ 7209 /* We don't support RT-tasks being in separate groups */
@@ -7154,18 +7214,18 @@ static int cpu_cgroup_can_attach(struct cgroup *cgrp,
7154 return 0; 7214 return 0;
7155} 7215}
7156 7216
7157static void cpu_cgroup_attach(struct cgroup *cgrp, 7217static void cpu_cgroup_attach(struct cgroup_subsys_state *css,
7158 struct cgroup_taskset *tset) 7218 struct cgroup_taskset *tset)
7159{ 7219{
7160 struct task_struct *task; 7220 struct task_struct *task;
7161 7221
7162 cgroup_taskset_for_each(task, cgrp, tset) 7222 cgroup_taskset_for_each(task, css, tset)
7163 sched_move_task(task); 7223 sched_move_task(task);
7164} 7224}
7165 7225
7166static void 7226static void cpu_cgroup_exit(struct cgroup_subsys_state *css,
7167cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, 7227 struct cgroup_subsys_state *old_css,
7168 struct task_struct *task) 7228 struct task_struct *task)
7169{ 7229{
7170 /* 7230 /*
7171 * cgroup_exit() is called in the copy_process() failure path. 7231 * cgroup_exit() is called in the copy_process() failure path.
@@ -7179,15 +7239,16 @@ cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
7179} 7239}
7180 7240
7181#ifdef CONFIG_FAIR_GROUP_SCHED 7241#ifdef CONFIG_FAIR_GROUP_SCHED
7182static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, 7242static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
7183 u64 shareval) 7243 struct cftype *cftype, u64 shareval)
7184{ 7244{
7185 return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval)); 7245 return sched_group_set_shares(css_tg(css), scale_load(shareval));
7186} 7246}
7187 7247
7188static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) 7248static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
7249 struct cftype *cft)
7189{ 7250{
7190 struct task_group *tg = cgroup_tg(cgrp); 7251 struct task_group *tg = css_tg(css);
7191 7252
7192 return (u64) scale_load_down(tg->shares); 7253 return (u64) scale_load_down(tg->shares);
7193} 7254}
@@ -7309,26 +7370,28 @@ long tg_get_cfs_period(struct task_group *tg)
7309 return cfs_period_us; 7370 return cfs_period_us;
7310} 7371}
7311 7372
7312static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft) 7373static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
7374 struct cftype *cft)
7313{ 7375{
7314 return tg_get_cfs_quota(cgroup_tg(cgrp)); 7376 return tg_get_cfs_quota(css_tg(css));
7315} 7377}
7316 7378
7317static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype, 7379static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css,
7318 s64 cfs_quota_us) 7380 struct cftype *cftype, s64 cfs_quota_us)
7319{ 7381{
7320 return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us); 7382 return tg_set_cfs_quota(css_tg(css), cfs_quota_us);
7321} 7383}
7322 7384
7323static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft) 7385static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css,
7386 struct cftype *cft)
7324{ 7387{
7325 return tg_get_cfs_period(cgroup_tg(cgrp)); 7388 return tg_get_cfs_period(css_tg(css));
7326} 7389}
7327 7390
7328static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype, 7391static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,
7329 u64 cfs_period_us) 7392 struct cftype *cftype, u64 cfs_period_us)
7330{ 7393{
7331 return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us); 7394 return tg_set_cfs_period(css_tg(css), cfs_period_us);
7332} 7395}
7333 7396
7334struct cfs_schedulable_data { 7397struct cfs_schedulable_data {
@@ -7409,10 +7472,10 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
7409 return ret; 7472 return ret;
7410} 7473}
7411 7474
7412static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft, 7475static int cpu_stats_show(struct cgroup_subsys_state *css, struct cftype *cft,
7413 struct cgroup_map_cb *cb) 7476 struct cgroup_map_cb *cb)
7414{ 7477{
7415 struct task_group *tg = cgroup_tg(cgrp); 7478 struct task_group *tg = css_tg(css);
7416 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 7479 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7417 7480
7418 cb->fill(cb, "nr_periods", cfs_b->nr_periods); 7481 cb->fill(cb, "nr_periods", cfs_b->nr_periods);
@@ -7425,26 +7488,28 @@ static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
7425#endif /* CONFIG_FAIR_GROUP_SCHED */ 7488#endif /* CONFIG_FAIR_GROUP_SCHED */
7426 7489
7427#ifdef CONFIG_RT_GROUP_SCHED 7490#ifdef CONFIG_RT_GROUP_SCHED
7428static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, 7491static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
7429 s64 val) 7492 struct cftype *cft, s64 val)
7430{ 7493{
7431 return sched_group_set_rt_runtime(cgroup_tg(cgrp), val); 7494 return sched_group_set_rt_runtime(css_tg(css), val);
7432} 7495}
7433 7496
7434static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft) 7497static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css,
7498 struct cftype *cft)
7435{ 7499{
7436 return sched_group_rt_runtime(cgroup_tg(cgrp)); 7500 return sched_group_rt_runtime(css_tg(css));
7437} 7501}
7438 7502
7439static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype, 7503static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css,
7440 u64 rt_period_us) 7504 struct cftype *cftype, u64 rt_period_us)
7441{ 7505{
7442 return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us); 7506 return sched_group_set_rt_period(css_tg(css), rt_period_us);
7443} 7507}
7444 7508
7445static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft) 7509static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
7510 struct cftype *cft)
7446{ 7511{
7447 return sched_group_rt_period(cgroup_tg(cgrp)); 7512 return sched_group_rt_period(css_tg(css));
7448} 7513}
7449#endif /* CONFIG_RT_GROUP_SCHED */ 7514#endif /* CONFIG_RT_GROUP_SCHED */
7450 7515
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index dbb7e2cd95eb..f64722ff0299 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -33,30 +33,20 @@ struct cpuacct {
33 struct kernel_cpustat __percpu *cpustat; 33 struct kernel_cpustat __percpu *cpustat;
34}; 34};
35 35
36/* return cpu accounting group corresponding to this container */ 36static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css)
37static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
38{ 37{
39 return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), 38 return css ? container_of(css, struct cpuacct, css) : NULL;
40 struct cpuacct, css);
41} 39}
42 40
43/* return cpu accounting group to which this task belongs */ 41/* return cpu accounting group to which this task belongs */
44static inline struct cpuacct *task_ca(struct task_struct *tsk) 42static inline struct cpuacct *task_ca(struct task_struct *tsk)
45{ 43{
46 return container_of(task_subsys_state(tsk, cpuacct_subsys_id), 44 return css_ca(task_css(tsk, cpuacct_subsys_id));
47 struct cpuacct, css);
48}
49
50static inline struct cpuacct *__parent_ca(struct cpuacct *ca)
51{
52 return cgroup_ca(ca->css.cgroup->parent);
53} 45}
54 46
55static inline struct cpuacct *parent_ca(struct cpuacct *ca) 47static inline struct cpuacct *parent_ca(struct cpuacct *ca)
56{ 48{
57 if (!ca->css.cgroup->parent) 49 return css_ca(css_parent(&ca->css));
58 return NULL;
59 return cgroup_ca(ca->css.cgroup->parent);
60} 50}
61 51
62static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); 52static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage);
@@ -66,11 +56,12 @@ static struct cpuacct root_cpuacct = {
66}; 56};
67 57
68/* create a new cpu accounting group */ 58/* create a new cpu accounting group */
69static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp) 59static struct cgroup_subsys_state *
60cpuacct_css_alloc(struct cgroup_subsys_state *parent_css)
70{ 61{
71 struct cpuacct *ca; 62 struct cpuacct *ca;
72 63
73 if (!cgrp->parent) 64 if (!parent_css)
74 return &root_cpuacct.css; 65 return &root_cpuacct.css;
75 66
76 ca = kzalloc(sizeof(*ca), GFP_KERNEL); 67 ca = kzalloc(sizeof(*ca), GFP_KERNEL);
@@ -96,9 +87,9 @@ out:
96} 87}
97 88
98/* destroy an existing cpu accounting group */ 89/* destroy an existing cpu accounting group */
99static void cpuacct_css_free(struct cgroup *cgrp) 90static void cpuacct_css_free(struct cgroup_subsys_state *css)
100{ 91{
101 struct cpuacct *ca = cgroup_ca(cgrp); 92 struct cpuacct *ca = css_ca(css);
102 93
103 free_percpu(ca->cpustat); 94 free_percpu(ca->cpustat);
104 free_percpu(ca->cpuusage); 95 free_percpu(ca->cpuusage);
@@ -141,9 +132,9 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
141} 132}
142 133
143/* return total cpu usage (in nanoseconds) of a group */ 134/* return total cpu usage (in nanoseconds) of a group */
144static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) 135static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft)
145{ 136{
146 struct cpuacct *ca = cgroup_ca(cgrp); 137 struct cpuacct *ca = css_ca(css);
147 u64 totalcpuusage = 0; 138 u64 totalcpuusage = 0;
148 int i; 139 int i;
149 140
@@ -153,10 +144,10 @@ static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
153 return totalcpuusage; 144 return totalcpuusage;
154} 145}
155 146
156static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, 147static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
157 u64 reset) 148 u64 reset)
158{ 149{
159 struct cpuacct *ca = cgroup_ca(cgrp); 150 struct cpuacct *ca = css_ca(css);
160 int err = 0; 151 int err = 0;
161 int i; 152 int i;
162 153
@@ -172,10 +163,10 @@ out:
172 return err; 163 return err;
173} 164}
174 165
175static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, 166static int cpuacct_percpu_seq_read(struct cgroup_subsys_state *css,
176 struct seq_file *m) 167 struct cftype *cft, struct seq_file *m)
177{ 168{
178 struct cpuacct *ca = cgroup_ca(cgroup); 169 struct cpuacct *ca = css_ca(css);
179 u64 percpu; 170 u64 percpu;
180 int i; 171 int i;
181 172
@@ -192,10 +183,10 @@ static const char * const cpuacct_stat_desc[] = {
192 [CPUACCT_STAT_SYSTEM] = "system", 183 [CPUACCT_STAT_SYSTEM] = "system",
193}; 184};
194 185
195static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, 186static int cpuacct_stats_show(struct cgroup_subsys_state *css,
196 struct cgroup_map_cb *cb) 187 struct cftype *cft, struct cgroup_map_cb *cb)
197{ 188{
198 struct cpuacct *ca = cgroup_ca(cgrp); 189 struct cpuacct *ca = css_ca(css);
199 int cpu; 190 int cpu;
200 s64 val = 0; 191 s64 val = 0;
201 192
@@ -281,7 +272,7 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val)
281 while (ca != &root_cpuacct) { 272 while (ca != &root_cpuacct) {
282 kcpustat = this_cpu_ptr(ca->cpustat); 273 kcpustat = this_cpu_ptr(ca->cpustat);
283 kcpustat->cpustat[index] += val; 274 kcpustat->cpustat[index] += val;
284 ca = __parent_ca(ca); 275 ca = parent_ca(ca);
285 } 276 }
286 rcu_read_unlock(); 277 rcu_read_unlock();
287} 278}
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 1095e878a46f..8b836b376d91 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -62,7 +62,7 @@ static int convert_prio(int prio)
62 * any discrepancies created by racing against the uncertainty of the current 62 * any discrepancies created by racing against the uncertainty of the current
63 * priority configuration. 63 * priority configuration.
64 * 64 *
65 * Returns: (int)bool - CPUs were found 65 * Return: (int)bool - CPUs were found
66 */ 66 */
67int cpupri_find(struct cpupri *cp, struct task_struct *p, 67int cpupri_find(struct cpupri *cp, struct task_struct *p,
68 struct cpumask *lowest_mask) 68 struct cpumask *lowest_mask)
@@ -203,7 +203,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
203 * cpupri_init - initialize the cpupri structure 203 * cpupri_init - initialize the cpupri structure
204 * @cp: The cpupri context 204 * @cp: The cpupri context
205 * 205 *
206 * Returns: -ENOMEM if memory fails. 206 * Return: -ENOMEM on memory allocation failure.
207 */ 207 */
208int cpupri_init(struct cpupri *cp) 208int cpupri_init(struct cpupri *cp)
209{ 209{
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index a7959e05a9d5..ace34f95e200 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -121,7 +121,7 @@ static inline void task_group_account_field(struct task_struct *p, int index,
121 * is the only cgroup, then nothing else should be necessary. 121 * is the only cgroup, then nothing else should be necessary.
122 * 122 *
123 */ 123 */
124 __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; 124 __this_cpu_add(kernel_cpustat.cpustat[index], tmp);
125 125
126 cpuacct_account_field(p, index, tmp); 126 cpuacct_account_field(p, index, tmp);
127} 127}
@@ -378,11 +378,8 @@ static inline void irqtime_account_process_tick(struct task_struct *p, int user_
378#ifdef CONFIG_VIRT_CPU_ACCOUNTING 378#ifdef CONFIG_VIRT_CPU_ACCOUNTING
379 379
380#ifndef __ARCH_HAS_VTIME_TASK_SWITCH 380#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
381void vtime_task_switch(struct task_struct *prev) 381void vtime_common_task_switch(struct task_struct *prev)
382{ 382{
383 if (!vtime_accounting_enabled())
384 return;
385
386 if (is_idle_task(prev)) 383 if (is_idle_task(prev))
387 vtime_account_idle(prev); 384 vtime_account_idle(prev);
388 else 385 else
@@ -404,11 +401,8 @@ void vtime_task_switch(struct task_struct *prev)
404 * vtime_account(). 401 * vtime_account().
405 */ 402 */
406#ifndef __ARCH_HAS_VTIME_ACCOUNT 403#ifndef __ARCH_HAS_VTIME_ACCOUNT
407void vtime_account_irq_enter(struct task_struct *tsk) 404void vtime_common_account_irq_enter(struct task_struct *tsk)
408{ 405{
409 if (!vtime_accounting_enabled())
410 return;
411
412 if (!in_interrupt()) { 406 if (!in_interrupt()) {
413 /* 407 /*
414 * If we interrupted user, context_tracking_in_user() 408 * If we interrupted user, context_tracking_in_user()
@@ -428,7 +422,7 @@ void vtime_account_irq_enter(struct task_struct *tsk)
428 } 422 }
429 vtime_account_system(tsk); 423 vtime_account_system(tsk);
430} 424}
431EXPORT_SYMBOL_GPL(vtime_account_irq_enter); 425EXPORT_SYMBOL_GPL(vtime_common_account_irq_enter);
432#endif /* __ARCH_HAS_VTIME_ACCOUNT */ 426#endif /* __ARCH_HAS_VTIME_ACCOUNT */
433#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ 427#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
434 428
@@ -559,12 +553,6 @@ static void cputime_adjust(struct task_cputime *curr,
559{ 553{
560 cputime_t rtime, stime, utime, total; 554 cputime_t rtime, stime, utime, total;
561 555
562 if (vtime_accounting_enabled()) {
563 *ut = curr->utime;
564 *st = curr->stime;
565 return;
566 }
567
568 stime = curr->stime; 556 stime = curr->stime;
569 total = stime + curr->utime; 557 total = stime + curr->utime;
570 558
@@ -664,23 +652,17 @@ static void __vtime_account_system(struct task_struct *tsk)
664 652
665void vtime_account_system(struct task_struct *tsk) 653void vtime_account_system(struct task_struct *tsk)
666{ 654{
667 if (!vtime_accounting_enabled())
668 return;
669
670 write_seqlock(&tsk->vtime_seqlock); 655 write_seqlock(&tsk->vtime_seqlock);
671 __vtime_account_system(tsk); 656 __vtime_account_system(tsk);
672 write_sequnlock(&tsk->vtime_seqlock); 657 write_sequnlock(&tsk->vtime_seqlock);
673} 658}
674 659
675void vtime_account_irq_exit(struct task_struct *tsk) 660void vtime_gen_account_irq_exit(struct task_struct *tsk)
676{ 661{
677 if (!vtime_accounting_enabled())
678 return;
679
680 write_seqlock(&tsk->vtime_seqlock); 662 write_seqlock(&tsk->vtime_seqlock);
663 __vtime_account_system(tsk);
681 if (context_tracking_in_user()) 664 if (context_tracking_in_user())
682 tsk->vtime_snap_whence = VTIME_USER; 665 tsk->vtime_snap_whence = VTIME_USER;
683 __vtime_account_system(tsk);
684 write_sequnlock(&tsk->vtime_seqlock); 666 write_sequnlock(&tsk->vtime_seqlock);
685} 667}
686 668
@@ -688,12 +670,8 @@ void vtime_account_user(struct task_struct *tsk)
688{ 670{
689 cputime_t delta_cpu; 671 cputime_t delta_cpu;
690 672
691 if (!vtime_accounting_enabled())
692 return;
693
694 delta_cpu = get_vtime_delta(tsk);
695
696 write_seqlock(&tsk->vtime_seqlock); 673 write_seqlock(&tsk->vtime_seqlock);
674 delta_cpu = get_vtime_delta(tsk);
697 tsk->vtime_snap_whence = VTIME_SYS; 675 tsk->vtime_snap_whence = VTIME_SYS;
698 account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); 676 account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
699 write_sequnlock(&tsk->vtime_seqlock); 677 write_sequnlock(&tsk->vtime_seqlock);
@@ -701,22 +679,27 @@ void vtime_account_user(struct task_struct *tsk)
701 679
702void vtime_user_enter(struct task_struct *tsk) 680void vtime_user_enter(struct task_struct *tsk)
703{ 681{
704 if (!vtime_accounting_enabled())
705 return;
706
707 write_seqlock(&tsk->vtime_seqlock); 682 write_seqlock(&tsk->vtime_seqlock);
708 tsk->vtime_snap_whence = VTIME_USER;
709 __vtime_account_system(tsk); 683 __vtime_account_system(tsk);
684 tsk->vtime_snap_whence = VTIME_USER;
710 write_sequnlock(&tsk->vtime_seqlock); 685 write_sequnlock(&tsk->vtime_seqlock);
711} 686}
712 687
713void vtime_guest_enter(struct task_struct *tsk) 688void vtime_guest_enter(struct task_struct *tsk)
714{ 689{
690 /*
691 * The flags must be updated under the lock with
692 * the vtime_snap flush and update.
693 * That enforces a right ordering and update sequence
694 * synchronization against the reader (task_gtime())
695 * that can thus safely catch up with a tickless delta.
696 */
715 write_seqlock(&tsk->vtime_seqlock); 697 write_seqlock(&tsk->vtime_seqlock);
716 __vtime_account_system(tsk); 698 __vtime_account_system(tsk);
717 current->flags |= PF_VCPU; 699 current->flags |= PF_VCPU;
718 write_sequnlock(&tsk->vtime_seqlock); 700 write_sequnlock(&tsk->vtime_seqlock);
719} 701}
702EXPORT_SYMBOL_GPL(vtime_guest_enter);
720 703
721void vtime_guest_exit(struct task_struct *tsk) 704void vtime_guest_exit(struct task_struct *tsk)
722{ 705{
@@ -725,6 +708,7 @@ void vtime_guest_exit(struct task_struct *tsk)
725 current->flags &= ~PF_VCPU; 708 current->flags &= ~PF_VCPU;
726 write_sequnlock(&tsk->vtime_seqlock); 709 write_sequnlock(&tsk->vtime_seqlock);
727} 710}
711EXPORT_SYMBOL_GPL(vtime_guest_exit);
728 712
729void vtime_account_idle(struct task_struct *tsk) 713void vtime_account_idle(struct task_struct *tsk)
730{ 714{
@@ -733,11 +717,6 @@ void vtime_account_idle(struct task_struct *tsk)
733 account_idle_time(delta_cpu); 717 account_idle_time(delta_cpu);
734} 718}
735 719
736bool vtime_accounting_enabled(void)
737{
738 return context_tracking_active();
739}
740
741void arch_vtime_task_switch(struct task_struct *prev) 720void arch_vtime_task_switch(struct task_struct *prev)
742{ 721{
743 write_seqlock(&prev->vtime_seqlock); 722 write_seqlock(&prev->vtime_seqlock);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f77f9c527449..7f0a5e6cdae0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -851,7 +851,7 @@ void task_numa_fault(int node, int pages, bool migrated)
851{ 851{
852 struct task_struct *p = current; 852 struct task_struct *p = current;
853 853
854 if (!sched_feat_numa(NUMA)) 854 if (!numabalancing_enabled)
855 return; 855 return;
856 856
857 /* FIXME: Allocate task-specific structure for placement policy here */ 857 /* FIXME: Allocate task-specific structure for placement policy here */
@@ -2032,6 +2032,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
2032 */ 2032 */
2033 update_entity_load_avg(curr, 1); 2033 update_entity_load_avg(curr, 1);
2034 update_cfs_rq_blocked_load(cfs_rq, 1); 2034 update_cfs_rq_blocked_load(cfs_rq, 1);
2035 update_cfs_shares(cfs_rq);
2035 2036
2036#ifdef CONFIG_SCHED_HRTICK 2037#ifdef CONFIG_SCHED_HRTICK
2037 /* 2038 /*
@@ -3017,6 +3018,23 @@ static unsigned long cpu_avg_load_per_task(int cpu)
3017 return 0; 3018 return 0;
3018} 3019}
3019 3020
3021static void record_wakee(struct task_struct *p)
3022{
3023 /*
3024 * Rough decay (wiping) for cost saving, don't worry
3025 * about the boundary, really active task won't care
3026 * about the loss.
3027 */
3028 if (jiffies > current->wakee_flip_decay_ts + HZ) {
3029 current->wakee_flips = 0;
3030 current->wakee_flip_decay_ts = jiffies;
3031 }
3032
3033 if (current->last_wakee != p) {
3034 current->last_wakee = p;
3035 current->wakee_flips++;
3036 }
3037}
3020 3038
3021static void task_waking_fair(struct task_struct *p) 3039static void task_waking_fair(struct task_struct *p)
3022{ 3040{
@@ -3037,6 +3055,7 @@ static void task_waking_fair(struct task_struct *p)
3037#endif 3055#endif
3038 3056
3039 se->vruntime -= min_vruntime; 3057 se->vruntime -= min_vruntime;
3058 record_wakee(p);
3040} 3059}
3041 3060
3042#ifdef CONFIG_FAIR_GROUP_SCHED 3061#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -3155,6 +3174,28 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
3155 3174
3156#endif 3175#endif
3157 3176
3177static int wake_wide(struct task_struct *p)
3178{
3179 int factor = this_cpu_read(sd_llc_size);
3180
3181 /*
3182 * Yeah, it's the switching-frequency, could means many wakee or
3183 * rapidly switch, use factor here will just help to automatically
3184 * adjust the loose-degree, so bigger node will lead to more pull.
3185 */
3186 if (p->wakee_flips > factor) {
3187 /*
3188 * wakee is somewhat hot, it needs certain amount of cpu
3189 * resource, so if waker is far more hot, prefer to leave
3190 * it alone.
3191 */
3192 if (current->wakee_flips > (factor * p->wakee_flips))
3193 return 1;
3194 }
3195
3196 return 0;
3197}
3198
3158static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) 3199static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
3159{ 3200{
3160 s64 this_load, load; 3201 s64 this_load, load;
@@ -3164,6 +3205,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
3164 unsigned long weight; 3205 unsigned long weight;
3165 int balanced; 3206 int balanced;
3166 3207
3208 /*
3209 * If we wake multiple tasks be careful to not bounce
3210 * ourselves around too much.
3211 */
3212 if (wake_wide(p))
3213 return 0;
3214
3167 idx = sd->wake_idx; 3215 idx = sd->wake_idx;
3168 this_cpu = smp_processor_id(); 3216 this_cpu = smp_processor_id();
3169 prev_cpu = task_cpu(p); 3217 prev_cpu = task_cpu(p);
@@ -4171,47 +4219,48 @@ static void update_blocked_averages(int cpu)
4171} 4219}
4172 4220
4173/* 4221/*
4174 * Compute the cpu's hierarchical load factor for each task group. 4222 * Compute the hierarchical load factor for cfs_rq and all its ascendants.
4175 * This needs to be done in a top-down fashion because the load of a child 4223 * This needs to be done in a top-down fashion because the load of a child
4176 * group is a fraction of its parents load. 4224 * group is a fraction of its parents load.
4177 */ 4225 */
4178static int tg_load_down(struct task_group *tg, void *data) 4226static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
4179{
4180 unsigned long load;
4181 long cpu = (long)data;
4182
4183 if (!tg->parent) {
4184 load = cpu_rq(cpu)->avg.load_avg_contrib;
4185 } else {
4186 load = tg->parent->cfs_rq[cpu]->h_load;
4187 load = div64_ul(load * tg->se[cpu]->avg.load_avg_contrib,
4188 tg->parent->cfs_rq[cpu]->runnable_load_avg + 1);
4189 }
4190
4191 tg->cfs_rq[cpu]->h_load = load;
4192
4193 return 0;
4194}
4195
4196static void update_h_load(long cpu)
4197{ 4227{
4198 struct rq *rq = cpu_rq(cpu); 4228 struct rq *rq = rq_of(cfs_rq);
4229 struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
4199 unsigned long now = jiffies; 4230 unsigned long now = jiffies;
4231 unsigned long load;
4200 4232
4201 if (rq->h_load_throttle == now) 4233 if (cfs_rq->last_h_load_update == now)
4202 return; 4234 return;
4203 4235
4204 rq->h_load_throttle = now; 4236 cfs_rq->h_load_next = NULL;
4237 for_each_sched_entity(se) {
4238 cfs_rq = cfs_rq_of(se);
4239 cfs_rq->h_load_next = se;
4240 if (cfs_rq->last_h_load_update == now)
4241 break;
4242 }
4205 4243
4206 rcu_read_lock(); 4244 if (!se) {
4207 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); 4245 cfs_rq->h_load = rq->avg.load_avg_contrib;
4208 rcu_read_unlock(); 4246 cfs_rq->last_h_load_update = now;
4247 }
4248
4249 while ((se = cfs_rq->h_load_next) != NULL) {
4250 load = cfs_rq->h_load;
4251 load = div64_ul(load * se->avg.load_avg_contrib,
4252 cfs_rq->runnable_load_avg + 1);
4253 cfs_rq = group_cfs_rq(se);
4254 cfs_rq->h_load = load;
4255 cfs_rq->last_h_load_update = now;
4256 }
4209} 4257}
4210 4258
4211static unsigned long task_h_load(struct task_struct *p) 4259static unsigned long task_h_load(struct task_struct *p)
4212{ 4260{
4213 struct cfs_rq *cfs_rq = task_cfs_rq(p); 4261 struct cfs_rq *cfs_rq = task_cfs_rq(p);
4214 4262
4263 update_cfs_rq_h_load(cfs_rq);
4215 return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load, 4264 return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load,
4216 cfs_rq->runnable_load_avg + 1); 4265 cfs_rq->runnable_load_avg + 1);
4217} 4266}
@@ -4220,10 +4269,6 @@ static inline void update_blocked_averages(int cpu)
4220{ 4269{
4221} 4270}
4222 4271
4223static inline void update_h_load(long cpu)
4224{
4225}
4226
4227static unsigned long task_h_load(struct task_struct *p) 4272static unsigned long task_h_load(struct task_struct *p)
4228{ 4273{
4229 return p->se.avg.load_avg_contrib; 4274 return p->se.avg.load_avg_contrib;
@@ -4232,54 +4277,62 @@ static unsigned long task_h_load(struct task_struct *p)
4232 4277
4233/********** Helpers for find_busiest_group ************************/ 4278/********** Helpers for find_busiest_group ************************/
4234/* 4279/*
4235 * sd_lb_stats - Structure to store the statistics of a sched_domain
4236 * during load balancing.
4237 */
4238struct sd_lb_stats {
4239 struct sched_group *busiest; /* Busiest group in this sd */
4240 struct sched_group *this; /* Local group in this sd */
4241 unsigned long total_load; /* Total load of all groups in sd */
4242 unsigned long total_pwr; /* Total power of all groups in sd */
4243 unsigned long avg_load; /* Average load across all groups in sd */
4244
4245 /** Statistics of this group */
4246 unsigned long this_load;
4247 unsigned long this_load_per_task;
4248 unsigned long this_nr_running;
4249 unsigned long this_has_capacity;
4250 unsigned int this_idle_cpus;
4251
4252 /* Statistics of the busiest group */
4253 unsigned int busiest_idle_cpus;
4254 unsigned long max_load;
4255 unsigned long busiest_load_per_task;
4256 unsigned long busiest_nr_running;
4257 unsigned long busiest_group_capacity;
4258 unsigned long busiest_has_capacity;
4259 unsigned int busiest_group_weight;
4260
4261 int group_imb; /* Is there imbalance in this sd */
4262};
4263
4264/*
4265 * sg_lb_stats - stats of a sched_group required for load_balancing 4280 * sg_lb_stats - stats of a sched_group required for load_balancing
4266 */ 4281 */
4267struct sg_lb_stats { 4282struct sg_lb_stats {
4268 unsigned long avg_load; /*Avg load across the CPUs of the group */ 4283 unsigned long avg_load; /*Avg load across the CPUs of the group */
4269 unsigned long group_load; /* Total load over the CPUs of the group */ 4284 unsigned long group_load; /* Total load over the CPUs of the group */
4270 unsigned long sum_nr_running; /* Nr tasks running in the group */
4271 unsigned long sum_weighted_load; /* Weighted load of group's tasks */ 4285 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
4272 unsigned long group_capacity; 4286 unsigned long load_per_task;
4273 unsigned long idle_cpus; 4287 unsigned long group_power;
4274 unsigned long group_weight; 4288 unsigned int sum_nr_running; /* Nr tasks running in the group */
4289 unsigned int group_capacity;
4290 unsigned int idle_cpus;
4291 unsigned int group_weight;
4275 int group_imb; /* Is there an imbalance in the group ? */ 4292 int group_imb; /* Is there an imbalance in the group ? */
4276 int group_has_capacity; /* Is there extra capacity in the group? */ 4293 int group_has_capacity; /* Is there extra capacity in the group? */
4277}; 4294};
4278 4295
4296/*
4297 * sd_lb_stats - Structure to store the statistics of a sched_domain
4298 * during load balancing.
4299 */
4300struct sd_lb_stats {
4301 struct sched_group *busiest; /* Busiest group in this sd */
4302 struct sched_group *local; /* Local group in this sd */
4303 unsigned long total_load; /* Total load of all groups in sd */
4304 unsigned long total_pwr; /* Total power of all groups in sd */
4305 unsigned long avg_load; /* Average load across all groups in sd */
4306
4307 struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
4308 struct sg_lb_stats local_stat; /* Statistics of the local group */
4309};
4310
4311static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
4312{
4313 /*
4314 * Skimp on the clearing to avoid duplicate work. We can avoid clearing
4315 * local_stat because update_sg_lb_stats() does a full clear/assignment.
4316 * We must however clear busiest_stat::avg_load because
4317 * update_sd_pick_busiest() reads this before assignment.
4318 */
4319 *sds = (struct sd_lb_stats){
4320 .busiest = NULL,
4321 .local = NULL,
4322 .total_load = 0UL,
4323 .total_pwr = 0UL,
4324 .busiest_stat = {
4325 .avg_load = 0UL,
4326 },
4327 };
4328}
4329
4279/** 4330/**
4280 * get_sd_load_idx - Obtain the load index for a given sched domain. 4331 * get_sd_load_idx - Obtain the load index for a given sched domain.
4281 * @sd: The sched_domain whose load_idx is to be obtained. 4332 * @sd: The sched_domain whose load_idx is to be obtained.
4282 * @idle: The Idle status of the CPU for whose sd load_icx is obtained. 4333 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
4334 *
4335 * Return: The load index.
4283 */ 4336 */
4284static inline int get_sd_load_idx(struct sched_domain *sd, 4337static inline int get_sd_load_idx(struct sched_domain *sd,
4285 enum cpu_idle_type idle) 4338 enum cpu_idle_type idle)
@@ -4457,33 +4510,99 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
4457 return 0; 4510 return 0;
4458} 4511}
4459 4512
4513/*
4514 * Group imbalance indicates (and tries to solve) the problem where balancing
4515 * groups is inadequate due to tsk_cpus_allowed() constraints.
4516 *
4517 * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
4518 * cpumask covering 1 cpu of the first group and 3 cpus of the second group.
4519 * Something like:
4520 *
4521 * { 0 1 2 3 } { 4 5 6 7 }
4522 * * * * *
4523 *
4524 * If we were to balance group-wise we'd place two tasks in the first group and
4525 * two tasks in the second group. Clearly this is undesired as it will overload
4526 * cpu 3 and leave one of the cpus in the second group unused.
4527 *
4528 * The current solution to this issue is detecting the skew in the first group
4529 * by noticing it has a cpu that is overloaded while the remaining cpus are
4530 * idle -- or rather, there's a distinct imbalance in the cpus; see
4531 * sg_imbalanced().
4532 *
4533 * When this is so detected; this group becomes a candidate for busiest; see
4534 * update_sd_pick_busiest(). And calculcate_imbalance() and
4535 * find_busiest_group() avoid some of the usual balance conditional to allow it
4536 * to create an effective group imbalance.
4537 *
4538 * This is a somewhat tricky proposition since the next run might not find the
4539 * group imbalance and decide the groups need to be balanced again. A most
4540 * subtle and fragile situation.
4541 */
4542
4543struct sg_imb_stats {
4544 unsigned long max_nr_running, min_nr_running;
4545 unsigned long max_cpu_load, min_cpu_load;
4546};
4547
4548static inline void init_sg_imb_stats(struct sg_imb_stats *sgi)
4549{
4550 sgi->max_cpu_load = sgi->max_nr_running = 0UL;
4551 sgi->min_cpu_load = sgi->min_nr_running = ~0UL;
4552}
4553
4554static inline void
4555update_sg_imb_stats(struct sg_imb_stats *sgi,
4556 unsigned long load, unsigned long nr_running)
4557{
4558 if (load > sgi->max_cpu_load)
4559 sgi->max_cpu_load = load;
4560 if (sgi->min_cpu_load > load)
4561 sgi->min_cpu_load = load;
4562
4563 if (nr_running > sgi->max_nr_running)
4564 sgi->max_nr_running = nr_running;
4565 if (sgi->min_nr_running > nr_running)
4566 sgi->min_nr_running = nr_running;
4567}
4568
4569static inline int
4570sg_imbalanced(struct sg_lb_stats *sgs, struct sg_imb_stats *sgi)
4571{
4572 /*
4573 * Consider the group unbalanced when the imbalance is larger
4574 * than the average weight of a task.
4575 *
4576 * APZ: with cgroup the avg task weight can vary wildly and
4577 * might not be a suitable number - should we keep a
4578 * normalized nr_running number somewhere that negates
4579 * the hierarchy?
4580 */
4581 if ((sgi->max_cpu_load - sgi->min_cpu_load) >= sgs->load_per_task &&
4582 (sgi->max_nr_running - sgi->min_nr_running) > 1)
4583 return 1;
4584
4585 return 0;
4586}
4587
4460/** 4588/**
4461 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 4589 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
4462 * @env: The load balancing environment. 4590 * @env: The load balancing environment.
4463 * @group: sched_group whose statistics are to be updated. 4591 * @group: sched_group whose statistics are to be updated.
4464 * @load_idx: Load index of sched_domain of this_cpu for load calc. 4592 * @load_idx: Load index of sched_domain of this_cpu for load calc.
4465 * @local_group: Does group contain this_cpu. 4593 * @local_group: Does group contain this_cpu.
4466 * @balance: Should we balance.
4467 * @sgs: variable to hold the statistics for this group. 4594 * @sgs: variable to hold the statistics for this group.
4468 */ 4595 */
4469static inline void update_sg_lb_stats(struct lb_env *env, 4596static inline void update_sg_lb_stats(struct lb_env *env,
4470 struct sched_group *group, int load_idx, 4597 struct sched_group *group, int load_idx,
4471 int local_group, int *balance, struct sg_lb_stats *sgs) 4598 int local_group, struct sg_lb_stats *sgs)
4472{ 4599{
4473 unsigned long nr_running, max_nr_running, min_nr_running; 4600 struct sg_imb_stats sgi;
4474 unsigned long load, max_cpu_load, min_cpu_load; 4601 unsigned long nr_running;
4475 unsigned int balance_cpu = -1, first_idle_cpu = 0; 4602 unsigned long load;
4476 unsigned long avg_load_per_task = 0;
4477 int i; 4603 int i;
4478 4604
4479 if (local_group) 4605 init_sg_imb_stats(&sgi);
4480 balance_cpu = group_balance_cpu(group);
4481
4482 /* Tally up the load of all CPUs in the group */
4483 max_cpu_load = 0;
4484 min_cpu_load = ~0UL;
4485 max_nr_running = 0;
4486 min_nr_running = ~0UL;
4487 4606
4488 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { 4607 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
4489 struct rq *rq = cpu_rq(i); 4608 struct rq *rq = cpu_rq(i);
@@ -4492,24 +4611,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
4492 4611
4493 /* Bias balancing toward cpus of our domain */ 4612 /* Bias balancing toward cpus of our domain */
4494 if (local_group) { 4613 if (local_group) {
4495 if (idle_cpu(i) && !first_idle_cpu &&
4496 cpumask_test_cpu(i, sched_group_mask(group))) {
4497 first_idle_cpu = 1;
4498 balance_cpu = i;
4499 }
4500
4501 load = target_load(i, load_idx); 4614 load = target_load(i, load_idx);
4502 } else { 4615 } else {
4503 load = source_load(i, load_idx); 4616 load = source_load(i, load_idx);
4504 if (load > max_cpu_load) 4617 update_sg_imb_stats(&sgi, load, nr_running);
4505 max_cpu_load = load;
4506 if (min_cpu_load > load)
4507 min_cpu_load = load;
4508
4509 if (nr_running > max_nr_running)
4510 max_nr_running = nr_running;
4511 if (min_nr_running > nr_running)
4512 min_nr_running = nr_running;
4513 } 4618 }
4514 4619
4515 sgs->group_load += load; 4620 sgs->group_load += load;
@@ -4519,46 +4624,25 @@ static inline void update_sg_lb_stats(struct lb_env *env,
4519 sgs->idle_cpus++; 4624 sgs->idle_cpus++;
4520 } 4625 }
4521 4626
4522 /* 4627 if (local_group && (env->idle != CPU_NEWLY_IDLE ||
4523 * First idle cpu or the first cpu(busiest) in this sched group 4628 time_after_eq(jiffies, group->sgp->next_update)))
4524 * is eligible for doing load balancing at this and above 4629 update_group_power(env->sd, env->dst_cpu);
4525 * domains. In the newly idle case, we will allow all the cpu's
4526 * to do the newly idle load balance.
4527 */
4528 if (local_group) {
4529 if (env->idle != CPU_NEWLY_IDLE) {
4530 if (balance_cpu != env->dst_cpu) {
4531 *balance = 0;
4532 return;
4533 }
4534 update_group_power(env->sd, env->dst_cpu);
4535 } else if (time_after_eq(jiffies, group->sgp->next_update))
4536 update_group_power(env->sd, env->dst_cpu);
4537 }
4538 4630
4539 /* Adjust by relative CPU power of the group */ 4631 /* Adjust by relative CPU power of the group */
4540 sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power; 4632 sgs->group_power = group->sgp->power;
4633 sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power;
4541 4634
4542 /*
4543 * Consider the group unbalanced when the imbalance is larger
4544 * than the average weight of a task.
4545 *
4546 * APZ: with cgroup the avg task weight can vary wildly and
4547 * might not be a suitable number - should we keep a
4548 * normalized nr_running number somewhere that negates
4549 * the hierarchy?
4550 */
4551 if (sgs->sum_nr_running) 4635 if (sgs->sum_nr_running)
4552 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; 4636 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
4637
4638 sgs->group_imb = sg_imbalanced(sgs, &sgi);
4553 4639
4554 if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && 4640 sgs->group_capacity =
4555 (max_nr_running - min_nr_running) > 1) 4641 DIV_ROUND_CLOSEST(sgs->group_power, SCHED_POWER_SCALE);
4556 sgs->group_imb = 1;
4557 4642
4558 sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
4559 SCHED_POWER_SCALE);
4560 if (!sgs->group_capacity) 4643 if (!sgs->group_capacity)
4561 sgs->group_capacity = fix_small_capacity(env->sd, group); 4644 sgs->group_capacity = fix_small_capacity(env->sd, group);
4645
4562 sgs->group_weight = group->group_weight; 4646 sgs->group_weight = group->group_weight;
4563 4647
4564 if (sgs->group_capacity > sgs->sum_nr_running) 4648 if (sgs->group_capacity > sgs->sum_nr_running)
@@ -4574,13 +4658,16 @@ static inline void update_sg_lb_stats(struct lb_env *env,
4574 * 4658 *
4575 * Determine if @sg is a busier group than the previously selected 4659 * Determine if @sg is a busier group than the previously selected
4576 * busiest group. 4660 * busiest group.
4661 *
4662 * Return: %true if @sg is a busier group than the previously selected
4663 * busiest group. %false otherwise.
4577 */ 4664 */
4578static bool update_sd_pick_busiest(struct lb_env *env, 4665static bool update_sd_pick_busiest(struct lb_env *env,
4579 struct sd_lb_stats *sds, 4666 struct sd_lb_stats *sds,
4580 struct sched_group *sg, 4667 struct sched_group *sg,
4581 struct sg_lb_stats *sgs) 4668 struct sg_lb_stats *sgs)
4582{ 4669{
4583 if (sgs->avg_load <= sds->max_load) 4670 if (sgs->avg_load <= sds->busiest_stat.avg_load)
4584 return false; 4671 return false;
4585 4672
4586 if (sgs->sum_nr_running > sgs->group_capacity) 4673 if (sgs->sum_nr_running > sgs->group_capacity)
@@ -4613,11 +4700,11 @@ static bool update_sd_pick_busiest(struct lb_env *env,
4613 * @sds: variable to hold the statistics for this sched_domain. 4700 * @sds: variable to hold the statistics for this sched_domain.
4614 */ 4701 */
4615static inline void update_sd_lb_stats(struct lb_env *env, 4702static inline void update_sd_lb_stats(struct lb_env *env,
4616 int *balance, struct sd_lb_stats *sds) 4703 struct sd_lb_stats *sds)
4617{ 4704{
4618 struct sched_domain *child = env->sd->child; 4705 struct sched_domain *child = env->sd->child;
4619 struct sched_group *sg = env->sd->groups; 4706 struct sched_group *sg = env->sd->groups;
4620 struct sg_lb_stats sgs; 4707 struct sg_lb_stats tmp_sgs;
4621 int load_idx, prefer_sibling = 0; 4708 int load_idx, prefer_sibling = 0;
4622 4709
4623 if (child && child->flags & SD_PREFER_SIBLING) 4710 if (child && child->flags & SD_PREFER_SIBLING)
@@ -4626,17 +4713,17 @@ static inline void update_sd_lb_stats(struct lb_env *env,
4626 load_idx = get_sd_load_idx(env->sd, env->idle); 4713 load_idx = get_sd_load_idx(env->sd, env->idle);
4627 4714
4628 do { 4715 do {
4716 struct sg_lb_stats *sgs = &tmp_sgs;
4629 int local_group; 4717 int local_group;
4630 4718
4631 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); 4719 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
4632 memset(&sgs, 0, sizeof(sgs)); 4720 if (local_group) {
4633 update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs); 4721 sds->local = sg;
4634 4722 sgs = &sds->local_stat;
4635 if (local_group && !(*balance)) 4723 }
4636 return;
4637 4724
4638 sds->total_load += sgs.group_load; 4725 memset(sgs, 0, sizeof(*sgs));
4639 sds->total_pwr += sg->sgp->power; 4726 update_sg_lb_stats(env, sg, load_idx, local_group, sgs);
4640 4727
4641 /* 4728 /*
4642 * In case the child domain prefers tasks go to siblings 4729 * In case the child domain prefers tasks go to siblings
@@ -4648,26 +4735,17 @@ static inline void update_sd_lb_stats(struct lb_env *env,
4648 * heaviest group when it is already under-utilized (possible 4735 * heaviest group when it is already under-utilized (possible
4649 * with a large weight task outweighs the tasks on the system). 4736 * with a large weight task outweighs the tasks on the system).
4650 */ 4737 */
4651 if (prefer_sibling && !local_group && sds->this_has_capacity) 4738 if (prefer_sibling && !local_group &&
4652 sgs.group_capacity = min(sgs.group_capacity, 1UL); 4739 sds->local && sds->local_stat.group_has_capacity)
4740 sgs->group_capacity = min(sgs->group_capacity, 1U);
4653 4741
4654 if (local_group) { 4742 /* Now, start updating sd_lb_stats */
4655 sds->this_load = sgs.avg_load; 4743 sds->total_load += sgs->group_load;
4656 sds->this = sg; 4744 sds->total_pwr += sgs->group_power;
4657 sds->this_nr_running = sgs.sum_nr_running; 4745
4658 sds->this_load_per_task = sgs.sum_weighted_load; 4746 if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) {
4659 sds->this_has_capacity = sgs.group_has_capacity;
4660 sds->this_idle_cpus = sgs.idle_cpus;
4661 } else if (update_sd_pick_busiest(env, sds, sg, &sgs)) {
4662 sds->max_load = sgs.avg_load;
4663 sds->busiest = sg; 4747 sds->busiest = sg;
4664 sds->busiest_nr_running = sgs.sum_nr_running; 4748 sds->busiest_stat = *sgs;
4665 sds->busiest_idle_cpus = sgs.idle_cpus;
4666 sds->busiest_group_capacity = sgs.group_capacity;
4667 sds->busiest_load_per_task = sgs.sum_weighted_load;
4668 sds->busiest_has_capacity = sgs.group_has_capacity;
4669 sds->busiest_group_weight = sgs.group_weight;
4670 sds->group_imb = sgs.group_imb;
4671 } 4749 }
4672 4750
4673 sg = sg->next; 4751 sg = sg->next;
@@ -4691,7 +4769,7 @@ static inline void update_sd_lb_stats(struct lb_env *env,
4691 * assuming lower CPU number will be equivalent to lower a SMT thread 4769 * assuming lower CPU number will be equivalent to lower a SMT thread
4692 * number. 4770 * number.
4693 * 4771 *
4694 * Returns 1 when packing is required and a task should be moved to 4772 * Return: 1 when packing is required and a task should be moved to
4695 * this CPU. The amount of the imbalance is returned in *imbalance. 4773 * this CPU. The amount of the imbalance is returned in *imbalance.
4696 * 4774 *
4697 * @env: The load balancing environment. 4775 * @env: The load balancing environment.
@@ -4712,7 +4790,8 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
4712 return 0; 4790 return 0;
4713 4791
4714 env->imbalance = DIV_ROUND_CLOSEST( 4792 env->imbalance = DIV_ROUND_CLOSEST(
4715 sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE); 4793 sds->busiest_stat.avg_load * sds->busiest_stat.group_power,
4794 SCHED_POWER_SCALE);
4716 4795
4717 return 1; 4796 return 1;
4718} 4797}
@@ -4730,24 +4809,23 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4730 unsigned long tmp, pwr_now = 0, pwr_move = 0; 4809 unsigned long tmp, pwr_now = 0, pwr_move = 0;
4731 unsigned int imbn = 2; 4810 unsigned int imbn = 2;
4732 unsigned long scaled_busy_load_per_task; 4811 unsigned long scaled_busy_load_per_task;
4812 struct sg_lb_stats *local, *busiest;
4733 4813
4734 if (sds->this_nr_running) { 4814 local = &sds->local_stat;
4735 sds->this_load_per_task /= sds->this_nr_running; 4815 busiest = &sds->busiest_stat;
4736 if (sds->busiest_load_per_task > 4816
4737 sds->this_load_per_task) 4817 if (!local->sum_nr_running)
4738 imbn = 1; 4818 local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
4739 } else { 4819 else if (busiest->load_per_task > local->load_per_task)
4740 sds->this_load_per_task = 4820 imbn = 1;
4741 cpu_avg_load_per_task(env->dst_cpu);
4742 }
4743 4821
4744 scaled_busy_load_per_task = sds->busiest_load_per_task 4822 scaled_busy_load_per_task =
4745 * SCHED_POWER_SCALE; 4823 (busiest->load_per_task * SCHED_POWER_SCALE) /
4746 scaled_busy_load_per_task /= sds->busiest->sgp->power; 4824 busiest->group_power;
4747 4825
4748 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= 4826 if (busiest->avg_load - local->avg_load + scaled_busy_load_per_task >=
4749 (scaled_busy_load_per_task * imbn)) { 4827 (scaled_busy_load_per_task * imbn)) {
4750 env->imbalance = sds->busiest_load_per_task; 4828 env->imbalance = busiest->load_per_task;
4751 return; 4829 return;
4752 } 4830 }
4753 4831
@@ -4757,34 +4835,37 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4757 * moving them. 4835 * moving them.
4758 */ 4836 */
4759 4837
4760 pwr_now += sds->busiest->sgp->power * 4838 pwr_now += busiest->group_power *
4761 min(sds->busiest_load_per_task, sds->max_load); 4839 min(busiest->load_per_task, busiest->avg_load);
4762 pwr_now += sds->this->sgp->power * 4840 pwr_now += local->group_power *
4763 min(sds->this_load_per_task, sds->this_load); 4841 min(local->load_per_task, local->avg_load);
4764 pwr_now /= SCHED_POWER_SCALE; 4842 pwr_now /= SCHED_POWER_SCALE;
4765 4843
4766 /* Amount of load we'd subtract */ 4844 /* Amount of load we'd subtract */
4767 tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / 4845 tmp = (busiest->load_per_task * SCHED_POWER_SCALE) /
4768 sds->busiest->sgp->power; 4846 busiest->group_power;
4769 if (sds->max_load > tmp) 4847 if (busiest->avg_load > tmp) {
4770 pwr_move += sds->busiest->sgp->power * 4848 pwr_move += busiest->group_power *
4771 min(sds->busiest_load_per_task, sds->max_load - tmp); 4849 min(busiest->load_per_task,
4850 busiest->avg_load - tmp);
4851 }
4772 4852
4773 /* Amount of load we'd add */ 4853 /* Amount of load we'd add */
4774 if (sds->max_load * sds->busiest->sgp->power < 4854 if (busiest->avg_load * busiest->group_power <
4775 sds->busiest_load_per_task * SCHED_POWER_SCALE) 4855 busiest->load_per_task * SCHED_POWER_SCALE) {
4776 tmp = (sds->max_load * sds->busiest->sgp->power) / 4856 tmp = (busiest->avg_load * busiest->group_power) /
4777 sds->this->sgp->power; 4857 local->group_power;
4778 else 4858 } else {
4779 tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / 4859 tmp = (busiest->load_per_task * SCHED_POWER_SCALE) /
4780 sds->this->sgp->power; 4860 local->group_power;
4781 pwr_move += sds->this->sgp->power * 4861 }
4782 min(sds->this_load_per_task, sds->this_load + tmp); 4862 pwr_move += local->group_power *
4863 min(local->load_per_task, local->avg_load + tmp);
4783 pwr_move /= SCHED_POWER_SCALE; 4864 pwr_move /= SCHED_POWER_SCALE;
4784 4865
4785 /* Move if we gain throughput */ 4866 /* Move if we gain throughput */
4786 if (pwr_move > pwr_now) 4867 if (pwr_move > pwr_now)
4787 env->imbalance = sds->busiest_load_per_task; 4868 env->imbalance = busiest->load_per_task;
4788} 4869}
4789 4870
4790/** 4871/**
@@ -4796,11 +4877,18 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4796static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds) 4877static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4797{ 4878{
4798 unsigned long max_pull, load_above_capacity = ~0UL; 4879 unsigned long max_pull, load_above_capacity = ~0UL;
4880 struct sg_lb_stats *local, *busiest;
4799 4881
4800 sds->busiest_load_per_task /= sds->busiest_nr_running; 4882 local = &sds->local_stat;
4801 if (sds->group_imb) { 4883 busiest = &sds->busiest_stat;
4802 sds->busiest_load_per_task = 4884
4803 min(sds->busiest_load_per_task, sds->avg_load); 4885 if (busiest->group_imb) {
4886 /*
4887 * In the group_imb case we cannot rely on group-wide averages
4888 * to ensure cpu-load equilibrium, look at wider averages. XXX
4889 */
4890 busiest->load_per_task =
4891 min(busiest->load_per_task, sds->avg_load);
4804 } 4892 }
4805 4893
4806 /* 4894 /*
@@ -4808,21 +4896,22 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
4808 * max load less than avg load(as we skip the groups at or below 4896 * max load less than avg load(as we skip the groups at or below
4809 * its cpu_power, while calculating max_load..) 4897 * its cpu_power, while calculating max_load..)
4810 */ 4898 */
4811 if (sds->max_load < sds->avg_load) { 4899 if (busiest->avg_load < sds->avg_load) {
4812 env->imbalance = 0; 4900 env->imbalance = 0;
4813 return fix_small_imbalance(env, sds); 4901 return fix_small_imbalance(env, sds);
4814 } 4902 }
4815 4903
4816 if (!sds->group_imb) { 4904 if (!busiest->group_imb) {
4817 /* 4905 /*
4818 * Don't want to pull so many tasks that a group would go idle. 4906 * Don't want to pull so many tasks that a group would go idle.
4907 * Except of course for the group_imb case, since then we might
4908 * have to drop below capacity to reach cpu-load equilibrium.
4819 */ 4909 */
4820 load_above_capacity = (sds->busiest_nr_running - 4910 load_above_capacity =
4821 sds->busiest_group_capacity); 4911 (busiest->sum_nr_running - busiest->group_capacity);
4822 4912
4823 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); 4913 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
4824 4914 load_above_capacity /= busiest->group_power;
4825 load_above_capacity /= sds->busiest->sgp->power;
4826 } 4915 }
4827 4916
4828 /* 4917 /*
@@ -4832,15 +4921,14 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
4832 * we also don't want to reduce the group load below the group capacity 4921 * we also don't want to reduce the group load below the group capacity
4833 * (so that we can implement power-savings policies etc). Thus we look 4922 * (so that we can implement power-savings policies etc). Thus we look
4834 * for the minimum possible imbalance. 4923 * for the minimum possible imbalance.
4835 * Be careful of negative numbers as they'll appear as very large values
4836 * with unsigned longs.
4837 */ 4924 */
4838 max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); 4925 max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
4839 4926
4840 /* How much load to actually move to equalise the imbalance */ 4927 /* How much load to actually move to equalise the imbalance */
4841 env->imbalance = min(max_pull * sds->busiest->sgp->power, 4928 env->imbalance = min(
4842 (sds->avg_load - sds->this_load) * sds->this->sgp->power) 4929 max_pull * busiest->group_power,
4843 / SCHED_POWER_SCALE; 4930 (sds->avg_load - local->avg_load) * local->group_power
4931 ) / SCHED_POWER_SCALE;
4844 4932
4845 /* 4933 /*
4846 * if *imbalance is less than the average load per runnable task 4934 * if *imbalance is less than the average load per runnable task
@@ -4848,9 +4936,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
4848 * a think about bumping its value to force at least one task to be 4936 * a think about bumping its value to force at least one task to be
4849 * moved 4937 * moved
4850 */ 4938 */
4851 if (env->imbalance < sds->busiest_load_per_task) 4939 if (env->imbalance < busiest->load_per_task)
4852 return fix_small_imbalance(env, sds); 4940 return fix_small_imbalance(env, sds);
4853
4854} 4941}
4855 4942
4856/******* find_busiest_group() helpers end here *********************/ 4943/******* find_busiest_group() helpers end here *********************/
@@ -4866,69 +4953,62 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
4866 * to restore balance. 4953 * to restore balance.
4867 * 4954 *
4868 * @env: The load balancing environment. 4955 * @env: The load balancing environment.
4869 * @balance: Pointer to a variable indicating if this_cpu
4870 * is the appropriate cpu to perform load balancing at this_level.
4871 * 4956 *
4872 * Returns: - the busiest group if imbalance exists. 4957 * Return: - The busiest group if imbalance exists.
4873 * - If no imbalance and user has opted for power-savings balance, 4958 * - If no imbalance and user has opted for power-savings balance,
4874 * return the least loaded group whose CPUs can be 4959 * return the least loaded group whose CPUs can be
4875 * put to idle by rebalancing its tasks onto our group. 4960 * put to idle by rebalancing its tasks onto our group.
4876 */ 4961 */
4877static struct sched_group * 4962static struct sched_group *find_busiest_group(struct lb_env *env)
4878find_busiest_group(struct lb_env *env, int *balance)
4879{ 4963{
4964 struct sg_lb_stats *local, *busiest;
4880 struct sd_lb_stats sds; 4965 struct sd_lb_stats sds;
4881 4966
4882 memset(&sds, 0, sizeof(sds)); 4967 init_sd_lb_stats(&sds);
4883 4968
4884 /* 4969 /*
4885 * Compute the various statistics relavent for load balancing at 4970 * Compute the various statistics relavent for load balancing at
4886 * this level. 4971 * this level.
4887 */ 4972 */
4888 update_sd_lb_stats(env, balance, &sds); 4973 update_sd_lb_stats(env, &sds);
4889 4974 local = &sds.local_stat;
4890 /* 4975 busiest = &sds.busiest_stat;
4891 * this_cpu is not the appropriate cpu to perform load balancing at
4892 * this level.
4893 */
4894 if (!(*balance))
4895 goto ret;
4896 4976
4897 if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && 4977 if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
4898 check_asym_packing(env, &sds)) 4978 check_asym_packing(env, &sds))
4899 return sds.busiest; 4979 return sds.busiest;
4900 4980
4901 /* There is no busy sibling group to pull tasks from */ 4981 /* There is no busy sibling group to pull tasks from */
4902 if (!sds.busiest || sds.busiest_nr_running == 0) 4982 if (!sds.busiest || busiest->sum_nr_running == 0)
4903 goto out_balanced; 4983 goto out_balanced;
4904 4984
4905 sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr; 4985 sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
4906 4986
4907 /* 4987 /*
4908 * If the busiest group is imbalanced the below checks don't 4988 * If the busiest group is imbalanced the below checks don't
4909 * work because they assumes all things are equal, which typically 4989 * work because they assume all things are equal, which typically
4910 * isn't true due to cpus_allowed constraints and the like. 4990 * isn't true due to cpus_allowed constraints and the like.
4911 */ 4991 */
4912 if (sds.group_imb) 4992 if (busiest->group_imb)
4913 goto force_balance; 4993 goto force_balance;
4914 4994
4915 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ 4995 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
4916 if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity && 4996 if (env->idle == CPU_NEWLY_IDLE && local->group_has_capacity &&
4917 !sds.busiest_has_capacity) 4997 !busiest->group_has_capacity)
4918 goto force_balance; 4998 goto force_balance;
4919 4999
4920 /* 5000 /*
4921 * If the local group is more busy than the selected busiest group 5001 * If the local group is more busy than the selected busiest group
4922 * don't try and pull any tasks. 5002 * don't try and pull any tasks.
4923 */ 5003 */
4924 if (sds.this_load >= sds.max_load) 5004 if (local->avg_load >= busiest->avg_load)
4925 goto out_balanced; 5005 goto out_balanced;
4926 5006
4927 /* 5007 /*
4928 * Don't pull any tasks if this group is already above the domain 5008 * Don't pull any tasks if this group is already above the domain
4929 * average load. 5009 * average load.
4930 */ 5010 */
4931 if (sds.this_load >= sds.avg_load) 5011 if (local->avg_load >= sds.avg_load)
4932 goto out_balanced; 5012 goto out_balanced;
4933 5013
4934 if (env->idle == CPU_IDLE) { 5014 if (env->idle == CPU_IDLE) {
@@ -4938,15 +5018,16 @@ find_busiest_group(struct lb_env *env, int *balance)
4938 * there is no imbalance between this and busiest group 5018 * there is no imbalance between this and busiest group
4939 * wrt to idle cpu's, it is balanced. 5019 * wrt to idle cpu's, it is balanced.
4940 */ 5020 */
4941 if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) && 5021 if ((local->idle_cpus < busiest->idle_cpus) &&
4942 sds.busiest_nr_running <= sds.busiest_group_weight) 5022 busiest->sum_nr_running <= busiest->group_weight)
4943 goto out_balanced; 5023 goto out_balanced;
4944 } else { 5024 } else {
4945 /* 5025 /*
4946 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use 5026 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
4947 * imbalance_pct to be conservative. 5027 * imbalance_pct to be conservative.
4948 */ 5028 */
4949 if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load) 5029 if (100 * busiest->avg_load <=
5030 env->sd->imbalance_pct * local->avg_load)
4950 goto out_balanced; 5031 goto out_balanced;
4951 } 5032 }
4952 5033
@@ -4956,7 +5037,6 @@ force_balance:
4956 return sds.busiest; 5037 return sds.busiest;
4957 5038
4958out_balanced: 5039out_balanced:
4959ret:
4960 env->imbalance = 0; 5040 env->imbalance = 0;
4961 return NULL; 5041 return NULL;
4962} 5042}
@@ -4968,10 +5048,10 @@ static struct rq *find_busiest_queue(struct lb_env *env,
4968 struct sched_group *group) 5048 struct sched_group *group)
4969{ 5049{
4970 struct rq *busiest = NULL, *rq; 5050 struct rq *busiest = NULL, *rq;
4971 unsigned long max_load = 0; 5051 unsigned long busiest_load = 0, busiest_power = 1;
4972 int i; 5052 int i;
4973 5053
4974 for_each_cpu(i, sched_group_cpus(group)) { 5054 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
4975 unsigned long power = power_of(i); 5055 unsigned long power = power_of(i);
4976 unsigned long capacity = DIV_ROUND_CLOSEST(power, 5056 unsigned long capacity = DIV_ROUND_CLOSEST(power,
4977 SCHED_POWER_SCALE); 5057 SCHED_POWER_SCALE);
@@ -4980,9 +5060,6 @@ static struct rq *find_busiest_queue(struct lb_env *env,
4980 if (!capacity) 5060 if (!capacity)
4981 capacity = fix_small_capacity(env->sd, group); 5061 capacity = fix_small_capacity(env->sd, group);
4982 5062
4983 if (!cpumask_test_cpu(i, env->cpus))
4984 continue;
4985
4986 rq = cpu_rq(i); 5063 rq = cpu_rq(i);
4987 wl = weighted_cpuload(i); 5064 wl = weighted_cpuload(i);
4988 5065
@@ -4998,11 +5075,15 @@ static struct rq *find_busiest_queue(struct lb_env *env,
4998 * the weighted_cpuload() scaled with the cpu power, so that 5075 * the weighted_cpuload() scaled with the cpu power, so that
4999 * the load can be moved away from the cpu that is potentially 5076 * the load can be moved away from the cpu that is potentially
5000 * running at a lower capacity. 5077 * running at a lower capacity.
5078 *
5079 * Thus we're looking for max(wl_i / power_i), crosswise
5080 * multiplication to rid ourselves of the division works out
5081 * to: wl_i * power_j > wl_j * power_i; where j is our
5082 * previous maximum.
5001 */ 5083 */
5002 wl = (wl * SCHED_POWER_SCALE) / power; 5084 if (wl * busiest_power > busiest_load * power) {
5003 5085 busiest_load = wl;
5004 if (wl > max_load) { 5086 busiest_power = power;
5005 max_load = wl;
5006 busiest = rq; 5087 busiest = rq;
5007 } 5088 }
5008 } 5089 }
@@ -5039,13 +5120,47 @@ static int need_active_balance(struct lb_env *env)
5039 5120
5040static int active_load_balance_cpu_stop(void *data); 5121static int active_load_balance_cpu_stop(void *data);
5041 5122
5123static int should_we_balance(struct lb_env *env)
5124{
5125 struct sched_group *sg = env->sd->groups;
5126 struct cpumask *sg_cpus, *sg_mask;
5127 int cpu, balance_cpu = -1;
5128
5129 /*
5130 * In the newly idle case, we will allow all the cpu's
5131 * to do the newly idle load balance.
5132 */
5133 if (env->idle == CPU_NEWLY_IDLE)
5134 return 1;
5135
5136 sg_cpus = sched_group_cpus(sg);
5137 sg_mask = sched_group_mask(sg);
5138 /* Try to find first idle cpu */
5139 for_each_cpu_and(cpu, sg_cpus, env->cpus) {
5140 if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu))
5141 continue;
5142
5143 balance_cpu = cpu;
5144 break;
5145 }
5146
5147 if (balance_cpu == -1)
5148 balance_cpu = group_balance_cpu(sg);
5149
5150 /*
5151 * First idle cpu or the first cpu(busiest) in this sched group
5152 * is eligible for doing load balancing at this and above domains.
5153 */
5154 return balance_cpu != env->dst_cpu;
5155}
5156
5042/* 5157/*
5043 * Check this_cpu to ensure it is balanced within domain. Attempt to move 5158 * Check this_cpu to ensure it is balanced within domain. Attempt to move
5044 * tasks if there is an imbalance. 5159 * tasks if there is an imbalance.
5045 */ 5160 */
5046static int load_balance(int this_cpu, struct rq *this_rq, 5161static int load_balance(int this_cpu, struct rq *this_rq,
5047 struct sched_domain *sd, enum cpu_idle_type idle, 5162 struct sched_domain *sd, enum cpu_idle_type idle,
5048 int *balance) 5163 int *continue_balancing)
5049{ 5164{
5050 int ld_moved, cur_ld_moved, active_balance = 0; 5165 int ld_moved, cur_ld_moved, active_balance = 0;
5051 struct sched_group *group; 5166 struct sched_group *group;
@@ -5075,11 +5190,12 @@ static int load_balance(int this_cpu, struct rq *this_rq,
5075 schedstat_inc(sd, lb_count[idle]); 5190 schedstat_inc(sd, lb_count[idle]);
5076 5191
5077redo: 5192redo:
5078 group = find_busiest_group(&env, balance); 5193 if (!should_we_balance(&env)) {
5079 5194 *continue_balancing = 0;
5080 if (*balance == 0)
5081 goto out_balanced; 5195 goto out_balanced;
5196 }
5082 5197
5198 group = find_busiest_group(&env);
5083 if (!group) { 5199 if (!group) {
5084 schedstat_inc(sd, lb_nobusyg[idle]); 5200 schedstat_inc(sd, lb_nobusyg[idle]);
5085 goto out_balanced; 5201 goto out_balanced;
@@ -5108,7 +5224,6 @@ redo:
5108 env.src_rq = busiest; 5224 env.src_rq = busiest;
5109 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); 5225 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
5110 5226
5111 update_h_load(env.src_cpu);
5112more_balance: 5227more_balance:
5113 local_irq_save(flags); 5228 local_irq_save(flags);
5114 double_rq_lock(env.dst_rq, busiest); 5229 double_rq_lock(env.dst_rq, busiest);
@@ -5292,7 +5407,7 @@ void idle_balance(int this_cpu, struct rq *this_rq)
5292 rcu_read_lock(); 5407 rcu_read_lock();
5293 for_each_domain(this_cpu, sd) { 5408 for_each_domain(this_cpu, sd) {
5294 unsigned long interval; 5409 unsigned long interval;
5295 int balance = 1; 5410 int continue_balancing = 1;
5296 5411
5297 if (!(sd->flags & SD_LOAD_BALANCE)) 5412 if (!(sd->flags & SD_LOAD_BALANCE))
5298 continue; 5413 continue;
@@ -5300,7 +5415,8 @@ void idle_balance(int this_cpu, struct rq *this_rq)
5300 if (sd->flags & SD_BALANCE_NEWIDLE) { 5415 if (sd->flags & SD_BALANCE_NEWIDLE) {
5301 /* If we've pulled tasks over stop searching: */ 5416 /* If we've pulled tasks over stop searching: */
5302 pulled_task = load_balance(this_cpu, this_rq, 5417 pulled_task = load_balance(this_cpu, this_rq,
5303 sd, CPU_NEWLY_IDLE, &balance); 5418 sd, CPU_NEWLY_IDLE,
5419 &continue_balancing);
5304 } 5420 }
5305 5421
5306 interval = msecs_to_jiffies(sd->balance_interval); 5422 interval = msecs_to_jiffies(sd->balance_interval);
@@ -5506,7 +5622,7 @@ void nohz_balance_enter_idle(int cpu)
5506 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); 5622 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
5507} 5623}
5508 5624
5509static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb, 5625static int sched_ilb_notifier(struct notifier_block *nfb,
5510 unsigned long action, void *hcpu) 5626 unsigned long action, void *hcpu)
5511{ 5627{
5512 switch (action & ~CPU_TASKS_FROZEN) { 5628 switch (action & ~CPU_TASKS_FROZEN) {
@@ -5538,7 +5654,7 @@ void update_max_interval(void)
5538 */ 5654 */
5539static void rebalance_domains(int cpu, enum cpu_idle_type idle) 5655static void rebalance_domains(int cpu, enum cpu_idle_type idle)
5540{ 5656{
5541 int balance = 1; 5657 int continue_balancing = 1;
5542 struct rq *rq = cpu_rq(cpu); 5658 struct rq *rq = cpu_rq(cpu);
5543 unsigned long interval; 5659 unsigned long interval;
5544 struct sched_domain *sd; 5660 struct sched_domain *sd;
@@ -5570,7 +5686,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
5570 } 5686 }
5571 5687
5572 if (time_after_eq(jiffies, sd->last_balance + interval)) { 5688 if (time_after_eq(jiffies, sd->last_balance + interval)) {
5573 if (load_balance(cpu, rq, sd, idle, &balance)) { 5689 if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
5574 /* 5690 /*
5575 * The LBF_SOME_PINNED logic could have changed 5691 * The LBF_SOME_PINNED logic could have changed
5576 * env->dst_cpu, so we can't know our idle 5692 * env->dst_cpu, so we can't know our idle
@@ -5593,7 +5709,7 @@ out:
5593 * CPU in our sched group which is doing load balancing more 5709 * CPU in our sched group which is doing load balancing more
5594 * actively. 5710 * actively.
5595 */ 5711 */
5596 if (!balance) 5712 if (!continue_balancing)
5597 break; 5713 break;
5598 } 5714 }
5599 rcu_read_unlock(); 5715 rcu_read_unlock();
@@ -5786,7 +5902,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
5786 entity_tick(cfs_rq, se, queued); 5902 entity_tick(cfs_rq, se, queued);
5787 } 5903 }
5788 5904
5789 if (sched_feat_numa(NUMA)) 5905 if (numabalancing_enabled)
5790 task_tick_numa(rq, curr); 5906 task_tick_numa(rq, curr);
5791 5907
5792 update_rq_runnable_avg(rq, 1); 5908 update_rq_runnable_avg(rq, 1);
@@ -5889,11 +6005,9 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
5889 * and ensure we don't carry in an old decay_count if we 6005 * and ensure we don't carry in an old decay_count if we
5890 * switch back. 6006 * switch back.
5891 */ 6007 */
5892 if (p->se.avg.decay_count) { 6008 if (se->avg.decay_count) {
5893 struct cfs_rq *cfs_rq = cfs_rq_of(&p->se); 6009 __synchronize_entity_decay(se);
5894 __synchronize_entity_decay(&p->se); 6010 subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
5895 subtract_blocked_load_contrib(cfs_rq,
5896 p->se.avg.load_avg_contrib);
5897 } 6011 }
5898#endif 6012#endif
5899} 6013}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ef0a7b2439dd..b3c5653e1dca 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -285,7 +285,6 @@ struct cfs_rq {
285 /* Required to track per-cpu representation of a task_group */ 285 /* Required to track per-cpu representation of a task_group */
286 u32 tg_runnable_contrib; 286 u32 tg_runnable_contrib;
287 unsigned long tg_load_contrib; 287 unsigned long tg_load_contrib;
288#endif /* CONFIG_FAIR_GROUP_SCHED */
289 288
290 /* 289 /*
291 * h_load = weight * f(tg) 290 * h_load = weight * f(tg)
@@ -294,6 +293,9 @@ struct cfs_rq {
294 * this group. 293 * this group.
295 */ 294 */
296 unsigned long h_load; 295 unsigned long h_load;
296 u64 last_h_load_update;
297 struct sched_entity *h_load_next;
298#endif /* CONFIG_FAIR_GROUP_SCHED */
297#endif /* CONFIG_SMP */ 299#endif /* CONFIG_SMP */
298 300
299#ifdef CONFIG_FAIR_GROUP_SCHED 301#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -429,9 +431,6 @@ struct rq {
429#ifdef CONFIG_FAIR_GROUP_SCHED 431#ifdef CONFIG_FAIR_GROUP_SCHED
430 /* list of leaf cfs_rq on this cpu: */ 432 /* list of leaf cfs_rq on this cpu: */
431 struct list_head leaf_cfs_rq_list; 433 struct list_head leaf_cfs_rq_list;
432#ifdef CONFIG_SMP
433 unsigned long h_load_throttle;
434#endif /* CONFIG_SMP */
435#endif /* CONFIG_FAIR_GROUP_SCHED */ 434#endif /* CONFIG_FAIR_GROUP_SCHED */
436 435
437#ifdef CONFIG_RT_GROUP_SCHED 436#ifdef CONFIG_RT_GROUP_SCHED
@@ -595,6 +594,7 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
595} 594}
596 595
597DECLARE_PER_CPU(struct sched_domain *, sd_llc); 596DECLARE_PER_CPU(struct sched_domain *, sd_llc);
597DECLARE_PER_CPU(int, sd_llc_size);
598DECLARE_PER_CPU(int, sd_llc_id); 598DECLARE_PER_CPU(int, sd_llc_id);
599 599
600struct sched_group_power { 600struct sched_group_power {
@@ -665,9 +665,9 @@ extern int group_balance_cpu(struct sched_group *sg);
665/* 665/*
666 * Return the group to which this tasks belongs. 666 * Return the group to which this tasks belongs.
667 * 667 *
668 * We cannot use task_subsys_state() and friends because the cgroup 668 * We cannot use task_css() and friends because the cgroup subsystem
669 * subsystem changes that value before the cgroup_subsys::attach() method 669 * changes that value before the cgroup_subsys::attach() method is called,
670 * is called, therefore we cannot pin it and might observe the wrong value. 670 * therefore we cannot pin it and might observe the wrong value.
671 * 671 *
672 * The same is true for autogroup's p->signal->autogroup->tg, the autogroup 672 * The same is true for autogroup's p->signal->autogroup->tg, the autogroup
673 * core changes this before calling sched_move_task(). 673 * core changes this before calling sched_move_task().
diff --git a/kernel/smp.c b/kernel/smp.c
index 4dba0f7b72ad..449b707fc20d 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -73,7 +73,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
73 return NOTIFY_OK; 73 return NOTIFY_OK;
74} 74}
75 75
76static struct notifier_block __cpuinitdata hotplug_cfd_notifier = { 76static struct notifier_block hotplug_cfd_notifier = {
77 .notifier_call = hotplug_cfd, 77 .notifier_call = hotplug_cfd,
78}; 78};
79 79
@@ -186,25 +186,13 @@ void generic_smp_call_function_single_interrupt(void)
186 186
187 while (!list_empty(&list)) { 187 while (!list_empty(&list)) {
188 struct call_single_data *csd; 188 struct call_single_data *csd;
189 unsigned int csd_flags;
190 189
191 csd = list_entry(list.next, struct call_single_data, list); 190 csd = list_entry(list.next, struct call_single_data, list);
192 list_del(&csd->list); 191 list_del(&csd->list);
193 192
194 /*
195 * 'csd' can be invalid after this call if flags == 0
196 * (when called through generic_exec_single()),
197 * so save them away before making the call:
198 */
199 csd_flags = csd->flags;
200
201 csd->func(csd->info); 193 csd->func(csd->info);
202 194
203 /* 195 csd_unlock(csd);
204 * Unlocked CSDs are valid through generic_exec_single():
205 */
206 if (csd_flags & CSD_FLAG_LOCK)
207 csd_unlock(csd);
208 } 196 }
209} 197}
210 198
@@ -278,8 +266,6 @@ EXPORT_SYMBOL(smp_call_function_single);
278 * @wait: If true, wait until function has completed. 266 * @wait: If true, wait until function has completed.
279 * 267 *
280 * Returns 0 on success, else a negative status code (if no cpus were online). 268 * Returns 0 on success, else a negative status code (if no cpus were online).
281 * Note that @wait will be implicitly turned on in case of allocation failures,
282 * since we fall back to on-stack allocation.
283 * 269 *
284 * Selection preference: 270 * Selection preference:
285 * 1) current cpu if in @mask 271 * 1) current cpu if in @mask
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 02fc5c933673..eb89e1807408 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -24,7 +24,7 @@
24 */ 24 */
25static DEFINE_PER_CPU(struct task_struct *, idle_threads); 25static DEFINE_PER_CPU(struct task_struct *, idle_threads);
26 26
27struct task_struct * __cpuinit idle_thread_get(unsigned int cpu) 27struct task_struct *idle_thread_get(unsigned int cpu)
28{ 28{
29 struct task_struct *tsk = per_cpu(idle_threads, cpu); 29 struct task_struct *tsk = per_cpu(idle_threads, cpu);
30 30
diff --git a/kernel/softirq.c b/kernel/softirq.c
index ca25e6e704a2..be3d3514c325 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -699,7 +699,7 @@ void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
699} 699}
700EXPORT_SYMBOL(send_remote_softirq); 700EXPORT_SYMBOL(send_remote_softirq);
701 701
702static int __cpuinit remote_softirq_cpu_notify(struct notifier_block *self, 702static int remote_softirq_cpu_notify(struct notifier_block *self,
703 unsigned long action, void *hcpu) 703 unsigned long action, void *hcpu)
704{ 704{
705 /* 705 /*
@@ -728,7 +728,7 @@ static int __cpuinit remote_softirq_cpu_notify(struct notifier_block *self,
728 return NOTIFY_OK; 728 return NOTIFY_OK;
729} 729}
730 730
731static struct notifier_block __cpuinitdata remote_softirq_cpu_notifier = { 731static struct notifier_block remote_softirq_cpu_notifier = {
732 .notifier_call = remote_softirq_cpu_notify, 732 .notifier_call = remote_softirq_cpu_notify,
733}; 733};
734 734
@@ -830,7 +830,7 @@ static void takeover_tasklets(unsigned int cpu)
830} 830}
831#endif /* CONFIG_HOTPLUG_CPU */ 831#endif /* CONFIG_HOTPLUG_CPU */
832 832
833static int __cpuinit cpu_callback(struct notifier_block *nfb, 833static int cpu_callback(struct notifier_block *nfb,
834 unsigned long action, 834 unsigned long action,
835 void *hcpu) 835 void *hcpu)
836{ 836{
@@ -845,7 +845,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
845 return NOTIFY_OK; 845 return NOTIFY_OK;
846} 846}
847 847
848static struct notifier_block __cpuinitdata cpu_nfb = { 848static struct notifier_block cpu_nfb = {
849 .notifier_call = cpu_callback 849 .notifier_call = cpu_callback
850}; 850};
851 851
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ac09d98490aa..07f6fc468e17 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2346,7 +2346,11 @@ static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp,
2346 int write, void *data) 2346 int write, void *data)
2347{ 2347{
2348 if (write) { 2348 if (write) {
2349 *valp = msecs_to_jiffies(*negp ? -*lvalp : *lvalp); 2349 unsigned long jif = msecs_to_jiffies(*negp ? -*lvalp : *lvalp);
2350
2351 if (jif > INT_MAX)
2352 return 1;
2353 *valp = (int)jif;
2350 } else { 2354 } else {
2351 int val = *valp; 2355 int val = *valp;
2352 unsigned long lval; 2356 unsigned long lval;
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 70f27e89012b..2b62fe86f9ec 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -105,7 +105,6 @@ config NO_HZ_FULL
105 select RCU_USER_QS 105 select RCU_USER_QS
106 select RCU_NOCB_CPU 106 select RCU_NOCB_CPU
107 select VIRT_CPU_ACCOUNTING_GEN 107 select VIRT_CPU_ACCOUNTING_GEN
108 select CONTEXT_TRACKING_FORCE
109 select IRQ_WORK 108 select IRQ_WORK
110 help 109 help
111 Adaptively try to shutdown the tick whenever possible, even when 110 Adaptively try to shutdown the tick whenever possible, even when
@@ -134,6 +133,56 @@ config NO_HZ_FULL_ALL
134 Note the boot CPU will still be kept outside the range to 133 Note the boot CPU will still be kept outside the range to
135 handle the timekeeping duty. 134 handle the timekeeping duty.
136 135
136config NO_HZ_FULL_SYSIDLE
137 bool "Detect full-system idle state for full dynticks system"
138 depends on NO_HZ_FULL
139 default n
140 help
141 At least one CPU must keep the scheduling-clock tick running for
142 timekeeping purposes whenever there is a non-idle CPU, where
143 "non-idle" also includes dynticks CPUs as long as they are
144 running non-idle tasks. Because the underlying adaptive-tick
145 support cannot distinguish between all CPUs being idle and
146 all CPUs each running a single task in dynticks mode, the
147 underlying support simply ensures that there is always a CPU
148 handling the scheduling-clock tick, whether or not all CPUs
149 are idle. This Kconfig option enables scalable detection of
150 the all-CPUs-idle state, thus allowing the scheduling-clock
151 tick to be disabled when all CPUs are idle. Note that scalable
152 detection of the all-CPUs-idle state means that larger systems
153 will be slower to declare the all-CPUs-idle state.
154
155 Say Y if you would like to help debug all-CPUs-idle detection.
156
157 Say N if you are unsure.
158
159config NO_HZ_FULL_SYSIDLE_SMALL
160 int "Number of CPUs above which large-system approach is used"
161 depends on NO_HZ_FULL_SYSIDLE
162 range 1 NR_CPUS
163 default 8
164 help
165 The full-system idle detection mechanism takes a lazy approach
166 on large systems, as is required to attain decent scalability.
167 However, on smaller systems, scalability is not anywhere near as
168 large a concern as is energy efficiency. The sysidle subsystem
169 therefore uses a fast but non-scalable algorithm for small
170 systems and a lazier but scalable algorithm for large systems.
171 This Kconfig parameter defines the number of CPUs in the largest
172 system that will be considered to be "small".
173
174 The default value will be fine in most cases. Battery-powered
175 systems that (1) enable NO_HZ_FULL_SYSIDLE, (2) have larger
176 numbers of CPUs, and (3) are suffering from battery-lifetime
177 problems due to long sysidle latencies might wish to experiment
178 with larger values for this Kconfig parameter. On the other
179 hand, they might be even better served by disabling NO_HZ_FULL
180 entirely, given that NO_HZ_FULL is intended for HPC and
181 real-time workloads that at present do not tend to be run on
182 battery-powered systems.
183
184 Take the default if you are unsure.
185
137config NO_HZ 186config NO_HZ
138 bool "Old Idle dynticks config" 187 bool "Old Idle dynticks config"
139 depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS 188 depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index a326f27d7f09..0b479a6a22bb 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -121,7 +121,7 @@ void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate)
121 BUG_ON(bits > 32); 121 BUG_ON(bits > 32);
122 WARN_ON(!irqs_disabled()); 122 WARN_ON(!irqs_disabled());
123 read_sched_clock = read; 123 read_sched_clock = read;
124 sched_clock_mask = (1 << bits) - 1; 124 sched_clock_mask = (1ULL << bits) - 1;
125 cd.rate = rate; 125 cd.rate = rate;
126 126
127 /* calculate the mult/shift to convert counter ticks to ns. */ 127 /* calculate the mult/shift to convert counter ticks to ns. */
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 69601726a745..3612fc77f834 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -23,6 +23,7 @@
23#include <linux/irq_work.h> 23#include <linux/irq_work.h>
24#include <linux/posix-timers.h> 24#include <linux/posix-timers.h>
25#include <linux/perf_event.h> 25#include <linux/perf_event.h>
26#include <linux/context_tracking.h>
26 27
27#include <asm/irq_regs.h> 28#include <asm/irq_regs.h>
28 29
@@ -148,8 +149,8 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
148} 149}
149 150
150#ifdef CONFIG_NO_HZ_FULL 151#ifdef CONFIG_NO_HZ_FULL
151static cpumask_var_t nohz_full_mask; 152cpumask_var_t tick_nohz_full_mask;
152bool have_nohz_full_mask; 153bool tick_nohz_full_running;
153 154
154static bool can_stop_full_tick(void) 155static bool can_stop_full_tick(void)
155{ 156{
@@ -182,7 +183,8 @@ static bool can_stop_full_tick(void)
182 * Don't allow the user to think they can get 183 * Don't allow the user to think they can get
183 * full NO_HZ with this machine. 184 * full NO_HZ with this machine.
184 */ 185 */
185 WARN_ONCE(1, "NO_HZ FULL will not work with unstable sched clock"); 186 WARN_ONCE(tick_nohz_full_running,
187 "NO_HZ FULL will not work with unstable sched clock");
186 return false; 188 return false;
187 } 189 }
188#endif 190#endif
@@ -196,7 +198,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now);
196 * Re-evaluate the need for the tick on the current CPU 198 * Re-evaluate the need for the tick on the current CPU
197 * and restart it if necessary. 199 * and restart it if necessary.
198 */ 200 */
199void tick_nohz_full_check(void) 201void __tick_nohz_full_check(void)
200{ 202{
201 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 203 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
202 204
@@ -210,7 +212,7 @@ void tick_nohz_full_check(void)
210 212
211static void nohz_full_kick_work_func(struct irq_work *work) 213static void nohz_full_kick_work_func(struct irq_work *work)
212{ 214{
213 tick_nohz_full_check(); 215 __tick_nohz_full_check();
214} 216}
215 217
216static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { 218static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
@@ -229,7 +231,7 @@ void tick_nohz_full_kick(void)
229 231
230static void nohz_full_kick_ipi(void *info) 232static void nohz_full_kick_ipi(void *info)
231{ 233{
232 tick_nohz_full_check(); 234 __tick_nohz_full_check();
233} 235}
234 236
235/* 237/*
@@ -238,12 +240,13 @@ static void nohz_full_kick_ipi(void *info)
238 */ 240 */
239void tick_nohz_full_kick_all(void) 241void tick_nohz_full_kick_all(void)
240{ 242{
241 if (!have_nohz_full_mask) 243 if (!tick_nohz_full_running)
242 return; 244 return;
243 245
244 preempt_disable(); 246 preempt_disable();
245 smp_call_function_many(nohz_full_mask, 247 smp_call_function_many(tick_nohz_full_mask,
246 nohz_full_kick_ipi, NULL, false); 248 nohz_full_kick_ipi, NULL, false);
249 tick_nohz_full_kick();
247 preempt_enable(); 250 preempt_enable();
248} 251}
249 252
@@ -252,7 +255,7 @@ void tick_nohz_full_kick_all(void)
252 * It might need the tick due to per task/process properties: 255 * It might need the tick due to per task/process properties:
253 * perf events, posix cpu timers, ... 256 * perf events, posix cpu timers, ...
254 */ 257 */
255void tick_nohz_task_switch(struct task_struct *tsk) 258void __tick_nohz_task_switch(struct task_struct *tsk)
256{ 259{
257 unsigned long flags; 260 unsigned long flags;
258 261
@@ -268,37 +271,29 @@ out:
268 local_irq_restore(flags); 271 local_irq_restore(flags);
269} 272}
270 273
271int tick_nohz_full_cpu(int cpu)
272{
273 if (!have_nohz_full_mask)
274 return 0;
275
276 return cpumask_test_cpu(cpu, nohz_full_mask);
277}
278
279/* Parse the boot-time nohz CPU list from the kernel parameters. */ 274/* Parse the boot-time nohz CPU list from the kernel parameters. */
280static int __init tick_nohz_full_setup(char *str) 275static int __init tick_nohz_full_setup(char *str)
281{ 276{
282 int cpu; 277 int cpu;
283 278
284 alloc_bootmem_cpumask_var(&nohz_full_mask); 279 alloc_bootmem_cpumask_var(&tick_nohz_full_mask);
285 if (cpulist_parse(str, nohz_full_mask) < 0) { 280 if (cpulist_parse(str, tick_nohz_full_mask) < 0) {
286 pr_warning("NOHZ: Incorrect nohz_full cpumask\n"); 281 pr_warning("NOHZ: Incorrect nohz_full cpumask\n");
287 return 1; 282 return 1;
288 } 283 }
289 284
290 cpu = smp_processor_id(); 285 cpu = smp_processor_id();
291 if (cpumask_test_cpu(cpu, nohz_full_mask)) { 286 if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) {
292 pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu); 287 pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu);
293 cpumask_clear_cpu(cpu, nohz_full_mask); 288 cpumask_clear_cpu(cpu, tick_nohz_full_mask);
294 } 289 }
295 have_nohz_full_mask = true; 290 tick_nohz_full_running = true;
296 291
297 return 1; 292 return 1;
298} 293}
299__setup("nohz_full=", tick_nohz_full_setup); 294__setup("nohz_full=", tick_nohz_full_setup);
300 295
301static int __cpuinit tick_nohz_cpu_down_callback(struct notifier_block *nfb, 296static int tick_nohz_cpu_down_callback(struct notifier_block *nfb,
302 unsigned long action, 297 unsigned long action,
303 void *hcpu) 298 void *hcpu)
304{ 299{
@@ -310,7 +305,7 @@ static int __cpuinit tick_nohz_cpu_down_callback(struct notifier_block *nfb,
310 * If we handle the timekeeping duty for full dynticks CPUs, 305 * If we handle the timekeeping duty for full dynticks CPUs,
311 * we can't safely shutdown that CPU. 306 * we can't safely shutdown that CPU.
312 */ 307 */
313 if (have_nohz_full_mask && tick_do_timer_cpu == cpu) 308 if (tick_nohz_full_running && tick_do_timer_cpu == cpu)
314 return NOTIFY_BAD; 309 return NOTIFY_BAD;
315 break; 310 break;
316 } 311 }
@@ -329,14 +324,14 @@ static int tick_nohz_init_all(void)
329 int err = -1; 324 int err = -1;
330 325
331#ifdef CONFIG_NO_HZ_FULL_ALL 326#ifdef CONFIG_NO_HZ_FULL_ALL
332 if (!alloc_cpumask_var(&nohz_full_mask, GFP_KERNEL)) { 327 if (!alloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL)) {
333 pr_err("NO_HZ: Can't allocate full dynticks cpumask\n"); 328 pr_err("NO_HZ: Can't allocate full dynticks cpumask\n");
334 return err; 329 return err;
335 } 330 }
336 err = 0; 331 err = 0;
337 cpumask_setall(nohz_full_mask); 332 cpumask_setall(tick_nohz_full_mask);
338 cpumask_clear_cpu(smp_processor_id(), nohz_full_mask); 333 cpumask_clear_cpu(smp_processor_id(), tick_nohz_full_mask);
339 have_nohz_full_mask = true; 334 tick_nohz_full_running = true;
340#endif 335#endif
341 return err; 336 return err;
342} 337}
@@ -345,17 +340,18 @@ void __init tick_nohz_init(void)
345{ 340{
346 int cpu; 341 int cpu;
347 342
348 if (!have_nohz_full_mask) { 343 if (!tick_nohz_full_running) {
349 if (tick_nohz_init_all() < 0) 344 if (tick_nohz_init_all() < 0)
350 return; 345 return;
351 } 346 }
352 347
348 for_each_cpu(cpu, tick_nohz_full_mask)
349 context_tracking_cpu_set(cpu);
350
353 cpu_notifier(tick_nohz_cpu_down_callback, 0); 351 cpu_notifier(tick_nohz_cpu_down_callback, 0);
354 cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), nohz_full_mask); 352 cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), tick_nohz_full_mask);
355 pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf); 353 pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf);
356} 354}
357#else
358#define have_nohz_full_mask (0)
359#endif 355#endif
360 356
361/* 357/*
@@ -733,7 +729,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
733 return false; 729 return false;
734 } 730 }
735 731
736 if (have_nohz_full_mask) { 732 if (tick_nohz_full_enabled()) {
737 /* 733 /*
738 * Keep the tick alive to guarantee timekeeping progression 734 * Keep the tick alive to guarantee timekeeping progression
739 * if there are full dynticks CPUs around 735 * if there are full dynticks CPUs around
@@ -827,13 +823,10 @@ void tick_nohz_irq_exit(void)
827{ 823{
828 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 824 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
829 825
830 if (ts->inidle) { 826 if (ts->inidle)
831 /* Cancel the timer because CPU already waken up from the C-states*/
832 menu_hrtimer_cancel();
833 __tick_nohz_idle_enter(ts); 827 __tick_nohz_idle_enter(ts);
834 } else { 828 else
835 tick_nohz_full_stop_tick(ts); 829 tick_nohz_full_stop_tick(ts);
836 }
837} 830}
838 831
839/** 832/**
@@ -931,8 +924,6 @@ void tick_nohz_idle_exit(void)
931 924
932 ts->inidle = 0; 925 ts->inidle = 0;
933 926
934 /* Cancel the timer because CPU already waken up from the C-states*/
935 menu_hrtimer_cancel();
936 if (ts->idle_active || ts->tick_stopped) 927 if (ts->idle_active || ts->tick_stopped)
937 now = ktime_get(); 928 now = ktime_get();
938 929
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 3bdf28323012..61ed862cdd37 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -265,10 +265,9 @@ static inline void timer_list_header(struct seq_file *m, u64 now)
265static int timer_list_show(struct seq_file *m, void *v) 265static int timer_list_show(struct seq_file *m, void *v)
266{ 266{
267 struct timer_list_iter *iter = v; 267 struct timer_list_iter *iter = v;
268 u64 now = ktime_to_ns(ktime_get());
269 268
270 if (iter->cpu == -1 && !iter->second_pass) 269 if (iter->cpu == -1 && !iter->second_pass)
271 timer_list_header(m, now); 270 timer_list_header(m, iter->now);
272 else if (!iter->second_pass) 271 else if (!iter->second_pass)
273 print_cpu(m, iter->cpu, iter->now); 272 print_cpu(m, iter->cpu, iter->now);
274#ifdef CONFIG_GENERIC_CLOCKEVENTS 273#ifdef CONFIG_GENERIC_CLOCKEVENTS
@@ -298,33 +297,41 @@ void sysrq_timer_list_show(void)
298 return; 297 return;
299} 298}
300 299
301static void *timer_list_start(struct seq_file *file, loff_t *offset) 300static void *move_iter(struct timer_list_iter *iter, loff_t offset)
302{ 301{
303 struct timer_list_iter *iter = file->private; 302 for (; offset; offset--) {
304 303 iter->cpu = cpumask_next(iter->cpu, cpu_online_mask);
305 if (!*offset) { 304 if (iter->cpu >= nr_cpu_ids) {
306 iter->cpu = -1;
307 iter->now = ktime_to_ns(ktime_get());
308 } else if (iter->cpu >= nr_cpu_ids) {
309#ifdef CONFIG_GENERIC_CLOCKEVENTS 305#ifdef CONFIG_GENERIC_CLOCKEVENTS
310 if (!iter->second_pass) { 306 if (!iter->second_pass) {
311 iter->cpu = -1; 307 iter->cpu = -1;
312 iter->second_pass = true; 308 iter->second_pass = true;
313 } else 309 } else
314 return NULL; 310 return NULL;
315#else 311#else
316 return NULL; 312 return NULL;
317#endif 313#endif
314 }
318 } 315 }
319 return iter; 316 return iter;
320} 317}
321 318
319static void *timer_list_start(struct seq_file *file, loff_t *offset)
320{
321 struct timer_list_iter *iter = file->private;
322
323 if (!*offset)
324 iter->now = ktime_to_ns(ktime_get());
325 iter->cpu = -1;
326 iter->second_pass = false;
327 return move_iter(iter, *offset);
328}
329
322static void *timer_list_next(struct seq_file *file, void *v, loff_t *offset) 330static void *timer_list_next(struct seq_file *file, void *v, loff_t *offset)
323{ 331{
324 struct timer_list_iter *iter = file->private; 332 struct timer_list_iter *iter = file->private;
325 iter->cpu = cpumask_next(iter->cpu, cpu_online_mask);
326 ++*offset; 333 ++*offset;
327 return timer_list_start(file, offset); 334 return move_iter(iter, 1);
328} 335}
329 336
330static void timer_list_stop(struct seq_file *seq, void *v) 337static void timer_list_stop(struct seq_file *seq, void *v)
diff --git a/kernel/timer.c b/kernel/timer.c
index 15bc1b41021d..4296d13db3d1 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1505,11 +1505,11 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout)
1505} 1505}
1506EXPORT_SYMBOL(schedule_timeout_uninterruptible); 1506EXPORT_SYMBOL(schedule_timeout_uninterruptible);
1507 1507
1508static int __cpuinit init_timers_cpu(int cpu) 1508static int init_timers_cpu(int cpu)
1509{ 1509{
1510 int j; 1510 int j;
1511 struct tvec_base *base; 1511 struct tvec_base *base;
1512 static char __cpuinitdata tvec_base_done[NR_CPUS]; 1512 static char tvec_base_done[NR_CPUS];
1513 1513
1514 if (!tvec_base_done[cpu]) { 1514 if (!tvec_base_done[cpu]) {
1515 static char boot_done; 1515 static char boot_done;
@@ -1577,7 +1577,7 @@ static void migrate_timer_list(struct tvec_base *new_base, struct list_head *hea
1577 } 1577 }
1578} 1578}
1579 1579
1580static void __cpuinit migrate_timers(int cpu) 1580static void migrate_timers(int cpu)
1581{ 1581{
1582 struct tvec_base *old_base; 1582 struct tvec_base *old_base;
1583 struct tvec_base *new_base; 1583 struct tvec_base *new_base;
@@ -1610,7 +1610,7 @@ static void __cpuinit migrate_timers(int cpu)
1610} 1610}
1611#endif /* CONFIG_HOTPLUG_CPU */ 1611#endif /* CONFIG_HOTPLUG_CPU */
1612 1612
1613static int __cpuinit timer_cpu_notify(struct notifier_block *self, 1613static int timer_cpu_notify(struct notifier_block *self,
1614 unsigned long action, void *hcpu) 1614 unsigned long action, void *hcpu)
1615{ 1615{
1616 long cpu = (long)hcpu; 1616 long cpu = (long)hcpu;
@@ -1635,7 +1635,7 @@ static int __cpuinit timer_cpu_notify(struct notifier_block *self,
1635 return NOTIFY_OK; 1635 return NOTIFY_OK;
1636} 1636}
1637 1637
1638static struct notifier_block __cpuinitdata timers_nb = { 1638static struct notifier_block timers_nb = {
1639 .notifier_call = timer_cpu_notify, 1639 .notifier_call = timer_cpu_notify,
1640}; 1640};
1641 1641
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 67708f46baae..a6d098c6df3f 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1441,12 +1441,22 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
1441 * the hashes are freed with call_rcu_sched(). 1441 * the hashes are freed with call_rcu_sched().
1442 */ 1442 */
1443static int 1443static int
1444ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) 1444ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)
1445{ 1445{
1446 struct ftrace_hash *filter_hash; 1446 struct ftrace_hash *filter_hash;
1447 struct ftrace_hash *notrace_hash; 1447 struct ftrace_hash *notrace_hash;
1448 int ret; 1448 int ret;
1449 1449
1450#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
1451 /*
1452 * There's a small race when adding ops that the ftrace handler
1453 * that wants regs, may be called without them. We can not
1454 * allow that handler to be called if regs is NULL.
1455 */
1456 if (regs == NULL && (ops->flags & FTRACE_OPS_FL_SAVE_REGS))
1457 return 0;
1458#endif
1459
1450 filter_hash = rcu_dereference_raw_notrace(ops->filter_hash); 1460 filter_hash = rcu_dereference_raw_notrace(ops->filter_hash);
1451 notrace_hash = rcu_dereference_raw_notrace(ops->notrace_hash); 1461 notrace_hash = rcu_dereference_raw_notrace(ops->notrace_hash);
1452 1462
@@ -2159,12 +2169,57 @@ static cycle_t ftrace_update_time;
2159static unsigned long ftrace_update_cnt; 2169static unsigned long ftrace_update_cnt;
2160unsigned long ftrace_update_tot_cnt; 2170unsigned long ftrace_update_tot_cnt;
2161 2171
2162static int ops_traces_mod(struct ftrace_ops *ops) 2172static inline int ops_traces_mod(struct ftrace_ops *ops)
2163{ 2173{
2164 struct ftrace_hash *hash; 2174 /*
2175 * Filter_hash being empty will default to trace module.
2176 * But notrace hash requires a test of individual module functions.
2177 */
2178 return ftrace_hash_empty(ops->filter_hash) &&
2179 ftrace_hash_empty(ops->notrace_hash);
2180}
2181
2182/*
2183 * Check if the current ops references the record.
2184 *
2185 * If the ops traces all functions, then it was already accounted for.
2186 * If the ops does not trace the current record function, skip it.
2187 * If the ops ignores the function via notrace filter, skip it.
2188 */
2189static inline bool
2190ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec)
2191{
2192 /* If ops isn't enabled, ignore it */
2193 if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
2194 return 0;
2195
2196 /* If ops traces all mods, we already accounted for it */
2197 if (ops_traces_mod(ops))
2198 return 0;
2199
2200 /* The function must be in the filter */
2201 if (!ftrace_hash_empty(ops->filter_hash) &&
2202 !ftrace_lookup_ip(ops->filter_hash, rec->ip))
2203 return 0;
2204
2205 /* If in notrace hash, we ignore it too */
2206 if (ftrace_lookup_ip(ops->notrace_hash, rec->ip))
2207 return 0;
2165 2208
2166 hash = ops->filter_hash; 2209 return 1;
2167 return ftrace_hash_empty(hash); 2210}
2211
2212static int referenced_filters(struct dyn_ftrace *rec)
2213{
2214 struct ftrace_ops *ops;
2215 int cnt = 0;
2216
2217 for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) {
2218 if (ops_references_rec(ops, rec))
2219 cnt++;
2220 }
2221
2222 return cnt;
2168} 2223}
2169 2224
2170static int ftrace_update_code(struct module *mod) 2225static int ftrace_update_code(struct module *mod)
@@ -2173,6 +2228,7 @@ static int ftrace_update_code(struct module *mod)
2173 struct dyn_ftrace *p; 2228 struct dyn_ftrace *p;
2174 cycle_t start, stop; 2229 cycle_t start, stop;
2175 unsigned long ref = 0; 2230 unsigned long ref = 0;
2231 bool test = false;
2176 int i; 2232 int i;
2177 2233
2178 /* 2234 /*
@@ -2186,9 +2242,12 @@ static int ftrace_update_code(struct module *mod)
2186 2242
2187 for (ops = ftrace_ops_list; 2243 for (ops = ftrace_ops_list;
2188 ops != &ftrace_list_end; ops = ops->next) { 2244 ops != &ftrace_list_end; ops = ops->next) {
2189 if (ops->flags & FTRACE_OPS_FL_ENABLED && 2245 if (ops->flags & FTRACE_OPS_FL_ENABLED) {
2190 ops_traces_mod(ops)) 2246 if (ops_traces_mod(ops))
2191 ref++; 2247 ref++;
2248 else
2249 test = true;
2250 }
2192 } 2251 }
2193 } 2252 }
2194 2253
@@ -2198,12 +2257,16 @@ static int ftrace_update_code(struct module *mod)
2198 for (pg = ftrace_new_pgs; pg; pg = pg->next) { 2257 for (pg = ftrace_new_pgs; pg; pg = pg->next) {
2199 2258
2200 for (i = 0; i < pg->index; i++) { 2259 for (i = 0; i < pg->index; i++) {
2260 int cnt = ref;
2261
2201 /* If something went wrong, bail without enabling anything */ 2262 /* If something went wrong, bail without enabling anything */
2202 if (unlikely(ftrace_disabled)) 2263 if (unlikely(ftrace_disabled))
2203 return -1; 2264 return -1;
2204 2265
2205 p = &pg->records[i]; 2266 p = &pg->records[i];
2206 p->flags = ref; 2267 if (test)
2268 cnt += referenced_filters(p);
2269 p->flags = cnt;
2207 2270
2208 /* 2271 /*
2209 * Do the initial record conversion from mcount jump 2272 * Do the initial record conversion from mcount jump
@@ -2223,7 +2286,7 @@ static int ftrace_update_code(struct module *mod)
2223 * conversion puts the module to the correct state, thus 2286 * conversion puts the module to the correct state, thus
2224 * passing the ftrace_make_call check. 2287 * passing the ftrace_make_call check.
2225 */ 2288 */
2226 if (ftrace_start_up && ref) { 2289 if (ftrace_start_up && cnt) {
2227 int failed = __ftrace_replace_code(p, 1); 2290 int failed = __ftrace_replace_code(p, 1);
2228 if (failed) 2291 if (failed)
2229 ftrace_bug(failed, p->ip); 2292 ftrace_bug(failed, p->ip);
@@ -3374,6 +3437,12 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
3374 return add_hash_entry(hash, ip); 3437 return add_hash_entry(hash, ip);
3375} 3438}
3376 3439
3440static void ftrace_ops_update_code(struct ftrace_ops *ops)
3441{
3442 if (ops->flags & FTRACE_OPS_FL_ENABLED && ftrace_enabled)
3443 ftrace_run_update_code(FTRACE_UPDATE_CALLS);
3444}
3445
3377static int 3446static int
3378ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, 3447ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
3379 unsigned long ip, int remove, int reset, int enable) 3448 unsigned long ip, int remove, int reset, int enable)
@@ -3416,9 +3485,8 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
3416 3485
3417 mutex_lock(&ftrace_lock); 3486 mutex_lock(&ftrace_lock);
3418 ret = ftrace_hash_move(ops, enable, orig_hash, hash); 3487 ret = ftrace_hash_move(ops, enable, orig_hash, hash);
3419 if (!ret && ops->flags & FTRACE_OPS_FL_ENABLED 3488 if (!ret)
3420 && ftrace_enabled) 3489 ftrace_ops_update_code(ops);
3421 ftrace_run_update_code(FTRACE_UPDATE_CALLS);
3422 3490
3423 mutex_unlock(&ftrace_lock); 3491 mutex_unlock(&ftrace_lock);
3424 3492
@@ -3645,9 +3713,8 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
3645 mutex_lock(&ftrace_lock); 3713 mutex_lock(&ftrace_lock);
3646 ret = ftrace_hash_move(iter->ops, filter_hash, 3714 ret = ftrace_hash_move(iter->ops, filter_hash,
3647 orig_hash, iter->hash); 3715 orig_hash, iter->hash);
3648 if (!ret && (iter->ops->flags & FTRACE_OPS_FL_ENABLED) 3716 if (!ret)
3649 && ftrace_enabled) 3717 ftrace_ops_update_code(iter->ops);
3650 ftrace_run_update_code(FTRACE_UPDATE_CALLS);
3651 3718
3652 mutex_unlock(&ftrace_lock); 3719 mutex_unlock(&ftrace_lock);
3653 } 3720 }
@@ -4218,7 +4285,7 @@ static inline void ftrace_startup_enable(int command) { }
4218# define ftrace_shutdown_sysctl() do { } while (0) 4285# define ftrace_shutdown_sysctl() do { } while (0)
4219 4286
4220static inline int 4287static inline int
4221ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) 4288ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)
4222{ 4289{
4223 return 1; 4290 return 1;
4224} 4291}
@@ -4241,7 +4308,7 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
4241 do_for_each_ftrace_op(op, ftrace_control_list) { 4308 do_for_each_ftrace_op(op, ftrace_control_list) {
4242 if (!(op->flags & FTRACE_OPS_FL_STUB) && 4309 if (!(op->flags & FTRACE_OPS_FL_STUB) &&
4243 !ftrace_function_local_disabled(op) && 4310 !ftrace_function_local_disabled(op) &&
4244 ftrace_ops_test(op, ip)) 4311 ftrace_ops_test(op, ip, regs))
4245 op->func(ip, parent_ip, op, regs); 4312 op->func(ip, parent_ip, op, regs);
4246 } while_for_each_ftrace_op(op); 4313 } while_for_each_ftrace_op(op);
4247 trace_recursion_clear(TRACE_CONTROL_BIT); 4314 trace_recursion_clear(TRACE_CONTROL_BIT);
@@ -4274,7 +4341,7 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
4274 */ 4341 */
4275 preempt_disable_notrace(); 4342 preempt_disable_notrace();
4276 do_for_each_ftrace_op(op, ftrace_ops_list) { 4343 do_for_each_ftrace_op(op, ftrace_ops_list) {
4277 if (ftrace_ops_test(op, ip)) 4344 if (ftrace_ops_test(op, ip, regs))
4278 op->func(ip, parent_ip, op, regs); 4345 op->func(ip, parent_ip, op, regs);
4279 } while_for_each_ftrace_op(op); 4346 } while_for_each_ftrace_op(op);
4280 preempt_enable_notrace(); 4347 preempt_enable_notrace();
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index e444ff88f0a4..cc2f66f68dc5 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -36,11 +36,11 @@ int ring_buffer_print_entry_header(struct trace_seq *s)
36{ 36{
37 int ret; 37 int ret;
38 38
39 ret = trace_seq_printf(s, "# compressed entry header\n"); 39 ret = trace_seq_puts(s, "# compressed entry header\n");
40 ret = trace_seq_printf(s, "\ttype_len : 5 bits\n"); 40 ret = trace_seq_puts(s, "\ttype_len : 5 bits\n");
41 ret = trace_seq_printf(s, "\ttime_delta : 27 bits\n"); 41 ret = trace_seq_puts(s, "\ttime_delta : 27 bits\n");
42 ret = trace_seq_printf(s, "\tarray : 32 bits\n"); 42 ret = trace_seq_puts(s, "\tarray : 32 bits\n");
43 ret = trace_seq_printf(s, "\n"); 43 ret = trace_seq_putc(s, '\n');
44 ret = trace_seq_printf(s, "\tpadding : type == %d\n", 44 ret = trace_seq_printf(s, "\tpadding : type == %d\n",
45 RINGBUF_TYPE_PADDING); 45 RINGBUF_TYPE_PADDING);
46 ret = trace_seq_printf(s, "\ttime_extend : type == %d\n", 46 ret = trace_seq_printf(s, "\ttime_extend : type == %d\n",
@@ -1066,7 +1066,7 @@ static int rb_check_list(struct ring_buffer_per_cpu *cpu_buffer,
1066} 1066}
1067 1067
1068/** 1068/**
1069 * check_pages - integrity check of buffer pages 1069 * rb_check_pages - integrity check of buffer pages
1070 * @cpu_buffer: CPU buffer with pages to test 1070 * @cpu_buffer: CPU buffer with pages to test
1071 * 1071 *
1072 * As a safety measure we check to make sure the data pages have not 1072 * As a safety measure we check to make sure the data pages have not
@@ -1258,7 +1258,7 @@ static int rb_cpu_notify(struct notifier_block *self,
1258#endif 1258#endif
1259 1259
1260/** 1260/**
1261 * ring_buffer_alloc - allocate a new ring_buffer 1261 * __ring_buffer_alloc - allocate a new ring_buffer
1262 * @size: the size in bytes per cpu that is needed. 1262 * @size: the size in bytes per cpu that is needed.
1263 * @flags: attributes to set for the ring buffer. 1263 * @flags: attributes to set for the ring buffer.
1264 * 1264 *
@@ -1607,6 +1607,7 @@ static void update_pages_handler(struct work_struct *work)
1607 * ring_buffer_resize - resize the ring buffer 1607 * ring_buffer_resize - resize the ring buffer
1608 * @buffer: the buffer to resize. 1608 * @buffer: the buffer to resize.
1609 * @size: the new size. 1609 * @size: the new size.
1610 * @cpu_id: the cpu buffer to resize
1610 * 1611 *
1611 * Minimum size is 2 * BUF_PAGE_SIZE. 1612 * Minimum size is 2 * BUF_PAGE_SIZE.
1612 * 1613 *
@@ -3956,11 +3957,11 @@ EXPORT_SYMBOL_GPL(ring_buffer_consume);
3956 * expected. 3957 * expected.
3957 * 3958 *
3958 * After a sequence of ring_buffer_read_prepare calls, the user is 3959 * After a sequence of ring_buffer_read_prepare calls, the user is
3959 * expected to make at least one call to ring_buffer_prepare_sync. 3960 * expected to make at least one call to ring_buffer_read_prepare_sync.
3960 * Afterwards, ring_buffer_read_start is invoked to get things going 3961 * Afterwards, ring_buffer_read_start is invoked to get things going
3961 * for real. 3962 * for real.
3962 * 3963 *
3963 * This overall must be paired with ring_buffer_finish. 3964 * This overall must be paired with ring_buffer_read_finish.
3964 */ 3965 */
3965struct ring_buffer_iter * 3966struct ring_buffer_iter *
3966ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu) 3967ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu)
@@ -4009,7 +4010,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync);
4009 * an intervening ring_buffer_read_prepare_sync must have been 4010 * an intervening ring_buffer_read_prepare_sync must have been
4010 * performed. 4011 * performed.
4011 * 4012 *
4012 * Must be paired with ring_buffer_finish. 4013 * Must be paired with ring_buffer_read_finish.
4013 */ 4014 */
4014void 4015void
4015ring_buffer_read_start(struct ring_buffer_iter *iter) 4016ring_buffer_read_start(struct ring_buffer_iter *iter)
@@ -4031,7 +4032,7 @@ ring_buffer_read_start(struct ring_buffer_iter *iter)
4031EXPORT_SYMBOL_GPL(ring_buffer_read_start); 4032EXPORT_SYMBOL_GPL(ring_buffer_read_start);
4032 4033
4033/** 4034/**
4034 * ring_buffer_finish - finish reading the iterator of the buffer 4035 * ring_buffer_read_finish - finish reading the iterator of the buffer
4035 * @iter: The iterator retrieved by ring_buffer_start 4036 * @iter: The iterator retrieved by ring_buffer_start
4036 * 4037 *
4037 * This re-enables the recording to the buffer, and frees the 4038 * This re-enables the recording to the buffer, and frees the
@@ -4346,6 +4347,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
4346/** 4347/**
4347 * ring_buffer_alloc_read_page - allocate a page to read from buffer 4348 * ring_buffer_alloc_read_page - allocate a page to read from buffer
4348 * @buffer: the buffer to allocate for. 4349 * @buffer: the buffer to allocate for.
4350 * @cpu: the cpu buffer to allocate.
4349 * 4351 *
4350 * This function is used in conjunction with ring_buffer_read_page. 4352 * This function is used in conjunction with ring_buffer_read_page.
4351 * When reading a full page from the ring buffer, these functions 4353 * When reading a full page from the ring buffer, these functions
@@ -4403,7 +4405,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
4403 * to swap with a page in the ring buffer. 4405 * to swap with a page in the ring buffer.
4404 * 4406 *
4405 * for example: 4407 * for example:
4406 * rpage = ring_buffer_alloc_read_page(buffer); 4408 * rpage = ring_buffer_alloc_read_page(buffer, cpu);
4407 * if (!rpage) 4409 * if (!rpage)
4408 * return error; 4410 * return error;
4409 * ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0); 4411 * ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0cd500bffd9b..496f94d57698 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -243,20 +243,25 @@ int filter_current_check_discard(struct ring_buffer *buffer,
243} 243}
244EXPORT_SYMBOL_GPL(filter_current_check_discard); 244EXPORT_SYMBOL_GPL(filter_current_check_discard);
245 245
246cycle_t ftrace_now(int cpu) 246cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu)
247{ 247{
248 u64 ts; 248 u64 ts;
249 249
250 /* Early boot up does not have a buffer yet */ 250 /* Early boot up does not have a buffer yet */
251 if (!global_trace.trace_buffer.buffer) 251 if (!buf->buffer)
252 return trace_clock_local(); 252 return trace_clock_local();
253 253
254 ts = ring_buffer_time_stamp(global_trace.trace_buffer.buffer, cpu); 254 ts = ring_buffer_time_stamp(buf->buffer, cpu);
255 ring_buffer_normalize_time_stamp(global_trace.trace_buffer.buffer, cpu, &ts); 255 ring_buffer_normalize_time_stamp(buf->buffer, cpu, &ts);
256 256
257 return ts; 257 return ts;
258} 258}
259 259
260cycle_t ftrace_now(int cpu)
261{
262 return buffer_ftrace_now(&global_trace.trace_buffer, cpu);
263}
264
260/** 265/**
261 * tracing_is_enabled - Show if global_trace has been disabled 266 * tracing_is_enabled - Show if global_trace has been disabled
262 * 267 *
@@ -1211,7 +1216,7 @@ void tracing_reset_online_cpus(struct trace_buffer *buf)
1211 /* Make sure all commits have finished */ 1216 /* Make sure all commits have finished */
1212 synchronize_sched(); 1217 synchronize_sched();
1213 1218
1214 buf->time_start = ftrace_now(buf->cpu); 1219 buf->time_start = buffer_ftrace_now(buf, buf->cpu);
1215 1220
1216 for_each_online_cpu(cpu) 1221 for_each_online_cpu(cpu)
1217 ring_buffer_reset_cpu(buffer, cpu); 1222 ring_buffer_reset_cpu(buffer, cpu);
@@ -1219,23 +1224,17 @@ void tracing_reset_online_cpus(struct trace_buffer *buf)
1219 ring_buffer_record_enable(buffer); 1224 ring_buffer_record_enable(buffer);
1220} 1225}
1221 1226
1222void tracing_reset_current(int cpu) 1227/* Must have trace_types_lock held */
1223{
1224 tracing_reset(&global_trace.trace_buffer, cpu);
1225}
1226
1227void tracing_reset_all_online_cpus(void) 1228void tracing_reset_all_online_cpus(void)
1228{ 1229{
1229 struct trace_array *tr; 1230 struct trace_array *tr;
1230 1231
1231 mutex_lock(&trace_types_lock);
1232 list_for_each_entry(tr, &ftrace_trace_arrays, list) { 1232 list_for_each_entry(tr, &ftrace_trace_arrays, list) {
1233 tracing_reset_online_cpus(&tr->trace_buffer); 1233 tracing_reset_online_cpus(&tr->trace_buffer);
1234#ifdef CONFIG_TRACER_MAX_TRACE 1234#ifdef CONFIG_TRACER_MAX_TRACE
1235 tracing_reset_online_cpus(&tr->max_buffer); 1235 tracing_reset_online_cpus(&tr->max_buffer);
1236#endif 1236#endif
1237 } 1237 }
1238 mutex_unlock(&trace_types_lock);
1239} 1238}
1240 1239
1241#define SAVED_CMDLINES 128 1240#define SAVED_CMDLINES 128
@@ -2843,6 +2842,17 @@ static int s_show(struct seq_file *m, void *v)
2843 return 0; 2842 return 0;
2844} 2843}
2845 2844
2845/*
2846 * Should be used after trace_array_get(), trace_types_lock
2847 * ensures that i_cdev was already initialized.
2848 */
2849static inline int tracing_get_cpu(struct inode *inode)
2850{
2851 if (inode->i_cdev) /* See trace_create_cpu_file() */
2852 return (long)inode->i_cdev - 1;
2853 return RING_BUFFER_ALL_CPUS;
2854}
2855
2846static const struct seq_operations tracer_seq_ops = { 2856static const struct seq_operations tracer_seq_ops = {
2847 .start = s_start, 2857 .start = s_start,
2848 .next = s_next, 2858 .next = s_next,
@@ -2851,9 +2861,9 @@ static const struct seq_operations tracer_seq_ops = {
2851}; 2861};
2852 2862
2853static struct trace_iterator * 2863static struct trace_iterator *
2854__tracing_open(struct trace_array *tr, struct trace_cpu *tc, 2864__tracing_open(struct inode *inode, struct file *file, bool snapshot)
2855 struct inode *inode, struct file *file, bool snapshot)
2856{ 2865{
2866 struct trace_array *tr = inode->i_private;
2857 struct trace_iterator *iter; 2867 struct trace_iterator *iter;
2858 int cpu; 2868 int cpu;
2859 2869
@@ -2894,8 +2904,8 @@ __tracing_open(struct trace_array *tr, struct trace_cpu *tc,
2894 iter->trace_buffer = &tr->trace_buffer; 2904 iter->trace_buffer = &tr->trace_buffer;
2895 iter->snapshot = snapshot; 2905 iter->snapshot = snapshot;
2896 iter->pos = -1; 2906 iter->pos = -1;
2907 iter->cpu_file = tracing_get_cpu(inode);
2897 mutex_init(&iter->mutex); 2908 mutex_init(&iter->mutex);
2898 iter->cpu_file = tc->cpu;
2899 2909
2900 /* Notify the tracer early; before we stop tracing. */ 2910 /* Notify the tracer early; before we stop tracing. */
2901 if (iter->trace && iter->trace->open) 2911 if (iter->trace && iter->trace->open)
@@ -2971,45 +2981,22 @@ static int tracing_open_generic_tr(struct inode *inode, struct file *filp)
2971 filp->private_data = inode->i_private; 2981 filp->private_data = inode->i_private;
2972 2982
2973 return 0; 2983 return 0;
2974
2975}
2976
2977static int tracing_open_generic_tc(struct inode *inode, struct file *filp)
2978{
2979 struct trace_cpu *tc = inode->i_private;
2980 struct trace_array *tr = tc->tr;
2981
2982 if (tracing_disabled)
2983 return -ENODEV;
2984
2985 if (trace_array_get(tr) < 0)
2986 return -ENODEV;
2987
2988 filp->private_data = inode->i_private;
2989
2990 return 0;
2991
2992} 2984}
2993 2985
2994static int tracing_release(struct inode *inode, struct file *file) 2986static int tracing_release(struct inode *inode, struct file *file)
2995{ 2987{
2988 struct trace_array *tr = inode->i_private;
2996 struct seq_file *m = file->private_data; 2989 struct seq_file *m = file->private_data;
2997 struct trace_iterator *iter; 2990 struct trace_iterator *iter;
2998 struct trace_array *tr;
2999 int cpu; 2991 int cpu;
3000 2992
3001 /* Writes do not use seq_file, need to grab tr from inode */
3002 if (!(file->f_mode & FMODE_READ)) { 2993 if (!(file->f_mode & FMODE_READ)) {
3003 struct trace_cpu *tc = inode->i_private; 2994 trace_array_put(tr);
3004
3005 trace_array_put(tc->tr);
3006 return 0; 2995 return 0;
3007 } 2996 }
3008 2997
2998 /* Writes do not use seq_file */
3009 iter = m->private; 2999 iter = m->private;
3010 tr = iter->tr;
3011 trace_array_put(tr);
3012
3013 mutex_lock(&trace_types_lock); 3000 mutex_lock(&trace_types_lock);
3014 3001
3015 for_each_tracing_cpu(cpu) { 3002 for_each_tracing_cpu(cpu) {
@@ -3023,6 +3010,9 @@ static int tracing_release(struct inode *inode, struct file *file)
3023 if (!iter->snapshot) 3010 if (!iter->snapshot)
3024 /* reenable tracing if it was previously enabled */ 3011 /* reenable tracing if it was previously enabled */
3025 tracing_start_tr(tr); 3012 tracing_start_tr(tr);
3013
3014 __trace_array_put(tr);
3015
3026 mutex_unlock(&trace_types_lock); 3016 mutex_unlock(&trace_types_lock);
3027 3017
3028 mutex_destroy(&iter->mutex); 3018 mutex_destroy(&iter->mutex);
@@ -3042,15 +3032,6 @@ static int tracing_release_generic_tr(struct inode *inode, struct file *file)
3042 return 0; 3032 return 0;
3043} 3033}
3044 3034
3045static int tracing_release_generic_tc(struct inode *inode, struct file *file)
3046{
3047 struct trace_cpu *tc = inode->i_private;
3048 struct trace_array *tr = tc->tr;
3049
3050 trace_array_put(tr);
3051 return 0;
3052}
3053
3054static int tracing_single_release_tr(struct inode *inode, struct file *file) 3035static int tracing_single_release_tr(struct inode *inode, struct file *file)
3055{ 3036{
3056 struct trace_array *tr = inode->i_private; 3037 struct trace_array *tr = inode->i_private;
@@ -3062,8 +3043,7 @@ static int tracing_single_release_tr(struct inode *inode, struct file *file)
3062 3043
3063static int tracing_open(struct inode *inode, struct file *file) 3044static int tracing_open(struct inode *inode, struct file *file)
3064{ 3045{
3065 struct trace_cpu *tc = inode->i_private; 3046 struct trace_array *tr = inode->i_private;
3066 struct trace_array *tr = tc->tr;
3067 struct trace_iterator *iter; 3047 struct trace_iterator *iter;
3068 int ret = 0; 3048 int ret = 0;
3069 3049
@@ -3071,16 +3051,17 @@ static int tracing_open(struct inode *inode, struct file *file)
3071 return -ENODEV; 3051 return -ENODEV;
3072 3052
3073 /* If this file was open for write, then erase contents */ 3053 /* If this file was open for write, then erase contents */
3074 if ((file->f_mode & FMODE_WRITE) && 3054 if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
3075 (file->f_flags & O_TRUNC)) { 3055 int cpu = tracing_get_cpu(inode);
3076 if (tc->cpu == RING_BUFFER_ALL_CPUS) 3056
3057 if (cpu == RING_BUFFER_ALL_CPUS)
3077 tracing_reset_online_cpus(&tr->trace_buffer); 3058 tracing_reset_online_cpus(&tr->trace_buffer);
3078 else 3059 else
3079 tracing_reset(&tr->trace_buffer, tc->cpu); 3060 tracing_reset(&tr->trace_buffer, cpu);
3080 } 3061 }
3081 3062
3082 if (file->f_mode & FMODE_READ) { 3063 if (file->f_mode & FMODE_READ) {
3083 iter = __tracing_open(tr, tc, inode, file, false); 3064 iter = __tracing_open(inode, file, false);
3084 if (IS_ERR(iter)) 3065 if (IS_ERR(iter))
3085 ret = PTR_ERR(iter); 3066 ret = PTR_ERR(iter);
3086 else if (trace_flags & TRACE_ITER_LATENCY_FMT) 3067 else if (trace_flags & TRACE_ITER_LATENCY_FMT)
@@ -3447,6 +3428,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
3447static int tracing_trace_options_open(struct inode *inode, struct file *file) 3428static int tracing_trace_options_open(struct inode *inode, struct file *file)
3448{ 3429{
3449 struct trace_array *tr = inode->i_private; 3430 struct trace_array *tr = inode->i_private;
3431 int ret;
3450 3432
3451 if (tracing_disabled) 3433 if (tracing_disabled)
3452 return -ENODEV; 3434 return -ENODEV;
@@ -3454,7 +3436,11 @@ static int tracing_trace_options_open(struct inode *inode, struct file *file)
3454 if (trace_array_get(tr) < 0) 3436 if (trace_array_get(tr) < 0)
3455 return -ENODEV; 3437 return -ENODEV;
3456 3438
3457 return single_open(file, tracing_trace_options_show, inode->i_private); 3439 ret = single_open(file, tracing_trace_options_show, inode->i_private);
3440 if (ret < 0)
3441 trace_array_put(tr);
3442
3443 return ret;
3458} 3444}
3459 3445
3460static const struct file_operations tracing_iter_fops = { 3446static const struct file_operations tracing_iter_fops = {
@@ -3537,14 +3523,14 @@ static const char readme_msg[] =
3537 "\n snapshot\t\t- Like 'trace' but shows the content of the static snapshot buffer\n" 3523 "\n snapshot\t\t- Like 'trace' but shows the content of the static snapshot buffer\n"
3538 "\t\t\t Read the contents for more information\n" 3524 "\t\t\t Read the contents for more information\n"
3539#endif 3525#endif
3540#ifdef CONFIG_STACKTRACE 3526#ifdef CONFIG_STACK_TRACER
3541 " stack_trace\t\t- Shows the max stack trace when active\n" 3527 " stack_trace\t\t- Shows the max stack trace when active\n"
3542 " stack_max_size\t- Shows current max stack size that was traced\n" 3528 " stack_max_size\t- Shows current max stack size that was traced\n"
3543 "\t\t\t Write into this file to reset the max size (trigger a new trace)\n" 3529 "\t\t\t Write into this file to reset the max size (trigger a new trace)\n"
3544#ifdef CONFIG_DYNAMIC_FTRACE 3530#ifdef CONFIG_DYNAMIC_FTRACE
3545 " stack_trace_filter\t- Like set_ftrace_filter but limits what stack_trace traces\n" 3531 " stack_trace_filter\t- Like set_ftrace_filter but limits what stack_trace traces\n"
3546#endif 3532#endif
3547#endif /* CONFIG_STACKTRACE */ 3533#endif /* CONFIG_STACK_TRACER */
3548; 3534;
3549 3535
3550static ssize_t 3536static ssize_t
@@ -3941,8 +3927,7 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf,
3941 3927
3942static int tracing_open_pipe(struct inode *inode, struct file *filp) 3928static int tracing_open_pipe(struct inode *inode, struct file *filp)
3943{ 3929{
3944 struct trace_cpu *tc = inode->i_private; 3930 struct trace_array *tr = inode->i_private;
3945 struct trace_array *tr = tc->tr;
3946 struct trace_iterator *iter; 3931 struct trace_iterator *iter;
3947 int ret = 0; 3932 int ret = 0;
3948 3933
@@ -3958,6 +3943,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
3958 iter = kzalloc(sizeof(*iter), GFP_KERNEL); 3943 iter = kzalloc(sizeof(*iter), GFP_KERNEL);
3959 if (!iter) { 3944 if (!iter) {
3960 ret = -ENOMEM; 3945 ret = -ENOMEM;
3946 __trace_array_put(tr);
3961 goto out; 3947 goto out;
3962 } 3948 }
3963 3949
@@ -3987,9 +3973,9 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
3987 if (trace_clocks[tr->clock_id].in_ns) 3973 if (trace_clocks[tr->clock_id].in_ns)
3988 iter->iter_flags |= TRACE_FILE_TIME_IN_NS; 3974 iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
3989 3975
3990 iter->cpu_file = tc->cpu; 3976 iter->tr = tr;
3991 iter->tr = tc->tr; 3977 iter->trace_buffer = &tr->trace_buffer;
3992 iter->trace_buffer = &tc->tr->trace_buffer; 3978 iter->cpu_file = tracing_get_cpu(inode);
3993 mutex_init(&iter->mutex); 3979 mutex_init(&iter->mutex);
3994 filp->private_data = iter; 3980 filp->private_data = iter;
3995 3981
@@ -4012,8 +3998,7 @@ fail:
4012static int tracing_release_pipe(struct inode *inode, struct file *file) 3998static int tracing_release_pipe(struct inode *inode, struct file *file)
4013{ 3999{
4014 struct trace_iterator *iter = file->private_data; 4000 struct trace_iterator *iter = file->private_data;
4015 struct trace_cpu *tc = inode->i_private; 4001 struct trace_array *tr = inode->i_private;
4016 struct trace_array *tr = tc->tr;
4017 4002
4018 mutex_lock(&trace_types_lock); 4003 mutex_lock(&trace_types_lock);
4019 4004
@@ -4166,6 +4151,7 @@ waitagain:
4166 memset(&iter->seq, 0, 4151 memset(&iter->seq, 0,
4167 sizeof(struct trace_iterator) - 4152 sizeof(struct trace_iterator) -
4168 offsetof(struct trace_iterator, seq)); 4153 offsetof(struct trace_iterator, seq));
4154 cpumask_clear(iter->started);
4169 iter->pos = -1; 4155 iter->pos = -1;
4170 4156
4171 trace_event_read_lock(); 4157 trace_event_read_lock();
@@ -4366,15 +4352,16 @@ static ssize_t
4366tracing_entries_read(struct file *filp, char __user *ubuf, 4352tracing_entries_read(struct file *filp, char __user *ubuf,
4367 size_t cnt, loff_t *ppos) 4353 size_t cnt, loff_t *ppos)
4368{ 4354{
4369 struct trace_cpu *tc = filp->private_data; 4355 struct inode *inode = file_inode(filp);
4370 struct trace_array *tr = tc->tr; 4356 struct trace_array *tr = inode->i_private;
4357 int cpu = tracing_get_cpu(inode);
4371 char buf[64]; 4358 char buf[64];
4372 int r = 0; 4359 int r = 0;
4373 ssize_t ret; 4360 ssize_t ret;
4374 4361
4375 mutex_lock(&trace_types_lock); 4362 mutex_lock(&trace_types_lock);
4376 4363
4377 if (tc->cpu == RING_BUFFER_ALL_CPUS) { 4364 if (cpu == RING_BUFFER_ALL_CPUS) {
4378 int cpu, buf_size_same; 4365 int cpu, buf_size_same;
4379 unsigned long size; 4366 unsigned long size;
4380 4367
@@ -4401,7 +4388,7 @@ tracing_entries_read(struct file *filp, char __user *ubuf,
4401 } else 4388 } else
4402 r = sprintf(buf, "X\n"); 4389 r = sprintf(buf, "X\n");
4403 } else 4390 } else
4404 r = sprintf(buf, "%lu\n", per_cpu_ptr(tr->trace_buffer.data, tc->cpu)->entries >> 10); 4391 r = sprintf(buf, "%lu\n", per_cpu_ptr(tr->trace_buffer.data, cpu)->entries >> 10);
4405 4392
4406 mutex_unlock(&trace_types_lock); 4393 mutex_unlock(&trace_types_lock);
4407 4394
@@ -4413,7 +4400,8 @@ static ssize_t
4413tracing_entries_write(struct file *filp, const char __user *ubuf, 4400tracing_entries_write(struct file *filp, const char __user *ubuf,
4414 size_t cnt, loff_t *ppos) 4401 size_t cnt, loff_t *ppos)
4415{ 4402{
4416 struct trace_cpu *tc = filp->private_data; 4403 struct inode *inode = file_inode(filp);
4404 struct trace_array *tr = inode->i_private;
4417 unsigned long val; 4405 unsigned long val;
4418 int ret; 4406 int ret;
4419 4407
@@ -4427,8 +4415,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
4427 4415
4428 /* value is in KB */ 4416 /* value is in KB */
4429 val <<= 10; 4417 val <<= 10;
4430 4418 ret = tracing_resize_ring_buffer(tr, val, tracing_get_cpu(inode));
4431 ret = tracing_resize_ring_buffer(tc->tr, val, tc->cpu);
4432 if (ret < 0) 4419 if (ret < 0)
4433 return ret; 4420 return ret;
4434 4421
@@ -4482,7 +4469,7 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp)
4482 4469
4483 /* disable tracing ? */ 4470 /* disable tracing ? */
4484 if (trace_flags & TRACE_ITER_STOP_ON_FREE) 4471 if (trace_flags & TRACE_ITER_STOP_ON_FREE)
4485 tracing_off(); 4472 tracer_tracing_off(tr);
4486 /* resize the ring buffer to 0 */ 4473 /* resize the ring buffer to 0 */
4487 tracing_resize_ring_buffer(tr, 0, RING_BUFFER_ALL_CPUS); 4474 tracing_resize_ring_buffer(tr, 0, RING_BUFFER_ALL_CPUS);
4488 4475
@@ -4647,12 +4634,12 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
4647 * New clock may not be consistent with the previous clock. 4634 * New clock may not be consistent with the previous clock.
4648 * Reset the buffer so that it doesn't have incomparable timestamps. 4635 * Reset the buffer so that it doesn't have incomparable timestamps.
4649 */ 4636 */
4650 tracing_reset_online_cpus(&global_trace.trace_buffer); 4637 tracing_reset_online_cpus(&tr->trace_buffer);
4651 4638
4652#ifdef CONFIG_TRACER_MAX_TRACE 4639#ifdef CONFIG_TRACER_MAX_TRACE
4653 if (tr->flags & TRACE_ARRAY_FL_GLOBAL && tr->max_buffer.buffer) 4640 if (tr->flags & TRACE_ARRAY_FL_GLOBAL && tr->max_buffer.buffer)
4654 ring_buffer_set_clock(tr->max_buffer.buffer, trace_clocks[i].func); 4641 ring_buffer_set_clock(tr->max_buffer.buffer, trace_clocks[i].func);
4655 tracing_reset_online_cpus(&global_trace.max_buffer); 4642 tracing_reset_online_cpus(&tr->max_buffer);
4656#endif 4643#endif
4657 4644
4658 mutex_unlock(&trace_types_lock); 4645 mutex_unlock(&trace_types_lock);
@@ -4689,8 +4676,7 @@ struct ftrace_buffer_info {
4689#ifdef CONFIG_TRACER_SNAPSHOT 4676#ifdef CONFIG_TRACER_SNAPSHOT
4690static int tracing_snapshot_open(struct inode *inode, struct file *file) 4677static int tracing_snapshot_open(struct inode *inode, struct file *file)
4691{ 4678{
4692 struct trace_cpu *tc = inode->i_private; 4679 struct trace_array *tr = inode->i_private;
4693 struct trace_array *tr = tc->tr;
4694 struct trace_iterator *iter; 4680 struct trace_iterator *iter;
4695 struct seq_file *m; 4681 struct seq_file *m;
4696 int ret = 0; 4682 int ret = 0;
@@ -4699,26 +4685,29 @@ static int tracing_snapshot_open(struct inode *inode, struct file *file)
4699 return -ENODEV; 4685 return -ENODEV;
4700 4686
4701 if (file->f_mode & FMODE_READ) { 4687 if (file->f_mode & FMODE_READ) {
4702 iter = __tracing_open(tr, tc, inode, file, true); 4688 iter = __tracing_open(inode, file, true);
4703 if (IS_ERR(iter)) 4689 if (IS_ERR(iter))
4704 ret = PTR_ERR(iter); 4690 ret = PTR_ERR(iter);
4705 } else { 4691 } else {
4706 /* Writes still need the seq_file to hold the private data */ 4692 /* Writes still need the seq_file to hold the private data */
4693 ret = -ENOMEM;
4707 m = kzalloc(sizeof(*m), GFP_KERNEL); 4694 m = kzalloc(sizeof(*m), GFP_KERNEL);
4708 if (!m) 4695 if (!m)
4709 return -ENOMEM; 4696 goto out;
4710 iter = kzalloc(sizeof(*iter), GFP_KERNEL); 4697 iter = kzalloc(sizeof(*iter), GFP_KERNEL);
4711 if (!iter) { 4698 if (!iter) {
4712 kfree(m); 4699 kfree(m);
4713 return -ENOMEM; 4700 goto out;
4714 } 4701 }
4702 ret = 0;
4703
4715 iter->tr = tr; 4704 iter->tr = tr;
4716 iter->trace_buffer = &tc->tr->max_buffer; 4705 iter->trace_buffer = &tr->max_buffer;
4717 iter->cpu_file = tc->cpu; 4706 iter->cpu_file = tracing_get_cpu(inode);
4718 m->private = iter; 4707 m->private = iter;
4719 file->private_data = m; 4708 file->private_data = m;
4720 } 4709 }
4721 4710out:
4722 if (ret < 0) 4711 if (ret < 0)
4723 trace_array_put(tr); 4712 trace_array_put(tr);
4724 4713
@@ -4873,11 +4862,11 @@ static const struct file_operations tracing_pipe_fops = {
4873}; 4862};
4874 4863
4875static const struct file_operations tracing_entries_fops = { 4864static const struct file_operations tracing_entries_fops = {
4876 .open = tracing_open_generic_tc, 4865 .open = tracing_open_generic_tr,
4877 .read = tracing_entries_read, 4866 .read = tracing_entries_read,
4878 .write = tracing_entries_write, 4867 .write = tracing_entries_write,
4879 .llseek = generic_file_llseek, 4868 .llseek = generic_file_llseek,
4880 .release = tracing_release_generic_tc, 4869 .release = tracing_release_generic_tr,
4881}; 4870};
4882 4871
4883static const struct file_operations tracing_total_entries_fops = { 4872static const struct file_operations tracing_total_entries_fops = {
@@ -4929,8 +4918,7 @@ static const struct file_operations snapshot_raw_fops = {
4929 4918
4930static int tracing_buffers_open(struct inode *inode, struct file *filp) 4919static int tracing_buffers_open(struct inode *inode, struct file *filp)
4931{ 4920{
4932 struct trace_cpu *tc = inode->i_private; 4921 struct trace_array *tr = inode->i_private;
4933 struct trace_array *tr = tc->tr;
4934 struct ftrace_buffer_info *info; 4922 struct ftrace_buffer_info *info;
4935 int ret; 4923 int ret;
4936 4924
@@ -4948,10 +4936,8 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp)
4948 4936
4949 mutex_lock(&trace_types_lock); 4937 mutex_lock(&trace_types_lock);
4950 4938
4951 tr->ref++;
4952
4953 info->iter.tr = tr; 4939 info->iter.tr = tr;
4954 info->iter.cpu_file = tc->cpu; 4940 info->iter.cpu_file = tracing_get_cpu(inode);
4955 info->iter.trace = tr->current_trace; 4941 info->iter.trace = tr->current_trace;
4956 info->iter.trace_buffer = &tr->trace_buffer; 4942 info->iter.trace_buffer = &tr->trace_buffer;
4957 info->spare = NULL; 4943 info->spare = NULL;
@@ -5268,14 +5254,14 @@ static ssize_t
5268tracing_stats_read(struct file *filp, char __user *ubuf, 5254tracing_stats_read(struct file *filp, char __user *ubuf,
5269 size_t count, loff_t *ppos) 5255 size_t count, loff_t *ppos)
5270{ 5256{
5271 struct trace_cpu *tc = filp->private_data; 5257 struct inode *inode = file_inode(filp);
5272 struct trace_array *tr = tc->tr; 5258 struct trace_array *tr = inode->i_private;
5273 struct trace_buffer *trace_buf = &tr->trace_buffer; 5259 struct trace_buffer *trace_buf = &tr->trace_buffer;
5260 int cpu = tracing_get_cpu(inode);
5274 struct trace_seq *s; 5261 struct trace_seq *s;
5275 unsigned long cnt; 5262 unsigned long cnt;
5276 unsigned long long t; 5263 unsigned long long t;
5277 unsigned long usec_rem; 5264 unsigned long usec_rem;
5278 int cpu = tc->cpu;
5279 5265
5280 s = kmalloc(sizeof(*s), GFP_KERNEL); 5266 s = kmalloc(sizeof(*s), GFP_KERNEL);
5281 if (!s) 5267 if (!s)
@@ -5328,9 +5314,10 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
5328} 5314}
5329 5315
5330static const struct file_operations tracing_stats_fops = { 5316static const struct file_operations tracing_stats_fops = {
5331 .open = tracing_open_generic, 5317 .open = tracing_open_generic_tr,
5332 .read = tracing_stats_read, 5318 .read = tracing_stats_read,
5333 .llseek = generic_file_llseek, 5319 .llseek = generic_file_llseek,
5320 .release = tracing_release_generic_tr,
5334}; 5321};
5335 5322
5336#ifdef CONFIG_DYNAMIC_FTRACE 5323#ifdef CONFIG_DYNAMIC_FTRACE
@@ -5519,10 +5506,20 @@ static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu)
5519 return tr->percpu_dir; 5506 return tr->percpu_dir;
5520} 5507}
5521 5508
5509static struct dentry *
5510trace_create_cpu_file(const char *name, umode_t mode, struct dentry *parent,
5511 void *data, long cpu, const struct file_operations *fops)
5512{
5513 struct dentry *ret = trace_create_file(name, mode, parent, data, fops);
5514
5515 if (ret) /* See tracing_get_cpu() */
5516 ret->d_inode->i_cdev = (void *)(cpu + 1);
5517 return ret;
5518}
5519
5522static void 5520static void
5523tracing_init_debugfs_percpu(struct trace_array *tr, long cpu) 5521tracing_init_debugfs_percpu(struct trace_array *tr, long cpu)
5524{ 5522{
5525 struct trace_array_cpu *data = per_cpu_ptr(tr->trace_buffer.data, cpu);
5526 struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu); 5523 struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu);
5527 struct dentry *d_cpu; 5524 struct dentry *d_cpu;
5528 char cpu_dir[30]; /* 30 characters should be more than enough */ 5525 char cpu_dir[30]; /* 30 characters should be more than enough */
@@ -5538,28 +5535,28 @@ tracing_init_debugfs_percpu(struct trace_array *tr, long cpu)
5538 } 5535 }
5539 5536
5540 /* per cpu trace_pipe */ 5537 /* per cpu trace_pipe */
5541 trace_create_file("trace_pipe", 0444, d_cpu, 5538 trace_create_cpu_file("trace_pipe", 0444, d_cpu,
5542 (void *)&data->trace_cpu, &tracing_pipe_fops); 5539 tr, cpu, &tracing_pipe_fops);
5543 5540
5544 /* per cpu trace */ 5541 /* per cpu trace */
5545 trace_create_file("trace", 0644, d_cpu, 5542 trace_create_cpu_file("trace", 0644, d_cpu,
5546 (void *)&data->trace_cpu, &tracing_fops); 5543 tr, cpu, &tracing_fops);
5547 5544
5548 trace_create_file("trace_pipe_raw", 0444, d_cpu, 5545 trace_create_cpu_file("trace_pipe_raw", 0444, d_cpu,
5549 (void *)&data->trace_cpu, &tracing_buffers_fops); 5546 tr, cpu, &tracing_buffers_fops);
5550 5547
5551 trace_create_file("stats", 0444, d_cpu, 5548 trace_create_cpu_file("stats", 0444, d_cpu,
5552 (void *)&data->trace_cpu, &tracing_stats_fops); 5549 tr, cpu, &tracing_stats_fops);
5553 5550
5554 trace_create_file("buffer_size_kb", 0444, d_cpu, 5551 trace_create_cpu_file("buffer_size_kb", 0444, d_cpu,
5555 (void *)&data->trace_cpu, &tracing_entries_fops); 5552 tr, cpu, &tracing_entries_fops);
5556 5553
5557#ifdef CONFIG_TRACER_SNAPSHOT 5554#ifdef CONFIG_TRACER_SNAPSHOT
5558 trace_create_file("snapshot", 0644, d_cpu, 5555 trace_create_cpu_file("snapshot", 0644, d_cpu,
5559 (void *)&data->trace_cpu, &snapshot_fops); 5556 tr, cpu, &snapshot_fops);
5560 5557
5561 trace_create_file("snapshot_raw", 0444, d_cpu, 5558 trace_create_cpu_file("snapshot_raw", 0444, d_cpu,
5562 (void *)&data->trace_cpu, &snapshot_raw_fops); 5559 tr, cpu, &snapshot_raw_fops);
5563#endif 5560#endif
5564} 5561}
5565 5562
@@ -5868,17 +5865,6 @@ struct dentry *trace_instance_dir;
5868static void 5865static void
5869init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer); 5866init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer);
5870 5867
5871static void init_trace_buffers(struct trace_array *tr, struct trace_buffer *buf)
5872{
5873 int cpu;
5874
5875 for_each_tracing_cpu(cpu) {
5876 memset(per_cpu_ptr(buf->data, cpu), 0, sizeof(struct trace_array_cpu));
5877 per_cpu_ptr(buf->data, cpu)->trace_cpu.cpu = cpu;
5878 per_cpu_ptr(buf->data, cpu)->trace_cpu.tr = tr;
5879 }
5880}
5881
5882static int 5868static int
5883allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size) 5869allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size)
5884{ 5870{
@@ -5896,8 +5882,6 @@ allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size
5896 return -ENOMEM; 5882 return -ENOMEM;
5897 } 5883 }
5898 5884
5899 init_trace_buffers(tr, buf);
5900
5901 /* Allocate the first page for all buffers */ 5885 /* Allocate the first page for all buffers */
5902 set_buffer_entries(&tr->trace_buffer, 5886 set_buffer_entries(&tr->trace_buffer,
5903 ring_buffer_size(tr->trace_buffer.buffer, 0)); 5887 ring_buffer_size(tr->trace_buffer.buffer, 0));
@@ -5964,17 +5948,15 @@ static int new_instance_create(const char *name)
5964 if (allocate_trace_buffers(tr, trace_buf_size) < 0) 5948 if (allocate_trace_buffers(tr, trace_buf_size) < 0)
5965 goto out_free_tr; 5949 goto out_free_tr;
5966 5950
5967 /* Holder for file callbacks */
5968 tr->trace_cpu.cpu = RING_BUFFER_ALL_CPUS;
5969 tr->trace_cpu.tr = tr;
5970
5971 tr->dir = debugfs_create_dir(name, trace_instance_dir); 5951 tr->dir = debugfs_create_dir(name, trace_instance_dir);
5972 if (!tr->dir) 5952 if (!tr->dir)
5973 goto out_free_tr; 5953 goto out_free_tr;
5974 5954
5975 ret = event_trace_add_tracer(tr->dir, tr); 5955 ret = event_trace_add_tracer(tr->dir, tr);
5976 if (ret) 5956 if (ret) {
5957 debugfs_remove_recursive(tr->dir);
5977 goto out_free_tr; 5958 goto out_free_tr;
5959 }
5978 5960
5979 init_tracer_debugfs(tr, tr->dir); 5961 init_tracer_debugfs(tr, tr->dir);
5980 5962
@@ -6120,13 +6102,13 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
6120 tr, &tracing_iter_fops); 6102 tr, &tracing_iter_fops);
6121 6103
6122 trace_create_file("trace", 0644, d_tracer, 6104 trace_create_file("trace", 0644, d_tracer,
6123 (void *)&tr->trace_cpu, &tracing_fops); 6105 tr, &tracing_fops);
6124 6106
6125 trace_create_file("trace_pipe", 0444, d_tracer, 6107 trace_create_file("trace_pipe", 0444, d_tracer,
6126 (void *)&tr->trace_cpu, &tracing_pipe_fops); 6108 tr, &tracing_pipe_fops);
6127 6109
6128 trace_create_file("buffer_size_kb", 0644, d_tracer, 6110 trace_create_file("buffer_size_kb", 0644, d_tracer,
6129 (void *)&tr->trace_cpu, &tracing_entries_fops); 6111 tr, &tracing_entries_fops);
6130 6112
6131 trace_create_file("buffer_total_size_kb", 0444, d_tracer, 6113 trace_create_file("buffer_total_size_kb", 0444, d_tracer,
6132 tr, &tracing_total_entries_fops); 6114 tr, &tracing_total_entries_fops);
@@ -6141,11 +6123,11 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
6141 &trace_clock_fops); 6123 &trace_clock_fops);
6142 6124
6143 trace_create_file("tracing_on", 0644, d_tracer, 6125 trace_create_file("tracing_on", 0644, d_tracer,
6144 tr, &rb_simple_fops); 6126 tr, &rb_simple_fops);
6145 6127
6146#ifdef CONFIG_TRACER_SNAPSHOT 6128#ifdef CONFIG_TRACER_SNAPSHOT
6147 trace_create_file("snapshot", 0644, d_tracer, 6129 trace_create_file("snapshot", 0644, d_tracer,
6148 (void *)&tr->trace_cpu, &snapshot_fops); 6130 tr, &snapshot_fops);
6149#endif 6131#endif
6150 6132
6151 for_each_tracing_cpu(cpu) 6133 for_each_tracing_cpu(cpu)
@@ -6439,10 +6421,6 @@ __init static int tracer_alloc_buffers(void)
6439 6421
6440 global_trace.flags = TRACE_ARRAY_FL_GLOBAL; 6422 global_trace.flags = TRACE_ARRAY_FL_GLOBAL;
6441 6423
6442 /* Holder for file callbacks */
6443 global_trace.trace_cpu.cpu = RING_BUFFER_ALL_CPUS;
6444 global_trace.trace_cpu.tr = &global_trace;
6445
6446 INIT_LIST_HEAD(&global_trace.systems); 6424 INIT_LIST_HEAD(&global_trace.systems);
6447 INIT_LIST_HEAD(&global_trace.events); 6425 INIT_LIST_HEAD(&global_trace.events);
6448 list_add(&global_trace.list, &ftrace_trace_arrays); 6426 list_add(&global_trace.list, &ftrace_trace_arrays);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 4a4f6e1828b6..fe39acd4c1aa 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -130,19 +130,12 @@ enum trace_flag_type {
130 130
131struct trace_array; 131struct trace_array;
132 132
133struct trace_cpu {
134 struct trace_array *tr;
135 struct dentry *dir;
136 int cpu;
137};
138
139/* 133/*
140 * The CPU trace array - it consists of thousands of trace entries 134 * The CPU trace array - it consists of thousands of trace entries
141 * plus some other descriptor data: (for example which task started 135 * plus some other descriptor data: (for example which task started
142 * the trace, etc.) 136 * the trace, etc.)
143 */ 137 */
144struct trace_array_cpu { 138struct trace_array_cpu {
145 struct trace_cpu trace_cpu;
146 atomic_t disabled; 139 atomic_t disabled;
147 void *buffer_page; /* ring buffer spare */ 140 void *buffer_page; /* ring buffer spare */
148 141
@@ -196,7 +189,6 @@ struct trace_array {
196 bool allocated_snapshot; 189 bool allocated_snapshot;
197#endif 190#endif
198 int buffer_disabled; 191 int buffer_disabled;
199 struct trace_cpu trace_cpu; /* place holder */
200#ifdef CONFIG_FTRACE_SYSCALLS 192#ifdef CONFIG_FTRACE_SYSCALLS
201 int sys_refcount_enter; 193 int sys_refcount_enter;
202 int sys_refcount_exit; 194 int sys_refcount_exit;
@@ -214,7 +206,6 @@ struct trace_array {
214 struct dentry *event_dir; 206 struct dentry *event_dir;
215 struct list_head systems; 207 struct list_head systems;
216 struct list_head events; 208 struct list_head events;
217 struct task_struct *waiter;
218 int ref; 209 int ref;
219}; 210};
220 211
@@ -680,6 +671,15 @@ extern int trace_selftest_startup_sched_switch(struct tracer *trace,
680 struct trace_array *tr); 671 struct trace_array *tr);
681extern int trace_selftest_startup_branch(struct tracer *trace, 672extern int trace_selftest_startup_branch(struct tracer *trace,
682 struct trace_array *tr); 673 struct trace_array *tr);
674/*
675 * Tracer data references selftest functions that only occur
676 * on boot up. These can be __init functions. Thus, when selftests
677 * are enabled, then the tracers need to reference __init functions.
678 */
679#define __tracer_data __refdata
680#else
681/* Tracers are seldom changed. Optimize when selftests are disabled. */
682#define __tracer_data __read_mostly
683#endif /* CONFIG_FTRACE_STARTUP_TEST */ 683#endif /* CONFIG_FTRACE_STARTUP_TEST */
684 684
685extern void *head_page(struct trace_array_cpu *data); 685extern void *head_page(struct trace_array_cpu *data);
@@ -1022,6 +1022,9 @@ extern struct list_head ftrace_events;
1022extern const char *__start___trace_bprintk_fmt[]; 1022extern const char *__start___trace_bprintk_fmt[];
1023extern const char *__stop___trace_bprintk_fmt[]; 1023extern const char *__stop___trace_bprintk_fmt[];
1024 1024
1025extern const char *__start___tracepoint_str[];
1026extern const char *__stop___tracepoint_str[];
1027
1025void trace_printk_init_buffers(void); 1028void trace_printk_init_buffers(void);
1026void trace_printk_start_comm(void); 1029void trace_printk_start_comm(void);
1027int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set); 1030int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set);
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 84b1e045faba..80c36bcf66e8 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -236,6 +236,10 @@ __kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
236 236
237 BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long)); 237 BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long));
238 238
239 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
240 "perf buffer not large enough"))
241 return NULL;
242
239 pc = preempt_count(); 243 pc = preempt_count();
240 244
241 *rctxp = perf_swevent_get_recursion_context(); 245 *rctxp = perf_swevent_get_recursion_context();
@@ -266,6 +270,10 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
266 struct pt_regs regs; 270 struct pt_regs regs;
267 int rctx; 271 int rctx;
268 272
273 head = this_cpu_ptr(event_function.perf_events);
274 if (hlist_empty(head))
275 return;
276
269#define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \ 277#define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \
270 sizeof(u64)) - sizeof(u32)) 278 sizeof(u64)) - sizeof(u32))
271 279
@@ -279,8 +287,6 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
279 287
280 entry->ip = ip; 288 entry->ip = ip;
281 entry->parent_ip = parent_ip; 289 entry->parent_ip = parent_ip;
282
283 head = this_cpu_ptr(event_function.perf_events);
284 perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0, 290 perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0,
285 1, &regs, head, NULL); 291 1, &regs, head, NULL);
286 292
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 7d854290bf81..29a7ebcfb426 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -409,33 +409,42 @@ static void put_system(struct ftrace_subsystem_dir *dir)
409 mutex_unlock(&event_mutex); 409 mutex_unlock(&event_mutex);
410} 410}
411 411
412/* 412static void remove_subsystem(struct ftrace_subsystem_dir *dir)
413 * Open and update trace_array ref count.
414 * Must have the current trace_array passed to it.
415 */
416static int tracing_open_generic_file(struct inode *inode, struct file *filp)
417{ 413{
418 struct ftrace_event_file *file = inode->i_private; 414 if (!dir)
419 struct trace_array *tr = file->tr; 415 return;
420 int ret;
421 416
422 if (trace_array_get(tr) < 0) 417 if (!--dir->nr_events) {
423 return -ENODEV; 418 debugfs_remove_recursive(dir->entry);
419 list_del(&dir->list);
420 __put_system_dir(dir);
421 }
422}
424 423
425 ret = tracing_open_generic(inode, filp); 424static void *event_file_data(struct file *filp)
426 if (ret < 0) 425{
427 trace_array_put(tr); 426 return ACCESS_ONCE(file_inode(filp)->i_private);
428 return ret;
429} 427}
430 428
431static int tracing_release_generic_file(struct inode *inode, struct file *filp) 429static void remove_event_file_dir(struct ftrace_event_file *file)
432{ 430{
433 struct ftrace_event_file *file = inode->i_private; 431 struct dentry *dir = file->dir;
434 struct trace_array *tr = file->tr; 432 struct dentry *child;
435 433
436 trace_array_put(tr); 434 if (dir) {
435 spin_lock(&dir->d_lock); /* probably unneeded */
436 list_for_each_entry(child, &dir->d_subdirs, d_u.d_child) {
437 if (child->d_inode) /* probably unneeded */
438 child->d_inode->i_private = NULL;
439 }
440 spin_unlock(&dir->d_lock);
437 441
438 return 0; 442 debugfs_remove_recursive(dir);
443 }
444
445 list_del(&file->list);
446 remove_subsystem(file->system);
447 kmem_cache_free(file_cachep, file);
439} 448}
440 449
441/* 450/*
@@ -679,15 +688,25 @@ static ssize_t
679event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, 688event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
680 loff_t *ppos) 689 loff_t *ppos)
681{ 690{
682 struct ftrace_event_file *file = filp->private_data; 691 struct ftrace_event_file *file;
692 unsigned long flags;
683 char buf[4] = "0"; 693 char buf[4] = "0";
684 694
685 if (file->flags & FTRACE_EVENT_FL_ENABLED && 695 mutex_lock(&event_mutex);
686 !(file->flags & FTRACE_EVENT_FL_SOFT_DISABLED)) 696 file = event_file_data(filp);
697 if (likely(file))
698 flags = file->flags;
699 mutex_unlock(&event_mutex);
700
701 if (!file)
702 return -ENODEV;
703
704 if (flags & FTRACE_EVENT_FL_ENABLED &&
705 !(flags & FTRACE_EVENT_FL_SOFT_DISABLED))
687 strcpy(buf, "1"); 706 strcpy(buf, "1");
688 707
689 if (file->flags & FTRACE_EVENT_FL_SOFT_DISABLED || 708 if (flags & FTRACE_EVENT_FL_SOFT_DISABLED ||
690 file->flags & FTRACE_EVENT_FL_SOFT_MODE) 709 flags & FTRACE_EVENT_FL_SOFT_MODE)
691 strcat(buf, "*"); 710 strcat(buf, "*");
692 711
693 strcat(buf, "\n"); 712 strcat(buf, "\n");
@@ -699,13 +718,10 @@ static ssize_t
699event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, 718event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
700 loff_t *ppos) 719 loff_t *ppos)
701{ 720{
702 struct ftrace_event_file *file = filp->private_data; 721 struct ftrace_event_file *file;
703 unsigned long val; 722 unsigned long val;
704 int ret; 723 int ret;
705 724
706 if (!file)
707 return -EINVAL;
708
709 ret = kstrtoul_from_user(ubuf, cnt, 10, &val); 725 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
710 if (ret) 726 if (ret)
711 return ret; 727 return ret;
@@ -717,8 +733,11 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
717 switch (val) { 733 switch (val) {
718 case 0: 734 case 0:
719 case 1: 735 case 1:
736 ret = -ENODEV;
720 mutex_lock(&event_mutex); 737 mutex_lock(&event_mutex);
721 ret = ftrace_event_enable_disable(file, val); 738 file = event_file_data(filp);
739 if (likely(file))
740 ret = ftrace_event_enable_disable(file, val);
722 mutex_unlock(&event_mutex); 741 mutex_unlock(&event_mutex);
723 break; 742 break;
724 743
@@ -825,65 +844,39 @@ enum {
825 844
826static void *f_next(struct seq_file *m, void *v, loff_t *pos) 845static void *f_next(struct seq_file *m, void *v, loff_t *pos)
827{ 846{
828 struct ftrace_event_call *call = m->private; 847 struct ftrace_event_call *call = event_file_data(m->private);
829 struct ftrace_event_field *field;
830 struct list_head *common_head = &ftrace_common_fields; 848 struct list_head *common_head = &ftrace_common_fields;
831 struct list_head *head = trace_get_fields(call); 849 struct list_head *head = trace_get_fields(call);
850 struct list_head *node = v;
832 851
833 (*pos)++; 852 (*pos)++;
834 853
835 switch ((unsigned long)v) { 854 switch ((unsigned long)v) {
836 case FORMAT_HEADER: 855 case FORMAT_HEADER:
837 if (unlikely(list_empty(common_head))) 856 node = common_head;
838 return NULL; 857 break;
839
840 field = list_entry(common_head->prev,
841 struct ftrace_event_field, link);
842 return field;
843 858
844 case FORMAT_FIELD_SEPERATOR: 859 case FORMAT_FIELD_SEPERATOR:
845 if (unlikely(list_empty(head))) 860 node = head;
846 return NULL; 861 break;
847
848 field = list_entry(head->prev, struct ftrace_event_field, link);
849 return field;
850 862
851 case FORMAT_PRINTFMT: 863 case FORMAT_PRINTFMT:
852 /* all done */ 864 /* all done */
853 return NULL; 865 return NULL;
854 } 866 }
855 867
856 field = v; 868 node = node->prev;
857 if (field->link.prev == common_head) 869 if (node == common_head)
858 return (void *)FORMAT_FIELD_SEPERATOR; 870 return (void *)FORMAT_FIELD_SEPERATOR;
859 else if (field->link.prev == head) 871 else if (node == head)
860 return (void *)FORMAT_PRINTFMT; 872 return (void *)FORMAT_PRINTFMT;
861 873 else
862 field = list_entry(field->link.prev, struct ftrace_event_field, link); 874 return node;
863
864 return field;
865}
866
867static void *f_start(struct seq_file *m, loff_t *pos)
868{
869 loff_t l = 0;
870 void *p;
871
872 /* Start by showing the header */
873 if (!*pos)
874 return (void *)FORMAT_HEADER;
875
876 p = (void *)FORMAT_HEADER;
877 do {
878 p = f_next(m, p, &l);
879 } while (p && l < *pos);
880
881 return p;
882} 875}
883 876
884static int f_show(struct seq_file *m, void *v) 877static int f_show(struct seq_file *m, void *v)
885{ 878{
886 struct ftrace_event_call *call = m->private; 879 struct ftrace_event_call *call = event_file_data(m->private);
887 struct ftrace_event_field *field; 880 struct ftrace_event_field *field;
888 const char *array_descriptor; 881 const char *array_descriptor;
889 882
@@ -904,8 +897,7 @@ static int f_show(struct seq_file *m, void *v)
904 return 0; 897 return 0;
905 } 898 }
906 899
907 field = v; 900 field = list_entry(v, struct ftrace_event_field, link);
908
909 /* 901 /*
910 * Smartly shows the array type(except dynamic array). 902 * Smartly shows the array type(except dynamic array).
911 * Normal: 903 * Normal:
@@ -932,8 +924,25 @@ static int f_show(struct seq_file *m, void *v)
932 return 0; 924 return 0;
933} 925}
934 926
927static void *f_start(struct seq_file *m, loff_t *pos)
928{
929 void *p = (void *)FORMAT_HEADER;
930 loff_t l = 0;
931
932 /* ->stop() is called even if ->start() fails */
933 mutex_lock(&event_mutex);
934 if (!event_file_data(m->private))
935 return ERR_PTR(-ENODEV);
936
937 while (l < *pos && p)
938 p = f_next(m, p, &l);
939
940 return p;
941}
942
935static void f_stop(struct seq_file *m, void *p) 943static void f_stop(struct seq_file *m, void *p)
936{ 944{
945 mutex_unlock(&event_mutex);
937} 946}
938 947
939static const struct seq_operations trace_format_seq_ops = { 948static const struct seq_operations trace_format_seq_ops = {
@@ -945,7 +954,6 @@ static const struct seq_operations trace_format_seq_ops = {
945 954
946static int trace_format_open(struct inode *inode, struct file *file) 955static int trace_format_open(struct inode *inode, struct file *file)
947{ 956{
948 struct ftrace_event_call *call = inode->i_private;
949 struct seq_file *m; 957 struct seq_file *m;
950 int ret; 958 int ret;
951 959
@@ -954,7 +962,7 @@ static int trace_format_open(struct inode *inode, struct file *file)
954 return ret; 962 return ret;
955 963
956 m = file->private_data; 964 m = file->private_data;
957 m->private = call; 965 m->private = file;
958 966
959 return 0; 967 return 0;
960} 968}
@@ -962,45 +970,47 @@ static int trace_format_open(struct inode *inode, struct file *file)
962static ssize_t 970static ssize_t
963event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) 971event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
964{ 972{
965 struct ftrace_event_call *call = filp->private_data; 973 int id = (long)event_file_data(filp);
966 struct trace_seq *s; 974 char buf[32];
967 int r; 975 int len;
968 976
969 if (*ppos) 977 if (*ppos)
970 return 0; 978 return 0;
971 979
972 s = kmalloc(sizeof(*s), GFP_KERNEL); 980 if (unlikely(!id))
973 if (!s) 981 return -ENODEV;
974 return -ENOMEM;
975 982
976 trace_seq_init(s); 983 len = sprintf(buf, "%d\n", id);
977 trace_seq_printf(s, "%d\n", call->event.type);
978 984
979 r = simple_read_from_buffer(ubuf, cnt, ppos, 985 return simple_read_from_buffer(ubuf, cnt, ppos, buf, len);
980 s->buffer, s->len);
981 kfree(s);
982 return r;
983} 986}
984 987
985static ssize_t 988static ssize_t
986event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, 989event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
987 loff_t *ppos) 990 loff_t *ppos)
988{ 991{
989 struct ftrace_event_call *call = filp->private_data; 992 struct ftrace_event_call *call;
990 struct trace_seq *s; 993 struct trace_seq *s;
991 int r; 994 int r = -ENODEV;
992 995
993 if (*ppos) 996 if (*ppos)
994 return 0; 997 return 0;
995 998
996 s = kmalloc(sizeof(*s), GFP_KERNEL); 999 s = kmalloc(sizeof(*s), GFP_KERNEL);
1000
997 if (!s) 1001 if (!s)
998 return -ENOMEM; 1002 return -ENOMEM;
999 1003
1000 trace_seq_init(s); 1004 trace_seq_init(s);
1001 1005
1002 print_event_filter(call, s); 1006 mutex_lock(&event_mutex);
1003 r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); 1007 call = event_file_data(filp);
1008 if (call)
1009 print_event_filter(call, s);
1010 mutex_unlock(&event_mutex);
1011
1012 if (call)
1013 r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
1004 1014
1005 kfree(s); 1015 kfree(s);
1006 1016
@@ -1011,9 +1021,9 @@ static ssize_t
1011event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, 1021event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
1012 loff_t *ppos) 1022 loff_t *ppos)
1013{ 1023{
1014 struct ftrace_event_call *call = filp->private_data; 1024 struct ftrace_event_call *call;
1015 char *buf; 1025 char *buf;
1016 int err; 1026 int err = -ENODEV;
1017 1027
1018 if (cnt >= PAGE_SIZE) 1028 if (cnt >= PAGE_SIZE)
1019 return -EINVAL; 1029 return -EINVAL;
@@ -1028,7 +1038,12 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
1028 } 1038 }
1029 buf[cnt] = '\0'; 1039 buf[cnt] = '\0';
1030 1040
1031 err = apply_event_filter(call, buf); 1041 mutex_lock(&event_mutex);
1042 call = event_file_data(filp);
1043 if (call)
1044 err = apply_event_filter(call, buf);
1045 mutex_unlock(&event_mutex);
1046
1032 free_page((unsigned long) buf); 1047 free_page((unsigned long) buf);
1033 if (err < 0) 1048 if (err < 0)
1034 return err; 1049 return err;
@@ -1218,6 +1233,7 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
1218 1233
1219static int ftrace_event_avail_open(struct inode *inode, struct file *file); 1234static int ftrace_event_avail_open(struct inode *inode, struct file *file);
1220static int ftrace_event_set_open(struct inode *inode, struct file *file); 1235static int ftrace_event_set_open(struct inode *inode, struct file *file);
1236static int ftrace_event_release(struct inode *inode, struct file *file);
1221 1237
1222static const struct seq_operations show_event_seq_ops = { 1238static const struct seq_operations show_event_seq_ops = {
1223 .start = t_start, 1239 .start = t_start,
@@ -1245,14 +1261,13 @@ static const struct file_operations ftrace_set_event_fops = {
1245 .read = seq_read, 1261 .read = seq_read,
1246 .write = ftrace_event_write, 1262 .write = ftrace_event_write,
1247 .llseek = seq_lseek, 1263 .llseek = seq_lseek,
1248 .release = seq_release, 1264 .release = ftrace_event_release,
1249}; 1265};
1250 1266
1251static const struct file_operations ftrace_enable_fops = { 1267static const struct file_operations ftrace_enable_fops = {
1252 .open = tracing_open_generic_file, 1268 .open = tracing_open_generic,
1253 .read = event_enable_read, 1269 .read = event_enable_read,
1254 .write = event_enable_write, 1270 .write = event_enable_write,
1255 .release = tracing_release_generic_file,
1256 .llseek = default_llseek, 1271 .llseek = default_llseek,
1257}; 1272};
1258 1273
@@ -1264,7 +1279,6 @@ static const struct file_operations ftrace_event_format_fops = {
1264}; 1279};
1265 1280
1266static const struct file_operations ftrace_event_id_fops = { 1281static const struct file_operations ftrace_event_id_fops = {
1267 .open = tracing_open_generic,
1268 .read = event_id_read, 1282 .read = event_id_read,
1269 .llseek = default_llseek, 1283 .llseek = default_llseek,
1270}; 1284};
@@ -1323,6 +1337,15 @@ ftrace_event_open(struct inode *inode, struct file *file,
1323 return ret; 1337 return ret;
1324} 1338}
1325 1339
1340static int ftrace_event_release(struct inode *inode, struct file *file)
1341{
1342 struct trace_array *tr = inode->i_private;
1343
1344 trace_array_put(tr);
1345
1346 return seq_release(inode, file);
1347}
1348
1326static int 1349static int
1327ftrace_event_avail_open(struct inode *inode, struct file *file) 1350ftrace_event_avail_open(struct inode *inode, struct file *file)
1328{ 1351{
@@ -1336,12 +1359,19 @@ ftrace_event_set_open(struct inode *inode, struct file *file)
1336{ 1359{
1337 const struct seq_operations *seq_ops = &show_set_event_seq_ops; 1360 const struct seq_operations *seq_ops = &show_set_event_seq_ops;
1338 struct trace_array *tr = inode->i_private; 1361 struct trace_array *tr = inode->i_private;
1362 int ret;
1363
1364 if (trace_array_get(tr) < 0)
1365 return -ENODEV;
1339 1366
1340 if ((file->f_mode & FMODE_WRITE) && 1367 if ((file->f_mode & FMODE_WRITE) &&
1341 (file->f_flags & O_TRUNC)) 1368 (file->f_flags & O_TRUNC))
1342 ftrace_clear_events(tr); 1369 ftrace_clear_events(tr);
1343 1370
1344 return ftrace_event_open(inode, file, seq_ops); 1371 ret = ftrace_event_open(inode, file, seq_ops);
1372 if (ret < 0)
1373 trace_array_put(tr);
1374 return ret;
1345} 1375}
1346 1376
1347static struct event_subsystem * 1377static struct event_subsystem *
@@ -1496,8 +1526,8 @@ event_create_dir(struct dentry *parent,
1496 1526
1497#ifdef CONFIG_PERF_EVENTS 1527#ifdef CONFIG_PERF_EVENTS
1498 if (call->event.type && call->class->reg) 1528 if (call->event.type && call->class->reg)
1499 trace_create_file("id", 0444, file->dir, call, 1529 trace_create_file("id", 0444, file->dir,
1500 id); 1530 (void *)(long)call->event.type, id);
1501#endif 1531#endif
1502 1532
1503 /* 1533 /*
@@ -1522,33 +1552,16 @@ event_create_dir(struct dentry *parent,
1522 return 0; 1552 return 0;
1523} 1553}
1524 1554
1525static void remove_subsystem(struct ftrace_subsystem_dir *dir)
1526{
1527 if (!dir)
1528 return;
1529
1530 if (!--dir->nr_events) {
1531 debugfs_remove_recursive(dir->entry);
1532 list_del(&dir->list);
1533 __put_system_dir(dir);
1534 }
1535}
1536
1537static void remove_event_from_tracers(struct ftrace_event_call *call) 1555static void remove_event_from_tracers(struct ftrace_event_call *call)
1538{ 1556{
1539 struct ftrace_event_file *file; 1557 struct ftrace_event_file *file;
1540 struct trace_array *tr; 1558 struct trace_array *tr;
1541 1559
1542 do_for_each_event_file_safe(tr, file) { 1560 do_for_each_event_file_safe(tr, file) {
1543
1544 if (file->event_call != call) 1561 if (file->event_call != call)
1545 continue; 1562 continue;
1546 1563
1547 list_del(&file->list); 1564 remove_event_file_dir(file);
1548 debugfs_remove_recursive(file->dir);
1549 remove_subsystem(file->system);
1550 kmem_cache_free(file_cachep, file);
1551
1552 /* 1565 /*
1553 * The do_for_each_event_file_safe() is 1566 * The do_for_each_event_file_safe() is
1554 * a double loop. After finding the call for this 1567 * a double loop. After finding the call for this
@@ -1700,16 +1713,53 @@ static void __trace_remove_event_call(struct ftrace_event_call *call)
1700 destroy_preds(call); 1713 destroy_preds(call);
1701} 1714}
1702 1715
1716static int probe_remove_event_call(struct ftrace_event_call *call)
1717{
1718 struct trace_array *tr;
1719 struct ftrace_event_file *file;
1720
1721#ifdef CONFIG_PERF_EVENTS
1722 if (call->perf_refcount)
1723 return -EBUSY;
1724#endif
1725 do_for_each_event_file(tr, file) {
1726 if (file->event_call != call)
1727 continue;
1728 /*
1729 * We can't rely on ftrace_event_enable_disable(enable => 0)
1730 * we are going to do, FTRACE_EVENT_FL_SOFT_MODE can suppress
1731 * TRACE_REG_UNREGISTER.
1732 */
1733 if (file->flags & FTRACE_EVENT_FL_ENABLED)
1734 return -EBUSY;
1735 /*
1736 * The do_for_each_event_file_safe() is
1737 * a double loop. After finding the call for this
1738 * trace_array, we use break to jump to the next
1739 * trace_array.
1740 */
1741 break;
1742 } while_for_each_event_file();
1743
1744 __trace_remove_event_call(call);
1745
1746 return 0;
1747}
1748
1703/* Remove an event_call */ 1749/* Remove an event_call */
1704void trace_remove_event_call(struct ftrace_event_call *call) 1750int trace_remove_event_call(struct ftrace_event_call *call)
1705{ 1751{
1752 int ret;
1753
1706 mutex_lock(&trace_types_lock); 1754 mutex_lock(&trace_types_lock);
1707 mutex_lock(&event_mutex); 1755 mutex_lock(&event_mutex);
1708 down_write(&trace_event_sem); 1756 down_write(&trace_event_sem);
1709 __trace_remove_event_call(call); 1757 ret = probe_remove_event_call(call);
1710 up_write(&trace_event_sem); 1758 up_write(&trace_event_sem);
1711 mutex_unlock(&event_mutex); 1759 mutex_unlock(&event_mutex);
1712 mutex_unlock(&trace_types_lock); 1760 mutex_unlock(&trace_types_lock);
1761
1762 return ret;
1713} 1763}
1714 1764
1715#define for_each_event(event, start, end) \ 1765#define for_each_event(event, start, end) \
@@ -2278,12 +2328,8 @@ __trace_remove_event_dirs(struct trace_array *tr)
2278{ 2328{
2279 struct ftrace_event_file *file, *next; 2329 struct ftrace_event_file *file, *next;
2280 2330
2281 list_for_each_entry_safe(file, next, &tr->events, list) { 2331 list_for_each_entry_safe(file, next, &tr->events, list)
2282 list_del(&file->list); 2332 remove_event_file_dir(file);
2283 debugfs_remove_recursive(file->dir);
2284 remove_subsystem(file->system);
2285 kmem_cache_free(file_cachep, file);
2286 }
2287} 2333}
2288 2334
2289static void 2335static void
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 0d883dc057d6..97daa8cf958d 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -637,17 +637,15 @@ static void append_filter_err(struct filter_parse_state *ps,
637 free_page((unsigned long) buf); 637 free_page((unsigned long) buf);
638} 638}
639 639
640/* caller must hold event_mutex */
640void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) 641void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
641{ 642{
642 struct event_filter *filter; 643 struct event_filter *filter = call->filter;
643 644
644 mutex_lock(&event_mutex);
645 filter = call->filter;
646 if (filter && filter->filter_string) 645 if (filter && filter->filter_string)
647 trace_seq_printf(s, "%s\n", filter->filter_string); 646 trace_seq_printf(s, "%s\n", filter->filter_string);
648 else 647 else
649 trace_seq_printf(s, "none\n"); 648 trace_seq_puts(s, "none\n");
650 mutex_unlock(&event_mutex);
651} 649}
652 650
653void print_subsystem_event_filter(struct event_subsystem *system, 651void print_subsystem_event_filter(struct event_subsystem *system,
@@ -660,7 +658,7 @@ void print_subsystem_event_filter(struct event_subsystem *system,
660 if (filter && filter->filter_string) 658 if (filter && filter->filter_string)
661 trace_seq_printf(s, "%s\n", filter->filter_string); 659 trace_seq_printf(s, "%s\n", filter->filter_string);
662 else 660 else
663 trace_seq_printf(s, DEFAULT_SYS_FILTER_MESSAGE "\n"); 661 trace_seq_puts(s, DEFAULT_SYS_FILTER_MESSAGE "\n");
664 mutex_unlock(&event_mutex); 662 mutex_unlock(&event_mutex);
665} 663}
666 664
@@ -1841,23 +1839,22 @@ static int create_system_filter(struct event_subsystem *system,
1841 return err; 1839 return err;
1842} 1840}
1843 1841
1842/* caller must hold event_mutex */
1844int apply_event_filter(struct ftrace_event_call *call, char *filter_string) 1843int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1845{ 1844{
1846 struct event_filter *filter; 1845 struct event_filter *filter;
1847 int err = 0; 1846 int err;
1848
1849 mutex_lock(&event_mutex);
1850 1847
1851 if (!strcmp(strstrip(filter_string), "0")) { 1848 if (!strcmp(strstrip(filter_string), "0")) {
1852 filter_disable(call); 1849 filter_disable(call);
1853 filter = call->filter; 1850 filter = call->filter;
1854 if (!filter) 1851 if (!filter)
1855 goto out_unlock; 1852 return 0;
1856 RCU_INIT_POINTER(call->filter, NULL); 1853 RCU_INIT_POINTER(call->filter, NULL);
1857 /* Make sure the filter is not being used */ 1854 /* Make sure the filter is not being used */
1858 synchronize_sched(); 1855 synchronize_sched();
1859 __free_filter(filter); 1856 __free_filter(filter);
1860 goto out_unlock; 1857 return 0;
1861 } 1858 }
1862 1859
1863 err = create_filter(call, filter_string, true, &filter); 1860 err = create_filter(call, filter_string, true, &filter);
@@ -1884,8 +1881,6 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1884 __free_filter(tmp); 1881 __free_filter(tmp);
1885 } 1882 }
1886 } 1883 }
1887out_unlock:
1888 mutex_unlock(&event_mutex);
1889 1884
1890 return err; 1885 return err;
1891} 1886}
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index b863f93b30f3..38fe1483c508 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -199,7 +199,7 @@ static int func_set_flag(u32 old_flags, u32 bit, int set)
199 return 0; 199 return 0;
200} 200}
201 201
202static struct tracer function_trace __read_mostly = 202static struct tracer function_trace __tracer_data =
203{ 203{
204 .name = "function", 204 .name = "function",
205 .init = function_trace_init, 205 .init = function_trace_init,
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 8388bc99f2ee..b5c09242683d 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -446,7 +446,7 @@ print_graph_proc(struct trace_seq *s, pid_t pid)
446 446
447 /* First spaces to align center */ 447 /* First spaces to align center */
448 for (i = 0; i < spaces / 2; i++) { 448 for (i = 0; i < spaces / 2; i++) {
449 ret = trace_seq_printf(s, " "); 449 ret = trace_seq_putc(s, ' ');
450 if (!ret) 450 if (!ret)
451 return TRACE_TYPE_PARTIAL_LINE; 451 return TRACE_TYPE_PARTIAL_LINE;
452 } 452 }
@@ -457,7 +457,7 @@ print_graph_proc(struct trace_seq *s, pid_t pid)
457 457
458 /* Last spaces to align center */ 458 /* Last spaces to align center */
459 for (i = 0; i < spaces - (spaces / 2); i++) { 459 for (i = 0; i < spaces - (spaces / 2); i++) {
460 ret = trace_seq_printf(s, " "); 460 ret = trace_seq_putc(s, ' ');
461 if (!ret) 461 if (!ret)
462 return TRACE_TYPE_PARTIAL_LINE; 462 return TRACE_TYPE_PARTIAL_LINE;
463 } 463 }
@@ -503,7 +503,7 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
503 ------------------------------------------ 503 ------------------------------------------
504 504
505 */ 505 */
506 ret = trace_seq_printf(s, 506 ret = trace_seq_puts(s,
507 " ------------------------------------------\n"); 507 " ------------------------------------------\n");
508 if (!ret) 508 if (!ret)
509 return TRACE_TYPE_PARTIAL_LINE; 509 return TRACE_TYPE_PARTIAL_LINE;
@@ -516,7 +516,7 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
516 if (ret == TRACE_TYPE_PARTIAL_LINE) 516 if (ret == TRACE_TYPE_PARTIAL_LINE)
517 return TRACE_TYPE_PARTIAL_LINE; 517 return TRACE_TYPE_PARTIAL_LINE;
518 518
519 ret = trace_seq_printf(s, " => "); 519 ret = trace_seq_puts(s, " => ");
520 if (!ret) 520 if (!ret)
521 return TRACE_TYPE_PARTIAL_LINE; 521 return TRACE_TYPE_PARTIAL_LINE;
522 522
@@ -524,7 +524,7 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
524 if (ret == TRACE_TYPE_PARTIAL_LINE) 524 if (ret == TRACE_TYPE_PARTIAL_LINE)
525 return TRACE_TYPE_PARTIAL_LINE; 525 return TRACE_TYPE_PARTIAL_LINE;
526 526
527 ret = trace_seq_printf(s, 527 ret = trace_seq_puts(s,
528 "\n ------------------------------------------\n\n"); 528 "\n ------------------------------------------\n\n");
529 if (!ret) 529 if (!ret)
530 return TRACE_TYPE_PARTIAL_LINE; 530 return TRACE_TYPE_PARTIAL_LINE;
@@ -645,7 +645,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
645 ret = print_graph_proc(s, pid); 645 ret = print_graph_proc(s, pid);
646 if (ret == TRACE_TYPE_PARTIAL_LINE) 646 if (ret == TRACE_TYPE_PARTIAL_LINE)
647 return TRACE_TYPE_PARTIAL_LINE; 647 return TRACE_TYPE_PARTIAL_LINE;
648 ret = trace_seq_printf(s, " | "); 648 ret = trace_seq_puts(s, " | ");
649 if (!ret) 649 if (!ret)
650 return TRACE_TYPE_PARTIAL_LINE; 650 return TRACE_TYPE_PARTIAL_LINE;
651 } 651 }
@@ -657,9 +657,9 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
657 return ret; 657 return ret;
658 658
659 if (type == TRACE_GRAPH_ENT) 659 if (type == TRACE_GRAPH_ENT)
660 ret = trace_seq_printf(s, "==========>"); 660 ret = trace_seq_puts(s, "==========>");
661 else 661 else
662 ret = trace_seq_printf(s, "<=========="); 662 ret = trace_seq_puts(s, "<==========");
663 663
664 if (!ret) 664 if (!ret)
665 return TRACE_TYPE_PARTIAL_LINE; 665 return TRACE_TYPE_PARTIAL_LINE;
@@ -668,7 +668,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
668 if (ret != TRACE_TYPE_HANDLED) 668 if (ret != TRACE_TYPE_HANDLED)
669 return ret; 669 return ret;
670 670
671 ret = trace_seq_printf(s, "\n"); 671 ret = trace_seq_putc(s, '\n');
672 672
673 if (!ret) 673 if (!ret)
674 return TRACE_TYPE_PARTIAL_LINE; 674 return TRACE_TYPE_PARTIAL_LINE;
@@ -705,13 +705,13 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
705 len += strlen(nsecs_str); 705 len += strlen(nsecs_str);
706 } 706 }
707 707
708 ret = trace_seq_printf(s, " us "); 708 ret = trace_seq_puts(s, " us ");
709 if (!ret) 709 if (!ret)
710 return TRACE_TYPE_PARTIAL_LINE; 710 return TRACE_TYPE_PARTIAL_LINE;
711 711
712 /* Print remaining spaces to fit the row's width */ 712 /* Print remaining spaces to fit the row's width */
713 for (i = len; i < 7; i++) { 713 for (i = len; i < 7; i++) {
714 ret = trace_seq_printf(s, " "); 714 ret = trace_seq_putc(s, ' ');
715 if (!ret) 715 if (!ret)
716 return TRACE_TYPE_PARTIAL_LINE; 716 return TRACE_TYPE_PARTIAL_LINE;
717 } 717 }
@@ -731,13 +731,13 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s,
731 /* No real adata, just filling the column with spaces */ 731 /* No real adata, just filling the column with spaces */
732 switch (duration) { 732 switch (duration) {
733 case DURATION_FILL_FULL: 733 case DURATION_FILL_FULL:
734 ret = trace_seq_printf(s, " | "); 734 ret = trace_seq_puts(s, " | ");
735 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; 735 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
736 case DURATION_FILL_START: 736 case DURATION_FILL_START:
737 ret = trace_seq_printf(s, " "); 737 ret = trace_seq_puts(s, " ");
738 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; 738 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
739 case DURATION_FILL_END: 739 case DURATION_FILL_END:
740 ret = trace_seq_printf(s, " |"); 740 ret = trace_seq_puts(s, " |");
741 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; 741 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
742 } 742 }
743 743
@@ -745,10 +745,10 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s,
745 if (flags & TRACE_GRAPH_PRINT_OVERHEAD) { 745 if (flags & TRACE_GRAPH_PRINT_OVERHEAD) {
746 /* Duration exceeded 100 msecs */ 746 /* Duration exceeded 100 msecs */
747 if (duration > 100000ULL) 747 if (duration > 100000ULL)
748 ret = trace_seq_printf(s, "! "); 748 ret = trace_seq_puts(s, "! ");
749 /* Duration exceeded 10 msecs */ 749 /* Duration exceeded 10 msecs */
750 else if (duration > 10000ULL) 750 else if (duration > 10000ULL)
751 ret = trace_seq_printf(s, "+ "); 751 ret = trace_seq_puts(s, "+ ");
752 } 752 }
753 753
754 /* 754 /*
@@ -757,7 +757,7 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s,
757 * to fill out the space. 757 * to fill out the space.
758 */ 758 */
759 if (ret == -1) 759 if (ret == -1)
760 ret = trace_seq_printf(s, " "); 760 ret = trace_seq_puts(s, " ");
761 761
762 /* Catching here any failure happenned above */ 762 /* Catching here any failure happenned above */
763 if (!ret) 763 if (!ret)
@@ -767,7 +767,7 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s,
767 if (ret != TRACE_TYPE_HANDLED) 767 if (ret != TRACE_TYPE_HANDLED)
768 return ret; 768 return ret;
769 769
770 ret = trace_seq_printf(s, "| "); 770 ret = trace_seq_puts(s, "| ");
771 if (!ret) 771 if (!ret)
772 return TRACE_TYPE_PARTIAL_LINE; 772 return TRACE_TYPE_PARTIAL_LINE;
773 773
@@ -817,7 +817,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
817 817
818 /* Function */ 818 /* Function */
819 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { 819 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
820 ret = trace_seq_printf(s, " "); 820 ret = trace_seq_putc(s, ' ');
821 if (!ret) 821 if (!ret)
822 return TRACE_TYPE_PARTIAL_LINE; 822 return TRACE_TYPE_PARTIAL_LINE;
823 } 823 }
@@ -858,7 +858,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
858 858
859 /* Function */ 859 /* Function */
860 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { 860 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
861 ret = trace_seq_printf(s, " "); 861 ret = trace_seq_putc(s, ' ');
862 if (!ret) 862 if (!ret)
863 return TRACE_TYPE_PARTIAL_LINE; 863 return TRACE_TYPE_PARTIAL_LINE;
864 } 864 }
@@ -917,7 +917,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
917 if (ret == TRACE_TYPE_PARTIAL_LINE) 917 if (ret == TRACE_TYPE_PARTIAL_LINE)
918 return TRACE_TYPE_PARTIAL_LINE; 918 return TRACE_TYPE_PARTIAL_LINE;
919 919
920 ret = trace_seq_printf(s, " | "); 920 ret = trace_seq_puts(s, " | ");
921 if (!ret) 921 if (!ret)
922 return TRACE_TYPE_PARTIAL_LINE; 922 return TRACE_TYPE_PARTIAL_LINE;
923 } 923 }
@@ -1117,7 +1117,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
1117 1117
1118 /* Closing brace */ 1118 /* Closing brace */
1119 for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) { 1119 for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) {
1120 ret = trace_seq_printf(s, " "); 1120 ret = trace_seq_putc(s, ' ');
1121 if (!ret) 1121 if (!ret)
1122 return TRACE_TYPE_PARTIAL_LINE; 1122 return TRACE_TYPE_PARTIAL_LINE;
1123 } 1123 }
@@ -1129,7 +1129,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
1129 * belongs to, write out the function name. 1129 * belongs to, write out the function name.
1130 */ 1130 */
1131 if (func_match) { 1131 if (func_match) {
1132 ret = trace_seq_printf(s, "}\n"); 1132 ret = trace_seq_puts(s, "}\n");
1133 if (!ret) 1133 if (!ret)
1134 return TRACE_TYPE_PARTIAL_LINE; 1134 return TRACE_TYPE_PARTIAL_LINE;
1135 } else { 1135 } else {
@@ -1179,13 +1179,13 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
1179 /* Indentation */ 1179 /* Indentation */
1180 if (depth > 0) 1180 if (depth > 0)
1181 for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++) { 1181 for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++) {
1182 ret = trace_seq_printf(s, " "); 1182 ret = trace_seq_putc(s, ' ');
1183 if (!ret) 1183 if (!ret)
1184 return TRACE_TYPE_PARTIAL_LINE; 1184 return TRACE_TYPE_PARTIAL_LINE;
1185 } 1185 }
1186 1186
1187 /* The comment */ 1187 /* The comment */
1188 ret = trace_seq_printf(s, "/* "); 1188 ret = trace_seq_puts(s, "/* ");
1189 if (!ret) 1189 if (!ret)
1190 return TRACE_TYPE_PARTIAL_LINE; 1190 return TRACE_TYPE_PARTIAL_LINE;
1191 1191
@@ -1216,7 +1216,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
1216 s->len--; 1216 s->len--;
1217 } 1217 }
1218 1218
1219 ret = trace_seq_printf(s, " */\n"); 1219 ret = trace_seq_puts(s, " */\n");
1220 if (!ret) 1220 if (!ret)
1221 return TRACE_TYPE_PARTIAL_LINE; 1221 return TRACE_TYPE_PARTIAL_LINE;
1222 1222
@@ -1448,7 +1448,7 @@ static struct trace_event graph_trace_ret_event = {
1448 .funcs = &graph_functions 1448 .funcs = &graph_functions
1449}; 1449};
1450 1450
1451static struct tracer graph_trace __read_mostly = { 1451static struct tracer graph_trace __tracer_data = {
1452 .name = "function_graph", 1452 .name = "function_graph",
1453 .open = graph_trace_open, 1453 .open = graph_trace_open,
1454 .pipe_open = graph_trace_open, 1454 .pipe_open = graph_trace_open,
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 7ed6976493c8..243f6834d026 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -95,7 +95,7 @@ static __kprobes bool trace_probe_is_on_module(struct trace_probe *tp)
95} 95}
96 96
97static int register_probe_event(struct trace_probe *tp); 97static int register_probe_event(struct trace_probe *tp);
98static void unregister_probe_event(struct trace_probe *tp); 98static int unregister_probe_event(struct trace_probe *tp);
99 99
100static DEFINE_MUTEX(probe_lock); 100static DEFINE_MUTEX(probe_lock);
101static LIST_HEAD(probe_list); 101static LIST_HEAD(probe_list);
@@ -243,11 +243,11 @@ find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file)
243static int 243static int
244disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) 244disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file)
245{ 245{
246 struct event_file_link *link = NULL;
247 int wait = 0;
246 int ret = 0; 248 int ret = 0;
247 249
248 if (file) { 250 if (file) {
249 struct event_file_link *link;
250
251 link = find_event_file_link(tp, file); 251 link = find_event_file_link(tp, file);
252 if (!link) { 252 if (!link) {
253 ret = -EINVAL; 253 ret = -EINVAL;
@@ -255,10 +255,7 @@ disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file)
255 } 255 }
256 256
257 list_del_rcu(&link->list); 257 list_del_rcu(&link->list);
258 /* synchronize with kprobe_trace_func/kretprobe_trace_func */ 258 wait = 1;
259 synchronize_sched();
260 kfree(link);
261
262 if (!list_empty(&tp->files)) 259 if (!list_empty(&tp->files))
263 goto out; 260 goto out;
264 261
@@ -271,8 +268,22 @@ disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file)
271 disable_kretprobe(&tp->rp); 268 disable_kretprobe(&tp->rp);
272 else 269 else
273 disable_kprobe(&tp->rp.kp); 270 disable_kprobe(&tp->rp.kp);
271 wait = 1;
274 } 272 }
275 out: 273 out:
274 if (wait) {
275 /*
276 * Synchronize with kprobe_trace_func/kretprobe_trace_func
277 * to ensure disabled (all running handlers are finished).
278 * This is not only for kfree(), but also the caller,
279 * trace_remove_event_call() supposes it for releasing
280 * event_call related objects, which will be accessed in
281 * the kprobe_trace_func/kretprobe_trace_func.
282 */
283 synchronize_sched();
284 kfree(link); /* Ignored if link == NULL */
285 }
286
276 return ret; 287 return ret;
277} 288}
278 289
@@ -340,9 +351,12 @@ static int unregister_trace_probe(struct trace_probe *tp)
340 if (trace_probe_is_enabled(tp)) 351 if (trace_probe_is_enabled(tp))
341 return -EBUSY; 352 return -EBUSY;
342 353
354 /* Will fail if probe is being used by ftrace or perf */
355 if (unregister_probe_event(tp))
356 return -EBUSY;
357
343 __unregister_trace_probe(tp); 358 __unregister_trace_probe(tp);
344 list_del(&tp->list); 359 list_del(&tp->list);
345 unregister_probe_event(tp);
346 360
347 return 0; 361 return 0;
348} 362}
@@ -621,7 +635,9 @@ static int release_all_trace_probes(void)
621 /* TODO: Use batch unregistration */ 635 /* TODO: Use batch unregistration */
622 while (!list_empty(&probe_list)) { 636 while (!list_empty(&probe_list)) {
623 tp = list_entry(probe_list.next, struct trace_probe, list); 637 tp = list_entry(probe_list.next, struct trace_probe, list);
624 unregister_trace_probe(tp); 638 ret = unregister_trace_probe(tp);
639 if (ret)
640 goto end;
625 free_trace_probe(tp); 641 free_trace_probe(tp);
626 } 642 }
627 643
@@ -1087,9 +1103,6 @@ kprobe_perf_func(struct trace_probe *tp, struct pt_regs *regs)
1087 __size = sizeof(*entry) + tp->size + dsize; 1103 __size = sizeof(*entry) + tp->size + dsize;
1088 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1104 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1089 size -= sizeof(u32); 1105 size -= sizeof(u32);
1090 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
1091 "profile buffer not large enough"))
1092 return;
1093 1106
1094 entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); 1107 entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
1095 if (!entry) 1108 if (!entry)
@@ -1120,9 +1133,6 @@ kretprobe_perf_func(struct trace_probe *tp, struct kretprobe_instance *ri,
1120 __size = sizeof(*entry) + tp->size + dsize; 1133 __size = sizeof(*entry) + tp->size + dsize;
1121 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1134 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1122 size -= sizeof(u32); 1135 size -= sizeof(u32);
1123 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
1124 "profile buffer not large enough"))
1125 return;
1126 1136
1127 entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); 1137 entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
1128 if (!entry) 1138 if (!entry)
@@ -1242,11 +1252,15 @@ static int register_probe_event(struct trace_probe *tp)
1242 return ret; 1252 return ret;
1243} 1253}
1244 1254
1245static void unregister_probe_event(struct trace_probe *tp) 1255static int unregister_probe_event(struct trace_probe *tp)
1246{ 1256{
1257 int ret;
1258
1247 /* tp->event is unregistered in trace_remove_event_call() */ 1259 /* tp->event is unregistered in trace_remove_event_call() */
1248 trace_remove_event_call(&tp->call); 1260 ret = trace_remove_event_call(&tp->call);
1249 kfree(tp->call.print_fmt); 1261 if (!ret)
1262 kfree(tp->call.print_fmt);
1263 return ret;
1250} 1264}
1251 1265
1252/* Make a debugfs interface for controlling probe points */ 1266/* Make a debugfs interface for controlling probe points */
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index a5e8f4878bfa..b3dcfb2f0fef 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -90,7 +90,7 @@ static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
90 if (drv) 90 if (drv)
91 ret += trace_seq_printf(s, " %s\n", drv->name); 91 ret += trace_seq_printf(s, " %s\n", drv->name);
92 else 92 else
93 ret += trace_seq_printf(s, " \n"); 93 ret += trace_seq_puts(s, " \n");
94 return ret; 94 return ret;
95} 95}
96 96
@@ -107,7 +107,7 @@ static void mmio_pipe_open(struct trace_iterator *iter)
107 struct header_iter *hiter; 107 struct header_iter *hiter;
108 struct trace_seq *s = &iter->seq; 108 struct trace_seq *s = &iter->seq;
109 109
110 trace_seq_printf(s, "VERSION 20070824\n"); 110 trace_seq_puts(s, "VERSION 20070824\n");
111 111
112 hiter = kzalloc(sizeof(*hiter), GFP_KERNEL); 112 hiter = kzalloc(sizeof(*hiter), GFP_KERNEL);
113 if (!hiter) 113 if (!hiter)
@@ -209,7 +209,7 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter)
209 (rw->value >> 0) & 0xff, rw->pc, 0); 209 (rw->value >> 0) & 0xff, rw->pc, 0);
210 break; 210 break;
211 default: 211 default:
212 ret = trace_seq_printf(s, "rw what?\n"); 212 ret = trace_seq_puts(s, "rw what?\n");
213 break; 213 break;
214 } 214 }
215 if (ret) 215 if (ret)
@@ -245,7 +245,7 @@ static enum print_line_t mmio_print_map(struct trace_iterator *iter)
245 secs, usec_rem, m->map_id, 0UL, 0); 245 secs, usec_rem, m->map_id, 0UL, 0);
246 break; 246 break;
247 default: 247 default:
248 ret = trace_seq_printf(s, "map what?\n"); 248 ret = trace_seq_puts(s, "map what?\n");
249 break; 249 break;
250 } 250 }
251 if (ret) 251 if (ret)
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index bb922d9ee51b..34e7cbac0c9c 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -78,7 +78,7 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter)
78 78
79 trace_assign_type(field, entry); 79 trace_assign_type(field, entry);
80 80
81 ret = trace_seq_printf(s, "%s", field->buf); 81 ret = trace_seq_puts(s, field->buf);
82 if (!ret) 82 if (!ret)
83 return TRACE_TYPE_PARTIAL_LINE; 83 return TRACE_TYPE_PARTIAL_LINE;
84 84
@@ -558,14 +558,14 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
558 if (ret) 558 if (ret)
559 ret = trace_seq_puts(s, "??"); 559 ret = trace_seq_puts(s, "??");
560 if (ret) 560 if (ret)
561 ret = trace_seq_puts(s, "\n"); 561 ret = trace_seq_putc(s, '\n');
562 continue; 562 continue;
563 } 563 }
564 if (!ret) 564 if (!ret)
565 break; 565 break;
566 if (ret) 566 if (ret)
567 ret = seq_print_user_ip(s, mm, ip, sym_flags); 567 ret = seq_print_user_ip(s, mm, ip, sym_flags);
568 ret = trace_seq_puts(s, "\n"); 568 ret = trace_seq_putc(s, '\n');
569 } 569 }
570 570
571 if (mm) 571 if (mm)
@@ -579,7 +579,7 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
579 int ret; 579 int ret;
580 580
581 if (!ip) 581 if (!ip)
582 return trace_seq_printf(s, "0"); 582 return trace_seq_putc(s, '0');
583 583
584 if (sym_flags & TRACE_ITER_SYM_OFFSET) 584 if (sym_flags & TRACE_ITER_SYM_OFFSET)
585 ret = seq_print_sym_offset(s, "%s", ip); 585 ret = seq_print_sym_offset(s, "%s", ip);
@@ -964,14 +964,14 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags,
964 goto partial; 964 goto partial;
965 965
966 if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) { 966 if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) {
967 if (!trace_seq_printf(s, " <-")) 967 if (!trace_seq_puts(s, " <-"))
968 goto partial; 968 goto partial;
969 if (!seq_print_ip_sym(s, 969 if (!seq_print_ip_sym(s,
970 field->parent_ip, 970 field->parent_ip,
971 flags)) 971 flags))
972 goto partial; 972 goto partial;
973 } 973 }
974 if (!trace_seq_printf(s, "\n")) 974 if (!trace_seq_putc(s, '\n'))
975 goto partial; 975 goto partial;
976 976
977 return TRACE_TYPE_HANDLED; 977 return TRACE_TYPE_HANDLED;
@@ -1210,7 +1210,7 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
1210 1210
1211 if (!seq_print_ip_sym(s, *p, flags)) 1211 if (!seq_print_ip_sym(s, *p, flags))
1212 goto partial; 1212 goto partial;
1213 if (!trace_seq_puts(s, "\n")) 1213 if (!trace_seq_putc(s, '\n'))
1214 goto partial; 1214 goto partial;
1215 } 1215 }
1216 1216
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index a9077c1b4ad3..2900817ba65c 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -244,12 +244,31 @@ static const char **find_next(void *v, loff_t *pos)
244{ 244{
245 const char **fmt = v; 245 const char **fmt = v;
246 int start_index; 246 int start_index;
247 int last_index;
247 248
248 start_index = __stop___trace_bprintk_fmt - __start___trace_bprintk_fmt; 249 start_index = __stop___trace_bprintk_fmt - __start___trace_bprintk_fmt;
249 250
250 if (*pos < start_index) 251 if (*pos < start_index)
251 return __start___trace_bprintk_fmt + *pos; 252 return __start___trace_bprintk_fmt + *pos;
252 253
254 /*
255 * The __tracepoint_str section is treated the same as the
256 * __trace_printk_fmt section. The difference is that the
257 * __trace_printk_fmt section should only be used by trace_printk()
258 * in a debugging environment, as if anything exists in that section
259 * the trace_prink() helper buffers are allocated, which would just
260 * waste space in a production environment.
261 *
262 * The __tracepoint_str sections on the other hand are used by
263 * tracepoints which need to map pointers to their strings to
264 * the ASCII text for userspace.
265 */
266 last_index = start_index;
267 start_index = __stop___tracepoint_str - __start___tracepoint_str;
268
269 if (*pos < last_index + start_index)
270 return __start___tracepoint_str + (*pos - last_index);
271
253 return find_next_mod_format(start_index, v, fmt, pos); 272 return find_next_mod_format(start_index, v, fmt, pos);
254} 273}
255 274
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 322e16461072..8fd03657bc7d 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -175,7 +175,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags,
175 entry = syscall_nr_to_meta(syscall); 175 entry = syscall_nr_to_meta(syscall);
176 176
177 if (!entry) { 177 if (!entry) {
178 trace_seq_printf(s, "\n"); 178 trace_seq_putc(s, '\n');
179 return TRACE_TYPE_HANDLED; 179 return TRACE_TYPE_HANDLED;
180 } 180 }
181 181
@@ -566,15 +566,15 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
566 if (!sys_data) 566 if (!sys_data)
567 return; 567 return;
568 568
569 head = this_cpu_ptr(sys_data->enter_event->perf_events);
570 if (hlist_empty(head))
571 return;
572
569 /* get the size after alignment with the u32 buffer size field */ 573 /* get the size after alignment with the u32 buffer size field */
570 size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec); 574 size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
571 size = ALIGN(size + sizeof(u32), sizeof(u64)); 575 size = ALIGN(size + sizeof(u32), sizeof(u64));
572 size -= sizeof(u32); 576 size -= sizeof(u32);
573 577
574 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
575 "perf buffer not large enough"))
576 return;
577
578 rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size, 578 rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
579 sys_data->enter_event->event.type, regs, &rctx); 579 sys_data->enter_event->event.type, regs, &rctx);
580 if (!rec) 580 if (!rec)
@@ -583,8 +583,6 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
583 rec->nr = syscall_nr; 583 rec->nr = syscall_nr;
584 syscall_get_arguments(current, regs, 0, sys_data->nb_args, 584 syscall_get_arguments(current, regs, 0, sys_data->nb_args,
585 (unsigned long *)&rec->args); 585 (unsigned long *)&rec->args);
586
587 head = this_cpu_ptr(sys_data->enter_event->perf_events);
588 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); 586 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
589} 587}
590 588
@@ -642,18 +640,14 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
642 if (!sys_data) 640 if (!sys_data)
643 return; 641 return;
644 642
643 head = this_cpu_ptr(sys_data->exit_event->perf_events);
644 if (hlist_empty(head))
645 return;
646
645 /* We can probably do that at build time */ 647 /* We can probably do that at build time */
646 size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64)); 648 size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
647 size -= sizeof(u32); 649 size -= sizeof(u32);
648 650
649 /*
650 * Impossible, but be paranoid with the future
651 * How to put this check outside runtime?
652 */
653 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
654 "exit event has grown above perf buffer size"))
655 return;
656
657 rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size, 651 rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
658 sys_data->exit_event->event.type, regs, &rctx); 652 sys_data->exit_event->event.type, regs, &rctx);
659 if (!rec) 653 if (!rec)
@@ -661,8 +655,6 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
661 655
662 rec->nr = syscall_nr; 656 rec->nr = syscall_nr;
663 rec->ret = syscall_get_return_value(current, regs); 657 rec->ret = syscall_get_return_value(current, regs);
664
665 head = this_cpu_ptr(sys_data->exit_event->perf_events);
666 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); 658 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
667} 659}
668 660
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index d5d0cd368a56..272261b5f94f 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -70,7 +70,7 @@ struct trace_uprobe {
70 (sizeof(struct probe_arg) * (n))) 70 (sizeof(struct probe_arg) * (n)))
71 71
72static int register_uprobe_event(struct trace_uprobe *tu); 72static int register_uprobe_event(struct trace_uprobe *tu);
73static void unregister_uprobe_event(struct trace_uprobe *tu); 73static int unregister_uprobe_event(struct trace_uprobe *tu);
74 74
75static DEFINE_MUTEX(uprobe_lock); 75static DEFINE_MUTEX(uprobe_lock);
76static LIST_HEAD(uprobe_list); 76static LIST_HEAD(uprobe_list);
@@ -164,11 +164,17 @@ static struct trace_uprobe *find_probe_event(const char *event, const char *grou
164} 164}
165 165
166/* Unregister a trace_uprobe and probe_event: call with locking uprobe_lock */ 166/* Unregister a trace_uprobe and probe_event: call with locking uprobe_lock */
167static void unregister_trace_uprobe(struct trace_uprobe *tu) 167static int unregister_trace_uprobe(struct trace_uprobe *tu)
168{ 168{
169 int ret;
170
171 ret = unregister_uprobe_event(tu);
172 if (ret)
173 return ret;
174
169 list_del(&tu->list); 175 list_del(&tu->list);
170 unregister_uprobe_event(tu);
171 free_trace_uprobe(tu); 176 free_trace_uprobe(tu);
177 return 0;
172} 178}
173 179
174/* Register a trace_uprobe and probe_event */ 180/* Register a trace_uprobe and probe_event */
@@ -181,9 +187,12 @@ static int register_trace_uprobe(struct trace_uprobe *tu)
181 187
182 /* register as an event */ 188 /* register as an event */
183 old_tp = find_probe_event(tu->call.name, tu->call.class->system); 189 old_tp = find_probe_event(tu->call.name, tu->call.class->system);
184 if (old_tp) 190 if (old_tp) {
185 /* delete old event */ 191 /* delete old event */
186 unregister_trace_uprobe(old_tp); 192 ret = unregister_trace_uprobe(old_tp);
193 if (ret)
194 goto end;
195 }
187 196
188 ret = register_uprobe_event(tu); 197 ret = register_uprobe_event(tu);
189 if (ret) { 198 if (ret) {
@@ -256,6 +265,8 @@ static int create_trace_uprobe(int argc, char **argv)
256 group = UPROBE_EVENT_SYSTEM; 265 group = UPROBE_EVENT_SYSTEM;
257 266
258 if (is_delete) { 267 if (is_delete) {
268 int ret;
269
259 if (!event) { 270 if (!event) {
260 pr_info("Delete command needs an event name.\n"); 271 pr_info("Delete command needs an event name.\n");
261 return -EINVAL; 272 return -EINVAL;
@@ -269,9 +280,9 @@ static int create_trace_uprobe(int argc, char **argv)
269 return -ENOENT; 280 return -ENOENT;
270 } 281 }
271 /* delete an event */ 282 /* delete an event */
272 unregister_trace_uprobe(tu); 283 ret = unregister_trace_uprobe(tu);
273 mutex_unlock(&uprobe_lock); 284 mutex_unlock(&uprobe_lock);
274 return 0; 285 return ret;
275 } 286 }
276 287
277 if (argc < 2) { 288 if (argc < 2) {
@@ -408,16 +419,20 @@ fail_address_parse:
408 return ret; 419 return ret;
409} 420}
410 421
411static void cleanup_all_probes(void) 422static int cleanup_all_probes(void)
412{ 423{
413 struct trace_uprobe *tu; 424 struct trace_uprobe *tu;
425 int ret = 0;
414 426
415 mutex_lock(&uprobe_lock); 427 mutex_lock(&uprobe_lock);
416 while (!list_empty(&uprobe_list)) { 428 while (!list_empty(&uprobe_list)) {
417 tu = list_entry(uprobe_list.next, struct trace_uprobe, list); 429 tu = list_entry(uprobe_list.next, struct trace_uprobe, list);
418 unregister_trace_uprobe(tu); 430 ret = unregister_trace_uprobe(tu);
431 if (ret)
432 break;
419 } 433 }
420 mutex_unlock(&uprobe_lock); 434 mutex_unlock(&uprobe_lock);
435 return ret;
421} 436}
422 437
423/* Probes listing interfaces */ 438/* Probes listing interfaces */
@@ -462,8 +477,13 @@ static const struct seq_operations probes_seq_op = {
462 477
463static int probes_open(struct inode *inode, struct file *file) 478static int probes_open(struct inode *inode, struct file *file)
464{ 479{
465 if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) 480 int ret;
466 cleanup_all_probes(); 481
482 if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
483 ret = cleanup_all_probes();
484 if (ret)
485 return ret;
486 }
467 487
468 return seq_open(file, &probes_seq_op); 488 return seq_open(file, &probes_seq_op);
469} 489}
@@ -818,8 +838,6 @@ static void uprobe_perf_print(struct trace_uprobe *tu,
818 838
819 size = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); 839 size = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
820 size = ALIGN(size + tu->size + sizeof(u32), sizeof(u64)) - sizeof(u32); 840 size = ALIGN(size + tu->size + sizeof(u32), sizeof(u64)) - sizeof(u32);
821 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough"))
822 return;
823 841
824 preempt_disable(); 842 preempt_disable();
825 head = this_cpu_ptr(call->perf_events); 843 head = this_cpu_ptr(call->perf_events);
@@ -970,12 +988,17 @@ static int register_uprobe_event(struct trace_uprobe *tu)
970 return ret; 988 return ret;
971} 989}
972 990
973static void unregister_uprobe_event(struct trace_uprobe *tu) 991static int unregister_uprobe_event(struct trace_uprobe *tu)
974{ 992{
993 int ret;
994
975 /* tu->event is unregistered in trace_remove_event_call() */ 995 /* tu->event is unregistered in trace_remove_event_call() */
976 trace_remove_event_call(&tu->call); 996 ret = trace_remove_event_call(&tu->call);
997 if (ret)
998 return ret;
977 kfree(tu->call.print_fmt); 999 kfree(tu->call.print_fmt);
978 tu->call.print_fmt = NULL; 1000 tu->call.print_fmt = NULL;
1001 return 0;
979} 1002}
980 1003
981/* Make a trace interface for controling probe points */ 1004/* Make a trace interface for controling probe points */
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index d8c30db06c5b..9064b919a406 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -62,6 +62,9 @@ int create_user_ns(struct cred *new)
62 kgid_t group = new->egid; 62 kgid_t group = new->egid;
63 int ret; 63 int ret;
64 64
65 if (parent_ns->level > 32)
66 return -EUSERS;
67
65 /* 68 /*
66 * Verify that we can not violate the policy of which files 69 * Verify that we can not violate the policy of which files
67 * may be accessed that is specified by the root directory, 70 * may be accessed that is specified by the root directory,
@@ -92,6 +95,7 @@ int create_user_ns(struct cred *new)
92 atomic_set(&ns->count, 1); 95 atomic_set(&ns->count, 1);
93 /* Leave the new->user_ns reference with the new user namespace. */ 96 /* Leave the new->user_ns reference with the new user namespace. */
94 ns->parent = parent_ns; 97 ns->parent = parent_ns;
98 ns->level = parent_ns->level + 1;
95 ns->owner = owner; 99 ns->owner = owner;
96 ns->group = group; 100 ns->group = group;
97 101
@@ -105,16 +109,21 @@ int create_user_ns(struct cred *new)
105int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) 109int unshare_userns(unsigned long unshare_flags, struct cred **new_cred)
106{ 110{
107 struct cred *cred; 111 struct cred *cred;
112 int err = -ENOMEM;
108 113
109 if (!(unshare_flags & CLONE_NEWUSER)) 114 if (!(unshare_flags & CLONE_NEWUSER))
110 return 0; 115 return 0;
111 116
112 cred = prepare_creds(); 117 cred = prepare_creds();
113 if (!cred) 118 if (cred) {
114 return -ENOMEM; 119 err = create_user_ns(cred);
120 if (err)
121 put_cred(cred);
122 else
123 *new_cred = cred;
124 }
115 125
116 *new_cred = cred; 126 return err;
117 return create_user_ns(cred);
118} 127}
119 128
120void free_user_ns(struct user_namespace *ns) 129void free_user_ns(struct user_namespace *ns)
diff --git a/kernel/wait.c b/kernel/wait.c
index ce0daa320a26..d550920e040c 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -333,7 +333,8 @@ int __wait_on_atomic_t(wait_queue_head_t *wq, struct wait_bit_queue *q,
333 prepare_to_wait(wq, &q->wait, mode); 333 prepare_to_wait(wq, &q->wait, mode);
334 val = q->key.flags; 334 val = q->key.flags;
335 if (atomic_read(val) == 0) 335 if (atomic_read(val) == 0)
336 ret = (*action)(val); 336 break;
337 ret = (*action)(val);
337 } while (!ret && atomic_read(val) != 0); 338 } while (!ret && atomic_read(val) != 0);
338 finish_wait(wq, &q->wait); 339 finish_wait(wq, &q->wait);
339 return ret; 340 return ret;
@@ -362,8 +363,7 @@ EXPORT_SYMBOL(out_of_line_wait_on_atomic_t);
362 363
363/** 364/**
364 * wake_up_atomic_t - Wake up a waiter on a atomic_t 365 * wake_up_atomic_t - Wake up a waiter on a atomic_t
365 * @word: The word being waited on, a kernel virtual address 366 * @p: The atomic_t being waited on, a kernel virtual address
366 * @bit: The bit of the word being waited on
367 * 367 *
368 * Wake up anyone waiting for the atomic_t to go to zero. 368 * Wake up anyone waiting for the atomic_t to go to zero.
369 * 369 *
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 1241d8c91d5e..51c4f34d258e 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -553,14 +553,6 @@ void __init lockup_detector_init(void)
553{ 553{
554 set_sample_period(); 554 set_sample_period();
555 555
556#ifdef CONFIG_NO_HZ_FULL
557 if (watchdog_user_enabled) {
558 watchdog_user_enabled = 0;
559 pr_warning("Disabled lockup detectors by default for full dynticks\n");
560 pr_warning("You can reactivate it with 'sysctl -w kernel.watchdog=1'\n");
561 }
562#endif
563
564 if (watchdog_user_enabled) 556 if (watchdog_user_enabled)
565 watchdog_enable_all_cpus(); 557 watchdog_enable_all_cpus();
566} 558}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f02c4a4a0c3c..29b79852a845 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -16,9 +16,10 @@
16 * 16 *
17 * This is the generic async execution mechanism. Work items as are 17 * This is the generic async execution mechanism. Work items as are
18 * executed in process context. The worker pool is shared and 18 * executed in process context. The worker pool is shared and
19 * automatically managed. There is one worker pool for each CPU and 19 * automatically managed. There are two worker pools for each CPU (one for
20 * one extra for works which are better served by workers which are 20 * normal work items and the other for high priority ones) and some extra
21 * not bound to any specific CPU. 21 * pools for workqueues which are not bound to any specific CPU - the
22 * number of these backing pools is dynamic.
22 * 23 *
23 * Please read Documentation/workqueue.txt for details. 24 * Please read Documentation/workqueue.txt for details.
24 */ 25 */
@@ -2033,8 +2034,11 @@ static bool maybe_destroy_workers(struct worker_pool *pool)
2033 * multiple times. Does GFP_KERNEL allocations. 2034 * multiple times. Does GFP_KERNEL allocations.
2034 * 2035 *
2035 * RETURNS: 2036 * RETURNS:
2036 * spin_lock_irq(pool->lock) which may be released and regrabbed 2037 * %false if the pool don't need management and the caller can safely start
2037 * multiple times. Does GFP_KERNEL allocations. 2038 * processing works, %true indicates that the function released pool->lock
2039 * and reacquired it to perform some management function and that the
2040 * conditions that the caller verified while holding the lock before
2041 * calling the function might no longer be true.
2038 */ 2042 */
2039static bool manage_workers(struct worker *worker) 2043static bool manage_workers(struct worker *worker)
2040{ 2044{
@@ -2201,6 +2205,15 @@ __acquires(&pool->lock)
2201 dump_stack(); 2205 dump_stack();
2202 } 2206 }
2203 2207
2208 /*
2209 * The following prevents a kworker from hogging CPU on !PREEMPT
2210 * kernels, where a requeueing work item waiting for something to
2211 * happen could deadlock with stop_machine as such work item could
2212 * indefinitely requeue itself while all other CPUs are trapped in
2213 * stop_machine.
2214 */
2215 cond_resched();
2216
2204 spin_lock_irq(&pool->lock); 2217 spin_lock_irq(&pool->lock);
2205 2218
2206 /* clear cpu intensive status */ 2219 /* clear cpu intensive status */
@@ -2817,6 +2830,19 @@ already_gone:
2817 return false; 2830 return false;
2818} 2831}
2819 2832
2833static bool __flush_work(struct work_struct *work)
2834{
2835 struct wq_barrier barr;
2836
2837 if (start_flush_work(work, &barr)) {
2838 wait_for_completion(&barr.done);
2839 destroy_work_on_stack(&barr.work);
2840 return true;
2841 } else {
2842 return false;
2843 }
2844}
2845
2820/** 2846/**
2821 * flush_work - wait for a work to finish executing the last queueing instance 2847 * flush_work - wait for a work to finish executing the last queueing instance
2822 * @work: the work to flush 2848 * @work: the work to flush
@@ -2830,18 +2856,10 @@ already_gone:
2830 */ 2856 */
2831bool flush_work(struct work_struct *work) 2857bool flush_work(struct work_struct *work)
2832{ 2858{
2833 struct wq_barrier barr;
2834
2835 lock_map_acquire(&work->lockdep_map); 2859 lock_map_acquire(&work->lockdep_map);
2836 lock_map_release(&work->lockdep_map); 2860 lock_map_release(&work->lockdep_map);
2837 2861
2838 if (start_flush_work(work, &barr)) { 2862 return __flush_work(work);
2839 wait_for_completion(&barr.done);
2840 destroy_work_on_stack(&barr.work);
2841 return true;
2842 } else {
2843 return false;
2844 }
2845} 2863}
2846EXPORT_SYMBOL_GPL(flush_work); 2864EXPORT_SYMBOL_GPL(flush_work);
2847 2865
@@ -3081,25 +3099,26 @@ static struct workqueue_struct *dev_to_wq(struct device *dev)
3081 return wq_dev->wq; 3099 return wq_dev->wq;
3082} 3100}
3083 3101
3084static ssize_t wq_per_cpu_show(struct device *dev, 3102static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr,
3085 struct device_attribute *attr, char *buf) 3103 char *buf)
3086{ 3104{
3087 struct workqueue_struct *wq = dev_to_wq(dev); 3105 struct workqueue_struct *wq = dev_to_wq(dev);
3088 3106
3089 return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND)); 3107 return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND));
3090} 3108}
3109static DEVICE_ATTR_RO(per_cpu);
3091 3110
3092static ssize_t wq_max_active_show(struct device *dev, 3111static ssize_t max_active_show(struct device *dev,
3093 struct device_attribute *attr, char *buf) 3112 struct device_attribute *attr, char *buf)
3094{ 3113{
3095 struct workqueue_struct *wq = dev_to_wq(dev); 3114 struct workqueue_struct *wq = dev_to_wq(dev);
3096 3115
3097 return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active); 3116 return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active);
3098} 3117}
3099 3118
3100static ssize_t wq_max_active_store(struct device *dev, 3119static ssize_t max_active_store(struct device *dev,
3101 struct device_attribute *attr, 3120 struct device_attribute *attr, const char *buf,
3102 const char *buf, size_t count) 3121 size_t count)
3103{ 3122{
3104 struct workqueue_struct *wq = dev_to_wq(dev); 3123 struct workqueue_struct *wq = dev_to_wq(dev);
3105 int val; 3124 int val;
@@ -3110,12 +3129,14 @@ static ssize_t wq_max_active_store(struct device *dev,
3110 workqueue_set_max_active(wq, val); 3129 workqueue_set_max_active(wq, val);
3111 return count; 3130 return count;
3112} 3131}
3132static DEVICE_ATTR_RW(max_active);
3113 3133
3114static struct device_attribute wq_sysfs_attrs[] = { 3134static struct attribute *wq_sysfs_attrs[] = {
3115 __ATTR(per_cpu, 0444, wq_per_cpu_show, NULL), 3135 &dev_attr_per_cpu.attr,
3116 __ATTR(max_active, 0644, wq_max_active_show, wq_max_active_store), 3136 &dev_attr_max_active.attr,
3117 __ATTR_NULL, 3137 NULL,
3118}; 3138};
3139ATTRIBUTE_GROUPS(wq_sysfs);
3119 3140
3120static ssize_t wq_pool_ids_show(struct device *dev, 3141static ssize_t wq_pool_ids_show(struct device *dev,
3121 struct device_attribute *attr, char *buf) 3142 struct device_attribute *attr, char *buf)
@@ -3265,7 +3286,7 @@ static struct device_attribute wq_sysfs_unbound_attrs[] = {
3265 3286
3266static struct bus_type wq_subsys = { 3287static struct bus_type wq_subsys = {
3267 .name = "workqueue", 3288 .name = "workqueue",
3268 .dev_attrs = wq_sysfs_attrs, 3289 .dev_groups = wq_sysfs_groups,
3269}; 3290};
3270 3291
3271static int __init wq_sysfs_init(void) 3292static int __init wq_sysfs_init(void)
@@ -3411,6 +3432,12 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to,
3411{ 3432{
3412 to->nice = from->nice; 3433 to->nice = from->nice;
3413 cpumask_copy(to->cpumask, from->cpumask); 3434 cpumask_copy(to->cpumask, from->cpumask);
3435 /*
3436 * Unlike hash and equality test, this function doesn't ignore
3437 * ->no_numa as it is used for both pool and wq attrs. Instead,
3438 * get_unbound_pool() explicitly clears ->no_numa after copying.
3439 */
3440 to->no_numa = from->no_numa;
3414} 3441}
3415 3442
3416/* hash value of the content of @attr */ 3443/* hash value of the content of @attr */
@@ -3578,6 +3605,12 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
3578 lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */ 3605 lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */
3579 copy_workqueue_attrs(pool->attrs, attrs); 3606 copy_workqueue_attrs(pool->attrs, attrs);
3580 3607
3608 /*
3609 * no_numa isn't a worker_pool attribute, always clear it. See
3610 * 'struct workqueue_attrs' comments for detail.
3611 */
3612 pool->attrs->no_numa = false;
3613
3581 /* if cpumask is contained inside a NUMA node, we belong to that node */ 3614 /* if cpumask is contained inside a NUMA node, we belong to that node */
3582 if (wq_numa_enabled) { 3615 if (wq_numa_enabled) {
3583 for_each_node(node) { 3616 for_each_node(node) {
@@ -4644,7 +4677,7 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)
4644 * Workqueues should be brought up before normal priority CPU notifiers. 4677 * Workqueues should be brought up before normal priority CPU notifiers.
4645 * This will be registered high priority CPU notifier. 4678 * This will be registered high priority CPU notifier.
4646 */ 4679 */
4647static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, 4680static int workqueue_cpu_up_callback(struct notifier_block *nfb,
4648 unsigned long action, 4681 unsigned long action,
4649 void *hcpu) 4682 void *hcpu)
4650{ 4683{
@@ -4697,7 +4730,7 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb,
4697 * Workqueues should be brought down after normal priority CPU notifiers. 4730 * Workqueues should be brought down after normal priority CPU notifiers.
4698 * This will be registered as low priority CPU notifier. 4731 * This will be registered as low priority CPU notifier.
4699 */ 4732 */
4700static int __cpuinit workqueue_cpu_down_callback(struct notifier_block *nfb, 4733static int workqueue_cpu_down_callback(struct notifier_block *nfb,
4701 unsigned long action, 4734 unsigned long action,
4702 void *hcpu) 4735 void *hcpu)
4703{ 4736{
@@ -4756,7 +4789,14 @@ long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
4756 4789
4757 INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn); 4790 INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn);
4758 schedule_work_on(cpu, &wfc.work); 4791 schedule_work_on(cpu, &wfc.work);
4759 flush_work(&wfc.work); 4792
4793 /*
4794 * The work item is on-stack and can't lead to deadlock through
4795 * flushing. Use __flush_work() to avoid spurious lockdep warnings
4796 * when work_on_cpu()s are nested.
4797 */
4798 __flush_work(&wfc.work);
4799
4760 return wfc.ret; 4800 return wfc.ret;
4761} 4801}
4762EXPORT_SYMBOL_GPL(work_on_cpu); 4802EXPORT_SYMBOL_GPL(work_on_cpu);